Phi-1.5B-IFT-Math-Openrs / trainer_state.json

Model save

8234cfb verified 6 months ago

210 kB

	{
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 2.0,
	"eval_steps": 500,
	"global_step": 500,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.004,
	"grad_norm": 4.214743137359619,
	"kl": 0.0,
	"learning_rate": 2e-08,
	"loss": -0.0,
	"reward": -0.572140134871006,
	"reward_std": 0.3359133452177048,
	"rewards/cosine_scaled_reward": -0.286070067435503,
	"rewards/format_reward": 0.0,
	"step": 1
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.008,
	"grad_norm": 3.178635597229004,
	"kl": 0.0,
	"learning_rate": 4e-08,
	"loss": -0.0,
	"reward": -0.6001544743776321,
	"reward_std": 0.33404429256916046,
	"rewards/cosine_scaled_reward": -0.30007724463939667,
	"rewards/format_reward": 0.0,
	"step": 2
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.012,
	"grad_norm": 4.78328800201416,
	"kl": 6.908178329467773e-05,
	"learning_rate": 6e-08,
	"loss": 0.0,
	"reward": -0.502997636795044,
	"reward_std": 0.3310435339808464,
	"rewards/cosine_scaled_reward": -0.251498818397522,
	"rewards/format_reward": 0.0,
	"step": 3
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.016,
	"grad_norm": 3.9194376468658447,
	"kl": 6.488710641860962e-05,
	"learning_rate": 8e-08,
	"loss": 0.0,
	"reward": -0.5549568086862564,
	"reward_std": 0.3469474986195564,
	"rewards/cosine_scaled_reward": -0.2774783968925476,
	"rewards/format_reward": 0.0,
	"step": 4
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.02,
	"grad_norm": 3.903712511062622,
	"kl": 5.97834587097168e-05,
	"learning_rate": 1e-07,
	"loss": 0.0,
	"reward": -0.5800392031669617,
	"reward_std": 0.35274410992860794,
	"rewards/cosine_scaled_reward": -0.29001960158348083,
	"rewards/format_reward": 0.0,
	"step": 5
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.024,
	"grad_norm": 3.738009452819824,
	"kl": 6.499886512756348e-05,
	"learning_rate": 1.2e-07,
	"loss": 0.0,
	"reward": -0.5155884921550751,
	"reward_std": 0.37037966400384903,
	"rewards/cosine_scaled_reward": -0.25779424607753754,
	"rewards/format_reward": 0.0,
	"step": 6
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.028,
	"grad_norm": 2.794049024581909,
	"kl": 5.620718002319336e-05,
	"learning_rate": 1.4e-07,
	"loss": 0.0,
	"reward": -0.5175943374633789,
	"reward_std": 0.3494645953178406,
	"rewards/cosine_scaled_reward": -0.25879716128110886,
	"rewards/format_reward": 0.0,
	"step": 7
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.032,
	"grad_norm": 2.484722852706909,
	"kl": 8.106231689453125e-05,
	"learning_rate": 1.6e-07,
	"loss": 0.0,
	"reward": -0.5301882103085518,
	"reward_std": 0.3405821621417999,
	"rewards/cosine_scaled_reward": -0.2650941051542759,
	"rewards/format_reward": 0.0,
	"step": 8
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.036,
	"grad_norm": 3.1448230743408203,
	"kl": 7.554888725280762e-05,
	"learning_rate": 1.8e-07,
	"loss": 0.0,
	"reward": -0.5024237409234047,
	"reward_std": 0.3572370335459709,
	"rewards/cosine_scaled_reward": -0.25121185183525085,
	"rewards/format_reward": 0.0,
	"step": 9
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.04,
	"grad_norm": 4.125906944274902,
	"kl": 8.666515350341797e-05,
	"learning_rate": 2e-07,
	"loss": 0.0,
	"reward": -0.5732719898223877,
	"reward_std": 0.37079156190156937,
	"rewards/cosine_scaled_reward": -0.28663600236177444,
	"rewards/format_reward": 0.0,
	"step": 10
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.044,
	"grad_norm": 4.4225945472717285,
	"kl": 5.561113357543945e-05,
	"learning_rate": 2.1999999999999998e-07,
	"loss": 0.0,
	"reward": -0.5889493525028229,
	"reward_std": 0.3473696708679199,
	"rewards/cosine_scaled_reward": -0.29447468370199203,
	"rewards/format_reward": 0.0,
	"step": 11
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.048,
	"grad_norm": 3.891627550125122,
	"kl": 7.808208465576172e-05,
	"learning_rate": 2.4e-07,
	"loss": 0.0,
	"reward": -0.5409628972411156,
	"reward_std": 0.326653391122818,
	"rewards/cosine_scaled_reward": -0.2704814486205578,
	"rewards/format_reward": 0.0,
	"step": 12
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.052,
	"grad_norm": 3.552539587020874,
	"kl": 7.30752944946289e-05,
	"learning_rate": 2.6e-07,
	"loss": 0.0,
	"reward": -0.5389444306492805,
	"reward_std": 0.3649257719516754,
	"rewards/cosine_scaled_reward": -0.2694722190499306,
	"rewards/format_reward": 0.0,
	"step": 13
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.056,
	"grad_norm": 2.781034231185913,
	"kl": 7.081031799316406e-05,
	"learning_rate": 2.8e-07,
	"loss": 0.0,
	"reward": -0.6049635112285614,
	"reward_std": 0.3185788542032242,
	"rewards/cosine_scaled_reward": -0.3024817630648613,
	"rewards/format_reward": 0.0,
	"step": 14
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.06,
	"grad_norm": 3.412130355834961,
	"kl": 6.335973739624023e-05,
	"learning_rate": 3e-07,
	"loss": 0.0,
	"reward": -0.6299380213022232,
	"reward_std": 0.31315718591213226,
	"rewards/cosine_scaled_reward": -0.3149690255522728,
	"rewards/format_reward": 0.0,
	"step": 15
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.064,
	"grad_norm": 4.064192771911621,
	"kl": 0.00011527538299560547,
	"learning_rate": 3.2e-07,
	"loss": 0.0,
	"reward": -0.5638149380683899,
	"reward_std": 0.3539445400238037,
	"rewards/cosine_scaled_reward": -0.28190746903419495,
	"rewards/format_reward": 0.0,
	"step": 16
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.068,
	"grad_norm": 3.5826501846313477,
	"kl": 9.000301361083984e-05,
	"learning_rate": 3.4000000000000003e-07,
	"loss": 0.0,
	"reward": -0.5815131217241287,
	"reward_std": 0.3570765480399132,
	"rewards/cosine_scaled_reward": -0.29075656831264496,
	"rewards/format_reward": 0.0,
	"step": 17
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.072,
	"grad_norm": 3.4398193359375,
	"kl": 0.00013589859008789062,
	"learning_rate": 3.6e-07,
	"loss": 0.0,
	"reward": -0.5058030858635902,
	"reward_std": 0.3534058630466461,
	"rewards/cosine_scaled_reward": -0.2529015429317951,
	"rewards/format_reward": 0.0,
	"step": 18
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.076,
	"grad_norm": 3.1647567749023438,
	"kl": 0.00010588765144348145,
	"learning_rate": 3.7999999999999996e-07,
	"loss": 0.0,
	"reward": -0.5453799739480019,
	"reward_std": 0.3434706851840019,
	"rewards/cosine_scaled_reward": -0.27268998324871063,
	"rewards/format_reward": 0.0,
	"step": 19
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.08,
	"grad_norm": 4.028233528137207,
	"kl": 0.00011265277862548828,
	"learning_rate": 4e-07,
	"loss": 0.0,
	"reward": -0.5725424438714981,
	"reward_std": 0.33554956316947937,
	"rewards/cosine_scaled_reward": -0.28627122938632965,
	"rewards/format_reward": 0.0,
	"step": 20
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.084,
	"grad_norm": 3.0403409004211426,
	"kl": 0.00015485286712646484,
	"learning_rate": 4.1999999999999995e-07,
	"loss": 0.0,
	"reward": -0.5395064353942871,
	"reward_std": 0.3414423242211342,
	"rewards/cosine_scaled_reward": -0.26975322514772415,
	"rewards/format_reward": 0.0,
	"step": 21
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.088,
	"grad_norm": 3.5831127166748047,
	"kl": 0.0006537437438964844,
	"learning_rate": 4.3999999999999997e-07,
	"loss": 0.0,
	"reward": -0.5216317698359489,
	"reward_std": 0.3427959829568863,
	"rewards/cosine_scaled_reward": -0.2608158737421036,
	"rewards/format_reward": 0.0,
	"step": 22
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.092,
	"grad_norm": 3.5175235271453857,
	"kl": 0.0010776519775390625,
	"learning_rate": 4.6e-07,
	"loss": 0.0,
	"reward": -0.5413709655404091,
	"reward_std": 0.32718800008296967,
	"rewards/cosine_scaled_reward": -0.27068548277020454,
	"rewards/format_reward": 0.0,
	"step": 23
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.096,
	"grad_norm": 3.442873239517212,
	"kl": 0.0013303756713867188,
	"learning_rate": 4.8e-07,
	"loss": 0.0001,
	"reward": -0.5624926462769508,
	"reward_std": 0.3581688553094864,
	"rewards/cosine_scaled_reward": -0.2812463231384754,
	"rewards/format_reward": 0.0,
	"step": 24
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.1,
	"grad_norm": 2.6114015579223633,
	"kl": 0.0016193389892578125,
	"learning_rate": 5e-07,
	"loss": 0.0001,
	"reward": -0.5309188961982727,
	"reward_std": 0.33032629638910294,
	"rewards/cosine_scaled_reward": -0.26545944809913635,
	"rewards/format_reward": 0.0,
	"step": 25
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.104,
	"grad_norm": 4.818567752838135,
	"kl": 0.0026264190673828125,
	"learning_rate": 5.2e-07,
	"loss": 0.0001,
	"reward": -0.5884083956480026,
	"reward_std": 0.3386874794960022,
	"rewards/cosine_scaled_reward": -0.2942042052745819,
	"rewards/format_reward": 0.0,
	"step": 26
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.108,
	"grad_norm": 4.078734397888184,
	"kl": 0.002239227294921875,
	"learning_rate": 5.4e-07,
	"loss": 0.0001,
	"reward": -0.6157089024782181,
	"reward_std": 0.3308729752898216,
	"rewards/cosine_scaled_reward": -0.30785445868968964,
	"rewards/format_reward": 0.0,
	"step": 27
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.112,
	"grad_norm": 3.4599478244781494,
	"kl": 0.002338409423828125,
	"learning_rate": 5.6e-07,
	"loss": 0.0001,
	"reward": -0.5709060430526733,
	"reward_std": 0.3136204034090042,
	"rewards/cosine_scaled_reward": -0.28545302152633667,
	"rewards/format_reward": 0.0,
	"step": 28
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1533.9464416503906,
	"epoch": 0.116,
	"grad_norm": 3.461718797683716,
	"kl": 0.003444671630859375,
	"learning_rate": 5.8e-07,
	"loss": -0.001,
	"reward": -0.5237472280859947,
	"reward_std": 0.3601622208952904,
	"rewards/cosine_scaled_reward": -0.26187360659241676,
	"rewards/format_reward": 0.0,
	"step": 29
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.12,
	"grad_norm": 3.7205333709716797,
	"kl": 0.00542449951171875,
	"learning_rate": 6e-07,
	"loss": 0.0002,
	"reward": -0.5595864206552505,
	"reward_std": 0.3391585499048233,
	"rewards/cosine_scaled_reward": -0.2797932103276253,
	"rewards/format_reward": 0.0,
	"step": 30
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.124,
	"grad_norm": 3.639012575149536,
	"kl": 0.0102996826171875,
	"learning_rate": 6.2e-07,
	"loss": 0.0004,
	"reward": -0.5832120478153229,
	"reward_std": 0.34403981268405914,
	"rewards/cosine_scaled_reward": -0.29160603135824203,
	"rewards/format_reward": 0.0,
	"step": 31
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.128,
	"grad_norm": 3.499258041381836,
	"kl": 0.0159149169921875,
	"learning_rate": 6.4e-07,
	"loss": 0.0006,
	"reward": -0.5567401573061943,
	"reward_std": 0.3353060856461525,
	"rewards/cosine_scaled_reward": -0.27837007120251656,
	"rewards/format_reward": 0.0,
	"step": 32
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.132,
	"grad_norm": 3.564453601837158,
	"kl": 0.0182952880859375,
	"learning_rate": 6.6e-07,
	"loss": 0.0007,
	"reward": -0.5521366372704506,
	"reward_std": 0.3413034975528717,
	"rewards/cosine_scaled_reward": -0.2760683260858059,
	"rewards/format_reward": 0.0,
	"step": 33
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.136,
	"grad_norm": 3.567174196243286,
	"kl": 0.0237274169921875,
	"learning_rate": 6.800000000000001e-07,
	"loss": 0.0009,
	"reward": -0.5193822234869003,
	"reward_std": 0.35690775513648987,
	"rewards/cosine_scaled_reward": -0.25969111174345016,
	"rewards/format_reward": 0.0,
	"step": 34
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.14,
	"grad_norm": 2.247893810272217,
	"kl": 0.0149078369140625,
	"learning_rate": 7e-07,
	"loss": 0.0006,
	"reward": -0.5820326581597328,
	"reward_std": 0.3510446697473526,
	"rewards/cosine_scaled_reward": -0.2910163216292858,
	"rewards/format_reward": 0.0,
	"step": 35
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.144,
	"grad_norm": 2.9316084384918213,
	"kl": 0.022552490234375,
	"learning_rate": 7.2e-07,
	"loss": 0.0009,
	"reward": -0.5632490888237953,
	"reward_std": 0.3500733822584152,
	"rewards/cosine_scaled_reward": -0.28162455186247826,
	"rewards/format_reward": 0.0,
	"step": 36
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.148,
	"grad_norm": 3.5201869010925293,
	"kl": 0.02850341796875,
	"learning_rate": 7.4e-07,
	"loss": 0.0011,
	"reward": -0.5141241475939751,
	"reward_std": 0.3309687077999115,
	"rewards/cosine_scaled_reward": -0.25706208124756813,
	"rewards/format_reward": 0.0,
	"step": 37
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.152,
	"grad_norm": 2.7246434688568115,
	"kl": 0.0296630859375,
	"learning_rate": 7.599999999999999e-07,
	"loss": 0.0012,
	"reward": -0.5139049887657166,
	"reward_std": 0.33319953083992004,
	"rewards/cosine_scaled_reward": -0.25695250555872917,
	"rewards/format_reward": 0.0,
	"step": 38
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.156,
	"grad_norm": 2.880594491958618,
	"kl": 0.0258636474609375,
	"learning_rate": 7.799999999999999e-07,
	"loss": 0.001,
	"reward": -0.5646104216575623,
	"reward_std": 0.3474426791071892,
	"rewards/cosine_scaled_reward": -0.2823052257299423,
	"rewards/format_reward": 0.0,
	"step": 39
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.16,
	"grad_norm": 2.6734988689422607,
	"kl": 0.0321044921875,
	"learning_rate": 8e-07,
	"loss": 0.0013,
	"reward": -0.5586390048265457,
	"reward_std": 0.3474784344434738,
	"rewards/cosine_scaled_reward": -0.27931951731443405,
	"rewards/format_reward": 0.0,
	"step": 40
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.164,
	"grad_norm": 3.1370785236358643,
	"kl": 0.03369140625,
	"learning_rate": 8.199999999999999e-07,
	"loss": 0.0013,
	"reward": -0.5609789937734604,
	"reward_std": 0.3450735807418823,
	"rewards/cosine_scaled_reward": -0.280489519238472,
	"rewards/format_reward": 0.0,
	"step": 41
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.168,
	"grad_norm": 2.5502073764801025,
	"kl": 0.06072998046875,
	"learning_rate": 8.399999999999999e-07,
	"loss": 0.0024,
	"reward": -0.5195748135447502,
	"reward_std": 0.34474433213472366,
	"rewards/cosine_scaled_reward": -0.2597874030470848,
	"rewards/format_reward": 0.0,
	"step": 42
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.172,
	"grad_norm": 2.1381213665008545,
	"kl": 0.067474365234375,
	"learning_rate": 8.599999999999999e-07,
	"loss": 0.0027,
	"reward": -0.5580533072352409,
	"reward_std": 0.32987529784440994,
	"rewards/cosine_scaled_reward": -0.27902666106820107,
	"rewards/format_reward": 0.0,
	"step": 43
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.176,
	"grad_norm": 2.1730432510375977,
	"kl": 0.0958251953125,
	"learning_rate": 8.799999999999999e-07,
	"loss": 0.0038,
	"reward": -0.5585729256272316,
	"reward_std": 0.3295438587665558,
	"rewards/cosine_scaled_reward": -0.2792864739894867,
	"rewards/format_reward": 0.0,
	"step": 44
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.18,
	"grad_norm": 1.962768316268921,
	"kl": 0.079345703125,
	"learning_rate": 9e-07,
	"loss": 0.0032,
	"reward": -0.5980347394943237,
	"reward_std": 0.3284436762332916,
	"rewards/cosine_scaled_reward": -0.29901736974716187,
	"rewards/format_reward": 0.0,
	"step": 45
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.184,
	"grad_norm": 1.8276231288909912,
	"kl": 0.1153564453125,
	"learning_rate": 9.2e-07,
	"loss": 0.0046,
	"reward": -0.507519856095314,
	"reward_std": 0.33579862862825394,
	"rewards/cosine_scaled_reward": -0.2537599205970764,
	"rewards/format_reward": 0.0,
	"step": 46
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.188,
	"grad_norm": 2.608023166656494,
	"kl": 0.09033203125,
	"learning_rate": 9.399999999999999e-07,
	"loss": 0.0036,
	"reward": -0.5289521142840385,
	"reward_std": 0.31808041036129,
	"rewards/cosine_scaled_reward": -0.26447605714201927,
	"rewards/format_reward": 0.0,
	"step": 47
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.192,
	"grad_norm": 1.8956966400146484,
	"kl": 0.09814453125,
	"learning_rate": 9.6e-07,
	"loss": 0.0039,
	"reward": -0.566174179315567,
	"reward_std": 0.311339795589447,
	"rewards/cosine_scaled_reward": -0.2830870673060417,
	"rewards/format_reward": 0.0,
	"step": 48
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.196,
	"grad_norm": 1.7705461978912354,
	"kl": 0.1209716796875,
	"learning_rate": 9.8e-07,
	"loss": 0.0048,
	"reward": -0.528024435043335,
	"reward_std": 0.36330366879701614,
	"rewards/cosine_scaled_reward": -0.26401223987340927,
	"rewards/format_reward": 0.0,
	"step": 49
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.2,
	"grad_norm": 2.1113531589508057,
	"kl": 0.1171875,
	"learning_rate": 1e-06,
	"loss": 0.0047,
	"reward": -0.4406622089445591,
	"reward_std": 0.3163011893630028,
	"rewards/cosine_scaled_reward": -0.2203311063349247,
	"rewards/format_reward": 0.0,
	"step": 50
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.204,
	"grad_norm": 1.803585410118103,
	"kl": 0.1026611328125,
	"learning_rate": 9.999890338174275e-07,
	"loss": 0.0041,
	"reward": -0.5815826654434204,
	"reward_std": 0.3248438388109207,
	"rewards/cosine_scaled_reward": -0.2907913327217102,
	"rewards/format_reward": 0.0,
	"step": 51
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.208,
	"grad_norm": 1.7076486349105835,
	"kl": 0.157470703125,
	"learning_rate": 9.999561358041868e-07,
	"loss": 0.0063,
	"reward": -0.5362438708543777,
	"reward_std": 0.2975444979965687,
	"rewards/cosine_scaled_reward": -0.26812195032835007,
	"rewards/format_reward": 0.0,
	"step": 52
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.212,
	"grad_norm": 2.478224515914917,
	"kl": 0.144287109375,
	"learning_rate": 9.999013075636804e-07,
	"loss": 0.0058,
	"reward": -0.47916819900274277,
	"reward_std": 0.35621220618486404,
	"rewards/cosine_scaled_reward": -0.23958410695195198,
	"rewards/format_reward": 0.0,
	"step": 53
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.216,
	"grad_norm": 2.006901502609253,
	"kl": 0.1337890625,
	"learning_rate": 9.998245517681593e-07,
	"loss": 0.0053,
	"reward": -0.5450761765241623,
	"reward_std": 0.32576631009578705,
	"rewards/cosine_scaled_reward": -0.27253808826208115,
	"rewards/format_reward": 0.0,
	"step": 54
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.22,
	"grad_norm": 2.2259609699249268,
	"kl": 0.11669921875,
	"learning_rate": 9.997258721585931e-07,
	"loss": 0.0047,
	"reward": -0.5271478518843651,
	"reward_std": 0.34441374242305756,
	"rewards/cosine_scaled_reward": -0.26357391849160194,
	"rewards/format_reward": 0.0,
	"step": 55
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.224,
	"grad_norm": 2.020939588546753,
	"kl": 0.1907958984375,
	"learning_rate": 9.996052735444862e-07,
	"loss": 0.0076,
	"reward": -0.5367654263973236,
	"reward_std": 0.3470792919397354,
	"rewards/cosine_scaled_reward": -0.2683827131986618,
	"rewards/format_reward": 0.0,
	"step": 56
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.228,
	"grad_norm": 1.9356812238693237,
	"kl": 0.158935546875,
	"learning_rate": 9.994627618036452e-07,
	"loss": 0.0064,
	"reward": -0.505635529756546,
	"reward_std": 0.3292393088340759,
	"rewards/cosine_scaled_reward": -0.252817764878273,
	"rewards/format_reward": 0.0,
	"step": 57
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.232,
	"grad_norm": 3.2483060359954834,
	"kl": 0.188720703125,
	"learning_rate": 9.992983438818915e-07,
	"loss": 0.0075,
	"reward": -0.504822663962841,
	"reward_std": 0.35463710874319077,
	"rewards/cosine_scaled_reward": -0.2524113282561302,
	"rewards/format_reward": 0.0,
	"step": 58
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.236,
	"grad_norm": 2.2256879806518555,
	"kl": 0.205322265625,
	"learning_rate": 9.991120277927223e-07,
	"loss": 0.0082,
	"reward": -0.5851711928844452,
	"reward_std": 0.3146449252963066,
	"rewards/cosine_scaled_reward": -0.2925856038928032,
	"rewards/format_reward": 0.0,
	"step": 59
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.24,
	"grad_norm": 2.093649387359619,
	"kl": 0.198486328125,
	"learning_rate": 9.989038226169207e-07,
	"loss": 0.0079,
	"reward": -0.45284587889909744,
	"reward_std": 0.34760017693042755,
	"rewards/cosine_scaled_reward": -0.22642293944954872,
	"rewards/format_reward": 0.0,
	"step": 60
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.244,
	"grad_norm": 2.378591537475586,
	"kl": 0.24365234375,
	"learning_rate": 9.98673738502114e-07,
	"loss": 0.0097,
	"reward": -0.5091445297002792,
	"reward_std": 0.3452131450176239,
	"rewards/cosine_scaled_reward": -0.2545722760260105,
	"rewards/format_reward": 0.0,
	"step": 61
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.248,
	"grad_norm": 2.188553810119629,
	"kl": 0.29833984375,
	"learning_rate": 9.98421786662277e-07,
	"loss": 0.0119,
	"reward": -0.47440846264362335,
	"reward_std": 0.34785814583301544,
	"rewards/cosine_scaled_reward": -0.23720423132181168,
	"rewards/format_reward": 0.0,
	"step": 62
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.252,
	"grad_norm": 2.6211366653442383,
	"kl": 0.48095703125,
	"learning_rate": 9.981479793771866e-07,
	"loss": 0.0192,
	"reward": -0.46701501309871674,
	"reward_std": 0.3275434151291847,
	"rewards/cosine_scaled_reward": -0.23350750654935837,
	"rewards/format_reward": 0.0,
	"step": 63
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.256,
	"grad_norm": 3.608039617538452,
	"kl": 0.63720703125,
	"learning_rate": 9.97852329991824e-07,
	"loss": 0.0254,
	"reward": -0.4022144228219986,
	"reward_std": 0.3280187249183655,
	"rewards/cosine_scaled_reward": -0.2011072114109993,
	"rewards/format_reward": 0.0,
	"step": 64
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.26,
	"grad_norm": 2.1589713096618652,
	"kl": 0.587890625,
	"learning_rate": 9.975348529157229e-07,
	"loss": 0.0236,
	"reward": -0.4902011975646019,
	"reward_std": 0.33829304575920105,
	"rewards/cosine_scaled_reward": -0.24510059878230095,
	"rewards/format_reward": 0.0,
	"step": 65
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.264,
	"grad_norm": 4.391396522521973,
	"kl": 0.851806640625,
	"learning_rate": 9.971955636222684e-07,
	"loss": 0.034,
	"reward": -0.5337588116526604,
	"reward_std": 0.3271815627813339,
	"rewards/cosine_scaled_reward": -0.2668794058263302,
	"rewards/format_reward": 0.0,
	"step": 66
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.268,
	"grad_norm": 4.296882629394531,
	"kl": 0.892333984375,
	"learning_rate": 9.968344786479415e-07,
	"loss": 0.0357,
	"reward": -0.45740216970443726,
	"reward_std": 0.32497797161340714,
	"rewards/cosine_scaled_reward": -0.22870109230279922,
	"rewards/format_reward": 0.0,
	"step": 67
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.272,
	"grad_norm": 7.224793434143066,
	"kl": 1.29736328125,
	"learning_rate": 9.964516155915151e-07,
	"loss": 0.0519,
	"reward": -0.5055549815297127,
	"reward_std": 0.3318631425499916,
	"rewards/cosine_scaled_reward": -0.25277747586369514,
	"rewards/format_reward": 0.0,
	"step": 68
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.276,
	"grad_norm": 6.747034072875977,
	"kl": 1.3232421875,
	"learning_rate": 9.960469931131936e-07,
	"loss": 0.0531,
	"reward": -0.4314222186803818,
	"reward_std": 0.31476689875125885,
	"rewards/cosine_scaled_reward": -0.21571110002696514,
	"rewards/format_reward": 0.0,
	"step": 69
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.28,
	"grad_norm": 5.5595808029174805,
	"kl": 0.8935546875,
	"learning_rate": 9.956206309337066e-07,
	"loss": 0.0358,
	"reward": -0.4758576303720474,
	"reward_std": 0.33101003617048264,
	"rewards/cosine_scaled_reward": -0.2379288226366043,
	"rewards/format_reward": 0.0,
	"step": 70
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.284,
	"grad_norm": 2.4482791423797607,
	"kl": 0.521484375,
	"learning_rate": 9.951725498333448e-07,
	"loss": 0.0209,
	"reward": -0.4491276890039444,
	"reward_std": 0.3567735329270363,
	"rewards/cosine_scaled_reward": -0.2245638445019722,
	"rewards/format_reward": 0.0,
	"step": 71
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.288,
	"grad_norm": 3.1987600326538086,
	"kl": 0.6240234375,
	"learning_rate": 9.947027716509488e-07,
	"loss": 0.025,
	"reward": -0.43654023110866547,
	"reward_std": 0.3590875416994095,
	"rewards/cosine_scaled_reward": -0.21827011927962303,
	"rewards/format_reward": 0.0,
	"step": 72
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.292,
	"grad_norm": 4.885537147521973,
	"kl": 1.14599609375,
	"learning_rate": 9.942113192828444e-07,
	"loss": 0.0458,
	"reward": -0.5265215784311295,
	"reward_std": 0.3363535851240158,
	"rewards/cosine_scaled_reward": -0.26326077431440353,
	"rewards/format_reward": 0.0,
	"step": 73
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.296,
	"grad_norm": 3.4503629207611084,
	"kl": 1.14794921875,
	"learning_rate": 9.93698216681727e-07,
	"loss": 0.0459,
	"reward": -0.4836200848221779,
	"reward_std": 0.33076073229312897,
	"rewards/cosine_scaled_reward": -0.24181004241108894,
	"rewards/format_reward": 0.0,
	"step": 74
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.3,
	"grad_norm": 3.5954651832580566,
	"kl": 0.6767578125,
	"learning_rate": 9.931634888554935e-07,
	"loss": 0.027,
	"reward": -0.5548510551452637,
	"reward_std": 0.3006826713681221,
	"rewards/cosine_scaled_reward": -0.27742552757263184,
	"rewards/format_reward": 0.0,
	"step": 75
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.304,
	"grad_norm": 2.27148699760437,
	"kl": 0.69970703125,
	"learning_rate": 9.926071618660237e-07,
	"loss": 0.028,
	"reward": -0.5522997975349426,
	"reward_std": 0.32217612117528915,
	"rewards/cosine_scaled_reward": -0.2761498987674713,
	"rewards/format_reward": 0.0,
	"step": 76
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.308,
	"grad_norm": 2.421114206314087,
	"kl": 0.65234375,
	"learning_rate": 9.9202926282791e-07,
	"loss": 0.0261,
	"reward": -0.5491495952010155,
	"reward_std": 0.33891358226537704,
	"rewards/cosine_scaled_reward": -0.27457480505108833,
	"rewards/format_reward": 0.0,
	"step": 77
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.312,
	"grad_norm": 2.296977996826172,
	"kl": 0.4833984375,
	"learning_rate": 9.91429819907136e-07,
	"loss": 0.0193,
	"reward": -0.5332002714276314,
	"reward_std": 0.3453890234231949,
	"rewards/cosine_scaled_reward": -0.2666001245379448,
	"rewards/format_reward": 0.0,
	"step": 78
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.316,
	"grad_norm": 2.351818084716797,
	"kl": 0.5048828125,
	"learning_rate": 9.908088623197048e-07,
	"loss": 0.0202,
	"reward": -0.4974421188235283,
	"reward_std": 0.36291657388210297,
	"rewards/cosine_scaled_reward": -0.24872105196118355,
	"rewards/format_reward": 0.0,
	"step": 79
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.32,
	"grad_norm": 2.808706521987915,
	"kl": 0.53125,
	"learning_rate": 9.901664203302124e-07,
	"loss": 0.0212,
	"reward": -0.5026201903820038,
	"reward_std": 0.30610421299934387,
	"rewards/cosine_scaled_reward": -0.2513100877404213,
	"rewards/format_reward": 0.0,
	"step": 80
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.324,
	"grad_norm": 2.077920913696289,
	"kl": 0.68994140625,
	"learning_rate": 9.895025252503755e-07,
	"loss": 0.0276,
	"reward": -0.4621705636382103,
	"reward_std": 0.33135028183460236,
	"rewards/cosine_scaled_reward": -0.23108528181910515,
	"rewards/format_reward": 0.0,
	"step": 81
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.328,
	"grad_norm": 2.951878309249878,
	"kl": 0.6015625,
	"learning_rate": 9.888172094375033e-07,
	"loss": 0.024,
	"reward": -0.5148988738656044,
	"reward_std": 0.3465086743235588,
	"rewards/cosine_scaled_reward": -0.2574494294822216,
	"rewards/format_reward": 0.0,
	"step": 82
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.332,
	"grad_norm": 2.1016077995300293,
	"kl": 0.36376953125,
	"learning_rate": 9.881105062929221e-07,
	"loss": 0.0145,
	"reward": -0.48821673542261124,
	"reward_std": 0.35235296189785004,
	"rewards/cosine_scaled_reward": -0.24410836026072502,
	"rewards/format_reward": 0.0,
	"step": 83
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.336,
	"grad_norm": 2.276076555252075,
	"kl": 0.77734375,
	"learning_rate": 9.873824502603459e-07,
	"loss": 0.0311,
	"reward": -0.509700171649456,
	"reward_std": 0.3434828519821167,
	"rewards/cosine_scaled_reward": -0.2548500932753086,
	"rewards/format_reward": 0.0,
	"step": 84
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.34,
	"grad_norm": 1.9953871965408325,
	"kl": 0.45263671875,
	"learning_rate": 9.866330768241983e-07,
	"loss": 0.0181,
	"reward": -0.5046856477856636,
	"reward_std": 0.3276178315281868,
	"rewards/cosine_scaled_reward": -0.2523428313434124,
	"rewards/format_reward": 0.0,
	"step": 85
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.344,
	"grad_norm": 5.694060802459717,
	"kl": 1.50390625,
	"learning_rate": 9.85862422507884e-07,
	"loss": 0.06,
	"reward": -0.5268296301364899,
	"reward_std": 0.3594844192266464,
	"rewards/cosine_scaled_reward": -0.26341481506824493,
	"rewards/format_reward": 0.0,
	"step": 86
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.348,
	"grad_norm": 2.5820319652557373,
	"kl": 0.79931640625,
	"learning_rate": 9.850705248720068e-07,
	"loss": 0.0319,
	"reward": -0.5030437260866165,
	"reward_std": 0.33297523856163025,
	"rewards/cosine_scaled_reward": -0.25152185559272766,
	"rewards/format_reward": 0.0,
	"step": 87
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.352,
	"grad_norm": 2.748469829559326,
	"kl": 0.8642578125,
	"learning_rate": 9.8425742251254e-07,
	"loss": 0.0346,
	"reward": -0.511917307972908,
	"reward_std": 0.3373011276125908,
	"rewards/cosine_scaled_reward": -0.255958653986454,
	"rewards/format_reward": 0.0,
	"step": 88
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.356,
	"grad_norm": 2.941894054412842,
	"kl": 1.10400390625,
	"learning_rate": 9.83423155058946e-07,
	"loss": 0.0443,
	"reward": -0.49383244663476944,
	"reward_std": 0.3190907835960388,
	"rewards/cosine_scaled_reward": -0.24691622331738472,
	"rewards/format_reward": 0.0,
	"step": 89
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.36,
	"grad_norm": 2.5008065700531006,
	"kl": 0.7451171875,
	"learning_rate": 9.825677631722435e-07,
	"loss": 0.0298,
	"reward": -0.5015105679631233,
	"reward_std": 0.3283078894019127,
	"rewards/cosine_scaled_reward": -0.25075526908040047,
	"rewards/format_reward": 0.0,
	"step": 90
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.364,
	"grad_norm": 2.775805950164795,
	"kl": 0.8662109375,
	"learning_rate": 9.816912885430258e-07,
	"loss": 0.0347,
	"reward": -0.49317121505737305,
	"reward_std": 0.3281624838709831,
	"rewards/cosine_scaled_reward": -0.24658560752868652,
	"rewards/format_reward": 0.0,
	"step": 91
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.368,
	"grad_norm": 4.057337284088135,
	"kl": 1.3115234375,
	"learning_rate": 9.807937738894303e-07,
	"loss": 0.0525,
	"reward": -0.4923912510275841,
	"reward_std": 0.334882490336895,
	"rewards/cosine_scaled_reward": -0.24619561806321144,
	"rewards/format_reward": 0.0,
	"step": 92
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.372,
	"grad_norm": 3.3191726207733154,
	"kl": 1.416015625,
	"learning_rate": 9.798752629550546e-07,
	"loss": 0.0567,
	"reward": -0.4856347441673279,
	"reward_std": 0.3141849860548973,
	"rewards/cosine_scaled_reward": -0.24281736463308334,
	"rewards/format_reward": 0.0,
	"step": 93
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.376,
	"grad_norm": 38.36699676513672,
	"kl": 3.833984375,
	"learning_rate": 9.78935800506826e-07,
	"loss": 0.1535,
	"reward": -0.5001253262162209,
	"reward_std": 0.34716712683439255,
	"rewards/cosine_scaled_reward": -0.25006265565752983,
	"rewards/format_reward": 0.0,
	"step": 94
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.38,
	"grad_norm": 2.851670742034912,
	"kl": 0.93017578125,
	"learning_rate": 9.779754323328192e-07,
	"loss": 0.0372,
	"reward": -0.4462156817317009,
	"reward_std": 0.3170738257467747,
	"rewards/cosine_scaled_reward": -0.22310783341526985,
	"rewards/format_reward": 0.0,
	"step": 95
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.384,
	"grad_norm": 1.903143048286438,
	"kl": 0.662109375,
	"learning_rate": 9.769942052400235e-07,
	"loss": 0.0265,
	"reward": -0.44278524816036224,
	"reward_std": 0.340934194624424,
	"rewards/cosine_scaled_reward": -0.22139262408018112,
	"rewards/format_reward": 0.0,
	"step": 96
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.388,
	"grad_norm": 2.613619089126587,
	"kl": 1.0009765625,
	"learning_rate": 9.759921670520634e-07,
	"loss": 0.04,
	"reward": -0.4385986104607582,
	"reward_std": 0.3297598212957382,
	"rewards/cosine_scaled_reward": -0.2192993052303791,
	"rewards/format_reward": 0.0,
	"step": 97
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.392,
	"grad_norm": 2.1393027305603027,
	"kl": 0.84912109375,
	"learning_rate": 9.749693666068663e-07,
	"loss": 0.0339,
	"reward": -0.4335070326924324,
	"reward_std": 0.3084552064538002,
	"rewards/cosine_scaled_reward": -0.2167535126209259,
	"rewards/format_reward": 0.0,
	"step": 98
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.396,
	"grad_norm": 10.226459503173828,
	"kl": 1.9765625,
	"learning_rate": 9.739258537542835e-07,
	"loss": 0.0791,
	"reward": -0.5120433643460274,
	"reward_std": 0.3308994993567467,
	"rewards/cosine_scaled_reward": -0.2560216821730137,
	"rewards/format_reward": 0.0,
	"step": 99
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.4,
	"grad_norm": 2.7042365074157715,
	"kl": 1.140625,
	"learning_rate": 9.728616793536587e-07,
	"loss": 0.0456,
	"reward": -0.5387645438313484,
	"reward_std": 0.32419781386852264,
	"rewards/cosine_scaled_reward": -0.2693822719156742,
	"rewards/format_reward": 0.0,
	"step": 100
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.404,
	"grad_norm": 3.3440866470336914,
	"kl": 1.158203125,
	"learning_rate": 9.717768952713511e-07,
	"loss": 0.0464,
	"reward": -0.479642316699028,
	"reward_std": 0.3374394252896309,
	"rewards/cosine_scaled_reward": -0.2398211695253849,
	"rewards/format_reward": 0.0,
	"step": 101
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.408,
	"grad_norm": 2.1483707427978516,
	"kl": 0.55859375,
	"learning_rate": 9.706715543782064e-07,
	"loss": 0.0224,
	"reward": -0.4488200396299362,
	"reward_std": 0.3361233174800873,
	"rewards/cosine_scaled_reward": -0.2244100198149681,
	"rewards/format_reward": 0.0,
	"step": 102
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.412,
	"grad_norm": 4.173567771911621,
	"kl": 1.900390625,
	"learning_rate": 9.695457105469804e-07,
	"loss": 0.0759,
	"reward": -0.4979688450694084,
	"reward_std": 0.35078077018260956,
	"rewards/cosine_scaled_reward": -0.2489844374358654,
	"rewards/format_reward": 0.0,
	"step": 103
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.416,
	"grad_norm": 5.119884490966797,
	"kl": 1.611328125,
	"learning_rate": 9.683994186497132e-07,
	"loss": 0.0644,
	"reward": -0.513933926820755,
	"reward_std": 0.3170707896351814,
	"rewards/cosine_scaled_reward": -0.2569669596850872,
	"rewards/format_reward": 0.0,
	"step": 104
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.42,
	"grad_norm": 2.8145992755889893,
	"kl": 1.466796875,
	"learning_rate": 9.672327345550543e-07,
	"loss": 0.0587,
	"reward": -0.47269363701343536,
	"reward_std": 0.31501560658216476,
	"rewards/cosine_scaled_reward": -0.23634683340787888,
	"rewards/format_reward": 0.0,
	"step": 105
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.424,
	"grad_norm": 2.3274426460266113,
	"kl": 0.59033203125,
	"learning_rate": 9.66045715125541e-07,
	"loss": 0.0236,
	"reward": -0.44968922436237335,
	"reward_std": 0.3498781695961952,
	"rewards/cosine_scaled_reward": -0.22484461963176727,
	"rewards/format_reward": 0.0,
	"step": 106
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.428,
	"grad_norm": 2.2112016677856445,
	"kl": 1.126953125,
	"learning_rate": 9.648384182148252e-07,
	"loss": 0.0451,
	"reward": -0.5002073347568512,
	"reward_std": 0.34406865388154984,
	"rewards/cosine_scaled_reward": -0.2501036673784256,
	"rewards/format_reward": 0.0,
	"step": 107
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.432,
	"grad_norm": 2.4664499759674072,
	"kl": 1.0986328125,
	"learning_rate": 9.636109026648554e-07,
	"loss": 0.0439,
	"reward": -0.49009862542152405,
	"reward_std": 0.3558028042316437,
	"rewards/cosine_scaled_reward": -0.24504930526018143,
	"rewards/format_reward": 0.0,
	"step": 108
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.436,
	"grad_norm": 2.3740482330322266,
	"kl": 0.67578125,
	"learning_rate": 9.623632283030077e-07,
	"loss": 0.027,
	"reward": -0.4631711468100548,
	"reward_std": 0.34275270998477936,
	"rewards/cosine_scaled_reward": -0.2315855734050274,
	"rewards/format_reward": 0.0,
	"step": 109
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.44,
	"grad_norm": 2.9116501808166504,
	"kl": 1.1826171875,
	"learning_rate": 9.610954559391704e-07,
	"loss": 0.0473,
	"reward": -0.444116935133934,
	"reward_std": 0.37212707847356796,
	"rewards/cosine_scaled_reward": -0.2220584638416767,
	"rewards/format_reward": 0.0,
	"step": 110
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.444,
	"grad_norm": 2.24743390083313,
	"kl": 0.638671875,
	"learning_rate": 9.598076473627796e-07,
	"loss": 0.0255,
	"reward": -0.46286992728710175,
	"reward_std": 0.3208693787455559,
	"rewards/cosine_scaled_reward": -0.23143497854471207,
	"rewards/format_reward": 0.0,
	"step": 111
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.448,
	"grad_norm": 3.138840913772583,
	"kl": 1.14404296875,
	"learning_rate": 9.58499865339809e-07,
	"loss": 0.0458,
	"reward": -0.4803452715277672,
	"reward_std": 0.3449332043528557,
	"rewards/cosine_scaled_reward": -0.2401726357638836,
	"rewards/format_reward": 0.0,
	"step": 112
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.452,
	"grad_norm": 2.7688963413238525,
	"kl": 0.9462890625,
	"learning_rate": 9.571721736097088e-07,
	"loss": 0.0379,
	"reward": -0.4440384730696678,
	"reward_std": 0.3389856517314911,
	"rewards/cosine_scaled_reward": -0.2220192365348339,
	"rewards/format_reward": 0.0,
	"step": 113
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.456,
	"grad_norm": 2.7298948764801025,
	"kl": 1.3583984375,
	"learning_rate": 9.55824636882301e-07,
	"loss": 0.0544,
	"reward": -0.40611616894602776,
	"reward_std": 0.3120696693658829,
	"rewards/cosine_scaled_reward": -0.20305808261036873,
	"rewards/format_reward": 0.0,
	"step": 114
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.46,
	"grad_norm": 2.628330945968628,
	"kl": 0.84521484375,
	"learning_rate": 9.54457320834625e-07,
	"loss": 0.0338,
	"reward": -0.41812988370656967,
	"reward_std": 0.33337801694869995,
	"rewards/cosine_scaled_reward": -0.20906493440270424,
	"rewards/format_reward": 0.0,
	"step": 115
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.464,
	"grad_norm": 2.21708607673645,
	"kl": 1.125,
	"learning_rate": 9.530702921077358e-07,
	"loss": 0.0451,
	"reward": -0.4452592432498932,
	"reward_std": 0.34758392721414566,
	"rewards/cosine_scaled_reward": -0.2226296216249466,
	"rewards/format_reward": 0.0,
	"step": 116
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.468,
	"grad_norm": 3.4151782989501953,
	"kl": 1.5390625,
	"learning_rate": 9.516636183034564e-07,
	"loss": 0.0617,
	"reward": -0.5043663010001183,
	"reward_std": 0.3056981936097145,
	"rewards/cosine_scaled_reward": -0.25218314677476883,
	"rewards/format_reward": 0.0,
	"step": 117
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.472,
	"grad_norm": 2.8809969425201416,
	"kl": 1.498046875,
	"learning_rate": 9.502373679810839e-07,
	"loss": 0.0599,
	"reward": -0.44362927228212357,
	"reward_std": 0.32765333354473114,
	"rewards/cosine_scaled_reward": -0.22181464359164238,
	"rewards/format_reward": 0.0,
	"step": 118
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.476,
	"grad_norm": 3.092552661895752,
	"kl": 1.6640625,
	"learning_rate": 9.487916106540465e-07,
	"loss": 0.0665,
	"reward": -0.49818655103445053,
	"reward_std": 0.3495415672659874,
	"rewards/cosine_scaled_reward": -0.24909326806664467,
	"rewards/format_reward": 0.0,
	"step": 119
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.48,
	"grad_norm": 3.2943530082702637,
	"kl": 2.07421875,
	"learning_rate": 9.473264167865171e-07,
	"loss": 0.0829,
	"reward": -0.4802135229110718,
	"reward_std": 0.3453461080789566,
	"rewards/cosine_scaled_reward": -0.24010677635669708,
	"rewards/format_reward": 0.0,
	"step": 120
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.484,
	"grad_norm": 2.5681769847869873,
	"kl": 1.505859375,
	"learning_rate": 9.458418577899774e-07,
	"loss": 0.0603,
	"reward": -0.5175792872905731,
	"reward_std": 0.35768260806798935,
	"rewards/cosine_scaled_reward": -0.25878964737057686,
	"rewards/format_reward": 0.0,
	"step": 121
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.488,
	"grad_norm": 2.9190571308135986,
	"kl": 1.57373046875,
	"learning_rate": 9.443380060197385e-07,
	"loss": 0.063,
	"reward": -0.46548449248075485,
	"reward_std": 0.35348332673311234,
	"rewards/cosine_scaled_reward": -0.23274223506450653,
	"rewards/format_reward": 0.0,
	"step": 122
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.492,
	"grad_norm": 2.435157537460327,
	"kl": 1.0654296875,
	"learning_rate": 9.428149347714143e-07,
	"loss": 0.0427,
	"reward": -0.4281177818775177,
	"reward_std": 0.3503784313797951,
	"rewards/cosine_scaled_reward": -0.21405889093875885,
	"rewards/format_reward": 0.0,
	"step": 123
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.496,
	"grad_norm": 3.1375350952148438,
	"kl": 1.5625,
	"learning_rate": 9.412727182773486e-07,
	"loss": 0.0624,
	"reward": -0.4667646959424019,
	"reward_std": 0.3501163199543953,
	"rewards/cosine_scaled_reward": -0.23338234052062035,
	"rewards/format_reward": 0.0,
	"step": 124
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.5,
	"grad_norm": 2.1935606002807617,
	"kl": 1.3427734375,
	"learning_rate": 9.397114317029974e-07,
	"loss": 0.0537,
	"reward": -0.4283955693244934,
	"reward_std": 0.34814615547657013,
	"rewards/cosine_scaled_reward": -0.2141977809369564,
	"rewards/format_reward": 0.0,
	"step": 125
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.504,
	"grad_norm": 2.727754592895508,
	"kl": 1.35546875,
	"learning_rate": 9.381311511432658e-07,
	"loss": 0.0543,
	"reward": -0.4584430381655693,
	"reward_std": 0.3318573832511902,
	"rewards/cosine_scaled_reward": -0.22922151535749435,
	"rewards/format_reward": 0.0,
	"step": 126
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.508,
	"grad_norm": 2.9863674640655518,
	"kl": 1.509765625,
	"learning_rate": 9.36531953618799e-07,
	"loss": 0.0603,
	"reward": -0.4794049710035324,
	"reward_std": 0.3224741891026497,
	"rewards/cosine_scaled_reward": -0.2397024855017662,
	"rewards/format_reward": 0.0,
	"step": 127
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.512,
	"grad_norm": 3.0583863258361816,
	"kl": 1.5751953125,
	"learning_rate": 9.34913917072228e-07,
	"loss": 0.0631,
	"reward": -0.3896471783518791,
	"reward_std": 0.32155635207891464,
	"rewards/cosine_scaled_reward": -0.19482359662652016,
	"rewards/format_reward": 0.0,
	"step": 128
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.516,
	"grad_norm": 11.888484001159668,
	"kl": 2.1806640625,
	"learning_rate": 9.332771203643714e-07,
	"loss": 0.0874,
	"reward": -0.46486661583185196,
	"reward_std": 0.34625906497240067,
	"rewards/cosine_scaled_reward": -0.23243330791592598,
	"rewards/format_reward": 0.0,
	"step": 129
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.52,
	"grad_norm": 3.14744234085083,
	"kl": 1.1103515625,
	"learning_rate": 9.316216432703916e-07,
	"loss": 0.0445,
	"reward": -0.4691261351108551,
	"reward_std": 0.3357261121273041,
	"rewards/cosine_scaled_reward": -0.23456306010484695,
	"rewards/format_reward": 0.0,
	"step": 130
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.524,
	"grad_norm": 2.6933717727661133,
	"kl": 1.76171875,
	"learning_rate": 9.299475664759068e-07,
	"loss": 0.0705,
	"reward": -0.5458347946405411,
	"reward_std": 0.3296028599143028,
	"rewards/cosine_scaled_reward": -0.27291740477085114,
	"rewards/format_reward": 0.0,
	"step": 131
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.528,
	"grad_norm": 2.695984363555908,
	"kl": 1.2666015625,
	"learning_rate": 9.282549715730579e-07,
	"loss": 0.0506,
	"reward": -0.43337278813123703,
	"reward_std": 0.3223467916250229,
	"rewards/cosine_scaled_reward": -0.2166864052414894,
	"rewards/format_reward": 0.0,
	"step": 132
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.532,
	"grad_norm": 2.1844236850738525,
	"kl": 1.072265625,
	"learning_rate": 9.265439410565328e-07,
	"loss": 0.0429,
	"reward": -0.47815513610839844,
	"reward_std": 0.33408980816602707,
	"rewards/cosine_scaled_reward": -0.23907756060361862,
	"rewards/format_reward": 0.0,
	"step": 133
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.536,
	"grad_norm": 2.6240434646606445,
	"kl": 0.998046875,
	"learning_rate": 9.248145583195447e-07,
	"loss": 0.0399,
	"reward": -0.3596036769449711,
	"reward_std": 0.3202332779765129,
	"rewards/cosine_scaled_reward": -0.17980184871703386,
	"rewards/format_reward": 0.0,
	"step": 134
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.54,
	"grad_norm": 2.413489580154419,
	"kl": 1.515625,
	"learning_rate": 9.230669076497687e-07,
	"loss": 0.0607,
	"reward": -0.3980662524700165,
	"reward_std": 0.3146558068692684,
	"rewards/cosine_scaled_reward": -0.19903312623500824,
	"rewards/format_reward": 0.0,
	"step": 135
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.544,
	"grad_norm": 2.5466983318328857,
	"kl": 1.421875,
	"learning_rate": 9.213010742252327e-07,
	"loss": 0.0568,
	"reward": -0.4567502960562706,
	"reward_std": 0.36093486845493317,
	"rewards/cosine_scaled_reward": -0.2283751629292965,
	"rewards/format_reward": 0.0,
	"step": 136
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.548,
	"grad_norm": 2.670454263687134,
	"kl": 1.63671875,
	"learning_rate": 9.195171441101668e-07,
	"loss": 0.0655,
	"reward": -0.48265285044908524,
	"reward_std": 0.33601198345422745,
	"rewards/cosine_scaled_reward": -0.24132642522454262,
	"rewards/format_reward": 0.0,
	"step": 137
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.552,
	"grad_norm": 3.4489877223968506,
	"kl": 1.39453125,
	"learning_rate": 9.177152042508077e-07,
	"loss": 0.0558,
	"reward": -0.40766458958387375,
	"reward_std": 0.34357643127441406,
	"rewards/cosine_scaled_reward": -0.20383229106664658,
	"rewards/format_reward": 0.0,
	"step": 138
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.556,
	"grad_norm": 2.18890118598938,
	"kl": 1.30859375,
	"learning_rate": 9.158953424711624e-07,
	"loss": 0.0523,
	"reward": -0.4143947809934616,
	"reward_std": 0.323918879032135,
	"rewards/cosine_scaled_reward": -0.2071974016726017,
	"rewards/format_reward": 0.0,
	"step": 139
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.56,
	"grad_norm": 2.5627028942108154,
	"kl": 1.34423828125,
	"learning_rate": 9.140576474687263e-07,
	"loss": 0.0538,
	"reward": -0.4485241174697876,
	"reward_std": 0.3278198316693306,
	"rewards/cosine_scaled_reward": -0.2242620587348938,
	"rewards/format_reward": 0.0,
	"step": 140
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.564,
	"grad_norm": 2.086371660232544,
	"kl": 1.2802734375,
	"learning_rate": 9.122022088101613e-07,
	"loss": 0.0512,
	"reward": -0.32855524495244026,
	"reward_std": 0.33061159402132034,
	"rewards/cosine_scaled_reward": -0.16427762433886528,
	"rewards/format_reward": 0.0,
	"step": 141
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.568,
	"grad_norm": 2.45231556892395,
	"kl": 1.580078125,
	"learning_rate": 9.103291169269299e-07,
	"loss": 0.0632,
	"reward": -0.4703398421406746,
	"reward_std": 0.2972045987844467,
	"rewards/cosine_scaled_reward": -0.2351699210703373,
	"rewards/format_reward": 0.0,
	"step": 142
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.572,
	"grad_norm": 2.864070415496826,
	"kl": 1.984375,
	"learning_rate": 9.084384631108882e-07,
	"loss": 0.0794,
	"reward": -0.41980744898319244,
	"reward_std": 0.34404993802309036,
	"rewards/cosine_scaled_reward": -0.20990372076630592,
	"rewards/format_reward": 0.0,
	"step": 143
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.576,
	"grad_norm": 2.412257194519043,
	"kl": 1.544921875,
	"learning_rate": 9.065303395098358e-07,
	"loss": 0.0618,
	"reward": -0.43455804139375687,
	"reward_std": 0.32647445797920227,
	"rewards/cosine_scaled_reward": -0.21727901697158813,
	"rewards/format_reward": 0.0,
	"step": 144
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.58,
	"grad_norm": 2.952892780303955,
	"kl": 2.0595703125,
	"learning_rate": 9.046048391230247e-07,
	"loss": 0.0824,
	"reward": -0.4728480279445648,
	"reward_std": 0.33887017518281937,
	"rewards/cosine_scaled_reward": -0.2364240102469921,
	"rewards/format_reward": 0.0,
	"step": 145
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.584,
	"grad_norm": 2.3727328777313232,
	"kl": 1.7255859375,
	"learning_rate": 9.026620557966279e-07,
	"loss": 0.0692,
	"reward": -0.42372531443834305,
	"reward_std": 0.3417205289006233,
	"rewards/cosine_scaled_reward": -0.21186266466975212,
	"rewards/format_reward": 0.0,
	"step": 146
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.588,
	"grad_norm": 2.953756809234619,
	"kl": 2.353515625,
	"learning_rate": 9.007020842191634e-07,
	"loss": 0.0943,
	"reward": -0.43578075617551804,
	"reward_std": 0.34062809497117996,
	"rewards/cosine_scaled_reward": -0.21789037808775902,
	"rewards/format_reward": 0.0,
	"step": 147
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.592,
	"grad_norm": 2.5953478813171387,
	"kl": 1.38671875,
	"learning_rate": 8.987250199168808e-07,
	"loss": 0.0555,
	"reward": -0.4190576896071434,
	"reward_std": 0.34895560145378113,
	"rewards/cosine_scaled_reward": -0.2095288448035717,
	"rewards/format_reward": 0.0,
	"step": 148
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.596,
	"grad_norm": 2.4279496669769287,
	"kl": 1.62890625,
	"learning_rate": 8.967309592491052e-07,
	"loss": 0.0651,
	"reward": -0.4394699037075043,
	"reward_std": 0.3207908198237419,
	"rewards/cosine_scaled_reward": -0.21973494067788124,
	"rewards/format_reward": 0.0,
	"step": 149
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.6,
	"grad_norm": 2.974292516708374,
	"kl": 1.892578125,
	"learning_rate": 8.9471999940354e-07,
	"loss": 0.0757,
	"reward": -0.4797021597623825,
	"reward_std": 0.32065775990486145,
	"rewards/cosine_scaled_reward": -0.23985107988119125,
	"rewards/format_reward": 0.0,
	"step": 150
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.604,
	"grad_norm": 2.51299786567688,
	"kl": 0.87890625,
	"learning_rate": 8.926922383915315e-07,
	"loss": 0.0351,
	"reward": -0.4108778163790703,
	"reward_std": 0.326105996966362,
	"rewards/cosine_scaled_reward": -0.20543890818953514,
	"rewards/format_reward": 0.0,
	"step": 151
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.608,
	"grad_norm": 2.723388195037842,
	"kl": 1.2294921875,
	"learning_rate": 8.906477750432903e-07,
	"loss": 0.0492,
	"reward": -0.4178111329674721,
	"reward_std": 0.32895463705062866,
	"rewards/cosine_scaled_reward": -0.20890555530786514,
	"rewards/format_reward": 0.0,
	"step": 152
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.612,
	"grad_norm": 2.4097025394439697,
	"kl": 1.650390625,
	"learning_rate": 8.88586709003076e-07,
	"loss": 0.0659,
	"reward": -0.4825671687722206,
	"reward_std": 0.33990373462438583,
	"rewards/cosine_scaled_reward": -0.2412835843861103,
	"rewards/format_reward": 0.0,
	"step": 153
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.616,
	"grad_norm": 2.114370107650757,
	"kl": 1.390625,
	"learning_rate": 8.865091407243394e-07,
	"loss": 0.0556,
	"reward": -0.42671380192041397,
	"reward_std": 0.32950445264577866,
	"rewards/cosine_scaled_reward": -0.21335690841078758,
	"rewards/format_reward": 0.0,
	"step": 154
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.62,
	"grad_norm": 3.1770823001861572,
	"kl": 1.4287109375,
	"learning_rate": 8.844151714648274e-07,
	"loss": 0.0572,
	"reward": -0.4250905141234398,
	"reward_std": 0.3110942989587784,
	"rewards/cosine_scaled_reward": -0.2125452570617199,
	"rewards/format_reward": 0.0,
	"step": 155
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.624,
	"grad_norm": 2.6063926219940186,
	"kl": 1.796875,
	"learning_rate": 8.823049032816478e-07,
	"loss": 0.0719,
	"reward": -0.4206129387021065,
	"reward_std": 0.33140094578266144,
	"rewards/cosine_scaled_reward": -0.21030646935105324,
	"rewards/format_reward": 0.0,
	"step": 156
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.628,
	"grad_norm": 2.482637643814087,
	"kl": 1.525390625,
	"learning_rate": 8.801784390262943e-07,
	"loss": 0.061,
	"reward": -0.36781868524849415,
	"reward_std": 0.3281563073396683,
	"rewards/cosine_scaled_reward": -0.18390934821218252,
	"rewards/format_reward": 0.0,
	"step": 157
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.632,
	"grad_norm": 2.7100956439971924,
	"kl": 1.7861328125,
	"learning_rate": 8.780358823396352e-07,
	"loss": 0.0715,
	"reward": -0.3854188397526741,
	"reward_std": 0.31897617131471634,
	"rewards/cosine_scaled_reward": -0.19270941987633705,
	"rewards/format_reward": 0.0,
	"step": 158
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.636,
	"grad_norm": 2.3493990898132324,
	"kl": 1.859375,
	"learning_rate": 8.758773376468604e-07,
	"loss": 0.0746,
	"reward": -0.41636481136083603,
	"reward_std": 0.3308830112218857,
	"rewards/cosine_scaled_reward": -0.20818240568041801,
	"rewards/format_reward": 0.0,
	"step": 159
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.64,
	"grad_norm": 2.429762840270996,
	"kl": 1.78125,
	"learning_rate": 8.737029101523929e-07,
	"loss": 0.0714,
	"reward": -0.44961177557706833,
	"reward_std": 0.3425107002258301,
	"rewards/cosine_scaled_reward": -0.22480589523911476,
	"rewards/format_reward": 0.0,
	"step": 160
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.644,
	"grad_norm": 2.6372933387756348,
	"kl": 1.6474609375,
	"learning_rate": 8.715127058347614e-07,
	"loss": 0.066,
	"reward": -0.4204000309109688,
	"reward_std": 0.3256704956293106,
	"rewards/cosine_scaled_reward": -0.2102000191807747,
	"rewards/format_reward": 0.0,
	"step": 161
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.648,
	"grad_norm": 2.2505483627319336,
	"kl": 1.576171875,
	"learning_rate": 8.693068314414344e-07,
	"loss": 0.063,
	"reward": -0.4363863915205002,
	"reward_std": 0.3367513567209244,
	"rewards/cosine_scaled_reward": -0.2181931994855404,
	"rewards/format_reward": 0.0,
	"step": 162
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.652,
	"grad_norm": 2.781273603439331,
	"kl": 1.4375,
	"learning_rate": 8.670853944836176e-07,
	"loss": 0.0576,
	"reward": -0.44805190712213516,
	"reward_std": 0.3117773234844208,
	"rewards/cosine_scaled_reward": -0.22402595356106758,
	"rewards/format_reward": 0.0,
	"step": 163
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.656,
	"grad_norm": 2.573030710220337,
	"kl": 1.21435546875,
	"learning_rate": 8.648485032310144e-07,
	"loss": 0.0487,
	"reward": -0.40324684232473373,
	"reward_std": 0.3176472932100296,
	"rewards/cosine_scaled_reward": -0.20162343233823776,
	"rewards/format_reward": 0.0,
	"step": 164
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.66,
	"grad_norm": 4.171741485595703,
	"kl": 2.3125,
	"learning_rate": 8.625962667065487e-07,
	"loss": 0.0925,
	"reward": -0.4968671426177025,
	"reward_std": 0.3204089626669884,
	"rewards/cosine_scaled_reward": -0.24843357503414154,
	"rewards/format_reward": 0.0,
	"step": 165
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1528.702392578125,
	"epoch": 0.664,
	"grad_norm": 2.1756961345672607,
	"kl": 1.7578125,
	"learning_rate": 8.603287946810513e-07,
	"loss": 0.0706,
	"reward": -0.4272613450884819,
	"reward_std": 0.32390115410089493,
	"rewards/cosine_scaled_reward": -0.21363067999482155,
	"rewards/format_reward": 0.0,
	"step": 166
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.668,
	"grad_norm": 2.2742207050323486,
	"kl": 1.912109375,
	"learning_rate": 8.580461976679099e-07,
	"loss": 0.0763,
	"reward": -0.3418873958289623,
	"reward_std": 0.29924022778868675,
	"rewards/cosine_scaled_reward": -0.17094369884580374,
	"rewards/format_reward": 0.0,
	"step": 167
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.672,
	"grad_norm": 2.1837146282196045,
	"kl": 1.3330078125,
	"learning_rate": 8.557485869176825e-07,
	"loss": 0.0533,
	"reward": -0.4050525277853012,
	"reward_std": 0.3251590058207512,
	"rewards/cosine_scaled_reward": -0.2025262601673603,
	"rewards/format_reward": 0.0,
	"step": 168
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.676,
	"grad_norm": 2.1009020805358887,
	"kl": 1.9326171875,
	"learning_rate": 8.534360744126753e-07,
	"loss": 0.0774,
	"reward": -0.4387947544455528,
	"reward_std": 0.3307826817035675,
	"rewards/cosine_scaled_reward": -0.21939736977219582,
	"rewards/format_reward": 0.0,
	"step": 169
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.68,
	"grad_norm": 2.515617609024048,
	"kl": 1.884765625,
	"learning_rate": 8.511087728614862e-07,
	"loss": 0.0754,
	"reward": -0.41566915810108185,
	"reward_std": 0.34893494844436646,
	"rewards/cosine_scaled_reward": -0.20783457532525063,
	"rewards/format_reward": 0.0,
	"step": 170
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.684,
	"grad_norm": 2.3045356273651123,
	"kl": 1.5078125,
	"learning_rate": 8.487667956935087e-07,
	"loss": 0.0604,
	"reward": -0.3871946483850479,
	"reward_std": 0.3363000229001045,
	"rewards/cosine_scaled_reward": -0.19359732419252396,
	"rewards/format_reward": 0.0,
	"step": 171
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.688,
	"grad_norm": 2.1517364978790283,
	"kl": 1.4169921875,
	"learning_rate": 8.464102570534061e-07,
	"loss": 0.0567,
	"reward": -0.41495678573846817,
	"reward_std": 0.33959241211414337,
	"rewards/cosine_scaled_reward": -0.20747840031981468,
	"rewards/format_reward": 0.0,
	"step": 172
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.692,
	"grad_norm": 2.4767415523529053,
	"kl": 1.5654296875,
	"learning_rate": 8.440392717955475e-07,
	"loss": 0.0626,
	"reward": -0.3259017579257488,
	"reward_std": 0.3448467329144478,
	"rewards/cosine_scaled_reward": -0.16295087756589055,
	"rewards/format_reward": 0.0,
	"step": 173
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.696,
	"grad_norm": 2.1803934574127197,
	"kl": 1.5986328125,
	"learning_rate": 8.416539554784089e-07,
	"loss": 0.0639,
	"reward": -0.45371130108833313,
	"reward_std": 0.3770594820380211,
	"rewards/cosine_scaled_reward": -0.22685565054416656,
	"rewards/format_reward": 0.0,
	"step": 174
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.7,
	"grad_norm": 2.146838426589966,
	"kl": 1.3212890625,
	"learning_rate": 8.392544243589427e-07,
	"loss": 0.053,
	"reward": -0.39382801204919815,
	"reward_std": 0.3155653551220894,
	"rewards/cosine_scaled_reward": -0.19691400602459908,
	"rewards/format_reward": 0.0,
	"step": 175
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.704,
	"grad_norm": 2.3939132690429688,
	"kl": 1.498046875,
	"learning_rate": 8.368407953869103e-07,
	"loss": 0.06,
	"reward": -0.397233285009861,
	"reward_std": 0.3429732918739319,
	"rewards/cosine_scaled_reward": -0.1986166313290596,
	"rewards/format_reward": 0.0,
	"step": 176
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.708,
	"grad_norm": 2.2279624938964844,
	"kl": 1.3759765625,
	"learning_rate": 8.344131861991828e-07,
	"loss": 0.0551,
	"reward": -0.41151023656129837,
	"reward_std": 0.3277590796351433,
	"rewards/cosine_scaled_reward": -0.2057551108300686,
	"rewards/format_reward": 0.0,
	"step": 177
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.712,
	"grad_norm": 2.5055384635925293,
	"kl": 1.341796875,
	"learning_rate": 8.319717151140072e-07,
	"loss": 0.0537,
	"reward": -0.4148360714316368,
	"reward_std": 0.3054031655192375,
	"rewards/cosine_scaled_reward": -0.2074180319905281,
	"rewards/format_reward": 0.0,
	"step": 178
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.716,
	"grad_norm": 2.605672836303711,
	"kl": 2.421875,
	"learning_rate": 8.295165011252396e-07,
	"loss": 0.0969,
	"reward": -0.49764253944158554,
	"reward_std": 0.34468474239110947,
	"rewards/cosine_scaled_reward": -0.24882125481963158,
	"rewards/format_reward": 0.0,
	"step": 179
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.72,
	"grad_norm": 1.8612443208694458,
	"kl": 1.958984375,
	"learning_rate": 8.270476638965461e-07,
	"loss": 0.0784,
	"reward": -0.41104499250650406,
	"reward_std": 0.32857123762369156,
	"rewards/cosine_scaled_reward": -0.20552249625325203,
	"rewards/format_reward": 0.0,
	"step": 180
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.724,
	"grad_norm": 2.20760178565979,
	"kl": 1.4267578125,
	"learning_rate": 8.245653237555705e-07,
	"loss": 0.0571,
	"reward": -0.4070161208510399,
	"reward_std": 0.29896606504917145,
	"rewards/cosine_scaled_reward": -0.20350806042551994,
	"rewards/format_reward": 0.0,
	"step": 181
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.728,
	"grad_norm": 2.527832269668579,
	"kl": 1.3251953125,
	"learning_rate": 8.220696016880687e-07,
	"loss": 0.053,
	"reward": -0.40310006588697433,
	"reward_std": 0.33485615253448486,
	"rewards/cosine_scaled_reward": -0.20155002549290657,
	"rewards/format_reward": 0.0,
	"step": 182
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.732,
	"grad_norm": 2.0901362895965576,
	"kl": 1.25,
	"learning_rate": 8.195606193320136e-07,
	"loss": 0.0499,
	"reward": -0.39147457480430603,
	"reward_std": 0.3105906918644905,
	"rewards/cosine_scaled_reward": -0.19573728740215302,
	"rewards/format_reward": 0.0,
	"step": 183
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.736,
	"grad_norm": 2.0712454319000244,
	"kl": 1.3271484375,
	"learning_rate": 8.170384989716657e-07,
	"loss": 0.053,
	"reward": -0.36338385939598083,
	"reward_std": 0.29373297840356827,
	"rewards/cosine_scaled_reward": -0.18169192969799042,
	"rewards/format_reward": 0.0,
	"step": 184
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.74,
	"grad_norm": 4.567477226257324,
	"kl": 2.91015625,
	"learning_rate": 8.145033635316128e-07,
	"loss": 0.1167,
	"reward": -0.46033478528261185,
	"reward_std": 0.309500552713871,
	"rewards/cosine_scaled_reward": -0.23016740009188652,
	"rewards/format_reward": 0.0,
	"step": 185
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.744,
	"grad_norm": 2.8025710582733154,
	"kl": 1.982421875,
	"learning_rate": 8.119553365707802e-07,
	"loss": 0.0793,
	"reward": -0.3399934060871601,
	"reward_std": 0.3289627507328987,
	"rewards/cosine_scaled_reward": -0.16999670304358006,
	"rewards/format_reward": 0.0,
	"step": 186
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.748,
	"grad_norm": 2.41241192817688,
	"kl": 1.6513671875,
	"learning_rate": 8.093945422764069e-07,
	"loss": 0.0663,
	"reward": -0.4002522900700569,
	"reward_std": 0.3234091103076935,
	"rewards/cosine_scaled_reward": -0.20012613758444786,
	"rewards/format_reward": 0.0,
	"step": 187
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.752,
	"grad_norm": 3.6371164321899414,
	"kl": 2.470703125,
	"learning_rate": 8.068211054579943e-07,
	"loss": 0.0988,
	"reward": -0.44175921380519867,
	"reward_std": 0.33701298385858536,
	"rewards/cosine_scaled_reward": -0.22087960690259933,
	"rewards/format_reward": 0.0,
	"step": 188
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.756,
	"grad_norm": 2.704362154006958,
	"kl": 1.71875,
	"learning_rate": 8.04235151541222e-07,
	"loss": 0.0686,
	"reward": -0.3934633806347847,
	"reward_std": 0.31845808029174805,
	"rewards/cosine_scaled_reward": -0.19673169776797295,
	"rewards/format_reward": 0.0,
	"step": 189
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.76,
	"grad_norm": 2.5518999099731445,
	"kl": 1.865234375,
	"learning_rate": 8.01636806561836e-07,
	"loss": 0.0746,
	"reward": -0.48456476628780365,
	"reward_std": 0.3398968055844307,
	"rewards/cosine_scaled_reward": -0.24228239431977272,
	"rewards/format_reward": 0.0,
	"step": 190
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.764,
	"grad_norm": 4.733001232147217,
	"kl": 2.0537109375,
	"learning_rate": 7.990261971595048e-07,
	"loss": 0.0822,
	"reward": -0.44671063870191574,
	"reward_std": 0.32652025669813156,
	"rewards/cosine_scaled_reward": -0.22335530444979668,
	"rewards/format_reward": 0.0,
	"step": 191
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.768,
	"grad_norm": 2.217525005340576,
	"kl": 1.72265625,
	"learning_rate": 7.964034505716476e-07,
	"loss": 0.0689,
	"reward": -0.38292936980724335,
	"reward_std": 0.3729139119386673,
	"rewards/cosine_scaled_reward": -0.19146469235420227,
	"rewards/format_reward": 0.0,
	"step": 192
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.772,
	"grad_norm": 2.3045313358306885,
	"kl": 1.0576171875,
	"learning_rate": 7.93768694627233e-07,
	"loss": 0.0423,
	"reward": -0.36335285753011703,
	"reward_std": 0.3274284452199936,
	"rewards/cosine_scaled_reward": -0.18167642876505852,
	"rewards/format_reward": 0.0,
	"step": 193
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.776,
	"grad_norm": 2.220212936401367,
	"kl": 1.974609375,
	"learning_rate": 7.911220577405484e-07,
	"loss": 0.0791,
	"reward": -0.41132358461618423,
	"reward_std": 0.33213579654693604,
	"rewards/cosine_scaled_reward": -0.20566179975867271,
	"rewards/format_reward": 0.0,
	"step": 194
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.78,
	"grad_norm": 2.872774124145508,
	"kl": 2.04296875,
	"learning_rate": 7.884636689049422e-07,
	"loss": 0.0819,
	"reward": -0.41410720348358154,
	"reward_std": 0.3132774606347084,
	"rewards/cosine_scaled_reward": -0.20705359801650047,
	"rewards/format_reward": 0.0,
	"step": 195
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.784,
	"grad_norm": 3.354735851287842,
	"kl": 1.2236328125,
	"learning_rate": 7.857936576865356e-07,
	"loss": 0.0489,
	"reward": -0.34651997685432434,
	"reward_std": 0.27611755579710007,
	"rewards/cosine_scaled_reward": -0.17325998842716217,
	"rewards/format_reward": 0.0,
	"step": 196
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.788,
	"grad_norm": 2.019547939300537,
	"kl": 1.03515625,
	"learning_rate": 7.831121542179086e-07,
	"loss": 0.0414,
	"reward": -0.36961859464645386,
	"reward_std": 0.3042915388941765,
	"rewards/cosine_scaled_reward": -0.18480929359793663,
	"rewards/format_reward": 0.0,
	"step": 197
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.792,
	"grad_norm": 2.245211601257324,
	"kl": 1.408203125,
	"learning_rate": 7.804192891917571e-07,
	"loss": 0.0564,
	"reward": -0.3812807723879814,
	"reward_std": 0.30970512330532074,
	"rewards/cosine_scaled_reward": -0.190640389919281,
	"rewards/format_reward": 0.0,
	"step": 198
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.796,
	"grad_norm": 2.0456931591033936,
	"kl": 1.673828125,
	"learning_rate": 7.777151938545235e-07,
	"loss": 0.067,
	"reward": -0.38433101773262024,
	"reward_std": 0.3408072590827942,
	"rewards/cosine_scaled_reward": -0.19216550886631012,
	"rewards/format_reward": 0.0,
	"step": 199
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.8,
	"grad_norm": 6.253657817840576,
	"kl": 1.48876953125,
	"learning_rate": 7.75e-07,
	"loss": 0.0595,
	"reward": -0.3863793611526489,
	"reward_std": 0.3155966252088547,
	"rewards/cosine_scaled_reward": -0.19318969175219536,
	"rewards/format_reward": 0.0,
	"step": 200
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.804,
	"grad_norm": 2.2331368923187256,
	"kl": 1.96484375,
	"learning_rate": 7.72273839962904e-07,
	"loss": 0.0786,
	"reward": -0.41171175986528397,
	"reward_std": 0.34651194512844086,
	"rewards/cosine_scaled_reward": -0.20585588365793228,
	"rewards/format_reward": 0.0,
	"step": 201
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.808,
	"grad_norm": 2.1702663898468018,
	"kl": 1.296875,
	"learning_rate": 7.695368466124296e-07,
	"loss": 0.0519,
	"reward": -0.38244833052158356,
	"reward_std": 0.34267907589673996,
	"rewards/cosine_scaled_reward": -0.19122417271137238,
	"rewards/format_reward": 0.0,
	"step": 202
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.812,
	"grad_norm": 2.0549793243408203,
	"kl": 1.4345703125,
	"learning_rate": 7.667891533457718e-07,
	"loss": 0.0573,
	"reward": -0.4125688225030899,
	"reward_std": 0.33167801052331924,
	"rewards/cosine_scaled_reward": -0.20628441870212555,
	"rewards/format_reward": 0.0,
	"step": 203
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.816,
	"grad_norm": 2.7793009281158447,
	"kl": 1.958984375,
	"learning_rate": 7.640308940816239e-07,
	"loss": 0.0784,
	"reward": -0.45417842268943787,
	"reward_std": 0.3453121930360794,
	"rewards/cosine_scaled_reward": -0.22708921134471893,
	"rewards/format_reward": 0.0,
	"step": 204
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.82,
	"grad_norm": 8.324098587036133,
	"kl": 2.23388671875,
	"learning_rate": 7.612622032536507e-07,
	"loss": 0.0895,
	"reward": -0.3973395526409149,
	"reward_std": 0.32590440660715103,
	"rewards/cosine_scaled_reward": -0.19866977632045746,
	"rewards/format_reward": 0.0,
	"step": 205
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.824,
	"grad_norm": 2.22940993309021,
	"kl": 1.51171875,
	"learning_rate": 7.584832158039378e-07,
	"loss": 0.0605,
	"reward": -0.4044779762625694,
	"reward_std": 0.33285098522901535,
	"rewards/cosine_scaled_reward": -0.2022389993071556,
	"rewards/format_reward": 0.0,
	"step": 206
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.828,
	"grad_norm": 2.824735164642334,
	"kl": 1.310546875,
	"learning_rate": 7.556940671764124e-07,
	"loss": 0.0524,
	"reward": -0.4486440494656563,
	"reward_std": 0.33797865360975266,
	"rewards/cosine_scaled_reward": -0.22432202845811844,
	"rewards/format_reward": 0.0,
	"step": 207
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.832,
	"grad_norm": 2.2558631896972656,
	"kl": 1.1962890625,
	"learning_rate": 7.528948933102438e-07,
	"loss": 0.0478,
	"reward": -0.40251782536506653,
	"reward_std": 0.30128662288188934,
	"rewards/cosine_scaled_reward": -0.20125891268253326,
	"rewards/format_reward": 0.0,
	"step": 208
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.836,
	"grad_norm": 2.7602171897888184,
	"kl": 0.9951171875,
	"learning_rate": 7.500858306332172e-07,
	"loss": 0.0398,
	"reward": -0.31514767929911613,
	"reward_std": 0.3020384646952152,
	"rewards/cosine_scaled_reward": -0.15757383964955807,
	"rewards/format_reward": 0.0,
	"step": 209
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.84,
	"grad_norm": 2.6217448711395264,
	"kl": 1.71484375,
	"learning_rate": 7.472670160550848e-07,
	"loss": 0.0684,
	"reward": -0.3670196682214737,
	"reward_std": 0.31881674379110336,
	"rewards/cosine_scaled_reward": -0.18350983038544655,
	"rewards/format_reward": 0.0,
	"step": 210
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.844,
	"grad_norm": 2.0915112495422363,
	"kl": 1.2841796875,
	"learning_rate": 7.444385869608921e-07,
	"loss": 0.0514,
	"reward": -0.4177168160676956,
	"reward_std": 0.3398260995745659,
	"rewards/cosine_scaled_reward": -0.2088584043085575,
	"rewards/format_reward": 0.0,
	"step": 211
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.848,
	"grad_norm": 1.7296172380447388,
	"kl": 1.2724609375,
	"learning_rate": 7.416006812042827e-07,
	"loss": 0.051,
	"reward": -0.41255099326372147,
	"reward_std": 0.33872970938682556,
	"rewards/cosine_scaled_reward": -0.20627548918128014,
	"rewards/format_reward": 0.0,
	"step": 212
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.852,
	"grad_norm": 2.1323206424713135,
	"kl": 1.16162109375,
	"learning_rate": 7.387534371007797e-07,
	"loss": 0.0466,
	"reward": -0.2759926188737154,
	"reward_std": 0.30077088996768,
	"rewards/cosine_scaled_reward": -0.1379963019862771,
	"rewards/format_reward": 0.0,
	"step": 213
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.856,
	"grad_norm": 2.3771109580993652,
	"kl": 1.556640625,
	"learning_rate": 7.358969934210438e-07,
	"loss": 0.0622,
	"reward": -0.3614875078201294,
	"reward_std": 0.32025381922721863,
	"rewards/cosine_scaled_reward": -0.1807437539100647,
	"rewards/format_reward": 0.0,
	"step": 214
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.86,
	"grad_norm": 2.940969467163086,
	"kl": 1.8828125,
	"learning_rate": 7.330314893841101e-07,
	"loss": 0.0754,
	"reward": -0.29097072361037135,
	"reward_std": 0.28063248097896576,
	"rewards/cosine_scaled_reward": -0.14548537082737312,
	"rewards/format_reward": 0.0,
	"step": 215
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.864,
	"grad_norm": 1.9293019771575928,
	"kl": 1.62890625,
	"learning_rate": 7.301570646506027e-07,
	"loss": 0.0652,
	"reward": -0.4154031127691269,
	"reward_std": 0.34460632503032684,
	"rewards/cosine_scaled_reward": -0.20770153775811195,
	"rewards/format_reward": 0.0,
	"step": 216
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.868,
	"grad_norm": 2.745267391204834,
	"kl": 2.0888671875,
	"learning_rate": 7.27273859315928e-07,
	"loss": 0.0835,
	"reward": -0.4031589925289154,
	"reward_std": 0.31946661323308945,
	"rewards/cosine_scaled_reward": -0.2015794888138771,
	"rewards/format_reward": 0.0,
	"step": 217
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.872,
	"grad_norm": 2.873622179031372,
	"kl": 1.5078125,
	"learning_rate": 7.243820139034464e-07,
	"loss": 0.0604,
	"reward": -0.4128880575299263,
	"reward_std": 0.3311196342110634,
	"rewards/cosine_scaled_reward": -0.20644402503967285,
	"rewards/format_reward": 0.0,
	"step": 218
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.876,
	"grad_norm": 2.7079639434814453,
	"kl": 1.2607421875,
	"learning_rate": 7.214816693576234e-07,
	"loss": 0.0505,
	"reward": -0.3099018558859825,
	"reward_std": 0.2861209958791733,
	"rewards/cosine_scaled_reward": -0.15495092794299126,
	"rewards/format_reward": 0.0,
	"step": 219
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.88,
	"grad_norm": 1.9640864133834839,
	"kl": 1.234375,
	"learning_rate": 7.185729670371604e-07,
	"loss": 0.0493,
	"reward": -0.40535254031419754,
	"reward_std": 0.2874290943145752,
	"rewards/cosine_scaled_reward": -0.20267625898122787,
	"rewards/format_reward": 0.0,
	"step": 220
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.884,
	"grad_norm": 2.130681037902832,
	"kl": 1.486328125,
	"learning_rate": 7.156560487081051e-07,
	"loss": 0.0595,
	"reward": -0.3594564124941826,
	"reward_std": 0.3218042775988579,
	"rewards/cosine_scaled_reward": -0.1797281987965107,
	"rewards/format_reward": 0.0,
	"step": 221
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.888,
	"grad_norm": 2.1852834224700928,
	"kl": 1.48046875,
	"learning_rate": 7.127310565369415e-07,
	"loss": 0.0591,
	"reward": -0.331524558365345,
	"reward_std": 0.28531621396541595,
	"rewards/cosine_scaled_reward": -0.1657622903585434,
	"rewards/format_reward": 0.0,
	"step": 222
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.892,
	"grad_norm": 2.3731930255889893,
	"kl": 1.734375,
	"learning_rate": 7.097981330836616e-07,
	"loss": 0.0693,
	"reward": -0.38006093353033066,
	"reward_std": 0.3292882591485977,
	"rewards/cosine_scaled_reward": -0.19003047049045563,
	"rewards/format_reward": 0.0,
	"step": 223
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.896,
	"grad_norm": 2.3246822357177734,
	"kl": 1.0390625,
	"learning_rate": 7.068574212948169e-07,
	"loss": 0.0416,
	"reward": -0.3990800455212593,
	"reward_std": 0.3413678854703903,
	"rewards/cosine_scaled_reward": -0.19954002648591995,
	"rewards/format_reward": 0.0,
	"step": 224
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.9,
	"grad_norm": 2.4476959705352783,
	"kl": 1.45703125,
	"learning_rate": 7.039090644965509e-07,
	"loss": 0.0583,
	"reward": -0.39841071516275406,
	"reward_std": 0.31324755400419235,
	"rewards/cosine_scaled_reward": -0.19920538365840912,
	"rewards/format_reward": 0.0,
	"step": 225
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.904,
	"grad_norm": 3.0681633949279785,
	"kl": 1.75,
	"learning_rate": 7.009532063876148e-07,
	"loss": 0.0701,
	"reward": -0.35963694006204605,
	"reward_std": 0.3227182477712631,
	"rewards/cosine_scaled_reward": -0.17981846630573273,
	"rewards/format_reward": 0.0,
	"step": 226
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.908,
	"grad_norm": 3.8354952335357666,
	"kl": 1.5087890625,
	"learning_rate": 6.979899910323624e-07,
	"loss": 0.0604,
	"reward": -0.3886452168226242,
	"reward_std": 0.31125637143850327,
	"rewards/cosine_scaled_reward": -0.1943226121366024,
	"rewards/format_reward": 0.0,
	"step": 227
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.912,
	"grad_norm": 2.3208184242248535,
	"kl": 1.39453125,
	"learning_rate": 6.950195628537299e-07,
	"loss": 0.0558,
	"reward": -0.34270477294921875,
	"reward_std": 0.3698492497205734,
	"rewards/cosine_scaled_reward": -0.17135238647460938,
	"rewards/format_reward": 0.0,
	"step": 228
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.916,
	"grad_norm": 2.174126386642456,
	"kl": 2.009765625,
	"learning_rate": 6.920420666261961e-07,
	"loss": 0.0804,
	"reward": -0.37576939910650253,
	"reward_std": 0.3269713968038559,
	"rewards/cosine_scaled_reward": -0.18788469955325127,
	"rewards/format_reward": 0.0,
	"step": 229
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.92,
	"grad_norm": 2.081784725189209,
	"kl": 2.1728515625,
	"learning_rate": 6.890576474687263e-07,
	"loss": 0.0869,
	"reward": -0.3998561128973961,
	"reward_std": 0.32443511486053467,
	"rewards/cosine_scaled_reward": -0.19992805272340775,
	"rewards/format_reward": 0.0,
	"step": 230
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.924,
	"grad_norm": 2.3403866291046143,
	"kl": 1.17529296875,
	"learning_rate": 6.860664508377001e-07,
	"loss": 0.0469,
	"reward": -0.38807813823223114,
	"reward_std": 0.32711831480264664,
	"rewards/cosine_scaled_reward": -0.19403906539082527,
	"rewards/format_reward": 0.0,
	"step": 231
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.928,
	"grad_norm": 2.029927968978882,
	"kl": 1.32666015625,
	"learning_rate": 6.83068622519821e-07,
	"loss": 0.0531,
	"reward": -0.38948777318000793,
	"reward_std": 0.3195284381508827,
	"rewards/cosine_scaled_reward": -0.19474387168884277,
	"rewards/format_reward": 0.0,
	"step": 232
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.932,
	"grad_norm": 2.9124484062194824,
	"kl": 1.71484375,
	"learning_rate": 6.800643086250121e-07,
	"loss": 0.0685,
	"reward": -0.3806769847869873,
	"reward_std": 0.2985011041164398,
	"rewards/cosine_scaled_reward": -0.19033849611878395,
	"rewards/format_reward": 0.0,
	"step": 233
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.936,
	"grad_norm": 2.464742422103882,
	"kl": 1.2998046875,
	"learning_rate": 6.770536555792944e-07,
	"loss": 0.052,
	"reward": -0.3443439155817032,
	"reward_std": 0.29415207356214523,
	"rewards/cosine_scaled_reward": -0.1721719540655613,
	"rewards/format_reward": 0.0,
	"step": 234
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.94,
	"grad_norm": 2.1291651725769043,
	"kl": 1.001953125,
	"learning_rate": 6.740368101176495e-07,
	"loss": 0.0401,
	"reward": -0.33735504001379013,
	"reward_std": 0.28946489840745926,
	"rewards/cosine_scaled_reward": -0.16867752373218536,
	"rewards/format_reward": 0.0,
	"step": 235
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.944,
	"grad_norm": 2.9513416290283203,
	"kl": 1.6201171875,
	"learning_rate": 6.710139192768694e-07,
	"loss": 0.0649,
	"reward": -0.40289320796728134,
	"reward_std": 0.30230626463890076,
	"rewards/cosine_scaled_reward": -0.20144660398364067,
	"rewards/format_reward": 0.0,
	"step": 236
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.948,
	"grad_norm": 3.7395241260528564,
	"kl": 1.6240234375,
	"learning_rate": 6.679851303883891e-07,
	"loss": 0.065,
	"reward": -0.3659610077738762,
	"reward_std": 0.32638294249773026,
	"rewards/cosine_scaled_reward": -0.1829805038869381,
	"rewards/format_reward": 0.0,
	"step": 237
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.952,
	"grad_norm": 2.7872421741485596,
	"kl": 1.7919921875,
	"learning_rate": 6.649505910711058e-07,
	"loss": 0.0718,
	"reward": -0.4507276937365532,
	"reward_std": 0.35789574682712555,
	"rewards/cosine_scaled_reward": -0.2253638356924057,
	"rewards/format_reward": 0.0,
	"step": 238
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.956,
	"grad_norm": 2.139983654022217,
	"kl": 1.40234375,
	"learning_rate": 6.619104492241847e-07,
	"loss": 0.056,
	"reward": -0.3731803297996521,
	"reward_std": 0.30503255128860474,
	"rewards/cosine_scaled_reward": -0.18659016117453575,
	"rewards/format_reward": 0.0,
	"step": 239
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.96,
	"grad_norm": 6.420464515686035,
	"kl": 2.787109375,
	"learning_rate": 6.588648530198504e-07,
	"loss": 0.1116,
	"reward": -0.40894675999879837,
	"reward_std": 0.3296940475702286,
	"rewards/cosine_scaled_reward": -0.20447338744997978,
	"rewards/format_reward": 0.0,
	"step": 240
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.964,
	"grad_norm": 2.4638171195983887,
	"kl": 2.1806640625,
	"learning_rate": 6.558139508961654e-07,
	"loss": 0.0874,
	"reward": -0.42437078058719635,
	"reward_std": 0.3512648344039917,
	"rewards/cosine_scaled_reward": -0.21218538656830788,
	"rewards/format_reward": 0.0,
	"step": 241
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.968,
	"grad_norm": 2.8068432807922363,
	"kl": 1.884765625,
	"learning_rate": 6.527578915497951e-07,
	"loss": 0.0754,
	"reward": -0.394868440926075,
	"reward_std": 0.2916436865925789,
	"rewards/cosine_scaled_reward": -0.1974342130124569,
	"rewards/format_reward": 0.0,
	"step": 242
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.972,
	"grad_norm": 2.272479295730591,
	"kl": 1.453125,
	"learning_rate": 6.496968239287603e-07,
	"loss": 0.0581,
	"reward": -0.36773569136857986,
	"reward_std": 0.3104323297739029,
	"rewards/cosine_scaled_reward": -0.18386784568428993,
	"rewards/format_reward": 0.0,
	"step": 243
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.976,
	"grad_norm": 2.86352276802063,
	"kl": 1.8525390625,
	"learning_rate": 6.466308972251785e-07,
	"loss": 0.0742,
	"reward": -0.3895353376865387,
	"reward_std": 0.30376598984003067,
	"rewards/cosine_scaled_reward": -0.19476767256855965,
	"rewards/format_reward": 0.0,
	"step": 244
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.98,
	"grad_norm": 3.2674906253814697,
	"kl": 1.89453125,
	"learning_rate": 6.435602608679916e-07,
	"loss": 0.0758,
	"reward": -0.35536977648735046,
	"reward_std": 0.32461147010326385,
	"rewards/cosine_scaled_reward": -0.17768489941954613,
	"rewards/format_reward": 0.0,
	"step": 245
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.984,
	"grad_norm": 2.3651580810546875,
	"kl": 1.3994140625,
	"learning_rate": 6.404850645156841e-07,
	"loss": 0.0559,
	"reward": -0.2967621465213597,
	"reward_std": 0.29580704867839813,
	"rewards/cosine_scaled_reward": -0.1483810821082443,
	"rewards/format_reward": 0.0,
	"step": 246
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.988,
	"grad_norm": 2.6290199756622314,
	"kl": 1.544921875,
	"learning_rate": 6.374054580489873e-07,
	"loss": 0.0618,
	"reward": -0.3732440918684006,
	"reward_std": 0.28786107152700424,
	"rewards/cosine_scaled_reward": -0.1866220459342003,
	"rewards/format_reward": 0.0,
	"step": 247
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.992,
	"grad_norm": 2.474320650100708,
	"kl": 1.18359375,
	"learning_rate": 6.343215915635761e-07,
	"loss": 0.0473,
	"reward": -0.3813322111964226,
	"reward_std": 0.3196609243750572,
	"rewards/cosine_scaled_reward": -0.1906661055982113,
	"rewards/format_reward": 0.0,
	"step": 248
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 0.996,
	"grad_norm": 2.4096460342407227,
	"kl": 1.185546875,
	"learning_rate": 6.31233615362752e-07,
	"loss": 0.0475,
	"reward": -0.37723246961832047,
	"reward_std": 0.32298891991376877,
	"rewards/cosine_scaled_reward": -0.18861623480916023,
	"rewards/format_reward": 0.0,
	"step": 249
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0001220703125,
	"epoch": 1.0,
	"grad_norm": 2.414369821548462,
	"kl": 1.1552734375,
	"learning_rate": 6.281416799501187e-07,
	"loss": 0.0462,
	"reward": -0.3446759209036827,
	"reward_std": 0.30413854122161865,
	"rewards/cosine_scaled_reward": -0.17233795672655106,
	"rewards/format_reward": 0.0,
	"step": 250
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.004,
	"grad_norm": 2.3181285858154297,
	"kl": 1.4765625,
	"learning_rate": 6.25045936022246e-07,
	"loss": 0.0591,
	"reward": -0.39850035309791565,
	"reward_std": 0.3559228628873825,
	"rewards/cosine_scaled_reward": -0.19925018772482872,
	"rewards/format_reward": 0.0,
	"step": 251
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.008,
	"grad_norm": 2.3214640617370605,
	"kl": 1.59375,
	"learning_rate": 6.219465344613258e-07,
	"loss": 0.0637,
	"reward": -0.3477981239557266,
	"reward_std": 0.3031875118613243,
	"rewards/cosine_scaled_reward": -0.1738990694284439,
	"rewards/format_reward": 0.0,
	"step": 252
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.012,
	"grad_norm": 2.4848833084106445,
	"kl": 1.6416015625,
	"learning_rate": 6.188436263278172e-07,
	"loss": 0.0657,
	"reward": -0.402904212474823,
	"reward_std": 0.32011619955301285,
	"rewards/cosine_scaled_reward": -0.2014521062374115,
	"rewards/format_reward": 0.0,
	"step": 253
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.016,
	"grad_norm": 7.0177903175354,
	"kl": 3.015625,
	"learning_rate": 6.157373628530852e-07,
	"loss": 0.1206,
	"reward": -0.41366545110940933,
	"reward_std": 0.3347878158092499,
	"rewards/cosine_scaled_reward": -0.20683272555470467,
	"rewards/format_reward": 0.0,
	"step": 254
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1533.3928527832031,
	"epoch": 1.02,
	"grad_norm": 2.5155041217803955,
	"kl": 1.818359375,
	"learning_rate": 6.126278954320294e-07,
	"loss": 0.073,
	"reward": -0.41607701033353806,
	"reward_std": 0.33659277111291885,
	"rewards/cosine_scaled_reward": -0.20803850889205933,
	"rewards/format_reward": 0.0,
	"step": 255
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.024,
	"grad_norm": 3.175401449203491,
	"kl": 2.349609375,
	"learning_rate": 6.095153756157051e-07,
	"loss": 0.094,
	"reward": -0.3731570616364479,
	"reward_std": 0.3251727372407913,
	"rewards/cosine_scaled_reward": -0.18657853826880455,
	"rewards/format_reward": 0.0,
	"step": 256
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.028,
	"grad_norm": 2.345123052597046,
	"kl": 2.140625,
	"learning_rate": 6.06399955103937e-07,
	"loss": 0.0857,
	"reward": -0.4059467390179634,
	"reward_std": 0.3182907700538635,
	"rewards/cosine_scaled_reward": -0.2029733695089817,
	"rewards/format_reward": 0.0,
	"step": 257
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.032,
	"grad_norm": 2.636462688446045,
	"kl": 1.705078125,
	"learning_rate": 6.032817857379256e-07,
	"loss": 0.068,
	"reward": -0.343365378677845,
	"reward_std": 0.3163585662841797,
	"rewards/cosine_scaled_reward": -0.1716826893389225,
	"rewards/format_reward": 0.0,
	"step": 258
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.036,
	"grad_norm": 2.297900438308716,
	"kl": 1.51953125,
	"learning_rate": 6.001610194928464e-07,
	"loss": 0.0608,
	"reward": -0.3703172579407692,
	"reward_std": 0.3630036562681198,
	"rewards/cosine_scaled_reward": -0.1851586326956749,
	"rewards/format_reward": 0.0,
	"step": 259
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.04,
	"grad_norm": 2.311648368835449,
	"kl": 1.515625,
	"learning_rate": 5.97037808470444e-07,
	"loss": 0.0605,
	"reward": -0.3789840117096901,
	"reward_std": 0.330322228372097,
	"rewards/cosine_scaled_reward": -0.18949199840426445,
	"rewards/format_reward": 0.0,
	"step": 260
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.044,
	"grad_norm": 2.3599531650543213,
	"kl": 1.78515625,
	"learning_rate": 5.939123048916173e-07,
	"loss": 0.0714,
	"reward": -0.3447503596544266,
	"reward_std": 0.33612456917762756,
	"rewards/cosine_scaled_reward": -0.17237518727779388,
	"rewards/format_reward": 0.0,
	"step": 261
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1527.6190490722656,
	"epoch": 1.048,
	"grad_norm": 2.2337074279785156,
	"kl": 1.890625,
	"learning_rate": 5.907846610890011e-07,
	"loss": 0.0786,
	"reward": -0.39859064668416977,
	"reward_std": 0.32645051926374435,
	"rewards/cosine_scaled_reward": -0.1992953196167946,
	"rewards/format_reward": 0.0,
	"step": 262
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.052,
	"grad_norm": 2.818617582321167,
	"kl": 1.55859375,
	"learning_rate": 5.87655029499542e-07,
	"loss": 0.0624,
	"reward": -0.3537183925509453,
	"reward_std": 0.309035487473011,
	"rewards/cosine_scaled_reward": -0.17685920372605324,
	"rewards/format_reward": 0.0,
	"step": 263
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.056,
	"grad_norm": 2.3533854484558105,
	"kl": 1.3583984375,
	"learning_rate": 5.845235626570683e-07,
	"loss": 0.0543,
	"reward": -0.3672221526503563,
	"reward_std": 0.31650061905384064,
	"rewards/cosine_scaled_reward": -0.18361108005046844,
	"rewards/format_reward": 0.0,
	"step": 264
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.06,
	"grad_norm": 3.936475992202759,
	"kl": 2.265625,
	"learning_rate": 5.813904131848564e-07,
	"loss": 0.0907,
	"reward": -0.36572812497615814,
	"reward_std": 0.2912697494029999,
	"rewards/cosine_scaled_reward": -0.18286405876278877,
	"rewards/format_reward": 0.0,
	"step": 265
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.064,
	"grad_norm": 2.754866600036621,
	"kl": 1.943359375,
	"learning_rate": 5.78255733788191e-07,
	"loss": 0.0777,
	"reward": -0.37356945127248764,
	"reward_std": 0.34380726516246796,
	"rewards/cosine_scaled_reward": -0.18678472936153412,
	"rewards/format_reward": 0.0,
	"step": 266
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.068,
	"grad_norm": 2.374964952468872,
	"kl": 1.4267578125,
	"learning_rate": 5.751196772469237e-07,
	"loss": 0.0571,
	"reward": -0.3651036322116852,
	"reward_std": 0.30468039214611053,
	"rewards/cosine_scaled_reward": -0.1825518161058426,
	"rewards/format_reward": 0.0,
	"step": 267
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1533.0535888671875,
	"epoch": 1.072,
	"grad_norm": 2.618032693862915,
	"kl": 1.6171875,
	"learning_rate": 5.71982396408026e-07,
	"loss": 0.0651,
	"reward": -0.35353927314281464,
	"reward_std": 0.3086354061961174,
	"rewards/cosine_scaled_reward": -0.17676963657140732,
	"rewards/format_reward": 0.0,
	"step": 268
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.076,
	"grad_norm": 2.920133590698242,
	"kl": 1.8515625,
	"learning_rate": 5.688440441781398e-07,
	"loss": 0.074,
	"reward": -0.37572528421878815,
	"reward_std": 0.33292342722415924,
	"rewards/cosine_scaled_reward": -0.18786264210939407,
	"rewards/format_reward": 0.0,
	"step": 269
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.08,
	"grad_norm": 2.581885576248169,
	"kl": 1.830078125,
	"learning_rate": 5.657047735161255e-07,
	"loss": 0.0732,
	"reward": -0.34584221988916397,
	"reward_std": 0.3140456974506378,
	"rewards/cosine_scaled_reward": -0.17292110994458199,
	"rewards/format_reward": 0.0,
	"step": 270
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.084,
	"grad_norm": 8.366601943969727,
	"kl": 2.509765625,
	"learning_rate": 5.625647374256061e-07,
	"loss": 0.1003,
	"reward": -0.37314866855740547,
	"reward_std": 0.2792880907654762,
	"rewards/cosine_scaled_reward": -0.18657432682812214,
	"rewards/format_reward": 0.0,
	"step": 271
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.088,
	"grad_norm": 3.071047067642212,
	"kl": 1.9658203125,
	"learning_rate": 5.594240889475106e-07,
	"loss": 0.0785,
	"reward": -0.39643432199954987,
	"reward_std": 0.31065937131643295,
	"rewards/cosine_scaled_reward": -0.19821715354919434,
	"rewards/format_reward": 0.0,
	"step": 272
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.092,
	"grad_norm": 3.8571436405181885,
	"kl": 1.2626953125,
	"learning_rate": 5.562829811526154e-07,
	"loss": 0.0506,
	"reward": -0.3136083036661148,
	"reward_std": 0.28241100907325745,
	"rewards/cosine_scaled_reward": -0.1568041555583477,
	"rewards/format_reward": 0.0,
	"step": 273
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.096,
	"grad_norm": 2.1380457878112793,
	"kl": 1.96875,
	"learning_rate": 5.531415671340826e-07,
	"loss": 0.0786,
	"reward": -0.35791803896427155,
	"reward_std": 0.3191326707601547,
	"rewards/cosine_scaled_reward": -0.17895901948213577,
	"rewards/format_reward": 0.0,
	"step": 274
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.1,
	"grad_norm": 3.744987964630127,
	"kl": 2.048828125,
	"learning_rate": 5.5e-07,
	"loss": 0.0819,
	"reward": -0.3743599057197571,
	"reward_std": 0.3121279552578926,
	"rewards/cosine_scaled_reward": -0.18717995658516884,
	"rewards/format_reward": 0.0,
	"step": 275
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.104,
	"grad_norm": 2.783698081970215,
	"kl": 1.8984375,
	"learning_rate": 5.468584328659172e-07,
	"loss": 0.0761,
	"reward": -0.3865007609128952,
	"reward_std": 0.322613961994648,
	"rewards/cosine_scaled_reward": -0.1932503841817379,
	"rewards/format_reward": 0.0,
	"step": 276
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.108,
	"grad_norm": 3.2086503505706787,
	"kl": 1.865234375,
	"learning_rate": 5.437170188473847e-07,
	"loss": 0.0746,
	"reward": -0.41129884123802185,
	"reward_std": 0.3018573820590973,
	"rewards/cosine_scaled_reward": -0.20564941689372063,
	"rewards/format_reward": 0.0,
	"step": 277
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.112,
	"grad_norm": 2.4078729152679443,
	"kl": 1.4072265625,
	"learning_rate": 5.405759110524894e-07,
	"loss": 0.0563,
	"reward": -0.39701489359140396,
	"reward_std": 0.3126164525747299,
	"rewards/cosine_scaled_reward": -0.19850744307041168,
	"rewards/format_reward": 0.0,
	"step": 278
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.116,
	"grad_norm": 2.5043461322784424,
	"kl": 2.35546875,
	"learning_rate": 5.37435262574394e-07,
	"loss": 0.0944,
	"reward": -0.28278425987809896,
	"reward_std": 0.2714259997010231,
	"rewards/cosine_scaled_reward": -0.1413921354105696,
	"rewards/format_reward": 0.0,
	"step": 279
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1533.6190490722656,
	"epoch": 1.12,
	"grad_norm": 4.991820335388184,
	"kl": 1.83984375,
	"learning_rate": 5.342952264838747e-07,
	"loss": 0.0713,
	"reward": -0.3403998464345932,
	"reward_std": 0.3223363533616066,
	"rewards/cosine_scaled_reward": -0.1701999232172966,
	"rewards/format_reward": 0.0,
	"step": 280
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1534.0476379394531,
	"epoch": 1.124,
	"grad_norm": 2.818126916885376,
	"kl": 1.37890625,
	"learning_rate": 5.311559558218603e-07,
	"loss": 0.054,
	"reward": -0.3611769676208496,
	"reward_std": 0.3213232010602951,
	"rewards/cosine_scaled_reward": -0.1805884800851345,
	"rewards/format_reward": 0.0,
	"step": 281
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.1280000000000001,
	"grad_norm": 2.7234742641448975,
	"kl": 2.248046875,
	"learning_rate": 5.28017603591974e-07,
	"loss": 0.0899,
	"reward": -0.4201104864478111,
	"reward_std": 0.3131628781557083,
	"rewards/cosine_scaled_reward": -0.21005523577332497,
	"rewards/format_reward": 0.0,
	"step": 282
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.1320000000000001,
	"grad_norm": 6.938405990600586,
	"kl": 1.998046875,
	"learning_rate": 5.248803227530763e-07,
	"loss": 0.0799,
	"reward": -0.33411792665719986,
	"reward_std": 0.32330870628356934,
	"rewards/cosine_scaled_reward": -0.16705895960330963,
	"rewards/format_reward": 0.0,
	"step": 283
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.1360000000000001,
	"grad_norm": 3.5663974285125732,
	"kl": 1.3349609375,
	"learning_rate": 5.21744266211809e-07,
	"loss": 0.0534,
	"reward": -0.3633820191025734,
	"reward_std": 0.31287185102701187,
	"rewards/cosine_scaled_reward": -0.1816909983754158,
	"rewards/format_reward": 0.0,
	"step": 284
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.1400000000000001,
	"grad_norm": 2.0476882457733154,
	"kl": 1.708984375,
	"learning_rate": 5.186095868151436e-07,
	"loss": 0.0684,
	"reward": -0.3689531907439232,
	"reward_std": 0.32297470420598984,
	"rewards/cosine_scaled_reward": -0.184476587921381,
	"rewards/format_reward": 0.0,
	"step": 285
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1526.8869018554688,
	"epoch": 1.144,
	"grad_norm": 12.345512390136719,
	"kl": 2.966796875,
	"learning_rate": 5.154764373429315e-07,
	"loss": 0.1254,
	"reward": -0.3650151863694191,
	"reward_std": 0.31899186968803406,
	"rewards/cosine_scaled_reward": -0.18250760063529015,
	"rewards/format_reward": 0.0,
	"step": 286
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.148,
	"grad_norm": 2.059617519378662,
	"kl": 2.291015625,
	"learning_rate": 5.123449705004581e-07,
	"loss": 0.0916,
	"reward": -0.3706892877817154,
	"reward_std": 0.32747378945350647,
	"rewards/cosine_scaled_reward": -0.1853446513414383,
	"rewards/format_reward": 0.0,
	"step": 287
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.152,
	"grad_norm": 3.889174699783325,
	"kl": 2.0859375,
	"learning_rate": 5.09215338910999e-07,
	"loss": 0.0834,
	"reward": -0.4078289121389389,
	"reward_std": 0.3290611281991005,
	"rewards/cosine_scaled_reward": -0.20391445606946945,
	"rewards/format_reward": 0.0,
	"step": 288
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1533.2440490722656,
	"epoch": 1.156,
	"grad_norm": 2.5038888454437256,
	"kl": 0.93896484375,
	"learning_rate": 5.060876951083828e-07,
	"loss": 0.0354,
	"reward": -0.34110401570796967,
	"reward_std": 0.3122602626681328,
	"rewards/cosine_scaled_reward": -0.17055201157927513,
	"rewards/format_reward": 0.0,
	"step": 289
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.16,
	"grad_norm": 2.39719557762146,
	"kl": 1.8583984375,
	"learning_rate": 5.02962191529556e-07,
	"loss": 0.0744,
	"reward": -0.36911261081695557,
	"reward_std": 0.3288589343428612,
	"rewards/cosine_scaled_reward": -0.1845562942326069,
	"rewards/format_reward": 0.0,
	"step": 290
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.164,
	"grad_norm": 2.758849620819092,
	"kl": 1.626953125,
	"learning_rate": 4.998389805071536e-07,
	"loss": 0.0651,
	"reward": -0.3935117796063423,
	"reward_std": 0.3461349532008171,
	"rewards/cosine_scaled_reward": -0.19675587862730026,
	"rewards/format_reward": 0.0,
	"step": 291
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.168,
	"grad_norm": 2.310575246810913,
	"kl": 1.455078125,
	"learning_rate": 4.967182142620745e-07,
	"loss": 0.0583,
	"reward": -0.34184807538986206,
	"reward_std": 0.3021695464849472,
	"rewards/cosine_scaled_reward": -0.17092403396964073,
	"rewards/format_reward": 0.0,
	"step": 292
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.172,
	"grad_norm": 2.8417394161224365,
	"kl": 1.861328125,
	"learning_rate": 4.93600044896063e-07,
	"loss": 0.0744,
	"reward": -0.3772461339831352,
	"reward_std": 0.3044436201453209,
	"rewards/cosine_scaled_reward": -0.18862305954098701,
	"rewards/format_reward": 0.0,
	"step": 293
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.176,
	"grad_norm": 2.347404956817627,
	"kl": 1.28125,
	"learning_rate": 4.904846243842949e-07,
	"loss": 0.0513,
	"reward": -0.3517310842871666,
	"reward_std": 0.3094722405076027,
	"rewards/cosine_scaled_reward": -0.1758655458688736,
	"rewards/format_reward": 0.0,
	"step": 294
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1535.6130981445312,
	"epoch": 1.18,
	"grad_norm": 2.7739925384521484,
	"kl": 1.833984375,
	"learning_rate": 4.873721045679706e-07,
	"loss": 0.0731,
	"reward": -0.4288819953799248,
	"reward_std": 0.3247087821364403,
	"rewards/cosine_scaled_reward": -0.2144409976899624,
	"rewards/format_reward": 0.0,
	"step": 295
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.184,
	"grad_norm": 2.1470892429351807,
	"kl": 1.296875,
	"learning_rate": 4.842626371469149e-07,
	"loss": 0.0519,
	"reward": -0.35219819098711014,
	"reward_std": 0.3056294918060303,
	"rewards/cosine_scaled_reward": -0.17609910294413567,
	"rewards/format_reward": 0.0,
	"step": 296
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.188,
	"grad_norm": 3.177232503890991,
	"kl": 1.677734375,
	"learning_rate": 4.811563736721829e-07,
	"loss": 0.0671,
	"reward": -0.3717339485883713,
	"reward_std": 0.29695921391248703,
	"rewards/cosine_scaled_reward": -0.18586697429418564,
	"rewards/format_reward": 0.0,
	"step": 297
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.192,
	"grad_norm": 3.3333382606506348,
	"kl": 2.322265625,
	"learning_rate": 4.780534655386743e-07,
	"loss": 0.093,
	"reward": -0.3814833015203476,
	"reward_std": 0.28608307987451553,
	"rewards/cosine_scaled_reward": -0.1907416470348835,
	"rewards/format_reward": 0.0,
	"step": 298
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.196,
	"grad_norm": 2.842420816421509,
	"kl": 1.45703125,
	"learning_rate": 4.749540639777539e-07,
	"loss": 0.0583,
	"reward": -0.3840809538960457,
	"reward_std": 0.31393957883119583,
	"rewards/cosine_scaled_reward": -0.19204047322273254,
	"rewards/format_reward": 0.0,
	"step": 299
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.2,
	"grad_norm": 2.9220309257507324,
	"kl": 1.681640625,
	"learning_rate": 4.7185832004988133e-07,
	"loss": 0.0672,
	"reward": -0.39588408917188644,
	"reward_std": 0.33600132539868355,
	"rewards/cosine_scaled_reward": -0.19794204831123352,
	"rewards/format_reward": 0.0,
	"step": 300
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.204,
	"grad_norm": 3.4091219902038574,
	"kl": 1.44140625,
	"learning_rate": 4.68766384637248e-07,
	"loss": 0.0576,
	"reward": -0.2894315180601552,
	"reward_std": 0.30969203263521194,
	"rewards/cosine_scaled_reward": -0.14471576345385984,
	"rewards/format_reward": 0.0,
	"step": 301
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.208,
	"grad_norm": 2.0488741397857666,
	"kl": 1.5576171875,
	"learning_rate": 4.656784084364238e-07,
	"loss": 0.0624,
	"reward": -0.32318826019763947,
	"reward_std": 0.3031533695757389,
	"rewards/cosine_scaled_reward": -0.16159413009881973,
	"rewards/format_reward": 0.0,
	"step": 302
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.212,
	"grad_norm": 2.6755242347717285,
	"kl": 1.34765625,
	"learning_rate": 4.6259454195101267e-07,
	"loss": 0.0539,
	"reward": -0.37002843618392944,
	"reward_std": 0.31058184802532196,
	"rewards/cosine_scaled_reward": -0.18501422181725502,
	"rewards/format_reward": 0.0,
	"step": 303
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.216,
	"grad_norm": 6.160266399383545,
	"kl": 1.734375,
	"learning_rate": 4.59514935484316e-07,
	"loss": 0.0694,
	"reward": -0.38714154064655304,
	"reward_std": 0.3265160173177719,
	"rewards/cosine_scaled_reward": -0.19357078149914742,
	"rewards/format_reward": 0.0,
	"step": 304
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.22,
	"grad_norm": 2.3529880046844482,
	"kl": 1.2138671875,
	"learning_rate": 4.5643973913200837e-07,
	"loss": 0.0486,
	"reward": -0.3460870534181595,
	"reward_std": 0.3087117671966553,
	"rewards/cosine_scaled_reward": -0.17304353043437004,
	"rewards/format_reward": 0.0,
	"step": 305
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.224,
	"grad_norm": 2.48714280128479,
	"kl": 1.9453125,
	"learning_rate": 4.5336910277482155e-07,
	"loss": 0.0779,
	"reward": -0.3756335750222206,
	"reward_std": 0.32805445045232773,
	"rewards/cosine_scaled_reward": -0.1878167800605297,
	"rewards/format_reward": 0.0,
	"step": 306
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.228,
	"grad_norm": 8.46654987335205,
	"kl": 2.5107421875,
	"learning_rate": 4.503031760712397e-07,
	"loss": 0.1004,
	"reward": -0.385331392288208,
	"reward_std": 0.31344960629940033,
	"rewards/cosine_scaled_reward": -0.1926657035946846,
	"rewards/format_reward": 0.0,
	"step": 307
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.232,
	"grad_norm": 3.198944568634033,
	"kl": 2.140625,
	"learning_rate": 4.4724210845020494e-07,
	"loss": 0.0857,
	"reward": -0.36118319630622864,
	"reward_std": 0.3010380119085312,
	"rewards/cosine_scaled_reward": -0.18059159815311432,
	"rewards/format_reward": 0.0,
	"step": 308
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.236,
	"grad_norm": 2.745668411254883,
	"kl": 2.033203125,
	"learning_rate": 4.441860491038345e-07,
	"loss": 0.0813,
	"reward": -0.3596822917461395,
	"reward_std": 0.3092067465186119,
	"rewards/cosine_scaled_reward": -0.17984114587306976,
	"rewards/format_reward": 0.0,
	"step": 309
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.24,
	"grad_norm": 5.614748954772949,
	"kl": 2.34375,
	"learning_rate": 4.4113514698014953e-07,
	"loss": 0.094,
	"reward": -0.34773094952106476,
	"reward_std": 0.29645886272192,
	"rewards/cosine_scaled_reward": -0.17386547103524208,
	"rewards/format_reward": 0.0,
	"step": 310
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.244,
	"grad_norm": 2.089031219482422,
	"kl": 1.39453125,
	"learning_rate": 4.3808955077581546e-07,
	"loss": 0.0558,
	"reward": -0.33028923720121384,
	"reward_std": 0.2886582836508751,
	"rewards/cosine_scaled_reward": -0.16514462232589722,
	"rewards/format_reward": 0.0,
	"step": 311
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.248,
	"grad_norm": 5.366787433624268,
	"kl": 2.9599609375,
	"learning_rate": 4.350494089288943e-07,
	"loss": 0.1186,
	"reward": -0.4123021811246872,
	"reward_std": 0.337029866874218,
	"rewards/cosine_scaled_reward": -0.206151083111763,
	"rewards/format_reward": 0.0,
	"step": 312
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.252,
	"grad_norm": 8.391505241394043,
	"kl": 1.953125,
	"learning_rate": 4.3201486961161093e-07,
	"loss": 0.078,
	"reward": -0.3487403020262718,
	"reward_std": 0.3276291638612747,
	"rewards/cosine_scaled_reward": -0.1743701510131359,
	"rewards/format_reward": 0.0,
	"step": 313
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.256,
	"grad_norm": 2.623786449432373,
	"kl": 1.3193359375,
	"learning_rate": 4.2898608072313045e-07,
	"loss": 0.0528,
	"reward": -0.32606934756040573,
	"reward_std": 0.28208620101213455,
	"rewards/cosine_scaled_reward": -0.16303467005491257,
	"rewards/format_reward": 0.0,
	"step": 314
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.26,
	"grad_norm": 2.2247447967529297,
	"kl": 1.884765625,
	"learning_rate": 4.2596318988235037e-07,
	"loss": 0.0755,
	"reward": -0.2273978427692782,
	"reward_std": 0.28098014742136,
	"rewards/cosine_scaled_reward": -0.11369891960930545,
	"rewards/format_reward": 0.0,
	"step": 315
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.264,
	"grad_norm": 2.258469581604004,
	"kl": 1.14453125,
	"learning_rate": 4.2294634442070553e-07,
	"loss": 0.0457,
	"reward": -0.24764333851635456,
	"reward_std": 0.2835834100842476,
	"rewards/cosine_scaled_reward": -0.12382166367024183,
	"rewards/format_reward": 0.0,
	"step": 316
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.268,
	"grad_norm": 2.884620189666748,
	"kl": 1.5986328125,
	"learning_rate": 4.1993569137498776e-07,
	"loss": 0.064,
	"reward": -0.37140634655952454,
	"reward_std": 0.36573630571365356,
	"rewards/cosine_scaled_reward": -0.18570317327976227,
	"rewards/format_reward": 0.0,
	"step": 317
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.272,
	"grad_norm": 2.703934669494629,
	"kl": 1.912109375,
	"learning_rate": 4.1693137748017915e-07,
	"loss": 0.0763,
	"reward": -0.34411681443452835,
	"reward_std": 0.29631946235895157,
	"rewards/cosine_scaled_reward": -0.17205841839313507,
	"rewards/format_reward": 0.0,
	"step": 318
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.276,
	"grad_norm": 3.717240571975708,
	"kl": 2.224609375,
	"learning_rate": 4.1393354916230005e-07,
	"loss": 0.0891,
	"reward": -0.3324529230594635,
	"reward_std": 0.2552623227238655,
	"rewards/cosine_scaled_reward": -0.16622646152973175,
	"rewards/format_reward": 0.0,
	"step": 319
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.28,
	"grad_norm": 2.4941396713256836,
	"kl": 1.384765625,
	"learning_rate": 4.1094235253127374e-07,
	"loss": 0.0555,
	"reward": -0.30811577290296555,
	"reward_std": 0.2845884971320629,
	"rewards/cosine_scaled_reward": -0.15405788272619247,
	"rewards/format_reward": 0.0,
	"step": 320
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.284,
	"grad_norm": 3.229072332382202,
	"kl": 1.9453125,
	"learning_rate": 4.079579333738039e-07,
	"loss": 0.0778,
	"reward": -0.3366442248225212,
	"reward_std": 0.301740899682045,
	"rewards/cosine_scaled_reward": -0.1683221124112606,
	"rewards/format_reward": 0.0,
	"step": 321
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.288,
	"grad_norm": 3.3636343479156494,
	"kl": 1.8828125,
	"learning_rate": 4.0498043714627006e-07,
	"loss": 0.0752,
	"reward": -0.36845648288726807,
	"reward_std": 0.34283190220594406,
	"rewards/cosine_scaled_reward": -0.18422825261950493,
	"rewards/format_reward": 0.0,
	"step": 322
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.292,
	"grad_norm": 3.507054090499878,
	"kl": 1.4130859375,
	"learning_rate": 4.020100089676376e-07,
	"loss": 0.0566,
	"reward": -0.34711746126413345,
	"reward_std": 0.2960944324731827,
	"rewards/cosine_scaled_reward": -0.17355873063206673,
	"rewards/format_reward": 0.0,
	"step": 323
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.296,
	"grad_norm": 2.661647081375122,
	"kl": 1.736328125,
	"learning_rate": 3.9904679361238526e-07,
	"loss": 0.0694,
	"reward": -0.33277176320552826,
	"reward_std": 0.3034566268324852,
	"rewards/cosine_scaled_reward": -0.16638587787747383,
	"rewards/format_reward": 0.0,
	"step": 324
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.3,
	"grad_norm": 3.079672336578369,
	"kl": 1.359375,
	"learning_rate": 3.9609093550344907e-07,
	"loss": 0.0544,
	"reward": -0.3246685415506363,
	"reward_std": 0.27341699600219727,
	"rewards/cosine_scaled_reward": -0.16233427450060844,
	"rewards/format_reward": 0.0,
	"step": 325
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.304,
	"grad_norm": 3.248324394226074,
	"kl": 1.1181640625,
	"learning_rate": 3.931425787051832e-07,
	"loss": 0.0447,
	"reward": -0.3214203119277954,
	"reward_std": 0.2835453376173973,
	"rewards/cosine_scaled_reward": -0.160710159689188,
	"rewards/format_reward": 0.0,
	"step": 326
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.308,
	"grad_norm": 3.676837205886841,
	"kl": 1.724609375,
	"learning_rate": 3.902018669163384e-07,
	"loss": 0.069,
	"reward": -0.32949286699295044,
	"reward_std": 0.30344782024621964,
	"rewards/cosine_scaled_reward": -0.16474644094705582,
	"rewards/format_reward": 0.0,
	"step": 327
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.312,
	"grad_norm": 2.3120462894439697,
	"kl": 1.537109375,
	"learning_rate": 3.872689434630585e-07,
	"loss": 0.0615,
	"reward": -0.3512613996863365,
	"reward_std": 0.3501633331179619,
	"rewards/cosine_scaled_reward": -0.17563070356845856,
	"rewards/format_reward": 0.0,
	"step": 328
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.316,
	"grad_norm": 2.4828386306762695,
	"kl": 1.6953125,
	"learning_rate": 3.843439512918949e-07,
	"loss": 0.0677,
	"reward": -0.31614498794078827,
	"reward_std": 0.29276788979768753,
	"rewards/cosine_scaled_reward": -0.15807249024510384,
	"rewards/format_reward": 0.0,
	"step": 329
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.32,
	"grad_norm": 3.356783151626587,
	"kl": 2.453125,
	"learning_rate": 3.8142703296283953e-07,
	"loss": 0.0982,
	"reward": -0.4576185494661331,
	"reward_std": 0.32832735031843185,
	"rewards/cosine_scaled_reward": -0.22880928218364716,
	"rewards/format_reward": 0.0,
	"step": 330
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.324,
	"grad_norm": 2.7885196208953857,
	"kl": 2.068359375,
	"learning_rate": 3.785183306423767e-07,
	"loss": 0.0827,
	"reward": -0.2943090833723545,
	"reward_std": 0.31652648001909256,
	"rewards/cosine_scaled_reward": -0.14715453796088696,
	"rewards/format_reward": 0.0,
	"step": 331
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.328,
	"grad_norm": 3.0415380001068115,
	"kl": 1.802734375,
	"learning_rate": 3.7561798609655373e-07,
	"loss": 0.0721,
	"reward": -0.3697570115327835,
	"reward_std": 0.3258262947201729,
	"rewards/cosine_scaled_reward": -0.18487850576639175,
	"rewards/format_reward": 0.0,
	"step": 332
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.332,
	"grad_norm": 3.139693021774292,
	"kl": 1.732421875,
	"learning_rate": 3.72726140684072e-07,
	"loss": 0.0693,
	"reward": -0.33471549302339554,
	"reward_std": 0.2794983647763729,
	"rewards/cosine_scaled_reward": -0.16735775396227837,
	"rewards/format_reward": 0.0,
	"step": 333
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.336,
	"grad_norm": 2.6243162155151367,
	"kl": 1.8369140625,
	"learning_rate": 3.6984293534939737e-07,
	"loss": 0.0733,
	"reward": -0.3382048085331917,
	"reward_std": 0.3457643389701843,
	"rewards/cosine_scaled_reward": -0.16910240054130554,
	"rewards/format_reward": 0.0,
	"step": 334
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.34,
	"grad_norm": 3.803060293197632,
	"kl": 1.8046875,
	"learning_rate": 3.6696851061588994e-07,
	"loss": 0.0723,
	"reward": -0.3406166359782219,
	"reward_std": 0.29876144975423813,
	"rewards/cosine_scaled_reward": -0.17030831426382065,
	"rewards/format_reward": 0.0,
	"step": 335
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.3439999999999999,
	"grad_norm": 3.948391914367676,
	"kl": 1.365234375,
	"learning_rate": 3.641030065789562e-07,
	"loss": 0.0546,
	"reward": -0.2908342033624649,
	"reward_std": 0.26911235228180885,
	"rewards/cosine_scaled_reward": -0.14541710540652275,
	"rewards/format_reward": 0.0,
	"step": 336
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.3479999999999999,
	"grad_norm": 2.9695639610290527,
	"kl": 2.19921875,
	"learning_rate": 3.612465628992203e-07,
	"loss": 0.0881,
	"reward": -0.37160656601190567,
	"reward_std": 0.3147331103682518,
	"rewards/cosine_scaled_reward": -0.18580328300595284,
	"rewards/format_reward": 0.0,
	"step": 337
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.3519999999999999,
	"grad_norm": 3.1350209712982178,
	"kl": 2.1689453125,
	"learning_rate": 3.5839931879571725e-07,
	"loss": 0.087,
	"reward": -0.3230074942111969,
	"reward_std": 0.313438281416893,
	"rewards/cosine_scaled_reward": -0.16150375083088875,
	"rewards/format_reward": 0.0,
	"step": 338
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.3559999999999999,
	"grad_norm": 3.882567882537842,
	"kl": 2.0546875,
	"learning_rate": 3.555614130391079e-07,
	"loss": 0.0821,
	"reward": -0.36975327879190445,
	"reward_std": 0.31242573261260986,
	"rewards/cosine_scaled_reward": -0.18487663567066193,
	"rewards/format_reward": 0.0,
	"step": 339
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.3599999999999999,
	"grad_norm": 2.6699118614196777,
	"kl": 1.689453125,
	"learning_rate": 3.5273298394491515e-07,
	"loss": 0.0676,
	"reward": -0.368961863219738,
	"reward_std": 0.32627636194229126,
	"rewards/cosine_scaled_reward": -0.1844809353351593,
	"rewards/format_reward": 0.0,
	"step": 340
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.3639999999999999,
	"grad_norm": 3.0782856941223145,
	"kl": 1.59765625,
	"learning_rate": 3.4991416936678276e-07,
	"loss": 0.064,
	"reward": -0.3320116475224495,
	"reward_std": 0.3151276856660843,
	"rewards/cosine_scaled_reward": -0.16600582748651505,
	"rewards/format_reward": 0.0,
	"step": 341
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.3679999999999999,
	"grad_norm": 2.2419495582580566,
	"kl": 1.46484375,
	"learning_rate": 3.471051066897562e-07,
	"loss": 0.0585,
	"reward": -0.2764207161962986,
	"reward_std": 0.3390573188662529,
	"rewards/cosine_scaled_reward": -0.1382103539071977,
	"rewards/format_reward": 0.0,
	"step": 342
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.3719999999999999,
	"grad_norm": 4.397972106933594,
	"kl": 2.5,
	"learning_rate": 3.4430593282358777e-07,
	"loss": 0.1002,
	"reward": -0.33926407247781754,
	"reward_std": 0.31172922998666763,
	"rewards/cosine_scaled_reward": -0.16963203251361847,
	"rewards/format_reward": 0.0,
	"step": 343
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.376,
	"grad_norm": 3.441905975341797,
	"kl": 2.0234375,
	"learning_rate": 3.4151678419606233e-07,
	"loss": 0.0808,
	"reward": -0.3324861600995064,
	"reward_std": 0.2958858981728554,
	"rewards/cosine_scaled_reward": -0.1662430725991726,
	"rewards/format_reward": 0.0,
	"step": 344
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.38,
	"grad_norm": 2.7323975563049316,
	"kl": 1.4189453125,
	"learning_rate": 3.387377967463493e-07,
	"loss": 0.0566,
	"reward": -0.3314187452197075,
	"reward_std": 0.3164066970348358,
	"rewards/cosine_scaled_reward": -0.16570937633514404,
	"rewards/format_reward": 0.0,
	"step": 345
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.384,
	"grad_norm": 4.131885528564453,
	"kl": 2.45703125,
	"learning_rate": 3.359691059183761e-07,
	"loss": 0.0983,
	"reward": -0.37432391941547394,
	"reward_std": 0.33136965334415436,
	"rewards/cosine_scaled_reward": -0.18716195970773697,
	"rewards/format_reward": 0.0,
	"step": 346
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.388,
	"grad_norm": 2.9907569885253906,
	"kl": 1.732421875,
	"learning_rate": 3.3321084665422803e-07,
	"loss": 0.0693,
	"reward": -0.38256606459617615,
	"reward_std": 0.31782740354537964,
	"rewards/cosine_scaled_reward": -0.19128303229808807,
	"rewards/format_reward": 0.0,
	"step": 347
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.392,
	"grad_norm": 2.6049344539642334,
	"kl": 1.53515625,
	"learning_rate": 3.3046315338757026e-07,
	"loss": 0.0613,
	"reward": -0.2997368350625038,
	"reward_std": 0.3045838475227356,
	"rewards/cosine_scaled_reward": -0.1498684138059616,
	"rewards/format_reward": 0.0,
	"step": 348
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.396,
	"grad_norm": 4.5095295906066895,
	"kl": 1.5029296875,
	"learning_rate": 3.2772616003709616e-07,
	"loss": 0.0602,
	"reward": -0.3363025635480881,
	"reward_std": 0.30865515023469925,
	"rewards/cosine_scaled_reward": -0.16815128177404404,
	"rewards/format_reward": 0.0,
	"step": 349
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.4,
	"grad_norm": 3.3342795372009277,
	"kl": 1.908203125,
	"learning_rate": 3.250000000000001e-07,
	"loss": 0.0762,
	"reward": -0.3770889565348625,
	"reward_std": 0.30710920691490173,
	"rewards/cosine_scaled_reward": -0.18854447081685066,
	"rewards/format_reward": 0.0,
	"step": 350
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.404,
	"grad_norm": 2.795259714126587,
	"kl": 2.048828125,
	"learning_rate": 3.222848061454764e-07,
	"loss": 0.082,
	"reward": -0.3462035730481148,
	"reward_std": 0.32692621648311615,
	"rewards/cosine_scaled_reward": -0.1731017865240574,
	"rewards/format_reward": 0.0,
	"step": 351
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.408,
	"grad_norm": 2.563765287399292,
	"kl": 1.462890625,
	"learning_rate": 3.195807108082429e-07,
	"loss": 0.0586,
	"reward": -0.37373943626880646,
	"reward_std": 0.3041759356856346,
	"rewards/cosine_scaled_reward": -0.18686972558498383,
	"rewards/format_reward": 0.0,
	"step": 352
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.412,
	"grad_norm": 2.6194751262664795,
	"kl": 1.24609375,
	"learning_rate": 3.168878457820915e-07,
	"loss": 0.0498,
	"reward": -0.3196728527545929,
	"reward_std": 0.2953634150326252,
	"rewards/cosine_scaled_reward": -0.15983642637729645,
	"rewards/format_reward": 0.0,
	"step": 353
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.416,
	"grad_norm": 2.8382420539855957,
	"kl": 1.650390625,
	"learning_rate": 3.142063423134644e-07,
	"loss": 0.0662,
	"reward": -0.33513225615024567,
	"reward_std": 0.30527665093541145,
	"rewards/cosine_scaled_reward": -0.16756613552570343,
	"rewards/format_reward": 0.0,
	"step": 354
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.42,
	"grad_norm": 2.6078808307647705,
	"kl": 2.15234375,
	"learning_rate": 3.115363310950578e-07,
	"loss": 0.086,
	"reward": -0.3992829695343971,
	"reward_std": 0.31726495921611786,
	"rewards/cosine_scaled_reward": -0.19964147731661797,
	"rewards/format_reward": 0.0,
	"step": 355
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.424,
	"grad_norm": 4.192615985870361,
	"kl": 2.142578125,
	"learning_rate": 3.0887794225945143e-07,
	"loss": 0.0858,
	"reward": -0.39319509267807007,
	"reward_std": 0.3372880816459656,
	"rewards/cosine_scaled_reward": -0.19659754261374474,
	"rewards/format_reward": 0.0,
	"step": 356
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.428,
	"grad_norm": 3.196894407272339,
	"kl": 2.509765625,
	"learning_rate": 3.062313053727671e-07,
	"loss": 0.1006,
	"reward": -0.3694089204072952,
	"reward_std": 0.323252871632576,
	"rewards/cosine_scaled_reward": -0.1847044676542282,
	"rewards/format_reward": 0.0,
	"step": 357
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.432,
	"grad_norm": 3.348161458969116,
	"kl": 1.1142578125,
	"learning_rate": 3.0359654942835247e-07,
	"loss": 0.0447,
	"reward": -0.36088229715824127,
	"reward_std": 0.31483449041843414,
	"rewards/cosine_scaled_reward": -0.18044114857912064,
	"rewards/format_reward": 0.0,
	"step": 358
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.436,
	"grad_norm": 3.457472324371338,
	"kl": 2.2265625,
	"learning_rate": 3.0097380284049523e-07,
	"loss": 0.089,
	"reward": -0.3612442761659622,
	"reward_std": 0.28438059240579605,
	"rewards/cosine_scaled_reward": -0.1806221418082714,
	"rewards/format_reward": 0.0,
	"step": 359
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.44,
	"grad_norm": 3.285405397415161,
	"kl": 2.076171875,
	"learning_rate": 2.9836319343816397e-07,
	"loss": 0.0831,
	"reward": -0.32887883111834526,
	"reward_std": 0.3107897564768791,
	"rewards/cosine_scaled_reward": -0.16443941928446293,
	"rewards/format_reward": 0.0,
	"step": 360
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.444,
	"grad_norm": 2.9156711101531982,
	"kl": 1.7646484375,
	"learning_rate": 2.9576484845877793e-07,
	"loss": 0.0706,
	"reward": -0.3512116149067879,
	"reward_std": 0.32886873185634613,
	"rewards/cosine_scaled_reward": -0.17560580000281334,
	"rewards/format_reward": 0.0,
	"step": 361
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.448,
	"grad_norm": 2.42704439163208,
	"kl": 1.697265625,
	"learning_rate": 2.931788945420058e-07,
	"loss": 0.0679,
	"reward": -0.3639722764492035,
	"reward_std": 0.2881170064210892,
	"rewards/cosine_scaled_reward": -0.18198613449931145,
	"rewards/format_reward": 0.0,
	"step": 362
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.452,
	"grad_norm": 4.5008225440979,
	"kl": 2.177734375,
	"learning_rate": 2.9060545772359305e-07,
	"loss": 0.087,
	"reward": -0.3515865206718445,
	"reward_std": 0.290123887360096,
	"rewards/cosine_scaled_reward": -0.17579325661063194,
	"rewards/format_reward": 0.0,
	"step": 363
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.456,
	"grad_norm": 2.7479496002197266,
	"kl": 1.578125,
	"learning_rate": 2.8804466342921987e-07,
	"loss": 0.0632,
	"reward": -0.26583924936130643,
	"reward_std": 0.29539088532328606,
	"rewards/cosine_scaled_reward": -0.13291961723007262,
	"rewards/format_reward": 0.0,
	"step": 364
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.46,
	"grad_norm": 2.6749367713928223,
	"kl": 2.1796875,
	"learning_rate": 2.854966364683872e-07,
	"loss": 0.087,
	"reward": -0.36106909811496735,
	"reward_std": 0.2982637956738472,
	"rewards/cosine_scaled_reward": -0.18053454905748367,
	"rewards/format_reward": 0.0,
	"step": 365
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.464,
	"grad_norm": 3.6434812545776367,
	"kl": 1.4482421875,
	"learning_rate": 2.829615010283344e-07,
	"loss": 0.058,
	"reward": -0.35805001854896545,
	"reward_std": 0.31588251888751984,
	"rewards/cosine_scaled_reward": -0.17902500554919243,
	"rewards/format_reward": 0.0,
	"step": 366
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.468,
	"grad_norm": 2.877927780151367,
	"kl": 1.779296875,
	"learning_rate": 2.8043938066798645e-07,
	"loss": 0.0712,
	"reward": -0.35267870873212814,
	"reward_std": 0.3029713034629822,
	"rewards/cosine_scaled_reward": -0.17633935809135437,
	"rewards/format_reward": 0.0,
	"step": 367
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.472,
	"grad_norm": 2.9547438621520996,
	"kl": 1.3583984375,
	"learning_rate": 2.7793039831193133e-07,
	"loss": 0.0542,
	"reward": -0.34842824190855026,
	"reward_std": 0.28041965141892433,
	"rewards/cosine_scaled_reward": -0.17421411722898483,
	"rewards/format_reward": 0.0,
	"step": 368
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.476,
	"grad_norm": 2.4998183250427246,
	"kl": 1.712890625,
	"learning_rate": 2.7543467624442956e-07,
	"loss": 0.0686,
	"reward": -0.34311509132385254,
	"reward_std": 0.3226206302642822,
	"rewards/cosine_scaled_reward": -0.17155754193663597,
	"rewards/format_reward": 0.0,
	"step": 369
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.48,
	"grad_norm": 3.5822997093200684,
	"kl": 1.2568359375,
	"learning_rate": 2.729523361034538e-07,
	"loss": 0.0502,
	"reward": -0.31581661850214005,
	"reward_std": 0.27614113688468933,
	"rewards/cosine_scaled_reward": -0.15790832042694092,
	"rewards/format_reward": 0.0,
	"step": 370
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.484,
	"grad_norm": 2.638000965118408,
	"kl": 1.658203125,
	"learning_rate": 2.7048349887476037e-07,
	"loss": 0.0663,
	"reward": -0.3658217638731003,
	"reward_std": 0.3533295765519142,
	"rewards/cosine_scaled_reward": -0.18291086703538895,
	"rewards/format_reward": 0.0,
	"step": 371
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.488,
	"grad_norm": 2.4719886779785156,
	"kl": 1.470703125,
	"learning_rate": 2.6802828488599294e-07,
	"loss": 0.0588,
	"reward": -0.35377567261457443,
	"reward_std": 0.2872357815504074,
	"rewards/cosine_scaled_reward": -0.17688783630728722,
	"rewards/format_reward": 0.0,
	"step": 372
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.492,
	"grad_norm": 3.820688486099243,
	"kl": 1.65673828125,
	"learning_rate": 2.655868138008171e-07,
	"loss": 0.0662,
	"reward": -0.3673105686903,
	"reward_std": 0.29224705323576927,
	"rewards/cosine_scaled_reward": -0.1836552768945694,
	"rewards/format_reward": 0.0,
	"step": 373
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.496,
	"grad_norm": 3.1416916847229004,
	"kl": 1.4990234375,
	"learning_rate": 2.631592046130896e-07,
	"loss": 0.06,
	"reward": -0.3574133738875389,
	"reward_std": 0.2663569226861,
	"rewards/cosine_scaled_reward": -0.17870669439435005,
	"rewards/format_reward": 0.0,
	"step": 374
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.5,
	"grad_norm": 2.3712515830993652,
	"kl": 1.900390625,
	"learning_rate": 2.6074557564105724e-07,
	"loss": 0.0761,
	"reward": -0.34536080807447433,
	"reward_std": 0.3063738942146301,
	"rewards/cosine_scaled_reward": -0.17268040403723717,
	"rewards/format_reward": 0.0,
	"step": 375
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.504,
	"grad_norm": 2.792006254196167,
	"kl": 1.71875,
	"learning_rate": 2.583460445215911e-07,
	"loss": 0.0688,
	"reward": -0.3458981513977051,
	"reward_std": 0.3039686158299446,
	"rewards/cosine_scaled_reward": -0.17294907197356224,
	"rewards/format_reward": 0.0,
	"step": 376
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.508,
	"grad_norm": 2.985948085784912,
	"kl": 1.5625,
	"learning_rate": 2.5596072820445254e-07,
	"loss": 0.0625,
	"reward": -0.21606629202142358,
	"reward_std": 0.2749215438961983,
	"rewards/cosine_scaled_reward": -0.10803314973600209,
	"rewards/format_reward": 0.0,
	"step": 377
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1531.952392578125,
	"epoch": 1.512,
	"grad_norm": 2.396852970123291,
	"kl": 1.9921875,
	"learning_rate": 2.5358974294659373e-07,
	"loss": 0.0823,
	"reward": -0.38127752393484116,
	"reward_std": 0.32172612845897675,
	"rewards/cosine_scaled_reward": -0.19063876569271088,
	"rewards/format_reward": 0.0,
	"step": 378
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.516,
	"grad_norm": 2.503976345062256,
	"kl": 1.794921875,
	"learning_rate": 2.512332043064913e-07,
	"loss": 0.0718,
	"reward": -0.3479606434702873,
	"reward_std": 0.29174239560961723,
	"rewards/cosine_scaled_reward": -0.17398031428456306,
	"rewards/format_reward": 0.0,
	"step": 379
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1531.8035888671875,
	"epoch": 1.52,
	"grad_norm": 3.344243049621582,
	"kl": 2.080078125,
	"learning_rate": 2.488912271385139e-07,
	"loss": 0.083,
	"reward": -0.38203170895576477,
	"reward_std": 0.3180833086371422,
	"rewards/cosine_scaled_reward": -0.19101585447788239,
	"rewards/format_reward": 0.0,
	"step": 380
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.524,
	"grad_norm": 3.5073604583740234,
	"kl": 2.095703125,
	"learning_rate": 2.465639255873246e-07,
	"loss": 0.0837,
	"reward": -0.33683621138334274,
	"reward_std": 0.3141423165798187,
	"rewards/cosine_scaled_reward": -0.16841810569167137,
	"rewards/format_reward": 0.0,
	"step": 381
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.528,
	"grad_norm": 2.7634477615356445,
	"kl": 2.55859375,
	"learning_rate": 2.4425141308231765e-07,
	"loss": 0.1022,
	"reward": -0.3983701467514038,
	"reward_std": 0.31766583025455475,
	"rewards/cosine_scaled_reward": -0.199185062199831,
	"rewards/format_reward": 0.0,
	"step": 382
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.532,
	"grad_norm": 3.1601033210754395,
	"kl": 1.486328125,
	"learning_rate": 2.4195380233209006e-07,
	"loss": 0.0594,
	"reward": -0.37120404094457626,
	"reward_std": 0.3172856420278549,
	"rewards/cosine_scaled_reward": -0.18560202419757843,
	"rewards/format_reward": 0.0,
	"step": 383
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.536,
	"grad_norm": 2.475311040878296,
	"kl": 2.01953125,
	"learning_rate": 2.3967120531894857e-07,
	"loss": 0.0807,
	"reward": -0.3449181020259857,
	"reward_std": 0.3061336353421211,
	"rewards/cosine_scaled_reward": -0.17245905846357346,
	"rewards/format_reward": 0.0,
	"step": 384
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.54,
	"grad_norm": 3.9638140201568604,
	"kl": 1.6806640625,
	"learning_rate": 2.374037332934512e-07,
	"loss": 0.0673,
	"reward": -0.3139965161681175,
	"reward_std": 0.303245909512043,
	"rewards/cosine_scaled_reward": -0.15699823945760727,
	"rewards/format_reward": 0.0,
	"step": 385
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.544,
	"grad_norm": 3.2407708168029785,
	"kl": 1.89453125,
	"learning_rate": 2.3515149676898552e-07,
	"loss": 0.0757,
	"reward": -0.3049175813794136,
	"reward_std": 0.30845751613378525,
	"rewards/cosine_scaled_reward": -0.1524587944149971,
	"rewards/format_reward": 0.0,
	"step": 386
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.548,
	"grad_norm": 3.1065189838409424,
	"kl": 1.75390625,
	"learning_rate": 2.3291460551638237e-07,
	"loss": 0.0701,
	"reward": -0.3369733840227127,
	"reward_std": 0.30179525911808014,
	"rewards/cosine_scaled_reward": -0.16848668828606606,
	"rewards/format_reward": 0.0,
	"step": 387
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.552,
	"grad_norm": 2.6867339611053467,
	"kl": 2.06640625,
	"learning_rate": 2.306931685585657e-07,
	"loss": 0.0826,
	"reward": -0.3339100852608681,
	"reward_std": 0.3043428584933281,
	"rewards/cosine_scaled_reward": -0.16695504263043404,
	"rewards/format_reward": 0.0,
	"step": 388
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.556,
	"grad_norm": 3.1580567359924316,
	"kl": 2.291015625,
	"learning_rate": 2.2848729416523859e-07,
	"loss": 0.0915,
	"reward": -0.3744669333100319,
	"reward_std": 0.3249610960483551,
	"rewards/cosine_scaled_reward": -0.18723345920443535,
	"rewards/format_reward": 0.0,
	"step": 389
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.56,
	"grad_norm": 5.407771587371826,
	"kl": 1.609375,
	"learning_rate": 2.2629708984760706e-07,
	"loss": 0.0645,
	"reward": -0.3420454412698746,
	"reward_std": 0.3148321136832237,
	"rewards/cosine_scaled_reward": -0.1710227131843567,
	"rewards/format_reward": 0.0,
	"step": 390
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.564,
	"grad_norm": 4.492737770080566,
	"kl": 2.275390625,
	"learning_rate": 2.2412266235313973e-07,
	"loss": 0.0909,
	"reward": -0.36313918232917786,
	"reward_std": 0.29535526037216187,
	"rewards/cosine_scaled_reward": -0.18156958371400833,
	"rewards/format_reward": 0.0,
	"step": 391
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.568,
	"grad_norm": 3.0125086307525635,
	"kl": 2.029296875,
	"learning_rate": 2.2196411766036487e-07,
	"loss": 0.0812,
	"reward": -0.37769585102796555,
	"reward_std": 0.31776873767375946,
	"rewards/cosine_scaled_reward": -0.18884791806340218,
	"rewards/format_reward": 0.0,
	"step": 392
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.572,
	"grad_norm": 3.134265899658203,
	"kl": 2.47265625,
	"learning_rate": 2.1982156097370557e-07,
	"loss": 0.099,
	"reward": -0.38678842037916183,
	"reward_std": 0.30557621270418167,
	"rewards/cosine_scaled_reward": -0.19339420646429062,
	"rewards/format_reward": 0.0,
	"step": 393
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.576,
	"grad_norm": 2.9398727416992188,
	"kl": 1.404296875,
	"learning_rate": 2.1769509671835223e-07,
	"loss": 0.0562,
	"reward": -0.3609785735607147,
	"reward_std": 0.29732464998960495,
	"rewards/cosine_scaled_reward": -0.18048929050564766,
	"rewards/format_reward": 0.0,
	"step": 394
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.58,
	"grad_norm": 2.3901424407958984,
	"kl": 2.3291015625,
	"learning_rate": 2.1558482853517253e-07,
	"loss": 0.093,
	"reward": -0.38430536538362503,
	"reward_std": 0.32753758877515793,
	"rewards/cosine_scaled_reward": -0.19215268269181252,
	"rewards/format_reward": 0.0,
	"step": 395
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1526.9702453613281,
	"epoch": 1.584,
	"grad_norm": 3.9775447845458984,
	"kl": 2.06640625,
	"learning_rate": 2.134908592756607e-07,
	"loss": 0.0914,
	"reward": -0.33116257190704346,
	"reward_std": 0.2928163409233093,
	"rewards/cosine_scaled_reward": -0.16558128595352173,
	"rewards/format_reward": 0.0,
	"step": 396
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.588,
	"grad_norm": 2.9975955486297607,
	"kl": 2.318359375,
	"learning_rate": 2.1141329099692406e-07,
	"loss": 0.0928,
	"reward": -0.3710367754101753,
	"reward_std": 0.3226532116532326,
	"rewards/cosine_scaled_reward": -0.18551838770508766,
	"rewards/format_reward": 0.0,
	"step": 397
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1530.6845397949219,
	"epoch": 1.592,
	"grad_norm": 3.739922046661377,
	"kl": 2.025390625,
	"learning_rate": 2.0935222495670968e-07,
	"loss": 0.0747,
	"reward": -0.3954162746667862,
	"reward_std": 0.3323783427476883,
	"rewards/cosine_scaled_reward": -0.1977081410586834,
	"rewards/format_reward": 0.0,
	"step": 398
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.596,
	"grad_norm": 2.7063024044036865,
	"kl": 1.0927734375,
	"learning_rate": 2.0730776160846853e-07,
	"loss": 0.0437,
	"reward": -0.3006215952336788,
	"reward_std": 0.27692657709121704,
	"rewards/cosine_scaled_reward": -0.15031079947948456,
	"rewards/format_reward": 0.0,
	"step": 399
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.6,
	"grad_norm": 2.469496726989746,
	"kl": 1.732421875,
	"learning_rate": 2.0528000059645995e-07,
	"loss": 0.0693,
	"reward": -0.36928267031908035,
	"reward_std": 0.30984392017126083,
	"rewards/cosine_scaled_reward": -0.18464133515954018,
	"rewards/format_reward": 0.0,
	"step": 400
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1522.3095397949219,
	"epoch": 1.604,
	"grad_norm": 2.855372190475464,
	"kl": 1.845703125,
	"learning_rate": 2.032690407508949e-07,
	"loss": 0.0636,
	"reward": -0.38443852961063385,
	"reward_std": 0.28470365703105927,
	"rewards/cosine_scaled_reward": -0.19221926480531693,
	"rewards/format_reward": 0.0,
	"step": 401
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.608,
	"grad_norm": 3.3847217559814453,
	"kl": 2.0390625,
	"learning_rate": 2.0127498008311922e-07,
	"loss": 0.0814,
	"reward": -0.3252910152077675,
	"reward_std": 0.2982725724577904,
	"rewards/cosine_scaled_reward": -0.16264550015330315,
	"rewards/format_reward": 0.0,
	"step": 402
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.612,
	"grad_norm": 3.0226523876190186,
	"kl": 1.81640625,
	"learning_rate": 1.9929791578083655e-07,
	"loss": 0.0727,
	"reward": -0.3527565225958824,
	"reward_std": 0.30437447875738144,
	"rewards/cosine_scaled_reward": -0.1763782650232315,
	"rewards/format_reward": 0.0,
	"step": 403
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.616,
	"grad_norm": 2.866734743118286,
	"kl": 1.7890625,
	"learning_rate": 1.9733794420337213e-07,
	"loss": 0.0716,
	"reward": -0.3746185079216957,
	"reward_std": 0.3078552633523941,
	"rewards/cosine_scaled_reward": -0.18730924278497696,
	"rewards/format_reward": 0.0,
	"step": 404
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.62,
	"grad_norm": 3.9170870780944824,
	"kl": 1.970703125,
	"learning_rate": 1.9539516087697517e-07,
	"loss": 0.0789,
	"reward": -0.41533301770687103,
	"reward_std": 0.3027655556797981,
	"rewards/cosine_scaled_reward": -0.20766650885343552,
	"rewards/format_reward": 0.0,
	"step": 405
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.624,
	"grad_norm": 3.470655679702759,
	"kl": 1.845703125,
	"learning_rate": 1.934696604901642e-07,
	"loss": 0.0738,
	"reward": -0.3191938251256943,
	"reward_std": 0.28303690254688263,
	"rewards/cosine_scaled_reward": -0.15959692373871803,
	"rewards/format_reward": 0.0,
	"step": 406
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.6280000000000001,
	"grad_norm": 3.623340368270874,
	"kl": 1.31640625,
	"learning_rate": 1.915615368891117e-07,
	"loss": 0.0526,
	"reward": -0.3123548626899719,
	"reward_std": 0.29499682784080505,
	"rewards/cosine_scaled_reward": -0.15617743134498596,
	"rewards/format_reward": 0.0,
	"step": 407
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.6320000000000001,
	"grad_norm": 2.282514810562134,
	"kl": 1.267578125,
	"learning_rate": 1.8967088307307e-07,
	"loss": 0.0507,
	"reward": -0.39642050117254257,
	"reward_std": 0.311983872205019,
	"rewards/cosine_scaled_reward": -0.19821025803685188,
	"rewards/format_reward": 0.0,
	"step": 408
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.6360000000000001,
	"grad_norm": 2.5232083797454834,
	"kl": 1.681640625,
	"learning_rate": 1.8779779118983867e-07,
	"loss": 0.0672,
	"reward": -0.33888739347457886,
	"reward_std": 0.28087718039751053,
	"rewards/cosine_scaled_reward": -0.16944369673728943,
	"rewards/format_reward": 0.0,
	"step": 409
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.6400000000000001,
	"grad_norm": 3.886439085006714,
	"kl": 2.09765625,
	"learning_rate": 1.8594235253127372e-07,
	"loss": 0.0838,
	"reward": -0.38627707213163376,
	"reward_std": 0.33190976083278656,
	"rewards/cosine_scaled_reward": -0.19313853234052658,
	"rewards/format_reward": 0.0,
	"step": 410
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.6440000000000001,
	"grad_norm": 3.090627670288086,
	"kl": 2.140625,
	"learning_rate": 1.8410465752883758e-07,
	"loss": 0.0857,
	"reward": -0.3793156296014786,
	"reward_std": 0.30717378109693527,
	"rewards/cosine_scaled_reward": -0.1896577998995781,
	"rewards/format_reward": 0.0,
	"step": 411
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.6480000000000001,
	"grad_norm": 3.867506980895996,
	"kl": 1.880859375,
	"learning_rate": 1.822847957491922e-07,
	"loss": 0.0753,
	"reward": -0.3565782457590103,
	"reward_std": 0.3352038711309433,
	"rewards/cosine_scaled_reward": -0.17828912287950516,
	"rewards/format_reward": 0.0,
	"step": 412
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.6520000000000001,
	"grad_norm": 2.388094902038574,
	"kl": 1.751953125,
	"learning_rate": 1.804828558898332e-07,
	"loss": 0.0701,
	"reward": -0.3393707424402237,
	"reward_std": 0.3029238283634186,
	"rewards/cosine_scaled_reward": -0.16968537122011185,
	"rewards/format_reward": 0.0,
	"step": 413
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.6560000000000001,
	"grad_norm": 2.5263466835021973,
	"kl": 1.748046875,
	"learning_rate": 1.7869892577476722e-07,
	"loss": 0.0698,
	"reward": -0.4274343103170395,
	"reward_std": 0.3449402078986168,
	"rewards/cosine_scaled_reward": -0.21371715888381004,
	"rewards/format_reward": 0.0,
	"step": 414
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.6600000000000001,
	"grad_norm": 2.3268003463745117,
	"kl": 1.400390625,
	"learning_rate": 1.7693309235023127e-07,
	"loss": 0.0559,
	"reward": -0.3480057269334793,
	"reward_std": 0.29953421652317047,
	"rewards/cosine_scaled_reward": -0.17400285601615906,
	"rewards/format_reward": 0.0,
	"step": 415
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.6640000000000001,
	"grad_norm": 3.2503533363342285,
	"kl": 1.9140625,
	"learning_rate": 1.7518544168045524e-07,
	"loss": 0.0767,
	"reward": -0.36937638372182846,
	"reward_std": 0.31766701489686966,
	"rewards/cosine_scaled_reward": -0.18468819558620453,
	"rewards/format_reward": 0.0,
	"step": 416
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.6680000000000001,
	"grad_norm": 2.9895646572113037,
	"kl": 2.1796875,
	"learning_rate": 1.7345605894346726e-07,
	"loss": 0.0871,
	"reward": -0.3985458239912987,
	"reward_std": 0.33385203033685684,
	"rewards/cosine_scaled_reward": -0.19927291199564934,
	"rewards/format_reward": 0.0,
	"step": 417
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.6720000000000002,
	"grad_norm": 3.2457692623138428,
	"kl": 1.71875,
	"learning_rate": 1.7174502842694212e-07,
	"loss": 0.0687,
	"reward": -0.2603262776392512,
	"reward_std": 0.3040950074791908,
	"rewards/cosine_scaled_reward": -0.13016314181732014,
	"rewards/format_reward": 0.0,
	"step": 418
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.6760000000000002,
	"grad_norm": 2.8391411304473877,
	"kl": 1.798828125,
	"learning_rate": 1.7005243352409333e-07,
	"loss": 0.072,
	"reward": -0.2663672436028719,
	"reward_std": 0.29912005364894867,
	"rewards/cosine_scaled_reward": -0.13318362249992788,
	"rewards/format_reward": 0.0,
	"step": 419
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.6800000000000002,
	"grad_norm": 3.1057238578796387,
	"kl": 1.5,
	"learning_rate": 1.6837835672960831e-07,
	"loss": 0.06,
	"reward": -0.34882377088069916,
	"reward_std": 0.3601520508527756,
	"rewards/cosine_scaled_reward": -0.17441189289093018,
	"rewards/format_reward": 0.0,
	"step": 420
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.6840000000000002,
	"grad_norm": 2.243816375732422,
	"kl": 1.541015625,
	"learning_rate": 1.6672287963562852e-07,
	"loss": 0.0616,
	"reward": -0.3832622766494751,
	"reward_std": 0.3413049802184105,
	"rewards/cosine_scaled_reward": -0.19163113832473755,
	"rewards/format_reward": 0.0,
	"step": 421
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.688,
	"grad_norm": 3.76218581199646,
	"kl": 1.880859375,
	"learning_rate": 1.6508608292777203e-07,
	"loss": 0.0752,
	"reward": -0.3700753226876259,
	"reward_std": 0.31324099004268646,
	"rewards/cosine_scaled_reward": -0.18503766134381294,
	"rewards/format_reward": 0.0,
	"step": 422
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.692,
	"grad_norm": 4.034151554107666,
	"kl": 1.70703125,
	"learning_rate": 1.6346804638120098e-07,
	"loss": 0.0682,
	"reward": -0.29791881144046783,
	"reward_std": 0.2801155336201191,
	"rewards/cosine_scaled_reward": -0.14895940944552422,
	"rewards/format_reward": 0.0,
	"step": 423
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.696,
	"grad_norm": 3.041618824005127,
	"kl": 1.81640625,
	"learning_rate": 1.6186884885673413e-07,
	"loss": 0.0725,
	"reward": -0.32316526770591736,
	"reward_std": 0.2970619350671768,
	"rewards/cosine_scaled_reward": -0.16158264502882957,
	"rewards/format_reward": 0.0,
	"step": 424
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.7,
	"grad_norm": 4.081668376922607,
	"kl": 1.4453125,
	"learning_rate": 1.6028856829700258e-07,
	"loss": 0.0576,
	"reward": -0.3476375713944435,
	"reward_std": 0.294509120285511,
	"rewards/cosine_scaled_reward": -0.17381878197193146,
	"rewards/format_reward": 0.0,
	"step": 425
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.704,
	"grad_norm": 3.166949510574341,
	"kl": 2.1015625,
	"learning_rate": 1.5872728172265146e-07,
	"loss": 0.0841,
	"reward": -0.3467593193054199,
	"reward_std": 0.30388573557138443,
	"rewards/cosine_scaled_reward": -0.17337966337800026,
	"rewards/format_reward": 0.0,
	"step": 426
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.708,
	"grad_norm": 4.211978435516357,
	"kl": 1.763671875,
	"learning_rate": 1.5718506522858572e-07,
	"loss": 0.0705,
	"reward": -0.3505774810910225,
	"reward_std": 0.30420946329832077,
	"rewards/cosine_scaled_reward": -0.17528874799609184,
	"rewards/format_reward": 0.0,
	"step": 427
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.712,
	"grad_norm": 4.166502952575684,
	"kl": 2.158203125,
	"learning_rate": 1.5566199398026147e-07,
	"loss": 0.0863,
	"reward": -0.361857570707798,
	"reward_std": 0.30119316279888153,
	"rewards/cosine_scaled_reward": -0.1809287928044796,
	"rewards/format_reward": 0.0,
	"step": 428
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.716,
	"grad_norm": 2.8889896869659424,
	"kl": 1.8671875,
	"learning_rate": 1.5415814221002265e-07,
	"loss": 0.0745,
	"reward": -0.32126056402921677,
	"reward_std": 0.27691005170345306,
	"rewards/cosine_scaled_reward": -0.16063029691576958,
	"rewards/format_reward": 0.0,
	"step": 429
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.72,
	"grad_norm": 3.3025801181793213,
	"kl": 1.904296875,
	"learning_rate": 1.5267358321348285e-07,
	"loss": 0.0761,
	"reward": -0.36847078800201416,
	"reward_std": 0.3445659205317497,
	"rewards/cosine_scaled_reward": -0.18423539400100708,
	"rewards/format_reward": 0.0,
	"step": 430
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.724,
	"grad_norm": 3.0440969467163086,
	"kl": 1.75,
	"learning_rate": 1.5120838934595337e-07,
	"loss": 0.07,
	"reward": -0.36113734543323517,
	"reward_std": 0.3412683606147766,
	"rewards/cosine_scaled_reward": -0.18056866899132729,
	"rewards/format_reward": 0.0,
	"step": 431
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1530.952392578125,
	"epoch": 1.728,
	"grad_norm": 2.575627326965332,
	"kl": 1.689453125,
	"learning_rate": 1.4976263201891613e-07,
	"loss": 0.0716,
	"reward": -0.3095761463046074,
	"reward_std": 0.32323335483670235,
	"rewards/cosine_scaled_reward": -0.1547880806028843,
	"rewards/format_reward": 0.0,
	"step": 432
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.732,
	"grad_norm": 3.186289072036743,
	"kl": 1.91015625,
	"learning_rate": 1.483363816965435e-07,
	"loss": 0.0765,
	"reward": -0.39015311002731323,
	"reward_std": 0.3067055642604828,
	"rewards/cosine_scaled_reward": -0.19507654383778572,
	"rewards/format_reward": 0.0,
	"step": 433
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.736,
	"grad_norm": 3.0739073753356934,
	"kl": 2.369140625,
	"learning_rate": 1.469297078922642e-07,
	"loss": 0.0946,
	"reward": -0.29091550246812403,
	"reward_std": 0.30687109380960464,
	"rewards/cosine_scaled_reward": -0.14545774972066283,
	"rewards/format_reward": 0.0,
	"step": 434
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.74,
	"grad_norm": 5.0029778480529785,
	"kl": 1.759765625,
	"learning_rate": 1.4554267916537495e-07,
	"loss": 0.0703,
	"reward": -0.34431006759405136,
	"reward_std": 0.27501973509788513,
	"rewards/cosine_scaled_reward": -0.17215503007173538,
	"rewards/format_reward": 0.0,
	"step": 435
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.744,
	"grad_norm": 5.139548301696777,
	"kl": 1.8203125,
	"learning_rate": 1.4417536311769885e-07,
	"loss": 0.0728,
	"reward": -0.31318235397338867,
	"reward_std": 0.2976163923740387,
	"rewards/cosine_scaled_reward": -0.15659117698669434,
	"rewards/format_reward": 0.0,
	"step": 436
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.748,
	"grad_norm": 2.881143808364868,
	"kl": 1.626953125,
	"learning_rate": 1.4282782639029128e-07,
	"loss": 0.065,
	"reward": -0.3547092080116272,
	"reward_std": 0.28170817345380783,
	"rewards/cosine_scaled_reward": -0.1773546040058136,
	"rewards/format_reward": 0.0,
	"step": 437
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.752,
	"grad_norm": 2.4268362522125244,
	"kl": 1.9609375,
	"learning_rate": 1.4150013466019114e-07,
	"loss": 0.0786,
	"reward": -0.3464732989668846,
	"reward_std": 0.3199189677834511,
	"rewards/cosine_scaled_reward": -0.173236645758152,
	"rewards/format_reward": 0.0,
	"step": 438
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.756,
	"grad_norm": 2.686417579650879,
	"kl": 2.318359375,
	"learning_rate": 1.4019235263722034e-07,
	"loss": 0.0926,
	"reward": -0.3557046577334404,
	"reward_std": 0.3187018297612667,
	"rewards/cosine_scaled_reward": -0.1778523214161396,
	"rewards/format_reward": 0.0,
	"step": 439
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.76,
	"grad_norm": 4.9666876792907715,
	"kl": 1.4619140625,
	"learning_rate": 1.3890454406082956e-07,
	"loss": 0.0584,
	"reward": -0.3234737552702427,
	"reward_std": 0.32776766270399094,
	"rewards/cosine_scaled_reward": -0.16173688508570194,
	"rewards/format_reward": 0.0,
	"step": 440
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.764,
	"grad_norm": 4.106746196746826,
	"kl": 2.49609375,
	"learning_rate": 1.3763677169699217e-07,
	"loss": 0.0999,
	"reward": -0.4192545562982559,
	"reward_std": 0.33375757187604904,
	"rewards/cosine_scaled_reward": -0.20962728559970856,
	"rewards/format_reward": 0.0,
	"step": 441
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1534.8690490722656,
	"epoch": 1.768,
	"grad_norm": 2.842816114425659,
	"kl": 2.2578125,
	"learning_rate": 1.3638909733514452e-07,
	"loss": 0.0898,
	"reward": -0.3652210012078285,
	"reward_std": 0.3345082625746727,
	"rewards/cosine_scaled_reward": -0.18261050805449486,
	"rewards/format_reward": 0.0,
	"step": 442
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.772,
	"grad_norm": 3.186333179473877,
	"kl": 2.375,
	"learning_rate": 1.351615817851748e-07,
	"loss": 0.0947,
	"reward": -0.40324729681015015,
	"reward_std": 0.32466883957386017,
	"rewards/cosine_scaled_reward": -0.20162366330623627,
	"rewards/format_reward": 0.0,
	"step": 443
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.776,
	"grad_norm": 4.4096360206604,
	"kl": 2.99609375,
	"learning_rate": 1.3395428487445914e-07,
	"loss": 0.1197,
	"reward": -0.3327697291970253,
	"reward_std": 0.3282741829752922,
	"rewards/cosine_scaled_reward": -0.16638486459851265,
	"rewards/format_reward": 0.0,
	"step": 444
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.78,
	"grad_norm": 2.8214669227600098,
	"kl": 1.8623046875,
	"learning_rate": 1.3276726544494571e-07,
	"loss": 0.0746,
	"reward": -0.39069636911153793,
	"reward_std": 0.33478184044361115,
	"rewards/cosine_scaled_reward": -0.19534818828105927,
	"rewards/format_reward": 0.0,
	"step": 445
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.784,
	"grad_norm": 2.96333646774292,
	"kl": 1.828125,
	"learning_rate": 1.316005813502869e-07,
	"loss": 0.073,
	"reward": -0.34233053401112556,
	"reward_std": 0.30314670503139496,
	"rewards/cosine_scaled_reward": -0.17116525955498219,
	"rewards/format_reward": 0.0,
	"step": 446
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.788,
	"grad_norm": 2.538837432861328,
	"kl": 1.615234375,
	"learning_rate": 1.3045428945301953e-07,
	"loss": 0.0647,
	"reward": -0.2668099580332637,
	"reward_std": 0.3087245300412178,
	"rewards/cosine_scaled_reward": -0.1334049835568294,
	"rewards/format_reward": 0.0,
	"step": 447
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.792,
	"grad_norm": 6.922802925109863,
	"kl": 1.9208984375,
	"learning_rate": 1.2932844562179352e-07,
	"loss": 0.0768,
	"reward": -0.3690221831202507,
	"reward_std": 0.3130299560725689,
	"rewards/cosine_scaled_reward": -0.18451109528541565,
	"rewards/format_reward": 0.0,
	"step": 448
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.796,
	"grad_norm": 3.2286629676818848,
	"kl": 1.990234375,
	"learning_rate": 1.2822310472864885e-07,
	"loss": 0.0795,
	"reward": -0.32342398166656494,
	"reward_std": 0.3065089136362076,
	"rewards/cosine_scaled_reward": -0.16171199083328247,
	"rewards/format_reward": 0.0,
	"step": 449
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.8,
	"grad_norm": 3.7653493881225586,
	"kl": 1.904296875,
	"learning_rate": 1.2713832064634125e-07,
	"loss": 0.0763,
	"reward": -0.4029879495501518,
	"reward_std": 0.31490693986415863,
	"rewards/cosine_scaled_reward": -0.2014939747750759,
	"rewards/format_reward": 0.0,
	"step": 450
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.804,
	"grad_norm": 3.4150803089141846,
	"kl": 2.150390625,
	"learning_rate": 1.260741462457165e-07,
	"loss": 0.086,
	"reward": -0.3429009020328522,
	"reward_std": 0.29108157753944397,
	"rewards/cosine_scaled_reward": -0.1714504510164261,
	"rewards/format_reward": 0.0,
	"step": 451
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.808,
	"grad_norm": 4.145492076873779,
	"kl": 2.2421875,
	"learning_rate": 1.2503063339313356e-07,
	"loss": 0.0897,
	"reward": -0.42198269814252853,
	"reward_std": 0.3363164961338043,
	"rewards/cosine_scaled_reward": -0.21099134907126427,
	"rewards/format_reward": 0.0,
	"step": 452
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.812,
	"grad_norm": 4.779297351837158,
	"kl": 2.228515625,
	"learning_rate": 1.2400783294793668e-07,
	"loss": 0.0891,
	"reward": -0.3492959663271904,
	"reward_std": 0.2949202358722687,
	"rewards/cosine_scaled_reward": -0.1746479757130146,
	"rewards/format_reward": 0.0,
	"step": 453
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.8159999999999998,
	"grad_norm": 2.905301570892334,
	"kl": 1.265625,
	"learning_rate": 1.2300579475997657e-07,
	"loss": 0.0506,
	"reward": -0.2935212664306164,
	"reward_std": 0.26374514773488045,
	"rewards/cosine_scaled_reward": -0.1467606294900179,
	"rewards/format_reward": 0.0,
	"step": 454
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.8199999999999998,
	"grad_norm": 2.7079851627349854,
	"kl": 2.1337890625,
	"learning_rate": 1.220245676671809e-07,
	"loss": 0.0853,
	"reward": -0.3475092798471451,
	"reward_std": 0.30007384717464447,
	"rewards/cosine_scaled_reward": -0.17375463247299194,
	"rewards/format_reward": 0.0,
	"step": 455
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.8239999999999998,
	"grad_norm": 2.6113271713256836,
	"kl": 1.6376953125,
	"learning_rate": 1.2106419949317388e-07,
	"loss": 0.0654,
	"reward": -0.330677293241024,
	"reward_std": 0.3133997842669487,
	"rewards/cosine_scaled_reward": -0.1653386428952217,
	"rewards/format_reward": 0.0,
	"step": 456
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.8279999999999998,
	"grad_norm": 2.7393922805786133,
	"kl": 1.666015625,
	"learning_rate": 1.2012473704494537e-07,
	"loss": 0.0668,
	"reward": -0.3434924744069576,
	"reward_std": 0.3196050524711609,
	"rewards/cosine_scaled_reward": -0.17174622975289822,
	"rewards/format_reward": 0.0,
	"step": 457
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.8319999999999999,
	"grad_norm": 4.49023962020874,
	"kl": 2.34375,
	"learning_rate": 1.1920622611056974e-07,
	"loss": 0.0938,
	"reward": -0.34944383054971695,
	"reward_std": 0.3238733857870102,
	"rewards/cosine_scaled_reward": -0.17472190782427788,
	"rewards/format_reward": 0.0,
	"step": 458
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.8359999999999999,
	"grad_norm": 2.3561832904815674,
	"kl": 1.4501953125,
	"learning_rate": 1.1830871145697412e-07,
	"loss": 0.0579,
	"reward": -0.3565739244222641,
	"reward_std": 0.3099294453859329,
	"rewards/cosine_scaled_reward": -0.17828696221113205,
	"rewards/format_reward": 0.0,
	"step": 459
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.8399999999999999,
	"grad_norm": 3.1239490509033203,
	"kl": 1.8984375,
	"learning_rate": 1.1743223682775649e-07,
	"loss": 0.0759,
	"reward": -0.3478566035628319,
	"reward_std": 0.28794750943779945,
	"rewards/cosine_scaled_reward": -0.17392829060554504,
	"rewards/format_reward": 0.0,
	"step": 460
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.8439999999999999,
	"grad_norm": 2.673818826675415,
	"kl": 1.740234375,
	"learning_rate": 1.1657684494105386e-07,
	"loss": 0.0695,
	"reward": -0.339593730866909,
	"reward_std": 0.3045819625258446,
	"rewards/cosine_scaled_reward": -0.1697968691587448,
	"rewards/format_reward": 0.0,
	"step": 461
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.8479999999999999,
	"grad_norm": 3.220402479171753,
	"kl": 1.626953125,
	"learning_rate": 1.1574257748745986e-07,
	"loss": 0.0651,
	"reward": -0.36886321753263474,
	"reward_std": 0.26985886320471764,
	"rewards/cosine_scaled_reward": -0.18443159759044647,
	"rewards/format_reward": 0.0,
	"step": 462
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1530.4642944335938,
	"epoch": 1.8519999999999999,
	"grad_norm": 2.8002877235412598,
	"kl": 2.23828125,
	"learning_rate": 1.1492947512799328e-07,
	"loss": 0.0941,
	"reward": -0.31243710219860077,
	"reward_std": 0.3104839473962784,
	"rewards/cosine_scaled_reward": -0.15621854737401009,
	"rewards/format_reward": 0.0,
	"step": 463
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.8559999999999999,
	"grad_norm": 3.3076934814453125,
	"kl": 2.455078125,
	"learning_rate": 1.1413757749211602e-07,
	"loss": 0.098,
	"reward": -0.3408735916018486,
	"reward_std": 0.3259742558002472,
	"rewards/cosine_scaled_reward": -0.1704367958009243,
	"rewards/format_reward": 0.0,
	"step": 464
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.8599999999999999,
	"grad_norm": 4.302088737487793,
	"kl": 2.005859375,
	"learning_rate": 1.1336692317580158e-07,
	"loss": 0.0802,
	"reward": -0.3594956621527672,
	"reward_std": 0.32260415703058243,
	"rewards/cosine_scaled_reward": -0.1797478273510933,
	"rewards/format_reward": 0.0,
	"step": 465
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.8639999999999999,
	"grad_norm": 4.171574115753174,
	"kl": 2.490234375,
	"learning_rate": 1.1261754973965422e-07,
	"loss": 0.0995,
	"reward": -0.3996199369430542,
	"reward_std": 0.30815524607896805,
	"rewards/cosine_scaled_reward": -0.1998099721968174,
	"rewards/format_reward": 0.0,
	"step": 466
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.8679999999999999,
	"grad_norm": 3.7009289264678955,
	"kl": 1.841796875,
	"learning_rate": 1.1188949370707787e-07,
	"loss": 0.0738,
	"reward": -0.3371664360165596,
	"reward_std": 0.3329595774412155,
	"rewards/cosine_scaled_reward": -0.1685832180082798,
	"rewards/format_reward": 0.0,
	"step": 467
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.8719999999999999,
	"grad_norm": 2.592533826828003,
	"kl": 2.251953125,
	"learning_rate": 1.1118279056249653e-07,
	"loss": 0.0901,
	"reward": -0.34844203293323517,
	"reward_std": 0.322611540555954,
	"rewards/cosine_scaled_reward": -0.1742210052907467,
	"rewards/format_reward": 0.0,
	"step": 468
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.876,
	"grad_norm": 4.633761405944824,
	"kl": 1.64013671875,
	"learning_rate": 1.1049747474962444e-07,
	"loss": 0.0656,
	"reward": -0.3193807154893875,
	"reward_std": 0.26448768377304077,
	"rewards/cosine_scaled_reward": -0.15969035774469376,
	"rewards/format_reward": 0.0,
	"step": 469
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.88,
	"grad_norm": 3.101719617843628,
	"kl": 2.033203125,
	"learning_rate": 1.0983357966978745e-07,
	"loss": 0.0812,
	"reward": -0.3662792518734932,
	"reward_std": 0.32248761504888535,
	"rewards/cosine_scaled_reward": -0.1831396110355854,
	"rewards/format_reward": 0.0,
	"step": 470
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.884,
	"grad_norm": 2.580354690551758,
	"kl": 1.607421875,
	"learning_rate": 1.0919113768029517e-07,
	"loss": 0.0643,
	"reward": -0.34900667518377304,
	"reward_std": 0.31430666893720627,
	"rewards/cosine_scaled_reward": -0.17450333759188652,
	"rewards/format_reward": 0.0,
	"step": 471
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.888,
	"grad_norm": 2.7384796142578125,
	"kl": 1.8046875,
	"learning_rate": 1.0857018009286381e-07,
	"loss": 0.0722,
	"reward": -0.32778534665703773,
	"reward_std": 0.3321828171610832,
	"rewards/cosine_scaled_reward": -0.16389267705380917,
	"rewards/format_reward": 0.0,
	"step": 472
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.892,
	"grad_norm": 3.759181499481201,
	"kl": 2.017578125,
	"learning_rate": 1.0797073717209013e-07,
	"loss": 0.0807,
	"reward": -0.32047825306653976,
	"reward_std": 0.28816820681095123,
	"rewards/cosine_scaled_reward": -0.16023912653326988,
	"rewards/format_reward": 0.0,
	"step": 473
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.896,
	"grad_norm": 2.8909876346588135,
	"kl": 1.603515625,
	"learning_rate": 1.0739283813397639e-07,
	"loss": 0.0642,
	"reward": -0.3390325605869293,
	"reward_std": 0.3011201545596123,
	"rewards/cosine_scaled_reward": -0.16951627284288406,
	"rewards/format_reward": 0.0,
	"step": 474
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.9,
	"grad_norm": 2.3281497955322266,
	"kl": 2.0234375,
	"learning_rate": 1.068365111445064e-07,
	"loss": 0.081,
	"reward": -0.36704741418361664,
	"reward_std": 0.3062589168548584,
	"rewards/cosine_scaled_reward": -0.18352371081709862,
	"rewards/format_reward": 0.0,
	"step": 475
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.904,
	"grad_norm": 3.56882643699646,
	"kl": 2.515625,
	"learning_rate": 1.063017833182728e-07,
	"loss": 0.1008,
	"reward": -0.39511261135339737,
	"reward_std": 0.3128170743584633,
	"rewards/cosine_scaled_reward": -0.19755630940198898,
	"rewards/format_reward": 0.0,
	"step": 476
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.908,
	"grad_norm": 2.958406925201416,
	"kl": 1.755859375,
	"learning_rate": 1.0578868071715544e-07,
	"loss": 0.0702,
	"reward": -0.3462023660540581,
	"reward_std": 0.322578527033329,
	"rewards/cosine_scaled_reward": -0.17310118675231934,
	"rewards/format_reward": 0.0,
	"step": 477
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.912,
	"grad_norm": 3.044797897338867,
	"kl": 2.375,
	"learning_rate": 1.0529722834905125e-07,
	"loss": 0.095,
	"reward": -0.3144143670797348,
	"reward_std": 0.29915551096200943,
	"rewards/cosine_scaled_reward": -0.1572071835398674,
	"rewards/format_reward": 0.0,
	"step": 478
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.916,
	"grad_norm": 4.031872272491455,
	"kl": 2.640625,
	"learning_rate": 1.0482745016665526e-07,
	"loss": 0.1057,
	"reward": -0.3763216808438301,
	"reward_std": 0.3211255893111229,
	"rewards/cosine_scaled_reward": -0.18816084042191505,
	"rewards/format_reward": 0.0,
	"step": 479
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.92,
	"grad_norm": 2.3054392337799072,
	"kl": 1.3173828125,
	"learning_rate": 1.0437936906629334e-07,
	"loss": 0.0528,
	"reward": -0.2678487957455218,
	"reward_std": 0.2627658285200596,
	"rewards/cosine_scaled_reward": -0.13392440509051085,
	"rewards/format_reward": 0.0,
	"step": 480
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.924,
	"grad_norm": 3.41572642326355,
	"kl": 1.353515625,
	"learning_rate": 1.0395300688680625e-07,
	"loss": 0.0541,
	"reward": -0.35157452523708344,
	"reward_std": 0.3239835053682327,
	"rewards/cosine_scaled_reward": -0.17578726634383202,
	"rewards/format_reward": 0.0,
	"step": 481
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.928,
	"grad_norm": 2.691436290740967,
	"kl": 2.041015625,
	"learning_rate": 1.0354838440848501e-07,
	"loss": 0.0816,
	"reward": -0.39503272622823715,
	"reward_std": 0.3050593361258507,
	"rewards/cosine_scaled_reward": -0.19751636311411858,
	"rewards/format_reward": 0.0,
	"step": 482
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.932,
	"grad_norm": 2.859536647796631,
	"kl": 1.494140625,
	"learning_rate": 1.0316552135205837e-07,
	"loss": 0.0599,
	"reward": -0.395970955491066,
	"reward_std": 0.27583859115839005,
	"rewards/cosine_scaled_reward": -0.197985477745533,
	"rewards/format_reward": 0.0,
	"step": 483
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.936,
	"grad_norm": 2.9280340671539307,
	"kl": 1.765625,
	"learning_rate": 1.0280443637773163e-07,
	"loss": 0.0708,
	"reward": -0.2913724035024643,
	"reward_std": 0.2617946192622185,
	"rewards/cosine_scaled_reward": -0.14568619430065155,
	"rewards/format_reward": 0.0,
	"step": 484
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.94,
	"grad_norm": 2.2830445766448975,
	"kl": 1.2158203125,
	"learning_rate": 1.0246514708427701e-07,
	"loss": 0.0487,
	"reward": -0.3095552623271942,
	"reward_std": 0.292842835187912,
	"rewards/cosine_scaled_reward": -0.1547776274383068,
	"rewards/format_reward": 0.0,
	"step": 485
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.944,
	"grad_norm": 3.141052007675171,
	"kl": 1.3427734375,
	"learning_rate": 1.0214767000817596e-07,
	"loss": 0.0537,
	"reward": -0.32299425452947617,
	"reward_std": 0.29863065481185913,
	"rewards/cosine_scaled_reward": -0.16149712353944778,
	"rewards/format_reward": 0.0,
	"step": 486
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.948,
	"grad_norm": 3.97387433052063,
	"kl": 1.931640625,
	"learning_rate": 1.0185202062281336e-07,
	"loss": 0.0773,
	"reward": -0.3765959292650223,
	"reward_std": 0.3192542716860771,
	"rewards/cosine_scaled_reward": -0.18829796463251114,
	"rewards/format_reward": 0.0,
	"step": 487
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.952,
	"grad_norm": 2.656202554702759,
	"kl": 1.578125,
	"learning_rate": 1.0157821333772304e-07,
	"loss": 0.0631,
	"reward": -0.31205643340945244,
	"reward_std": 0.31670553237199783,
	"rewards/cosine_scaled_reward": -0.15602822043001652,
	"rewards/format_reward": 0.0,
	"step": 488
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.956,
	"grad_norm": 3.296848773956299,
	"kl": 1.16796875,
	"learning_rate": 1.013262614978859e-07,
	"loss": 0.0468,
	"reward": -0.3039631359279156,
	"reward_std": 0.27847766503691673,
	"rewards/cosine_scaled_reward": -0.15198157727718353,
	"rewards/format_reward": 0.0,
	"step": 489
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.96,
	"grad_norm": 4.522839546203613,
	"kl": 1.8203125,
	"learning_rate": 1.0109617738307911e-07,
	"loss": 0.0728,
	"reward": -0.34008362144231796,
	"reward_std": 0.29262910783290863,
	"rewards/cosine_scaled_reward": -0.17004182189702988,
	"rewards/format_reward": 0.0,
	"step": 490
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.964,
	"grad_norm": 2.311014175415039,
	"kl": 2.244140625,
	"learning_rate": 1.0088797220727779e-07,
	"loss": 0.0898,
	"reward": -0.34849604219198227,
	"reward_std": 0.3044138178229332,
	"rewards/cosine_scaled_reward": -0.17424802854657173,
	"rewards/format_reward": 0.0,
	"step": 491
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.968,
	"grad_norm": 2.6442465782165527,
	"kl": 1.998046875,
	"learning_rate": 1.0070165611810855e-07,
	"loss": 0.0799,
	"reward": -0.34308964014053345,
	"reward_std": 0.3727850690484047,
	"rewards/cosine_scaled_reward": -0.17154482379555702,
	"rewards/format_reward": 0.0,
	"step": 492
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.972,
	"grad_norm": 2.6985509395599365,
	"kl": 1.41796875,
	"learning_rate": 1.005372381963547e-07,
	"loss": 0.0567,
	"reward": -0.3521110415458679,
	"reward_std": 0.30227896198630333,
	"rewards/cosine_scaled_reward": -0.17605552449822426,
	"rewards/format_reward": 0.0,
	"step": 493
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.976,
	"grad_norm": 3.240550994873047,
	"kl": 1.904296875,
	"learning_rate": 1.0039472645551372e-07,
	"loss": 0.076,
	"reward": -0.3422994837164879,
	"reward_std": 0.3251089081168175,
	"rewards/cosine_scaled_reward": -0.17114974185824394,
	"rewards/format_reward": 0.0,
	"step": 494
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.98,
	"grad_norm": 4.803572177886963,
	"kl": 3.177734375,
	"learning_rate": 1.002741278414069e-07,
	"loss": 0.1272,
	"reward": -0.3737839311361313,
	"reward_std": 0.3232840970158577,
	"rewards/cosine_scaled_reward": -0.18689196929335594,
	"rewards/format_reward": 0.0,
	"step": 495
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.984,
	"grad_norm": 2.532582998275757,
	"kl": 1.9375,
	"learning_rate": 1.0017544823184055e-07,
	"loss": 0.0776,
	"reward": -0.374487929046154,
	"reward_std": 0.32537975162267685,
	"rewards/cosine_scaled_reward": -0.187243964523077,
	"rewards/format_reward": 0.0,
	"step": 496
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.988,
	"grad_norm": 2.6129701137542725,
	"kl": 2.2734375,
	"learning_rate": 1.0009869243631952e-07,
	"loss": 0.091,
	"reward": -0.3434004709124565,
	"reward_std": 0.32708871364593506,
	"rewards/cosine_scaled_reward": -0.17170023545622826,
	"rewards/format_reward": 0.0,
	"step": 497
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.992,
	"grad_norm": 4.10455322265625,
	"kl": 1.595703125,
	"learning_rate": 1.000438641958131e-07,
	"loss": 0.0638,
	"reward": -0.3211556486785412,
	"reward_std": 0.2905324958264828,
	"rewards/cosine_scaled_reward": -0.1605778243392706,
	"rewards/format_reward": 0.0,
	"step": 498
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0,
	"epoch": 1.996,
	"grad_norm": 2.7520267963409424,
	"kl": 1.892578125,
	"learning_rate": 1.0001096618257236e-07,
	"loss": 0.0756,
	"reward": -0.35309676826000214,
	"reward_std": 0.31401190161705017,
	"rewards/cosine_scaled_reward": -0.17654838413000107,
	"rewards/format_reward": 0.0,
	"step": 499
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 1536.0001220703125,
	"epoch": 2.0,
	"grad_norm": 2.9658398628234863,
	"kl": 1.7763671875,
	"learning_rate": 1e-07,
	"loss": 0.0711,
	"reward": -0.343311108648777,
	"reward_std": 0.28952478244900703,
	"rewards/cosine_scaled_reward": -0.1716555580496788,
	"rewards/format_reward": 0.0,
	"step": 500
	},
	{
	"epoch": 2.0,
	"step": 500,
	"total_flos": 0.0,
	"train_loss": 0.05846181693652478,
	"train_runtime": 107214.2293,
	"train_samples_per_second": 0.783,
	"train_steps_per_second": 0.005
	}
	],
	"logging_steps": 1,
	"max_steps": 500,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 2,
	"save_steps": 250,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": true
	},
	"attributes": {}
	}
	},
	"total_flos": 0.0,
	"train_batch_size": 6,
	"trial_name": null,
	"trial_params": null
	}