{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 2.0,
  "eval_steps": 500,
  "global_step": 500,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.004,
      "grad_norm": 4.214743137359619,
      "kl": 0.0,
      "learning_rate": 2e-08,
      "loss": -0.0,
      "reward": -0.572140134871006,
      "reward_std": 0.3359133452177048,
      "rewards/cosine_scaled_reward": -0.286070067435503,
      "rewards/format_reward": 0.0,
      "step": 1
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.008,
      "grad_norm": 3.178635597229004,
      "kl": 0.0,
      "learning_rate": 4e-08,
      "loss": -0.0,
      "reward": -0.6001544743776321,
      "reward_std": 0.33404429256916046,
      "rewards/cosine_scaled_reward": -0.30007724463939667,
      "rewards/format_reward": 0.0,
      "step": 2
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.012,
      "grad_norm": 4.78328800201416,
      "kl": 6.908178329467773e-05,
      "learning_rate": 6e-08,
      "loss": 0.0,
      "reward": -0.502997636795044,
      "reward_std": 0.3310435339808464,
      "rewards/cosine_scaled_reward": -0.251498818397522,
      "rewards/format_reward": 0.0,
      "step": 3
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.016,
      "grad_norm": 3.9194376468658447,
      "kl": 6.488710641860962e-05,
      "learning_rate": 8e-08,
      "loss": 0.0,
      "reward": -0.5549568086862564,
      "reward_std": 0.3469474986195564,
      "rewards/cosine_scaled_reward": -0.2774783968925476,
      "rewards/format_reward": 0.0,
      "step": 4
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.02,
      "grad_norm": 3.903712511062622,
      "kl": 5.97834587097168e-05,
      "learning_rate": 1e-07,
      "loss": 0.0,
      "reward": -0.5800392031669617,
      "reward_std": 0.35274410992860794,
      "rewards/cosine_scaled_reward": -0.29001960158348083,
      "rewards/format_reward": 0.0,
      "step": 5
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.024,
      "grad_norm": 3.738009452819824,
      "kl": 6.499886512756348e-05,
      "learning_rate": 1.2e-07,
      "loss": 0.0,
      "reward": -0.5155884921550751,
      "reward_std": 0.37037966400384903,
      "rewards/cosine_scaled_reward": -0.25779424607753754,
      "rewards/format_reward": 0.0,
      "step": 6
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.028,
      "grad_norm": 2.794049024581909,
      "kl": 5.620718002319336e-05,
      "learning_rate": 1.4e-07,
      "loss": 0.0,
      "reward": -0.5175943374633789,
      "reward_std": 0.3494645953178406,
      "rewards/cosine_scaled_reward": -0.25879716128110886,
      "rewards/format_reward": 0.0,
      "step": 7
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.032,
      "grad_norm": 2.484722852706909,
      "kl": 8.106231689453125e-05,
      "learning_rate": 1.6e-07,
      "loss": 0.0,
      "reward": -0.5301882103085518,
      "reward_std": 0.3405821621417999,
      "rewards/cosine_scaled_reward": -0.2650941051542759,
      "rewards/format_reward": 0.0,
      "step": 8
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.036,
      "grad_norm": 3.1448230743408203,
      "kl": 7.554888725280762e-05,
      "learning_rate": 1.8e-07,
      "loss": 0.0,
      "reward": -0.5024237409234047,
      "reward_std": 0.3572370335459709,
      "rewards/cosine_scaled_reward": -0.25121185183525085,
      "rewards/format_reward": 0.0,
      "step": 9
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.04,
      "grad_norm": 4.125906944274902,
      "kl": 8.666515350341797e-05,
      "learning_rate": 2e-07,
      "loss": 0.0,
      "reward": -0.5732719898223877,
      "reward_std": 0.37079156190156937,
      "rewards/cosine_scaled_reward": -0.28663600236177444,
      "rewards/format_reward": 0.0,
      "step": 10
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.044,
      "grad_norm": 4.4225945472717285,
      "kl": 5.561113357543945e-05,
      "learning_rate": 2.1999999999999998e-07,
      "loss": 0.0,
      "reward": -0.5889493525028229,
      "reward_std": 0.3473696708679199,
      "rewards/cosine_scaled_reward": -0.29447468370199203,
      "rewards/format_reward": 0.0,
      "step": 11
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.048,
      "grad_norm": 3.891627550125122,
      "kl": 7.808208465576172e-05,
      "learning_rate": 2.4e-07,
      "loss": 0.0,
      "reward": -0.5409628972411156,
      "reward_std": 0.326653391122818,
      "rewards/cosine_scaled_reward": -0.2704814486205578,
      "rewards/format_reward": 0.0,
      "step": 12
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.052,
      "grad_norm": 3.552539587020874,
      "kl": 7.30752944946289e-05,
      "learning_rate": 2.6e-07,
      "loss": 0.0,
      "reward": -0.5389444306492805,
      "reward_std": 0.3649257719516754,
      "rewards/cosine_scaled_reward": -0.2694722190499306,
      "rewards/format_reward": 0.0,
      "step": 13
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.056,
      "grad_norm": 2.781034231185913,
      "kl": 7.081031799316406e-05,
      "learning_rate": 2.8e-07,
      "loss": 0.0,
      "reward": -0.6049635112285614,
      "reward_std": 0.3185788542032242,
      "rewards/cosine_scaled_reward": -0.3024817630648613,
      "rewards/format_reward": 0.0,
      "step": 14
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.06,
      "grad_norm": 3.412130355834961,
      "kl": 6.335973739624023e-05,
      "learning_rate": 3e-07,
      "loss": 0.0,
      "reward": -0.6299380213022232,
      "reward_std": 0.31315718591213226,
      "rewards/cosine_scaled_reward": -0.3149690255522728,
      "rewards/format_reward": 0.0,
      "step": 15
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.064,
      "grad_norm": 4.064192771911621,
      "kl": 0.00011527538299560547,
      "learning_rate": 3.2e-07,
      "loss": 0.0,
      "reward": -0.5638149380683899,
      "reward_std": 0.3539445400238037,
      "rewards/cosine_scaled_reward": -0.28190746903419495,
      "rewards/format_reward": 0.0,
      "step": 16
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.068,
      "grad_norm": 3.5826501846313477,
      "kl": 9.000301361083984e-05,
      "learning_rate": 3.4000000000000003e-07,
      "loss": 0.0,
      "reward": -0.5815131217241287,
      "reward_std": 0.3570765480399132,
      "rewards/cosine_scaled_reward": -0.29075656831264496,
      "rewards/format_reward": 0.0,
      "step": 17
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.072,
      "grad_norm": 3.4398193359375,
      "kl": 0.00013589859008789062,
      "learning_rate": 3.6e-07,
      "loss": 0.0,
      "reward": -0.5058030858635902,
      "reward_std": 0.3534058630466461,
      "rewards/cosine_scaled_reward": -0.2529015429317951,
      "rewards/format_reward": 0.0,
      "step": 18
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.076,
      "grad_norm": 3.1647567749023438,
      "kl": 0.00010588765144348145,
      "learning_rate": 3.7999999999999996e-07,
      "loss": 0.0,
      "reward": -0.5453799739480019,
      "reward_std": 0.3434706851840019,
      "rewards/cosine_scaled_reward": -0.27268998324871063,
      "rewards/format_reward": 0.0,
      "step": 19
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.08,
      "grad_norm": 4.028233528137207,
      "kl": 0.00011265277862548828,
      "learning_rate": 4e-07,
      "loss": 0.0,
      "reward": -0.5725424438714981,
      "reward_std": 0.33554956316947937,
      "rewards/cosine_scaled_reward": -0.28627122938632965,
      "rewards/format_reward": 0.0,
      "step": 20
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.084,
      "grad_norm": 3.0403409004211426,
      "kl": 0.00015485286712646484,
      "learning_rate": 4.1999999999999995e-07,
      "loss": 0.0,
      "reward": -0.5395064353942871,
      "reward_std": 0.3414423242211342,
      "rewards/cosine_scaled_reward": -0.26975322514772415,
      "rewards/format_reward": 0.0,
      "step": 21
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.088,
      "grad_norm": 3.5831127166748047,
      "kl": 0.0006537437438964844,
      "learning_rate": 4.3999999999999997e-07,
      "loss": 0.0,
      "reward": -0.5216317698359489,
      "reward_std": 0.3427959829568863,
      "rewards/cosine_scaled_reward": -0.2608158737421036,
      "rewards/format_reward": 0.0,
      "step": 22
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.092,
      "grad_norm": 3.5175235271453857,
      "kl": 0.0010776519775390625,
      "learning_rate": 4.6e-07,
      "loss": 0.0,
      "reward": -0.5413709655404091,
      "reward_std": 0.32718800008296967,
      "rewards/cosine_scaled_reward": -0.27068548277020454,
      "rewards/format_reward": 0.0,
      "step": 23
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.096,
      "grad_norm": 3.442873239517212,
      "kl": 0.0013303756713867188,
      "learning_rate": 4.8e-07,
      "loss": 0.0001,
      "reward": -0.5624926462769508,
      "reward_std": 0.3581688553094864,
      "rewards/cosine_scaled_reward": -0.2812463231384754,
      "rewards/format_reward": 0.0,
      "step": 24
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.1,
      "grad_norm": 2.6114015579223633,
      "kl": 0.0016193389892578125,
      "learning_rate": 5e-07,
      "loss": 0.0001,
      "reward": -0.5309188961982727,
      "reward_std": 0.33032629638910294,
      "rewards/cosine_scaled_reward": -0.26545944809913635,
      "rewards/format_reward": 0.0,
      "step": 25
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.104,
      "grad_norm": 4.818567752838135,
      "kl": 0.0026264190673828125,
      "learning_rate": 5.2e-07,
      "loss": 0.0001,
      "reward": -0.5884083956480026,
      "reward_std": 0.3386874794960022,
      "rewards/cosine_scaled_reward": -0.2942042052745819,
      "rewards/format_reward": 0.0,
      "step": 26
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.108,
      "grad_norm": 4.078734397888184,
      "kl": 0.002239227294921875,
      "learning_rate": 5.4e-07,
      "loss": 0.0001,
      "reward": -0.6157089024782181,
      "reward_std": 0.3308729752898216,
      "rewards/cosine_scaled_reward": -0.30785445868968964,
      "rewards/format_reward": 0.0,
      "step": 27
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.112,
      "grad_norm": 3.4599478244781494,
      "kl": 0.002338409423828125,
      "learning_rate": 5.6e-07,
      "loss": 0.0001,
      "reward": -0.5709060430526733,
      "reward_std": 0.3136204034090042,
      "rewards/cosine_scaled_reward": -0.28545302152633667,
      "rewards/format_reward": 0.0,
      "step": 28
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1533.9464416503906,
      "epoch": 0.116,
      "grad_norm": 3.461718797683716,
      "kl": 0.003444671630859375,
      "learning_rate": 5.8e-07,
      "loss": -0.001,
      "reward": -0.5237472280859947,
      "reward_std": 0.3601622208952904,
      "rewards/cosine_scaled_reward": -0.26187360659241676,
      "rewards/format_reward": 0.0,
      "step": 29
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.12,
      "grad_norm": 3.7205333709716797,
      "kl": 0.00542449951171875,
      "learning_rate": 6e-07,
      "loss": 0.0002,
      "reward": -0.5595864206552505,
      "reward_std": 0.3391585499048233,
      "rewards/cosine_scaled_reward": -0.2797932103276253,
      "rewards/format_reward": 0.0,
      "step": 30
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.124,
      "grad_norm": 3.639012575149536,
      "kl": 0.0102996826171875,
      "learning_rate": 6.2e-07,
      "loss": 0.0004,
      "reward": -0.5832120478153229,
      "reward_std": 0.34403981268405914,
      "rewards/cosine_scaled_reward": -0.29160603135824203,
      "rewards/format_reward": 0.0,
      "step": 31
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.128,
      "grad_norm": 3.499258041381836,
      "kl": 0.0159149169921875,
      "learning_rate": 6.4e-07,
      "loss": 0.0006,
      "reward": -0.5567401573061943,
      "reward_std": 0.3353060856461525,
      "rewards/cosine_scaled_reward": -0.27837007120251656,
      "rewards/format_reward": 0.0,
      "step": 32
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.132,
      "grad_norm": 3.564453601837158,
      "kl": 0.0182952880859375,
      "learning_rate": 6.6e-07,
      "loss": 0.0007,
      "reward": -0.5521366372704506,
      "reward_std": 0.3413034975528717,
      "rewards/cosine_scaled_reward": -0.2760683260858059,
      "rewards/format_reward": 0.0,
      "step": 33
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.136,
      "grad_norm": 3.567174196243286,
      "kl": 0.0237274169921875,
      "learning_rate": 6.800000000000001e-07,
      "loss": 0.0009,
      "reward": -0.5193822234869003,
      "reward_std": 0.35690775513648987,
      "rewards/cosine_scaled_reward": -0.25969111174345016,
      "rewards/format_reward": 0.0,
      "step": 34
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.14,
      "grad_norm": 2.247893810272217,
      "kl": 0.0149078369140625,
      "learning_rate": 7e-07,
      "loss": 0.0006,
      "reward": -0.5820326581597328,
      "reward_std": 0.3510446697473526,
      "rewards/cosine_scaled_reward": -0.2910163216292858,
      "rewards/format_reward": 0.0,
      "step": 35
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.144,
      "grad_norm": 2.9316084384918213,
      "kl": 0.022552490234375,
      "learning_rate": 7.2e-07,
      "loss": 0.0009,
      "reward": -0.5632490888237953,
      "reward_std": 0.3500733822584152,
      "rewards/cosine_scaled_reward": -0.28162455186247826,
      "rewards/format_reward": 0.0,
      "step": 36
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.148,
      "grad_norm": 3.5201869010925293,
      "kl": 0.02850341796875,
      "learning_rate": 7.4e-07,
      "loss": 0.0011,
      "reward": -0.5141241475939751,
      "reward_std": 0.3309687077999115,
      "rewards/cosine_scaled_reward": -0.25706208124756813,
      "rewards/format_reward": 0.0,
      "step": 37
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.152,
      "grad_norm": 2.7246434688568115,
      "kl": 0.0296630859375,
      "learning_rate": 7.599999999999999e-07,
      "loss": 0.0012,
      "reward": -0.5139049887657166,
      "reward_std": 0.33319953083992004,
      "rewards/cosine_scaled_reward": -0.25695250555872917,
      "rewards/format_reward": 0.0,
      "step": 38
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.156,
      "grad_norm": 2.880594491958618,
      "kl": 0.0258636474609375,
      "learning_rate": 7.799999999999999e-07,
      "loss": 0.001,
      "reward": -0.5646104216575623,
      "reward_std": 0.3474426791071892,
      "rewards/cosine_scaled_reward": -0.2823052257299423,
      "rewards/format_reward": 0.0,
      "step": 39
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.16,
      "grad_norm": 2.6734988689422607,
      "kl": 0.0321044921875,
      "learning_rate": 8e-07,
      "loss": 0.0013,
      "reward": -0.5586390048265457,
      "reward_std": 0.3474784344434738,
      "rewards/cosine_scaled_reward": -0.27931951731443405,
      "rewards/format_reward": 0.0,
      "step": 40
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.164,
      "grad_norm": 3.1370785236358643,
      "kl": 0.03369140625,
      "learning_rate": 8.199999999999999e-07,
      "loss": 0.0013,
      "reward": -0.5609789937734604,
      "reward_std": 0.3450735807418823,
      "rewards/cosine_scaled_reward": -0.280489519238472,
      "rewards/format_reward": 0.0,
      "step": 41
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.168,
      "grad_norm": 2.5502073764801025,
      "kl": 0.06072998046875,
      "learning_rate": 8.399999999999999e-07,
      "loss": 0.0024,
      "reward": -0.5195748135447502,
      "reward_std": 0.34474433213472366,
      "rewards/cosine_scaled_reward": -0.2597874030470848,
      "rewards/format_reward": 0.0,
      "step": 42
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.172,
      "grad_norm": 2.1381213665008545,
      "kl": 0.067474365234375,
      "learning_rate": 8.599999999999999e-07,
      "loss": 0.0027,
      "reward": -0.5580533072352409,
      "reward_std": 0.32987529784440994,
      "rewards/cosine_scaled_reward": -0.27902666106820107,
      "rewards/format_reward": 0.0,
      "step": 43
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.176,
      "grad_norm": 2.1730432510375977,
      "kl": 0.0958251953125,
      "learning_rate": 8.799999999999999e-07,
      "loss": 0.0038,
      "reward": -0.5585729256272316,
      "reward_std": 0.3295438587665558,
      "rewards/cosine_scaled_reward": -0.2792864739894867,
      "rewards/format_reward": 0.0,
      "step": 44
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.18,
      "grad_norm": 1.962768316268921,
      "kl": 0.079345703125,
      "learning_rate": 9e-07,
      "loss": 0.0032,
      "reward": -0.5980347394943237,
      "reward_std": 0.3284436762332916,
      "rewards/cosine_scaled_reward": -0.29901736974716187,
      "rewards/format_reward": 0.0,
      "step": 45
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.184,
      "grad_norm": 1.8276231288909912,
      "kl": 0.1153564453125,
      "learning_rate": 9.2e-07,
      "loss": 0.0046,
      "reward": -0.507519856095314,
      "reward_std": 0.33579862862825394,
      "rewards/cosine_scaled_reward": -0.2537599205970764,
      "rewards/format_reward": 0.0,
      "step": 46
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.188,
      "grad_norm": 2.608023166656494,
      "kl": 0.09033203125,
      "learning_rate": 9.399999999999999e-07,
      "loss": 0.0036,
      "reward": -0.5289521142840385,
      "reward_std": 0.31808041036129,
      "rewards/cosine_scaled_reward": -0.26447605714201927,
      "rewards/format_reward": 0.0,
      "step": 47
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.192,
      "grad_norm": 1.8956966400146484,
      "kl": 0.09814453125,
      "learning_rate": 9.6e-07,
      "loss": 0.0039,
      "reward": -0.566174179315567,
      "reward_std": 0.311339795589447,
      "rewards/cosine_scaled_reward": -0.2830870673060417,
      "rewards/format_reward": 0.0,
      "step": 48
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.196,
      "grad_norm": 1.7705461978912354,
      "kl": 0.1209716796875,
      "learning_rate": 9.8e-07,
      "loss": 0.0048,
      "reward": -0.528024435043335,
      "reward_std": 0.36330366879701614,
      "rewards/cosine_scaled_reward": -0.26401223987340927,
      "rewards/format_reward": 0.0,
      "step": 49
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.2,
      "grad_norm": 2.1113531589508057,
      "kl": 0.1171875,
      "learning_rate": 1e-06,
      "loss": 0.0047,
      "reward": -0.4406622089445591,
      "reward_std": 0.3163011893630028,
      "rewards/cosine_scaled_reward": -0.2203311063349247,
      "rewards/format_reward": 0.0,
      "step": 50
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.204,
      "grad_norm": 1.803585410118103,
      "kl": 0.1026611328125,
      "learning_rate": 9.999890338174275e-07,
      "loss": 0.0041,
      "reward": -0.5815826654434204,
      "reward_std": 0.3248438388109207,
      "rewards/cosine_scaled_reward": -0.2907913327217102,
      "rewards/format_reward": 0.0,
      "step": 51
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.208,
      "grad_norm": 1.7076486349105835,
      "kl": 0.157470703125,
      "learning_rate": 9.999561358041868e-07,
      "loss": 0.0063,
      "reward": -0.5362438708543777,
      "reward_std": 0.2975444979965687,
      "rewards/cosine_scaled_reward": -0.26812195032835007,
      "rewards/format_reward": 0.0,
      "step": 52
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.212,
      "grad_norm": 2.478224515914917,
      "kl": 0.144287109375,
      "learning_rate": 9.999013075636804e-07,
      "loss": 0.0058,
      "reward": -0.47916819900274277,
      "reward_std": 0.35621220618486404,
      "rewards/cosine_scaled_reward": -0.23958410695195198,
      "rewards/format_reward": 0.0,
      "step": 53
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.216,
      "grad_norm": 2.006901502609253,
      "kl": 0.1337890625,
      "learning_rate": 9.998245517681593e-07,
      "loss": 0.0053,
      "reward": -0.5450761765241623,
      "reward_std": 0.32576631009578705,
      "rewards/cosine_scaled_reward": -0.27253808826208115,
      "rewards/format_reward": 0.0,
      "step": 54
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.22,
      "grad_norm": 2.2259609699249268,
      "kl": 0.11669921875,
      "learning_rate": 9.997258721585931e-07,
      "loss": 0.0047,
      "reward": -0.5271478518843651,
      "reward_std": 0.34441374242305756,
      "rewards/cosine_scaled_reward": -0.26357391849160194,
      "rewards/format_reward": 0.0,
      "step": 55
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.224,
      "grad_norm": 2.020939588546753,
      "kl": 0.1907958984375,
      "learning_rate": 9.996052735444862e-07,
      "loss": 0.0076,
      "reward": -0.5367654263973236,
      "reward_std": 0.3470792919397354,
      "rewards/cosine_scaled_reward": -0.2683827131986618,
      "rewards/format_reward": 0.0,
      "step": 56
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.228,
      "grad_norm": 1.9356812238693237,
      "kl": 0.158935546875,
      "learning_rate": 9.994627618036452e-07,
      "loss": 0.0064,
      "reward": -0.505635529756546,
      "reward_std": 0.3292393088340759,
      "rewards/cosine_scaled_reward": -0.252817764878273,
      "rewards/format_reward": 0.0,
      "step": 57
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.232,
      "grad_norm": 3.2483060359954834,
      "kl": 0.188720703125,
      "learning_rate": 9.992983438818915e-07,
      "loss": 0.0075,
      "reward": -0.504822663962841,
      "reward_std": 0.35463710874319077,
      "rewards/cosine_scaled_reward": -0.2524113282561302,
      "rewards/format_reward": 0.0,
      "step": 58
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.236,
      "grad_norm": 2.2256879806518555,
      "kl": 0.205322265625,
      "learning_rate": 9.991120277927223e-07,
      "loss": 0.0082,
      "reward": -0.5851711928844452,
      "reward_std": 0.3146449252963066,
      "rewards/cosine_scaled_reward": -0.2925856038928032,
      "rewards/format_reward": 0.0,
      "step": 59
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.24,
      "grad_norm": 2.093649387359619,
      "kl": 0.198486328125,
      "learning_rate": 9.989038226169207e-07,
      "loss": 0.0079,
      "reward": -0.45284587889909744,
      "reward_std": 0.34760017693042755,
      "rewards/cosine_scaled_reward": -0.22642293944954872,
      "rewards/format_reward": 0.0,
      "step": 60
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.244,
      "grad_norm": 2.378591537475586,
      "kl": 0.24365234375,
      "learning_rate": 9.98673738502114e-07,
      "loss": 0.0097,
      "reward": -0.5091445297002792,
      "reward_std": 0.3452131450176239,
      "rewards/cosine_scaled_reward": -0.2545722760260105,
      "rewards/format_reward": 0.0,
      "step": 61
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.248,
      "grad_norm": 2.188553810119629,
      "kl": 0.29833984375,
      "learning_rate": 9.98421786662277e-07,
      "loss": 0.0119,
      "reward": -0.47440846264362335,
      "reward_std": 0.34785814583301544,
      "rewards/cosine_scaled_reward": -0.23720423132181168,
      "rewards/format_reward": 0.0,
      "step": 62
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.252,
      "grad_norm": 2.6211366653442383,
      "kl": 0.48095703125,
      "learning_rate": 9.981479793771866e-07,
      "loss": 0.0192,
      "reward": -0.46701501309871674,
      "reward_std": 0.3275434151291847,
      "rewards/cosine_scaled_reward": -0.23350750654935837,
      "rewards/format_reward": 0.0,
      "step": 63
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.256,
      "grad_norm": 3.608039617538452,
      "kl": 0.63720703125,
      "learning_rate": 9.97852329991824e-07,
      "loss": 0.0254,
      "reward": -0.4022144228219986,
      "reward_std": 0.3280187249183655,
      "rewards/cosine_scaled_reward": -0.2011072114109993,
      "rewards/format_reward": 0.0,
      "step": 64
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.26,
      "grad_norm": 2.1589713096618652,
      "kl": 0.587890625,
      "learning_rate": 9.975348529157229e-07,
      "loss": 0.0236,
      "reward": -0.4902011975646019,
      "reward_std": 0.33829304575920105,
      "rewards/cosine_scaled_reward": -0.24510059878230095,
      "rewards/format_reward": 0.0,
      "step": 65
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.264,
      "grad_norm": 4.391396522521973,
      "kl": 0.851806640625,
      "learning_rate": 9.971955636222684e-07,
      "loss": 0.034,
      "reward": -0.5337588116526604,
      "reward_std": 0.3271815627813339,
      "rewards/cosine_scaled_reward": -0.2668794058263302,
      "rewards/format_reward": 0.0,
      "step": 66
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.268,
      "grad_norm": 4.296882629394531,
      "kl": 0.892333984375,
      "learning_rate": 9.968344786479415e-07,
      "loss": 0.0357,
      "reward": -0.45740216970443726,
      "reward_std": 0.32497797161340714,
      "rewards/cosine_scaled_reward": -0.22870109230279922,
      "rewards/format_reward": 0.0,
      "step": 67
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.272,
      "grad_norm": 7.224793434143066,
      "kl": 1.29736328125,
      "learning_rate": 9.964516155915151e-07,
      "loss": 0.0519,
      "reward": -0.5055549815297127,
      "reward_std": 0.3318631425499916,
      "rewards/cosine_scaled_reward": -0.25277747586369514,
      "rewards/format_reward": 0.0,
      "step": 68
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.276,
      "grad_norm": 6.747034072875977,
      "kl": 1.3232421875,
      "learning_rate": 9.960469931131936e-07,
      "loss": 0.0531,
      "reward": -0.4314222186803818,
      "reward_std": 0.31476689875125885,
      "rewards/cosine_scaled_reward": -0.21571110002696514,
      "rewards/format_reward": 0.0,
      "step": 69
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.28,
      "grad_norm": 5.5595808029174805,
      "kl": 0.8935546875,
      "learning_rate": 9.956206309337066e-07,
      "loss": 0.0358,
      "reward": -0.4758576303720474,
      "reward_std": 0.33101003617048264,
      "rewards/cosine_scaled_reward": -0.2379288226366043,
      "rewards/format_reward": 0.0,
      "step": 70
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.284,
      "grad_norm": 2.4482791423797607,
      "kl": 0.521484375,
      "learning_rate": 9.951725498333448e-07,
      "loss": 0.0209,
      "reward": -0.4491276890039444,
      "reward_std": 0.3567735329270363,
      "rewards/cosine_scaled_reward": -0.2245638445019722,
      "rewards/format_reward": 0.0,
      "step": 71
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.288,
      "grad_norm": 3.1987600326538086,
      "kl": 0.6240234375,
      "learning_rate": 9.947027716509488e-07,
      "loss": 0.025,
      "reward": -0.43654023110866547,
      "reward_std": 0.3590875416994095,
      "rewards/cosine_scaled_reward": -0.21827011927962303,
      "rewards/format_reward": 0.0,
      "step": 72
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.292,
      "grad_norm": 4.885537147521973,
      "kl": 1.14599609375,
      "learning_rate": 9.942113192828444e-07,
      "loss": 0.0458,
      "reward": -0.5265215784311295,
      "reward_std": 0.3363535851240158,
      "rewards/cosine_scaled_reward": -0.26326077431440353,
      "rewards/format_reward": 0.0,
      "step": 73
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.296,
      "grad_norm": 3.4503629207611084,
      "kl": 1.14794921875,
      "learning_rate": 9.93698216681727e-07,
      "loss": 0.0459,
      "reward": -0.4836200848221779,
      "reward_std": 0.33076073229312897,
      "rewards/cosine_scaled_reward": -0.24181004241108894,
      "rewards/format_reward": 0.0,
      "step": 74
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.3,
      "grad_norm": 3.5954651832580566,
      "kl": 0.6767578125,
      "learning_rate": 9.931634888554935e-07,
      "loss": 0.027,
      "reward": -0.5548510551452637,
      "reward_std": 0.3006826713681221,
      "rewards/cosine_scaled_reward": -0.27742552757263184,
      "rewards/format_reward": 0.0,
      "step": 75
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.304,
      "grad_norm": 2.27148699760437,
      "kl": 0.69970703125,
      "learning_rate": 9.926071618660237e-07,
      "loss": 0.028,
      "reward": -0.5522997975349426,
      "reward_std": 0.32217612117528915,
      "rewards/cosine_scaled_reward": -0.2761498987674713,
      "rewards/format_reward": 0.0,
      "step": 76
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.308,
      "grad_norm": 2.421114206314087,
      "kl": 0.65234375,
      "learning_rate": 9.9202926282791e-07,
      "loss": 0.0261,
      "reward": -0.5491495952010155,
      "reward_std": 0.33891358226537704,
      "rewards/cosine_scaled_reward": -0.27457480505108833,
      "rewards/format_reward": 0.0,
      "step": 77
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.312,
      "grad_norm": 2.296977996826172,
      "kl": 0.4833984375,
      "learning_rate": 9.91429819907136e-07,
      "loss": 0.0193,
      "reward": -0.5332002714276314,
      "reward_std": 0.3453890234231949,
      "rewards/cosine_scaled_reward": -0.2666001245379448,
      "rewards/format_reward": 0.0,
      "step": 78
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.316,
      "grad_norm": 2.351818084716797,
      "kl": 0.5048828125,
      "learning_rate": 9.908088623197048e-07,
      "loss": 0.0202,
      "reward": -0.4974421188235283,
      "reward_std": 0.36291657388210297,
      "rewards/cosine_scaled_reward": -0.24872105196118355,
      "rewards/format_reward": 0.0,
      "step": 79
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.32,
      "grad_norm": 2.808706521987915,
      "kl": 0.53125,
      "learning_rate": 9.901664203302124e-07,
      "loss": 0.0212,
      "reward": -0.5026201903820038,
      "reward_std": 0.30610421299934387,
      "rewards/cosine_scaled_reward": -0.2513100877404213,
      "rewards/format_reward": 0.0,
      "step": 80
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.324,
      "grad_norm": 2.077920913696289,
      "kl": 0.68994140625,
      "learning_rate": 9.895025252503755e-07,
      "loss": 0.0276,
      "reward": -0.4621705636382103,
      "reward_std": 0.33135028183460236,
      "rewards/cosine_scaled_reward": -0.23108528181910515,
      "rewards/format_reward": 0.0,
      "step": 81
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.328,
      "grad_norm": 2.951878309249878,
      "kl": 0.6015625,
      "learning_rate": 9.888172094375033e-07,
      "loss": 0.024,
      "reward": -0.5148988738656044,
      "reward_std": 0.3465086743235588,
      "rewards/cosine_scaled_reward": -0.2574494294822216,
      "rewards/format_reward": 0.0,
      "step": 82
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.332,
      "grad_norm": 2.1016077995300293,
      "kl": 0.36376953125,
      "learning_rate": 9.881105062929221e-07,
      "loss": 0.0145,
      "reward": -0.48821673542261124,
      "reward_std": 0.35235296189785004,
      "rewards/cosine_scaled_reward": -0.24410836026072502,
      "rewards/format_reward": 0.0,
      "step": 83
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.336,
      "grad_norm": 2.276076555252075,
      "kl": 0.77734375,
      "learning_rate": 9.873824502603459e-07,
      "loss": 0.0311,
      "reward": -0.509700171649456,
      "reward_std": 0.3434828519821167,
      "rewards/cosine_scaled_reward": -0.2548500932753086,
      "rewards/format_reward": 0.0,
      "step": 84
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.34,
      "grad_norm": 1.9953871965408325,
      "kl": 0.45263671875,
      "learning_rate": 9.866330768241983e-07,
      "loss": 0.0181,
      "reward": -0.5046856477856636,
      "reward_std": 0.3276178315281868,
      "rewards/cosine_scaled_reward": -0.2523428313434124,
      "rewards/format_reward": 0.0,
      "step": 85
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.344,
      "grad_norm": 5.694060802459717,
      "kl": 1.50390625,
      "learning_rate": 9.85862422507884e-07,
      "loss": 0.06,
      "reward": -0.5268296301364899,
      "reward_std": 0.3594844192266464,
      "rewards/cosine_scaled_reward": -0.26341481506824493,
      "rewards/format_reward": 0.0,
      "step": 86
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.348,
      "grad_norm": 2.5820319652557373,
      "kl": 0.79931640625,
      "learning_rate": 9.850705248720068e-07,
      "loss": 0.0319,
      "reward": -0.5030437260866165,
      "reward_std": 0.33297523856163025,
      "rewards/cosine_scaled_reward": -0.25152185559272766,
      "rewards/format_reward": 0.0,
      "step": 87
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.352,
      "grad_norm": 2.748469829559326,
      "kl": 0.8642578125,
      "learning_rate": 9.8425742251254e-07,
      "loss": 0.0346,
      "reward": -0.511917307972908,
      "reward_std": 0.3373011276125908,
      "rewards/cosine_scaled_reward": -0.255958653986454,
      "rewards/format_reward": 0.0,
      "step": 88
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.356,
      "grad_norm": 2.941894054412842,
      "kl": 1.10400390625,
      "learning_rate": 9.83423155058946e-07,
      "loss": 0.0443,
      "reward": -0.49383244663476944,
      "reward_std": 0.3190907835960388,
      "rewards/cosine_scaled_reward": -0.24691622331738472,
      "rewards/format_reward": 0.0,
      "step": 89
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.36,
      "grad_norm": 2.5008065700531006,
      "kl": 0.7451171875,
      "learning_rate": 9.825677631722435e-07,
      "loss": 0.0298,
      "reward": -0.5015105679631233,
      "reward_std": 0.3283078894019127,
      "rewards/cosine_scaled_reward": -0.25075526908040047,
      "rewards/format_reward": 0.0,
      "step": 90
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.364,
      "grad_norm": 2.775805950164795,
      "kl": 0.8662109375,
      "learning_rate": 9.816912885430258e-07,
      "loss": 0.0347,
      "reward": -0.49317121505737305,
      "reward_std": 0.3281624838709831,
      "rewards/cosine_scaled_reward": -0.24658560752868652,
      "rewards/format_reward": 0.0,
      "step": 91
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.368,
      "grad_norm": 4.057337284088135,
      "kl": 1.3115234375,
      "learning_rate": 9.807937738894303e-07,
      "loss": 0.0525,
      "reward": -0.4923912510275841,
      "reward_std": 0.334882490336895,
      "rewards/cosine_scaled_reward": -0.24619561806321144,
      "rewards/format_reward": 0.0,
      "step": 92
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.372,
      "grad_norm": 3.3191726207733154,
      "kl": 1.416015625,
      "learning_rate": 9.798752629550546e-07,
      "loss": 0.0567,
      "reward": -0.4856347441673279,
      "reward_std": 0.3141849860548973,
      "rewards/cosine_scaled_reward": -0.24281736463308334,
      "rewards/format_reward": 0.0,
      "step": 93
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.376,
      "grad_norm": 38.36699676513672,
      "kl": 3.833984375,
      "learning_rate": 9.78935800506826e-07,
      "loss": 0.1535,
      "reward": -0.5001253262162209,
      "reward_std": 0.34716712683439255,
      "rewards/cosine_scaled_reward": -0.25006265565752983,
      "rewards/format_reward": 0.0,
      "step": 94
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.38,
      "grad_norm": 2.851670742034912,
      "kl": 0.93017578125,
      "learning_rate": 9.779754323328192e-07,
      "loss": 0.0372,
      "reward": -0.4462156817317009,
      "reward_std": 0.3170738257467747,
      "rewards/cosine_scaled_reward": -0.22310783341526985,
      "rewards/format_reward": 0.0,
      "step": 95
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.384,
      "grad_norm": 1.903143048286438,
      "kl": 0.662109375,
      "learning_rate": 9.769942052400235e-07,
      "loss": 0.0265,
      "reward": -0.44278524816036224,
      "reward_std": 0.340934194624424,
      "rewards/cosine_scaled_reward": -0.22139262408018112,
      "rewards/format_reward": 0.0,
      "step": 96
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.388,
      "grad_norm": 2.613619089126587,
      "kl": 1.0009765625,
      "learning_rate": 9.759921670520634e-07,
      "loss": 0.04,
      "reward": -0.4385986104607582,
      "reward_std": 0.3297598212957382,
      "rewards/cosine_scaled_reward": -0.2192993052303791,
      "rewards/format_reward": 0.0,
      "step": 97
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.392,
      "grad_norm": 2.1393027305603027,
      "kl": 0.84912109375,
      "learning_rate": 9.749693666068663e-07,
      "loss": 0.0339,
      "reward": -0.4335070326924324,
      "reward_std": 0.3084552064538002,
      "rewards/cosine_scaled_reward": -0.2167535126209259,
      "rewards/format_reward": 0.0,
      "step": 98
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.396,
      "grad_norm": 10.226459503173828,
      "kl": 1.9765625,
      "learning_rate": 9.739258537542835e-07,
      "loss": 0.0791,
      "reward": -0.5120433643460274,
      "reward_std": 0.3308994993567467,
      "rewards/cosine_scaled_reward": -0.2560216821730137,
      "rewards/format_reward": 0.0,
      "step": 99
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.4,
      "grad_norm": 2.7042365074157715,
      "kl": 1.140625,
      "learning_rate": 9.728616793536587e-07,
      "loss": 0.0456,
      "reward": -0.5387645438313484,
      "reward_std": 0.32419781386852264,
      "rewards/cosine_scaled_reward": -0.2693822719156742,
      "rewards/format_reward": 0.0,
      "step": 100
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.404,
      "grad_norm": 3.3440866470336914,
      "kl": 1.158203125,
      "learning_rate": 9.717768952713511e-07,
      "loss": 0.0464,
      "reward": -0.479642316699028,
      "reward_std": 0.3374394252896309,
      "rewards/cosine_scaled_reward": -0.2398211695253849,
      "rewards/format_reward": 0.0,
      "step": 101
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.408,
      "grad_norm": 2.1483707427978516,
      "kl": 0.55859375,
      "learning_rate": 9.706715543782064e-07,
      "loss": 0.0224,
      "reward": -0.4488200396299362,
      "reward_std": 0.3361233174800873,
      "rewards/cosine_scaled_reward": -0.2244100198149681,
      "rewards/format_reward": 0.0,
      "step": 102
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.412,
      "grad_norm": 4.173567771911621,
      "kl": 1.900390625,
      "learning_rate": 9.695457105469804e-07,
      "loss": 0.0759,
      "reward": -0.4979688450694084,
      "reward_std": 0.35078077018260956,
      "rewards/cosine_scaled_reward": -0.2489844374358654,
      "rewards/format_reward": 0.0,
      "step": 103
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.416,
      "grad_norm": 5.119884490966797,
      "kl": 1.611328125,
      "learning_rate": 9.683994186497132e-07,
      "loss": 0.0644,
      "reward": -0.513933926820755,
      "reward_std": 0.3170707896351814,
      "rewards/cosine_scaled_reward": -0.2569669596850872,
      "rewards/format_reward": 0.0,
      "step": 104
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.42,
      "grad_norm": 2.8145992755889893,
      "kl": 1.466796875,
      "learning_rate": 9.672327345550543e-07,
      "loss": 0.0587,
      "reward": -0.47269363701343536,
      "reward_std": 0.31501560658216476,
      "rewards/cosine_scaled_reward": -0.23634683340787888,
      "rewards/format_reward": 0.0,
      "step": 105
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.424,
      "grad_norm": 2.3274426460266113,
      "kl": 0.59033203125,
      "learning_rate": 9.66045715125541e-07,
      "loss": 0.0236,
      "reward": -0.44968922436237335,
      "reward_std": 0.3498781695961952,
      "rewards/cosine_scaled_reward": -0.22484461963176727,
      "rewards/format_reward": 0.0,
      "step": 106
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.428,
      "grad_norm": 2.2112016677856445,
      "kl": 1.126953125,
      "learning_rate": 9.648384182148252e-07,
      "loss": 0.0451,
      "reward": -0.5002073347568512,
      "reward_std": 0.34406865388154984,
      "rewards/cosine_scaled_reward": -0.2501036673784256,
      "rewards/format_reward": 0.0,
      "step": 107
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.432,
      "grad_norm": 2.4664499759674072,
      "kl": 1.0986328125,
      "learning_rate": 9.636109026648554e-07,
      "loss": 0.0439,
      "reward": -0.49009862542152405,
      "reward_std": 0.3558028042316437,
      "rewards/cosine_scaled_reward": -0.24504930526018143,
      "rewards/format_reward": 0.0,
      "step": 108
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.436,
      "grad_norm": 2.3740482330322266,
      "kl": 0.67578125,
      "learning_rate": 9.623632283030077e-07,
      "loss": 0.027,
      "reward": -0.4631711468100548,
      "reward_std": 0.34275270998477936,
      "rewards/cosine_scaled_reward": -0.2315855734050274,
      "rewards/format_reward": 0.0,
      "step": 109
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.44,
      "grad_norm": 2.9116501808166504,
      "kl": 1.1826171875,
      "learning_rate": 9.610954559391704e-07,
      "loss": 0.0473,
      "reward": -0.444116935133934,
      "reward_std": 0.37212707847356796,
      "rewards/cosine_scaled_reward": -0.2220584638416767,
      "rewards/format_reward": 0.0,
      "step": 110
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.444,
      "grad_norm": 2.24743390083313,
      "kl": 0.638671875,
      "learning_rate": 9.598076473627796e-07,
      "loss": 0.0255,
      "reward": -0.46286992728710175,
      "reward_std": 0.3208693787455559,
      "rewards/cosine_scaled_reward": -0.23143497854471207,
      "rewards/format_reward": 0.0,
      "step": 111
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.448,
      "grad_norm": 3.138840913772583,
      "kl": 1.14404296875,
      "learning_rate": 9.58499865339809e-07,
      "loss": 0.0458,
      "reward": -0.4803452715277672,
      "reward_std": 0.3449332043528557,
      "rewards/cosine_scaled_reward": -0.2401726357638836,
      "rewards/format_reward": 0.0,
      "step": 112
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.452,
      "grad_norm": 2.7688963413238525,
      "kl": 0.9462890625,
      "learning_rate": 9.571721736097088e-07,
      "loss": 0.0379,
      "reward": -0.4440384730696678,
      "reward_std": 0.3389856517314911,
      "rewards/cosine_scaled_reward": -0.2220192365348339,
      "rewards/format_reward": 0.0,
      "step": 113
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.456,
      "grad_norm": 2.7298948764801025,
      "kl": 1.3583984375,
      "learning_rate": 9.55824636882301e-07,
      "loss": 0.0544,
      "reward": -0.40611616894602776,
      "reward_std": 0.3120696693658829,
      "rewards/cosine_scaled_reward": -0.20305808261036873,
      "rewards/format_reward": 0.0,
      "step": 114
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.46,
      "grad_norm": 2.628330945968628,
      "kl": 0.84521484375,
      "learning_rate": 9.54457320834625e-07,
      "loss": 0.0338,
      "reward": -0.41812988370656967,
      "reward_std": 0.33337801694869995,
      "rewards/cosine_scaled_reward": -0.20906493440270424,
      "rewards/format_reward": 0.0,
      "step": 115
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.464,
      "grad_norm": 2.21708607673645,
      "kl": 1.125,
      "learning_rate": 9.530702921077358e-07,
      "loss": 0.0451,
      "reward": -0.4452592432498932,
      "reward_std": 0.34758392721414566,
      "rewards/cosine_scaled_reward": -0.2226296216249466,
      "rewards/format_reward": 0.0,
      "step": 116
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.468,
      "grad_norm": 3.4151782989501953,
      "kl": 1.5390625,
      "learning_rate": 9.516636183034564e-07,
      "loss": 0.0617,
      "reward": -0.5043663010001183,
      "reward_std": 0.3056981936097145,
      "rewards/cosine_scaled_reward": -0.25218314677476883,
      "rewards/format_reward": 0.0,
      "step": 117
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.472,
      "grad_norm": 2.8809969425201416,
      "kl": 1.498046875,
      "learning_rate": 9.502373679810839e-07,
      "loss": 0.0599,
      "reward": -0.44362927228212357,
      "reward_std": 0.32765333354473114,
      "rewards/cosine_scaled_reward": -0.22181464359164238,
      "rewards/format_reward": 0.0,
      "step": 118
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.476,
      "grad_norm": 3.092552661895752,
      "kl": 1.6640625,
      "learning_rate": 9.487916106540465e-07,
      "loss": 0.0665,
      "reward": -0.49818655103445053,
      "reward_std": 0.3495415672659874,
      "rewards/cosine_scaled_reward": -0.24909326806664467,
      "rewards/format_reward": 0.0,
      "step": 119
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.48,
      "grad_norm": 3.2943530082702637,
      "kl": 2.07421875,
      "learning_rate": 9.473264167865171e-07,
      "loss": 0.0829,
      "reward": -0.4802135229110718,
      "reward_std": 0.3453461080789566,
      "rewards/cosine_scaled_reward": -0.24010677635669708,
      "rewards/format_reward": 0.0,
      "step": 120
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.484,
      "grad_norm": 2.5681769847869873,
      "kl": 1.505859375,
      "learning_rate": 9.458418577899774e-07,
      "loss": 0.0603,
      "reward": -0.5175792872905731,
      "reward_std": 0.35768260806798935,
      "rewards/cosine_scaled_reward": -0.25878964737057686,
      "rewards/format_reward": 0.0,
      "step": 121
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.488,
      "grad_norm": 2.9190571308135986,
      "kl": 1.57373046875,
      "learning_rate": 9.443380060197385e-07,
      "loss": 0.063,
      "reward": -0.46548449248075485,
      "reward_std": 0.35348332673311234,
      "rewards/cosine_scaled_reward": -0.23274223506450653,
      "rewards/format_reward": 0.0,
      "step": 122
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.492,
      "grad_norm": 2.435157537460327,
      "kl": 1.0654296875,
      "learning_rate": 9.428149347714143e-07,
      "loss": 0.0427,
      "reward": -0.4281177818775177,
      "reward_std": 0.3503784313797951,
      "rewards/cosine_scaled_reward": -0.21405889093875885,
      "rewards/format_reward": 0.0,
      "step": 123
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.496,
      "grad_norm": 3.1375350952148438,
      "kl": 1.5625,
      "learning_rate": 9.412727182773486e-07,
      "loss": 0.0624,
      "reward": -0.4667646959424019,
      "reward_std": 0.3501163199543953,
      "rewards/cosine_scaled_reward": -0.23338234052062035,
      "rewards/format_reward": 0.0,
      "step": 124
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.5,
      "grad_norm": 2.1935606002807617,
      "kl": 1.3427734375,
      "learning_rate": 9.397114317029974e-07,
      "loss": 0.0537,
      "reward": -0.4283955693244934,
      "reward_std": 0.34814615547657013,
      "rewards/cosine_scaled_reward": -0.2141977809369564,
      "rewards/format_reward": 0.0,
      "step": 125
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.504,
      "grad_norm": 2.727754592895508,
      "kl": 1.35546875,
      "learning_rate": 9.381311511432658e-07,
      "loss": 0.0543,
      "reward": -0.4584430381655693,
      "reward_std": 0.3318573832511902,
      "rewards/cosine_scaled_reward": -0.22922151535749435,
      "rewards/format_reward": 0.0,
      "step": 126
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.508,
      "grad_norm": 2.9863674640655518,
      "kl": 1.509765625,
      "learning_rate": 9.36531953618799e-07,
      "loss": 0.0603,
      "reward": -0.4794049710035324,
      "reward_std": 0.3224741891026497,
      "rewards/cosine_scaled_reward": -0.2397024855017662,
      "rewards/format_reward": 0.0,
      "step": 127
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.512,
      "grad_norm": 3.0583863258361816,
      "kl": 1.5751953125,
      "learning_rate": 9.34913917072228e-07,
      "loss": 0.0631,
      "reward": -0.3896471783518791,
      "reward_std": 0.32155635207891464,
      "rewards/cosine_scaled_reward": -0.19482359662652016,
      "rewards/format_reward": 0.0,
      "step": 128
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.516,
      "grad_norm": 11.888484001159668,
      "kl": 2.1806640625,
      "learning_rate": 9.332771203643714e-07,
      "loss": 0.0874,
      "reward": -0.46486661583185196,
      "reward_std": 0.34625906497240067,
      "rewards/cosine_scaled_reward": -0.23243330791592598,
      "rewards/format_reward": 0.0,
      "step": 129
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.52,
      "grad_norm": 3.14744234085083,
      "kl": 1.1103515625,
      "learning_rate": 9.316216432703916e-07,
      "loss": 0.0445,
      "reward": -0.4691261351108551,
      "reward_std": 0.3357261121273041,
      "rewards/cosine_scaled_reward": -0.23456306010484695,
      "rewards/format_reward": 0.0,
      "step": 130
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.524,
      "grad_norm": 2.6933717727661133,
      "kl": 1.76171875,
      "learning_rate": 9.299475664759068e-07,
      "loss": 0.0705,
      "reward": -0.5458347946405411,
      "reward_std": 0.3296028599143028,
      "rewards/cosine_scaled_reward": -0.27291740477085114,
      "rewards/format_reward": 0.0,
      "step": 131
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.528,
      "grad_norm": 2.695984363555908,
      "kl": 1.2666015625,
      "learning_rate": 9.282549715730579e-07,
      "loss": 0.0506,
      "reward": -0.43337278813123703,
      "reward_std": 0.3223467916250229,
      "rewards/cosine_scaled_reward": -0.2166864052414894,
      "rewards/format_reward": 0.0,
      "step": 132
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.532,
      "grad_norm": 2.1844236850738525,
      "kl": 1.072265625,
      "learning_rate": 9.265439410565328e-07,
      "loss": 0.0429,
      "reward": -0.47815513610839844,
      "reward_std": 0.33408980816602707,
      "rewards/cosine_scaled_reward": -0.23907756060361862,
      "rewards/format_reward": 0.0,
      "step": 133
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.536,
      "grad_norm": 2.6240434646606445,
      "kl": 0.998046875,
      "learning_rate": 9.248145583195447e-07,
      "loss": 0.0399,
      "reward": -0.3596036769449711,
      "reward_std": 0.3202332779765129,
      "rewards/cosine_scaled_reward": -0.17980184871703386,
      "rewards/format_reward": 0.0,
      "step": 134
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.54,
      "grad_norm": 2.413489580154419,
      "kl": 1.515625,
      "learning_rate": 9.230669076497687e-07,
      "loss": 0.0607,
      "reward": -0.3980662524700165,
      "reward_std": 0.3146558068692684,
      "rewards/cosine_scaled_reward": -0.19903312623500824,
      "rewards/format_reward": 0.0,
      "step": 135
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.544,
      "grad_norm": 2.5466983318328857,
      "kl": 1.421875,
      "learning_rate": 9.213010742252327e-07,
      "loss": 0.0568,
      "reward": -0.4567502960562706,
      "reward_std": 0.36093486845493317,
      "rewards/cosine_scaled_reward": -0.2283751629292965,
      "rewards/format_reward": 0.0,
      "step": 136
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.548,
      "grad_norm": 2.670454263687134,
      "kl": 1.63671875,
      "learning_rate": 9.195171441101668e-07,
      "loss": 0.0655,
      "reward": -0.48265285044908524,
      "reward_std": 0.33601198345422745,
      "rewards/cosine_scaled_reward": -0.24132642522454262,
      "rewards/format_reward": 0.0,
      "step": 137
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.552,
      "grad_norm": 3.4489877223968506,
      "kl": 1.39453125,
      "learning_rate": 9.177152042508077e-07,
      "loss": 0.0558,
      "reward": -0.40766458958387375,
      "reward_std": 0.34357643127441406,
      "rewards/cosine_scaled_reward": -0.20383229106664658,
      "rewards/format_reward": 0.0,
      "step": 138
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.556,
      "grad_norm": 2.18890118598938,
      "kl": 1.30859375,
      "learning_rate": 9.158953424711624e-07,
      "loss": 0.0523,
      "reward": -0.4143947809934616,
      "reward_std": 0.323918879032135,
      "rewards/cosine_scaled_reward": -0.2071974016726017,
      "rewards/format_reward": 0.0,
      "step": 139
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.56,
      "grad_norm": 2.5627028942108154,
      "kl": 1.34423828125,
      "learning_rate": 9.140576474687263e-07,
      "loss": 0.0538,
      "reward": -0.4485241174697876,
      "reward_std": 0.3278198316693306,
      "rewards/cosine_scaled_reward": -0.2242620587348938,
      "rewards/format_reward": 0.0,
      "step": 140
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.564,
      "grad_norm": 2.086371660232544,
      "kl": 1.2802734375,
      "learning_rate": 9.122022088101613e-07,
      "loss": 0.0512,
      "reward": -0.32855524495244026,
      "reward_std": 0.33061159402132034,
      "rewards/cosine_scaled_reward": -0.16427762433886528,
      "rewards/format_reward": 0.0,
      "step": 141
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.568,
      "grad_norm": 2.45231556892395,
      "kl": 1.580078125,
      "learning_rate": 9.103291169269299e-07,
      "loss": 0.0632,
      "reward": -0.4703398421406746,
      "reward_std": 0.2972045987844467,
      "rewards/cosine_scaled_reward": -0.2351699210703373,
      "rewards/format_reward": 0.0,
      "step": 142
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.572,
      "grad_norm": 2.864070415496826,
      "kl": 1.984375,
      "learning_rate": 9.084384631108882e-07,
      "loss": 0.0794,
      "reward": -0.41980744898319244,
      "reward_std": 0.34404993802309036,
      "rewards/cosine_scaled_reward": -0.20990372076630592,
      "rewards/format_reward": 0.0,
      "step": 143
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.576,
      "grad_norm": 2.412257194519043,
      "kl": 1.544921875,
      "learning_rate": 9.065303395098358e-07,
      "loss": 0.0618,
      "reward": -0.43455804139375687,
      "reward_std": 0.32647445797920227,
      "rewards/cosine_scaled_reward": -0.21727901697158813,
      "rewards/format_reward": 0.0,
      "step": 144
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.58,
      "grad_norm": 2.952892780303955,
      "kl": 2.0595703125,
      "learning_rate": 9.046048391230247e-07,
      "loss": 0.0824,
      "reward": -0.4728480279445648,
      "reward_std": 0.33887017518281937,
      "rewards/cosine_scaled_reward": -0.2364240102469921,
      "rewards/format_reward": 0.0,
      "step": 145
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.584,
      "grad_norm": 2.3727328777313232,
      "kl": 1.7255859375,
      "learning_rate": 9.026620557966279e-07,
      "loss": 0.0692,
      "reward": -0.42372531443834305,
      "reward_std": 0.3417205289006233,
      "rewards/cosine_scaled_reward": -0.21186266466975212,
      "rewards/format_reward": 0.0,
      "step": 146
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.588,
      "grad_norm": 2.953756809234619,
      "kl": 2.353515625,
      "learning_rate": 9.007020842191634e-07,
      "loss": 0.0943,
      "reward": -0.43578075617551804,
      "reward_std": 0.34062809497117996,
      "rewards/cosine_scaled_reward": -0.21789037808775902,
      "rewards/format_reward": 0.0,
      "step": 147
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.592,
      "grad_norm": 2.5953478813171387,
      "kl": 1.38671875,
      "learning_rate": 8.987250199168808e-07,
      "loss": 0.0555,
      "reward": -0.4190576896071434,
      "reward_std": 0.34895560145378113,
      "rewards/cosine_scaled_reward": -0.2095288448035717,
      "rewards/format_reward": 0.0,
      "step": 148
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.596,
      "grad_norm": 2.4279496669769287,
      "kl": 1.62890625,
      "learning_rate": 8.967309592491052e-07,
      "loss": 0.0651,
      "reward": -0.4394699037075043,
      "reward_std": 0.3207908198237419,
      "rewards/cosine_scaled_reward": -0.21973494067788124,
      "rewards/format_reward": 0.0,
      "step": 149
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.6,
      "grad_norm": 2.974292516708374,
      "kl": 1.892578125,
      "learning_rate": 8.9471999940354e-07,
      "loss": 0.0757,
      "reward": -0.4797021597623825,
      "reward_std": 0.32065775990486145,
      "rewards/cosine_scaled_reward": -0.23985107988119125,
      "rewards/format_reward": 0.0,
      "step": 150
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.604,
      "grad_norm": 2.51299786567688,
      "kl": 0.87890625,
      "learning_rate": 8.926922383915315e-07,
      "loss": 0.0351,
      "reward": -0.4108778163790703,
      "reward_std": 0.326105996966362,
      "rewards/cosine_scaled_reward": -0.20543890818953514,
      "rewards/format_reward": 0.0,
      "step": 151
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.608,
      "grad_norm": 2.723388195037842,
      "kl": 1.2294921875,
      "learning_rate": 8.906477750432903e-07,
      "loss": 0.0492,
      "reward": -0.4178111329674721,
      "reward_std": 0.32895463705062866,
      "rewards/cosine_scaled_reward": -0.20890555530786514,
      "rewards/format_reward": 0.0,
      "step": 152
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.612,
      "grad_norm": 2.4097025394439697,
      "kl": 1.650390625,
      "learning_rate": 8.88586709003076e-07,
      "loss": 0.0659,
      "reward": -0.4825671687722206,
      "reward_std": 0.33990373462438583,
      "rewards/cosine_scaled_reward": -0.2412835843861103,
      "rewards/format_reward": 0.0,
      "step": 153
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.616,
      "grad_norm": 2.114370107650757,
      "kl": 1.390625,
      "learning_rate": 8.865091407243394e-07,
      "loss": 0.0556,
      "reward": -0.42671380192041397,
      "reward_std": 0.32950445264577866,
      "rewards/cosine_scaled_reward": -0.21335690841078758,
      "rewards/format_reward": 0.0,
      "step": 154
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.62,
      "grad_norm": 3.1770823001861572,
      "kl": 1.4287109375,
      "learning_rate": 8.844151714648274e-07,
      "loss": 0.0572,
      "reward": -0.4250905141234398,
      "reward_std": 0.3110942989587784,
      "rewards/cosine_scaled_reward": -0.2125452570617199,
      "rewards/format_reward": 0.0,
      "step": 155
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.624,
      "grad_norm": 2.6063926219940186,
      "kl": 1.796875,
      "learning_rate": 8.823049032816478e-07,
      "loss": 0.0719,
      "reward": -0.4206129387021065,
      "reward_std": 0.33140094578266144,
      "rewards/cosine_scaled_reward": -0.21030646935105324,
      "rewards/format_reward": 0.0,
      "step": 156
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.628,
      "grad_norm": 2.482637643814087,
      "kl": 1.525390625,
      "learning_rate": 8.801784390262943e-07,
      "loss": 0.061,
      "reward": -0.36781868524849415,
      "reward_std": 0.3281563073396683,
      "rewards/cosine_scaled_reward": -0.18390934821218252,
      "rewards/format_reward": 0.0,
      "step": 157
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.632,
      "grad_norm": 2.7100956439971924,
      "kl": 1.7861328125,
      "learning_rate": 8.780358823396352e-07,
      "loss": 0.0715,
      "reward": -0.3854188397526741,
      "reward_std": 0.31897617131471634,
      "rewards/cosine_scaled_reward": -0.19270941987633705,
      "rewards/format_reward": 0.0,
      "step": 158
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.636,
      "grad_norm": 2.3493990898132324,
      "kl": 1.859375,
      "learning_rate": 8.758773376468604e-07,
      "loss": 0.0746,
      "reward": -0.41636481136083603,
      "reward_std": 0.3308830112218857,
      "rewards/cosine_scaled_reward": -0.20818240568041801,
      "rewards/format_reward": 0.0,
      "step": 159
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.64,
      "grad_norm": 2.429762840270996,
      "kl": 1.78125,
      "learning_rate": 8.737029101523929e-07,
      "loss": 0.0714,
      "reward": -0.44961177557706833,
      "reward_std": 0.3425107002258301,
      "rewards/cosine_scaled_reward": -0.22480589523911476,
      "rewards/format_reward": 0.0,
      "step": 160
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.644,
      "grad_norm": 2.6372933387756348,
      "kl": 1.6474609375,
      "learning_rate": 8.715127058347614e-07,
      "loss": 0.066,
      "reward": -0.4204000309109688,
      "reward_std": 0.3256704956293106,
      "rewards/cosine_scaled_reward": -0.2102000191807747,
      "rewards/format_reward": 0.0,
      "step": 161
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.648,
      "grad_norm": 2.2505483627319336,
      "kl": 1.576171875,
      "learning_rate": 8.693068314414344e-07,
      "loss": 0.063,
      "reward": -0.4363863915205002,
      "reward_std": 0.3367513567209244,
      "rewards/cosine_scaled_reward": -0.2181931994855404,
      "rewards/format_reward": 0.0,
      "step": 162
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.652,
      "grad_norm": 2.781273603439331,
      "kl": 1.4375,
      "learning_rate": 8.670853944836176e-07,
      "loss": 0.0576,
      "reward": -0.44805190712213516,
      "reward_std": 0.3117773234844208,
      "rewards/cosine_scaled_reward": -0.22402595356106758,
      "rewards/format_reward": 0.0,
      "step": 163
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.656,
      "grad_norm": 2.573030710220337,
      "kl": 1.21435546875,
      "learning_rate": 8.648485032310144e-07,
      "loss": 0.0487,
      "reward": -0.40324684232473373,
      "reward_std": 0.3176472932100296,
      "rewards/cosine_scaled_reward": -0.20162343233823776,
      "rewards/format_reward": 0.0,
      "step": 164
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.66,
      "grad_norm": 4.171741485595703,
      "kl": 2.3125,
      "learning_rate": 8.625962667065487e-07,
      "loss": 0.0925,
      "reward": -0.4968671426177025,
      "reward_std": 0.3204089626669884,
      "rewards/cosine_scaled_reward": -0.24843357503414154,
      "rewards/format_reward": 0.0,
      "step": 165
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1528.702392578125,
      "epoch": 0.664,
      "grad_norm": 2.1756961345672607,
      "kl": 1.7578125,
      "learning_rate": 8.603287946810513e-07,
      "loss": 0.0706,
      "reward": -0.4272613450884819,
      "reward_std": 0.32390115410089493,
      "rewards/cosine_scaled_reward": -0.21363067999482155,
      "rewards/format_reward": 0.0,
      "step": 166
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.668,
      "grad_norm": 2.2742207050323486,
      "kl": 1.912109375,
      "learning_rate": 8.580461976679099e-07,
      "loss": 0.0763,
      "reward": -0.3418873958289623,
      "reward_std": 0.29924022778868675,
      "rewards/cosine_scaled_reward": -0.17094369884580374,
      "rewards/format_reward": 0.0,
      "step": 167
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.672,
      "grad_norm": 2.1837146282196045,
      "kl": 1.3330078125,
      "learning_rate": 8.557485869176825e-07,
      "loss": 0.0533,
      "reward": -0.4050525277853012,
      "reward_std": 0.3251590058207512,
      "rewards/cosine_scaled_reward": -0.2025262601673603,
      "rewards/format_reward": 0.0,
      "step": 168
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.676,
      "grad_norm": 2.1009020805358887,
      "kl": 1.9326171875,
      "learning_rate": 8.534360744126753e-07,
      "loss": 0.0774,
      "reward": -0.4387947544455528,
      "reward_std": 0.3307826817035675,
      "rewards/cosine_scaled_reward": -0.21939736977219582,
      "rewards/format_reward": 0.0,
      "step": 169
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.68,
      "grad_norm": 2.515617609024048,
      "kl": 1.884765625,
      "learning_rate": 8.511087728614862e-07,
      "loss": 0.0754,
      "reward": -0.41566915810108185,
      "reward_std": 0.34893494844436646,
      "rewards/cosine_scaled_reward": -0.20783457532525063,
      "rewards/format_reward": 0.0,
      "step": 170
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.684,
      "grad_norm": 2.3045356273651123,
      "kl": 1.5078125,
      "learning_rate": 8.487667956935087e-07,
      "loss": 0.0604,
      "reward": -0.3871946483850479,
      "reward_std": 0.3363000229001045,
      "rewards/cosine_scaled_reward": -0.19359732419252396,
      "rewards/format_reward": 0.0,
      "step": 171
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.688,
      "grad_norm": 2.1517364978790283,
      "kl": 1.4169921875,
      "learning_rate": 8.464102570534061e-07,
      "loss": 0.0567,
      "reward": -0.41495678573846817,
      "reward_std": 0.33959241211414337,
      "rewards/cosine_scaled_reward": -0.20747840031981468,
      "rewards/format_reward": 0.0,
      "step": 172
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.692,
      "grad_norm": 2.4767415523529053,
      "kl": 1.5654296875,
      "learning_rate": 8.440392717955475e-07,
      "loss": 0.0626,
      "reward": -0.3259017579257488,
      "reward_std": 0.3448467329144478,
      "rewards/cosine_scaled_reward": -0.16295087756589055,
      "rewards/format_reward": 0.0,
      "step": 173
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.696,
      "grad_norm": 2.1803934574127197,
      "kl": 1.5986328125,
      "learning_rate": 8.416539554784089e-07,
      "loss": 0.0639,
      "reward": -0.45371130108833313,
      "reward_std": 0.3770594820380211,
      "rewards/cosine_scaled_reward": -0.22685565054416656,
      "rewards/format_reward": 0.0,
      "step": 174
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.7,
      "grad_norm": 2.146838426589966,
      "kl": 1.3212890625,
      "learning_rate": 8.392544243589427e-07,
      "loss": 0.053,
      "reward": -0.39382801204919815,
      "reward_std": 0.3155653551220894,
      "rewards/cosine_scaled_reward": -0.19691400602459908,
      "rewards/format_reward": 0.0,
      "step": 175
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.704,
      "grad_norm": 2.3939132690429688,
      "kl": 1.498046875,
      "learning_rate": 8.368407953869103e-07,
      "loss": 0.06,
      "reward": -0.397233285009861,
      "reward_std": 0.3429732918739319,
      "rewards/cosine_scaled_reward": -0.1986166313290596,
      "rewards/format_reward": 0.0,
      "step": 176
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.708,
      "grad_norm": 2.2279624938964844,
      "kl": 1.3759765625,
      "learning_rate": 8.344131861991828e-07,
      "loss": 0.0551,
      "reward": -0.41151023656129837,
      "reward_std": 0.3277590796351433,
      "rewards/cosine_scaled_reward": -0.2057551108300686,
      "rewards/format_reward": 0.0,
      "step": 177
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.712,
      "grad_norm": 2.5055384635925293,
      "kl": 1.341796875,
      "learning_rate": 8.319717151140072e-07,
      "loss": 0.0537,
      "reward": -0.4148360714316368,
      "reward_std": 0.3054031655192375,
      "rewards/cosine_scaled_reward": -0.2074180319905281,
      "rewards/format_reward": 0.0,
      "step": 178
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.716,
      "grad_norm": 2.605672836303711,
      "kl": 2.421875,
      "learning_rate": 8.295165011252396e-07,
      "loss": 0.0969,
      "reward": -0.49764253944158554,
      "reward_std": 0.34468474239110947,
      "rewards/cosine_scaled_reward": -0.24882125481963158,
      "rewards/format_reward": 0.0,
      "step": 179
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.72,
      "grad_norm": 1.8612443208694458,
      "kl": 1.958984375,
      "learning_rate": 8.270476638965461e-07,
      "loss": 0.0784,
      "reward": -0.41104499250650406,
      "reward_std": 0.32857123762369156,
      "rewards/cosine_scaled_reward": -0.20552249625325203,
      "rewards/format_reward": 0.0,
      "step": 180
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.724,
      "grad_norm": 2.20760178565979,
      "kl": 1.4267578125,
      "learning_rate": 8.245653237555705e-07,
      "loss": 0.0571,
      "reward": -0.4070161208510399,
      "reward_std": 0.29896606504917145,
      "rewards/cosine_scaled_reward": -0.20350806042551994,
      "rewards/format_reward": 0.0,
      "step": 181
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.728,
      "grad_norm": 2.527832269668579,
      "kl": 1.3251953125,
      "learning_rate": 8.220696016880687e-07,
      "loss": 0.053,
      "reward": -0.40310006588697433,
      "reward_std": 0.33485615253448486,
      "rewards/cosine_scaled_reward": -0.20155002549290657,
      "rewards/format_reward": 0.0,
      "step": 182
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.732,
      "grad_norm": 2.0901362895965576,
      "kl": 1.25,
      "learning_rate": 8.195606193320136e-07,
      "loss": 0.0499,
      "reward": -0.39147457480430603,
      "reward_std": 0.3105906918644905,
      "rewards/cosine_scaled_reward": -0.19573728740215302,
      "rewards/format_reward": 0.0,
      "step": 183
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.736,
      "grad_norm": 2.0712454319000244,
      "kl": 1.3271484375,
      "learning_rate": 8.170384989716657e-07,
      "loss": 0.053,
      "reward": -0.36338385939598083,
      "reward_std": 0.29373297840356827,
      "rewards/cosine_scaled_reward": -0.18169192969799042,
      "rewards/format_reward": 0.0,
      "step": 184
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.74,
      "grad_norm": 4.567477226257324,
      "kl": 2.91015625,
      "learning_rate": 8.145033635316128e-07,
      "loss": 0.1167,
      "reward": -0.46033478528261185,
      "reward_std": 0.309500552713871,
      "rewards/cosine_scaled_reward": -0.23016740009188652,
      "rewards/format_reward": 0.0,
      "step": 185
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.744,
      "grad_norm": 2.8025710582733154,
      "kl": 1.982421875,
      "learning_rate": 8.119553365707802e-07,
      "loss": 0.0793,
      "reward": -0.3399934060871601,
      "reward_std": 0.3289627507328987,
      "rewards/cosine_scaled_reward": -0.16999670304358006,
      "rewards/format_reward": 0.0,
      "step": 186
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.748,
      "grad_norm": 2.41241192817688,
      "kl": 1.6513671875,
      "learning_rate": 8.093945422764069e-07,
      "loss": 0.0663,
      "reward": -0.4002522900700569,
      "reward_std": 0.3234091103076935,
      "rewards/cosine_scaled_reward": -0.20012613758444786,
      "rewards/format_reward": 0.0,
      "step": 187
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.752,
      "grad_norm": 3.6371164321899414,
      "kl": 2.470703125,
      "learning_rate": 8.068211054579943e-07,
      "loss": 0.0988,
      "reward": -0.44175921380519867,
      "reward_std": 0.33701298385858536,
      "rewards/cosine_scaled_reward": -0.22087960690259933,
      "rewards/format_reward": 0.0,
      "step": 188
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.756,
      "grad_norm": 2.704362154006958,
      "kl": 1.71875,
      "learning_rate": 8.04235151541222e-07,
      "loss": 0.0686,
      "reward": -0.3934633806347847,
      "reward_std": 0.31845808029174805,
      "rewards/cosine_scaled_reward": -0.19673169776797295,
      "rewards/format_reward": 0.0,
      "step": 189
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.76,
      "grad_norm": 2.5518999099731445,
      "kl": 1.865234375,
      "learning_rate": 8.01636806561836e-07,
      "loss": 0.0746,
      "reward": -0.48456476628780365,
      "reward_std": 0.3398968055844307,
      "rewards/cosine_scaled_reward": -0.24228239431977272,
      "rewards/format_reward": 0.0,
      "step": 190
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.764,
      "grad_norm": 4.733001232147217,
      "kl": 2.0537109375,
      "learning_rate": 7.990261971595048e-07,
      "loss": 0.0822,
      "reward": -0.44671063870191574,
      "reward_std": 0.32652025669813156,
      "rewards/cosine_scaled_reward": -0.22335530444979668,
      "rewards/format_reward": 0.0,
      "step": 191
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.768,
      "grad_norm": 2.217525005340576,
      "kl": 1.72265625,
      "learning_rate": 7.964034505716476e-07,
      "loss": 0.0689,
      "reward": -0.38292936980724335,
      "reward_std": 0.3729139119386673,
      "rewards/cosine_scaled_reward": -0.19146469235420227,
      "rewards/format_reward": 0.0,
      "step": 192
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.772,
      "grad_norm": 2.3045313358306885,
      "kl": 1.0576171875,
      "learning_rate": 7.93768694627233e-07,
      "loss": 0.0423,
      "reward": -0.36335285753011703,
      "reward_std": 0.3274284452199936,
      "rewards/cosine_scaled_reward": -0.18167642876505852,
      "rewards/format_reward": 0.0,
      "step": 193
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.776,
      "grad_norm": 2.220212936401367,
      "kl": 1.974609375,
      "learning_rate": 7.911220577405484e-07,
      "loss": 0.0791,
      "reward": -0.41132358461618423,
      "reward_std": 0.33213579654693604,
      "rewards/cosine_scaled_reward": -0.20566179975867271,
      "rewards/format_reward": 0.0,
      "step": 194
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.78,
      "grad_norm": 2.872774124145508,
      "kl": 2.04296875,
      "learning_rate": 7.884636689049422e-07,
      "loss": 0.0819,
      "reward": -0.41410720348358154,
      "reward_std": 0.3132774606347084,
      "rewards/cosine_scaled_reward": -0.20705359801650047,
      "rewards/format_reward": 0.0,
      "step": 195
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.784,
      "grad_norm": 3.354735851287842,
      "kl": 1.2236328125,
      "learning_rate": 7.857936576865356e-07,
      "loss": 0.0489,
      "reward": -0.34651997685432434,
      "reward_std": 0.27611755579710007,
      "rewards/cosine_scaled_reward": -0.17325998842716217,
      "rewards/format_reward": 0.0,
      "step": 196
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.788,
      "grad_norm": 2.019547939300537,
      "kl": 1.03515625,
      "learning_rate": 7.831121542179086e-07,
      "loss": 0.0414,
      "reward": -0.36961859464645386,
      "reward_std": 0.3042915388941765,
      "rewards/cosine_scaled_reward": -0.18480929359793663,
      "rewards/format_reward": 0.0,
      "step": 197
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.792,
      "grad_norm": 2.245211601257324,
      "kl": 1.408203125,
      "learning_rate": 7.804192891917571e-07,
      "loss": 0.0564,
      "reward": -0.3812807723879814,
      "reward_std": 0.30970512330532074,
      "rewards/cosine_scaled_reward": -0.190640389919281,
      "rewards/format_reward": 0.0,
      "step": 198
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.796,
      "grad_norm": 2.0456931591033936,
      "kl": 1.673828125,
      "learning_rate": 7.777151938545235e-07,
      "loss": 0.067,
      "reward": -0.38433101773262024,
      "reward_std": 0.3408072590827942,
      "rewards/cosine_scaled_reward": -0.19216550886631012,
      "rewards/format_reward": 0.0,
      "step": 199
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.8,
      "grad_norm": 6.253657817840576,
      "kl": 1.48876953125,
      "learning_rate": 7.75e-07,
      "loss": 0.0595,
      "reward": -0.3863793611526489,
      "reward_std": 0.3155966252088547,
      "rewards/cosine_scaled_reward": -0.19318969175219536,
      "rewards/format_reward": 0.0,
      "step": 200
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.804,
      "grad_norm": 2.2331368923187256,
      "kl": 1.96484375,
      "learning_rate": 7.72273839962904e-07,
      "loss": 0.0786,
      "reward": -0.41171175986528397,
      "reward_std": 0.34651194512844086,
      "rewards/cosine_scaled_reward": -0.20585588365793228,
      "rewards/format_reward": 0.0,
      "step": 201
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.808,
      "grad_norm": 2.1702663898468018,
      "kl": 1.296875,
      "learning_rate": 7.695368466124296e-07,
      "loss": 0.0519,
      "reward": -0.38244833052158356,
      "reward_std": 0.34267907589673996,
      "rewards/cosine_scaled_reward": -0.19122417271137238,
      "rewards/format_reward": 0.0,
      "step": 202
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.812,
      "grad_norm": 2.0549793243408203,
      "kl": 1.4345703125,
      "learning_rate": 7.667891533457718e-07,
      "loss": 0.0573,
      "reward": -0.4125688225030899,
      "reward_std": 0.33167801052331924,
      "rewards/cosine_scaled_reward": -0.20628441870212555,
      "rewards/format_reward": 0.0,
      "step": 203
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.816,
      "grad_norm": 2.7793009281158447,
      "kl": 1.958984375,
      "learning_rate": 7.640308940816239e-07,
      "loss": 0.0784,
      "reward": -0.45417842268943787,
      "reward_std": 0.3453121930360794,
      "rewards/cosine_scaled_reward": -0.22708921134471893,
      "rewards/format_reward": 0.0,
      "step": 204
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.82,
      "grad_norm": 8.324098587036133,
      "kl": 2.23388671875,
      "learning_rate": 7.612622032536507e-07,
      "loss": 0.0895,
      "reward": -0.3973395526409149,
      "reward_std": 0.32590440660715103,
      "rewards/cosine_scaled_reward": -0.19866977632045746,
      "rewards/format_reward": 0.0,
      "step": 205
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.824,
      "grad_norm": 2.22940993309021,
      "kl": 1.51171875,
      "learning_rate": 7.584832158039378e-07,
      "loss": 0.0605,
      "reward": -0.4044779762625694,
      "reward_std": 0.33285098522901535,
      "rewards/cosine_scaled_reward": -0.2022389993071556,
      "rewards/format_reward": 0.0,
      "step": 206
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.828,
      "grad_norm": 2.824735164642334,
      "kl": 1.310546875,
      "learning_rate": 7.556940671764124e-07,
      "loss": 0.0524,
      "reward": -0.4486440494656563,
      "reward_std": 0.33797865360975266,
      "rewards/cosine_scaled_reward": -0.22432202845811844,
      "rewards/format_reward": 0.0,
      "step": 207
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.832,
      "grad_norm": 2.2558631896972656,
      "kl": 1.1962890625,
      "learning_rate": 7.528948933102438e-07,
      "loss": 0.0478,
      "reward": -0.40251782536506653,
      "reward_std": 0.30128662288188934,
      "rewards/cosine_scaled_reward": -0.20125891268253326,
      "rewards/format_reward": 0.0,
      "step": 208
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.836,
      "grad_norm": 2.7602171897888184,
      "kl": 0.9951171875,
      "learning_rate": 7.500858306332172e-07,
      "loss": 0.0398,
      "reward": -0.31514767929911613,
      "reward_std": 0.3020384646952152,
      "rewards/cosine_scaled_reward": -0.15757383964955807,
      "rewards/format_reward": 0.0,
      "step": 209
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.84,
      "grad_norm": 2.6217448711395264,
      "kl": 1.71484375,
      "learning_rate": 7.472670160550848e-07,
      "loss": 0.0684,
      "reward": -0.3670196682214737,
      "reward_std": 0.31881674379110336,
      "rewards/cosine_scaled_reward": -0.18350983038544655,
      "rewards/format_reward": 0.0,
      "step": 210
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.844,
      "grad_norm": 2.0915112495422363,
      "kl": 1.2841796875,
      "learning_rate": 7.444385869608921e-07,
      "loss": 0.0514,
      "reward": -0.4177168160676956,
      "reward_std": 0.3398260995745659,
      "rewards/cosine_scaled_reward": -0.2088584043085575,
      "rewards/format_reward": 0.0,
      "step": 211
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.848,
      "grad_norm": 1.7296172380447388,
      "kl": 1.2724609375,
      "learning_rate": 7.416006812042827e-07,
      "loss": 0.051,
      "reward": -0.41255099326372147,
      "reward_std": 0.33872970938682556,
      "rewards/cosine_scaled_reward": -0.20627548918128014,
      "rewards/format_reward": 0.0,
      "step": 212
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.852,
      "grad_norm": 2.1323206424713135,
      "kl": 1.16162109375,
      "learning_rate": 7.387534371007797e-07,
      "loss": 0.0466,
      "reward": -0.2759926188737154,
      "reward_std": 0.30077088996768,
      "rewards/cosine_scaled_reward": -0.1379963019862771,
      "rewards/format_reward": 0.0,
      "step": 213
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.856,
      "grad_norm": 2.3771109580993652,
      "kl": 1.556640625,
      "learning_rate": 7.358969934210438e-07,
      "loss": 0.0622,
      "reward": -0.3614875078201294,
      "reward_std": 0.32025381922721863,
      "rewards/cosine_scaled_reward": -0.1807437539100647,
      "rewards/format_reward": 0.0,
      "step": 214
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.86,
      "grad_norm": 2.940969467163086,
      "kl": 1.8828125,
      "learning_rate": 7.330314893841101e-07,
      "loss": 0.0754,
      "reward": -0.29097072361037135,
      "reward_std": 0.28063248097896576,
      "rewards/cosine_scaled_reward": -0.14548537082737312,
      "rewards/format_reward": 0.0,
      "step": 215
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.864,
      "grad_norm": 1.9293019771575928,
      "kl": 1.62890625,
      "learning_rate": 7.301570646506027e-07,
      "loss": 0.0652,
      "reward": -0.4154031127691269,
      "reward_std": 0.34460632503032684,
      "rewards/cosine_scaled_reward": -0.20770153775811195,
      "rewards/format_reward": 0.0,
      "step": 216
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.868,
      "grad_norm": 2.745267391204834,
      "kl": 2.0888671875,
      "learning_rate": 7.27273859315928e-07,
      "loss": 0.0835,
      "reward": -0.4031589925289154,
      "reward_std": 0.31946661323308945,
      "rewards/cosine_scaled_reward": -0.2015794888138771,
      "rewards/format_reward": 0.0,
      "step": 217
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.872,
      "grad_norm": 2.873622179031372,
      "kl": 1.5078125,
      "learning_rate": 7.243820139034464e-07,
      "loss": 0.0604,
      "reward": -0.4128880575299263,
      "reward_std": 0.3311196342110634,
      "rewards/cosine_scaled_reward": -0.20644402503967285,
      "rewards/format_reward": 0.0,
      "step": 218
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.876,
      "grad_norm": 2.7079639434814453,
      "kl": 1.2607421875,
      "learning_rate": 7.214816693576234e-07,
      "loss": 0.0505,
      "reward": -0.3099018558859825,
      "reward_std": 0.2861209958791733,
      "rewards/cosine_scaled_reward": -0.15495092794299126,
      "rewards/format_reward": 0.0,
      "step": 219
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.88,
      "grad_norm": 1.9640864133834839,
      "kl": 1.234375,
      "learning_rate": 7.185729670371604e-07,
      "loss": 0.0493,
      "reward": -0.40535254031419754,
      "reward_std": 0.2874290943145752,
      "rewards/cosine_scaled_reward": -0.20267625898122787,
      "rewards/format_reward": 0.0,
      "step": 220
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.884,
      "grad_norm": 2.130681037902832,
      "kl": 1.486328125,
      "learning_rate": 7.156560487081051e-07,
      "loss": 0.0595,
      "reward": -0.3594564124941826,
      "reward_std": 0.3218042775988579,
      "rewards/cosine_scaled_reward": -0.1797281987965107,
      "rewards/format_reward": 0.0,
      "step": 221
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.888,
      "grad_norm": 2.1852834224700928,
      "kl": 1.48046875,
      "learning_rate": 7.127310565369415e-07,
      "loss": 0.0591,
      "reward": -0.331524558365345,
      "reward_std": 0.28531621396541595,
      "rewards/cosine_scaled_reward": -0.1657622903585434,
      "rewards/format_reward": 0.0,
      "step": 222
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.892,
      "grad_norm": 2.3731930255889893,
      "kl": 1.734375,
      "learning_rate": 7.097981330836616e-07,
      "loss": 0.0693,
      "reward": -0.38006093353033066,
      "reward_std": 0.3292882591485977,
      "rewards/cosine_scaled_reward": -0.19003047049045563,
      "rewards/format_reward": 0.0,
      "step": 223
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.896,
      "grad_norm": 2.3246822357177734,
      "kl": 1.0390625,
      "learning_rate": 7.068574212948169e-07,
      "loss": 0.0416,
      "reward": -0.3990800455212593,
      "reward_std": 0.3413678854703903,
      "rewards/cosine_scaled_reward": -0.19954002648591995,
      "rewards/format_reward": 0.0,
      "step": 224
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.9,
      "grad_norm": 2.4476959705352783,
      "kl": 1.45703125,
      "learning_rate": 7.039090644965509e-07,
      "loss": 0.0583,
      "reward": -0.39841071516275406,
      "reward_std": 0.31324755400419235,
      "rewards/cosine_scaled_reward": -0.19920538365840912,
      "rewards/format_reward": 0.0,
      "step": 225
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.904,
      "grad_norm": 3.0681633949279785,
      "kl": 1.75,
      "learning_rate": 7.009532063876148e-07,
      "loss": 0.0701,
      "reward": -0.35963694006204605,
      "reward_std": 0.3227182477712631,
      "rewards/cosine_scaled_reward": -0.17981846630573273,
      "rewards/format_reward": 0.0,
      "step": 226
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.908,
      "grad_norm": 3.8354952335357666,
      "kl": 1.5087890625,
      "learning_rate": 6.979899910323624e-07,
      "loss": 0.0604,
      "reward": -0.3886452168226242,
      "reward_std": 0.31125637143850327,
      "rewards/cosine_scaled_reward": -0.1943226121366024,
      "rewards/format_reward": 0.0,
      "step": 227
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.912,
      "grad_norm": 2.3208184242248535,
      "kl": 1.39453125,
      "learning_rate": 6.950195628537299e-07,
      "loss": 0.0558,
      "reward": -0.34270477294921875,
      "reward_std": 0.3698492497205734,
      "rewards/cosine_scaled_reward": -0.17135238647460938,
      "rewards/format_reward": 0.0,
      "step": 228
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.916,
      "grad_norm": 2.174126386642456,
      "kl": 2.009765625,
      "learning_rate": 6.920420666261961e-07,
      "loss": 0.0804,
      "reward": -0.37576939910650253,
      "reward_std": 0.3269713968038559,
      "rewards/cosine_scaled_reward": -0.18788469955325127,
      "rewards/format_reward": 0.0,
      "step": 229
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.92,
      "grad_norm": 2.081784725189209,
      "kl": 2.1728515625,
      "learning_rate": 6.890576474687263e-07,
      "loss": 0.0869,
      "reward": -0.3998561128973961,
      "reward_std": 0.32443511486053467,
      "rewards/cosine_scaled_reward": -0.19992805272340775,
      "rewards/format_reward": 0.0,
      "step": 230
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.924,
      "grad_norm": 2.3403866291046143,
      "kl": 1.17529296875,
      "learning_rate": 6.860664508377001e-07,
      "loss": 0.0469,
      "reward": -0.38807813823223114,
      "reward_std": 0.32711831480264664,
      "rewards/cosine_scaled_reward": -0.19403906539082527,
      "rewards/format_reward": 0.0,
      "step": 231
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.928,
      "grad_norm": 2.029927968978882,
      "kl": 1.32666015625,
      "learning_rate": 6.83068622519821e-07,
      "loss": 0.0531,
      "reward": -0.38948777318000793,
      "reward_std": 0.3195284381508827,
      "rewards/cosine_scaled_reward": -0.19474387168884277,
      "rewards/format_reward": 0.0,
      "step": 232
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.932,
      "grad_norm": 2.9124484062194824,
      "kl": 1.71484375,
      "learning_rate": 6.800643086250121e-07,
      "loss": 0.0685,
      "reward": -0.3806769847869873,
      "reward_std": 0.2985011041164398,
      "rewards/cosine_scaled_reward": -0.19033849611878395,
      "rewards/format_reward": 0.0,
      "step": 233
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.936,
      "grad_norm": 2.464742422103882,
      "kl": 1.2998046875,
      "learning_rate": 6.770536555792944e-07,
      "loss": 0.052,
      "reward": -0.3443439155817032,
      "reward_std": 0.29415207356214523,
      "rewards/cosine_scaled_reward": -0.1721719540655613,
      "rewards/format_reward": 0.0,
      "step": 234
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.94,
      "grad_norm": 2.1291651725769043,
      "kl": 1.001953125,
      "learning_rate": 6.740368101176495e-07,
      "loss": 0.0401,
      "reward": -0.33735504001379013,
      "reward_std": 0.28946489840745926,
      "rewards/cosine_scaled_reward": -0.16867752373218536,
      "rewards/format_reward": 0.0,
      "step": 235
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.944,
      "grad_norm": 2.9513416290283203,
      "kl": 1.6201171875,
      "learning_rate": 6.710139192768694e-07,
      "loss": 0.0649,
      "reward": -0.40289320796728134,
      "reward_std": 0.30230626463890076,
      "rewards/cosine_scaled_reward": -0.20144660398364067,
      "rewards/format_reward": 0.0,
      "step": 236
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.948,
      "grad_norm": 3.7395241260528564,
      "kl": 1.6240234375,
      "learning_rate": 6.679851303883891e-07,
      "loss": 0.065,
      "reward": -0.3659610077738762,
      "reward_std": 0.32638294249773026,
      "rewards/cosine_scaled_reward": -0.1829805038869381,
      "rewards/format_reward": 0.0,
      "step": 237
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.952,
      "grad_norm": 2.7872421741485596,
      "kl": 1.7919921875,
      "learning_rate": 6.649505910711058e-07,
      "loss": 0.0718,
      "reward": -0.4507276937365532,
      "reward_std": 0.35789574682712555,
      "rewards/cosine_scaled_reward": -0.2253638356924057,
      "rewards/format_reward": 0.0,
      "step": 238
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.956,
      "grad_norm": 2.139983654022217,
      "kl": 1.40234375,
      "learning_rate": 6.619104492241847e-07,
      "loss": 0.056,
      "reward": -0.3731803297996521,
      "reward_std": 0.30503255128860474,
      "rewards/cosine_scaled_reward": -0.18659016117453575,
      "rewards/format_reward": 0.0,
      "step": 239
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.96,
      "grad_norm": 6.420464515686035,
      "kl": 2.787109375,
      "learning_rate": 6.588648530198504e-07,
      "loss": 0.1116,
      "reward": -0.40894675999879837,
      "reward_std": 0.3296940475702286,
      "rewards/cosine_scaled_reward": -0.20447338744997978,
      "rewards/format_reward": 0.0,
      "step": 240
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.964,
      "grad_norm": 2.4638171195983887,
      "kl": 2.1806640625,
      "learning_rate": 6.558139508961654e-07,
      "loss": 0.0874,
      "reward": -0.42437078058719635,
      "reward_std": 0.3512648344039917,
      "rewards/cosine_scaled_reward": -0.21218538656830788,
      "rewards/format_reward": 0.0,
      "step": 241
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.968,
      "grad_norm": 2.8068432807922363,
      "kl": 1.884765625,
      "learning_rate": 6.527578915497951e-07,
      "loss": 0.0754,
      "reward": -0.394868440926075,
      "reward_std": 0.2916436865925789,
      "rewards/cosine_scaled_reward": -0.1974342130124569,
      "rewards/format_reward": 0.0,
      "step": 242
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.972,
      "grad_norm": 2.272479295730591,
      "kl": 1.453125,
      "learning_rate": 6.496968239287603e-07,
      "loss": 0.0581,
      "reward": -0.36773569136857986,
      "reward_std": 0.3104323297739029,
      "rewards/cosine_scaled_reward": -0.18386784568428993,
      "rewards/format_reward": 0.0,
      "step": 243
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.976,
      "grad_norm": 2.86352276802063,
      "kl": 1.8525390625,
      "learning_rate": 6.466308972251785e-07,
      "loss": 0.0742,
      "reward": -0.3895353376865387,
      "reward_std": 0.30376598984003067,
      "rewards/cosine_scaled_reward": -0.19476767256855965,
      "rewards/format_reward": 0.0,
      "step": 244
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.98,
      "grad_norm": 3.2674906253814697,
      "kl": 1.89453125,
      "learning_rate": 6.435602608679916e-07,
      "loss": 0.0758,
      "reward": -0.35536977648735046,
      "reward_std": 0.32461147010326385,
      "rewards/cosine_scaled_reward": -0.17768489941954613,
      "rewards/format_reward": 0.0,
      "step": 245
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.984,
      "grad_norm": 2.3651580810546875,
      "kl": 1.3994140625,
      "learning_rate": 6.404850645156841e-07,
      "loss": 0.0559,
      "reward": -0.2967621465213597,
      "reward_std": 0.29580704867839813,
      "rewards/cosine_scaled_reward": -0.1483810821082443,
      "rewards/format_reward": 0.0,
      "step": 246
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.988,
      "grad_norm": 2.6290199756622314,
      "kl": 1.544921875,
      "learning_rate": 6.374054580489873e-07,
      "loss": 0.0618,
      "reward": -0.3732440918684006,
      "reward_std": 0.28786107152700424,
      "rewards/cosine_scaled_reward": -0.1866220459342003,
      "rewards/format_reward": 0.0,
      "step": 247
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.992,
      "grad_norm": 2.474320650100708,
      "kl": 1.18359375,
      "learning_rate": 6.343215915635761e-07,
      "loss": 0.0473,
      "reward": -0.3813322111964226,
      "reward_std": 0.3196609243750572,
      "rewards/cosine_scaled_reward": -0.1906661055982113,
      "rewards/format_reward": 0.0,
      "step": 248
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 0.996,
      "grad_norm": 2.4096460342407227,
      "kl": 1.185546875,
      "learning_rate": 6.31233615362752e-07,
      "loss": 0.0475,
      "reward": -0.37723246961832047,
      "reward_std": 0.32298891991376877,
      "rewards/cosine_scaled_reward": -0.18861623480916023,
      "rewards/format_reward": 0.0,
      "step": 249
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0001220703125,
      "epoch": 1.0,
      "grad_norm": 2.414369821548462,
      "kl": 1.1552734375,
      "learning_rate": 6.281416799501187e-07,
      "loss": 0.0462,
      "reward": -0.3446759209036827,
      "reward_std": 0.30413854122161865,
      "rewards/cosine_scaled_reward": -0.17233795672655106,
      "rewards/format_reward": 0.0,
      "step": 250
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.004,
      "grad_norm": 2.3181285858154297,
      "kl": 1.4765625,
      "learning_rate": 6.25045936022246e-07,
      "loss": 0.0591,
      "reward": -0.39850035309791565,
      "reward_std": 0.3559228628873825,
      "rewards/cosine_scaled_reward": -0.19925018772482872,
      "rewards/format_reward": 0.0,
      "step": 251
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.008,
      "grad_norm": 2.3214640617370605,
      "kl": 1.59375,
      "learning_rate": 6.219465344613258e-07,
      "loss": 0.0637,
      "reward": -0.3477981239557266,
      "reward_std": 0.3031875118613243,
      "rewards/cosine_scaled_reward": -0.1738990694284439,
      "rewards/format_reward": 0.0,
      "step": 252
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.012,
      "grad_norm": 2.4848833084106445,
      "kl": 1.6416015625,
      "learning_rate": 6.188436263278172e-07,
      "loss": 0.0657,
      "reward": -0.402904212474823,
      "reward_std": 0.32011619955301285,
      "rewards/cosine_scaled_reward": -0.2014521062374115,
      "rewards/format_reward": 0.0,
      "step": 253
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.016,
      "grad_norm": 7.0177903175354,
      "kl": 3.015625,
      "learning_rate": 6.157373628530852e-07,
      "loss": 0.1206,
      "reward": -0.41366545110940933,
      "reward_std": 0.3347878158092499,
      "rewards/cosine_scaled_reward": -0.20683272555470467,
      "rewards/format_reward": 0.0,
      "step": 254
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1533.3928527832031,
      "epoch": 1.02,
      "grad_norm": 2.5155041217803955,
      "kl": 1.818359375,
      "learning_rate": 6.126278954320294e-07,
      "loss": 0.073,
      "reward": -0.41607701033353806,
      "reward_std": 0.33659277111291885,
      "rewards/cosine_scaled_reward": -0.20803850889205933,
      "rewards/format_reward": 0.0,
      "step": 255
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.024,
      "grad_norm": 3.175401449203491,
      "kl": 2.349609375,
      "learning_rate": 6.095153756157051e-07,
      "loss": 0.094,
      "reward": -0.3731570616364479,
      "reward_std": 0.3251727372407913,
      "rewards/cosine_scaled_reward": -0.18657853826880455,
      "rewards/format_reward": 0.0,
      "step": 256
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.028,
      "grad_norm": 2.345123052597046,
      "kl": 2.140625,
      "learning_rate": 6.06399955103937e-07,
      "loss": 0.0857,
      "reward": -0.4059467390179634,
      "reward_std": 0.3182907700538635,
      "rewards/cosine_scaled_reward": -0.2029733695089817,
      "rewards/format_reward": 0.0,
      "step": 257
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.032,
      "grad_norm": 2.636462688446045,
      "kl": 1.705078125,
      "learning_rate": 6.032817857379256e-07,
      "loss": 0.068,
      "reward": -0.343365378677845,
      "reward_std": 0.3163585662841797,
      "rewards/cosine_scaled_reward": -0.1716826893389225,
      "rewards/format_reward": 0.0,
      "step": 258
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.036,
      "grad_norm": 2.297900438308716,
      "kl": 1.51953125,
      "learning_rate": 6.001610194928464e-07,
      "loss": 0.0608,
      "reward": -0.3703172579407692,
      "reward_std": 0.3630036562681198,
      "rewards/cosine_scaled_reward": -0.1851586326956749,
      "rewards/format_reward": 0.0,
      "step": 259
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.04,
      "grad_norm": 2.311648368835449,
      "kl": 1.515625,
      "learning_rate": 5.97037808470444e-07,
      "loss": 0.0605,
      "reward": -0.3789840117096901,
      "reward_std": 0.330322228372097,
      "rewards/cosine_scaled_reward": -0.18949199840426445,
      "rewards/format_reward": 0.0,
      "step": 260
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.044,
      "grad_norm": 2.3599531650543213,
      "kl": 1.78515625,
      "learning_rate": 5.939123048916173e-07,
      "loss": 0.0714,
      "reward": -0.3447503596544266,
      "reward_std": 0.33612456917762756,
      "rewards/cosine_scaled_reward": -0.17237518727779388,
      "rewards/format_reward": 0.0,
      "step": 261
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1527.6190490722656,
      "epoch": 1.048,
      "grad_norm": 2.2337074279785156,
      "kl": 1.890625,
      "learning_rate": 5.907846610890011e-07,
      "loss": 0.0786,
      "reward": -0.39859064668416977,
      "reward_std": 0.32645051926374435,
      "rewards/cosine_scaled_reward": -0.1992953196167946,
      "rewards/format_reward": 0.0,
      "step": 262
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.052,
      "grad_norm": 2.818617582321167,
      "kl": 1.55859375,
      "learning_rate": 5.87655029499542e-07,
      "loss": 0.0624,
      "reward": -0.3537183925509453,
      "reward_std": 0.309035487473011,
      "rewards/cosine_scaled_reward": -0.17685920372605324,
      "rewards/format_reward": 0.0,
      "step": 263
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.056,
      "grad_norm": 2.3533854484558105,
      "kl": 1.3583984375,
      "learning_rate": 5.845235626570683e-07,
      "loss": 0.0543,
      "reward": -0.3672221526503563,
      "reward_std": 0.31650061905384064,
      "rewards/cosine_scaled_reward": -0.18361108005046844,
      "rewards/format_reward": 0.0,
      "step": 264
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.06,
      "grad_norm": 3.936475992202759,
      "kl": 2.265625,
      "learning_rate": 5.813904131848564e-07,
      "loss": 0.0907,
      "reward": -0.36572812497615814,
      "reward_std": 0.2912697494029999,
      "rewards/cosine_scaled_reward": -0.18286405876278877,
      "rewards/format_reward": 0.0,
      "step": 265
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.064,
      "grad_norm": 2.754866600036621,
      "kl": 1.943359375,
      "learning_rate": 5.78255733788191e-07,
      "loss": 0.0777,
      "reward": -0.37356945127248764,
      "reward_std": 0.34380726516246796,
      "rewards/cosine_scaled_reward": -0.18678472936153412,
      "rewards/format_reward": 0.0,
      "step": 266
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.068,
      "grad_norm": 2.374964952468872,
      "kl": 1.4267578125,
      "learning_rate": 5.751196772469237e-07,
      "loss": 0.0571,
      "reward": -0.3651036322116852,
      "reward_std": 0.30468039214611053,
      "rewards/cosine_scaled_reward": -0.1825518161058426,
      "rewards/format_reward": 0.0,
      "step": 267
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1533.0535888671875,
      "epoch": 1.072,
      "grad_norm": 2.618032693862915,
      "kl": 1.6171875,
      "learning_rate": 5.71982396408026e-07,
      "loss": 0.0651,
      "reward": -0.35353927314281464,
      "reward_std": 0.3086354061961174,
      "rewards/cosine_scaled_reward": -0.17676963657140732,
      "rewards/format_reward": 0.0,
      "step": 268
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.076,
      "grad_norm": 2.920133590698242,
      "kl": 1.8515625,
      "learning_rate": 5.688440441781398e-07,
      "loss": 0.074,
      "reward": -0.37572528421878815,
      "reward_std": 0.33292342722415924,
      "rewards/cosine_scaled_reward": -0.18786264210939407,
      "rewards/format_reward": 0.0,
      "step": 269
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.08,
      "grad_norm": 2.581885576248169,
      "kl": 1.830078125,
      "learning_rate": 5.657047735161255e-07,
      "loss": 0.0732,
      "reward": -0.34584221988916397,
      "reward_std": 0.3140456974506378,
      "rewards/cosine_scaled_reward": -0.17292110994458199,
      "rewards/format_reward": 0.0,
      "step": 270
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.084,
      "grad_norm": 8.366601943969727,
      "kl": 2.509765625,
      "learning_rate": 5.625647374256061e-07,
      "loss": 0.1003,
      "reward": -0.37314866855740547,
      "reward_std": 0.2792880907654762,
      "rewards/cosine_scaled_reward": -0.18657432682812214,
      "rewards/format_reward": 0.0,
      "step": 271
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.088,
      "grad_norm": 3.071047067642212,
      "kl": 1.9658203125,
      "learning_rate": 5.594240889475106e-07,
      "loss": 0.0785,
      "reward": -0.39643432199954987,
      "reward_std": 0.31065937131643295,
      "rewards/cosine_scaled_reward": -0.19821715354919434,
      "rewards/format_reward": 0.0,
      "step": 272
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.092,
      "grad_norm": 3.8571436405181885,
      "kl": 1.2626953125,
      "learning_rate": 5.562829811526154e-07,
      "loss": 0.0506,
      "reward": -0.3136083036661148,
      "reward_std": 0.28241100907325745,
      "rewards/cosine_scaled_reward": -0.1568041555583477,
      "rewards/format_reward": 0.0,
      "step": 273
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.096,
      "grad_norm": 2.1380457878112793,
      "kl": 1.96875,
      "learning_rate": 5.531415671340826e-07,
      "loss": 0.0786,
      "reward": -0.35791803896427155,
      "reward_std": 0.3191326707601547,
      "rewards/cosine_scaled_reward": -0.17895901948213577,
      "rewards/format_reward": 0.0,
      "step": 274
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.1,
      "grad_norm": 3.744987964630127,
      "kl": 2.048828125,
      "learning_rate": 5.5e-07,
      "loss": 0.0819,
      "reward": -0.3743599057197571,
      "reward_std": 0.3121279552578926,
      "rewards/cosine_scaled_reward": -0.18717995658516884,
      "rewards/format_reward": 0.0,
      "step": 275
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.104,
      "grad_norm": 2.783698081970215,
      "kl": 1.8984375,
      "learning_rate": 5.468584328659172e-07,
      "loss": 0.0761,
      "reward": -0.3865007609128952,
      "reward_std": 0.322613961994648,
      "rewards/cosine_scaled_reward": -0.1932503841817379,
      "rewards/format_reward": 0.0,
      "step": 276
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.108,
      "grad_norm": 3.2086503505706787,
      "kl": 1.865234375,
      "learning_rate": 5.437170188473847e-07,
      "loss": 0.0746,
      "reward": -0.41129884123802185,
      "reward_std": 0.3018573820590973,
      "rewards/cosine_scaled_reward": -0.20564941689372063,
      "rewards/format_reward": 0.0,
      "step": 277
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.112,
      "grad_norm": 2.4078729152679443,
      "kl": 1.4072265625,
      "learning_rate": 5.405759110524894e-07,
      "loss": 0.0563,
      "reward": -0.39701489359140396,
      "reward_std": 0.3126164525747299,
      "rewards/cosine_scaled_reward": -0.19850744307041168,
      "rewards/format_reward": 0.0,
      "step": 278
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.116,
      "grad_norm": 2.5043461322784424,
      "kl": 2.35546875,
      "learning_rate": 5.37435262574394e-07,
      "loss": 0.0944,
      "reward": -0.28278425987809896,
      "reward_std": 0.2714259997010231,
      "rewards/cosine_scaled_reward": -0.1413921354105696,
      "rewards/format_reward": 0.0,
      "step": 279
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1533.6190490722656,
      "epoch": 1.12,
      "grad_norm": 4.991820335388184,
      "kl": 1.83984375,
      "learning_rate": 5.342952264838747e-07,
      "loss": 0.0713,
      "reward": -0.3403998464345932,
      "reward_std": 0.3223363533616066,
      "rewards/cosine_scaled_reward": -0.1701999232172966,
      "rewards/format_reward": 0.0,
      "step": 280
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1534.0476379394531,
      "epoch": 1.124,
      "grad_norm": 2.818126916885376,
      "kl": 1.37890625,
      "learning_rate": 5.311559558218603e-07,
      "loss": 0.054,
      "reward": -0.3611769676208496,
      "reward_std": 0.3213232010602951,
      "rewards/cosine_scaled_reward": -0.1805884800851345,
      "rewards/format_reward": 0.0,
      "step": 281
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.1280000000000001,
      "grad_norm": 2.7234742641448975,
      "kl": 2.248046875,
      "learning_rate": 5.28017603591974e-07,
      "loss": 0.0899,
      "reward": -0.4201104864478111,
      "reward_std": 0.3131628781557083,
      "rewards/cosine_scaled_reward": -0.21005523577332497,
      "rewards/format_reward": 0.0,
      "step": 282
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.1320000000000001,
      "grad_norm": 6.938405990600586,
      "kl": 1.998046875,
      "learning_rate": 5.248803227530763e-07,
      "loss": 0.0799,
      "reward": -0.33411792665719986,
      "reward_std": 0.32330870628356934,
      "rewards/cosine_scaled_reward": -0.16705895960330963,
      "rewards/format_reward": 0.0,
      "step": 283
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.1360000000000001,
      "grad_norm": 3.5663974285125732,
      "kl": 1.3349609375,
      "learning_rate": 5.21744266211809e-07,
      "loss": 0.0534,
      "reward": -0.3633820191025734,
      "reward_std": 0.31287185102701187,
      "rewards/cosine_scaled_reward": -0.1816909983754158,
      "rewards/format_reward": 0.0,
      "step": 284
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.1400000000000001,
      "grad_norm": 2.0476882457733154,
      "kl": 1.708984375,
      "learning_rate": 5.186095868151436e-07,
      "loss": 0.0684,
      "reward": -0.3689531907439232,
      "reward_std": 0.32297470420598984,
      "rewards/cosine_scaled_reward": -0.184476587921381,
      "rewards/format_reward": 0.0,
      "step": 285
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1526.8869018554688,
      "epoch": 1.144,
      "grad_norm": 12.345512390136719,
      "kl": 2.966796875,
      "learning_rate": 5.154764373429315e-07,
      "loss": 0.1254,
      "reward": -0.3650151863694191,
      "reward_std": 0.31899186968803406,
      "rewards/cosine_scaled_reward": -0.18250760063529015,
      "rewards/format_reward": 0.0,
      "step": 286
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.148,
      "grad_norm": 2.059617519378662,
      "kl": 2.291015625,
      "learning_rate": 5.123449705004581e-07,
      "loss": 0.0916,
      "reward": -0.3706892877817154,
      "reward_std": 0.32747378945350647,
      "rewards/cosine_scaled_reward": -0.1853446513414383,
      "rewards/format_reward": 0.0,
      "step": 287
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.152,
      "grad_norm": 3.889174699783325,
      "kl": 2.0859375,
      "learning_rate": 5.09215338910999e-07,
      "loss": 0.0834,
      "reward": -0.4078289121389389,
      "reward_std": 0.3290611281991005,
      "rewards/cosine_scaled_reward": -0.20391445606946945,
      "rewards/format_reward": 0.0,
      "step": 288
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1533.2440490722656,
      "epoch": 1.156,
      "grad_norm": 2.5038888454437256,
      "kl": 0.93896484375,
      "learning_rate": 5.060876951083828e-07,
      "loss": 0.0354,
      "reward": -0.34110401570796967,
      "reward_std": 0.3122602626681328,
      "rewards/cosine_scaled_reward": -0.17055201157927513,
      "rewards/format_reward": 0.0,
      "step": 289
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.16,
      "grad_norm": 2.39719557762146,
      "kl": 1.8583984375,
      "learning_rate": 5.02962191529556e-07,
      "loss": 0.0744,
      "reward": -0.36911261081695557,
      "reward_std": 0.3288589343428612,
      "rewards/cosine_scaled_reward": -0.1845562942326069,
      "rewards/format_reward": 0.0,
      "step": 290
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.164,
      "grad_norm": 2.758849620819092,
      "kl": 1.626953125,
      "learning_rate": 4.998389805071536e-07,
      "loss": 0.0651,
      "reward": -0.3935117796063423,
      "reward_std": 0.3461349532008171,
      "rewards/cosine_scaled_reward": -0.19675587862730026,
      "rewards/format_reward": 0.0,
      "step": 291
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.168,
      "grad_norm": 2.310575246810913,
      "kl": 1.455078125,
      "learning_rate": 4.967182142620745e-07,
      "loss": 0.0583,
      "reward": -0.34184807538986206,
      "reward_std": 0.3021695464849472,
      "rewards/cosine_scaled_reward": -0.17092403396964073,
      "rewards/format_reward": 0.0,
      "step": 292
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.172,
      "grad_norm": 2.8417394161224365,
      "kl": 1.861328125,
      "learning_rate": 4.93600044896063e-07,
      "loss": 0.0744,
      "reward": -0.3772461339831352,
      "reward_std": 0.3044436201453209,
      "rewards/cosine_scaled_reward": -0.18862305954098701,
      "rewards/format_reward": 0.0,
      "step": 293
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.176,
      "grad_norm": 2.347404956817627,
      "kl": 1.28125,
      "learning_rate": 4.904846243842949e-07,
      "loss": 0.0513,
      "reward": -0.3517310842871666,
      "reward_std": 0.3094722405076027,
      "rewards/cosine_scaled_reward": -0.1758655458688736,
      "rewards/format_reward": 0.0,
      "step": 294
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1535.6130981445312,
      "epoch": 1.18,
      "grad_norm": 2.7739925384521484,
      "kl": 1.833984375,
      "learning_rate": 4.873721045679706e-07,
      "loss": 0.0731,
      "reward": -0.4288819953799248,
      "reward_std": 0.3247087821364403,
      "rewards/cosine_scaled_reward": -0.2144409976899624,
      "rewards/format_reward": 0.0,
      "step": 295
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.184,
      "grad_norm": 2.1470892429351807,
      "kl": 1.296875,
      "learning_rate": 4.842626371469149e-07,
      "loss": 0.0519,
      "reward": -0.35219819098711014,
      "reward_std": 0.3056294918060303,
      "rewards/cosine_scaled_reward": -0.17609910294413567,
      "rewards/format_reward": 0.0,
      "step": 296
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.188,
      "grad_norm": 3.177232503890991,
      "kl": 1.677734375,
      "learning_rate": 4.811563736721829e-07,
      "loss": 0.0671,
      "reward": -0.3717339485883713,
      "reward_std": 0.29695921391248703,
      "rewards/cosine_scaled_reward": -0.18586697429418564,
      "rewards/format_reward": 0.0,
      "step": 297
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.192,
      "grad_norm": 3.3333382606506348,
      "kl": 2.322265625,
      "learning_rate": 4.780534655386743e-07,
      "loss": 0.093,
      "reward": -0.3814833015203476,
      "reward_std": 0.28608307987451553,
      "rewards/cosine_scaled_reward": -0.1907416470348835,
      "rewards/format_reward": 0.0,
      "step": 298
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.196,
      "grad_norm": 2.842420816421509,
      "kl": 1.45703125,
      "learning_rate": 4.749540639777539e-07,
      "loss": 0.0583,
      "reward": -0.3840809538960457,
      "reward_std": 0.31393957883119583,
      "rewards/cosine_scaled_reward": -0.19204047322273254,
      "rewards/format_reward": 0.0,
      "step": 299
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.2,
      "grad_norm": 2.9220309257507324,
      "kl": 1.681640625,
      "learning_rate": 4.7185832004988133e-07,
      "loss": 0.0672,
      "reward": -0.39588408917188644,
      "reward_std": 0.33600132539868355,
      "rewards/cosine_scaled_reward": -0.19794204831123352,
      "rewards/format_reward": 0.0,
      "step": 300
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.204,
      "grad_norm": 3.4091219902038574,
      "kl": 1.44140625,
      "learning_rate": 4.68766384637248e-07,
      "loss": 0.0576,
      "reward": -0.2894315180601552,
      "reward_std": 0.30969203263521194,
      "rewards/cosine_scaled_reward": -0.14471576345385984,
      "rewards/format_reward": 0.0,
      "step": 301
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.208,
      "grad_norm": 2.0488741397857666,
      "kl": 1.5576171875,
      "learning_rate": 4.656784084364238e-07,
      "loss": 0.0624,
      "reward": -0.32318826019763947,
      "reward_std": 0.3031533695757389,
      "rewards/cosine_scaled_reward": -0.16159413009881973,
      "rewards/format_reward": 0.0,
      "step": 302
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.212,
      "grad_norm": 2.6755242347717285,
      "kl": 1.34765625,
      "learning_rate": 4.6259454195101267e-07,
      "loss": 0.0539,
      "reward": -0.37002843618392944,
      "reward_std": 0.31058184802532196,
      "rewards/cosine_scaled_reward": -0.18501422181725502,
      "rewards/format_reward": 0.0,
      "step": 303
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.216,
      "grad_norm": 6.160266399383545,
      "kl": 1.734375,
      "learning_rate": 4.59514935484316e-07,
      "loss": 0.0694,
      "reward": -0.38714154064655304,
      "reward_std": 0.3265160173177719,
      "rewards/cosine_scaled_reward": -0.19357078149914742,
      "rewards/format_reward": 0.0,
      "step": 304
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.22,
      "grad_norm": 2.3529880046844482,
      "kl": 1.2138671875,
      "learning_rate": 4.5643973913200837e-07,
      "loss": 0.0486,
      "reward": -0.3460870534181595,
      "reward_std": 0.3087117671966553,
      "rewards/cosine_scaled_reward": -0.17304353043437004,
      "rewards/format_reward": 0.0,
      "step": 305
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.224,
      "grad_norm": 2.48714280128479,
      "kl": 1.9453125,
      "learning_rate": 4.5336910277482155e-07,
      "loss": 0.0779,
      "reward": -0.3756335750222206,
      "reward_std": 0.32805445045232773,
      "rewards/cosine_scaled_reward": -0.1878167800605297,
      "rewards/format_reward": 0.0,
      "step": 306
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.228,
      "grad_norm": 8.46654987335205,
      "kl": 2.5107421875,
      "learning_rate": 4.503031760712397e-07,
      "loss": 0.1004,
      "reward": -0.385331392288208,
      "reward_std": 0.31344960629940033,
      "rewards/cosine_scaled_reward": -0.1926657035946846,
      "rewards/format_reward": 0.0,
      "step": 307
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.232,
      "grad_norm": 3.198944568634033,
      "kl": 2.140625,
      "learning_rate": 4.4724210845020494e-07,
      "loss": 0.0857,
      "reward": -0.36118319630622864,
      "reward_std": 0.3010380119085312,
      "rewards/cosine_scaled_reward": -0.18059159815311432,
      "rewards/format_reward": 0.0,
      "step": 308
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.236,
      "grad_norm": 2.745668411254883,
      "kl": 2.033203125,
      "learning_rate": 4.441860491038345e-07,
      "loss": 0.0813,
      "reward": -0.3596822917461395,
      "reward_std": 0.3092067465186119,
      "rewards/cosine_scaled_reward": -0.17984114587306976,
      "rewards/format_reward": 0.0,
      "step": 309
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.24,
      "grad_norm": 5.614748954772949,
      "kl": 2.34375,
      "learning_rate": 4.4113514698014953e-07,
      "loss": 0.094,
      "reward": -0.34773094952106476,
      "reward_std": 0.29645886272192,
      "rewards/cosine_scaled_reward": -0.17386547103524208,
      "rewards/format_reward": 0.0,
      "step": 310
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.244,
      "grad_norm": 2.089031219482422,
      "kl": 1.39453125,
      "learning_rate": 4.3808955077581546e-07,
      "loss": 0.0558,
      "reward": -0.33028923720121384,
      "reward_std": 0.2886582836508751,
      "rewards/cosine_scaled_reward": -0.16514462232589722,
      "rewards/format_reward": 0.0,
      "step": 311
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.248,
      "grad_norm": 5.366787433624268,
      "kl": 2.9599609375,
      "learning_rate": 4.350494089288943e-07,
      "loss": 0.1186,
      "reward": -0.4123021811246872,
      "reward_std": 0.337029866874218,
      "rewards/cosine_scaled_reward": -0.206151083111763,
      "rewards/format_reward": 0.0,
      "step": 312
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.252,
      "grad_norm": 8.391505241394043,
      "kl": 1.953125,
      "learning_rate": 4.3201486961161093e-07,
      "loss": 0.078,
      "reward": -0.3487403020262718,
      "reward_std": 0.3276291638612747,
      "rewards/cosine_scaled_reward": -0.1743701510131359,
      "rewards/format_reward": 0.0,
      "step": 313
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.256,
      "grad_norm": 2.623786449432373,
      "kl": 1.3193359375,
      "learning_rate": 4.2898608072313045e-07,
      "loss": 0.0528,
      "reward": -0.32606934756040573,
      "reward_std": 0.28208620101213455,
      "rewards/cosine_scaled_reward": -0.16303467005491257,
      "rewards/format_reward": 0.0,
      "step": 314
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.26,
      "grad_norm": 2.2247447967529297,
      "kl": 1.884765625,
      "learning_rate": 4.2596318988235037e-07,
      "loss": 0.0755,
      "reward": -0.2273978427692782,
      "reward_std": 0.28098014742136,
      "rewards/cosine_scaled_reward": -0.11369891960930545,
      "rewards/format_reward": 0.0,
      "step": 315
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.264,
      "grad_norm": 2.258469581604004,
      "kl": 1.14453125,
      "learning_rate": 4.2294634442070553e-07,
      "loss": 0.0457,
      "reward": -0.24764333851635456,
      "reward_std": 0.2835834100842476,
      "rewards/cosine_scaled_reward": -0.12382166367024183,
      "rewards/format_reward": 0.0,
      "step": 316
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.268,
      "grad_norm": 2.884620189666748,
      "kl": 1.5986328125,
      "learning_rate": 4.1993569137498776e-07,
      "loss": 0.064,
      "reward": -0.37140634655952454,
      "reward_std": 0.36573630571365356,
      "rewards/cosine_scaled_reward": -0.18570317327976227,
      "rewards/format_reward": 0.0,
      "step": 317
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.272,
      "grad_norm": 2.703934669494629,
      "kl": 1.912109375,
      "learning_rate": 4.1693137748017915e-07,
      "loss": 0.0763,
      "reward": -0.34411681443452835,
      "reward_std": 0.29631946235895157,
      "rewards/cosine_scaled_reward": -0.17205841839313507,
      "rewards/format_reward": 0.0,
      "step": 318
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.276,
      "grad_norm": 3.717240571975708,
      "kl": 2.224609375,
      "learning_rate": 4.1393354916230005e-07,
      "loss": 0.0891,
      "reward": -0.3324529230594635,
      "reward_std": 0.2552623227238655,
      "rewards/cosine_scaled_reward": -0.16622646152973175,
      "rewards/format_reward": 0.0,
      "step": 319
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.28,
      "grad_norm": 2.4941396713256836,
      "kl": 1.384765625,
      "learning_rate": 4.1094235253127374e-07,
      "loss": 0.0555,
      "reward": -0.30811577290296555,
      "reward_std": 0.2845884971320629,
      "rewards/cosine_scaled_reward": -0.15405788272619247,
      "rewards/format_reward": 0.0,
      "step": 320
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.284,
      "grad_norm": 3.229072332382202,
      "kl": 1.9453125,
      "learning_rate": 4.079579333738039e-07,
      "loss": 0.0778,
      "reward": -0.3366442248225212,
      "reward_std": 0.301740899682045,
      "rewards/cosine_scaled_reward": -0.1683221124112606,
      "rewards/format_reward": 0.0,
      "step": 321
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.288,
      "grad_norm": 3.3636343479156494,
      "kl": 1.8828125,
      "learning_rate": 4.0498043714627006e-07,
      "loss": 0.0752,
      "reward": -0.36845648288726807,
      "reward_std": 0.34283190220594406,
      "rewards/cosine_scaled_reward": -0.18422825261950493,
      "rewards/format_reward": 0.0,
      "step": 322
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.292,
      "grad_norm": 3.507054090499878,
      "kl": 1.4130859375,
      "learning_rate": 4.020100089676376e-07,
      "loss": 0.0566,
      "reward": -0.34711746126413345,
      "reward_std": 0.2960944324731827,
      "rewards/cosine_scaled_reward": -0.17355873063206673,
      "rewards/format_reward": 0.0,
      "step": 323
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.296,
      "grad_norm": 2.661647081375122,
      "kl": 1.736328125,
      "learning_rate": 3.9904679361238526e-07,
      "loss": 0.0694,
      "reward": -0.33277176320552826,
      "reward_std": 0.3034566268324852,
      "rewards/cosine_scaled_reward": -0.16638587787747383,
      "rewards/format_reward": 0.0,
      "step": 324
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.3,
      "grad_norm": 3.079672336578369,
      "kl": 1.359375,
      "learning_rate": 3.9609093550344907e-07,
      "loss": 0.0544,
      "reward": -0.3246685415506363,
      "reward_std": 0.27341699600219727,
      "rewards/cosine_scaled_reward": -0.16233427450060844,
      "rewards/format_reward": 0.0,
      "step": 325
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.304,
      "grad_norm": 3.248324394226074,
      "kl": 1.1181640625,
      "learning_rate": 3.931425787051832e-07,
      "loss": 0.0447,
      "reward": -0.3214203119277954,
      "reward_std": 0.2835453376173973,
      "rewards/cosine_scaled_reward": -0.160710159689188,
      "rewards/format_reward": 0.0,
      "step": 326
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.308,
      "grad_norm": 3.676837205886841,
      "kl": 1.724609375,
      "learning_rate": 3.902018669163384e-07,
      "loss": 0.069,
      "reward": -0.32949286699295044,
      "reward_std": 0.30344782024621964,
      "rewards/cosine_scaled_reward": -0.16474644094705582,
      "rewards/format_reward": 0.0,
      "step": 327
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.312,
      "grad_norm": 2.3120462894439697,
      "kl": 1.537109375,
      "learning_rate": 3.872689434630585e-07,
      "loss": 0.0615,
      "reward": -0.3512613996863365,
      "reward_std": 0.3501633331179619,
      "rewards/cosine_scaled_reward": -0.17563070356845856,
      "rewards/format_reward": 0.0,
      "step": 328
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.316,
      "grad_norm": 2.4828386306762695,
      "kl": 1.6953125,
      "learning_rate": 3.843439512918949e-07,
      "loss": 0.0677,
      "reward": -0.31614498794078827,
      "reward_std": 0.29276788979768753,
      "rewards/cosine_scaled_reward": -0.15807249024510384,
      "rewards/format_reward": 0.0,
      "step": 329
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.32,
      "grad_norm": 3.356783151626587,
      "kl": 2.453125,
      "learning_rate": 3.8142703296283953e-07,
      "loss": 0.0982,
      "reward": -0.4576185494661331,
      "reward_std": 0.32832735031843185,
      "rewards/cosine_scaled_reward": -0.22880928218364716,
      "rewards/format_reward": 0.0,
      "step": 330
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.324,
      "grad_norm": 2.7885196208953857,
      "kl": 2.068359375,
      "learning_rate": 3.785183306423767e-07,
      "loss": 0.0827,
      "reward": -0.2943090833723545,
      "reward_std": 0.31652648001909256,
      "rewards/cosine_scaled_reward": -0.14715453796088696,
      "rewards/format_reward": 0.0,
      "step": 331
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.328,
      "grad_norm": 3.0415380001068115,
      "kl": 1.802734375,
      "learning_rate": 3.7561798609655373e-07,
      "loss": 0.0721,
      "reward": -0.3697570115327835,
      "reward_std": 0.3258262947201729,
      "rewards/cosine_scaled_reward": -0.18487850576639175,
      "rewards/format_reward": 0.0,
      "step": 332
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.332,
      "grad_norm": 3.139693021774292,
      "kl": 1.732421875,
      "learning_rate": 3.72726140684072e-07,
      "loss": 0.0693,
      "reward": -0.33471549302339554,
      "reward_std": 0.2794983647763729,
      "rewards/cosine_scaled_reward": -0.16735775396227837,
      "rewards/format_reward": 0.0,
      "step": 333
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.336,
      "grad_norm": 2.6243162155151367,
      "kl": 1.8369140625,
      "learning_rate": 3.6984293534939737e-07,
      "loss": 0.0733,
      "reward": -0.3382048085331917,
      "reward_std": 0.3457643389701843,
      "rewards/cosine_scaled_reward": -0.16910240054130554,
      "rewards/format_reward": 0.0,
      "step": 334
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.34,
      "grad_norm": 3.803060293197632,
      "kl": 1.8046875,
      "learning_rate": 3.6696851061588994e-07,
      "loss": 0.0723,
      "reward": -0.3406166359782219,
      "reward_std": 0.29876144975423813,
      "rewards/cosine_scaled_reward": -0.17030831426382065,
      "rewards/format_reward": 0.0,
      "step": 335
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.3439999999999999,
      "grad_norm": 3.948391914367676,
      "kl": 1.365234375,
      "learning_rate": 3.641030065789562e-07,
      "loss": 0.0546,
      "reward": -0.2908342033624649,
      "reward_std": 0.26911235228180885,
      "rewards/cosine_scaled_reward": -0.14541710540652275,
      "rewards/format_reward": 0.0,
      "step": 336
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.3479999999999999,
      "grad_norm": 2.9695639610290527,
      "kl": 2.19921875,
      "learning_rate": 3.612465628992203e-07,
      "loss": 0.0881,
      "reward": -0.37160656601190567,
      "reward_std": 0.3147331103682518,
      "rewards/cosine_scaled_reward": -0.18580328300595284,
      "rewards/format_reward": 0.0,
      "step": 337
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.3519999999999999,
      "grad_norm": 3.1350209712982178,
      "kl": 2.1689453125,
      "learning_rate": 3.5839931879571725e-07,
      "loss": 0.087,
      "reward": -0.3230074942111969,
      "reward_std": 0.313438281416893,
      "rewards/cosine_scaled_reward": -0.16150375083088875,
      "rewards/format_reward": 0.0,
      "step": 338
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.3559999999999999,
      "grad_norm": 3.882567882537842,
      "kl": 2.0546875,
      "learning_rate": 3.555614130391079e-07,
      "loss": 0.0821,
      "reward": -0.36975327879190445,
      "reward_std": 0.31242573261260986,
      "rewards/cosine_scaled_reward": -0.18487663567066193,
      "rewards/format_reward": 0.0,
      "step": 339
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.3599999999999999,
      "grad_norm": 2.6699118614196777,
      "kl": 1.689453125,
      "learning_rate": 3.5273298394491515e-07,
      "loss": 0.0676,
      "reward": -0.368961863219738,
      "reward_std": 0.32627636194229126,
      "rewards/cosine_scaled_reward": -0.1844809353351593,
      "rewards/format_reward": 0.0,
      "step": 340
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.3639999999999999,
      "grad_norm": 3.0782856941223145,
      "kl": 1.59765625,
      "learning_rate": 3.4991416936678276e-07,
      "loss": 0.064,
      "reward": -0.3320116475224495,
      "reward_std": 0.3151276856660843,
      "rewards/cosine_scaled_reward": -0.16600582748651505,
      "rewards/format_reward": 0.0,
      "step": 341
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.3679999999999999,
      "grad_norm": 2.2419495582580566,
      "kl": 1.46484375,
      "learning_rate": 3.471051066897562e-07,
      "loss": 0.0585,
      "reward": -0.2764207161962986,
      "reward_std": 0.3390573188662529,
      "rewards/cosine_scaled_reward": -0.1382103539071977,
      "rewards/format_reward": 0.0,
      "step": 342
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.3719999999999999,
      "grad_norm": 4.397972106933594,
      "kl": 2.5,
      "learning_rate": 3.4430593282358777e-07,
      "loss": 0.1002,
      "reward": -0.33926407247781754,
      "reward_std": 0.31172922998666763,
      "rewards/cosine_scaled_reward": -0.16963203251361847,
      "rewards/format_reward": 0.0,
      "step": 343
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.376,
      "grad_norm": 3.441905975341797,
      "kl": 2.0234375,
      "learning_rate": 3.4151678419606233e-07,
      "loss": 0.0808,
      "reward": -0.3324861600995064,
      "reward_std": 0.2958858981728554,
      "rewards/cosine_scaled_reward": -0.1662430725991726,
      "rewards/format_reward": 0.0,
      "step": 344
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.38,
      "grad_norm": 2.7323975563049316,
      "kl": 1.4189453125,
      "learning_rate": 3.387377967463493e-07,
      "loss": 0.0566,
      "reward": -0.3314187452197075,
      "reward_std": 0.3164066970348358,
      "rewards/cosine_scaled_reward": -0.16570937633514404,
      "rewards/format_reward": 0.0,
      "step": 345
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.384,
      "grad_norm": 4.131885528564453,
      "kl": 2.45703125,
      "learning_rate": 3.359691059183761e-07,
      "loss": 0.0983,
      "reward": -0.37432391941547394,
      "reward_std": 0.33136965334415436,
      "rewards/cosine_scaled_reward": -0.18716195970773697,
      "rewards/format_reward": 0.0,
      "step": 346
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.388,
      "grad_norm": 2.9907569885253906,
      "kl": 1.732421875,
      "learning_rate": 3.3321084665422803e-07,
      "loss": 0.0693,
      "reward": -0.38256606459617615,
      "reward_std": 0.31782740354537964,
      "rewards/cosine_scaled_reward": -0.19128303229808807,
      "rewards/format_reward": 0.0,
      "step": 347
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.392,
      "grad_norm": 2.6049344539642334,
      "kl": 1.53515625,
      "learning_rate": 3.3046315338757026e-07,
      "loss": 0.0613,
      "reward": -0.2997368350625038,
      "reward_std": 0.3045838475227356,
      "rewards/cosine_scaled_reward": -0.1498684138059616,
      "rewards/format_reward": 0.0,
      "step": 348
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.396,
      "grad_norm": 4.5095295906066895,
      "kl": 1.5029296875,
      "learning_rate": 3.2772616003709616e-07,
      "loss": 0.0602,
      "reward": -0.3363025635480881,
      "reward_std": 0.30865515023469925,
      "rewards/cosine_scaled_reward": -0.16815128177404404,
      "rewards/format_reward": 0.0,
      "step": 349
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.4,
      "grad_norm": 3.3342795372009277,
      "kl": 1.908203125,
      "learning_rate": 3.250000000000001e-07,
      "loss": 0.0762,
      "reward": -0.3770889565348625,
      "reward_std": 0.30710920691490173,
      "rewards/cosine_scaled_reward": -0.18854447081685066,
      "rewards/format_reward": 0.0,
      "step": 350
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.404,
      "grad_norm": 2.795259714126587,
      "kl": 2.048828125,
      "learning_rate": 3.222848061454764e-07,
      "loss": 0.082,
      "reward": -0.3462035730481148,
      "reward_std": 0.32692621648311615,
      "rewards/cosine_scaled_reward": -0.1731017865240574,
      "rewards/format_reward": 0.0,
      "step": 351
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.408,
      "grad_norm": 2.563765287399292,
      "kl": 1.462890625,
      "learning_rate": 3.195807108082429e-07,
      "loss": 0.0586,
      "reward": -0.37373943626880646,
      "reward_std": 0.3041759356856346,
      "rewards/cosine_scaled_reward": -0.18686972558498383,
      "rewards/format_reward": 0.0,
      "step": 352
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.412,
      "grad_norm": 2.6194751262664795,
      "kl": 1.24609375,
      "learning_rate": 3.168878457820915e-07,
      "loss": 0.0498,
      "reward": -0.3196728527545929,
      "reward_std": 0.2953634150326252,
      "rewards/cosine_scaled_reward": -0.15983642637729645,
      "rewards/format_reward": 0.0,
      "step": 353
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.416,
      "grad_norm": 2.8382420539855957,
      "kl": 1.650390625,
      "learning_rate": 3.142063423134644e-07,
      "loss": 0.0662,
      "reward": -0.33513225615024567,
      "reward_std": 0.30527665093541145,
      "rewards/cosine_scaled_reward": -0.16756613552570343,
      "rewards/format_reward": 0.0,
      "step": 354
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.42,
      "grad_norm": 2.6078808307647705,
      "kl": 2.15234375,
      "learning_rate": 3.115363310950578e-07,
      "loss": 0.086,
      "reward": -0.3992829695343971,
      "reward_std": 0.31726495921611786,
      "rewards/cosine_scaled_reward": -0.19964147731661797,
      "rewards/format_reward": 0.0,
      "step": 355
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.424,
      "grad_norm": 4.192615985870361,
      "kl": 2.142578125,
      "learning_rate": 3.0887794225945143e-07,
      "loss": 0.0858,
      "reward": -0.39319509267807007,
      "reward_std": 0.3372880816459656,
      "rewards/cosine_scaled_reward": -0.19659754261374474,
      "rewards/format_reward": 0.0,
      "step": 356
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.428,
      "grad_norm": 3.196894407272339,
      "kl": 2.509765625,
      "learning_rate": 3.062313053727671e-07,
      "loss": 0.1006,
      "reward": -0.3694089204072952,
      "reward_std": 0.323252871632576,
      "rewards/cosine_scaled_reward": -0.1847044676542282,
      "rewards/format_reward": 0.0,
      "step": 357
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.432,
      "grad_norm": 3.348161458969116,
      "kl": 1.1142578125,
      "learning_rate": 3.0359654942835247e-07,
      "loss": 0.0447,
      "reward": -0.36088229715824127,
      "reward_std": 0.31483449041843414,
      "rewards/cosine_scaled_reward": -0.18044114857912064,
      "rewards/format_reward": 0.0,
      "step": 358
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.436,
      "grad_norm": 3.457472324371338,
      "kl": 2.2265625,
      "learning_rate": 3.0097380284049523e-07,
      "loss": 0.089,
      "reward": -0.3612442761659622,
      "reward_std": 0.28438059240579605,
      "rewards/cosine_scaled_reward": -0.1806221418082714,
      "rewards/format_reward": 0.0,
      "step": 359
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.44,
      "grad_norm": 3.285405397415161,
      "kl": 2.076171875,
      "learning_rate": 2.9836319343816397e-07,
      "loss": 0.0831,
      "reward": -0.32887883111834526,
      "reward_std": 0.3107897564768791,
      "rewards/cosine_scaled_reward": -0.16443941928446293,
      "rewards/format_reward": 0.0,
      "step": 360
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.444,
      "grad_norm": 2.9156711101531982,
      "kl": 1.7646484375,
      "learning_rate": 2.9576484845877793e-07,
      "loss": 0.0706,
      "reward": -0.3512116149067879,
      "reward_std": 0.32886873185634613,
      "rewards/cosine_scaled_reward": -0.17560580000281334,
      "rewards/format_reward": 0.0,
      "step": 361
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.448,
      "grad_norm": 2.42704439163208,
      "kl": 1.697265625,
      "learning_rate": 2.931788945420058e-07,
      "loss": 0.0679,
      "reward": -0.3639722764492035,
      "reward_std": 0.2881170064210892,
      "rewards/cosine_scaled_reward": -0.18198613449931145,
      "rewards/format_reward": 0.0,
      "step": 362
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.452,
      "grad_norm": 4.5008225440979,
      "kl": 2.177734375,
      "learning_rate": 2.9060545772359305e-07,
      "loss": 0.087,
      "reward": -0.3515865206718445,
      "reward_std": 0.290123887360096,
      "rewards/cosine_scaled_reward": -0.17579325661063194,
      "rewards/format_reward": 0.0,
      "step": 363
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.456,
      "grad_norm": 2.7479496002197266,
      "kl": 1.578125,
      "learning_rate": 2.8804466342921987e-07,
      "loss": 0.0632,
      "reward": -0.26583924936130643,
      "reward_std": 0.29539088532328606,
      "rewards/cosine_scaled_reward": -0.13291961723007262,
      "rewards/format_reward": 0.0,
      "step": 364
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.46,
      "grad_norm": 2.6749367713928223,
      "kl": 2.1796875,
      "learning_rate": 2.854966364683872e-07,
      "loss": 0.087,
      "reward": -0.36106909811496735,
      "reward_std": 0.2982637956738472,
      "rewards/cosine_scaled_reward": -0.18053454905748367,
      "rewards/format_reward": 0.0,
      "step": 365
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.464,
      "grad_norm": 3.6434812545776367,
      "kl": 1.4482421875,
      "learning_rate": 2.829615010283344e-07,
      "loss": 0.058,
      "reward": -0.35805001854896545,
      "reward_std": 0.31588251888751984,
      "rewards/cosine_scaled_reward": -0.17902500554919243,
      "rewards/format_reward": 0.0,
      "step": 366
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.468,
      "grad_norm": 2.877927780151367,
      "kl": 1.779296875,
      "learning_rate": 2.8043938066798645e-07,
      "loss": 0.0712,
      "reward": -0.35267870873212814,
      "reward_std": 0.3029713034629822,
      "rewards/cosine_scaled_reward": -0.17633935809135437,
      "rewards/format_reward": 0.0,
      "step": 367
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.472,
      "grad_norm": 2.9547438621520996,
      "kl": 1.3583984375,
      "learning_rate": 2.7793039831193133e-07,
      "loss": 0.0542,
      "reward": -0.34842824190855026,
      "reward_std": 0.28041965141892433,
      "rewards/cosine_scaled_reward": -0.17421411722898483,
      "rewards/format_reward": 0.0,
      "step": 368
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.476,
      "grad_norm": 2.4998183250427246,
      "kl": 1.712890625,
      "learning_rate": 2.7543467624442956e-07,
      "loss": 0.0686,
      "reward": -0.34311509132385254,
      "reward_std": 0.3226206302642822,
      "rewards/cosine_scaled_reward": -0.17155754193663597,
      "rewards/format_reward": 0.0,
      "step": 369
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.48,
      "grad_norm": 3.5822997093200684,
      "kl": 1.2568359375,
      "learning_rate": 2.729523361034538e-07,
      "loss": 0.0502,
      "reward": -0.31581661850214005,
      "reward_std": 0.27614113688468933,
      "rewards/cosine_scaled_reward": -0.15790832042694092,
      "rewards/format_reward": 0.0,
      "step": 370
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.484,
      "grad_norm": 2.638000965118408,
      "kl": 1.658203125,
      "learning_rate": 2.7048349887476037e-07,
      "loss": 0.0663,
      "reward": -0.3658217638731003,
      "reward_std": 0.3533295765519142,
      "rewards/cosine_scaled_reward": -0.18291086703538895,
      "rewards/format_reward": 0.0,
      "step": 371
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.488,
      "grad_norm": 2.4719886779785156,
      "kl": 1.470703125,
      "learning_rate": 2.6802828488599294e-07,
      "loss": 0.0588,
      "reward": -0.35377567261457443,
      "reward_std": 0.2872357815504074,
      "rewards/cosine_scaled_reward": -0.17688783630728722,
      "rewards/format_reward": 0.0,
      "step": 372
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.492,
      "grad_norm": 3.820688486099243,
      "kl": 1.65673828125,
      "learning_rate": 2.655868138008171e-07,
      "loss": 0.0662,
      "reward": -0.3673105686903,
      "reward_std": 0.29224705323576927,
      "rewards/cosine_scaled_reward": -0.1836552768945694,
      "rewards/format_reward": 0.0,
      "step": 373
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.496,
      "grad_norm": 3.1416916847229004,
      "kl": 1.4990234375,
      "learning_rate": 2.631592046130896e-07,
      "loss": 0.06,
      "reward": -0.3574133738875389,
      "reward_std": 0.2663569226861,
      "rewards/cosine_scaled_reward": -0.17870669439435005,
      "rewards/format_reward": 0.0,
      "step": 374
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.5,
      "grad_norm": 2.3712515830993652,
      "kl": 1.900390625,
      "learning_rate": 2.6074557564105724e-07,
      "loss": 0.0761,
      "reward": -0.34536080807447433,
      "reward_std": 0.3063738942146301,
      "rewards/cosine_scaled_reward": -0.17268040403723717,
      "rewards/format_reward": 0.0,
      "step": 375
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.504,
      "grad_norm": 2.792006254196167,
      "kl": 1.71875,
      "learning_rate": 2.583460445215911e-07,
      "loss": 0.0688,
      "reward": -0.3458981513977051,
      "reward_std": 0.3039686158299446,
      "rewards/cosine_scaled_reward": -0.17294907197356224,
      "rewards/format_reward": 0.0,
      "step": 376
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.508,
      "grad_norm": 2.985948085784912,
      "kl": 1.5625,
      "learning_rate": 2.5596072820445254e-07,
      "loss": 0.0625,
      "reward": -0.21606629202142358,
      "reward_std": 0.2749215438961983,
      "rewards/cosine_scaled_reward": -0.10803314973600209,
      "rewards/format_reward": 0.0,
      "step": 377
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1531.952392578125,
      "epoch": 1.512,
      "grad_norm": 2.396852970123291,
      "kl": 1.9921875,
      "learning_rate": 2.5358974294659373e-07,
      "loss": 0.0823,
      "reward": -0.38127752393484116,
      "reward_std": 0.32172612845897675,
      "rewards/cosine_scaled_reward": -0.19063876569271088,
      "rewards/format_reward": 0.0,
      "step": 378
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.516,
      "grad_norm": 2.503976345062256,
      "kl": 1.794921875,
      "learning_rate": 2.512332043064913e-07,
      "loss": 0.0718,
      "reward": -0.3479606434702873,
      "reward_std": 0.29174239560961723,
      "rewards/cosine_scaled_reward": -0.17398031428456306,
      "rewards/format_reward": 0.0,
      "step": 379
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1531.8035888671875,
      "epoch": 1.52,
      "grad_norm": 3.344243049621582,
      "kl": 2.080078125,
      "learning_rate": 2.488912271385139e-07,
      "loss": 0.083,
      "reward": -0.38203170895576477,
      "reward_std": 0.3180833086371422,
      "rewards/cosine_scaled_reward": -0.19101585447788239,
      "rewards/format_reward": 0.0,
      "step": 380
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.524,
      "grad_norm": 3.5073604583740234,
      "kl": 2.095703125,
      "learning_rate": 2.465639255873246e-07,
      "loss": 0.0837,
      "reward": -0.33683621138334274,
      "reward_std": 0.3141423165798187,
      "rewards/cosine_scaled_reward": -0.16841810569167137,
      "rewards/format_reward": 0.0,
      "step": 381
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.528,
      "grad_norm": 2.7634477615356445,
      "kl": 2.55859375,
      "learning_rate": 2.4425141308231765e-07,
      "loss": 0.1022,
      "reward": -0.3983701467514038,
      "reward_std": 0.31766583025455475,
      "rewards/cosine_scaled_reward": -0.199185062199831,
      "rewards/format_reward": 0.0,
      "step": 382
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.532,
      "grad_norm": 3.1601033210754395,
      "kl": 1.486328125,
      "learning_rate": 2.4195380233209006e-07,
      "loss": 0.0594,
      "reward": -0.37120404094457626,
      "reward_std": 0.3172856420278549,
      "rewards/cosine_scaled_reward": -0.18560202419757843,
      "rewards/format_reward": 0.0,
      "step": 383
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.536,
      "grad_norm": 2.475311040878296,
      "kl": 2.01953125,
      "learning_rate": 2.3967120531894857e-07,
      "loss": 0.0807,
      "reward": -0.3449181020259857,
      "reward_std": 0.3061336353421211,
      "rewards/cosine_scaled_reward": -0.17245905846357346,
      "rewards/format_reward": 0.0,
      "step": 384
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.54,
      "grad_norm": 3.9638140201568604,
      "kl": 1.6806640625,
      "learning_rate": 2.374037332934512e-07,
      "loss": 0.0673,
      "reward": -0.3139965161681175,
      "reward_std": 0.303245909512043,
      "rewards/cosine_scaled_reward": -0.15699823945760727,
      "rewards/format_reward": 0.0,
      "step": 385
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.544,
      "grad_norm": 3.2407708168029785,
      "kl": 1.89453125,
      "learning_rate": 2.3515149676898552e-07,
      "loss": 0.0757,
      "reward": -0.3049175813794136,
      "reward_std": 0.30845751613378525,
      "rewards/cosine_scaled_reward": -0.1524587944149971,
      "rewards/format_reward": 0.0,
      "step": 386
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.548,
      "grad_norm": 3.1065189838409424,
      "kl": 1.75390625,
      "learning_rate": 2.3291460551638237e-07,
      "loss": 0.0701,
      "reward": -0.3369733840227127,
      "reward_std": 0.30179525911808014,
      "rewards/cosine_scaled_reward": -0.16848668828606606,
      "rewards/format_reward": 0.0,
      "step": 387
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.552,
      "grad_norm": 2.6867339611053467,
      "kl": 2.06640625,
      "learning_rate": 2.306931685585657e-07,
      "loss": 0.0826,
      "reward": -0.3339100852608681,
      "reward_std": 0.3043428584933281,
      "rewards/cosine_scaled_reward": -0.16695504263043404,
      "rewards/format_reward": 0.0,
      "step": 388
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.556,
      "grad_norm": 3.1580567359924316,
      "kl": 2.291015625,
      "learning_rate": 2.2848729416523859e-07,
      "loss": 0.0915,
      "reward": -0.3744669333100319,
      "reward_std": 0.3249610960483551,
      "rewards/cosine_scaled_reward": -0.18723345920443535,
      "rewards/format_reward": 0.0,
      "step": 389
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.56,
      "grad_norm": 5.407771587371826,
      "kl": 1.609375,
      "learning_rate": 2.2629708984760706e-07,
      "loss": 0.0645,
      "reward": -0.3420454412698746,
      "reward_std": 0.3148321136832237,
      "rewards/cosine_scaled_reward": -0.1710227131843567,
      "rewards/format_reward": 0.0,
      "step": 390
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.564,
      "grad_norm": 4.492737770080566,
      "kl": 2.275390625,
      "learning_rate": 2.2412266235313973e-07,
      "loss": 0.0909,
      "reward": -0.36313918232917786,
      "reward_std": 0.29535526037216187,
      "rewards/cosine_scaled_reward": -0.18156958371400833,
      "rewards/format_reward": 0.0,
      "step": 391
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.568,
      "grad_norm": 3.0125086307525635,
      "kl": 2.029296875,
      "learning_rate": 2.2196411766036487e-07,
      "loss": 0.0812,
      "reward": -0.37769585102796555,
      "reward_std": 0.31776873767375946,
      "rewards/cosine_scaled_reward": -0.18884791806340218,
      "rewards/format_reward": 0.0,
      "step": 392
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.572,
      "grad_norm": 3.134265899658203,
      "kl": 2.47265625,
      "learning_rate": 2.1982156097370557e-07,
      "loss": 0.099,
      "reward": -0.38678842037916183,
      "reward_std": 0.30557621270418167,
      "rewards/cosine_scaled_reward": -0.19339420646429062,
      "rewards/format_reward": 0.0,
      "step": 393
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.576,
      "grad_norm": 2.9398727416992188,
      "kl": 1.404296875,
      "learning_rate": 2.1769509671835223e-07,
      "loss": 0.0562,
      "reward": -0.3609785735607147,
      "reward_std": 0.29732464998960495,
      "rewards/cosine_scaled_reward": -0.18048929050564766,
      "rewards/format_reward": 0.0,
      "step": 394
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.58,
      "grad_norm": 2.3901424407958984,
      "kl": 2.3291015625,
      "learning_rate": 2.1558482853517253e-07,
      "loss": 0.093,
      "reward": -0.38430536538362503,
      "reward_std": 0.32753758877515793,
      "rewards/cosine_scaled_reward": -0.19215268269181252,
      "rewards/format_reward": 0.0,
      "step": 395
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1526.9702453613281,
      "epoch": 1.584,
      "grad_norm": 3.9775447845458984,
      "kl": 2.06640625,
      "learning_rate": 2.134908592756607e-07,
      "loss": 0.0914,
      "reward": -0.33116257190704346,
      "reward_std": 0.2928163409233093,
      "rewards/cosine_scaled_reward": -0.16558128595352173,
      "rewards/format_reward": 0.0,
      "step": 396
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.588,
      "grad_norm": 2.9975955486297607,
      "kl": 2.318359375,
      "learning_rate": 2.1141329099692406e-07,
      "loss": 0.0928,
      "reward": -0.3710367754101753,
      "reward_std": 0.3226532116532326,
      "rewards/cosine_scaled_reward": -0.18551838770508766,
      "rewards/format_reward": 0.0,
      "step": 397
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1530.6845397949219,
      "epoch": 1.592,
      "grad_norm": 3.739922046661377,
      "kl": 2.025390625,
      "learning_rate": 2.0935222495670968e-07,
      "loss": 0.0747,
      "reward": -0.3954162746667862,
      "reward_std": 0.3323783427476883,
      "rewards/cosine_scaled_reward": -0.1977081410586834,
      "rewards/format_reward": 0.0,
      "step": 398
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.596,
      "grad_norm": 2.7063024044036865,
      "kl": 1.0927734375,
      "learning_rate": 2.0730776160846853e-07,
      "loss": 0.0437,
      "reward": -0.3006215952336788,
      "reward_std": 0.27692657709121704,
      "rewards/cosine_scaled_reward": -0.15031079947948456,
      "rewards/format_reward": 0.0,
      "step": 399
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.6,
      "grad_norm": 2.469496726989746,
      "kl": 1.732421875,
      "learning_rate": 2.0528000059645995e-07,
      "loss": 0.0693,
      "reward": -0.36928267031908035,
      "reward_std": 0.30984392017126083,
      "rewards/cosine_scaled_reward": -0.18464133515954018,
      "rewards/format_reward": 0.0,
      "step": 400
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1522.3095397949219,
      "epoch": 1.604,
      "grad_norm": 2.855372190475464,
      "kl": 1.845703125,
      "learning_rate": 2.032690407508949e-07,
      "loss": 0.0636,
      "reward": -0.38443852961063385,
      "reward_std": 0.28470365703105927,
      "rewards/cosine_scaled_reward": -0.19221926480531693,
      "rewards/format_reward": 0.0,
      "step": 401
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.608,
      "grad_norm": 3.3847217559814453,
      "kl": 2.0390625,
      "learning_rate": 2.0127498008311922e-07,
      "loss": 0.0814,
      "reward": -0.3252910152077675,
      "reward_std": 0.2982725724577904,
      "rewards/cosine_scaled_reward": -0.16264550015330315,
      "rewards/format_reward": 0.0,
      "step": 402
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.612,
      "grad_norm": 3.0226523876190186,
      "kl": 1.81640625,
      "learning_rate": 1.9929791578083655e-07,
      "loss": 0.0727,
      "reward": -0.3527565225958824,
      "reward_std": 0.30437447875738144,
      "rewards/cosine_scaled_reward": -0.1763782650232315,
      "rewards/format_reward": 0.0,
      "step": 403
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.616,
      "grad_norm": 2.866734743118286,
      "kl": 1.7890625,
      "learning_rate": 1.9733794420337213e-07,
      "loss": 0.0716,
      "reward": -0.3746185079216957,
      "reward_std": 0.3078552633523941,
      "rewards/cosine_scaled_reward": -0.18730924278497696,
      "rewards/format_reward": 0.0,
      "step": 404
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.62,
      "grad_norm": 3.9170870780944824,
      "kl": 1.970703125,
      "learning_rate": 1.9539516087697517e-07,
      "loss": 0.0789,
      "reward": -0.41533301770687103,
      "reward_std": 0.3027655556797981,
      "rewards/cosine_scaled_reward": -0.20766650885343552,
      "rewards/format_reward": 0.0,
      "step": 405
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.624,
      "grad_norm": 3.470655679702759,
      "kl": 1.845703125,
      "learning_rate": 1.934696604901642e-07,
      "loss": 0.0738,
      "reward": -0.3191938251256943,
      "reward_std": 0.28303690254688263,
      "rewards/cosine_scaled_reward": -0.15959692373871803,
      "rewards/format_reward": 0.0,
      "step": 406
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.6280000000000001,
      "grad_norm": 3.623340368270874,
      "kl": 1.31640625,
      "learning_rate": 1.915615368891117e-07,
      "loss": 0.0526,
      "reward": -0.3123548626899719,
      "reward_std": 0.29499682784080505,
      "rewards/cosine_scaled_reward": -0.15617743134498596,
      "rewards/format_reward": 0.0,
      "step": 407
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.6320000000000001,
      "grad_norm": 2.282514810562134,
      "kl": 1.267578125,
      "learning_rate": 1.8967088307307e-07,
      "loss": 0.0507,
      "reward": -0.39642050117254257,
      "reward_std": 0.311983872205019,
      "rewards/cosine_scaled_reward": -0.19821025803685188,
      "rewards/format_reward": 0.0,
      "step": 408
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.6360000000000001,
      "grad_norm": 2.5232083797454834,
      "kl": 1.681640625,
      "learning_rate": 1.8779779118983867e-07,
      "loss": 0.0672,
      "reward": -0.33888739347457886,
      "reward_std": 0.28087718039751053,
      "rewards/cosine_scaled_reward": -0.16944369673728943,
      "rewards/format_reward": 0.0,
      "step": 409
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.6400000000000001,
      "grad_norm": 3.886439085006714,
      "kl": 2.09765625,
      "learning_rate": 1.8594235253127372e-07,
      "loss": 0.0838,
      "reward": -0.38627707213163376,
      "reward_std": 0.33190976083278656,
      "rewards/cosine_scaled_reward": -0.19313853234052658,
      "rewards/format_reward": 0.0,
      "step": 410
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.6440000000000001,
      "grad_norm": 3.090627670288086,
      "kl": 2.140625,
      "learning_rate": 1.8410465752883758e-07,
      "loss": 0.0857,
      "reward": -0.3793156296014786,
      "reward_std": 0.30717378109693527,
      "rewards/cosine_scaled_reward": -0.1896577998995781,
      "rewards/format_reward": 0.0,
      "step": 411
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.6480000000000001,
      "grad_norm": 3.867506980895996,
      "kl": 1.880859375,
      "learning_rate": 1.822847957491922e-07,
      "loss": 0.0753,
      "reward": -0.3565782457590103,
      "reward_std": 0.3352038711309433,
      "rewards/cosine_scaled_reward": -0.17828912287950516,
      "rewards/format_reward": 0.0,
      "step": 412
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.6520000000000001,
      "grad_norm": 2.388094902038574,
      "kl": 1.751953125,
      "learning_rate": 1.804828558898332e-07,
      "loss": 0.0701,
      "reward": -0.3393707424402237,
      "reward_std": 0.3029238283634186,
      "rewards/cosine_scaled_reward": -0.16968537122011185,
      "rewards/format_reward": 0.0,
      "step": 413
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.6560000000000001,
      "grad_norm": 2.5263466835021973,
      "kl": 1.748046875,
      "learning_rate": 1.7869892577476722e-07,
      "loss": 0.0698,
      "reward": -0.4274343103170395,
      "reward_std": 0.3449402078986168,
      "rewards/cosine_scaled_reward": -0.21371715888381004,
      "rewards/format_reward": 0.0,
      "step": 414
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.6600000000000001,
      "grad_norm": 2.3268003463745117,
      "kl": 1.400390625,
      "learning_rate": 1.7693309235023127e-07,
      "loss": 0.0559,
      "reward": -0.3480057269334793,
      "reward_std": 0.29953421652317047,
      "rewards/cosine_scaled_reward": -0.17400285601615906,
      "rewards/format_reward": 0.0,
      "step": 415
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.6640000000000001,
      "grad_norm": 3.2503533363342285,
      "kl": 1.9140625,
      "learning_rate": 1.7518544168045524e-07,
      "loss": 0.0767,
      "reward": -0.36937638372182846,
      "reward_std": 0.31766701489686966,
      "rewards/cosine_scaled_reward": -0.18468819558620453,
      "rewards/format_reward": 0.0,
      "step": 416
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.6680000000000001,
      "grad_norm": 2.9895646572113037,
      "kl": 2.1796875,
      "learning_rate": 1.7345605894346726e-07,
      "loss": 0.0871,
      "reward": -0.3985458239912987,
      "reward_std": 0.33385203033685684,
      "rewards/cosine_scaled_reward": -0.19927291199564934,
      "rewards/format_reward": 0.0,
      "step": 417
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.6720000000000002,
      "grad_norm": 3.2457692623138428,
      "kl": 1.71875,
      "learning_rate": 1.7174502842694212e-07,
      "loss": 0.0687,
      "reward": -0.2603262776392512,
      "reward_std": 0.3040950074791908,
      "rewards/cosine_scaled_reward": -0.13016314181732014,
      "rewards/format_reward": 0.0,
      "step": 418
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.6760000000000002,
      "grad_norm": 2.8391411304473877,
      "kl": 1.798828125,
      "learning_rate": 1.7005243352409333e-07,
      "loss": 0.072,
      "reward": -0.2663672436028719,
      "reward_std": 0.29912005364894867,
      "rewards/cosine_scaled_reward": -0.13318362249992788,
      "rewards/format_reward": 0.0,
      "step": 419
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.6800000000000002,
      "grad_norm": 3.1057238578796387,
      "kl": 1.5,
      "learning_rate": 1.6837835672960831e-07,
      "loss": 0.06,
      "reward": -0.34882377088069916,
      "reward_std": 0.3601520508527756,
      "rewards/cosine_scaled_reward": -0.17441189289093018,
      "rewards/format_reward": 0.0,
      "step": 420
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.6840000000000002,
      "grad_norm": 2.243816375732422,
      "kl": 1.541015625,
      "learning_rate": 1.6672287963562852e-07,
      "loss": 0.0616,
      "reward": -0.3832622766494751,
      "reward_std": 0.3413049802184105,
      "rewards/cosine_scaled_reward": -0.19163113832473755,
      "rewards/format_reward": 0.0,
      "step": 421
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.688,
      "grad_norm": 3.76218581199646,
      "kl": 1.880859375,
      "learning_rate": 1.6508608292777203e-07,
      "loss": 0.0752,
      "reward": -0.3700753226876259,
      "reward_std": 0.31324099004268646,
      "rewards/cosine_scaled_reward": -0.18503766134381294,
      "rewards/format_reward": 0.0,
      "step": 422
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.692,
      "grad_norm": 4.034151554107666,
      "kl": 1.70703125,
      "learning_rate": 1.6346804638120098e-07,
      "loss": 0.0682,
      "reward": -0.29791881144046783,
      "reward_std": 0.2801155336201191,
      "rewards/cosine_scaled_reward": -0.14895940944552422,
      "rewards/format_reward": 0.0,
      "step": 423
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.696,
      "grad_norm": 3.041618824005127,
      "kl": 1.81640625,
      "learning_rate": 1.6186884885673413e-07,
      "loss": 0.0725,
      "reward": -0.32316526770591736,
      "reward_std": 0.2970619350671768,
      "rewards/cosine_scaled_reward": -0.16158264502882957,
      "rewards/format_reward": 0.0,
      "step": 424
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.7,
      "grad_norm": 4.081668376922607,
      "kl": 1.4453125,
      "learning_rate": 1.6028856829700258e-07,
      "loss": 0.0576,
      "reward": -0.3476375713944435,
      "reward_std": 0.294509120285511,
      "rewards/cosine_scaled_reward": -0.17381878197193146,
      "rewards/format_reward": 0.0,
      "step": 425
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.704,
      "grad_norm": 3.166949510574341,
      "kl": 2.1015625,
      "learning_rate": 1.5872728172265146e-07,
      "loss": 0.0841,
      "reward": -0.3467593193054199,
      "reward_std": 0.30388573557138443,
      "rewards/cosine_scaled_reward": -0.17337966337800026,
      "rewards/format_reward": 0.0,
      "step": 426
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.708,
      "grad_norm": 4.211978435516357,
      "kl": 1.763671875,
      "learning_rate": 1.5718506522858572e-07,
      "loss": 0.0705,
      "reward": -0.3505774810910225,
      "reward_std": 0.30420946329832077,
      "rewards/cosine_scaled_reward": -0.17528874799609184,
      "rewards/format_reward": 0.0,
      "step": 427
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.712,
      "grad_norm": 4.166502952575684,
      "kl": 2.158203125,
      "learning_rate": 1.5566199398026147e-07,
      "loss": 0.0863,
      "reward": -0.361857570707798,
      "reward_std": 0.30119316279888153,
      "rewards/cosine_scaled_reward": -0.1809287928044796,
      "rewards/format_reward": 0.0,
      "step": 428
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.716,
      "grad_norm": 2.8889896869659424,
      "kl": 1.8671875,
      "learning_rate": 1.5415814221002265e-07,
      "loss": 0.0745,
      "reward": -0.32126056402921677,
      "reward_std": 0.27691005170345306,
      "rewards/cosine_scaled_reward": -0.16063029691576958,
      "rewards/format_reward": 0.0,
      "step": 429
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.72,
      "grad_norm": 3.3025801181793213,
      "kl": 1.904296875,
      "learning_rate": 1.5267358321348285e-07,
      "loss": 0.0761,
      "reward": -0.36847078800201416,
      "reward_std": 0.3445659205317497,
      "rewards/cosine_scaled_reward": -0.18423539400100708,
      "rewards/format_reward": 0.0,
      "step": 430
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.724,
      "grad_norm": 3.0440969467163086,
      "kl": 1.75,
      "learning_rate": 1.5120838934595337e-07,
      "loss": 0.07,
      "reward": -0.36113734543323517,
      "reward_std": 0.3412683606147766,
      "rewards/cosine_scaled_reward": -0.18056866899132729,
      "rewards/format_reward": 0.0,
      "step": 431
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1530.952392578125,
      "epoch": 1.728,
      "grad_norm": 2.575627326965332,
      "kl": 1.689453125,
      "learning_rate": 1.4976263201891613e-07,
      "loss": 0.0716,
      "reward": -0.3095761463046074,
      "reward_std": 0.32323335483670235,
      "rewards/cosine_scaled_reward": -0.1547880806028843,
      "rewards/format_reward": 0.0,
      "step": 432
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.732,
      "grad_norm": 3.186289072036743,
      "kl": 1.91015625,
      "learning_rate": 1.483363816965435e-07,
      "loss": 0.0765,
      "reward": -0.39015311002731323,
      "reward_std": 0.3067055642604828,
      "rewards/cosine_scaled_reward": -0.19507654383778572,
      "rewards/format_reward": 0.0,
      "step": 433
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.736,
      "grad_norm": 3.0739073753356934,
      "kl": 2.369140625,
      "learning_rate": 1.469297078922642e-07,
      "loss": 0.0946,
      "reward": -0.29091550246812403,
      "reward_std": 0.30687109380960464,
      "rewards/cosine_scaled_reward": -0.14545774972066283,
      "rewards/format_reward": 0.0,
      "step": 434
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.74,
      "grad_norm": 5.0029778480529785,
      "kl": 1.759765625,
      "learning_rate": 1.4554267916537495e-07,
      "loss": 0.0703,
      "reward": -0.34431006759405136,
      "reward_std": 0.27501973509788513,
      "rewards/cosine_scaled_reward": -0.17215503007173538,
      "rewards/format_reward": 0.0,
      "step": 435
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.744,
      "grad_norm": 5.139548301696777,
      "kl": 1.8203125,
      "learning_rate": 1.4417536311769885e-07,
      "loss": 0.0728,
      "reward": -0.31318235397338867,
      "reward_std": 0.2976163923740387,
      "rewards/cosine_scaled_reward": -0.15659117698669434,
      "rewards/format_reward": 0.0,
      "step": 436
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.748,
      "grad_norm": 2.881143808364868,
      "kl": 1.626953125,
      "learning_rate": 1.4282782639029128e-07,
      "loss": 0.065,
      "reward": -0.3547092080116272,
      "reward_std": 0.28170817345380783,
      "rewards/cosine_scaled_reward": -0.1773546040058136,
      "rewards/format_reward": 0.0,
      "step": 437
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.752,
      "grad_norm": 2.4268362522125244,
      "kl": 1.9609375,
      "learning_rate": 1.4150013466019114e-07,
      "loss": 0.0786,
      "reward": -0.3464732989668846,
      "reward_std": 0.3199189677834511,
      "rewards/cosine_scaled_reward": -0.173236645758152,
      "rewards/format_reward": 0.0,
      "step": 438
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.756,
      "grad_norm": 2.686417579650879,
      "kl": 2.318359375,
      "learning_rate": 1.4019235263722034e-07,
      "loss": 0.0926,
      "reward": -0.3557046577334404,
      "reward_std": 0.3187018297612667,
      "rewards/cosine_scaled_reward": -0.1778523214161396,
      "rewards/format_reward": 0.0,
      "step": 439
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.76,
      "grad_norm": 4.9666876792907715,
      "kl": 1.4619140625,
      "learning_rate": 1.3890454406082956e-07,
      "loss": 0.0584,
      "reward": -0.3234737552702427,
      "reward_std": 0.32776766270399094,
      "rewards/cosine_scaled_reward": -0.16173688508570194,
      "rewards/format_reward": 0.0,
      "step": 440
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.764,
      "grad_norm": 4.106746196746826,
      "kl": 2.49609375,
      "learning_rate": 1.3763677169699217e-07,
      "loss": 0.0999,
      "reward": -0.4192545562982559,
      "reward_std": 0.33375757187604904,
      "rewards/cosine_scaled_reward": -0.20962728559970856,
      "rewards/format_reward": 0.0,
      "step": 441
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1534.8690490722656,
      "epoch": 1.768,
      "grad_norm": 2.842816114425659,
      "kl": 2.2578125,
      "learning_rate": 1.3638909733514452e-07,
      "loss": 0.0898,
      "reward": -0.3652210012078285,
      "reward_std": 0.3345082625746727,
      "rewards/cosine_scaled_reward": -0.18261050805449486,
      "rewards/format_reward": 0.0,
      "step": 442
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.772,
      "grad_norm": 3.186333179473877,
      "kl": 2.375,
      "learning_rate": 1.351615817851748e-07,
      "loss": 0.0947,
      "reward": -0.40324729681015015,
      "reward_std": 0.32466883957386017,
      "rewards/cosine_scaled_reward": -0.20162366330623627,
      "rewards/format_reward": 0.0,
      "step": 443
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.776,
      "grad_norm": 4.4096360206604,
      "kl": 2.99609375,
      "learning_rate": 1.3395428487445914e-07,
      "loss": 0.1197,
      "reward": -0.3327697291970253,
      "reward_std": 0.3282741829752922,
      "rewards/cosine_scaled_reward": -0.16638486459851265,
      "rewards/format_reward": 0.0,
      "step": 444
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.78,
      "grad_norm": 2.8214669227600098,
      "kl": 1.8623046875,
      "learning_rate": 1.3276726544494571e-07,
      "loss": 0.0746,
      "reward": -0.39069636911153793,
      "reward_std": 0.33478184044361115,
      "rewards/cosine_scaled_reward": -0.19534818828105927,
      "rewards/format_reward": 0.0,
      "step": 445
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.784,
      "grad_norm": 2.96333646774292,
      "kl": 1.828125,
      "learning_rate": 1.316005813502869e-07,
      "loss": 0.073,
      "reward": -0.34233053401112556,
      "reward_std": 0.30314670503139496,
      "rewards/cosine_scaled_reward": -0.17116525955498219,
      "rewards/format_reward": 0.0,
      "step": 446
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.788,
      "grad_norm": 2.538837432861328,
      "kl": 1.615234375,
      "learning_rate": 1.3045428945301953e-07,
      "loss": 0.0647,
      "reward": -0.2668099580332637,
      "reward_std": 0.3087245300412178,
      "rewards/cosine_scaled_reward": -0.1334049835568294,
      "rewards/format_reward": 0.0,
      "step": 447
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.792,
      "grad_norm": 6.922802925109863,
      "kl": 1.9208984375,
      "learning_rate": 1.2932844562179352e-07,
      "loss": 0.0768,
      "reward": -0.3690221831202507,
      "reward_std": 0.3130299560725689,
      "rewards/cosine_scaled_reward": -0.18451109528541565,
      "rewards/format_reward": 0.0,
      "step": 448
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.796,
      "grad_norm": 3.2286629676818848,
      "kl": 1.990234375,
      "learning_rate": 1.2822310472864885e-07,
      "loss": 0.0795,
      "reward": -0.32342398166656494,
      "reward_std": 0.3065089136362076,
      "rewards/cosine_scaled_reward": -0.16171199083328247,
      "rewards/format_reward": 0.0,
      "step": 449
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.8,
      "grad_norm": 3.7653493881225586,
      "kl": 1.904296875,
      "learning_rate": 1.2713832064634125e-07,
      "loss": 0.0763,
      "reward": -0.4029879495501518,
      "reward_std": 0.31490693986415863,
      "rewards/cosine_scaled_reward": -0.2014939747750759,
      "rewards/format_reward": 0.0,
      "step": 450
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.804,
      "grad_norm": 3.4150803089141846,
      "kl": 2.150390625,
      "learning_rate": 1.260741462457165e-07,
      "loss": 0.086,
      "reward": -0.3429009020328522,
      "reward_std": 0.29108157753944397,
      "rewards/cosine_scaled_reward": -0.1714504510164261,
      "rewards/format_reward": 0.0,
      "step": 451
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.808,
      "grad_norm": 4.145492076873779,
      "kl": 2.2421875,
      "learning_rate": 1.2503063339313356e-07,
      "loss": 0.0897,
      "reward": -0.42198269814252853,
      "reward_std": 0.3363164961338043,
      "rewards/cosine_scaled_reward": -0.21099134907126427,
      "rewards/format_reward": 0.0,
      "step": 452
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.812,
      "grad_norm": 4.779297351837158,
      "kl": 2.228515625,
      "learning_rate": 1.2400783294793668e-07,
      "loss": 0.0891,
      "reward": -0.3492959663271904,
      "reward_std": 0.2949202358722687,
      "rewards/cosine_scaled_reward": -0.1746479757130146,
      "rewards/format_reward": 0.0,
      "step": 453
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.8159999999999998,
      "grad_norm": 2.905301570892334,
      "kl": 1.265625,
      "learning_rate": 1.2300579475997657e-07,
      "loss": 0.0506,
      "reward": -0.2935212664306164,
      "reward_std": 0.26374514773488045,
      "rewards/cosine_scaled_reward": -0.1467606294900179,
      "rewards/format_reward": 0.0,
      "step": 454
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.8199999999999998,
      "grad_norm": 2.7079851627349854,
      "kl": 2.1337890625,
      "learning_rate": 1.220245676671809e-07,
      "loss": 0.0853,
      "reward": -0.3475092798471451,
      "reward_std": 0.30007384717464447,
      "rewards/cosine_scaled_reward": -0.17375463247299194,
      "rewards/format_reward": 0.0,
      "step": 455
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.8239999999999998,
      "grad_norm": 2.6113271713256836,
      "kl": 1.6376953125,
      "learning_rate": 1.2106419949317388e-07,
      "loss": 0.0654,
      "reward": -0.330677293241024,
      "reward_std": 0.3133997842669487,
      "rewards/cosine_scaled_reward": -0.1653386428952217,
      "rewards/format_reward": 0.0,
      "step": 456
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.8279999999999998,
      "grad_norm": 2.7393922805786133,
      "kl": 1.666015625,
      "learning_rate": 1.2012473704494537e-07,
      "loss": 0.0668,
      "reward": -0.3434924744069576,
      "reward_std": 0.3196050524711609,
      "rewards/cosine_scaled_reward": -0.17174622975289822,
      "rewards/format_reward": 0.0,
      "step": 457
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.8319999999999999,
      "grad_norm": 4.49023962020874,
      "kl": 2.34375,
      "learning_rate": 1.1920622611056974e-07,
      "loss": 0.0938,
      "reward": -0.34944383054971695,
      "reward_std": 0.3238733857870102,
      "rewards/cosine_scaled_reward": -0.17472190782427788,
      "rewards/format_reward": 0.0,
      "step": 458
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.8359999999999999,
      "grad_norm": 2.3561832904815674,
      "kl": 1.4501953125,
      "learning_rate": 1.1830871145697412e-07,
      "loss": 0.0579,
      "reward": -0.3565739244222641,
      "reward_std": 0.3099294453859329,
      "rewards/cosine_scaled_reward": -0.17828696221113205,
      "rewards/format_reward": 0.0,
      "step": 459
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.8399999999999999,
      "grad_norm": 3.1239490509033203,
      "kl": 1.8984375,
      "learning_rate": 1.1743223682775649e-07,
      "loss": 0.0759,
      "reward": -0.3478566035628319,
      "reward_std": 0.28794750943779945,
      "rewards/cosine_scaled_reward": -0.17392829060554504,
      "rewards/format_reward": 0.0,
      "step": 460
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.8439999999999999,
      "grad_norm": 2.673818826675415,
      "kl": 1.740234375,
      "learning_rate": 1.1657684494105386e-07,
      "loss": 0.0695,
      "reward": -0.339593730866909,
      "reward_std": 0.3045819625258446,
      "rewards/cosine_scaled_reward": -0.1697968691587448,
      "rewards/format_reward": 0.0,
      "step": 461
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.8479999999999999,
      "grad_norm": 3.220402479171753,
      "kl": 1.626953125,
      "learning_rate": 1.1574257748745986e-07,
      "loss": 0.0651,
      "reward": -0.36886321753263474,
      "reward_std": 0.26985886320471764,
      "rewards/cosine_scaled_reward": -0.18443159759044647,
      "rewards/format_reward": 0.0,
      "step": 462
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1530.4642944335938,
      "epoch": 1.8519999999999999,
      "grad_norm": 2.8002877235412598,
      "kl": 2.23828125,
      "learning_rate": 1.1492947512799328e-07,
      "loss": 0.0941,
      "reward": -0.31243710219860077,
      "reward_std": 0.3104839473962784,
      "rewards/cosine_scaled_reward": -0.15621854737401009,
      "rewards/format_reward": 0.0,
      "step": 463
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.8559999999999999,
      "grad_norm": 3.3076934814453125,
      "kl": 2.455078125,
      "learning_rate": 1.1413757749211602e-07,
      "loss": 0.098,
      "reward": -0.3408735916018486,
      "reward_std": 0.3259742558002472,
      "rewards/cosine_scaled_reward": -0.1704367958009243,
      "rewards/format_reward": 0.0,
      "step": 464
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.8599999999999999,
      "grad_norm": 4.302088737487793,
      "kl": 2.005859375,
      "learning_rate": 1.1336692317580158e-07,
      "loss": 0.0802,
      "reward": -0.3594956621527672,
      "reward_std": 0.32260415703058243,
      "rewards/cosine_scaled_reward": -0.1797478273510933,
      "rewards/format_reward": 0.0,
      "step": 465
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.8639999999999999,
      "grad_norm": 4.171574115753174,
      "kl": 2.490234375,
      "learning_rate": 1.1261754973965422e-07,
      "loss": 0.0995,
      "reward": -0.3996199369430542,
      "reward_std": 0.30815524607896805,
      "rewards/cosine_scaled_reward": -0.1998099721968174,
      "rewards/format_reward": 0.0,
      "step": 466
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.8679999999999999,
      "grad_norm": 3.7009289264678955,
      "kl": 1.841796875,
      "learning_rate": 1.1188949370707787e-07,
      "loss": 0.0738,
      "reward": -0.3371664360165596,
      "reward_std": 0.3329595774412155,
      "rewards/cosine_scaled_reward": -0.1685832180082798,
      "rewards/format_reward": 0.0,
      "step": 467
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.8719999999999999,
      "grad_norm": 2.592533826828003,
      "kl": 2.251953125,
      "learning_rate": 1.1118279056249653e-07,
      "loss": 0.0901,
      "reward": -0.34844203293323517,
      "reward_std": 0.322611540555954,
      "rewards/cosine_scaled_reward": -0.1742210052907467,
      "rewards/format_reward": 0.0,
      "step": 468
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.876,
      "grad_norm": 4.633761405944824,
      "kl": 1.64013671875,
      "learning_rate": 1.1049747474962444e-07,
      "loss": 0.0656,
      "reward": -0.3193807154893875,
      "reward_std": 0.26448768377304077,
      "rewards/cosine_scaled_reward": -0.15969035774469376,
      "rewards/format_reward": 0.0,
      "step": 469
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.88,
      "grad_norm": 3.101719617843628,
      "kl": 2.033203125,
      "learning_rate": 1.0983357966978745e-07,
      "loss": 0.0812,
      "reward": -0.3662792518734932,
      "reward_std": 0.32248761504888535,
      "rewards/cosine_scaled_reward": -0.1831396110355854,
      "rewards/format_reward": 0.0,
      "step": 470
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.884,
      "grad_norm": 2.580354690551758,
      "kl": 1.607421875,
      "learning_rate": 1.0919113768029517e-07,
      "loss": 0.0643,
      "reward": -0.34900667518377304,
      "reward_std": 0.31430666893720627,
      "rewards/cosine_scaled_reward": -0.17450333759188652,
      "rewards/format_reward": 0.0,
      "step": 471
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.888,
      "grad_norm": 2.7384796142578125,
      "kl": 1.8046875,
      "learning_rate": 1.0857018009286381e-07,
      "loss": 0.0722,
      "reward": -0.32778534665703773,
      "reward_std": 0.3321828171610832,
      "rewards/cosine_scaled_reward": -0.16389267705380917,
      "rewards/format_reward": 0.0,
      "step": 472
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.892,
      "grad_norm": 3.759181499481201,
      "kl": 2.017578125,
      "learning_rate": 1.0797073717209013e-07,
      "loss": 0.0807,
      "reward": -0.32047825306653976,
      "reward_std": 0.28816820681095123,
      "rewards/cosine_scaled_reward": -0.16023912653326988,
      "rewards/format_reward": 0.0,
      "step": 473
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.896,
      "grad_norm": 2.8909876346588135,
      "kl": 1.603515625,
      "learning_rate": 1.0739283813397639e-07,
      "loss": 0.0642,
      "reward": -0.3390325605869293,
      "reward_std": 0.3011201545596123,
      "rewards/cosine_scaled_reward": -0.16951627284288406,
      "rewards/format_reward": 0.0,
      "step": 474
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.9,
      "grad_norm": 2.3281497955322266,
      "kl": 2.0234375,
      "learning_rate": 1.068365111445064e-07,
      "loss": 0.081,
      "reward": -0.36704741418361664,
      "reward_std": 0.3062589168548584,
      "rewards/cosine_scaled_reward": -0.18352371081709862,
      "rewards/format_reward": 0.0,
      "step": 475
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.904,
      "grad_norm": 3.56882643699646,
      "kl": 2.515625,
      "learning_rate": 1.063017833182728e-07,
      "loss": 0.1008,
      "reward": -0.39511261135339737,
      "reward_std": 0.3128170743584633,
      "rewards/cosine_scaled_reward": -0.19755630940198898,
      "rewards/format_reward": 0.0,
      "step": 476
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.908,
      "grad_norm": 2.958406925201416,
      "kl": 1.755859375,
      "learning_rate": 1.0578868071715544e-07,
      "loss": 0.0702,
      "reward": -0.3462023660540581,
      "reward_std": 0.322578527033329,
      "rewards/cosine_scaled_reward": -0.17310118675231934,
      "rewards/format_reward": 0.0,
      "step": 477
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.912,
      "grad_norm": 3.044797897338867,
      "kl": 2.375,
      "learning_rate": 1.0529722834905125e-07,
      "loss": 0.095,
      "reward": -0.3144143670797348,
      "reward_std": 0.29915551096200943,
      "rewards/cosine_scaled_reward": -0.1572071835398674,
      "rewards/format_reward": 0.0,
      "step": 478
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.916,
      "grad_norm": 4.031872272491455,
      "kl": 2.640625,
      "learning_rate": 1.0482745016665526e-07,
      "loss": 0.1057,
      "reward": -0.3763216808438301,
      "reward_std": 0.3211255893111229,
      "rewards/cosine_scaled_reward": -0.18816084042191505,
      "rewards/format_reward": 0.0,
      "step": 479
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.92,
      "grad_norm": 2.3054392337799072,
      "kl": 1.3173828125,
      "learning_rate": 1.0437936906629334e-07,
      "loss": 0.0528,
      "reward": -0.2678487957455218,
      "reward_std": 0.2627658285200596,
      "rewards/cosine_scaled_reward": -0.13392440509051085,
      "rewards/format_reward": 0.0,
      "step": 480
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.924,
      "grad_norm": 3.41572642326355,
      "kl": 1.353515625,
      "learning_rate": 1.0395300688680625e-07,
      "loss": 0.0541,
      "reward": -0.35157452523708344,
      "reward_std": 0.3239835053682327,
      "rewards/cosine_scaled_reward": -0.17578726634383202,
      "rewards/format_reward": 0.0,
      "step": 481
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.928,
      "grad_norm": 2.691436290740967,
      "kl": 2.041015625,
      "learning_rate": 1.0354838440848501e-07,
      "loss": 0.0816,
      "reward": -0.39503272622823715,
      "reward_std": 0.3050593361258507,
      "rewards/cosine_scaled_reward": -0.19751636311411858,
      "rewards/format_reward": 0.0,
      "step": 482
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.932,
      "grad_norm": 2.859536647796631,
      "kl": 1.494140625,
      "learning_rate": 1.0316552135205837e-07,
      "loss": 0.0599,
      "reward": -0.395970955491066,
      "reward_std": 0.27583859115839005,
      "rewards/cosine_scaled_reward": -0.197985477745533,
      "rewards/format_reward": 0.0,
      "step": 483
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.936,
      "grad_norm": 2.9280340671539307,
      "kl": 1.765625,
      "learning_rate": 1.0280443637773163e-07,
      "loss": 0.0708,
      "reward": -0.2913724035024643,
      "reward_std": 0.2617946192622185,
      "rewards/cosine_scaled_reward": -0.14568619430065155,
      "rewards/format_reward": 0.0,
      "step": 484
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.94,
      "grad_norm": 2.2830445766448975,
      "kl": 1.2158203125,
      "learning_rate": 1.0246514708427701e-07,
      "loss": 0.0487,
      "reward": -0.3095552623271942,
      "reward_std": 0.292842835187912,
      "rewards/cosine_scaled_reward": -0.1547776274383068,
      "rewards/format_reward": 0.0,
      "step": 485
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.944,
      "grad_norm": 3.141052007675171,
      "kl": 1.3427734375,
      "learning_rate": 1.0214767000817596e-07,
      "loss": 0.0537,
      "reward": -0.32299425452947617,
      "reward_std": 0.29863065481185913,
      "rewards/cosine_scaled_reward": -0.16149712353944778,
      "rewards/format_reward": 0.0,
      "step": 486
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.948,
      "grad_norm": 3.97387433052063,
      "kl": 1.931640625,
      "learning_rate": 1.0185202062281336e-07,
      "loss": 0.0773,
      "reward": -0.3765959292650223,
      "reward_std": 0.3192542716860771,
      "rewards/cosine_scaled_reward": -0.18829796463251114,
      "rewards/format_reward": 0.0,
      "step": 487
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.952,
      "grad_norm": 2.656202554702759,
      "kl": 1.578125,
      "learning_rate": 1.0157821333772304e-07,
      "loss": 0.0631,
      "reward": -0.31205643340945244,
      "reward_std": 0.31670553237199783,
      "rewards/cosine_scaled_reward": -0.15602822043001652,
      "rewards/format_reward": 0.0,
      "step": 488
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.956,
      "grad_norm": 3.296848773956299,
      "kl": 1.16796875,
      "learning_rate": 1.013262614978859e-07,
      "loss": 0.0468,
      "reward": -0.3039631359279156,
      "reward_std": 0.27847766503691673,
      "rewards/cosine_scaled_reward": -0.15198157727718353,
      "rewards/format_reward": 0.0,
      "step": 489
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.96,
      "grad_norm": 4.522839546203613,
      "kl": 1.8203125,
      "learning_rate": 1.0109617738307911e-07,
      "loss": 0.0728,
      "reward": -0.34008362144231796,
      "reward_std": 0.29262910783290863,
      "rewards/cosine_scaled_reward": -0.17004182189702988,
      "rewards/format_reward": 0.0,
      "step": 490
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.964,
      "grad_norm": 2.311014175415039,
      "kl": 2.244140625,
      "learning_rate": 1.0088797220727779e-07,
      "loss": 0.0898,
      "reward": -0.34849604219198227,
      "reward_std": 0.3044138178229332,
      "rewards/cosine_scaled_reward": -0.17424802854657173,
      "rewards/format_reward": 0.0,
      "step": 491
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.968,
      "grad_norm": 2.6442465782165527,
      "kl": 1.998046875,
      "learning_rate": 1.0070165611810855e-07,
      "loss": 0.0799,
      "reward": -0.34308964014053345,
      "reward_std": 0.3727850690484047,
      "rewards/cosine_scaled_reward": -0.17154482379555702,
      "rewards/format_reward": 0.0,
      "step": 492
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.972,
      "grad_norm": 2.6985509395599365,
      "kl": 1.41796875,
      "learning_rate": 1.005372381963547e-07,
      "loss": 0.0567,
      "reward": -0.3521110415458679,
      "reward_std": 0.30227896198630333,
      "rewards/cosine_scaled_reward": -0.17605552449822426,
      "rewards/format_reward": 0.0,
      "step": 493
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.976,
      "grad_norm": 3.240550994873047,
      "kl": 1.904296875,
      "learning_rate": 1.0039472645551372e-07,
      "loss": 0.076,
      "reward": -0.3422994837164879,
      "reward_std": 0.3251089081168175,
      "rewards/cosine_scaled_reward": -0.17114974185824394,
      "rewards/format_reward": 0.0,
      "step": 494
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.98,
      "grad_norm": 4.803572177886963,
      "kl": 3.177734375,
      "learning_rate": 1.002741278414069e-07,
      "loss": 0.1272,
      "reward": -0.3737839311361313,
      "reward_std": 0.3232840970158577,
      "rewards/cosine_scaled_reward": -0.18689196929335594,
      "rewards/format_reward": 0.0,
      "step": 495
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.984,
      "grad_norm": 2.532582998275757,
      "kl": 1.9375,
      "learning_rate": 1.0017544823184055e-07,
      "loss": 0.0776,
      "reward": -0.374487929046154,
      "reward_std": 0.32537975162267685,
      "rewards/cosine_scaled_reward": -0.187243964523077,
      "rewards/format_reward": 0.0,
      "step": 496
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.988,
      "grad_norm": 2.6129701137542725,
      "kl": 2.2734375,
      "learning_rate": 1.0009869243631952e-07,
      "loss": 0.091,
      "reward": -0.3434004709124565,
      "reward_std": 0.32708871364593506,
      "rewards/cosine_scaled_reward": -0.17170023545622826,
      "rewards/format_reward": 0.0,
      "step": 497
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.992,
      "grad_norm": 4.10455322265625,
      "kl": 1.595703125,
      "learning_rate": 1.000438641958131e-07,
      "loss": 0.0638,
      "reward": -0.3211556486785412,
      "reward_std": 0.2905324958264828,
      "rewards/cosine_scaled_reward": -0.1605778243392706,
      "rewards/format_reward": 0.0,
      "step": 498
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0,
      "epoch": 1.996,
      "grad_norm": 2.7520267963409424,
      "kl": 1.892578125,
      "learning_rate": 1.0001096618257236e-07,
      "loss": 0.0756,
      "reward": -0.35309676826000214,
      "reward_std": 0.31401190161705017,
      "rewards/cosine_scaled_reward": -0.17654838413000107,
      "rewards/format_reward": 0.0,
      "step": 499
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1536.0001220703125,
      "epoch": 2.0,
      "grad_norm": 2.9658398628234863,
      "kl": 1.7763671875,
      "learning_rate": 1e-07,
      "loss": 0.0711,
      "reward": -0.343311108648777,
      "reward_std": 0.28952478244900703,
      "rewards/cosine_scaled_reward": -0.1716555580496788,
      "rewards/format_reward": 0.0,
      "step": 500
    },
    {
      "epoch": 2.0,
      "step": 500,
      "total_flos": 0.0,
      "train_loss": 0.05846181693652478,
      "train_runtime": 107214.2293,
      "train_samples_per_second": 0.783,
      "train_steps_per_second": 0.005
    }
  ],
  "logging_steps": 1,
  "max_steps": 500,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 2,
  "save_steps": 250,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 6,
  "trial_name": null,
  "trial_params": null
}