Phi-1.5B-IFT-Math-Openrs / trainer_state.json
advaithc's picture
Model save
8234cfb verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.004,
"grad_norm": 4.214743137359619,
"kl": 0.0,
"learning_rate": 2e-08,
"loss": -0.0,
"reward": -0.572140134871006,
"reward_std": 0.3359133452177048,
"rewards/cosine_scaled_reward": -0.286070067435503,
"rewards/format_reward": 0.0,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.008,
"grad_norm": 3.178635597229004,
"kl": 0.0,
"learning_rate": 4e-08,
"loss": -0.0,
"reward": -0.6001544743776321,
"reward_std": 0.33404429256916046,
"rewards/cosine_scaled_reward": -0.30007724463939667,
"rewards/format_reward": 0.0,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.012,
"grad_norm": 4.78328800201416,
"kl": 6.908178329467773e-05,
"learning_rate": 6e-08,
"loss": 0.0,
"reward": -0.502997636795044,
"reward_std": 0.3310435339808464,
"rewards/cosine_scaled_reward": -0.251498818397522,
"rewards/format_reward": 0.0,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.016,
"grad_norm": 3.9194376468658447,
"kl": 6.488710641860962e-05,
"learning_rate": 8e-08,
"loss": 0.0,
"reward": -0.5549568086862564,
"reward_std": 0.3469474986195564,
"rewards/cosine_scaled_reward": -0.2774783968925476,
"rewards/format_reward": 0.0,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.02,
"grad_norm": 3.903712511062622,
"kl": 5.97834587097168e-05,
"learning_rate": 1e-07,
"loss": 0.0,
"reward": -0.5800392031669617,
"reward_std": 0.35274410992860794,
"rewards/cosine_scaled_reward": -0.29001960158348083,
"rewards/format_reward": 0.0,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.024,
"grad_norm": 3.738009452819824,
"kl": 6.499886512756348e-05,
"learning_rate": 1.2e-07,
"loss": 0.0,
"reward": -0.5155884921550751,
"reward_std": 0.37037966400384903,
"rewards/cosine_scaled_reward": -0.25779424607753754,
"rewards/format_reward": 0.0,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.028,
"grad_norm": 2.794049024581909,
"kl": 5.620718002319336e-05,
"learning_rate": 1.4e-07,
"loss": 0.0,
"reward": -0.5175943374633789,
"reward_std": 0.3494645953178406,
"rewards/cosine_scaled_reward": -0.25879716128110886,
"rewards/format_reward": 0.0,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.032,
"grad_norm": 2.484722852706909,
"kl": 8.106231689453125e-05,
"learning_rate": 1.6e-07,
"loss": 0.0,
"reward": -0.5301882103085518,
"reward_std": 0.3405821621417999,
"rewards/cosine_scaled_reward": -0.2650941051542759,
"rewards/format_reward": 0.0,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.036,
"grad_norm": 3.1448230743408203,
"kl": 7.554888725280762e-05,
"learning_rate": 1.8e-07,
"loss": 0.0,
"reward": -0.5024237409234047,
"reward_std": 0.3572370335459709,
"rewards/cosine_scaled_reward": -0.25121185183525085,
"rewards/format_reward": 0.0,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.04,
"grad_norm": 4.125906944274902,
"kl": 8.666515350341797e-05,
"learning_rate": 2e-07,
"loss": 0.0,
"reward": -0.5732719898223877,
"reward_std": 0.37079156190156937,
"rewards/cosine_scaled_reward": -0.28663600236177444,
"rewards/format_reward": 0.0,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.044,
"grad_norm": 4.4225945472717285,
"kl": 5.561113357543945e-05,
"learning_rate": 2.1999999999999998e-07,
"loss": 0.0,
"reward": -0.5889493525028229,
"reward_std": 0.3473696708679199,
"rewards/cosine_scaled_reward": -0.29447468370199203,
"rewards/format_reward": 0.0,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.048,
"grad_norm": 3.891627550125122,
"kl": 7.808208465576172e-05,
"learning_rate": 2.4e-07,
"loss": 0.0,
"reward": -0.5409628972411156,
"reward_std": 0.326653391122818,
"rewards/cosine_scaled_reward": -0.2704814486205578,
"rewards/format_reward": 0.0,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.052,
"grad_norm": 3.552539587020874,
"kl": 7.30752944946289e-05,
"learning_rate": 2.6e-07,
"loss": 0.0,
"reward": -0.5389444306492805,
"reward_std": 0.3649257719516754,
"rewards/cosine_scaled_reward": -0.2694722190499306,
"rewards/format_reward": 0.0,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.056,
"grad_norm": 2.781034231185913,
"kl": 7.081031799316406e-05,
"learning_rate": 2.8e-07,
"loss": 0.0,
"reward": -0.6049635112285614,
"reward_std": 0.3185788542032242,
"rewards/cosine_scaled_reward": -0.3024817630648613,
"rewards/format_reward": 0.0,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.06,
"grad_norm": 3.412130355834961,
"kl": 6.335973739624023e-05,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": -0.6299380213022232,
"reward_std": 0.31315718591213226,
"rewards/cosine_scaled_reward": -0.3149690255522728,
"rewards/format_reward": 0.0,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.064,
"grad_norm": 4.064192771911621,
"kl": 0.00011527538299560547,
"learning_rate": 3.2e-07,
"loss": 0.0,
"reward": -0.5638149380683899,
"reward_std": 0.3539445400238037,
"rewards/cosine_scaled_reward": -0.28190746903419495,
"rewards/format_reward": 0.0,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.068,
"grad_norm": 3.5826501846313477,
"kl": 9.000301361083984e-05,
"learning_rate": 3.4000000000000003e-07,
"loss": 0.0,
"reward": -0.5815131217241287,
"reward_std": 0.3570765480399132,
"rewards/cosine_scaled_reward": -0.29075656831264496,
"rewards/format_reward": 0.0,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.072,
"grad_norm": 3.4398193359375,
"kl": 0.00013589859008789062,
"learning_rate": 3.6e-07,
"loss": 0.0,
"reward": -0.5058030858635902,
"reward_std": 0.3534058630466461,
"rewards/cosine_scaled_reward": -0.2529015429317951,
"rewards/format_reward": 0.0,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.076,
"grad_norm": 3.1647567749023438,
"kl": 0.00010588765144348145,
"learning_rate": 3.7999999999999996e-07,
"loss": 0.0,
"reward": -0.5453799739480019,
"reward_std": 0.3434706851840019,
"rewards/cosine_scaled_reward": -0.27268998324871063,
"rewards/format_reward": 0.0,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.08,
"grad_norm": 4.028233528137207,
"kl": 0.00011265277862548828,
"learning_rate": 4e-07,
"loss": 0.0,
"reward": -0.5725424438714981,
"reward_std": 0.33554956316947937,
"rewards/cosine_scaled_reward": -0.28627122938632965,
"rewards/format_reward": 0.0,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.084,
"grad_norm": 3.0403409004211426,
"kl": 0.00015485286712646484,
"learning_rate": 4.1999999999999995e-07,
"loss": 0.0,
"reward": -0.5395064353942871,
"reward_std": 0.3414423242211342,
"rewards/cosine_scaled_reward": -0.26975322514772415,
"rewards/format_reward": 0.0,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.088,
"grad_norm": 3.5831127166748047,
"kl": 0.0006537437438964844,
"learning_rate": 4.3999999999999997e-07,
"loss": 0.0,
"reward": -0.5216317698359489,
"reward_std": 0.3427959829568863,
"rewards/cosine_scaled_reward": -0.2608158737421036,
"rewards/format_reward": 0.0,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.092,
"grad_norm": 3.5175235271453857,
"kl": 0.0010776519775390625,
"learning_rate": 4.6e-07,
"loss": 0.0,
"reward": -0.5413709655404091,
"reward_std": 0.32718800008296967,
"rewards/cosine_scaled_reward": -0.27068548277020454,
"rewards/format_reward": 0.0,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.096,
"grad_norm": 3.442873239517212,
"kl": 0.0013303756713867188,
"learning_rate": 4.8e-07,
"loss": 0.0001,
"reward": -0.5624926462769508,
"reward_std": 0.3581688553094864,
"rewards/cosine_scaled_reward": -0.2812463231384754,
"rewards/format_reward": 0.0,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.1,
"grad_norm": 2.6114015579223633,
"kl": 0.0016193389892578125,
"learning_rate": 5e-07,
"loss": 0.0001,
"reward": -0.5309188961982727,
"reward_std": 0.33032629638910294,
"rewards/cosine_scaled_reward": -0.26545944809913635,
"rewards/format_reward": 0.0,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.104,
"grad_norm": 4.818567752838135,
"kl": 0.0026264190673828125,
"learning_rate": 5.2e-07,
"loss": 0.0001,
"reward": -0.5884083956480026,
"reward_std": 0.3386874794960022,
"rewards/cosine_scaled_reward": -0.2942042052745819,
"rewards/format_reward": 0.0,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.108,
"grad_norm": 4.078734397888184,
"kl": 0.002239227294921875,
"learning_rate": 5.4e-07,
"loss": 0.0001,
"reward": -0.6157089024782181,
"reward_std": 0.3308729752898216,
"rewards/cosine_scaled_reward": -0.30785445868968964,
"rewards/format_reward": 0.0,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.112,
"grad_norm": 3.4599478244781494,
"kl": 0.002338409423828125,
"learning_rate": 5.6e-07,
"loss": 0.0001,
"reward": -0.5709060430526733,
"reward_std": 0.3136204034090042,
"rewards/cosine_scaled_reward": -0.28545302152633667,
"rewards/format_reward": 0.0,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 1533.9464416503906,
"epoch": 0.116,
"grad_norm": 3.461718797683716,
"kl": 0.003444671630859375,
"learning_rate": 5.8e-07,
"loss": -0.001,
"reward": -0.5237472280859947,
"reward_std": 0.3601622208952904,
"rewards/cosine_scaled_reward": -0.26187360659241676,
"rewards/format_reward": 0.0,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.12,
"grad_norm": 3.7205333709716797,
"kl": 0.00542449951171875,
"learning_rate": 6e-07,
"loss": 0.0002,
"reward": -0.5595864206552505,
"reward_std": 0.3391585499048233,
"rewards/cosine_scaled_reward": -0.2797932103276253,
"rewards/format_reward": 0.0,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.124,
"grad_norm": 3.639012575149536,
"kl": 0.0102996826171875,
"learning_rate": 6.2e-07,
"loss": 0.0004,
"reward": -0.5832120478153229,
"reward_std": 0.34403981268405914,
"rewards/cosine_scaled_reward": -0.29160603135824203,
"rewards/format_reward": 0.0,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.128,
"grad_norm": 3.499258041381836,
"kl": 0.0159149169921875,
"learning_rate": 6.4e-07,
"loss": 0.0006,
"reward": -0.5567401573061943,
"reward_std": 0.3353060856461525,
"rewards/cosine_scaled_reward": -0.27837007120251656,
"rewards/format_reward": 0.0,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.132,
"grad_norm": 3.564453601837158,
"kl": 0.0182952880859375,
"learning_rate": 6.6e-07,
"loss": 0.0007,
"reward": -0.5521366372704506,
"reward_std": 0.3413034975528717,
"rewards/cosine_scaled_reward": -0.2760683260858059,
"rewards/format_reward": 0.0,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.136,
"grad_norm": 3.567174196243286,
"kl": 0.0237274169921875,
"learning_rate": 6.800000000000001e-07,
"loss": 0.0009,
"reward": -0.5193822234869003,
"reward_std": 0.35690775513648987,
"rewards/cosine_scaled_reward": -0.25969111174345016,
"rewards/format_reward": 0.0,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.14,
"grad_norm": 2.247893810272217,
"kl": 0.0149078369140625,
"learning_rate": 7e-07,
"loss": 0.0006,
"reward": -0.5820326581597328,
"reward_std": 0.3510446697473526,
"rewards/cosine_scaled_reward": -0.2910163216292858,
"rewards/format_reward": 0.0,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.144,
"grad_norm": 2.9316084384918213,
"kl": 0.022552490234375,
"learning_rate": 7.2e-07,
"loss": 0.0009,
"reward": -0.5632490888237953,
"reward_std": 0.3500733822584152,
"rewards/cosine_scaled_reward": -0.28162455186247826,
"rewards/format_reward": 0.0,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.148,
"grad_norm": 3.5201869010925293,
"kl": 0.02850341796875,
"learning_rate": 7.4e-07,
"loss": 0.0011,
"reward": -0.5141241475939751,
"reward_std": 0.3309687077999115,
"rewards/cosine_scaled_reward": -0.25706208124756813,
"rewards/format_reward": 0.0,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.152,
"grad_norm": 2.7246434688568115,
"kl": 0.0296630859375,
"learning_rate": 7.599999999999999e-07,
"loss": 0.0012,
"reward": -0.5139049887657166,
"reward_std": 0.33319953083992004,
"rewards/cosine_scaled_reward": -0.25695250555872917,
"rewards/format_reward": 0.0,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.156,
"grad_norm": 2.880594491958618,
"kl": 0.0258636474609375,
"learning_rate": 7.799999999999999e-07,
"loss": 0.001,
"reward": -0.5646104216575623,
"reward_std": 0.3474426791071892,
"rewards/cosine_scaled_reward": -0.2823052257299423,
"rewards/format_reward": 0.0,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.16,
"grad_norm": 2.6734988689422607,
"kl": 0.0321044921875,
"learning_rate": 8e-07,
"loss": 0.0013,
"reward": -0.5586390048265457,
"reward_std": 0.3474784344434738,
"rewards/cosine_scaled_reward": -0.27931951731443405,
"rewards/format_reward": 0.0,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.164,
"grad_norm": 3.1370785236358643,
"kl": 0.03369140625,
"learning_rate": 8.199999999999999e-07,
"loss": 0.0013,
"reward": -0.5609789937734604,
"reward_std": 0.3450735807418823,
"rewards/cosine_scaled_reward": -0.280489519238472,
"rewards/format_reward": 0.0,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.168,
"grad_norm": 2.5502073764801025,
"kl": 0.06072998046875,
"learning_rate": 8.399999999999999e-07,
"loss": 0.0024,
"reward": -0.5195748135447502,
"reward_std": 0.34474433213472366,
"rewards/cosine_scaled_reward": -0.2597874030470848,
"rewards/format_reward": 0.0,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.172,
"grad_norm": 2.1381213665008545,
"kl": 0.067474365234375,
"learning_rate": 8.599999999999999e-07,
"loss": 0.0027,
"reward": -0.5580533072352409,
"reward_std": 0.32987529784440994,
"rewards/cosine_scaled_reward": -0.27902666106820107,
"rewards/format_reward": 0.0,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.176,
"grad_norm": 2.1730432510375977,
"kl": 0.0958251953125,
"learning_rate": 8.799999999999999e-07,
"loss": 0.0038,
"reward": -0.5585729256272316,
"reward_std": 0.3295438587665558,
"rewards/cosine_scaled_reward": -0.2792864739894867,
"rewards/format_reward": 0.0,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.18,
"grad_norm": 1.962768316268921,
"kl": 0.079345703125,
"learning_rate": 9e-07,
"loss": 0.0032,
"reward": -0.5980347394943237,
"reward_std": 0.3284436762332916,
"rewards/cosine_scaled_reward": -0.29901736974716187,
"rewards/format_reward": 0.0,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.184,
"grad_norm": 1.8276231288909912,
"kl": 0.1153564453125,
"learning_rate": 9.2e-07,
"loss": 0.0046,
"reward": -0.507519856095314,
"reward_std": 0.33579862862825394,
"rewards/cosine_scaled_reward": -0.2537599205970764,
"rewards/format_reward": 0.0,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.188,
"grad_norm": 2.608023166656494,
"kl": 0.09033203125,
"learning_rate": 9.399999999999999e-07,
"loss": 0.0036,
"reward": -0.5289521142840385,
"reward_std": 0.31808041036129,
"rewards/cosine_scaled_reward": -0.26447605714201927,
"rewards/format_reward": 0.0,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.192,
"grad_norm": 1.8956966400146484,
"kl": 0.09814453125,
"learning_rate": 9.6e-07,
"loss": 0.0039,
"reward": -0.566174179315567,
"reward_std": 0.311339795589447,
"rewards/cosine_scaled_reward": -0.2830870673060417,
"rewards/format_reward": 0.0,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.196,
"grad_norm": 1.7705461978912354,
"kl": 0.1209716796875,
"learning_rate": 9.8e-07,
"loss": 0.0048,
"reward": -0.528024435043335,
"reward_std": 0.36330366879701614,
"rewards/cosine_scaled_reward": -0.26401223987340927,
"rewards/format_reward": 0.0,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.2,
"grad_norm": 2.1113531589508057,
"kl": 0.1171875,
"learning_rate": 1e-06,
"loss": 0.0047,
"reward": -0.4406622089445591,
"reward_std": 0.3163011893630028,
"rewards/cosine_scaled_reward": -0.2203311063349247,
"rewards/format_reward": 0.0,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.204,
"grad_norm": 1.803585410118103,
"kl": 0.1026611328125,
"learning_rate": 9.999890338174275e-07,
"loss": 0.0041,
"reward": -0.5815826654434204,
"reward_std": 0.3248438388109207,
"rewards/cosine_scaled_reward": -0.2907913327217102,
"rewards/format_reward": 0.0,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.208,
"grad_norm": 1.7076486349105835,
"kl": 0.157470703125,
"learning_rate": 9.999561358041868e-07,
"loss": 0.0063,
"reward": -0.5362438708543777,
"reward_std": 0.2975444979965687,
"rewards/cosine_scaled_reward": -0.26812195032835007,
"rewards/format_reward": 0.0,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.212,
"grad_norm": 2.478224515914917,
"kl": 0.144287109375,
"learning_rate": 9.999013075636804e-07,
"loss": 0.0058,
"reward": -0.47916819900274277,
"reward_std": 0.35621220618486404,
"rewards/cosine_scaled_reward": -0.23958410695195198,
"rewards/format_reward": 0.0,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.216,
"grad_norm": 2.006901502609253,
"kl": 0.1337890625,
"learning_rate": 9.998245517681593e-07,
"loss": 0.0053,
"reward": -0.5450761765241623,
"reward_std": 0.32576631009578705,
"rewards/cosine_scaled_reward": -0.27253808826208115,
"rewards/format_reward": 0.0,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.22,
"grad_norm": 2.2259609699249268,
"kl": 0.11669921875,
"learning_rate": 9.997258721585931e-07,
"loss": 0.0047,
"reward": -0.5271478518843651,
"reward_std": 0.34441374242305756,
"rewards/cosine_scaled_reward": -0.26357391849160194,
"rewards/format_reward": 0.0,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.224,
"grad_norm": 2.020939588546753,
"kl": 0.1907958984375,
"learning_rate": 9.996052735444862e-07,
"loss": 0.0076,
"reward": -0.5367654263973236,
"reward_std": 0.3470792919397354,
"rewards/cosine_scaled_reward": -0.2683827131986618,
"rewards/format_reward": 0.0,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.228,
"grad_norm": 1.9356812238693237,
"kl": 0.158935546875,
"learning_rate": 9.994627618036452e-07,
"loss": 0.0064,
"reward": -0.505635529756546,
"reward_std": 0.3292393088340759,
"rewards/cosine_scaled_reward": -0.252817764878273,
"rewards/format_reward": 0.0,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.232,
"grad_norm": 3.2483060359954834,
"kl": 0.188720703125,
"learning_rate": 9.992983438818915e-07,
"loss": 0.0075,
"reward": -0.504822663962841,
"reward_std": 0.35463710874319077,
"rewards/cosine_scaled_reward": -0.2524113282561302,
"rewards/format_reward": 0.0,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.236,
"grad_norm": 2.2256879806518555,
"kl": 0.205322265625,
"learning_rate": 9.991120277927223e-07,
"loss": 0.0082,
"reward": -0.5851711928844452,
"reward_std": 0.3146449252963066,
"rewards/cosine_scaled_reward": -0.2925856038928032,
"rewards/format_reward": 0.0,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.24,
"grad_norm": 2.093649387359619,
"kl": 0.198486328125,
"learning_rate": 9.989038226169207e-07,
"loss": 0.0079,
"reward": -0.45284587889909744,
"reward_std": 0.34760017693042755,
"rewards/cosine_scaled_reward": -0.22642293944954872,
"rewards/format_reward": 0.0,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.244,
"grad_norm": 2.378591537475586,
"kl": 0.24365234375,
"learning_rate": 9.98673738502114e-07,
"loss": 0.0097,
"reward": -0.5091445297002792,
"reward_std": 0.3452131450176239,
"rewards/cosine_scaled_reward": -0.2545722760260105,
"rewards/format_reward": 0.0,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.248,
"grad_norm": 2.188553810119629,
"kl": 0.29833984375,
"learning_rate": 9.98421786662277e-07,
"loss": 0.0119,
"reward": -0.47440846264362335,
"reward_std": 0.34785814583301544,
"rewards/cosine_scaled_reward": -0.23720423132181168,
"rewards/format_reward": 0.0,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.252,
"grad_norm": 2.6211366653442383,
"kl": 0.48095703125,
"learning_rate": 9.981479793771866e-07,
"loss": 0.0192,
"reward": -0.46701501309871674,
"reward_std": 0.3275434151291847,
"rewards/cosine_scaled_reward": -0.23350750654935837,
"rewards/format_reward": 0.0,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.256,
"grad_norm": 3.608039617538452,
"kl": 0.63720703125,
"learning_rate": 9.97852329991824e-07,
"loss": 0.0254,
"reward": -0.4022144228219986,
"reward_std": 0.3280187249183655,
"rewards/cosine_scaled_reward": -0.2011072114109993,
"rewards/format_reward": 0.0,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.26,
"grad_norm": 2.1589713096618652,
"kl": 0.587890625,
"learning_rate": 9.975348529157229e-07,
"loss": 0.0236,
"reward": -0.4902011975646019,
"reward_std": 0.33829304575920105,
"rewards/cosine_scaled_reward": -0.24510059878230095,
"rewards/format_reward": 0.0,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.264,
"grad_norm": 4.391396522521973,
"kl": 0.851806640625,
"learning_rate": 9.971955636222684e-07,
"loss": 0.034,
"reward": -0.5337588116526604,
"reward_std": 0.3271815627813339,
"rewards/cosine_scaled_reward": -0.2668794058263302,
"rewards/format_reward": 0.0,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.268,
"grad_norm": 4.296882629394531,
"kl": 0.892333984375,
"learning_rate": 9.968344786479415e-07,
"loss": 0.0357,
"reward": -0.45740216970443726,
"reward_std": 0.32497797161340714,
"rewards/cosine_scaled_reward": -0.22870109230279922,
"rewards/format_reward": 0.0,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.272,
"grad_norm": 7.224793434143066,
"kl": 1.29736328125,
"learning_rate": 9.964516155915151e-07,
"loss": 0.0519,
"reward": -0.5055549815297127,
"reward_std": 0.3318631425499916,
"rewards/cosine_scaled_reward": -0.25277747586369514,
"rewards/format_reward": 0.0,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.276,
"grad_norm": 6.747034072875977,
"kl": 1.3232421875,
"learning_rate": 9.960469931131936e-07,
"loss": 0.0531,
"reward": -0.4314222186803818,
"reward_std": 0.31476689875125885,
"rewards/cosine_scaled_reward": -0.21571110002696514,
"rewards/format_reward": 0.0,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.28,
"grad_norm": 5.5595808029174805,
"kl": 0.8935546875,
"learning_rate": 9.956206309337066e-07,
"loss": 0.0358,
"reward": -0.4758576303720474,
"reward_std": 0.33101003617048264,
"rewards/cosine_scaled_reward": -0.2379288226366043,
"rewards/format_reward": 0.0,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.284,
"grad_norm": 2.4482791423797607,
"kl": 0.521484375,
"learning_rate": 9.951725498333448e-07,
"loss": 0.0209,
"reward": -0.4491276890039444,
"reward_std": 0.3567735329270363,
"rewards/cosine_scaled_reward": -0.2245638445019722,
"rewards/format_reward": 0.0,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.288,
"grad_norm": 3.1987600326538086,
"kl": 0.6240234375,
"learning_rate": 9.947027716509488e-07,
"loss": 0.025,
"reward": -0.43654023110866547,
"reward_std": 0.3590875416994095,
"rewards/cosine_scaled_reward": -0.21827011927962303,
"rewards/format_reward": 0.0,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.292,
"grad_norm": 4.885537147521973,
"kl": 1.14599609375,
"learning_rate": 9.942113192828444e-07,
"loss": 0.0458,
"reward": -0.5265215784311295,
"reward_std": 0.3363535851240158,
"rewards/cosine_scaled_reward": -0.26326077431440353,
"rewards/format_reward": 0.0,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.296,
"grad_norm": 3.4503629207611084,
"kl": 1.14794921875,
"learning_rate": 9.93698216681727e-07,
"loss": 0.0459,
"reward": -0.4836200848221779,
"reward_std": 0.33076073229312897,
"rewards/cosine_scaled_reward": -0.24181004241108894,
"rewards/format_reward": 0.0,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.3,
"grad_norm": 3.5954651832580566,
"kl": 0.6767578125,
"learning_rate": 9.931634888554935e-07,
"loss": 0.027,
"reward": -0.5548510551452637,
"reward_std": 0.3006826713681221,
"rewards/cosine_scaled_reward": -0.27742552757263184,
"rewards/format_reward": 0.0,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.304,
"grad_norm": 2.27148699760437,
"kl": 0.69970703125,
"learning_rate": 9.926071618660237e-07,
"loss": 0.028,
"reward": -0.5522997975349426,
"reward_std": 0.32217612117528915,
"rewards/cosine_scaled_reward": -0.2761498987674713,
"rewards/format_reward": 0.0,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.308,
"grad_norm": 2.421114206314087,
"kl": 0.65234375,
"learning_rate": 9.9202926282791e-07,
"loss": 0.0261,
"reward": -0.5491495952010155,
"reward_std": 0.33891358226537704,
"rewards/cosine_scaled_reward": -0.27457480505108833,
"rewards/format_reward": 0.0,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.312,
"grad_norm": 2.296977996826172,
"kl": 0.4833984375,
"learning_rate": 9.91429819907136e-07,
"loss": 0.0193,
"reward": -0.5332002714276314,
"reward_std": 0.3453890234231949,
"rewards/cosine_scaled_reward": -0.2666001245379448,
"rewards/format_reward": 0.0,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.316,
"grad_norm": 2.351818084716797,
"kl": 0.5048828125,
"learning_rate": 9.908088623197048e-07,
"loss": 0.0202,
"reward": -0.4974421188235283,
"reward_std": 0.36291657388210297,
"rewards/cosine_scaled_reward": -0.24872105196118355,
"rewards/format_reward": 0.0,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.32,
"grad_norm": 2.808706521987915,
"kl": 0.53125,
"learning_rate": 9.901664203302124e-07,
"loss": 0.0212,
"reward": -0.5026201903820038,
"reward_std": 0.30610421299934387,
"rewards/cosine_scaled_reward": -0.2513100877404213,
"rewards/format_reward": 0.0,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.324,
"grad_norm": 2.077920913696289,
"kl": 0.68994140625,
"learning_rate": 9.895025252503755e-07,
"loss": 0.0276,
"reward": -0.4621705636382103,
"reward_std": 0.33135028183460236,
"rewards/cosine_scaled_reward": -0.23108528181910515,
"rewards/format_reward": 0.0,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.328,
"grad_norm": 2.951878309249878,
"kl": 0.6015625,
"learning_rate": 9.888172094375033e-07,
"loss": 0.024,
"reward": -0.5148988738656044,
"reward_std": 0.3465086743235588,
"rewards/cosine_scaled_reward": -0.2574494294822216,
"rewards/format_reward": 0.0,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.332,
"grad_norm": 2.1016077995300293,
"kl": 0.36376953125,
"learning_rate": 9.881105062929221e-07,
"loss": 0.0145,
"reward": -0.48821673542261124,
"reward_std": 0.35235296189785004,
"rewards/cosine_scaled_reward": -0.24410836026072502,
"rewards/format_reward": 0.0,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.336,
"grad_norm": 2.276076555252075,
"kl": 0.77734375,
"learning_rate": 9.873824502603459e-07,
"loss": 0.0311,
"reward": -0.509700171649456,
"reward_std": 0.3434828519821167,
"rewards/cosine_scaled_reward": -0.2548500932753086,
"rewards/format_reward": 0.0,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.34,
"grad_norm": 1.9953871965408325,
"kl": 0.45263671875,
"learning_rate": 9.866330768241983e-07,
"loss": 0.0181,
"reward": -0.5046856477856636,
"reward_std": 0.3276178315281868,
"rewards/cosine_scaled_reward": -0.2523428313434124,
"rewards/format_reward": 0.0,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.344,
"grad_norm": 5.694060802459717,
"kl": 1.50390625,
"learning_rate": 9.85862422507884e-07,
"loss": 0.06,
"reward": -0.5268296301364899,
"reward_std": 0.3594844192266464,
"rewards/cosine_scaled_reward": -0.26341481506824493,
"rewards/format_reward": 0.0,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.348,
"grad_norm": 2.5820319652557373,
"kl": 0.79931640625,
"learning_rate": 9.850705248720068e-07,
"loss": 0.0319,
"reward": -0.5030437260866165,
"reward_std": 0.33297523856163025,
"rewards/cosine_scaled_reward": -0.25152185559272766,
"rewards/format_reward": 0.0,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.352,
"grad_norm": 2.748469829559326,
"kl": 0.8642578125,
"learning_rate": 9.8425742251254e-07,
"loss": 0.0346,
"reward": -0.511917307972908,
"reward_std": 0.3373011276125908,
"rewards/cosine_scaled_reward": -0.255958653986454,
"rewards/format_reward": 0.0,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.356,
"grad_norm": 2.941894054412842,
"kl": 1.10400390625,
"learning_rate": 9.83423155058946e-07,
"loss": 0.0443,
"reward": -0.49383244663476944,
"reward_std": 0.3190907835960388,
"rewards/cosine_scaled_reward": -0.24691622331738472,
"rewards/format_reward": 0.0,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.36,
"grad_norm": 2.5008065700531006,
"kl": 0.7451171875,
"learning_rate": 9.825677631722435e-07,
"loss": 0.0298,
"reward": -0.5015105679631233,
"reward_std": 0.3283078894019127,
"rewards/cosine_scaled_reward": -0.25075526908040047,
"rewards/format_reward": 0.0,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.364,
"grad_norm": 2.775805950164795,
"kl": 0.8662109375,
"learning_rate": 9.816912885430258e-07,
"loss": 0.0347,
"reward": -0.49317121505737305,
"reward_std": 0.3281624838709831,
"rewards/cosine_scaled_reward": -0.24658560752868652,
"rewards/format_reward": 0.0,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.368,
"grad_norm": 4.057337284088135,
"kl": 1.3115234375,
"learning_rate": 9.807937738894303e-07,
"loss": 0.0525,
"reward": -0.4923912510275841,
"reward_std": 0.334882490336895,
"rewards/cosine_scaled_reward": -0.24619561806321144,
"rewards/format_reward": 0.0,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.372,
"grad_norm": 3.3191726207733154,
"kl": 1.416015625,
"learning_rate": 9.798752629550546e-07,
"loss": 0.0567,
"reward": -0.4856347441673279,
"reward_std": 0.3141849860548973,
"rewards/cosine_scaled_reward": -0.24281736463308334,
"rewards/format_reward": 0.0,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.376,
"grad_norm": 38.36699676513672,
"kl": 3.833984375,
"learning_rate": 9.78935800506826e-07,
"loss": 0.1535,
"reward": -0.5001253262162209,
"reward_std": 0.34716712683439255,
"rewards/cosine_scaled_reward": -0.25006265565752983,
"rewards/format_reward": 0.0,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.38,
"grad_norm": 2.851670742034912,
"kl": 0.93017578125,
"learning_rate": 9.779754323328192e-07,
"loss": 0.0372,
"reward": -0.4462156817317009,
"reward_std": 0.3170738257467747,
"rewards/cosine_scaled_reward": -0.22310783341526985,
"rewards/format_reward": 0.0,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.384,
"grad_norm": 1.903143048286438,
"kl": 0.662109375,
"learning_rate": 9.769942052400235e-07,
"loss": 0.0265,
"reward": -0.44278524816036224,
"reward_std": 0.340934194624424,
"rewards/cosine_scaled_reward": -0.22139262408018112,
"rewards/format_reward": 0.0,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.388,
"grad_norm": 2.613619089126587,
"kl": 1.0009765625,
"learning_rate": 9.759921670520634e-07,
"loss": 0.04,
"reward": -0.4385986104607582,
"reward_std": 0.3297598212957382,
"rewards/cosine_scaled_reward": -0.2192993052303791,
"rewards/format_reward": 0.0,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.392,
"grad_norm": 2.1393027305603027,
"kl": 0.84912109375,
"learning_rate": 9.749693666068663e-07,
"loss": 0.0339,
"reward": -0.4335070326924324,
"reward_std": 0.3084552064538002,
"rewards/cosine_scaled_reward": -0.2167535126209259,
"rewards/format_reward": 0.0,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.396,
"grad_norm": 10.226459503173828,
"kl": 1.9765625,
"learning_rate": 9.739258537542835e-07,
"loss": 0.0791,
"reward": -0.5120433643460274,
"reward_std": 0.3308994993567467,
"rewards/cosine_scaled_reward": -0.2560216821730137,
"rewards/format_reward": 0.0,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.4,
"grad_norm": 2.7042365074157715,
"kl": 1.140625,
"learning_rate": 9.728616793536587e-07,
"loss": 0.0456,
"reward": -0.5387645438313484,
"reward_std": 0.32419781386852264,
"rewards/cosine_scaled_reward": -0.2693822719156742,
"rewards/format_reward": 0.0,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.404,
"grad_norm": 3.3440866470336914,
"kl": 1.158203125,
"learning_rate": 9.717768952713511e-07,
"loss": 0.0464,
"reward": -0.479642316699028,
"reward_std": 0.3374394252896309,
"rewards/cosine_scaled_reward": -0.2398211695253849,
"rewards/format_reward": 0.0,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.408,
"grad_norm": 2.1483707427978516,
"kl": 0.55859375,
"learning_rate": 9.706715543782064e-07,
"loss": 0.0224,
"reward": -0.4488200396299362,
"reward_std": 0.3361233174800873,
"rewards/cosine_scaled_reward": -0.2244100198149681,
"rewards/format_reward": 0.0,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.412,
"grad_norm": 4.173567771911621,
"kl": 1.900390625,
"learning_rate": 9.695457105469804e-07,
"loss": 0.0759,
"reward": -0.4979688450694084,
"reward_std": 0.35078077018260956,
"rewards/cosine_scaled_reward": -0.2489844374358654,
"rewards/format_reward": 0.0,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.416,
"grad_norm": 5.119884490966797,
"kl": 1.611328125,
"learning_rate": 9.683994186497132e-07,
"loss": 0.0644,
"reward": -0.513933926820755,
"reward_std": 0.3170707896351814,
"rewards/cosine_scaled_reward": -0.2569669596850872,
"rewards/format_reward": 0.0,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.42,
"grad_norm": 2.8145992755889893,
"kl": 1.466796875,
"learning_rate": 9.672327345550543e-07,
"loss": 0.0587,
"reward": -0.47269363701343536,
"reward_std": 0.31501560658216476,
"rewards/cosine_scaled_reward": -0.23634683340787888,
"rewards/format_reward": 0.0,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.424,
"grad_norm": 2.3274426460266113,
"kl": 0.59033203125,
"learning_rate": 9.66045715125541e-07,
"loss": 0.0236,
"reward": -0.44968922436237335,
"reward_std": 0.3498781695961952,
"rewards/cosine_scaled_reward": -0.22484461963176727,
"rewards/format_reward": 0.0,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.428,
"grad_norm": 2.2112016677856445,
"kl": 1.126953125,
"learning_rate": 9.648384182148252e-07,
"loss": 0.0451,
"reward": -0.5002073347568512,
"reward_std": 0.34406865388154984,
"rewards/cosine_scaled_reward": -0.2501036673784256,
"rewards/format_reward": 0.0,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.432,
"grad_norm": 2.4664499759674072,
"kl": 1.0986328125,
"learning_rate": 9.636109026648554e-07,
"loss": 0.0439,
"reward": -0.49009862542152405,
"reward_std": 0.3558028042316437,
"rewards/cosine_scaled_reward": -0.24504930526018143,
"rewards/format_reward": 0.0,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.436,
"grad_norm": 2.3740482330322266,
"kl": 0.67578125,
"learning_rate": 9.623632283030077e-07,
"loss": 0.027,
"reward": -0.4631711468100548,
"reward_std": 0.34275270998477936,
"rewards/cosine_scaled_reward": -0.2315855734050274,
"rewards/format_reward": 0.0,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.44,
"grad_norm": 2.9116501808166504,
"kl": 1.1826171875,
"learning_rate": 9.610954559391704e-07,
"loss": 0.0473,
"reward": -0.444116935133934,
"reward_std": 0.37212707847356796,
"rewards/cosine_scaled_reward": -0.2220584638416767,
"rewards/format_reward": 0.0,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.444,
"grad_norm": 2.24743390083313,
"kl": 0.638671875,
"learning_rate": 9.598076473627796e-07,
"loss": 0.0255,
"reward": -0.46286992728710175,
"reward_std": 0.3208693787455559,
"rewards/cosine_scaled_reward": -0.23143497854471207,
"rewards/format_reward": 0.0,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.448,
"grad_norm": 3.138840913772583,
"kl": 1.14404296875,
"learning_rate": 9.58499865339809e-07,
"loss": 0.0458,
"reward": -0.4803452715277672,
"reward_std": 0.3449332043528557,
"rewards/cosine_scaled_reward": -0.2401726357638836,
"rewards/format_reward": 0.0,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.452,
"grad_norm": 2.7688963413238525,
"kl": 0.9462890625,
"learning_rate": 9.571721736097088e-07,
"loss": 0.0379,
"reward": -0.4440384730696678,
"reward_std": 0.3389856517314911,
"rewards/cosine_scaled_reward": -0.2220192365348339,
"rewards/format_reward": 0.0,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.456,
"grad_norm": 2.7298948764801025,
"kl": 1.3583984375,
"learning_rate": 9.55824636882301e-07,
"loss": 0.0544,
"reward": -0.40611616894602776,
"reward_std": 0.3120696693658829,
"rewards/cosine_scaled_reward": -0.20305808261036873,
"rewards/format_reward": 0.0,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.46,
"grad_norm": 2.628330945968628,
"kl": 0.84521484375,
"learning_rate": 9.54457320834625e-07,
"loss": 0.0338,
"reward": -0.41812988370656967,
"reward_std": 0.33337801694869995,
"rewards/cosine_scaled_reward": -0.20906493440270424,
"rewards/format_reward": 0.0,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.464,
"grad_norm": 2.21708607673645,
"kl": 1.125,
"learning_rate": 9.530702921077358e-07,
"loss": 0.0451,
"reward": -0.4452592432498932,
"reward_std": 0.34758392721414566,
"rewards/cosine_scaled_reward": -0.2226296216249466,
"rewards/format_reward": 0.0,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.468,
"grad_norm": 3.4151782989501953,
"kl": 1.5390625,
"learning_rate": 9.516636183034564e-07,
"loss": 0.0617,
"reward": -0.5043663010001183,
"reward_std": 0.3056981936097145,
"rewards/cosine_scaled_reward": -0.25218314677476883,
"rewards/format_reward": 0.0,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.472,
"grad_norm": 2.8809969425201416,
"kl": 1.498046875,
"learning_rate": 9.502373679810839e-07,
"loss": 0.0599,
"reward": -0.44362927228212357,
"reward_std": 0.32765333354473114,
"rewards/cosine_scaled_reward": -0.22181464359164238,
"rewards/format_reward": 0.0,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.476,
"grad_norm": 3.092552661895752,
"kl": 1.6640625,
"learning_rate": 9.487916106540465e-07,
"loss": 0.0665,
"reward": -0.49818655103445053,
"reward_std": 0.3495415672659874,
"rewards/cosine_scaled_reward": -0.24909326806664467,
"rewards/format_reward": 0.0,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.48,
"grad_norm": 3.2943530082702637,
"kl": 2.07421875,
"learning_rate": 9.473264167865171e-07,
"loss": 0.0829,
"reward": -0.4802135229110718,
"reward_std": 0.3453461080789566,
"rewards/cosine_scaled_reward": -0.24010677635669708,
"rewards/format_reward": 0.0,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.484,
"grad_norm": 2.5681769847869873,
"kl": 1.505859375,
"learning_rate": 9.458418577899774e-07,
"loss": 0.0603,
"reward": -0.5175792872905731,
"reward_std": 0.35768260806798935,
"rewards/cosine_scaled_reward": -0.25878964737057686,
"rewards/format_reward": 0.0,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.488,
"grad_norm": 2.9190571308135986,
"kl": 1.57373046875,
"learning_rate": 9.443380060197385e-07,
"loss": 0.063,
"reward": -0.46548449248075485,
"reward_std": 0.35348332673311234,
"rewards/cosine_scaled_reward": -0.23274223506450653,
"rewards/format_reward": 0.0,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.492,
"grad_norm": 2.435157537460327,
"kl": 1.0654296875,
"learning_rate": 9.428149347714143e-07,
"loss": 0.0427,
"reward": -0.4281177818775177,
"reward_std": 0.3503784313797951,
"rewards/cosine_scaled_reward": -0.21405889093875885,
"rewards/format_reward": 0.0,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.496,
"grad_norm": 3.1375350952148438,
"kl": 1.5625,
"learning_rate": 9.412727182773486e-07,
"loss": 0.0624,
"reward": -0.4667646959424019,
"reward_std": 0.3501163199543953,
"rewards/cosine_scaled_reward": -0.23338234052062035,
"rewards/format_reward": 0.0,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.5,
"grad_norm": 2.1935606002807617,
"kl": 1.3427734375,
"learning_rate": 9.397114317029974e-07,
"loss": 0.0537,
"reward": -0.4283955693244934,
"reward_std": 0.34814615547657013,
"rewards/cosine_scaled_reward": -0.2141977809369564,
"rewards/format_reward": 0.0,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.504,
"grad_norm": 2.727754592895508,
"kl": 1.35546875,
"learning_rate": 9.381311511432658e-07,
"loss": 0.0543,
"reward": -0.4584430381655693,
"reward_std": 0.3318573832511902,
"rewards/cosine_scaled_reward": -0.22922151535749435,
"rewards/format_reward": 0.0,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.508,
"grad_norm": 2.9863674640655518,
"kl": 1.509765625,
"learning_rate": 9.36531953618799e-07,
"loss": 0.0603,
"reward": -0.4794049710035324,
"reward_std": 0.3224741891026497,
"rewards/cosine_scaled_reward": -0.2397024855017662,
"rewards/format_reward": 0.0,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.512,
"grad_norm": 3.0583863258361816,
"kl": 1.5751953125,
"learning_rate": 9.34913917072228e-07,
"loss": 0.0631,
"reward": -0.3896471783518791,
"reward_std": 0.32155635207891464,
"rewards/cosine_scaled_reward": -0.19482359662652016,
"rewards/format_reward": 0.0,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.516,
"grad_norm": 11.888484001159668,
"kl": 2.1806640625,
"learning_rate": 9.332771203643714e-07,
"loss": 0.0874,
"reward": -0.46486661583185196,
"reward_std": 0.34625906497240067,
"rewards/cosine_scaled_reward": -0.23243330791592598,
"rewards/format_reward": 0.0,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.52,
"grad_norm": 3.14744234085083,
"kl": 1.1103515625,
"learning_rate": 9.316216432703916e-07,
"loss": 0.0445,
"reward": -0.4691261351108551,
"reward_std": 0.3357261121273041,
"rewards/cosine_scaled_reward": -0.23456306010484695,
"rewards/format_reward": 0.0,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.524,
"grad_norm": 2.6933717727661133,
"kl": 1.76171875,
"learning_rate": 9.299475664759068e-07,
"loss": 0.0705,
"reward": -0.5458347946405411,
"reward_std": 0.3296028599143028,
"rewards/cosine_scaled_reward": -0.27291740477085114,
"rewards/format_reward": 0.0,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.528,
"grad_norm": 2.695984363555908,
"kl": 1.2666015625,
"learning_rate": 9.282549715730579e-07,
"loss": 0.0506,
"reward": -0.43337278813123703,
"reward_std": 0.3223467916250229,
"rewards/cosine_scaled_reward": -0.2166864052414894,
"rewards/format_reward": 0.0,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.532,
"grad_norm": 2.1844236850738525,
"kl": 1.072265625,
"learning_rate": 9.265439410565328e-07,
"loss": 0.0429,
"reward": -0.47815513610839844,
"reward_std": 0.33408980816602707,
"rewards/cosine_scaled_reward": -0.23907756060361862,
"rewards/format_reward": 0.0,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.536,
"grad_norm": 2.6240434646606445,
"kl": 0.998046875,
"learning_rate": 9.248145583195447e-07,
"loss": 0.0399,
"reward": -0.3596036769449711,
"reward_std": 0.3202332779765129,
"rewards/cosine_scaled_reward": -0.17980184871703386,
"rewards/format_reward": 0.0,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.54,
"grad_norm": 2.413489580154419,
"kl": 1.515625,
"learning_rate": 9.230669076497687e-07,
"loss": 0.0607,
"reward": -0.3980662524700165,
"reward_std": 0.3146558068692684,
"rewards/cosine_scaled_reward": -0.19903312623500824,
"rewards/format_reward": 0.0,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.544,
"grad_norm": 2.5466983318328857,
"kl": 1.421875,
"learning_rate": 9.213010742252327e-07,
"loss": 0.0568,
"reward": -0.4567502960562706,
"reward_std": 0.36093486845493317,
"rewards/cosine_scaled_reward": -0.2283751629292965,
"rewards/format_reward": 0.0,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.548,
"grad_norm": 2.670454263687134,
"kl": 1.63671875,
"learning_rate": 9.195171441101668e-07,
"loss": 0.0655,
"reward": -0.48265285044908524,
"reward_std": 0.33601198345422745,
"rewards/cosine_scaled_reward": -0.24132642522454262,
"rewards/format_reward": 0.0,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.552,
"grad_norm": 3.4489877223968506,
"kl": 1.39453125,
"learning_rate": 9.177152042508077e-07,
"loss": 0.0558,
"reward": -0.40766458958387375,
"reward_std": 0.34357643127441406,
"rewards/cosine_scaled_reward": -0.20383229106664658,
"rewards/format_reward": 0.0,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.556,
"grad_norm": 2.18890118598938,
"kl": 1.30859375,
"learning_rate": 9.158953424711624e-07,
"loss": 0.0523,
"reward": -0.4143947809934616,
"reward_std": 0.323918879032135,
"rewards/cosine_scaled_reward": -0.2071974016726017,
"rewards/format_reward": 0.0,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.56,
"grad_norm": 2.5627028942108154,
"kl": 1.34423828125,
"learning_rate": 9.140576474687263e-07,
"loss": 0.0538,
"reward": -0.4485241174697876,
"reward_std": 0.3278198316693306,
"rewards/cosine_scaled_reward": -0.2242620587348938,
"rewards/format_reward": 0.0,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.564,
"grad_norm": 2.086371660232544,
"kl": 1.2802734375,
"learning_rate": 9.122022088101613e-07,
"loss": 0.0512,
"reward": -0.32855524495244026,
"reward_std": 0.33061159402132034,
"rewards/cosine_scaled_reward": -0.16427762433886528,
"rewards/format_reward": 0.0,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.568,
"grad_norm": 2.45231556892395,
"kl": 1.580078125,
"learning_rate": 9.103291169269299e-07,
"loss": 0.0632,
"reward": -0.4703398421406746,
"reward_std": 0.2972045987844467,
"rewards/cosine_scaled_reward": -0.2351699210703373,
"rewards/format_reward": 0.0,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.572,
"grad_norm": 2.864070415496826,
"kl": 1.984375,
"learning_rate": 9.084384631108882e-07,
"loss": 0.0794,
"reward": -0.41980744898319244,
"reward_std": 0.34404993802309036,
"rewards/cosine_scaled_reward": -0.20990372076630592,
"rewards/format_reward": 0.0,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.576,
"grad_norm": 2.412257194519043,
"kl": 1.544921875,
"learning_rate": 9.065303395098358e-07,
"loss": 0.0618,
"reward": -0.43455804139375687,
"reward_std": 0.32647445797920227,
"rewards/cosine_scaled_reward": -0.21727901697158813,
"rewards/format_reward": 0.0,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.58,
"grad_norm": 2.952892780303955,
"kl": 2.0595703125,
"learning_rate": 9.046048391230247e-07,
"loss": 0.0824,
"reward": -0.4728480279445648,
"reward_std": 0.33887017518281937,
"rewards/cosine_scaled_reward": -0.2364240102469921,
"rewards/format_reward": 0.0,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.584,
"grad_norm": 2.3727328777313232,
"kl": 1.7255859375,
"learning_rate": 9.026620557966279e-07,
"loss": 0.0692,
"reward": -0.42372531443834305,
"reward_std": 0.3417205289006233,
"rewards/cosine_scaled_reward": -0.21186266466975212,
"rewards/format_reward": 0.0,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.588,
"grad_norm": 2.953756809234619,
"kl": 2.353515625,
"learning_rate": 9.007020842191634e-07,
"loss": 0.0943,
"reward": -0.43578075617551804,
"reward_std": 0.34062809497117996,
"rewards/cosine_scaled_reward": -0.21789037808775902,
"rewards/format_reward": 0.0,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.592,
"grad_norm": 2.5953478813171387,
"kl": 1.38671875,
"learning_rate": 8.987250199168808e-07,
"loss": 0.0555,
"reward": -0.4190576896071434,
"reward_std": 0.34895560145378113,
"rewards/cosine_scaled_reward": -0.2095288448035717,
"rewards/format_reward": 0.0,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.596,
"grad_norm": 2.4279496669769287,
"kl": 1.62890625,
"learning_rate": 8.967309592491052e-07,
"loss": 0.0651,
"reward": -0.4394699037075043,
"reward_std": 0.3207908198237419,
"rewards/cosine_scaled_reward": -0.21973494067788124,
"rewards/format_reward": 0.0,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.6,
"grad_norm": 2.974292516708374,
"kl": 1.892578125,
"learning_rate": 8.9471999940354e-07,
"loss": 0.0757,
"reward": -0.4797021597623825,
"reward_std": 0.32065775990486145,
"rewards/cosine_scaled_reward": -0.23985107988119125,
"rewards/format_reward": 0.0,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.604,
"grad_norm": 2.51299786567688,
"kl": 0.87890625,
"learning_rate": 8.926922383915315e-07,
"loss": 0.0351,
"reward": -0.4108778163790703,
"reward_std": 0.326105996966362,
"rewards/cosine_scaled_reward": -0.20543890818953514,
"rewards/format_reward": 0.0,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.608,
"grad_norm": 2.723388195037842,
"kl": 1.2294921875,
"learning_rate": 8.906477750432903e-07,
"loss": 0.0492,
"reward": -0.4178111329674721,
"reward_std": 0.32895463705062866,
"rewards/cosine_scaled_reward": -0.20890555530786514,
"rewards/format_reward": 0.0,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.612,
"grad_norm": 2.4097025394439697,
"kl": 1.650390625,
"learning_rate": 8.88586709003076e-07,
"loss": 0.0659,
"reward": -0.4825671687722206,
"reward_std": 0.33990373462438583,
"rewards/cosine_scaled_reward": -0.2412835843861103,
"rewards/format_reward": 0.0,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.616,
"grad_norm": 2.114370107650757,
"kl": 1.390625,
"learning_rate": 8.865091407243394e-07,
"loss": 0.0556,
"reward": -0.42671380192041397,
"reward_std": 0.32950445264577866,
"rewards/cosine_scaled_reward": -0.21335690841078758,
"rewards/format_reward": 0.0,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.62,
"grad_norm": 3.1770823001861572,
"kl": 1.4287109375,
"learning_rate": 8.844151714648274e-07,
"loss": 0.0572,
"reward": -0.4250905141234398,
"reward_std": 0.3110942989587784,
"rewards/cosine_scaled_reward": -0.2125452570617199,
"rewards/format_reward": 0.0,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.624,
"grad_norm": 2.6063926219940186,
"kl": 1.796875,
"learning_rate": 8.823049032816478e-07,
"loss": 0.0719,
"reward": -0.4206129387021065,
"reward_std": 0.33140094578266144,
"rewards/cosine_scaled_reward": -0.21030646935105324,
"rewards/format_reward": 0.0,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.628,
"grad_norm": 2.482637643814087,
"kl": 1.525390625,
"learning_rate": 8.801784390262943e-07,
"loss": 0.061,
"reward": -0.36781868524849415,
"reward_std": 0.3281563073396683,
"rewards/cosine_scaled_reward": -0.18390934821218252,
"rewards/format_reward": 0.0,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.632,
"grad_norm": 2.7100956439971924,
"kl": 1.7861328125,
"learning_rate": 8.780358823396352e-07,
"loss": 0.0715,
"reward": -0.3854188397526741,
"reward_std": 0.31897617131471634,
"rewards/cosine_scaled_reward": -0.19270941987633705,
"rewards/format_reward": 0.0,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.636,
"grad_norm": 2.3493990898132324,
"kl": 1.859375,
"learning_rate": 8.758773376468604e-07,
"loss": 0.0746,
"reward": -0.41636481136083603,
"reward_std": 0.3308830112218857,
"rewards/cosine_scaled_reward": -0.20818240568041801,
"rewards/format_reward": 0.0,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.64,
"grad_norm": 2.429762840270996,
"kl": 1.78125,
"learning_rate": 8.737029101523929e-07,
"loss": 0.0714,
"reward": -0.44961177557706833,
"reward_std": 0.3425107002258301,
"rewards/cosine_scaled_reward": -0.22480589523911476,
"rewards/format_reward": 0.0,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.644,
"grad_norm": 2.6372933387756348,
"kl": 1.6474609375,
"learning_rate": 8.715127058347614e-07,
"loss": 0.066,
"reward": -0.4204000309109688,
"reward_std": 0.3256704956293106,
"rewards/cosine_scaled_reward": -0.2102000191807747,
"rewards/format_reward": 0.0,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.648,
"grad_norm": 2.2505483627319336,
"kl": 1.576171875,
"learning_rate": 8.693068314414344e-07,
"loss": 0.063,
"reward": -0.4363863915205002,
"reward_std": 0.3367513567209244,
"rewards/cosine_scaled_reward": -0.2181931994855404,
"rewards/format_reward": 0.0,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.652,
"grad_norm": 2.781273603439331,
"kl": 1.4375,
"learning_rate": 8.670853944836176e-07,
"loss": 0.0576,
"reward": -0.44805190712213516,
"reward_std": 0.3117773234844208,
"rewards/cosine_scaled_reward": -0.22402595356106758,
"rewards/format_reward": 0.0,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.656,
"grad_norm": 2.573030710220337,
"kl": 1.21435546875,
"learning_rate": 8.648485032310144e-07,
"loss": 0.0487,
"reward": -0.40324684232473373,
"reward_std": 0.3176472932100296,
"rewards/cosine_scaled_reward": -0.20162343233823776,
"rewards/format_reward": 0.0,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.66,
"grad_norm": 4.171741485595703,
"kl": 2.3125,
"learning_rate": 8.625962667065487e-07,
"loss": 0.0925,
"reward": -0.4968671426177025,
"reward_std": 0.3204089626669884,
"rewards/cosine_scaled_reward": -0.24843357503414154,
"rewards/format_reward": 0.0,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 1528.702392578125,
"epoch": 0.664,
"grad_norm": 2.1756961345672607,
"kl": 1.7578125,
"learning_rate": 8.603287946810513e-07,
"loss": 0.0706,
"reward": -0.4272613450884819,
"reward_std": 0.32390115410089493,
"rewards/cosine_scaled_reward": -0.21363067999482155,
"rewards/format_reward": 0.0,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.668,
"grad_norm": 2.2742207050323486,
"kl": 1.912109375,
"learning_rate": 8.580461976679099e-07,
"loss": 0.0763,
"reward": -0.3418873958289623,
"reward_std": 0.29924022778868675,
"rewards/cosine_scaled_reward": -0.17094369884580374,
"rewards/format_reward": 0.0,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.672,
"grad_norm": 2.1837146282196045,
"kl": 1.3330078125,
"learning_rate": 8.557485869176825e-07,
"loss": 0.0533,
"reward": -0.4050525277853012,
"reward_std": 0.3251590058207512,
"rewards/cosine_scaled_reward": -0.2025262601673603,
"rewards/format_reward": 0.0,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.676,
"grad_norm": 2.1009020805358887,
"kl": 1.9326171875,
"learning_rate": 8.534360744126753e-07,
"loss": 0.0774,
"reward": -0.4387947544455528,
"reward_std": 0.3307826817035675,
"rewards/cosine_scaled_reward": -0.21939736977219582,
"rewards/format_reward": 0.0,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.68,
"grad_norm": 2.515617609024048,
"kl": 1.884765625,
"learning_rate": 8.511087728614862e-07,
"loss": 0.0754,
"reward": -0.41566915810108185,
"reward_std": 0.34893494844436646,
"rewards/cosine_scaled_reward": -0.20783457532525063,
"rewards/format_reward": 0.0,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.684,
"grad_norm": 2.3045356273651123,
"kl": 1.5078125,
"learning_rate": 8.487667956935087e-07,
"loss": 0.0604,
"reward": -0.3871946483850479,
"reward_std": 0.3363000229001045,
"rewards/cosine_scaled_reward": -0.19359732419252396,
"rewards/format_reward": 0.0,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.688,
"grad_norm": 2.1517364978790283,
"kl": 1.4169921875,
"learning_rate": 8.464102570534061e-07,
"loss": 0.0567,
"reward": -0.41495678573846817,
"reward_std": 0.33959241211414337,
"rewards/cosine_scaled_reward": -0.20747840031981468,
"rewards/format_reward": 0.0,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.692,
"grad_norm": 2.4767415523529053,
"kl": 1.5654296875,
"learning_rate": 8.440392717955475e-07,
"loss": 0.0626,
"reward": -0.3259017579257488,
"reward_std": 0.3448467329144478,
"rewards/cosine_scaled_reward": -0.16295087756589055,
"rewards/format_reward": 0.0,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.696,
"grad_norm": 2.1803934574127197,
"kl": 1.5986328125,
"learning_rate": 8.416539554784089e-07,
"loss": 0.0639,
"reward": -0.45371130108833313,
"reward_std": 0.3770594820380211,
"rewards/cosine_scaled_reward": -0.22685565054416656,
"rewards/format_reward": 0.0,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.7,
"grad_norm": 2.146838426589966,
"kl": 1.3212890625,
"learning_rate": 8.392544243589427e-07,
"loss": 0.053,
"reward": -0.39382801204919815,
"reward_std": 0.3155653551220894,
"rewards/cosine_scaled_reward": -0.19691400602459908,
"rewards/format_reward": 0.0,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.704,
"grad_norm": 2.3939132690429688,
"kl": 1.498046875,
"learning_rate": 8.368407953869103e-07,
"loss": 0.06,
"reward": -0.397233285009861,
"reward_std": 0.3429732918739319,
"rewards/cosine_scaled_reward": -0.1986166313290596,
"rewards/format_reward": 0.0,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.708,
"grad_norm": 2.2279624938964844,
"kl": 1.3759765625,
"learning_rate": 8.344131861991828e-07,
"loss": 0.0551,
"reward": -0.41151023656129837,
"reward_std": 0.3277590796351433,
"rewards/cosine_scaled_reward": -0.2057551108300686,
"rewards/format_reward": 0.0,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.712,
"grad_norm": 2.5055384635925293,
"kl": 1.341796875,
"learning_rate": 8.319717151140072e-07,
"loss": 0.0537,
"reward": -0.4148360714316368,
"reward_std": 0.3054031655192375,
"rewards/cosine_scaled_reward": -0.2074180319905281,
"rewards/format_reward": 0.0,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.716,
"grad_norm": 2.605672836303711,
"kl": 2.421875,
"learning_rate": 8.295165011252396e-07,
"loss": 0.0969,
"reward": -0.49764253944158554,
"reward_std": 0.34468474239110947,
"rewards/cosine_scaled_reward": -0.24882125481963158,
"rewards/format_reward": 0.0,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.72,
"grad_norm": 1.8612443208694458,
"kl": 1.958984375,
"learning_rate": 8.270476638965461e-07,
"loss": 0.0784,
"reward": -0.41104499250650406,
"reward_std": 0.32857123762369156,
"rewards/cosine_scaled_reward": -0.20552249625325203,
"rewards/format_reward": 0.0,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.724,
"grad_norm": 2.20760178565979,
"kl": 1.4267578125,
"learning_rate": 8.245653237555705e-07,
"loss": 0.0571,
"reward": -0.4070161208510399,
"reward_std": 0.29896606504917145,
"rewards/cosine_scaled_reward": -0.20350806042551994,
"rewards/format_reward": 0.0,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.728,
"grad_norm": 2.527832269668579,
"kl": 1.3251953125,
"learning_rate": 8.220696016880687e-07,
"loss": 0.053,
"reward": -0.40310006588697433,
"reward_std": 0.33485615253448486,
"rewards/cosine_scaled_reward": -0.20155002549290657,
"rewards/format_reward": 0.0,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.732,
"grad_norm": 2.0901362895965576,
"kl": 1.25,
"learning_rate": 8.195606193320136e-07,
"loss": 0.0499,
"reward": -0.39147457480430603,
"reward_std": 0.3105906918644905,
"rewards/cosine_scaled_reward": -0.19573728740215302,
"rewards/format_reward": 0.0,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.736,
"grad_norm": 2.0712454319000244,
"kl": 1.3271484375,
"learning_rate": 8.170384989716657e-07,
"loss": 0.053,
"reward": -0.36338385939598083,
"reward_std": 0.29373297840356827,
"rewards/cosine_scaled_reward": -0.18169192969799042,
"rewards/format_reward": 0.0,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.74,
"grad_norm": 4.567477226257324,
"kl": 2.91015625,
"learning_rate": 8.145033635316128e-07,
"loss": 0.1167,
"reward": -0.46033478528261185,
"reward_std": 0.309500552713871,
"rewards/cosine_scaled_reward": -0.23016740009188652,
"rewards/format_reward": 0.0,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.744,
"grad_norm": 2.8025710582733154,
"kl": 1.982421875,
"learning_rate": 8.119553365707802e-07,
"loss": 0.0793,
"reward": -0.3399934060871601,
"reward_std": 0.3289627507328987,
"rewards/cosine_scaled_reward": -0.16999670304358006,
"rewards/format_reward": 0.0,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.748,
"grad_norm": 2.41241192817688,
"kl": 1.6513671875,
"learning_rate": 8.093945422764069e-07,
"loss": 0.0663,
"reward": -0.4002522900700569,
"reward_std": 0.3234091103076935,
"rewards/cosine_scaled_reward": -0.20012613758444786,
"rewards/format_reward": 0.0,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.752,
"grad_norm": 3.6371164321899414,
"kl": 2.470703125,
"learning_rate": 8.068211054579943e-07,
"loss": 0.0988,
"reward": -0.44175921380519867,
"reward_std": 0.33701298385858536,
"rewards/cosine_scaled_reward": -0.22087960690259933,
"rewards/format_reward": 0.0,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.756,
"grad_norm": 2.704362154006958,
"kl": 1.71875,
"learning_rate": 8.04235151541222e-07,
"loss": 0.0686,
"reward": -0.3934633806347847,
"reward_std": 0.31845808029174805,
"rewards/cosine_scaled_reward": -0.19673169776797295,
"rewards/format_reward": 0.0,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.76,
"grad_norm": 2.5518999099731445,
"kl": 1.865234375,
"learning_rate": 8.01636806561836e-07,
"loss": 0.0746,
"reward": -0.48456476628780365,
"reward_std": 0.3398968055844307,
"rewards/cosine_scaled_reward": -0.24228239431977272,
"rewards/format_reward": 0.0,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.764,
"grad_norm": 4.733001232147217,
"kl": 2.0537109375,
"learning_rate": 7.990261971595048e-07,
"loss": 0.0822,
"reward": -0.44671063870191574,
"reward_std": 0.32652025669813156,
"rewards/cosine_scaled_reward": -0.22335530444979668,
"rewards/format_reward": 0.0,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.768,
"grad_norm": 2.217525005340576,
"kl": 1.72265625,
"learning_rate": 7.964034505716476e-07,
"loss": 0.0689,
"reward": -0.38292936980724335,
"reward_std": 0.3729139119386673,
"rewards/cosine_scaled_reward": -0.19146469235420227,
"rewards/format_reward": 0.0,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.772,
"grad_norm": 2.3045313358306885,
"kl": 1.0576171875,
"learning_rate": 7.93768694627233e-07,
"loss": 0.0423,
"reward": -0.36335285753011703,
"reward_std": 0.3274284452199936,
"rewards/cosine_scaled_reward": -0.18167642876505852,
"rewards/format_reward": 0.0,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.776,
"grad_norm": 2.220212936401367,
"kl": 1.974609375,
"learning_rate": 7.911220577405484e-07,
"loss": 0.0791,
"reward": -0.41132358461618423,
"reward_std": 0.33213579654693604,
"rewards/cosine_scaled_reward": -0.20566179975867271,
"rewards/format_reward": 0.0,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.78,
"grad_norm": 2.872774124145508,
"kl": 2.04296875,
"learning_rate": 7.884636689049422e-07,
"loss": 0.0819,
"reward": -0.41410720348358154,
"reward_std": 0.3132774606347084,
"rewards/cosine_scaled_reward": -0.20705359801650047,
"rewards/format_reward": 0.0,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.784,
"grad_norm": 3.354735851287842,
"kl": 1.2236328125,
"learning_rate": 7.857936576865356e-07,
"loss": 0.0489,
"reward": -0.34651997685432434,
"reward_std": 0.27611755579710007,
"rewards/cosine_scaled_reward": -0.17325998842716217,
"rewards/format_reward": 0.0,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.788,
"grad_norm": 2.019547939300537,
"kl": 1.03515625,
"learning_rate": 7.831121542179086e-07,
"loss": 0.0414,
"reward": -0.36961859464645386,
"reward_std": 0.3042915388941765,
"rewards/cosine_scaled_reward": -0.18480929359793663,
"rewards/format_reward": 0.0,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.792,
"grad_norm": 2.245211601257324,
"kl": 1.408203125,
"learning_rate": 7.804192891917571e-07,
"loss": 0.0564,
"reward": -0.3812807723879814,
"reward_std": 0.30970512330532074,
"rewards/cosine_scaled_reward": -0.190640389919281,
"rewards/format_reward": 0.0,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.796,
"grad_norm": 2.0456931591033936,
"kl": 1.673828125,
"learning_rate": 7.777151938545235e-07,
"loss": 0.067,
"reward": -0.38433101773262024,
"reward_std": 0.3408072590827942,
"rewards/cosine_scaled_reward": -0.19216550886631012,
"rewards/format_reward": 0.0,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.8,
"grad_norm": 6.253657817840576,
"kl": 1.48876953125,
"learning_rate": 7.75e-07,
"loss": 0.0595,
"reward": -0.3863793611526489,
"reward_std": 0.3155966252088547,
"rewards/cosine_scaled_reward": -0.19318969175219536,
"rewards/format_reward": 0.0,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.804,
"grad_norm": 2.2331368923187256,
"kl": 1.96484375,
"learning_rate": 7.72273839962904e-07,
"loss": 0.0786,
"reward": -0.41171175986528397,
"reward_std": 0.34651194512844086,
"rewards/cosine_scaled_reward": -0.20585588365793228,
"rewards/format_reward": 0.0,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.808,
"grad_norm": 2.1702663898468018,
"kl": 1.296875,
"learning_rate": 7.695368466124296e-07,
"loss": 0.0519,
"reward": -0.38244833052158356,
"reward_std": 0.34267907589673996,
"rewards/cosine_scaled_reward": -0.19122417271137238,
"rewards/format_reward": 0.0,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.812,
"grad_norm": 2.0549793243408203,
"kl": 1.4345703125,
"learning_rate": 7.667891533457718e-07,
"loss": 0.0573,
"reward": -0.4125688225030899,
"reward_std": 0.33167801052331924,
"rewards/cosine_scaled_reward": -0.20628441870212555,
"rewards/format_reward": 0.0,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.816,
"grad_norm": 2.7793009281158447,
"kl": 1.958984375,
"learning_rate": 7.640308940816239e-07,
"loss": 0.0784,
"reward": -0.45417842268943787,
"reward_std": 0.3453121930360794,
"rewards/cosine_scaled_reward": -0.22708921134471893,
"rewards/format_reward": 0.0,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.82,
"grad_norm": 8.324098587036133,
"kl": 2.23388671875,
"learning_rate": 7.612622032536507e-07,
"loss": 0.0895,
"reward": -0.3973395526409149,
"reward_std": 0.32590440660715103,
"rewards/cosine_scaled_reward": -0.19866977632045746,
"rewards/format_reward": 0.0,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.824,
"grad_norm": 2.22940993309021,
"kl": 1.51171875,
"learning_rate": 7.584832158039378e-07,
"loss": 0.0605,
"reward": -0.4044779762625694,
"reward_std": 0.33285098522901535,
"rewards/cosine_scaled_reward": -0.2022389993071556,
"rewards/format_reward": 0.0,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.828,
"grad_norm": 2.824735164642334,
"kl": 1.310546875,
"learning_rate": 7.556940671764124e-07,
"loss": 0.0524,
"reward": -0.4486440494656563,
"reward_std": 0.33797865360975266,
"rewards/cosine_scaled_reward": -0.22432202845811844,
"rewards/format_reward": 0.0,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.832,
"grad_norm": 2.2558631896972656,
"kl": 1.1962890625,
"learning_rate": 7.528948933102438e-07,
"loss": 0.0478,
"reward": -0.40251782536506653,
"reward_std": 0.30128662288188934,
"rewards/cosine_scaled_reward": -0.20125891268253326,
"rewards/format_reward": 0.0,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.836,
"grad_norm": 2.7602171897888184,
"kl": 0.9951171875,
"learning_rate": 7.500858306332172e-07,
"loss": 0.0398,
"reward": -0.31514767929911613,
"reward_std": 0.3020384646952152,
"rewards/cosine_scaled_reward": -0.15757383964955807,
"rewards/format_reward": 0.0,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.84,
"grad_norm": 2.6217448711395264,
"kl": 1.71484375,
"learning_rate": 7.472670160550848e-07,
"loss": 0.0684,
"reward": -0.3670196682214737,
"reward_std": 0.31881674379110336,
"rewards/cosine_scaled_reward": -0.18350983038544655,
"rewards/format_reward": 0.0,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.844,
"grad_norm": 2.0915112495422363,
"kl": 1.2841796875,
"learning_rate": 7.444385869608921e-07,
"loss": 0.0514,
"reward": -0.4177168160676956,
"reward_std": 0.3398260995745659,
"rewards/cosine_scaled_reward": -0.2088584043085575,
"rewards/format_reward": 0.0,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.848,
"grad_norm": 1.7296172380447388,
"kl": 1.2724609375,
"learning_rate": 7.416006812042827e-07,
"loss": 0.051,
"reward": -0.41255099326372147,
"reward_std": 0.33872970938682556,
"rewards/cosine_scaled_reward": -0.20627548918128014,
"rewards/format_reward": 0.0,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.852,
"grad_norm": 2.1323206424713135,
"kl": 1.16162109375,
"learning_rate": 7.387534371007797e-07,
"loss": 0.0466,
"reward": -0.2759926188737154,
"reward_std": 0.30077088996768,
"rewards/cosine_scaled_reward": -0.1379963019862771,
"rewards/format_reward": 0.0,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.856,
"grad_norm": 2.3771109580993652,
"kl": 1.556640625,
"learning_rate": 7.358969934210438e-07,
"loss": 0.0622,
"reward": -0.3614875078201294,
"reward_std": 0.32025381922721863,
"rewards/cosine_scaled_reward": -0.1807437539100647,
"rewards/format_reward": 0.0,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.86,
"grad_norm": 2.940969467163086,
"kl": 1.8828125,
"learning_rate": 7.330314893841101e-07,
"loss": 0.0754,
"reward": -0.29097072361037135,
"reward_std": 0.28063248097896576,
"rewards/cosine_scaled_reward": -0.14548537082737312,
"rewards/format_reward": 0.0,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.864,
"grad_norm": 1.9293019771575928,
"kl": 1.62890625,
"learning_rate": 7.301570646506027e-07,
"loss": 0.0652,
"reward": -0.4154031127691269,
"reward_std": 0.34460632503032684,
"rewards/cosine_scaled_reward": -0.20770153775811195,
"rewards/format_reward": 0.0,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.868,
"grad_norm": 2.745267391204834,
"kl": 2.0888671875,
"learning_rate": 7.27273859315928e-07,
"loss": 0.0835,
"reward": -0.4031589925289154,
"reward_std": 0.31946661323308945,
"rewards/cosine_scaled_reward": -0.2015794888138771,
"rewards/format_reward": 0.0,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.872,
"grad_norm": 2.873622179031372,
"kl": 1.5078125,
"learning_rate": 7.243820139034464e-07,
"loss": 0.0604,
"reward": -0.4128880575299263,
"reward_std": 0.3311196342110634,
"rewards/cosine_scaled_reward": -0.20644402503967285,
"rewards/format_reward": 0.0,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.876,
"grad_norm": 2.7079639434814453,
"kl": 1.2607421875,
"learning_rate": 7.214816693576234e-07,
"loss": 0.0505,
"reward": -0.3099018558859825,
"reward_std": 0.2861209958791733,
"rewards/cosine_scaled_reward": -0.15495092794299126,
"rewards/format_reward": 0.0,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.88,
"grad_norm": 1.9640864133834839,
"kl": 1.234375,
"learning_rate": 7.185729670371604e-07,
"loss": 0.0493,
"reward": -0.40535254031419754,
"reward_std": 0.2874290943145752,
"rewards/cosine_scaled_reward": -0.20267625898122787,
"rewards/format_reward": 0.0,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.884,
"grad_norm": 2.130681037902832,
"kl": 1.486328125,
"learning_rate": 7.156560487081051e-07,
"loss": 0.0595,
"reward": -0.3594564124941826,
"reward_std": 0.3218042775988579,
"rewards/cosine_scaled_reward": -0.1797281987965107,
"rewards/format_reward": 0.0,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.888,
"grad_norm": 2.1852834224700928,
"kl": 1.48046875,
"learning_rate": 7.127310565369415e-07,
"loss": 0.0591,
"reward": -0.331524558365345,
"reward_std": 0.28531621396541595,
"rewards/cosine_scaled_reward": -0.1657622903585434,
"rewards/format_reward": 0.0,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.892,
"grad_norm": 2.3731930255889893,
"kl": 1.734375,
"learning_rate": 7.097981330836616e-07,
"loss": 0.0693,
"reward": -0.38006093353033066,
"reward_std": 0.3292882591485977,
"rewards/cosine_scaled_reward": -0.19003047049045563,
"rewards/format_reward": 0.0,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.896,
"grad_norm": 2.3246822357177734,
"kl": 1.0390625,
"learning_rate": 7.068574212948169e-07,
"loss": 0.0416,
"reward": -0.3990800455212593,
"reward_std": 0.3413678854703903,
"rewards/cosine_scaled_reward": -0.19954002648591995,
"rewards/format_reward": 0.0,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.9,
"grad_norm": 2.4476959705352783,
"kl": 1.45703125,
"learning_rate": 7.039090644965509e-07,
"loss": 0.0583,
"reward": -0.39841071516275406,
"reward_std": 0.31324755400419235,
"rewards/cosine_scaled_reward": -0.19920538365840912,
"rewards/format_reward": 0.0,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.904,
"grad_norm": 3.0681633949279785,
"kl": 1.75,
"learning_rate": 7.009532063876148e-07,
"loss": 0.0701,
"reward": -0.35963694006204605,
"reward_std": 0.3227182477712631,
"rewards/cosine_scaled_reward": -0.17981846630573273,
"rewards/format_reward": 0.0,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.908,
"grad_norm": 3.8354952335357666,
"kl": 1.5087890625,
"learning_rate": 6.979899910323624e-07,
"loss": 0.0604,
"reward": -0.3886452168226242,
"reward_std": 0.31125637143850327,
"rewards/cosine_scaled_reward": -0.1943226121366024,
"rewards/format_reward": 0.0,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.912,
"grad_norm": 2.3208184242248535,
"kl": 1.39453125,
"learning_rate": 6.950195628537299e-07,
"loss": 0.0558,
"reward": -0.34270477294921875,
"reward_std": 0.3698492497205734,
"rewards/cosine_scaled_reward": -0.17135238647460938,
"rewards/format_reward": 0.0,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.916,
"grad_norm": 2.174126386642456,
"kl": 2.009765625,
"learning_rate": 6.920420666261961e-07,
"loss": 0.0804,
"reward": -0.37576939910650253,
"reward_std": 0.3269713968038559,
"rewards/cosine_scaled_reward": -0.18788469955325127,
"rewards/format_reward": 0.0,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.92,
"grad_norm": 2.081784725189209,
"kl": 2.1728515625,
"learning_rate": 6.890576474687263e-07,
"loss": 0.0869,
"reward": -0.3998561128973961,
"reward_std": 0.32443511486053467,
"rewards/cosine_scaled_reward": -0.19992805272340775,
"rewards/format_reward": 0.0,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.924,
"grad_norm": 2.3403866291046143,
"kl": 1.17529296875,
"learning_rate": 6.860664508377001e-07,
"loss": 0.0469,
"reward": -0.38807813823223114,
"reward_std": 0.32711831480264664,
"rewards/cosine_scaled_reward": -0.19403906539082527,
"rewards/format_reward": 0.0,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.928,
"grad_norm": 2.029927968978882,
"kl": 1.32666015625,
"learning_rate": 6.83068622519821e-07,
"loss": 0.0531,
"reward": -0.38948777318000793,
"reward_std": 0.3195284381508827,
"rewards/cosine_scaled_reward": -0.19474387168884277,
"rewards/format_reward": 0.0,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.932,
"grad_norm": 2.9124484062194824,
"kl": 1.71484375,
"learning_rate": 6.800643086250121e-07,
"loss": 0.0685,
"reward": -0.3806769847869873,
"reward_std": 0.2985011041164398,
"rewards/cosine_scaled_reward": -0.19033849611878395,
"rewards/format_reward": 0.0,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.936,
"grad_norm": 2.464742422103882,
"kl": 1.2998046875,
"learning_rate": 6.770536555792944e-07,
"loss": 0.052,
"reward": -0.3443439155817032,
"reward_std": 0.29415207356214523,
"rewards/cosine_scaled_reward": -0.1721719540655613,
"rewards/format_reward": 0.0,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.94,
"grad_norm": 2.1291651725769043,
"kl": 1.001953125,
"learning_rate": 6.740368101176495e-07,
"loss": 0.0401,
"reward": -0.33735504001379013,
"reward_std": 0.28946489840745926,
"rewards/cosine_scaled_reward": -0.16867752373218536,
"rewards/format_reward": 0.0,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.944,
"grad_norm": 2.9513416290283203,
"kl": 1.6201171875,
"learning_rate": 6.710139192768694e-07,
"loss": 0.0649,
"reward": -0.40289320796728134,
"reward_std": 0.30230626463890076,
"rewards/cosine_scaled_reward": -0.20144660398364067,
"rewards/format_reward": 0.0,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.948,
"grad_norm": 3.7395241260528564,
"kl": 1.6240234375,
"learning_rate": 6.679851303883891e-07,
"loss": 0.065,
"reward": -0.3659610077738762,
"reward_std": 0.32638294249773026,
"rewards/cosine_scaled_reward": -0.1829805038869381,
"rewards/format_reward": 0.0,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.952,
"grad_norm": 2.7872421741485596,
"kl": 1.7919921875,
"learning_rate": 6.649505910711058e-07,
"loss": 0.0718,
"reward": -0.4507276937365532,
"reward_std": 0.35789574682712555,
"rewards/cosine_scaled_reward": -0.2253638356924057,
"rewards/format_reward": 0.0,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.956,
"grad_norm": 2.139983654022217,
"kl": 1.40234375,
"learning_rate": 6.619104492241847e-07,
"loss": 0.056,
"reward": -0.3731803297996521,
"reward_std": 0.30503255128860474,
"rewards/cosine_scaled_reward": -0.18659016117453575,
"rewards/format_reward": 0.0,
"step": 239
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.96,
"grad_norm": 6.420464515686035,
"kl": 2.787109375,
"learning_rate": 6.588648530198504e-07,
"loss": 0.1116,
"reward": -0.40894675999879837,
"reward_std": 0.3296940475702286,
"rewards/cosine_scaled_reward": -0.20447338744997978,
"rewards/format_reward": 0.0,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.964,
"grad_norm": 2.4638171195983887,
"kl": 2.1806640625,
"learning_rate": 6.558139508961654e-07,
"loss": 0.0874,
"reward": -0.42437078058719635,
"reward_std": 0.3512648344039917,
"rewards/cosine_scaled_reward": -0.21218538656830788,
"rewards/format_reward": 0.0,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.968,
"grad_norm": 2.8068432807922363,
"kl": 1.884765625,
"learning_rate": 6.527578915497951e-07,
"loss": 0.0754,
"reward": -0.394868440926075,
"reward_std": 0.2916436865925789,
"rewards/cosine_scaled_reward": -0.1974342130124569,
"rewards/format_reward": 0.0,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.972,
"grad_norm": 2.272479295730591,
"kl": 1.453125,
"learning_rate": 6.496968239287603e-07,
"loss": 0.0581,
"reward": -0.36773569136857986,
"reward_std": 0.3104323297739029,
"rewards/cosine_scaled_reward": -0.18386784568428993,
"rewards/format_reward": 0.0,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.976,
"grad_norm": 2.86352276802063,
"kl": 1.8525390625,
"learning_rate": 6.466308972251785e-07,
"loss": 0.0742,
"reward": -0.3895353376865387,
"reward_std": 0.30376598984003067,
"rewards/cosine_scaled_reward": -0.19476767256855965,
"rewards/format_reward": 0.0,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.98,
"grad_norm": 3.2674906253814697,
"kl": 1.89453125,
"learning_rate": 6.435602608679916e-07,
"loss": 0.0758,
"reward": -0.35536977648735046,
"reward_std": 0.32461147010326385,
"rewards/cosine_scaled_reward": -0.17768489941954613,
"rewards/format_reward": 0.0,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.984,
"grad_norm": 2.3651580810546875,
"kl": 1.3994140625,
"learning_rate": 6.404850645156841e-07,
"loss": 0.0559,
"reward": -0.2967621465213597,
"reward_std": 0.29580704867839813,
"rewards/cosine_scaled_reward": -0.1483810821082443,
"rewards/format_reward": 0.0,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.988,
"grad_norm": 2.6290199756622314,
"kl": 1.544921875,
"learning_rate": 6.374054580489873e-07,
"loss": 0.0618,
"reward": -0.3732440918684006,
"reward_std": 0.28786107152700424,
"rewards/cosine_scaled_reward": -0.1866220459342003,
"rewards/format_reward": 0.0,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.992,
"grad_norm": 2.474320650100708,
"kl": 1.18359375,
"learning_rate": 6.343215915635761e-07,
"loss": 0.0473,
"reward": -0.3813322111964226,
"reward_std": 0.3196609243750572,
"rewards/cosine_scaled_reward": -0.1906661055982113,
"rewards/format_reward": 0.0,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.996,
"grad_norm": 2.4096460342407227,
"kl": 1.185546875,
"learning_rate": 6.31233615362752e-07,
"loss": 0.0475,
"reward": -0.37723246961832047,
"reward_std": 0.32298891991376877,
"rewards/cosine_scaled_reward": -0.18861623480916023,
"rewards/format_reward": 0.0,
"step": 249
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0001220703125,
"epoch": 1.0,
"grad_norm": 2.414369821548462,
"kl": 1.1552734375,
"learning_rate": 6.281416799501187e-07,
"loss": 0.0462,
"reward": -0.3446759209036827,
"reward_std": 0.30413854122161865,
"rewards/cosine_scaled_reward": -0.17233795672655106,
"rewards/format_reward": 0.0,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.004,
"grad_norm": 2.3181285858154297,
"kl": 1.4765625,
"learning_rate": 6.25045936022246e-07,
"loss": 0.0591,
"reward": -0.39850035309791565,
"reward_std": 0.3559228628873825,
"rewards/cosine_scaled_reward": -0.19925018772482872,
"rewards/format_reward": 0.0,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.008,
"grad_norm": 2.3214640617370605,
"kl": 1.59375,
"learning_rate": 6.219465344613258e-07,
"loss": 0.0637,
"reward": -0.3477981239557266,
"reward_std": 0.3031875118613243,
"rewards/cosine_scaled_reward": -0.1738990694284439,
"rewards/format_reward": 0.0,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.012,
"grad_norm": 2.4848833084106445,
"kl": 1.6416015625,
"learning_rate": 6.188436263278172e-07,
"loss": 0.0657,
"reward": -0.402904212474823,
"reward_std": 0.32011619955301285,
"rewards/cosine_scaled_reward": -0.2014521062374115,
"rewards/format_reward": 0.0,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.016,
"grad_norm": 7.0177903175354,
"kl": 3.015625,
"learning_rate": 6.157373628530852e-07,
"loss": 0.1206,
"reward": -0.41366545110940933,
"reward_std": 0.3347878158092499,
"rewards/cosine_scaled_reward": -0.20683272555470467,
"rewards/format_reward": 0.0,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 1533.3928527832031,
"epoch": 1.02,
"grad_norm": 2.5155041217803955,
"kl": 1.818359375,
"learning_rate": 6.126278954320294e-07,
"loss": 0.073,
"reward": -0.41607701033353806,
"reward_std": 0.33659277111291885,
"rewards/cosine_scaled_reward": -0.20803850889205933,
"rewards/format_reward": 0.0,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.024,
"grad_norm": 3.175401449203491,
"kl": 2.349609375,
"learning_rate": 6.095153756157051e-07,
"loss": 0.094,
"reward": -0.3731570616364479,
"reward_std": 0.3251727372407913,
"rewards/cosine_scaled_reward": -0.18657853826880455,
"rewards/format_reward": 0.0,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.028,
"grad_norm": 2.345123052597046,
"kl": 2.140625,
"learning_rate": 6.06399955103937e-07,
"loss": 0.0857,
"reward": -0.4059467390179634,
"reward_std": 0.3182907700538635,
"rewards/cosine_scaled_reward": -0.2029733695089817,
"rewards/format_reward": 0.0,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.032,
"grad_norm": 2.636462688446045,
"kl": 1.705078125,
"learning_rate": 6.032817857379256e-07,
"loss": 0.068,
"reward": -0.343365378677845,
"reward_std": 0.3163585662841797,
"rewards/cosine_scaled_reward": -0.1716826893389225,
"rewards/format_reward": 0.0,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.036,
"grad_norm": 2.297900438308716,
"kl": 1.51953125,
"learning_rate": 6.001610194928464e-07,
"loss": 0.0608,
"reward": -0.3703172579407692,
"reward_std": 0.3630036562681198,
"rewards/cosine_scaled_reward": -0.1851586326956749,
"rewards/format_reward": 0.0,
"step": 259
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.04,
"grad_norm": 2.311648368835449,
"kl": 1.515625,
"learning_rate": 5.97037808470444e-07,
"loss": 0.0605,
"reward": -0.3789840117096901,
"reward_std": 0.330322228372097,
"rewards/cosine_scaled_reward": -0.18949199840426445,
"rewards/format_reward": 0.0,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.044,
"grad_norm": 2.3599531650543213,
"kl": 1.78515625,
"learning_rate": 5.939123048916173e-07,
"loss": 0.0714,
"reward": -0.3447503596544266,
"reward_std": 0.33612456917762756,
"rewards/cosine_scaled_reward": -0.17237518727779388,
"rewards/format_reward": 0.0,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 1527.6190490722656,
"epoch": 1.048,
"grad_norm": 2.2337074279785156,
"kl": 1.890625,
"learning_rate": 5.907846610890011e-07,
"loss": 0.0786,
"reward": -0.39859064668416977,
"reward_std": 0.32645051926374435,
"rewards/cosine_scaled_reward": -0.1992953196167946,
"rewards/format_reward": 0.0,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.052,
"grad_norm": 2.818617582321167,
"kl": 1.55859375,
"learning_rate": 5.87655029499542e-07,
"loss": 0.0624,
"reward": -0.3537183925509453,
"reward_std": 0.309035487473011,
"rewards/cosine_scaled_reward": -0.17685920372605324,
"rewards/format_reward": 0.0,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.056,
"grad_norm": 2.3533854484558105,
"kl": 1.3583984375,
"learning_rate": 5.845235626570683e-07,
"loss": 0.0543,
"reward": -0.3672221526503563,
"reward_std": 0.31650061905384064,
"rewards/cosine_scaled_reward": -0.18361108005046844,
"rewards/format_reward": 0.0,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.06,
"grad_norm": 3.936475992202759,
"kl": 2.265625,
"learning_rate": 5.813904131848564e-07,
"loss": 0.0907,
"reward": -0.36572812497615814,
"reward_std": 0.2912697494029999,
"rewards/cosine_scaled_reward": -0.18286405876278877,
"rewards/format_reward": 0.0,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.064,
"grad_norm": 2.754866600036621,
"kl": 1.943359375,
"learning_rate": 5.78255733788191e-07,
"loss": 0.0777,
"reward": -0.37356945127248764,
"reward_std": 0.34380726516246796,
"rewards/cosine_scaled_reward": -0.18678472936153412,
"rewards/format_reward": 0.0,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.068,
"grad_norm": 2.374964952468872,
"kl": 1.4267578125,
"learning_rate": 5.751196772469237e-07,
"loss": 0.0571,
"reward": -0.3651036322116852,
"reward_std": 0.30468039214611053,
"rewards/cosine_scaled_reward": -0.1825518161058426,
"rewards/format_reward": 0.0,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 1533.0535888671875,
"epoch": 1.072,
"grad_norm": 2.618032693862915,
"kl": 1.6171875,
"learning_rate": 5.71982396408026e-07,
"loss": 0.0651,
"reward": -0.35353927314281464,
"reward_std": 0.3086354061961174,
"rewards/cosine_scaled_reward": -0.17676963657140732,
"rewards/format_reward": 0.0,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.076,
"grad_norm": 2.920133590698242,
"kl": 1.8515625,
"learning_rate": 5.688440441781398e-07,
"loss": 0.074,
"reward": -0.37572528421878815,
"reward_std": 0.33292342722415924,
"rewards/cosine_scaled_reward": -0.18786264210939407,
"rewards/format_reward": 0.0,
"step": 269
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.08,
"grad_norm": 2.581885576248169,
"kl": 1.830078125,
"learning_rate": 5.657047735161255e-07,
"loss": 0.0732,
"reward": -0.34584221988916397,
"reward_std": 0.3140456974506378,
"rewards/cosine_scaled_reward": -0.17292110994458199,
"rewards/format_reward": 0.0,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.084,
"grad_norm": 8.366601943969727,
"kl": 2.509765625,
"learning_rate": 5.625647374256061e-07,
"loss": 0.1003,
"reward": -0.37314866855740547,
"reward_std": 0.2792880907654762,
"rewards/cosine_scaled_reward": -0.18657432682812214,
"rewards/format_reward": 0.0,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.088,
"grad_norm": 3.071047067642212,
"kl": 1.9658203125,
"learning_rate": 5.594240889475106e-07,
"loss": 0.0785,
"reward": -0.39643432199954987,
"reward_std": 0.31065937131643295,
"rewards/cosine_scaled_reward": -0.19821715354919434,
"rewards/format_reward": 0.0,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.092,
"grad_norm": 3.8571436405181885,
"kl": 1.2626953125,
"learning_rate": 5.562829811526154e-07,
"loss": 0.0506,
"reward": -0.3136083036661148,
"reward_std": 0.28241100907325745,
"rewards/cosine_scaled_reward": -0.1568041555583477,
"rewards/format_reward": 0.0,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.096,
"grad_norm": 2.1380457878112793,
"kl": 1.96875,
"learning_rate": 5.531415671340826e-07,
"loss": 0.0786,
"reward": -0.35791803896427155,
"reward_std": 0.3191326707601547,
"rewards/cosine_scaled_reward": -0.17895901948213577,
"rewards/format_reward": 0.0,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.1,
"grad_norm": 3.744987964630127,
"kl": 2.048828125,
"learning_rate": 5.5e-07,
"loss": 0.0819,
"reward": -0.3743599057197571,
"reward_std": 0.3121279552578926,
"rewards/cosine_scaled_reward": -0.18717995658516884,
"rewards/format_reward": 0.0,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.104,
"grad_norm": 2.783698081970215,
"kl": 1.8984375,
"learning_rate": 5.468584328659172e-07,
"loss": 0.0761,
"reward": -0.3865007609128952,
"reward_std": 0.322613961994648,
"rewards/cosine_scaled_reward": -0.1932503841817379,
"rewards/format_reward": 0.0,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.108,
"grad_norm": 3.2086503505706787,
"kl": 1.865234375,
"learning_rate": 5.437170188473847e-07,
"loss": 0.0746,
"reward": -0.41129884123802185,
"reward_std": 0.3018573820590973,
"rewards/cosine_scaled_reward": -0.20564941689372063,
"rewards/format_reward": 0.0,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.112,
"grad_norm": 2.4078729152679443,
"kl": 1.4072265625,
"learning_rate": 5.405759110524894e-07,
"loss": 0.0563,
"reward": -0.39701489359140396,
"reward_std": 0.3126164525747299,
"rewards/cosine_scaled_reward": -0.19850744307041168,
"rewards/format_reward": 0.0,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.116,
"grad_norm": 2.5043461322784424,
"kl": 2.35546875,
"learning_rate": 5.37435262574394e-07,
"loss": 0.0944,
"reward": -0.28278425987809896,
"reward_std": 0.2714259997010231,
"rewards/cosine_scaled_reward": -0.1413921354105696,
"rewards/format_reward": 0.0,
"step": 279
},
{
"clip_ratio": 0.0,
"completion_length": 1533.6190490722656,
"epoch": 1.12,
"grad_norm": 4.991820335388184,
"kl": 1.83984375,
"learning_rate": 5.342952264838747e-07,
"loss": 0.0713,
"reward": -0.3403998464345932,
"reward_std": 0.3223363533616066,
"rewards/cosine_scaled_reward": -0.1701999232172966,
"rewards/format_reward": 0.0,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 1534.0476379394531,
"epoch": 1.124,
"grad_norm": 2.818126916885376,
"kl": 1.37890625,
"learning_rate": 5.311559558218603e-07,
"loss": 0.054,
"reward": -0.3611769676208496,
"reward_std": 0.3213232010602951,
"rewards/cosine_scaled_reward": -0.1805884800851345,
"rewards/format_reward": 0.0,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.1280000000000001,
"grad_norm": 2.7234742641448975,
"kl": 2.248046875,
"learning_rate": 5.28017603591974e-07,
"loss": 0.0899,
"reward": -0.4201104864478111,
"reward_std": 0.3131628781557083,
"rewards/cosine_scaled_reward": -0.21005523577332497,
"rewards/format_reward": 0.0,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.1320000000000001,
"grad_norm": 6.938405990600586,
"kl": 1.998046875,
"learning_rate": 5.248803227530763e-07,
"loss": 0.0799,
"reward": -0.33411792665719986,
"reward_std": 0.32330870628356934,
"rewards/cosine_scaled_reward": -0.16705895960330963,
"rewards/format_reward": 0.0,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.1360000000000001,
"grad_norm": 3.5663974285125732,
"kl": 1.3349609375,
"learning_rate": 5.21744266211809e-07,
"loss": 0.0534,
"reward": -0.3633820191025734,
"reward_std": 0.31287185102701187,
"rewards/cosine_scaled_reward": -0.1816909983754158,
"rewards/format_reward": 0.0,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.1400000000000001,
"grad_norm": 2.0476882457733154,
"kl": 1.708984375,
"learning_rate": 5.186095868151436e-07,
"loss": 0.0684,
"reward": -0.3689531907439232,
"reward_std": 0.32297470420598984,
"rewards/cosine_scaled_reward": -0.184476587921381,
"rewards/format_reward": 0.0,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 1526.8869018554688,
"epoch": 1.144,
"grad_norm": 12.345512390136719,
"kl": 2.966796875,
"learning_rate": 5.154764373429315e-07,
"loss": 0.1254,
"reward": -0.3650151863694191,
"reward_std": 0.31899186968803406,
"rewards/cosine_scaled_reward": -0.18250760063529015,
"rewards/format_reward": 0.0,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.148,
"grad_norm": 2.059617519378662,
"kl": 2.291015625,
"learning_rate": 5.123449705004581e-07,
"loss": 0.0916,
"reward": -0.3706892877817154,
"reward_std": 0.32747378945350647,
"rewards/cosine_scaled_reward": -0.1853446513414383,
"rewards/format_reward": 0.0,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.152,
"grad_norm": 3.889174699783325,
"kl": 2.0859375,
"learning_rate": 5.09215338910999e-07,
"loss": 0.0834,
"reward": -0.4078289121389389,
"reward_std": 0.3290611281991005,
"rewards/cosine_scaled_reward": -0.20391445606946945,
"rewards/format_reward": 0.0,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 1533.2440490722656,
"epoch": 1.156,
"grad_norm": 2.5038888454437256,
"kl": 0.93896484375,
"learning_rate": 5.060876951083828e-07,
"loss": 0.0354,
"reward": -0.34110401570796967,
"reward_std": 0.3122602626681328,
"rewards/cosine_scaled_reward": -0.17055201157927513,
"rewards/format_reward": 0.0,
"step": 289
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.16,
"grad_norm": 2.39719557762146,
"kl": 1.8583984375,
"learning_rate": 5.02962191529556e-07,
"loss": 0.0744,
"reward": -0.36911261081695557,
"reward_std": 0.3288589343428612,
"rewards/cosine_scaled_reward": -0.1845562942326069,
"rewards/format_reward": 0.0,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.164,
"grad_norm": 2.758849620819092,
"kl": 1.626953125,
"learning_rate": 4.998389805071536e-07,
"loss": 0.0651,
"reward": -0.3935117796063423,
"reward_std": 0.3461349532008171,
"rewards/cosine_scaled_reward": -0.19675587862730026,
"rewards/format_reward": 0.0,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.168,
"grad_norm": 2.310575246810913,
"kl": 1.455078125,
"learning_rate": 4.967182142620745e-07,
"loss": 0.0583,
"reward": -0.34184807538986206,
"reward_std": 0.3021695464849472,
"rewards/cosine_scaled_reward": -0.17092403396964073,
"rewards/format_reward": 0.0,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.172,
"grad_norm": 2.8417394161224365,
"kl": 1.861328125,
"learning_rate": 4.93600044896063e-07,
"loss": 0.0744,
"reward": -0.3772461339831352,
"reward_std": 0.3044436201453209,
"rewards/cosine_scaled_reward": -0.18862305954098701,
"rewards/format_reward": 0.0,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.176,
"grad_norm": 2.347404956817627,
"kl": 1.28125,
"learning_rate": 4.904846243842949e-07,
"loss": 0.0513,
"reward": -0.3517310842871666,
"reward_std": 0.3094722405076027,
"rewards/cosine_scaled_reward": -0.1758655458688736,
"rewards/format_reward": 0.0,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 1535.6130981445312,
"epoch": 1.18,
"grad_norm": 2.7739925384521484,
"kl": 1.833984375,
"learning_rate": 4.873721045679706e-07,
"loss": 0.0731,
"reward": -0.4288819953799248,
"reward_std": 0.3247087821364403,
"rewards/cosine_scaled_reward": -0.2144409976899624,
"rewards/format_reward": 0.0,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.184,
"grad_norm": 2.1470892429351807,
"kl": 1.296875,
"learning_rate": 4.842626371469149e-07,
"loss": 0.0519,
"reward": -0.35219819098711014,
"reward_std": 0.3056294918060303,
"rewards/cosine_scaled_reward": -0.17609910294413567,
"rewards/format_reward": 0.0,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.188,
"grad_norm": 3.177232503890991,
"kl": 1.677734375,
"learning_rate": 4.811563736721829e-07,
"loss": 0.0671,
"reward": -0.3717339485883713,
"reward_std": 0.29695921391248703,
"rewards/cosine_scaled_reward": -0.18586697429418564,
"rewards/format_reward": 0.0,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.192,
"grad_norm": 3.3333382606506348,
"kl": 2.322265625,
"learning_rate": 4.780534655386743e-07,
"loss": 0.093,
"reward": -0.3814833015203476,
"reward_std": 0.28608307987451553,
"rewards/cosine_scaled_reward": -0.1907416470348835,
"rewards/format_reward": 0.0,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.196,
"grad_norm": 2.842420816421509,
"kl": 1.45703125,
"learning_rate": 4.749540639777539e-07,
"loss": 0.0583,
"reward": -0.3840809538960457,
"reward_std": 0.31393957883119583,
"rewards/cosine_scaled_reward": -0.19204047322273254,
"rewards/format_reward": 0.0,
"step": 299
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.2,
"grad_norm": 2.9220309257507324,
"kl": 1.681640625,
"learning_rate": 4.7185832004988133e-07,
"loss": 0.0672,
"reward": -0.39588408917188644,
"reward_std": 0.33600132539868355,
"rewards/cosine_scaled_reward": -0.19794204831123352,
"rewards/format_reward": 0.0,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.204,
"grad_norm": 3.4091219902038574,
"kl": 1.44140625,
"learning_rate": 4.68766384637248e-07,
"loss": 0.0576,
"reward": -0.2894315180601552,
"reward_std": 0.30969203263521194,
"rewards/cosine_scaled_reward": -0.14471576345385984,
"rewards/format_reward": 0.0,
"step": 301
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.208,
"grad_norm": 2.0488741397857666,
"kl": 1.5576171875,
"learning_rate": 4.656784084364238e-07,
"loss": 0.0624,
"reward": -0.32318826019763947,
"reward_std": 0.3031533695757389,
"rewards/cosine_scaled_reward": -0.16159413009881973,
"rewards/format_reward": 0.0,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.212,
"grad_norm": 2.6755242347717285,
"kl": 1.34765625,
"learning_rate": 4.6259454195101267e-07,
"loss": 0.0539,
"reward": -0.37002843618392944,
"reward_std": 0.31058184802532196,
"rewards/cosine_scaled_reward": -0.18501422181725502,
"rewards/format_reward": 0.0,
"step": 303
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.216,
"grad_norm": 6.160266399383545,
"kl": 1.734375,
"learning_rate": 4.59514935484316e-07,
"loss": 0.0694,
"reward": -0.38714154064655304,
"reward_std": 0.3265160173177719,
"rewards/cosine_scaled_reward": -0.19357078149914742,
"rewards/format_reward": 0.0,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.22,
"grad_norm": 2.3529880046844482,
"kl": 1.2138671875,
"learning_rate": 4.5643973913200837e-07,
"loss": 0.0486,
"reward": -0.3460870534181595,
"reward_std": 0.3087117671966553,
"rewards/cosine_scaled_reward": -0.17304353043437004,
"rewards/format_reward": 0.0,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.224,
"grad_norm": 2.48714280128479,
"kl": 1.9453125,
"learning_rate": 4.5336910277482155e-07,
"loss": 0.0779,
"reward": -0.3756335750222206,
"reward_std": 0.32805445045232773,
"rewards/cosine_scaled_reward": -0.1878167800605297,
"rewards/format_reward": 0.0,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.228,
"grad_norm": 8.46654987335205,
"kl": 2.5107421875,
"learning_rate": 4.503031760712397e-07,
"loss": 0.1004,
"reward": -0.385331392288208,
"reward_std": 0.31344960629940033,
"rewards/cosine_scaled_reward": -0.1926657035946846,
"rewards/format_reward": 0.0,
"step": 307
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.232,
"grad_norm": 3.198944568634033,
"kl": 2.140625,
"learning_rate": 4.4724210845020494e-07,
"loss": 0.0857,
"reward": -0.36118319630622864,
"reward_std": 0.3010380119085312,
"rewards/cosine_scaled_reward": -0.18059159815311432,
"rewards/format_reward": 0.0,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.236,
"grad_norm": 2.745668411254883,
"kl": 2.033203125,
"learning_rate": 4.441860491038345e-07,
"loss": 0.0813,
"reward": -0.3596822917461395,
"reward_std": 0.3092067465186119,
"rewards/cosine_scaled_reward": -0.17984114587306976,
"rewards/format_reward": 0.0,
"step": 309
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.24,
"grad_norm": 5.614748954772949,
"kl": 2.34375,
"learning_rate": 4.4113514698014953e-07,
"loss": 0.094,
"reward": -0.34773094952106476,
"reward_std": 0.29645886272192,
"rewards/cosine_scaled_reward": -0.17386547103524208,
"rewards/format_reward": 0.0,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.244,
"grad_norm": 2.089031219482422,
"kl": 1.39453125,
"learning_rate": 4.3808955077581546e-07,
"loss": 0.0558,
"reward": -0.33028923720121384,
"reward_std": 0.2886582836508751,
"rewards/cosine_scaled_reward": -0.16514462232589722,
"rewards/format_reward": 0.0,
"step": 311
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.248,
"grad_norm": 5.366787433624268,
"kl": 2.9599609375,
"learning_rate": 4.350494089288943e-07,
"loss": 0.1186,
"reward": -0.4123021811246872,
"reward_std": 0.337029866874218,
"rewards/cosine_scaled_reward": -0.206151083111763,
"rewards/format_reward": 0.0,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.252,
"grad_norm": 8.391505241394043,
"kl": 1.953125,
"learning_rate": 4.3201486961161093e-07,
"loss": 0.078,
"reward": -0.3487403020262718,
"reward_std": 0.3276291638612747,
"rewards/cosine_scaled_reward": -0.1743701510131359,
"rewards/format_reward": 0.0,
"step": 313
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.256,
"grad_norm": 2.623786449432373,
"kl": 1.3193359375,
"learning_rate": 4.2898608072313045e-07,
"loss": 0.0528,
"reward": -0.32606934756040573,
"reward_std": 0.28208620101213455,
"rewards/cosine_scaled_reward": -0.16303467005491257,
"rewards/format_reward": 0.0,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.26,
"grad_norm": 2.2247447967529297,
"kl": 1.884765625,
"learning_rate": 4.2596318988235037e-07,
"loss": 0.0755,
"reward": -0.2273978427692782,
"reward_std": 0.28098014742136,
"rewards/cosine_scaled_reward": -0.11369891960930545,
"rewards/format_reward": 0.0,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.264,
"grad_norm": 2.258469581604004,
"kl": 1.14453125,
"learning_rate": 4.2294634442070553e-07,
"loss": 0.0457,
"reward": -0.24764333851635456,
"reward_std": 0.2835834100842476,
"rewards/cosine_scaled_reward": -0.12382166367024183,
"rewards/format_reward": 0.0,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.268,
"grad_norm": 2.884620189666748,
"kl": 1.5986328125,
"learning_rate": 4.1993569137498776e-07,
"loss": 0.064,
"reward": -0.37140634655952454,
"reward_std": 0.36573630571365356,
"rewards/cosine_scaled_reward": -0.18570317327976227,
"rewards/format_reward": 0.0,
"step": 317
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.272,
"grad_norm": 2.703934669494629,
"kl": 1.912109375,
"learning_rate": 4.1693137748017915e-07,
"loss": 0.0763,
"reward": -0.34411681443452835,
"reward_std": 0.29631946235895157,
"rewards/cosine_scaled_reward": -0.17205841839313507,
"rewards/format_reward": 0.0,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.276,
"grad_norm": 3.717240571975708,
"kl": 2.224609375,
"learning_rate": 4.1393354916230005e-07,
"loss": 0.0891,
"reward": -0.3324529230594635,
"reward_std": 0.2552623227238655,
"rewards/cosine_scaled_reward": -0.16622646152973175,
"rewards/format_reward": 0.0,
"step": 319
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.28,
"grad_norm": 2.4941396713256836,
"kl": 1.384765625,
"learning_rate": 4.1094235253127374e-07,
"loss": 0.0555,
"reward": -0.30811577290296555,
"reward_std": 0.2845884971320629,
"rewards/cosine_scaled_reward": -0.15405788272619247,
"rewards/format_reward": 0.0,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.284,
"grad_norm": 3.229072332382202,
"kl": 1.9453125,
"learning_rate": 4.079579333738039e-07,
"loss": 0.0778,
"reward": -0.3366442248225212,
"reward_std": 0.301740899682045,
"rewards/cosine_scaled_reward": -0.1683221124112606,
"rewards/format_reward": 0.0,
"step": 321
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.288,
"grad_norm": 3.3636343479156494,
"kl": 1.8828125,
"learning_rate": 4.0498043714627006e-07,
"loss": 0.0752,
"reward": -0.36845648288726807,
"reward_std": 0.34283190220594406,
"rewards/cosine_scaled_reward": -0.18422825261950493,
"rewards/format_reward": 0.0,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.292,
"grad_norm": 3.507054090499878,
"kl": 1.4130859375,
"learning_rate": 4.020100089676376e-07,
"loss": 0.0566,
"reward": -0.34711746126413345,
"reward_std": 0.2960944324731827,
"rewards/cosine_scaled_reward": -0.17355873063206673,
"rewards/format_reward": 0.0,
"step": 323
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.296,
"grad_norm": 2.661647081375122,
"kl": 1.736328125,
"learning_rate": 3.9904679361238526e-07,
"loss": 0.0694,
"reward": -0.33277176320552826,
"reward_std": 0.3034566268324852,
"rewards/cosine_scaled_reward": -0.16638587787747383,
"rewards/format_reward": 0.0,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.3,
"grad_norm": 3.079672336578369,
"kl": 1.359375,
"learning_rate": 3.9609093550344907e-07,
"loss": 0.0544,
"reward": -0.3246685415506363,
"reward_std": 0.27341699600219727,
"rewards/cosine_scaled_reward": -0.16233427450060844,
"rewards/format_reward": 0.0,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.304,
"grad_norm": 3.248324394226074,
"kl": 1.1181640625,
"learning_rate": 3.931425787051832e-07,
"loss": 0.0447,
"reward": -0.3214203119277954,
"reward_std": 0.2835453376173973,
"rewards/cosine_scaled_reward": -0.160710159689188,
"rewards/format_reward": 0.0,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.308,
"grad_norm": 3.676837205886841,
"kl": 1.724609375,
"learning_rate": 3.902018669163384e-07,
"loss": 0.069,
"reward": -0.32949286699295044,
"reward_std": 0.30344782024621964,
"rewards/cosine_scaled_reward": -0.16474644094705582,
"rewards/format_reward": 0.0,
"step": 327
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.312,
"grad_norm": 2.3120462894439697,
"kl": 1.537109375,
"learning_rate": 3.872689434630585e-07,
"loss": 0.0615,
"reward": -0.3512613996863365,
"reward_std": 0.3501633331179619,
"rewards/cosine_scaled_reward": -0.17563070356845856,
"rewards/format_reward": 0.0,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.316,
"grad_norm": 2.4828386306762695,
"kl": 1.6953125,
"learning_rate": 3.843439512918949e-07,
"loss": 0.0677,
"reward": -0.31614498794078827,
"reward_std": 0.29276788979768753,
"rewards/cosine_scaled_reward": -0.15807249024510384,
"rewards/format_reward": 0.0,
"step": 329
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.32,
"grad_norm": 3.356783151626587,
"kl": 2.453125,
"learning_rate": 3.8142703296283953e-07,
"loss": 0.0982,
"reward": -0.4576185494661331,
"reward_std": 0.32832735031843185,
"rewards/cosine_scaled_reward": -0.22880928218364716,
"rewards/format_reward": 0.0,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.324,
"grad_norm": 2.7885196208953857,
"kl": 2.068359375,
"learning_rate": 3.785183306423767e-07,
"loss": 0.0827,
"reward": -0.2943090833723545,
"reward_std": 0.31652648001909256,
"rewards/cosine_scaled_reward": -0.14715453796088696,
"rewards/format_reward": 0.0,
"step": 331
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.328,
"grad_norm": 3.0415380001068115,
"kl": 1.802734375,
"learning_rate": 3.7561798609655373e-07,
"loss": 0.0721,
"reward": -0.3697570115327835,
"reward_std": 0.3258262947201729,
"rewards/cosine_scaled_reward": -0.18487850576639175,
"rewards/format_reward": 0.0,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.332,
"grad_norm": 3.139693021774292,
"kl": 1.732421875,
"learning_rate": 3.72726140684072e-07,
"loss": 0.0693,
"reward": -0.33471549302339554,
"reward_std": 0.2794983647763729,
"rewards/cosine_scaled_reward": -0.16735775396227837,
"rewards/format_reward": 0.0,
"step": 333
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.336,
"grad_norm": 2.6243162155151367,
"kl": 1.8369140625,
"learning_rate": 3.6984293534939737e-07,
"loss": 0.0733,
"reward": -0.3382048085331917,
"reward_std": 0.3457643389701843,
"rewards/cosine_scaled_reward": -0.16910240054130554,
"rewards/format_reward": 0.0,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.34,
"grad_norm": 3.803060293197632,
"kl": 1.8046875,
"learning_rate": 3.6696851061588994e-07,
"loss": 0.0723,
"reward": -0.3406166359782219,
"reward_std": 0.29876144975423813,
"rewards/cosine_scaled_reward": -0.17030831426382065,
"rewards/format_reward": 0.0,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.3439999999999999,
"grad_norm": 3.948391914367676,
"kl": 1.365234375,
"learning_rate": 3.641030065789562e-07,
"loss": 0.0546,
"reward": -0.2908342033624649,
"reward_std": 0.26911235228180885,
"rewards/cosine_scaled_reward": -0.14541710540652275,
"rewards/format_reward": 0.0,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.3479999999999999,
"grad_norm": 2.9695639610290527,
"kl": 2.19921875,
"learning_rate": 3.612465628992203e-07,
"loss": 0.0881,
"reward": -0.37160656601190567,
"reward_std": 0.3147331103682518,
"rewards/cosine_scaled_reward": -0.18580328300595284,
"rewards/format_reward": 0.0,
"step": 337
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.3519999999999999,
"grad_norm": 3.1350209712982178,
"kl": 2.1689453125,
"learning_rate": 3.5839931879571725e-07,
"loss": 0.087,
"reward": -0.3230074942111969,
"reward_std": 0.313438281416893,
"rewards/cosine_scaled_reward": -0.16150375083088875,
"rewards/format_reward": 0.0,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.3559999999999999,
"grad_norm": 3.882567882537842,
"kl": 2.0546875,
"learning_rate": 3.555614130391079e-07,
"loss": 0.0821,
"reward": -0.36975327879190445,
"reward_std": 0.31242573261260986,
"rewards/cosine_scaled_reward": -0.18487663567066193,
"rewards/format_reward": 0.0,
"step": 339
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.3599999999999999,
"grad_norm": 2.6699118614196777,
"kl": 1.689453125,
"learning_rate": 3.5273298394491515e-07,
"loss": 0.0676,
"reward": -0.368961863219738,
"reward_std": 0.32627636194229126,
"rewards/cosine_scaled_reward": -0.1844809353351593,
"rewards/format_reward": 0.0,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.3639999999999999,
"grad_norm": 3.0782856941223145,
"kl": 1.59765625,
"learning_rate": 3.4991416936678276e-07,
"loss": 0.064,
"reward": -0.3320116475224495,
"reward_std": 0.3151276856660843,
"rewards/cosine_scaled_reward": -0.16600582748651505,
"rewards/format_reward": 0.0,
"step": 341
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.3679999999999999,
"grad_norm": 2.2419495582580566,
"kl": 1.46484375,
"learning_rate": 3.471051066897562e-07,
"loss": 0.0585,
"reward": -0.2764207161962986,
"reward_std": 0.3390573188662529,
"rewards/cosine_scaled_reward": -0.1382103539071977,
"rewards/format_reward": 0.0,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.3719999999999999,
"grad_norm": 4.397972106933594,
"kl": 2.5,
"learning_rate": 3.4430593282358777e-07,
"loss": 0.1002,
"reward": -0.33926407247781754,
"reward_std": 0.31172922998666763,
"rewards/cosine_scaled_reward": -0.16963203251361847,
"rewards/format_reward": 0.0,
"step": 343
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.376,
"grad_norm": 3.441905975341797,
"kl": 2.0234375,
"learning_rate": 3.4151678419606233e-07,
"loss": 0.0808,
"reward": -0.3324861600995064,
"reward_std": 0.2958858981728554,
"rewards/cosine_scaled_reward": -0.1662430725991726,
"rewards/format_reward": 0.0,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.38,
"grad_norm": 2.7323975563049316,
"kl": 1.4189453125,
"learning_rate": 3.387377967463493e-07,
"loss": 0.0566,
"reward": -0.3314187452197075,
"reward_std": 0.3164066970348358,
"rewards/cosine_scaled_reward": -0.16570937633514404,
"rewards/format_reward": 0.0,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.384,
"grad_norm": 4.131885528564453,
"kl": 2.45703125,
"learning_rate": 3.359691059183761e-07,
"loss": 0.0983,
"reward": -0.37432391941547394,
"reward_std": 0.33136965334415436,
"rewards/cosine_scaled_reward": -0.18716195970773697,
"rewards/format_reward": 0.0,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.388,
"grad_norm": 2.9907569885253906,
"kl": 1.732421875,
"learning_rate": 3.3321084665422803e-07,
"loss": 0.0693,
"reward": -0.38256606459617615,
"reward_std": 0.31782740354537964,
"rewards/cosine_scaled_reward": -0.19128303229808807,
"rewards/format_reward": 0.0,
"step": 347
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.392,
"grad_norm": 2.6049344539642334,
"kl": 1.53515625,
"learning_rate": 3.3046315338757026e-07,
"loss": 0.0613,
"reward": -0.2997368350625038,
"reward_std": 0.3045838475227356,
"rewards/cosine_scaled_reward": -0.1498684138059616,
"rewards/format_reward": 0.0,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.396,
"grad_norm": 4.5095295906066895,
"kl": 1.5029296875,
"learning_rate": 3.2772616003709616e-07,
"loss": 0.0602,
"reward": -0.3363025635480881,
"reward_std": 0.30865515023469925,
"rewards/cosine_scaled_reward": -0.16815128177404404,
"rewards/format_reward": 0.0,
"step": 349
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.4,
"grad_norm": 3.3342795372009277,
"kl": 1.908203125,
"learning_rate": 3.250000000000001e-07,
"loss": 0.0762,
"reward": -0.3770889565348625,
"reward_std": 0.30710920691490173,
"rewards/cosine_scaled_reward": -0.18854447081685066,
"rewards/format_reward": 0.0,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.404,
"grad_norm": 2.795259714126587,
"kl": 2.048828125,
"learning_rate": 3.222848061454764e-07,
"loss": 0.082,
"reward": -0.3462035730481148,
"reward_std": 0.32692621648311615,
"rewards/cosine_scaled_reward": -0.1731017865240574,
"rewards/format_reward": 0.0,
"step": 351
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.408,
"grad_norm": 2.563765287399292,
"kl": 1.462890625,
"learning_rate": 3.195807108082429e-07,
"loss": 0.0586,
"reward": -0.37373943626880646,
"reward_std": 0.3041759356856346,
"rewards/cosine_scaled_reward": -0.18686972558498383,
"rewards/format_reward": 0.0,
"step": 352
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.412,
"grad_norm": 2.6194751262664795,
"kl": 1.24609375,
"learning_rate": 3.168878457820915e-07,
"loss": 0.0498,
"reward": -0.3196728527545929,
"reward_std": 0.2953634150326252,
"rewards/cosine_scaled_reward": -0.15983642637729645,
"rewards/format_reward": 0.0,
"step": 353
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.416,
"grad_norm": 2.8382420539855957,
"kl": 1.650390625,
"learning_rate": 3.142063423134644e-07,
"loss": 0.0662,
"reward": -0.33513225615024567,
"reward_std": 0.30527665093541145,
"rewards/cosine_scaled_reward": -0.16756613552570343,
"rewards/format_reward": 0.0,
"step": 354
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.42,
"grad_norm": 2.6078808307647705,
"kl": 2.15234375,
"learning_rate": 3.115363310950578e-07,
"loss": 0.086,
"reward": -0.3992829695343971,
"reward_std": 0.31726495921611786,
"rewards/cosine_scaled_reward": -0.19964147731661797,
"rewards/format_reward": 0.0,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.424,
"grad_norm": 4.192615985870361,
"kl": 2.142578125,
"learning_rate": 3.0887794225945143e-07,
"loss": 0.0858,
"reward": -0.39319509267807007,
"reward_std": 0.3372880816459656,
"rewards/cosine_scaled_reward": -0.19659754261374474,
"rewards/format_reward": 0.0,
"step": 356
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.428,
"grad_norm": 3.196894407272339,
"kl": 2.509765625,
"learning_rate": 3.062313053727671e-07,
"loss": 0.1006,
"reward": -0.3694089204072952,
"reward_std": 0.323252871632576,
"rewards/cosine_scaled_reward": -0.1847044676542282,
"rewards/format_reward": 0.0,
"step": 357
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.432,
"grad_norm": 3.348161458969116,
"kl": 1.1142578125,
"learning_rate": 3.0359654942835247e-07,
"loss": 0.0447,
"reward": -0.36088229715824127,
"reward_std": 0.31483449041843414,
"rewards/cosine_scaled_reward": -0.18044114857912064,
"rewards/format_reward": 0.0,
"step": 358
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.436,
"grad_norm": 3.457472324371338,
"kl": 2.2265625,
"learning_rate": 3.0097380284049523e-07,
"loss": 0.089,
"reward": -0.3612442761659622,
"reward_std": 0.28438059240579605,
"rewards/cosine_scaled_reward": -0.1806221418082714,
"rewards/format_reward": 0.0,
"step": 359
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.44,
"grad_norm": 3.285405397415161,
"kl": 2.076171875,
"learning_rate": 2.9836319343816397e-07,
"loss": 0.0831,
"reward": -0.32887883111834526,
"reward_std": 0.3107897564768791,
"rewards/cosine_scaled_reward": -0.16443941928446293,
"rewards/format_reward": 0.0,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.444,
"grad_norm": 2.9156711101531982,
"kl": 1.7646484375,
"learning_rate": 2.9576484845877793e-07,
"loss": 0.0706,
"reward": -0.3512116149067879,
"reward_std": 0.32886873185634613,
"rewards/cosine_scaled_reward": -0.17560580000281334,
"rewards/format_reward": 0.0,
"step": 361
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.448,
"grad_norm": 2.42704439163208,
"kl": 1.697265625,
"learning_rate": 2.931788945420058e-07,
"loss": 0.0679,
"reward": -0.3639722764492035,
"reward_std": 0.2881170064210892,
"rewards/cosine_scaled_reward": -0.18198613449931145,
"rewards/format_reward": 0.0,
"step": 362
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.452,
"grad_norm": 4.5008225440979,
"kl": 2.177734375,
"learning_rate": 2.9060545772359305e-07,
"loss": 0.087,
"reward": -0.3515865206718445,
"reward_std": 0.290123887360096,
"rewards/cosine_scaled_reward": -0.17579325661063194,
"rewards/format_reward": 0.0,
"step": 363
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.456,
"grad_norm": 2.7479496002197266,
"kl": 1.578125,
"learning_rate": 2.8804466342921987e-07,
"loss": 0.0632,
"reward": -0.26583924936130643,
"reward_std": 0.29539088532328606,
"rewards/cosine_scaled_reward": -0.13291961723007262,
"rewards/format_reward": 0.0,
"step": 364
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.46,
"grad_norm": 2.6749367713928223,
"kl": 2.1796875,
"learning_rate": 2.854966364683872e-07,
"loss": 0.087,
"reward": -0.36106909811496735,
"reward_std": 0.2982637956738472,
"rewards/cosine_scaled_reward": -0.18053454905748367,
"rewards/format_reward": 0.0,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.464,
"grad_norm": 3.6434812545776367,
"kl": 1.4482421875,
"learning_rate": 2.829615010283344e-07,
"loss": 0.058,
"reward": -0.35805001854896545,
"reward_std": 0.31588251888751984,
"rewards/cosine_scaled_reward": -0.17902500554919243,
"rewards/format_reward": 0.0,
"step": 366
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.468,
"grad_norm": 2.877927780151367,
"kl": 1.779296875,
"learning_rate": 2.8043938066798645e-07,
"loss": 0.0712,
"reward": -0.35267870873212814,
"reward_std": 0.3029713034629822,
"rewards/cosine_scaled_reward": -0.17633935809135437,
"rewards/format_reward": 0.0,
"step": 367
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.472,
"grad_norm": 2.9547438621520996,
"kl": 1.3583984375,
"learning_rate": 2.7793039831193133e-07,
"loss": 0.0542,
"reward": -0.34842824190855026,
"reward_std": 0.28041965141892433,
"rewards/cosine_scaled_reward": -0.17421411722898483,
"rewards/format_reward": 0.0,
"step": 368
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.476,
"grad_norm": 2.4998183250427246,
"kl": 1.712890625,
"learning_rate": 2.7543467624442956e-07,
"loss": 0.0686,
"reward": -0.34311509132385254,
"reward_std": 0.3226206302642822,
"rewards/cosine_scaled_reward": -0.17155754193663597,
"rewards/format_reward": 0.0,
"step": 369
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.48,
"grad_norm": 3.5822997093200684,
"kl": 1.2568359375,
"learning_rate": 2.729523361034538e-07,
"loss": 0.0502,
"reward": -0.31581661850214005,
"reward_std": 0.27614113688468933,
"rewards/cosine_scaled_reward": -0.15790832042694092,
"rewards/format_reward": 0.0,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.484,
"grad_norm": 2.638000965118408,
"kl": 1.658203125,
"learning_rate": 2.7048349887476037e-07,
"loss": 0.0663,
"reward": -0.3658217638731003,
"reward_std": 0.3533295765519142,
"rewards/cosine_scaled_reward": -0.18291086703538895,
"rewards/format_reward": 0.0,
"step": 371
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.488,
"grad_norm": 2.4719886779785156,
"kl": 1.470703125,
"learning_rate": 2.6802828488599294e-07,
"loss": 0.0588,
"reward": -0.35377567261457443,
"reward_std": 0.2872357815504074,
"rewards/cosine_scaled_reward": -0.17688783630728722,
"rewards/format_reward": 0.0,
"step": 372
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.492,
"grad_norm": 3.820688486099243,
"kl": 1.65673828125,
"learning_rate": 2.655868138008171e-07,
"loss": 0.0662,
"reward": -0.3673105686903,
"reward_std": 0.29224705323576927,
"rewards/cosine_scaled_reward": -0.1836552768945694,
"rewards/format_reward": 0.0,
"step": 373
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.496,
"grad_norm": 3.1416916847229004,
"kl": 1.4990234375,
"learning_rate": 2.631592046130896e-07,
"loss": 0.06,
"reward": -0.3574133738875389,
"reward_std": 0.2663569226861,
"rewards/cosine_scaled_reward": -0.17870669439435005,
"rewards/format_reward": 0.0,
"step": 374
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.5,
"grad_norm": 2.3712515830993652,
"kl": 1.900390625,
"learning_rate": 2.6074557564105724e-07,
"loss": 0.0761,
"reward": -0.34536080807447433,
"reward_std": 0.3063738942146301,
"rewards/cosine_scaled_reward": -0.17268040403723717,
"rewards/format_reward": 0.0,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.504,
"grad_norm": 2.792006254196167,
"kl": 1.71875,
"learning_rate": 2.583460445215911e-07,
"loss": 0.0688,
"reward": -0.3458981513977051,
"reward_std": 0.3039686158299446,
"rewards/cosine_scaled_reward": -0.17294907197356224,
"rewards/format_reward": 0.0,
"step": 376
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.508,
"grad_norm": 2.985948085784912,
"kl": 1.5625,
"learning_rate": 2.5596072820445254e-07,
"loss": 0.0625,
"reward": -0.21606629202142358,
"reward_std": 0.2749215438961983,
"rewards/cosine_scaled_reward": -0.10803314973600209,
"rewards/format_reward": 0.0,
"step": 377
},
{
"clip_ratio": 0.0,
"completion_length": 1531.952392578125,
"epoch": 1.512,
"grad_norm": 2.396852970123291,
"kl": 1.9921875,
"learning_rate": 2.5358974294659373e-07,
"loss": 0.0823,
"reward": -0.38127752393484116,
"reward_std": 0.32172612845897675,
"rewards/cosine_scaled_reward": -0.19063876569271088,
"rewards/format_reward": 0.0,
"step": 378
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.516,
"grad_norm": 2.503976345062256,
"kl": 1.794921875,
"learning_rate": 2.512332043064913e-07,
"loss": 0.0718,
"reward": -0.3479606434702873,
"reward_std": 0.29174239560961723,
"rewards/cosine_scaled_reward": -0.17398031428456306,
"rewards/format_reward": 0.0,
"step": 379
},
{
"clip_ratio": 0.0,
"completion_length": 1531.8035888671875,
"epoch": 1.52,
"grad_norm": 3.344243049621582,
"kl": 2.080078125,
"learning_rate": 2.488912271385139e-07,
"loss": 0.083,
"reward": -0.38203170895576477,
"reward_std": 0.3180833086371422,
"rewards/cosine_scaled_reward": -0.19101585447788239,
"rewards/format_reward": 0.0,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.524,
"grad_norm": 3.5073604583740234,
"kl": 2.095703125,
"learning_rate": 2.465639255873246e-07,
"loss": 0.0837,
"reward": -0.33683621138334274,
"reward_std": 0.3141423165798187,
"rewards/cosine_scaled_reward": -0.16841810569167137,
"rewards/format_reward": 0.0,
"step": 381
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.528,
"grad_norm": 2.7634477615356445,
"kl": 2.55859375,
"learning_rate": 2.4425141308231765e-07,
"loss": 0.1022,
"reward": -0.3983701467514038,
"reward_std": 0.31766583025455475,
"rewards/cosine_scaled_reward": -0.199185062199831,
"rewards/format_reward": 0.0,
"step": 382
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.532,
"grad_norm": 3.1601033210754395,
"kl": 1.486328125,
"learning_rate": 2.4195380233209006e-07,
"loss": 0.0594,
"reward": -0.37120404094457626,
"reward_std": 0.3172856420278549,
"rewards/cosine_scaled_reward": -0.18560202419757843,
"rewards/format_reward": 0.0,
"step": 383
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.536,
"grad_norm": 2.475311040878296,
"kl": 2.01953125,
"learning_rate": 2.3967120531894857e-07,
"loss": 0.0807,
"reward": -0.3449181020259857,
"reward_std": 0.3061336353421211,
"rewards/cosine_scaled_reward": -0.17245905846357346,
"rewards/format_reward": 0.0,
"step": 384
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.54,
"grad_norm": 3.9638140201568604,
"kl": 1.6806640625,
"learning_rate": 2.374037332934512e-07,
"loss": 0.0673,
"reward": -0.3139965161681175,
"reward_std": 0.303245909512043,
"rewards/cosine_scaled_reward": -0.15699823945760727,
"rewards/format_reward": 0.0,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.544,
"grad_norm": 3.2407708168029785,
"kl": 1.89453125,
"learning_rate": 2.3515149676898552e-07,
"loss": 0.0757,
"reward": -0.3049175813794136,
"reward_std": 0.30845751613378525,
"rewards/cosine_scaled_reward": -0.1524587944149971,
"rewards/format_reward": 0.0,
"step": 386
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.548,
"grad_norm": 3.1065189838409424,
"kl": 1.75390625,
"learning_rate": 2.3291460551638237e-07,
"loss": 0.0701,
"reward": -0.3369733840227127,
"reward_std": 0.30179525911808014,
"rewards/cosine_scaled_reward": -0.16848668828606606,
"rewards/format_reward": 0.0,
"step": 387
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.552,
"grad_norm": 2.6867339611053467,
"kl": 2.06640625,
"learning_rate": 2.306931685585657e-07,
"loss": 0.0826,
"reward": -0.3339100852608681,
"reward_std": 0.3043428584933281,
"rewards/cosine_scaled_reward": -0.16695504263043404,
"rewards/format_reward": 0.0,
"step": 388
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.556,
"grad_norm": 3.1580567359924316,
"kl": 2.291015625,
"learning_rate": 2.2848729416523859e-07,
"loss": 0.0915,
"reward": -0.3744669333100319,
"reward_std": 0.3249610960483551,
"rewards/cosine_scaled_reward": -0.18723345920443535,
"rewards/format_reward": 0.0,
"step": 389
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.56,
"grad_norm": 5.407771587371826,
"kl": 1.609375,
"learning_rate": 2.2629708984760706e-07,
"loss": 0.0645,
"reward": -0.3420454412698746,
"reward_std": 0.3148321136832237,
"rewards/cosine_scaled_reward": -0.1710227131843567,
"rewards/format_reward": 0.0,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.564,
"grad_norm": 4.492737770080566,
"kl": 2.275390625,
"learning_rate": 2.2412266235313973e-07,
"loss": 0.0909,
"reward": -0.36313918232917786,
"reward_std": 0.29535526037216187,
"rewards/cosine_scaled_reward": -0.18156958371400833,
"rewards/format_reward": 0.0,
"step": 391
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.568,
"grad_norm": 3.0125086307525635,
"kl": 2.029296875,
"learning_rate": 2.2196411766036487e-07,
"loss": 0.0812,
"reward": -0.37769585102796555,
"reward_std": 0.31776873767375946,
"rewards/cosine_scaled_reward": -0.18884791806340218,
"rewards/format_reward": 0.0,
"step": 392
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.572,
"grad_norm": 3.134265899658203,
"kl": 2.47265625,
"learning_rate": 2.1982156097370557e-07,
"loss": 0.099,
"reward": -0.38678842037916183,
"reward_std": 0.30557621270418167,
"rewards/cosine_scaled_reward": -0.19339420646429062,
"rewards/format_reward": 0.0,
"step": 393
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.576,
"grad_norm": 2.9398727416992188,
"kl": 1.404296875,
"learning_rate": 2.1769509671835223e-07,
"loss": 0.0562,
"reward": -0.3609785735607147,
"reward_std": 0.29732464998960495,
"rewards/cosine_scaled_reward": -0.18048929050564766,
"rewards/format_reward": 0.0,
"step": 394
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.58,
"grad_norm": 2.3901424407958984,
"kl": 2.3291015625,
"learning_rate": 2.1558482853517253e-07,
"loss": 0.093,
"reward": -0.38430536538362503,
"reward_std": 0.32753758877515793,
"rewards/cosine_scaled_reward": -0.19215268269181252,
"rewards/format_reward": 0.0,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 1526.9702453613281,
"epoch": 1.584,
"grad_norm": 3.9775447845458984,
"kl": 2.06640625,
"learning_rate": 2.134908592756607e-07,
"loss": 0.0914,
"reward": -0.33116257190704346,
"reward_std": 0.2928163409233093,
"rewards/cosine_scaled_reward": -0.16558128595352173,
"rewards/format_reward": 0.0,
"step": 396
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.588,
"grad_norm": 2.9975955486297607,
"kl": 2.318359375,
"learning_rate": 2.1141329099692406e-07,
"loss": 0.0928,
"reward": -0.3710367754101753,
"reward_std": 0.3226532116532326,
"rewards/cosine_scaled_reward": -0.18551838770508766,
"rewards/format_reward": 0.0,
"step": 397
},
{
"clip_ratio": 0.0,
"completion_length": 1530.6845397949219,
"epoch": 1.592,
"grad_norm": 3.739922046661377,
"kl": 2.025390625,
"learning_rate": 2.0935222495670968e-07,
"loss": 0.0747,
"reward": -0.3954162746667862,
"reward_std": 0.3323783427476883,
"rewards/cosine_scaled_reward": -0.1977081410586834,
"rewards/format_reward": 0.0,
"step": 398
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.596,
"grad_norm": 2.7063024044036865,
"kl": 1.0927734375,
"learning_rate": 2.0730776160846853e-07,
"loss": 0.0437,
"reward": -0.3006215952336788,
"reward_std": 0.27692657709121704,
"rewards/cosine_scaled_reward": -0.15031079947948456,
"rewards/format_reward": 0.0,
"step": 399
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.6,
"grad_norm": 2.469496726989746,
"kl": 1.732421875,
"learning_rate": 2.0528000059645995e-07,
"loss": 0.0693,
"reward": -0.36928267031908035,
"reward_std": 0.30984392017126083,
"rewards/cosine_scaled_reward": -0.18464133515954018,
"rewards/format_reward": 0.0,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 1522.3095397949219,
"epoch": 1.604,
"grad_norm": 2.855372190475464,
"kl": 1.845703125,
"learning_rate": 2.032690407508949e-07,
"loss": 0.0636,
"reward": -0.38443852961063385,
"reward_std": 0.28470365703105927,
"rewards/cosine_scaled_reward": -0.19221926480531693,
"rewards/format_reward": 0.0,
"step": 401
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.608,
"grad_norm": 3.3847217559814453,
"kl": 2.0390625,
"learning_rate": 2.0127498008311922e-07,
"loss": 0.0814,
"reward": -0.3252910152077675,
"reward_std": 0.2982725724577904,
"rewards/cosine_scaled_reward": -0.16264550015330315,
"rewards/format_reward": 0.0,
"step": 402
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.612,
"grad_norm": 3.0226523876190186,
"kl": 1.81640625,
"learning_rate": 1.9929791578083655e-07,
"loss": 0.0727,
"reward": -0.3527565225958824,
"reward_std": 0.30437447875738144,
"rewards/cosine_scaled_reward": -0.1763782650232315,
"rewards/format_reward": 0.0,
"step": 403
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.616,
"grad_norm": 2.866734743118286,
"kl": 1.7890625,
"learning_rate": 1.9733794420337213e-07,
"loss": 0.0716,
"reward": -0.3746185079216957,
"reward_std": 0.3078552633523941,
"rewards/cosine_scaled_reward": -0.18730924278497696,
"rewards/format_reward": 0.0,
"step": 404
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.62,
"grad_norm": 3.9170870780944824,
"kl": 1.970703125,
"learning_rate": 1.9539516087697517e-07,
"loss": 0.0789,
"reward": -0.41533301770687103,
"reward_std": 0.3027655556797981,
"rewards/cosine_scaled_reward": -0.20766650885343552,
"rewards/format_reward": 0.0,
"step": 405
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.624,
"grad_norm": 3.470655679702759,
"kl": 1.845703125,
"learning_rate": 1.934696604901642e-07,
"loss": 0.0738,
"reward": -0.3191938251256943,
"reward_std": 0.28303690254688263,
"rewards/cosine_scaled_reward": -0.15959692373871803,
"rewards/format_reward": 0.0,
"step": 406
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.6280000000000001,
"grad_norm": 3.623340368270874,
"kl": 1.31640625,
"learning_rate": 1.915615368891117e-07,
"loss": 0.0526,
"reward": -0.3123548626899719,
"reward_std": 0.29499682784080505,
"rewards/cosine_scaled_reward": -0.15617743134498596,
"rewards/format_reward": 0.0,
"step": 407
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.6320000000000001,
"grad_norm": 2.282514810562134,
"kl": 1.267578125,
"learning_rate": 1.8967088307307e-07,
"loss": 0.0507,
"reward": -0.39642050117254257,
"reward_std": 0.311983872205019,
"rewards/cosine_scaled_reward": -0.19821025803685188,
"rewards/format_reward": 0.0,
"step": 408
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.6360000000000001,
"grad_norm": 2.5232083797454834,
"kl": 1.681640625,
"learning_rate": 1.8779779118983867e-07,
"loss": 0.0672,
"reward": -0.33888739347457886,
"reward_std": 0.28087718039751053,
"rewards/cosine_scaled_reward": -0.16944369673728943,
"rewards/format_reward": 0.0,
"step": 409
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.6400000000000001,
"grad_norm": 3.886439085006714,
"kl": 2.09765625,
"learning_rate": 1.8594235253127372e-07,
"loss": 0.0838,
"reward": -0.38627707213163376,
"reward_std": 0.33190976083278656,
"rewards/cosine_scaled_reward": -0.19313853234052658,
"rewards/format_reward": 0.0,
"step": 410
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.6440000000000001,
"grad_norm": 3.090627670288086,
"kl": 2.140625,
"learning_rate": 1.8410465752883758e-07,
"loss": 0.0857,
"reward": -0.3793156296014786,
"reward_std": 0.30717378109693527,
"rewards/cosine_scaled_reward": -0.1896577998995781,
"rewards/format_reward": 0.0,
"step": 411
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.6480000000000001,
"grad_norm": 3.867506980895996,
"kl": 1.880859375,
"learning_rate": 1.822847957491922e-07,
"loss": 0.0753,
"reward": -0.3565782457590103,
"reward_std": 0.3352038711309433,
"rewards/cosine_scaled_reward": -0.17828912287950516,
"rewards/format_reward": 0.0,
"step": 412
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.6520000000000001,
"grad_norm": 2.388094902038574,
"kl": 1.751953125,
"learning_rate": 1.804828558898332e-07,
"loss": 0.0701,
"reward": -0.3393707424402237,
"reward_std": 0.3029238283634186,
"rewards/cosine_scaled_reward": -0.16968537122011185,
"rewards/format_reward": 0.0,
"step": 413
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.6560000000000001,
"grad_norm": 2.5263466835021973,
"kl": 1.748046875,
"learning_rate": 1.7869892577476722e-07,
"loss": 0.0698,
"reward": -0.4274343103170395,
"reward_std": 0.3449402078986168,
"rewards/cosine_scaled_reward": -0.21371715888381004,
"rewards/format_reward": 0.0,
"step": 414
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.6600000000000001,
"grad_norm": 2.3268003463745117,
"kl": 1.400390625,
"learning_rate": 1.7693309235023127e-07,
"loss": 0.0559,
"reward": -0.3480057269334793,
"reward_std": 0.29953421652317047,
"rewards/cosine_scaled_reward": -0.17400285601615906,
"rewards/format_reward": 0.0,
"step": 415
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.6640000000000001,
"grad_norm": 3.2503533363342285,
"kl": 1.9140625,
"learning_rate": 1.7518544168045524e-07,
"loss": 0.0767,
"reward": -0.36937638372182846,
"reward_std": 0.31766701489686966,
"rewards/cosine_scaled_reward": -0.18468819558620453,
"rewards/format_reward": 0.0,
"step": 416
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.6680000000000001,
"grad_norm": 2.9895646572113037,
"kl": 2.1796875,
"learning_rate": 1.7345605894346726e-07,
"loss": 0.0871,
"reward": -0.3985458239912987,
"reward_std": 0.33385203033685684,
"rewards/cosine_scaled_reward": -0.19927291199564934,
"rewards/format_reward": 0.0,
"step": 417
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.6720000000000002,
"grad_norm": 3.2457692623138428,
"kl": 1.71875,
"learning_rate": 1.7174502842694212e-07,
"loss": 0.0687,
"reward": -0.2603262776392512,
"reward_std": 0.3040950074791908,
"rewards/cosine_scaled_reward": -0.13016314181732014,
"rewards/format_reward": 0.0,
"step": 418
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.6760000000000002,
"grad_norm": 2.8391411304473877,
"kl": 1.798828125,
"learning_rate": 1.7005243352409333e-07,
"loss": 0.072,
"reward": -0.2663672436028719,
"reward_std": 0.29912005364894867,
"rewards/cosine_scaled_reward": -0.13318362249992788,
"rewards/format_reward": 0.0,
"step": 419
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.6800000000000002,
"grad_norm": 3.1057238578796387,
"kl": 1.5,
"learning_rate": 1.6837835672960831e-07,
"loss": 0.06,
"reward": -0.34882377088069916,
"reward_std": 0.3601520508527756,
"rewards/cosine_scaled_reward": -0.17441189289093018,
"rewards/format_reward": 0.0,
"step": 420
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.6840000000000002,
"grad_norm": 2.243816375732422,
"kl": 1.541015625,
"learning_rate": 1.6672287963562852e-07,
"loss": 0.0616,
"reward": -0.3832622766494751,
"reward_std": 0.3413049802184105,
"rewards/cosine_scaled_reward": -0.19163113832473755,
"rewards/format_reward": 0.0,
"step": 421
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.688,
"grad_norm": 3.76218581199646,
"kl": 1.880859375,
"learning_rate": 1.6508608292777203e-07,
"loss": 0.0752,
"reward": -0.3700753226876259,
"reward_std": 0.31324099004268646,
"rewards/cosine_scaled_reward": -0.18503766134381294,
"rewards/format_reward": 0.0,
"step": 422
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.692,
"grad_norm": 4.034151554107666,
"kl": 1.70703125,
"learning_rate": 1.6346804638120098e-07,
"loss": 0.0682,
"reward": -0.29791881144046783,
"reward_std": 0.2801155336201191,
"rewards/cosine_scaled_reward": -0.14895940944552422,
"rewards/format_reward": 0.0,
"step": 423
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.696,
"grad_norm": 3.041618824005127,
"kl": 1.81640625,
"learning_rate": 1.6186884885673413e-07,
"loss": 0.0725,
"reward": -0.32316526770591736,
"reward_std": 0.2970619350671768,
"rewards/cosine_scaled_reward": -0.16158264502882957,
"rewards/format_reward": 0.0,
"step": 424
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.7,
"grad_norm": 4.081668376922607,
"kl": 1.4453125,
"learning_rate": 1.6028856829700258e-07,
"loss": 0.0576,
"reward": -0.3476375713944435,
"reward_std": 0.294509120285511,
"rewards/cosine_scaled_reward": -0.17381878197193146,
"rewards/format_reward": 0.0,
"step": 425
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.704,
"grad_norm": 3.166949510574341,
"kl": 2.1015625,
"learning_rate": 1.5872728172265146e-07,
"loss": 0.0841,
"reward": -0.3467593193054199,
"reward_std": 0.30388573557138443,
"rewards/cosine_scaled_reward": -0.17337966337800026,
"rewards/format_reward": 0.0,
"step": 426
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.708,
"grad_norm": 4.211978435516357,
"kl": 1.763671875,
"learning_rate": 1.5718506522858572e-07,
"loss": 0.0705,
"reward": -0.3505774810910225,
"reward_std": 0.30420946329832077,
"rewards/cosine_scaled_reward": -0.17528874799609184,
"rewards/format_reward": 0.0,
"step": 427
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.712,
"grad_norm": 4.166502952575684,
"kl": 2.158203125,
"learning_rate": 1.5566199398026147e-07,
"loss": 0.0863,
"reward": -0.361857570707798,
"reward_std": 0.30119316279888153,
"rewards/cosine_scaled_reward": -0.1809287928044796,
"rewards/format_reward": 0.0,
"step": 428
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.716,
"grad_norm": 2.8889896869659424,
"kl": 1.8671875,
"learning_rate": 1.5415814221002265e-07,
"loss": 0.0745,
"reward": -0.32126056402921677,
"reward_std": 0.27691005170345306,
"rewards/cosine_scaled_reward": -0.16063029691576958,
"rewards/format_reward": 0.0,
"step": 429
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.72,
"grad_norm": 3.3025801181793213,
"kl": 1.904296875,
"learning_rate": 1.5267358321348285e-07,
"loss": 0.0761,
"reward": -0.36847078800201416,
"reward_std": 0.3445659205317497,
"rewards/cosine_scaled_reward": -0.18423539400100708,
"rewards/format_reward": 0.0,
"step": 430
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.724,
"grad_norm": 3.0440969467163086,
"kl": 1.75,
"learning_rate": 1.5120838934595337e-07,
"loss": 0.07,
"reward": -0.36113734543323517,
"reward_std": 0.3412683606147766,
"rewards/cosine_scaled_reward": -0.18056866899132729,
"rewards/format_reward": 0.0,
"step": 431
},
{
"clip_ratio": 0.0,
"completion_length": 1530.952392578125,
"epoch": 1.728,
"grad_norm": 2.575627326965332,
"kl": 1.689453125,
"learning_rate": 1.4976263201891613e-07,
"loss": 0.0716,
"reward": -0.3095761463046074,
"reward_std": 0.32323335483670235,
"rewards/cosine_scaled_reward": -0.1547880806028843,
"rewards/format_reward": 0.0,
"step": 432
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.732,
"grad_norm": 3.186289072036743,
"kl": 1.91015625,
"learning_rate": 1.483363816965435e-07,
"loss": 0.0765,
"reward": -0.39015311002731323,
"reward_std": 0.3067055642604828,
"rewards/cosine_scaled_reward": -0.19507654383778572,
"rewards/format_reward": 0.0,
"step": 433
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.736,
"grad_norm": 3.0739073753356934,
"kl": 2.369140625,
"learning_rate": 1.469297078922642e-07,
"loss": 0.0946,
"reward": -0.29091550246812403,
"reward_std": 0.30687109380960464,
"rewards/cosine_scaled_reward": -0.14545774972066283,
"rewards/format_reward": 0.0,
"step": 434
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.74,
"grad_norm": 5.0029778480529785,
"kl": 1.759765625,
"learning_rate": 1.4554267916537495e-07,
"loss": 0.0703,
"reward": -0.34431006759405136,
"reward_std": 0.27501973509788513,
"rewards/cosine_scaled_reward": -0.17215503007173538,
"rewards/format_reward": 0.0,
"step": 435
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.744,
"grad_norm": 5.139548301696777,
"kl": 1.8203125,
"learning_rate": 1.4417536311769885e-07,
"loss": 0.0728,
"reward": -0.31318235397338867,
"reward_std": 0.2976163923740387,
"rewards/cosine_scaled_reward": -0.15659117698669434,
"rewards/format_reward": 0.0,
"step": 436
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.748,
"grad_norm": 2.881143808364868,
"kl": 1.626953125,
"learning_rate": 1.4282782639029128e-07,
"loss": 0.065,
"reward": -0.3547092080116272,
"reward_std": 0.28170817345380783,
"rewards/cosine_scaled_reward": -0.1773546040058136,
"rewards/format_reward": 0.0,
"step": 437
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.752,
"grad_norm": 2.4268362522125244,
"kl": 1.9609375,
"learning_rate": 1.4150013466019114e-07,
"loss": 0.0786,
"reward": -0.3464732989668846,
"reward_std": 0.3199189677834511,
"rewards/cosine_scaled_reward": -0.173236645758152,
"rewards/format_reward": 0.0,
"step": 438
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.756,
"grad_norm": 2.686417579650879,
"kl": 2.318359375,
"learning_rate": 1.4019235263722034e-07,
"loss": 0.0926,
"reward": -0.3557046577334404,
"reward_std": 0.3187018297612667,
"rewards/cosine_scaled_reward": -0.1778523214161396,
"rewards/format_reward": 0.0,
"step": 439
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.76,
"grad_norm": 4.9666876792907715,
"kl": 1.4619140625,
"learning_rate": 1.3890454406082956e-07,
"loss": 0.0584,
"reward": -0.3234737552702427,
"reward_std": 0.32776766270399094,
"rewards/cosine_scaled_reward": -0.16173688508570194,
"rewards/format_reward": 0.0,
"step": 440
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.764,
"grad_norm": 4.106746196746826,
"kl": 2.49609375,
"learning_rate": 1.3763677169699217e-07,
"loss": 0.0999,
"reward": -0.4192545562982559,
"reward_std": 0.33375757187604904,
"rewards/cosine_scaled_reward": -0.20962728559970856,
"rewards/format_reward": 0.0,
"step": 441
},
{
"clip_ratio": 0.0,
"completion_length": 1534.8690490722656,
"epoch": 1.768,
"grad_norm": 2.842816114425659,
"kl": 2.2578125,
"learning_rate": 1.3638909733514452e-07,
"loss": 0.0898,
"reward": -0.3652210012078285,
"reward_std": 0.3345082625746727,
"rewards/cosine_scaled_reward": -0.18261050805449486,
"rewards/format_reward": 0.0,
"step": 442
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.772,
"grad_norm": 3.186333179473877,
"kl": 2.375,
"learning_rate": 1.351615817851748e-07,
"loss": 0.0947,
"reward": -0.40324729681015015,
"reward_std": 0.32466883957386017,
"rewards/cosine_scaled_reward": -0.20162366330623627,
"rewards/format_reward": 0.0,
"step": 443
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.776,
"grad_norm": 4.4096360206604,
"kl": 2.99609375,
"learning_rate": 1.3395428487445914e-07,
"loss": 0.1197,
"reward": -0.3327697291970253,
"reward_std": 0.3282741829752922,
"rewards/cosine_scaled_reward": -0.16638486459851265,
"rewards/format_reward": 0.0,
"step": 444
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.78,
"grad_norm": 2.8214669227600098,
"kl": 1.8623046875,
"learning_rate": 1.3276726544494571e-07,
"loss": 0.0746,
"reward": -0.39069636911153793,
"reward_std": 0.33478184044361115,
"rewards/cosine_scaled_reward": -0.19534818828105927,
"rewards/format_reward": 0.0,
"step": 445
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.784,
"grad_norm": 2.96333646774292,
"kl": 1.828125,
"learning_rate": 1.316005813502869e-07,
"loss": 0.073,
"reward": -0.34233053401112556,
"reward_std": 0.30314670503139496,
"rewards/cosine_scaled_reward": -0.17116525955498219,
"rewards/format_reward": 0.0,
"step": 446
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.788,
"grad_norm": 2.538837432861328,
"kl": 1.615234375,
"learning_rate": 1.3045428945301953e-07,
"loss": 0.0647,
"reward": -0.2668099580332637,
"reward_std": 0.3087245300412178,
"rewards/cosine_scaled_reward": -0.1334049835568294,
"rewards/format_reward": 0.0,
"step": 447
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.792,
"grad_norm": 6.922802925109863,
"kl": 1.9208984375,
"learning_rate": 1.2932844562179352e-07,
"loss": 0.0768,
"reward": -0.3690221831202507,
"reward_std": 0.3130299560725689,
"rewards/cosine_scaled_reward": -0.18451109528541565,
"rewards/format_reward": 0.0,
"step": 448
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.796,
"grad_norm": 3.2286629676818848,
"kl": 1.990234375,
"learning_rate": 1.2822310472864885e-07,
"loss": 0.0795,
"reward": -0.32342398166656494,
"reward_std": 0.3065089136362076,
"rewards/cosine_scaled_reward": -0.16171199083328247,
"rewards/format_reward": 0.0,
"step": 449
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.8,
"grad_norm": 3.7653493881225586,
"kl": 1.904296875,
"learning_rate": 1.2713832064634125e-07,
"loss": 0.0763,
"reward": -0.4029879495501518,
"reward_std": 0.31490693986415863,
"rewards/cosine_scaled_reward": -0.2014939747750759,
"rewards/format_reward": 0.0,
"step": 450
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.804,
"grad_norm": 3.4150803089141846,
"kl": 2.150390625,
"learning_rate": 1.260741462457165e-07,
"loss": 0.086,
"reward": -0.3429009020328522,
"reward_std": 0.29108157753944397,
"rewards/cosine_scaled_reward": -0.1714504510164261,
"rewards/format_reward": 0.0,
"step": 451
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.808,
"grad_norm": 4.145492076873779,
"kl": 2.2421875,
"learning_rate": 1.2503063339313356e-07,
"loss": 0.0897,
"reward": -0.42198269814252853,
"reward_std": 0.3363164961338043,
"rewards/cosine_scaled_reward": -0.21099134907126427,
"rewards/format_reward": 0.0,
"step": 452
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.812,
"grad_norm": 4.779297351837158,
"kl": 2.228515625,
"learning_rate": 1.2400783294793668e-07,
"loss": 0.0891,
"reward": -0.3492959663271904,
"reward_std": 0.2949202358722687,
"rewards/cosine_scaled_reward": -0.1746479757130146,
"rewards/format_reward": 0.0,
"step": 453
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.8159999999999998,
"grad_norm": 2.905301570892334,
"kl": 1.265625,
"learning_rate": 1.2300579475997657e-07,
"loss": 0.0506,
"reward": -0.2935212664306164,
"reward_std": 0.26374514773488045,
"rewards/cosine_scaled_reward": -0.1467606294900179,
"rewards/format_reward": 0.0,
"step": 454
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.8199999999999998,
"grad_norm": 2.7079851627349854,
"kl": 2.1337890625,
"learning_rate": 1.220245676671809e-07,
"loss": 0.0853,
"reward": -0.3475092798471451,
"reward_std": 0.30007384717464447,
"rewards/cosine_scaled_reward": -0.17375463247299194,
"rewards/format_reward": 0.0,
"step": 455
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.8239999999999998,
"grad_norm": 2.6113271713256836,
"kl": 1.6376953125,
"learning_rate": 1.2106419949317388e-07,
"loss": 0.0654,
"reward": -0.330677293241024,
"reward_std": 0.3133997842669487,
"rewards/cosine_scaled_reward": -0.1653386428952217,
"rewards/format_reward": 0.0,
"step": 456
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.8279999999999998,
"grad_norm": 2.7393922805786133,
"kl": 1.666015625,
"learning_rate": 1.2012473704494537e-07,
"loss": 0.0668,
"reward": -0.3434924744069576,
"reward_std": 0.3196050524711609,
"rewards/cosine_scaled_reward": -0.17174622975289822,
"rewards/format_reward": 0.0,
"step": 457
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.8319999999999999,
"grad_norm": 4.49023962020874,
"kl": 2.34375,
"learning_rate": 1.1920622611056974e-07,
"loss": 0.0938,
"reward": -0.34944383054971695,
"reward_std": 0.3238733857870102,
"rewards/cosine_scaled_reward": -0.17472190782427788,
"rewards/format_reward": 0.0,
"step": 458
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.8359999999999999,
"grad_norm": 2.3561832904815674,
"kl": 1.4501953125,
"learning_rate": 1.1830871145697412e-07,
"loss": 0.0579,
"reward": -0.3565739244222641,
"reward_std": 0.3099294453859329,
"rewards/cosine_scaled_reward": -0.17828696221113205,
"rewards/format_reward": 0.0,
"step": 459
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.8399999999999999,
"grad_norm": 3.1239490509033203,
"kl": 1.8984375,
"learning_rate": 1.1743223682775649e-07,
"loss": 0.0759,
"reward": -0.3478566035628319,
"reward_std": 0.28794750943779945,
"rewards/cosine_scaled_reward": -0.17392829060554504,
"rewards/format_reward": 0.0,
"step": 460
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.8439999999999999,
"grad_norm": 2.673818826675415,
"kl": 1.740234375,
"learning_rate": 1.1657684494105386e-07,
"loss": 0.0695,
"reward": -0.339593730866909,
"reward_std": 0.3045819625258446,
"rewards/cosine_scaled_reward": -0.1697968691587448,
"rewards/format_reward": 0.0,
"step": 461
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.8479999999999999,
"grad_norm": 3.220402479171753,
"kl": 1.626953125,
"learning_rate": 1.1574257748745986e-07,
"loss": 0.0651,
"reward": -0.36886321753263474,
"reward_std": 0.26985886320471764,
"rewards/cosine_scaled_reward": -0.18443159759044647,
"rewards/format_reward": 0.0,
"step": 462
},
{
"clip_ratio": 0.0,
"completion_length": 1530.4642944335938,
"epoch": 1.8519999999999999,
"grad_norm": 2.8002877235412598,
"kl": 2.23828125,
"learning_rate": 1.1492947512799328e-07,
"loss": 0.0941,
"reward": -0.31243710219860077,
"reward_std": 0.3104839473962784,
"rewards/cosine_scaled_reward": -0.15621854737401009,
"rewards/format_reward": 0.0,
"step": 463
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.8559999999999999,
"grad_norm": 3.3076934814453125,
"kl": 2.455078125,
"learning_rate": 1.1413757749211602e-07,
"loss": 0.098,
"reward": -0.3408735916018486,
"reward_std": 0.3259742558002472,
"rewards/cosine_scaled_reward": -0.1704367958009243,
"rewards/format_reward": 0.0,
"step": 464
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.8599999999999999,
"grad_norm": 4.302088737487793,
"kl": 2.005859375,
"learning_rate": 1.1336692317580158e-07,
"loss": 0.0802,
"reward": -0.3594956621527672,
"reward_std": 0.32260415703058243,
"rewards/cosine_scaled_reward": -0.1797478273510933,
"rewards/format_reward": 0.0,
"step": 465
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.8639999999999999,
"grad_norm": 4.171574115753174,
"kl": 2.490234375,
"learning_rate": 1.1261754973965422e-07,
"loss": 0.0995,
"reward": -0.3996199369430542,
"reward_std": 0.30815524607896805,
"rewards/cosine_scaled_reward": -0.1998099721968174,
"rewards/format_reward": 0.0,
"step": 466
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.8679999999999999,
"grad_norm": 3.7009289264678955,
"kl": 1.841796875,
"learning_rate": 1.1188949370707787e-07,
"loss": 0.0738,
"reward": -0.3371664360165596,
"reward_std": 0.3329595774412155,
"rewards/cosine_scaled_reward": -0.1685832180082798,
"rewards/format_reward": 0.0,
"step": 467
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.8719999999999999,
"grad_norm": 2.592533826828003,
"kl": 2.251953125,
"learning_rate": 1.1118279056249653e-07,
"loss": 0.0901,
"reward": -0.34844203293323517,
"reward_std": 0.322611540555954,
"rewards/cosine_scaled_reward": -0.1742210052907467,
"rewards/format_reward": 0.0,
"step": 468
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.876,
"grad_norm": 4.633761405944824,
"kl": 1.64013671875,
"learning_rate": 1.1049747474962444e-07,
"loss": 0.0656,
"reward": -0.3193807154893875,
"reward_std": 0.26448768377304077,
"rewards/cosine_scaled_reward": -0.15969035774469376,
"rewards/format_reward": 0.0,
"step": 469
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.88,
"grad_norm": 3.101719617843628,
"kl": 2.033203125,
"learning_rate": 1.0983357966978745e-07,
"loss": 0.0812,
"reward": -0.3662792518734932,
"reward_std": 0.32248761504888535,
"rewards/cosine_scaled_reward": -0.1831396110355854,
"rewards/format_reward": 0.0,
"step": 470
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.884,
"grad_norm": 2.580354690551758,
"kl": 1.607421875,
"learning_rate": 1.0919113768029517e-07,
"loss": 0.0643,
"reward": -0.34900667518377304,
"reward_std": 0.31430666893720627,
"rewards/cosine_scaled_reward": -0.17450333759188652,
"rewards/format_reward": 0.0,
"step": 471
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.888,
"grad_norm": 2.7384796142578125,
"kl": 1.8046875,
"learning_rate": 1.0857018009286381e-07,
"loss": 0.0722,
"reward": -0.32778534665703773,
"reward_std": 0.3321828171610832,
"rewards/cosine_scaled_reward": -0.16389267705380917,
"rewards/format_reward": 0.0,
"step": 472
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.892,
"grad_norm": 3.759181499481201,
"kl": 2.017578125,
"learning_rate": 1.0797073717209013e-07,
"loss": 0.0807,
"reward": -0.32047825306653976,
"reward_std": 0.28816820681095123,
"rewards/cosine_scaled_reward": -0.16023912653326988,
"rewards/format_reward": 0.0,
"step": 473
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.896,
"grad_norm": 2.8909876346588135,
"kl": 1.603515625,
"learning_rate": 1.0739283813397639e-07,
"loss": 0.0642,
"reward": -0.3390325605869293,
"reward_std": 0.3011201545596123,
"rewards/cosine_scaled_reward": -0.16951627284288406,
"rewards/format_reward": 0.0,
"step": 474
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.9,
"grad_norm": 2.3281497955322266,
"kl": 2.0234375,
"learning_rate": 1.068365111445064e-07,
"loss": 0.081,
"reward": -0.36704741418361664,
"reward_std": 0.3062589168548584,
"rewards/cosine_scaled_reward": -0.18352371081709862,
"rewards/format_reward": 0.0,
"step": 475
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.904,
"grad_norm": 3.56882643699646,
"kl": 2.515625,
"learning_rate": 1.063017833182728e-07,
"loss": 0.1008,
"reward": -0.39511261135339737,
"reward_std": 0.3128170743584633,
"rewards/cosine_scaled_reward": -0.19755630940198898,
"rewards/format_reward": 0.0,
"step": 476
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.908,
"grad_norm": 2.958406925201416,
"kl": 1.755859375,
"learning_rate": 1.0578868071715544e-07,
"loss": 0.0702,
"reward": -0.3462023660540581,
"reward_std": 0.322578527033329,
"rewards/cosine_scaled_reward": -0.17310118675231934,
"rewards/format_reward": 0.0,
"step": 477
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.912,
"grad_norm": 3.044797897338867,
"kl": 2.375,
"learning_rate": 1.0529722834905125e-07,
"loss": 0.095,
"reward": -0.3144143670797348,
"reward_std": 0.29915551096200943,
"rewards/cosine_scaled_reward": -0.1572071835398674,
"rewards/format_reward": 0.0,
"step": 478
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.916,
"grad_norm": 4.031872272491455,
"kl": 2.640625,
"learning_rate": 1.0482745016665526e-07,
"loss": 0.1057,
"reward": -0.3763216808438301,
"reward_std": 0.3211255893111229,
"rewards/cosine_scaled_reward": -0.18816084042191505,
"rewards/format_reward": 0.0,
"step": 479
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.92,
"grad_norm": 2.3054392337799072,
"kl": 1.3173828125,
"learning_rate": 1.0437936906629334e-07,
"loss": 0.0528,
"reward": -0.2678487957455218,
"reward_std": 0.2627658285200596,
"rewards/cosine_scaled_reward": -0.13392440509051085,
"rewards/format_reward": 0.0,
"step": 480
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.924,
"grad_norm": 3.41572642326355,
"kl": 1.353515625,
"learning_rate": 1.0395300688680625e-07,
"loss": 0.0541,
"reward": -0.35157452523708344,
"reward_std": 0.3239835053682327,
"rewards/cosine_scaled_reward": -0.17578726634383202,
"rewards/format_reward": 0.0,
"step": 481
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.928,
"grad_norm": 2.691436290740967,
"kl": 2.041015625,
"learning_rate": 1.0354838440848501e-07,
"loss": 0.0816,
"reward": -0.39503272622823715,
"reward_std": 0.3050593361258507,
"rewards/cosine_scaled_reward": -0.19751636311411858,
"rewards/format_reward": 0.0,
"step": 482
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.932,
"grad_norm": 2.859536647796631,
"kl": 1.494140625,
"learning_rate": 1.0316552135205837e-07,
"loss": 0.0599,
"reward": -0.395970955491066,
"reward_std": 0.27583859115839005,
"rewards/cosine_scaled_reward": -0.197985477745533,
"rewards/format_reward": 0.0,
"step": 483
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.936,
"grad_norm": 2.9280340671539307,
"kl": 1.765625,
"learning_rate": 1.0280443637773163e-07,
"loss": 0.0708,
"reward": -0.2913724035024643,
"reward_std": 0.2617946192622185,
"rewards/cosine_scaled_reward": -0.14568619430065155,
"rewards/format_reward": 0.0,
"step": 484
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.94,
"grad_norm": 2.2830445766448975,
"kl": 1.2158203125,
"learning_rate": 1.0246514708427701e-07,
"loss": 0.0487,
"reward": -0.3095552623271942,
"reward_std": 0.292842835187912,
"rewards/cosine_scaled_reward": -0.1547776274383068,
"rewards/format_reward": 0.0,
"step": 485
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.944,
"grad_norm": 3.141052007675171,
"kl": 1.3427734375,
"learning_rate": 1.0214767000817596e-07,
"loss": 0.0537,
"reward": -0.32299425452947617,
"reward_std": 0.29863065481185913,
"rewards/cosine_scaled_reward": -0.16149712353944778,
"rewards/format_reward": 0.0,
"step": 486
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.948,
"grad_norm": 3.97387433052063,
"kl": 1.931640625,
"learning_rate": 1.0185202062281336e-07,
"loss": 0.0773,
"reward": -0.3765959292650223,
"reward_std": 0.3192542716860771,
"rewards/cosine_scaled_reward": -0.18829796463251114,
"rewards/format_reward": 0.0,
"step": 487
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.952,
"grad_norm": 2.656202554702759,
"kl": 1.578125,
"learning_rate": 1.0157821333772304e-07,
"loss": 0.0631,
"reward": -0.31205643340945244,
"reward_std": 0.31670553237199783,
"rewards/cosine_scaled_reward": -0.15602822043001652,
"rewards/format_reward": 0.0,
"step": 488
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.956,
"grad_norm": 3.296848773956299,
"kl": 1.16796875,
"learning_rate": 1.013262614978859e-07,
"loss": 0.0468,
"reward": -0.3039631359279156,
"reward_std": 0.27847766503691673,
"rewards/cosine_scaled_reward": -0.15198157727718353,
"rewards/format_reward": 0.0,
"step": 489
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.96,
"grad_norm": 4.522839546203613,
"kl": 1.8203125,
"learning_rate": 1.0109617738307911e-07,
"loss": 0.0728,
"reward": -0.34008362144231796,
"reward_std": 0.29262910783290863,
"rewards/cosine_scaled_reward": -0.17004182189702988,
"rewards/format_reward": 0.0,
"step": 490
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.964,
"grad_norm": 2.311014175415039,
"kl": 2.244140625,
"learning_rate": 1.0088797220727779e-07,
"loss": 0.0898,
"reward": -0.34849604219198227,
"reward_std": 0.3044138178229332,
"rewards/cosine_scaled_reward": -0.17424802854657173,
"rewards/format_reward": 0.0,
"step": 491
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.968,
"grad_norm": 2.6442465782165527,
"kl": 1.998046875,
"learning_rate": 1.0070165611810855e-07,
"loss": 0.0799,
"reward": -0.34308964014053345,
"reward_std": 0.3727850690484047,
"rewards/cosine_scaled_reward": -0.17154482379555702,
"rewards/format_reward": 0.0,
"step": 492
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.972,
"grad_norm": 2.6985509395599365,
"kl": 1.41796875,
"learning_rate": 1.005372381963547e-07,
"loss": 0.0567,
"reward": -0.3521110415458679,
"reward_std": 0.30227896198630333,
"rewards/cosine_scaled_reward": -0.17605552449822426,
"rewards/format_reward": 0.0,
"step": 493
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.976,
"grad_norm": 3.240550994873047,
"kl": 1.904296875,
"learning_rate": 1.0039472645551372e-07,
"loss": 0.076,
"reward": -0.3422994837164879,
"reward_std": 0.3251089081168175,
"rewards/cosine_scaled_reward": -0.17114974185824394,
"rewards/format_reward": 0.0,
"step": 494
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.98,
"grad_norm": 4.803572177886963,
"kl": 3.177734375,
"learning_rate": 1.002741278414069e-07,
"loss": 0.1272,
"reward": -0.3737839311361313,
"reward_std": 0.3232840970158577,
"rewards/cosine_scaled_reward": -0.18689196929335594,
"rewards/format_reward": 0.0,
"step": 495
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.984,
"grad_norm": 2.532582998275757,
"kl": 1.9375,
"learning_rate": 1.0017544823184055e-07,
"loss": 0.0776,
"reward": -0.374487929046154,
"reward_std": 0.32537975162267685,
"rewards/cosine_scaled_reward": -0.187243964523077,
"rewards/format_reward": 0.0,
"step": 496
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.988,
"grad_norm": 2.6129701137542725,
"kl": 2.2734375,
"learning_rate": 1.0009869243631952e-07,
"loss": 0.091,
"reward": -0.3434004709124565,
"reward_std": 0.32708871364593506,
"rewards/cosine_scaled_reward": -0.17170023545622826,
"rewards/format_reward": 0.0,
"step": 497
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.992,
"grad_norm": 4.10455322265625,
"kl": 1.595703125,
"learning_rate": 1.000438641958131e-07,
"loss": 0.0638,
"reward": -0.3211556486785412,
"reward_std": 0.2905324958264828,
"rewards/cosine_scaled_reward": -0.1605778243392706,
"rewards/format_reward": 0.0,
"step": 498
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.996,
"grad_norm": 2.7520267963409424,
"kl": 1.892578125,
"learning_rate": 1.0001096618257236e-07,
"loss": 0.0756,
"reward": -0.35309676826000214,
"reward_std": 0.31401190161705017,
"rewards/cosine_scaled_reward": -0.17654838413000107,
"rewards/format_reward": 0.0,
"step": 499
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0001220703125,
"epoch": 2.0,
"grad_norm": 2.9658398628234863,
"kl": 1.7763671875,
"learning_rate": 1e-07,
"loss": 0.0711,
"reward": -0.343311108648777,
"reward_std": 0.28952478244900703,
"rewards/cosine_scaled_reward": -0.1716555580496788,
"rewards/format_reward": 0.0,
"step": 500
},
{
"epoch": 2.0,
"step": 500,
"total_flos": 0.0,
"train_loss": 0.05846181693652478,
"train_runtime": 107214.2293,
"train_samples_per_second": 0.783,
"train_steps_per_second": 0.005
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 250,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}