{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.004, "grad_norm": 4.214743137359619, "kl": 0.0, "learning_rate": 2e-08, "loss": -0.0, "reward": -0.572140134871006, "reward_std": 0.3359133452177048, "rewards/cosine_scaled_reward": -0.286070067435503, "rewards/format_reward": 0.0, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.008, "grad_norm": 3.178635597229004, "kl": 0.0, "learning_rate": 4e-08, "loss": -0.0, "reward": -0.6001544743776321, "reward_std": 0.33404429256916046, "rewards/cosine_scaled_reward": -0.30007724463939667, "rewards/format_reward": 0.0, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.012, "grad_norm": 4.78328800201416, "kl": 6.908178329467773e-05, "learning_rate": 6e-08, "loss": 0.0, "reward": -0.502997636795044, "reward_std": 0.3310435339808464, "rewards/cosine_scaled_reward": -0.251498818397522, "rewards/format_reward": 0.0, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.016, "grad_norm": 3.9194376468658447, "kl": 6.488710641860962e-05, "learning_rate": 8e-08, "loss": 0.0, "reward": -0.5549568086862564, "reward_std": 0.3469474986195564, "rewards/cosine_scaled_reward": -0.2774783968925476, "rewards/format_reward": 0.0, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.02, "grad_norm": 3.903712511062622, "kl": 5.97834587097168e-05, "learning_rate": 1e-07, "loss": 0.0, "reward": -0.5800392031669617, "reward_std": 0.35274410992860794, "rewards/cosine_scaled_reward": -0.29001960158348083, "rewards/format_reward": 0.0, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.024, "grad_norm": 3.738009452819824, "kl": 6.499886512756348e-05, "learning_rate": 1.2e-07, "loss": 0.0, "reward": -0.5155884921550751, "reward_std": 0.37037966400384903, "rewards/cosine_scaled_reward": -0.25779424607753754, "rewards/format_reward": 0.0, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.028, "grad_norm": 2.794049024581909, "kl": 5.620718002319336e-05, "learning_rate": 1.4e-07, "loss": 0.0, "reward": -0.5175943374633789, "reward_std": 0.3494645953178406, "rewards/cosine_scaled_reward": -0.25879716128110886, "rewards/format_reward": 0.0, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.032, "grad_norm": 2.484722852706909, "kl": 8.106231689453125e-05, "learning_rate": 1.6e-07, "loss": 0.0, "reward": -0.5301882103085518, "reward_std": 0.3405821621417999, "rewards/cosine_scaled_reward": -0.2650941051542759, "rewards/format_reward": 0.0, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.036, "grad_norm": 3.1448230743408203, "kl": 7.554888725280762e-05, "learning_rate": 1.8e-07, "loss": 0.0, "reward": -0.5024237409234047, "reward_std": 0.3572370335459709, "rewards/cosine_scaled_reward": -0.25121185183525085, "rewards/format_reward": 0.0, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.04, "grad_norm": 4.125906944274902, "kl": 8.666515350341797e-05, "learning_rate": 2e-07, "loss": 0.0, "reward": -0.5732719898223877, "reward_std": 0.37079156190156937, "rewards/cosine_scaled_reward": -0.28663600236177444, "rewards/format_reward": 0.0, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.044, "grad_norm": 4.4225945472717285, "kl": 5.561113357543945e-05, "learning_rate": 2.1999999999999998e-07, "loss": 0.0, "reward": -0.5889493525028229, "reward_std": 0.3473696708679199, "rewards/cosine_scaled_reward": -0.29447468370199203, "rewards/format_reward": 0.0, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.048, "grad_norm": 3.891627550125122, "kl": 7.808208465576172e-05, "learning_rate": 2.4e-07, "loss": 0.0, "reward": -0.5409628972411156, "reward_std": 0.326653391122818, "rewards/cosine_scaled_reward": -0.2704814486205578, "rewards/format_reward": 0.0, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.052, "grad_norm": 3.552539587020874, "kl": 7.30752944946289e-05, "learning_rate": 2.6e-07, "loss": 0.0, "reward": -0.5389444306492805, "reward_std": 0.3649257719516754, "rewards/cosine_scaled_reward": -0.2694722190499306, "rewards/format_reward": 0.0, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.056, "grad_norm": 2.781034231185913, "kl": 7.081031799316406e-05, "learning_rate": 2.8e-07, "loss": 0.0, "reward": -0.6049635112285614, "reward_std": 0.3185788542032242, "rewards/cosine_scaled_reward": -0.3024817630648613, "rewards/format_reward": 0.0, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.06, "grad_norm": 3.412130355834961, "kl": 6.335973739624023e-05, "learning_rate": 3e-07, "loss": 0.0, "reward": -0.6299380213022232, "reward_std": 0.31315718591213226, "rewards/cosine_scaled_reward": -0.3149690255522728, "rewards/format_reward": 0.0, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.064, "grad_norm": 4.064192771911621, "kl": 0.00011527538299560547, "learning_rate": 3.2e-07, "loss": 0.0, "reward": -0.5638149380683899, "reward_std": 0.3539445400238037, "rewards/cosine_scaled_reward": -0.28190746903419495, "rewards/format_reward": 0.0, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.068, "grad_norm": 3.5826501846313477, "kl": 9.000301361083984e-05, "learning_rate": 3.4000000000000003e-07, "loss": 0.0, "reward": -0.5815131217241287, "reward_std": 0.3570765480399132, "rewards/cosine_scaled_reward": -0.29075656831264496, "rewards/format_reward": 0.0, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.072, "grad_norm": 3.4398193359375, "kl": 0.00013589859008789062, "learning_rate": 3.6e-07, "loss": 0.0, "reward": -0.5058030858635902, "reward_std": 0.3534058630466461, "rewards/cosine_scaled_reward": -0.2529015429317951, "rewards/format_reward": 0.0, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.076, "grad_norm": 3.1647567749023438, "kl": 0.00010588765144348145, "learning_rate": 3.7999999999999996e-07, "loss": 0.0, "reward": -0.5453799739480019, "reward_std": 0.3434706851840019, "rewards/cosine_scaled_reward": -0.27268998324871063, "rewards/format_reward": 0.0, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.08, "grad_norm": 4.028233528137207, "kl": 0.00011265277862548828, "learning_rate": 4e-07, "loss": 0.0, "reward": -0.5725424438714981, "reward_std": 0.33554956316947937, "rewards/cosine_scaled_reward": -0.28627122938632965, "rewards/format_reward": 0.0, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.084, "grad_norm": 3.0403409004211426, "kl": 0.00015485286712646484, "learning_rate": 4.1999999999999995e-07, "loss": 0.0, "reward": -0.5395064353942871, "reward_std": 0.3414423242211342, "rewards/cosine_scaled_reward": -0.26975322514772415, "rewards/format_reward": 0.0, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.088, "grad_norm": 3.5831127166748047, "kl": 0.0006537437438964844, "learning_rate": 4.3999999999999997e-07, "loss": 0.0, "reward": -0.5216317698359489, "reward_std": 0.3427959829568863, "rewards/cosine_scaled_reward": -0.2608158737421036, "rewards/format_reward": 0.0, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.092, "grad_norm": 3.5175235271453857, "kl": 0.0010776519775390625, "learning_rate": 4.6e-07, "loss": 0.0, "reward": -0.5413709655404091, "reward_std": 0.32718800008296967, "rewards/cosine_scaled_reward": -0.27068548277020454, "rewards/format_reward": 0.0, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.096, "grad_norm": 3.442873239517212, "kl": 0.0013303756713867188, "learning_rate": 4.8e-07, "loss": 0.0001, "reward": -0.5624926462769508, "reward_std": 0.3581688553094864, "rewards/cosine_scaled_reward": -0.2812463231384754, "rewards/format_reward": 0.0, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.1, "grad_norm": 2.6114015579223633, "kl": 0.0016193389892578125, "learning_rate": 5e-07, "loss": 0.0001, "reward": -0.5309188961982727, "reward_std": 0.33032629638910294, "rewards/cosine_scaled_reward": -0.26545944809913635, "rewards/format_reward": 0.0, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.104, "grad_norm": 4.818567752838135, "kl": 0.0026264190673828125, "learning_rate": 5.2e-07, "loss": 0.0001, "reward": -0.5884083956480026, "reward_std": 0.3386874794960022, "rewards/cosine_scaled_reward": -0.2942042052745819, "rewards/format_reward": 0.0, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.108, "grad_norm": 4.078734397888184, "kl": 0.002239227294921875, "learning_rate": 5.4e-07, "loss": 0.0001, "reward": -0.6157089024782181, "reward_std": 0.3308729752898216, "rewards/cosine_scaled_reward": -0.30785445868968964, "rewards/format_reward": 0.0, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.112, "grad_norm": 3.4599478244781494, "kl": 0.002338409423828125, "learning_rate": 5.6e-07, "loss": 0.0001, "reward": -0.5709060430526733, "reward_std": 0.3136204034090042, "rewards/cosine_scaled_reward": -0.28545302152633667, "rewards/format_reward": 0.0, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 1533.9464416503906, "epoch": 0.116, "grad_norm": 3.461718797683716, "kl": 0.003444671630859375, "learning_rate": 5.8e-07, "loss": -0.001, "reward": -0.5237472280859947, "reward_std": 0.3601622208952904, "rewards/cosine_scaled_reward": -0.26187360659241676, "rewards/format_reward": 0.0, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.12, "grad_norm": 3.7205333709716797, "kl": 0.00542449951171875, "learning_rate": 6e-07, "loss": 0.0002, "reward": -0.5595864206552505, "reward_std": 0.3391585499048233, "rewards/cosine_scaled_reward": -0.2797932103276253, "rewards/format_reward": 0.0, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.124, "grad_norm": 3.639012575149536, "kl": 0.0102996826171875, "learning_rate": 6.2e-07, "loss": 0.0004, "reward": -0.5832120478153229, "reward_std": 0.34403981268405914, "rewards/cosine_scaled_reward": -0.29160603135824203, "rewards/format_reward": 0.0, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.128, "grad_norm": 3.499258041381836, "kl": 0.0159149169921875, "learning_rate": 6.4e-07, "loss": 0.0006, "reward": -0.5567401573061943, "reward_std": 0.3353060856461525, "rewards/cosine_scaled_reward": -0.27837007120251656, "rewards/format_reward": 0.0, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.132, "grad_norm": 3.564453601837158, "kl": 0.0182952880859375, "learning_rate": 6.6e-07, "loss": 0.0007, "reward": -0.5521366372704506, "reward_std": 0.3413034975528717, "rewards/cosine_scaled_reward": -0.2760683260858059, "rewards/format_reward": 0.0, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.136, "grad_norm": 3.567174196243286, "kl": 0.0237274169921875, "learning_rate": 6.800000000000001e-07, "loss": 0.0009, "reward": -0.5193822234869003, "reward_std": 0.35690775513648987, "rewards/cosine_scaled_reward": -0.25969111174345016, "rewards/format_reward": 0.0, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.14, "grad_norm": 2.247893810272217, "kl": 0.0149078369140625, "learning_rate": 7e-07, "loss": 0.0006, "reward": -0.5820326581597328, "reward_std": 0.3510446697473526, "rewards/cosine_scaled_reward": -0.2910163216292858, "rewards/format_reward": 0.0, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.144, "grad_norm": 2.9316084384918213, "kl": 0.022552490234375, "learning_rate": 7.2e-07, "loss": 0.0009, "reward": -0.5632490888237953, "reward_std": 0.3500733822584152, "rewards/cosine_scaled_reward": -0.28162455186247826, "rewards/format_reward": 0.0, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.148, "grad_norm": 3.5201869010925293, "kl": 0.02850341796875, "learning_rate": 7.4e-07, "loss": 0.0011, "reward": -0.5141241475939751, "reward_std": 0.3309687077999115, "rewards/cosine_scaled_reward": -0.25706208124756813, "rewards/format_reward": 0.0, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.152, "grad_norm": 2.7246434688568115, "kl": 0.0296630859375, "learning_rate": 7.599999999999999e-07, "loss": 0.0012, "reward": -0.5139049887657166, "reward_std": 0.33319953083992004, "rewards/cosine_scaled_reward": -0.25695250555872917, "rewards/format_reward": 0.0, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.156, "grad_norm": 2.880594491958618, "kl": 0.0258636474609375, "learning_rate": 7.799999999999999e-07, "loss": 0.001, "reward": -0.5646104216575623, "reward_std": 0.3474426791071892, "rewards/cosine_scaled_reward": -0.2823052257299423, "rewards/format_reward": 0.0, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.16, "grad_norm": 2.6734988689422607, "kl": 0.0321044921875, "learning_rate": 8e-07, "loss": 0.0013, "reward": -0.5586390048265457, "reward_std": 0.3474784344434738, "rewards/cosine_scaled_reward": -0.27931951731443405, "rewards/format_reward": 0.0, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.164, "grad_norm": 3.1370785236358643, "kl": 0.03369140625, "learning_rate": 8.199999999999999e-07, "loss": 0.0013, "reward": -0.5609789937734604, "reward_std": 0.3450735807418823, "rewards/cosine_scaled_reward": -0.280489519238472, "rewards/format_reward": 0.0, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.168, "grad_norm": 2.5502073764801025, "kl": 0.06072998046875, "learning_rate": 8.399999999999999e-07, "loss": 0.0024, "reward": -0.5195748135447502, "reward_std": 0.34474433213472366, "rewards/cosine_scaled_reward": -0.2597874030470848, "rewards/format_reward": 0.0, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.172, "grad_norm": 2.1381213665008545, "kl": 0.067474365234375, "learning_rate": 8.599999999999999e-07, "loss": 0.0027, "reward": -0.5580533072352409, "reward_std": 0.32987529784440994, "rewards/cosine_scaled_reward": -0.27902666106820107, "rewards/format_reward": 0.0, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.176, "grad_norm": 2.1730432510375977, "kl": 0.0958251953125, "learning_rate": 8.799999999999999e-07, "loss": 0.0038, "reward": -0.5585729256272316, "reward_std": 0.3295438587665558, "rewards/cosine_scaled_reward": -0.2792864739894867, "rewards/format_reward": 0.0, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.18, "grad_norm": 1.962768316268921, "kl": 0.079345703125, "learning_rate": 9e-07, "loss": 0.0032, "reward": -0.5980347394943237, "reward_std": 0.3284436762332916, "rewards/cosine_scaled_reward": -0.29901736974716187, "rewards/format_reward": 0.0, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.184, "grad_norm": 1.8276231288909912, "kl": 0.1153564453125, "learning_rate": 9.2e-07, "loss": 0.0046, "reward": -0.507519856095314, "reward_std": 0.33579862862825394, "rewards/cosine_scaled_reward": -0.2537599205970764, "rewards/format_reward": 0.0, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.188, "grad_norm": 2.608023166656494, "kl": 0.09033203125, "learning_rate": 9.399999999999999e-07, "loss": 0.0036, "reward": -0.5289521142840385, "reward_std": 0.31808041036129, "rewards/cosine_scaled_reward": -0.26447605714201927, "rewards/format_reward": 0.0, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.192, "grad_norm": 1.8956966400146484, "kl": 0.09814453125, "learning_rate": 9.6e-07, "loss": 0.0039, "reward": -0.566174179315567, "reward_std": 0.311339795589447, "rewards/cosine_scaled_reward": -0.2830870673060417, "rewards/format_reward": 0.0, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.196, "grad_norm": 1.7705461978912354, "kl": 0.1209716796875, "learning_rate": 9.8e-07, "loss": 0.0048, "reward": -0.528024435043335, "reward_std": 0.36330366879701614, "rewards/cosine_scaled_reward": -0.26401223987340927, "rewards/format_reward": 0.0, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.2, "grad_norm": 2.1113531589508057, "kl": 0.1171875, "learning_rate": 1e-06, "loss": 0.0047, "reward": -0.4406622089445591, "reward_std": 0.3163011893630028, "rewards/cosine_scaled_reward": -0.2203311063349247, "rewards/format_reward": 0.0, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.204, "grad_norm": 1.803585410118103, "kl": 0.1026611328125, "learning_rate": 9.999890338174275e-07, "loss": 0.0041, "reward": -0.5815826654434204, "reward_std": 0.3248438388109207, "rewards/cosine_scaled_reward": -0.2907913327217102, "rewards/format_reward": 0.0, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.208, "grad_norm": 1.7076486349105835, "kl": 0.157470703125, "learning_rate": 9.999561358041868e-07, "loss": 0.0063, "reward": -0.5362438708543777, "reward_std": 0.2975444979965687, "rewards/cosine_scaled_reward": -0.26812195032835007, "rewards/format_reward": 0.0, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.212, "grad_norm": 2.478224515914917, "kl": 0.144287109375, "learning_rate": 9.999013075636804e-07, "loss": 0.0058, "reward": -0.47916819900274277, "reward_std": 0.35621220618486404, "rewards/cosine_scaled_reward": -0.23958410695195198, "rewards/format_reward": 0.0, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.216, "grad_norm": 2.006901502609253, "kl": 0.1337890625, "learning_rate": 9.998245517681593e-07, "loss": 0.0053, "reward": -0.5450761765241623, "reward_std": 0.32576631009578705, "rewards/cosine_scaled_reward": -0.27253808826208115, "rewards/format_reward": 0.0, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.22, "grad_norm": 2.2259609699249268, "kl": 0.11669921875, "learning_rate": 9.997258721585931e-07, "loss": 0.0047, "reward": -0.5271478518843651, "reward_std": 0.34441374242305756, "rewards/cosine_scaled_reward": -0.26357391849160194, "rewards/format_reward": 0.0, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.224, "grad_norm": 2.020939588546753, "kl": 0.1907958984375, "learning_rate": 9.996052735444862e-07, "loss": 0.0076, "reward": -0.5367654263973236, "reward_std": 0.3470792919397354, "rewards/cosine_scaled_reward": -0.2683827131986618, "rewards/format_reward": 0.0, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.228, "grad_norm": 1.9356812238693237, "kl": 0.158935546875, "learning_rate": 9.994627618036452e-07, "loss": 0.0064, "reward": -0.505635529756546, "reward_std": 0.3292393088340759, "rewards/cosine_scaled_reward": -0.252817764878273, "rewards/format_reward": 0.0, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.232, "grad_norm": 3.2483060359954834, "kl": 0.188720703125, "learning_rate": 9.992983438818915e-07, "loss": 0.0075, "reward": -0.504822663962841, "reward_std": 0.35463710874319077, "rewards/cosine_scaled_reward": -0.2524113282561302, "rewards/format_reward": 0.0, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.236, "grad_norm": 2.2256879806518555, "kl": 0.205322265625, "learning_rate": 9.991120277927223e-07, "loss": 0.0082, "reward": -0.5851711928844452, "reward_std": 0.3146449252963066, "rewards/cosine_scaled_reward": -0.2925856038928032, "rewards/format_reward": 0.0, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.24, "grad_norm": 2.093649387359619, "kl": 0.198486328125, "learning_rate": 9.989038226169207e-07, "loss": 0.0079, "reward": -0.45284587889909744, "reward_std": 0.34760017693042755, "rewards/cosine_scaled_reward": -0.22642293944954872, "rewards/format_reward": 0.0, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.244, "grad_norm": 2.378591537475586, "kl": 0.24365234375, "learning_rate": 9.98673738502114e-07, "loss": 0.0097, "reward": -0.5091445297002792, "reward_std": 0.3452131450176239, "rewards/cosine_scaled_reward": -0.2545722760260105, "rewards/format_reward": 0.0, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.248, "grad_norm": 2.188553810119629, "kl": 0.29833984375, "learning_rate": 9.98421786662277e-07, "loss": 0.0119, "reward": -0.47440846264362335, "reward_std": 0.34785814583301544, "rewards/cosine_scaled_reward": -0.23720423132181168, "rewards/format_reward": 0.0, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.252, "grad_norm": 2.6211366653442383, "kl": 0.48095703125, "learning_rate": 9.981479793771866e-07, "loss": 0.0192, "reward": -0.46701501309871674, "reward_std": 0.3275434151291847, "rewards/cosine_scaled_reward": -0.23350750654935837, "rewards/format_reward": 0.0, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.256, "grad_norm": 3.608039617538452, "kl": 0.63720703125, "learning_rate": 9.97852329991824e-07, "loss": 0.0254, "reward": -0.4022144228219986, "reward_std": 0.3280187249183655, "rewards/cosine_scaled_reward": -0.2011072114109993, "rewards/format_reward": 0.0, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.26, "grad_norm": 2.1589713096618652, "kl": 0.587890625, "learning_rate": 9.975348529157229e-07, "loss": 0.0236, "reward": -0.4902011975646019, "reward_std": 0.33829304575920105, "rewards/cosine_scaled_reward": -0.24510059878230095, "rewards/format_reward": 0.0, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.264, "grad_norm": 4.391396522521973, "kl": 0.851806640625, "learning_rate": 9.971955636222684e-07, "loss": 0.034, "reward": -0.5337588116526604, "reward_std": 0.3271815627813339, "rewards/cosine_scaled_reward": -0.2668794058263302, "rewards/format_reward": 0.0, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.268, "grad_norm": 4.296882629394531, "kl": 0.892333984375, "learning_rate": 9.968344786479415e-07, "loss": 0.0357, "reward": -0.45740216970443726, "reward_std": 0.32497797161340714, "rewards/cosine_scaled_reward": -0.22870109230279922, "rewards/format_reward": 0.0, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.272, "grad_norm": 7.224793434143066, "kl": 1.29736328125, "learning_rate": 9.964516155915151e-07, "loss": 0.0519, "reward": -0.5055549815297127, "reward_std": 0.3318631425499916, "rewards/cosine_scaled_reward": -0.25277747586369514, "rewards/format_reward": 0.0, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.276, "grad_norm": 6.747034072875977, "kl": 1.3232421875, "learning_rate": 9.960469931131936e-07, "loss": 0.0531, "reward": -0.4314222186803818, "reward_std": 0.31476689875125885, "rewards/cosine_scaled_reward": -0.21571110002696514, "rewards/format_reward": 0.0, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.28, "grad_norm": 5.5595808029174805, "kl": 0.8935546875, "learning_rate": 9.956206309337066e-07, "loss": 0.0358, "reward": -0.4758576303720474, "reward_std": 0.33101003617048264, "rewards/cosine_scaled_reward": -0.2379288226366043, "rewards/format_reward": 0.0, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.284, "grad_norm": 2.4482791423797607, "kl": 0.521484375, "learning_rate": 9.951725498333448e-07, "loss": 0.0209, "reward": -0.4491276890039444, "reward_std": 0.3567735329270363, "rewards/cosine_scaled_reward": -0.2245638445019722, "rewards/format_reward": 0.0, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.288, "grad_norm": 3.1987600326538086, "kl": 0.6240234375, "learning_rate": 9.947027716509488e-07, "loss": 0.025, "reward": -0.43654023110866547, "reward_std": 0.3590875416994095, "rewards/cosine_scaled_reward": -0.21827011927962303, "rewards/format_reward": 0.0, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.292, "grad_norm": 4.885537147521973, "kl": 1.14599609375, "learning_rate": 9.942113192828444e-07, "loss": 0.0458, "reward": -0.5265215784311295, "reward_std": 0.3363535851240158, "rewards/cosine_scaled_reward": -0.26326077431440353, "rewards/format_reward": 0.0, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.296, "grad_norm": 3.4503629207611084, "kl": 1.14794921875, "learning_rate": 9.93698216681727e-07, "loss": 0.0459, "reward": -0.4836200848221779, "reward_std": 0.33076073229312897, "rewards/cosine_scaled_reward": -0.24181004241108894, "rewards/format_reward": 0.0, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.3, "grad_norm": 3.5954651832580566, "kl": 0.6767578125, "learning_rate": 9.931634888554935e-07, "loss": 0.027, "reward": -0.5548510551452637, "reward_std": 0.3006826713681221, "rewards/cosine_scaled_reward": -0.27742552757263184, "rewards/format_reward": 0.0, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.304, "grad_norm": 2.27148699760437, "kl": 0.69970703125, "learning_rate": 9.926071618660237e-07, "loss": 0.028, "reward": -0.5522997975349426, "reward_std": 0.32217612117528915, "rewards/cosine_scaled_reward": -0.2761498987674713, "rewards/format_reward": 0.0, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.308, "grad_norm": 2.421114206314087, "kl": 0.65234375, "learning_rate": 9.9202926282791e-07, "loss": 0.0261, "reward": -0.5491495952010155, "reward_std": 0.33891358226537704, "rewards/cosine_scaled_reward": -0.27457480505108833, "rewards/format_reward": 0.0, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.312, "grad_norm": 2.296977996826172, "kl": 0.4833984375, "learning_rate": 9.91429819907136e-07, "loss": 0.0193, "reward": -0.5332002714276314, "reward_std": 0.3453890234231949, "rewards/cosine_scaled_reward": -0.2666001245379448, "rewards/format_reward": 0.0, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.316, "grad_norm": 2.351818084716797, "kl": 0.5048828125, "learning_rate": 9.908088623197048e-07, "loss": 0.0202, "reward": -0.4974421188235283, "reward_std": 0.36291657388210297, "rewards/cosine_scaled_reward": -0.24872105196118355, "rewards/format_reward": 0.0, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.32, "grad_norm": 2.808706521987915, "kl": 0.53125, "learning_rate": 9.901664203302124e-07, "loss": 0.0212, "reward": -0.5026201903820038, "reward_std": 0.30610421299934387, "rewards/cosine_scaled_reward": -0.2513100877404213, "rewards/format_reward": 0.0, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.324, "grad_norm": 2.077920913696289, "kl": 0.68994140625, "learning_rate": 9.895025252503755e-07, "loss": 0.0276, "reward": -0.4621705636382103, "reward_std": 0.33135028183460236, "rewards/cosine_scaled_reward": -0.23108528181910515, "rewards/format_reward": 0.0, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.328, "grad_norm": 2.951878309249878, "kl": 0.6015625, "learning_rate": 9.888172094375033e-07, "loss": 0.024, "reward": -0.5148988738656044, "reward_std": 0.3465086743235588, "rewards/cosine_scaled_reward": -0.2574494294822216, "rewards/format_reward": 0.0, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.332, "grad_norm": 2.1016077995300293, "kl": 0.36376953125, "learning_rate": 9.881105062929221e-07, "loss": 0.0145, "reward": -0.48821673542261124, "reward_std": 0.35235296189785004, "rewards/cosine_scaled_reward": -0.24410836026072502, "rewards/format_reward": 0.0, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.336, "grad_norm": 2.276076555252075, "kl": 0.77734375, "learning_rate": 9.873824502603459e-07, "loss": 0.0311, "reward": -0.509700171649456, "reward_std": 0.3434828519821167, "rewards/cosine_scaled_reward": -0.2548500932753086, "rewards/format_reward": 0.0, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.34, "grad_norm": 1.9953871965408325, "kl": 0.45263671875, "learning_rate": 9.866330768241983e-07, "loss": 0.0181, "reward": -0.5046856477856636, "reward_std": 0.3276178315281868, "rewards/cosine_scaled_reward": -0.2523428313434124, "rewards/format_reward": 0.0, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.344, "grad_norm": 5.694060802459717, "kl": 1.50390625, "learning_rate": 9.85862422507884e-07, "loss": 0.06, "reward": -0.5268296301364899, "reward_std": 0.3594844192266464, "rewards/cosine_scaled_reward": -0.26341481506824493, "rewards/format_reward": 0.0, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.348, "grad_norm": 2.5820319652557373, "kl": 0.79931640625, "learning_rate": 9.850705248720068e-07, "loss": 0.0319, "reward": -0.5030437260866165, "reward_std": 0.33297523856163025, "rewards/cosine_scaled_reward": -0.25152185559272766, "rewards/format_reward": 0.0, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.352, "grad_norm": 2.748469829559326, "kl": 0.8642578125, "learning_rate": 9.8425742251254e-07, "loss": 0.0346, "reward": -0.511917307972908, "reward_std": 0.3373011276125908, "rewards/cosine_scaled_reward": -0.255958653986454, "rewards/format_reward": 0.0, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.356, "grad_norm": 2.941894054412842, "kl": 1.10400390625, "learning_rate": 9.83423155058946e-07, "loss": 0.0443, "reward": -0.49383244663476944, "reward_std": 0.3190907835960388, "rewards/cosine_scaled_reward": -0.24691622331738472, "rewards/format_reward": 0.0, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.36, "grad_norm": 2.5008065700531006, "kl": 0.7451171875, "learning_rate": 9.825677631722435e-07, "loss": 0.0298, "reward": -0.5015105679631233, "reward_std": 0.3283078894019127, "rewards/cosine_scaled_reward": -0.25075526908040047, "rewards/format_reward": 0.0, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.364, "grad_norm": 2.775805950164795, "kl": 0.8662109375, "learning_rate": 9.816912885430258e-07, "loss": 0.0347, "reward": -0.49317121505737305, "reward_std": 0.3281624838709831, "rewards/cosine_scaled_reward": -0.24658560752868652, "rewards/format_reward": 0.0, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.368, "grad_norm": 4.057337284088135, "kl": 1.3115234375, "learning_rate": 9.807937738894303e-07, "loss": 0.0525, "reward": -0.4923912510275841, "reward_std": 0.334882490336895, "rewards/cosine_scaled_reward": -0.24619561806321144, "rewards/format_reward": 0.0, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.372, "grad_norm": 3.3191726207733154, "kl": 1.416015625, "learning_rate": 9.798752629550546e-07, "loss": 0.0567, "reward": -0.4856347441673279, "reward_std": 0.3141849860548973, "rewards/cosine_scaled_reward": -0.24281736463308334, "rewards/format_reward": 0.0, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.376, "grad_norm": 38.36699676513672, "kl": 3.833984375, "learning_rate": 9.78935800506826e-07, "loss": 0.1535, "reward": -0.5001253262162209, "reward_std": 0.34716712683439255, "rewards/cosine_scaled_reward": -0.25006265565752983, "rewards/format_reward": 0.0, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.38, "grad_norm": 2.851670742034912, "kl": 0.93017578125, "learning_rate": 9.779754323328192e-07, "loss": 0.0372, "reward": -0.4462156817317009, "reward_std": 0.3170738257467747, "rewards/cosine_scaled_reward": -0.22310783341526985, "rewards/format_reward": 0.0, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.384, "grad_norm": 1.903143048286438, "kl": 0.662109375, "learning_rate": 9.769942052400235e-07, "loss": 0.0265, "reward": -0.44278524816036224, "reward_std": 0.340934194624424, "rewards/cosine_scaled_reward": -0.22139262408018112, "rewards/format_reward": 0.0, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.388, "grad_norm": 2.613619089126587, "kl": 1.0009765625, "learning_rate": 9.759921670520634e-07, "loss": 0.04, "reward": -0.4385986104607582, "reward_std": 0.3297598212957382, "rewards/cosine_scaled_reward": -0.2192993052303791, "rewards/format_reward": 0.0, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.392, "grad_norm": 2.1393027305603027, "kl": 0.84912109375, "learning_rate": 9.749693666068663e-07, "loss": 0.0339, "reward": -0.4335070326924324, "reward_std": 0.3084552064538002, "rewards/cosine_scaled_reward": -0.2167535126209259, "rewards/format_reward": 0.0, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.396, "grad_norm": 10.226459503173828, "kl": 1.9765625, "learning_rate": 9.739258537542835e-07, "loss": 0.0791, "reward": -0.5120433643460274, "reward_std": 0.3308994993567467, "rewards/cosine_scaled_reward": -0.2560216821730137, "rewards/format_reward": 0.0, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.4, "grad_norm": 2.7042365074157715, "kl": 1.140625, "learning_rate": 9.728616793536587e-07, "loss": 0.0456, "reward": -0.5387645438313484, "reward_std": 0.32419781386852264, "rewards/cosine_scaled_reward": -0.2693822719156742, "rewards/format_reward": 0.0, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.404, "grad_norm": 3.3440866470336914, "kl": 1.158203125, "learning_rate": 9.717768952713511e-07, "loss": 0.0464, "reward": -0.479642316699028, "reward_std": 0.3374394252896309, "rewards/cosine_scaled_reward": -0.2398211695253849, "rewards/format_reward": 0.0, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.408, "grad_norm": 2.1483707427978516, "kl": 0.55859375, "learning_rate": 9.706715543782064e-07, "loss": 0.0224, "reward": -0.4488200396299362, "reward_std": 0.3361233174800873, "rewards/cosine_scaled_reward": -0.2244100198149681, "rewards/format_reward": 0.0, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.412, "grad_norm": 4.173567771911621, "kl": 1.900390625, "learning_rate": 9.695457105469804e-07, "loss": 0.0759, "reward": -0.4979688450694084, "reward_std": 0.35078077018260956, "rewards/cosine_scaled_reward": -0.2489844374358654, "rewards/format_reward": 0.0, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.416, "grad_norm": 5.119884490966797, "kl": 1.611328125, "learning_rate": 9.683994186497132e-07, "loss": 0.0644, "reward": -0.513933926820755, "reward_std": 0.3170707896351814, "rewards/cosine_scaled_reward": -0.2569669596850872, "rewards/format_reward": 0.0, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.42, "grad_norm": 2.8145992755889893, "kl": 1.466796875, "learning_rate": 9.672327345550543e-07, "loss": 0.0587, "reward": -0.47269363701343536, "reward_std": 0.31501560658216476, "rewards/cosine_scaled_reward": -0.23634683340787888, "rewards/format_reward": 0.0, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.424, "grad_norm": 2.3274426460266113, "kl": 0.59033203125, "learning_rate": 9.66045715125541e-07, "loss": 0.0236, "reward": -0.44968922436237335, "reward_std": 0.3498781695961952, "rewards/cosine_scaled_reward": -0.22484461963176727, "rewards/format_reward": 0.0, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.428, "grad_norm": 2.2112016677856445, "kl": 1.126953125, "learning_rate": 9.648384182148252e-07, "loss": 0.0451, "reward": -0.5002073347568512, "reward_std": 0.34406865388154984, "rewards/cosine_scaled_reward": -0.2501036673784256, "rewards/format_reward": 0.0, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.432, "grad_norm": 2.4664499759674072, "kl": 1.0986328125, "learning_rate": 9.636109026648554e-07, "loss": 0.0439, "reward": -0.49009862542152405, "reward_std": 0.3558028042316437, "rewards/cosine_scaled_reward": -0.24504930526018143, "rewards/format_reward": 0.0, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.436, "grad_norm": 2.3740482330322266, "kl": 0.67578125, "learning_rate": 9.623632283030077e-07, "loss": 0.027, "reward": -0.4631711468100548, "reward_std": 0.34275270998477936, "rewards/cosine_scaled_reward": -0.2315855734050274, "rewards/format_reward": 0.0, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.44, "grad_norm": 2.9116501808166504, "kl": 1.1826171875, "learning_rate": 9.610954559391704e-07, "loss": 0.0473, "reward": -0.444116935133934, "reward_std": 0.37212707847356796, "rewards/cosine_scaled_reward": -0.2220584638416767, "rewards/format_reward": 0.0, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.444, "grad_norm": 2.24743390083313, "kl": 0.638671875, "learning_rate": 9.598076473627796e-07, "loss": 0.0255, "reward": -0.46286992728710175, "reward_std": 0.3208693787455559, "rewards/cosine_scaled_reward": -0.23143497854471207, "rewards/format_reward": 0.0, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.448, "grad_norm": 3.138840913772583, "kl": 1.14404296875, "learning_rate": 9.58499865339809e-07, "loss": 0.0458, "reward": -0.4803452715277672, "reward_std": 0.3449332043528557, "rewards/cosine_scaled_reward": -0.2401726357638836, "rewards/format_reward": 0.0, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.452, "grad_norm": 2.7688963413238525, "kl": 0.9462890625, "learning_rate": 9.571721736097088e-07, "loss": 0.0379, "reward": -0.4440384730696678, "reward_std": 0.3389856517314911, "rewards/cosine_scaled_reward": -0.2220192365348339, "rewards/format_reward": 0.0, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.456, "grad_norm": 2.7298948764801025, "kl": 1.3583984375, "learning_rate": 9.55824636882301e-07, "loss": 0.0544, "reward": -0.40611616894602776, "reward_std": 0.3120696693658829, "rewards/cosine_scaled_reward": -0.20305808261036873, "rewards/format_reward": 0.0, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.46, "grad_norm": 2.628330945968628, "kl": 0.84521484375, "learning_rate": 9.54457320834625e-07, "loss": 0.0338, "reward": -0.41812988370656967, "reward_std": 0.33337801694869995, "rewards/cosine_scaled_reward": -0.20906493440270424, "rewards/format_reward": 0.0, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.464, "grad_norm": 2.21708607673645, "kl": 1.125, "learning_rate": 9.530702921077358e-07, "loss": 0.0451, "reward": -0.4452592432498932, "reward_std": 0.34758392721414566, "rewards/cosine_scaled_reward": -0.2226296216249466, "rewards/format_reward": 0.0, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.468, "grad_norm": 3.4151782989501953, "kl": 1.5390625, "learning_rate": 9.516636183034564e-07, "loss": 0.0617, "reward": -0.5043663010001183, "reward_std": 0.3056981936097145, "rewards/cosine_scaled_reward": -0.25218314677476883, "rewards/format_reward": 0.0, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.472, "grad_norm": 2.8809969425201416, "kl": 1.498046875, "learning_rate": 9.502373679810839e-07, "loss": 0.0599, "reward": -0.44362927228212357, "reward_std": 0.32765333354473114, "rewards/cosine_scaled_reward": -0.22181464359164238, "rewards/format_reward": 0.0, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.476, "grad_norm": 3.092552661895752, "kl": 1.6640625, "learning_rate": 9.487916106540465e-07, "loss": 0.0665, "reward": -0.49818655103445053, "reward_std": 0.3495415672659874, "rewards/cosine_scaled_reward": -0.24909326806664467, "rewards/format_reward": 0.0, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.48, "grad_norm": 3.2943530082702637, "kl": 2.07421875, "learning_rate": 9.473264167865171e-07, "loss": 0.0829, "reward": -0.4802135229110718, "reward_std": 0.3453461080789566, "rewards/cosine_scaled_reward": -0.24010677635669708, "rewards/format_reward": 0.0, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.484, "grad_norm": 2.5681769847869873, "kl": 1.505859375, "learning_rate": 9.458418577899774e-07, "loss": 0.0603, "reward": -0.5175792872905731, "reward_std": 0.35768260806798935, "rewards/cosine_scaled_reward": -0.25878964737057686, "rewards/format_reward": 0.0, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.488, "grad_norm": 2.9190571308135986, "kl": 1.57373046875, "learning_rate": 9.443380060197385e-07, "loss": 0.063, "reward": -0.46548449248075485, "reward_std": 0.35348332673311234, "rewards/cosine_scaled_reward": -0.23274223506450653, "rewards/format_reward": 0.0, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.492, "grad_norm": 2.435157537460327, "kl": 1.0654296875, "learning_rate": 9.428149347714143e-07, "loss": 0.0427, "reward": -0.4281177818775177, "reward_std": 0.3503784313797951, "rewards/cosine_scaled_reward": -0.21405889093875885, "rewards/format_reward": 0.0, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.496, "grad_norm": 3.1375350952148438, "kl": 1.5625, "learning_rate": 9.412727182773486e-07, "loss": 0.0624, "reward": -0.4667646959424019, "reward_std": 0.3501163199543953, "rewards/cosine_scaled_reward": -0.23338234052062035, "rewards/format_reward": 0.0, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.5, "grad_norm": 2.1935606002807617, "kl": 1.3427734375, "learning_rate": 9.397114317029974e-07, "loss": 0.0537, "reward": -0.4283955693244934, "reward_std": 0.34814615547657013, "rewards/cosine_scaled_reward": -0.2141977809369564, "rewards/format_reward": 0.0, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.504, "grad_norm": 2.727754592895508, "kl": 1.35546875, "learning_rate": 9.381311511432658e-07, "loss": 0.0543, "reward": -0.4584430381655693, "reward_std": 0.3318573832511902, "rewards/cosine_scaled_reward": -0.22922151535749435, "rewards/format_reward": 0.0, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.508, "grad_norm": 2.9863674640655518, "kl": 1.509765625, "learning_rate": 9.36531953618799e-07, "loss": 0.0603, "reward": -0.4794049710035324, "reward_std": 0.3224741891026497, "rewards/cosine_scaled_reward": -0.2397024855017662, "rewards/format_reward": 0.0, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.512, "grad_norm": 3.0583863258361816, "kl": 1.5751953125, "learning_rate": 9.34913917072228e-07, "loss": 0.0631, "reward": -0.3896471783518791, "reward_std": 0.32155635207891464, "rewards/cosine_scaled_reward": -0.19482359662652016, "rewards/format_reward": 0.0, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.516, "grad_norm": 11.888484001159668, "kl": 2.1806640625, "learning_rate": 9.332771203643714e-07, "loss": 0.0874, "reward": -0.46486661583185196, "reward_std": 0.34625906497240067, "rewards/cosine_scaled_reward": -0.23243330791592598, "rewards/format_reward": 0.0, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.52, "grad_norm": 3.14744234085083, "kl": 1.1103515625, "learning_rate": 9.316216432703916e-07, "loss": 0.0445, "reward": -0.4691261351108551, "reward_std": 0.3357261121273041, "rewards/cosine_scaled_reward": -0.23456306010484695, "rewards/format_reward": 0.0, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.524, "grad_norm": 2.6933717727661133, "kl": 1.76171875, "learning_rate": 9.299475664759068e-07, "loss": 0.0705, "reward": -0.5458347946405411, "reward_std": 0.3296028599143028, "rewards/cosine_scaled_reward": -0.27291740477085114, "rewards/format_reward": 0.0, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.528, "grad_norm": 2.695984363555908, "kl": 1.2666015625, "learning_rate": 9.282549715730579e-07, "loss": 0.0506, "reward": -0.43337278813123703, "reward_std": 0.3223467916250229, "rewards/cosine_scaled_reward": -0.2166864052414894, "rewards/format_reward": 0.0, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.532, "grad_norm": 2.1844236850738525, "kl": 1.072265625, "learning_rate": 9.265439410565328e-07, "loss": 0.0429, "reward": -0.47815513610839844, "reward_std": 0.33408980816602707, "rewards/cosine_scaled_reward": -0.23907756060361862, "rewards/format_reward": 0.0, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.536, "grad_norm": 2.6240434646606445, "kl": 0.998046875, "learning_rate": 9.248145583195447e-07, "loss": 0.0399, "reward": -0.3596036769449711, "reward_std": 0.3202332779765129, "rewards/cosine_scaled_reward": -0.17980184871703386, "rewards/format_reward": 0.0, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.54, "grad_norm": 2.413489580154419, "kl": 1.515625, "learning_rate": 9.230669076497687e-07, "loss": 0.0607, "reward": -0.3980662524700165, "reward_std": 0.3146558068692684, "rewards/cosine_scaled_reward": -0.19903312623500824, "rewards/format_reward": 0.0, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.544, "grad_norm": 2.5466983318328857, "kl": 1.421875, "learning_rate": 9.213010742252327e-07, "loss": 0.0568, "reward": -0.4567502960562706, "reward_std": 0.36093486845493317, "rewards/cosine_scaled_reward": -0.2283751629292965, "rewards/format_reward": 0.0, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.548, "grad_norm": 2.670454263687134, "kl": 1.63671875, "learning_rate": 9.195171441101668e-07, "loss": 0.0655, "reward": -0.48265285044908524, "reward_std": 0.33601198345422745, "rewards/cosine_scaled_reward": -0.24132642522454262, "rewards/format_reward": 0.0, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.552, "grad_norm": 3.4489877223968506, "kl": 1.39453125, "learning_rate": 9.177152042508077e-07, "loss": 0.0558, "reward": -0.40766458958387375, "reward_std": 0.34357643127441406, "rewards/cosine_scaled_reward": -0.20383229106664658, "rewards/format_reward": 0.0, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.556, "grad_norm": 2.18890118598938, "kl": 1.30859375, "learning_rate": 9.158953424711624e-07, "loss": 0.0523, "reward": -0.4143947809934616, "reward_std": 0.323918879032135, "rewards/cosine_scaled_reward": -0.2071974016726017, "rewards/format_reward": 0.0, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.56, "grad_norm": 2.5627028942108154, "kl": 1.34423828125, "learning_rate": 9.140576474687263e-07, "loss": 0.0538, "reward": -0.4485241174697876, "reward_std": 0.3278198316693306, "rewards/cosine_scaled_reward": -0.2242620587348938, "rewards/format_reward": 0.0, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.564, "grad_norm": 2.086371660232544, "kl": 1.2802734375, "learning_rate": 9.122022088101613e-07, "loss": 0.0512, "reward": -0.32855524495244026, "reward_std": 0.33061159402132034, "rewards/cosine_scaled_reward": -0.16427762433886528, "rewards/format_reward": 0.0, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.568, "grad_norm": 2.45231556892395, "kl": 1.580078125, "learning_rate": 9.103291169269299e-07, "loss": 0.0632, "reward": -0.4703398421406746, "reward_std": 0.2972045987844467, "rewards/cosine_scaled_reward": -0.2351699210703373, "rewards/format_reward": 0.0, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.572, "grad_norm": 2.864070415496826, "kl": 1.984375, "learning_rate": 9.084384631108882e-07, "loss": 0.0794, "reward": -0.41980744898319244, "reward_std": 0.34404993802309036, "rewards/cosine_scaled_reward": -0.20990372076630592, "rewards/format_reward": 0.0, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.576, "grad_norm": 2.412257194519043, "kl": 1.544921875, "learning_rate": 9.065303395098358e-07, "loss": 0.0618, "reward": -0.43455804139375687, "reward_std": 0.32647445797920227, "rewards/cosine_scaled_reward": -0.21727901697158813, "rewards/format_reward": 0.0, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.58, "grad_norm": 2.952892780303955, "kl": 2.0595703125, "learning_rate": 9.046048391230247e-07, "loss": 0.0824, "reward": -0.4728480279445648, "reward_std": 0.33887017518281937, "rewards/cosine_scaled_reward": -0.2364240102469921, "rewards/format_reward": 0.0, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.584, "grad_norm": 2.3727328777313232, "kl": 1.7255859375, "learning_rate": 9.026620557966279e-07, "loss": 0.0692, "reward": -0.42372531443834305, "reward_std": 0.3417205289006233, "rewards/cosine_scaled_reward": -0.21186266466975212, "rewards/format_reward": 0.0, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.588, "grad_norm": 2.953756809234619, "kl": 2.353515625, "learning_rate": 9.007020842191634e-07, "loss": 0.0943, "reward": -0.43578075617551804, "reward_std": 0.34062809497117996, "rewards/cosine_scaled_reward": -0.21789037808775902, "rewards/format_reward": 0.0, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.592, "grad_norm": 2.5953478813171387, "kl": 1.38671875, "learning_rate": 8.987250199168808e-07, "loss": 0.0555, "reward": -0.4190576896071434, "reward_std": 0.34895560145378113, "rewards/cosine_scaled_reward": -0.2095288448035717, "rewards/format_reward": 0.0, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.596, "grad_norm": 2.4279496669769287, "kl": 1.62890625, "learning_rate": 8.967309592491052e-07, "loss": 0.0651, "reward": -0.4394699037075043, "reward_std": 0.3207908198237419, "rewards/cosine_scaled_reward": -0.21973494067788124, "rewards/format_reward": 0.0, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.6, "grad_norm": 2.974292516708374, "kl": 1.892578125, "learning_rate": 8.9471999940354e-07, "loss": 0.0757, "reward": -0.4797021597623825, "reward_std": 0.32065775990486145, "rewards/cosine_scaled_reward": -0.23985107988119125, "rewards/format_reward": 0.0, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.604, "grad_norm": 2.51299786567688, "kl": 0.87890625, "learning_rate": 8.926922383915315e-07, "loss": 0.0351, "reward": -0.4108778163790703, "reward_std": 0.326105996966362, "rewards/cosine_scaled_reward": -0.20543890818953514, "rewards/format_reward": 0.0, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.608, "grad_norm": 2.723388195037842, "kl": 1.2294921875, "learning_rate": 8.906477750432903e-07, "loss": 0.0492, "reward": -0.4178111329674721, "reward_std": 0.32895463705062866, "rewards/cosine_scaled_reward": -0.20890555530786514, "rewards/format_reward": 0.0, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.612, "grad_norm": 2.4097025394439697, "kl": 1.650390625, "learning_rate": 8.88586709003076e-07, "loss": 0.0659, "reward": -0.4825671687722206, "reward_std": 0.33990373462438583, "rewards/cosine_scaled_reward": -0.2412835843861103, "rewards/format_reward": 0.0, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.616, "grad_norm": 2.114370107650757, "kl": 1.390625, "learning_rate": 8.865091407243394e-07, "loss": 0.0556, "reward": -0.42671380192041397, "reward_std": 0.32950445264577866, "rewards/cosine_scaled_reward": -0.21335690841078758, "rewards/format_reward": 0.0, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.62, "grad_norm": 3.1770823001861572, "kl": 1.4287109375, "learning_rate": 8.844151714648274e-07, "loss": 0.0572, "reward": -0.4250905141234398, "reward_std": 0.3110942989587784, "rewards/cosine_scaled_reward": -0.2125452570617199, "rewards/format_reward": 0.0, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.624, "grad_norm": 2.6063926219940186, "kl": 1.796875, "learning_rate": 8.823049032816478e-07, "loss": 0.0719, "reward": -0.4206129387021065, "reward_std": 0.33140094578266144, "rewards/cosine_scaled_reward": -0.21030646935105324, "rewards/format_reward": 0.0, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.628, "grad_norm": 2.482637643814087, "kl": 1.525390625, "learning_rate": 8.801784390262943e-07, "loss": 0.061, "reward": -0.36781868524849415, "reward_std": 0.3281563073396683, "rewards/cosine_scaled_reward": -0.18390934821218252, "rewards/format_reward": 0.0, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.632, "grad_norm": 2.7100956439971924, "kl": 1.7861328125, "learning_rate": 8.780358823396352e-07, "loss": 0.0715, "reward": -0.3854188397526741, "reward_std": 0.31897617131471634, "rewards/cosine_scaled_reward": -0.19270941987633705, "rewards/format_reward": 0.0, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.636, "grad_norm": 2.3493990898132324, "kl": 1.859375, "learning_rate": 8.758773376468604e-07, "loss": 0.0746, "reward": -0.41636481136083603, "reward_std": 0.3308830112218857, "rewards/cosine_scaled_reward": -0.20818240568041801, "rewards/format_reward": 0.0, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.64, "grad_norm": 2.429762840270996, "kl": 1.78125, "learning_rate": 8.737029101523929e-07, "loss": 0.0714, "reward": -0.44961177557706833, "reward_std": 0.3425107002258301, "rewards/cosine_scaled_reward": -0.22480589523911476, "rewards/format_reward": 0.0, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.644, "grad_norm": 2.6372933387756348, "kl": 1.6474609375, "learning_rate": 8.715127058347614e-07, "loss": 0.066, "reward": -0.4204000309109688, "reward_std": 0.3256704956293106, "rewards/cosine_scaled_reward": -0.2102000191807747, "rewards/format_reward": 0.0, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.648, "grad_norm": 2.2505483627319336, "kl": 1.576171875, "learning_rate": 8.693068314414344e-07, "loss": 0.063, "reward": -0.4363863915205002, "reward_std": 0.3367513567209244, "rewards/cosine_scaled_reward": -0.2181931994855404, "rewards/format_reward": 0.0, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.652, "grad_norm": 2.781273603439331, "kl": 1.4375, "learning_rate": 8.670853944836176e-07, "loss": 0.0576, "reward": -0.44805190712213516, "reward_std": 0.3117773234844208, "rewards/cosine_scaled_reward": -0.22402595356106758, "rewards/format_reward": 0.0, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.656, "grad_norm": 2.573030710220337, "kl": 1.21435546875, "learning_rate": 8.648485032310144e-07, "loss": 0.0487, "reward": -0.40324684232473373, "reward_std": 0.3176472932100296, "rewards/cosine_scaled_reward": -0.20162343233823776, "rewards/format_reward": 0.0, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.66, "grad_norm": 4.171741485595703, "kl": 2.3125, "learning_rate": 8.625962667065487e-07, "loss": 0.0925, "reward": -0.4968671426177025, "reward_std": 0.3204089626669884, "rewards/cosine_scaled_reward": -0.24843357503414154, "rewards/format_reward": 0.0, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 1528.702392578125, "epoch": 0.664, "grad_norm": 2.1756961345672607, "kl": 1.7578125, "learning_rate": 8.603287946810513e-07, "loss": 0.0706, "reward": -0.4272613450884819, "reward_std": 0.32390115410089493, "rewards/cosine_scaled_reward": -0.21363067999482155, "rewards/format_reward": 0.0, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.668, "grad_norm": 2.2742207050323486, "kl": 1.912109375, "learning_rate": 8.580461976679099e-07, "loss": 0.0763, "reward": -0.3418873958289623, "reward_std": 0.29924022778868675, "rewards/cosine_scaled_reward": -0.17094369884580374, "rewards/format_reward": 0.0, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.672, "grad_norm": 2.1837146282196045, "kl": 1.3330078125, "learning_rate": 8.557485869176825e-07, "loss": 0.0533, "reward": -0.4050525277853012, "reward_std": 0.3251590058207512, "rewards/cosine_scaled_reward": -0.2025262601673603, "rewards/format_reward": 0.0, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.676, "grad_norm": 2.1009020805358887, "kl": 1.9326171875, "learning_rate": 8.534360744126753e-07, "loss": 0.0774, "reward": -0.4387947544455528, "reward_std": 0.3307826817035675, "rewards/cosine_scaled_reward": -0.21939736977219582, "rewards/format_reward": 0.0, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.68, "grad_norm": 2.515617609024048, "kl": 1.884765625, "learning_rate": 8.511087728614862e-07, "loss": 0.0754, "reward": -0.41566915810108185, "reward_std": 0.34893494844436646, "rewards/cosine_scaled_reward": -0.20783457532525063, "rewards/format_reward": 0.0, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.684, "grad_norm": 2.3045356273651123, "kl": 1.5078125, "learning_rate": 8.487667956935087e-07, "loss": 0.0604, "reward": -0.3871946483850479, "reward_std": 0.3363000229001045, "rewards/cosine_scaled_reward": -0.19359732419252396, "rewards/format_reward": 0.0, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.688, "grad_norm": 2.1517364978790283, "kl": 1.4169921875, "learning_rate": 8.464102570534061e-07, "loss": 0.0567, "reward": -0.41495678573846817, "reward_std": 0.33959241211414337, "rewards/cosine_scaled_reward": -0.20747840031981468, "rewards/format_reward": 0.0, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.692, "grad_norm": 2.4767415523529053, "kl": 1.5654296875, "learning_rate": 8.440392717955475e-07, "loss": 0.0626, "reward": -0.3259017579257488, "reward_std": 0.3448467329144478, "rewards/cosine_scaled_reward": -0.16295087756589055, "rewards/format_reward": 0.0, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.696, "grad_norm": 2.1803934574127197, "kl": 1.5986328125, "learning_rate": 8.416539554784089e-07, "loss": 0.0639, "reward": -0.45371130108833313, "reward_std": 0.3770594820380211, "rewards/cosine_scaled_reward": -0.22685565054416656, "rewards/format_reward": 0.0, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.7, "grad_norm": 2.146838426589966, "kl": 1.3212890625, "learning_rate": 8.392544243589427e-07, "loss": 0.053, "reward": -0.39382801204919815, "reward_std": 0.3155653551220894, "rewards/cosine_scaled_reward": -0.19691400602459908, "rewards/format_reward": 0.0, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.704, "grad_norm": 2.3939132690429688, "kl": 1.498046875, "learning_rate": 8.368407953869103e-07, "loss": 0.06, "reward": -0.397233285009861, "reward_std": 0.3429732918739319, "rewards/cosine_scaled_reward": -0.1986166313290596, "rewards/format_reward": 0.0, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.708, "grad_norm": 2.2279624938964844, "kl": 1.3759765625, "learning_rate": 8.344131861991828e-07, "loss": 0.0551, "reward": -0.41151023656129837, "reward_std": 0.3277590796351433, "rewards/cosine_scaled_reward": -0.2057551108300686, "rewards/format_reward": 0.0, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.712, "grad_norm": 2.5055384635925293, "kl": 1.341796875, "learning_rate": 8.319717151140072e-07, "loss": 0.0537, "reward": -0.4148360714316368, "reward_std": 0.3054031655192375, "rewards/cosine_scaled_reward": -0.2074180319905281, "rewards/format_reward": 0.0, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.716, "grad_norm": 2.605672836303711, "kl": 2.421875, "learning_rate": 8.295165011252396e-07, "loss": 0.0969, "reward": -0.49764253944158554, "reward_std": 0.34468474239110947, "rewards/cosine_scaled_reward": -0.24882125481963158, "rewards/format_reward": 0.0, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.72, "grad_norm": 1.8612443208694458, "kl": 1.958984375, "learning_rate": 8.270476638965461e-07, "loss": 0.0784, "reward": -0.41104499250650406, "reward_std": 0.32857123762369156, "rewards/cosine_scaled_reward": -0.20552249625325203, "rewards/format_reward": 0.0, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.724, "grad_norm": 2.20760178565979, "kl": 1.4267578125, "learning_rate": 8.245653237555705e-07, "loss": 0.0571, "reward": -0.4070161208510399, "reward_std": 0.29896606504917145, "rewards/cosine_scaled_reward": -0.20350806042551994, "rewards/format_reward": 0.0, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.728, "grad_norm": 2.527832269668579, "kl": 1.3251953125, "learning_rate": 8.220696016880687e-07, "loss": 0.053, "reward": -0.40310006588697433, "reward_std": 0.33485615253448486, "rewards/cosine_scaled_reward": -0.20155002549290657, "rewards/format_reward": 0.0, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.732, "grad_norm": 2.0901362895965576, "kl": 1.25, "learning_rate": 8.195606193320136e-07, "loss": 0.0499, "reward": -0.39147457480430603, "reward_std": 0.3105906918644905, "rewards/cosine_scaled_reward": -0.19573728740215302, "rewards/format_reward": 0.0, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.736, "grad_norm": 2.0712454319000244, "kl": 1.3271484375, "learning_rate": 8.170384989716657e-07, "loss": 0.053, "reward": -0.36338385939598083, "reward_std": 0.29373297840356827, "rewards/cosine_scaled_reward": -0.18169192969799042, "rewards/format_reward": 0.0, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.74, "grad_norm": 4.567477226257324, "kl": 2.91015625, "learning_rate": 8.145033635316128e-07, "loss": 0.1167, "reward": -0.46033478528261185, "reward_std": 0.309500552713871, "rewards/cosine_scaled_reward": -0.23016740009188652, "rewards/format_reward": 0.0, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.744, "grad_norm": 2.8025710582733154, "kl": 1.982421875, "learning_rate": 8.119553365707802e-07, "loss": 0.0793, "reward": -0.3399934060871601, "reward_std": 0.3289627507328987, "rewards/cosine_scaled_reward": -0.16999670304358006, "rewards/format_reward": 0.0, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.748, "grad_norm": 2.41241192817688, "kl": 1.6513671875, "learning_rate": 8.093945422764069e-07, "loss": 0.0663, "reward": -0.4002522900700569, "reward_std": 0.3234091103076935, "rewards/cosine_scaled_reward": -0.20012613758444786, "rewards/format_reward": 0.0, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.752, "grad_norm": 3.6371164321899414, "kl": 2.470703125, "learning_rate": 8.068211054579943e-07, "loss": 0.0988, "reward": -0.44175921380519867, "reward_std": 0.33701298385858536, "rewards/cosine_scaled_reward": -0.22087960690259933, "rewards/format_reward": 0.0, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.756, "grad_norm": 2.704362154006958, "kl": 1.71875, "learning_rate": 8.04235151541222e-07, "loss": 0.0686, "reward": -0.3934633806347847, "reward_std": 0.31845808029174805, "rewards/cosine_scaled_reward": -0.19673169776797295, "rewards/format_reward": 0.0, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.76, "grad_norm": 2.5518999099731445, "kl": 1.865234375, "learning_rate": 8.01636806561836e-07, "loss": 0.0746, "reward": -0.48456476628780365, "reward_std": 0.3398968055844307, "rewards/cosine_scaled_reward": -0.24228239431977272, "rewards/format_reward": 0.0, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.764, "grad_norm": 4.733001232147217, "kl": 2.0537109375, "learning_rate": 7.990261971595048e-07, "loss": 0.0822, "reward": -0.44671063870191574, "reward_std": 0.32652025669813156, "rewards/cosine_scaled_reward": -0.22335530444979668, "rewards/format_reward": 0.0, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.768, "grad_norm": 2.217525005340576, "kl": 1.72265625, "learning_rate": 7.964034505716476e-07, "loss": 0.0689, "reward": -0.38292936980724335, "reward_std": 0.3729139119386673, "rewards/cosine_scaled_reward": -0.19146469235420227, "rewards/format_reward": 0.0, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.772, "grad_norm": 2.3045313358306885, "kl": 1.0576171875, "learning_rate": 7.93768694627233e-07, "loss": 0.0423, "reward": -0.36335285753011703, "reward_std": 0.3274284452199936, "rewards/cosine_scaled_reward": -0.18167642876505852, "rewards/format_reward": 0.0, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.776, "grad_norm": 2.220212936401367, "kl": 1.974609375, "learning_rate": 7.911220577405484e-07, "loss": 0.0791, "reward": -0.41132358461618423, "reward_std": 0.33213579654693604, "rewards/cosine_scaled_reward": -0.20566179975867271, "rewards/format_reward": 0.0, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.78, "grad_norm": 2.872774124145508, "kl": 2.04296875, "learning_rate": 7.884636689049422e-07, "loss": 0.0819, "reward": -0.41410720348358154, "reward_std": 0.3132774606347084, "rewards/cosine_scaled_reward": -0.20705359801650047, "rewards/format_reward": 0.0, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.784, "grad_norm": 3.354735851287842, "kl": 1.2236328125, "learning_rate": 7.857936576865356e-07, "loss": 0.0489, "reward": -0.34651997685432434, "reward_std": 0.27611755579710007, "rewards/cosine_scaled_reward": -0.17325998842716217, "rewards/format_reward": 0.0, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.788, "grad_norm": 2.019547939300537, "kl": 1.03515625, "learning_rate": 7.831121542179086e-07, "loss": 0.0414, "reward": -0.36961859464645386, "reward_std": 0.3042915388941765, "rewards/cosine_scaled_reward": -0.18480929359793663, "rewards/format_reward": 0.0, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.792, "grad_norm": 2.245211601257324, "kl": 1.408203125, "learning_rate": 7.804192891917571e-07, "loss": 0.0564, "reward": -0.3812807723879814, "reward_std": 0.30970512330532074, "rewards/cosine_scaled_reward": -0.190640389919281, "rewards/format_reward": 0.0, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.796, "grad_norm": 2.0456931591033936, "kl": 1.673828125, "learning_rate": 7.777151938545235e-07, "loss": 0.067, "reward": -0.38433101773262024, "reward_std": 0.3408072590827942, "rewards/cosine_scaled_reward": -0.19216550886631012, "rewards/format_reward": 0.0, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.8, "grad_norm": 6.253657817840576, "kl": 1.48876953125, "learning_rate": 7.75e-07, "loss": 0.0595, "reward": -0.3863793611526489, "reward_std": 0.3155966252088547, "rewards/cosine_scaled_reward": -0.19318969175219536, "rewards/format_reward": 0.0, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.804, "grad_norm": 2.2331368923187256, "kl": 1.96484375, "learning_rate": 7.72273839962904e-07, "loss": 0.0786, "reward": -0.41171175986528397, "reward_std": 0.34651194512844086, "rewards/cosine_scaled_reward": -0.20585588365793228, "rewards/format_reward": 0.0, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.808, "grad_norm": 2.1702663898468018, "kl": 1.296875, "learning_rate": 7.695368466124296e-07, "loss": 0.0519, "reward": -0.38244833052158356, "reward_std": 0.34267907589673996, "rewards/cosine_scaled_reward": -0.19122417271137238, "rewards/format_reward": 0.0, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.812, "grad_norm": 2.0549793243408203, "kl": 1.4345703125, "learning_rate": 7.667891533457718e-07, "loss": 0.0573, "reward": -0.4125688225030899, "reward_std": 0.33167801052331924, "rewards/cosine_scaled_reward": -0.20628441870212555, "rewards/format_reward": 0.0, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.816, "grad_norm": 2.7793009281158447, "kl": 1.958984375, "learning_rate": 7.640308940816239e-07, "loss": 0.0784, "reward": -0.45417842268943787, "reward_std": 0.3453121930360794, "rewards/cosine_scaled_reward": -0.22708921134471893, "rewards/format_reward": 0.0, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.82, "grad_norm": 8.324098587036133, "kl": 2.23388671875, "learning_rate": 7.612622032536507e-07, "loss": 0.0895, "reward": -0.3973395526409149, "reward_std": 0.32590440660715103, "rewards/cosine_scaled_reward": -0.19866977632045746, "rewards/format_reward": 0.0, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.824, "grad_norm": 2.22940993309021, "kl": 1.51171875, "learning_rate": 7.584832158039378e-07, "loss": 0.0605, "reward": -0.4044779762625694, "reward_std": 0.33285098522901535, "rewards/cosine_scaled_reward": -0.2022389993071556, "rewards/format_reward": 0.0, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.828, "grad_norm": 2.824735164642334, "kl": 1.310546875, "learning_rate": 7.556940671764124e-07, "loss": 0.0524, "reward": -0.4486440494656563, "reward_std": 0.33797865360975266, "rewards/cosine_scaled_reward": -0.22432202845811844, "rewards/format_reward": 0.0, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.832, "grad_norm": 2.2558631896972656, "kl": 1.1962890625, "learning_rate": 7.528948933102438e-07, "loss": 0.0478, "reward": -0.40251782536506653, "reward_std": 0.30128662288188934, "rewards/cosine_scaled_reward": -0.20125891268253326, "rewards/format_reward": 0.0, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.836, "grad_norm": 2.7602171897888184, "kl": 0.9951171875, "learning_rate": 7.500858306332172e-07, "loss": 0.0398, "reward": -0.31514767929911613, "reward_std": 0.3020384646952152, "rewards/cosine_scaled_reward": -0.15757383964955807, "rewards/format_reward": 0.0, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.84, "grad_norm": 2.6217448711395264, "kl": 1.71484375, "learning_rate": 7.472670160550848e-07, "loss": 0.0684, "reward": -0.3670196682214737, "reward_std": 0.31881674379110336, "rewards/cosine_scaled_reward": -0.18350983038544655, "rewards/format_reward": 0.0, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.844, "grad_norm": 2.0915112495422363, "kl": 1.2841796875, "learning_rate": 7.444385869608921e-07, "loss": 0.0514, "reward": -0.4177168160676956, "reward_std": 0.3398260995745659, "rewards/cosine_scaled_reward": -0.2088584043085575, "rewards/format_reward": 0.0, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.848, "grad_norm": 1.7296172380447388, "kl": 1.2724609375, "learning_rate": 7.416006812042827e-07, "loss": 0.051, "reward": -0.41255099326372147, "reward_std": 0.33872970938682556, "rewards/cosine_scaled_reward": -0.20627548918128014, "rewards/format_reward": 0.0, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.852, "grad_norm": 2.1323206424713135, "kl": 1.16162109375, "learning_rate": 7.387534371007797e-07, "loss": 0.0466, "reward": -0.2759926188737154, "reward_std": 0.30077088996768, "rewards/cosine_scaled_reward": -0.1379963019862771, "rewards/format_reward": 0.0, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.856, "grad_norm": 2.3771109580993652, "kl": 1.556640625, "learning_rate": 7.358969934210438e-07, "loss": 0.0622, "reward": -0.3614875078201294, "reward_std": 0.32025381922721863, "rewards/cosine_scaled_reward": -0.1807437539100647, "rewards/format_reward": 0.0, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.86, "grad_norm": 2.940969467163086, "kl": 1.8828125, "learning_rate": 7.330314893841101e-07, "loss": 0.0754, "reward": -0.29097072361037135, "reward_std": 0.28063248097896576, "rewards/cosine_scaled_reward": -0.14548537082737312, "rewards/format_reward": 0.0, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.864, "grad_norm": 1.9293019771575928, "kl": 1.62890625, "learning_rate": 7.301570646506027e-07, "loss": 0.0652, "reward": -0.4154031127691269, "reward_std": 0.34460632503032684, "rewards/cosine_scaled_reward": -0.20770153775811195, "rewards/format_reward": 0.0, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.868, "grad_norm": 2.745267391204834, "kl": 2.0888671875, "learning_rate": 7.27273859315928e-07, "loss": 0.0835, "reward": -0.4031589925289154, "reward_std": 0.31946661323308945, "rewards/cosine_scaled_reward": -0.2015794888138771, "rewards/format_reward": 0.0, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.872, "grad_norm": 2.873622179031372, "kl": 1.5078125, "learning_rate": 7.243820139034464e-07, "loss": 0.0604, "reward": -0.4128880575299263, "reward_std": 0.3311196342110634, "rewards/cosine_scaled_reward": -0.20644402503967285, "rewards/format_reward": 0.0, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.876, "grad_norm": 2.7079639434814453, "kl": 1.2607421875, "learning_rate": 7.214816693576234e-07, "loss": 0.0505, "reward": -0.3099018558859825, "reward_std": 0.2861209958791733, "rewards/cosine_scaled_reward": -0.15495092794299126, "rewards/format_reward": 0.0, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.88, "grad_norm": 1.9640864133834839, "kl": 1.234375, "learning_rate": 7.185729670371604e-07, "loss": 0.0493, "reward": -0.40535254031419754, "reward_std": 0.2874290943145752, "rewards/cosine_scaled_reward": -0.20267625898122787, "rewards/format_reward": 0.0, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.884, "grad_norm": 2.130681037902832, "kl": 1.486328125, "learning_rate": 7.156560487081051e-07, "loss": 0.0595, "reward": -0.3594564124941826, "reward_std": 0.3218042775988579, "rewards/cosine_scaled_reward": -0.1797281987965107, "rewards/format_reward": 0.0, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.888, "grad_norm": 2.1852834224700928, "kl": 1.48046875, "learning_rate": 7.127310565369415e-07, "loss": 0.0591, "reward": -0.331524558365345, "reward_std": 0.28531621396541595, "rewards/cosine_scaled_reward": -0.1657622903585434, "rewards/format_reward": 0.0, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.892, "grad_norm": 2.3731930255889893, "kl": 1.734375, "learning_rate": 7.097981330836616e-07, "loss": 0.0693, "reward": -0.38006093353033066, "reward_std": 0.3292882591485977, "rewards/cosine_scaled_reward": -0.19003047049045563, "rewards/format_reward": 0.0, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.896, "grad_norm": 2.3246822357177734, "kl": 1.0390625, "learning_rate": 7.068574212948169e-07, "loss": 0.0416, "reward": -0.3990800455212593, "reward_std": 0.3413678854703903, "rewards/cosine_scaled_reward": -0.19954002648591995, "rewards/format_reward": 0.0, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.9, "grad_norm": 2.4476959705352783, "kl": 1.45703125, "learning_rate": 7.039090644965509e-07, "loss": 0.0583, "reward": -0.39841071516275406, "reward_std": 0.31324755400419235, "rewards/cosine_scaled_reward": -0.19920538365840912, "rewards/format_reward": 0.0, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.904, "grad_norm": 3.0681633949279785, "kl": 1.75, "learning_rate": 7.009532063876148e-07, "loss": 0.0701, "reward": -0.35963694006204605, "reward_std": 0.3227182477712631, "rewards/cosine_scaled_reward": -0.17981846630573273, "rewards/format_reward": 0.0, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.908, "grad_norm": 3.8354952335357666, "kl": 1.5087890625, "learning_rate": 6.979899910323624e-07, "loss": 0.0604, "reward": -0.3886452168226242, "reward_std": 0.31125637143850327, "rewards/cosine_scaled_reward": -0.1943226121366024, "rewards/format_reward": 0.0, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.912, "grad_norm": 2.3208184242248535, "kl": 1.39453125, "learning_rate": 6.950195628537299e-07, "loss": 0.0558, "reward": -0.34270477294921875, "reward_std": 0.3698492497205734, "rewards/cosine_scaled_reward": -0.17135238647460938, "rewards/format_reward": 0.0, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.916, "grad_norm": 2.174126386642456, "kl": 2.009765625, "learning_rate": 6.920420666261961e-07, "loss": 0.0804, "reward": -0.37576939910650253, "reward_std": 0.3269713968038559, "rewards/cosine_scaled_reward": -0.18788469955325127, "rewards/format_reward": 0.0, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.92, "grad_norm": 2.081784725189209, "kl": 2.1728515625, "learning_rate": 6.890576474687263e-07, "loss": 0.0869, "reward": -0.3998561128973961, "reward_std": 0.32443511486053467, "rewards/cosine_scaled_reward": -0.19992805272340775, "rewards/format_reward": 0.0, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.924, "grad_norm": 2.3403866291046143, "kl": 1.17529296875, "learning_rate": 6.860664508377001e-07, "loss": 0.0469, "reward": -0.38807813823223114, "reward_std": 0.32711831480264664, "rewards/cosine_scaled_reward": -0.19403906539082527, "rewards/format_reward": 0.0, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.928, "grad_norm": 2.029927968978882, "kl": 1.32666015625, "learning_rate": 6.83068622519821e-07, "loss": 0.0531, "reward": -0.38948777318000793, "reward_std": 0.3195284381508827, "rewards/cosine_scaled_reward": -0.19474387168884277, "rewards/format_reward": 0.0, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.932, "grad_norm": 2.9124484062194824, "kl": 1.71484375, "learning_rate": 6.800643086250121e-07, "loss": 0.0685, "reward": -0.3806769847869873, "reward_std": 0.2985011041164398, "rewards/cosine_scaled_reward": -0.19033849611878395, "rewards/format_reward": 0.0, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.936, "grad_norm": 2.464742422103882, "kl": 1.2998046875, "learning_rate": 6.770536555792944e-07, "loss": 0.052, "reward": -0.3443439155817032, "reward_std": 0.29415207356214523, "rewards/cosine_scaled_reward": -0.1721719540655613, "rewards/format_reward": 0.0, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.94, "grad_norm": 2.1291651725769043, "kl": 1.001953125, "learning_rate": 6.740368101176495e-07, "loss": 0.0401, "reward": -0.33735504001379013, "reward_std": 0.28946489840745926, "rewards/cosine_scaled_reward": -0.16867752373218536, "rewards/format_reward": 0.0, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.944, "grad_norm": 2.9513416290283203, "kl": 1.6201171875, "learning_rate": 6.710139192768694e-07, "loss": 0.0649, "reward": -0.40289320796728134, "reward_std": 0.30230626463890076, "rewards/cosine_scaled_reward": -0.20144660398364067, "rewards/format_reward": 0.0, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.948, "grad_norm": 3.7395241260528564, "kl": 1.6240234375, "learning_rate": 6.679851303883891e-07, "loss": 0.065, "reward": -0.3659610077738762, "reward_std": 0.32638294249773026, "rewards/cosine_scaled_reward": -0.1829805038869381, "rewards/format_reward": 0.0, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.952, "grad_norm": 2.7872421741485596, "kl": 1.7919921875, "learning_rate": 6.649505910711058e-07, "loss": 0.0718, "reward": -0.4507276937365532, "reward_std": 0.35789574682712555, "rewards/cosine_scaled_reward": -0.2253638356924057, "rewards/format_reward": 0.0, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.956, "grad_norm": 2.139983654022217, "kl": 1.40234375, "learning_rate": 6.619104492241847e-07, "loss": 0.056, "reward": -0.3731803297996521, "reward_std": 0.30503255128860474, "rewards/cosine_scaled_reward": -0.18659016117453575, "rewards/format_reward": 0.0, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.96, "grad_norm": 6.420464515686035, "kl": 2.787109375, "learning_rate": 6.588648530198504e-07, "loss": 0.1116, "reward": -0.40894675999879837, "reward_std": 0.3296940475702286, "rewards/cosine_scaled_reward": -0.20447338744997978, "rewards/format_reward": 0.0, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.964, "grad_norm": 2.4638171195983887, "kl": 2.1806640625, "learning_rate": 6.558139508961654e-07, "loss": 0.0874, "reward": -0.42437078058719635, "reward_std": 0.3512648344039917, "rewards/cosine_scaled_reward": -0.21218538656830788, "rewards/format_reward": 0.0, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.968, "grad_norm": 2.8068432807922363, "kl": 1.884765625, "learning_rate": 6.527578915497951e-07, "loss": 0.0754, "reward": -0.394868440926075, "reward_std": 0.2916436865925789, "rewards/cosine_scaled_reward": -0.1974342130124569, "rewards/format_reward": 0.0, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.972, "grad_norm": 2.272479295730591, "kl": 1.453125, "learning_rate": 6.496968239287603e-07, "loss": 0.0581, "reward": -0.36773569136857986, "reward_std": 0.3104323297739029, "rewards/cosine_scaled_reward": -0.18386784568428993, "rewards/format_reward": 0.0, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.976, "grad_norm": 2.86352276802063, "kl": 1.8525390625, "learning_rate": 6.466308972251785e-07, "loss": 0.0742, "reward": -0.3895353376865387, "reward_std": 0.30376598984003067, "rewards/cosine_scaled_reward": -0.19476767256855965, "rewards/format_reward": 0.0, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.98, "grad_norm": 3.2674906253814697, "kl": 1.89453125, "learning_rate": 6.435602608679916e-07, "loss": 0.0758, "reward": -0.35536977648735046, "reward_std": 0.32461147010326385, "rewards/cosine_scaled_reward": -0.17768489941954613, "rewards/format_reward": 0.0, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.984, "grad_norm": 2.3651580810546875, "kl": 1.3994140625, "learning_rate": 6.404850645156841e-07, "loss": 0.0559, "reward": -0.2967621465213597, "reward_std": 0.29580704867839813, "rewards/cosine_scaled_reward": -0.1483810821082443, "rewards/format_reward": 0.0, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.988, "grad_norm": 2.6290199756622314, "kl": 1.544921875, "learning_rate": 6.374054580489873e-07, "loss": 0.0618, "reward": -0.3732440918684006, "reward_std": 0.28786107152700424, "rewards/cosine_scaled_reward": -0.1866220459342003, "rewards/format_reward": 0.0, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.992, "grad_norm": 2.474320650100708, "kl": 1.18359375, "learning_rate": 6.343215915635761e-07, "loss": 0.0473, "reward": -0.3813322111964226, "reward_std": 0.3196609243750572, "rewards/cosine_scaled_reward": -0.1906661055982113, "rewards/format_reward": 0.0, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 0.996, "grad_norm": 2.4096460342407227, "kl": 1.185546875, "learning_rate": 6.31233615362752e-07, "loss": 0.0475, "reward": -0.37723246961832047, "reward_std": 0.32298891991376877, "rewards/cosine_scaled_reward": -0.18861623480916023, "rewards/format_reward": 0.0, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 1536.0001220703125, "epoch": 1.0, "grad_norm": 2.414369821548462, "kl": 1.1552734375, "learning_rate": 6.281416799501187e-07, "loss": 0.0462, "reward": -0.3446759209036827, "reward_std": 0.30413854122161865, "rewards/cosine_scaled_reward": -0.17233795672655106, "rewards/format_reward": 0.0, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.004, "grad_norm": 2.3181285858154297, "kl": 1.4765625, "learning_rate": 6.25045936022246e-07, "loss": 0.0591, "reward": -0.39850035309791565, "reward_std": 0.3559228628873825, "rewards/cosine_scaled_reward": -0.19925018772482872, "rewards/format_reward": 0.0, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.008, "grad_norm": 2.3214640617370605, "kl": 1.59375, "learning_rate": 6.219465344613258e-07, "loss": 0.0637, "reward": -0.3477981239557266, "reward_std": 0.3031875118613243, "rewards/cosine_scaled_reward": -0.1738990694284439, "rewards/format_reward": 0.0, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.012, "grad_norm": 2.4848833084106445, "kl": 1.6416015625, "learning_rate": 6.188436263278172e-07, "loss": 0.0657, "reward": -0.402904212474823, "reward_std": 0.32011619955301285, "rewards/cosine_scaled_reward": -0.2014521062374115, "rewards/format_reward": 0.0, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.016, "grad_norm": 7.0177903175354, "kl": 3.015625, "learning_rate": 6.157373628530852e-07, "loss": 0.1206, "reward": -0.41366545110940933, "reward_std": 0.3347878158092499, "rewards/cosine_scaled_reward": -0.20683272555470467, "rewards/format_reward": 0.0, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 1533.3928527832031, "epoch": 1.02, "grad_norm": 2.5155041217803955, "kl": 1.818359375, "learning_rate": 6.126278954320294e-07, "loss": 0.073, "reward": -0.41607701033353806, "reward_std": 0.33659277111291885, "rewards/cosine_scaled_reward": -0.20803850889205933, "rewards/format_reward": 0.0, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.024, "grad_norm": 3.175401449203491, "kl": 2.349609375, "learning_rate": 6.095153756157051e-07, "loss": 0.094, "reward": -0.3731570616364479, "reward_std": 0.3251727372407913, "rewards/cosine_scaled_reward": -0.18657853826880455, "rewards/format_reward": 0.0, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.028, "grad_norm": 2.345123052597046, "kl": 2.140625, "learning_rate": 6.06399955103937e-07, "loss": 0.0857, "reward": -0.4059467390179634, "reward_std": 0.3182907700538635, "rewards/cosine_scaled_reward": -0.2029733695089817, "rewards/format_reward": 0.0, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.032, "grad_norm": 2.636462688446045, "kl": 1.705078125, "learning_rate": 6.032817857379256e-07, "loss": 0.068, "reward": -0.343365378677845, "reward_std": 0.3163585662841797, "rewards/cosine_scaled_reward": -0.1716826893389225, "rewards/format_reward": 0.0, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.036, "grad_norm": 2.297900438308716, "kl": 1.51953125, "learning_rate": 6.001610194928464e-07, "loss": 0.0608, "reward": -0.3703172579407692, "reward_std": 0.3630036562681198, "rewards/cosine_scaled_reward": -0.1851586326956749, "rewards/format_reward": 0.0, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.04, "grad_norm": 2.311648368835449, "kl": 1.515625, "learning_rate": 5.97037808470444e-07, "loss": 0.0605, "reward": -0.3789840117096901, "reward_std": 0.330322228372097, "rewards/cosine_scaled_reward": -0.18949199840426445, "rewards/format_reward": 0.0, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.044, "grad_norm": 2.3599531650543213, "kl": 1.78515625, "learning_rate": 5.939123048916173e-07, "loss": 0.0714, "reward": -0.3447503596544266, "reward_std": 0.33612456917762756, "rewards/cosine_scaled_reward": -0.17237518727779388, "rewards/format_reward": 0.0, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 1527.6190490722656, "epoch": 1.048, "grad_norm": 2.2337074279785156, "kl": 1.890625, "learning_rate": 5.907846610890011e-07, "loss": 0.0786, "reward": -0.39859064668416977, "reward_std": 0.32645051926374435, "rewards/cosine_scaled_reward": -0.1992953196167946, "rewards/format_reward": 0.0, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.052, "grad_norm": 2.818617582321167, "kl": 1.55859375, "learning_rate": 5.87655029499542e-07, "loss": 0.0624, "reward": -0.3537183925509453, "reward_std": 0.309035487473011, "rewards/cosine_scaled_reward": -0.17685920372605324, "rewards/format_reward": 0.0, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.056, "grad_norm": 2.3533854484558105, "kl": 1.3583984375, "learning_rate": 5.845235626570683e-07, "loss": 0.0543, "reward": -0.3672221526503563, "reward_std": 0.31650061905384064, "rewards/cosine_scaled_reward": -0.18361108005046844, "rewards/format_reward": 0.0, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.06, "grad_norm": 3.936475992202759, "kl": 2.265625, "learning_rate": 5.813904131848564e-07, "loss": 0.0907, "reward": -0.36572812497615814, "reward_std": 0.2912697494029999, "rewards/cosine_scaled_reward": -0.18286405876278877, "rewards/format_reward": 0.0, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.064, "grad_norm": 2.754866600036621, "kl": 1.943359375, "learning_rate": 5.78255733788191e-07, "loss": 0.0777, "reward": -0.37356945127248764, "reward_std": 0.34380726516246796, "rewards/cosine_scaled_reward": -0.18678472936153412, "rewards/format_reward": 0.0, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.068, "grad_norm": 2.374964952468872, "kl": 1.4267578125, "learning_rate": 5.751196772469237e-07, "loss": 0.0571, "reward": -0.3651036322116852, "reward_std": 0.30468039214611053, "rewards/cosine_scaled_reward": -0.1825518161058426, "rewards/format_reward": 0.0, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 1533.0535888671875, "epoch": 1.072, "grad_norm": 2.618032693862915, "kl": 1.6171875, "learning_rate": 5.71982396408026e-07, "loss": 0.0651, "reward": -0.35353927314281464, "reward_std": 0.3086354061961174, "rewards/cosine_scaled_reward": -0.17676963657140732, "rewards/format_reward": 0.0, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.076, "grad_norm": 2.920133590698242, "kl": 1.8515625, "learning_rate": 5.688440441781398e-07, "loss": 0.074, "reward": -0.37572528421878815, "reward_std": 0.33292342722415924, "rewards/cosine_scaled_reward": -0.18786264210939407, "rewards/format_reward": 0.0, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.08, "grad_norm": 2.581885576248169, "kl": 1.830078125, "learning_rate": 5.657047735161255e-07, "loss": 0.0732, "reward": -0.34584221988916397, "reward_std": 0.3140456974506378, "rewards/cosine_scaled_reward": -0.17292110994458199, "rewards/format_reward": 0.0, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.084, "grad_norm": 8.366601943969727, "kl": 2.509765625, "learning_rate": 5.625647374256061e-07, "loss": 0.1003, "reward": -0.37314866855740547, "reward_std": 0.2792880907654762, "rewards/cosine_scaled_reward": -0.18657432682812214, "rewards/format_reward": 0.0, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.088, "grad_norm": 3.071047067642212, "kl": 1.9658203125, "learning_rate": 5.594240889475106e-07, "loss": 0.0785, "reward": -0.39643432199954987, "reward_std": 0.31065937131643295, "rewards/cosine_scaled_reward": -0.19821715354919434, "rewards/format_reward": 0.0, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.092, "grad_norm": 3.8571436405181885, "kl": 1.2626953125, "learning_rate": 5.562829811526154e-07, "loss": 0.0506, "reward": -0.3136083036661148, "reward_std": 0.28241100907325745, "rewards/cosine_scaled_reward": -0.1568041555583477, "rewards/format_reward": 0.0, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.096, "grad_norm": 2.1380457878112793, "kl": 1.96875, "learning_rate": 5.531415671340826e-07, "loss": 0.0786, "reward": -0.35791803896427155, "reward_std": 0.3191326707601547, "rewards/cosine_scaled_reward": -0.17895901948213577, "rewards/format_reward": 0.0, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.1, "grad_norm": 3.744987964630127, "kl": 2.048828125, "learning_rate": 5.5e-07, "loss": 0.0819, "reward": -0.3743599057197571, "reward_std": 0.3121279552578926, "rewards/cosine_scaled_reward": -0.18717995658516884, "rewards/format_reward": 0.0, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.104, "grad_norm": 2.783698081970215, "kl": 1.8984375, "learning_rate": 5.468584328659172e-07, "loss": 0.0761, "reward": -0.3865007609128952, "reward_std": 0.322613961994648, "rewards/cosine_scaled_reward": -0.1932503841817379, "rewards/format_reward": 0.0, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.108, "grad_norm": 3.2086503505706787, "kl": 1.865234375, "learning_rate": 5.437170188473847e-07, "loss": 0.0746, "reward": -0.41129884123802185, "reward_std": 0.3018573820590973, "rewards/cosine_scaled_reward": -0.20564941689372063, "rewards/format_reward": 0.0, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.112, "grad_norm": 2.4078729152679443, "kl": 1.4072265625, "learning_rate": 5.405759110524894e-07, "loss": 0.0563, "reward": -0.39701489359140396, "reward_std": 0.3126164525747299, "rewards/cosine_scaled_reward": -0.19850744307041168, "rewards/format_reward": 0.0, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.116, "grad_norm": 2.5043461322784424, "kl": 2.35546875, "learning_rate": 5.37435262574394e-07, "loss": 0.0944, "reward": -0.28278425987809896, "reward_std": 0.2714259997010231, "rewards/cosine_scaled_reward": -0.1413921354105696, "rewards/format_reward": 0.0, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 1533.6190490722656, "epoch": 1.12, "grad_norm": 4.991820335388184, "kl": 1.83984375, "learning_rate": 5.342952264838747e-07, "loss": 0.0713, "reward": -0.3403998464345932, "reward_std": 0.3223363533616066, "rewards/cosine_scaled_reward": -0.1701999232172966, "rewards/format_reward": 0.0, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 1534.0476379394531, "epoch": 1.124, "grad_norm": 2.818126916885376, "kl": 1.37890625, "learning_rate": 5.311559558218603e-07, "loss": 0.054, "reward": -0.3611769676208496, "reward_std": 0.3213232010602951, "rewards/cosine_scaled_reward": -0.1805884800851345, "rewards/format_reward": 0.0, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.1280000000000001, "grad_norm": 2.7234742641448975, "kl": 2.248046875, "learning_rate": 5.28017603591974e-07, "loss": 0.0899, "reward": -0.4201104864478111, "reward_std": 0.3131628781557083, "rewards/cosine_scaled_reward": -0.21005523577332497, "rewards/format_reward": 0.0, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.1320000000000001, "grad_norm": 6.938405990600586, "kl": 1.998046875, "learning_rate": 5.248803227530763e-07, "loss": 0.0799, "reward": -0.33411792665719986, "reward_std": 0.32330870628356934, "rewards/cosine_scaled_reward": -0.16705895960330963, "rewards/format_reward": 0.0, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.1360000000000001, "grad_norm": 3.5663974285125732, "kl": 1.3349609375, "learning_rate": 5.21744266211809e-07, "loss": 0.0534, "reward": -0.3633820191025734, "reward_std": 0.31287185102701187, "rewards/cosine_scaled_reward": -0.1816909983754158, "rewards/format_reward": 0.0, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.1400000000000001, "grad_norm": 2.0476882457733154, "kl": 1.708984375, "learning_rate": 5.186095868151436e-07, "loss": 0.0684, "reward": -0.3689531907439232, "reward_std": 0.32297470420598984, "rewards/cosine_scaled_reward": -0.184476587921381, "rewards/format_reward": 0.0, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 1526.8869018554688, "epoch": 1.144, "grad_norm": 12.345512390136719, "kl": 2.966796875, "learning_rate": 5.154764373429315e-07, "loss": 0.1254, "reward": -0.3650151863694191, "reward_std": 0.31899186968803406, "rewards/cosine_scaled_reward": -0.18250760063529015, "rewards/format_reward": 0.0, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.148, "grad_norm": 2.059617519378662, "kl": 2.291015625, "learning_rate": 5.123449705004581e-07, "loss": 0.0916, "reward": -0.3706892877817154, "reward_std": 0.32747378945350647, "rewards/cosine_scaled_reward": -0.1853446513414383, "rewards/format_reward": 0.0, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.152, "grad_norm": 3.889174699783325, "kl": 2.0859375, "learning_rate": 5.09215338910999e-07, "loss": 0.0834, "reward": -0.4078289121389389, "reward_std": 0.3290611281991005, "rewards/cosine_scaled_reward": -0.20391445606946945, "rewards/format_reward": 0.0, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 1533.2440490722656, "epoch": 1.156, "grad_norm": 2.5038888454437256, "kl": 0.93896484375, "learning_rate": 5.060876951083828e-07, "loss": 0.0354, "reward": -0.34110401570796967, "reward_std": 0.3122602626681328, "rewards/cosine_scaled_reward": -0.17055201157927513, "rewards/format_reward": 0.0, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.16, "grad_norm": 2.39719557762146, "kl": 1.8583984375, "learning_rate": 5.02962191529556e-07, "loss": 0.0744, "reward": -0.36911261081695557, "reward_std": 0.3288589343428612, "rewards/cosine_scaled_reward": -0.1845562942326069, "rewards/format_reward": 0.0, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.164, "grad_norm": 2.758849620819092, "kl": 1.626953125, "learning_rate": 4.998389805071536e-07, "loss": 0.0651, "reward": -0.3935117796063423, "reward_std": 0.3461349532008171, "rewards/cosine_scaled_reward": -0.19675587862730026, "rewards/format_reward": 0.0, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.168, "grad_norm": 2.310575246810913, "kl": 1.455078125, "learning_rate": 4.967182142620745e-07, "loss": 0.0583, "reward": -0.34184807538986206, "reward_std": 0.3021695464849472, "rewards/cosine_scaled_reward": -0.17092403396964073, "rewards/format_reward": 0.0, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.172, "grad_norm": 2.8417394161224365, "kl": 1.861328125, "learning_rate": 4.93600044896063e-07, "loss": 0.0744, "reward": -0.3772461339831352, "reward_std": 0.3044436201453209, "rewards/cosine_scaled_reward": -0.18862305954098701, "rewards/format_reward": 0.0, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.176, "grad_norm": 2.347404956817627, "kl": 1.28125, "learning_rate": 4.904846243842949e-07, "loss": 0.0513, "reward": -0.3517310842871666, "reward_std": 0.3094722405076027, "rewards/cosine_scaled_reward": -0.1758655458688736, "rewards/format_reward": 0.0, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 1535.6130981445312, "epoch": 1.18, "grad_norm": 2.7739925384521484, "kl": 1.833984375, "learning_rate": 4.873721045679706e-07, "loss": 0.0731, "reward": -0.4288819953799248, "reward_std": 0.3247087821364403, "rewards/cosine_scaled_reward": -0.2144409976899624, "rewards/format_reward": 0.0, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.184, "grad_norm": 2.1470892429351807, "kl": 1.296875, "learning_rate": 4.842626371469149e-07, "loss": 0.0519, "reward": -0.35219819098711014, "reward_std": 0.3056294918060303, "rewards/cosine_scaled_reward": -0.17609910294413567, "rewards/format_reward": 0.0, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.188, "grad_norm": 3.177232503890991, "kl": 1.677734375, "learning_rate": 4.811563736721829e-07, "loss": 0.0671, "reward": -0.3717339485883713, "reward_std": 0.29695921391248703, "rewards/cosine_scaled_reward": -0.18586697429418564, "rewards/format_reward": 0.0, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.192, "grad_norm": 3.3333382606506348, "kl": 2.322265625, "learning_rate": 4.780534655386743e-07, "loss": 0.093, "reward": -0.3814833015203476, "reward_std": 0.28608307987451553, "rewards/cosine_scaled_reward": -0.1907416470348835, "rewards/format_reward": 0.0, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.196, "grad_norm": 2.842420816421509, "kl": 1.45703125, "learning_rate": 4.749540639777539e-07, "loss": 0.0583, "reward": -0.3840809538960457, "reward_std": 0.31393957883119583, "rewards/cosine_scaled_reward": -0.19204047322273254, "rewards/format_reward": 0.0, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.2, "grad_norm": 2.9220309257507324, "kl": 1.681640625, "learning_rate": 4.7185832004988133e-07, "loss": 0.0672, "reward": -0.39588408917188644, "reward_std": 0.33600132539868355, "rewards/cosine_scaled_reward": -0.19794204831123352, "rewards/format_reward": 0.0, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.204, "grad_norm": 3.4091219902038574, "kl": 1.44140625, "learning_rate": 4.68766384637248e-07, "loss": 0.0576, "reward": -0.2894315180601552, "reward_std": 0.30969203263521194, "rewards/cosine_scaled_reward": -0.14471576345385984, "rewards/format_reward": 0.0, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.208, "grad_norm": 2.0488741397857666, "kl": 1.5576171875, "learning_rate": 4.656784084364238e-07, "loss": 0.0624, "reward": -0.32318826019763947, "reward_std": 0.3031533695757389, "rewards/cosine_scaled_reward": -0.16159413009881973, "rewards/format_reward": 0.0, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.212, "grad_norm": 2.6755242347717285, "kl": 1.34765625, "learning_rate": 4.6259454195101267e-07, "loss": 0.0539, "reward": -0.37002843618392944, "reward_std": 0.31058184802532196, "rewards/cosine_scaled_reward": -0.18501422181725502, "rewards/format_reward": 0.0, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.216, "grad_norm": 6.160266399383545, "kl": 1.734375, "learning_rate": 4.59514935484316e-07, "loss": 0.0694, "reward": -0.38714154064655304, "reward_std": 0.3265160173177719, "rewards/cosine_scaled_reward": -0.19357078149914742, "rewards/format_reward": 0.0, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.22, "grad_norm": 2.3529880046844482, "kl": 1.2138671875, "learning_rate": 4.5643973913200837e-07, "loss": 0.0486, "reward": -0.3460870534181595, "reward_std": 0.3087117671966553, "rewards/cosine_scaled_reward": -0.17304353043437004, "rewards/format_reward": 0.0, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.224, "grad_norm": 2.48714280128479, "kl": 1.9453125, "learning_rate": 4.5336910277482155e-07, "loss": 0.0779, "reward": -0.3756335750222206, "reward_std": 0.32805445045232773, "rewards/cosine_scaled_reward": -0.1878167800605297, "rewards/format_reward": 0.0, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.228, "grad_norm": 8.46654987335205, "kl": 2.5107421875, "learning_rate": 4.503031760712397e-07, "loss": 0.1004, "reward": -0.385331392288208, "reward_std": 0.31344960629940033, "rewards/cosine_scaled_reward": -0.1926657035946846, "rewards/format_reward": 0.0, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.232, "grad_norm": 3.198944568634033, "kl": 2.140625, "learning_rate": 4.4724210845020494e-07, "loss": 0.0857, "reward": -0.36118319630622864, "reward_std": 0.3010380119085312, "rewards/cosine_scaled_reward": -0.18059159815311432, "rewards/format_reward": 0.0, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.236, "grad_norm": 2.745668411254883, "kl": 2.033203125, "learning_rate": 4.441860491038345e-07, "loss": 0.0813, "reward": -0.3596822917461395, "reward_std": 0.3092067465186119, "rewards/cosine_scaled_reward": -0.17984114587306976, "rewards/format_reward": 0.0, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.24, "grad_norm": 5.614748954772949, "kl": 2.34375, "learning_rate": 4.4113514698014953e-07, "loss": 0.094, "reward": -0.34773094952106476, "reward_std": 0.29645886272192, "rewards/cosine_scaled_reward": -0.17386547103524208, "rewards/format_reward": 0.0, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.244, "grad_norm": 2.089031219482422, "kl": 1.39453125, "learning_rate": 4.3808955077581546e-07, "loss": 0.0558, "reward": -0.33028923720121384, "reward_std": 0.2886582836508751, "rewards/cosine_scaled_reward": -0.16514462232589722, "rewards/format_reward": 0.0, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.248, "grad_norm": 5.366787433624268, "kl": 2.9599609375, "learning_rate": 4.350494089288943e-07, "loss": 0.1186, "reward": -0.4123021811246872, "reward_std": 0.337029866874218, "rewards/cosine_scaled_reward": -0.206151083111763, "rewards/format_reward": 0.0, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.252, "grad_norm": 8.391505241394043, "kl": 1.953125, "learning_rate": 4.3201486961161093e-07, "loss": 0.078, "reward": -0.3487403020262718, "reward_std": 0.3276291638612747, "rewards/cosine_scaled_reward": -0.1743701510131359, "rewards/format_reward": 0.0, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.256, "grad_norm": 2.623786449432373, "kl": 1.3193359375, "learning_rate": 4.2898608072313045e-07, "loss": 0.0528, "reward": -0.32606934756040573, "reward_std": 0.28208620101213455, "rewards/cosine_scaled_reward": -0.16303467005491257, "rewards/format_reward": 0.0, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.26, "grad_norm": 2.2247447967529297, "kl": 1.884765625, "learning_rate": 4.2596318988235037e-07, "loss": 0.0755, "reward": -0.2273978427692782, "reward_std": 0.28098014742136, "rewards/cosine_scaled_reward": -0.11369891960930545, "rewards/format_reward": 0.0, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.264, "grad_norm": 2.258469581604004, "kl": 1.14453125, "learning_rate": 4.2294634442070553e-07, "loss": 0.0457, "reward": -0.24764333851635456, "reward_std": 0.2835834100842476, "rewards/cosine_scaled_reward": -0.12382166367024183, "rewards/format_reward": 0.0, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.268, "grad_norm": 2.884620189666748, "kl": 1.5986328125, "learning_rate": 4.1993569137498776e-07, "loss": 0.064, "reward": -0.37140634655952454, "reward_std": 0.36573630571365356, "rewards/cosine_scaled_reward": -0.18570317327976227, "rewards/format_reward": 0.0, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.272, "grad_norm": 2.703934669494629, "kl": 1.912109375, "learning_rate": 4.1693137748017915e-07, "loss": 0.0763, "reward": -0.34411681443452835, "reward_std": 0.29631946235895157, "rewards/cosine_scaled_reward": -0.17205841839313507, "rewards/format_reward": 0.0, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.276, "grad_norm": 3.717240571975708, "kl": 2.224609375, "learning_rate": 4.1393354916230005e-07, "loss": 0.0891, "reward": -0.3324529230594635, "reward_std": 0.2552623227238655, "rewards/cosine_scaled_reward": -0.16622646152973175, "rewards/format_reward": 0.0, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.28, "grad_norm": 2.4941396713256836, "kl": 1.384765625, "learning_rate": 4.1094235253127374e-07, "loss": 0.0555, "reward": -0.30811577290296555, "reward_std": 0.2845884971320629, "rewards/cosine_scaled_reward": -0.15405788272619247, "rewards/format_reward": 0.0, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.284, "grad_norm": 3.229072332382202, "kl": 1.9453125, "learning_rate": 4.079579333738039e-07, "loss": 0.0778, "reward": -0.3366442248225212, "reward_std": 0.301740899682045, "rewards/cosine_scaled_reward": -0.1683221124112606, "rewards/format_reward": 0.0, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.288, "grad_norm": 3.3636343479156494, "kl": 1.8828125, "learning_rate": 4.0498043714627006e-07, "loss": 0.0752, "reward": -0.36845648288726807, "reward_std": 0.34283190220594406, "rewards/cosine_scaled_reward": -0.18422825261950493, "rewards/format_reward": 0.0, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.292, "grad_norm": 3.507054090499878, "kl": 1.4130859375, "learning_rate": 4.020100089676376e-07, "loss": 0.0566, "reward": -0.34711746126413345, "reward_std": 0.2960944324731827, "rewards/cosine_scaled_reward": -0.17355873063206673, "rewards/format_reward": 0.0, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.296, "grad_norm": 2.661647081375122, "kl": 1.736328125, "learning_rate": 3.9904679361238526e-07, "loss": 0.0694, "reward": -0.33277176320552826, "reward_std": 0.3034566268324852, "rewards/cosine_scaled_reward": -0.16638587787747383, "rewards/format_reward": 0.0, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.3, "grad_norm": 3.079672336578369, "kl": 1.359375, "learning_rate": 3.9609093550344907e-07, "loss": 0.0544, "reward": -0.3246685415506363, "reward_std": 0.27341699600219727, "rewards/cosine_scaled_reward": -0.16233427450060844, "rewards/format_reward": 0.0, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.304, "grad_norm": 3.248324394226074, "kl": 1.1181640625, "learning_rate": 3.931425787051832e-07, "loss": 0.0447, "reward": -0.3214203119277954, "reward_std": 0.2835453376173973, "rewards/cosine_scaled_reward": -0.160710159689188, "rewards/format_reward": 0.0, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.308, "grad_norm": 3.676837205886841, "kl": 1.724609375, "learning_rate": 3.902018669163384e-07, "loss": 0.069, "reward": -0.32949286699295044, "reward_std": 0.30344782024621964, "rewards/cosine_scaled_reward": -0.16474644094705582, "rewards/format_reward": 0.0, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.312, "grad_norm": 2.3120462894439697, "kl": 1.537109375, "learning_rate": 3.872689434630585e-07, "loss": 0.0615, "reward": -0.3512613996863365, "reward_std": 0.3501633331179619, "rewards/cosine_scaled_reward": -0.17563070356845856, "rewards/format_reward": 0.0, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.316, "grad_norm": 2.4828386306762695, "kl": 1.6953125, "learning_rate": 3.843439512918949e-07, "loss": 0.0677, "reward": -0.31614498794078827, "reward_std": 0.29276788979768753, "rewards/cosine_scaled_reward": -0.15807249024510384, "rewards/format_reward": 0.0, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.32, "grad_norm": 3.356783151626587, "kl": 2.453125, "learning_rate": 3.8142703296283953e-07, "loss": 0.0982, "reward": -0.4576185494661331, "reward_std": 0.32832735031843185, "rewards/cosine_scaled_reward": -0.22880928218364716, "rewards/format_reward": 0.0, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.324, "grad_norm": 2.7885196208953857, "kl": 2.068359375, "learning_rate": 3.785183306423767e-07, "loss": 0.0827, "reward": -0.2943090833723545, "reward_std": 0.31652648001909256, "rewards/cosine_scaled_reward": -0.14715453796088696, "rewards/format_reward": 0.0, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.328, "grad_norm": 3.0415380001068115, "kl": 1.802734375, "learning_rate": 3.7561798609655373e-07, "loss": 0.0721, "reward": -0.3697570115327835, "reward_std": 0.3258262947201729, "rewards/cosine_scaled_reward": -0.18487850576639175, "rewards/format_reward": 0.0, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.332, "grad_norm": 3.139693021774292, "kl": 1.732421875, "learning_rate": 3.72726140684072e-07, "loss": 0.0693, "reward": -0.33471549302339554, "reward_std": 0.2794983647763729, "rewards/cosine_scaled_reward": -0.16735775396227837, "rewards/format_reward": 0.0, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.336, "grad_norm": 2.6243162155151367, "kl": 1.8369140625, "learning_rate": 3.6984293534939737e-07, "loss": 0.0733, "reward": -0.3382048085331917, "reward_std": 0.3457643389701843, "rewards/cosine_scaled_reward": -0.16910240054130554, "rewards/format_reward": 0.0, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.34, "grad_norm": 3.803060293197632, "kl": 1.8046875, "learning_rate": 3.6696851061588994e-07, "loss": 0.0723, "reward": -0.3406166359782219, "reward_std": 0.29876144975423813, "rewards/cosine_scaled_reward": -0.17030831426382065, "rewards/format_reward": 0.0, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.3439999999999999, "grad_norm": 3.948391914367676, "kl": 1.365234375, "learning_rate": 3.641030065789562e-07, "loss": 0.0546, "reward": -0.2908342033624649, "reward_std": 0.26911235228180885, "rewards/cosine_scaled_reward": -0.14541710540652275, "rewards/format_reward": 0.0, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.3479999999999999, "grad_norm": 2.9695639610290527, "kl": 2.19921875, "learning_rate": 3.612465628992203e-07, "loss": 0.0881, "reward": -0.37160656601190567, "reward_std": 0.3147331103682518, "rewards/cosine_scaled_reward": -0.18580328300595284, "rewards/format_reward": 0.0, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.3519999999999999, "grad_norm": 3.1350209712982178, "kl": 2.1689453125, "learning_rate": 3.5839931879571725e-07, "loss": 0.087, "reward": -0.3230074942111969, "reward_std": 0.313438281416893, "rewards/cosine_scaled_reward": -0.16150375083088875, "rewards/format_reward": 0.0, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.3559999999999999, "grad_norm": 3.882567882537842, "kl": 2.0546875, "learning_rate": 3.555614130391079e-07, "loss": 0.0821, "reward": -0.36975327879190445, "reward_std": 0.31242573261260986, "rewards/cosine_scaled_reward": -0.18487663567066193, "rewards/format_reward": 0.0, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.3599999999999999, "grad_norm": 2.6699118614196777, "kl": 1.689453125, "learning_rate": 3.5273298394491515e-07, "loss": 0.0676, "reward": -0.368961863219738, "reward_std": 0.32627636194229126, "rewards/cosine_scaled_reward": -0.1844809353351593, "rewards/format_reward": 0.0, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.3639999999999999, "grad_norm": 3.0782856941223145, "kl": 1.59765625, "learning_rate": 3.4991416936678276e-07, "loss": 0.064, "reward": -0.3320116475224495, "reward_std": 0.3151276856660843, "rewards/cosine_scaled_reward": -0.16600582748651505, "rewards/format_reward": 0.0, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.3679999999999999, "grad_norm": 2.2419495582580566, "kl": 1.46484375, "learning_rate": 3.471051066897562e-07, "loss": 0.0585, "reward": -0.2764207161962986, "reward_std": 0.3390573188662529, "rewards/cosine_scaled_reward": -0.1382103539071977, "rewards/format_reward": 0.0, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.3719999999999999, "grad_norm": 4.397972106933594, "kl": 2.5, "learning_rate": 3.4430593282358777e-07, "loss": 0.1002, "reward": -0.33926407247781754, "reward_std": 0.31172922998666763, "rewards/cosine_scaled_reward": -0.16963203251361847, "rewards/format_reward": 0.0, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.376, "grad_norm": 3.441905975341797, "kl": 2.0234375, "learning_rate": 3.4151678419606233e-07, "loss": 0.0808, "reward": -0.3324861600995064, "reward_std": 0.2958858981728554, "rewards/cosine_scaled_reward": -0.1662430725991726, "rewards/format_reward": 0.0, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.38, "grad_norm": 2.7323975563049316, "kl": 1.4189453125, "learning_rate": 3.387377967463493e-07, "loss": 0.0566, "reward": -0.3314187452197075, "reward_std": 0.3164066970348358, "rewards/cosine_scaled_reward": -0.16570937633514404, "rewards/format_reward": 0.0, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.384, "grad_norm": 4.131885528564453, "kl": 2.45703125, "learning_rate": 3.359691059183761e-07, "loss": 0.0983, "reward": -0.37432391941547394, "reward_std": 0.33136965334415436, "rewards/cosine_scaled_reward": -0.18716195970773697, "rewards/format_reward": 0.0, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.388, "grad_norm": 2.9907569885253906, "kl": 1.732421875, "learning_rate": 3.3321084665422803e-07, "loss": 0.0693, "reward": -0.38256606459617615, "reward_std": 0.31782740354537964, "rewards/cosine_scaled_reward": -0.19128303229808807, "rewards/format_reward": 0.0, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.392, "grad_norm": 2.6049344539642334, "kl": 1.53515625, "learning_rate": 3.3046315338757026e-07, "loss": 0.0613, "reward": -0.2997368350625038, "reward_std": 0.3045838475227356, "rewards/cosine_scaled_reward": -0.1498684138059616, "rewards/format_reward": 0.0, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.396, "grad_norm": 4.5095295906066895, "kl": 1.5029296875, "learning_rate": 3.2772616003709616e-07, "loss": 0.0602, "reward": -0.3363025635480881, "reward_std": 0.30865515023469925, "rewards/cosine_scaled_reward": -0.16815128177404404, "rewards/format_reward": 0.0, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.4, "grad_norm": 3.3342795372009277, "kl": 1.908203125, "learning_rate": 3.250000000000001e-07, "loss": 0.0762, "reward": -0.3770889565348625, "reward_std": 0.30710920691490173, "rewards/cosine_scaled_reward": -0.18854447081685066, "rewards/format_reward": 0.0, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.404, "grad_norm": 2.795259714126587, "kl": 2.048828125, "learning_rate": 3.222848061454764e-07, "loss": 0.082, "reward": -0.3462035730481148, "reward_std": 0.32692621648311615, "rewards/cosine_scaled_reward": -0.1731017865240574, "rewards/format_reward": 0.0, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.408, "grad_norm": 2.563765287399292, "kl": 1.462890625, "learning_rate": 3.195807108082429e-07, "loss": 0.0586, "reward": -0.37373943626880646, "reward_std": 0.3041759356856346, "rewards/cosine_scaled_reward": -0.18686972558498383, "rewards/format_reward": 0.0, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.412, "grad_norm": 2.6194751262664795, "kl": 1.24609375, "learning_rate": 3.168878457820915e-07, "loss": 0.0498, "reward": -0.3196728527545929, "reward_std": 0.2953634150326252, "rewards/cosine_scaled_reward": -0.15983642637729645, "rewards/format_reward": 0.0, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.416, "grad_norm": 2.8382420539855957, "kl": 1.650390625, "learning_rate": 3.142063423134644e-07, "loss": 0.0662, "reward": -0.33513225615024567, "reward_std": 0.30527665093541145, "rewards/cosine_scaled_reward": -0.16756613552570343, "rewards/format_reward": 0.0, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.42, "grad_norm": 2.6078808307647705, "kl": 2.15234375, "learning_rate": 3.115363310950578e-07, "loss": 0.086, "reward": -0.3992829695343971, "reward_std": 0.31726495921611786, "rewards/cosine_scaled_reward": -0.19964147731661797, "rewards/format_reward": 0.0, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.424, "grad_norm": 4.192615985870361, "kl": 2.142578125, "learning_rate": 3.0887794225945143e-07, "loss": 0.0858, "reward": -0.39319509267807007, "reward_std": 0.3372880816459656, "rewards/cosine_scaled_reward": -0.19659754261374474, "rewards/format_reward": 0.0, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.428, "grad_norm": 3.196894407272339, "kl": 2.509765625, "learning_rate": 3.062313053727671e-07, "loss": 0.1006, "reward": -0.3694089204072952, "reward_std": 0.323252871632576, "rewards/cosine_scaled_reward": -0.1847044676542282, "rewards/format_reward": 0.0, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.432, "grad_norm": 3.348161458969116, "kl": 1.1142578125, "learning_rate": 3.0359654942835247e-07, "loss": 0.0447, "reward": -0.36088229715824127, "reward_std": 0.31483449041843414, "rewards/cosine_scaled_reward": -0.18044114857912064, "rewards/format_reward": 0.0, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.436, "grad_norm": 3.457472324371338, "kl": 2.2265625, "learning_rate": 3.0097380284049523e-07, "loss": 0.089, "reward": -0.3612442761659622, "reward_std": 0.28438059240579605, "rewards/cosine_scaled_reward": -0.1806221418082714, "rewards/format_reward": 0.0, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.44, "grad_norm": 3.285405397415161, "kl": 2.076171875, "learning_rate": 2.9836319343816397e-07, "loss": 0.0831, "reward": -0.32887883111834526, "reward_std": 0.3107897564768791, "rewards/cosine_scaled_reward": -0.16443941928446293, "rewards/format_reward": 0.0, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.444, "grad_norm": 2.9156711101531982, "kl": 1.7646484375, "learning_rate": 2.9576484845877793e-07, "loss": 0.0706, "reward": -0.3512116149067879, "reward_std": 0.32886873185634613, "rewards/cosine_scaled_reward": -0.17560580000281334, "rewards/format_reward": 0.0, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.448, "grad_norm": 2.42704439163208, "kl": 1.697265625, "learning_rate": 2.931788945420058e-07, "loss": 0.0679, "reward": -0.3639722764492035, "reward_std": 0.2881170064210892, "rewards/cosine_scaled_reward": -0.18198613449931145, "rewards/format_reward": 0.0, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.452, "grad_norm": 4.5008225440979, "kl": 2.177734375, "learning_rate": 2.9060545772359305e-07, "loss": 0.087, "reward": -0.3515865206718445, "reward_std": 0.290123887360096, "rewards/cosine_scaled_reward": -0.17579325661063194, "rewards/format_reward": 0.0, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.456, "grad_norm": 2.7479496002197266, "kl": 1.578125, "learning_rate": 2.8804466342921987e-07, "loss": 0.0632, "reward": -0.26583924936130643, "reward_std": 0.29539088532328606, "rewards/cosine_scaled_reward": -0.13291961723007262, "rewards/format_reward": 0.0, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.46, "grad_norm": 2.6749367713928223, "kl": 2.1796875, "learning_rate": 2.854966364683872e-07, "loss": 0.087, "reward": -0.36106909811496735, "reward_std": 0.2982637956738472, "rewards/cosine_scaled_reward": -0.18053454905748367, "rewards/format_reward": 0.0, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.464, "grad_norm": 3.6434812545776367, "kl": 1.4482421875, "learning_rate": 2.829615010283344e-07, "loss": 0.058, "reward": -0.35805001854896545, "reward_std": 0.31588251888751984, "rewards/cosine_scaled_reward": -0.17902500554919243, "rewards/format_reward": 0.0, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.468, "grad_norm": 2.877927780151367, "kl": 1.779296875, "learning_rate": 2.8043938066798645e-07, "loss": 0.0712, "reward": -0.35267870873212814, "reward_std": 0.3029713034629822, "rewards/cosine_scaled_reward": -0.17633935809135437, "rewards/format_reward": 0.0, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.472, "grad_norm": 2.9547438621520996, "kl": 1.3583984375, "learning_rate": 2.7793039831193133e-07, "loss": 0.0542, "reward": -0.34842824190855026, "reward_std": 0.28041965141892433, "rewards/cosine_scaled_reward": -0.17421411722898483, "rewards/format_reward": 0.0, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.476, "grad_norm": 2.4998183250427246, "kl": 1.712890625, "learning_rate": 2.7543467624442956e-07, "loss": 0.0686, "reward": -0.34311509132385254, "reward_std": 0.3226206302642822, "rewards/cosine_scaled_reward": -0.17155754193663597, "rewards/format_reward": 0.0, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.48, "grad_norm": 3.5822997093200684, "kl": 1.2568359375, "learning_rate": 2.729523361034538e-07, "loss": 0.0502, "reward": -0.31581661850214005, "reward_std": 0.27614113688468933, "rewards/cosine_scaled_reward": -0.15790832042694092, "rewards/format_reward": 0.0, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.484, "grad_norm": 2.638000965118408, "kl": 1.658203125, "learning_rate": 2.7048349887476037e-07, "loss": 0.0663, "reward": -0.3658217638731003, "reward_std": 0.3533295765519142, "rewards/cosine_scaled_reward": -0.18291086703538895, "rewards/format_reward": 0.0, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.488, "grad_norm": 2.4719886779785156, "kl": 1.470703125, "learning_rate": 2.6802828488599294e-07, "loss": 0.0588, "reward": -0.35377567261457443, "reward_std": 0.2872357815504074, "rewards/cosine_scaled_reward": -0.17688783630728722, "rewards/format_reward": 0.0, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.492, "grad_norm": 3.820688486099243, "kl": 1.65673828125, "learning_rate": 2.655868138008171e-07, "loss": 0.0662, "reward": -0.3673105686903, "reward_std": 0.29224705323576927, "rewards/cosine_scaled_reward": -0.1836552768945694, "rewards/format_reward": 0.0, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.496, "grad_norm": 3.1416916847229004, "kl": 1.4990234375, "learning_rate": 2.631592046130896e-07, "loss": 0.06, "reward": -0.3574133738875389, "reward_std": 0.2663569226861, "rewards/cosine_scaled_reward": -0.17870669439435005, "rewards/format_reward": 0.0, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.5, "grad_norm": 2.3712515830993652, "kl": 1.900390625, "learning_rate": 2.6074557564105724e-07, "loss": 0.0761, "reward": -0.34536080807447433, "reward_std": 0.3063738942146301, "rewards/cosine_scaled_reward": -0.17268040403723717, "rewards/format_reward": 0.0, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.504, "grad_norm": 2.792006254196167, "kl": 1.71875, "learning_rate": 2.583460445215911e-07, "loss": 0.0688, "reward": -0.3458981513977051, "reward_std": 0.3039686158299446, "rewards/cosine_scaled_reward": -0.17294907197356224, "rewards/format_reward": 0.0, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.508, "grad_norm": 2.985948085784912, "kl": 1.5625, "learning_rate": 2.5596072820445254e-07, "loss": 0.0625, "reward": -0.21606629202142358, "reward_std": 0.2749215438961983, "rewards/cosine_scaled_reward": -0.10803314973600209, "rewards/format_reward": 0.0, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 1531.952392578125, "epoch": 1.512, "grad_norm": 2.396852970123291, "kl": 1.9921875, "learning_rate": 2.5358974294659373e-07, "loss": 0.0823, "reward": -0.38127752393484116, "reward_std": 0.32172612845897675, "rewards/cosine_scaled_reward": -0.19063876569271088, "rewards/format_reward": 0.0, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.516, "grad_norm": 2.503976345062256, "kl": 1.794921875, "learning_rate": 2.512332043064913e-07, "loss": 0.0718, "reward": -0.3479606434702873, "reward_std": 0.29174239560961723, "rewards/cosine_scaled_reward": -0.17398031428456306, "rewards/format_reward": 0.0, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 1531.8035888671875, "epoch": 1.52, "grad_norm": 3.344243049621582, "kl": 2.080078125, "learning_rate": 2.488912271385139e-07, "loss": 0.083, "reward": -0.38203170895576477, "reward_std": 0.3180833086371422, "rewards/cosine_scaled_reward": -0.19101585447788239, "rewards/format_reward": 0.0, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.524, "grad_norm": 3.5073604583740234, "kl": 2.095703125, "learning_rate": 2.465639255873246e-07, "loss": 0.0837, "reward": -0.33683621138334274, "reward_std": 0.3141423165798187, "rewards/cosine_scaled_reward": -0.16841810569167137, "rewards/format_reward": 0.0, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.528, "grad_norm": 2.7634477615356445, "kl": 2.55859375, "learning_rate": 2.4425141308231765e-07, "loss": 0.1022, "reward": -0.3983701467514038, "reward_std": 0.31766583025455475, "rewards/cosine_scaled_reward": -0.199185062199831, "rewards/format_reward": 0.0, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.532, "grad_norm": 3.1601033210754395, "kl": 1.486328125, "learning_rate": 2.4195380233209006e-07, "loss": 0.0594, "reward": -0.37120404094457626, "reward_std": 0.3172856420278549, "rewards/cosine_scaled_reward": -0.18560202419757843, "rewards/format_reward": 0.0, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.536, "grad_norm": 2.475311040878296, "kl": 2.01953125, "learning_rate": 2.3967120531894857e-07, "loss": 0.0807, "reward": -0.3449181020259857, "reward_std": 0.3061336353421211, "rewards/cosine_scaled_reward": -0.17245905846357346, "rewards/format_reward": 0.0, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.54, "grad_norm": 3.9638140201568604, "kl": 1.6806640625, "learning_rate": 2.374037332934512e-07, "loss": 0.0673, "reward": -0.3139965161681175, "reward_std": 0.303245909512043, "rewards/cosine_scaled_reward": -0.15699823945760727, "rewards/format_reward": 0.0, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.544, "grad_norm": 3.2407708168029785, "kl": 1.89453125, "learning_rate": 2.3515149676898552e-07, "loss": 0.0757, "reward": -0.3049175813794136, "reward_std": 0.30845751613378525, "rewards/cosine_scaled_reward": -0.1524587944149971, "rewards/format_reward": 0.0, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.548, "grad_norm": 3.1065189838409424, "kl": 1.75390625, "learning_rate": 2.3291460551638237e-07, "loss": 0.0701, "reward": -0.3369733840227127, "reward_std": 0.30179525911808014, "rewards/cosine_scaled_reward": -0.16848668828606606, "rewards/format_reward": 0.0, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.552, "grad_norm": 2.6867339611053467, "kl": 2.06640625, "learning_rate": 2.306931685585657e-07, "loss": 0.0826, "reward": -0.3339100852608681, "reward_std": 0.3043428584933281, "rewards/cosine_scaled_reward": -0.16695504263043404, "rewards/format_reward": 0.0, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.556, "grad_norm": 3.1580567359924316, "kl": 2.291015625, "learning_rate": 2.2848729416523859e-07, "loss": 0.0915, "reward": -0.3744669333100319, "reward_std": 0.3249610960483551, "rewards/cosine_scaled_reward": -0.18723345920443535, "rewards/format_reward": 0.0, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.56, "grad_norm": 5.407771587371826, "kl": 1.609375, "learning_rate": 2.2629708984760706e-07, "loss": 0.0645, "reward": -0.3420454412698746, "reward_std": 0.3148321136832237, "rewards/cosine_scaled_reward": -0.1710227131843567, "rewards/format_reward": 0.0, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.564, "grad_norm": 4.492737770080566, "kl": 2.275390625, "learning_rate": 2.2412266235313973e-07, "loss": 0.0909, "reward": -0.36313918232917786, "reward_std": 0.29535526037216187, "rewards/cosine_scaled_reward": -0.18156958371400833, "rewards/format_reward": 0.0, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.568, "grad_norm": 3.0125086307525635, "kl": 2.029296875, "learning_rate": 2.2196411766036487e-07, "loss": 0.0812, "reward": -0.37769585102796555, "reward_std": 0.31776873767375946, "rewards/cosine_scaled_reward": -0.18884791806340218, "rewards/format_reward": 0.0, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.572, "grad_norm": 3.134265899658203, "kl": 2.47265625, "learning_rate": 2.1982156097370557e-07, "loss": 0.099, "reward": -0.38678842037916183, "reward_std": 0.30557621270418167, "rewards/cosine_scaled_reward": -0.19339420646429062, "rewards/format_reward": 0.0, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.576, "grad_norm": 2.9398727416992188, "kl": 1.404296875, "learning_rate": 2.1769509671835223e-07, "loss": 0.0562, "reward": -0.3609785735607147, "reward_std": 0.29732464998960495, "rewards/cosine_scaled_reward": -0.18048929050564766, "rewards/format_reward": 0.0, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.58, "grad_norm": 2.3901424407958984, "kl": 2.3291015625, "learning_rate": 2.1558482853517253e-07, "loss": 0.093, "reward": -0.38430536538362503, "reward_std": 0.32753758877515793, "rewards/cosine_scaled_reward": -0.19215268269181252, "rewards/format_reward": 0.0, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 1526.9702453613281, "epoch": 1.584, "grad_norm": 3.9775447845458984, "kl": 2.06640625, "learning_rate": 2.134908592756607e-07, "loss": 0.0914, "reward": -0.33116257190704346, "reward_std": 0.2928163409233093, "rewards/cosine_scaled_reward": -0.16558128595352173, "rewards/format_reward": 0.0, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.588, "grad_norm": 2.9975955486297607, "kl": 2.318359375, "learning_rate": 2.1141329099692406e-07, "loss": 0.0928, "reward": -0.3710367754101753, "reward_std": 0.3226532116532326, "rewards/cosine_scaled_reward": -0.18551838770508766, "rewards/format_reward": 0.0, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 1530.6845397949219, "epoch": 1.592, "grad_norm": 3.739922046661377, "kl": 2.025390625, "learning_rate": 2.0935222495670968e-07, "loss": 0.0747, "reward": -0.3954162746667862, "reward_std": 0.3323783427476883, "rewards/cosine_scaled_reward": -0.1977081410586834, "rewards/format_reward": 0.0, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.596, "grad_norm": 2.7063024044036865, "kl": 1.0927734375, "learning_rate": 2.0730776160846853e-07, "loss": 0.0437, "reward": -0.3006215952336788, "reward_std": 0.27692657709121704, "rewards/cosine_scaled_reward": -0.15031079947948456, "rewards/format_reward": 0.0, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.6, "grad_norm": 2.469496726989746, "kl": 1.732421875, "learning_rate": 2.0528000059645995e-07, "loss": 0.0693, "reward": -0.36928267031908035, "reward_std": 0.30984392017126083, "rewards/cosine_scaled_reward": -0.18464133515954018, "rewards/format_reward": 0.0, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 1522.3095397949219, "epoch": 1.604, "grad_norm": 2.855372190475464, "kl": 1.845703125, "learning_rate": 2.032690407508949e-07, "loss": 0.0636, "reward": -0.38443852961063385, "reward_std": 0.28470365703105927, "rewards/cosine_scaled_reward": -0.19221926480531693, "rewards/format_reward": 0.0, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.608, "grad_norm": 3.3847217559814453, "kl": 2.0390625, "learning_rate": 2.0127498008311922e-07, "loss": 0.0814, "reward": -0.3252910152077675, "reward_std": 0.2982725724577904, "rewards/cosine_scaled_reward": -0.16264550015330315, "rewards/format_reward": 0.0, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.612, "grad_norm": 3.0226523876190186, "kl": 1.81640625, "learning_rate": 1.9929791578083655e-07, "loss": 0.0727, "reward": -0.3527565225958824, "reward_std": 0.30437447875738144, "rewards/cosine_scaled_reward": -0.1763782650232315, "rewards/format_reward": 0.0, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.616, "grad_norm": 2.866734743118286, "kl": 1.7890625, "learning_rate": 1.9733794420337213e-07, "loss": 0.0716, "reward": -0.3746185079216957, "reward_std": 0.3078552633523941, "rewards/cosine_scaled_reward": -0.18730924278497696, "rewards/format_reward": 0.0, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.62, "grad_norm": 3.9170870780944824, "kl": 1.970703125, "learning_rate": 1.9539516087697517e-07, "loss": 0.0789, "reward": -0.41533301770687103, "reward_std": 0.3027655556797981, "rewards/cosine_scaled_reward": -0.20766650885343552, "rewards/format_reward": 0.0, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.624, "grad_norm": 3.470655679702759, "kl": 1.845703125, "learning_rate": 1.934696604901642e-07, "loss": 0.0738, "reward": -0.3191938251256943, "reward_std": 0.28303690254688263, "rewards/cosine_scaled_reward": -0.15959692373871803, "rewards/format_reward": 0.0, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.6280000000000001, "grad_norm": 3.623340368270874, "kl": 1.31640625, "learning_rate": 1.915615368891117e-07, "loss": 0.0526, "reward": -0.3123548626899719, "reward_std": 0.29499682784080505, "rewards/cosine_scaled_reward": -0.15617743134498596, "rewards/format_reward": 0.0, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.6320000000000001, "grad_norm": 2.282514810562134, "kl": 1.267578125, "learning_rate": 1.8967088307307e-07, "loss": 0.0507, "reward": -0.39642050117254257, "reward_std": 0.311983872205019, "rewards/cosine_scaled_reward": -0.19821025803685188, "rewards/format_reward": 0.0, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.6360000000000001, "grad_norm": 2.5232083797454834, "kl": 1.681640625, "learning_rate": 1.8779779118983867e-07, "loss": 0.0672, "reward": -0.33888739347457886, "reward_std": 0.28087718039751053, "rewards/cosine_scaled_reward": -0.16944369673728943, "rewards/format_reward": 0.0, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.6400000000000001, "grad_norm": 3.886439085006714, "kl": 2.09765625, "learning_rate": 1.8594235253127372e-07, "loss": 0.0838, "reward": -0.38627707213163376, "reward_std": 0.33190976083278656, "rewards/cosine_scaled_reward": -0.19313853234052658, "rewards/format_reward": 0.0, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.6440000000000001, "grad_norm": 3.090627670288086, "kl": 2.140625, "learning_rate": 1.8410465752883758e-07, "loss": 0.0857, "reward": -0.3793156296014786, "reward_std": 0.30717378109693527, "rewards/cosine_scaled_reward": -0.1896577998995781, "rewards/format_reward": 0.0, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.6480000000000001, "grad_norm": 3.867506980895996, "kl": 1.880859375, "learning_rate": 1.822847957491922e-07, "loss": 0.0753, "reward": -0.3565782457590103, "reward_std": 0.3352038711309433, "rewards/cosine_scaled_reward": -0.17828912287950516, "rewards/format_reward": 0.0, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.6520000000000001, "grad_norm": 2.388094902038574, "kl": 1.751953125, "learning_rate": 1.804828558898332e-07, "loss": 0.0701, "reward": -0.3393707424402237, "reward_std": 0.3029238283634186, "rewards/cosine_scaled_reward": -0.16968537122011185, "rewards/format_reward": 0.0, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.6560000000000001, "grad_norm": 2.5263466835021973, "kl": 1.748046875, "learning_rate": 1.7869892577476722e-07, "loss": 0.0698, "reward": -0.4274343103170395, "reward_std": 0.3449402078986168, "rewards/cosine_scaled_reward": -0.21371715888381004, "rewards/format_reward": 0.0, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.6600000000000001, "grad_norm": 2.3268003463745117, "kl": 1.400390625, "learning_rate": 1.7693309235023127e-07, "loss": 0.0559, "reward": -0.3480057269334793, "reward_std": 0.29953421652317047, "rewards/cosine_scaled_reward": -0.17400285601615906, "rewards/format_reward": 0.0, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.6640000000000001, "grad_norm": 3.2503533363342285, "kl": 1.9140625, "learning_rate": 1.7518544168045524e-07, "loss": 0.0767, "reward": -0.36937638372182846, "reward_std": 0.31766701489686966, "rewards/cosine_scaled_reward": -0.18468819558620453, "rewards/format_reward": 0.0, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.6680000000000001, "grad_norm": 2.9895646572113037, "kl": 2.1796875, "learning_rate": 1.7345605894346726e-07, "loss": 0.0871, "reward": -0.3985458239912987, "reward_std": 0.33385203033685684, "rewards/cosine_scaled_reward": -0.19927291199564934, "rewards/format_reward": 0.0, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.6720000000000002, "grad_norm": 3.2457692623138428, "kl": 1.71875, "learning_rate": 1.7174502842694212e-07, "loss": 0.0687, "reward": -0.2603262776392512, "reward_std": 0.3040950074791908, "rewards/cosine_scaled_reward": -0.13016314181732014, "rewards/format_reward": 0.0, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.6760000000000002, "grad_norm": 2.8391411304473877, "kl": 1.798828125, "learning_rate": 1.7005243352409333e-07, "loss": 0.072, "reward": -0.2663672436028719, "reward_std": 0.29912005364894867, "rewards/cosine_scaled_reward": -0.13318362249992788, "rewards/format_reward": 0.0, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.6800000000000002, "grad_norm": 3.1057238578796387, "kl": 1.5, "learning_rate": 1.6837835672960831e-07, "loss": 0.06, "reward": -0.34882377088069916, "reward_std": 0.3601520508527756, "rewards/cosine_scaled_reward": -0.17441189289093018, "rewards/format_reward": 0.0, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.6840000000000002, "grad_norm": 2.243816375732422, "kl": 1.541015625, "learning_rate": 1.6672287963562852e-07, "loss": 0.0616, "reward": -0.3832622766494751, "reward_std": 0.3413049802184105, "rewards/cosine_scaled_reward": -0.19163113832473755, "rewards/format_reward": 0.0, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.688, "grad_norm": 3.76218581199646, "kl": 1.880859375, "learning_rate": 1.6508608292777203e-07, "loss": 0.0752, "reward": -0.3700753226876259, "reward_std": 0.31324099004268646, "rewards/cosine_scaled_reward": -0.18503766134381294, "rewards/format_reward": 0.0, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.692, "grad_norm": 4.034151554107666, "kl": 1.70703125, "learning_rate": 1.6346804638120098e-07, "loss": 0.0682, "reward": -0.29791881144046783, "reward_std": 0.2801155336201191, "rewards/cosine_scaled_reward": -0.14895940944552422, "rewards/format_reward": 0.0, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.696, "grad_norm": 3.041618824005127, "kl": 1.81640625, "learning_rate": 1.6186884885673413e-07, "loss": 0.0725, "reward": -0.32316526770591736, "reward_std": 0.2970619350671768, "rewards/cosine_scaled_reward": -0.16158264502882957, "rewards/format_reward": 0.0, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.7, "grad_norm": 4.081668376922607, "kl": 1.4453125, "learning_rate": 1.6028856829700258e-07, "loss": 0.0576, "reward": -0.3476375713944435, "reward_std": 0.294509120285511, "rewards/cosine_scaled_reward": -0.17381878197193146, "rewards/format_reward": 0.0, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.704, "grad_norm": 3.166949510574341, "kl": 2.1015625, "learning_rate": 1.5872728172265146e-07, "loss": 0.0841, "reward": -0.3467593193054199, "reward_std": 0.30388573557138443, "rewards/cosine_scaled_reward": -0.17337966337800026, "rewards/format_reward": 0.0, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.708, "grad_norm": 4.211978435516357, "kl": 1.763671875, "learning_rate": 1.5718506522858572e-07, "loss": 0.0705, "reward": -0.3505774810910225, "reward_std": 0.30420946329832077, "rewards/cosine_scaled_reward": -0.17528874799609184, "rewards/format_reward": 0.0, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.712, "grad_norm": 4.166502952575684, "kl": 2.158203125, "learning_rate": 1.5566199398026147e-07, "loss": 0.0863, "reward": -0.361857570707798, "reward_std": 0.30119316279888153, "rewards/cosine_scaled_reward": -0.1809287928044796, "rewards/format_reward": 0.0, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.716, "grad_norm": 2.8889896869659424, "kl": 1.8671875, "learning_rate": 1.5415814221002265e-07, "loss": 0.0745, "reward": -0.32126056402921677, "reward_std": 0.27691005170345306, "rewards/cosine_scaled_reward": -0.16063029691576958, "rewards/format_reward": 0.0, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.72, "grad_norm": 3.3025801181793213, "kl": 1.904296875, "learning_rate": 1.5267358321348285e-07, "loss": 0.0761, "reward": -0.36847078800201416, "reward_std": 0.3445659205317497, "rewards/cosine_scaled_reward": -0.18423539400100708, "rewards/format_reward": 0.0, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.724, "grad_norm": 3.0440969467163086, "kl": 1.75, "learning_rate": 1.5120838934595337e-07, "loss": 0.07, "reward": -0.36113734543323517, "reward_std": 0.3412683606147766, "rewards/cosine_scaled_reward": -0.18056866899132729, "rewards/format_reward": 0.0, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 1530.952392578125, "epoch": 1.728, "grad_norm": 2.575627326965332, "kl": 1.689453125, "learning_rate": 1.4976263201891613e-07, "loss": 0.0716, "reward": -0.3095761463046074, "reward_std": 0.32323335483670235, "rewards/cosine_scaled_reward": -0.1547880806028843, "rewards/format_reward": 0.0, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.732, "grad_norm": 3.186289072036743, "kl": 1.91015625, "learning_rate": 1.483363816965435e-07, "loss": 0.0765, "reward": -0.39015311002731323, "reward_std": 0.3067055642604828, "rewards/cosine_scaled_reward": -0.19507654383778572, "rewards/format_reward": 0.0, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.736, "grad_norm": 3.0739073753356934, "kl": 2.369140625, "learning_rate": 1.469297078922642e-07, "loss": 0.0946, "reward": -0.29091550246812403, "reward_std": 0.30687109380960464, "rewards/cosine_scaled_reward": -0.14545774972066283, "rewards/format_reward": 0.0, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.74, "grad_norm": 5.0029778480529785, "kl": 1.759765625, "learning_rate": 1.4554267916537495e-07, "loss": 0.0703, "reward": -0.34431006759405136, "reward_std": 0.27501973509788513, "rewards/cosine_scaled_reward": -0.17215503007173538, "rewards/format_reward": 0.0, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.744, "grad_norm": 5.139548301696777, "kl": 1.8203125, "learning_rate": 1.4417536311769885e-07, "loss": 0.0728, "reward": -0.31318235397338867, "reward_std": 0.2976163923740387, "rewards/cosine_scaled_reward": -0.15659117698669434, "rewards/format_reward": 0.0, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.748, "grad_norm": 2.881143808364868, "kl": 1.626953125, "learning_rate": 1.4282782639029128e-07, "loss": 0.065, "reward": -0.3547092080116272, "reward_std": 0.28170817345380783, "rewards/cosine_scaled_reward": -0.1773546040058136, "rewards/format_reward": 0.0, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.752, "grad_norm": 2.4268362522125244, "kl": 1.9609375, "learning_rate": 1.4150013466019114e-07, "loss": 0.0786, "reward": -0.3464732989668846, "reward_std": 0.3199189677834511, "rewards/cosine_scaled_reward": -0.173236645758152, "rewards/format_reward": 0.0, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.756, "grad_norm": 2.686417579650879, "kl": 2.318359375, "learning_rate": 1.4019235263722034e-07, "loss": 0.0926, "reward": -0.3557046577334404, "reward_std": 0.3187018297612667, "rewards/cosine_scaled_reward": -0.1778523214161396, "rewards/format_reward": 0.0, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.76, "grad_norm": 4.9666876792907715, "kl": 1.4619140625, "learning_rate": 1.3890454406082956e-07, "loss": 0.0584, "reward": -0.3234737552702427, "reward_std": 0.32776766270399094, "rewards/cosine_scaled_reward": -0.16173688508570194, "rewards/format_reward": 0.0, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.764, "grad_norm": 4.106746196746826, "kl": 2.49609375, "learning_rate": 1.3763677169699217e-07, "loss": 0.0999, "reward": -0.4192545562982559, "reward_std": 0.33375757187604904, "rewards/cosine_scaled_reward": -0.20962728559970856, "rewards/format_reward": 0.0, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 1534.8690490722656, "epoch": 1.768, "grad_norm": 2.842816114425659, "kl": 2.2578125, "learning_rate": 1.3638909733514452e-07, "loss": 0.0898, "reward": -0.3652210012078285, "reward_std": 0.3345082625746727, "rewards/cosine_scaled_reward": -0.18261050805449486, "rewards/format_reward": 0.0, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.772, "grad_norm": 3.186333179473877, "kl": 2.375, "learning_rate": 1.351615817851748e-07, "loss": 0.0947, "reward": -0.40324729681015015, "reward_std": 0.32466883957386017, "rewards/cosine_scaled_reward": -0.20162366330623627, "rewards/format_reward": 0.0, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.776, "grad_norm": 4.4096360206604, "kl": 2.99609375, "learning_rate": 1.3395428487445914e-07, "loss": 0.1197, "reward": -0.3327697291970253, "reward_std": 0.3282741829752922, "rewards/cosine_scaled_reward": -0.16638486459851265, "rewards/format_reward": 0.0, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.78, "grad_norm": 2.8214669227600098, "kl": 1.8623046875, "learning_rate": 1.3276726544494571e-07, "loss": 0.0746, "reward": -0.39069636911153793, "reward_std": 0.33478184044361115, "rewards/cosine_scaled_reward": -0.19534818828105927, "rewards/format_reward": 0.0, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.784, "grad_norm": 2.96333646774292, "kl": 1.828125, "learning_rate": 1.316005813502869e-07, "loss": 0.073, "reward": -0.34233053401112556, "reward_std": 0.30314670503139496, "rewards/cosine_scaled_reward": -0.17116525955498219, "rewards/format_reward": 0.0, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.788, "grad_norm": 2.538837432861328, "kl": 1.615234375, "learning_rate": 1.3045428945301953e-07, "loss": 0.0647, "reward": -0.2668099580332637, "reward_std": 0.3087245300412178, "rewards/cosine_scaled_reward": -0.1334049835568294, "rewards/format_reward": 0.0, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.792, "grad_norm": 6.922802925109863, "kl": 1.9208984375, "learning_rate": 1.2932844562179352e-07, "loss": 0.0768, "reward": -0.3690221831202507, "reward_std": 0.3130299560725689, "rewards/cosine_scaled_reward": -0.18451109528541565, "rewards/format_reward": 0.0, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.796, "grad_norm": 3.2286629676818848, "kl": 1.990234375, "learning_rate": 1.2822310472864885e-07, "loss": 0.0795, "reward": -0.32342398166656494, "reward_std": 0.3065089136362076, "rewards/cosine_scaled_reward": -0.16171199083328247, "rewards/format_reward": 0.0, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.8, "grad_norm": 3.7653493881225586, "kl": 1.904296875, "learning_rate": 1.2713832064634125e-07, "loss": 0.0763, "reward": -0.4029879495501518, "reward_std": 0.31490693986415863, "rewards/cosine_scaled_reward": -0.2014939747750759, "rewards/format_reward": 0.0, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.804, "grad_norm": 3.4150803089141846, "kl": 2.150390625, "learning_rate": 1.260741462457165e-07, "loss": 0.086, "reward": -0.3429009020328522, "reward_std": 0.29108157753944397, "rewards/cosine_scaled_reward": -0.1714504510164261, "rewards/format_reward": 0.0, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.808, "grad_norm": 4.145492076873779, "kl": 2.2421875, "learning_rate": 1.2503063339313356e-07, "loss": 0.0897, "reward": -0.42198269814252853, "reward_std": 0.3363164961338043, "rewards/cosine_scaled_reward": -0.21099134907126427, "rewards/format_reward": 0.0, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.812, "grad_norm": 4.779297351837158, "kl": 2.228515625, "learning_rate": 1.2400783294793668e-07, "loss": 0.0891, "reward": -0.3492959663271904, "reward_std": 0.2949202358722687, "rewards/cosine_scaled_reward": -0.1746479757130146, "rewards/format_reward": 0.0, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.8159999999999998, "grad_norm": 2.905301570892334, "kl": 1.265625, "learning_rate": 1.2300579475997657e-07, "loss": 0.0506, "reward": -0.2935212664306164, "reward_std": 0.26374514773488045, "rewards/cosine_scaled_reward": -0.1467606294900179, "rewards/format_reward": 0.0, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.8199999999999998, "grad_norm": 2.7079851627349854, "kl": 2.1337890625, "learning_rate": 1.220245676671809e-07, "loss": 0.0853, "reward": -0.3475092798471451, "reward_std": 0.30007384717464447, "rewards/cosine_scaled_reward": -0.17375463247299194, "rewards/format_reward": 0.0, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.8239999999999998, "grad_norm": 2.6113271713256836, "kl": 1.6376953125, "learning_rate": 1.2106419949317388e-07, "loss": 0.0654, "reward": -0.330677293241024, "reward_std": 0.3133997842669487, "rewards/cosine_scaled_reward": -0.1653386428952217, "rewards/format_reward": 0.0, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.8279999999999998, "grad_norm": 2.7393922805786133, "kl": 1.666015625, "learning_rate": 1.2012473704494537e-07, "loss": 0.0668, "reward": -0.3434924744069576, "reward_std": 0.3196050524711609, "rewards/cosine_scaled_reward": -0.17174622975289822, "rewards/format_reward": 0.0, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.8319999999999999, "grad_norm": 4.49023962020874, "kl": 2.34375, "learning_rate": 1.1920622611056974e-07, "loss": 0.0938, "reward": -0.34944383054971695, "reward_std": 0.3238733857870102, "rewards/cosine_scaled_reward": -0.17472190782427788, "rewards/format_reward": 0.0, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.8359999999999999, "grad_norm": 2.3561832904815674, "kl": 1.4501953125, "learning_rate": 1.1830871145697412e-07, "loss": 0.0579, "reward": -0.3565739244222641, "reward_std": 0.3099294453859329, "rewards/cosine_scaled_reward": -0.17828696221113205, "rewards/format_reward": 0.0, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.8399999999999999, "grad_norm": 3.1239490509033203, "kl": 1.8984375, "learning_rate": 1.1743223682775649e-07, "loss": 0.0759, "reward": -0.3478566035628319, "reward_std": 0.28794750943779945, "rewards/cosine_scaled_reward": -0.17392829060554504, "rewards/format_reward": 0.0, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.8439999999999999, "grad_norm": 2.673818826675415, "kl": 1.740234375, "learning_rate": 1.1657684494105386e-07, "loss": 0.0695, "reward": -0.339593730866909, "reward_std": 0.3045819625258446, "rewards/cosine_scaled_reward": -0.1697968691587448, "rewards/format_reward": 0.0, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.8479999999999999, "grad_norm": 3.220402479171753, "kl": 1.626953125, "learning_rate": 1.1574257748745986e-07, "loss": 0.0651, "reward": -0.36886321753263474, "reward_std": 0.26985886320471764, "rewards/cosine_scaled_reward": -0.18443159759044647, "rewards/format_reward": 0.0, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 1530.4642944335938, "epoch": 1.8519999999999999, "grad_norm": 2.8002877235412598, "kl": 2.23828125, "learning_rate": 1.1492947512799328e-07, "loss": 0.0941, "reward": -0.31243710219860077, "reward_std": 0.3104839473962784, "rewards/cosine_scaled_reward": -0.15621854737401009, "rewards/format_reward": 0.0, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.8559999999999999, "grad_norm": 3.3076934814453125, "kl": 2.455078125, "learning_rate": 1.1413757749211602e-07, "loss": 0.098, "reward": -0.3408735916018486, "reward_std": 0.3259742558002472, "rewards/cosine_scaled_reward": -0.1704367958009243, "rewards/format_reward": 0.0, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.8599999999999999, "grad_norm": 4.302088737487793, "kl": 2.005859375, "learning_rate": 1.1336692317580158e-07, "loss": 0.0802, "reward": -0.3594956621527672, "reward_std": 0.32260415703058243, "rewards/cosine_scaled_reward": -0.1797478273510933, "rewards/format_reward": 0.0, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.8639999999999999, "grad_norm": 4.171574115753174, "kl": 2.490234375, "learning_rate": 1.1261754973965422e-07, "loss": 0.0995, "reward": -0.3996199369430542, "reward_std": 0.30815524607896805, "rewards/cosine_scaled_reward": -0.1998099721968174, "rewards/format_reward": 0.0, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.8679999999999999, "grad_norm": 3.7009289264678955, "kl": 1.841796875, "learning_rate": 1.1188949370707787e-07, "loss": 0.0738, "reward": -0.3371664360165596, "reward_std": 0.3329595774412155, "rewards/cosine_scaled_reward": -0.1685832180082798, "rewards/format_reward": 0.0, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.8719999999999999, "grad_norm": 2.592533826828003, "kl": 2.251953125, "learning_rate": 1.1118279056249653e-07, "loss": 0.0901, "reward": -0.34844203293323517, "reward_std": 0.322611540555954, "rewards/cosine_scaled_reward": -0.1742210052907467, "rewards/format_reward": 0.0, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.876, "grad_norm": 4.633761405944824, "kl": 1.64013671875, "learning_rate": 1.1049747474962444e-07, "loss": 0.0656, "reward": -0.3193807154893875, "reward_std": 0.26448768377304077, "rewards/cosine_scaled_reward": -0.15969035774469376, "rewards/format_reward": 0.0, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.88, "grad_norm": 3.101719617843628, "kl": 2.033203125, "learning_rate": 1.0983357966978745e-07, "loss": 0.0812, "reward": -0.3662792518734932, "reward_std": 0.32248761504888535, "rewards/cosine_scaled_reward": -0.1831396110355854, "rewards/format_reward": 0.0, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.884, "grad_norm": 2.580354690551758, "kl": 1.607421875, "learning_rate": 1.0919113768029517e-07, "loss": 0.0643, "reward": -0.34900667518377304, "reward_std": 0.31430666893720627, "rewards/cosine_scaled_reward": -0.17450333759188652, "rewards/format_reward": 0.0, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.888, "grad_norm": 2.7384796142578125, "kl": 1.8046875, "learning_rate": 1.0857018009286381e-07, "loss": 0.0722, "reward": -0.32778534665703773, "reward_std": 0.3321828171610832, "rewards/cosine_scaled_reward": -0.16389267705380917, "rewards/format_reward": 0.0, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.892, "grad_norm": 3.759181499481201, "kl": 2.017578125, "learning_rate": 1.0797073717209013e-07, "loss": 0.0807, "reward": -0.32047825306653976, "reward_std": 0.28816820681095123, "rewards/cosine_scaled_reward": -0.16023912653326988, "rewards/format_reward": 0.0, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.896, "grad_norm": 2.8909876346588135, "kl": 1.603515625, "learning_rate": 1.0739283813397639e-07, "loss": 0.0642, "reward": -0.3390325605869293, "reward_std": 0.3011201545596123, "rewards/cosine_scaled_reward": -0.16951627284288406, "rewards/format_reward": 0.0, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.9, "grad_norm": 2.3281497955322266, "kl": 2.0234375, "learning_rate": 1.068365111445064e-07, "loss": 0.081, "reward": -0.36704741418361664, "reward_std": 0.3062589168548584, "rewards/cosine_scaled_reward": -0.18352371081709862, "rewards/format_reward": 0.0, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.904, "grad_norm": 3.56882643699646, "kl": 2.515625, "learning_rate": 1.063017833182728e-07, "loss": 0.1008, "reward": -0.39511261135339737, "reward_std": 0.3128170743584633, "rewards/cosine_scaled_reward": -0.19755630940198898, "rewards/format_reward": 0.0, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.908, "grad_norm": 2.958406925201416, "kl": 1.755859375, "learning_rate": 1.0578868071715544e-07, "loss": 0.0702, "reward": -0.3462023660540581, "reward_std": 0.322578527033329, "rewards/cosine_scaled_reward": -0.17310118675231934, "rewards/format_reward": 0.0, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.912, "grad_norm": 3.044797897338867, "kl": 2.375, "learning_rate": 1.0529722834905125e-07, "loss": 0.095, "reward": -0.3144143670797348, "reward_std": 0.29915551096200943, "rewards/cosine_scaled_reward": -0.1572071835398674, "rewards/format_reward": 0.0, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.916, "grad_norm": 4.031872272491455, "kl": 2.640625, "learning_rate": 1.0482745016665526e-07, "loss": 0.1057, "reward": -0.3763216808438301, "reward_std": 0.3211255893111229, "rewards/cosine_scaled_reward": -0.18816084042191505, "rewards/format_reward": 0.0, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.92, "grad_norm": 2.3054392337799072, "kl": 1.3173828125, "learning_rate": 1.0437936906629334e-07, "loss": 0.0528, "reward": -0.2678487957455218, "reward_std": 0.2627658285200596, "rewards/cosine_scaled_reward": -0.13392440509051085, "rewards/format_reward": 0.0, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.924, "grad_norm": 3.41572642326355, "kl": 1.353515625, "learning_rate": 1.0395300688680625e-07, "loss": 0.0541, "reward": -0.35157452523708344, "reward_std": 0.3239835053682327, "rewards/cosine_scaled_reward": -0.17578726634383202, "rewards/format_reward": 0.0, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.928, "grad_norm": 2.691436290740967, "kl": 2.041015625, "learning_rate": 1.0354838440848501e-07, "loss": 0.0816, "reward": -0.39503272622823715, "reward_std": 0.3050593361258507, "rewards/cosine_scaled_reward": -0.19751636311411858, "rewards/format_reward": 0.0, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.932, "grad_norm": 2.859536647796631, "kl": 1.494140625, "learning_rate": 1.0316552135205837e-07, "loss": 0.0599, "reward": -0.395970955491066, "reward_std": 0.27583859115839005, "rewards/cosine_scaled_reward": -0.197985477745533, "rewards/format_reward": 0.0, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.936, "grad_norm": 2.9280340671539307, "kl": 1.765625, "learning_rate": 1.0280443637773163e-07, "loss": 0.0708, "reward": -0.2913724035024643, "reward_std": 0.2617946192622185, "rewards/cosine_scaled_reward": -0.14568619430065155, "rewards/format_reward": 0.0, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.94, "grad_norm": 2.2830445766448975, "kl": 1.2158203125, "learning_rate": 1.0246514708427701e-07, "loss": 0.0487, "reward": -0.3095552623271942, "reward_std": 0.292842835187912, "rewards/cosine_scaled_reward": -0.1547776274383068, "rewards/format_reward": 0.0, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.944, "grad_norm": 3.141052007675171, "kl": 1.3427734375, "learning_rate": 1.0214767000817596e-07, "loss": 0.0537, "reward": -0.32299425452947617, "reward_std": 0.29863065481185913, "rewards/cosine_scaled_reward": -0.16149712353944778, "rewards/format_reward": 0.0, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.948, "grad_norm": 3.97387433052063, "kl": 1.931640625, "learning_rate": 1.0185202062281336e-07, "loss": 0.0773, "reward": -0.3765959292650223, "reward_std": 0.3192542716860771, "rewards/cosine_scaled_reward": -0.18829796463251114, "rewards/format_reward": 0.0, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.952, "grad_norm": 2.656202554702759, "kl": 1.578125, "learning_rate": 1.0157821333772304e-07, "loss": 0.0631, "reward": -0.31205643340945244, "reward_std": 0.31670553237199783, "rewards/cosine_scaled_reward": -0.15602822043001652, "rewards/format_reward": 0.0, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.956, "grad_norm": 3.296848773956299, "kl": 1.16796875, "learning_rate": 1.013262614978859e-07, "loss": 0.0468, "reward": -0.3039631359279156, "reward_std": 0.27847766503691673, "rewards/cosine_scaled_reward": -0.15198157727718353, "rewards/format_reward": 0.0, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.96, "grad_norm": 4.522839546203613, "kl": 1.8203125, "learning_rate": 1.0109617738307911e-07, "loss": 0.0728, "reward": -0.34008362144231796, "reward_std": 0.29262910783290863, "rewards/cosine_scaled_reward": -0.17004182189702988, "rewards/format_reward": 0.0, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.964, "grad_norm": 2.311014175415039, "kl": 2.244140625, "learning_rate": 1.0088797220727779e-07, "loss": 0.0898, "reward": -0.34849604219198227, "reward_std": 0.3044138178229332, "rewards/cosine_scaled_reward": -0.17424802854657173, "rewards/format_reward": 0.0, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.968, "grad_norm": 2.6442465782165527, "kl": 1.998046875, "learning_rate": 1.0070165611810855e-07, "loss": 0.0799, "reward": -0.34308964014053345, "reward_std": 0.3727850690484047, "rewards/cosine_scaled_reward": -0.17154482379555702, "rewards/format_reward": 0.0, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.972, "grad_norm": 2.6985509395599365, "kl": 1.41796875, "learning_rate": 1.005372381963547e-07, "loss": 0.0567, "reward": -0.3521110415458679, "reward_std": 0.30227896198630333, "rewards/cosine_scaled_reward": -0.17605552449822426, "rewards/format_reward": 0.0, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.976, "grad_norm": 3.240550994873047, "kl": 1.904296875, "learning_rate": 1.0039472645551372e-07, "loss": 0.076, "reward": -0.3422994837164879, "reward_std": 0.3251089081168175, "rewards/cosine_scaled_reward": -0.17114974185824394, "rewards/format_reward": 0.0, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.98, "grad_norm": 4.803572177886963, "kl": 3.177734375, "learning_rate": 1.002741278414069e-07, "loss": 0.1272, "reward": -0.3737839311361313, "reward_std": 0.3232840970158577, "rewards/cosine_scaled_reward": -0.18689196929335594, "rewards/format_reward": 0.0, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.984, "grad_norm": 2.532582998275757, "kl": 1.9375, "learning_rate": 1.0017544823184055e-07, "loss": 0.0776, "reward": -0.374487929046154, "reward_std": 0.32537975162267685, "rewards/cosine_scaled_reward": -0.187243964523077, "rewards/format_reward": 0.0, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.988, "grad_norm": 2.6129701137542725, "kl": 2.2734375, "learning_rate": 1.0009869243631952e-07, "loss": 0.091, "reward": -0.3434004709124565, "reward_std": 0.32708871364593506, "rewards/cosine_scaled_reward": -0.17170023545622826, "rewards/format_reward": 0.0, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.992, "grad_norm": 4.10455322265625, "kl": 1.595703125, "learning_rate": 1.000438641958131e-07, "loss": 0.0638, "reward": -0.3211556486785412, "reward_std": 0.2905324958264828, "rewards/cosine_scaled_reward": -0.1605778243392706, "rewards/format_reward": 0.0, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 1536.0, "epoch": 1.996, "grad_norm": 2.7520267963409424, "kl": 1.892578125, "learning_rate": 1.0001096618257236e-07, "loss": 0.0756, "reward": -0.35309676826000214, "reward_std": 0.31401190161705017, "rewards/cosine_scaled_reward": -0.17654838413000107, "rewards/format_reward": 0.0, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 1536.0001220703125, "epoch": 2.0, "grad_norm": 2.9658398628234863, "kl": 1.7763671875, "learning_rate": 1e-07, "loss": 0.0711, "reward": -0.343311108648777, "reward_std": 0.28952478244900703, "rewards/cosine_scaled_reward": -0.1716555580496788, "rewards/format_reward": 0.0, "step": 500 }, { "epoch": 2.0, "step": 500, "total_flos": 0.0, "train_loss": 0.05846181693652478, "train_runtime": 107214.2293, "train_samples_per_second": 0.783, "train_steps_per_second": 0.005 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }