| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.004, | |
| "grad_norm": 4.214743137359619, | |
| "kl": 0.0, | |
| "learning_rate": 2e-08, | |
| "loss": -0.0, | |
| "reward": -0.572140134871006, | |
| "reward_std": 0.3359133452177048, | |
| "rewards/cosine_scaled_reward": -0.286070067435503, | |
| "rewards/format_reward": 0.0, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.008, | |
| "grad_norm": 3.178635597229004, | |
| "kl": 0.0, | |
| "learning_rate": 4e-08, | |
| "loss": -0.0, | |
| "reward": -0.6001544743776321, | |
| "reward_std": 0.33404429256916046, | |
| "rewards/cosine_scaled_reward": -0.30007724463939667, | |
| "rewards/format_reward": 0.0, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.012, | |
| "grad_norm": 4.78328800201416, | |
| "kl": 6.908178329467773e-05, | |
| "learning_rate": 6e-08, | |
| "loss": 0.0, | |
| "reward": -0.502997636795044, | |
| "reward_std": 0.3310435339808464, | |
| "rewards/cosine_scaled_reward": -0.251498818397522, | |
| "rewards/format_reward": 0.0, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.016, | |
| "grad_norm": 3.9194376468658447, | |
| "kl": 6.488710641860962e-05, | |
| "learning_rate": 8e-08, | |
| "loss": 0.0, | |
| "reward": -0.5549568086862564, | |
| "reward_std": 0.3469474986195564, | |
| "rewards/cosine_scaled_reward": -0.2774783968925476, | |
| "rewards/format_reward": 0.0, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.02, | |
| "grad_norm": 3.903712511062622, | |
| "kl": 5.97834587097168e-05, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0, | |
| "reward": -0.5800392031669617, | |
| "reward_std": 0.35274410992860794, | |
| "rewards/cosine_scaled_reward": -0.29001960158348083, | |
| "rewards/format_reward": 0.0, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.024, | |
| "grad_norm": 3.738009452819824, | |
| "kl": 6.499886512756348e-05, | |
| "learning_rate": 1.2e-07, | |
| "loss": 0.0, | |
| "reward": -0.5155884921550751, | |
| "reward_std": 0.37037966400384903, | |
| "rewards/cosine_scaled_reward": -0.25779424607753754, | |
| "rewards/format_reward": 0.0, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.028, | |
| "grad_norm": 2.794049024581909, | |
| "kl": 5.620718002319336e-05, | |
| "learning_rate": 1.4e-07, | |
| "loss": 0.0, | |
| "reward": -0.5175943374633789, | |
| "reward_std": 0.3494645953178406, | |
| "rewards/cosine_scaled_reward": -0.25879716128110886, | |
| "rewards/format_reward": 0.0, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.032, | |
| "grad_norm": 2.484722852706909, | |
| "kl": 8.106231689453125e-05, | |
| "learning_rate": 1.6e-07, | |
| "loss": 0.0, | |
| "reward": -0.5301882103085518, | |
| "reward_std": 0.3405821621417999, | |
| "rewards/cosine_scaled_reward": -0.2650941051542759, | |
| "rewards/format_reward": 0.0, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.036, | |
| "grad_norm": 3.1448230743408203, | |
| "kl": 7.554888725280762e-05, | |
| "learning_rate": 1.8e-07, | |
| "loss": 0.0, | |
| "reward": -0.5024237409234047, | |
| "reward_std": 0.3572370335459709, | |
| "rewards/cosine_scaled_reward": -0.25121185183525085, | |
| "rewards/format_reward": 0.0, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.04, | |
| "grad_norm": 4.125906944274902, | |
| "kl": 8.666515350341797e-05, | |
| "learning_rate": 2e-07, | |
| "loss": 0.0, | |
| "reward": -0.5732719898223877, | |
| "reward_std": 0.37079156190156937, | |
| "rewards/cosine_scaled_reward": -0.28663600236177444, | |
| "rewards/format_reward": 0.0, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.044, | |
| "grad_norm": 4.4225945472717285, | |
| "kl": 5.561113357543945e-05, | |
| "learning_rate": 2.1999999999999998e-07, | |
| "loss": 0.0, | |
| "reward": -0.5889493525028229, | |
| "reward_std": 0.3473696708679199, | |
| "rewards/cosine_scaled_reward": -0.29447468370199203, | |
| "rewards/format_reward": 0.0, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.048, | |
| "grad_norm": 3.891627550125122, | |
| "kl": 7.808208465576172e-05, | |
| "learning_rate": 2.4e-07, | |
| "loss": 0.0, | |
| "reward": -0.5409628972411156, | |
| "reward_std": 0.326653391122818, | |
| "rewards/cosine_scaled_reward": -0.2704814486205578, | |
| "rewards/format_reward": 0.0, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.052, | |
| "grad_norm": 3.552539587020874, | |
| "kl": 7.30752944946289e-05, | |
| "learning_rate": 2.6e-07, | |
| "loss": 0.0, | |
| "reward": -0.5389444306492805, | |
| "reward_std": 0.3649257719516754, | |
| "rewards/cosine_scaled_reward": -0.2694722190499306, | |
| "rewards/format_reward": 0.0, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.056, | |
| "grad_norm": 2.781034231185913, | |
| "kl": 7.081031799316406e-05, | |
| "learning_rate": 2.8e-07, | |
| "loss": 0.0, | |
| "reward": -0.6049635112285614, | |
| "reward_std": 0.3185788542032242, | |
| "rewards/cosine_scaled_reward": -0.3024817630648613, | |
| "rewards/format_reward": 0.0, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.06, | |
| "grad_norm": 3.412130355834961, | |
| "kl": 6.335973739624023e-05, | |
| "learning_rate": 3e-07, | |
| "loss": 0.0, | |
| "reward": -0.6299380213022232, | |
| "reward_std": 0.31315718591213226, | |
| "rewards/cosine_scaled_reward": -0.3149690255522728, | |
| "rewards/format_reward": 0.0, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.064, | |
| "grad_norm": 4.064192771911621, | |
| "kl": 0.00011527538299560547, | |
| "learning_rate": 3.2e-07, | |
| "loss": 0.0, | |
| "reward": -0.5638149380683899, | |
| "reward_std": 0.3539445400238037, | |
| "rewards/cosine_scaled_reward": -0.28190746903419495, | |
| "rewards/format_reward": 0.0, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.068, | |
| "grad_norm": 3.5826501846313477, | |
| "kl": 9.000301361083984e-05, | |
| "learning_rate": 3.4000000000000003e-07, | |
| "loss": 0.0, | |
| "reward": -0.5815131217241287, | |
| "reward_std": 0.3570765480399132, | |
| "rewards/cosine_scaled_reward": -0.29075656831264496, | |
| "rewards/format_reward": 0.0, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.072, | |
| "grad_norm": 3.4398193359375, | |
| "kl": 0.00013589859008789062, | |
| "learning_rate": 3.6e-07, | |
| "loss": 0.0, | |
| "reward": -0.5058030858635902, | |
| "reward_std": 0.3534058630466461, | |
| "rewards/cosine_scaled_reward": -0.2529015429317951, | |
| "rewards/format_reward": 0.0, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.076, | |
| "grad_norm": 3.1647567749023438, | |
| "kl": 0.00010588765144348145, | |
| "learning_rate": 3.7999999999999996e-07, | |
| "loss": 0.0, | |
| "reward": -0.5453799739480019, | |
| "reward_std": 0.3434706851840019, | |
| "rewards/cosine_scaled_reward": -0.27268998324871063, | |
| "rewards/format_reward": 0.0, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.08, | |
| "grad_norm": 4.028233528137207, | |
| "kl": 0.00011265277862548828, | |
| "learning_rate": 4e-07, | |
| "loss": 0.0, | |
| "reward": -0.5725424438714981, | |
| "reward_std": 0.33554956316947937, | |
| "rewards/cosine_scaled_reward": -0.28627122938632965, | |
| "rewards/format_reward": 0.0, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.084, | |
| "grad_norm": 3.0403409004211426, | |
| "kl": 0.00015485286712646484, | |
| "learning_rate": 4.1999999999999995e-07, | |
| "loss": 0.0, | |
| "reward": -0.5395064353942871, | |
| "reward_std": 0.3414423242211342, | |
| "rewards/cosine_scaled_reward": -0.26975322514772415, | |
| "rewards/format_reward": 0.0, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.088, | |
| "grad_norm": 3.5831127166748047, | |
| "kl": 0.0006537437438964844, | |
| "learning_rate": 4.3999999999999997e-07, | |
| "loss": 0.0, | |
| "reward": -0.5216317698359489, | |
| "reward_std": 0.3427959829568863, | |
| "rewards/cosine_scaled_reward": -0.2608158737421036, | |
| "rewards/format_reward": 0.0, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.092, | |
| "grad_norm": 3.5175235271453857, | |
| "kl": 0.0010776519775390625, | |
| "learning_rate": 4.6e-07, | |
| "loss": 0.0, | |
| "reward": -0.5413709655404091, | |
| "reward_std": 0.32718800008296967, | |
| "rewards/cosine_scaled_reward": -0.27068548277020454, | |
| "rewards/format_reward": 0.0, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.096, | |
| "grad_norm": 3.442873239517212, | |
| "kl": 0.0013303756713867188, | |
| "learning_rate": 4.8e-07, | |
| "loss": 0.0001, | |
| "reward": -0.5624926462769508, | |
| "reward_std": 0.3581688553094864, | |
| "rewards/cosine_scaled_reward": -0.2812463231384754, | |
| "rewards/format_reward": 0.0, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.1, | |
| "grad_norm": 2.6114015579223633, | |
| "kl": 0.0016193389892578125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0001, | |
| "reward": -0.5309188961982727, | |
| "reward_std": 0.33032629638910294, | |
| "rewards/cosine_scaled_reward": -0.26545944809913635, | |
| "rewards/format_reward": 0.0, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.104, | |
| "grad_norm": 4.818567752838135, | |
| "kl": 0.0026264190673828125, | |
| "learning_rate": 5.2e-07, | |
| "loss": 0.0001, | |
| "reward": -0.5884083956480026, | |
| "reward_std": 0.3386874794960022, | |
| "rewards/cosine_scaled_reward": -0.2942042052745819, | |
| "rewards/format_reward": 0.0, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.108, | |
| "grad_norm": 4.078734397888184, | |
| "kl": 0.002239227294921875, | |
| "learning_rate": 5.4e-07, | |
| "loss": 0.0001, | |
| "reward": -0.6157089024782181, | |
| "reward_std": 0.3308729752898216, | |
| "rewards/cosine_scaled_reward": -0.30785445868968964, | |
| "rewards/format_reward": 0.0, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.112, | |
| "grad_norm": 3.4599478244781494, | |
| "kl": 0.002338409423828125, | |
| "learning_rate": 5.6e-07, | |
| "loss": 0.0001, | |
| "reward": -0.5709060430526733, | |
| "reward_std": 0.3136204034090042, | |
| "rewards/cosine_scaled_reward": -0.28545302152633667, | |
| "rewards/format_reward": 0.0, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1533.9464416503906, | |
| "epoch": 0.116, | |
| "grad_norm": 3.461718797683716, | |
| "kl": 0.003444671630859375, | |
| "learning_rate": 5.8e-07, | |
| "loss": -0.001, | |
| "reward": -0.5237472280859947, | |
| "reward_std": 0.3601622208952904, | |
| "rewards/cosine_scaled_reward": -0.26187360659241676, | |
| "rewards/format_reward": 0.0, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.12, | |
| "grad_norm": 3.7205333709716797, | |
| "kl": 0.00542449951171875, | |
| "learning_rate": 6e-07, | |
| "loss": 0.0002, | |
| "reward": -0.5595864206552505, | |
| "reward_std": 0.3391585499048233, | |
| "rewards/cosine_scaled_reward": -0.2797932103276253, | |
| "rewards/format_reward": 0.0, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.124, | |
| "grad_norm": 3.639012575149536, | |
| "kl": 0.0102996826171875, | |
| "learning_rate": 6.2e-07, | |
| "loss": 0.0004, | |
| "reward": -0.5832120478153229, | |
| "reward_std": 0.34403981268405914, | |
| "rewards/cosine_scaled_reward": -0.29160603135824203, | |
| "rewards/format_reward": 0.0, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.128, | |
| "grad_norm": 3.499258041381836, | |
| "kl": 0.0159149169921875, | |
| "learning_rate": 6.4e-07, | |
| "loss": 0.0006, | |
| "reward": -0.5567401573061943, | |
| "reward_std": 0.3353060856461525, | |
| "rewards/cosine_scaled_reward": -0.27837007120251656, | |
| "rewards/format_reward": 0.0, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.132, | |
| "grad_norm": 3.564453601837158, | |
| "kl": 0.0182952880859375, | |
| "learning_rate": 6.6e-07, | |
| "loss": 0.0007, | |
| "reward": -0.5521366372704506, | |
| "reward_std": 0.3413034975528717, | |
| "rewards/cosine_scaled_reward": -0.2760683260858059, | |
| "rewards/format_reward": 0.0, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.136, | |
| "grad_norm": 3.567174196243286, | |
| "kl": 0.0237274169921875, | |
| "learning_rate": 6.800000000000001e-07, | |
| "loss": 0.0009, | |
| "reward": -0.5193822234869003, | |
| "reward_std": 0.35690775513648987, | |
| "rewards/cosine_scaled_reward": -0.25969111174345016, | |
| "rewards/format_reward": 0.0, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.14, | |
| "grad_norm": 2.247893810272217, | |
| "kl": 0.0149078369140625, | |
| "learning_rate": 7e-07, | |
| "loss": 0.0006, | |
| "reward": -0.5820326581597328, | |
| "reward_std": 0.3510446697473526, | |
| "rewards/cosine_scaled_reward": -0.2910163216292858, | |
| "rewards/format_reward": 0.0, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.144, | |
| "grad_norm": 2.9316084384918213, | |
| "kl": 0.022552490234375, | |
| "learning_rate": 7.2e-07, | |
| "loss": 0.0009, | |
| "reward": -0.5632490888237953, | |
| "reward_std": 0.3500733822584152, | |
| "rewards/cosine_scaled_reward": -0.28162455186247826, | |
| "rewards/format_reward": 0.0, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.148, | |
| "grad_norm": 3.5201869010925293, | |
| "kl": 0.02850341796875, | |
| "learning_rate": 7.4e-07, | |
| "loss": 0.0011, | |
| "reward": -0.5141241475939751, | |
| "reward_std": 0.3309687077999115, | |
| "rewards/cosine_scaled_reward": -0.25706208124756813, | |
| "rewards/format_reward": 0.0, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.152, | |
| "grad_norm": 2.7246434688568115, | |
| "kl": 0.0296630859375, | |
| "learning_rate": 7.599999999999999e-07, | |
| "loss": 0.0012, | |
| "reward": -0.5139049887657166, | |
| "reward_std": 0.33319953083992004, | |
| "rewards/cosine_scaled_reward": -0.25695250555872917, | |
| "rewards/format_reward": 0.0, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.156, | |
| "grad_norm": 2.880594491958618, | |
| "kl": 0.0258636474609375, | |
| "learning_rate": 7.799999999999999e-07, | |
| "loss": 0.001, | |
| "reward": -0.5646104216575623, | |
| "reward_std": 0.3474426791071892, | |
| "rewards/cosine_scaled_reward": -0.2823052257299423, | |
| "rewards/format_reward": 0.0, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.16, | |
| "grad_norm": 2.6734988689422607, | |
| "kl": 0.0321044921875, | |
| "learning_rate": 8e-07, | |
| "loss": 0.0013, | |
| "reward": -0.5586390048265457, | |
| "reward_std": 0.3474784344434738, | |
| "rewards/cosine_scaled_reward": -0.27931951731443405, | |
| "rewards/format_reward": 0.0, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.164, | |
| "grad_norm": 3.1370785236358643, | |
| "kl": 0.03369140625, | |
| "learning_rate": 8.199999999999999e-07, | |
| "loss": 0.0013, | |
| "reward": -0.5609789937734604, | |
| "reward_std": 0.3450735807418823, | |
| "rewards/cosine_scaled_reward": -0.280489519238472, | |
| "rewards/format_reward": 0.0, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.168, | |
| "grad_norm": 2.5502073764801025, | |
| "kl": 0.06072998046875, | |
| "learning_rate": 8.399999999999999e-07, | |
| "loss": 0.0024, | |
| "reward": -0.5195748135447502, | |
| "reward_std": 0.34474433213472366, | |
| "rewards/cosine_scaled_reward": -0.2597874030470848, | |
| "rewards/format_reward": 0.0, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.172, | |
| "grad_norm": 2.1381213665008545, | |
| "kl": 0.067474365234375, | |
| "learning_rate": 8.599999999999999e-07, | |
| "loss": 0.0027, | |
| "reward": -0.5580533072352409, | |
| "reward_std": 0.32987529784440994, | |
| "rewards/cosine_scaled_reward": -0.27902666106820107, | |
| "rewards/format_reward": 0.0, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.176, | |
| "grad_norm": 2.1730432510375977, | |
| "kl": 0.0958251953125, | |
| "learning_rate": 8.799999999999999e-07, | |
| "loss": 0.0038, | |
| "reward": -0.5585729256272316, | |
| "reward_std": 0.3295438587665558, | |
| "rewards/cosine_scaled_reward": -0.2792864739894867, | |
| "rewards/format_reward": 0.0, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.18, | |
| "grad_norm": 1.962768316268921, | |
| "kl": 0.079345703125, | |
| "learning_rate": 9e-07, | |
| "loss": 0.0032, | |
| "reward": -0.5980347394943237, | |
| "reward_std": 0.3284436762332916, | |
| "rewards/cosine_scaled_reward": -0.29901736974716187, | |
| "rewards/format_reward": 0.0, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.184, | |
| "grad_norm": 1.8276231288909912, | |
| "kl": 0.1153564453125, | |
| "learning_rate": 9.2e-07, | |
| "loss": 0.0046, | |
| "reward": -0.507519856095314, | |
| "reward_std": 0.33579862862825394, | |
| "rewards/cosine_scaled_reward": -0.2537599205970764, | |
| "rewards/format_reward": 0.0, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.188, | |
| "grad_norm": 2.608023166656494, | |
| "kl": 0.09033203125, | |
| "learning_rate": 9.399999999999999e-07, | |
| "loss": 0.0036, | |
| "reward": -0.5289521142840385, | |
| "reward_std": 0.31808041036129, | |
| "rewards/cosine_scaled_reward": -0.26447605714201927, | |
| "rewards/format_reward": 0.0, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.192, | |
| "grad_norm": 1.8956966400146484, | |
| "kl": 0.09814453125, | |
| "learning_rate": 9.6e-07, | |
| "loss": 0.0039, | |
| "reward": -0.566174179315567, | |
| "reward_std": 0.311339795589447, | |
| "rewards/cosine_scaled_reward": -0.2830870673060417, | |
| "rewards/format_reward": 0.0, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.196, | |
| "grad_norm": 1.7705461978912354, | |
| "kl": 0.1209716796875, | |
| "learning_rate": 9.8e-07, | |
| "loss": 0.0048, | |
| "reward": -0.528024435043335, | |
| "reward_std": 0.36330366879701614, | |
| "rewards/cosine_scaled_reward": -0.26401223987340927, | |
| "rewards/format_reward": 0.0, | |
| "step": 49 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.2, | |
| "grad_norm": 2.1113531589508057, | |
| "kl": 0.1171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0047, | |
| "reward": -0.4406622089445591, | |
| "reward_std": 0.3163011893630028, | |
| "rewards/cosine_scaled_reward": -0.2203311063349247, | |
| "rewards/format_reward": 0.0, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.204, | |
| "grad_norm": 1.803585410118103, | |
| "kl": 0.1026611328125, | |
| "learning_rate": 9.999890338174275e-07, | |
| "loss": 0.0041, | |
| "reward": -0.5815826654434204, | |
| "reward_std": 0.3248438388109207, | |
| "rewards/cosine_scaled_reward": -0.2907913327217102, | |
| "rewards/format_reward": 0.0, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.208, | |
| "grad_norm": 1.7076486349105835, | |
| "kl": 0.157470703125, | |
| "learning_rate": 9.999561358041868e-07, | |
| "loss": 0.0063, | |
| "reward": -0.5362438708543777, | |
| "reward_std": 0.2975444979965687, | |
| "rewards/cosine_scaled_reward": -0.26812195032835007, | |
| "rewards/format_reward": 0.0, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.212, | |
| "grad_norm": 2.478224515914917, | |
| "kl": 0.144287109375, | |
| "learning_rate": 9.999013075636804e-07, | |
| "loss": 0.0058, | |
| "reward": -0.47916819900274277, | |
| "reward_std": 0.35621220618486404, | |
| "rewards/cosine_scaled_reward": -0.23958410695195198, | |
| "rewards/format_reward": 0.0, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.216, | |
| "grad_norm": 2.006901502609253, | |
| "kl": 0.1337890625, | |
| "learning_rate": 9.998245517681593e-07, | |
| "loss": 0.0053, | |
| "reward": -0.5450761765241623, | |
| "reward_std": 0.32576631009578705, | |
| "rewards/cosine_scaled_reward": -0.27253808826208115, | |
| "rewards/format_reward": 0.0, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.22, | |
| "grad_norm": 2.2259609699249268, | |
| "kl": 0.11669921875, | |
| "learning_rate": 9.997258721585931e-07, | |
| "loss": 0.0047, | |
| "reward": -0.5271478518843651, | |
| "reward_std": 0.34441374242305756, | |
| "rewards/cosine_scaled_reward": -0.26357391849160194, | |
| "rewards/format_reward": 0.0, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.224, | |
| "grad_norm": 2.020939588546753, | |
| "kl": 0.1907958984375, | |
| "learning_rate": 9.996052735444862e-07, | |
| "loss": 0.0076, | |
| "reward": -0.5367654263973236, | |
| "reward_std": 0.3470792919397354, | |
| "rewards/cosine_scaled_reward": -0.2683827131986618, | |
| "rewards/format_reward": 0.0, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.228, | |
| "grad_norm": 1.9356812238693237, | |
| "kl": 0.158935546875, | |
| "learning_rate": 9.994627618036452e-07, | |
| "loss": 0.0064, | |
| "reward": -0.505635529756546, | |
| "reward_std": 0.3292393088340759, | |
| "rewards/cosine_scaled_reward": -0.252817764878273, | |
| "rewards/format_reward": 0.0, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.232, | |
| "grad_norm": 3.2483060359954834, | |
| "kl": 0.188720703125, | |
| "learning_rate": 9.992983438818915e-07, | |
| "loss": 0.0075, | |
| "reward": -0.504822663962841, | |
| "reward_std": 0.35463710874319077, | |
| "rewards/cosine_scaled_reward": -0.2524113282561302, | |
| "rewards/format_reward": 0.0, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.236, | |
| "grad_norm": 2.2256879806518555, | |
| "kl": 0.205322265625, | |
| "learning_rate": 9.991120277927223e-07, | |
| "loss": 0.0082, | |
| "reward": -0.5851711928844452, | |
| "reward_std": 0.3146449252963066, | |
| "rewards/cosine_scaled_reward": -0.2925856038928032, | |
| "rewards/format_reward": 0.0, | |
| "step": 59 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.24, | |
| "grad_norm": 2.093649387359619, | |
| "kl": 0.198486328125, | |
| "learning_rate": 9.989038226169207e-07, | |
| "loss": 0.0079, | |
| "reward": -0.45284587889909744, | |
| "reward_std": 0.34760017693042755, | |
| "rewards/cosine_scaled_reward": -0.22642293944954872, | |
| "rewards/format_reward": 0.0, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.244, | |
| "grad_norm": 2.378591537475586, | |
| "kl": 0.24365234375, | |
| "learning_rate": 9.98673738502114e-07, | |
| "loss": 0.0097, | |
| "reward": -0.5091445297002792, | |
| "reward_std": 0.3452131450176239, | |
| "rewards/cosine_scaled_reward": -0.2545722760260105, | |
| "rewards/format_reward": 0.0, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.248, | |
| "grad_norm": 2.188553810119629, | |
| "kl": 0.29833984375, | |
| "learning_rate": 9.98421786662277e-07, | |
| "loss": 0.0119, | |
| "reward": -0.47440846264362335, | |
| "reward_std": 0.34785814583301544, | |
| "rewards/cosine_scaled_reward": -0.23720423132181168, | |
| "rewards/format_reward": 0.0, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.252, | |
| "grad_norm": 2.6211366653442383, | |
| "kl": 0.48095703125, | |
| "learning_rate": 9.981479793771866e-07, | |
| "loss": 0.0192, | |
| "reward": -0.46701501309871674, | |
| "reward_std": 0.3275434151291847, | |
| "rewards/cosine_scaled_reward": -0.23350750654935837, | |
| "rewards/format_reward": 0.0, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.256, | |
| "grad_norm": 3.608039617538452, | |
| "kl": 0.63720703125, | |
| "learning_rate": 9.97852329991824e-07, | |
| "loss": 0.0254, | |
| "reward": -0.4022144228219986, | |
| "reward_std": 0.3280187249183655, | |
| "rewards/cosine_scaled_reward": -0.2011072114109993, | |
| "rewards/format_reward": 0.0, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.26, | |
| "grad_norm": 2.1589713096618652, | |
| "kl": 0.587890625, | |
| "learning_rate": 9.975348529157229e-07, | |
| "loss": 0.0236, | |
| "reward": -0.4902011975646019, | |
| "reward_std": 0.33829304575920105, | |
| "rewards/cosine_scaled_reward": -0.24510059878230095, | |
| "rewards/format_reward": 0.0, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.264, | |
| "grad_norm": 4.391396522521973, | |
| "kl": 0.851806640625, | |
| "learning_rate": 9.971955636222684e-07, | |
| "loss": 0.034, | |
| "reward": -0.5337588116526604, | |
| "reward_std": 0.3271815627813339, | |
| "rewards/cosine_scaled_reward": -0.2668794058263302, | |
| "rewards/format_reward": 0.0, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.268, | |
| "grad_norm": 4.296882629394531, | |
| "kl": 0.892333984375, | |
| "learning_rate": 9.968344786479415e-07, | |
| "loss": 0.0357, | |
| "reward": -0.45740216970443726, | |
| "reward_std": 0.32497797161340714, | |
| "rewards/cosine_scaled_reward": -0.22870109230279922, | |
| "rewards/format_reward": 0.0, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.272, | |
| "grad_norm": 7.224793434143066, | |
| "kl": 1.29736328125, | |
| "learning_rate": 9.964516155915151e-07, | |
| "loss": 0.0519, | |
| "reward": -0.5055549815297127, | |
| "reward_std": 0.3318631425499916, | |
| "rewards/cosine_scaled_reward": -0.25277747586369514, | |
| "rewards/format_reward": 0.0, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.276, | |
| "grad_norm": 6.747034072875977, | |
| "kl": 1.3232421875, | |
| "learning_rate": 9.960469931131936e-07, | |
| "loss": 0.0531, | |
| "reward": -0.4314222186803818, | |
| "reward_std": 0.31476689875125885, | |
| "rewards/cosine_scaled_reward": -0.21571110002696514, | |
| "rewards/format_reward": 0.0, | |
| "step": 69 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.28, | |
| "grad_norm": 5.5595808029174805, | |
| "kl": 0.8935546875, | |
| "learning_rate": 9.956206309337066e-07, | |
| "loss": 0.0358, | |
| "reward": -0.4758576303720474, | |
| "reward_std": 0.33101003617048264, | |
| "rewards/cosine_scaled_reward": -0.2379288226366043, | |
| "rewards/format_reward": 0.0, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.284, | |
| "grad_norm": 2.4482791423797607, | |
| "kl": 0.521484375, | |
| "learning_rate": 9.951725498333448e-07, | |
| "loss": 0.0209, | |
| "reward": -0.4491276890039444, | |
| "reward_std": 0.3567735329270363, | |
| "rewards/cosine_scaled_reward": -0.2245638445019722, | |
| "rewards/format_reward": 0.0, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.288, | |
| "grad_norm": 3.1987600326538086, | |
| "kl": 0.6240234375, | |
| "learning_rate": 9.947027716509488e-07, | |
| "loss": 0.025, | |
| "reward": -0.43654023110866547, | |
| "reward_std": 0.3590875416994095, | |
| "rewards/cosine_scaled_reward": -0.21827011927962303, | |
| "rewards/format_reward": 0.0, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.292, | |
| "grad_norm": 4.885537147521973, | |
| "kl": 1.14599609375, | |
| "learning_rate": 9.942113192828444e-07, | |
| "loss": 0.0458, | |
| "reward": -0.5265215784311295, | |
| "reward_std": 0.3363535851240158, | |
| "rewards/cosine_scaled_reward": -0.26326077431440353, | |
| "rewards/format_reward": 0.0, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.296, | |
| "grad_norm": 3.4503629207611084, | |
| "kl": 1.14794921875, | |
| "learning_rate": 9.93698216681727e-07, | |
| "loss": 0.0459, | |
| "reward": -0.4836200848221779, | |
| "reward_std": 0.33076073229312897, | |
| "rewards/cosine_scaled_reward": -0.24181004241108894, | |
| "rewards/format_reward": 0.0, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.3, | |
| "grad_norm": 3.5954651832580566, | |
| "kl": 0.6767578125, | |
| "learning_rate": 9.931634888554935e-07, | |
| "loss": 0.027, | |
| "reward": -0.5548510551452637, | |
| "reward_std": 0.3006826713681221, | |
| "rewards/cosine_scaled_reward": -0.27742552757263184, | |
| "rewards/format_reward": 0.0, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.304, | |
| "grad_norm": 2.27148699760437, | |
| "kl": 0.69970703125, | |
| "learning_rate": 9.926071618660237e-07, | |
| "loss": 0.028, | |
| "reward": -0.5522997975349426, | |
| "reward_std": 0.32217612117528915, | |
| "rewards/cosine_scaled_reward": -0.2761498987674713, | |
| "rewards/format_reward": 0.0, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.308, | |
| "grad_norm": 2.421114206314087, | |
| "kl": 0.65234375, | |
| "learning_rate": 9.9202926282791e-07, | |
| "loss": 0.0261, | |
| "reward": -0.5491495952010155, | |
| "reward_std": 0.33891358226537704, | |
| "rewards/cosine_scaled_reward": -0.27457480505108833, | |
| "rewards/format_reward": 0.0, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.312, | |
| "grad_norm": 2.296977996826172, | |
| "kl": 0.4833984375, | |
| "learning_rate": 9.91429819907136e-07, | |
| "loss": 0.0193, | |
| "reward": -0.5332002714276314, | |
| "reward_std": 0.3453890234231949, | |
| "rewards/cosine_scaled_reward": -0.2666001245379448, | |
| "rewards/format_reward": 0.0, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.316, | |
| "grad_norm": 2.351818084716797, | |
| "kl": 0.5048828125, | |
| "learning_rate": 9.908088623197048e-07, | |
| "loss": 0.0202, | |
| "reward": -0.4974421188235283, | |
| "reward_std": 0.36291657388210297, | |
| "rewards/cosine_scaled_reward": -0.24872105196118355, | |
| "rewards/format_reward": 0.0, | |
| "step": 79 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.32, | |
| "grad_norm": 2.808706521987915, | |
| "kl": 0.53125, | |
| "learning_rate": 9.901664203302124e-07, | |
| "loss": 0.0212, | |
| "reward": -0.5026201903820038, | |
| "reward_std": 0.30610421299934387, | |
| "rewards/cosine_scaled_reward": -0.2513100877404213, | |
| "rewards/format_reward": 0.0, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.324, | |
| "grad_norm": 2.077920913696289, | |
| "kl": 0.68994140625, | |
| "learning_rate": 9.895025252503755e-07, | |
| "loss": 0.0276, | |
| "reward": -0.4621705636382103, | |
| "reward_std": 0.33135028183460236, | |
| "rewards/cosine_scaled_reward": -0.23108528181910515, | |
| "rewards/format_reward": 0.0, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.328, | |
| "grad_norm": 2.951878309249878, | |
| "kl": 0.6015625, | |
| "learning_rate": 9.888172094375033e-07, | |
| "loss": 0.024, | |
| "reward": -0.5148988738656044, | |
| "reward_std": 0.3465086743235588, | |
| "rewards/cosine_scaled_reward": -0.2574494294822216, | |
| "rewards/format_reward": 0.0, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.332, | |
| "grad_norm": 2.1016077995300293, | |
| "kl": 0.36376953125, | |
| "learning_rate": 9.881105062929221e-07, | |
| "loss": 0.0145, | |
| "reward": -0.48821673542261124, | |
| "reward_std": 0.35235296189785004, | |
| "rewards/cosine_scaled_reward": -0.24410836026072502, | |
| "rewards/format_reward": 0.0, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.336, | |
| "grad_norm": 2.276076555252075, | |
| "kl": 0.77734375, | |
| "learning_rate": 9.873824502603459e-07, | |
| "loss": 0.0311, | |
| "reward": -0.509700171649456, | |
| "reward_std": 0.3434828519821167, | |
| "rewards/cosine_scaled_reward": -0.2548500932753086, | |
| "rewards/format_reward": 0.0, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.34, | |
| "grad_norm": 1.9953871965408325, | |
| "kl": 0.45263671875, | |
| "learning_rate": 9.866330768241983e-07, | |
| "loss": 0.0181, | |
| "reward": -0.5046856477856636, | |
| "reward_std": 0.3276178315281868, | |
| "rewards/cosine_scaled_reward": -0.2523428313434124, | |
| "rewards/format_reward": 0.0, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.344, | |
| "grad_norm": 5.694060802459717, | |
| "kl": 1.50390625, | |
| "learning_rate": 9.85862422507884e-07, | |
| "loss": 0.06, | |
| "reward": -0.5268296301364899, | |
| "reward_std": 0.3594844192266464, | |
| "rewards/cosine_scaled_reward": -0.26341481506824493, | |
| "rewards/format_reward": 0.0, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.348, | |
| "grad_norm": 2.5820319652557373, | |
| "kl": 0.79931640625, | |
| "learning_rate": 9.850705248720068e-07, | |
| "loss": 0.0319, | |
| "reward": -0.5030437260866165, | |
| "reward_std": 0.33297523856163025, | |
| "rewards/cosine_scaled_reward": -0.25152185559272766, | |
| "rewards/format_reward": 0.0, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.352, | |
| "grad_norm": 2.748469829559326, | |
| "kl": 0.8642578125, | |
| "learning_rate": 9.8425742251254e-07, | |
| "loss": 0.0346, | |
| "reward": -0.511917307972908, | |
| "reward_std": 0.3373011276125908, | |
| "rewards/cosine_scaled_reward": -0.255958653986454, | |
| "rewards/format_reward": 0.0, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.356, | |
| "grad_norm": 2.941894054412842, | |
| "kl": 1.10400390625, | |
| "learning_rate": 9.83423155058946e-07, | |
| "loss": 0.0443, | |
| "reward": -0.49383244663476944, | |
| "reward_std": 0.3190907835960388, | |
| "rewards/cosine_scaled_reward": -0.24691622331738472, | |
| "rewards/format_reward": 0.0, | |
| "step": 89 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.36, | |
| "grad_norm": 2.5008065700531006, | |
| "kl": 0.7451171875, | |
| "learning_rate": 9.825677631722435e-07, | |
| "loss": 0.0298, | |
| "reward": -0.5015105679631233, | |
| "reward_std": 0.3283078894019127, | |
| "rewards/cosine_scaled_reward": -0.25075526908040047, | |
| "rewards/format_reward": 0.0, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.364, | |
| "grad_norm": 2.775805950164795, | |
| "kl": 0.8662109375, | |
| "learning_rate": 9.816912885430258e-07, | |
| "loss": 0.0347, | |
| "reward": -0.49317121505737305, | |
| "reward_std": 0.3281624838709831, | |
| "rewards/cosine_scaled_reward": -0.24658560752868652, | |
| "rewards/format_reward": 0.0, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.368, | |
| "grad_norm": 4.057337284088135, | |
| "kl": 1.3115234375, | |
| "learning_rate": 9.807937738894303e-07, | |
| "loss": 0.0525, | |
| "reward": -0.4923912510275841, | |
| "reward_std": 0.334882490336895, | |
| "rewards/cosine_scaled_reward": -0.24619561806321144, | |
| "rewards/format_reward": 0.0, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.372, | |
| "grad_norm": 3.3191726207733154, | |
| "kl": 1.416015625, | |
| "learning_rate": 9.798752629550546e-07, | |
| "loss": 0.0567, | |
| "reward": -0.4856347441673279, | |
| "reward_std": 0.3141849860548973, | |
| "rewards/cosine_scaled_reward": -0.24281736463308334, | |
| "rewards/format_reward": 0.0, | |
| "step": 93 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.376, | |
| "grad_norm": 38.36699676513672, | |
| "kl": 3.833984375, | |
| "learning_rate": 9.78935800506826e-07, | |
| "loss": 0.1535, | |
| "reward": -0.5001253262162209, | |
| "reward_std": 0.34716712683439255, | |
| "rewards/cosine_scaled_reward": -0.25006265565752983, | |
| "rewards/format_reward": 0.0, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.38, | |
| "grad_norm": 2.851670742034912, | |
| "kl": 0.93017578125, | |
| "learning_rate": 9.779754323328192e-07, | |
| "loss": 0.0372, | |
| "reward": -0.4462156817317009, | |
| "reward_std": 0.3170738257467747, | |
| "rewards/cosine_scaled_reward": -0.22310783341526985, | |
| "rewards/format_reward": 0.0, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.384, | |
| "grad_norm": 1.903143048286438, | |
| "kl": 0.662109375, | |
| "learning_rate": 9.769942052400235e-07, | |
| "loss": 0.0265, | |
| "reward": -0.44278524816036224, | |
| "reward_std": 0.340934194624424, | |
| "rewards/cosine_scaled_reward": -0.22139262408018112, | |
| "rewards/format_reward": 0.0, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.388, | |
| "grad_norm": 2.613619089126587, | |
| "kl": 1.0009765625, | |
| "learning_rate": 9.759921670520634e-07, | |
| "loss": 0.04, | |
| "reward": -0.4385986104607582, | |
| "reward_std": 0.3297598212957382, | |
| "rewards/cosine_scaled_reward": -0.2192993052303791, | |
| "rewards/format_reward": 0.0, | |
| "step": 97 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.392, | |
| "grad_norm": 2.1393027305603027, | |
| "kl": 0.84912109375, | |
| "learning_rate": 9.749693666068663e-07, | |
| "loss": 0.0339, | |
| "reward": -0.4335070326924324, | |
| "reward_std": 0.3084552064538002, | |
| "rewards/cosine_scaled_reward": -0.2167535126209259, | |
| "rewards/format_reward": 0.0, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.396, | |
| "grad_norm": 10.226459503173828, | |
| "kl": 1.9765625, | |
| "learning_rate": 9.739258537542835e-07, | |
| "loss": 0.0791, | |
| "reward": -0.5120433643460274, | |
| "reward_std": 0.3308994993567467, | |
| "rewards/cosine_scaled_reward": -0.2560216821730137, | |
| "rewards/format_reward": 0.0, | |
| "step": 99 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.4, | |
| "grad_norm": 2.7042365074157715, | |
| "kl": 1.140625, | |
| "learning_rate": 9.728616793536587e-07, | |
| "loss": 0.0456, | |
| "reward": -0.5387645438313484, | |
| "reward_std": 0.32419781386852264, | |
| "rewards/cosine_scaled_reward": -0.2693822719156742, | |
| "rewards/format_reward": 0.0, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.404, | |
| "grad_norm": 3.3440866470336914, | |
| "kl": 1.158203125, | |
| "learning_rate": 9.717768952713511e-07, | |
| "loss": 0.0464, | |
| "reward": -0.479642316699028, | |
| "reward_std": 0.3374394252896309, | |
| "rewards/cosine_scaled_reward": -0.2398211695253849, | |
| "rewards/format_reward": 0.0, | |
| "step": 101 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.408, | |
| "grad_norm": 2.1483707427978516, | |
| "kl": 0.55859375, | |
| "learning_rate": 9.706715543782064e-07, | |
| "loss": 0.0224, | |
| "reward": -0.4488200396299362, | |
| "reward_std": 0.3361233174800873, | |
| "rewards/cosine_scaled_reward": -0.2244100198149681, | |
| "rewards/format_reward": 0.0, | |
| "step": 102 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.412, | |
| "grad_norm": 4.173567771911621, | |
| "kl": 1.900390625, | |
| "learning_rate": 9.695457105469804e-07, | |
| "loss": 0.0759, | |
| "reward": -0.4979688450694084, | |
| "reward_std": 0.35078077018260956, | |
| "rewards/cosine_scaled_reward": -0.2489844374358654, | |
| "rewards/format_reward": 0.0, | |
| "step": 103 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.416, | |
| "grad_norm": 5.119884490966797, | |
| "kl": 1.611328125, | |
| "learning_rate": 9.683994186497132e-07, | |
| "loss": 0.0644, | |
| "reward": -0.513933926820755, | |
| "reward_std": 0.3170707896351814, | |
| "rewards/cosine_scaled_reward": -0.2569669596850872, | |
| "rewards/format_reward": 0.0, | |
| "step": 104 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.42, | |
| "grad_norm": 2.8145992755889893, | |
| "kl": 1.466796875, | |
| "learning_rate": 9.672327345550543e-07, | |
| "loss": 0.0587, | |
| "reward": -0.47269363701343536, | |
| "reward_std": 0.31501560658216476, | |
| "rewards/cosine_scaled_reward": -0.23634683340787888, | |
| "rewards/format_reward": 0.0, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.424, | |
| "grad_norm": 2.3274426460266113, | |
| "kl": 0.59033203125, | |
| "learning_rate": 9.66045715125541e-07, | |
| "loss": 0.0236, | |
| "reward": -0.44968922436237335, | |
| "reward_std": 0.3498781695961952, | |
| "rewards/cosine_scaled_reward": -0.22484461963176727, | |
| "rewards/format_reward": 0.0, | |
| "step": 106 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.428, | |
| "grad_norm": 2.2112016677856445, | |
| "kl": 1.126953125, | |
| "learning_rate": 9.648384182148252e-07, | |
| "loss": 0.0451, | |
| "reward": -0.5002073347568512, | |
| "reward_std": 0.34406865388154984, | |
| "rewards/cosine_scaled_reward": -0.2501036673784256, | |
| "rewards/format_reward": 0.0, | |
| "step": 107 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.432, | |
| "grad_norm": 2.4664499759674072, | |
| "kl": 1.0986328125, | |
| "learning_rate": 9.636109026648554e-07, | |
| "loss": 0.0439, | |
| "reward": -0.49009862542152405, | |
| "reward_std": 0.3558028042316437, | |
| "rewards/cosine_scaled_reward": -0.24504930526018143, | |
| "rewards/format_reward": 0.0, | |
| "step": 108 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.436, | |
| "grad_norm": 2.3740482330322266, | |
| "kl": 0.67578125, | |
| "learning_rate": 9.623632283030077e-07, | |
| "loss": 0.027, | |
| "reward": -0.4631711468100548, | |
| "reward_std": 0.34275270998477936, | |
| "rewards/cosine_scaled_reward": -0.2315855734050274, | |
| "rewards/format_reward": 0.0, | |
| "step": 109 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.44, | |
| "grad_norm": 2.9116501808166504, | |
| "kl": 1.1826171875, | |
| "learning_rate": 9.610954559391704e-07, | |
| "loss": 0.0473, | |
| "reward": -0.444116935133934, | |
| "reward_std": 0.37212707847356796, | |
| "rewards/cosine_scaled_reward": -0.2220584638416767, | |
| "rewards/format_reward": 0.0, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.444, | |
| "grad_norm": 2.24743390083313, | |
| "kl": 0.638671875, | |
| "learning_rate": 9.598076473627796e-07, | |
| "loss": 0.0255, | |
| "reward": -0.46286992728710175, | |
| "reward_std": 0.3208693787455559, | |
| "rewards/cosine_scaled_reward": -0.23143497854471207, | |
| "rewards/format_reward": 0.0, | |
| "step": 111 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.448, | |
| "grad_norm": 3.138840913772583, | |
| "kl": 1.14404296875, | |
| "learning_rate": 9.58499865339809e-07, | |
| "loss": 0.0458, | |
| "reward": -0.4803452715277672, | |
| "reward_std": 0.3449332043528557, | |
| "rewards/cosine_scaled_reward": -0.2401726357638836, | |
| "rewards/format_reward": 0.0, | |
| "step": 112 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.452, | |
| "grad_norm": 2.7688963413238525, | |
| "kl": 0.9462890625, | |
| "learning_rate": 9.571721736097088e-07, | |
| "loss": 0.0379, | |
| "reward": -0.4440384730696678, | |
| "reward_std": 0.3389856517314911, | |
| "rewards/cosine_scaled_reward": -0.2220192365348339, | |
| "rewards/format_reward": 0.0, | |
| "step": 113 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.456, | |
| "grad_norm": 2.7298948764801025, | |
| "kl": 1.3583984375, | |
| "learning_rate": 9.55824636882301e-07, | |
| "loss": 0.0544, | |
| "reward": -0.40611616894602776, | |
| "reward_std": 0.3120696693658829, | |
| "rewards/cosine_scaled_reward": -0.20305808261036873, | |
| "rewards/format_reward": 0.0, | |
| "step": 114 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.46, | |
| "grad_norm": 2.628330945968628, | |
| "kl": 0.84521484375, | |
| "learning_rate": 9.54457320834625e-07, | |
| "loss": 0.0338, | |
| "reward": -0.41812988370656967, | |
| "reward_std": 0.33337801694869995, | |
| "rewards/cosine_scaled_reward": -0.20906493440270424, | |
| "rewards/format_reward": 0.0, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.464, | |
| "grad_norm": 2.21708607673645, | |
| "kl": 1.125, | |
| "learning_rate": 9.530702921077358e-07, | |
| "loss": 0.0451, | |
| "reward": -0.4452592432498932, | |
| "reward_std": 0.34758392721414566, | |
| "rewards/cosine_scaled_reward": -0.2226296216249466, | |
| "rewards/format_reward": 0.0, | |
| "step": 116 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.468, | |
| "grad_norm": 3.4151782989501953, | |
| "kl": 1.5390625, | |
| "learning_rate": 9.516636183034564e-07, | |
| "loss": 0.0617, | |
| "reward": -0.5043663010001183, | |
| "reward_std": 0.3056981936097145, | |
| "rewards/cosine_scaled_reward": -0.25218314677476883, | |
| "rewards/format_reward": 0.0, | |
| "step": 117 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.472, | |
| "grad_norm": 2.8809969425201416, | |
| "kl": 1.498046875, | |
| "learning_rate": 9.502373679810839e-07, | |
| "loss": 0.0599, | |
| "reward": -0.44362927228212357, | |
| "reward_std": 0.32765333354473114, | |
| "rewards/cosine_scaled_reward": -0.22181464359164238, | |
| "rewards/format_reward": 0.0, | |
| "step": 118 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.476, | |
| "grad_norm": 3.092552661895752, | |
| "kl": 1.6640625, | |
| "learning_rate": 9.487916106540465e-07, | |
| "loss": 0.0665, | |
| "reward": -0.49818655103445053, | |
| "reward_std": 0.3495415672659874, | |
| "rewards/cosine_scaled_reward": -0.24909326806664467, | |
| "rewards/format_reward": 0.0, | |
| "step": 119 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.48, | |
| "grad_norm": 3.2943530082702637, | |
| "kl": 2.07421875, | |
| "learning_rate": 9.473264167865171e-07, | |
| "loss": 0.0829, | |
| "reward": -0.4802135229110718, | |
| "reward_std": 0.3453461080789566, | |
| "rewards/cosine_scaled_reward": -0.24010677635669708, | |
| "rewards/format_reward": 0.0, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.484, | |
| "grad_norm": 2.5681769847869873, | |
| "kl": 1.505859375, | |
| "learning_rate": 9.458418577899774e-07, | |
| "loss": 0.0603, | |
| "reward": -0.5175792872905731, | |
| "reward_std": 0.35768260806798935, | |
| "rewards/cosine_scaled_reward": -0.25878964737057686, | |
| "rewards/format_reward": 0.0, | |
| "step": 121 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.488, | |
| "grad_norm": 2.9190571308135986, | |
| "kl": 1.57373046875, | |
| "learning_rate": 9.443380060197385e-07, | |
| "loss": 0.063, | |
| "reward": -0.46548449248075485, | |
| "reward_std": 0.35348332673311234, | |
| "rewards/cosine_scaled_reward": -0.23274223506450653, | |
| "rewards/format_reward": 0.0, | |
| "step": 122 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.492, | |
| "grad_norm": 2.435157537460327, | |
| "kl": 1.0654296875, | |
| "learning_rate": 9.428149347714143e-07, | |
| "loss": 0.0427, | |
| "reward": -0.4281177818775177, | |
| "reward_std": 0.3503784313797951, | |
| "rewards/cosine_scaled_reward": -0.21405889093875885, | |
| "rewards/format_reward": 0.0, | |
| "step": 123 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.496, | |
| "grad_norm": 3.1375350952148438, | |
| "kl": 1.5625, | |
| "learning_rate": 9.412727182773486e-07, | |
| "loss": 0.0624, | |
| "reward": -0.4667646959424019, | |
| "reward_std": 0.3501163199543953, | |
| "rewards/cosine_scaled_reward": -0.23338234052062035, | |
| "rewards/format_reward": 0.0, | |
| "step": 124 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.5, | |
| "grad_norm": 2.1935606002807617, | |
| "kl": 1.3427734375, | |
| "learning_rate": 9.397114317029974e-07, | |
| "loss": 0.0537, | |
| "reward": -0.4283955693244934, | |
| "reward_std": 0.34814615547657013, | |
| "rewards/cosine_scaled_reward": -0.2141977809369564, | |
| "rewards/format_reward": 0.0, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.504, | |
| "grad_norm": 2.727754592895508, | |
| "kl": 1.35546875, | |
| "learning_rate": 9.381311511432658e-07, | |
| "loss": 0.0543, | |
| "reward": -0.4584430381655693, | |
| "reward_std": 0.3318573832511902, | |
| "rewards/cosine_scaled_reward": -0.22922151535749435, | |
| "rewards/format_reward": 0.0, | |
| "step": 126 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.508, | |
| "grad_norm": 2.9863674640655518, | |
| "kl": 1.509765625, | |
| "learning_rate": 9.36531953618799e-07, | |
| "loss": 0.0603, | |
| "reward": -0.4794049710035324, | |
| "reward_std": 0.3224741891026497, | |
| "rewards/cosine_scaled_reward": -0.2397024855017662, | |
| "rewards/format_reward": 0.0, | |
| "step": 127 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.512, | |
| "grad_norm": 3.0583863258361816, | |
| "kl": 1.5751953125, | |
| "learning_rate": 9.34913917072228e-07, | |
| "loss": 0.0631, | |
| "reward": -0.3896471783518791, | |
| "reward_std": 0.32155635207891464, | |
| "rewards/cosine_scaled_reward": -0.19482359662652016, | |
| "rewards/format_reward": 0.0, | |
| "step": 128 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.516, | |
| "grad_norm": 11.888484001159668, | |
| "kl": 2.1806640625, | |
| "learning_rate": 9.332771203643714e-07, | |
| "loss": 0.0874, | |
| "reward": -0.46486661583185196, | |
| "reward_std": 0.34625906497240067, | |
| "rewards/cosine_scaled_reward": -0.23243330791592598, | |
| "rewards/format_reward": 0.0, | |
| "step": 129 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.52, | |
| "grad_norm": 3.14744234085083, | |
| "kl": 1.1103515625, | |
| "learning_rate": 9.316216432703916e-07, | |
| "loss": 0.0445, | |
| "reward": -0.4691261351108551, | |
| "reward_std": 0.3357261121273041, | |
| "rewards/cosine_scaled_reward": -0.23456306010484695, | |
| "rewards/format_reward": 0.0, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.524, | |
| "grad_norm": 2.6933717727661133, | |
| "kl": 1.76171875, | |
| "learning_rate": 9.299475664759068e-07, | |
| "loss": 0.0705, | |
| "reward": -0.5458347946405411, | |
| "reward_std": 0.3296028599143028, | |
| "rewards/cosine_scaled_reward": -0.27291740477085114, | |
| "rewards/format_reward": 0.0, | |
| "step": 131 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.528, | |
| "grad_norm": 2.695984363555908, | |
| "kl": 1.2666015625, | |
| "learning_rate": 9.282549715730579e-07, | |
| "loss": 0.0506, | |
| "reward": -0.43337278813123703, | |
| "reward_std": 0.3223467916250229, | |
| "rewards/cosine_scaled_reward": -0.2166864052414894, | |
| "rewards/format_reward": 0.0, | |
| "step": 132 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.532, | |
| "grad_norm": 2.1844236850738525, | |
| "kl": 1.072265625, | |
| "learning_rate": 9.265439410565328e-07, | |
| "loss": 0.0429, | |
| "reward": -0.47815513610839844, | |
| "reward_std": 0.33408980816602707, | |
| "rewards/cosine_scaled_reward": -0.23907756060361862, | |
| "rewards/format_reward": 0.0, | |
| "step": 133 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.536, | |
| "grad_norm": 2.6240434646606445, | |
| "kl": 0.998046875, | |
| "learning_rate": 9.248145583195447e-07, | |
| "loss": 0.0399, | |
| "reward": -0.3596036769449711, | |
| "reward_std": 0.3202332779765129, | |
| "rewards/cosine_scaled_reward": -0.17980184871703386, | |
| "rewards/format_reward": 0.0, | |
| "step": 134 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.54, | |
| "grad_norm": 2.413489580154419, | |
| "kl": 1.515625, | |
| "learning_rate": 9.230669076497687e-07, | |
| "loss": 0.0607, | |
| "reward": -0.3980662524700165, | |
| "reward_std": 0.3146558068692684, | |
| "rewards/cosine_scaled_reward": -0.19903312623500824, | |
| "rewards/format_reward": 0.0, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.544, | |
| "grad_norm": 2.5466983318328857, | |
| "kl": 1.421875, | |
| "learning_rate": 9.213010742252327e-07, | |
| "loss": 0.0568, | |
| "reward": -0.4567502960562706, | |
| "reward_std": 0.36093486845493317, | |
| "rewards/cosine_scaled_reward": -0.2283751629292965, | |
| "rewards/format_reward": 0.0, | |
| "step": 136 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.548, | |
| "grad_norm": 2.670454263687134, | |
| "kl": 1.63671875, | |
| "learning_rate": 9.195171441101668e-07, | |
| "loss": 0.0655, | |
| "reward": -0.48265285044908524, | |
| "reward_std": 0.33601198345422745, | |
| "rewards/cosine_scaled_reward": -0.24132642522454262, | |
| "rewards/format_reward": 0.0, | |
| "step": 137 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.552, | |
| "grad_norm": 3.4489877223968506, | |
| "kl": 1.39453125, | |
| "learning_rate": 9.177152042508077e-07, | |
| "loss": 0.0558, | |
| "reward": -0.40766458958387375, | |
| "reward_std": 0.34357643127441406, | |
| "rewards/cosine_scaled_reward": -0.20383229106664658, | |
| "rewards/format_reward": 0.0, | |
| "step": 138 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.556, | |
| "grad_norm": 2.18890118598938, | |
| "kl": 1.30859375, | |
| "learning_rate": 9.158953424711624e-07, | |
| "loss": 0.0523, | |
| "reward": -0.4143947809934616, | |
| "reward_std": 0.323918879032135, | |
| "rewards/cosine_scaled_reward": -0.2071974016726017, | |
| "rewards/format_reward": 0.0, | |
| "step": 139 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.56, | |
| "grad_norm": 2.5627028942108154, | |
| "kl": 1.34423828125, | |
| "learning_rate": 9.140576474687263e-07, | |
| "loss": 0.0538, | |
| "reward": -0.4485241174697876, | |
| "reward_std": 0.3278198316693306, | |
| "rewards/cosine_scaled_reward": -0.2242620587348938, | |
| "rewards/format_reward": 0.0, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.564, | |
| "grad_norm": 2.086371660232544, | |
| "kl": 1.2802734375, | |
| "learning_rate": 9.122022088101613e-07, | |
| "loss": 0.0512, | |
| "reward": -0.32855524495244026, | |
| "reward_std": 0.33061159402132034, | |
| "rewards/cosine_scaled_reward": -0.16427762433886528, | |
| "rewards/format_reward": 0.0, | |
| "step": 141 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.568, | |
| "grad_norm": 2.45231556892395, | |
| "kl": 1.580078125, | |
| "learning_rate": 9.103291169269299e-07, | |
| "loss": 0.0632, | |
| "reward": -0.4703398421406746, | |
| "reward_std": 0.2972045987844467, | |
| "rewards/cosine_scaled_reward": -0.2351699210703373, | |
| "rewards/format_reward": 0.0, | |
| "step": 142 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.572, | |
| "grad_norm": 2.864070415496826, | |
| "kl": 1.984375, | |
| "learning_rate": 9.084384631108882e-07, | |
| "loss": 0.0794, | |
| "reward": -0.41980744898319244, | |
| "reward_std": 0.34404993802309036, | |
| "rewards/cosine_scaled_reward": -0.20990372076630592, | |
| "rewards/format_reward": 0.0, | |
| "step": 143 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.576, | |
| "grad_norm": 2.412257194519043, | |
| "kl": 1.544921875, | |
| "learning_rate": 9.065303395098358e-07, | |
| "loss": 0.0618, | |
| "reward": -0.43455804139375687, | |
| "reward_std": 0.32647445797920227, | |
| "rewards/cosine_scaled_reward": -0.21727901697158813, | |
| "rewards/format_reward": 0.0, | |
| "step": 144 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.58, | |
| "grad_norm": 2.952892780303955, | |
| "kl": 2.0595703125, | |
| "learning_rate": 9.046048391230247e-07, | |
| "loss": 0.0824, | |
| "reward": -0.4728480279445648, | |
| "reward_std": 0.33887017518281937, | |
| "rewards/cosine_scaled_reward": -0.2364240102469921, | |
| "rewards/format_reward": 0.0, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.584, | |
| "grad_norm": 2.3727328777313232, | |
| "kl": 1.7255859375, | |
| "learning_rate": 9.026620557966279e-07, | |
| "loss": 0.0692, | |
| "reward": -0.42372531443834305, | |
| "reward_std": 0.3417205289006233, | |
| "rewards/cosine_scaled_reward": -0.21186266466975212, | |
| "rewards/format_reward": 0.0, | |
| "step": 146 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.588, | |
| "grad_norm": 2.953756809234619, | |
| "kl": 2.353515625, | |
| "learning_rate": 9.007020842191634e-07, | |
| "loss": 0.0943, | |
| "reward": -0.43578075617551804, | |
| "reward_std": 0.34062809497117996, | |
| "rewards/cosine_scaled_reward": -0.21789037808775902, | |
| "rewards/format_reward": 0.0, | |
| "step": 147 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.592, | |
| "grad_norm": 2.5953478813171387, | |
| "kl": 1.38671875, | |
| "learning_rate": 8.987250199168808e-07, | |
| "loss": 0.0555, | |
| "reward": -0.4190576896071434, | |
| "reward_std": 0.34895560145378113, | |
| "rewards/cosine_scaled_reward": -0.2095288448035717, | |
| "rewards/format_reward": 0.0, | |
| "step": 148 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.596, | |
| "grad_norm": 2.4279496669769287, | |
| "kl": 1.62890625, | |
| "learning_rate": 8.967309592491052e-07, | |
| "loss": 0.0651, | |
| "reward": -0.4394699037075043, | |
| "reward_std": 0.3207908198237419, | |
| "rewards/cosine_scaled_reward": -0.21973494067788124, | |
| "rewards/format_reward": 0.0, | |
| "step": 149 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.6, | |
| "grad_norm": 2.974292516708374, | |
| "kl": 1.892578125, | |
| "learning_rate": 8.9471999940354e-07, | |
| "loss": 0.0757, | |
| "reward": -0.4797021597623825, | |
| "reward_std": 0.32065775990486145, | |
| "rewards/cosine_scaled_reward": -0.23985107988119125, | |
| "rewards/format_reward": 0.0, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.604, | |
| "grad_norm": 2.51299786567688, | |
| "kl": 0.87890625, | |
| "learning_rate": 8.926922383915315e-07, | |
| "loss": 0.0351, | |
| "reward": -0.4108778163790703, | |
| "reward_std": 0.326105996966362, | |
| "rewards/cosine_scaled_reward": -0.20543890818953514, | |
| "rewards/format_reward": 0.0, | |
| "step": 151 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.608, | |
| "grad_norm": 2.723388195037842, | |
| "kl": 1.2294921875, | |
| "learning_rate": 8.906477750432903e-07, | |
| "loss": 0.0492, | |
| "reward": -0.4178111329674721, | |
| "reward_std": 0.32895463705062866, | |
| "rewards/cosine_scaled_reward": -0.20890555530786514, | |
| "rewards/format_reward": 0.0, | |
| "step": 152 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.612, | |
| "grad_norm": 2.4097025394439697, | |
| "kl": 1.650390625, | |
| "learning_rate": 8.88586709003076e-07, | |
| "loss": 0.0659, | |
| "reward": -0.4825671687722206, | |
| "reward_std": 0.33990373462438583, | |
| "rewards/cosine_scaled_reward": -0.2412835843861103, | |
| "rewards/format_reward": 0.0, | |
| "step": 153 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.616, | |
| "grad_norm": 2.114370107650757, | |
| "kl": 1.390625, | |
| "learning_rate": 8.865091407243394e-07, | |
| "loss": 0.0556, | |
| "reward": -0.42671380192041397, | |
| "reward_std": 0.32950445264577866, | |
| "rewards/cosine_scaled_reward": -0.21335690841078758, | |
| "rewards/format_reward": 0.0, | |
| "step": 154 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.62, | |
| "grad_norm": 3.1770823001861572, | |
| "kl": 1.4287109375, | |
| "learning_rate": 8.844151714648274e-07, | |
| "loss": 0.0572, | |
| "reward": -0.4250905141234398, | |
| "reward_std": 0.3110942989587784, | |
| "rewards/cosine_scaled_reward": -0.2125452570617199, | |
| "rewards/format_reward": 0.0, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.624, | |
| "grad_norm": 2.6063926219940186, | |
| "kl": 1.796875, | |
| "learning_rate": 8.823049032816478e-07, | |
| "loss": 0.0719, | |
| "reward": -0.4206129387021065, | |
| "reward_std": 0.33140094578266144, | |
| "rewards/cosine_scaled_reward": -0.21030646935105324, | |
| "rewards/format_reward": 0.0, | |
| "step": 156 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.628, | |
| "grad_norm": 2.482637643814087, | |
| "kl": 1.525390625, | |
| "learning_rate": 8.801784390262943e-07, | |
| "loss": 0.061, | |
| "reward": -0.36781868524849415, | |
| "reward_std": 0.3281563073396683, | |
| "rewards/cosine_scaled_reward": -0.18390934821218252, | |
| "rewards/format_reward": 0.0, | |
| "step": 157 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.632, | |
| "grad_norm": 2.7100956439971924, | |
| "kl": 1.7861328125, | |
| "learning_rate": 8.780358823396352e-07, | |
| "loss": 0.0715, | |
| "reward": -0.3854188397526741, | |
| "reward_std": 0.31897617131471634, | |
| "rewards/cosine_scaled_reward": -0.19270941987633705, | |
| "rewards/format_reward": 0.0, | |
| "step": 158 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.636, | |
| "grad_norm": 2.3493990898132324, | |
| "kl": 1.859375, | |
| "learning_rate": 8.758773376468604e-07, | |
| "loss": 0.0746, | |
| "reward": -0.41636481136083603, | |
| "reward_std": 0.3308830112218857, | |
| "rewards/cosine_scaled_reward": -0.20818240568041801, | |
| "rewards/format_reward": 0.0, | |
| "step": 159 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.64, | |
| "grad_norm": 2.429762840270996, | |
| "kl": 1.78125, | |
| "learning_rate": 8.737029101523929e-07, | |
| "loss": 0.0714, | |
| "reward": -0.44961177557706833, | |
| "reward_std": 0.3425107002258301, | |
| "rewards/cosine_scaled_reward": -0.22480589523911476, | |
| "rewards/format_reward": 0.0, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.644, | |
| "grad_norm": 2.6372933387756348, | |
| "kl": 1.6474609375, | |
| "learning_rate": 8.715127058347614e-07, | |
| "loss": 0.066, | |
| "reward": -0.4204000309109688, | |
| "reward_std": 0.3256704956293106, | |
| "rewards/cosine_scaled_reward": -0.2102000191807747, | |
| "rewards/format_reward": 0.0, | |
| "step": 161 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.648, | |
| "grad_norm": 2.2505483627319336, | |
| "kl": 1.576171875, | |
| "learning_rate": 8.693068314414344e-07, | |
| "loss": 0.063, | |
| "reward": -0.4363863915205002, | |
| "reward_std": 0.3367513567209244, | |
| "rewards/cosine_scaled_reward": -0.2181931994855404, | |
| "rewards/format_reward": 0.0, | |
| "step": 162 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.652, | |
| "grad_norm": 2.781273603439331, | |
| "kl": 1.4375, | |
| "learning_rate": 8.670853944836176e-07, | |
| "loss": 0.0576, | |
| "reward": -0.44805190712213516, | |
| "reward_std": 0.3117773234844208, | |
| "rewards/cosine_scaled_reward": -0.22402595356106758, | |
| "rewards/format_reward": 0.0, | |
| "step": 163 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.656, | |
| "grad_norm": 2.573030710220337, | |
| "kl": 1.21435546875, | |
| "learning_rate": 8.648485032310144e-07, | |
| "loss": 0.0487, | |
| "reward": -0.40324684232473373, | |
| "reward_std": 0.3176472932100296, | |
| "rewards/cosine_scaled_reward": -0.20162343233823776, | |
| "rewards/format_reward": 0.0, | |
| "step": 164 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.66, | |
| "grad_norm": 4.171741485595703, | |
| "kl": 2.3125, | |
| "learning_rate": 8.625962667065487e-07, | |
| "loss": 0.0925, | |
| "reward": -0.4968671426177025, | |
| "reward_std": 0.3204089626669884, | |
| "rewards/cosine_scaled_reward": -0.24843357503414154, | |
| "rewards/format_reward": 0.0, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1528.702392578125, | |
| "epoch": 0.664, | |
| "grad_norm": 2.1756961345672607, | |
| "kl": 1.7578125, | |
| "learning_rate": 8.603287946810513e-07, | |
| "loss": 0.0706, | |
| "reward": -0.4272613450884819, | |
| "reward_std": 0.32390115410089493, | |
| "rewards/cosine_scaled_reward": -0.21363067999482155, | |
| "rewards/format_reward": 0.0, | |
| "step": 166 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.668, | |
| "grad_norm": 2.2742207050323486, | |
| "kl": 1.912109375, | |
| "learning_rate": 8.580461976679099e-07, | |
| "loss": 0.0763, | |
| "reward": -0.3418873958289623, | |
| "reward_std": 0.29924022778868675, | |
| "rewards/cosine_scaled_reward": -0.17094369884580374, | |
| "rewards/format_reward": 0.0, | |
| "step": 167 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.672, | |
| "grad_norm": 2.1837146282196045, | |
| "kl": 1.3330078125, | |
| "learning_rate": 8.557485869176825e-07, | |
| "loss": 0.0533, | |
| "reward": -0.4050525277853012, | |
| "reward_std": 0.3251590058207512, | |
| "rewards/cosine_scaled_reward": -0.2025262601673603, | |
| "rewards/format_reward": 0.0, | |
| "step": 168 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.676, | |
| "grad_norm": 2.1009020805358887, | |
| "kl": 1.9326171875, | |
| "learning_rate": 8.534360744126753e-07, | |
| "loss": 0.0774, | |
| "reward": -0.4387947544455528, | |
| "reward_std": 0.3307826817035675, | |
| "rewards/cosine_scaled_reward": -0.21939736977219582, | |
| "rewards/format_reward": 0.0, | |
| "step": 169 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.68, | |
| "grad_norm": 2.515617609024048, | |
| "kl": 1.884765625, | |
| "learning_rate": 8.511087728614862e-07, | |
| "loss": 0.0754, | |
| "reward": -0.41566915810108185, | |
| "reward_std": 0.34893494844436646, | |
| "rewards/cosine_scaled_reward": -0.20783457532525063, | |
| "rewards/format_reward": 0.0, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.684, | |
| "grad_norm": 2.3045356273651123, | |
| "kl": 1.5078125, | |
| "learning_rate": 8.487667956935087e-07, | |
| "loss": 0.0604, | |
| "reward": -0.3871946483850479, | |
| "reward_std": 0.3363000229001045, | |
| "rewards/cosine_scaled_reward": -0.19359732419252396, | |
| "rewards/format_reward": 0.0, | |
| "step": 171 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.688, | |
| "grad_norm": 2.1517364978790283, | |
| "kl": 1.4169921875, | |
| "learning_rate": 8.464102570534061e-07, | |
| "loss": 0.0567, | |
| "reward": -0.41495678573846817, | |
| "reward_std": 0.33959241211414337, | |
| "rewards/cosine_scaled_reward": -0.20747840031981468, | |
| "rewards/format_reward": 0.0, | |
| "step": 172 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.692, | |
| "grad_norm": 2.4767415523529053, | |
| "kl": 1.5654296875, | |
| "learning_rate": 8.440392717955475e-07, | |
| "loss": 0.0626, | |
| "reward": -0.3259017579257488, | |
| "reward_std": 0.3448467329144478, | |
| "rewards/cosine_scaled_reward": -0.16295087756589055, | |
| "rewards/format_reward": 0.0, | |
| "step": 173 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.696, | |
| "grad_norm": 2.1803934574127197, | |
| "kl": 1.5986328125, | |
| "learning_rate": 8.416539554784089e-07, | |
| "loss": 0.0639, | |
| "reward": -0.45371130108833313, | |
| "reward_std": 0.3770594820380211, | |
| "rewards/cosine_scaled_reward": -0.22685565054416656, | |
| "rewards/format_reward": 0.0, | |
| "step": 174 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.7, | |
| "grad_norm": 2.146838426589966, | |
| "kl": 1.3212890625, | |
| "learning_rate": 8.392544243589427e-07, | |
| "loss": 0.053, | |
| "reward": -0.39382801204919815, | |
| "reward_std": 0.3155653551220894, | |
| "rewards/cosine_scaled_reward": -0.19691400602459908, | |
| "rewards/format_reward": 0.0, | |
| "step": 175 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.704, | |
| "grad_norm": 2.3939132690429688, | |
| "kl": 1.498046875, | |
| "learning_rate": 8.368407953869103e-07, | |
| "loss": 0.06, | |
| "reward": -0.397233285009861, | |
| "reward_std": 0.3429732918739319, | |
| "rewards/cosine_scaled_reward": -0.1986166313290596, | |
| "rewards/format_reward": 0.0, | |
| "step": 176 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.708, | |
| "grad_norm": 2.2279624938964844, | |
| "kl": 1.3759765625, | |
| "learning_rate": 8.344131861991828e-07, | |
| "loss": 0.0551, | |
| "reward": -0.41151023656129837, | |
| "reward_std": 0.3277590796351433, | |
| "rewards/cosine_scaled_reward": -0.2057551108300686, | |
| "rewards/format_reward": 0.0, | |
| "step": 177 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.712, | |
| "grad_norm": 2.5055384635925293, | |
| "kl": 1.341796875, | |
| "learning_rate": 8.319717151140072e-07, | |
| "loss": 0.0537, | |
| "reward": -0.4148360714316368, | |
| "reward_std": 0.3054031655192375, | |
| "rewards/cosine_scaled_reward": -0.2074180319905281, | |
| "rewards/format_reward": 0.0, | |
| "step": 178 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.716, | |
| "grad_norm": 2.605672836303711, | |
| "kl": 2.421875, | |
| "learning_rate": 8.295165011252396e-07, | |
| "loss": 0.0969, | |
| "reward": -0.49764253944158554, | |
| "reward_std": 0.34468474239110947, | |
| "rewards/cosine_scaled_reward": -0.24882125481963158, | |
| "rewards/format_reward": 0.0, | |
| "step": 179 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.72, | |
| "grad_norm": 1.8612443208694458, | |
| "kl": 1.958984375, | |
| "learning_rate": 8.270476638965461e-07, | |
| "loss": 0.0784, | |
| "reward": -0.41104499250650406, | |
| "reward_std": 0.32857123762369156, | |
| "rewards/cosine_scaled_reward": -0.20552249625325203, | |
| "rewards/format_reward": 0.0, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.724, | |
| "grad_norm": 2.20760178565979, | |
| "kl": 1.4267578125, | |
| "learning_rate": 8.245653237555705e-07, | |
| "loss": 0.0571, | |
| "reward": -0.4070161208510399, | |
| "reward_std": 0.29896606504917145, | |
| "rewards/cosine_scaled_reward": -0.20350806042551994, | |
| "rewards/format_reward": 0.0, | |
| "step": 181 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.728, | |
| "grad_norm": 2.527832269668579, | |
| "kl": 1.3251953125, | |
| "learning_rate": 8.220696016880687e-07, | |
| "loss": 0.053, | |
| "reward": -0.40310006588697433, | |
| "reward_std": 0.33485615253448486, | |
| "rewards/cosine_scaled_reward": -0.20155002549290657, | |
| "rewards/format_reward": 0.0, | |
| "step": 182 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.732, | |
| "grad_norm": 2.0901362895965576, | |
| "kl": 1.25, | |
| "learning_rate": 8.195606193320136e-07, | |
| "loss": 0.0499, | |
| "reward": -0.39147457480430603, | |
| "reward_std": 0.3105906918644905, | |
| "rewards/cosine_scaled_reward": -0.19573728740215302, | |
| "rewards/format_reward": 0.0, | |
| "step": 183 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.736, | |
| "grad_norm": 2.0712454319000244, | |
| "kl": 1.3271484375, | |
| "learning_rate": 8.170384989716657e-07, | |
| "loss": 0.053, | |
| "reward": -0.36338385939598083, | |
| "reward_std": 0.29373297840356827, | |
| "rewards/cosine_scaled_reward": -0.18169192969799042, | |
| "rewards/format_reward": 0.0, | |
| "step": 184 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.74, | |
| "grad_norm": 4.567477226257324, | |
| "kl": 2.91015625, | |
| "learning_rate": 8.145033635316128e-07, | |
| "loss": 0.1167, | |
| "reward": -0.46033478528261185, | |
| "reward_std": 0.309500552713871, | |
| "rewards/cosine_scaled_reward": -0.23016740009188652, | |
| "rewards/format_reward": 0.0, | |
| "step": 185 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.744, | |
| "grad_norm": 2.8025710582733154, | |
| "kl": 1.982421875, | |
| "learning_rate": 8.119553365707802e-07, | |
| "loss": 0.0793, | |
| "reward": -0.3399934060871601, | |
| "reward_std": 0.3289627507328987, | |
| "rewards/cosine_scaled_reward": -0.16999670304358006, | |
| "rewards/format_reward": 0.0, | |
| "step": 186 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.748, | |
| "grad_norm": 2.41241192817688, | |
| "kl": 1.6513671875, | |
| "learning_rate": 8.093945422764069e-07, | |
| "loss": 0.0663, | |
| "reward": -0.4002522900700569, | |
| "reward_std": 0.3234091103076935, | |
| "rewards/cosine_scaled_reward": -0.20012613758444786, | |
| "rewards/format_reward": 0.0, | |
| "step": 187 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.752, | |
| "grad_norm": 3.6371164321899414, | |
| "kl": 2.470703125, | |
| "learning_rate": 8.068211054579943e-07, | |
| "loss": 0.0988, | |
| "reward": -0.44175921380519867, | |
| "reward_std": 0.33701298385858536, | |
| "rewards/cosine_scaled_reward": -0.22087960690259933, | |
| "rewards/format_reward": 0.0, | |
| "step": 188 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.756, | |
| "grad_norm": 2.704362154006958, | |
| "kl": 1.71875, | |
| "learning_rate": 8.04235151541222e-07, | |
| "loss": 0.0686, | |
| "reward": -0.3934633806347847, | |
| "reward_std": 0.31845808029174805, | |
| "rewards/cosine_scaled_reward": -0.19673169776797295, | |
| "rewards/format_reward": 0.0, | |
| "step": 189 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.76, | |
| "grad_norm": 2.5518999099731445, | |
| "kl": 1.865234375, | |
| "learning_rate": 8.01636806561836e-07, | |
| "loss": 0.0746, | |
| "reward": -0.48456476628780365, | |
| "reward_std": 0.3398968055844307, | |
| "rewards/cosine_scaled_reward": -0.24228239431977272, | |
| "rewards/format_reward": 0.0, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.764, | |
| "grad_norm": 4.733001232147217, | |
| "kl": 2.0537109375, | |
| "learning_rate": 7.990261971595048e-07, | |
| "loss": 0.0822, | |
| "reward": -0.44671063870191574, | |
| "reward_std": 0.32652025669813156, | |
| "rewards/cosine_scaled_reward": -0.22335530444979668, | |
| "rewards/format_reward": 0.0, | |
| "step": 191 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.768, | |
| "grad_norm": 2.217525005340576, | |
| "kl": 1.72265625, | |
| "learning_rate": 7.964034505716476e-07, | |
| "loss": 0.0689, | |
| "reward": -0.38292936980724335, | |
| "reward_std": 0.3729139119386673, | |
| "rewards/cosine_scaled_reward": -0.19146469235420227, | |
| "rewards/format_reward": 0.0, | |
| "step": 192 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.772, | |
| "grad_norm": 2.3045313358306885, | |
| "kl": 1.0576171875, | |
| "learning_rate": 7.93768694627233e-07, | |
| "loss": 0.0423, | |
| "reward": -0.36335285753011703, | |
| "reward_std": 0.3274284452199936, | |
| "rewards/cosine_scaled_reward": -0.18167642876505852, | |
| "rewards/format_reward": 0.0, | |
| "step": 193 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.776, | |
| "grad_norm": 2.220212936401367, | |
| "kl": 1.974609375, | |
| "learning_rate": 7.911220577405484e-07, | |
| "loss": 0.0791, | |
| "reward": -0.41132358461618423, | |
| "reward_std": 0.33213579654693604, | |
| "rewards/cosine_scaled_reward": -0.20566179975867271, | |
| "rewards/format_reward": 0.0, | |
| "step": 194 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.78, | |
| "grad_norm": 2.872774124145508, | |
| "kl": 2.04296875, | |
| "learning_rate": 7.884636689049422e-07, | |
| "loss": 0.0819, | |
| "reward": -0.41410720348358154, | |
| "reward_std": 0.3132774606347084, | |
| "rewards/cosine_scaled_reward": -0.20705359801650047, | |
| "rewards/format_reward": 0.0, | |
| "step": 195 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.784, | |
| "grad_norm": 3.354735851287842, | |
| "kl": 1.2236328125, | |
| "learning_rate": 7.857936576865356e-07, | |
| "loss": 0.0489, | |
| "reward": -0.34651997685432434, | |
| "reward_std": 0.27611755579710007, | |
| "rewards/cosine_scaled_reward": -0.17325998842716217, | |
| "rewards/format_reward": 0.0, | |
| "step": 196 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.788, | |
| "grad_norm": 2.019547939300537, | |
| "kl": 1.03515625, | |
| "learning_rate": 7.831121542179086e-07, | |
| "loss": 0.0414, | |
| "reward": -0.36961859464645386, | |
| "reward_std": 0.3042915388941765, | |
| "rewards/cosine_scaled_reward": -0.18480929359793663, | |
| "rewards/format_reward": 0.0, | |
| "step": 197 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.792, | |
| "grad_norm": 2.245211601257324, | |
| "kl": 1.408203125, | |
| "learning_rate": 7.804192891917571e-07, | |
| "loss": 0.0564, | |
| "reward": -0.3812807723879814, | |
| "reward_std": 0.30970512330532074, | |
| "rewards/cosine_scaled_reward": -0.190640389919281, | |
| "rewards/format_reward": 0.0, | |
| "step": 198 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.796, | |
| "grad_norm": 2.0456931591033936, | |
| "kl": 1.673828125, | |
| "learning_rate": 7.777151938545235e-07, | |
| "loss": 0.067, | |
| "reward": -0.38433101773262024, | |
| "reward_std": 0.3408072590827942, | |
| "rewards/cosine_scaled_reward": -0.19216550886631012, | |
| "rewards/format_reward": 0.0, | |
| "step": 199 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.8, | |
| "grad_norm": 6.253657817840576, | |
| "kl": 1.48876953125, | |
| "learning_rate": 7.75e-07, | |
| "loss": 0.0595, | |
| "reward": -0.3863793611526489, | |
| "reward_std": 0.3155966252088547, | |
| "rewards/cosine_scaled_reward": -0.19318969175219536, | |
| "rewards/format_reward": 0.0, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.804, | |
| "grad_norm": 2.2331368923187256, | |
| "kl": 1.96484375, | |
| "learning_rate": 7.72273839962904e-07, | |
| "loss": 0.0786, | |
| "reward": -0.41171175986528397, | |
| "reward_std": 0.34651194512844086, | |
| "rewards/cosine_scaled_reward": -0.20585588365793228, | |
| "rewards/format_reward": 0.0, | |
| "step": 201 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.808, | |
| "grad_norm": 2.1702663898468018, | |
| "kl": 1.296875, | |
| "learning_rate": 7.695368466124296e-07, | |
| "loss": 0.0519, | |
| "reward": -0.38244833052158356, | |
| "reward_std": 0.34267907589673996, | |
| "rewards/cosine_scaled_reward": -0.19122417271137238, | |
| "rewards/format_reward": 0.0, | |
| "step": 202 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.812, | |
| "grad_norm": 2.0549793243408203, | |
| "kl": 1.4345703125, | |
| "learning_rate": 7.667891533457718e-07, | |
| "loss": 0.0573, | |
| "reward": -0.4125688225030899, | |
| "reward_std": 0.33167801052331924, | |
| "rewards/cosine_scaled_reward": -0.20628441870212555, | |
| "rewards/format_reward": 0.0, | |
| "step": 203 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.816, | |
| "grad_norm": 2.7793009281158447, | |
| "kl": 1.958984375, | |
| "learning_rate": 7.640308940816239e-07, | |
| "loss": 0.0784, | |
| "reward": -0.45417842268943787, | |
| "reward_std": 0.3453121930360794, | |
| "rewards/cosine_scaled_reward": -0.22708921134471893, | |
| "rewards/format_reward": 0.0, | |
| "step": 204 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.82, | |
| "grad_norm": 8.324098587036133, | |
| "kl": 2.23388671875, | |
| "learning_rate": 7.612622032536507e-07, | |
| "loss": 0.0895, | |
| "reward": -0.3973395526409149, | |
| "reward_std": 0.32590440660715103, | |
| "rewards/cosine_scaled_reward": -0.19866977632045746, | |
| "rewards/format_reward": 0.0, | |
| "step": 205 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.824, | |
| "grad_norm": 2.22940993309021, | |
| "kl": 1.51171875, | |
| "learning_rate": 7.584832158039378e-07, | |
| "loss": 0.0605, | |
| "reward": -0.4044779762625694, | |
| "reward_std": 0.33285098522901535, | |
| "rewards/cosine_scaled_reward": -0.2022389993071556, | |
| "rewards/format_reward": 0.0, | |
| "step": 206 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.828, | |
| "grad_norm": 2.824735164642334, | |
| "kl": 1.310546875, | |
| "learning_rate": 7.556940671764124e-07, | |
| "loss": 0.0524, | |
| "reward": -0.4486440494656563, | |
| "reward_std": 0.33797865360975266, | |
| "rewards/cosine_scaled_reward": -0.22432202845811844, | |
| "rewards/format_reward": 0.0, | |
| "step": 207 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.832, | |
| "grad_norm": 2.2558631896972656, | |
| "kl": 1.1962890625, | |
| "learning_rate": 7.528948933102438e-07, | |
| "loss": 0.0478, | |
| "reward": -0.40251782536506653, | |
| "reward_std": 0.30128662288188934, | |
| "rewards/cosine_scaled_reward": -0.20125891268253326, | |
| "rewards/format_reward": 0.0, | |
| "step": 208 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.836, | |
| "grad_norm": 2.7602171897888184, | |
| "kl": 0.9951171875, | |
| "learning_rate": 7.500858306332172e-07, | |
| "loss": 0.0398, | |
| "reward": -0.31514767929911613, | |
| "reward_std": 0.3020384646952152, | |
| "rewards/cosine_scaled_reward": -0.15757383964955807, | |
| "rewards/format_reward": 0.0, | |
| "step": 209 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.84, | |
| "grad_norm": 2.6217448711395264, | |
| "kl": 1.71484375, | |
| "learning_rate": 7.472670160550848e-07, | |
| "loss": 0.0684, | |
| "reward": -0.3670196682214737, | |
| "reward_std": 0.31881674379110336, | |
| "rewards/cosine_scaled_reward": -0.18350983038544655, | |
| "rewards/format_reward": 0.0, | |
| "step": 210 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.844, | |
| "grad_norm": 2.0915112495422363, | |
| "kl": 1.2841796875, | |
| "learning_rate": 7.444385869608921e-07, | |
| "loss": 0.0514, | |
| "reward": -0.4177168160676956, | |
| "reward_std": 0.3398260995745659, | |
| "rewards/cosine_scaled_reward": -0.2088584043085575, | |
| "rewards/format_reward": 0.0, | |
| "step": 211 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.848, | |
| "grad_norm": 1.7296172380447388, | |
| "kl": 1.2724609375, | |
| "learning_rate": 7.416006812042827e-07, | |
| "loss": 0.051, | |
| "reward": -0.41255099326372147, | |
| "reward_std": 0.33872970938682556, | |
| "rewards/cosine_scaled_reward": -0.20627548918128014, | |
| "rewards/format_reward": 0.0, | |
| "step": 212 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.852, | |
| "grad_norm": 2.1323206424713135, | |
| "kl": 1.16162109375, | |
| "learning_rate": 7.387534371007797e-07, | |
| "loss": 0.0466, | |
| "reward": -0.2759926188737154, | |
| "reward_std": 0.30077088996768, | |
| "rewards/cosine_scaled_reward": -0.1379963019862771, | |
| "rewards/format_reward": 0.0, | |
| "step": 213 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.856, | |
| "grad_norm": 2.3771109580993652, | |
| "kl": 1.556640625, | |
| "learning_rate": 7.358969934210438e-07, | |
| "loss": 0.0622, | |
| "reward": -0.3614875078201294, | |
| "reward_std": 0.32025381922721863, | |
| "rewards/cosine_scaled_reward": -0.1807437539100647, | |
| "rewards/format_reward": 0.0, | |
| "step": 214 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.86, | |
| "grad_norm": 2.940969467163086, | |
| "kl": 1.8828125, | |
| "learning_rate": 7.330314893841101e-07, | |
| "loss": 0.0754, | |
| "reward": -0.29097072361037135, | |
| "reward_std": 0.28063248097896576, | |
| "rewards/cosine_scaled_reward": -0.14548537082737312, | |
| "rewards/format_reward": 0.0, | |
| "step": 215 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.864, | |
| "grad_norm": 1.9293019771575928, | |
| "kl": 1.62890625, | |
| "learning_rate": 7.301570646506027e-07, | |
| "loss": 0.0652, | |
| "reward": -0.4154031127691269, | |
| "reward_std": 0.34460632503032684, | |
| "rewards/cosine_scaled_reward": -0.20770153775811195, | |
| "rewards/format_reward": 0.0, | |
| "step": 216 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.868, | |
| "grad_norm": 2.745267391204834, | |
| "kl": 2.0888671875, | |
| "learning_rate": 7.27273859315928e-07, | |
| "loss": 0.0835, | |
| "reward": -0.4031589925289154, | |
| "reward_std": 0.31946661323308945, | |
| "rewards/cosine_scaled_reward": -0.2015794888138771, | |
| "rewards/format_reward": 0.0, | |
| "step": 217 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.872, | |
| "grad_norm": 2.873622179031372, | |
| "kl": 1.5078125, | |
| "learning_rate": 7.243820139034464e-07, | |
| "loss": 0.0604, | |
| "reward": -0.4128880575299263, | |
| "reward_std": 0.3311196342110634, | |
| "rewards/cosine_scaled_reward": -0.20644402503967285, | |
| "rewards/format_reward": 0.0, | |
| "step": 218 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.876, | |
| "grad_norm": 2.7079639434814453, | |
| "kl": 1.2607421875, | |
| "learning_rate": 7.214816693576234e-07, | |
| "loss": 0.0505, | |
| "reward": -0.3099018558859825, | |
| "reward_std": 0.2861209958791733, | |
| "rewards/cosine_scaled_reward": -0.15495092794299126, | |
| "rewards/format_reward": 0.0, | |
| "step": 219 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.88, | |
| "grad_norm": 1.9640864133834839, | |
| "kl": 1.234375, | |
| "learning_rate": 7.185729670371604e-07, | |
| "loss": 0.0493, | |
| "reward": -0.40535254031419754, | |
| "reward_std": 0.2874290943145752, | |
| "rewards/cosine_scaled_reward": -0.20267625898122787, | |
| "rewards/format_reward": 0.0, | |
| "step": 220 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.884, | |
| "grad_norm": 2.130681037902832, | |
| "kl": 1.486328125, | |
| "learning_rate": 7.156560487081051e-07, | |
| "loss": 0.0595, | |
| "reward": -0.3594564124941826, | |
| "reward_std": 0.3218042775988579, | |
| "rewards/cosine_scaled_reward": -0.1797281987965107, | |
| "rewards/format_reward": 0.0, | |
| "step": 221 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.888, | |
| "grad_norm": 2.1852834224700928, | |
| "kl": 1.48046875, | |
| "learning_rate": 7.127310565369415e-07, | |
| "loss": 0.0591, | |
| "reward": -0.331524558365345, | |
| "reward_std": 0.28531621396541595, | |
| "rewards/cosine_scaled_reward": -0.1657622903585434, | |
| "rewards/format_reward": 0.0, | |
| "step": 222 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.892, | |
| "grad_norm": 2.3731930255889893, | |
| "kl": 1.734375, | |
| "learning_rate": 7.097981330836616e-07, | |
| "loss": 0.0693, | |
| "reward": -0.38006093353033066, | |
| "reward_std": 0.3292882591485977, | |
| "rewards/cosine_scaled_reward": -0.19003047049045563, | |
| "rewards/format_reward": 0.0, | |
| "step": 223 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.896, | |
| "grad_norm": 2.3246822357177734, | |
| "kl": 1.0390625, | |
| "learning_rate": 7.068574212948169e-07, | |
| "loss": 0.0416, | |
| "reward": -0.3990800455212593, | |
| "reward_std": 0.3413678854703903, | |
| "rewards/cosine_scaled_reward": -0.19954002648591995, | |
| "rewards/format_reward": 0.0, | |
| "step": 224 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.9, | |
| "grad_norm": 2.4476959705352783, | |
| "kl": 1.45703125, | |
| "learning_rate": 7.039090644965509e-07, | |
| "loss": 0.0583, | |
| "reward": -0.39841071516275406, | |
| "reward_std": 0.31324755400419235, | |
| "rewards/cosine_scaled_reward": -0.19920538365840912, | |
| "rewards/format_reward": 0.0, | |
| "step": 225 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.904, | |
| "grad_norm": 3.0681633949279785, | |
| "kl": 1.75, | |
| "learning_rate": 7.009532063876148e-07, | |
| "loss": 0.0701, | |
| "reward": -0.35963694006204605, | |
| "reward_std": 0.3227182477712631, | |
| "rewards/cosine_scaled_reward": -0.17981846630573273, | |
| "rewards/format_reward": 0.0, | |
| "step": 226 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.908, | |
| "grad_norm": 3.8354952335357666, | |
| "kl": 1.5087890625, | |
| "learning_rate": 6.979899910323624e-07, | |
| "loss": 0.0604, | |
| "reward": -0.3886452168226242, | |
| "reward_std": 0.31125637143850327, | |
| "rewards/cosine_scaled_reward": -0.1943226121366024, | |
| "rewards/format_reward": 0.0, | |
| "step": 227 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.912, | |
| "grad_norm": 2.3208184242248535, | |
| "kl": 1.39453125, | |
| "learning_rate": 6.950195628537299e-07, | |
| "loss": 0.0558, | |
| "reward": -0.34270477294921875, | |
| "reward_std": 0.3698492497205734, | |
| "rewards/cosine_scaled_reward": -0.17135238647460938, | |
| "rewards/format_reward": 0.0, | |
| "step": 228 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.916, | |
| "grad_norm": 2.174126386642456, | |
| "kl": 2.009765625, | |
| "learning_rate": 6.920420666261961e-07, | |
| "loss": 0.0804, | |
| "reward": -0.37576939910650253, | |
| "reward_std": 0.3269713968038559, | |
| "rewards/cosine_scaled_reward": -0.18788469955325127, | |
| "rewards/format_reward": 0.0, | |
| "step": 229 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.92, | |
| "grad_norm": 2.081784725189209, | |
| "kl": 2.1728515625, | |
| "learning_rate": 6.890576474687263e-07, | |
| "loss": 0.0869, | |
| "reward": -0.3998561128973961, | |
| "reward_std": 0.32443511486053467, | |
| "rewards/cosine_scaled_reward": -0.19992805272340775, | |
| "rewards/format_reward": 0.0, | |
| "step": 230 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.924, | |
| "grad_norm": 2.3403866291046143, | |
| "kl": 1.17529296875, | |
| "learning_rate": 6.860664508377001e-07, | |
| "loss": 0.0469, | |
| "reward": -0.38807813823223114, | |
| "reward_std": 0.32711831480264664, | |
| "rewards/cosine_scaled_reward": -0.19403906539082527, | |
| "rewards/format_reward": 0.0, | |
| "step": 231 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.928, | |
| "grad_norm": 2.029927968978882, | |
| "kl": 1.32666015625, | |
| "learning_rate": 6.83068622519821e-07, | |
| "loss": 0.0531, | |
| "reward": -0.38948777318000793, | |
| "reward_std": 0.3195284381508827, | |
| "rewards/cosine_scaled_reward": -0.19474387168884277, | |
| "rewards/format_reward": 0.0, | |
| "step": 232 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.932, | |
| "grad_norm": 2.9124484062194824, | |
| "kl": 1.71484375, | |
| "learning_rate": 6.800643086250121e-07, | |
| "loss": 0.0685, | |
| "reward": -0.3806769847869873, | |
| "reward_std": 0.2985011041164398, | |
| "rewards/cosine_scaled_reward": -0.19033849611878395, | |
| "rewards/format_reward": 0.0, | |
| "step": 233 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.936, | |
| "grad_norm": 2.464742422103882, | |
| "kl": 1.2998046875, | |
| "learning_rate": 6.770536555792944e-07, | |
| "loss": 0.052, | |
| "reward": -0.3443439155817032, | |
| "reward_std": 0.29415207356214523, | |
| "rewards/cosine_scaled_reward": -0.1721719540655613, | |
| "rewards/format_reward": 0.0, | |
| "step": 234 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.94, | |
| "grad_norm": 2.1291651725769043, | |
| "kl": 1.001953125, | |
| "learning_rate": 6.740368101176495e-07, | |
| "loss": 0.0401, | |
| "reward": -0.33735504001379013, | |
| "reward_std": 0.28946489840745926, | |
| "rewards/cosine_scaled_reward": -0.16867752373218536, | |
| "rewards/format_reward": 0.0, | |
| "step": 235 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.944, | |
| "grad_norm": 2.9513416290283203, | |
| "kl": 1.6201171875, | |
| "learning_rate": 6.710139192768694e-07, | |
| "loss": 0.0649, | |
| "reward": -0.40289320796728134, | |
| "reward_std": 0.30230626463890076, | |
| "rewards/cosine_scaled_reward": -0.20144660398364067, | |
| "rewards/format_reward": 0.0, | |
| "step": 236 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.948, | |
| "grad_norm": 3.7395241260528564, | |
| "kl": 1.6240234375, | |
| "learning_rate": 6.679851303883891e-07, | |
| "loss": 0.065, | |
| "reward": -0.3659610077738762, | |
| "reward_std": 0.32638294249773026, | |
| "rewards/cosine_scaled_reward": -0.1829805038869381, | |
| "rewards/format_reward": 0.0, | |
| "step": 237 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.952, | |
| "grad_norm": 2.7872421741485596, | |
| "kl": 1.7919921875, | |
| "learning_rate": 6.649505910711058e-07, | |
| "loss": 0.0718, | |
| "reward": -0.4507276937365532, | |
| "reward_std": 0.35789574682712555, | |
| "rewards/cosine_scaled_reward": -0.2253638356924057, | |
| "rewards/format_reward": 0.0, | |
| "step": 238 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.956, | |
| "grad_norm": 2.139983654022217, | |
| "kl": 1.40234375, | |
| "learning_rate": 6.619104492241847e-07, | |
| "loss": 0.056, | |
| "reward": -0.3731803297996521, | |
| "reward_std": 0.30503255128860474, | |
| "rewards/cosine_scaled_reward": -0.18659016117453575, | |
| "rewards/format_reward": 0.0, | |
| "step": 239 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.96, | |
| "grad_norm": 6.420464515686035, | |
| "kl": 2.787109375, | |
| "learning_rate": 6.588648530198504e-07, | |
| "loss": 0.1116, | |
| "reward": -0.40894675999879837, | |
| "reward_std": 0.3296940475702286, | |
| "rewards/cosine_scaled_reward": -0.20447338744997978, | |
| "rewards/format_reward": 0.0, | |
| "step": 240 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.964, | |
| "grad_norm": 2.4638171195983887, | |
| "kl": 2.1806640625, | |
| "learning_rate": 6.558139508961654e-07, | |
| "loss": 0.0874, | |
| "reward": -0.42437078058719635, | |
| "reward_std": 0.3512648344039917, | |
| "rewards/cosine_scaled_reward": -0.21218538656830788, | |
| "rewards/format_reward": 0.0, | |
| "step": 241 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.968, | |
| "grad_norm": 2.8068432807922363, | |
| "kl": 1.884765625, | |
| "learning_rate": 6.527578915497951e-07, | |
| "loss": 0.0754, | |
| "reward": -0.394868440926075, | |
| "reward_std": 0.2916436865925789, | |
| "rewards/cosine_scaled_reward": -0.1974342130124569, | |
| "rewards/format_reward": 0.0, | |
| "step": 242 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.972, | |
| "grad_norm": 2.272479295730591, | |
| "kl": 1.453125, | |
| "learning_rate": 6.496968239287603e-07, | |
| "loss": 0.0581, | |
| "reward": -0.36773569136857986, | |
| "reward_std": 0.3104323297739029, | |
| "rewards/cosine_scaled_reward": -0.18386784568428993, | |
| "rewards/format_reward": 0.0, | |
| "step": 243 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.976, | |
| "grad_norm": 2.86352276802063, | |
| "kl": 1.8525390625, | |
| "learning_rate": 6.466308972251785e-07, | |
| "loss": 0.0742, | |
| "reward": -0.3895353376865387, | |
| "reward_std": 0.30376598984003067, | |
| "rewards/cosine_scaled_reward": -0.19476767256855965, | |
| "rewards/format_reward": 0.0, | |
| "step": 244 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.98, | |
| "grad_norm": 3.2674906253814697, | |
| "kl": 1.89453125, | |
| "learning_rate": 6.435602608679916e-07, | |
| "loss": 0.0758, | |
| "reward": -0.35536977648735046, | |
| "reward_std": 0.32461147010326385, | |
| "rewards/cosine_scaled_reward": -0.17768489941954613, | |
| "rewards/format_reward": 0.0, | |
| "step": 245 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.984, | |
| "grad_norm": 2.3651580810546875, | |
| "kl": 1.3994140625, | |
| "learning_rate": 6.404850645156841e-07, | |
| "loss": 0.0559, | |
| "reward": -0.2967621465213597, | |
| "reward_std": 0.29580704867839813, | |
| "rewards/cosine_scaled_reward": -0.1483810821082443, | |
| "rewards/format_reward": 0.0, | |
| "step": 246 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.988, | |
| "grad_norm": 2.6290199756622314, | |
| "kl": 1.544921875, | |
| "learning_rate": 6.374054580489873e-07, | |
| "loss": 0.0618, | |
| "reward": -0.3732440918684006, | |
| "reward_std": 0.28786107152700424, | |
| "rewards/cosine_scaled_reward": -0.1866220459342003, | |
| "rewards/format_reward": 0.0, | |
| "step": 247 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.992, | |
| "grad_norm": 2.474320650100708, | |
| "kl": 1.18359375, | |
| "learning_rate": 6.343215915635761e-07, | |
| "loss": 0.0473, | |
| "reward": -0.3813322111964226, | |
| "reward_std": 0.3196609243750572, | |
| "rewards/cosine_scaled_reward": -0.1906661055982113, | |
| "rewards/format_reward": 0.0, | |
| "step": 248 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 0.996, | |
| "grad_norm": 2.4096460342407227, | |
| "kl": 1.185546875, | |
| "learning_rate": 6.31233615362752e-07, | |
| "loss": 0.0475, | |
| "reward": -0.37723246961832047, | |
| "reward_std": 0.32298891991376877, | |
| "rewards/cosine_scaled_reward": -0.18861623480916023, | |
| "rewards/format_reward": 0.0, | |
| "step": 249 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0001220703125, | |
| "epoch": 1.0, | |
| "grad_norm": 2.414369821548462, | |
| "kl": 1.1552734375, | |
| "learning_rate": 6.281416799501187e-07, | |
| "loss": 0.0462, | |
| "reward": -0.3446759209036827, | |
| "reward_std": 0.30413854122161865, | |
| "rewards/cosine_scaled_reward": -0.17233795672655106, | |
| "rewards/format_reward": 0.0, | |
| "step": 250 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.004, | |
| "grad_norm": 2.3181285858154297, | |
| "kl": 1.4765625, | |
| "learning_rate": 6.25045936022246e-07, | |
| "loss": 0.0591, | |
| "reward": -0.39850035309791565, | |
| "reward_std": 0.3559228628873825, | |
| "rewards/cosine_scaled_reward": -0.19925018772482872, | |
| "rewards/format_reward": 0.0, | |
| "step": 251 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.008, | |
| "grad_norm": 2.3214640617370605, | |
| "kl": 1.59375, | |
| "learning_rate": 6.219465344613258e-07, | |
| "loss": 0.0637, | |
| "reward": -0.3477981239557266, | |
| "reward_std": 0.3031875118613243, | |
| "rewards/cosine_scaled_reward": -0.1738990694284439, | |
| "rewards/format_reward": 0.0, | |
| "step": 252 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.012, | |
| "grad_norm": 2.4848833084106445, | |
| "kl": 1.6416015625, | |
| "learning_rate": 6.188436263278172e-07, | |
| "loss": 0.0657, | |
| "reward": -0.402904212474823, | |
| "reward_std": 0.32011619955301285, | |
| "rewards/cosine_scaled_reward": -0.2014521062374115, | |
| "rewards/format_reward": 0.0, | |
| "step": 253 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.016, | |
| "grad_norm": 7.0177903175354, | |
| "kl": 3.015625, | |
| "learning_rate": 6.157373628530852e-07, | |
| "loss": 0.1206, | |
| "reward": -0.41366545110940933, | |
| "reward_std": 0.3347878158092499, | |
| "rewards/cosine_scaled_reward": -0.20683272555470467, | |
| "rewards/format_reward": 0.0, | |
| "step": 254 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1533.3928527832031, | |
| "epoch": 1.02, | |
| "grad_norm": 2.5155041217803955, | |
| "kl": 1.818359375, | |
| "learning_rate": 6.126278954320294e-07, | |
| "loss": 0.073, | |
| "reward": -0.41607701033353806, | |
| "reward_std": 0.33659277111291885, | |
| "rewards/cosine_scaled_reward": -0.20803850889205933, | |
| "rewards/format_reward": 0.0, | |
| "step": 255 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.024, | |
| "grad_norm": 3.175401449203491, | |
| "kl": 2.349609375, | |
| "learning_rate": 6.095153756157051e-07, | |
| "loss": 0.094, | |
| "reward": -0.3731570616364479, | |
| "reward_std": 0.3251727372407913, | |
| "rewards/cosine_scaled_reward": -0.18657853826880455, | |
| "rewards/format_reward": 0.0, | |
| "step": 256 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.028, | |
| "grad_norm": 2.345123052597046, | |
| "kl": 2.140625, | |
| "learning_rate": 6.06399955103937e-07, | |
| "loss": 0.0857, | |
| "reward": -0.4059467390179634, | |
| "reward_std": 0.3182907700538635, | |
| "rewards/cosine_scaled_reward": -0.2029733695089817, | |
| "rewards/format_reward": 0.0, | |
| "step": 257 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.032, | |
| "grad_norm": 2.636462688446045, | |
| "kl": 1.705078125, | |
| "learning_rate": 6.032817857379256e-07, | |
| "loss": 0.068, | |
| "reward": -0.343365378677845, | |
| "reward_std": 0.3163585662841797, | |
| "rewards/cosine_scaled_reward": -0.1716826893389225, | |
| "rewards/format_reward": 0.0, | |
| "step": 258 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.036, | |
| "grad_norm": 2.297900438308716, | |
| "kl": 1.51953125, | |
| "learning_rate": 6.001610194928464e-07, | |
| "loss": 0.0608, | |
| "reward": -0.3703172579407692, | |
| "reward_std": 0.3630036562681198, | |
| "rewards/cosine_scaled_reward": -0.1851586326956749, | |
| "rewards/format_reward": 0.0, | |
| "step": 259 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.04, | |
| "grad_norm": 2.311648368835449, | |
| "kl": 1.515625, | |
| "learning_rate": 5.97037808470444e-07, | |
| "loss": 0.0605, | |
| "reward": -0.3789840117096901, | |
| "reward_std": 0.330322228372097, | |
| "rewards/cosine_scaled_reward": -0.18949199840426445, | |
| "rewards/format_reward": 0.0, | |
| "step": 260 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.044, | |
| "grad_norm": 2.3599531650543213, | |
| "kl": 1.78515625, | |
| "learning_rate": 5.939123048916173e-07, | |
| "loss": 0.0714, | |
| "reward": -0.3447503596544266, | |
| "reward_std": 0.33612456917762756, | |
| "rewards/cosine_scaled_reward": -0.17237518727779388, | |
| "rewards/format_reward": 0.0, | |
| "step": 261 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1527.6190490722656, | |
| "epoch": 1.048, | |
| "grad_norm": 2.2337074279785156, | |
| "kl": 1.890625, | |
| "learning_rate": 5.907846610890011e-07, | |
| "loss": 0.0786, | |
| "reward": -0.39859064668416977, | |
| "reward_std": 0.32645051926374435, | |
| "rewards/cosine_scaled_reward": -0.1992953196167946, | |
| "rewards/format_reward": 0.0, | |
| "step": 262 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.052, | |
| "grad_norm": 2.818617582321167, | |
| "kl": 1.55859375, | |
| "learning_rate": 5.87655029499542e-07, | |
| "loss": 0.0624, | |
| "reward": -0.3537183925509453, | |
| "reward_std": 0.309035487473011, | |
| "rewards/cosine_scaled_reward": -0.17685920372605324, | |
| "rewards/format_reward": 0.0, | |
| "step": 263 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.056, | |
| "grad_norm": 2.3533854484558105, | |
| "kl": 1.3583984375, | |
| "learning_rate": 5.845235626570683e-07, | |
| "loss": 0.0543, | |
| "reward": -0.3672221526503563, | |
| "reward_std": 0.31650061905384064, | |
| "rewards/cosine_scaled_reward": -0.18361108005046844, | |
| "rewards/format_reward": 0.0, | |
| "step": 264 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.06, | |
| "grad_norm": 3.936475992202759, | |
| "kl": 2.265625, | |
| "learning_rate": 5.813904131848564e-07, | |
| "loss": 0.0907, | |
| "reward": -0.36572812497615814, | |
| "reward_std": 0.2912697494029999, | |
| "rewards/cosine_scaled_reward": -0.18286405876278877, | |
| "rewards/format_reward": 0.0, | |
| "step": 265 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.064, | |
| "grad_norm": 2.754866600036621, | |
| "kl": 1.943359375, | |
| "learning_rate": 5.78255733788191e-07, | |
| "loss": 0.0777, | |
| "reward": -0.37356945127248764, | |
| "reward_std": 0.34380726516246796, | |
| "rewards/cosine_scaled_reward": -0.18678472936153412, | |
| "rewards/format_reward": 0.0, | |
| "step": 266 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.068, | |
| "grad_norm": 2.374964952468872, | |
| "kl": 1.4267578125, | |
| "learning_rate": 5.751196772469237e-07, | |
| "loss": 0.0571, | |
| "reward": -0.3651036322116852, | |
| "reward_std": 0.30468039214611053, | |
| "rewards/cosine_scaled_reward": -0.1825518161058426, | |
| "rewards/format_reward": 0.0, | |
| "step": 267 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1533.0535888671875, | |
| "epoch": 1.072, | |
| "grad_norm": 2.618032693862915, | |
| "kl": 1.6171875, | |
| "learning_rate": 5.71982396408026e-07, | |
| "loss": 0.0651, | |
| "reward": -0.35353927314281464, | |
| "reward_std": 0.3086354061961174, | |
| "rewards/cosine_scaled_reward": -0.17676963657140732, | |
| "rewards/format_reward": 0.0, | |
| "step": 268 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.076, | |
| "grad_norm": 2.920133590698242, | |
| "kl": 1.8515625, | |
| "learning_rate": 5.688440441781398e-07, | |
| "loss": 0.074, | |
| "reward": -0.37572528421878815, | |
| "reward_std": 0.33292342722415924, | |
| "rewards/cosine_scaled_reward": -0.18786264210939407, | |
| "rewards/format_reward": 0.0, | |
| "step": 269 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.08, | |
| "grad_norm": 2.581885576248169, | |
| "kl": 1.830078125, | |
| "learning_rate": 5.657047735161255e-07, | |
| "loss": 0.0732, | |
| "reward": -0.34584221988916397, | |
| "reward_std": 0.3140456974506378, | |
| "rewards/cosine_scaled_reward": -0.17292110994458199, | |
| "rewards/format_reward": 0.0, | |
| "step": 270 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.084, | |
| "grad_norm": 8.366601943969727, | |
| "kl": 2.509765625, | |
| "learning_rate": 5.625647374256061e-07, | |
| "loss": 0.1003, | |
| "reward": -0.37314866855740547, | |
| "reward_std": 0.2792880907654762, | |
| "rewards/cosine_scaled_reward": -0.18657432682812214, | |
| "rewards/format_reward": 0.0, | |
| "step": 271 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.088, | |
| "grad_norm": 3.071047067642212, | |
| "kl": 1.9658203125, | |
| "learning_rate": 5.594240889475106e-07, | |
| "loss": 0.0785, | |
| "reward": -0.39643432199954987, | |
| "reward_std": 0.31065937131643295, | |
| "rewards/cosine_scaled_reward": -0.19821715354919434, | |
| "rewards/format_reward": 0.0, | |
| "step": 272 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.092, | |
| "grad_norm": 3.8571436405181885, | |
| "kl": 1.2626953125, | |
| "learning_rate": 5.562829811526154e-07, | |
| "loss": 0.0506, | |
| "reward": -0.3136083036661148, | |
| "reward_std": 0.28241100907325745, | |
| "rewards/cosine_scaled_reward": -0.1568041555583477, | |
| "rewards/format_reward": 0.0, | |
| "step": 273 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.096, | |
| "grad_norm": 2.1380457878112793, | |
| "kl": 1.96875, | |
| "learning_rate": 5.531415671340826e-07, | |
| "loss": 0.0786, | |
| "reward": -0.35791803896427155, | |
| "reward_std": 0.3191326707601547, | |
| "rewards/cosine_scaled_reward": -0.17895901948213577, | |
| "rewards/format_reward": 0.0, | |
| "step": 274 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.1, | |
| "grad_norm": 3.744987964630127, | |
| "kl": 2.048828125, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.0819, | |
| "reward": -0.3743599057197571, | |
| "reward_std": 0.3121279552578926, | |
| "rewards/cosine_scaled_reward": -0.18717995658516884, | |
| "rewards/format_reward": 0.0, | |
| "step": 275 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.104, | |
| "grad_norm": 2.783698081970215, | |
| "kl": 1.8984375, | |
| "learning_rate": 5.468584328659172e-07, | |
| "loss": 0.0761, | |
| "reward": -0.3865007609128952, | |
| "reward_std": 0.322613961994648, | |
| "rewards/cosine_scaled_reward": -0.1932503841817379, | |
| "rewards/format_reward": 0.0, | |
| "step": 276 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.108, | |
| "grad_norm": 3.2086503505706787, | |
| "kl": 1.865234375, | |
| "learning_rate": 5.437170188473847e-07, | |
| "loss": 0.0746, | |
| "reward": -0.41129884123802185, | |
| "reward_std": 0.3018573820590973, | |
| "rewards/cosine_scaled_reward": -0.20564941689372063, | |
| "rewards/format_reward": 0.0, | |
| "step": 277 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.112, | |
| "grad_norm": 2.4078729152679443, | |
| "kl": 1.4072265625, | |
| "learning_rate": 5.405759110524894e-07, | |
| "loss": 0.0563, | |
| "reward": -0.39701489359140396, | |
| "reward_std": 0.3126164525747299, | |
| "rewards/cosine_scaled_reward": -0.19850744307041168, | |
| "rewards/format_reward": 0.0, | |
| "step": 278 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.116, | |
| "grad_norm": 2.5043461322784424, | |
| "kl": 2.35546875, | |
| "learning_rate": 5.37435262574394e-07, | |
| "loss": 0.0944, | |
| "reward": -0.28278425987809896, | |
| "reward_std": 0.2714259997010231, | |
| "rewards/cosine_scaled_reward": -0.1413921354105696, | |
| "rewards/format_reward": 0.0, | |
| "step": 279 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1533.6190490722656, | |
| "epoch": 1.12, | |
| "grad_norm": 4.991820335388184, | |
| "kl": 1.83984375, | |
| "learning_rate": 5.342952264838747e-07, | |
| "loss": 0.0713, | |
| "reward": -0.3403998464345932, | |
| "reward_std": 0.3223363533616066, | |
| "rewards/cosine_scaled_reward": -0.1701999232172966, | |
| "rewards/format_reward": 0.0, | |
| "step": 280 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1534.0476379394531, | |
| "epoch": 1.124, | |
| "grad_norm": 2.818126916885376, | |
| "kl": 1.37890625, | |
| "learning_rate": 5.311559558218603e-07, | |
| "loss": 0.054, | |
| "reward": -0.3611769676208496, | |
| "reward_std": 0.3213232010602951, | |
| "rewards/cosine_scaled_reward": -0.1805884800851345, | |
| "rewards/format_reward": 0.0, | |
| "step": 281 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.1280000000000001, | |
| "grad_norm": 2.7234742641448975, | |
| "kl": 2.248046875, | |
| "learning_rate": 5.28017603591974e-07, | |
| "loss": 0.0899, | |
| "reward": -0.4201104864478111, | |
| "reward_std": 0.3131628781557083, | |
| "rewards/cosine_scaled_reward": -0.21005523577332497, | |
| "rewards/format_reward": 0.0, | |
| "step": 282 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.1320000000000001, | |
| "grad_norm": 6.938405990600586, | |
| "kl": 1.998046875, | |
| "learning_rate": 5.248803227530763e-07, | |
| "loss": 0.0799, | |
| "reward": -0.33411792665719986, | |
| "reward_std": 0.32330870628356934, | |
| "rewards/cosine_scaled_reward": -0.16705895960330963, | |
| "rewards/format_reward": 0.0, | |
| "step": 283 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.1360000000000001, | |
| "grad_norm": 3.5663974285125732, | |
| "kl": 1.3349609375, | |
| "learning_rate": 5.21744266211809e-07, | |
| "loss": 0.0534, | |
| "reward": -0.3633820191025734, | |
| "reward_std": 0.31287185102701187, | |
| "rewards/cosine_scaled_reward": -0.1816909983754158, | |
| "rewards/format_reward": 0.0, | |
| "step": 284 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.1400000000000001, | |
| "grad_norm": 2.0476882457733154, | |
| "kl": 1.708984375, | |
| "learning_rate": 5.186095868151436e-07, | |
| "loss": 0.0684, | |
| "reward": -0.3689531907439232, | |
| "reward_std": 0.32297470420598984, | |
| "rewards/cosine_scaled_reward": -0.184476587921381, | |
| "rewards/format_reward": 0.0, | |
| "step": 285 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1526.8869018554688, | |
| "epoch": 1.144, | |
| "grad_norm": 12.345512390136719, | |
| "kl": 2.966796875, | |
| "learning_rate": 5.154764373429315e-07, | |
| "loss": 0.1254, | |
| "reward": -0.3650151863694191, | |
| "reward_std": 0.31899186968803406, | |
| "rewards/cosine_scaled_reward": -0.18250760063529015, | |
| "rewards/format_reward": 0.0, | |
| "step": 286 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.148, | |
| "grad_norm": 2.059617519378662, | |
| "kl": 2.291015625, | |
| "learning_rate": 5.123449705004581e-07, | |
| "loss": 0.0916, | |
| "reward": -0.3706892877817154, | |
| "reward_std": 0.32747378945350647, | |
| "rewards/cosine_scaled_reward": -0.1853446513414383, | |
| "rewards/format_reward": 0.0, | |
| "step": 287 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.152, | |
| "grad_norm": 3.889174699783325, | |
| "kl": 2.0859375, | |
| "learning_rate": 5.09215338910999e-07, | |
| "loss": 0.0834, | |
| "reward": -0.4078289121389389, | |
| "reward_std": 0.3290611281991005, | |
| "rewards/cosine_scaled_reward": -0.20391445606946945, | |
| "rewards/format_reward": 0.0, | |
| "step": 288 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1533.2440490722656, | |
| "epoch": 1.156, | |
| "grad_norm": 2.5038888454437256, | |
| "kl": 0.93896484375, | |
| "learning_rate": 5.060876951083828e-07, | |
| "loss": 0.0354, | |
| "reward": -0.34110401570796967, | |
| "reward_std": 0.3122602626681328, | |
| "rewards/cosine_scaled_reward": -0.17055201157927513, | |
| "rewards/format_reward": 0.0, | |
| "step": 289 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.16, | |
| "grad_norm": 2.39719557762146, | |
| "kl": 1.8583984375, | |
| "learning_rate": 5.02962191529556e-07, | |
| "loss": 0.0744, | |
| "reward": -0.36911261081695557, | |
| "reward_std": 0.3288589343428612, | |
| "rewards/cosine_scaled_reward": -0.1845562942326069, | |
| "rewards/format_reward": 0.0, | |
| "step": 290 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.164, | |
| "grad_norm": 2.758849620819092, | |
| "kl": 1.626953125, | |
| "learning_rate": 4.998389805071536e-07, | |
| "loss": 0.0651, | |
| "reward": -0.3935117796063423, | |
| "reward_std": 0.3461349532008171, | |
| "rewards/cosine_scaled_reward": -0.19675587862730026, | |
| "rewards/format_reward": 0.0, | |
| "step": 291 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.168, | |
| "grad_norm": 2.310575246810913, | |
| "kl": 1.455078125, | |
| "learning_rate": 4.967182142620745e-07, | |
| "loss": 0.0583, | |
| "reward": -0.34184807538986206, | |
| "reward_std": 0.3021695464849472, | |
| "rewards/cosine_scaled_reward": -0.17092403396964073, | |
| "rewards/format_reward": 0.0, | |
| "step": 292 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.172, | |
| "grad_norm": 2.8417394161224365, | |
| "kl": 1.861328125, | |
| "learning_rate": 4.93600044896063e-07, | |
| "loss": 0.0744, | |
| "reward": -0.3772461339831352, | |
| "reward_std": 0.3044436201453209, | |
| "rewards/cosine_scaled_reward": -0.18862305954098701, | |
| "rewards/format_reward": 0.0, | |
| "step": 293 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.176, | |
| "grad_norm": 2.347404956817627, | |
| "kl": 1.28125, | |
| "learning_rate": 4.904846243842949e-07, | |
| "loss": 0.0513, | |
| "reward": -0.3517310842871666, | |
| "reward_std": 0.3094722405076027, | |
| "rewards/cosine_scaled_reward": -0.1758655458688736, | |
| "rewards/format_reward": 0.0, | |
| "step": 294 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1535.6130981445312, | |
| "epoch": 1.18, | |
| "grad_norm": 2.7739925384521484, | |
| "kl": 1.833984375, | |
| "learning_rate": 4.873721045679706e-07, | |
| "loss": 0.0731, | |
| "reward": -0.4288819953799248, | |
| "reward_std": 0.3247087821364403, | |
| "rewards/cosine_scaled_reward": -0.2144409976899624, | |
| "rewards/format_reward": 0.0, | |
| "step": 295 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.184, | |
| "grad_norm": 2.1470892429351807, | |
| "kl": 1.296875, | |
| "learning_rate": 4.842626371469149e-07, | |
| "loss": 0.0519, | |
| "reward": -0.35219819098711014, | |
| "reward_std": 0.3056294918060303, | |
| "rewards/cosine_scaled_reward": -0.17609910294413567, | |
| "rewards/format_reward": 0.0, | |
| "step": 296 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.188, | |
| "grad_norm": 3.177232503890991, | |
| "kl": 1.677734375, | |
| "learning_rate": 4.811563736721829e-07, | |
| "loss": 0.0671, | |
| "reward": -0.3717339485883713, | |
| "reward_std": 0.29695921391248703, | |
| "rewards/cosine_scaled_reward": -0.18586697429418564, | |
| "rewards/format_reward": 0.0, | |
| "step": 297 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.192, | |
| "grad_norm": 3.3333382606506348, | |
| "kl": 2.322265625, | |
| "learning_rate": 4.780534655386743e-07, | |
| "loss": 0.093, | |
| "reward": -0.3814833015203476, | |
| "reward_std": 0.28608307987451553, | |
| "rewards/cosine_scaled_reward": -0.1907416470348835, | |
| "rewards/format_reward": 0.0, | |
| "step": 298 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.196, | |
| "grad_norm": 2.842420816421509, | |
| "kl": 1.45703125, | |
| "learning_rate": 4.749540639777539e-07, | |
| "loss": 0.0583, | |
| "reward": -0.3840809538960457, | |
| "reward_std": 0.31393957883119583, | |
| "rewards/cosine_scaled_reward": -0.19204047322273254, | |
| "rewards/format_reward": 0.0, | |
| "step": 299 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.2, | |
| "grad_norm": 2.9220309257507324, | |
| "kl": 1.681640625, | |
| "learning_rate": 4.7185832004988133e-07, | |
| "loss": 0.0672, | |
| "reward": -0.39588408917188644, | |
| "reward_std": 0.33600132539868355, | |
| "rewards/cosine_scaled_reward": -0.19794204831123352, | |
| "rewards/format_reward": 0.0, | |
| "step": 300 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.204, | |
| "grad_norm": 3.4091219902038574, | |
| "kl": 1.44140625, | |
| "learning_rate": 4.68766384637248e-07, | |
| "loss": 0.0576, | |
| "reward": -0.2894315180601552, | |
| "reward_std": 0.30969203263521194, | |
| "rewards/cosine_scaled_reward": -0.14471576345385984, | |
| "rewards/format_reward": 0.0, | |
| "step": 301 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.208, | |
| "grad_norm": 2.0488741397857666, | |
| "kl": 1.5576171875, | |
| "learning_rate": 4.656784084364238e-07, | |
| "loss": 0.0624, | |
| "reward": -0.32318826019763947, | |
| "reward_std": 0.3031533695757389, | |
| "rewards/cosine_scaled_reward": -0.16159413009881973, | |
| "rewards/format_reward": 0.0, | |
| "step": 302 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.212, | |
| "grad_norm": 2.6755242347717285, | |
| "kl": 1.34765625, | |
| "learning_rate": 4.6259454195101267e-07, | |
| "loss": 0.0539, | |
| "reward": -0.37002843618392944, | |
| "reward_std": 0.31058184802532196, | |
| "rewards/cosine_scaled_reward": -0.18501422181725502, | |
| "rewards/format_reward": 0.0, | |
| "step": 303 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.216, | |
| "grad_norm": 6.160266399383545, | |
| "kl": 1.734375, | |
| "learning_rate": 4.59514935484316e-07, | |
| "loss": 0.0694, | |
| "reward": -0.38714154064655304, | |
| "reward_std": 0.3265160173177719, | |
| "rewards/cosine_scaled_reward": -0.19357078149914742, | |
| "rewards/format_reward": 0.0, | |
| "step": 304 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.22, | |
| "grad_norm": 2.3529880046844482, | |
| "kl": 1.2138671875, | |
| "learning_rate": 4.5643973913200837e-07, | |
| "loss": 0.0486, | |
| "reward": -0.3460870534181595, | |
| "reward_std": 0.3087117671966553, | |
| "rewards/cosine_scaled_reward": -0.17304353043437004, | |
| "rewards/format_reward": 0.0, | |
| "step": 305 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.224, | |
| "grad_norm": 2.48714280128479, | |
| "kl": 1.9453125, | |
| "learning_rate": 4.5336910277482155e-07, | |
| "loss": 0.0779, | |
| "reward": -0.3756335750222206, | |
| "reward_std": 0.32805445045232773, | |
| "rewards/cosine_scaled_reward": -0.1878167800605297, | |
| "rewards/format_reward": 0.0, | |
| "step": 306 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.228, | |
| "grad_norm": 8.46654987335205, | |
| "kl": 2.5107421875, | |
| "learning_rate": 4.503031760712397e-07, | |
| "loss": 0.1004, | |
| "reward": -0.385331392288208, | |
| "reward_std": 0.31344960629940033, | |
| "rewards/cosine_scaled_reward": -0.1926657035946846, | |
| "rewards/format_reward": 0.0, | |
| "step": 307 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.232, | |
| "grad_norm": 3.198944568634033, | |
| "kl": 2.140625, | |
| "learning_rate": 4.4724210845020494e-07, | |
| "loss": 0.0857, | |
| "reward": -0.36118319630622864, | |
| "reward_std": 0.3010380119085312, | |
| "rewards/cosine_scaled_reward": -0.18059159815311432, | |
| "rewards/format_reward": 0.0, | |
| "step": 308 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.236, | |
| "grad_norm": 2.745668411254883, | |
| "kl": 2.033203125, | |
| "learning_rate": 4.441860491038345e-07, | |
| "loss": 0.0813, | |
| "reward": -0.3596822917461395, | |
| "reward_std": 0.3092067465186119, | |
| "rewards/cosine_scaled_reward": -0.17984114587306976, | |
| "rewards/format_reward": 0.0, | |
| "step": 309 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.24, | |
| "grad_norm": 5.614748954772949, | |
| "kl": 2.34375, | |
| "learning_rate": 4.4113514698014953e-07, | |
| "loss": 0.094, | |
| "reward": -0.34773094952106476, | |
| "reward_std": 0.29645886272192, | |
| "rewards/cosine_scaled_reward": -0.17386547103524208, | |
| "rewards/format_reward": 0.0, | |
| "step": 310 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.244, | |
| "grad_norm": 2.089031219482422, | |
| "kl": 1.39453125, | |
| "learning_rate": 4.3808955077581546e-07, | |
| "loss": 0.0558, | |
| "reward": -0.33028923720121384, | |
| "reward_std": 0.2886582836508751, | |
| "rewards/cosine_scaled_reward": -0.16514462232589722, | |
| "rewards/format_reward": 0.0, | |
| "step": 311 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.248, | |
| "grad_norm": 5.366787433624268, | |
| "kl": 2.9599609375, | |
| "learning_rate": 4.350494089288943e-07, | |
| "loss": 0.1186, | |
| "reward": -0.4123021811246872, | |
| "reward_std": 0.337029866874218, | |
| "rewards/cosine_scaled_reward": -0.206151083111763, | |
| "rewards/format_reward": 0.0, | |
| "step": 312 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.252, | |
| "grad_norm": 8.391505241394043, | |
| "kl": 1.953125, | |
| "learning_rate": 4.3201486961161093e-07, | |
| "loss": 0.078, | |
| "reward": -0.3487403020262718, | |
| "reward_std": 0.3276291638612747, | |
| "rewards/cosine_scaled_reward": -0.1743701510131359, | |
| "rewards/format_reward": 0.0, | |
| "step": 313 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.256, | |
| "grad_norm": 2.623786449432373, | |
| "kl": 1.3193359375, | |
| "learning_rate": 4.2898608072313045e-07, | |
| "loss": 0.0528, | |
| "reward": -0.32606934756040573, | |
| "reward_std": 0.28208620101213455, | |
| "rewards/cosine_scaled_reward": -0.16303467005491257, | |
| "rewards/format_reward": 0.0, | |
| "step": 314 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.26, | |
| "grad_norm": 2.2247447967529297, | |
| "kl": 1.884765625, | |
| "learning_rate": 4.2596318988235037e-07, | |
| "loss": 0.0755, | |
| "reward": -0.2273978427692782, | |
| "reward_std": 0.28098014742136, | |
| "rewards/cosine_scaled_reward": -0.11369891960930545, | |
| "rewards/format_reward": 0.0, | |
| "step": 315 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.264, | |
| "grad_norm": 2.258469581604004, | |
| "kl": 1.14453125, | |
| "learning_rate": 4.2294634442070553e-07, | |
| "loss": 0.0457, | |
| "reward": -0.24764333851635456, | |
| "reward_std": 0.2835834100842476, | |
| "rewards/cosine_scaled_reward": -0.12382166367024183, | |
| "rewards/format_reward": 0.0, | |
| "step": 316 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.268, | |
| "grad_norm": 2.884620189666748, | |
| "kl": 1.5986328125, | |
| "learning_rate": 4.1993569137498776e-07, | |
| "loss": 0.064, | |
| "reward": -0.37140634655952454, | |
| "reward_std": 0.36573630571365356, | |
| "rewards/cosine_scaled_reward": -0.18570317327976227, | |
| "rewards/format_reward": 0.0, | |
| "step": 317 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.272, | |
| "grad_norm": 2.703934669494629, | |
| "kl": 1.912109375, | |
| "learning_rate": 4.1693137748017915e-07, | |
| "loss": 0.0763, | |
| "reward": -0.34411681443452835, | |
| "reward_std": 0.29631946235895157, | |
| "rewards/cosine_scaled_reward": -0.17205841839313507, | |
| "rewards/format_reward": 0.0, | |
| "step": 318 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.276, | |
| "grad_norm": 3.717240571975708, | |
| "kl": 2.224609375, | |
| "learning_rate": 4.1393354916230005e-07, | |
| "loss": 0.0891, | |
| "reward": -0.3324529230594635, | |
| "reward_std": 0.2552623227238655, | |
| "rewards/cosine_scaled_reward": -0.16622646152973175, | |
| "rewards/format_reward": 0.0, | |
| "step": 319 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.28, | |
| "grad_norm": 2.4941396713256836, | |
| "kl": 1.384765625, | |
| "learning_rate": 4.1094235253127374e-07, | |
| "loss": 0.0555, | |
| "reward": -0.30811577290296555, | |
| "reward_std": 0.2845884971320629, | |
| "rewards/cosine_scaled_reward": -0.15405788272619247, | |
| "rewards/format_reward": 0.0, | |
| "step": 320 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.284, | |
| "grad_norm": 3.229072332382202, | |
| "kl": 1.9453125, | |
| "learning_rate": 4.079579333738039e-07, | |
| "loss": 0.0778, | |
| "reward": -0.3366442248225212, | |
| "reward_std": 0.301740899682045, | |
| "rewards/cosine_scaled_reward": -0.1683221124112606, | |
| "rewards/format_reward": 0.0, | |
| "step": 321 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.288, | |
| "grad_norm": 3.3636343479156494, | |
| "kl": 1.8828125, | |
| "learning_rate": 4.0498043714627006e-07, | |
| "loss": 0.0752, | |
| "reward": -0.36845648288726807, | |
| "reward_std": 0.34283190220594406, | |
| "rewards/cosine_scaled_reward": -0.18422825261950493, | |
| "rewards/format_reward": 0.0, | |
| "step": 322 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.292, | |
| "grad_norm": 3.507054090499878, | |
| "kl": 1.4130859375, | |
| "learning_rate": 4.020100089676376e-07, | |
| "loss": 0.0566, | |
| "reward": -0.34711746126413345, | |
| "reward_std": 0.2960944324731827, | |
| "rewards/cosine_scaled_reward": -0.17355873063206673, | |
| "rewards/format_reward": 0.0, | |
| "step": 323 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.296, | |
| "grad_norm": 2.661647081375122, | |
| "kl": 1.736328125, | |
| "learning_rate": 3.9904679361238526e-07, | |
| "loss": 0.0694, | |
| "reward": -0.33277176320552826, | |
| "reward_std": 0.3034566268324852, | |
| "rewards/cosine_scaled_reward": -0.16638587787747383, | |
| "rewards/format_reward": 0.0, | |
| "step": 324 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.3, | |
| "grad_norm": 3.079672336578369, | |
| "kl": 1.359375, | |
| "learning_rate": 3.9609093550344907e-07, | |
| "loss": 0.0544, | |
| "reward": -0.3246685415506363, | |
| "reward_std": 0.27341699600219727, | |
| "rewards/cosine_scaled_reward": -0.16233427450060844, | |
| "rewards/format_reward": 0.0, | |
| "step": 325 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.304, | |
| "grad_norm": 3.248324394226074, | |
| "kl": 1.1181640625, | |
| "learning_rate": 3.931425787051832e-07, | |
| "loss": 0.0447, | |
| "reward": -0.3214203119277954, | |
| "reward_std": 0.2835453376173973, | |
| "rewards/cosine_scaled_reward": -0.160710159689188, | |
| "rewards/format_reward": 0.0, | |
| "step": 326 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.308, | |
| "grad_norm": 3.676837205886841, | |
| "kl": 1.724609375, | |
| "learning_rate": 3.902018669163384e-07, | |
| "loss": 0.069, | |
| "reward": -0.32949286699295044, | |
| "reward_std": 0.30344782024621964, | |
| "rewards/cosine_scaled_reward": -0.16474644094705582, | |
| "rewards/format_reward": 0.0, | |
| "step": 327 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.312, | |
| "grad_norm": 2.3120462894439697, | |
| "kl": 1.537109375, | |
| "learning_rate": 3.872689434630585e-07, | |
| "loss": 0.0615, | |
| "reward": -0.3512613996863365, | |
| "reward_std": 0.3501633331179619, | |
| "rewards/cosine_scaled_reward": -0.17563070356845856, | |
| "rewards/format_reward": 0.0, | |
| "step": 328 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.316, | |
| "grad_norm": 2.4828386306762695, | |
| "kl": 1.6953125, | |
| "learning_rate": 3.843439512918949e-07, | |
| "loss": 0.0677, | |
| "reward": -0.31614498794078827, | |
| "reward_std": 0.29276788979768753, | |
| "rewards/cosine_scaled_reward": -0.15807249024510384, | |
| "rewards/format_reward": 0.0, | |
| "step": 329 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.32, | |
| "grad_norm": 3.356783151626587, | |
| "kl": 2.453125, | |
| "learning_rate": 3.8142703296283953e-07, | |
| "loss": 0.0982, | |
| "reward": -0.4576185494661331, | |
| "reward_std": 0.32832735031843185, | |
| "rewards/cosine_scaled_reward": -0.22880928218364716, | |
| "rewards/format_reward": 0.0, | |
| "step": 330 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.324, | |
| "grad_norm": 2.7885196208953857, | |
| "kl": 2.068359375, | |
| "learning_rate": 3.785183306423767e-07, | |
| "loss": 0.0827, | |
| "reward": -0.2943090833723545, | |
| "reward_std": 0.31652648001909256, | |
| "rewards/cosine_scaled_reward": -0.14715453796088696, | |
| "rewards/format_reward": 0.0, | |
| "step": 331 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.328, | |
| "grad_norm": 3.0415380001068115, | |
| "kl": 1.802734375, | |
| "learning_rate": 3.7561798609655373e-07, | |
| "loss": 0.0721, | |
| "reward": -0.3697570115327835, | |
| "reward_std": 0.3258262947201729, | |
| "rewards/cosine_scaled_reward": -0.18487850576639175, | |
| "rewards/format_reward": 0.0, | |
| "step": 332 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.332, | |
| "grad_norm": 3.139693021774292, | |
| "kl": 1.732421875, | |
| "learning_rate": 3.72726140684072e-07, | |
| "loss": 0.0693, | |
| "reward": -0.33471549302339554, | |
| "reward_std": 0.2794983647763729, | |
| "rewards/cosine_scaled_reward": -0.16735775396227837, | |
| "rewards/format_reward": 0.0, | |
| "step": 333 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.336, | |
| "grad_norm": 2.6243162155151367, | |
| "kl": 1.8369140625, | |
| "learning_rate": 3.6984293534939737e-07, | |
| "loss": 0.0733, | |
| "reward": -0.3382048085331917, | |
| "reward_std": 0.3457643389701843, | |
| "rewards/cosine_scaled_reward": -0.16910240054130554, | |
| "rewards/format_reward": 0.0, | |
| "step": 334 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.34, | |
| "grad_norm": 3.803060293197632, | |
| "kl": 1.8046875, | |
| "learning_rate": 3.6696851061588994e-07, | |
| "loss": 0.0723, | |
| "reward": -0.3406166359782219, | |
| "reward_std": 0.29876144975423813, | |
| "rewards/cosine_scaled_reward": -0.17030831426382065, | |
| "rewards/format_reward": 0.0, | |
| "step": 335 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.3439999999999999, | |
| "grad_norm": 3.948391914367676, | |
| "kl": 1.365234375, | |
| "learning_rate": 3.641030065789562e-07, | |
| "loss": 0.0546, | |
| "reward": -0.2908342033624649, | |
| "reward_std": 0.26911235228180885, | |
| "rewards/cosine_scaled_reward": -0.14541710540652275, | |
| "rewards/format_reward": 0.0, | |
| "step": 336 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.3479999999999999, | |
| "grad_norm": 2.9695639610290527, | |
| "kl": 2.19921875, | |
| "learning_rate": 3.612465628992203e-07, | |
| "loss": 0.0881, | |
| "reward": -0.37160656601190567, | |
| "reward_std": 0.3147331103682518, | |
| "rewards/cosine_scaled_reward": -0.18580328300595284, | |
| "rewards/format_reward": 0.0, | |
| "step": 337 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.3519999999999999, | |
| "grad_norm": 3.1350209712982178, | |
| "kl": 2.1689453125, | |
| "learning_rate": 3.5839931879571725e-07, | |
| "loss": 0.087, | |
| "reward": -0.3230074942111969, | |
| "reward_std": 0.313438281416893, | |
| "rewards/cosine_scaled_reward": -0.16150375083088875, | |
| "rewards/format_reward": 0.0, | |
| "step": 338 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.3559999999999999, | |
| "grad_norm": 3.882567882537842, | |
| "kl": 2.0546875, | |
| "learning_rate": 3.555614130391079e-07, | |
| "loss": 0.0821, | |
| "reward": -0.36975327879190445, | |
| "reward_std": 0.31242573261260986, | |
| "rewards/cosine_scaled_reward": -0.18487663567066193, | |
| "rewards/format_reward": 0.0, | |
| "step": 339 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.3599999999999999, | |
| "grad_norm": 2.6699118614196777, | |
| "kl": 1.689453125, | |
| "learning_rate": 3.5273298394491515e-07, | |
| "loss": 0.0676, | |
| "reward": -0.368961863219738, | |
| "reward_std": 0.32627636194229126, | |
| "rewards/cosine_scaled_reward": -0.1844809353351593, | |
| "rewards/format_reward": 0.0, | |
| "step": 340 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.3639999999999999, | |
| "grad_norm": 3.0782856941223145, | |
| "kl": 1.59765625, | |
| "learning_rate": 3.4991416936678276e-07, | |
| "loss": 0.064, | |
| "reward": -0.3320116475224495, | |
| "reward_std": 0.3151276856660843, | |
| "rewards/cosine_scaled_reward": -0.16600582748651505, | |
| "rewards/format_reward": 0.0, | |
| "step": 341 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.3679999999999999, | |
| "grad_norm": 2.2419495582580566, | |
| "kl": 1.46484375, | |
| "learning_rate": 3.471051066897562e-07, | |
| "loss": 0.0585, | |
| "reward": -0.2764207161962986, | |
| "reward_std": 0.3390573188662529, | |
| "rewards/cosine_scaled_reward": -0.1382103539071977, | |
| "rewards/format_reward": 0.0, | |
| "step": 342 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.3719999999999999, | |
| "grad_norm": 4.397972106933594, | |
| "kl": 2.5, | |
| "learning_rate": 3.4430593282358777e-07, | |
| "loss": 0.1002, | |
| "reward": -0.33926407247781754, | |
| "reward_std": 0.31172922998666763, | |
| "rewards/cosine_scaled_reward": -0.16963203251361847, | |
| "rewards/format_reward": 0.0, | |
| "step": 343 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.376, | |
| "grad_norm": 3.441905975341797, | |
| "kl": 2.0234375, | |
| "learning_rate": 3.4151678419606233e-07, | |
| "loss": 0.0808, | |
| "reward": -0.3324861600995064, | |
| "reward_std": 0.2958858981728554, | |
| "rewards/cosine_scaled_reward": -0.1662430725991726, | |
| "rewards/format_reward": 0.0, | |
| "step": 344 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.38, | |
| "grad_norm": 2.7323975563049316, | |
| "kl": 1.4189453125, | |
| "learning_rate": 3.387377967463493e-07, | |
| "loss": 0.0566, | |
| "reward": -0.3314187452197075, | |
| "reward_std": 0.3164066970348358, | |
| "rewards/cosine_scaled_reward": -0.16570937633514404, | |
| "rewards/format_reward": 0.0, | |
| "step": 345 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.384, | |
| "grad_norm": 4.131885528564453, | |
| "kl": 2.45703125, | |
| "learning_rate": 3.359691059183761e-07, | |
| "loss": 0.0983, | |
| "reward": -0.37432391941547394, | |
| "reward_std": 0.33136965334415436, | |
| "rewards/cosine_scaled_reward": -0.18716195970773697, | |
| "rewards/format_reward": 0.0, | |
| "step": 346 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.388, | |
| "grad_norm": 2.9907569885253906, | |
| "kl": 1.732421875, | |
| "learning_rate": 3.3321084665422803e-07, | |
| "loss": 0.0693, | |
| "reward": -0.38256606459617615, | |
| "reward_std": 0.31782740354537964, | |
| "rewards/cosine_scaled_reward": -0.19128303229808807, | |
| "rewards/format_reward": 0.0, | |
| "step": 347 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.392, | |
| "grad_norm": 2.6049344539642334, | |
| "kl": 1.53515625, | |
| "learning_rate": 3.3046315338757026e-07, | |
| "loss": 0.0613, | |
| "reward": -0.2997368350625038, | |
| "reward_std": 0.3045838475227356, | |
| "rewards/cosine_scaled_reward": -0.1498684138059616, | |
| "rewards/format_reward": 0.0, | |
| "step": 348 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.396, | |
| "grad_norm": 4.5095295906066895, | |
| "kl": 1.5029296875, | |
| "learning_rate": 3.2772616003709616e-07, | |
| "loss": 0.0602, | |
| "reward": -0.3363025635480881, | |
| "reward_std": 0.30865515023469925, | |
| "rewards/cosine_scaled_reward": -0.16815128177404404, | |
| "rewards/format_reward": 0.0, | |
| "step": 349 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.4, | |
| "grad_norm": 3.3342795372009277, | |
| "kl": 1.908203125, | |
| "learning_rate": 3.250000000000001e-07, | |
| "loss": 0.0762, | |
| "reward": -0.3770889565348625, | |
| "reward_std": 0.30710920691490173, | |
| "rewards/cosine_scaled_reward": -0.18854447081685066, | |
| "rewards/format_reward": 0.0, | |
| "step": 350 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.404, | |
| "grad_norm": 2.795259714126587, | |
| "kl": 2.048828125, | |
| "learning_rate": 3.222848061454764e-07, | |
| "loss": 0.082, | |
| "reward": -0.3462035730481148, | |
| "reward_std": 0.32692621648311615, | |
| "rewards/cosine_scaled_reward": -0.1731017865240574, | |
| "rewards/format_reward": 0.0, | |
| "step": 351 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.408, | |
| "grad_norm": 2.563765287399292, | |
| "kl": 1.462890625, | |
| "learning_rate": 3.195807108082429e-07, | |
| "loss": 0.0586, | |
| "reward": -0.37373943626880646, | |
| "reward_std": 0.3041759356856346, | |
| "rewards/cosine_scaled_reward": -0.18686972558498383, | |
| "rewards/format_reward": 0.0, | |
| "step": 352 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.412, | |
| "grad_norm": 2.6194751262664795, | |
| "kl": 1.24609375, | |
| "learning_rate": 3.168878457820915e-07, | |
| "loss": 0.0498, | |
| "reward": -0.3196728527545929, | |
| "reward_std": 0.2953634150326252, | |
| "rewards/cosine_scaled_reward": -0.15983642637729645, | |
| "rewards/format_reward": 0.0, | |
| "step": 353 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.416, | |
| "grad_norm": 2.8382420539855957, | |
| "kl": 1.650390625, | |
| "learning_rate": 3.142063423134644e-07, | |
| "loss": 0.0662, | |
| "reward": -0.33513225615024567, | |
| "reward_std": 0.30527665093541145, | |
| "rewards/cosine_scaled_reward": -0.16756613552570343, | |
| "rewards/format_reward": 0.0, | |
| "step": 354 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.42, | |
| "grad_norm": 2.6078808307647705, | |
| "kl": 2.15234375, | |
| "learning_rate": 3.115363310950578e-07, | |
| "loss": 0.086, | |
| "reward": -0.3992829695343971, | |
| "reward_std": 0.31726495921611786, | |
| "rewards/cosine_scaled_reward": -0.19964147731661797, | |
| "rewards/format_reward": 0.0, | |
| "step": 355 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.424, | |
| "grad_norm": 4.192615985870361, | |
| "kl": 2.142578125, | |
| "learning_rate": 3.0887794225945143e-07, | |
| "loss": 0.0858, | |
| "reward": -0.39319509267807007, | |
| "reward_std": 0.3372880816459656, | |
| "rewards/cosine_scaled_reward": -0.19659754261374474, | |
| "rewards/format_reward": 0.0, | |
| "step": 356 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.428, | |
| "grad_norm": 3.196894407272339, | |
| "kl": 2.509765625, | |
| "learning_rate": 3.062313053727671e-07, | |
| "loss": 0.1006, | |
| "reward": -0.3694089204072952, | |
| "reward_std": 0.323252871632576, | |
| "rewards/cosine_scaled_reward": -0.1847044676542282, | |
| "rewards/format_reward": 0.0, | |
| "step": 357 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.432, | |
| "grad_norm": 3.348161458969116, | |
| "kl": 1.1142578125, | |
| "learning_rate": 3.0359654942835247e-07, | |
| "loss": 0.0447, | |
| "reward": -0.36088229715824127, | |
| "reward_std": 0.31483449041843414, | |
| "rewards/cosine_scaled_reward": -0.18044114857912064, | |
| "rewards/format_reward": 0.0, | |
| "step": 358 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.436, | |
| "grad_norm": 3.457472324371338, | |
| "kl": 2.2265625, | |
| "learning_rate": 3.0097380284049523e-07, | |
| "loss": 0.089, | |
| "reward": -0.3612442761659622, | |
| "reward_std": 0.28438059240579605, | |
| "rewards/cosine_scaled_reward": -0.1806221418082714, | |
| "rewards/format_reward": 0.0, | |
| "step": 359 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.44, | |
| "grad_norm": 3.285405397415161, | |
| "kl": 2.076171875, | |
| "learning_rate": 2.9836319343816397e-07, | |
| "loss": 0.0831, | |
| "reward": -0.32887883111834526, | |
| "reward_std": 0.3107897564768791, | |
| "rewards/cosine_scaled_reward": -0.16443941928446293, | |
| "rewards/format_reward": 0.0, | |
| "step": 360 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.444, | |
| "grad_norm": 2.9156711101531982, | |
| "kl": 1.7646484375, | |
| "learning_rate": 2.9576484845877793e-07, | |
| "loss": 0.0706, | |
| "reward": -0.3512116149067879, | |
| "reward_std": 0.32886873185634613, | |
| "rewards/cosine_scaled_reward": -0.17560580000281334, | |
| "rewards/format_reward": 0.0, | |
| "step": 361 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.448, | |
| "grad_norm": 2.42704439163208, | |
| "kl": 1.697265625, | |
| "learning_rate": 2.931788945420058e-07, | |
| "loss": 0.0679, | |
| "reward": -0.3639722764492035, | |
| "reward_std": 0.2881170064210892, | |
| "rewards/cosine_scaled_reward": -0.18198613449931145, | |
| "rewards/format_reward": 0.0, | |
| "step": 362 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.452, | |
| "grad_norm": 4.5008225440979, | |
| "kl": 2.177734375, | |
| "learning_rate": 2.9060545772359305e-07, | |
| "loss": 0.087, | |
| "reward": -0.3515865206718445, | |
| "reward_std": 0.290123887360096, | |
| "rewards/cosine_scaled_reward": -0.17579325661063194, | |
| "rewards/format_reward": 0.0, | |
| "step": 363 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.456, | |
| "grad_norm": 2.7479496002197266, | |
| "kl": 1.578125, | |
| "learning_rate": 2.8804466342921987e-07, | |
| "loss": 0.0632, | |
| "reward": -0.26583924936130643, | |
| "reward_std": 0.29539088532328606, | |
| "rewards/cosine_scaled_reward": -0.13291961723007262, | |
| "rewards/format_reward": 0.0, | |
| "step": 364 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.46, | |
| "grad_norm": 2.6749367713928223, | |
| "kl": 2.1796875, | |
| "learning_rate": 2.854966364683872e-07, | |
| "loss": 0.087, | |
| "reward": -0.36106909811496735, | |
| "reward_std": 0.2982637956738472, | |
| "rewards/cosine_scaled_reward": -0.18053454905748367, | |
| "rewards/format_reward": 0.0, | |
| "step": 365 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.464, | |
| "grad_norm": 3.6434812545776367, | |
| "kl": 1.4482421875, | |
| "learning_rate": 2.829615010283344e-07, | |
| "loss": 0.058, | |
| "reward": -0.35805001854896545, | |
| "reward_std": 0.31588251888751984, | |
| "rewards/cosine_scaled_reward": -0.17902500554919243, | |
| "rewards/format_reward": 0.0, | |
| "step": 366 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.468, | |
| "grad_norm": 2.877927780151367, | |
| "kl": 1.779296875, | |
| "learning_rate": 2.8043938066798645e-07, | |
| "loss": 0.0712, | |
| "reward": -0.35267870873212814, | |
| "reward_std": 0.3029713034629822, | |
| "rewards/cosine_scaled_reward": -0.17633935809135437, | |
| "rewards/format_reward": 0.0, | |
| "step": 367 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.472, | |
| "grad_norm": 2.9547438621520996, | |
| "kl": 1.3583984375, | |
| "learning_rate": 2.7793039831193133e-07, | |
| "loss": 0.0542, | |
| "reward": -0.34842824190855026, | |
| "reward_std": 0.28041965141892433, | |
| "rewards/cosine_scaled_reward": -0.17421411722898483, | |
| "rewards/format_reward": 0.0, | |
| "step": 368 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.476, | |
| "grad_norm": 2.4998183250427246, | |
| "kl": 1.712890625, | |
| "learning_rate": 2.7543467624442956e-07, | |
| "loss": 0.0686, | |
| "reward": -0.34311509132385254, | |
| "reward_std": 0.3226206302642822, | |
| "rewards/cosine_scaled_reward": -0.17155754193663597, | |
| "rewards/format_reward": 0.0, | |
| "step": 369 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.48, | |
| "grad_norm": 3.5822997093200684, | |
| "kl": 1.2568359375, | |
| "learning_rate": 2.729523361034538e-07, | |
| "loss": 0.0502, | |
| "reward": -0.31581661850214005, | |
| "reward_std": 0.27614113688468933, | |
| "rewards/cosine_scaled_reward": -0.15790832042694092, | |
| "rewards/format_reward": 0.0, | |
| "step": 370 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.484, | |
| "grad_norm": 2.638000965118408, | |
| "kl": 1.658203125, | |
| "learning_rate": 2.7048349887476037e-07, | |
| "loss": 0.0663, | |
| "reward": -0.3658217638731003, | |
| "reward_std": 0.3533295765519142, | |
| "rewards/cosine_scaled_reward": -0.18291086703538895, | |
| "rewards/format_reward": 0.0, | |
| "step": 371 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.488, | |
| "grad_norm": 2.4719886779785156, | |
| "kl": 1.470703125, | |
| "learning_rate": 2.6802828488599294e-07, | |
| "loss": 0.0588, | |
| "reward": -0.35377567261457443, | |
| "reward_std": 0.2872357815504074, | |
| "rewards/cosine_scaled_reward": -0.17688783630728722, | |
| "rewards/format_reward": 0.0, | |
| "step": 372 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.492, | |
| "grad_norm": 3.820688486099243, | |
| "kl": 1.65673828125, | |
| "learning_rate": 2.655868138008171e-07, | |
| "loss": 0.0662, | |
| "reward": -0.3673105686903, | |
| "reward_std": 0.29224705323576927, | |
| "rewards/cosine_scaled_reward": -0.1836552768945694, | |
| "rewards/format_reward": 0.0, | |
| "step": 373 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.496, | |
| "grad_norm": 3.1416916847229004, | |
| "kl": 1.4990234375, | |
| "learning_rate": 2.631592046130896e-07, | |
| "loss": 0.06, | |
| "reward": -0.3574133738875389, | |
| "reward_std": 0.2663569226861, | |
| "rewards/cosine_scaled_reward": -0.17870669439435005, | |
| "rewards/format_reward": 0.0, | |
| "step": 374 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.5, | |
| "grad_norm": 2.3712515830993652, | |
| "kl": 1.900390625, | |
| "learning_rate": 2.6074557564105724e-07, | |
| "loss": 0.0761, | |
| "reward": -0.34536080807447433, | |
| "reward_std": 0.3063738942146301, | |
| "rewards/cosine_scaled_reward": -0.17268040403723717, | |
| "rewards/format_reward": 0.0, | |
| "step": 375 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.504, | |
| "grad_norm": 2.792006254196167, | |
| "kl": 1.71875, | |
| "learning_rate": 2.583460445215911e-07, | |
| "loss": 0.0688, | |
| "reward": -0.3458981513977051, | |
| "reward_std": 0.3039686158299446, | |
| "rewards/cosine_scaled_reward": -0.17294907197356224, | |
| "rewards/format_reward": 0.0, | |
| "step": 376 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.508, | |
| "grad_norm": 2.985948085784912, | |
| "kl": 1.5625, | |
| "learning_rate": 2.5596072820445254e-07, | |
| "loss": 0.0625, | |
| "reward": -0.21606629202142358, | |
| "reward_std": 0.2749215438961983, | |
| "rewards/cosine_scaled_reward": -0.10803314973600209, | |
| "rewards/format_reward": 0.0, | |
| "step": 377 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1531.952392578125, | |
| "epoch": 1.512, | |
| "grad_norm": 2.396852970123291, | |
| "kl": 1.9921875, | |
| "learning_rate": 2.5358974294659373e-07, | |
| "loss": 0.0823, | |
| "reward": -0.38127752393484116, | |
| "reward_std": 0.32172612845897675, | |
| "rewards/cosine_scaled_reward": -0.19063876569271088, | |
| "rewards/format_reward": 0.0, | |
| "step": 378 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.516, | |
| "grad_norm": 2.503976345062256, | |
| "kl": 1.794921875, | |
| "learning_rate": 2.512332043064913e-07, | |
| "loss": 0.0718, | |
| "reward": -0.3479606434702873, | |
| "reward_std": 0.29174239560961723, | |
| "rewards/cosine_scaled_reward": -0.17398031428456306, | |
| "rewards/format_reward": 0.0, | |
| "step": 379 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1531.8035888671875, | |
| "epoch": 1.52, | |
| "grad_norm": 3.344243049621582, | |
| "kl": 2.080078125, | |
| "learning_rate": 2.488912271385139e-07, | |
| "loss": 0.083, | |
| "reward": -0.38203170895576477, | |
| "reward_std": 0.3180833086371422, | |
| "rewards/cosine_scaled_reward": -0.19101585447788239, | |
| "rewards/format_reward": 0.0, | |
| "step": 380 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.524, | |
| "grad_norm": 3.5073604583740234, | |
| "kl": 2.095703125, | |
| "learning_rate": 2.465639255873246e-07, | |
| "loss": 0.0837, | |
| "reward": -0.33683621138334274, | |
| "reward_std": 0.3141423165798187, | |
| "rewards/cosine_scaled_reward": -0.16841810569167137, | |
| "rewards/format_reward": 0.0, | |
| "step": 381 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.528, | |
| "grad_norm": 2.7634477615356445, | |
| "kl": 2.55859375, | |
| "learning_rate": 2.4425141308231765e-07, | |
| "loss": 0.1022, | |
| "reward": -0.3983701467514038, | |
| "reward_std": 0.31766583025455475, | |
| "rewards/cosine_scaled_reward": -0.199185062199831, | |
| "rewards/format_reward": 0.0, | |
| "step": 382 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.532, | |
| "grad_norm": 3.1601033210754395, | |
| "kl": 1.486328125, | |
| "learning_rate": 2.4195380233209006e-07, | |
| "loss": 0.0594, | |
| "reward": -0.37120404094457626, | |
| "reward_std": 0.3172856420278549, | |
| "rewards/cosine_scaled_reward": -0.18560202419757843, | |
| "rewards/format_reward": 0.0, | |
| "step": 383 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.536, | |
| "grad_norm": 2.475311040878296, | |
| "kl": 2.01953125, | |
| "learning_rate": 2.3967120531894857e-07, | |
| "loss": 0.0807, | |
| "reward": -0.3449181020259857, | |
| "reward_std": 0.3061336353421211, | |
| "rewards/cosine_scaled_reward": -0.17245905846357346, | |
| "rewards/format_reward": 0.0, | |
| "step": 384 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.54, | |
| "grad_norm": 3.9638140201568604, | |
| "kl": 1.6806640625, | |
| "learning_rate": 2.374037332934512e-07, | |
| "loss": 0.0673, | |
| "reward": -0.3139965161681175, | |
| "reward_std": 0.303245909512043, | |
| "rewards/cosine_scaled_reward": -0.15699823945760727, | |
| "rewards/format_reward": 0.0, | |
| "step": 385 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.544, | |
| "grad_norm": 3.2407708168029785, | |
| "kl": 1.89453125, | |
| "learning_rate": 2.3515149676898552e-07, | |
| "loss": 0.0757, | |
| "reward": -0.3049175813794136, | |
| "reward_std": 0.30845751613378525, | |
| "rewards/cosine_scaled_reward": -0.1524587944149971, | |
| "rewards/format_reward": 0.0, | |
| "step": 386 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.548, | |
| "grad_norm": 3.1065189838409424, | |
| "kl": 1.75390625, | |
| "learning_rate": 2.3291460551638237e-07, | |
| "loss": 0.0701, | |
| "reward": -0.3369733840227127, | |
| "reward_std": 0.30179525911808014, | |
| "rewards/cosine_scaled_reward": -0.16848668828606606, | |
| "rewards/format_reward": 0.0, | |
| "step": 387 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.552, | |
| "grad_norm": 2.6867339611053467, | |
| "kl": 2.06640625, | |
| "learning_rate": 2.306931685585657e-07, | |
| "loss": 0.0826, | |
| "reward": -0.3339100852608681, | |
| "reward_std": 0.3043428584933281, | |
| "rewards/cosine_scaled_reward": -0.16695504263043404, | |
| "rewards/format_reward": 0.0, | |
| "step": 388 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.556, | |
| "grad_norm": 3.1580567359924316, | |
| "kl": 2.291015625, | |
| "learning_rate": 2.2848729416523859e-07, | |
| "loss": 0.0915, | |
| "reward": -0.3744669333100319, | |
| "reward_std": 0.3249610960483551, | |
| "rewards/cosine_scaled_reward": -0.18723345920443535, | |
| "rewards/format_reward": 0.0, | |
| "step": 389 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.56, | |
| "grad_norm": 5.407771587371826, | |
| "kl": 1.609375, | |
| "learning_rate": 2.2629708984760706e-07, | |
| "loss": 0.0645, | |
| "reward": -0.3420454412698746, | |
| "reward_std": 0.3148321136832237, | |
| "rewards/cosine_scaled_reward": -0.1710227131843567, | |
| "rewards/format_reward": 0.0, | |
| "step": 390 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.564, | |
| "grad_norm": 4.492737770080566, | |
| "kl": 2.275390625, | |
| "learning_rate": 2.2412266235313973e-07, | |
| "loss": 0.0909, | |
| "reward": -0.36313918232917786, | |
| "reward_std": 0.29535526037216187, | |
| "rewards/cosine_scaled_reward": -0.18156958371400833, | |
| "rewards/format_reward": 0.0, | |
| "step": 391 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.568, | |
| "grad_norm": 3.0125086307525635, | |
| "kl": 2.029296875, | |
| "learning_rate": 2.2196411766036487e-07, | |
| "loss": 0.0812, | |
| "reward": -0.37769585102796555, | |
| "reward_std": 0.31776873767375946, | |
| "rewards/cosine_scaled_reward": -0.18884791806340218, | |
| "rewards/format_reward": 0.0, | |
| "step": 392 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.572, | |
| "grad_norm": 3.134265899658203, | |
| "kl": 2.47265625, | |
| "learning_rate": 2.1982156097370557e-07, | |
| "loss": 0.099, | |
| "reward": -0.38678842037916183, | |
| "reward_std": 0.30557621270418167, | |
| "rewards/cosine_scaled_reward": -0.19339420646429062, | |
| "rewards/format_reward": 0.0, | |
| "step": 393 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.576, | |
| "grad_norm": 2.9398727416992188, | |
| "kl": 1.404296875, | |
| "learning_rate": 2.1769509671835223e-07, | |
| "loss": 0.0562, | |
| "reward": -0.3609785735607147, | |
| "reward_std": 0.29732464998960495, | |
| "rewards/cosine_scaled_reward": -0.18048929050564766, | |
| "rewards/format_reward": 0.0, | |
| "step": 394 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.58, | |
| "grad_norm": 2.3901424407958984, | |
| "kl": 2.3291015625, | |
| "learning_rate": 2.1558482853517253e-07, | |
| "loss": 0.093, | |
| "reward": -0.38430536538362503, | |
| "reward_std": 0.32753758877515793, | |
| "rewards/cosine_scaled_reward": -0.19215268269181252, | |
| "rewards/format_reward": 0.0, | |
| "step": 395 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1526.9702453613281, | |
| "epoch": 1.584, | |
| "grad_norm": 3.9775447845458984, | |
| "kl": 2.06640625, | |
| "learning_rate": 2.134908592756607e-07, | |
| "loss": 0.0914, | |
| "reward": -0.33116257190704346, | |
| "reward_std": 0.2928163409233093, | |
| "rewards/cosine_scaled_reward": -0.16558128595352173, | |
| "rewards/format_reward": 0.0, | |
| "step": 396 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.588, | |
| "grad_norm": 2.9975955486297607, | |
| "kl": 2.318359375, | |
| "learning_rate": 2.1141329099692406e-07, | |
| "loss": 0.0928, | |
| "reward": -0.3710367754101753, | |
| "reward_std": 0.3226532116532326, | |
| "rewards/cosine_scaled_reward": -0.18551838770508766, | |
| "rewards/format_reward": 0.0, | |
| "step": 397 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1530.6845397949219, | |
| "epoch": 1.592, | |
| "grad_norm": 3.739922046661377, | |
| "kl": 2.025390625, | |
| "learning_rate": 2.0935222495670968e-07, | |
| "loss": 0.0747, | |
| "reward": -0.3954162746667862, | |
| "reward_std": 0.3323783427476883, | |
| "rewards/cosine_scaled_reward": -0.1977081410586834, | |
| "rewards/format_reward": 0.0, | |
| "step": 398 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.596, | |
| "grad_norm": 2.7063024044036865, | |
| "kl": 1.0927734375, | |
| "learning_rate": 2.0730776160846853e-07, | |
| "loss": 0.0437, | |
| "reward": -0.3006215952336788, | |
| "reward_std": 0.27692657709121704, | |
| "rewards/cosine_scaled_reward": -0.15031079947948456, | |
| "rewards/format_reward": 0.0, | |
| "step": 399 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.6, | |
| "grad_norm": 2.469496726989746, | |
| "kl": 1.732421875, | |
| "learning_rate": 2.0528000059645995e-07, | |
| "loss": 0.0693, | |
| "reward": -0.36928267031908035, | |
| "reward_std": 0.30984392017126083, | |
| "rewards/cosine_scaled_reward": -0.18464133515954018, | |
| "rewards/format_reward": 0.0, | |
| "step": 400 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1522.3095397949219, | |
| "epoch": 1.604, | |
| "grad_norm": 2.855372190475464, | |
| "kl": 1.845703125, | |
| "learning_rate": 2.032690407508949e-07, | |
| "loss": 0.0636, | |
| "reward": -0.38443852961063385, | |
| "reward_std": 0.28470365703105927, | |
| "rewards/cosine_scaled_reward": -0.19221926480531693, | |
| "rewards/format_reward": 0.0, | |
| "step": 401 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.608, | |
| "grad_norm": 3.3847217559814453, | |
| "kl": 2.0390625, | |
| "learning_rate": 2.0127498008311922e-07, | |
| "loss": 0.0814, | |
| "reward": -0.3252910152077675, | |
| "reward_std": 0.2982725724577904, | |
| "rewards/cosine_scaled_reward": -0.16264550015330315, | |
| "rewards/format_reward": 0.0, | |
| "step": 402 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.612, | |
| "grad_norm": 3.0226523876190186, | |
| "kl": 1.81640625, | |
| "learning_rate": 1.9929791578083655e-07, | |
| "loss": 0.0727, | |
| "reward": -0.3527565225958824, | |
| "reward_std": 0.30437447875738144, | |
| "rewards/cosine_scaled_reward": -0.1763782650232315, | |
| "rewards/format_reward": 0.0, | |
| "step": 403 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.616, | |
| "grad_norm": 2.866734743118286, | |
| "kl": 1.7890625, | |
| "learning_rate": 1.9733794420337213e-07, | |
| "loss": 0.0716, | |
| "reward": -0.3746185079216957, | |
| "reward_std": 0.3078552633523941, | |
| "rewards/cosine_scaled_reward": -0.18730924278497696, | |
| "rewards/format_reward": 0.0, | |
| "step": 404 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.62, | |
| "grad_norm": 3.9170870780944824, | |
| "kl": 1.970703125, | |
| "learning_rate": 1.9539516087697517e-07, | |
| "loss": 0.0789, | |
| "reward": -0.41533301770687103, | |
| "reward_std": 0.3027655556797981, | |
| "rewards/cosine_scaled_reward": -0.20766650885343552, | |
| "rewards/format_reward": 0.0, | |
| "step": 405 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.624, | |
| "grad_norm": 3.470655679702759, | |
| "kl": 1.845703125, | |
| "learning_rate": 1.934696604901642e-07, | |
| "loss": 0.0738, | |
| "reward": -0.3191938251256943, | |
| "reward_std": 0.28303690254688263, | |
| "rewards/cosine_scaled_reward": -0.15959692373871803, | |
| "rewards/format_reward": 0.0, | |
| "step": 406 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.6280000000000001, | |
| "grad_norm": 3.623340368270874, | |
| "kl": 1.31640625, | |
| "learning_rate": 1.915615368891117e-07, | |
| "loss": 0.0526, | |
| "reward": -0.3123548626899719, | |
| "reward_std": 0.29499682784080505, | |
| "rewards/cosine_scaled_reward": -0.15617743134498596, | |
| "rewards/format_reward": 0.0, | |
| "step": 407 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.6320000000000001, | |
| "grad_norm": 2.282514810562134, | |
| "kl": 1.267578125, | |
| "learning_rate": 1.8967088307307e-07, | |
| "loss": 0.0507, | |
| "reward": -0.39642050117254257, | |
| "reward_std": 0.311983872205019, | |
| "rewards/cosine_scaled_reward": -0.19821025803685188, | |
| "rewards/format_reward": 0.0, | |
| "step": 408 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.6360000000000001, | |
| "grad_norm": 2.5232083797454834, | |
| "kl": 1.681640625, | |
| "learning_rate": 1.8779779118983867e-07, | |
| "loss": 0.0672, | |
| "reward": -0.33888739347457886, | |
| "reward_std": 0.28087718039751053, | |
| "rewards/cosine_scaled_reward": -0.16944369673728943, | |
| "rewards/format_reward": 0.0, | |
| "step": 409 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.6400000000000001, | |
| "grad_norm": 3.886439085006714, | |
| "kl": 2.09765625, | |
| "learning_rate": 1.8594235253127372e-07, | |
| "loss": 0.0838, | |
| "reward": -0.38627707213163376, | |
| "reward_std": 0.33190976083278656, | |
| "rewards/cosine_scaled_reward": -0.19313853234052658, | |
| "rewards/format_reward": 0.0, | |
| "step": 410 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.6440000000000001, | |
| "grad_norm": 3.090627670288086, | |
| "kl": 2.140625, | |
| "learning_rate": 1.8410465752883758e-07, | |
| "loss": 0.0857, | |
| "reward": -0.3793156296014786, | |
| "reward_std": 0.30717378109693527, | |
| "rewards/cosine_scaled_reward": -0.1896577998995781, | |
| "rewards/format_reward": 0.0, | |
| "step": 411 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.6480000000000001, | |
| "grad_norm": 3.867506980895996, | |
| "kl": 1.880859375, | |
| "learning_rate": 1.822847957491922e-07, | |
| "loss": 0.0753, | |
| "reward": -0.3565782457590103, | |
| "reward_std": 0.3352038711309433, | |
| "rewards/cosine_scaled_reward": -0.17828912287950516, | |
| "rewards/format_reward": 0.0, | |
| "step": 412 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.6520000000000001, | |
| "grad_norm": 2.388094902038574, | |
| "kl": 1.751953125, | |
| "learning_rate": 1.804828558898332e-07, | |
| "loss": 0.0701, | |
| "reward": -0.3393707424402237, | |
| "reward_std": 0.3029238283634186, | |
| "rewards/cosine_scaled_reward": -0.16968537122011185, | |
| "rewards/format_reward": 0.0, | |
| "step": 413 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.6560000000000001, | |
| "grad_norm": 2.5263466835021973, | |
| "kl": 1.748046875, | |
| "learning_rate": 1.7869892577476722e-07, | |
| "loss": 0.0698, | |
| "reward": -0.4274343103170395, | |
| "reward_std": 0.3449402078986168, | |
| "rewards/cosine_scaled_reward": -0.21371715888381004, | |
| "rewards/format_reward": 0.0, | |
| "step": 414 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.6600000000000001, | |
| "grad_norm": 2.3268003463745117, | |
| "kl": 1.400390625, | |
| "learning_rate": 1.7693309235023127e-07, | |
| "loss": 0.0559, | |
| "reward": -0.3480057269334793, | |
| "reward_std": 0.29953421652317047, | |
| "rewards/cosine_scaled_reward": -0.17400285601615906, | |
| "rewards/format_reward": 0.0, | |
| "step": 415 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.6640000000000001, | |
| "grad_norm": 3.2503533363342285, | |
| "kl": 1.9140625, | |
| "learning_rate": 1.7518544168045524e-07, | |
| "loss": 0.0767, | |
| "reward": -0.36937638372182846, | |
| "reward_std": 0.31766701489686966, | |
| "rewards/cosine_scaled_reward": -0.18468819558620453, | |
| "rewards/format_reward": 0.0, | |
| "step": 416 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.6680000000000001, | |
| "grad_norm": 2.9895646572113037, | |
| "kl": 2.1796875, | |
| "learning_rate": 1.7345605894346726e-07, | |
| "loss": 0.0871, | |
| "reward": -0.3985458239912987, | |
| "reward_std": 0.33385203033685684, | |
| "rewards/cosine_scaled_reward": -0.19927291199564934, | |
| "rewards/format_reward": 0.0, | |
| "step": 417 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.6720000000000002, | |
| "grad_norm": 3.2457692623138428, | |
| "kl": 1.71875, | |
| "learning_rate": 1.7174502842694212e-07, | |
| "loss": 0.0687, | |
| "reward": -0.2603262776392512, | |
| "reward_std": 0.3040950074791908, | |
| "rewards/cosine_scaled_reward": -0.13016314181732014, | |
| "rewards/format_reward": 0.0, | |
| "step": 418 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.6760000000000002, | |
| "grad_norm": 2.8391411304473877, | |
| "kl": 1.798828125, | |
| "learning_rate": 1.7005243352409333e-07, | |
| "loss": 0.072, | |
| "reward": -0.2663672436028719, | |
| "reward_std": 0.29912005364894867, | |
| "rewards/cosine_scaled_reward": -0.13318362249992788, | |
| "rewards/format_reward": 0.0, | |
| "step": 419 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.6800000000000002, | |
| "grad_norm": 3.1057238578796387, | |
| "kl": 1.5, | |
| "learning_rate": 1.6837835672960831e-07, | |
| "loss": 0.06, | |
| "reward": -0.34882377088069916, | |
| "reward_std": 0.3601520508527756, | |
| "rewards/cosine_scaled_reward": -0.17441189289093018, | |
| "rewards/format_reward": 0.0, | |
| "step": 420 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.6840000000000002, | |
| "grad_norm": 2.243816375732422, | |
| "kl": 1.541015625, | |
| "learning_rate": 1.6672287963562852e-07, | |
| "loss": 0.0616, | |
| "reward": -0.3832622766494751, | |
| "reward_std": 0.3413049802184105, | |
| "rewards/cosine_scaled_reward": -0.19163113832473755, | |
| "rewards/format_reward": 0.0, | |
| "step": 421 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.688, | |
| "grad_norm": 3.76218581199646, | |
| "kl": 1.880859375, | |
| "learning_rate": 1.6508608292777203e-07, | |
| "loss": 0.0752, | |
| "reward": -0.3700753226876259, | |
| "reward_std": 0.31324099004268646, | |
| "rewards/cosine_scaled_reward": -0.18503766134381294, | |
| "rewards/format_reward": 0.0, | |
| "step": 422 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.692, | |
| "grad_norm": 4.034151554107666, | |
| "kl": 1.70703125, | |
| "learning_rate": 1.6346804638120098e-07, | |
| "loss": 0.0682, | |
| "reward": -0.29791881144046783, | |
| "reward_std": 0.2801155336201191, | |
| "rewards/cosine_scaled_reward": -0.14895940944552422, | |
| "rewards/format_reward": 0.0, | |
| "step": 423 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.696, | |
| "grad_norm": 3.041618824005127, | |
| "kl": 1.81640625, | |
| "learning_rate": 1.6186884885673413e-07, | |
| "loss": 0.0725, | |
| "reward": -0.32316526770591736, | |
| "reward_std": 0.2970619350671768, | |
| "rewards/cosine_scaled_reward": -0.16158264502882957, | |
| "rewards/format_reward": 0.0, | |
| "step": 424 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.7, | |
| "grad_norm": 4.081668376922607, | |
| "kl": 1.4453125, | |
| "learning_rate": 1.6028856829700258e-07, | |
| "loss": 0.0576, | |
| "reward": -0.3476375713944435, | |
| "reward_std": 0.294509120285511, | |
| "rewards/cosine_scaled_reward": -0.17381878197193146, | |
| "rewards/format_reward": 0.0, | |
| "step": 425 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.704, | |
| "grad_norm": 3.166949510574341, | |
| "kl": 2.1015625, | |
| "learning_rate": 1.5872728172265146e-07, | |
| "loss": 0.0841, | |
| "reward": -0.3467593193054199, | |
| "reward_std": 0.30388573557138443, | |
| "rewards/cosine_scaled_reward": -0.17337966337800026, | |
| "rewards/format_reward": 0.0, | |
| "step": 426 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.708, | |
| "grad_norm": 4.211978435516357, | |
| "kl": 1.763671875, | |
| "learning_rate": 1.5718506522858572e-07, | |
| "loss": 0.0705, | |
| "reward": -0.3505774810910225, | |
| "reward_std": 0.30420946329832077, | |
| "rewards/cosine_scaled_reward": -0.17528874799609184, | |
| "rewards/format_reward": 0.0, | |
| "step": 427 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.712, | |
| "grad_norm": 4.166502952575684, | |
| "kl": 2.158203125, | |
| "learning_rate": 1.5566199398026147e-07, | |
| "loss": 0.0863, | |
| "reward": -0.361857570707798, | |
| "reward_std": 0.30119316279888153, | |
| "rewards/cosine_scaled_reward": -0.1809287928044796, | |
| "rewards/format_reward": 0.0, | |
| "step": 428 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.716, | |
| "grad_norm": 2.8889896869659424, | |
| "kl": 1.8671875, | |
| "learning_rate": 1.5415814221002265e-07, | |
| "loss": 0.0745, | |
| "reward": -0.32126056402921677, | |
| "reward_std": 0.27691005170345306, | |
| "rewards/cosine_scaled_reward": -0.16063029691576958, | |
| "rewards/format_reward": 0.0, | |
| "step": 429 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.72, | |
| "grad_norm": 3.3025801181793213, | |
| "kl": 1.904296875, | |
| "learning_rate": 1.5267358321348285e-07, | |
| "loss": 0.0761, | |
| "reward": -0.36847078800201416, | |
| "reward_std": 0.3445659205317497, | |
| "rewards/cosine_scaled_reward": -0.18423539400100708, | |
| "rewards/format_reward": 0.0, | |
| "step": 430 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.724, | |
| "grad_norm": 3.0440969467163086, | |
| "kl": 1.75, | |
| "learning_rate": 1.5120838934595337e-07, | |
| "loss": 0.07, | |
| "reward": -0.36113734543323517, | |
| "reward_std": 0.3412683606147766, | |
| "rewards/cosine_scaled_reward": -0.18056866899132729, | |
| "rewards/format_reward": 0.0, | |
| "step": 431 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1530.952392578125, | |
| "epoch": 1.728, | |
| "grad_norm": 2.575627326965332, | |
| "kl": 1.689453125, | |
| "learning_rate": 1.4976263201891613e-07, | |
| "loss": 0.0716, | |
| "reward": -0.3095761463046074, | |
| "reward_std": 0.32323335483670235, | |
| "rewards/cosine_scaled_reward": -0.1547880806028843, | |
| "rewards/format_reward": 0.0, | |
| "step": 432 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.732, | |
| "grad_norm": 3.186289072036743, | |
| "kl": 1.91015625, | |
| "learning_rate": 1.483363816965435e-07, | |
| "loss": 0.0765, | |
| "reward": -0.39015311002731323, | |
| "reward_std": 0.3067055642604828, | |
| "rewards/cosine_scaled_reward": -0.19507654383778572, | |
| "rewards/format_reward": 0.0, | |
| "step": 433 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.736, | |
| "grad_norm": 3.0739073753356934, | |
| "kl": 2.369140625, | |
| "learning_rate": 1.469297078922642e-07, | |
| "loss": 0.0946, | |
| "reward": -0.29091550246812403, | |
| "reward_std": 0.30687109380960464, | |
| "rewards/cosine_scaled_reward": -0.14545774972066283, | |
| "rewards/format_reward": 0.0, | |
| "step": 434 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.74, | |
| "grad_norm": 5.0029778480529785, | |
| "kl": 1.759765625, | |
| "learning_rate": 1.4554267916537495e-07, | |
| "loss": 0.0703, | |
| "reward": -0.34431006759405136, | |
| "reward_std": 0.27501973509788513, | |
| "rewards/cosine_scaled_reward": -0.17215503007173538, | |
| "rewards/format_reward": 0.0, | |
| "step": 435 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.744, | |
| "grad_norm": 5.139548301696777, | |
| "kl": 1.8203125, | |
| "learning_rate": 1.4417536311769885e-07, | |
| "loss": 0.0728, | |
| "reward": -0.31318235397338867, | |
| "reward_std": 0.2976163923740387, | |
| "rewards/cosine_scaled_reward": -0.15659117698669434, | |
| "rewards/format_reward": 0.0, | |
| "step": 436 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.748, | |
| "grad_norm": 2.881143808364868, | |
| "kl": 1.626953125, | |
| "learning_rate": 1.4282782639029128e-07, | |
| "loss": 0.065, | |
| "reward": -0.3547092080116272, | |
| "reward_std": 0.28170817345380783, | |
| "rewards/cosine_scaled_reward": -0.1773546040058136, | |
| "rewards/format_reward": 0.0, | |
| "step": 437 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.752, | |
| "grad_norm": 2.4268362522125244, | |
| "kl": 1.9609375, | |
| "learning_rate": 1.4150013466019114e-07, | |
| "loss": 0.0786, | |
| "reward": -0.3464732989668846, | |
| "reward_std": 0.3199189677834511, | |
| "rewards/cosine_scaled_reward": -0.173236645758152, | |
| "rewards/format_reward": 0.0, | |
| "step": 438 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.756, | |
| "grad_norm": 2.686417579650879, | |
| "kl": 2.318359375, | |
| "learning_rate": 1.4019235263722034e-07, | |
| "loss": 0.0926, | |
| "reward": -0.3557046577334404, | |
| "reward_std": 0.3187018297612667, | |
| "rewards/cosine_scaled_reward": -0.1778523214161396, | |
| "rewards/format_reward": 0.0, | |
| "step": 439 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.76, | |
| "grad_norm": 4.9666876792907715, | |
| "kl": 1.4619140625, | |
| "learning_rate": 1.3890454406082956e-07, | |
| "loss": 0.0584, | |
| "reward": -0.3234737552702427, | |
| "reward_std": 0.32776766270399094, | |
| "rewards/cosine_scaled_reward": -0.16173688508570194, | |
| "rewards/format_reward": 0.0, | |
| "step": 440 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.764, | |
| "grad_norm": 4.106746196746826, | |
| "kl": 2.49609375, | |
| "learning_rate": 1.3763677169699217e-07, | |
| "loss": 0.0999, | |
| "reward": -0.4192545562982559, | |
| "reward_std": 0.33375757187604904, | |
| "rewards/cosine_scaled_reward": -0.20962728559970856, | |
| "rewards/format_reward": 0.0, | |
| "step": 441 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1534.8690490722656, | |
| "epoch": 1.768, | |
| "grad_norm": 2.842816114425659, | |
| "kl": 2.2578125, | |
| "learning_rate": 1.3638909733514452e-07, | |
| "loss": 0.0898, | |
| "reward": -0.3652210012078285, | |
| "reward_std": 0.3345082625746727, | |
| "rewards/cosine_scaled_reward": -0.18261050805449486, | |
| "rewards/format_reward": 0.0, | |
| "step": 442 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.772, | |
| "grad_norm": 3.186333179473877, | |
| "kl": 2.375, | |
| "learning_rate": 1.351615817851748e-07, | |
| "loss": 0.0947, | |
| "reward": -0.40324729681015015, | |
| "reward_std": 0.32466883957386017, | |
| "rewards/cosine_scaled_reward": -0.20162366330623627, | |
| "rewards/format_reward": 0.0, | |
| "step": 443 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.776, | |
| "grad_norm": 4.4096360206604, | |
| "kl": 2.99609375, | |
| "learning_rate": 1.3395428487445914e-07, | |
| "loss": 0.1197, | |
| "reward": -0.3327697291970253, | |
| "reward_std": 0.3282741829752922, | |
| "rewards/cosine_scaled_reward": -0.16638486459851265, | |
| "rewards/format_reward": 0.0, | |
| "step": 444 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.78, | |
| "grad_norm": 2.8214669227600098, | |
| "kl": 1.8623046875, | |
| "learning_rate": 1.3276726544494571e-07, | |
| "loss": 0.0746, | |
| "reward": -0.39069636911153793, | |
| "reward_std": 0.33478184044361115, | |
| "rewards/cosine_scaled_reward": -0.19534818828105927, | |
| "rewards/format_reward": 0.0, | |
| "step": 445 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.784, | |
| "grad_norm": 2.96333646774292, | |
| "kl": 1.828125, | |
| "learning_rate": 1.316005813502869e-07, | |
| "loss": 0.073, | |
| "reward": -0.34233053401112556, | |
| "reward_std": 0.30314670503139496, | |
| "rewards/cosine_scaled_reward": -0.17116525955498219, | |
| "rewards/format_reward": 0.0, | |
| "step": 446 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.788, | |
| "grad_norm": 2.538837432861328, | |
| "kl": 1.615234375, | |
| "learning_rate": 1.3045428945301953e-07, | |
| "loss": 0.0647, | |
| "reward": -0.2668099580332637, | |
| "reward_std": 0.3087245300412178, | |
| "rewards/cosine_scaled_reward": -0.1334049835568294, | |
| "rewards/format_reward": 0.0, | |
| "step": 447 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.792, | |
| "grad_norm": 6.922802925109863, | |
| "kl": 1.9208984375, | |
| "learning_rate": 1.2932844562179352e-07, | |
| "loss": 0.0768, | |
| "reward": -0.3690221831202507, | |
| "reward_std": 0.3130299560725689, | |
| "rewards/cosine_scaled_reward": -0.18451109528541565, | |
| "rewards/format_reward": 0.0, | |
| "step": 448 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.796, | |
| "grad_norm": 3.2286629676818848, | |
| "kl": 1.990234375, | |
| "learning_rate": 1.2822310472864885e-07, | |
| "loss": 0.0795, | |
| "reward": -0.32342398166656494, | |
| "reward_std": 0.3065089136362076, | |
| "rewards/cosine_scaled_reward": -0.16171199083328247, | |
| "rewards/format_reward": 0.0, | |
| "step": 449 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.8, | |
| "grad_norm": 3.7653493881225586, | |
| "kl": 1.904296875, | |
| "learning_rate": 1.2713832064634125e-07, | |
| "loss": 0.0763, | |
| "reward": -0.4029879495501518, | |
| "reward_std": 0.31490693986415863, | |
| "rewards/cosine_scaled_reward": -0.2014939747750759, | |
| "rewards/format_reward": 0.0, | |
| "step": 450 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.804, | |
| "grad_norm": 3.4150803089141846, | |
| "kl": 2.150390625, | |
| "learning_rate": 1.260741462457165e-07, | |
| "loss": 0.086, | |
| "reward": -0.3429009020328522, | |
| "reward_std": 0.29108157753944397, | |
| "rewards/cosine_scaled_reward": -0.1714504510164261, | |
| "rewards/format_reward": 0.0, | |
| "step": 451 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.808, | |
| "grad_norm": 4.145492076873779, | |
| "kl": 2.2421875, | |
| "learning_rate": 1.2503063339313356e-07, | |
| "loss": 0.0897, | |
| "reward": -0.42198269814252853, | |
| "reward_std": 0.3363164961338043, | |
| "rewards/cosine_scaled_reward": -0.21099134907126427, | |
| "rewards/format_reward": 0.0, | |
| "step": 452 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.812, | |
| "grad_norm": 4.779297351837158, | |
| "kl": 2.228515625, | |
| "learning_rate": 1.2400783294793668e-07, | |
| "loss": 0.0891, | |
| "reward": -0.3492959663271904, | |
| "reward_std": 0.2949202358722687, | |
| "rewards/cosine_scaled_reward": -0.1746479757130146, | |
| "rewards/format_reward": 0.0, | |
| "step": 453 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.8159999999999998, | |
| "grad_norm": 2.905301570892334, | |
| "kl": 1.265625, | |
| "learning_rate": 1.2300579475997657e-07, | |
| "loss": 0.0506, | |
| "reward": -0.2935212664306164, | |
| "reward_std": 0.26374514773488045, | |
| "rewards/cosine_scaled_reward": -0.1467606294900179, | |
| "rewards/format_reward": 0.0, | |
| "step": 454 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.8199999999999998, | |
| "grad_norm": 2.7079851627349854, | |
| "kl": 2.1337890625, | |
| "learning_rate": 1.220245676671809e-07, | |
| "loss": 0.0853, | |
| "reward": -0.3475092798471451, | |
| "reward_std": 0.30007384717464447, | |
| "rewards/cosine_scaled_reward": -0.17375463247299194, | |
| "rewards/format_reward": 0.0, | |
| "step": 455 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.8239999999999998, | |
| "grad_norm": 2.6113271713256836, | |
| "kl": 1.6376953125, | |
| "learning_rate": 1.2106419949317388e-07, | |
| "loss": 0.0654, | |
| "reward": -0.330677293241024, | |
| "reward_std": 0.3133997842669487, | |
| "rewards/cosine_scaled_reward": -0.1653386428952217, | |
| "rewards/format_reward": 0.0, | |
| "step": 456 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.8279999999999998, | |
| "grad_norm": 2.7393922805786133, | |
| "kl": 1.666015625, | |
| "learning_rate": 1.2012473704494537e-07, | |
| "loss": 0.0668, | |
| "reward": -0.3434924744069576, | |
| "reward_std": 0.3196050524711609, | |
| "rewards/cosine_scaled_reward": -0.17174622975289822, | |
| "rewards/format_reward": 0.0, | |
| "step": 457 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.8319999999999999, | |
| "grad_norm": 4.49023962020874, | |
| "kl": 2.34375, | |
| "learning_rate": 1.1920622611056974e-07, | |
| "loss": 0.0938, | |
| "reward": -0.34944383054971695, | |
| "reward_std": 0.3238733857870102, | |
| "rewards/cosine_scaled_reward": -0.17472190782427788, | |
| "rewards/format_reward": 0.0, | |
| "step": 458 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.8359999999999999, | |
| "grad_norm": 2.3561832904815674, | |
| "kl": 1.4501953125, | |
| "learning_rate": 1.1830871145697412e-07, | |
| "loss": 0.0579, | |
| "reward": -0.3565739244222641, | |
| "reward_std": 0.3099294453859329, | |
| "rewards/cosine_scaled_reward": -0.17828696221113205, | |
| "rewards/format_reward": 0.0, | |
| "step": 459 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.8399999999999999, | |
| "grad_norm": 3.1239490509033203, | |
| "kl": 1.8984375, | |
| "learning_rate": 1.1743223682775649e-07, | |
| "loss": 0.0759, | |
| "reward": -0.3478566035628319, | |
| "reward_std": 0.28794750943779945, | |
| "rewards/cosine_scaled_reward": -0.17392829060554504, | |
| "rewards/format_reward": 0.0, | |
| "step": 460 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.8439999999999999, | |
| "grad_norm": 2.673818826675415, | |
| "kl": 1.740234375, | |
| "learning_rate": 1.1657684494105386e-07, | |
| "loss": 0.0695, | |
| "reward": -0.339593730866909, | |
| "reward_std": 0.3045819625258446, | |
| "rewards/cosine_scaled_reward": -0.1697968691587448, | |
| "rewards/format_reward": 0.0, | |
| "step": 461 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.8479999999999999, | |
| "grad_norm": 3.220402479171753, | |
| "kl": 1.626953125, | |
| "learning_rate": 1.1574257748745986e-07, | |
| "loss": 0.0651, | |
| "reward": -0.36886321753263474, | |
| "reward_std": 0.26985886320471764, | |
| "rewards/cosine_scaled_reward": -0.18443159759044647, | |
| "rewards/format_reward": 0.0, | |
| "step": 462 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1530.4642944335938, | |
| "epoch": 1.8519999999999999, | |
| "grad_norm": 2.8002877235412598, | |
| "kl": 2.23828125, | |
| "learning_rate": 1.1492947512799328e-07, | |
| "loss": 0.0941, | |
| "reward": -0.31243710219860077, | |
| "reward_std": 0.3104839473962784, | |
| "rewards/cosine_scaled_reward": -0.15621854737401009, | |
| "rewards/format_reward": 0.0, | |
| "step": 463 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.8559999999999999, | |
| "grad_norm": 3.3076934814453125, | |
| "kl": 2.455078125, | |
| "learning_rate": 1.1413757749211602e-07, | |
| "loss": 0.098, | |
| "reward": -0.3408735916018486, | |
| "reward_std": 0.3259742558002472, | |
| "rewards/cosine_scaled_reward": -0.1704367958009243, | |
| "rewards/format_reward": 0.0, | |
| "step": 464 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.8599999999999999, | |
| "grad_norm": 4.302088737487793, | |
| "kl": 2.005859375, | |
| "learning_rate": 1.1336692317580158e-07, | |
| "loss": 0.0802, | |
| "reward": -0.3594956621527672, | |
| "reward_std": 0.32260415703058243, | |
| "rewards/cosine_scaled_reward": -0.1797478273510933, | |
| "rewards/format_reward": 0.0, | |
| "step": 465 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.8639999999999999, | |
| "grad_norm": 4.171574115753174, | |
| "kl": 2.490234375, | |
| "learning_rate": 1.1261754973965422e-07, | |
| "loss": 0.0995, | |
| "reward": -0.3996199369430542, | |
| "reward_std": 0.30815524607896805, | |
| "rewards/cosine_scaled_reward": -0.1998099721968174, | |
| "rewards/format_reward": 0.0, | |
| "step": 466 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.8679999999999999, | |
| "grad_norm": 3.7009289264678955, | |
| "kl": 1.841796875, | |
| "learning_rate": 1.1188949370707787e-07, | |
| "loss": 0.0738, | |
| "reward": -0.3371664360165596, | |
| "reward_std": 0.3329595774412155, | |
| "rewards/cosine_scaled_reward": -0.1685832180082798, | |
| "rewards/format_reward": 0.0, | |
| "step": 467 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.8719999999999999, | |
| "grad_norm": 2.592533826828003, | |
| "kl": 2.251953125, | |
| "learning_rate": 1.1118279056249653e-07, | |
| "loss": 0.0901, | |
| "reward": -0.34844203293323517, | |
| "reward_std": 0.322611540555954, | |
| "rewards/cosine_scaled_reward": -0.1742210052907467, | |
| "rewards/format_reward": 0.0, | |
| "step": 468 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.876, | |
| "grad_norm": 4.633761405944824, | |
| "kl": 1.64013671875, | |
| "learning_rate": 1.1049747474962444e-07, | |
| "loss": 0.0656, | |
| "reward": -0.3193807154893875, | |
| "reward_std": 0.26448768377304077, | |
| "rewards/cosine_scaled_reward": -0.15969035774469376, | |
| "rewards/format_reward": 0.0, | |
| "step": 469 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.88, | |
| "grad_norm": 3.101719617843628, | |
| "kl": 2.033203125, | |
| "learning_rate": 1.0983357966978745e-07, | |
| "loss": 0.0812, | |
| "reward": -0.3662792518734932, | |
| "reward_std": 0.32248761504888535, | |
| "rewards/cosine_scaled_reward": -0.1831396110355854, | |
| "rewards/format_reward": 0.0, | |
| "step": 470 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.884, | |
| "grad_norm": 2.580354690551758, | |
| "kl": 1.607421875, | |
| "learning_rate": 1.0919113768029517e-07, | |
| "loss": 0.0643, | |
| "reward": -0.34900667518377304, | |
| "reward_std": 0.31430666893720627, | |
| "rewards/cosine_scaled_reward": -0.17450333759188652, | |
| "rewards/format_reward": 0.0, | |
| "step": 471 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.888, | |
| "grad_norm": 2.7384796142578125, | |
| "kl": 1.8046875, | |
| "learning_rate": 1.0857018009286381e-07, | |
| "loss": 0.0722, | |
| "reward": -0.32778534665703773, | |
| "reward_std": 0.3321828171610832, | |
| "rewards/cosine_scaled_reward": -0.16389267705380917, | |
| "rewards/format_reward": 0.0, | |
| "step": 472 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.892, | |
| "grad_norm": 3.759181499481201, | |
| "kl": 2.017578125, | |
| "learning_rate": 1.0797073717209013e-07, | |
| "loss": 0.0807, | |
| "reward": -0.32047825306653976, | |
| "reward_std": 0.28816820681095123, | |
| "rewards/cosine_scaled_reward": -0.16023912653326988, | |
| "rewards/format_reward": 0.0, | |
| "step": 473 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.896, | |
| "grad_norm": 2.8909876346588135, | |
| "kl": 1.603515625, | |
| "learning_rate": 1.0739283813397639e-07, | |
| "loss": 0.0642, | |
| "reward": -0.3390325605869293, | |
| "reward_std": 0.3011201545596123, | |
| "rewards/cosine_scaled_reward": -0.16951627284288406, | |
| "rewards/format_reward": 0.0, | |
| "step": 474 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.9, | |
| "grad_norm": 2.3281497955322266, | |
| "kl": 2.0234375, | |
| "learning_rate": 1.068365111445064e-07, | |
| "loss": 0.081, | |
| "reward": -0.36704741418361664, | |
| "reward_std": 0.3062589168548584, | |
| "rewards/cosine_scaled_reward": -0.18352371081709862, | |
| "rewards/format_reward": 0.0, | |
| "step": 475 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.904, | |
| "grad_norm": 3.56882643699646, | |
| "kl": 2.515625, | |
| "learning_rate": 1.063017833182728e-07, | |
| "loss": 0.1008, | |
| "reward": -0.39511261135339737, | |
| "reward_std": 0.3128170743584633, | |
| "rewards/cosine_scaled_reward": -0.19755630940198898, | |
| "rewards/format_reward": 0.0, | |
| "step": 476 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.908, | |
| "grad_norm": 2.958406925201416, | |
| "kl": 1.755859375, | |
| "learning_rate": 1.0578868071715544e-07, | |
| "loss": 0.0702, | |
| "reward": -0.3462023660540581, | |
| "reward_std": 0.322578527033329, | |
| "rewards/cosine_scaled_reward": -0.17310118675231934, | |
| "rewards/format_reward": 0.0, | |
| "step": 477 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.912, | |
| "grad_norm": 3.044797897338867, | |
| "kl": 2.375, | |
| "learning_rate": 1.0529722834905125e-07, | |
| "loss": 0.095, | |
| "reward": -0.3144143670797348, | |
| "reward_std": 0.29915551096200943, | |
| "rewards/cosine_scaled_reward": -0.1572071835398674, | |
| "rewards/format_reward": 0.0, | |
| "step": 478 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.916, | |
| "grad_norm": 4.031872272491455, | |
| "kl": 2.640625, | |
| "learning_rate": 1.0482745016665526e-07, | |
| "loss": 0.1057, | |
| "reward": -0.3763216808438301, | |
| "reward_std": 0.3211255893111229, | |
| "rewards/cosine_scaled_reward": -0.18816084042191505, | |
| "rewards/format_reward": 0.0, | |
| "step": 479 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.92, | |
| "grad_norm": 2.3054392337799072, | |
| "kl": 1.3173828125, | |
| "learning_rate": 1.0437936906629334e-07, | |
| "loss": 0.0528, | |
| "reward": -0.2678487957455218, | |
| "reward_std": 0.2627658285200596, | |
| "rewards/cosine_scaled_reward": -0.13392440509051085, | |
| "rewards/format_reward": 0.0, | |
| "step": 480 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.924, | |
| "grad_norm": 3.41572642326355, | |
| "kl": 1.353515625, | |
| "learning_rate": 1.0395300688680625e-07, | |
| "loss": 0.0541, | |
| "reward": -0.35157452523708344, | |
| "reward_std": 0.3239835053682327, | |
| "rewards/cosine_scaled_reward": -0.17578726634383202, | |
| "rewards/format_reward": 0.0, | |
| "step": 481 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.928, | |
| "grad_norm": 2.691436290740967, | |
| "kl": 2.041015625, | |
| "learning_rate": 1.0354838440848501e-07, | |
| "loss": 0.0816, | |
| "reward": -0.39503272622823715, | |
| "reward_std": 0.3050593361258507, | |
| "rewards/cosine_scaled_reward": -0.19751636311411858, | |
| "rewards/format_reward": 0.0, | |
| "step": 482 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.932, | |
| "grad_norm": 2.859536647796631, | |
| "kl": 1.494140625, | |
| "learning_rate": 1.0316552135205837e-07, | |
| "loss": 0.0599, | |
| "reward": -0.395970955491066, | |
| "reward_std": 0.27583859115839005, | |
| "rewards/cosine_scaled_reward": -0.197985477745533, | |
| "rewards/format_reward": 0.0, | |
| "step": 483 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.936, | |
| "grad_norm": 2.9280340671539307, | |
| "kl": 1.765625, | |
| "learning_rate": 1.0280443637773163e-07, | |
| "loss": 0.0708, | |
| "reward": -0.2913724035024643, | |
| "reward_std": 0.2617946192622185, | |
| "rewards/cosine_scaled_reward": -0.14568619430065155, | |
| "rewards/format_reward": 0.0, | |
| "step": 484 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.94, | |
| "grad_norm": 2.2830445766448975, | |
| "kl": 1.2158203125, | |
| "learning_rate": 1.0246514708427701e-07, | |
| "loss": 0.0487, | |
| "reward": -0.3095552623271942, | |
| "reward_std": 0.292842835187912, | |
| "rewards/cosine_scaled_reward": -0.1547776274383068, | |
| "rewards/format_reward": 0.0, | |
| "step": 485 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.944, | |
| "grad_norm": 3.141052007675171, | |
| "kl": 1.3427734375, | |
| "learning_rate": 1.0214767000817596e-07, | |
| "loss": 0.0537, | |
| "reward": -0.32299425452947617, | |
| "reward_std": 0.29863065481185913, | |
| "rewards/cosine_scaled_reward": -0.16149712353944778, | |
| "rewards/format_reward": 0.0, | |
| "step": 486 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.948, | |
| "grad_norm": 3.97387433052063, | |
| "kl": 1.931640625, | |
| "learning_rate": 1.0185202062281336e-07, | |
| "loss": 0.0773, | |
| "reward": -0.3765959292650223, | |
| "reward_std": 0.3192542716860771, | |
| "rewards/cosine_scaled_reward": -0.18829796463251114, | |
| "rewards/format_reward": 0.0, | |
| "step": 487 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.952, | |
| "grad_norm": 2.656202554702759, | |
| "kl": 1.578125, | |
| "learning_rate": 1.0157821333772304e-07, | |
| "loss": 0.0631, | |
| "reward": -0.31205643340945244, | |
| "reward_std": 0.31670553237199783, | |
| "rewards/cosine_scaled_reward": -0.15602822043001652, | |
| "rewards/format_reward": 0.0, | |
| "step": 488 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.956, | |
| "grad_norm": 3.296848773956299, | |
| "kl": 1.16796875, | |
| "learning_rate": 1.013262614978859e-07, | |
| "loss": 0.0468, | |
| "reward": -0.3039631359279156, | |
| "reward_std": 0.27847766503691673, | |
| "rewards/cosine_scaled_reward": -0.15198157727718353, | |
| "rewards/format_reward": 0.0, | |
| "step": 489 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.96, | |
| "grad_norm": 4.522839546203613, | |
| "kl": 1.8203125, | |
| "learning_rate": 1.0109617738307911e-07, | |
| "loss": 0.0728, | |
| "reward": -0.34008362144231796, | |
| "reward_std": 0.29262910783290863, | |
| "rewards/cosine_scaled_reward": -0.17004182189702988, | |
| "rewards/format_reward": 0.0, | |
| "step": 490 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.964, | |
| "grad_norm": 2.311014175415039, | |
| "kl": 2.244140625, | |
| "learning_rate": 1.0088797220727779e-07, | |
| "loss": 0.0898, | |
| "reward": -0.34849604219198227, | |
| "reward_std": 0.3044138178229332, | |
| "rewards/cosine_scaled_reward": -0.17424802854657173, | |
| "rewards/format_reward": 0.0, | |
| "step": 491 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.968, | |
| "grad_norm": 2.6442465782165527, | |
| "kl": 1.998046875, | |
| "learning_rate": 1.0070165611810855e-07, | |
| "loss": 0.0799, | |
| "reward": -0.34308964014053345, | |
| "reward_std": 0.3727850690484047, | |
| "rewards/cosine_scaled_reward": -0.17154482379555702, | |
| "rewards/format_reward": 0.0, | |
| "step": 492 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.972, | |
| "grad_norm": 2.6985509395599365, | |
| "kl": 1.41796875, | |
| "learning_rate": 1.005372381963547e-07, | |
| "loss": 0.0567, | |
| "reward": -0.3521110415458679, | |
| "reward_std": 0.30227896198630333, | |
| "rewards/cosine_scaled_reward": -0.17605552449822426, | |
| "rewards/format_reward": 0.0, | |
| "step": 493 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.976, | |
| "grad_norm": 3.240550994873047, | |
| "kl": 1.904296875, | |
| "learning_rate": 1.0039472645551372e-07, | |
| "loss": 0.076, | |
| "reward": -0.3422994837164879, | |
| "reward_std": 0.3251089081168175, | |
| "rewards/cosine_scaled_reward": -0.17114974185824394, | |
| "rewards/format_reward": 0.0, | |
| "step": 494 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.98, | |
| "grad_norm": 4.803572177886963, | |
| "kl": 3.177734375, | |
| "learning_rate": 1.002741278414069e-07, | |
| "loss": 0.1272, | |
| "reward": -0.3737839311361313, | |
| "reward_std": 0.3232840970158577, | |
| "rewards/cosine_scaled_reward": -0.18689196929335594, | |
| "rewards/format_reward": 0.0, | |
| "step": 495 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.984, | |
| "grad_norm": 2.532582998275757, | |
| "kl": 1.9375, | |
| "learning_rate": 1.0017544823184055e-07, | |
| "loss": 0.0776, | |
| "reward": -0.374487929046154, | |
| "reward_std": 0.32537975162267685, | |
| "rewards/cosine_scaled_reward": -0.187243964523077, | |
| "rewards/format_reward": 0.0, | |
| "step": 496 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.988, | |
| "grad_norm": 2.6129701137542725, | |
| "kl": 2.2734375, | |
| "learning_rate": 1.0009869243631952e-07, | |
| "loss": 0.091, | |
| "reward": -0.3434004709124565, | |
| "reward_std": 0.32708871364593506, | |
| "rewards/cosine_scaled_reward": -0.17170023545622826, | |
| "rewards/format_reward": 0.0, | |
| "step": 497 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.992, | |
| "grad_norm": 4.10455322265625, | |
| "kl": 1.595703125, | |
| "learning_rate": 1.000438641958131e-07, | |
| "loss": 0.0638, | |
| "reward": -0.3211556486785412, | |
| "reward_std": 0.2905324958264828, | |
| "rewards/cosine_scaled_reward": -0.1605778243392706, | |
| "rewards/format_reward": 0.0, | |
| "step": 498 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0, | |
| "epoch": 1.996, | |
| "grad_norm": 2.7520267963409424, | |
| "kl": 1.892578125, | |
| "learning_rate": 1.0001096618257236e-07, | |
| "loss": 0.0756, | |
| "reward": -0.35309676826000214, | |
| "reward_std": 0.31401190161705017, | |
| "rewards/cosine_scaled_reward": -0.17654838413000107, | |
| "rewards/format_reward": 0.0, | |
| "step": 499 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1536.0001220703125, | |
| "epoch": 2.0, | |
| "grad_norm": 2.9658398628234863, | |
| "kl": 1.7763671875, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0711, | |
| "reward": -0.343311108648777, | |
| "reward_std": 0.28952478244900703, | |
| "rewards/cosine_scaled_reward": -0.1716555580496788, | |
| "rewards/format_reward": 0.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "step": 500, | |
| "total_flos": 0.0, | |
| "train_loss": 0.05846181693652478, | |
| "train_runtime": 107214.2293, | |
| "train_samples_per_second": 0.783, | |
| "train_steps_per_second": 0.005 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 250, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 6, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |