{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5154639175257731, "eval_steps": 500, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 143.8359375, "epoch": 0.000859106529209622, "grad_norm": 3.728695289308407, "kl": 0.0, "learning_rate": 9.995704467353951e-07, "loss": -0.0, "reward": 1.225318193435669, "reward_std": 0.5892010927200317, "rewards/format_reward_gen": 0.609375, "rewards/llm_reward": 0.6159431040287018, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 146.78125, "epoch": 0.001718213058419244, "grad_norm": 2.510726404973143, "kl": 0.0007686614990234375, "learning_rate": 9.991408934707903e-07, "loss": 0.0, "reward": 1.148492693901062, "reward_std": 0.5800909698009491, "rewards/format_reward_gen": 0.5390625, "rewards/llm_reward": 0.6094301640987396, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 127.578125, "epoch": 0.002577319587628866, "grad_norm": 5.237708678465742, "kl": 0.001880645751953125, "learning_rate": 9.987113402061855e-07, "loss": 0.0001, "reward": 1.2333476543426514, "reward_std": 0.5131954550743103, "rewards/format_reward_gen": 0.7109375, "rewards/llm_reward": 0.5224101841449738, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 140.265625, "epoch": 0.003436426116838488, "grad_norm": 4.430606101451457, "kl": 0.0068817138671875, "learning_rate": 9.982817869415807e-07, "loss": 0.0003, "reward": 1.3201585412025452, "reward_std": 0.5908865928649902, "rewards/format_reward_gen": 0.6640625, "rewards/llm_reward": 0.6560960710048676, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 140.34375, "epoch": 0.00429553264604811, "grad_norm": 2.966410650688808, "kl": 0.00145721435546875, "learning_rate": 9.97852233676976e-07, "loss": 0.0001, "reward": 1.5005085468292236, "reward_std": 0.446560800075531, "rewards/format_reward_gen": 0.8046875, "rewards/llm_reward": 0.6958210468292236, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 149.0234375, "epoch": 0.005154639175257732, "grad_norm": 18.73054467782952, "kl": 0.00228118896484375, "learning_rate": 9.97422680412371e-07, "loss": 0.0001, "reward": 1.379498541355133, "reward_std": 0.3884614408016205, "rewards/format_reward_gen": 0.796875, "rewards/llm_reward": 0.5826235115528107, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 130.1796875, "epoch": 0.006013745704467354, "grad_norm": 2.1996526299389556, "kl": 0.00290679931640625, "learning_rate": 9.969931271477663e-07, "loss": 0.0001, "reward": 1.4183476567268372, "reward_std": 0.46953827142715454, "rewards/format_reward_gen": 0.828125, "rewards/llm_reward": 0.5902226716279984, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 159.2265625, "epoch": 0.006872852233676976, "grad_norm": 1.5763458045998808, "kl": 0.00386810302734375, "learning_rate": 9.965635738831615e-07, "loss": 0.0002, "reward": 1.4628726840019226, "reward_std": 0.3434259593486786, "rewards/format_reward_gen": 0.859375, "rewards/llm_reward": 0.6034976243972778, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 148.8203125, "epoch": 0.007731958762886598, "grad_norm": 1.8925562854231095, "kl": 0.003387451171875, "learning_rate": 9.961340206185566e-07, "loss": 0.0001, "reward": 1.466668725013733, "reward_std": 0.3619833141565323, "rewards/format_reward_gen": 0.828125, "rewards/llm_reward": 0.6385437846183777, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 128.6640625, "epoch": 0.00859106529209622, "grad_norm": 2.30348378416368, "kl": 0.008270263671875, "learning_rate": 9.957044673539518e-07, "loss": 0.0003, "reward": 1.4460369348526, "reward_std": 0.2754833847284317, "rewards/format_reward_gen": 0.8828125, "rewards/llm_reward": 0.5632244646549225, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 152.671875, "epoch": 0.009450171821305841, "grad_norm": 4.5902996335202495, "kl": 0.006683349609375, "learning_rate": 9.95274914089347e-07, "loss": 0.0003, "reward": 1.6509097814559937, "reward_std": 0.25286635756492615, "rewards/format_reward_gen": 0.953125, "rewards/llm_reward": 0.6977846920490265, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 129.9140625, "epoch": 0.010309278350515464, "grad_norm": 4.341349059182411, "kl": 0.0069732666015625, "learning_rate": 9.948453608247422e-07, "loss": 0.0003, "reward": 1.63209068775177, "reward_std": 0.20565086603164673, "rewards/format_reward_gen": 0.9609375, "rewards/llm_reward": 0.6711532175540924, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 129.453125, "epoch": 0.011168384879725086, "grad_norm": 5.165117571002937, "kl": 0.0079345703125, "learning_rate": 9.944158075601374e-07, "loss": 0.0003, "reward": 1.7142389416694641, "reward_std": 0.219488687813282, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7376764714717865, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 121.25, "epoch": 0.012027491408934709, "grad_norm": 1.529830865974652, "kl": 0.017974853515625, "learning_rate": 9.939862542955326e-07, "loss": 0.0007, "reward": 1.6100916266441345, "reward_std": 0.15942473709583282, "rewards/format_reward_gen": 0.9609375, "rewards/llm_reward": 0.6491541266441345, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 118.9765625, "epoch": 0.01288659793814433, "grad_norm": 1.959441191085793, "kl": 0.01171875, "learning_rate": 9.935567010309278e-07, "loss": 0.0005, "reward": 1.6476787328720093, "reward_std": 0.21745194494724274, "rewards/format_reward_gen": 0.9609375, "rewards/llm_reward": 0.6867412328720093, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 117.8515625, "epoch": 0.013745704467353952, "grad_norm": 1.8012904974133117, "kl": 0.01544189453125, "learning_rate": 9.93127147766323e-07, "loss": 0.0006, "reward": 1.648992121219635, "reward_std": 0.22647833824157715, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.6802421808242798, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 110.6796875, "epoch": 0.014604810996563574, "grad_norm": 2.0371515238907456, "kl": 0.02178955078125, "learning_rate": 9.926975945017182e-07, "loss": 0.0009, "reward": 1.7388362884521484, "reward_std": 0.23847993463277817, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7544613182544708, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 136.171875, "epoch": 0.015463917525773196, "grad_norm": 2.29872644703533, "kl": 0.02154541015625, "learning_rate": 9.922680412371133e-07, "loss": 0.0009, "reward": 1.698328673839569, "reward_std": 0.21018245071172714, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7139536440372467, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 115.8203125, "epoch": 0.01632302405498282, "grad_norm": 1.642185067375333, "kl": 0.026123046875, "learning_rate": 9.918384879725085e-07, "loss": 0.001, "reward": 1.5711023807525635, "reward_std": 0.24714628607034683, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.5945399105548859, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 116.0859375, "epoch": 0.01718213058419244, "grad_norm": 1.8301018336586894, "kl": 0.04449462890625, "learning_rate": 9.914089347079037e-07, "loss": 0.0018, "reward": 1.7036264538764954, "reward_std": 0.2401985377073288, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7348764538764954, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 105.5859375, "epoch": 0.01804123711340206, "grad_norm": 2.4502501155105696, "kl": 0.02874755859375, "learning_rate": 9.90979381443299e-07, "loss": 0.0011, "reward": 1.5346225500106812, "reward_std": 0.18206386268138885, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.5502475053071976, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 117.828125, "epoch": 0.018900343642611683, "grad_norm": 4.579990678131132, "kl": 0.020751953125, "learning_rate": 9.90549828178694e-07, "loss": 0.0008, "reward": 1.5973477363586426, "reward_std": 0.21275992691516876, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.605160266160965, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 127.1171875, "epoch": 0.019759450171821305, "grad_norm": 1.9133262074405595, "kl": 0.025390625, "learning_rate": 9.901202749140893e-07, "loss": 0.001, "reward": 1.6978419423103333, "reward_std": 0.1712438389658928, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7056544423103333, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 118.3515625, "epoch": 0.020618556701030927, "grad_norm": 1.6795653310334677, "kl": 0.0228271484375, "learning_rate": 9.896907216494845e-07, "loss": 0.0009, "reward": 1.5728907585144043, "reward_std": 0.23567190766334534, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.5885157585144043, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 114.4609375, "epoch": 0.02147766323024055, "grad_norm": 1.8945583882920514, "kl": 0.02520751953125, "learning_rate": 9.892611683848797e-07, "loss": 0.001, "reward": 1.6186752319335938, "reward_std": 0.2236367017030716, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.6343001425266266, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 116.0, "epoch": 0.022336769759450172, "grad_norm": 1.6471025834878852, "kl": 0.02325439453125, "learning_rate": 9.888316151202748e-07, "loss": 0.0009, "reward": 1.7836337089538574, "reward_std": 0.18849463760852814, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.791446179151535, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 114.0546875, "epoch": 0.023195876288659795, "grad_norm": 1.8224790908918138, "kl": 0.01788330078125, "learning_rate": 9.8840206185567e-07, "loss": 0.0007, "reward": 1.6910547018051147, "reward_std": 0.177225723862648, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.6988671720027924, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 110.984375, "epoch": 0.024054982817869417, "grad_norm": 1.7847962406057651, "kl": 0.0181884765625, "learning_rate": 9.879725085910652e-07, "loss": 0.0007, "reward": 1.7889049053192139, "reward_std": 0.17536725103855133, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7967174351215363, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 113.0234375, "epoch": 0.02491408934707904, "grad_norm": 3.1801733940089547, "kl": 0.02703857421875, "learning_rate": 9.875429553264604e-07, "loss": 0.0011, "reward": 1.7557846307754517, "reward_std": 0.17591644823551178, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7714096307754517, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 109.265625, "epoch": 0.02577319587628866, "grad_norm": 3.160227437825882, "kl": 0.0194091796875, "learning_rate": 9.871134020618556e-07, "loss": 0.0008, "reward": 1.6949102878570557, "reward_std": 0.1834886148571968, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.710535317659378, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 112.265625, "epoch": 0.02663230240549828, "grad_norm": 1.4868195880486625, "kl": 0.0224609375, "learning_rate": 9.866838487972508e-07, "loss": 0.0009, "reward": 1.6780639290809631, "reward_std": 0.16649843007326126, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7015013694763184, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 105.984375, "epoch": 0.027491408934707903, "grad_norm": 22.472420255137777, "kl": 0.017974853515625, "learning_rate": 9.86254295532646e-07, "loss": 0.0007, "reward": 1.64022696018219, "reward_std": 0.19799882918596268, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.6558520197868347, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 110.46875, "epoch": 0.028350515463917526, "grad_norm": 1.8825542488707594, "kl": 0.027587890625, "learning_rate": 9.858247422680412e-07, "loss": 0.0011, "reward": 1.686434805393219, "reward_std": 0.1929420754313469, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.694247305393219, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 111.7109375, "epoch": 0.029209621993127148, "grad_norm": 1.318783449188221, "kl": 0.011474609375, "learning_rate": 9.853951890034363e-07, "loss": 0.0005, "reward": 1.7323167324066162, "reward_std": 0.14837764203548431, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7323167026042938, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 117.7421875, "epoch": 0.03006872852233677, "grad_norm": 1.6922286395862565, "kl": 0.016632080078125, "learning_rate": 9.849656357388315e-07, "loss": 0.0007, "reward": 1.6450969576835632, "reward_std": 0.18919343501329422, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.6450969874858856, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 107.8828125, "epoch": 0.030927835051546393, "grad_norm": 1.8376143817871085, "kl": 0.017822265625, "learning_rate": 9.845360824742267e-07, "loss": 0.0007, "reward": 1.6301739811897278, "reward_std": 0.20516958832740784, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.6379865407943726, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 98.4921875, "epoch": 0.03178694158075601, "grad_norm": 2.8326207376956565, "kl": 0.009918212890625, "learning_rate": 9.84106529209622e-07, "loss": 0.0004, "reward": 1.709650695323944, "reward_std": 0.1534641981124878, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7096506357192993, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 101.6796875, "epoch": 0.03264604810996564, "grad_norm": 2.0347285959045536, "kl": 0.009307861328125, "learning_rate": 9.83676975945017e-07, "loss": 0.0004, "reward": 1.6739385724067688, "reward_std": 0.13153361529111862, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.6739385724067688, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 112.5625, "epoch": 0.03350515463917526, "grad_norm": 1.9197526254739072, "kl": 0.01287841796875, "learning_rate": 9.832474226804123e-07, "loss": 0.0005, "reward": 1.7046000361442566, "reward_std": 0.19367831200361252, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7124125361442566, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 108.1328125, "epoch": 0.03436426116838488, "grad_norm": 3.1475346486430174, "kl": 0.011077880859375, "learning_rate": 9.828178694158075e-07, "loss": 0.0004, "reward": 1.714475929737091, "reward_std": 0.11251556500792503, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7144758999347687, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 118.6484375, "epoch": 0.0352233676975945, "grad_norm": 1.9386038530957954, "kl": 0.013763427734375, "learning_rate": 9.823883161512027e-07, "loss": 0.0006, "reward": 1.7007797360420227, "reward_std": 0.22316401451826096, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7085922360420227, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 111.2734375, "epoch": 0.03608247422680412, "grad_norm": 1.6026084575263893, "kl": 0.009185791015625, "learning_rate": 9.819587628865979e-07, "loss": 0.0004, "reward": 1.7506839632987976, "reward_std": 0.16962886601686478, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.75068399310112, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 109.265625, "epoch": 0.036941580756013746, "grad_norm": 1.4019849035616754, "kl": 0.015960693359375, "learning_rate": 9.81529209621993e-07, "loss": 0.0006, "reward": 1.6929492950439453, "reward_std": 0.15880103036761284, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.6929492652416229, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 108.78125, "epoch": 0.037800687285223365, "grad_norm": 1.874751262052608, "kl": 0.01318359375, "learning_rate": 9.810996563573882e-07, "loss": 0.0005, "reward": 1.7645366191864014, "reward_std": 0.1825566589832306, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7723491191864014, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 104.1171875, "epoch": 0.03865979381443299, "grad_norm": 1.628943635799199, "kl": 0.012939453125, "learning_rate": 9.806701030927834e-07, "loss": 0.0005, "reward": 1.7157459259033203, "reward_std": 0.13008157908916473, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7157459259033203, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 112.3203125, "epoch": 0.03951890034364261, "grad_norm": 1.3333890922786833, "kl": 0.02374267578125, "learning_rate": 9.802405498281786e-07, "loss": 0.001, "reward": 1.7577217817306519, "reward_std": 0.1191495880484581, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7577218413352966, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 117.3359375, "epoch": 0.040378006872852236, "grad_norm": 2.7773278425143313, "kl": 0.014434814453125, "learning_rate": 9.798109965635738e-07, "loss": 0.0006, "reward": 1.6698015332221985, "reward_std": 0.18330103904008865, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.6776140332221985, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 123.4296875, "epoch": 0.041237113402061855, "grad_norm": 3.1125459908194886, "kl": 0.0120849609375, "learning_rate": 9.79381443298969e-07, "loss": 0.0005, "reward": 1.7181047797203064, "reward_std": 0.2058340311050415, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.718104749917984, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 118.65625, "epoch": 0.04209621993127148, "grad_norm": 1.7917182356261732, "kl": 0.01312255859375, "learning_rate": 9.789518900343642e-07, "loss": 0.0005, "reward": 1.6312512159347534, "reward_std": 0.1998831108212471, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.631251186132431, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 116.765625, "epoch": 0.0429553264604811, "grad_norm": 2.715958501901883, "kl": 0.01019287109375, "learning_rate": 9.785223367697594e-07, "loss": 0.0004, "reward": 1.758800745010376, "reward_std": 0.17326835542917252, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.758800745010376, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 113.609375, "epoch": 0.04381443298969072, "grad_norm": 1.6233811186388696, "kl": 0.0115966796875, "learning_rate": 9.780927835051545e-07, "loss": 0.0005, "reward": 1.693307340145111, "reward_std": 0.16266920417547226, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.6933073401451111, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 108.203125, "epoch": 0.044673539518900345, "grad_norm": 1.655879971578571, "kl": 0.024871826171875, "learning_rate": 9.776632302405497e-07, "loss": 0.001, "reward": 1.6892904043197632, "reward_std": 0.17997007817029953, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7049154043197632, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 121.65625, "epoch": 0.04553264604810996, "grad_norm": 1.6681145967268827, "kl": 0.010528564453125, "learning_rate": 9.77233676975945e-07, "loss": 0.0004, "reward": 1.7307260036468506, "reward_std": 0.18761465698480606, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7307261228561401, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 125.5078125, "epoch": 0.04639175257731959, "grad_norm": 1.9783878004259334, "kl": 0.00994873046875, "learning_rate": 9.768041237113401e-07, "loss": 0.0004, "reward": 1.6929954290390015, "reward_std": 0.15189384669065475, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.6929953396320343, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 135.3828125, "epoch": 0.04725085910652921, "grad_norm": 1.6703845988223984, "kl": 0.014129638671875, "learning_rate": 9.763745704467353e-07, "loss": 0.0006, "reward": 1.616851270198822, "reward_std": 0.18548668175935745, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.6168512403964996, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 117.6015625, "epoch": 0.048109965635738834, "grad_norm": 1.5488291217502541, "kl": 0.013580322265625, "learning_rate": 9.759450171821305e-07, "loss": 0.0005, "reward": 1.7046599388122559, "reward_std": 0.22010766714811325, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7124724388122559, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 138.9453125, "epoch": 0.04896907216494845, "grad_norm": 1.4634966134639982, "kl": 0.0120849609375, "learning_rate": 9.755154639175257e-07, "loss": 0.0005, "reward": 1.6978161334991455, "reward_std": 0.17123103514313698, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7134410738945007, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 104.203125, "epoch": 0.04982817869415808, "grad_norm": 2.773139620924608, "kl": 0.01214599609375, "learning_rate": 9.750859106529209e-07, "loss": 0.0005, "reward": 1.7033132314682007, "reward_std": 0.17858785390853882, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7033132314682007, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 127.6484375, "epoch": 0.0506872852233677, "grad_norm": 2.112316846879123, "kl": 0.015411376953125, "learning_rate": 9.74656357388316e-07, "loss": 0.0006, "reward": 1.7434183359146118, "reward_std": 0.1504114419221878, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7434183359146118, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 116.8203125, "epoch": 0.05154639175257732, "grad_norm": 4.733899218910252, "kl": 0.0123291015625, "learning_rate": 9.742268041237112e-07, "loss": 0.0005, "reward": 1.7861030101776123, "reward_std": 0.21234874427318573, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7939155101776123, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 110.7734375, "epoch": 0.05240549828178694, "grad_norm": 2.410724757522067, "kl": 0.01287841796875, "learning_rate": 9.737972508591064e-07, "loss": 0.0005, "reward": 1.7895718216896057, "reward_std": 0.1751849427819252, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7973843216896057, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 114.0, "epoch": 0.05326460481099656, "grad_norm": 1.3683631722816978, "kl": 0.0125732421875, "learning_rate": 9.733676975945016e-07, "loss": 0.0005, "reward": 1.6282992959022522, "reward_std": 0.1704944670200348, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.6439243257045746, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 119.2421875, "epoch": 0.05412371134020619, "grad_norm": 1.6450659007412214, "kl": 0.01019287109375, "learning_rate": 9.729381443298968e-07, "loss": 0.0004, "reward": 1.6509077548980713, "reward_std": 0.17651135474443436, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.6509077548980713, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 132.171875, "epoch": 0.054982817869415807, "grad_norm": 1.6294625649527108, "kl": 0.01202392578125, "learning_rate": 9.72508591065292e-07, "loss": 0.0005, "reward": 1.7486688494682312, "reward_std": 0.2049519494175911, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7564813494682312, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 112.4375, "epoch": 0.05584192439862543, "grad_norm": 1.3207489104187833, "kl": 0.014923095703125, "learning_rate": 9.720790378006872e-07, "loss": 0.0006, "reward": 1.686493158340454, "reward_std": 0.1424517035484314, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7021180987358093, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 107.4453125, "epoch": 0.05670103092783505, "grad_norm": 1.8194250939249632, "kl": 0.01617431640625, "learning_rate": 9.716494845360824e-07, "loss": 0.0006, "reward": 1.7264857292175293, "reward_std": 0.16944430023431778, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7342982888221741, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 122.1953125, "epoch": 0.05756013745704467, "grad_norm": 1.8454925104090358, "kl": 0.013275146484375, "learning_rate": 9.712199312714776e-07, "loss": 0.0005, "reward": 1.7975256443023682, "reward_std": 0.18923871964216232, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7975256443023682, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 110.2265625, "epoch": 0.058419243986254296, "grad_norm": 1.8448295930855816, "kl": 0.012298583984375, "learning_rate": 9.707903780068727e-07, "loss": 0.0005, "reward": 1.7086012363433838, "reward_std": 0.14006467163562775, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7086012661457062, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 111.421875, "epoch": 0.059278350515463915, "grad_norm": 1.5791228430586928, "kl": 0.0155029296875, "learning_rate": 9.70360824742268e-07, "loss": 0.0006, "reward": 1.799054503440857, "reward_std": 0.14379648119211197, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8068670034408569, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 132.359375, "epoch": 0.06013745704467354, "grad_norm": 1.3943612865559436, "kl": 0.010162353515625, "learning_rate": 9.699312714776631e-07, "loss": 0.0004, "reward": 1.8229413032531738, "reward_std": 0.15740595757961273, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8229413330554962, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 133.6953125, "epoch": 0.06099656357388316, "grad_norm": 2.8156538308984262, "kl": 0.009796142578125, "learning_rate": 9.695017182130583e-07, "loss": 0.0004, "reward": 1.7003414630889893, "reward_std": 0.19474071264266968, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7159664928913116, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 125.2265625, "epoch": 0.061855670103092786, "grad_norm": 1.866123797516234, "kl": 0.0084228515625, "learning_rate": 9.690721649484535e-07, "loss": 0.0003, "reward": 1.7515292763710022, "reward_std": 0.1583714485168457, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7515292763710022, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 143.3671875, "epoch": 0.0627147766323024, "grad_norm": 1.394074930738141, "kl": 0.011199951171875, "learning_rate": 9.686426116838487e-07, "loss": 0.0004, "reward": 1.679770588874817, "reward_std": 0.21282349526882172, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7110206186771393, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 118.3984375, "epoch": 0.06357388316151202, "grad_norm": 4.5617894751936285, "kl": 0.019927978515625, "learning_rate": 9.682130584192439e-07, "loss": 0.0008, "reward": 1.6840202808380127, "reward_std": 0.18173471838235855, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.6918327808380127, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 131.40625, "epoch": 0.06443298969072164, "grad_norm": 20.644470378771178, "kl": 0.0096435546875, "learning_rate": 9.67783505154639e-07, "loss": 0.0004, "reward": 1.6653663516044617, "reward_std": 0.15374305844306946, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.6653663218021393, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 129.75, "epoch": 0.06529209621993128, "grad_norm": 1.3520802500043023, "kl": 0.011871337890625, "learning_rate": 9.673539518900342e-07, "loss": 0.0005, "reward": 1.7905267477035522, "reward_std": 0.11402217298746109, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.790526807308197, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 130.0703125, "epoch": 0.0661512027491409, "grad_norm": 2.1828273386945662, "kl": 0.008880615234375, "learning_rate": 9.669243986254294e-07, "loss": 0.0004, "reward": 1.6396149396896362, "reward_std": 0.22704153507947922, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.6552398800849915, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 136.5625, "epoch": 0.06701030927835051, "grad_norm": 1.9960711680675047, "kl": 0.01043701171875, "learning_rate": 9.664948453608246e-07, "loss": 0.0004, "reward": 1.717953085899353, "reward_std": 0.1279629021883011, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7179530560970306, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 118.375, "epoch": 0.06786941580756013, "grad_norm": 1.573275119243184, "kl": 0.009918212890625, "learning_rate": 9.660652920962198e-07, "loss": 0.0004, "reward": 1.7117525339126587, "reward_std": 0.17472369223833084, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7117525339126587, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 141.6953125, "epoch": 0.06872852233676977, "grad_norm": 5.858171715596188, "kl": 0.0074462890625, "learning_rate": 9.65635738831615e-07, "loss": 0.0003, "reward": 1.7692703604698181, "reward_std": 0.18383784592151642, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7692703902721405, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 121.8203125, "epoch": 0.06958762886597938, "grad_norm": 2.153346346698614, "kl": 0.0079345703125, "learning_rate": 9.652061855670102e-07, "loss": 0.0003, "reward": 1.7922507524490356, "reward_std": 0.16792288422584534, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7922507226467133, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 128.2890625, "epoch": 0.070446735395189, "grad_norm": 1.5730677509411617, "kl": 0.009002685546875, "learning_rate": 9.647766323024054e-07, "loss": 0.0004, "reward": 1.7323782444000244, "reward_std": 0.17609868198633194, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7480032444000244, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 134.578125, "epoch": 0.07130584192439862, "grad_norm": 1.374124074188527, "kl": 0.011138916015625, "learning_rate": 9.643470790378006e-07, "loss": 0.0004, "reward": 1.6585241556167603, "reward_std": 0.19784274697303772, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.6819616854190826, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 123.9296875, "epoch": 0.07216494845360824, "grad_norm": 2.383137509998003, "kl": 0.01141357421875, "learning_rate": 9.639175257731957e-07, "loss": 0.0005, "reward": 1.6670758724212646, "reward_std": 0.19278159737586975, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.6670758724212646, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 128.578125, "epoch": 0.07302405498281787, "grad_norm": 1.7362778804071193, "kl": 0.011749267578125, "learning_rate": 9.63487972508591e-07, "loss": 0.0005, "reward": 1.7598541378974915, "reward_std": 0.1449267938733101, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7754791080951691, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 164.609375, "epoch": 0.07388316151202749, "grad_norm": 2.8650428143418325, "kl": 0.012176513671875, "learning_rate": 9.630584192439863e-07, "loss": 0.0005, "reward": 1.6261486411094666, "reward_std": 0.29106324911117554, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.6573987007141113, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 134.90625, "epoch": 0.07474226804123711, "grad_norm": 1.9633221241492278, "kl": 0.0074920654296875, "learning_rate": 9.626288659793815e-07, "loss": 0.0003, "reward": 1.7269136905670166, "reward_std": 0.1622983068227768, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7347261309623718, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 137.75, "epoch": 0.07560137457044673, "grad_norm": 2.0681449704464954, "kl": 0.0077056884765625, "learning_rate": 9.621993127147767e-07, "loss": 0.0003, "reward": 1.7325759530067444, "reward_std": 0.14999449253082275, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7325759530067444, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 135.3984375, "epoch": 0.07646048109965636, "grad_norm": 1.1844639755913142, "kl": 0.007354736328125, "learning_rate": 9.61769759450172e-07, "loss": 0.0003, "reward": 1.7702937126159668, "reward_std": 0.12665041163563728, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7702936828136444, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 125.25, "epoch": 0.07731958762886598, "grad_norm": 1.3413729169105462, "kl": 0.0083465576171875, "learning_rate": 9.61340206185567e-07, "loss": 0.0003, "reward": 1.6900493502616882, "reward_std": 0.08954241871833801, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.6978618502616882, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 128.421875, "epoch": 0.0781786941580756, "grad_norm": 1.9584992131927317, "kl": 0.015350341796875, "learning_rate": 9.609106529209623e-07, "loss": 0.0006, "reward": 1.6389594078063965, "reward_std": 0.231736421585083, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.6545844078063965, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 132.6796875, "epoch": 0.07903780068728522, "grad_norm": 1.4808391695134289, "kl": 0.015228271484375, "learning_rate": 9.604810996563575e-07, "loss": 0.0006, "reward": 1.7157284617424011, "reward_std": 0.14664900302886963, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7313534915447235, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 145.8515625, "epoch": 0.07989690721649484, "grad_norm": 1.3560060082947887, "kl": 0.011688232421875, "learning_rate": 9.600515463917527e-07, "loss": 0.0005, "reward": 1.6663364171981812, "reward_std": 0.17353109270334244, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.6897739470005035, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 126.78125, "epoch": 0.08075601374570447, "grad_norm": 1.4417733286684913, "kl": 0.009521484375, "learning_rate": 9.596219931271478e-07, "loss": 0.0004, "reward": 1.695834457874298, "reward_std": 0.15896284580230713, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7270845174789429, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 125.7421875, "epoch": 0.08161512027491409, "grad_norm": 1.81422275990836, "kl": 0.008056640625, "learning_rate": 9.59192439862543e-07, "loss": 0.0003, "reward": 1.7134593725204468, "reward_std": 0.16323717311024666, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7134594321250916, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 127.21875, "epoch": 0.08247422680412371, "grad_norm": 1.2278195202315274, "kl": 0.015380859375, "learning_rate": 9.587628865979382e-07, "loss": 0.0006, "reward": 1.833047330379486, "reward_std": 0.15521731600165367, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8486724197864532, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 124.7890625, "epoch": 0.08333333333333333, "grad_norm": 1.7371338217464498, "kl": 0.012603759765625, "learning_rate": 9.583333333333334e-07, "loss": 0.0005, "reward": 1.6824183464050293, "reward_std": 0.1946360394358635, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.6824184060096741, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 127.09375, "epoch": 0.08419243986254296, "grad_norm": 2.9141007581916285, "kl": 0.01483154296875, "learning_rate": 9.579037800687286e-07, "loss": 0.0006, "reward": 1.7687426805496216, "reward_std": 0.16979511827230453, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7687426805496216, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 129.703125, "epoch": 0.08505154639175258, "grad_norm": 3.157985367093318, "kl": 0.012939453125, "learning_rate": 9.574742268041238e-07, "loss": 0.0005, "reward": 1.8471919298171997, "reward_std": 0.08835843577980995, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8471919894218445, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 124.96875, "epoch": 0.0859106529209622, "grad_norm": 1.791601113718697, "kl": 0.01171875, "learning_rate": 9.57044673539519e-07, "loss": 0.0005, "reward": 1.7132195234298706, "reward_std": 0.1518014371395111, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7132195234298706, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 116.640625, "epoch": 0.08676975945017182, "grad_norm": 1.6750678433197985, "kl": 0.01409912109375, "learning_rate": 9.566151202749142e-07, "loss": 0.0006, "reward": 1.6660110354423523, "reward_std": 0.16518359631299973, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.6660110354423523, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 131.453125, "epoch": 0.08762886597938144, "grad_norm": 2.100280147210653, "kl": 0.014434814453125, "learning_rate": 9.561855670103093e-07, "loss": 0.0006, "reward": 1.7369045615196228, "reward_std": 0.11925114691257477, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.736904501914978, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 118.5703125, "epoch": 0.08848797250859107, "grad_norm": 2.48977370571022, "kl": 0.012603759765625, "learning_rate": 9.557560137457045e-07, "loss": 0.0005, "reward": 1.7730196118354797, "reward_std": 0.18081708252429962, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7808321118354797, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 115.546875, "epoch": 0.08934707903780069, "grad_norm": 2.083788782587846, "kl": 0.0164794921875, "learning_rate": 9.553264604810997e-07, "loss": 0.0007, "reward": 1.668508231639862, "reward_std": 0.1277586668729782, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.6763207614421844, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 118.1953125, "epoch": 0.09020618556701031, "grad_norm": 3.157040349398891, "kl": 0.015838623046875, "learning_rate": 9.54896907216495e-07, "loss": 0.0006, "reward": 1.792285680770874, "reward_std": 0.1403278186917305, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.800098180770874, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 123.53125, "epoch": 0.09106529209621993, "grad_norm": 1.9043694578355221, "kl": 0.01544189453125, "learning_rate": 9.5446735395189e-07, "loss": 0.0006, "reward": 1.7426919341087341, "reward_std": 0.17454702407121658, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7426919043064117, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 116.0234375, "epoch": 0.09192439862542956, "grad_norm": 1.1762483491043985, "kl": 0.013885498046875, "learning_rate": 9.540378006872853e-07, "loss": 0.0006, "reward": 1.76499605178833, "reward_std": 0.12454288452863693, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7649960517883301, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 125.375, "epoch": 0.09278350515463918, "grad_norm": 2.4203700264289822, "kl": 0.014434814453125, "learning_rate": 9.536082474226805e-07, "loss": 0.0006, "reward": 1.7665973901748657, "reward_std": 0.16096310317516327, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7900348901748657, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 122.6796875, "epoch": 0.0936426116838488, "grad_norm": 1.1516404581818234, "kl": 0.013824462890625, "learning_rate": 9.531786941580757e-07, "loss": 0.0006, "reward": 1.7484302520751953, "reward_std": 0.11297959834337234, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7562427520751953, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 132.765625, "epoch": 0.09450171821305842, "grad_norm": 1.8664322686617876, "kl": 0.013092041015625, "learning_rate": 9.527491408934707e-07, "loss": 0.0005, "reward": 1.7427197694778442, "reward_std": 0.1969207152724266, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7505322694778442, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 134.4140625, "epoch": 0.09536082474226804, "grad_norm": 2.8553637446748055, "kl": 0.01678466796875, "learning_rate": 9.523195876288659e-07, "loss": 0.0007, "reward": 1.6789370775222778, "reward_std": 0.14484084397554398, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.6867496371269226, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 110.9765625, "epoch": 0.09621993127147767, "grad_norm": 2.8547235701713527, "kl": 0.01629638671875, "learning_rate": 9.518900343642611e-07, "loss": 0.0007, "reward": 1.7494203448295593, "reward_std": 0.17836876958608627, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7572328150272369, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 122.1328125, "epoch": 0.09707903780068729, "grad_norm": 1.1274603410680148, "kl": 0.014923095703125, "learning_rate": 9.514604810996563e-07, "loss": 0.0006, "reward": 1.775137186050415, "reward_std": 0.12154169753193855, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7829497158527374, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 107.5625, "epoch": 0.0979381443298969, "grad_norm": 2.4835835333144347, "kl": 0.025482177734375, "learning_rate": 9.510309278350515e-07, "loss": 0.001, "reward": 1.6830406785011292, "reward_std": 0.07851138710975647, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.6986656486988068, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 117.2578125, "epoch": 0.09879725085910653, "grad_norm": 1.457424888302412, "kl": 0.01715087890625, "learning_rate": 9.506013745704467e-07, "loss": 0.0007, "reward": 1.8288711309432983, "reward_std": 0.1552474945783615, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8366836309432983, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 146.15625, "epoch": 0.09965635738831616, "grad_norm": 5.261633631939683, "kl": 0.012969970703125, "learning_rate": 9.501718213058419e-07, "loss": 0.0005, "reward": 1.7097431421279907, "reward_std": 0.1742759346961975, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7097431421279907, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 142.25, "epoch": 0.10051546391752578, "grad_norm": 2.0866973884427784, "kl": 0.01885986328125, "learning_rate": 9.497422680412371e-07, "loss": 0.0008, "reward": 1.7714307308197021, "reward_std": 0.2078239470720291, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7714307308197021, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 122.0546875, "epoch": 0.1013745704467354, "grad_norm": 1.9401421255992048, "kl": 0.0369873046875, "learning_rate": 9.493127147766322e-07, "loss": 0.0015, "reward": 1.7130677700042725, "reward_std": 0.19828316569328308, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7208802998065948, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 120.390625, "epoch": 0.10223367697594501, "grad_norm": 2.5355319382822827, "kl": 0.0181884765625, "learning_rate": 9.488831615120274e-07, "loss": 0.0007, "reward": 1.737857460975647, "reward_std": 0.1541828289628029, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7378574907779694, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 120.7265625, "epoch": 0.10309278350515463, "grad_norm": 1.4772557885572284, "kl": 0.02142333984375, "learning_rate": 9.484536082474226e-07, "loss": 0.0009, "reward": 1.7606690526008606, "reward_std": 0.18398111313581467, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7606690227985382, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 136.7578125, "epoch": 0.10395189003436427, "grad_norm": 2.4929887339743244, "kl": 0.01837158203125, "learning_rate": 9.480240549828178e-07, "loss": 0.0007, "reward": 1.727208137512207, "reward_std": 0.13875007256865501, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.727208137512207, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 114.640625, "epoch": 0.10481099656357389, "grad_norm": 3.0006792162829257, "kl": 0.02630615234375, "learning_rate": 9.47594501718213e-07, "loss": 0.001, "reward": 1.8001123070716858, "reward_std": 0.20054399222135544, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8157372772693634, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 113.046875, "epoch": 0.1056701030927835, "grad_norm": 2.6748232943569827, "kl": 0.0206298828125, "learning_rate": 9.471649484536082e-07, "loss": 0.0008, "reward": 1.7969177961349487, "reward_std": 0.10830671712756157, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8047303557395935, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 130.953125, "epoch": 0.10652920962199312, "grad_norm": 1.4841628651180314, "kl": 0.024658203125, "learning_rate": 9.467353951890034e-07, "loss": 0.001, "reward": 1.7056427001953125, "reward_std": 0.1875780150294304, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7368926405906677, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 130.203125, "epoch": 0.10738831615120274, "grad_norm": 1.874212030480472, "kl": 0.023193359375, "learning_rate": 9.463058419243986e-07, "loss": 0.0009, "reward": 1.6158097386360168, "reward_std": 0.1327304244041443, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.6236222088336945, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 135.75, "epoch": 0.10824742268041238, "grad_norm": 1.0408020027656946, "kl": 0.014923095703125, "learning_rate": 9.458762886597938e-07, "loss": 0.0006, "reward": 1.7203240394592285, "reward_std": 0.1105212289839983, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7281366288661957, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 116.359375, "epoch": 0.109106529209622, "grad_norm": 1.2726802820456735, "kl": 0.0220947265625, "learning_rate": 9.454467353951889e-07, "loss": 0.0009, "reward": 1.6998094320297241, "reward_std": 0.16664542257785797, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7076218724250793, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 118.765625, "epoch": 0.10996563573883161, "grad_norm": 1.6961988401912567, "kl": 0.015594482421875, "learning_rate": 9.450171821305841e-07, "loss": 0.0006, "reward": 1.6439663171768188, "reward_std": 0.16561322659254074, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.6517787873744965, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 127.390625, "epoch": 0.11082474226804123, "grad_norm": 2.510071105917193, "kl": 0.015350341796875, "learning_rate": 9.445876288659793e-07, "loss": 0.0006, "reward": 1.66585111618042, "reward_std": 0.1521969810128212, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.6736635565757751, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 117.484375, "epoch": 0.11168384879725086, "grad_norm": 5.638902477820217, "kl": 0.01513671875, "learning_rate": 9.441580756013745e-07, "loss": 0.0006, "reward": 1.7457653284072876, "reward_std": 0.15448395907878876, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7457653880119324, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 124.1484375, "epoch": 0.11254295532646048, "grad_norm": 1.513851686329793, "kl": 0.015289306640625, "learning_rate": 9.437285223367697e-07, "loss": 0.0006, "reward": 1.7635972499847412, "reward_std": 0.17401929199695587, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7714097797870636, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 119.3359375, "epoch": 0.1134020618556701, "grad_norm": 1.322125994104397, "kl": 0.012359619140625, "learning_rate": 9.432989690721649e-07, "loss": 0.0005, "reward": 1.823841154575348, "reward_std": 0.14871416985988617, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8238411545753479, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 130.0, "epoch": 0.11426116838487972, "grad_norm": 1.355698173704868, "kl": 0.023406982421875, "learning_rate": 9.428694158075601e-07, "loss": 0.0009, "reward": 1.766915202140808, "reward_std": 0.14532212167978287, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7825402021408081, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 116.4140625, "epoch": 0.11512027491408934, "grad_norm": 1.4739462448440241, "kl": 0.0181884765625, "learning_rate": 9.424398625429553e-07, "loss": 0.0007, "reward": 1.7099063992500305, "reward_std": 0.1308385580778122, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7099063992500305, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 116.9296875, "epoch": 0.11597938144329897, "grad_norm": 1.3274707295914516, "kl": 0.014129638671875, "learning_rate": 9.420103092783504e-07, "loss": 0.0006, "reward": 1.7741219997406006, "reward_std": 0.16581101715564728, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7819344997406006, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 129.140625, "epoch": 0.11683848797250859, "grad_norm": 1.5843650427627012, "kl": 0.01092529296875, "learning_rate": 9.415807560137456e-07, "loss": 0.0004, "reward": 1.6576457023620605, "reward_std": 0.1768977865576744, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.6654582023620605, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 140.8984375, "epoch": 0.11769759450171821, "grad_norm": 1.670051625687803, "kl": 0.012542724609375, "learning_rate": 9.411512027491408e-07, "loss": 0.0005, "reward": 1.7060455083847046, "reward_std": 0.14821957796812057, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.706045538187027, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 123.953125, "epoch": 0.11855670103092783, "grad_norm": 1.64228564707855, "kl": 0.0146484375, "learning_rate": 9.40721649484536e-07, "loss": 0.0006, "reward": 1.698437750339508, "reward_std": 0.1564936824142933, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.6984376907348633, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 128.15625, "epoch": 0.11941580756013746, "grad_norm": 1.376863304210578, "kl": 0.010498046875, "learning_rate": 9.402920962199312e-07, "loss": 0.0004, "reward": 1.8016321063041687, "reward_std": 0.15034572780132294, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8172571659088135, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 123.8359375, "epoch": 0.12027491408934708, "grad_norm": 6.099834295001363, "kl": 0.013580322265625, "learning_rate": 9.398625429553264e-07, "loss": 0.0005, "reward": 1.6821348667144775, "reward_std": 0.13406795635819435, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.6821348965167999, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 125.953125, "epoch": 0.1211340206185567, "grad_norm": 1.6651173493841875, "kl": 0.01617431640625, "learning_rate": 9.394329896907216e-07, "loss": 0.0006, "reward": 1.7098830938339233, "reward_std": 0.17125768959522247, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7333205342292786, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 118.2265625, "epoch": 0.12199312714776632, "grad_norm": 2.528179883736634, "kl": 0.013946533203125, "learning_rate": 9.390034364261168e-07, "loss": 0.0006, "reward": 1.7531933784484863, "reward_std": 0.15759651362895966, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7610058188438416, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 130.453125, "epoch": 0.12285223367697594, "grad_norm": 3.1296650163459088, "kl": 0.01806640625, "learning_rate": 9.38573883161512e-07, "loss": 0.0007, "reward": 1.8122310042381287, "reward_std": 0.17129109799861908, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8122310042381287, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 132.421875, "epoch": 0.12371134020618557, "grad_norm": 2.002669092661699, "kl": 0.01312255859375, "learning_rate": 9.381443298969071e-07, "loss": 0.0005, "reward": 1.7977581024169922, "reward_std": 0.145246010273695, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8055705428123474, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 126.7421875, "epoch": 0.12457044673539519, "grad_norm": 1.2100201264770116, "kl": 0.0140380859375, "learning_rate": 9.377147766323023e-07, "loss": 0.0006, "reward": 1.829803466796875, "reward_std": 0.1415480300784111, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.837615966796875, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 129.3203125, "epoch": 0.1254295532646048, "grad_norm": 2.180495745625869, "kl": 0.01171875, "learning_rate": 9.372852233676975e-07, "loss": 0.0005, "reward": 1.7289317846298218, "reward_std": 0.18080250918865204, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7445567846298218, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 126.84375, "epoch": 0.12628865979381443, "grad_norm": 2.9127570085332892, "kl": 0.01275634765625, "learning_rate": 9.368556701030927e-07, "loss": 0.0005, "reward": 1.8339774012565613, "reward_std": 0.09063693135976791, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8339774012565613, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 141.8125, "epoch": 0.12714776632302405, "grad_norm": 1.5988443873795521, "kl": 0.015777587890625, "learning_rate": 9.364261168384879e-07, "loss": 0.0006, "reward": 1.7010595202445984, "reward_std": 0.21928820759058, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7010595202445984, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 125.015625, "epoch": 0.12800687285223367, "grad_norm": 2.3035156258187697, "kl": 0.016021728515625, "learning_rate": 9.359965635738831e-07, "loss": 0.0006, "reward": 1.7444458603858948, "reward_std": 0.21774866431951523, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7444458901882172, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 124.453125, "epoch": 0.12886597938144329, "grad_norm": 1.335338188075339, "kl": 0.014312744140625, "learning_rate": 9.355670103092783e-07, "loss": 0.0006, "reward": 1.8212101459503174, "reward_std": 0.08616838604211807, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8212102353572845, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 126.2734375, "epoch": 0.12972508591065293, "grad_norm": 1.4631690042303354, "kl": 0.01123046875, "learning_rate": 9.351374570446736e-07, "loss": 0.0004, "reward": 1.7321330308914185, "reward_std": 0.1448444351553917, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7321330606937408, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 128.3515625, "epoch": 0.13058419243986255, "grad_norm": 0.9273287057028194, "kl": 0.009765625, "learning_rate": 9.347079037800687e-07, "loss": 0.0004, "reward": 1.7624321579933167, "reward_std": 0.09383735246956348, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7624321281909943, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 146.8515625, "epoch": 0.13144329896907217, "grad_norm": 1.47382353081262, "kl": 0.0152587890625, "learning_rate": 9.342783505154639e-07, "loss": 0.0006, "reward": 1.6556521654129028, "reward_std": 0.15647713094949722, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.6634646654129028, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 131.5859375, "epoch": 0.1323024054982818, "grad_norm": 7.187735004337097, "kl": 0.01318359375, "learning_rate": 9.338487972508591e-07, "loss": 0.0005, "reward": 1.781952142715454, "reward_std": 0.20670166611671448, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7897646427154541, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 117.8828125, "epoch": 0.1331615120274914, "grad_norm": 1.5089418750844499, "kl": 0.016754150390625, "learning_rate": 9.334192439862543e-07, "loss": 0.0007, "reward": 1.763621985912323, "reward_std": 0.18807729333639145, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7792469561100006, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 116.15625, "epoch": 0.13402061855670103, "grad_norm": 14.193518545363872, "kl": 0.012908935546875, "learning_rate": 9.329896907216495e-07, "loss": 0.0005, "reward": 1.8192681670188904, "reward_std": 0.11732286959886551, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8192681968212128, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 124.5859375, "epoch": 0.13487972508591065, "grad_norm": 1.5467385079039138, "kl": 0.012451171875, "learning_rate": 9.325601374570447e-07, "loss": 0.0005, "reward": 1.710074007511139, "reward_std": 0.2205553948879242, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7178865075111389, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 135.3671875, "epoch": 0.13573883161512026, "grad_norm": 1.4389192395608355, "kl": 0.0135498046875, "learning_rate": 9.321305841924399e-07, "loss": 0.0005, "reward": 1.6988090872764587, "reward_std": 0.10531151667237282, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.6988090872764587, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 137.109375, "epoch": 0.13659793814432988, "grad_norm": 1.6050785903093132, "kl": 0.016448974609375, "learning_rate": 9.317010309278351e-07, "loss": 0.0007, "reward": 1.7135010957717896, "reward_std": 0.21779370307922363, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7213136553764343, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 122.8828125, "epoch": 0.13745704467353953, "grad_norm": 1.4854987369310133, "kl": 0.016021728515625, "learning_rate": 9.312714776632303e-07, "loss": 0.0006, "reward": 1.7385917901992798, "reward_std": 0.15268366783857346, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7464043200016022, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 135.8203125, "epoch": 0.13831615120274915, "grad_norm": 3.524052752645729, "kl": 0.0146484375, "learning_rate": 9.308419243986254e-07, "loss": 0.0006, "reward": 1.7990399599075317, "reward_std": 0.15285367518663406, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.799039900302887, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 136.203125, "epoch": 0.13917525773195877, "grad_norm": 2.86620135877599, "kl": 0.014190673828125, "learning_rate": 9.304123711340206e-07, "loss": 0.0006, "reward": 1.6504857540130615, "reward_std": 0.1360258013010025, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.6504857540130615, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 129.4609375, "epoch": 0.1400343642611684, "grad_norm": 1.4169335821055606, "kl": 0.014739990234375, "learning_rate": 9.299828178694158e-07, "loss": 0.0006, "reward": 1.7095910906791687, "reward_std": 0.08373668044805527, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7174035906791687, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 126.8515625, "epoch": 0.140893470790378, "grad_norm": 1.3575110989008008, "kl": 0.01556396484375, "learning_rate": 9.29553264604811e-07, "loss": 0.0006, "reward": 1.7448028326034546, "reward_std": 0.14756760746240616, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7448029220104218, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 116.1171875, "epoch": 0.14175257731958762, "grad_norm": 1.8530325477228802, "kl": 0.01324462890625, "learning_rate": 9.291237113402062e-07, "loss": 0.0005, "reward": 1.742842435836792, "reward_std": 0.16235359013080597, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7428424954414368, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 132.75, "epoch": 0.14261168384879724, "grad_norm": 1.465950210564534, "kl": 0.01416015625, "learning_rate": 9.286941580756014e-07, "loss": 0.0006, "reward": 1.813442349433899, "reward_std": 0.1405864767730236, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8134423494338989, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 130.2265625, "epoch": 0.14347079037800686, "grad_norm": 1.7221592259263123, "kl": 0.015533447265625, "learning_rate": 9.282646048109966e-07, "loss": 0.0006, "reward": 1.6789227724075317, "reward_std": 0.198103629052639, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.6867353320121765, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 146.0, "epoch": 0.14432989690721648, "grad_norm": 2.1528629212609407, "kl": 0.013214111328125, "learning_rate": 9.278350515463918e-07, "loss": 0.0005, "reward": 1.7893621921539307, "reward_std": 0.13996141962707043, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7893621623516083, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 129.7265625, "epoch": 0.14518900343642613, "grad_norm": 1.3004655134925518, "kl": 0.0184326171875, "learning_rate": 9.274054982817869e-07, "loss": 0.0007, "reward": 1.7796956300735474, "reward_std": 0.14173657447099686, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7875081300735474, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 145.7578125, "epoch": 0.14604810996563575, "grad_norm": 1.4101337237690597, "kl": 0.01959228515625, "learning_rate": 9.269759450171821e-07, "loss": 0.0008, "reward": 1.8010075688362122, "reward_std": 0.1661326214671135, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8010075390338898, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 140.7265625, "epoch": 0.14690721649484537, "grad_norm": 1.4640575230347574, "kl": 0.02215576171875, "learning_rate": 9.265463917525773e-07, "loss": 0.0009, "reward": 1.6987681984901428, "reward_std": 0.21677575260400772, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.6987681686878204, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 133.1171875, "epoch": 0.14776632302405499, "grad_norm": 1.6395435895829094, "kl": 0.017822265625, "learning_rate": 9.261168384879725e-07, "loss": 0.0007, "reward": 1.7570144534111023, "reward_std": 0.14532551914453506, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7570143938064575, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 138.1796875, "epoch": 0.1486254295532646, "grad_norm": 1.696240174354143, "kl": 0.0164794921875, "learning_rate": 9.256872852233677e-07, "loss": 0.0007, "reward": 1.8731423616409302, "reward_std": 0.14208180457353592, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8731423616409302, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 126.1875, "epoch": 0.14948453608247422, "grad_norm": 1.3562918586715518, "kl": 0.01910400390625, "learning_rate": 9.252577319587629e-07, "loss": 0.0008, "reward": 1.7502126693725586, "reward_std": 0.1445324867963791, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.758025199174881, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 140.34375, "epoch": 0.15034364261168384, "grad_norm": 1.8285091498658437, "kl": 0.01953125, "learning_rate": 9.248281786941581e-07, "loss": 0.0008, "reward": 1.7030513286590576, "reward_std": 0.20504559576511383, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7108637988567352, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 126.4375, "epoch": 0.15120274914089346, "grad_norm": 2.1424066584845747, "kl": 0.0216064453125, "learning_rate": 9.243986254295533e-07, "loss": 0.0009, "reward": 1.6664610505104065, "reward_std": 0.07924733310937881, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.6664610505104065, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 118.1484375, "epoch": 0.15206185567010308, "grad_norm": 1.8341695343755187, "kl": 0.021728515625, "learning_rate": 9.239690721649484e-07, "loss": 0.0009, "reward": 1.701620638370514, "reward_std": 0.135139562189579, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7016206681728363, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 124.5703125, "epoch": 0.15292096219931273, "grad_norm": 1.7560417473294632, "kl": 0.02288818359375, "learning_rate": 9.235395189003436e-07, "loss": 0.0009, "reward": 1.6936078667640686, "reward_std": 0.20969898253679276, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.717045396566391, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 129.453125, "epoch": 0.15378006872852235, "grad_norm": 1.66239183509703, "kl": 0.02362060546875, "learning_rate": 9.231099656357388e-07, "loss": 0.0009, "reward": 1.72810560464859, "reward_std": 0.174056738615036, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7515431046485901, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 141.2265625, "epoch": 0.15463917525773196, "grad_norm": 1.3638203436692424, "kl": 0.0224609375, "learning_rate": 9.22680412371134e-07, "loss": 0.0009, "reward": 1.7772685289382935, "reward_std": 0.09648095071315765, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7850809693336487, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 127.546875, "epoch": 0.15549828178694158, "grad_norm": 1.2134969783708147, "kl": 0.02392578125, "learning_rate": 9.222508591065292e-07, "loss": 0.001, "reward": 1.826979637145996, "reward_std": 0.1237468272447586, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8347920775413513, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 135.7890625, "epoch": 0.1563573883161512, "grad_norm": 1.3458969430241254, "kl": 0.02227783203125, "learning_rate": 9.218213058419243e-07, "loss": 0.0009, "reward": 1.7129201889038086, "reward_std": 0.18879132717847824, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7285451889038086, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 137.3359375, "epoch": 0.15721649484536082, "grad_norm": 1.750103363285581, "kl": 0.06610107421875, "learning_rate": 9.213917525773195e-07, "loss": 0.0026, "reward": 1.6476903557777405, "reward_std": 0.265672467648983, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.67894047498703, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 130.0078125, "epoch": 0.15807560137457044, "grad_norm": 1.820111828776988, "kl": 0.02685546875, "learning_rate": 9.209621993127147e-07, "loss": 0.0011, "reward": 1.774660885334015, "reward_std": 0.15566430985927582, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7824733853340149, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 133.703125, "epoch": 0.15893470790378006, "grad_norm": 1.6209110696010172, "kl": 0.0299072265625, "learning_rate": 9.205326460481098e-07, "loss": 0.0012, "reward": 1.7323817014694214, "reward_std": 0.20174291729927063, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7636317014694214, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 141.859375, "epoch": 0.15979381443298968, "grad_norm": 1.1877258799941435, "kl": 0.02227783203125, "learning_rate": 9.20103092783505e-07, "loss": 0.0009, "reward": 1.7020468711853027, "reward_std": 0.14912591874599457, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7098594009876251, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 137.25, "epoch": 0.16065292096219932, "grad_norm": 1.7483290111833802, "kl": 0.02362060546875, "learning_rate": 9.196735395189002e-07, "loss": 0.0009, "reward": 1.7843592166900635, "reward_std": 0.18126945197582245, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7999841570854187, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 141.625, "epoch": 0.16151202749140894, "grad_norm": 1.6918407108780025, "kl": 0.02398681640625, "learning_rate": 9.192439862542954e-07, "loss": 0.001, "reward": 1.70003342628479, "reward_std": 0.15035928413271904, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7156584560871124, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 162.4609375, "epoch": 0.16237113402061856, "grad_norm": 1.3068095870355032, "kl": 0.0291748046875, "learning_rate": 9.188144329896906e-07, "loss": 0.0012, "reward": 1.6980275511741638, "reward_std": 0.14009612798690796, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7058400511741638, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 149.5625, "epoch": 0.16323024054982818, "grad_norm": 5.9784870187151045, "kl": 0.2835693359375, "learning_rate": 9.183848797250858e-07, "loss": 0.0113, "reward": 1.5892316102981567, "reward_std": 0.09760742634534836, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.604856550693512, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 132.7734375, "epoch": 0.1640893470790378, "grad_norm": 1.2813901114072874, "kl": 0.0228271484375, "learning_rate": 9.17955326460481e-07, "loss": 0.0009, "reward": 1.6868805289268494, "reward_std": 0.18260416388511658, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7025054693222046, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 142.046875, "epoch": 0.16494845360824742, "grad_norm": 6.416179300212472, "kl": 0.02294921875, "learning_rate": 9.175257731958762e-07, "loss": 0.0009, "reward": 1.6928731799125671, "reward_std": 0.2074473798274994, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7241232395172119, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 157.703125, "epoch": 0.16580756013745704, "grad_norm": 2.6639805313439546, "kl": 0.0198974609375, "learning_rate": 9.170962199312713e-07, "loss": 0.0008, "reward": 1.7725132703781128, "reward_std": 0.16904744878411293, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7881382405757904, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 150.21875, "epoch": 0.16666666666666666, "grad_norm": 2.6627936333789113, "kl": 0.0616455078125, "learning_rate": 9.166666666666665e-07, "loss": 0.0025, "reward": 1.726097822189331, "reward_std": 0.17287509143352509, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7260977625846863, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 134.6171875, "epoch": 0.16752577319587628, "grad_norm": 2.3427841091043904, "kl": 0.0228271484375, "learning_rate": 9.162371134020618e-07, "loss": 0.0009, "reward": 1.6643958687782288, "reward_std": 0.16956821829080582, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.6722084581851959, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 131.9765625, "epoch": 0.16838487972508592, "grad_norm": 1.2879832569815648, "kl": 0.02276611328125, "learning_rate": 9.15807560137457e-07, "loss": 0.0009, "reward": 1.7251017093658447, "reward_std": 0.19944577664136887, "rewards/format_reward_gen": 0.9609375, "rewards/llm_reward": 0.7641642093658447, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 131.2890625, "epoch": 0.16924398625429554, "grad_norm": 1.302637581591733, "kl": 0.0181884765625, "learning_rate": 9.153780068728522e-07, "loss": 0.0007, "reward": 1.7328863739967346, "reward_std": 0.170260988175869, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7485113739967346, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 130.390625, "epoch": 0.17010309278350516, "grad_norm": 1.185620411258173, "kl": 0.0184326171875, "learning_rate": 9.149484536082474e-07, "loss": 0.0007, "reward": 1.7054132223129272, "reward_std": 0.14447502046823502, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7054132521152496, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 150.4296875, "epoch": 0.17096219931271478, "grad_norm": 1.7985827377480441, "kl": 0.02508544921875, "learning_rate": 9.145189003436426e-07, "loss": 0.001, "reward": 1.7419326901435852, "reward_std": 0.10603855177760124, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7497451901435852, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 129.734375, "epoch": 0.1718213058419244, "grad_norm": 1.4591627868559685, "kl": 0.01800537109375, "learning_rate": 9.140893470790378e-07, "loss": 0.0007, "reward": 1.7669562697410583, "reward_std": 0.16979996114969254, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7747687101364136, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 136.765625, "epoch": 0.17268041237113402, "grad_norm": 1.7341999103239758, "kl": 0.02862548828125, "learning_rate": 9.13659793814433e-07, "loss": 0.0011, "reward": 1.6440722942352295, "reward_std": 0.13084383308887482, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.6518846750259399, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 133.2734375, "epoch": 0.17353951890034364, "grad_norm": 1.6909406413369592, "kl": 0.01934814453125, "learning_rate": 9.132302405498281e-07, "loss": 0.0008, "reward": 1.8257556557655334, "reward_std": 0.1285765990614891, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8257556557655334, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 137.515625, "epoch": 0.17439862542955326, "grad_norm": 0.9710832816221044, "kl": 0.01922607421875, "learning_rate": 9.128006872852233e-07, "loss": 0.0008, "reward": 1.7562236189842224, "reward_std": 0.1861056238412857, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7796610593795776, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 125.4609375, "epoch": 0.17525773195876287, "grad_norm": 1.5930942457393673, "kl": 0.01959228515625, "learning_rate": 9.123711340206185e-07, "loss": 0.0008, "reward": 1.7354264855384827, "reward_std": 0.14093422889709473, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.743239015340805, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 133.8828125, "epoch": 0.17611683848797252, "grad_norm": 1.1786198725658004, "kl": 0.02020263671875, "learning_rate": 9.119415807560137e-07, "loss": 0.0008, "reward": 1.8254988193511963, "reward_std": 0.13964856415987015, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8254987895488739, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 140.5, "epoch": 0.17697594501718214, "grad_norm": 1.561736876348039, "kl": 0.03765869140625, "learning_rate": 9.115120274914089e-07, "loss": 0.0015, "reward": 1.7375341653823853, "reward_std": 0.16468166559934616, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7531591653823853, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 147.421875, "epoch": 0.17783505154639176, "grad_norm": 1.673938587392466, "kl": 0.02197265625, "learning_rate": 9.110824742268041e-07, "loss": 0.0009, "reward": 1.828542172908783, "reward_std": 0.12264525145292282, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.844167172908783, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 137.1171875, "epoch": 0.17869415807560138, "grad_norm": 1.7407490513436423, "kl": 0.02032470703125, "learning_rate": 9.106529209621993e-07, "loss": 0.0008, "reward": 1.7809680104255676, "reward_std": 0.13108700513839722, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7809680104255676, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 120.921875, "epoch": 0.179553264604811, "grad_norm": 1.2574262300280894, "kl": 0.028564453125, "learning_rate": 9.102233676975945e-07, "loss": 0.0011, "reward": 1.7650489807128906, "reward_std": 0.11492755264043808, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7728614807128906, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 123.7578125, "epoch": 0.18041237113402062, "grad_norm": 1.803569017181721, "kl": 0.01959228515625, "learning_rate": 9.097938144329897e-07, "loss": 0.0008, "reward": 1.685543417930603, "reward_std": 0.13148176670074463, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.693355917930603, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 130.7265625, "epoch": 0.18127147766323023, "grad_norm": 1.8750450940771943, "kl": 0.02978515625, "learning_rate": 9.093642611683848e-07, "loss": 0.0012, "reward": 1.723173975944519, "reward_std": 0.21225081384181976, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7309865355491638, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 125.6640625, "epoch": 0.18213058419243985, "grad_norm": 1.8697489802004137, "kl": 0.0194091796875, "learning_rate": 9.0893470790378e-07, "loss": 0.0008, "reward": 1.773992896080017, "reward_std": 0.10730047896504402, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7818053960800171, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 140.6328125, "epoch": 0.18298969072164947, "grad_norm": 1.1778709789513797, "kl": 0.0174560546875, "learning_rate": 9.085051546391752e-07, "loss": 0.0007, "reward": 1.7802048325538635, "reward_std": 0.1343718022108078, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7802048325538635, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 129.171875, "epoch": 0.18384879725085912, "grad_norm": 1.6555684252295757, "kl": 0.0272216796875, "learning_rate": 9.080756013745704e-07, "loss": 0.0011, "reward": 1.6995335221290588, "reward_std": 0.26308566331863403, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7229710221290588, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 139.921875, "epoch": 0.18470790378006874, "grad_norm": 1.2357582827778086, "kl": 0.0206298828125, "learning_rate": 9.076460481099656e-07, "loss": 0.0008, "reward": 1.6951950788497925, "reward_std": 0.1771727241575718, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7186326384544373, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 128.15625, "epoch": 0.18556701030927836, "grad_norm": 1.7499046552420514, "kl": 0.02349853515625, "learning_rate": 9.072164948453608e-07, "loss": 0.0009, "reward": 1.826527178287506, "reward_std": 0.10365994274616241, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8343397080898285, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 126.9609375, "epoch": 0.18642611683848798, "grad_norm": 1.8005132756117428, "kl": 0.0301513671875, "learning_rate": 9.06786941580756e-07, "loss": 0.0012, "reward": 1.746228814125061, "reward_std": 0.20855771750211716, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7540413737297058, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 130.3359375, "epoch": 0.1872852233676976, "grad_norm": 1.9030503926874742, "kl": 0.0220947265625, "learning_rate": 9.063573883161512e-07, "loss": 0.0009, "reward": 1.7833629846572876, "reward_std": 0.12160376086831093, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7833629846572876, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 124.546875, "epoch": 0.18814432989690721, "grad_norm": 2.346663613038113, "kl": 0.02880859375, "learning_rate": 9.059278350515463e-07, "loss": 0.0012, "reward": 1.810929536819458, "reward_std": 0.12845928221940994, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8187420070171356, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 146.4140625, "epoch": 0.18900343642611683, "grad_norm": 1.951232732702119, "kl": 0.0296630859375, "learning_rate": 9.054982817869415e-07, "loss": 0.0012, "reward": 1.7572931051254272, "reward_std": 0.2043527588248253, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7807306349277496, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 131.3671875, "epoch": 0.18986254295532645, "grad_norm": 1.4773283463402496, "kl": 0.02398681640625, "learning_rate": 9.050687285223367e-07, "loss": 0.001, "reward": 1.8425282835960388, "reward_std": 0.09052041545510292, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8503407537937164, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 137.546875, "epoch": 0.19072164948453607, "grad_norm": 1.6430837871831834, "kl": 0.0224609375, "learning_rate": 9.046391752577319e-07, "loss": 0.0009, "reward": 1.6133168935775757, "reward_std": 0.1693805679678917, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.6367543637752533, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 123.3359375, "epoch": 0.19158075601374572, "grad_norm": 1.5498241580052614, "kl": 0.028076171875, "learning_rate": 9.042096219931271e-07, "loss": 0.0011, "reward": 1.832928478717804, "reward_std": 0.10157027468085289, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8407409191131592, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 127.3828125, "epoch": 0.19243986254295534, "grad_norm": 1.427575606765542, "kl": 0.02471923828125, "learning_rate": 9.037800687285223e-07, "loss": 0.001, "reward": 1.7951682806015015, "reward_std": 0.11901043727993965, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7951683104038239, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 133.5234375, "epoch": 0.19329896907216496, "grad_norm": 1.745841693534615, "kl": 0.05950927734375, "learning_rate": 9.033505154639175e-07, "loss": 0.0024, "reward": 1.7870480418205261, "reward_std": 0.1087818406522274, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8104856014251709, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 124.015625, "epoch": 0.19415807560137457, "grad_norm": 1.0774096537668199, "kl": 0.022216796875, "learning_rate": 9.029209621993127e-07, "loss": 0.0009, "reward": 1.8171106576919556, "reward_std": 0.10428809374570847, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.824923187494278, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 136.5234375, "epoch": 0.1950171821305842, "grad_norm": 2.0901239612312854, "kl": 0.02642822265625, "learning_rate": 9.024914089347078e-07, "loss": 0.0011, "reward": 1.7465097308158875, "reward_std": 0.14316842332482338, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7465097308158875, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 138.8125, "epoch": 0.1958762886597938, "grad_norm": 1.3420343738640794, "kl": 0.02850341796875, "learning_rate": 9.02061855670103e-07, "loss": 0.0011, "reward": 1.7858530282974243, "reward_std": 0.10892975702881813, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7936655879020691, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 132.6484375, "epoch": 0.19673539518900343, "grad_norm": 1.2697191544586062, "kl": 0.03155517578125, "learning_rate": 9.016323024054982e-07, "loss": 0.0013, "reward": 1.7829349040985107, "reward_std": 0.17472659796476364, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7985599339008331, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 148.53125, "epoch": 0.19759450171821305, "grad_norm": 1.2098132776705879, "kl": 0.02154541015625, "learning_rate": 9.012027491408934e-07, "loss": 0.0009, "reward": 1.8274175524711609, "reward_std": 0.1526181548833847, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8430425822734833, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 132.6015625, "epoch": 0.19845360824742267, "grad_norm": 1.3150045930691368, "kl": 0.0223388671875, "learning_rate": 9.007731958762886e-07, "loss": 0.0009, "reward": 1.7271833419799805, "reward_std": 0.1835765838623047, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7428083121776581, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 131.7578125, "epoch": 0.19931271477663232, "grad_norm": 1.1993647390638915, "kl": 0.031494140625, "learning_rate": 9.003436426116838e-07, "loss": 0.0013, "reward": 1.7216781377792358, "reward_std": 0.14081217721104622, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7373031079769135, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 129.8984375, "epoch": 0.20017182130584193, "grad_norm": 0.731243698465931, "kl": 0.02880859375, "learning_rate": 8.99914089347079e-07, "loss": 0.0012, "reward": 1.7585216164588928, "reward_std": 0.07797488383948803, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7663341164588928, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 135.171875, "epoch": 0.20103092783505155, "grad_norm": 1.4749482594585495, "kl": 0.021240234375, "learning_rate": 8.994845360824742e-07, "loss": 0.0009, "reward": 1.7960580587387085, "reward_std": 0.15863198041915894, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8116830587387085, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 141.21875, "epoch": 0.20189003436426117, "grad_norm": 1.773680287142342, "kl": 0.0177001953125, "learning_rate": 8.990549828178694e-07, "loss": 0.0007, "reward": 1.7810790538787842, "reward_std": 0.13028161227703094, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7810790240764618, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 128.6328125, "epoch": 0.2027491408934708, "grad_norm": 8.259511454922954, "kl": 0.0208740234375, "learning_rate": 8.986254295532645e-07, "loss": 0.0008, "reward": 1.8064086437225342, "reward_std": 0.12397770583629608, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8064086735248566, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 147.7890625, "epoch": 0.2036082474226804, "grad_norm": 1.308723092707521, "kl": 0.02813720703125, "learning_rate": 8.981958762886598e-07, "loss": 0.0011, "reward": 1.7656047344207764, "reward_std": 0.11740110442042351, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7656047642230988, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 126.9296875, "epoch": 0.20446735395189003, "grad_norm": 1.4015170970241377, "kl": 0.02276611328125, "learning_rate": 8.97766323024055e-07, "loss": 0.0009, "reward": 1.7170850038528442, "reward_std": 0.08975991420447826, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7170850038528442, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 133.921875, "epoch": 0.20532646048109965, "grad_norm": 1.7165031378836524, "kl": 0.031982421875, "learning_rate": 8.973367697594502e-07, "loss": 0.0013, "reward": 1.714181363582611, "reward_std": 0.1387975737452507, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7298063337802887, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 137.7734375, "epoch": 0.20618556701030927, "grad_norm": 1.1986523220206222, "kl": 0.021484375, "learning_rate": 8.969072164948454e-07, "loss": 0.0009, "reward": 1.70902019739151, "reward_std": 0.13189618289470673, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7090202271938324, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 148.0859375, "epoch": 0.20704467353951891, "grad_norm": 1.4338358835989133, "kl": 0.029052734375, "learning_rate": 8.964776632302406e-07, "loss": 0.0012, "reward": 1.7179991602897644, "reward_std": 0.16914209723472595, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7492491602897644, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 142.640625, "epoch": 0.20790378006872853, "grad_norm": 0.8698039683331652, "kl": 0.03466796875, "learning_rate": 8.960481099656358e-07, "loss": 0.0014, "reward": 1.817783772945404, "reward_std": 0.08906146325170994, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.825596272945404, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 136.46875, "epoch": 0.20876288659793815, "grad_norm": 3.260349200890088, "kl": 0.02471923828125, "learning_rate": 8.95618556701031e-07, "loss": 0.001, "reward": 1.698820948600769, "reward_std": 0.1781657487154007, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7066334784030914, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 137.1796875, "epoch": 0.20962199312714777, "grad_norm": 1.0734369880505024, "kl": 0.02301025390625, "learning_rate": 8.951890034364261e-07, "loss": 0.0009, "reward": 1.7488206624984741, "reward_std": 0.1078120581805706, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7566331923007965, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 140.4609375, "epoch": 0.2104810996563574, "grad_norm": 1.230920345198539, "kl": 0.03179931640625, "learning_rate": 8.947594501718213e-07, "loss": 0.0013, "reward": 1.7286466360092163, "reward_std": 0.16450222581624985, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7598966360092163, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 147.265625, "epoch": 0.211340206185567, "grad_norm": 4.523859706146446, "kl": 0.0262451171875, "learning_rate": 8.943298969072165e-07, "loss": 0.0011, "reward": 1.763322114944458, "reward_std": 0.1752321869134903, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7867595553398132, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 129.8359375, "epoch": 0.21219931271477663, "grad_norm": 1.363046353001964, "kl": 0.02996826171875, "learning_rate": 8.939003436426117e-07, "loss": 0.0012, "reward": 1.734560787677765, "reward_std": 0.17826475948095322, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7423732876777649, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 118.1015625, "epoch": 0.21305841924398625, "grad_norm": 1.2016846191079753, "kl": 0.02099609375, "learning_rate": 8.934707903780069e-07, "loss": 0.0008, "reward": 1.8029004335403442, "reward_std": 0.15353290364146233, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8107129037380219, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 134.953125, "epoch": 0.21391752577319587, "grad_norm": 1.6472740449517276, "kl": 0.03009033203125, "learning_rate": 8.930412371134021e-07, "loss": 0.0012, "reward": 1.728321135044098, "reward_std": 0.17561784386634827, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7439461648464203, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 114.2578125, "epoch": 0.21477663230240548, "grad_norm": 1.3208424429966092, "kl": 0.0274658203125, "learning_rate": 8.926116838487973e-07, "loss": 0.0011, "reward": 1.7464920282363892, "reward_std": 0.1454971358180046, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7543044984340668, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 119.8984375, "epoch": 0.21563573883161513, "grad_norm": 1.7163264390417992, "kl": 0.02838134765625, "learning_rate": 8.921821305841925e-07, "loss": 0.0011, "reward": 1.8135504722595215, "reward_std": 0.14255821704864502, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8135504722595215, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 127.46875, "epoch": 0.21649484536082475, "grad_norm": 1.4660984975320737, "kl": 0.0269775390625, "learning_rate": 8.917525773195877e-07, "loss": 0.0011, "reward": 1.6587898135185242, "reward_std": 0.16928455978631973, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.6666022539138794, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 138.2890625, "epoch": 0.21735395189003437, "grad_norm": 1.4026200377012226, "kl": 0.02838134765625, "learning_rate": 8.913230240549828e-07, "loss": 0.0011, "reward": 1.7479596138000488, "reward_std": 0.09517625346779823, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7479596734046936, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 116.734375, "epoch": 0.218213058419244, "grad_norm": 1.8900236317096133, "kl": 0.03192138671875, "learning_rate": 8.90893470790378e-07, "loss": 0.0013, "reward": 1.7791557312011719, "reward_std": 0.08186223357915878, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7791557610034943, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 121.0234375, "epoch": 0.2190721649484536, "grad_norm": 1.348548755398183, "kl": 0.035400390625, "learning_rate": 8.904639175257731e-07, "loss": 0.0014, "reward": 1.762001395225525, "reward_std": 0.10934092476963997, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7698139548301697, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 157.3359375, "epoch": 0.21993127147766323, "grad_norm": 1.4276465496185335, "kl": 0.0269775390625, "learning_rate": 8.900343642611683e-07, "loss": 0.0011, "reward": 1.675103783607483, "reward_std": 0.21139457076787949, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7063537836074829, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 120.515625, "epoch": 0.22079037800687284, "grad_norm": 1.133588870905801, "kl": 0.03448486328125, "learning_rate": 8.896048109965635e-07, "loss": 0.0014, "reward": 1.71183180809021, "reward_std": 0.10940879955887794, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7118318676948547, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 127.203125, "epoch": 0.22164948453608246, "grad_norm": 0.8368892953167869, "kl": 0.0330810546875, "learning_rate": 8.891752577319587e-07, "loss": 0.0013, "reward": 1.8508310914039612, "reward_std": 0.07327214255928993, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8508311212062836, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 144.3359375, "epoch": 0.22250859106529208, "grad_norm": 1.006775443305009, "kl": 0.0328369140625, "learning_rate": 8.887457044673539e-07, "loss": 0.0013, "reward": 1.8622272610664368, "reward_std": 0.1266963928937912, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8778522610664368, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 120.421875, "epoch": 0.22336769759450173, "grad_norm": 0.9631829361222369, "kl": 0.0396728515625, "learning_rate": 8.88316151202749e-07, "loss": 0.0016, "reward": 1.8064301013946533, "reward_std": 0.11555318906903267, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8142426609992981, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 132.375, "epoch": 0.22422680412371135, "grad_norm": 1.5467830446043906, "kl": 0.03759765625, "learning_rate": 8.878865979381442e-07, "loss": 0.0015, "reward": 1.7076144218444824, "reward_std": 0.16983914375305176, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7154269218444824, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 126.078125, "epoch": 0.22508591065292097, "grad_norm": 1.385570288599824, "kl": 0.0396728515625, "learning_rate": 8.874570446735394e-07, "loss": 0.0016, "reward": 1.727626919746399, "reward_std": 0.15141097083687782, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7432518899440765, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 144.109375, "epoch": 0.2259450171821306, "grad_norm": 1.0781029500650152, "kl": 0.03558349609375, "learning_rate": 8.870274914089346e-07, "loss": 0.0014, "reward": 1.8266533613204956, "reward_std": 0.13985048979520798, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8344658315181732, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 122.7109375, "epoch": 0.2268041237113402, "grad_norm": 1.5051081251639746, "kl": 0.04119873046875, "learning_rate": 8.865979381443298e-07, "loss": 0.0016, "reward": 1.9263587594032288, "reward_std": 0.06610674690455198, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.9263588190078735, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 114.953125, "epoch": 0.22766323024054982, "grad_norm": 1.2568559492969402, "kl": 0.043212890625, "learning_rate": 8.86168384879725e-07, "loss": 0.0017, "reward": 1.7441568970680237, "reward_std": 0.052992574870586395, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7441569268703461, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 128.8203125, "epoch": 0.22852233676975944, "grad_norm": 12.957475206253665, "kl": 0.03289794921875, "learning_rate": 8.857388316151202e-07, "loss": 0.0013, "reward": 1.7724300026893616, "reward_std": 0.12845268845558167, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.780242532491684, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 132.46875, "epoch": 0.22938144329896906, "grad_norm": 4.112320049307154, "kl": 0.030517578125, "learning_rate": 8.853092783505154e-07, "loss": 0.0012, "reward": 1.7406333684921265, "reward_std": 0.16561638191342354, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7484458982944489, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 143.0859375, "epoch": 0.23024054982817868, "grad_norm": 1.3042211470111829, "kl": 0.039306640625, "learning_rate": 8.848797250859106e-07, "loss": 0.0016, "reward": 1.7143108248710632, "reward_std": 0.13581188768148422, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7377482950687408, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 150.8828125, "epoch": 0.23109965635738833, "grad_norm": 1.928604268090887, "kl": 0.037353515625, "learning_rate": 8.844501718213057e-07, "loss": 0.0015, "reward": 1.8002280592918396, "reward_std": 0.15524645149707794, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8080405294895172, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 125.171875, "epoch": 0.23195876288659795, "grad_norm": 5.27894758100115, "kl": 0.02862548828125, "learning_rate": 8.840206185567009e-07, "loss": 0.0011, "reward": 1.8452274799346924, "reward_std": 0.15420031547546387, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8530399203300476, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 133.671875, "epoch": 0.23281786941580757, "grad_norm": 1.306834060633908, "kl": 0.0325927734375, "learning_rate": 8.835910652920961e-07, "loss": 0.0013, "reward": 1.6935831308364868, "reward_std": 0.154945969581604, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7092081606388092, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 130.8125, "epoch": 0.23367697594501718, "grad_norm": 1.2910579810108835, "kl": 0.03216552734375, "learning_rate": 8.831615120274913e-07, "loss": 0.0013, "reward": 1.785846471786499, "reward_std": 0.08592062070965767, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7936590611934662, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 137.4765625, "epoch": 0.2345360824742268, "grad_norm": 1.6016973929619667, "kl": 0.0333251953125, "learning_rate": 8.827319587628865e-07, "loss": 0.0013, "reward": 1.6971846222877502, "reward_std": 0.1201891340315342, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7049971222877502, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 136.0703125, "epoch": 0.23539518900343642, "grad_norm": 1.357010665541966, "kl": 0.03564453125, "learning_rate": 8.823024054982817e-07, "loss": 0.0014, "reward": 1.7180689573287964, "reward_std": 0.19224438071250916, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7415064871311188, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 130.0859375, "epoch": 0.23625429553264604, "grad_norm": 1.1159550961761515, "kl": 0.03515625, "learning_rate": 8.818728522336769e-07, "loss": 0.0014, "reward": 1.7949897646903992, "reward_std": 0.14928951114416122, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8184272944927216, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 137.6328125, "epoch": 0.23711340206185566, "grad_norm": 3.5515611265357485, "kl": 0.02630615234375, "learning_rate": 8.814432989690721e-07, "loss": 0.0011, "reward": 1.7773825526237488, "reward_std": 0.10580384358763695, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7773825824260712, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 137.4140625, "epoch": 0.23797250859106528, "grad_norm": 1.5580462509961617, "kl": 0.03564453125, "learning_rate": 8.810137457044672e-07, "loss": 0.0014, "reward": 1.7888085842132568, "reward_std": 0.21199411898851395, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8122460544109344, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 154.171875, "epoch": 0.23883161512027493, "grad_norm": 1.0543303085586277, "kl": 0.02984619140625, "learning_rate": 8.805841924398624e-07, "loss": 0.0012, "reward": 1.8188990950584412, "reward_std": 0.13441307097673416, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8423365950584412, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 146.8984375, "epoch": 0.23969072164948454, "grad_norm": 1.5265695805128905, "kl": 0.02557373046875, "learning_rate": 8.801546391752576e-07, "loss": 0.001, "reward": 1.7924699783325195, "reward_std": 0.16526872664690018, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8002825081348419, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 137.8671875, "epoch": 0.24054982817869416, "grad_norm": 1.1812679545358764, "kl": 0.0225830078125, "learning_rate": 8.797250859106528e-07, "loss": 0.0009, "reward": 1.827331304550171, "reward_std": 0.10759979486465454, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8273313045501709, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 129.28125, "epoch": 0.24140893470790378, "grad_norm": 1.5991468981516737, "kl": 0.0291748046875, "learning_rate": 8.792955326460481e-07, "loss": 0.0012, "reward": 1.7398195266723633, "reward_std": 0.19154010713100433, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7476319670677185, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 126.546875, "epoch": 0.2422680412371134, "grad_norm": 0.9116875920398242, "kl": 0.0299072265625, "learning_rate": 8.788659793814433e-07, "loss": 0.0012, "reward": 1.7467403411865234, "reward_std": 0.08945905789732933, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7545528709888458, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 134.46875, "epoch": 0.24312714776632302, "grad_norm": 0.8380935360171634, "kl": 0.0316162109375, "learning_rate": 8.784364261168385e-07, "loss": 0.0013, "reward": 1.8638163208961487, "reward_std": 0.09008487407118082, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8638162612915039, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 119.296875, "epoch": 0.24398625429553264, "grad_norm": 0.9161141195578109, "kl": 0.027587890625, "learning_rate": 8.780068728522337e-07, "loss": 0.0011, "reward": 1.8011575937271118, "reward_std": 0.0535787851549685, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8011575937271118, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 132.3203125, "epoch": 0.24484536082474226, "grad_norm": 1.9166929431895137, "kl": 0.024658203125, "learning_rate": 8.775773195876289e-07, "loss": 0.001, "reward": 1.7418934106826782, "reward_std": 0.14604221284389496, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7497059106826782, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 124.2890625, "epoch": 0.24570446735395188, "grad_norm": 1.1252522693717517, "kl": 0.02996826171875, "learning_rate": 8.77147766323024e-07, "loss": 0.0012, "reward": 1.7983113527297974, "reward_std": 0.09610910713672638, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.806123822927475, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 135.921875, "epoch": 0.24656357388316152, "grad_norm": 1.3367048117093618, "kl": 0.034912109375, "learning_rate": 8.767182130584192e-07, "loss": 0.0014, "reward": 1.7578275799751282, "reward_std": 0.14203480631113052, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.765640139579773, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 133.625, "epoch": 0.24742268041237114, "grad_norm": 0.724502338816319, "kl": 0.03143310546875, "learning_rate": 8.762886597938144e-07, "loss": 0.0013, "reward": 1.7635814547538757, "reward_std": 0.0678582051768899, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7635815143585205, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 163.6953125, "epoch": 0.24828178694158076, "grad_norm": 1.1733316115040695, "kl": 0.0283203125, "learning_rate": 8.758591065292096e-07, "loss": 0.0011, "reward": 1.7535958290100098, "reward_std": 0.16154304146766663, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7614083290100098, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 129.8046875, "epoch": 0.24914089347079038, "grad_norm": 1.4167971747096428, "kl": 0.0279541015625, "learning_rate": 8.754295532646048e-07, "loss": 0.0011, "reward": 1.795049250125885, "reward_std": 0.07856442406773567, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7950493097305298, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 136.734375, "epoch": 0.25, "grad_norm": 3.1009628872255646, "kl": 0.03045654296875, "learning_rate": 8.75e-07, "loss": 0.0012, "reward": 1.8120030760765076, "reward_std": 0.16107841953635216, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8276280760765076, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 134.7265625, "epoch": 0.2508591065292096, "grad_norm": 1.1395121387903864, "kl": 0.02764892578125, "learning_rate": 8.745704467353952e-07, "loss": 0.0011, "reward": 1.7605109214782715, "reward_std": 0.093373142182827, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7683233916759491, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 129.34375, "epoch": 0.25171821305841924, "grad_norm": 1.0703218519168713, "kl": 0.03216552734375, "learning_rate": 8.741408934707904e-07, "loss": 0.0013, "reward": 1.880930781364441, "reward_std": 0.11160749942064285, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8887432813644409, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 133.1484375, "epoch": 0.25257731958762886, "grad_norm": 3.1378356819250217, "kl": 0.0274658203125, "learning_rate": 8.737113402061856e-07, "loss": 0.0011, "reward": 1.8660714626312256, "reward_std": 0.1355828121304512, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8738839626312256, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 121.90625, "epoch": 0.2534364261168385, "grad_norm": 0.9752935666515755, "kl": 0.02984619140625, "learning_rate": 8.732817869415807e-07, "loss": 0.0012, "reward": 1.8270512223243713, "reward_std": 0.09455467015504837, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8348636627197266, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 129.4140625, "epoch": 0.2542955326460481, "grad_norm": 2.5028271019973256, "kl": 0.02728271484375, "learning_rate": 8.728522336769759e-07, "loss": 0.0011, "reward": 1.81145840883255, "reward_std": 0.1066664457321167, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8114584386348724, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 115.6171875, "epoch": 0.2551546391752577, "grad_norm": 1.9066759357991514, "kl": 0.03387451171875, "learning_rate": 8.724226804123711e-07, "loss": 0.0014, "reward": 1.7507621049880981, "reward_std": 0.1560019999742508, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7585746049880981, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 119.296875, "epoch": 0.25601374570446733, "grad_norm": 1.659392268817117, "kl": 0.03125, "learning_rate": 8.719931271477663e-07, "loss": 0.0012, "reward": 1.8714821338653564, "reward_std": 0.10832902416586876, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8792945742607117, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 118.0234375, "epoch": 0.25687285223367695, "grad_norm": 1.109700712202529, "kl": 0.02984619140625, "learning_rate": 8.715635738831615e-07, "loss": 0.0012, "reward": 1.8865798711776733, "reward_std": 0.08204978704452515, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8865799009799957, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 121.015625, "epoch": 0.25773195876288657, "grad_norm": 1.0487633124419409, "kl": 0.0244140625, "learning_rate": 8.711340206185567e-07, "loss": 0.001, "reward": 1.809591829776764, "reward_std": 0.15175417065620422, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8330293297767639, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 135.25, "epoch": 0.25859106529209624, "grad_norm": 1.5750530363933386, "kl": 0.04327392578125, "learning_rate": 8.707044673539519e-07, "loss": 0.0017, "reward": 1.792675495147705, "reward_std": 0.08446579799056053, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8004880249500275, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 133.0546875, "epoch": 0.25945017182130586, "grad_norm": 1.9949049568284047, "kl": 0.026123046875, "learning_rate": 8.702749140893471e-07, "loss": 0.001, "reward": 1.8067399859428406, "reward_std": 0.14312193542718887, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8145524859428406, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 126.796875, "epoch": 0.2603092783505155, "grad_norm": 1.7326491889574818, "kl": 0.02447509765625, "learning_rate": 8.698453608247422e-07, "loss": 0.001, "reward": 1.8493663668632507, "reward_std": 0.06654036790132523, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8493663370609283, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 119.4296875, "epoch": 0.2611683848797251, "grad_norm": 1.9340178331586866, "kl": 0.0367431640625, "learning_rate": 8.694158075601374e-07, "loss": 0.0015, "reward": 1.7990782260894775, "reward_std": 0.14300024509429932, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7990781962871552, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 138.3046875, "epoch": 0.2620274914089347, "grad_norm": 1.6168654993721436, "kl": 0.0328369140625, "learning_rate": 8.689862542955326e-07, "loss": 0.0013, "reward": 1.7844675183296204, "reward_std": 0.10683033242821693, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.784467488527298, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 131.4921875, "epoch": 0.26288659793814434, "grad_norm": 1.3435959118517327, "kl": 0.0362548828125, "learning_rate": 8.685567010309278e-07, "loss": 0.0015, "reward": 1.7007364630699158, "reward_std": 0.10049042850732803, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7007364630699158, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 129.8359375, "epoch": 0.26374570446735396, "grad_norm": 1.7041209629618606, "kl": 0.02471923828125, "learning_rate": 8.68127147766323e-07, "loss": 0.001, "reward": 1.7297524213790894, "reward_std": 0.11169169098138809, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7297524213790894, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 133.1171875, "epoch": 0.2646048109965636, "grad_norm": 1.0611383574670885, "kl": 0.0352783203125, "learning_rate": 8.676975945017182e-07, "loss": 0.0014, "reward": 1.765690803527832, "reward_std": 0.07691102847456932, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7656907737255096, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 133.5, "epoch": 0.2654639175257732, "grad_norm": 1.810412989768572, "kl": 0.061279296875, "learning_rate": 8.672680412371134e-07, "loss": 0.0024, "reward": 1.6709439158439636, "reward_std": 0.11246037483215332, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.6709439158439636, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 138.1796875, "epoch": 0.2663230240549828, "grad_norm": 3.183250415302431, "kl": 0.03106689453125, "learning_rate": 8.668384879725086e-07, "loss": 0.0012, "reward": 1.7835421562194824, "reward_std": 0.08028195053339005, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7835421562194824, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 148.3515625, "epoch": 0.26718213058419243, "grad_norm": 1.290985809671743, "kl": 0.033203125, "learning_rate": 8.664089347079037e-07, "loss": 0.0013, "reward": 1.7408164143562317, "reward_std": 0.15223591029644012, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7564414143562317, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 130.75, "epoch": 0.26804123711340205, "grad_norm": 1.8077145483203556, "kl": 0.040283203125, "learning_rate": 8.659793814432989e-07, "loss": 0.0016, "reward": 1.7184379696846008, "reward_std": 0.12002924084663391, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7340629696846008, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 134.3828125, "epoch": 0.26890034364261167, "grad_norm": 2.7429724901343433, "kl": 0.048583984375, "learning_rate": 8.655498281786941e-07, "loss": 0.0019, "reward": 1.731776773929596, "reward_std": 0.1657547503709793, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.747401773929596, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 138.0234375, "epoch": 0.2697594501718213, "grad_norm": 0.9847082773950215, "kl": 0.0380859375, "learning_rate": 8.651202749140893e-07, "loss": 0.0015, "reward": 1.701866626739502, "reward_std": 0.12148784846067429, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7018666565418243, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 156.46875, "epoch": 0.2706185567010309, "grad_norm": 1.5399880460978463, "kl": 0.03057861328125, "learning_rate": 8.646907216494845e-07, "loss": 0.0012, "reward": 1.6783122420310974, "reward_std": 0.1540703848004341, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.686124712228775, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 146.9921875, "epoch": 0.27147766323024053, "grad_norm": 1.5313679692857116, "kl": 0.0394287109375, "learning_rate": 8.642611683848797e-07, "loss": 0.0016, "reward": 1.7149747014045715, "reward_std": 0.2113272026181221, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7462246716022491, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 153.3984375, "epoch": 0.27233676975945015, "grad_norm": 1.3181550613878923, "kl": 0.0369873046875, "learning_rate": 8.638316151202749e-07, "loss": 0.0015, "reward": 1.8070083856582642, "reward_std": 0.18914027512073517, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8226334154605865, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 147.4453125, "epoch": 0.27319587628865977, "grad_norm": 1.1473725643237571, "kl": 0.035400390625, "learning_rate": 8.634020618556701e-07, "loss": 0.0014, "reward": 1.8190871477127075, "reward_std": 0.15289827063679695, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.8503372073173523, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 157.6015625, "epoch": 0.27405498281786944, "grad_norm": 0.7143585607947084, "kl": 0.03167724609375, "learning_rate": 8.629725085910653e-07, "loss": 0.0013, "reward": 1.8210635781288147, "reward_std": 0.07932163029909134, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8210635781288147, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 143.28125, "epoch": 0.27491408934707906, "grad_norm": 1.3665635736893489, "kl": 0.02557373046875, "learning_rate": 8.625429553264604e-07, "loss": 0.001, "reward": 1.7617181539535522, "reward_std": 0.1237938292324543, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7617180943489075, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 140.1953125, "epoch": 0.2757731958762887, "grad_norm": 1.1585944459835054, "kl": 0.0296630859375, "learning_rate": 8.621134020618556e-07, "loss": 0.0012, "reward": 1.7693681120872498, "reward_std": 0.15534278005361557, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7771806418895721, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 132.359375, "epoch": 0.2766323024054983, "grad_norm": 1.3885088694278251, "kl": 0.03216552734375, "learning_rate": 8.616838487972508e-07, "loss": 0.0013, "reward": 1.6785261631011963, "reward_std": 0.11299961805343628, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.6785261631011963, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 147.0234375, "epoch": 0.2774914089347079, "grad_norm": 1.0963555730545884, "kl": 0.035888671875, "learning_rate": 8.612542955326461e-07, "loss": 0.0014, "reward": 1.724756121635437, "reward_std": 0.15419768542051315, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7325686514377594, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 141.734375, "epoch": 0.27835051546391754, "grad_norm": 1.9036632085091891, "kl": 0.029296875, "learning_rate": 8.608247422680413e-07, "loss": 0.0012, "reward": 1.7815952897071838, "reward_std": 0.07885266095399857, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7815952897071838, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 128.078125, "epoch": 0.27920962199312716, "grad_norm": 0.8583001841176572, "kl": 0.03070068359375, "learning_rate": 8.603951890034365e-07, "loss": 0.0012, "reward": 1.805925726890564, "reward_std": 0.08024111017584801, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8059256374835968, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 136.4921875, "epoch": 0.2800687285223368, "grad_norm": 1.6115188524753052, "kl": 0.0347900390625, "learning_rate": 8.599656357388317e-07, "loss": 0.0014, "reward": 1.6898165941238403, "reward_std": 0.17968852818012238, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7132540941238403, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 136.09375, "epoch": 0.2809278350515464, "grad_norm": 3.0925571712058755, "kl": 0.0787353515625, "learning_rate": 8.595360824742269e-07, "loss": 0.0032, "reward": 1.7974568605422974, "reward_std": 0.1517610400915146, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8208943903446198, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 171.8125, "epoch": 0.281786941580756, "grad_norm": 1.0479920039978226, "kl": 0.02923583984375, "learning_rate": 8.591065292096219e-07, "loss": 0.0012, "reward": 1.8526938557624817, "reward_std": 0.07072590291500092, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8526938557624817, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 148.2734375, "epoch": 0.28264604810996563, "grad_norm": 2.741217901691856, "kl": 0.03240966796875, "learning_rate": 8.586769759450171e-07, "loss": 0.0013, "reward": 1.7304821014404297, "reward_std": 0.10214803740382195, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7461071014404297, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 161.578125, "epoch": 0.28350515463917525, "grad_norm": 1.1171189004763298, "kl": 0.0263671875, "learning_rate": 8.582474226804123e-07, "loss": 0.0011, "reward": 1.7979252934455872, "reward_std": 0.12719742208719254, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8057378232479095, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 151.4765625, "epoch": 0.28436426116838487, "grad_norm": 2.031628516822189, "kl": 0.05181884765625, "learning_rate": 8.578178694158075e-07, "loss": 0.0021, "reward": 1.6497213244438171, "reward_std": 0.1195969358086586, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.6731588840484619, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 134.4453125, "epoch": 0.2852233676975945, "grad_norm": 2.801651555129562, "kl": 0.02777099609375, "learning_rate": 8.573883161512027e-07, "loss": 0.0011, "reward": 1.8582192659378052, "reward_std": 0.10834803432226181, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8738442659378052, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 140.6640625, "epoch": 0.2860824742268041, "grad_norm": 11.067746215654495, "kl": 0.02642822265625, "learning_rate": 8.569587628865979e-07, "loss": 0.0011, "reward": 1.7104763984680176, "reward_std": 0.19199497997760773, "rewards/format_reward_gen": 0.9609375, "rewards/llm_reward": 0.74953892827034, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 125.6796875, "epoch": 0.2869415807560137, "grad_norm": 1.2489715002859365, "kl": 0.0211181640625, "learning_rate": 8.565292096219931e-07, "loss": 0.0008, "reward": 1.6395170092582703, "reward_std": 0.1772407442331314, "rewards/format_reward_gen": 0.9296875, "rewards/llm_reward": 0.7098294794559479, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 146.953125, "epoch": 0.28780068728522334, "grad_norm": 1.3905135992782467, "kl": 0.02386474609375, "learning_rate": 8.560996563573883e-07, "loss": 0.001, "reward": 1.6524567604064941, "reward_std": 0.22227772325277328, "rewards/format_reward_gen": 0.921875, "rewards/llm_reward": 0.7305817306041718, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 154.75, "epoch": 0.28865979381443296, "grad_norm": 6.054324722106658, "kl": 0.02081298828125, "learning_rate": 8.556701030927834e-07, "loss": 0.0008, "reward": 1.7611451148986816, "reward_std": 0.2188207283616066, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7923950850963593, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 136.796875, "epoch": 0.28951890034364264, "grad_norm": 3.7876750335169507, "kl": 0.021484375, "learning_rate": 8.552405498281786e-07, "loss": 0.0009, "reward": 1.681132197380066, "reward_std": 0.23194558173418045, "rewards/format_reward_gen": 0.9296875, "rewards/llm_reward": 0.7514447271823883, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 139.84375, "epoch": 0.29037800687285226, "grad_norm": 1.4224659056279383, "kl": 0.02587890625, "learning_rate": 8.548109965635738e-07, "loss": 0.001, "reward": 1.78468519449234, "reward_std": 0.2202548086643219, "rewards/format_reward_gen": 0.9453125, "rewards/llm_reward": 0.8393726646900177, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 142.328125, "epoch": 0.2912371134020619, "grad_norm": 1.811194222155832, "kl": 0.024658203125, "learning_rate": 8.54381443298969e-07, "loss": 0.001, "reward": 1.8332806825637817, "reward_std": 0.09518006816506386, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8567183017730713, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 148.078125, "epoch": 0.2920962199312715, "grad_norm": 1.45115586999827, "kl": 0.02301025390625, "learning_rate": 8.539518900343642e-07, "loss": 0.0009, "reward": 1.7859101295471191, "reward_std": 0.14324617385864258, "rewards/format_reward_gen": 0.9609375, "rewards/llm_reward": 0.8249726593494415, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 131.6015625, "epoch": 0.2929553264604811, "grad_norm": 1.3891780059241028, "kl": 0.0213623046875, "learning_rate": 8.535223367697594e-07, "loss": 0.0009, "reward": 1.7494396567344666, "reward_std": 0.14654994010925293, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7572520971298218, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 139.5, "epoch": 0.29381443298969073, "grad_norm": 1.0515730604032878, "kl": 0.0264892578125, "learning_rate": 8.530927835051546e-07, "loss": 0.0011, "reward": 1.7149075269699097, "reward_std": 0.12536488845944405, "rewards/format_reward_gen": 0.9453125, "rewards/llm_reward": 0.7695950269699097, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 152.171875, "epoch": 0.29467353951890035, "grad_norm": 1.884237347147452, "kl": 0.0213623046875, "learning_rate": 8.526632302405498e-07, "loss": 0.0009, "reward": 1.7150232791900635, "reward_std": 0.17639710754156113, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7306482791900635, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 137.0390625, "epoch": 0.29553264604810997, "grad_norm": 1.0301696531429079, "kl": 0.02069091796875, "learning_rate": 8.52233676975945e-07, "loss": 0.0008, "reward": 1.8793914318084717, "reward_std": 0.04007810167968273, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8793914318084717, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 130.0859375, "epoch": 0.2963917525773196, "grad_norm": 1.6887023989278636, "kl": 0.022216796875, "learning_rate": 8.518041237113401e-07, "loss": 0.0009, "reward": 1.7447214126586914, "reward_std": 0.16757074743509293, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7447213530540466, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 121.859375, "epoch": 0.2972508591065292, "grad_norm": 0.9804680400847956, "kl": 0.026123046875, "learning_rate": 8.513745704467353e-07, "loss": 0.001, "reward": 1.7684549689292908, "reward_std": 0.06802868098020554, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7762674689292908, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 131.203125, "epoch": 0.2981099656357388, "grad_norm": 1.4132838829734102, "kl": 0.02655029296875, "learning_rate": 8.509450171821305e-07, "loss": 0.0011, "reward": 1.7236329913139343, "reward_std": 0.15013974159955978, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7392580211162567, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 135.34375, "epoch": 0.29896907216494845, "grad_norm": 3.798539075577766, "kl": 0.03009033203125, "learning_rate": 8.505154639175257e-07, "loss": 0.0012, "reward": 1.7059985995292664, "reward_std": 0.13966944441199303, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7059986293315887, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 118.7578125, "epoch": 0.29982817869415807, "grad_norm": 1.4928606296691562, "kl": 0.05828857421875, "learning_rate": 8.500859106529209e-07, "loss": 0.0023, "reward": 1.7457636594772339, "reward_std": 0.16971006989479065, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7770136296749115, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 121.828125, "epoch": 0.3006872852233677, "grad_norm": 8.775228795942514, "kl": 0.02349853515625, "learning_rate": 8.496563573883161e-07, "loss": 0.0009, "reward": 1.75225168466568, "reward_std": 0.11466844379901886, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7522516846656799, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 128.2578125, "epoch": 0.3015463917525773, "grad_norm": 1.2582262027913427, "kl": 0.02496337890625, "learning_rate": 8.492268041237113e-07, "loss": 0.001, "reward": 1.8221701383590698, "reward_std": 0.09425491839647293, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8299825489521027, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 115.328125, "epoch": 0.3024054982817869, "grad_norm": 8.10337252047317, "kl": 0.029541015625, "learning_rate": 8.487972508591065e-07, "loss": 0.0012, "reward": 1.7997434735298157, "reward_std": 0.07890489138662815, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7997434437274933, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 121.4375, "epoch": 0.30326460481099654, "grad_norm": 0.8630573702061485, "kl": 0.025146484375, "learning_rate": 8.483676975945016e-07, "loss": 0.001, "reward": 1.735402524471283, "reward_std": 0.1313318181782961, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7510275840759277, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 116.671875, "epoch": 0.30412371134020616, "grad_norm": 2.294001840527455, "kl": 0.021240234375, "learning_rate": 8.479381443298968e-07, "loss": 0.0008, "reward": 1.8022547364234924, "reward_std": 0.14323440939188004, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.81006720662117, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 116.0625, "epoch": 0.30498281786941583, "grad_norm": 1.3365205015914472, "kl": 0.0291748046875, "learning_rate": 8.47508591065292e-07, "loss": 0.0012, "reward": 1.798896074295044, "reward_std": 0.14688794314861298, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.806708574295044, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 116.65625, "epoch": 0.30584192439862545, "grad_norm": 1.3467405037883613, "kl": 0.03033447265625, "learning_rate": 8.470790378006872e-07, "loss": 0.0012, "reward": 1.7370511293411255, "reward_std": 0.1430470496416092, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7448635101318359, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 119.3125, "epoch": 0.30670103092783507, "grad_norm": 1.4250852392657183, "kl": 0.02288818359375, "learning_rate": 8.466494845360824e-07, "loss": 0.0009, "reward": 1.7630817294120789, "reward_std": 0.099862240254879, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7630816698074341, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 114.484375, "epoch": 0.3075601374570447, "grad_norm": 1.001564962464479, "kl": 0.0289306640625, "learning_rate": 8.462199312714776e-07, "loss": 0.0012, "reward": 1.7671634554862976, "reward_std": 0.07000754773616791, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7671634256839752, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 110.234375, "epoch": 0.3084192439862543, "grad_norm": 1.7447567831684878, "kl": 0.02752685546875, "learning_rate": 8.457903780068728e-07, "loss": 0.0011, "reward": 1.6856686472892761, "reward_std": 0.12096000462770462, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.6856685876846313, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 122.0078125, "epoch": 0.30927835051546393, "grad_norm": 1.2242293521408898, "kl": 0.02239990234375, "learning_rate": 8.45360824742268e-07, "loss": 0.0009, "reward": 1.8242759108543396, "reward_std": 0.08643890917301178, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8242759108543396, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 118.65625, "epoch": 0.31013745704467355, "grad_norm": 1.5243424904852039, "kl": 0.0262451171875, "learning_rate": 8.449312714776631e-07, "loss": 0.001, "reward": 1.7807704210281372, "reward_std": 0.0936017706990242, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7885829210281372, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 122.734375, "epoch": 0.31099656357388317, "grad_norm": 1.4884814724042141, "kl": 0.022216796875, "learning_rate": 8.445017182130583e-07, "loss": 0.0009, "reward": 1.8066073656082153, "reward_std": 0.09719736129045486, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8144198656082153, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 123.359375, "epoch": 0.3118556701030928, "grad_norm": 1.4940528435261182, "kl": 0.02490234375, "learning_rate": 8.440721649484535e-07, "loss": 0.001, "reward": 1.6919100880622864, "reward_std": 0.08997226506471634, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.6919100880622864, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 132.2265625, "epoch": 0.3127147766323024, "grad_norm": 1.0097324218360122, "kl": 0.023681640625, "learning_rate": 8.436426116838487e-07, "loss": 0.0009, "reward": 1.7592207789421082, "reward_std": 0.12093230336904526, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7592207789421082, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 120.3984375, "epoch": 0.313573883161512, "grad_norm": 1.1761577149895857, "kl": 0.01922607421875, "learning_rate": 8.432130584192439e-07, "loss": 0.0008, "reward": 1.7229613065719604, "reward_std": 0.1399160847067833, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7229613661766052, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 129.0390625, "epoch": 0.31443298969072164, "grad_norm": 2.443207308672, "kl": 0.028076171875, "learning_rate": 8.427835051546391e-07, "loss": 0.0011, "reward": 1.78279447555542, "reward_std": 0.08378404378890991, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7827944755554199, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 127.15625, "epoch": 0.31529209621993126, "grad_norm": 1.3122123216803352, "kl": 0.0269775390625, "learning_rate": 8.423539518900344e-07, "loss": 0.0011, "reward": 1.7783340811729431, "reward_std": 0.13799217343330383, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7783340811729431, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 133.8125, "epoch": 0.3161512027491409, "grad_norm": 1.6680208275489512, "kl": 0.0198974609375, "learning_rate": 8.419243986254296e-07, "loss": 0.0008, "reward": 1.7162877321243286, "reward_std": 0.07326348312199116, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7162877917289734, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 119.3125, "epoch": 0.3170103092783505, "grad_norm": 1.382745000875992, "kl": 0.02130126953125, "learning_rate": 8.414948453608248e-07, "loss": 0.0009, "reward": 1.79585862159729, "reward_std": 0.13986750692129135, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.79585862159729, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 120.625, "epoch": 0.3178694158075601, "grad_norm": 1.1013774115401274, "kl": 0.02069091796875, "learning_rate": 8.410652920962199e-07, "loss": 0.0008, "reward": 1.753614366054535, "reward_std": 0.10165842995047569, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7536143660545349, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 137.4765625, "epoch": 0.31872852233676974, "grad_norm": 1.7113488988118148, "kl": 0.0211181640625, "learning_rate": 8.406357388316151e-07, "loss": 0.0008, "reward": 1.861440122127533, "reward_std": 0.06895541399717331, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8692527115345001, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 136.9765625, "epoch": 0.31958762886597936, "grad_norm": 1.103897326474956, "kl": 0.021484375, "learning_rate": 8.402061855670103e-07, "loss": 0.0009, "reward": 1.7671774625778198, "reward_std": 0.09223662130534649, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7671774327754974, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 131.59375, "epoch": 0.32044673539518903, "grad_norm": 1.1271147620986939, "kl": 0.018157958984375, "learning_rate": 8.397766323024055e-07, "loss": 0.0007, "reward": 1.7452142238616943, "reward_std": 0.12823403999209404, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7530267238616943, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 129.59375, "epoch": 0.32130584192439865, "grad_norm": 1.1312273949872451, "kl": 0.0181884765625, "learning_rate": 8.393470790378007e-07, "loss": 0.0007, "reward": 1.76444673538208, "reward_std": 0.08396412804722786, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7644466161727905, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 123.3359375, "epoch": 0.32216494845360827, "grad_norm": 1.1834607042303573, "kl": 0.02215576171875, "learning_rate": 8.389175257731959e-07, "loss": 0.0009, "reward": 1.7634702920913696, "reward_std": 0.07862638682126999, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7712827622890472, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 128.6796875, "epoch": 0.3230240549828179, "grad_norm": 1.218745212804485, "kl": 0.027587890625, "learning_rate": 8.384879725085911e-07, "loss": 0.0011, "reward": 1.773351788520813, "reward_std": 0.10650411248207092, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7889767587184906, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 142.5625, "epoch": 0.3238831615120275, "grad_norm": 2.9765457354255247, "kl": 0.0250244140625, "learning_rate": 8.380584192439863e-07, "loss": 0.001, "reward": 1.8011161088943481, "reward_std": 0.14274968206882477, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8089286386966705, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 137.703125, "epoch": 0.3247422680412371, "grad_norm": 4.935642518044648, "kl": 0.01898193359375, "learning_rate": 8.376288659793815e-07, "loss": 0.0008, "reward": 1.8446325659751892, "reward_std": 0.12704916298389435, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8524451553821564, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 125.2109375, "epoch": 0.32560137457044674, "grad_norm": 0.8982810314616194, "kl": 0.0203857421875, "learning_rate": 8.371993127147766e-07, "loss": 0.0008, "reward": 1.8438200950622559, "reward_std": 0.07897239178419113, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8438200354576111, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 125.6328125, "epoch": 0.32646048109965636, "grad_norm": 0.8997947654002385, "kl": 0.02166748046875, "learning_rate": 8.367697594501718e-07, "loss": 0.0009, "reward": 1.8457976579666138, "reward_std": 0.05379333579912782, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.845797598361969, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 123.171875, "epoch": 0.327319587628866, "grad_norm": 1.221630816341642, "kl": 0.0252685546875, "learning_rate": 8.36340206185567e-07, "loss": 0.001, "reward": 1.755448818206787, "reward_std": 0.12511924654245377, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7710737586021423, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 124.0390625, "epoch": 0.3281786941580756, "grad_norm": 1.7175446698553867, "kl": 0.0228271484375, "learning_rate": 8.359106529209622e-07, "loss": 0.0009, "reward": 1.8109247088432312, "reward_std": 0.10409371182322502, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8187372088432312, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 124.4296875, "epoch": 0.3290378006872852, "grad_norm": 1.4545443289160316, "kl": 0.0201416015625, "learning_rate": 8.354810996563574e-07, "loss": 0.0008, "reward": 1.8992087244987488, "reward_std": 0.07725157961249352, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8992086946964264, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 131.6015625, "epoch": 0.32989690721649484, "grad_norm": 2.1100358243584743, "kl": 0.0252685546875, "learning_rate": 8.350515463917526e-07, "loss": 0.001, "reward": 1.8205956816673279, "reward_std": 0.12146460637450218, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8284082114696503, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 119.9140625, "epoch": 0.33075601374570446, "grad_norm": 1.3804484761106177, "kl": 0.02508544921875, "learning_rate": 8.346219931271478e-07, "loss": 0.001, "reward": 1.7877458333969116, "reward_std": 0.13950634747743607, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7955583333969116, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 130.703125, "epoch": 0.3316151202749141, "grad_norm": 1.0084199177905135, "kl": 0.0240478515625, "learning_rate": 8.34192439862543e-07, "loss": 0.001, "reward": 1.7355512380599976, "reward_std": 0.11318296939134598, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7355512380599976, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 149.8125, "epoch": 0.3324742268041237, "grad_norm": 0.8344574864466167, "kl": 0.02178955078125, "learning_rate": 8.337628865979381e-07, "loss": 0.0009, "reward": 1.8301225900650024, "reward_std": 0.08988046087324619, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.83012256026268, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 121.3203125, "epoch": 0.3333333333333333, "grad_norm": 0.835913485605179, "kl": 0.0225830078125, "learning_rate": 8.333333333333333e-07, "loss": 0.0009, "reward": 1.7770044803619385, "reward_std": 0.11324162408709526, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7770044505596161, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 151.6640625, "epoch": 0.33419243986254293, "grad_norm": 1.0924953323879845, "kl": 0.02471923828125, "learning_rate": 8.329037800687285e-07, "loss": 0.001, "reward": 1.7987311482429504, "reward_std": 0.08528102748095989, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.806543618440628, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 137.8984375, "epoch": 0.33505154639175255, "grad_norm": 0.7425828858782676, "kl": 0.02239990234375, "learning_rate": 8.324742268041237e-07, "loss": 0.0009, "reward": 1.8864429593086243, "reward_std": 0.09466363862156868, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8942554891109467, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 137.359375, "epoch": 0.33591065292096217, "grad_norm": 1.4256854867993622, "kl": 0.020355224609375, "learning_rate": 8.320446735395189e-07, "loss": 0.0008, "reward": 1.805371105670929, "reward_std": 0.08615696430206299, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8053711354732513, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 143.6796875, "epoch": 0.33676975945017185, "grad_norm": 1.2154506440783912, "kl": 0.02349853515625, "learning_rate": 8.316151202749141e-07, "loss": 0.0009, "reward": 1.7635123133659363, "reward_std": 0.125723734498024, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7713248431682587, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 141.3203125, "epoch": 0.33762886597938147, "grad_norm": 1.063545112594486, "kl": 0.023193359375, "learning_rate": 8.311855670103093e-07, "loss": 0.0009, "reward": 1.7525449395179749, "reward_std": 0.09935981594026089, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7603574395179749, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 136.2734375, "epoch": 0.3384879725085911, "grad_norm": 1.1910010378396947, "kl": 0.0213623046875, "learning_rate": 8.307560137457045e-07, "loss": 0.0009, "reward": 1.8792944550514221, "reward_std": 0.07655412331223488, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8792945444583893, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 138.4453125, "epoch": 0.3393470790378007, "grad_norm": 1.0352744087470642, "kl": 0.0286865234375, "learning_rate": 8.303264604810996e-07, "loss": 0.0011, "reward": 1.817804753780365, "reward_std": 0.09332024306058884, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.825617253780365, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 129.984375, "epoch": 0.3402061855670103, "grad_norm": 3.4370614566686366, "kl": 0.02947998046875, "learning_rate": 8.298969072164948e-07, "loss": 0.0012, "reward": 1.7064816355705261, "reward_std": 0.11401160806417465, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7064816355705261, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 137.5546875, "epoch": 0.34106529209621994, "grad_norm": 1.8575760022730172, "kl": 0.02227783203125, "learning_rate": 8.2946735395189e-07, "loss": 0.0009, "reward": 1.7460897564888, "reward_std": 0.1139952689409256, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7539022862911224, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 128.09375, "epoch": 0.34192439862542956, "grad_norm": 1.2996032361936032, "kl": 0.02801513671875, "learning_rate": 8.290378006872852e-07, "loss": 0.0011, "reward": 1.8059646487236023, "reward_std": 0.1373230665922165, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8215896785259247, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 133.3984375, "epoch": 0.3427835051546392, "grad_norm": 1.6501841523869047, "kl": 0.0233154296875, "learning_rate": 8.286082474226804e-07, "loss": 0.0009, "reward": 1.809871256351471, "reward_std": 0.12983518466353416, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8098713159561157, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 141.34375, "epoch": 0.3436426116838488, "grad_norm": 1.111398957842247, "kl": 0.02349853515625, "learning_rate": 8.281786941580756e-07, "loss": 0.0009, "reward": 1.8126710057258606, "reward_std": 0.0854789987206459, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8204834461212158, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 150.875, "epoch": 0.3445017182130584, "grad_norm": 1.5863639489406471, "kl": 0.02459716796875, "learning_rate": 8.277491408934707e-07, "loss": 0.001, "reward": 1.7722662687301636, "reward_std": 0.0587652251124382, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.772266298532486, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 134.875, "epoch": 0.34536082474226804, "grad_norm": 1.1575507587682292, "kl": 0.04693603515625, "learning_rate": 8.273195876288659e-07, "loss": 0.0019, "reward": 1.8038502931594849, "reward_std": 0.11479467153549194, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8116627931594849, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 125.015625, "epoch": 0.34621993127147765, "grad_norm": 1.3175894104086836, "kl": 0.02630615234375, "learning_rate": 8.26890034364261e-07, "loss": 0.0011, "reward": 1.8177732825279236, "reward_std": 0.061465613543987274, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8177732825279236, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 125.4140625, "epoch": 0.3470790378006873, "grad_norm": 0.9613495919689865, "kl": 0.03057861328125, "learning_rate": 8.264604810996562e-07, "loss": 0.0012, "reward": 1.856289029121399, "reward_std": 0.08307300135493279, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8562889695167542, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 155.5546875, "epoch": 0.3479381443298969, "grad_norm": 1.2236294083563708, "kl": 0.02239990234375, "learning_rate": 8.260309278350514e-07, "loss": 0.0009, "reward": 1.86646169424057, "reward_std": 0.05629648268222809, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8664617240428925, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 135.40625, "epoch": 0.3487972508591065, "grad_norm": 4.196708925682456, "kl": 0.0263671875, "learning_rate": 8.256013745704466e-07, "loss": 0.0011, "reward": 1.6891456842422485, "reward_std": 0.13314834237098694, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.6969581544399261, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 141.9765625, "epoch": 0.34965635738831613, "grad_norm": 3.3390151553242826, "kl": 0.02838134765625, "learning_rate": 8.251718213058418e-07, "loss": 0.0011, "reward": 1.7792019844055176, "reward_std": 0.08469441719353199, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7792020440101624, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 123.796875, "epoch": 0.35051546391752575, "grad_norm": 1.0417856723793362, "kl": 0.0313720703125, "learning_rate": 8.24742268041237e-07, "loss": 0.0013, "reward": 1.745001196861267, "reward_std": 0.08887993916869164, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7528136670589447, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 136.2421875, "epoch": 0.35137457044673537, "grad_norm": 1.5185692601329688, "kl": 0.02972412109375, "learning_rate": 8.243127147766322e-07, "loss": 0.0012, "reward": 1.743471622467041, "reward_std": 0.12599998712539673, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7434715926647186, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 133.3828125, "epoch": 0.35223367697594504, "grad_norm": 3.698150098713745, "kl": 0.037353515625, "learning_rate": 8.238831615120274e-07, "loss": 0.0015, "reward": 1.7748391032218933, "reward_std": 0.109963808208704, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7826516330242157, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 132.84375, "epoch": 0.35309278350515466, "grad_norm": 0.7340273563755917, "kl": 0.0301513671875, "learning_rate": 8.234536082474227e-07, "loss": 0.0012, "reward": 1.747934103012085, "reward_std": 0.029868584126234055, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7557465732097626, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 137.140625, "epoch": 0.3539518900343643, "grad_norm": 6.5869920419752885, "kl": 0.03033447265625, "learning_rate": 8.230240549828178e-07, "loss": 0.0012, "reward": 1.803978145122528, "reward_std": 0.11175262182950974, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8196031451225281, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 133.0625, "epoch": 0.3548109965635739, "grad_norm": 1.2110163382375696, "kl": 0.03582763671875, "learning_rate": 8.22594501718213e-07, "loss": 0.0014, "reward": 1.7075645923614502, "reward_std": 0.09112687408924103, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7075645625591278, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 137.2109375, "epoch": 0.3556701030927835, "grad_norm": 0.9440184144682974, "kl": 0.02886962890625, "learning_rate": 8.221649484536082e-07, "loss": 0.0012, "reward": 1.879349708557129, "reward_std": 0.077772606164217, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8871622085571289, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 158.203125, "epoch": 0.35652920962199314, "grad_norm": 1.1844513928260747, "kl": 0.024658203125, "learning_rate": 8.217353951890034e-07, "loss": 0.001, "reward": 1.8242889642715454, "reward_std": 0.11754781007766724, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8321015238761902, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 128.9453125, "epoch": 0.35738831615120276, "grad_norm": 1.1572419610665239, "kl": 0.02496337890625, "learning_rate": 8.213058419243986e-07, "loss": 0.001, "reward": 1.7361584305763245, "reward_std": 0.0901523232460022, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7439709305763245, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 150.4140625, "epoch": 0.3582474226804124, "grad_norm": 1.1187566144341665, "kl": 0.02642822265625, "learning_rate": 8.208762886597938e-07, "loss": 0.0011, "reward": 1.787147879600525, "reward_std": 0.08959689736366272, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7871478796005249, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 131.8515625, "epoch": 0.359106529209622, "grad_norm": 1.469623021469994, "kl": 0.0328369140625, "learning_rate": 8.20446735395189e-07, "loss": 0.0013, "reward": 1.7043437957763672, "reward_std": 0.13283608853816986, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7043437361717224, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 135.7890625, "epoch": 0.3599656357388316, "grad_norm": 1.7756842310117262, "kl": 0.03424072265625, "learning_rate": 8.200171821305842e-07, "loss": 0.0014, "reward": 1.8168611526489258, "reward_std": 0.09070071205496788, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8168611526489258, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 124.0078125, "epoch": 0.36082474226804123, "grad_norm": 1.9241728030153786, "kl": 0.03515625, "learning_rate": 8.195876288659793e-07, "loss": 0.0014, "reward": 1.7493263483047485, "reward_std": 0.12105973809957504, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7493264377117157, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 140.203125, "epoch": 0.36168384879725085, "grad_norm": 2.3633953737617848, "kl": 0.0306396484375, "learning_rate": 8.191580756013745e-07, "loss": 0.0012, "reward": 1.7810669541358948, "reward_std": 0.09037407860159874, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7810669541358948, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 142.0078125, "epoch": 0.36254295532646047, "grad_norm": 1.1965175464927247, "kl": 0.0252685546875, "learning_rate": 8.187285223367697e-07, "loss": 0.001, "reward": 1.82912939786911, "reward_std": 0.13231520354747772, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8369418382644653, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 140.6484375, "epoch": 0.3634020618556701, "grad_norm": 0.9630157435896298, "kl": 0.025634765625, "learning_rate": 8.182989690721649e-07, "loss": 0.001, "reward": 1.8102641105651855, "reward_std": 0.10265975818037987, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8102641105651855, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 135.0234375, "epoch": 0.3642611683848797, "grad_norm": 1.4555619232603647, "kl": 0.0316162109375, "learning_rate": 8.178694158075601e-07, "loss": 0.0013, "reward": 1.763112723827362, "reward_std": 0.15452294051647186, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7709251940250397, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 136.6953125, "epoch": 0.3651202749140893, "grad_norm": 1.6695763498386311, "kl": 0.02679443359375, "learning_rate": 8.174398625429553e-07, "loss": 0.0011, "reward": 1.8538659811019897, "reward_std": 0.06878830399364233, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8538659512996674, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 138.7734375, "epoch": 0.36597938144329895, "grad_norm": 1.714285388160138, "kl": 0.02813720703125, "learning_rate": 8.170103092783505e-07, "loss": 0.0011, "reward": 1.7433908581733704, "reward_std": 0.11577881872653961, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7512032985687256, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 128.375, "epoch": 0.36683848797250856, "grad_norm": 2.0652235283446543, "kl": 0.03082275390625, "learning_rate": 8.165807560137457e-07, "loss": 0.0012, "reward": 1.7070344686508179, "reward_std": 0.18474984169006348, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7226594686508179, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 123.4609375, "epoch": 0.36769759450171824, "grad_norm": 0.952329796803498, "kl": 0.03497314453125, "learning_rate": 8.161512027491409e-07, "loss": 0.0014, "reward": 1.8179508447647095, "reward_std": 0.06837813928723335, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8179508149623871, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 127.7578125, "epoch": 0.36855670103092786, "grad_norm": 1.6252786551878542, "kl": 0.0391845703125, "learning_rate": 8.15721649484536e-07, "loss": 0.0016, "reward": 1.7616538405418396, "reward_std": 0.0948004387319088, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7694664299488068, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 133.1328125, "epoch": 0.3694158075601375, "grad_norm": 4.575833798673977, "kl": 0.02618408203125, "learning_rate": 8.152920962199312e-07, "loss": 0.001, "reward": 1.7446335554122925, "reward_std": 0.14209628105163574, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7446335852146149, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 149.5078125, "epoch": 0.3702749140893471, "grad_norm": 1.5029038984828853, "kl": 0.027099609375, "learning_rate": 8.148625429553264e-07, "loss": 0.0011, "reward": 1.7977319955825806, "reward_std": 0.12650492042303085, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7977319657802582, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 141.9609375, "epoch": 0.3711340206185567, "grad_norm": 3.124108570449623, "kl": 0.030517578125, "learning_rate": 8.144329896907216e-07, "loss": 0.0012, "reward": 1.763170838356018, "reward_std": 0.13480468839406967, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7631707787513733, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 140.1328125, "epoch": 0.37199312714776633, "grad_norm": 0.9956952014358291, "kl": 0.0367431640625, "learning_rate": 8.140034364261168e-07, "loss": 0.0015, "reward": 1.8633191585540771, "reward_std": 0.09405502304434776, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8633190989494324, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 143.8203125, "epoch": 0.37285223367697595, "grad_norm": 1.1621034752647021, "kl": 0.02874755859375, "learning_rate": 8.13573883161512e-07, "loss": 0.0012, "reward": 1.7654351592063904, "reward_std": 0.10451111197471619, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7654351592063904, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 135.84375, "epoch": 0.37371134020618557, "grad_norm": 1.0937552701332574, "kl": 0.02410888671875, "learning_rate": 8.131443298969072e-07, "loss": 0.001, "reward": 1.7504616975784302, "reward_std": 0.13275029510259628, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7582742869853973, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 143.40625, "epoch": 0.3745704467353952, "grad_norm": 1.173542025190629, "kl": 0.0269775390625, "learning_rate": 8.127147766323024e-07, "loss": 0.0011, "reward": 1.7958321571350098, "reward_std": 0.13957487791776657, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7958321869373322, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 155.5859375, "epoch": 0.3754295532646048, "grad_norm": 1.0809358432591143, "kl": 0.0223388671875, "learning_rate": 8.122852233676975e-07, "loss": 0.0009, "reward": 1.8025323748588562, "reward_std": 0.10576809197664261, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8103449046611786, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 140.296875, "epoch": 0.37628865979381443, "grad_norm": 0.9307591226779286, "kl": 0.0291748046875, "learning_rate": 8.118556701030927e-07, "loss": 0.0012, "reward": 1.745145320892334, "reward_std": 0.09199497848749161, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7607703506946564, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 132.3046875, "epoch": 0.37714776632302405, "grad_norm": 1.599730206934407, "kl": 0.025634765625, "learning_rate": 8.114261168384879e-07, "loss": 0.001, "reward": 1.7341384291648865, "reward_std": 0.08559693396091461, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7341383695602417, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 150.046875, "epoch": 0.37800687285223367, "grad_norm": 1.1302132744799622, "kl": 0.0223388671875, "learning_rate": 8.109965635738831e-07, "loss": 0.0009, "reward": 1.7777530550956726, "reward_std": 0.11446893960237503, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7855655252933502, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 158.390625, "epoch": 0.3788659793814433, "grad_norm": 0.8081005861278157, "kl": 0.03192138671875, "learning_rate": 8.105670103092783e-07, "loss": 0.0013, "reward": 1.7475718259811401, "reward_std": 0.12176471576094627, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7475718557834625, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 131.1640625, "epoch": 0.3797250859106529, "grad_norm": 1.4982803588386655, "kl": 0.03009033203125, "learning_rate": 8.101374570446735e-07, "loss": 0.0012, "reward": 1.7991742491722107, "reward_std": 0.09572646208107471, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8069867491722107, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 139.40625, "epoch": 0.3805841924398625, "grad_norm": 4.788054491524054, "kl": 0.02618408203125, "learning_rate": 8.097079037800687e-07, "loss": 0.001, "reward": 1.7099648118019104, "reward_std": 0.13577891886234283, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.725589781999588, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 146.9140625, "epoch": 0.38144329896907214, "grad_norm": 1.0187295128992484, "kl": 0.028564453125, "learning_rate": 8.092783505154639e-07, "loss": 0.0011, "reward": 1.7704007029533386, "reward_std": 0.07720200531184673, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.770400732755661, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 150.875, "epoch": 0.38230240549828176, "grad_norm": 1.617440205079106, "kl": 0.0537109375, "learning_rate": 8.08848797250859e-07, "loss": 0.0022, "reward": 1.745563805103302, "reward_std": 0.12642088159918785, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7533764243125916, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 154.46875, "epoch": 0.38316151202749144, "grad_norm": 1.047930515930632, "kl": 0.0303955078125, "learning_rate": 8.084192439862542e-07, "loss": 0.0012, "reward": 1.8345497846603394, "reward_std": 0.09045657515525818, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8423622250556946, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 146.21875, "epoch": 0.38402061855670105, "grad_norm": 1.4220569982461002, "kl": 0.0233154296875, "learning_rate": 8.079896907216494e-07, "loss": 0.0009, "reward": 1.7613440155982971, "reward_std": 0.11786005645990372, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7613440155982971, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 153.1875, "epoch": 0.3848797250859107, "grad_norm": 1.1920009174782409, "kl": 0.02923583984375, "learning_rate": 8.075601374570446e-07, "loss": 0.0012, "reward": 1.7767443656921387, "reward_std": 0.1390495002269745, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7845568954944611, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 145.0703125, "epoch": 0.3857388316151203, "grad_norm": 0.8285644733256662, "kl": 0.0247802734375, "learning_rate": 8.071305841924398e-07, "loss": 0.001, "reward": 1.8258004784584045, "reward_std": 0.06260296143591404, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8258004486560822, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 133.75, "epoch": 0.3865979381443299, "grad_norm": 1.7038040697442591, "kl": 0.026123046875, "learning_rate": 8.06701030927835e-07, "loss": 0.001, "reward": 1.8847057819366455, "reward_std": 0.1375606805086136, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.9081432819366455, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 158.8359375, "epoch": 0.38745704467353953, "grad_norm": 1.4889637606593127, "kl": 0.0345458984375, "learning_rate": 8.062714776632302e-07, "loss": 0.0014, "reward": 1.7658740282058716, "reward_std": 0.18101423978805542, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7893114984035492, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 135.59375, "epoch": 0.38831615120274915, "grad_norm": 1.046812904427794, "kl": 0.0262451171875, "learning_rate": 8.058419243986254e-07, "loss": 0.001, "reward": 1.7848870158195496, "reward_std": 0.2099006325006485, "rewards/format_reward_gen": 0.9375, "rewards/llm_reward": 0.8473870158195496, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 156.5625, "epoch": 0.38917525773195877, "grad_norm": 1.4677940684663608, "kl": 0.023193359375, "learning_rate": 8.054123711340207e-07, "loss": 0.0009, "reward": 1.597209393978119, "reward_std": 0.26560014486312866, "rewards/format_reward_gen": 0.921875, "rewards/llm_reward": 0.6753344535827637, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 142.2265625, "epoch": 0.3900343642611684, "grad_norm": 2.5890905532205957, "kl": 0.02532958984375, "learning_rate": 8.049828178694158e-07, "loss": 0.001, "reward": 1.741255521774292, "reward_std": 0.1423398032784462, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.772505521774292, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 127.7578125, "epoch": 0.390893470790378, "grad_norm": 1.877091810608803, "kl": 0.021728515625, "learning_rate": 8.04553264604811e-07, "loss": 0.0009, "reward": 1.746235966682434, "reward_std": 0.1634746640920639, "rewards/format_reward_gen": 0.9609375, "rewards/llm_reward": 0.7852984368801117, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 143.0625, "epoch": 0.3917525773195876, "grad_norm": 1.9322462103510145, "kl": 0.030029296875, "learning_rate": 8.041237113402062e-07, "loss": 0.0012, "reward": 1.7314363718032837, "reward_std": 0.19341952353715897, "rewards/format_reward_gen": 0.9453125, "rewards/llm_reward": 0.7861238718032837, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 141.34375, "epoch": 0.39261168384879724, "grad_norm": 1.761668192474341, "kl": 0.02813720703125, "learning_rate": 8.036941580756014e-07, "loss": 0.0011, "reward": 1.6722000241279602, "reward_std": 0.22043447196483612, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7034500241279602, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 177.1484375, "epoch": 0.39347079037800686, "grad_norm": 1.103623434836374, "kl": 0.021484375, "learning_rate": 8.032646048109966e-07, "loss": 0.0009, "reward": 1.7857837677001953, "reward_std": 0.16772552579641342, "rewards/format_reward_gen": 0.953125, "rewards/llm_reward": 0.8326588273048401, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 149.78125, "epoch": 0.3943298969072165, "grad_norm": 2.9558490613021475, "kl": 0.02484130859375, "learning_rate": 8.028350515463918e-07, "loss": 0.001, "reward": 1.8172647356987, "reward_std": 0.1397656500339508, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8172646760940552, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 128.109375, "epoch": 0.3951890034364261, "grad_norm": 1.0942493174264873, "kl": 0.02008056640625, "learning_rate": 8.02405498281787e-07, "loss": 0.0008, "reward": 1.7567135095596313, "reward_std": 0.08268255367875099, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.764525979757309, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 123.4140625, "epoch": 0.3960481099656357, "grad_norm": 1.724231638784223, "kl": 0.02337646484375, "learning_rate": 8.019759450171822e-07, "loss": 0.0009, "reward": 1.812518298625946, "reward_std": 0.1107301339507103, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8281432688236237, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 131.875, "epoch": 0.39690721649484534, "grad_norm": 0.8421480615888612, "kl": 0.02001953125, "learning_rate": 8.015463917525774e-07, "loss": 0.0008, "reward": 1.8146081566810608, "reward_std": 0.09295319765806198, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8224206566810608, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 135.2265625, "epoch": 0.39776632302405496, "grad_norm": 0.9758819512120601, "kl": 0.017822265625, "learning_rate": 8.011168384879725e-07, "loss": 0.0007, "reward": 1.7702763676643372, "reward_std": 0.10476777702569962, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7780888676643372, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 143.234375, "epoch": 0.39862542955326463, "grad_norm": 0.9223788317682841, "kl": 0.02032470703125, "learning_rate": 8.006872852233677e-07, "loss": 0.0008, "reward": 1.8337781429290771, "reward_std": 0.10091580078005791, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8337781429290771, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 118.0625, "epoch": 0.39948453608247425, "grad_norm": 1.1306004675623413, "kl": 0.02081298828125, "learning_rate": 8.002577319587629e-07, "loss": 0.0008, "reward": 1.6799623370170593, "reward_std": 0.1472223922610283, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.6955873966217041, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 126.28125, "epoch": 0.40034364261168387, "grad_norm": 2.7421308952943497, "kl": 0.02642822265625, "learning_rate": 7.998281786941581e-07, "loss": 0.0011, "reward": 1.7180203199386597, "reward_std": 0.16847053170204163, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7336452901363373, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 136.8984375, "epoch": 0.4012027491408935, "grad_norm": 1.1905386765274455, "kl": 0.02166748046875, "learning_rate": 7.993986254295533e-07, "loss": 0.0009, "reward": 1.824146568775177, "reward_std": 0.09475822001695633, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8319591283798218, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 126.828125, "epoch": 0.4020618556701031, "grad_norm": 1.0030306949143268, "kl": 0.02276611328125, "learning_rate": 7.989690721649485e-07, "loss": 0.0009, "reward": 1.6890740394592285, "reward_std": 0.08577071130275726, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7046990394592285, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 129.390625, "epoch": 0.4029209621993127, "grad_norm": 0.6829732849279407, "kl": 0.020751953125, "learning_rate": 7.985395189003437e-07, "loss": 0.0008, "reward": 1.8722723126411438, "reward_std": 0.03336894512176514, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8722723126411438, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 118.703125, "epoch": 0.40378006872852235, "grad_norm": 3.6315379563427315, "kl": 0.08935546875, "learning_rate": 7.981099656357389e-07, "loss": 0.0036, "reward": 1.669398546218872, "reward_std": 0.16622114926576614, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.6772109866142273, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 125.3828125, "epoch": 0.40463917525773196, "grad_norm": 2.201255320619489, "kl": 0.02001953125, "learning_rate": 7.97680412371134e-07, "loss": 0.0008, "reward": 1.689961850643158, "reward_std": 0.21605655923485756, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.6977743208408356, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 133.15625, "epoch": 0.4054982817869416, "grad_norm": 7.197996250278638, "kl": 0.02166748046875, "learning_rate": 7.972508591065292e-07, "loss": 0.0009, "reward": 1.8866642117500305, "reward_std": 0.0431262981146574, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8866641819477081, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 135.5625, "epoch": 0.4063573883161512, "grad_norm": 0.9395439850082784, "kl": 0.02435302734375, "learning_rate": 7.968213058419243e-07, "loss": 0.001, "reward": 1.7944100499153137, "reward_std": 0.07158202305436134, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7944100797176361, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 127.46875, "epoch": 0.4072164948453608, "grad_norm": 10.764240409904978, "kl": 0.02557373046875, "learning_rate": 7.963917525773195e-07, "loss": 0.001, "reward": 1.7304428219795227, "reward_std": 0.19595059752464294, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7382553517818451, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 123.9140625, "epoch": 0.40807560137457044, "grad_norm": 1.4034901060673528, "kl": 0.02447509765625, "learning_rate": 7.959621993127147e-07, "loss": 0.001, "reward": 1.7872547507286072, "reward_std": 0.12548959627747536, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7950672507286072, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 118.5546875, "epoch": 0.40893470790378006, "grad_norm": 1.8853927915690178, "kl": 0.0291748046875, "learning_rate": 7.955326460481099e-07, "loss": 0.0012, "reward": 1.6977566480636597, "reward_std": 0.14235452935099602, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7055690884590149, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 126.859375, "epoch": 0.4097938144329897, "grad_norm": 2.0359538756787243, "kl": 0.02093505859375, "learning_rate": 7.951030927835051e-07, "loss": 0.0008, "reward": 1.8194365501403809, "reward_std": 0.14543933421373367, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8194365203380585, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 126.0625, "epoch": 0.4106529209621993, "grad_norm": 2.3120838651943827, "kl": 0.02349853515625, "learning_rate": 7.946735395189003e-07, "loss": 0.0009, "reward": 1.7549501657485962, "reward_std": 0.17499929666519165, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7627626061439514, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 119.671875, "epoch": 0.4115120274914089, "grad_norm": 0.846611479751614, "kl": 0.01953125, "learning_rate": 7.942439862542954e-07, "loss": 0.0008, "reward": 1.761724591255188, "reward_std": 0.1066037192940712, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7695370316505432, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 129.7421875, "epoch": 0.41237113402061853, "grad_norm": 1.0153354601072486, "kl": 0.021728515625, "learning_rate": 7.938144329896906e-07, "loss": 0.0009, "reward": 1.8040355443954468, "reward_std": 0.09103575721383095, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8118480145931244, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 132.34375, "epoch": 0.41323024054982815, "grad_norm": 1.3316805075661329, "kl": 0.03472900390625, "learning_rate": 7.933848797250858e-07, "loss": 0.0014, "reward": 1.8269373178482056, "reward_std": 0.0393183808773756, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8269373178482056, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 136.4453125, "epoch": 0.41408934707903783, "grad_norm": 1.2169011867171224, "kl": 0.0244140625, "learning_rate": 7.92955326460481e-07, "loss": 0.001, "reward": 1.808565378189087, "reward_std": 0.10053881630301476, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8163778781890869, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 119.3671875, "epoch": 0.41494845360824745, "grad_norm": 1.9556380092566035, "kl": 0.02239990234375, "learning_rate": 7.925257731958762e-07, "loss": 0.0009, "reward": 1.7945858240127563, "reward_std": 0.1085839495062828, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7945858240127563, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 132.2890625, "epoch": 0.41580756013745707, "grad_norm": 1.4377206374386469, "kl": 0.0299072265625, "learning_rate": 7.920962199312714e-07, "loss": 0.0012, "reward": 1.7909988164901733, "reward_std": 0.10544274002313614, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8066238462924957, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 126.484375, "epoch": 0.4166666666666667, "grad_norm": 1.1637028605621145, "kl": 0.0244140625, "learning_rate": 7.916666666666666e-07, "loss": 0.001, "reward": 1.8535531759262085, "reward_std": 0.09841496497392654, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8535531163215637, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 145.296875, "epoch": 0.4175257731958763, "grad_norm": 0.9089860152674513, "kl": 0.0196533203125, "learning_rate": 7.912371134020618e-07, "loss": 0.0008, "reward": 1.8009296655654907, "reward_std": 0.08241390809416771, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.800929605960846, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 147.875, "epoch": 0.4183848797250859, "grad_norm": 1.1175991581337885, "kl": 0.02105712890625, "learning_rate": 7.908075601374569e-07, "loss": 0.0008, "reward": 1.7471742033958435, "reward_std": 0.09226182475686073, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7471742033958435, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 138.65625, "epoch": 0.41924398625429554, "grad_norm": 1.5782041969321439, "kl": 0.0245361328125, "learning_rate": 7.903780068728521e-07, "loss": 0.001, "reward": 1.7752417922019958, "reward_std": 0.07173944637179375, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7752417623996735, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 126.328125, "epoch": 0.42010309278350516, "grad_norm": 1.712440072733616, "kl": 0.02490234375, "learning_rate": 7.899484536082473e-07, "loss": 0.001, "reward": 1.7801220417022705, "reward_std": 0.1850130558013916, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7957470417022705, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 136.96875, "epoch": 0.4209621993127148, "grad_norm": 1.1903287634500803, "kl": 0.021484375, "learning_rate": 7.895189003436425e-07, "loss": 0.0009, "reward": 1.797283411026001, "reward_std": 0.15069201588630676, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.797283411026001, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 133.5390625, "epoch": 0.4218213058419244, "grad_norm": 1.3823311832547454, "kl": 0.02471923828125, "learning_rate": 7.890893470790377e-07, "loss": 0.001, "reward": 1.6768372654914856, "reward_std": 0.14700639992952347, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.6846497654914856, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 142.8828125, "epoch": 0.422680412371134, "grad_norm": 1.052318496217373, "kl": 0.02825927734375, "learning_rate": 7.886597938144329e-07, "loss": 0.0011, "reward": 1.7353659868240356, "reward_std": 0.10273649916052818, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.735366016626358, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 133.234375, "epoch": 0.42353951890034364, "grad_norm": 0.9659208234974426, "kl": 0.0198974609375, "learning_rate": 7.882302405498281e-07, "loss": 0.0008, "reward": 1.6551663875579834, "reward_std": 0.10088075697422028, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.670791357755661, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 141.6796875, "epoch": 0.42439862542955326, "grad_norm": 3.2851303393945175, "kl": 0.0211181640625, "learning_rate": 7.878006872852233e-07, "loss": 0.0008, "reward": 1.7861362099647522, "reward_std": 0.1402701437473297, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.786136269569397, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 145.0390625, "epoch": 0.4252577319587629, "grad_norm": 1.3709194983778912, "kl": 0.02056884765625, "learning_rate": 7.873711340206184e-07, "loss": 0.0008, "reward": 1.7891325950622559, "reward_std": 0.15913131088018417, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7891325950622559, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 153.5078125, "epoch": 0.4261168384879725, "grad_norm": 1.6084405058813327, "kl": 0.02093505859375, "learning_rate": 7.869415807560136e-07, "loss": 0.0008, "reward": 1.843230962753296, "reward_std": 0.11450556665658951, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8432309925556183, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 142.6015625, "epoch": 0.4269759450171821, "grad_norm": 0.9098919677048347, "kl": 0.0244140625, "learning_rate": 7.865120274914089e-07, "loss": 0.001, "reward": 1.8330826163291931, "reward_std": 0.13003884255886078, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8330826163291931, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 139.03125, "epoch": 0.42783505154639173, "grad_norm": 1.1077984476897802, "kl": 0.0185546875, "learning_rate": 7.860824742268041e-07, "loss": 0.0007, "reward": 1.7646691799163818, "reward_std": 0.09061707556247711, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7646692395210266, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 141.5, "epoch": 0.42869415807560135, "grad_norm": 1.5301258115042293, "kl": 0.021728515625, "learning_rate": 7.856529209621993e-07, "loss": 0.0009, "reward": 1.8485521078109741, "reward_std": 0.14281748235225677, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8485520780086517, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 146.15625, "epoch": 0.42955326460481097, "grad_norm": 0.6854626662078912, "kl": 0.0198974609375, "learning_rate": 7.852233676975945e-07, "loss": 0.0008, "reward": 1.8705613613128662, "reward_std": 0.057761300355196, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8705613017082214, "step": 500 }, { "clip_ratio": 0.0, "completion_length": 156.5, "epoch": 0.43041237113402064, "grad_norm": 4.684754333390039, "kl": 0.02490234375, "learning_rate": 7.847938144329897e-07, "loss": 0.001, "reward": 1.7904819250106812, "reward_std": 0.11770961433649063, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.790481835603714, "step": 501 }, { "clip_ratio": 0.0, "completion_length": 137.1015625, "epoch": 0.43127147766323026, "grad_norm": 1.3603984953615473, "kl": 0.0272216796875, "learning_rate": 7.843642611683849e-07, "loss": 0.0011, "reward": 1.8305404782295227, "reward_std": 0.08736425265669823, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8305404186248779, "step": 502 }, { "clip_ratio": 0.0, "completion_length": 154.171875, "epoch": 0.4321305841924399, "grad_norm": 1.1418674217309672, "kl": 0.026123046875, "learning_rate": 7.839347079037801e-07, "loss": 0.001, "reward": 1.7259817123413086, "reward_std": 0.16148880869150162, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7416067719459534, "step": 503 }, { "clip_ratio": 0.0, "completion_length": 154.3359375, "epoch": 0.4329896907216495, "grad_norm": 1.1135192490403159, "kl": 0.0301513671875, "learning_rate": 7.835051546391752e-07, "loss": 0.0012, "reward": 1.675801932811737, "reward_std": 0.10446542128920555, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.6758019924163818, "step": 504 }, { "clip_ratio": 0.0, "completion_length": 158.8984375, "epoch": 0.4338487972508591, "grad_norm": 1.0026442135683145, "kl": 0.02264404296875, "learning_rate": 7.830756013745704e-07, "loss": 0.0009, "reward": 1.753090739250183, "reward_std": 0.11342515423893929, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7687157094478607, "step": 505 }, { "clip_ratio": 0.0, "completion_length": 156.4609375, "epoch": 0.43470790378006874, "grad_norm": 0.63327553743734, "kl": 0.0238037109375, "learning_rate": 7.826460481099656e-07, "loss": 0.001, "reward": 1.7929630875587463, "reward_std": 0.06263483129441738, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8007755875587463, "step": 506 }, { "clip_ratio": 0.0, "completion_length": 145.015625, "epoch": 0.43556701030927836, "grad_norm": 1.1405571467224787, "kl": 0.02972412109375, "learning_rate": 7.822164948453608e-07, "loss": 0.0012, "reward": 1.8316816091537476, "reward_std": 0.13786949962377548, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8394941091537476, "step": 507 }, { "clip_ratio": 0.0, "completion_length": 137.7109375, "epoch": 0.436426116838488, "grad_norm": 2.491831127237855, "kl": 0.0272216796875, "learning_rate": 7.81786941580756e-07, "loss": 0.0011, "reward": 1.8621906638145447, "reward_std": 0.08054150827229023, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8621906936168671, "step": 508 }, { "clip_ratio": 0.0, "completion_length": 145.6640625, "epoch": 0.4372852233676976, "grad_norm": 5.202435035199377, "kl": 0.02520751953125, "learning_rate": 7.813573883161512e-07, "loss": 0.001, "reward": 1.6291198134422302, "reward_std": 0.2496250867843628, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.6447448432445526, "step": 509 }, { "clip_ratio": 0.0, "completion_length": 168.59375, "epoch": 0.4381443298969072, "grad_norm": 1.6848389247112312, "kl": 0.02227783203125, "learning_rate": 7.809278350515464e-07, "loss": 0.0009, "reward": 1.8260024189949036, "reward_std": 0.0773128978908062, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8260024785995483, "step": 510 }, { "clip_ratio": 0.0, "completion_length": 153.8671875, "epoch": 0.43900343642611683, "grad_norm": 1.2536511830670496, "kl": 0.04034423828125, "learning_rate": 7.804982817869416e-07, "loss": 0.0016, "reward": 1.8234487175941467, "reward_std": 0.12906832993030548, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8390736877918243, "step": 511 }, { "clip_ratio": 0.0, "completion_length": 162.9609375, "epoch": 0.43986254295532645, "grad_norm": 0.9056562066002685, "kl": 0.0228271484375, "learning_rate": 7.800687285223368e-07, "loss": 0.0009, "reward": 1.8035598993301392, "reward_std": 0.10758433863520622, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.819184809923172, "step": 512 }, { "clip_ratio": 0.0, "completion_length": 160.625, "epoch": 0.44072164948453607, "grad_norm": 1.9749807757861202, "kl": 0.0284423828125, "learning_rate": 7.796391752577319e-07, "loss": 0.0011, "reward": 1.7375337481498718, "reward_std": 0.12667454779148102, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7375337779521942, "step": 513 }, { "clip_ratio": 0.0, "completion_length": 177.203125, "epoch": 0.4415807560137457, "grad_norm": 2.7049577625952863, "kl": 0.025634765625, "learning_rate": 7.792096219931271e-07, "loss": 0.001, "reward": 1.665591835975647, "reward_std": 0.1844305470585823, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.6890293061733246, "step": 514 }, { "clip_ratio": 0.0, "completion_length": 176.421875, "epoch": 0.4424398625429553, "grad_norm": 1.1503646554833182, "kl": 0.020751953125, "learning_rate": 7.787800687285223e-07, "loss": 0.0008, "reward": 1.7526438236236572, "reward_std": 0.15297304093837738, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7526438236236572, "step": 515 }, { "clip_ratio": 0.0, "completion_length": 174.8203125, "epoch": 0.44329896907216493, "grad_norm": 2.528762349405692, "kl": 0.0223388671875, "learning_rate": 7.783505154639175e-07, "loss": 0.0009, "reward": 1.7877247333526611, "reward_std": 0.1644352711737156, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7877247333526611, "step": 516 }, { "clip_ratio": 0.0, "completion_length": 161.5859375, "epoch": 0.44415807560137455, "grad_norm": 1.0883366488533088, "kl": 0.02557373046875, "learning_rate": 7.779209621993127e-07, "loss": 0.001, "reward": 1.9020284414291382, "reward_std": 0.0875653550028801, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.9098409414291382, "step": 517 }, { "clip_ratio": 0.0, "completion_length": 168.234375, "epoch": 0.44501718213058417, "grad_norm": 1.238117408635782, "kl": 0.02325439453125, "learning_rate": 7.774914089347079e-07, "loss": 0.0009, "reward": 1.7550345659255981, "reward_std": 0.11340263113379478, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7706595659255981, "step": 518 }, { "clip_ratio": 0.0, "completion_length": 161.953125, "epoch": 0.44587628865979384, "grad_norm": 3.2324811204812818, "kl": 0.02435302734375, "learning_rate": 7.770618556701031e-07, "loss": 0.001, "reward": 1.7546863555908203, "reward_std": 0.15567786246538162, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7624987959861755, "step": 519 }, { "clip_ratio": 0.0, "completion_length": 159.5390625, "epoch": 0.44673539518900346, "grad_norm": 1.1906445700635893, "kl": 0.0238037109375, "learning_rate": 7.766323024054983e-07, "loss": 0.001, "reward": 1.770099401473999, "reward_std": 0.15499432384967804, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7700993120670319, "step": 520 }, { "clip_ratio": 0.0, "completion_length": 155.3359375, "epoch": 0.4475945017182131, "grad_norm": 1.4289037451333348, "kl": 0.02899169921875, "learning_rate": 7.762027491408934e-07, "loss": 0.0012, "reward": 1.7591895461082458, "reward_std": 0.20220742374658585, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7826270163059235, "step": 521 }, { "clip_ratio": 0.0, "completion_length": 153.859375, "epoch": 0.4484536082474227, "grad_norm": 0.9798567354541009, "kl": 0.0318603515625, "learning_rate": 7.757731958762886e-07, "loss": 0.0013, "reward": 1.8011258244514465, "reward_std": 0.1115933284163475, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8089383244514465, "step": 522 }, { "clip_ratio": 0.0, "completion_length": 162.453125, "epoch": 0.4493127147766323, "grad_norm": 0.9870529145672341, "kl": 0.0245361328125, "learning_rate": 7.753436426116838e-07, "loss": 0.001, "reward": 1.7835025191307068, "reward_std": 0.10335457697510719, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7913150489330292, "step": 523 }, { "clip_ratio": 0.0, "completion_length": 149.1796875, "epoch": 0.45017182130584193, "grad_norm": 1.590053867640443, "kl": 0.02679443359375, "learning_rate": 7.74914089347079e-07, "loss": 0.0011, "reward": 1.6806529760360718, "reward_std": 0.12486770376563072, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.6962779760360718, "step": 524 }, { "clip_ratio": 0.0, "completion_length": 150.203125, "epoch": 0.45103092783505155, "grad_norm": 1.288976196272062, "kl": 0.0467529296875, "learning_rate": 7.744845360824742e-07, "loss": 0.0019, "reward": 1.7338435053825378, "reward_std": 0.21027009189128876, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7494684457778931, "step": 525 }, { "clip_ratio": 0.0, "completion_length": 135.875, "epoch": 0.4518900343642612, "grad_norm": 1.3796315870979017, "kl": 0.03314208984375, "learning_rate": 7.740549828178694e-07, "loss": 0.0013, "reward": 1.813571572303772, "reward_std": 0.11932007595896721, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8135715425014496, "step": 526 }, { "clip_ratio": 0.0, "completion_length": 155.8359375, "epoch": 0.4527491408934708, "grad_norm": 1.0856638820035733, "kl": 0.02392578125, "learning_rate": 7.736254295532646e-07, "loss": 0.001, "reward": 1.7808946371078491, "reward_std": 0.15631069988012314, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7887071967124939, "step": 527 }, { "clip_ratio": 0.0, "completion_length": 139.53125, "epoch": 0.4536082474226804, "grad_norm": 1.2741002088470732, "kl": 0.0306396484375, "learning_rate": 7.731958762886598e-07, "loss": 0.0012, "reward": 1.7552229762077332, "reward_std": 0.16508981585502625, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7708480358123779, "step": 528 }, { "clip_ratio": 0.0, "completion_length": 163.84375, "epoch": 0.45446735395189003, "grad_norm": 0.8166293208289542, "kl": 0.0262451171875, "learning_rate": 7.72766323024055e-07, "loss": 0.0011, "reward": 1.763866662979126, "reward_std": 0.11001107096672058, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.771679162979126, "step": 529 }, { "clip_ratio": 0.0, "completion_length": 133.6171875, "epoch": 0.45532646048109965, "grad_norm": 1.0906821881418176, "kl": 0.0263671875, "learning_rate": 7.723367697594501e-07, "loss": 0.0011, "reward": 1.7570544481277466, "reward_std": 0.09293561428785324, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7570544183254242, "step": 530 }, { "clip_ratio": 0.0, "completion_length": 152.1328125, "epoch": 0.45618556701030927, "grad_norm": 0.6788651581622658, "kl": 0.0257568359375, "learning_rate": 7.719072164948453e-07, "loss": 0.001, "reward": 1.8436721563339233, "reward_std": 0.05900268629193306, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8514846563339233, "step": 531 }, { "clip_ratio": 0.0, "completion_length": 153.203125, "epoch": 0.4570446735395189, "grad_norm": 1.3523482508260602, "kl": 0.02862548828125, "learning_rate": 7.714776632302405e-07, "loss": 0.0011, "reward": 1.7728696465492249, "reward_std": 0.0960187017917633, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7728695869445801, "step": 532 }, { "clip_ratio": 0.0, "completion_length": 157.2578125, "epoch": 0.4579037800687285, "grad_norm": 2.080449621498438, "kl": 0.02349853515625, "learning_rate": 7.710481099656357e-07, "loss": 0.0009, "reward": 1.7926985025405884, "reward_std": 0.10765637829899788, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7926985621452332, "step": 533 }, { "clip_ratio": 0.0, "completion_length": 157.546875, "epoch": 0.4587628865979381, "grad_norm": 0.9990312365514723, "kl": 0.030029296875, "learning_rate": 7.706185567010309e-07, "loss": 0.0012, "reward": 1.6698935627937317, "reward_std": 0.1365356780588627, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.6777060627937317, "step": 534 }, { "clip_ratio": 0.0, "completion_length": 147.921875, "epoch": 0.45962199312714774, "grad_norm": 1.423339665969349, "kl": 0.02587890625, "learning_rate": 7.701890034364261e-07, "loss": 0.001, "reward": 1.7801197171211243, "reward_std": 0.12286467850208282, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.780119776725769, "step": 535 }, { "clip_ratio": 0.0, "completion_length": 155.1328125, "epoch": 0.46048109965635736, "grad_norm": 2.0644152976523924, "kl": 0.0234375, "learning_rate": 7.697594501718213e-07, "loss": 0.0009, "reward": 1.8145374059677124, "reward_std": 0.15781568735837936, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8223499655723572, "step": 536 }, { "clip_ratio": 0.0, "completion_length": 153.828125, "epoch": 0.46134020618556704, "grad_norm": 2.419979165046296, "kl": 0.03106689453125, "learning_rate": 7.693298969072165e-07, "loss": 0.0012, "reward": 1.7789837718009949, "reward_std": 0.1027519591152668, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7789837718009949, "step": 537 }, { "clip_ratio": 0.0, "completion_length": 153.5859375, "epoch": 0.46219931271477666, "grad_norm": 0.9119452557469206, "kl": 0.02142333984375, "learning_rate": 7.689003436426116e-07, "loss": 0.0009, "reward": 1.8047268986701965, "reward_std": 0.0921391174197197, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8047268688678741, "step": 538 }, { "clip_ratio": 0.0, "completion_length": 133.75, "epoch": 0.4630584192439863, "grad_norm": 1.0752489048608882, "kl": 0.02978515625, "learning_rate": 7.684707903780069e-07, "loss": 0.0012, "reward": 1.7013397812843323, "reward_std": 0.10420958697795868, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7013397514820099, "step": 539 }, { "clip_ratio": 0.0, "completion_length": 142.21875, "epoch": 0.4639175257731959, "grad_norm": 1.879235291737055, "kl": 0.03424072265625, "learning_rate": 7.680412371134021e-07, "loss": 0.0014, "reward": 1.7587386965751648, "reward_std": 0.12074919883161783, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7665511965751648, "step": 540 }, { "clip_ratio": 0.0, "completion_length": 144.1640625, "epoch": 0.4647766323024055, "grad_norm": 1.0694029027143328, "kl": 0.02728271484375, "learning_rate": 7.676116838487973e-07, "loss": 0.0011, "reward": 1.7316040396690369, "reward_std": 0.1069042906165123, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7316040396690369, "step": 541 }, { "clip_ratio": 0.0, "completion_length": 137.390625, "epoch": 0.46563573883161513, "grad_norm": 1.0560010325147446, "kl": 0.027099609375, "learning_rate": 7.671821305841925e-07, "loss": 0.0011, "reward": 1.7262669205665588, "reward_std": 0.1186705082654953, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7340793907642365, "step": 542 }, { "clip_ratio": 0.0, "completion_length": 153.375, "epoch": 0.46649484536082475, "grad_norm": 0.8831242345243586, "kl": 0.02362060546875, "learning_rate": 7.667525773195877e-07, "loss": 0.0009, "reward": 1.8377864360809326, "reward_std": 0.07232390902936459, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8455990254878998, "step": 543 }, { "clip_ratio": 0.0, "completion_length": 150.5625, "epoch": 0.46735395189003437, "grad_norm": 0.808526359455059, "kl": 0.025634765625, "learning_rate": 7.663230240549829e-07, "loss": 0.001, "reward": 1.8943662643432617, "reward_std": 0.08304097317159176, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8943662643432617, "step": 544 }, { "clip_ratio": 0.0, "completion_length": 143.8359375, "epoch": 0.468213058419244, "grad_norm": 1.2364679060956676, "kl": 0.03729248046875, "learning_rate": 7.658934707903781e-07, "loss": 0.0015, "reward": 1.820011556148529, "reward_std": 0.10274770110845566, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8278240263462067, "step": 545 }, { "clip_ratio": 0.0, "completion_length": 165.7421875, "epoch": 0.4690721649484536, "grad_norm": 1.4203606446988248, "kl": 0.02764892578125, "learning_rate": 7.654639175257731e-07, "loss": 0.0011, "reward": 1.7455325722694397, "reward_std": 0.08128118142485619, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7455325424671173, "step": 546 }, { "clip_ratio": 0.0, "completion_length": 141.140625, "epoch": 0.4699312714776632, "grad_norm": 2.804119845115769, "kl": 0.03338623046875, "learning_rate": 7.650343642611683e-07, "loss": 0.0013, "reward": 1.6912947297096252, "reward_std": 0.1474270112812519, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.71473228931427, "step": 547 }, { "clip_ratio": 0.0, "completion_length": 149.09375, "epoch": 0.47079037800687284, "grad_norm": 1.9971642189793117, "kl": 0.07635498046875, "learning_rate": 7.646048109965635e-07, "loss": 0.0031, "reward": 1.7125971913337708, "reward_std": 0.17911146581172943, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7360346615314484, "step": 548 }, { "clip_ratio": 0.0, "completion_length": 142.2734375, "epoch": 0.47164948453608246, "grad_norm": 2.745551651788921, "kl": 0.02740478515625, "learning_rate": 7.641752577319587e-07, "loss": 0.0011, "reward": 1.8493961691856384, "reward_std": 0.10139979794621468, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8493961095809937, "step": 549 }, { "clip_ratio": 0.0, "completion_length": 125.359375, "epoch": 0.4725085910652921, "grad_norm": 3.8143164658012823, "kl": 0.02508544921875, "learning_rate": 7.637457044673539e-07, "loss": 0.001, "reward": 1.717983603477478, "reward_std": 0.11418138444423676, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7336086332798004, "step": 550 }, { "clip_ratio": 0.0, "completion_length": 142.8671875, "epoch": 0.4733676975945017, "grad_norm": 1.0145675064339164, "kl": 0.02392578125, "learning_rate": 7.633161512027491e-07, "loss": 0.001, "reward": 1.7650344371795654, "reward_std": 0.07981048338115215, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7728469967842102, "step": 551 }, { "clip_ratio": 0.0, "completion_length": 146.8515625, "epoch": 0.4742268041237113, "grad_norm": 18.881776716124634, "kl": 0.0201416015625, "learning_rate": 7.628865979381443e-07, "loss": 0.0008, "reward": 1.7701881527900696, "reward_std": 0.17208566516637802, "rewards/format_reward_gen": 0.953125, "rewards/llm_reward": 0.817063182592392, "step": 552 }, { "clip_ratio": 0.0, "completion_length": 148.9765625, "epoch": 0.47508591065292094, "grad_norm": 2.102100470034523, "kl": 0.02630615234375, "learning_rate": 7.624570446735395e-07, "loss": 0.0011, "reward": 1.8304003477096558, "reward_std": 0.1711219921708107, "rewards/format_reward_gen": 0.921875, "rewards/llm_reward": 0.908525288105011, "step": 553 }, { "clip_ratio": 0.0, "completion_length": 152.96875, "epoch": 0.47594501718213056, "grad_norm": 1.444717715857585, "kl": 0.021728515625, "learning_rate": 7.620274914089346e-07, "loss": 0.0009, "reward": 1.806252360343933, "reward_std": 0.1447359174489975, "rewards/format_reward_gen": 0.953125, "rewards/llm_reward": 0.8531273901462555, "step": 554 }, { "clip_ratio": 0.0, "completion_length": 140.8828125, "epoch": 0.47680412371134023, "grad_norm": 2.156085264005566, "kl": 0.02764892578125, "learning_rate": 7.615979381443298e-07, "loss": 0.0011, "reward": 1.6459792852401733, "reward_std": 0.2492372989654541, "rewards/format_reward_gen": 0.921875, "rewards/llm_reward": 0.7241042256355286, "step": 555 }, { "clip_ratio": 0.0, "completion_length": 139.4453125, "epoch": 0.47766323024054985, "grad_norm": 2.1944379120092123, "kl": 0.0213623046875, "learning_rate": 7.61168384879725e-07, "loss": 0.0009, "reward": 1.7091248035430908, "reward_std": 0.16284260153770447, "rewards/format_reward_gen": 0.953125, "rewards/llm_reward": 0.7559998035430908, "step": 556 }, { "clip_ratio": 0.0, "completion_length": 139.359375, "epoch": 0.47852233676975947, "grad_norm": 1.607470492713128, "kl": 0.0281982421875, "learning_rate": 7.607388316151202e-07, "loss": 0.0011, "reward": 1.687252402305603, "reward_std": 0.16054657474160194, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.710689902305603, "step": 557 }, { "clip_ratio": 0.0, "completion_length": 125.53125, "epoch": 0.4793814432989691, "grad_norm": 0.9757519931508962, "kl": 0.0283203125, "learning_rate": 7.603092783505154e-07, "loss": 0.0011, "reward": 1.8052751421928406, "reward_std": 0.05359087511897087, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8209001123905182, "step": 558 }, { "clip_ratio": 0.0, "completion_length": 136.125, "epoch": 0.4802405498281787, "grad_norm": 1.657372250071319, "kl": 0.03369140625, "learning_rate": 7.598797250859106e-07, "loss": 0.0013, "reward": 1.6079121828079224, "reward_std": 0.22965000569820404, "rewards/format_reward_gen": 0.9140625, "rewards/llm_reward": 0.6938497424125671, "step": 559 }, { "clip_ratio": 0.0, "completion_length": 129.34375, "epoch": 0.48109965635738833, "grad_norm": 1.3334146432657494, "kl": 0.0301513671875, "learning_rate": 7.594501718213058e-07, "loss": 0.0012, "reward": 1.724092185497284, "reward_std": 0.12364434078335762, "rewards/format_reward_gen": 0.9609375, "rewards/llm_reward": 0.7631546556949615, "step": 560 }, { "clip_ratio": 0.0, "completion_length": 107.3515625, "epoch": 0.48195876288659795, "grad_norm": 1.3103330552745027, "kl": 0.03033447265625, "learning_rate": 7.59020618556701e-07, "loss": 0.0012, "reward": 1.7695825695991516, "reward_std": 0.16746646538376808, "rewards/format_reward_gen": 0.9609375, "rewards/llm_reward": 0.8086450695991516, "step": 561 }, { "clip_ratio": 0.0, "completion_length": 119.21875, "epoch": 0.48281786941580757, "grad_norm": 3.4007488158636634, "kl": 0.0413818359375, "learning_rate": 7.585910652920962e-07, "loss": 0.0017, "reward": 1.8109328150749207, "reward_std": 0.17322580516338348, "rewards/format_reward_gen": 0.9609375, "rewards/llm_reward": 0.8499953150749207, "step": 562 }, { "clip_ratio": 0.0, "completion_length": 135.859375, "epoch": 0.4836769759450172, "grad_norm": 4.0862178779047085, "kl": 0.02703857421875, "learning_rate": 7.581615120274913e-07, "loss": 0.0011, "reward": 1.7667149305343628, "reward_std": 0.08758777007460594, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7667149305343628, "step": 563 }, { "clip_ratio": 0.0, "completion_length": 122.5859375, "epoch": 0.4845360824742268, "grad_norm": 1.073396750076447, "kl": 0.0284423828125, "learning_rate": 7.577319587628865e-07, "loss": 0.0011, "reward": 1.7671363949775696, "reward_std": 0.10686640068888664, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.774948924779892, "step": 564 }, { "clip_ratio": 0.0, "completion_length": 130.7109375, "epoch": 0.4853951890034364, "grad_norm": 1.6082673896426019, "kl": 0.02496337890625, "learning_rate": 7.573024054982817e-07, "loss": 0.001, "reward": 1.7323017120361328, "reward_std": 0.11195755004882812, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7479267120361328, "step": 565 }, { "clip_ratio": 0.0, "completion_length": 122.03125, "epoch": 0.48625429553264604, "grad_norm": 1.3078742427842147, "kl": 0.02801513671875, "learning_rate": 7.568728522336769e-07, "loss": 0.0011, "reward": 1.7409408688545227, "reward_std": 0.11844996362924576, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7487533390522003, "step": 566 }, { "clip_ratio": 0.0, "completion_length": 118.53125, "epoch": 0.48711340206185566, "grad_norm": 1.0128900337930118, "kl": 0.0277099609375, "learning_rate": 7.564432989690721e-07, "loss": 0.0011, "reward": 1.7667198181152344, "reward_std": 0.09644139185547829, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7745323181152344, "step": 567 }, { "clip_ratio": 0.0, "completion_length": 101.28125, "epoch": 0.4879725085910653, "grad_norm": 1.2158578490452436, "kl": 0.0233154296875, "learning_rate": 7.560137457044673e-07, "loss": 0.0009, "reward": 1.8185061812400818, "reward_std": 0.07172945514321327, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8185062408447266, "step": 568 }, { "clip_ratio": 0.0, "completion_length": 119.78125, "epoch": 0.4888316151202749, "grad_norm": 1.1584871999787036, "kl": 0.0294189453125, "learning_rate": 7.555841924398625e-07, "loss": 0.0012, "reward": 1.6857864260673523, "reward_std": 0.0634692870080471, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.6935989856719971, "step": 569 }, { "clip_ratio": 0.0, "completion_length": 119.8984375, "epoch": 0.4896907216494845, "grad_norm": 1.5831208673085706, "kl": 0.02850341796875, "learning_rate": 7.551546391752577e-07, "loss": 0.0011, "reward": 1.7977246046066284, "reward_std": 0.17054561525583267, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.805537074804306, "step": 570 }, { "clip_ratio": 0.0, "completion_length": 120.5078125, "epoch": 0.49054982817869414, "grad_norm": 0.7928407917060947, "kl": 0.0238037109375, "learning_rate": 7.547250859106528e-07, "loss": 0.001, "reward": 1.7591076493263245, "reward_std": 0.0366393206641078, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7669200897216797, "step": 571 }, { "clip_ratio": 0.0, "completion_length": 134.4609375, "epoch": 0.49140893470790376, "grad_norm": 0.8346365718982691, "kl": 0.02227783203125, "learning_rate": 7.54295532646048e-07, "loss": 0.0009, "reward": 1.826495349407196, "reward_std": 0.05793471448123455, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8264953792095184, "step": 572 }, { "clip_ratio": 0.0, "completion_length": 114.6484375, "epoch": 0.49226804123711343, "grad_norm": 1.852024426080103, "kl": 0.021484375, "learning_rate": 7.538659793814432e-07, "loss": 0.0009, "reward": 1.8043755292892456, "reward_std": 0.18020495027303696, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.804375559091568, "step": 573 }, { "clip_ratio": 0.0, "completion_length": 124.8203125, "epoch": 0.49312714776632305, "grad_norm": 26.47121113659565, "kl": 0.02374267578125, "learning_rate": 7.534364261168384e-07, "loss": 0.0009, "reward": 1.827117145061493, "reward_std": 0.10837876796722412, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8271171748638153, "step": 574 }, { "clip_ratio": 0.0, "completion_length": 124.234375, "epoch": 0.49398625429553267, "grad_norm": 1.1310521518717915, "kl": 0.02288818359375, "learning_rate": 7.530068728522336e-07, "loss": 0.0009, "reward": 1.797199308872223, "reward_std": 0.10720982775092125, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8128242492675781, "step": 575 }, { "clip_ratio": 0.0, "completion_length": 120.9453125, "epoch": 0.4948453608247423, "grad_norm": 1.8553971162459708, "kl": 0.02252197265625, "learning_rate": 7.525773195876288e-07, "loss": 0.0009, "reward": 1.79475736618042, "reward_std": 0.13686570525169373, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7947573065757751, "step": 576 }, { "clip_ratio": 0.0, "completion_length": 135.5703125, "epoch": 0.4957044673539519, "grad_norm": 1.1152662401016404, "kl": 0.026611328125, "learning_rate": 7.52147766323024e-07, "loss": 0.0011, "reward": 1.858827292919159, "reward_std": 0.15026213228702545, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8666398227214813, "step": 577 }, { "clip_ratio": 0.0, "completion_length": 129.15625, "epoch": 0.4965635738831615, "grad_norm": 0.7945279843022707, "kl": 0.0247802734375, "learning_rate": 7.517182130584192e-07, "loss": 0.001, "reward": 1.8583158254623413, "reward_std": 0.08704648166894913, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8739407956600189, "step": 578 }, { "clip_ratio": 0.0, "completion_length": 128.015625, "epoch": 0.49742268041237114, "grad_norm": 0.8545181390908309, "kl": 0.02587890625, "learning_rate": 7.512886597938143e-07, "loss": 0.001, "reward": 1.794776201248169, "reward_std": 0.07284862734377384, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8025886416435242, "step": 579 }, { "clip_ratio": 0.0, "completion_length": 130.828125, "epoch": 0.49828178694158076, "grad_norm": 5.702979853199694, "kl": 0.02679443359375, "learning_rate": 7.508591065292095e-07, "loss": 0.0011, "reward": 1.7567557096481323, "reward_std": 0.10015341639518738, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7567557394504547, "step": 580 }, { "clip_ratio": 0.0, "completion_length": 143.1796875, "epoch": 0.4991408934707904, "grad_norm": 3.2034971522784796, "kl": 0.0230712890625, "learning_rate": 7.504295532646047e-07, "loss": 0.0009, "reward": 1.8063377737998962, "reward_std": 0.11063193529844284, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8063377737998962, "step": 581 }, { "clip_ratio": 0.0, "completion_length": 129.734375, "epoch": 0.5, "grad_norm": 0.7129779180772821, "kl": 0.0211181640625, "learning_rate": 7.5e-07, "loss": 0.0008, "reward": 1.8224888443946838, "reward_std": 0.056552507914602757, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8303013443946838, "step": 582 }, { "clip_ratio": 0.0, "completion_length": 133.0625, "epoch": 0.5008591065292096, "grad_norm": 2.78986309606567, "kl": 0.0185546875, "learning_rate": 7.495704467353952e-07, "loss": 0.0007, "reward": 1.854720413684845, "reward_std": 0.10461413115262985, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8547204732894897, "step": 583 }, { "clip_ratio": 0.0, "completion_length": 133.1484375, "epoch": 0.5017182130584192, "grad_norm": 1.0824827966215953, "kl": 0.02435302734375, "learning_rate": 7.491408934707904e-07, "loss": 0.001, "reward": 1.7764517068862915, "reward_std": 0.10467999801039696, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7920766472816467, "step": 584 }, { "clip_ratio": 0.0, "completion_length": 128.765625, "epoch": 0.5025773195876289, "grad_norm": 1.252463130344048, "kl": 0.02105712890625, "learning_rate": 7.487113402061856e-07, "loss": 0.0008, "reward": 1.8154524564743042, "reward_std": 0.1660311445593834, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8310775756835938, "step": 585 }, { "clip_ratio": 0.0, "completion_length": 119.734375, "epoch": 0.5034364261168385, "grad_norm": 0.9304730717779914, "kl": 0.023681640625, "learning_rate": 7.482817869415808e-07, "loss": 0.0009, "reward": 1.8320194482803345, "reward_std": 0.07787203788757324, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8398319780826569, "step": 586 }, { "clip_ratio": 0.0, "completion_length": 151.5703125, "epoch": 0.5042955326460481, "grad_norm": 5.260833868032736, "kl": 0.0369873046875, "learning_rate": 7.47852233676976e-07, "loss": 0.0015, "reward": 1.7589982748031616, "reward_std": 0.14544661343097687, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.774623304605484, "step": 587 }, { "clip_ratio": 0.0, "completion_length": 132.953125, "epoch": 0.5051546391752577, "grad_norm": 1.2660610390516958, "kl": 0.031005859375, "learning_rate": 7.474226804123711e-07, "loss": 0.0012, "reward": 1.822842538356781, "reward_std": 0.13162581250071526, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8306550085544586, "step": 588 }, { "clip_ratio": 0.0, "completion_length": 158.7578125, "epoch": 0.5060137457044673, "grad_norm": 0.9382817085600501, "kl": 0.0223388671875, "learning_rate": 7.469931271477663e-07, "loss": 0.0009, "reward": 1.7842236161231995, "reward_std": 0.11218691617250443, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7842236161231995, "step": 589 }, { "clip_ratio": 0.0, "completion_length": 141.609375, "epoch": 0.506872852233677, "grad_norm": 1.5108030501592749, "kl": 0.0247802734375, "learning_rate": 7.465635738831615e-07, "loss": 0.001, "reward": 1.8379915952682495, "reward_std": 0.11418896913528442, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8458041250705719, "step": 590 }, { "clip_ratio": 0.0, "completion_length": 131.6640625, "epoch": 0.5077319587628866, "grad_norm": 0.9756789130203551, "kl": 0.02655029296875, "learning_rate": 7.461340206185567e-07, "loss": 0.0011, "reward": 1.8145712018013, "reward_std": 0.09588093496859074, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8145712018013, "step": 591 }, { "clip_ratio": 0.0, "completion_length": 136.1953125, "epoch": 0.5085910652920962, "grad_norm": 1.78737635682799, "kl": 0.02001953125, "learning_rate": 7.457044673539519e-07, "loss": 0.0008, "reward": 1.7849004864692688, "reward_std": 0.09064675122499466, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7927128970623016, "step": 592 }, { "clip_ratio": 0.0, "completion_length": 130.0234375, "epoch": 0.5094501718213058, "grad_norm": 1.5323474545376934, "kl": 0.0244140625, "learning_rate": 7.452749140893471e-07, "loss": 0.001, "reward": 1.8163361549377441, "reward_std": 0.06137897726148367, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8163362145423889, "step": 593 }, { "clip_ratio": 0.0, "completion_length": 146.65625, "epoch": 0.5103092783505154, "grad_norm": 1.167714380864763, "kl": 0.02984619140625, "learning_rate": 7.448453608247423e-07, "loss": 0.0012, "reward": 1.7876654863357544, "reward_std": 0.1253710240125656, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7954780161380768, "step": 594 }, { "clip_ratio": 0.0, "completion_length": 133.375, "epoch": 0.511168384879725, "grad_norm": 1.2346413570984562, "kl": 0.0233154296875, "learning_rate": 7.444158075601375e-07, "loss": 0.0009, "reward": 1.7417802214622498, "reward_std": 0.1481439284980297, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7417803108692169, "step": 595 }, { "clip_ratio": 0.0, "completion_length": 140.828125, "epoch": 0.5120274914089347, "grad_norm": 1.4208055330522806, "kl": 0.0283203125, "learning_rate": 7.439862542955327e-07, "loss": 0.0011, "reward": 1.7719964981079102, "reward_std": 0.15724200010299683, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7798090577125549, "step": 596 }, { "clip_ratio": 0.0, "completion_length": 144.71875, "epoch": 0.5128865979381443, "grad_norm": 3.920082332866134, "kl": 0.02557373046875, "learning_rate": 7.435567010309278e-07, "loss": 0.001, "reward": 1.7663773894309998, "reward_std": 0.08243504166603088, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7663773894309998, "step": 597 }, { "clip_ratio": 0.0, "completion_length": 137.40625, "epoch": 0.5137457044673539, "grad_norm": 1.2860662352628833, "kl": 0.02374267578125, "learning_rate": 7.43127147766323e-07, "loss": 0.0009, "reward": 1.7066620588302612, "reward_std": 0.06952378898859024, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7144745290279388, "step": 598 }, { "clip_ratio": 0.0, "completion_length": 140.2265625, "epoch": 0.5146048109965635, "grad_norm": 0.9580097497182469, "kl": 0.021728515625, "learning_rate": 7.426975945017182e-07, "loss": 0.0009, "reward": 1.7517217993736267, "reward_std": 0.09551467373967171, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7517217695713043, "step": 599 }, { "clip_ratio": 0.0, "completion_length": 153.5625, "epoch": 0.5154639175257731, "grad_norm": 1.8465816930490027, "kl": 0.0545654296875, "learning_rate": 7.422680412371134e-07, "loss": 0.0022, "reward": 1.711679220199585, "reward_std": 0.10108717158436775, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7194916903972626, "step": 600 } ], "logging_steps": 1.0, "max_steps": 2328, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }