{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.218430034129693, "eval_steps": 500, "global_step": 2400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 21.971354484558105, "epoch": 0.0034129692832764505, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 9.997155858930603e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward": 0.0, "rewards/score_reward": 0.0, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 21.5390625, "epoch": 0.006825938566552901, "grad_norm": 0.2198824607387467, "kl": 0.0, "learning_rate": 9.994311717861205e-07, "loss": -0.0, "reward": 0.0026041667442768812, "reward_std": 0.0073656952008605, "rewards/format_reward": 0.0, "rewards/score_reward": 0.0026041667442768812, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 28.151042938232422, "epoch": 0.010238907849829351, "grad_norm": 0.787836483544367, "kl": 0.00017499923706054688, "learning_rate": 9.991467576791809e-07, "loss": 0.0, "reward": 0.015625, "reward_std": 0.025392480194568634, "rewards/format_reward": 0.0026041667442768812, "rewards/score_reward": 0.013020833022892475, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 29.054688453674316, "epoch": 0.013651877133105802, "grad_norm": 1.6367635058055439, "kl": 0.00067138671875, "learning_rate": 9.98862343572241e-07, "loss": 0.0, "reward": 0.026041666977107525, "reward_std": 0.03629593923687935, "rewards/format_reward": 0.0, "rewards/score_reward": 0.026041666977107525, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 53.838544845581055, "epoch": 0.017064846416382253, "grad_norm": 2.131864681112454, "kl": 0.00830078125, "learning_rate": 9.985779294653014e-07, "loss": 0.0, "reward": 0.0572916679084301, "reward_std": 0.10535168647766113, "rewards/format_reward": 0.0052083334885537624, "rewards/score_reward": 0.05208333395421505, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 95.69271087646484, "epoch": 0.020477815699658702, "grad_norm": 1.447709833366231, "kl": 0.0274658203125, "learning_rate": 9.982935153583618e-07, "loss": 0.0, "reward": 0.1328124962747097, "reward_std": 0.22811448574066162, "rewards/format_reward": 0.0078125, "rewards/score_reward": 0.1249999962747097, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 153.60678100585938, "epoch": 0.023890784982935155, "grad_norm": 2.101485582698274, "kl": 0.0555419921875, "learning_rate": 9.98009101251422e-07, "loss": 0.0001, "reward": 0.486328125, "reward_std": 0.48113366961479187, "rewards/format_reward": 0.2942708432674408, "rewards/score_reward": 0.1920572966337204, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 159.91928100585938, "epoch": 0.027303754266211604, "grad_norm": 1.5641788133949739, "kl": 0.0467529296875, "learning_rate": 9.977246871444823e-07, "loss": 0.0, "reward": 0.5533854067325592, "reward_std": 0.48387788236141205, "rewards/format_reward": 0.296875, "rewards/score_reward": 0.2565104141831398, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 153.76563262939453, "epoch": 0.030716723549488054, "grad_norm": 2.0837768296761614, "kl": 0.052490234375, "learning_rate": 9.974402730375427e-07, "loss": 0.0001, "reward": 0.7272135317325592, "reward_std": 0.5450011193752289, "rewards/format_reward": 0.4609375, "rewards/score_reward": 0.2662760466337204, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 159.0572967529297, "epoch": 0.034129692832764506, "grad_norm": 2.079138271764708, "kl": 0.0654296875, "learning_rate": 9.97155858930603e-07, "loss": 0.0001, "reward": 0.9049479067325592, "reward_std": 0.536494106054306, "rewards/format_reward": 0.6588541865348816, "rewards/score_reward": 0.24609375, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 169.71615600585938, "epoch": 0.03754266211604096, "grad_norm": 2.196380604681797, "kl": 0.06640625, "learning_rate": 9.968714448236632e-07, "loss": 0.0001, "reward": 1.091796875, "reward_std": 0.5734454393386841, "rewards/format_reward": 0.796875, "rewards/score_reward": 0.294921875, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 175.4166717529297, "epoch": 0.040955631399317405, "grad_norm": 1.5751945059420034, "kl": 0.06884765625, "learning_rate": 9.965870307167234e-07, "loss": 0.0001, "reward": 1.1197916269302368, "reward_std": 0.49553531408309937, "rewards/format_reward": 0.8619791865348816, "rewards/score_reward": 0.2578125, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 180.9765625, "epoch": 0.04436860068259386, "grad_norm": 1.3356831569538974, "kl": 0.066650390625, "learning_rate": 9.963026166097837e-07, "loss": 0.0001, "reward": 1.0397135615348816, "reward_std": 0.47830362617969513, "rewards/format_reward": 0.8489583432674408, "rewards/score_reward": 0.1907552108168602, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 178.87500762939453, "epoch": 0.04778156996587031, "grad_norm": 1.8722112490609648, "kl": 0.07470703125, "learning_rate": 9.960182025028441e-07, "loss": 0.0001, "reward": 1.201171875, "reward_std": 0.448482409119606, "rewards/format_reward": 0.9036458432674408, "rewards/score_reward": 0.2975260466337204, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 174.02344512939453, "epoch": 0.051194539249146756, "grad_norm": 1.137542954341904, "kl": 0.07177734375, "learning_rate": 9.957337883959045e-07, "loss": 0.0001, "reward": 1.19921875, "reward_std": 0.40944164991378784, "rewards/format_reward": 0.9244791865348816, "rewards/score_reward": 0.2747395858168602, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 158.71875, "epoch": 0.05460750853242321, "grad_norm": 1.7382890655037977, "kl": 0.0849609375, "learning_rate": 9.954493742889646e-07, "loss": 0.0001, "reward": 1.30078125, "reward_std": 0.38683652877807617, "rewards/format_reward": 0.953125, "rewards/score_reward": 0.34765625, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 146.5572967529297, "epoch": 0.05802047781569966, "grad_norm": 1.1691379786391385, "kl": 0.087646484375, "learning_rate": 9.95164960182025e-07, "loss": 0.0001, "reward": 1.3307291865348816, "reward_std": 0.3555728495121002, "rewards/format_reward": 0.9765625, "rewards/score_reward": 0.3541666716337204, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 148.00782012939453, "epoch": 0.06143344709897611, "grad_norm": 1.2111523809976827, "kl": 0.0869140625, "learning_rate": 9.948805460750854e-07, "loss": 0.0001, "reward": 1.3190103769302368, "reward_std": 0.36138303577899933, "rewards/format_reward": 0.9661458432674408, "rewards/score_reward": 0.3528645783662796, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 138.6171875, "epoch": 0.06484641638225255, "grad_norm": 1.1521581757387094, "kl": 0.091552734375, "learning_rate": 9.945961319681455e-07, "loss": 0.0001, "reward": 1.2825520634651184, "reward_std": 0.32621656358242035, "rewards/format_reward": 0.9791666567325592, "rewards/score_reward": 0.3033854216337204, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 130.3072967529297, "epoch": 0.06825938566552901, "grad_norm": 1.646785373167282, "kl": 0.09619140625, "learning_rate": 9.94311717861206e-07, "loss": 0.0001, "reward": 1.3118489384651184, "reward_std": 0.3204665184020996, "rewards/format_reward": 0.9921875, "rewards/score_reward": 0.3196614682674408, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 123.89323043823242, "epoch": 0.07167235494880546, "grad_norm": 1.4562297536059283, "kl": 0.101806640625, "learning_rate": 9.94027303754266e-07, "loss": 0.0001, "reward": 1.2877604365348816, "reward_std": 0.25049570947885513, "rewards/format_reward": 0.9947916865348816, "rewards/score_reward": 0.2929687425494194, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 116.96875, "epoch": 0.07508532423208192, "grad_norm": 1.518881868528844, "kl": 0.1142578125, "learning_rate": 9.937428896473265e-07, "loss": 0.0001, "reward": 1.3932291269302368, "reward_std": 0.3011336326599121, "rewards/format_reward": 0.9921875, "rewards/score_reward": 0.4010416716337204, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 111.80208587646484, "epoch": 0.07849829351535836, "grad_norm": 1.2084754088310832, "kl": 0.120361328125, "learning_rate": 9.934584755403868e-07, "loss": 0.0001, "reward": 1.3359375, "reward_std": 0.2880899906158447, "rewards/format_reward": 0.9947916865348816, "rewards/score_reward": 0.3411458283662796, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 107.60937881469727, "epoch": 0.08191126279863481, "grad_norm": 1.469309030422537, "kl": 0.125244140625, "learning_rate": 9.93174061433447e-07, "loss": 0.0001, "reward": 1.4876301884651184, "reward_std": 0.33191148936748505, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.4902343899011612, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 107.38021087646484, "epoch": 0.08532423208191127, "grad_norm": 1.2610045699912085, "kl": 0.1259765625, "learning_rate": 9.928896473265074e-07, "loss": 0.0001, "reward": 1.349609375, "reward_std": 0.3074036240577698, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.3522135466337204, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 104.21354293823242, "epoch": 0.08873720136518772, "grad_norm": 2.3296176309100676, "kl": 0.12939453125, "learning_rate": 9.926052332195677e-07, "loss": 0.0001, "reward": 1.408203125, "reward_std": 0.30291494727134705, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.4108072966337204, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 102.27083587646484, "epoch": 0.09215017064846416, "grad_norm": 1.3840211312213955, "kl": 0.1396484375, "learning_rate": 9.923208191126279e-07, "loss": 0.0001, "reward": 1.470703125, "reward_std": 0.2888399064540863, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.4733072966337204, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 101.04166793823242, "epoch": 0.09556313993174062, "grad_norm": 2.2728847711048825, "kl": 0.1416015625, "learning_rate": 9.920364050056883e-07, "loss": 0.0001, "reward": 1.3743489384651184, "reward_std": 0.25145185738801956, "rewards/format_reward": 0.9947916567325592, "rewards/score_reward": 0.3795572966337204, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 102.47916793823242, "epoch": 0.09897610921501707, "grad_norm": 1.6317672882764709, "kl": 0.17529296875, "learning_rate": 9.917519908987484e-07, "loss": 0.0002, "reward": 1.4661458134651184, "reward_std": 0.3163784444332123, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4661458283662796, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 102.90625, "epoch": 0.10238907849829351, "grad_norm": 1.5008022569026451, "kl": 0.14697265625, "learning_rate": 9.914675767918088e-07, "loss": 0.0001, "reward": 1.4075521230697632, "reward_std": 0.2981177270412445, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.41015625, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 101.65364837646484, "epoch": 0.10580204778156997, "grad_norm": 1.4348573370987756, "kl": 0.15673828125, "learning_rate": 9.911831626848692e-07, "loss": 0.0002, "reward": 1.5143229365348816, "reward_std": 0.28444378823041916, "rewards/format_reward": 0.9947916865348816, "rewards/score_reward": 0.5195312649011612, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 100.58854675292969, "epoch": 0.10921501706484642, "grad_norm": 2.533004174069148, "kl": 0.14599609375, "learning_rate": 9.908987485779293e-07, "loss": 0.0001, "reward": 1.5143229365348816, "reward_std": 0.31976868212223053, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.5169270783662796, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 99.74479293823242, "epoch": 0.11262798634812286, "grad_norm": 1.5439128748817215, "kl": 0.14990234375, "learning_rate": 9.906143344709897e-07, "loss": 0.0001, "reward": 1.3411458134651184, "reward_std": 0.23153312504291534, "rewards/format_reward": 1.0, "rewards/score_reward": 0.3411458283662796, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 100.52344131469727, "epoch": 0.11604095563139932, "grad_norm": 1.0615367399747022, "kl": 0.14306640625, "learning_rate": 9.9032992036405e-07, "loss": 0.0001, "reward": 1.4010416269302368, "reward_std": 0.2429962158203125, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4010416567325592, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 98.85677337646484, "epoch": 0.11945392491467577, "grad_norm": 1.5958807232745575, "kl": 0.14697265625, "learning_rate": 9.900455062571105e-07, "loss": 0.0001, "reward": 1.4596353769302368, "reward_std": 0.25090838968753815, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.4622395783662796, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 96.08594131469727, "epoch": 0.12286689419795221, "grad_norm": 1.187241518222232, "kl": 0.16552734375, "learning_rate": 9.897610921501706e-07, "loss": 0.0002, "reward": 1.396484375, "reward_std": 0.23251446336507797, "rewards/format_reward": 1.0, "rewards/score_reward": 0.396484375, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 97.07812881469727, "epoch": 0.12627986348122866, "grad_norm": 1.1430310787136027, "kl": 0.1728515625, "learning_rate": 9.894766780432308e-07, "loss": 0.0002, "reward": 1.5149739384651184, "reward_std": 0.271073117852211, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.5175781100988388, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 97.89583587646484, "epoch": 0.1296928327645051, "grad_norm": 1.8198382482141502, "kl": 0.15087890625, "learning_rate": 9.891922639362911e-07, "loss": 0.0002, "reward": 1.353515625, "reward_std": 0.23891131579875946, "rewards/format_reward": 1.0, "rewards/score_reward": 0.353515625, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 96.8125, "epoch": 0.13310580204778158, "grad_norm": 6.498361170046529, "kl": 0.16259765625, "learning_rate": 9.889078498293515e-07, "loss": 0.0002, "reward": 1.3470051884651184, "reward_std": 0.28532542288303375, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.349609375, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 95.54427337646484, "epoch": 0.13651877133105803, "grad_norm": 1.4265999966206027, "kl": 0.16748046875, "learning_rate": 9.886234357224119e-07, "loss": 0.0002, "reward": 1.41796875, "reward_std": 0.2189825400710106, "rewards/format_reward": 1.0, "rewards/score_reward": 0.41796875, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 97.35937881469727, "epoch": 0.13993174061433447, "grad_norm": 2.6340856151833765, "kl": 0.1787109375, "learning_rate": 9.88339021615472e-07, "loss": 0.0002, "reward": 1.439453125, "reward_std": 0.2565288618206978, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4394531399011612, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 95.61198425292969, "epoch": 0.14334470989761092, "grad_norm": 7.407382448607371, "kl": 0.16259765625, "learning_rate": 9.880546075085324e-07, "loss": 0.0002, "reward": 1.4641926884651184, "reward_std": 0.21369559317827225, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4641927033662796, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 94.28125381469727, "epoch": 0.14675767918088736, "grad_norm": 2.128461007299376, "kl": 0.166015625, "learning_rate": 9.877701934015926e-07, "loss": 0.0002, "reward": 1.478515625, "reward_std": 0.18956902623176575, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4785156100988388, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 92.77864837646484, "epoch": 0.15017064846416384, "grad_norm": 1.1602637332040315, "kl": 0.17529296875, "learning_rate": 9.87485779294653e-07, "loss": 0.0002, "reward": 1.337890625, "reward_std": 0.1985834315419197, "rewards/format_reward": 1.0, "rewards/score_reward": 0.337890625, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 93.17448043823242, "epoch": 0.15358361774744028, "grad_norm": 1.6993816231587195, "kl": 0.18359375, "learning_rate": 9.872013651877133e-07, "loss": 0.0002, "reward": 1.41796875, "reward_std": 0.1657150462269783, "rewards/format_reward": 1.0, "rewards/score_reward": 0.41796875, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 94.15104675292969, "epoch": 0.15699658703071673, "grad_norm": 1.4198105304826467, "kl": 0.18603515625, "learning_rate": 9.869169510807735e-07, "loss": 0.0002, "reward": 1.408203125, "reward_std": 0.21997249871492386, "rewards/format_reward": 1.0, "rewards/score_reward": 0.408203125, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 90.890625, "epoch": 0.16040955631399317, "grad_norm": 1.3926919501646233, "kl": 0.18408203125, "learning_rate": 9.866325369738339e-07, "loss": 0.0002, "reward": 1.33203125, "reward_std": 0.18587756156921387, "rewards/format_reward": 1.0, "rewards/score_reward": 0.33203125, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 89.61458587646484, "epoch": 0.16382252559726962, "grad_norm": 1.0171234746158861, "kl": 0.1826171875, "learning_rate": 9.863481228668942e-07, "loss": 0.0002, "reward": 1.419921875, "reward_std": 0.21101392805576324, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.4225260317325592, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 86.98177337646484, "epoch": 0.16723549488054607, "grad_norm": 1.8923475810855548, "kl": 0.1904296875, "learning_rate": 9.860637087599544e-07, "loss": 0.0002, "reward": 1.431640625, "reward_std": 0.2544175982475281, "rewards/format_reward": 1.0, "rewards/score_reward": 0.431640625, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 88.98437881469727, "epoch": 0.17064846416382254, "grad_norm": 0.9638448434064062, "kl": 0.18798828125, "learning_rate": 9.857792946530148e-07, "loss": 0.0002, "reward": 1.5110676884651184, "reward_std": 0.21133490651845932, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5110677182674408, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 88.40364837646484, "epoch": 0.17406143344709898, "grad_norm": 1.459330588937527, "kl": 0.193359375, "learning_rate": 9.85494880546075e-07, "loss": 0.0002, "reward": 1.4765625, "reward_std": 0.27866625785827637, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4765625, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 87.35937881469727, "epoch": 0.17747440273037543, "grad_norm": 1.3781789319630122, "kl": 0.19189453125, "learning_rate": 9.852104664391353e-07, "loss": 0.0002, "reward": 1.41796875, "reward_std": 0.1655370444059372, "rewards/format_reward": 1.0, "rewards/score_reward": 0.41796875, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 88.72135543823242, "epoch": 0.18088737201365188, "grad_norm": 1.258241309353634, "kl": 0.1953125, "learning_rate": 9.849260523321957e-07, "loss": 0.0002, "reward": 1.4889323115348816, "reward_std": 0.1946164071559906, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4889322817325592, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 85.86719131469727, "epoch": 0.18430034129692832, "grad_norm": 1.0001485037285975, "kl": 0.18896484375, "learning_rate": 9.846416382252558e-07, "loss": 0.0002, "reward": 1.408203125, "reward_std": 0.20008864998817444, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4082031100988388, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 87.39062881469727, "epoch": 0.18771331058020477, "grad_norm": 1.510360886565899, "kl": 0.1875, "learning_rate": 9.843572241183162e-07, "loss": 0.0002, "reward": 1.5013020634651184, "reward_std": 0.2293265089392662, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5013020783662796, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 88.15104293823242, "epoch": 0.19112627986348124, "grad_norm": 0.9878606818293397, "kl": 0.19384765625, "learning_rate": 9.840728100113766e-07, "loss": 0.0002, "reward": 1.5104166865348816, "reward_std": 0.18821671605110168, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5104166716337204, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 82.29166793823242, "epoch": 0.1945392491467577, "grad_norm": 1.2736010095046053, "kl": 0.20654296875, "learning_rate": 9.83788395904437e-07, "loss": 0.0002, "reward": 1.4674479365348816, "reward_std": 0.21555245667696, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4674479216337204, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 85.27604293823242, "epoch": 0.19795221843003413, "grad_norm": 4.423957365231371, "kl": 0.2509765625, "learning_rate": 9.835039817974971e-07, "loss": 0.0003, "reward": 1.3997395634651184, "reward_std": 0.29705318808555603, "rewards/format_reward": 1.0, "rewards/score_reward": 0.3997395783662796, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 83.05469131469727, "epoch": 0.20136518771331058, "grad_norm": 1.2457838962541887, "kl": 0.21337890625, "learning_rate": 9.832195676905573e-07, "loss": 0.0002, "reward": 1.4772135615348816, "reward_std": 0.2145705297589302, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4772135466337204, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 84.57812881469727, "epoch": 0.20477815699658702, "grad_norm": 1.6942530277320655, "kl": 0.208984375, "learning_rate": 9.829351535836176e-07, "loss": 0.0002, "reward": 1.404296875, "reward_std": 0.21944832801818848, "rewards/format_reward": 1.0, "rewards/score_reward": 0.404296875, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 83.76302337646484, "epoch": 0.20819112627986347, "grad_norm": 1.3401680767486719, "kl": 0.291015625, "learning_rate": 9.82650739476678e-07, "loss": 0.0003, "reward": 1.3912760615348816, "reward_std": 0.25876037776470184, "rewards/format_reward": 1.0, "rewards/score_reward": 0.3912760466337204, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 83.80208587646484, "epoch": 0.21160409556313994, "grad_norm": 1.2715920520394912, "kl": 0.21240234375, "learning_rate": 9.823663253697384e-07, "loss": 0.0002, "reward": 1.4296875, "reward_std": 0.23293594270944595, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4296875, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 81.88542175292969, "epoch": 0.2150170648464164, "grad_norm": 1.4414384221346033, "kl": 0.21728515625, "learning_rate": 9.820819112627986e-07, "loss": 0.0002, "reward": 1.421875, "reward_std": 0.2545352503657341, "rewards/format_reward": 1.0, "rewards/score_reward": 0.421875, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 85.46614837646484, "epoch": 0.21843003412969283, "grad_norm": 3.543996299314846, "kl": 0.2080078125, "learning_rate": 9.81797497155859e-07, "loss": 0.0002, "reward": 1.43359375, "reward_std": 0.22249317914247513, "rewards/format_reward": 1.0, "rewards/score_reward": 0.43359375, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 83.62760543823242, "epoch": 0.22184300341296928, "grad_norm": 8.657662832189583, "kl": 0.21923828125, "learning_rate": 9.815130830489193e-07, "loss": 0.0002, "reward": 1.4602864384651184, "reward_std": 0.19201908260583878, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4602864533662796, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 87.90364837646484, "epoch": 0.22525597269624573, "grad_norm": 1.4939361587139874, "kl": 0.19921875, "learning_rate": 9.812286689419795e-07, "loss": 0.0002, "reward": 1.4811198115348816, "reward_std": 0.2641155272722244, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4811197966337204, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 86.35677337646484, "epoch": 0.22866894197952217, "grad_norm": 1.230410078832553, "kl": 0.2294921875, "learning_rate": 9.809442548350398e-07, "loss": 0.0002, "reward": 1.455078125, "reward_std": 0.2112826257944107, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4550781399011612, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 86.68229293823242, "epoch": 0.23208191126279865, "grad_norm": 2.1075016311206554, "kl": 0.21484375, "learning_rate": 9.806598407281e-07, "loss": 0.0002, "reward": 1.4544270634651184, "reward_std": 0.24234450608491898, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4544270783662796, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 84.5859375, "epoch": 0.2354948805460751, "grad_norm": 2.1278949645354968, "kl": 0.2216796875, "learning_rate": 9.803754266211604e-07, "loss": 0.0002, "reward": 1.47265625, "reward_std": 0.2958555817604065, "rewards/format_reward": 1.0, "rewards/score_reward": 0.47265625, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 87.61458587646484, "epoch": 0.23890784982935154, "grad_norm": 1.3053415329457876, "kl": 0.22412109375, "learning_rate": 9.800910125142207e-07, "loss": 0.0002, "reward": 1.4817708134651184, "reward_std": 0.2614116221666336, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4817708283662796, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 90.40885925292969, "epoch": 0.24232081911262798, "grad_norm": 5.618126794000786, "kl": 0.2001953125, "learning_rate": 9.79806598407281e-07, "loss": 0.0002, "reward": 1.556640625, "reward_std": 0.26630353927612305, "rewards/format_reward": 1.0, "rewards/score_reward": 0.556640625, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 91.76041793823242, "epoch": 0.24573378839590443, "grad_norm": 1.3325557145608604, "kl": 0.1953125, "learning_rate": 9.795221843003413e-07, "loss": 0.0002, "reward": 1.486328125, "reward_std": 0.2584661543369293, "rewards/format_reward": 1.0, "rewards/score_reward": 0.486328125, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 94.73958587646484, "epoch": 0.24914675767918087, "grad_norm": 1.2964609325578282, "kl": 0.208984375, "learning_rate": 9.792377701934016e-07, "loss": 0.0002, "reward": 1.4752604365348816, "reward_std": 0.22054015845060349, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4752604067325592, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 92.84375381469727, "epoch": 0.2525597269624573, "grad_norm": 1.4426874016460878, "kl": 0.2119140625, "learning_rate": 9.789533560864618e-07, "loss": 0.0002, "reward": 1.4329426884651184, "reward_std": 0.27699775248765945, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4329427033662796, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 95.60417175292969, "epoch": 0.25597269624573377, "grad_norm": 1.9055725919946906, "kl": 0.2490234375, "learning_rate": 9.786689419795222e-07, "loss": 0.0002, "reward": 1.4557291865348816, "reward_std": 0.18950803577899933, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4557291716337204, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 96.62500381469727, "epoch": 0.2593856655290102, "grad_norm": 1.4716516940149593, "kl": 0.203125, "learning_rate": 9.783845278725823e-07, "loss": 0.0002, "reward": 1.4654948115348816, "reward_std": 0.26829979568719864, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4654947817325592, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 94.51823043823242, "epoch": 0.2627986348122867, "grad_norm": 1.3012314238140408, "kl": 0.212890625, "learning_rate": 9.781001137656427e-07, "loss": 0.0002, "reward": 1.4934895634651184, "reward_std": 0.22228141874074936, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4934895932674408, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 96.21875381469727, "epoch": 0.26621160409556316, "grad_norm": 1.4233240440877817, "kl": 0.2392578125, "learning_rate": 9.77815699658703e-07, "loss": 0.0002, "reward": 1.39453125, "reward_std": 0.23518333584070206, "rewards/format_reward": 1.0, "rewards/score_reward": 0.39453125, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 97.16406631469727, "epoch": 0.2696245733788396, "grad_norm": 1.1910030740669966, "kl": 0.21240234375, "learning_rate": 9.775312855517632e-07, "loss": 0.0002, "reward": 1.4518228769302368, "reward_std": 0.2141050398349762, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4518229067325592, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 94.85677337646484, "epoch": 0.27303754266211605, "grad_norm": 6.7239694547329565, "kl": 0.21337890625, "learning_rate": 9.772468714448236e-07, "loss": 0.0002, "reward": 1.4889323115348816, "reward_std": 0.22330933809280396, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4889322966337204, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 93.41666793823242, "epoch": 0.2764505119453925, "grad_norm": 1.4918052682533274, "kl": 0.232421875, "learning_rate": 9.76962457337884e-07, "loss": 0.0002, "reward": 1.40234375, "reward_std": 0.27006910741329193, "rewards/format_reward": 1.0, "rewards/score_reward": 0.40234375, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 93.98698425292969, "epoch": 0.27986348122866894, "grad_norm": 1.054646710437264, "kl": 0.2197265625, "learning_rate": 9.766780432309441e-07, "loss": 0.0002, "reward": 1.3736979365348816, "reward_std": 0.18724806606769562, "rewards/format_reward": 1.0, "rewards/score_reward": 0.3736979216337204, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 96.83594131469727, "epoch": 0.2832764505119454, "grad_norm": 1.3098939605528006, "kl": 0.2294921875, "learning_rate": 9.763936291240045e-07, "loss": 0.0002, "reward": 1.5169270634651184, "reward_std": 0.22978021204471588, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5169270783662796, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 97.91927337646484, "epoch": 0.28668941979522183, "grad_norm": 1.0594563297083222, "kl": 0.20751953125, "learning_rate": 9.761092150170647e-07, "loss": 0.0002, "reward": 1.5143229365348816, "reward_std": 0.19752653688192368, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5143229067325592, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 88.18489837646484, "epoch": 0.2901023890784983, "grad_norm": 1.0567447286929676, "kl": 0.24169921875, "learning_rate": 9.75824800910125e-07, "loss": 0.0002, "reward": 1.51171875, "reward_std": 0.2275291010737419, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5117187350988388, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 92.42448043823242, "epoch": 0.2935153583617747, "grad_norm": 1.3367332783575792, "kl": 0.22607421875, "learning_rate": 9.755403868031854e-07, "loss": 0.0002, "reward": 1.451171875, "reward_std": 0.23949608951807022, "rewards/format_reward": 1.0, "rewards/score_reward": 0.451171875, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 90.80208587646484, "epoch": 0.29692832764505117, "grad_norm": 1.0711138224113856, "kl": 0.2607421875, "learning_rate": 9.752559726962458e-07, "loss": 0.0003, "reward": 1.4498698115348816, "reward_std": 0.20927658677101135, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4498697966337204, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 89.32552337646484, "epoch": 0.3003412969283277, "grad_norm": 2.215100584913107, "kl": 0.24169921875, "learning_rate": 9.74971558589306e-07, "loss": 0.0002, "reward": 1.4147135615348816, "reward_std": 0.15555834025144577, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4147135466337204, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 88.69791793823242, "epoch": 0.3037542662116041, "grad_norm": 1.1059504035789098, "kl": 0.267578125, "learning_rate": 9.746871444823663e-07, "loss": 0.0003, "reward": 1.4270833134651184, "reward_std": 0.17468806356191635, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4270833283662796, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 87.40885925292969, "epoch": 0.30716723549488056, "grad_norm": 1.161549567378809, "kl": 0.2568359375, "learning_rate": 9.744027303754265e-07, "loss": 0.0003, "reward": 1.5260416865348816, "reward_std": 0.20608416944742203, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5260416716337204, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 86.98698043823242, "epoch": 0.310580204778157, "grad_norm": 1.9261382164783556, "kl": 0.24267578125, "learning_rate": 9.741183162684869e-07, "loss": 0.0002, "reward": 1.5462239384651184, "reward_std": 0.227963425219059, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5462239682674408, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 85.35416793823242, "epoch": 0.31399317406143346, "grad_norm": 1.3364913480642067, "kl": 0.25537109375, "learning_rate": 9.738339021615472e-07, "loss": 0.0003, "reward": 1.455078125, "reward_std": 0.2189953550696373, "rewards/format_reward": 1.0, "rewards/score_reward": 0.455078125, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 89.14062881469727, "epoch": 0.3174061433447099, "grad_norm": 1.112317167539996, "kl": 0.255859375, "learning_rate": 9.735494880546074e-07, "loss": 0.0003, "reward": 1.4479166865348816, "reward_std": 0.17169290035963058, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4479166716337204, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 87.92448043823242, "epoch": 0.32081911262798635, "grad_norm": 2.737827037902838, "kl": 0.2734375, "learning_rate": 9.732650739476678e-07, "loss": 0.0003, "reward": 1.5377603769302368, "reward_std": 0.15655027329921722, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5377604216337204, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 90.6640625, "epoch": 0.3242320819112628, "grad_norm": 2.011022437795498, "kl": 0.2509765625, "learning_rate": 9.729806598407281e-07, "loss": 0.0003, "reward": 1.447265625, "reward_std": 0.21784657984972, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4472656100988388, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 91.98958587646484, "epoch": 0.32764505119453924, "grad_norm": 1.4132733860642261, "kl": 0.22998046875, "learning_rate": 9.726962457337883e-07, "loss": 0.0002, "reward": 1.578125, "reward_std": 0.20498556643724442, "rewards/format_reward": 1.0, "rewards/score_reward": 0.578125, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 88.40625381469727, "epoch": 0.3310580204778157, "grad_norm": 1.0673613453486381, "kl": 0.26953125, "learning_rate": 9.724118316268487e-07, "loss": 0.0003, "reward": 1.4583333134651184, "reward_std": 0.21532084792852402, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4583333283662796, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 92.02604293823242, "epoch": 0.33447098976109213, "grad_norm": 1.2641804890901798, "kl": 0.23583984375, "learning_rate": 9.721274175199088e-07, "loss": 0.0002, "reward": 1.4954426884651184, "reward_std": 0.2019774094223976, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4954427182674408, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 92.20833587646484, "epoch": 0.3378839590443686, "grad_norm": 1.4367368900005992, "kl": 0.24951171875, "learning_rate": 9.718430034129692e-07, "loss": 0.0002, "reward": 1.5045573115348816, "reward_std": 0.20538997650146484, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5045572966337204, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 92.55989837646484, "epoch": 0.3412969283276451, "grad_norm": 3.3739020676975233, "kl": 0.251953125, "learning_rate": 9.715585893060296e-07, "loss": 0.0003, "reward": 1.37890625, "reward_std": 0.19987022131681442, "rewards/format_reward": 1.0, "rewards/score_reward": 0.37890625, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 96.02604293823242, "epoch": 0.3447098976109215, "grad_norm": 1.1489137303721118, "kl": 0.21728515625, "learning_rate": 9.712741751990897e-07, "loss": 0.0002, "reward": 1.4609375, "reward_std": 0.18004252761602402, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4609375, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 95.23177337646484, "epoch": 0.34812286689419797, "grad_norm": 1.3294360980615194, "kl": 0.22216796875, "learning_rate": 9.709897610921501e-07, "loss": 0.0002, "reward": 1.408203125, "reward_std": 0.2031831070780754, "rewards/format_reward": 1.0, "rewards/score_reward": 0.408203125, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 95.85156631469727, "epoch": 0.3515358361774744, "grad_norm": 2.715785100356974, "kl": 0.21728515625, "learning_rate": 9.707053469852105e-07, "loss": 0.0002, "reward": 1.46484375, "reward_std": 0.15291283279657364, "rewards/format_reward": 1.0, "rewards/score_reward": 0.46484375, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 97.30208587646484, "epoch": 0.35494880546075086, "grad_norm": 1.4623107562310147, "kl": 0.228515625, "learning_rate": 9.704209328782709e-07, "loss": 0.0002, "reward": 1.5208333134651184, "reward_std": 0.22833144664764404, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.5234375, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 97.16406631469727, "epoch": 0.3583617747440273, "grad_norm": 1.316911040927373, "kl": 0.21923828125, "learning_rate": 9.70136518771331e-07, "loss": 0.0002, "reward": 1.458984375, "reward_std": 0.22437740117311478, "rewards/format_reward": 1.0, "rewards/score_reward": 0.458984375, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 98.71354293823242, "epoch": 0.36177474402730375, "grad_norm": 7.419133469789789, "kl": 0.22021484375, "learning_rate": 9.698521046643912e-07, "loss": 0.0002, "reward": 1.6022135019302368, "reward_std": 0.24919532239437103, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6022135615348816, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 96.01041793823242, "epoch": 0.3651877133105802, "grad_norm": 1.8076901791193822, "kl": 0.23388671875, "learning_rate": 9.695676905574516e-07, "loss": 0.0002, "reward": 1.5188802480697632, "reward_std": 0.22146914899349213, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5188802033662796, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 97.41667175292969, "epoch": 0.36860068259385664, "grad_norm": 1.3278335988261762, "kl": 0.2275390625, "learning_rate": 9.69283276450512e-07, "loss": 0.0002, "reward": 1.51171875, "reward_std": 0.2307361215353012, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5117187649011612, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 96.59375381469727, "epoch": 0.3720136518771331, "grad_norm": 4.891447889612288, "kl": 0.2109375, "learning_rate": 9.689988623435723e-07, "loss": 0.0002, "reward": 1.583984375, "reward_std": 0.19887910038232803, "rewards/format_reward": 1.0, "rewards/score_reward": 0.583984375, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 97.56771087646484, "epoch": 0.37542662116040953, "grad_norm": 1.386319129730727, "kl": 0.21923828125, "learning_rate": 9.687144482366325e-07, "loss": 0.0002, "reward": 1.5481771230697632, "reward_std": 0.19203244149684906, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5481770783662796, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 94.17708587646484, "epoch": 0.378839590443686, "grad_norm": 1.888229314286435, "kl": 0.28955078125, "learning_rate": 9.684300341296928e-07, "loss": 0.0003, "reward": 1.4680989384651184, "reward_std": 0.2418600544333458, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4680989533662796, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 96.57031631469727, "epoch": 0.3822525597269625, "grad_norm": 1.3522710612139173, "kl": 0.2255859375, "learning_rate": 9.681456200227532e-07, "loss": 0.0002, "reward": 1.4837239384651184, "reward_std": 0.18956978619098663, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4837239682674408, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 98.0390625, "epoch": 0.3856655290102389, "grad_norm": 1.3470575104856732, "kl": 0.22802734375, "learning_rate": 9.678612059158134e-07, "loss": 0.0002, "reward": 1.5279947519302368, "reward_std": 0.15205243229866028, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.5305989682674408, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 99.11458587646484, "epoch": 0.3890784982935154, "grad_norm": 1.7593393131119803, "kl": 0.21533203125, "learning_rate": 9.675767918088737e-07, "loss": 0.0002, "reward": 1.4889323115348816, "reward_std": 0.14162280037999153, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4889322966337204, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 95.30989837646484, "epoch": 0.3924914675767918, "grad_norm": 1.296461052105745, "kl": 0.228515625, "learning_rate": 9.67292377701934e-07, "loss": 0.0002, "reward": 1.48046875, "reward_std": 0.15420109778642654, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4804687649011612, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 97.85417175292969, "epoch": 0.39590443686006827, "grad_norm": 1.2042611842802597, "kl": 0.22705078125, "learning_rate": 9.670079635949943e-07, "loss": 0.0002, "reward": 1.423828125, "reward_std": 0.16303545981645584, "rewards/format_reward": 1.0, "rewards/score_reward": 0.423828125, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 98.82552337646484, "epoch": 0.3993174061433447, "grad_norm": 1.255025677375449, "kl": 0.24169921875, "learning_rate": 9.667235494880546e-07, "loss": 0.0002, "reward": 1.5078125, "reward_std": 0.23359986394643784, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5078125149011612, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 96.8359375, "epoch": 0.40273037542662116, "grad_norm": 1.4326690475107122, "kl": 0.22412109375, "learning_rate": 9.664391353811148e-07, "loss": 0.0002, "reward": 1.4296875, "reward_std": 0.15335293114185333, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4296875, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 104.75000381469727, "epoch": 0.4061433447098976, "grad_norm": 1.0087638202735194, "kl": 0.21337890625, "learning_rate": 9.661547212741752e-07, "loss": 0.0002, "reward": 1.521484375, "reward_std": 0.19533314555883408, "rewards/format_reward": 1.0, "rewards/score_reward": 0.521484375, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 102.13021087646484, "epoch": 0.40955631399317405, "grad_norm": 1.6711845266439247, "kl": 0.23193359375, "learning_rate": 9.658703071672355e-07, "loss": 0.0002, "reward": 1.4921875, "reward_std": 0.22448322176933289, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4921874850988388, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 103.61979675292969, "epoch": 0.4129692832764505, "grad_norm": 1.3506611838520175, "kl": 0.2373046875, "learning_rate": 9.655858930602957e-07, "loss": 0.0002, "reward": 1.53125, "reward_std": 0.24163929373025894, "rewards/format_reward": 1.0, "rewards/score_reward": 0.53125, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 99.10677337646484, "epoch": 0.41638225255972694, "grad_norm": 1.651256545657348, "kl": 0.2568359375, "learning_rate": 9.65301478953356e-07, "loss": 0.0003, "reward": 1.3782552480697632, "reward_std": 0.16048663854599, "rewards/format_reward": 1.0, "rewards/score_reward": 0.3782552182674408, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 99.54948043823242, "epoch": 0.4197952218430034, "grad_norm": 1.0970403644597289, "kl": 0.23193359375, "learning_rate": 9.650170648464162e-07, "loss": 0.0002, "reward": 1.494140625, "reward_std": 0.2039298415184021, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4941406399011612, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 102.5625, "epoch": 0.4232081911262799, "grad_norm": 5.6808782896487715, "kl": 0.24365234375, "learning_rate": 9.647326507394766e-07, "loss": 0.0002, "reward": 1.4654947519302368, "reward_std": 0.26918137073516846, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4654947966337204, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 99.47656631469727, "epoch": 0.42662116040955633, "grad_norm": 1.2308185985206765, "kl": 0.2412109375, "learning_rate": 9.64448236632537e-07, "loss": 0.0002, "reward": 1.4615885615348816, "reward_std": 0.19158507883548737, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4615885466337204, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 98.61198425292969, "epoch": 0.4300341296928328, "grad_norm": 1.5158750490523494, "kl": 0.23046875, "learning_rate": 9.641638225255971e-07, "loss": 0.0002, "reward": 1.5670573115348816, "reward_std": 0.20640700310468674, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5670572817325592, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 100.74479293823242, "epoch": 0.4334470989761092, "grad_norm": 1.110651781417454, "kl": 0.28466796875, "learning_rate": 9.638794084186575e-07, "loss": 0.0003, "reward": 1.515625, "reward_std": 0.13477708026766777, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5156249850988388, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 96.08854293823242, "epoch": 0.43686006825938567, "grad_norm": 1.5884552606093345, "kl": 0.23486328125, "learning_rate": 9.635949943117179e-07, "loss": 0.0002, "reward": 1.4928385019302368, "reward_std": 0.21951977908611298, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4928385466337204, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 94.81771087646484, "epoch": 0.4402730375426621, "grad_norm": 1.3517684631926223, "kl": 0.255859375, "learning_rate": 9.63310580204778e-07, "loss": 0.0003, "reward": 1.53515625, "reward_std": 0.18520070612430573, "rewards/format_reward": 1.0, "rewards/score_reward": 0.53515625, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 96.03385543823242, "epoch": 0.44368600682593856, "grad_norm": 1.0413543207579345, "kl": 0.263671875, "learning_rate": 9.630261660978384e-07, "loss": 0.0003, "reward": 1.5859375, "reward_std": 0.17556622624397278, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.5885416865348816, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 93.9765625, "epoch": 0.447098976109215, "grad_norm": 2.445101092491985, "kl": 0.25, "learning_rate": 9.627417519908986e-07, "loss": 0.0002, "reward": 1.5319010615348816, "reward_std": 0.25744544714689255, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5319010317325592, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 97.04166793823242, "epoch": 0.45051194539249145, "grad_norm": 1.1666788609189316, "kl": 0.2529296875, "learning_rate": 9.62457337883959e-07, "loss": 0.0003, "reward": 1.546875, "reward_std": 0.22364410012960434, "rewards/format_reward": 1.0, "rewards/score_reward": 0.546875, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 93.19271087646484, "epoch": 0.4539249146757679, "grad_norm": 2.3293193535188155, "kl": 0.25048828125, "learning_rate": 9.621729237770193e-07, "loss": 0.0003, "reward": 1.45703125, "reward_std": 0.190684013068676, "rewards/format_reward": 1.0, "rewards/score_reward": 0.45703125, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 91.55989837646484, "epoch": 0.45733788395904434, "grad_norm": 1.3665629858555748, "kl": 0.2802734375, "learning_rate": 9.618885096700797e-07, "loss": 0.0003, "reward": 1.568359375, "reward_std": 0.19531019032001495, "rewards/format_reward": 1.0, "rewards/score_reward": 0.568359375, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 96.09114837646484, "epoch": 0.46075085324232085, "grad_norm": 1.456166394491346, "kl": 0.244140625, "learning_rate": 9.616040955631399e-07, "loss": 0.0002, "reward": 1.4583333730697632, "reward_std": 0.20917189121246338, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4583333283662796, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 99.34635543823242, "epoch": 0.4641638225255973, "grad_norm": 1.6175766825336322, "kl": 0.23583984375, "learning_rate": 9.613196814562002e-07, "loss": 0.0002, "reward": 1.583984375, "reward_std": 0.22017093747854233, "rewards/format_reward": 1.0, "rewards/score_reward": 0.583984375, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 94.66146087646484, "epoch": 0.46757679180887374, "grad_norm": 1.1060563704002164, "kl": 0.25390625, "learning_rate": 9.610352673492604e-07, "loss": 0.0003, "reward": 1.5546875, "reward_std": 0.16684557497501373, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5546875149011612, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 93.88802337646484, "epoch": 0.4709897610921502, "grad_norm": 7.838690133114509, "kl": 0.25048828125, "learning_rate": 9.607508532423208e-07, "loss": 0.0003, "reward": 1.556640625, "reward_std": 0.20170941948890686, "rewards/format_reward": 1.0, "rewards/score_reward": 0.556640625, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 96.22396087646484, "epoch": 0.47440273037542663, "grad_norm": 2.6419561330451224, "kl": 0.25341796875, "learning_rate": 9.604664391353811e-07, "loss": 0.0003, "reward": 1.53125, "reward_std": 0.1628839075565338, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5312500149011612, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 91.09114837646484, "epoch": 0.4778156996587031, "grad_norm": 1.7010500016534529, "kl": 0.267578125, "learning_rate": 9.601820250284413e-07, "loss": 0.0003, "reward": 1.4817708730697632, "reward_std": 0.15764238685369492, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4817708283662796, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 94.28646087646484, "epoch": 0.4812286689419795, "grad_norm": 1.1711356218533107, "kl": 0.2529296875, "learning_rate": 9.598976109215017e-07, "loss": 0.0003, "reward": 1.4928385615348816, "reward_std": 0.17302485927939415, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4928385466337204, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 95.92187881469727, "epoch": 0.48464163822525597, "grad_norm": 3.2882284812540634, "kl": 0.25732421875, "learning_rate": 9.59613196814562e-07, "loss": 0.0003, "reward": 1.5911458134651184, "reward_std": 0.19832035899162292, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5911458432674408, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 91.56771087646484, "epoch": 0.4880546075085324, "grad_norm": 5.27157839840722, "kl": 0.26904296875, "learning_rate": 9.593287827076222e-07, "loss": 0.0003, "reward": 1.5423176884651184, "reward_std": 0.18472765386104584, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5423177182674408, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 93.64583587646484, "epoch": 0.49146757679180886, "grad_norm": 1.27191209672486, "kl": 0.2744140625, "learning_rate": 9.590443686006826e-07, "loss": 0.0003, "reward": 1.5305989980697632, "reward_std": 0.17271071672439575, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5305989533662796, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 95.93229675292969, "epoch": 0.4948805460750853, "grad_norm": 3.624397234387372, "kl": 0.2734375, "learning_rate": 9.587599544937427e-07, "loss": 0.0003, "reward": 1.4641926884651184, "reward_std": 0.18525615707039833, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4641927033662796, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 95.16406631469727, "epoch": 0.49829351535836175, "grad_norm": 2.0220225687030773, "kl": 0.265625, "learning_rate": 9.584755403868031e-07, "loss": 0.0003, "reward": 1.595703125, "reward_std": 0.19919829815626144, "rewards/format_reward": 1.0, "rewards/score_reward": 0.595703125, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 93.44271087646484, "epoch": 0.5017064846416383, "grad_norm": 1.520442121751635, "kl": 0.2568359375, "learning_rate": 9.581911262798635e-07, "loss": 0.0003, "reward": 1.5729166269302368, "reward_std": 0.18299250304698944, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5729166865348816, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 91.9296875, "epoch": 0.5051194539249146, "grad_norm": 1.159001368862851, "kl": 0.263671875, "learning_rate": 9.579067121729236e-07, "loss": 0.0003, "reward": 1.5670573115348816, "reward_std": 0.18750882893800735, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.5696614384651184, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 94.88802337646484, "epoch": 0.5085324232081911, "grad_norm": 2.0419104444961356, "kl": 0.2734375, "learning_rate": 9.57622298065984e-07, "loss": 0.0003, "reward": 1.5065103769302368, "reward_std": 0.22661717981100082, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5065104216337204, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 100.49479293823242, "epoch": 0.5119453924914675, "grad_norm": 1.7657329678814704, "kl": 0.2568359375, "learning_rate": 9.573378839590444e-07, "loss": 0.0003, "reward": 1.6731770634651184, "reward_std": 0.15461134910583496, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6731770932674408, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 92.421875, "epoch": 0.515358361774744, "grad_norm": 1.5539494053633505, "kl": 0.2783203125, "learning_rate": 9.570534698521048e-07, "loss": 0.0003, "reward": 1.6158854365348816, "reward_std": 0.2618384212255478, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6158854067325592, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 88.89062881469727, "epoch": 0.5187713310580204, "grad_norm": 1.529062819114615, "kl": 0.287109375, "learning_rate": 9.56769055745165e-07, "loss": 0.0003, "reward": 1.4401041865348816, "reward_std": 0.12989095598459244, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4401041567325592, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 92.02864837646484, "epoch": 0.5221843003412969, "grad_norm": 1.0486975450096105, "kl": 0.26953125, "learning_rate": 9.56484641638225e-07, "loss": 0.0003, "reward": 1.546875, "reward_std": 0.18862713873386383, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5468750149011612, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 92.24479293823242, "epoch": 0.5255972696245734, "grad_norm": 1.2705712237011493, "kl": 0.2626953125, "learning_rate": 9.562002275312855e-07, "loss": 0.0003, "reward": 1.5533854365348816, "reward_std": 0.19555211812257767, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5533854067325592, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 90.37500381469727, "epoch": 0.5290102389078498, "grad_norm": 1.4370619631988224, "kl": 0.271484375, "learning_rate": 9.559158134243458e-07, "loss": 0.0003, "reward": 1.45703125, "reward_std": 0.1917545050382614, "rewards/format_reward": 1.0, "rewards/score_reward": 0.45703125, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 91.57031631469727, "epoch": 0.5324232081911263, "grad_norm": 1.0276438368066685, "kl": 0.2822265625, "learning_rate": 9.556313993174062e-07, "loss": 0.0003, "reward": 1.482421875, "reward_std": 0.17743125557899475, "rewards/format_reward": 1.0, "rewards/score_reward": 0.482421875, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 93.09375381469727, "epoch": 0.5358361774744027, "grad_norm": 2.986916019209931, "kl": 0.2880859375, "learning_rate": 9.553469852104664e-07, "loss": 0.0003, "reward": 1.54296875, "reward_std": 0.20025206357240677, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5429687350988388, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 92.71094131469727, "epoch": 0.5392491467576792, "grad_norm": 0.9700204558913682, "kl": 0.27734375, "learning_rate": 9.550625711035267e-07, "loss": 0.0003, "reward": 1.533203125, "reward_std": 0.10485843569040298, "rewards/format_reward": 1.0, "rewards/score_reward": 0.533203125, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 92.36198425292969, "epoch": 0.5426621160409556, "grad_norm": 1.4529763156934812, "kl": 0.2724609375, "learning_rate": 9.547781569965871e-07, "loss": 0.0003, "reward": 1.619140625, "reward_std": 0.17972856014966965, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6217447817325592, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 90.13281631469727, "epoch": 0.5460750853242321, "grad_norm": 1.2968807771420612, "kl": 0.30078125, "learning_rate": 9.544937428896473e-07, "loss": 0.0003, "reward": 1.603515625, "reward_std": 0.13654065132141113, "rewards/format_reward": 1.0, "rewards/score_reward": 0.603515625, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 88.67448425292969, "epoch": 0.5494880546075085, "grad_norm": 1.7652192577986647, "kl": 0.3427734375, "learning_rate": 9.542093287827076e-07, "loss": 0.0003, "reward": 1.5735676884651184, "reward_std": 0.1647915244102478, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5735677182674408, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 87.06250381469727, "epoch": 0.552901023890785, "grad_norm": 4.1732957983673495, "kl": 0.341796875, "learning_rate": 9.539249146757678e-07, "loss": 0.0003, "reward": 1.5657551884651184, "reward_std": 0.1900365948677063, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.568359375, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 88.45833587646484, "epoch": 0.5563139931740614, "grad_norm": 0.8302712564708642, "kl": 0.3154296875, "learning_rate": 9.536405005688282e-07, "loss": 0.0003, "reward": 1.6106770634651184, "reward_std": 0.12565238028764725, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6106770932674408, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 90.05729293823242, "epoch": 0.5597269624573379, "grad_norm": 1.7998791231474587, "kl": 0.3037109375, "learning_rate": 9.533560864618884e-07, "loss": 0.0003, "reward": 1.5384114384651184, "reward_std": 0.14410103857517242, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5384114682674408, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 87.28385925292969, "epoch": 0.5631399317406144, "grad_norm": 1.7153509392554396, "kl": 0.33203125, "learning_rate": 9.530716723549488e-07, "loss": 0.0003, "reward": 1.5208333134651184, "reward_std": 0.16491220146417618, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5208333432674408, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 83.67448425292969, "epoch": 0.5665529010238908, "grad_norm": 1.1869266491540977, "kl": 0.3037109375, "learning_rate": 9.527872582480091e-07, "loss": 0.0003, "reward": 1.58203125, "reward_std": 0.14064710959792137, "rewards/format_reward": 1.0, "rewards/score_reward": 0.58203125, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 84.66406631469727, "epoch": 0.5699658703071673, "grad_norm": 2.391123443739861, "kl": 0.2958984375, "learning_rate": 9.525028441410695e-07, "loss": 0.0003, "reward": 1.5768228769302368, "reward_std": 0.1652406081557274, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5768229365348816, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 83.14583587646484, "epoch": 0.5733788395904437, "grad_norm": 1.7226088169577431, "kl": 0.298828125, "learning_rate": 9.522184300341296e-07, "loss": 0.0003, "reward": 1.5364583134651184, "reward_std": 0.20410507172346115, "rewards/format_reward": 0.9947916865348816, "rewards/score_reward": 0.5416666567325592, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 83.80729675292969, "epoch": 0.5767918088737202, "grad_norm": 9.173547905723849, "kl": 0.3134765625, "learning_rate": 9.519340159271899e-07, "loss": 0.0003, "reward": 1.583984375, "reward_std": 0.1642392873764038, "rewards/format_reward": 1.0, "rewards/score_reward": 0.583984375, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 84.08073043823242, "epoch": 0.5802047781569966, "grad_norm": 1.2544316883846751, "kl": 0.30078125, "learning_rate": 9.516496018202503e-07, "loss": 0.0003, "reward": 1.5846354365348816, "reward_std": 0.19074121117591858, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5846354067325592, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 79.56510543823242, "epoch": 0.5836177474402731, "grad_norm": 3.4246937880621102, "kl": 0.3017578125, "learning_rate": 9.513651877133105e-07, "loss": 0.0003, "reward": 1.5221354365348816, "reward_std": 0.18158544600009918, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.5247395932674408, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 87.09635543823242, "epoch": 0.5870307167235495, "grad_norm": 1.3937207555272406, "kl": 0.3017578125, "learning_rate": 9.510807736063709e-07, "loss": 0.0003, "reward": 1.58984375, "reward_std": 0.13702688366174698, "rewards/format_reward": 1.0, "rewards/score_reward": 0.58984375, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 84.92187881469727, "epoch": 0.590443686006826, "grad_norm": 1.5920239869983628, "kl": 0.31640625, "learning_rate": 9.507963594994312e-07, "loss": 0.0003, "reward": 1.5078125, "reward_std": 0.16781241446733475, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5078124850988388, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 86.203125, "epoch": 0.5938566552901023, "grad_norm": 1.1354945896486925, "kl": 0.2822265625, "learning_rate": 9.505119453924914e-07, "loss": 0.0003, "reward": 1.5696614980697632, "reward_std": 0.142689049243927, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5696614384651184, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 86.17187881469727, "epoch": 0.5972696245733788, "grad_norm": 2.4181350240007706, "kl": 0.28125, "learning_rate": 9.502275312855518e-07, "loss": 0.0003, "reward": 1.7643228769302368, "reward_std": 0.1672179102897644, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7643229365348816, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 81.36458587646484, "epoch": 0.6006825938566553, "grad_norm": 1.3883457985729364, "kl": 0.2919921875, "learning_rate": 9.49943117178612e-07, "loss": 0.0003, "reward": 1.4401041865348816, "reward_std": 0.13375820219516754, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4401041716337204, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 84.83594131469727, "epoch": 0.6040955631399317, "grad_norm": 1.1051465911538918, "kl": 0.2978515625, "learning_rate": 9.496587030716723e-07, "loss": 0.0003, "reward": 1.51171875, "reward_std": 0.11327887326478958, "rewards/format_reward": 1.0, "rewards/score_reward": 0.51171875, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 88.04948425292969, "epoch": 0.6075085324232082, "grad_norm": 1.202345193677071, "kl": 0.2802734375, "learning_rate": 9.493742889647326e-07, "loss": 0.0003, "reward": 1.59765625, "reward_std": 0.11354753002524376, "rewards/format_reward": 1.0, "rewards/score_reward": 0.59765625, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 83.625, "epoch": 0.6109215017064846, "grad_norm": 0.8831036408258776, "kl": 0.291015625, "learning_rate": 9.490898748577929e-07, "loss": 0.0003, "reward": 1.5201823115348816, "reward_std": 0.09265818819403648, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5201822817325592, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 82.41927337646484, "epoch": 0.6143344709897611, "grad_norm": 8.354013350572941, "kl": 0.3056640625, "learning_rate": 9.488054607508532e-07, "loss": 0.0003, "reward": 1.4361978769302368, "reward_std": 0.1067906878888607, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4361979216337204, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 86.23177337646484, "epoch": 0.6177474402730375, "grad_norm": 1.194123508290952, "kl": 0.3076171875, "learning_rate": 9.485210466439135e-07, "loss": 0.0003, "reward": 1.4954427480697632, "reward_std": 0.12360484525561333, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4954427033662796, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 81.41146087646484, "epoch": 0.621160409556314, "grad_norm": 1.1087760506121667, "kl": 0.314453125, "learning_rate": 9.482366325369739e-07, "loss": 0.0003, "reward": 1.5872395634651184, "reward_std": 0.15423013269901276, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5872395932674408, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 83.53385543823242, "epoch": 0.6245733788395904, "grad_norm": 1.3068440162606456, "kl": 0.30078125, "learning_rate": 9.47952218430034e-07, "loss": 0.0003, "reward": 1.505859375, "reward_std": 0.14192763715982437, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5058593600988388, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 83.30208587646484, "epoch": 0.6279863481228669, "grad_norm": 1.8083149557814298, "kl": 0.283203125, "learning_rate": 9.476678043230943e-07, "loss": 0.0003, "reward": 1.505859375, "reward_std": 0.1389032043516636, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5058593600988388, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 84.94271087646484, "epoch": 0.6313993174061433, "grad_norm": 1.1536596850035479, "kl": 0.3115234375, "learning_rate": 9.473833902161547e-07, "loss": 0.0003, "reward": 1.5904947519302368, "reward_std": 0.12341071292757988, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5904948115348816, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 84.30208587646484, "epoch": 0.6348122866894198, "grad_norm": 1.8131764031157886, "kl": 0.2890625, "learning_rate": 9.470989761092149e-07, "loss": 0.0003, "reward": 1.6861979365348816, "reward_std": 0.11571146547794342, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6861979067325592, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 85.33854293823242, "epoch": 0.6382252559726962, "grad_norm": 1.9402272283243727, "kl": 0.2919921875, "learning_rate": 9.468145620022753e-07, "loss": 0.0003, "reward": 1.6673176884651184, "reward_std": 0.13649199157953262, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.669921875, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 89.26042175292969, "epoch": 0.6416382252559727, "grad_norm": 1.225544353186395, "kl": 0.298828125, "learning_rate": 9.465301478953356e-07, "loss": 0.0003, "reward": 1.6419270634651184, "reward_std": 0.1162111833691597, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6419270932674408, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 88.84635543823242, "epoch": 0.6450511945392492, "grad_norm": 1.7174189615627482, "kl": 0.314453125, "learning_rate": 9.462457337883958e-07, "loss": 0.0003, "reward": 1.5690104365348816, "reward_std": 0.10404875874519348, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5690104067325592, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 84.55729675292969, "epoch": 0.6484641638225256, "grad_norm": 2.6994189782588225, "kl": 0.3232421875, "learning_rate": 9.459613196814562e-07, "loss": 0.0003, "reward": 1.5520833134651184, "reward_std": 0.133047953248024, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5520833283662796, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 91.48437881469727, "epoch": 0.6518771331058021, "grad_norm": 1.9687778724457192, "kl": 0.296875, "learning_rate": 9.456769055745164e-07, "loss": 0.0003, "reward": 1.5859375, "reward_std": 0.12429952993988991, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5859375, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 89.77604293823242, "epoch": 0.6552901023890785, "grad_norm": 2.834577198797363, "kl": 0.29296875, "learning_rate": 9.453924914675768e-07, "loss": 0.0003, "reward": 1.5579426884651184, "reward_std": 0.12464326992630959, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5579427182674408, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 90.16927337646484, "epoch": 0.658703071672355, "grad_norm": 1.4395741123538752, "kl": 0.3134765625, "learning_rate": 9.45108077360637e-07, "loss": 0.0003, "reward": 1.5032551884651184, "reward_std": 0.1261390894651413, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5032552182674408, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 93.68750381469727, "epoch": 0.6621160409556314, "grad_norm": 1.4756276212573352, "kl": 0.2861328125, "learning_rate": 9.448236632536973e-07, "loss": 0.0003, "reward": 1.6022135615348816, "reward_std": 0.14312393963336945, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6022135317325592, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 91.0078125, "epoch": 0.6655290102389079, "grad_norm": 1.32296549239664, "kl": 0.2802734375, "learning_rate": 9.445392491467577e-07, "loss": 0.0003, "reward": 1.6634114384651184, "reward_std": 0.14198945835232735, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6634114682674408, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 87.67187881469727, "epoch": 0.6689419795221843, "grad_norm": 0.9124892502177, "kl": 0.296875, "learning_rate": 9.442548350398179e-07, "loss": 0.0003, "reward": 1.5540364384651184, "reward_std": 0.14297698065638542, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.556640625, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 89.84635925292969, "epoch": 0.6723549488054608, "grad_norm": 1.3216503381798976, "kl": 0.27734375, "learning_rate": 9.439704209328783e-07, "loss": 0.0003, "reward": 1.5572916865348816, "reward_std": 0.11042757704854012, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5572916567325592, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 98.51562881469727, "epoch": 0.6757679180887372, "grad_norm": 1.3713969321053203, "kl": 0.2802734375, "learning_rate": 9.436860068259386e-07, "loss": 0.0003, "reward": 1.515625, "reward_std": 0.08309678733348846, "rewards/format_reward": 1.0, "rewards/score_reward": 0.515625, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 87.72656631469727, "epoch": 0.6791808873720137, "grad_norm": 1.5494125794697398, "kl": 0.2890625, "learning_rate": 9.434015927189987e-07, "loss": 0.0003, "reward": 1.4694010615348816, "reward_std": 0.09256935492157936, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.4720052033662796, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 96.50260543823242, "epoch": 0.6825938566552902, "grad_norm": 50.872469080274, "kl": 0.30859375, "learning_rate": 9.431171786120591e-07, "loss": 0.0003, "reward": 1.5807291865348816, "reward_std": 0.12038127705454826, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5807291567325592, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 93.32291793823242, "epoch": 0.6860068259385665, "grad_norm": 1.5463578752836844, "kl": 0.306640625, "learning_rate": 9.428327645051194e-07, "loss": 0.0003, "reward": 1.52734375, "reward_std": 0.165326826274395, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5273437649011612, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 92.40885543823242, "epoch": 0.689419795221843, "grad_norm": 2.3461998982011925, "kl": 0.291015625, "learning_rate": 9.425483503981797e-07, "loss": 0.0003, "reward": 1.58984375, "reward_std": 0.1601010486483574, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5898437350988388, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 91.63281631469727, "epoch": 0.6928327645051194, "grad_norm": 1.8332848976095586, "kl": 0.314453125, "learning_rate": 9.4226393629124e-07, "loss": 0.0003, "reward": 1.5846354365348816, "reward_std": 0.1462932452559471, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5846354067325592, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 95.60677337646484, "epoch": 0.6962457337883959, "grad_norm": 1.0439426459058514, "kl": 0.28515625, "learning_rate": 9.419795221843004e-07, "loss": 0.0003, "reward": 1.556640625, "reward_std": 0.1143002137541771, "rewards/format_reward": 1.0, "rewards/score_reward": 0.556640625, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 88.359375, "epoch": 0.6996587030716723, "grad_norm": 0.6942903718055942, "kl": 0.2939453125, "learning_rate": 9.416951080773606e-07, "loss": 0.0003, "reward": 1.6549479365348816, "reward_std": 0.0985458455979824, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6549479067325592, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 87.71354293823242, "epoch": 0.7030716723549488, "grad_norm": 2.383087103555715, "kl": 0.2939453125, "learning_rate": 9.414106939704209e-07, "loss": 0.0003, "reward": 1.6002603769302368, "reward_std": 0.14820802211761475, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6002604365348816, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 94.52083587646484, "epoch": 0.7064846416382252, "grad_norm": 1.4419484700412601, "kl": 0.310546875, "learning_rate": 9.411262798634812e-07, "loss": 0.0003, "reward": 1.546875, "reward_std": 0.14163772761821747, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5468749850988388, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 91.65625381469727, "epoch": 0.7098976109215017, "grad_norm": 1.3148372017644039, "kl": 0.2919921875, "learning_rate": 9.408418657565414e-07, "loss": 0.0003, "reward": 1.60546875, "reward_std": 0.13645726814866066, "rewards/format_reward": 1.0, "rewards/score_reward": 0.60546875, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 91.39323043823242, "epoch": 0.7133105802047781, "grad_norm": 1.3522448490361099, "kl": 0.2880859375, "learning_rate": 9.405574516496018e-07, "loss": 0.0003, "reward": 1.5364583134651184, "reward_std": 0.15635421872138977, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5364583432674408, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 86.07292175292969, "epoch": 0.7167235494880546, "grad_norm": 1.2694344161218785, "kl": 0.3046875, "learning_rate": 9.402730375426621e-07, "loss": 0.0003, "reward": 1.4596354365348816, "reward_std": 0.13681117445230484, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4596354216337204, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 87.27864837646484, "epoch": 0.7201365187713311, "grad_norm": 3.2663004890688216, "kl": 0.298828125, "learning_rate": 9.399886234357223e-07, "loss": 0.0003, "reward": 1.5924479365348816, "reward_std": 0.1270042061805725, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.5950520932674408, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 91.61198043823242, "epoch": 0.7235494880546075, "grad_norm": 2.0209049927144025, "kl": 0.28515625, "learning_rate": 9.397042093287827e-07, "loss": 0.0003, "reward": 1.5221354365348816, "reward_std": 0.1338191144168377, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5221354067325592, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 88.07292175292969, "epoch": 0.726962457337884, "grad_norm": 1.5461232652893386, "kl": 0.2900390625, "learning_rate": 9.39419795221843e-07, "loss": 0.0003, "reward": 1.5091145634651184, "reward_std": 0.13311340287327766, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5091145932674408, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 95.43229293823242, "epoch": 0.7303754266211604, "grad_norm": 0.9320018936544767, "kl": 0.265625, "learning_rate": 9.391353811149033e-07, "loss": 0.0003, "reward": 1.5494791865348816, "reward_std": 0.09421789273619652, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5494791716337204, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 90.54166793823242, "epoch": 0.7337883959044369, "grad_norm": 0.9223453730190508, "kl": 0.291015625, "learning_rate": 9.388509670079635e-07, "loss": 0.0003, "reward": 1.537109375, "reward_std": 0.09795532375574112, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5371093899011612, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 84.89583587646484, "epoch": 0.7372013651877133, "grad_norm": 0.957847726923714, "kl": 0.3154296875, "learning_rate": 9.385665529010238e-07, "loss": 0.0003, "reward": 1.5787760615348816, "reward_std": 0.11143211275339127, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5787760466337204, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 91.41927337646484, "epoch": 0.7406143344709898, "grad_norm": 0.9390436097763094, "kl": 0.302734375, "learning_rate": 9.382821387940842e-07, "loss": 0.0003, "reward": 1.62109375, "reward_std": 0.06584687903523445, "rewards/format_reward": 1.0, "rewards/score_reward": 0.62109375, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 89.86198043823242, "epoch": 0.7440273037542662, "grad_norm": 1.6157195576822085, "kl": 0.302734375, "learning_rate": 9.379977246871444e-07, "loss": 0.0003, "reward": 1.5481770634651184, "reward_std": 0.1149645671248436, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5481770932674408, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 89.68229675292969, "epoch": 0.7474402730375427, "grad_norm": 1.6473071075202543, "kl": 0.2890625, "learning_rate": 9.377133105802048e-07, "loss": 0.0003, "reward": 1.6178385615348816, "reward_std": 0.07969107292592525, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6178385317325592, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 90.90885925292969, "epoch": 0.7508532423208191, "grad_norm": 0.7928567116679044, "kl": 0.318359375, "learning_rate": 9.374288964732651e-07, "loss": 0.0003, "reward": 1.6380208134651184, "reward_std": 0.09222095459699631, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6380208432674408, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 90.11198043823242, "epoch": 0.7542662116040956, "grad_norm": 1.0886039176086961, "kl": 0.3076171875, "learning_rate": 9.371444823663253e-07, "loss": 0.0003, "reward": 1.638671875, "reward_std": 0.10493001714348793, "rewards/format_reward": 1.0, "rewards/score_reward": 0.638671875, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 90.43489837646484, "epoch": 0.757679180887372, "grad_norm": 1.4597608974852134, "kl": 0.326171875, "learning_rate": 9.368600682593856e-07, "loss": 0.0003, "reward": 1.6184895634651184, "reward_std": 0.11143399029970169, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6184895932674408, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 91.57552337646484, "epoch": 0.7610921501706485, "grad_norm": 1.1850741994478704, "kl": 0.287109375, "learning_rate": 9.365756541524459e-07, "loss": 0.0003, "reward": 1.5221354365348816, "reward_std": 0.11109277233481407, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5221354067325592, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 92.03125381469727, "epoch": 0.764505119453925, "grad_norm": 1.65225933273493, "kl": 0.30078125, "learning_rate": 9.362912400455062e-07, "loss": 0.0003, "reward": 1.5677083134651184, "reward_std": 0.16273781657218933, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5677083432674408, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 90.03906631469727, "epoch": 0.7679180887372014, "grad_norm": 1.2751122094322596, "kl": 0.3017578125, "learning_rate": 9.360068259385665e-07, "loss": 0.0003, "reward": 1.619140625, "reward_std": 0.11990096792578697, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6217447817325592, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 86.11719131469727, "epoch": 0.7713310580204779, "grad_norm": 1.2138226397882341, "kl": 0.3154296875, "learning_rate": 9.357224118316268e-07, "loss": 0.0003, "reward": 1.5540364384651184, "reward_std": 0.11548810824751854, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5540364682674408, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 88.9921875, "epoch": 0.7747440273037542, "grad_norm": 2.1674584283857676, "kl": 0.3173828125, "learning_rate": 9.354379977246871e-07, "loss": 0.0003, "reward": 1.66015625, "reward_std": 0.1127990074455738, "rewards/format_reward": 1.0, "rewards/score_reward": 0.66015625, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 84.45312881469727, "epoch": 0.7781569965870307, "grad_norm": 1.3887715216541883, "kl": 0.34765625, "learning_rate": 9.351535836177474e-07, "loss": 0.0003, "reward": 1.5677083134651184, "reward_std": 0.13341303169727325, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5677083432674408, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 82.04427337646484, "epoch": 0.7815699658703071, "grad_norm": 0.8459342546532318, "kl": 0.3291015625, "learning_rate": 9.348691695108078e-07, "loss": 0.0003, "reward": 1.5846354365348816, "reward_std": 0.08973606117069721, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5846354067325592, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 76.96354293823242, "epoch": 0.7849829351535836, "grad_norm": 0.995898852347522, "kl": 0.3525390625, "learning_rate": 9.345847554038679e-07, "loss": 0.0004, "reward": 1.548828125, "reward_std": 0.10883224010467529, "rewards/format_reward": 1.0, "rewards/score_reward": 0.548828125, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 77.15625381469727, "epoch": 0.78839590443686, "grad_norm": 2.9128525682768522, "kl": 0.33984375, "learning_rate": 9.343003412969282e-07, "loss": 0.0003, "reward": 1.5338541269302368, "reward_std": 0.10578823462128639, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5338541716337204, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 78.48177337646484, "epoch": 0.7918088737201365, "grad_norm": 1.420679788356626, "kl": 0.373046875, "learning_rate": 9.340159271899886e-07, "loss": 0.0004, "reward": 1.6477864980697632, "reward_std": 0.16396937519311905, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6477864384651184, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 74.609375, "epoch": 0.7952218430034129, "grad_norm": 2.076207749862506, "kl": 0.37109375, "learning_rate": 9.337315130830488e-07, "loss": 0.0004, "reward": 1.57421875, "reward_std": 0.11615165323019028, "rewards/format_reward": 1.0, "rewards/score_reward": 0.57421875, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 74.17708587646484, "epoch": 0.7986348122866894, "grad_norm": 1.6185445325132897, "kl": 0.37109375, "learning_rate": 9.334470989761092e-07, "loss": 0.0004, "reward": 1.62890625, "reward_std": 0.13974132016301155, "rewards/format_reward": 1.0, "rewards/score_reward": 0.62890625, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 72.41146087646484, "epoch": 0.8020477815699659, "grad_norm": 2.0886392523919404, "kl": 0.3779296875, "learning_rate": 9.331626848691695e-07, "loss": 0.0004, "reward": 1.5364583134651184, "reward_std": 0.1474272646009922, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5364583283662796, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 67.41927337646484, "epoch": 0.8054607508532423, "grad_norm": 1.2975295323519407, "kl": 0.4033203125, "learning_rate": 9.328782707622298e-07, "loss": 0.0004, "reward": 1.6139322519302368, "reward_std": 0.15034055709838867, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6139323115348816, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 73.63021087646484, "epoch": 0.8088737201365188, "grad_norm": 4.676220265412339, "kl": 0.3779296875, "learning_rate": 9.325938566552901e-07, "loss": 0.0004, "reward": 1.5865885615348816, "reward_std": 0.13271766155958176, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5865885317325592, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 70.19792175292969, "epoch": 0.8122866894197952, "grad_norm": 1.5423093127104015, "kl": 0.3994140625, "learning_rate": 9.323094425483503e-07, "loss": 0.0004, "reward": 1.58984375, "reward_std": 0.11641987785696983, "rewards/format_reward": 1.0, "rewards/score_reward": 0.58984375, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 70.82552337646484, "epoch": 0.8156996587030717, "grad_norm": 1.956298340466015, "kl": 0.4150390625, "learning_rate": 9.320250284414107e-07, "loss": 0.0004, "reward": 1.5813801884651184, "reward_std": 0.09505849704146385, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5813802182674408, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 65.36458587646484, "epoch": 0.8191126279863481, "grad_norm": 2.7227566229481956, "kl": 0.451171875, "learning_rate": 9.317406143344709e-07, "loss": 0.0005, "reward": 1.61328125, "reward_std": 0.15649890154600143, "rewards/format_reward": 1.0, "rewards/score_reward": 0.61328125, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 67.49479484558105, "epoch": 0.8225255972696246, "grad_norm": 1.9665616403099118, "kl": 0.4150390625, "learning_rate": 9.314562002275312e-07, "loss": 0.0004, "reward": 1.4876302480697632, "reward_std": 0.09193010814487934, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4876302033662796, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 67.95833587646484, "epoch": 0.825938566552901, "grad_norm": 1.7606733541366977, "kl": 0.427734375, "learning_rate": 9.311717861205916e-07, "loss": 0.0004, "reward": 1.5904948115348816, "reward_std": 0.14968551695346832, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5904947817325592, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 62.60937690734863, "epoch": 0.8293515358361775, "grad_norm": 1.6580879748989206, "kl": 0.451171875, "learning_rate": 9.308873720136518e-07, "loss": 0.0005, "reward": 1.490234375, "reward_std": 0.161702211946249, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4902343899011612, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 65.79948043823242, "epoch": 0.8327645051194539, "grad_norm": 1.1862939614813994, "kl": 0.404296875, "learning_rate": 9.306029579067122e-07, "loss": 0.0004, "reward": 1.5774739980697632, "reward_std": 0.11055941134691238, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5774739384651184, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 67.69271087646484, "epoch": 0.8361774744027304, "grad_norm": 2.146947240168559, "kl": 0.4013671875, "learning_rate": 9.303185437997725e-07, "loss": 0.0004, "reward": 1.62890625, "reward_std": 0.11054257676005363, "rewards/format_reward": 1.0, "rewards/score_reward": 0.62890625, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 64.11198043823242, "epoch": 0.8395904436860068, "grad_norm": 1.528693082161764, "kl": 0.4658203125, "learning_rate": 9.300341296928326e-07, "loss": 0.0005, "reward": 1.5729166865348816, "reward_std": 0.10883518308401108, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5729166567325592, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 68.69010925292969, "epoch": 0.8430034129692833, "grad_norm": 2.0187179966568083, "kl": 0.3955078125, "learning_rate": 9.29749715585893e-07, "loss": 0.0004, "reward": 1.53515625, "reward_std": 0.12294026091694832, "rewards/format_reward": 1.0, "rewards/score_reward": 0.53515625, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 62.9375, "epoch": 0.8464163822525598, "grad_norm": 7.4032273497852525, "kl": 0.4306640625, "learning_rate": 9.294653014789533e-07, "loss": 0.0004, "reward": 1.6321614384651184, "reward_std": 0.10955382138490677, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6321614682674408, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 65.88281631469727, "epoch": 0.8498293515358362, "grad_norm": 1.3277090196464292, "kl": 0.4140625, "learning_rate": 9.291808873720136e-07, "loss": 0.0004, "reward": 1.6328125, "reward_std": 0.050040462985634804, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6328125, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 65.71094131469727, "epoch": 0.8532423208191127, "grad_norm": 2.3023144726342974, "kl": 0.44921875, "learning_rate": 9.288964732650739e-07, "loss": 0.0004, "reward": 1.548828125, "reward_std": 0.16994865238666534, "rewards/format_reward": 1.0, "rewards/score_reward": 0.548828125, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 63.32291793823242, "epoch": 0.856655290102389, "grad_norm": 1.2882733431831468, "kl": 0.427734375, "learning_rate": 9.286120591581343e-07, "loss": 0.0004, "reward": 1.6119791865348816, "reward_std": 0.12297629565000534, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6145833432674408, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 65.58333587646484, "epoch": 0.8600682593856656, "grad_norm": 1.5502946250752767, "kl": 0.435546875, "learning_rate": 9.283276450511945e-07, "loss": 0.0004, "reward": 1.611328125, "reward_std": 0.11104883998632431, "rewards/format_reward": 1.0, "rewards/score_reward": 0.611328125, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 67.11979675292969, "epoch": 0.863481228668942, "grad_norm": 1.4306723346241343, "kl": 0.396484375, "learning_rate": 9.280432309442547e-07, "loss": 0.0004, "reward": 1.603515625, "reward_std": 0.11739904060959816, "rewards/format_reward": 1.0, "rewards/score_reward": 0.603515625, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 67.01562881469727, "epoch": 0.8668941979522184, "grad_norm": 1.1021502050975742, "kl": 0.4267578125, "learning_rate": 9.277588168373151e-07, "loss": 0.0004, "reward": 1.640625, "reward_std": 0.0707939937710762, "rewards/format_reward": 1.0, "rewards/score_reward": 0.640625, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 67.11458587646484, "epoch": 0.8703071672354948, "grad_norm": 1.1781500837030368, "kl": 0.3994140625, "learning_rate": 9.274744027303753e-07, "loss": 0.0004, "reward": 1.65234375, "reward_std": 0.11686968803405762, "rewards/format_reward": 1.0, "rewards/score_reward": 0.65234375, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 64.88541793823242, "epoch": 0.8737201365187713, "grad_norm": 1.37421932212208, "kl": 0.4384765625, "learning_rate": 9.271899886234357e-07, "loss": 0.0004, "reward": 1.5377604365348816, "reward_std": 0.10120938159525394, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5377604067325592, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 70.3359375, "epoch": 0.8771331058020477, "grad_norm": 1.2302428716236145, "kl": 0.3828125, "learning_rate": 9.26905574516496e-07, "loss": 0.0004, "reward": 1.6061198115348816, "reward_std": 0.07681708037853241, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6061197817325592, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 72.14062881469727, "epoch": 0.8805460750853242, "grad_norm": 0.6344453792864901, "kl": 0.3701171875, "learning_rate": 9.266211604095563e-07, "loss": 0.0004, "reward": 1.595703125, "reward_std": 0.04280577879399061, "rewards/format_reward": 1.0, "rewards/score_reward": 0.595703125, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 71.71094131469727, "epoch": 0.8839590443686007, "grad_norm": 1.5561839898778216, "kl": 0.3857421875, "learning_rate": 9.263367463026166e-07, "loss": 0.0004, "reward": 1.5455728769302368, "reward_std": 0.08386293798685074, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5455729365348816, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 69.75260543823242, "epoch": 0.8873720136518771, "grad_norm": 1.0461823348162742, "kl": 0.3818359375, "learning_rate": 9.260523321956769e-07, "loss": 0.0004, "reward": 1.552734375, "reward_std": 0.06634635664522648, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5527343600988388, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 69.80729293823242, "epoch": 0.8907849829351536, "grad_norm": 3.0799404694395527, "kl": 0.4033203125, "learning_rate": 9.257679180887372e-07, "loss": 0.0004, "reward": 1.5611979365348816, "reward_std": 0.1408201977610588, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5611979067325592, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 73.74739837646484, "epoch": 0.89419795221843, "grad_norm": 1.843537613456795, "kl": 0.369140625, "learning_rate": 9.254835039817974e-07, "loss": 0.0004, "reward": 1.58984375, "reward_std": 0.11127864941954613, "rewards/format_reward": 1.0, "rewards/score_reward": 0.58984375, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 73.93489837646484, "epoch": 0.8976109215017065, "grad_norm": 1.066452004948601, "kl": 0.4189453125, "learning_rate": 9.251990898748577e-07, "loss": 0.0004, "reward": 1.68359375, "reward_std": 0.08193148300051689, "rewards/format_reward": 1.0, "rewards/score_reward": 0.68359375, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 73.42969131469727, "epoch": 0.9010238907849829, "grad_norm": 1.9308593206466187, "kl": 0.3935546875, "learning_rate": 9.249146757679181e-07, "loss": 0.0004, "reward": 1.5553385615348816, "reward_std": 0.13180071115493774, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5553385317325592, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 75.27864837646484, "epoch": 0.9044368600682594, "grad_norm": 3.1729153103867827, "kl": 0.37890625, "learning_rate": 9.246302616609783e-07, "loss": 0.0004, "reward": 1.5930989384651184, "reward_std": 0.05155232921242714, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5930989682674408, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 76.85937881469727, "epoch": 0.9078498293515358, "grad_norm": 1.2212841987790444, "kl": 0.376953125, "learning_rate": 9.243458475540387e-07, "loss": 0.0004, "reward": 1.6907552480697632, "reward_std": 0.08413984254002571, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6907551884651184, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 74.28646087646484, "epoch": 0.9112627986348123, "grad_norm": 2.9376840355200224, "kl": 0.390625, "learning_rate": 9.24061433447099e-07, "loss": 0.0004, "reward": 1.5703125, "reward_std": 0.0728606041520834, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5703125, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 73.16927337646484, "epoch": 0.9146757679180887, "grad_norm": 4.170046897960386, "kl": 0.4033203125, "learning_rate": 9.237770193401592e-07, "loss": 0.0004, "reward": 1.5651041865348816, "reward_std": 0.05343913845717907, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5651041567325592, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 82.43750381469727, "epoch": 0.9180887372013652, "grad_norm": 2.2759648274541395, "kl": 0.361328125, "learning_rate": 9.234926052332195e-07, "loss": 0.0004, "reward": 1.6477864384651184, "reward_std": 0.06452471576631069, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6477864682674408, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 78.10937881469727, "epoch": 0.9215017064846417, "grad_norm": 1.445806440878731, "kl": 0.390625, "learning_rate": 9.232081911262798e-07, "loss": 0.0004, "reward": 1.6471353769302368, "reward_std": 0.1357623189687729, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6471354365348816, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 75.20052337646484, "epoch": 0.9249146757679181, "grad_norm": 1.5703219327212041, "kl": 0.388671875, "learning_rate": 9.229237770193401e-07, "loss": 0.0004, "reward": 1.5442708730697632, "reward_std": 0.16338147595524788, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5442708134651184, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 74.55989837646484, "epoch": 0.9283276450511946, "grad_norm": 1.3068882986484045, "kl": 0.380859375, "learning_rate": 9.226393629124004e-07, "loss": 0.0004, "reward": 1.5377603769302368, "reward_std": 0.08386553078889847, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5377604216337204, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 79.02864837646484, "epoch": 0.931740614334471, "grad_norm": 1.4206247335629236, "kl": 0.3974609375, "learning_rate": 9.223549488054607e-07, "loss": 0.0004, "reward": 1.6087239980697632, "reward_std": 0.09887825697660446, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6087239384651184, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 75.94531631469727, "epoch": 0.9351535836177475, "grad_norm": 1.0221595669829904, "kl": 0.39453125, "learning_rate": 9.22070534698521e-07, "loss": 0.0004, "reward": 1.6067708730697632, "reward_std": 0.07655988074839115, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6067708283662796, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 76.07031631469727, "epoch": 0.9385665529010239, "grad_norm": 1.318324676536594, "kl": 0.3935546875, "learning_rate": 9.217861205915813e-07, "loss": 0.0004, "reward": 1.583984375, "reward_std": 0.1487456113100052, "rewards/format_reward": 1.0, "rewards/score_reward": 0.583984375, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 75.74219131469727, "epoch": 0.9419795221843004, "grad_norm": 1.217934961713591, "kl": 0.3837890625, "learning_rate": 9.215017064846417e-07, "loss": 0.0004, "reward": 1.7018228769302368, "reward_std": 0.09659957885742188, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7018229365348816, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 77.75781631469727, "epoch": 0.9453924914675768, "grad_norm": 1.7516154055040667, "kl": 0.408203125, "learning_rate": 9.212172923777018e-07, "loss": 0.0004, "reward": 1.623046875, "reward_std": 0.09372204169631004, "rewards/format_reward": 1.0, "rewards/score_reward": 0.623046875, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 78.30208587646484, "epoch": 0.9488054607508533, "grad_norm": 1.0031376010123734, "kl": 0.3974609375, "learning_rate": 9.209328782707621e-07, "loss": 0.0004, "reward": 1.623046875, "reward_std": 0.09383310377597809, "rewards/format_reward": 1.0, "rewards/score_reward": 0.623046875, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 82.2265625, "epoch": 0.9522184300341296, "grad_norm": 1.8271071632215812, "kl": 0.34765625, "learning_rate": 9.206484641638225e-07, "loss": 0.0003, "reward": 1.6373698115348816, "reward_std": 0.13646313548088074, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6373697817325592, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 86.17708587646484, "epoch": 0.9556313993174061, "grad_norm": 3.850850667248188, "kl": 0.34765625, "learning_rate": 9.203640500568828e-07, "loss": 0.0003, "reward": 1.583984375, "reward_std": 0.13261324539780617, "rewards/format_reward": 1.0, "rewards/score_reward": 0.583984375, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 82.32031631469727, "epoch": 0.9590443686006825, "grad_norm": 1.9201635423837125, "kl": 0.36328125, "learning_rate": 9.200796359499431e-07, "loss": 0.0004, "reward": 1.705078125, "reward_std": 0.12342376634478569, "rewards/format_reward": 1.0, "rewards/score_reward": 0.705078125, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 81.23698043823242, "epoch": 0.962457337883959, "grad_norm": 1.317574439229845, "kl": 0.384765625, "learning_rate": 9.197952218430034e-07, "loss": 0.0004, "reward": 1.572265625, "reward_std": 0.10398440435528755, "rewards/format_reward": 1.0, "rewards/score_reward": 0.572265625, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 86.1875, "epoch": 0.9658703071672355, "grad_norm": 2.120831579834469, "kl": 0.3466796875, "learning_rate": 9.195108077360637e-07, "loss": 0.0003, "reward": 1.578125, "reward_std": 0.06463994458317757, "rewards/format_reward": 1.0, "rewards/score_reward": 0.578125, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 90.36458587646484, "epoch": 0.9692832764505119, "grad_norm": 6.946635369712441, "kl": 0.3173828125, "learning_rate": 9.192263936291239e-07, "loss": 0.0003, "reward": 1.6744791865348816, "reward_std": 0.1112542413175106, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6744791567325592, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 90.76562881469727, "epoch": 0.9726962457337884, "grad_norm": 0.8582069308415536, "kl": 0.34765625, "learning_rate": 9.189419795221842e-07, "loss": 0.0003, "reward": 1.6809896230697632, "reward_std": 0.056438006460666656, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6809895634651184, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 83.11979293823242, "epoch": 0.9761092150170648, "grad_norm": 1.3741505727951633, "kl": 0.400390625, "learning_rate": 9.186575654152446e-07, "loss": 0.0004, "reward": 1.6360676884651184, "reward_std": 0.082920391112566, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6360677182674408, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 87.41927337646484, "epoch": 0.9795221843003413, "grad_norm": 1.523034070512041, "kl": 0.3369140625, "learning_rate": 9.183731513083048e-07, "loss": 0.0003, "reward": 1.6197916865348816, "reward_std": 0.05102896690368652, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6197916567325592, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 87.015625, "epoch": 0.9829351535836177, "grad_norm": 0.9398206236894897, "kl": 0.3408203125, "learning_rate": 9.180887372013651e-07, "loss": 0.0003, "reward": 1.6223958134651184, "reward_std": 0.093685831874609, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6223958432674408, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 83.59635543823242, "epoch": 0.9863481228668942, "grad_norm": 1.5003853693571851, "kl": 0.3740234375, "learning_rate": 9.178043230944255e-07, "loss": 0.0004, "reward": 1.6712239384651184, "reward_std": 0.09214582294225693, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6712239682674408, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 84.47135543823242, "epoch": 0.9897610921501706, "grad_norm": 0.912239492255273, "kl": 0.3740234375, "learning_rate": 9.175199089874857e-07, "loss": 0.0004, "reward": 1.6770833134651184, "reward_std": 0.07785562612116337, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6770833432674408, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 86.10937881469727, "epoch": 0.9931740614334471, "grad_norm": 1.1500746729452802, "kl": 0.408203125, "learning_rate": 9.172354948805461e-07, "loss": 0.0004, "reward": 1.609375, "reward_std": 0.09400661289691925, "rewards/format_reward": 1.0, "rewards/score_reward": 0.609375, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 82.56666946411133, "epoch": 0.9965870307167235, "grad_norm": 3.12863701826555, "kl": 0.5341796875, "learning_rate": 9.169510807736063e-07, "loss": 0.0005, "reward": 1.4416667819023132, "reward_std": 0.15430336073040962, "rewards/format_reward": 1.0, "rewards/score_reward": 0.44166669249534607, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 90.33594131469727, "epoch": 1.0034129692832765, "grad_norm": 0.743822646754145, "kl": 0.333984375, "learning_rate": 9.166666666666665e-07, "loss": 0.0003, "reward": 1.6236978769302368, "reward_std": 0.036915821488946676, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6236979365348816, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 84.94531631469727, "epoch": 1.006825938566553, "grad_norm": 1.037346260626218, "kl": 0.369140625, "learning_rate": 9.163822525597269e-07, "loss": 0.0004, "reward": 1.6412760615348816, "reward_std": 0.0727266501635313, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6412760317325592, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 89.21354293823242, "epoch": 1.0102389078498293, "grad_norm": 1.1101757537408414, "kl": 0.330078125, "learning_rate": 9.160978384527872e-07, "loss": 0.0003, "reward": 1.6223958730697632, "reward_std": 0.04018030222505331, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6223958134651184, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 88.37760925292969, "epoch": 1.0136518771331058, "grad_norm": 1.545899680455198, "kl": 0.3369140625, "learning_rate": 9.158134243458475e-07, "loss": 0.0003, "reward": 1.6328125, "reward_std": 0.09223341569304466, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6328125, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 90.44010543823242, "epoch": 1.0170648464163823, "grad_norm": 1.8943347579601058, "kl": 0.328125, "learning_rate": 9.155290102389078e-07, "loss": 0.0003, "reward": 1.6360676884651184, "reward_std": 0.0707520842552185, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6360677182674408, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 93.52604293823242, "epoch": 1.0204778156996588, "grad_norm": 4.277089722184778, "kl": 0.3369140625, "learning_rate": 9.152445961319682e-07, "loss": 0.0003, "reward": 1.7096354365348816, "reward_std": 0.10994765162467957, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7096354067325592, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 85.33854293823242, "epoch": 1.023890784982935, "grad_norm": 1.8900672110895504, "kl": 0.3720703125, "learning_rate": 9.149601820250285e-07, "loss": 0.0004, "reward": 1.6100260019302368, "reward_std": 0.14373011142015457, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6100260615348816, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 89.90885925292969, "epoch": 1.0273037542662116, "grad_norm": 0.9308798989188733, "kl": 0.3447265625, "learning_rate": 9.146757679180886e-07, "loss": 0.0003, "reward": 1.623046875, "reward_std": 0.08888068795204163, "rewards/format_reward": 1.0, "rewards/score_reward": 0.623046875, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 91.52864837646484, "epoch": 1.030716723549488, "grad_norm": 1.8876028651942585, "kl": 0.3251953125, "learning_rate": 9.14391353811149e-07, "loss": 0.0003, "reward": 1.7024739980697632, "reward_std": 0.11737678572535515, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7024739384651184, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 85.28385925292969, "epoch": 1.0341296928327646, "grad_norm": 0.9069414333853495, "kl": 0.388671875, "learning_rate": 9.141069397042093e-07, "loss": 0.0004, "reward": 1.6588541865348816, "reward_std": 0.0891011655330658, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6588541567325592, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 85.00000381469727, "epoch": 1.0375426621160408, "grad_norm": 1.2940322844474055, "kl": 0.357421875, "learning_rate": 9.138225255972696e-07, "loss": 0.0004, "reward": 1.65234375, "reward_std": 0.13364116474986076, "rewards/format_reward": 1.0, "rewards/score_reward": 0.65234375, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 84.03385543823242, "epoch": 1.0409556313993173, "grad_norm": 1.3338650997631307, "kl": 0.34375, "learning_rate": 9.135381114903299e-07, "loss": 0.0003, "reward": 1.669921875, "reward_std": 0.0928155668079853, "rewards/format_reward": 1.0, "rewards/score_reward": 0.669921875, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 81.28385543823242, "epoch": 1.0443686006825939, "grad_norm": 1.8728375381322906, "kl": 0.3759765625, "learning_rate": 9.132536973833902e-07, "loss": 0.0004, "reward": 1.6569010615348816, "reward_std": 0.12476343289017677, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6569010317325592, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 84.41666793823242, "epoch": 1.0477815699658704, "grad_norm": 1.2494853125636305, "kl": 0.373046875, "learning_rate": 9.129692832764505e-07, "loss": 0.0004, "reward": 1.626953125, "reward_std": 0.08782971277832985, "rewards/format_reward": 1.0, "rewards/score_reward": 0.626953125, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 84.77083587646484, "epoch": 1.0511945392491469, "grad_norm": 1.3138918539290483, "kl": 0.3701171875, "learning_rate": 9.126848691695108e-07, "loss": 0.0004, "reward": 1.5833333134651184, "reward_std": 0.10683724284172058, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5833333432674408, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 88.21094131469727, "epoch": 1.0546075085324231, "grad_norm": 1.4364814129818928, "kl": 0.37109375, "learning_rate": 9.124004550625711e-07, "loss": 0.0004, "reward": 1.5983073115348816, "reward_std": 0.08592073246836662, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5983072817325592, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 85.671875, "epoch": 1.0580204778156996, "grad_norm": 5.0709331706216165, "kl": 0.3779296875, "learning_rate": 9.121160409556313e-07, "loss": 0.0004, "reward": 1.6555989384651184, "reward_std": 0.06132901646196842, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6555989682674408, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 84.36198043823242, "epoch": 1.0614334470989761, "grad_norm": 1.5901275148713188, "kl": 0.416015625, "learning_rate": 9.118316268486916e-07, "loss": 0.0004, "reward": 1.7506510019302368, "reward_std": 0.08971758186817169, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7506510615348816, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 82.39844131469727, "epoch": 1.0648464163822526, "grad_norm": 3.775664004021408, "kl": 0.375, "learning_rate": 9.11547212741752e-07, "loss": 0.0004, "reward": 1.6341145634651184, "reward_std": 0.06413274444639683, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6341145932674408, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 84.06250381469727, "epoch": 1.068259385665529, "grad_norm": 0.47495635735053743, "kl": 0.365234375, "learning_rate": 9.112627986348122e-07, "loss": 0.0004, "reward": 1.6399739980697632, "reward_std": 0.027355906553566456, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6399739384651184, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 86.01823043823242, "epoch": 1.0716723549488054, "grad_norm": 3.668071761141221, "kl": 0.439453125, "learning_rate": 9.109783845278726e-07, "loss": 0.0004, "reward": 1.595703125, "reward_std": 0.08690677955746651, "rewards/format_reward": 1.0, "rewards/score_reward": 0.595703125, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 87.67187881469727, "epoch": 1.075085324232082, "grad_norm": 0.8858134378733757, "kl": 0.3681640625, "learning_rate": 9.106939704209329e-07, "loss": 0.0004, "reward": 1.693359375, "reward_std": 0.05881131440401077, "rewards/format_reward": 1.0, "rewards/score_reward": 0.693359375, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 87.14323043823242, "epoch": 1.0784982935153584, "grad_norm": 5.783534127359017, "kl": 0.37890625, "learning_rate": 9.104095563139931e-07, "loss": 0.0004, "reward": 1.6171875, "reward_std": 0.16154177486896515, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6171875, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 84.32812881469727, "epoch": 1.0819112627986347, "grad_norm": 0.4179868263069957, "kl": 0.3515625, "learning_rate": 9.101251422070534e-07, "loss": 0.0004, "reward": 1.75, "reward_std": 0.026766151189804077, "rewards/format_reward": 1.0, "rewards/score_reward": 0.75, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 88.33073425292969, "epoch": 1.0853242320819112, "grad_norm": 1.888968215865612, "kl": 0.345703125, "learning_rate": 9.098407281001137e-07, "loss": 0.0003, "reward": 1.6184895634651184, "reward_std": 0.10234515368938446, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6184895932674408, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 89.26823043823242, "epoch": 1.0887372013651877, "grad_norm": 1.1774300236330038, "kl": 0.3505859375, "learning_rate": 9.09556313993174e-07, "loss": 0.0004, "reward": 1.7395833134651184, "reward_std": 0.07180734444409609, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7395833432674408, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 88.27864837646484, "epoch": 1.0921501706484642, "grad_norm": 10.345832997264825, "kl": 0.3564453125, "learning_rate": 9.092718998862343e-07, "loss": 0.0004, "reward": 1.7083333730697632, "reward_std": 0.08178085833787918, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7083333134651184, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 84.23698043823242, "epoch": 1.0955631399317407, "grad_norm": 0.854074568699615, "kl": 0.3642578125, "learning_rate": 9.089874857792946e-07, "loss": 0.0004, "reward": 1.6490885019302368, "reward_std": 0.04795813001692295, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6490885615348816, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 83.67187881469727, "epoch": 1.098976109215017, "grad_norm": 2.8409638008195537, "kl": 0.3642578125, "learning_rate": 9.08703071672355e-07, "loss": 0.0004, "reward": 1.6321614980697632, "reward_std": 0.06073768436908722, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6321614384651184, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 87.0078125, "epoch": 1.1023890784982935, "grad_norm": 2.0586947745562414, "kl": 0.345703125, "learning_rate": 9.084186575654152e-07, "loss": 0.0003, "reward": 1.6510416865348816, "reward_std": 0.07754674460738897, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6510416567325592, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 87.50000381469727, "epoch": 1.10580204778157, "grad_norm": 0.8996071684659139, "kl": 0.353515625, "learning_rate": 9.081342434584755e-07, "loss": 0.0004, "reward": 1.6809895634651184, "reward_std": 0.0901708584278822, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6809895932674408, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 88.10677337646484, "epoch": 1.1092150170648465, "grad_norm": 2.3969505059298877, "kl": 0.3427734375, "learning_rate": 9.078498293515358e-07, "loss": 0.0003, "reward": 1.6178385615348816, "reward_std": 0.06308121979236603, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6178385317325592, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 86.59114837646484, "epoch": 1.1126279863481228, "grad_norm": 0.574348486065908, "kl": 0.359375, "learning_rate": 9.07565415244596e-07, "loss": 0.0004, "reward": 1.6451823115348816, "reward_std": 0.04977460205554962, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6451822817325592, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 88.30729293823242, "epoch": 1.1160409556313993, "grad_norm": 0.9574750393817035, "kl": 0.3388671875, "learning_rate": 9.072810011376564e-07, "loss": 0.0003, "reward": 1.7623697519302368, "reward_std": 0.06534223258495331, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7623698115348816, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 91.95312881469727, "epoch": 1.1194539249146758, "grad_norm": 1.3334427295701892, "kl": 0.310546875, "learning_rate": 9.069965870307167e-07, "loss": 0.0003, "reward": 1.6640625, "reward_std": 0.064340865239501, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6640625, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 90.51823043823242, "epoch": 1.1228668941979523, "grad_norm": 1.264366071691412, "kl": 0.330078125, "learning_rate": 9.06712172923777e-07, "loss": 0.0003, "reward": 1.705078125, "reward_std": 0.06199164502322674, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7076823115348816, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 89.60677337646484, "epoch": 1.1262798634812285, "grad_norm": 1.6685551976843742, "kl": 0.3544921875, "learning_rate": 9.064277588168373e-07, "loss": 0.0004, "reward": 1.6184896230697632, "reward_std": 0.10630683228373528, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.62109375, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 88.36979293823242, "epoch": 1.129692832764505, "grad_norm": 1.0475313302971008, "kl": 0.34765625, "learning_rate": 9.061433447098976e-07, "loss": 0.0003, "reward": 1.66796875, "reward_std": 0.07847016677260399, "rewards/format_reward": 1.0, "rewards/score_reward": 0.66796875, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 88.47135543823242, "epoch": 1.1331058020477816, "grad_norm": 1.057356280239692, "kl": 0.3388671875, "learning_rate": 9.058589306029578e-07, "loss": 0.0003, "reward": 1.583984375, "reward_std": 0.07702643424272537, "rewards/format_reward": 1.0, "rewards/score_reward": 0.583984375, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 89.73698043823242, "epoch": 1.136518771331058, "grad_norm": 0.9116386468379839, "kl": 0.3564453125, "learning_rate": 9.055745164960181e-07, "loss": 0.0004, "reward": 1.599609375, "reward_std": 0.0955435186624527, "rewards/format_reward": 1.0, "rewards/score_reward": 0.599609375, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 91.61458587646484, "epoch": 1.1399317406143346, "grad_norm": 7.789995138414796, "kl": 0.33203125, "learning_rate": 9.052901023890785e-07, "loss": 0.0003, "reward": 1.580078125, "reward_std": 0.08064066246151924, "rewards/format_reward": 1.0, "rewards/score_reward": 0.580078125, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 90.10416793823242, "epoch": 1.1433447098976108, "grad_norm": 1.5398170582838697, "kl": 0.341796875, "learning_rate": 9.050056882821387e-07, "loss": 0.0003, "reward": 1.6341145634651184, "reward_std": 0.07193924859166145, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6341145932674408, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 89.75260925292969, "epoch": 1.1467576791808873, "grad_norm": 5.898407975769588, "kl": 0.3447265625, "learning_rate": 9.04721274175199e-07, "loss": 0.0003, "reward": 1.56640625, "reward_std": 0.11290653422474861, "rewards/format_reward": 1.0, "rewards/score_reward": 0.56640625, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 93.59896087646484, "epoch": 1.1501706484641638, "grad_norm": 1.5207039539823812, "kl": 0.333984375, "learning_rate": 9.044368600682594e-07, "loss": 0.0003, "reward": 1.5345052480697632, "reward_std": 0.09670136496424675, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5345052033662796, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 91.44531631469727, "epoch": 1.1535836177474403, "grad_norm": 1.2883446170932515, "kl": 0.361328125, "learning_rate": 9.041524459613196e-07, "loss": 0.0004, "reward": 1.5598958730697632, "reward_std": 0.07441498711705208, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5598958283662796, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 93.60417175292969, "epoch": 1.1569965870307168, "grad_norm": 1.4185591178425403, "kl": 0.3193359375, "learning_rate": 9.0386803185438e-07, "loss": 0.0003, "reward": 1.5631510615348816, "reward_std": 0.08493223041296005, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5631510466337204, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 92.82291793823242, "epoch": 1.1604095563139931, "grad_norm": 2.286145571815387, "kl": 0.3466796875, "learning_rate": 9.035836177474402e-07, "loss": 0.0003, "reward": 1.57421875, "reward_std": 0.06847423501312733, "rewards/format_reward": 1.0, "rewards/score_reward": 0.57421875, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 93.25000381469727, "epoch": 1.1638225255972696, "grad_norm": 1.1164049172689587, "kl": 0.3193359375, "learning_rate": 9.032992036405004e-07, "loss": 0.0003, "reward": 1.6223958730697632, "reward_std": 0.07818534225225449, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6223958134651184, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 94.06510543823242, "epoch": 1.1672354948805461, "grad_norm": 1.2906806372896535, "kl": 0.3291015625, "learning_rate": 9.030147895335608e-07, "loss": 0.0003, "reward": 1.62890625, "reward_std": 0.07450397312641144, "rewards/format_reward": 1.0, "rewards/score_reward": 0.62890625, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 91.64323043823242, "epoch": 1.1706484641638226, "grad_norm": 0.79574839793954, "kl": 0.341796875, "learning_rate": 9.027303754266211e-07, "loss": 0.0003, "reward": 1.533203125, "reward_std": 0.03980143740773201, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.5358072966337204, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 91.34635543823242, "epoch": 1.174061433447099, "grad_norm": 0.9377296414246623, "kl": 0.326171875, "learning_rate": 9.024459613196815e-07, "loss": 0.0003, "reward": 1.712890625, "reward_std": 0.08788119815289974, "rewards/format_reward": 1.0, "rewards/score_reward": 0.712890625, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 93.48437881469727, "epoch": 1.1774744027303754, "grad_norm": 0.6523290118074977, "kl": 0.373046875, "learning_rate": 9.021615472127417e-07, "loss": 0.0004, "reward": 1.6744791865348816, "reward_std": 0.063651442527771, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6744791567325592, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 95.51562881469727, "epoch": 1.180887372013652, "grad_norm": 1.088043763316254, "kl": 0.3212890625, "learning_rate": 9.018771331058021e-07, "loss": 0.0003, "reward": 1.646484375, "reward_std": 0.06860671192407608, "rewards/format_reward": 1.0, "rewards/score_reward": 0.646484375, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 93.32031631469727, "epoch": 1.1843003412969284, "grad_norm": 1.7800517438781007, "kl": 0.322265625, "learning_rate": 9.015927189988624e-07, "loss": 0.0003, "reward": 1.5143229365348816, "reward_std": 0.10142895951867104, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5143229216337204, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 93.52604675292969, "epoch": 1.1877133105802047, "grad_norm": 1.304108240146414, "kl": 0.3369140625, "learning_rate": 9.013083048919225e-07, "loss": 0.0003, "reward": 1.5924479365348816, "reward_std": 0.10900428518652916, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5924479067325592, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 96.6640625, "epoch": 1.1911262798634812, "grad_norm": 1.6308991932345873, "kl": 0.310546875, "learning_rate": 9.010238907849829e-07, "loss": 0.0003, "reward": 1.548828125, "reward_std": 0.14025410264730453, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5488281399011612, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 95.38281631469727, "epoch": 1.1945392491467577, "grad_norm": 0.7421903759615947, "kl": 0.31640625, "learning_rate": 9.007394766780432e-07, "loss": 0.0003, "reward": 1.7350260019302368, "reward_std": 0.0697450339794159, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7350260615348816, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 96.14583587646484, "epoch": 1.1979522184300342, "grad_norm": 0.7816870932113442, "kl": 0.3046875, "learning_rate": 9.004550625711035e-07, "loss": 0.0003, "reward": 1.697265625, "reward_std": 0.08189758285880089, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6998697817325592, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 96.796875, "epoch": 1.2013651877133107, "grad_norm": 1.464503637326796, "kl": 0.322265625, "learning_rate": 9.001706484641638e-07, "loss": 0.0003, "reward": 1.5852864980697632, "reward_std": 0.09937797859311104, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5852864384651184, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 98.77864837646484, "epoch": 1.204778156996587, "grad_norm": 1.3512194299638194, "kl": 0.3125, "learning_rate": 8.998862343572241e-07, "loss": 0.0003, "reward": 1.732421875, "reward_std": 0.08107856288552284, "rewards/format_reward": 1.0, "rewards/score_reward": 0.732421875, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 99.74479293823242, "epoch": 1.2081911262798635, "grad_norm": 5.0256861248761915, "kl": 0.2998046875, "learning_rate": 8.996018202502844e-07, "loss": 0.0003, "reward": 1.6516926884651184, "reward_std": 0.11081887409090996, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6516927182674408, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 100.01041793823242, "epoch": 1.21160409556314, "grad_norm": 1.460015180461731, "kl": 0.318359375, "learning_rate": 8.993174061433446e-07, "loss": 0.0003, "reward": 1.6575520634651184, "reward_std": 0.10313650593161583, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6575520932674408, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 99.5546875, "epoch": 1.2150170648464165, "grad_norm": 1.1448312974219836, "kl": 0.3330078125, "learning_rate": 8.99032992036405e-07, "loss": 0.0003, "reward": 1.5631510019302368, "reward_std": 0.14101306721568108, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5631510615348816, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 98.85937881469727, "epoch": 1.2184300341296928, "grad_norm": 2.1091494531135684, "kl": 0.33984375, "learning_rate": 8.987485779294652e-07, "loss": 0.0003, "reward": 1.6373698115348816, "reward_std": 0.13265039026737213, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6373697817325592, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 101.90625381469727, "epoch": 1.2218430034129693, "grad_norm": 1.139476204759664, "kl": 0.3154296875, "learning_rate": 8.984641638225255e-07, "loss": 0.0003, "reward": 1.5852864384651184, "reward_std": 0.0712361391633749, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5852864682674408, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 100.125, "epoch": 1.2252559726962458, "grad_norm": 1.6368521383934038, "kl": 0.3076171875, "learning_rate": 8.981797497155859e-07, "loss": 0.0003, "reward": 1.64453125, "reward_std": 0.10514185950160027, "rewards/format_reward": 1.0, "rewards/score_reward": 0.64453125, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 102.796875, "epoch": 1.2286689419795223, "grad_norm": 1.3931892254627536, "kl": 0.302734375, "learning_rate": 8.978953356086461e-07, "loss": 0.0003, "reward": 1.6321614384651184, "reward_std": 0.1321612298488617, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6321614682674408, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 100.30208587646484, "epoch": 1.2320819112627985, "grad_norm": 1.395302200148577, "kl": 0.3466796875, "learning_rate": 8.976109215017065e-07, "loss": 0.0003, "reward": 1.6067708134651184, "reward_std": 0.10510212928056717, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6067708432674408, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 106.75000381469727, "epoch": 1.235494880546075, "grad_norm": 1.4526377255555443, "kl": 0.28515625, "learning_rate": 8.973265073947668e-07, "loss": 0.0003, "reward": 1.673828125, "reward_std": 0.08640624955296516, "rewards/format_reward": 1.0, "rewards/score_reward": 0.673828125, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 100.28385543823242, "epoch": 1.2389078498293515, "grad_norm": 0.7947405543121799, "kl": 0.3251953125, "learning_rate": 8.970420932878269e-07, "loss": 0.0003, "reward": 1.5911458134651184, "reward_std": 0.07372437790036201, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5911458432674408, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 101.82291793823242, "epoch": 1.242320819112628, "grad_norm": 1.1864292643507481, "kl": 0.3193359375, "learning_rate": 8.967576791808873e-07, "loss": 0.0003, "reward": 1.5807291269302368, "reward_std": 0.15902487933635712, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5807291865348816, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 103.25521087646484, "epoch": 1.2457337883959045, "grad_norm": 0.5698746328530475, "kl": 0.30859375, "learning_rate": 8.964732650739476e-07, "loss": 0.0003, "reward": 1.6803385019302368, "reward_std": 0.06285369955003262, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6803385615348816, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 102.79687881469727, "epoch": 1.2491467576791808, "grad_norm": 1.9970254660975333, "kl": 0.3330078125, "learning_rate": 8.96188850967008e-07, "loss": 0.0003, "reward": 1.689453125, "reward_std": 0.1364504098892212, "rewards/format_reward": 1.0, "rewards/score_reward": 0.689453125, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 106.21875, "epoch": 1.2525597269624573, "grad_norm": 0.7755135658545669, "kl": 0.3173828125, "learning_rate": 8.959044368600682e-07, "loss": 0.0003, "reward": 1.66796875, "reward_std": 0.07290705293416977, "rewards/format_reward": 1.0, "rewards/score_reward": 0.66796875, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 106.02083587646484, "epoch": 1.2559726962457338, "grad_norm": 0.6394973062433026, "kl": 0.3037109375, "learning_rate": 8.956200227531285e-07, "loss": 0.0003, "reward": 1.7376302480697632, "reward_std": 0.05532081238925457, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7376301884651184, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 106.96875381469727, "epoch": 1.25938566552901, "grad_norm": 1.518152493441197, "kl": 0.3173828125, "learning_rate": 8.953356086461889e-07, "loss": 0.0003, "reward": 1.6595051884651184, "reward_std": 0.1121549978852272, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6595052182674408, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 110.45833587646484, "epoch": 1.2627986348122868, "grad_norm": 1.7122090519174835, "kl": 0.3095703125, "learning_rate": 8.950511945392491e-07, "loss": 0.0003, "reward": 1.703125, "reward_std": 0.08931688219308853, "rewards/format_reward": 1.0, "rewards/score_reward": 0.703125, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 108.06510543823242, "epoch": 1.266211604095563, "grad_norm": 1.1610791478202611, "kl": 0.3076171875, "learning_rate": 8.947667804323094e-07, "loss": 0.0003, "reward": 1.6302083134651184, "reward_std": 0.09186108037829399, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6302083432674408, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 109.20052337646484, "epoch": 1.2696245733788396, "grad_norm": 1.265366861875569, "kl": 0.30078125, "learning_rate": 8.944823663253697e-07, "loss": 0.0003, "reward": 1.66015625, "reward_std": 0.08011146634817123, "rewards/format_reward": 1.0, "rewards/score_reward": 0.66015625, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 110.27083587646484, "epoch": 1.273037542662116, "grad_norm": 0.8689025299668102, "kl": 0.310546875, "learning_rate": 8.941979522184299e-07, "loss": 0.0003, "reward": 1.76953125, "reward_std": 0.08278562501072884, "rewards/format_reward": 1.0, "rewards/score_reward": 0.76953125, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 109.25521087646484, "epoch": 1.2764505119453924, "grad_norm": 2.0567670716813886, "kl": 0.2958984375, "learning_rate": 8.939135381114903e-07, "loss": 0.0003, "reward": 1.5240885615348816, "reward_std": 0.09690434858202934, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.5266927182674408, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 106.97917175292969, "epoch": 1.2798634812286689, "grad_norm": 1.3712222085351589, "kl": 0.4013671875, "learning_rate": 8.936291240045506e-07, "loss": 0.0004, "reward": 1.697265625, "reward_std": 0.0807935781776905, "rewards/format_reward": 1.0, "rewards/score_reward": 0.697265625, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 108.08854293823242, "epoch": 1.2832764505119454, "grad_norm": 0.9882147341101891, "kl": 0.3076171875, "learning_rate": 8.933447098976109e-07, "loss": 0.0003, "reward": 1.640625, "reward_std": 0.09115936048328876, "rewards/format_reward": 1.0, "rewards/score_reward": 0.640625, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 107.69010925292969, "epoch": 1.286689419795222, "grad_norm": 0.952102731405316, "kl": 0.3017578125, "learning_rate": 8.930602957906712e-07, "loss": 0.0003, "reward": 1.6048176884651184, "reward_std": 0.0842303428798914, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6048177182674408, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 108.83333587646484, "epoch": 1.2901023890784984, "grad_norm": 1.4769470794909119, "kl": 0.3046875, "learning_rate": 8.927758816837315e-07, "loss": 0.0003, "reward": 1.7669270634651184, "reward_std": 0.11623643338680267, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7669270932674408, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 109.43229293823242, "epoch": 1.2935153583617747, "grad_norm": 1.5996238264880085, "kl": 0.3173828125, "learning_rate": 8.924914675767917e-07, "loss": 0.0003, "reward": 1.6354166865348816, "reward_std": 0.11213107779622078, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6354166567325592, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 110.13542175292969, "epoch": 1.2969283276450512, "grad_norm": 1.0546129793709467, "kl": 0.310546875, "learning_rate": 8.92207053469852e-07, "loss": 0.0003, "reward": 1.634765625, "reward_std": 0.10719964653253555, "rewards/format_reward": 1.0, "rewards/score_reward": 0.634765625, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 109.37239837646484, "epoch": 1.3003412969283277, "grad_norm": 2.8740670072622274, "kl": 0.3583984375, "learning_rate": 8.919226393629124e-07, "loss": 0.0004, "reward": 1.6197916269302368, "reward_std": 0.0862240307033062, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6197916865348816, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 110.94271087646484, "epoch": 1.3037542662116042, "grad_norm": 1.0792941298922691, "kl": 0.3076171875, "learning_rate": 8.916382252559726e-07, "loss": 0.0003, "reward": 1.640625, "reward_std": 0.09019815176725388, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6432291865348816, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 110.65885925292969, "epoch": 1.3071672354948807, "grad_norm": 0.9535194485254701, "kl": 0.3125, "learning_rate": 8.91353811149033e-07, "loss": 0.0003, "reward": 1.583984375, "reward_std": 0.07940917834639549, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5839843600988388, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 109.01823425292969, "epoch": 1.310580204778157, "grad_norm": 2.582584932627284, "kl": 0.3330078125, "learning_rate": 8.910693970420933e-07, "loss": 0.0003, "reward": 1.5169271230697632, "reward_std": 0.11989713460206985, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5169270783662796, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 110.97916793823242, "epoch": 1.3139931740614335, "grad_norm": 1.6412407114675462, "kl": 0.3232421875, "learning_rate": 8.907849829351535e-07, "loss": 0.0003, "reward": 1.6555989980697632, "reward_std": 0.14920351654291153, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6555989384651184, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 109.86719131469727, "epoch": 1.31740614334471, "grad_norm": 2.8528116804495713, "kl": 0.3046875, "learning_rate": 8.905005688282139e-07, "loss": 0.0003, "reward": 1.5546875, "reward_std": 0.05937940999865532, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5546875, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 112.27344131469727, "epoch": 1.3208191126279862, "grad_norm": 1.807442907735358, "kl": 0.3017578125, "learning_rate": 8.902161547212741e-07, "loss": 0.0003, "reward": 1.6692708730697632, "reward_std": 0.12294073030352592, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6692708134651184, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 112.96094131469727, "epoch": 1.3242320819112627, "grad_norm": 0.9742098157040858, "kl": 0.2900390625, "learning_rate": 8.899317406143345e-07, "loss": 0.0003, "reward": 1.5833333134651184, "reward_std": 0.10687696933746338, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5833333432674408, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 110.12239837646484, "epoch": 1.3276450511945392, "grad_norm": 1.0811882200937308, "kl": 0.310546875, "learning_rate": 8.896473265073947e-07, "loss": 0.0003, "reward": 1.69921875, "reward_std": 0.11106470599770546, "rewards/format_reward": 1.0, "rewards/score_reward": 0.69921875, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 110.72396087646484, "epoch": 1.3310580204778157, "grad_norm": 1.1129741398020574, "kl": 0.291015625, "learning_rate": 8.89362912400455e-07, "loss": 0.0003, "reward": 1.712890625, "reward_std": 0.11653907597064972, "rewards/format_reward": 1.0, "rewards/score_reward": 0.712890625, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 112.08854675292969, "epoch": 1.3344709897610922, "grad_norm": 4.6635508934562715, "kl": 0.3046875, "learning_rate": 8.890784982935154e-07, "loss": 0.0003, "reward": 1.728515625, "reward_std": 0.0908365286886692, "rewards/format_reward": 1.0, "rewards/score_reward": 0.728515625, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 109.36719131469727, "epoch": 1.3378839590443685, "grad_norm": 0.8702425155899015, "kl": 0.3037109375, "learning_rate": 8.887940841865756e-07, "loss": 0.0003, "reward": 1.6393228769302368, "reward_std": 0.07395935244858265, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6393229365348816, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 109.12239837646484, "epoch": 1.341296928327645, "grad_norm": 1.7286230494669028, "kl": 0.3388671875, "learning_rate": 8.88509670079636e-07, "loss": 0.0003, "reward": 1.62109375, "reward_std": 0.12561412528157234, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6236979067325592, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 111.02604293823242, "epoch": 1.3447098976109215, "grad_norm": 1.6221723660147427, "kl": 0.275390625, "learning_rate": 8.882252559726962e-07, "loss": 0.0003, "reward": 1.6790364980697632, "reward_std": 0.1091359406709671, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6790364384651184, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 109.58333587646484, "epoch": 1.348122866894198, "grad_norm": 0.9487148878030691, "kl": 0.2958984375, "learning_rate": 8.879408418657564e-07, "loss": 0.0003, "reward": 1.611328125, "reward_std": 0.08767728880047798, "rewards/format_reward": 1.0, "rewards/score_reward": 0.611328125, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 111.08333587646484, "epoch": 1.3515358361774745, "grad_norm": 7.717115217492057, "kl": 0.2861328125, "learning_rate": 8.876564277588168e-07, "loss": 0.0003, "reward": 1.6328125, "reward_std": 0.08776052109897137, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6328125, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 108.66927337646484, "epoch": 1.3549488054607508, "grad_norm": 1.6651789097724212, "kl": 0.2978515625, "learning_rate": 8.873720136518771e-07, "loss": 0.0003, "reward": 1.5657551884651184, "reward_std": 0.0652086939662695, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5657552182674408, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 112.04687881469727, "epoch": 1.3583617747440273, "grad_norm": 1.8997455571178061, "kl": 0.287109375, "learning_rate": 8.870875995449374e-07, "loss": 0.0003, "reward": 1.69921875, "reward_std": 0.09427125565707684, "rewards/format_reward": 1.0, "rewards/score_reward": 0.69921875, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 111.22916793823242, "epoch": 1.3617747440273038, "grad_norm": 0.9949928565148652, "kl": 0.2919921875, "learning_rate": 8.868031854379977e-07, "loss": 0.0003, "reward": 1.748046875, "reward_std": 0.09622831642627716, "rewards/format_reward": 1.0, "rewards/score_reward": 0.748046875, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 111.55729293823242, "epoch": 1.36518771331058, "grad_norm": 0.7957167157318042, "kl": 0.2900390625, "learning_rate": 8.86518771331058e-07, "loss": 0.0003, "reward": 1.7311198115348816, "reward_std": 0.08022425323724747, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7311197817325592, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 109.70052337646484, "epoch": 1.3686006825938566, "grad_norm": 1.3707076165843584, "kl": 0.318359375, "learning_rate": 8.862343572241183e-07, "loss": 0.0003, "reward": 1.6393228769302368, "reward_std": 0.11868953704833984, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6393229365348816, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 107.40364837646484, "epoch": 1.372013651877133, "grad_norm": 1.1094062047764173, "kl": 0.3037109375, "learning_rate": 8.859499431171785e-07, "loss": 0.0003, "reward": 1.6614583134651184, "reward_std": 0.08151458203792572, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6614583432674408, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 107.32291793823242, "epoch": 1.3754266211604096, "grad_norm": 1.4842318895765294, "kl": 0.3330078125, "learning_rate": 8.856655290102389e-07, "loss": 0.0003, "reward": 1.6119791865348816, "reward_std": 0.1342332512140274, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6119791567325592, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 105.27604675292969, "epoch": 1.378839590443686, "grad_norm": 0.7615450386030306, "kl": 0.3056640625, "learning_rate": 8.853811149032991e-07, "loss": 0.0003, "reward": 1.6399739980697632, "reward_std": 0.06917358562350273, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6399739384651184, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 108.92448425292969, "epoch": 1.3822525597269624, "grad_norm": 0.7036920245988518, "kl": 0.3037109375, "learning_rate": 8.850967007963594e-07, "loss": 0.0003, "reward": 1.6809895634651184, "reward_std": 0.07711337134242058, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6809895932674408, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 108.39583587646484, "epoch": 1.3856655290102389, "grad_norm": 1.26330612755144, "kl": 0.3525390625, "learning_rate": 8.848122866894198e-07, "loss": 0.0004, "reward": 1.6608073115348816, "reward_std": 0.09284770861268044, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6608072817325592, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 107.53906631469727, "epoch": 1.3890784982935154, "grad_norm": 2.9069905071345485, "kl": 0.2998046875, "learning_rate": 8.8452787258248e-07, "loss": 0.0003, "reward": 1.6640625, "reward_std": 0.12088547274470329, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6640625, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 106.86458587646484, "epoch": 1.3924914675767919, "grad_norm": 0.9564060002468044, "kl": 0.333984375, "learning_rate": 8.842434584755404e-07, "loss": 0.0003, "reward": 1.650390625, "reward_std": 0.07424523681402206, "rewards/format_reward": 1.0, "rewards/score_reward": 0.650390625, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 109.5, "epoch": 1.3959044368600684, "grad_norm": 0.9275331537494872, "kl": 0.296875, "learning_rate": 8.839590443686007e-07, "loss": 0.0003, "reward": 1.5651041269302368, "reward_std": 0.07928121648728848, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5651041865348816, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 107.49479293823242, "epoch": 1.3993174061433447, "grad_norm": 0.9673103482672438, "kl": 0.361328125, "learning_rate": 8.836746302616608e-07, "loss": 0.0004, "reward": 1.6529947519302368, "reward_std": 0.0672699362039566, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6529948115348816, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 105.8359375, "epoch": 1.4027303754266212, "grad_norm": 1.2966828674815594, "kl": 0.31640625, "learning_rate": 8.833902161547212e-07, "loss": 0.0003, "reward": 1.6223958134651184, "reward_std": 0.1244407631456852, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6223958432674408, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 100.74219131469727, "epoch": 1.4061433447098977, "grad_norm": 1.3633538914819374, "kl": 0.31640625, "learning_rate": 8.831058020477815e-07, "loss": 0.0003, "reward": 1.6555989384651184, "reward_std": 0.1606348678469658, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6555989682674408, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 105.40885925292969, "epoch": 1.409556313993174, "grad_norm": 1.258028402558139, "kl": 0.3115234375, "learning_rate": 8.828213879408419e-07, "loss": 0.0003, "reward": 1.5774739384651184, "reward_std": 0.08842276781797409, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5774739682674408, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 105.140625, "epoch": 1.4129692832764504, "grad_norm": 0.7272648971585978, "kl": 0.3291015625, "learning_rate": 8.825369738339021e-07, "loss": 0.0003, "reward": 1.6412760615348816, "reward_std": 0.07299699075520039, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6412760317325592, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 103.81771087646484, "epoch": 1.416382252559727, "grad_norm": 2.119887850909003, "kl": 0.3134765625, "learning_rate": 8.822525597269624e-07, "loss": 0.0003, "reward": 1.6588541269302368, "reward_std": 0.0896218903362751, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6588541865348816, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 103.45312881469727, "epoch": 1.4197952218430034, "grad_norm": 2.2315366028053325, "kl": 0.3203125, "learning_rate": 8.819681456200228e-07, "loss": 0.0003, "reward": 1.673828125, "reward_std": 0.08890458568930626, "rewards/format_reward": 1.0, "rewards/score_reward": 0.673828125, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 103.51302337646484, "epoch": 1.42320819112628, "grad_norm": 1.1528148299423515, "kl": 0.302734375, "learning_rate": 8.81683731513083e-07, "loss": 0.0003, "reward": 1.6647135615348816, "reward_std": 0.13601423799991608, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6647135317325592, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 103.28125, "epoch": 1.4266211604095562, "grad_norm": 1.216405946754455, "kl": 0.3505859375, "learning_rate": 8.813993174061433e-07, "loss": 0.0004, "reward": 1.6985676884651184, "reward_std": 0.07471911236643791, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6985677182674408, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 101.81510543823242, "epoch": 1.4300341296928327, "grad_norm": 0.8379853821936692, "kl": 0.3154296875, "learning_rate": 8.811149032992036e-07, "loss": 0.0003, "reward": 1.7141927480697632, "reward_std": 0.058109262492507696, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7141926884651184, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 101.66666793823242, "epoch": 1.4334470989761092, "grad_norm": 1.2816355583709123, "kl": 0.30859375, "learning_rate": 8.808304891922638e-07, "loss": 0.0003, "reward": 1.6100260615348816, "reward_std": 0.08390820398926735, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6100260317325592, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 101.44271087646484, "epoch": 1.4368600682593857, "grad_norm": 1.0214765901042817, "kl": 0.3056640625, "learning_rate": 8.805460750853242e-07, "loss": 0.0003, "reward": 1.6477864980697632, "reward_std": 0.08100602775812149, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6477864384651184, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 100.67187881469727, "epoch": 1.4402730375426622, "grad_norm": 1.696061660184675, "kl": 0.34375, "learning_rate": 8.802616609783845e-07, "loss": 0.0003, "reward": 1.6106770634651184, "reward_std": 0.0863872841000557, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6106770932674408, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 100.02083587646484, "epoch": 1.4436860068259385, "grad_norm": 3.2257993740571242, "kl": 0.3115234375, "learning_rate": 8.799772468714448e-07, "loss": 0.0003, "reward": 1.6315104365348816, "reward_std": 0.09658884257078171, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6341145932674408, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 98.23698043823242, "epoch": 1.447098976109215, "grad_norm": 1.739053849853806, "kl": 0.3408203125, "learning_rate": 8.796928327645051e-07, "loss": 0.0003, "reward": 1.5794270634651184, "reward_std": 0.09072646871209145, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5794270932674408, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 100.41927337646484, "epoch": 1.4505119453924915, "grad_norm": 1.079195928719097, "kl": 0.33203125, "learning_rate": 8.794084186575653e-07, "loss": 0.0003, "reward": 1.66015625, "reward_std": 0.11672064289450645, "rewards/format_reward": 1.0, "rewards/score_reward": 0.66015625, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 97.03125381469727, "epoch": 1.4539249146757678, "grad_norm": 2.512653325961497, "kl": 0.337890625, "learning_rate": 8.791240045506256e-07, "loss": 0.0003, "reward": 1.6744791865348816, "reward_std": 0.1383742317557335, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6744791567325592, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 100.82291793823242, "epoch": 1.4573378839590443, "grad_norm": 1.7885118249073315, "kl": 0.3369140625, "learning_rate": 8.788395904436859e-07, "loss": 0.0003, "reward": 1.578125, "reward_std": 0.09837279468774796, "rewards/format_reward": 1.0, "rewards/score_reward": 0.578125, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 97.90625, "epoch": 1.4607508532423208, "grad_norm": 1.1751384319288967, "kl": 0.33203125, "learning_rate": 8.785551763367463e-07, "loss": 0.0003, "reward": 1.5891926884651184, "reward_std": 0.0992475338280201, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5891927182674408, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 97.56510543823242, "epoch": 1.4641638225255973, "grad_norm": 0.9357494951912841, "kl": 0.34375, "learning_rate": 8.782707622298066e-07, "loss": 0.0003, "reward": 1.583984375, "reward_std": 0.07640949636697769, "rewards/format_reward": 1.0, "rewards/score_reward": 0.583984375, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 97.07291793823242, "epoch": 1.4675767918088738, "grad_norm": 1.0387693293703266, "kl": 0.359375, "learning_rate": 8.779863481228669e-07, "loss": 0.0004, "reward": 1.6907552480697632, "reward_std": 0.08697622641921043, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6907551884651184, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 98.84114837646484, "epoch": 1.47098976109215, "grad_norm": 1.3609326320620856, "kl": 0.3505859375, "learning_rate": 8.777019340159272e-07, "loss": 0.0004, "reward": 1.603515625, "reward_std": 0.06718259863555431, "rewards/format_reward": 1.0, "rewards/score_reward": 0.603515625, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 98.12239837646484, "epoch": 1.4744027303754266, "grad_norm": 0.8396345405779416, "kl": 0.375, "learning_rate": 8.774175199089875e-07, "loss": 0.0004, "reward": 1.7838541865348816, "reward_std": 0.06503665260970592, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7838541567325592, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 95.30989837646484, "epoch": 1.477815699658703, "grad_norm": 0.8490022617601793, "kl": 0.396484375, "learning_rate": 8.771331058020477e-07, "loss": 0.0004, "reward": 1.6842448115348816, "reward_std": 0.06424054130911827, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6842447817325592, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 96.51042175292969, "epoch": 1.4812286689419796, "grad_norm": 0.908958424656608, "kl": 0.328125, "learning_rate": 8.76848691695108e-07, "loss": 0.0003, "reward": 1.6595051884651184, "reward_std": 0.0826345905661583, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6595052182674408, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 93.32031631469727, "epoch": 1.484641638225256, "grad_norm": 1.0107589365877434, "kl": 0.3525390625, "learning_rate": 8.765642775881684e-07, "loss": 0.0004, "reward": 1.6822916269302368, "reward_std": 0.0878087654709816, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6822916865348816, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 93.75781631469727, "epoch": 1.4880546075085324, "grad_norm": 1.539039271270074, "kl": 0.3447265625, "learning_rate": 8.762798634812286e-07, "loss": 0.0003, "reward": 1.6217447519302368, "reward_std": 0.08094730414450169, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6217448115348816, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 94.01302337646484, "epoch": 1.4914675767918089, "grad_norm": 1.3574929234613924, "kl": 0.3408203125, "learning_rate": 8.759954493742889e-07, "loss": 0.0003, "reward": 1.603515625, "reward_std": 0.09326329827308655, "rewards/format_reward": 1.0, "rewards/score_reward": 0.603515625, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 95.01041793823242, "epoch": 1.4948805460750854, "grad_norm": 1.00532468230596, "kl": 0.3291015625, "learning_rate": 8.757110352673493e-07, "loss": 0.0003, "reward": 1.6145833134651184, "reward_std": 0.07846951484680176, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6145833432674408, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 94.16666793823242, "epoch": 1.4982935153583616, "grad_norm": 1.0173549368627994, "kl": 0.3203125, "learning_rate": 8.754266211604095e-07, "loss": 0.0003, "reward": 1.720703125, "reward_std": 0.08181869983673096, "rewards/format_reward": 1.0, "rewards/score_reward": 0.720703125, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 93.39062881469727, "epoch": 1.5017064846416384, "grad_norm": 0.9283570945449268, "kl": 0.330078125, "learning_rate": 8.751422070534699e-07, "loss": 0.0003, "reward": 1.6393229365348816, "reward_std": 0.052566275000572205, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6419270634651184, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 95.2265625, "epoch": 1.5051194539249146, "grad_norm": 1.0196713631893406, "kl": 0.3232421875, "learning_rate": 8.748577929465301e-07, "loss": 0.0003, "reward": 1.7311198115348816, "reward_std": 0.08464887738227844, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7311197817325592, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 93.4453125, "epoch": 1.5085324232081911, "grad_norm": 1.1315168550060561, "kl": 0.357421875, "learning_rate": 8.745733788395903e-07, "loss": 0.0004, "reward": 1.634765625, "reward_std": 0.08270996063947678, "rewards/format_reward": 1.0, "rewards/score_reward": 0.634765625, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 90.75781631469727, "epoch": 1.5119453924914676, "grad_norm": 1.2422532181707782, "kl": 0.337890625, "learning_rate": 8.742889647326507e-07, "loss": 0.0003, "reward": 1.6907551884651184, "reward_std": 0.0996636152267456, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6907552182674408, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 93.14062881469727, "epoch": 1.515358361774744, "grad_norm": 6.436457912100031, "kl": 0.3193359375, "learning_rate": 8.74004550625711e-07, "loss": 0.0003, "reward": 1.7298177480697632, "reward_std": 0.07599178329110146, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7298176884651184, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 90.97135543823242, "epoch": 1.5187713310580204, "grad_norm": 3.3258646584076073, "kl": 0.34375, "learning_rate": 8.737201365187713e-07, "loss": 0.0003, "reward": 1.607421875, "reward_std": 0.12124079465866089, "rewards/format_reward": 1.0, "rewards/score_reward": 0.607421875, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 90.54427337646484, "epoch": 1.522184300341297, "grad_norm": 1.2568407008026043, "kl": 0.33203125, "learning_rate": 8.734357224118316e-07, "loss": 0.0003, "reward": 1.6067708134651184, "reward_std": 0.06372088007628918, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6067708432674408, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 89.73437881469727, "epoch": 1.5255972696245734, "grad_norm": 1.8485015892885457, "kl": 0.357421875, "learning_rate": 8.731513083048919e-07, "loss": 0.0004, "reward": 1.6549479365348816, "reward_std": 0.1027575209736824, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6575520932674408, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 90.41406631469727, "epoch": 1.52901023890785, "grad_norm": 1.15112383485338, "kl": 0.3388671875, "learning_rate": 8.728668941979523e-07, "loss": 0.0003, "reward": 1.6790364980697632, "reward_std": 0.09449188783764839, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6790364384651184, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 89.09375, "epoch": 1.5324232081911262, "grad_norm": 0.9661115337179529, "kl": 0.333984375, "learning_rate": 8.725824800910124e-07, "loss": 0.0003, "reward": 1.5865885615348816, "reward_std": 0.06768830306828022, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5865885317325592, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 88.43489837646484, "epoch": 1.5358361774744027, "grad_norm": 1.3294506445247756, "kl": 0.3427734375, "learning_rate": 8.722980659840728e-07, "loss": 0.0003, "reward": 1.619140625, "reward_std": 0.06643434800207615, "rewards/format_reward": 1.0, "rewards/score_reward": 0.619140625, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 87.63542175292969, "epoch": 1.5392491467576792, "grad_norm": 5.666629774548189, "kl": 0.341796875, "learning_rate": 8.72013651877133e-07, "loss": 0.0003, "reward": 1.6067708730697632, "reward_std": 0.08958297967910767, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6067708134651184, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 87.64583587646484, "epoch": 1.5426621160409555, "grad_norm": 1.335406065956223, "kl": 0.3427734375, "learning_rate": 8.717292377701933e-07, "loss": 0.0003, "reward": 1.66015625, "reward_std": 0.07359911035746336, "rewards/format_reward": 1.0, "rewards/score_reward": 0.66015625, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 86.63802337646484, "epoch": 1.5460750853242322, "grad_norm": 1.054854585254263, "kl": 0.3447265625, "learning_rate": 8.714448236632537e-07, "loss": 0.0003, "reward": 1.6009114980697632, "reward_std": 0.06845445558428764, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6009114533662796, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 84.82292175292969, "epoch": 1.5494880546075085, "grad_norm": 1.8388508463176552, "kl": 0.3701171875, "learning_rate": 8.71160409556314e-07, "loss": 0.0004, "reward": 1.673828125, "reward_std": 0.06641644425690174, "rewards/format_reward": 1.0, "rewards/score_reward": 0.673828125, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 84.31771087646484, "epoch": 1.552901023890785, "grad_norm": 1.072230324369192, "kl": 0.3447265625, "learning_rate": 8.708759954493743e-07, "loss": 0.0003, "reward": 1.66796875, "reward_std": 0.06862568110227585, "rewards/format_reward": 1.0, "rewards/score_reward": 0.66796875, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 82.83073043823242, "epoch": 1.5563139931740615, "grad_norm": 1.180107697656908, "kl": 0.353515625, "learning_rate": 8.705915813424346e-07, "loss": 0.0004, "reward": 1.6243489980697632, "reward_std": 0.04532465152442455, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6243489384651184, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 82.85677337646484, "epoch": 1.5597269624573378, "grad_norm": 1.7754135433440812, "kl": 0.3505859375, "learning_rate": 8.703071672354948e-07, "loss": 0.0004, "reward": 1.7877604365348816, "reward_std": 0.09606852009892464, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7877604067325592, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 82.76823043823242, "epoch": 1.5631399317406145, "grad_norm": 3.9701751677255737, "kl": 0.3544921875, "learning_rate": 8.700227531285551e-07, "loss": 0.0004, "reward": 1.638671875, "reward_std": 0.06023379787802696, "rewards/format_reward": 1.0, "rewards/score_reward": 0.638671875, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 81.53385543823242, "epoch": 1.5665529010238908, "grad_norm": 2.916363373364193, "kl": 0.3720703125, "learning_rate": 8.697383390216154e-07, "loss": 0.0004, "reward": 1.7259114384651184, "reward_std": 0.038537174463272095, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7259114682674408, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 83.97916793823242, "epoch": 1.5699658703071673, "grad_norm": 3.6744281674192183, "kl": 0.3681640625, "learning_rate": 8.694539249146758e-07, "loss": 0.0004, "reward": 1.6985676884651184, "reward_std": 0.06125957425683737, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6985677182674408, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 85.80469131469727, "epoch": 1.5733788395904438, "grad_norm": 1.6706492021836636, "kl": 0.3466796875, "learning_rate": 8.69169510807736e-07, "loss": 0.0003, "reward": 1.6608073115348816, "reward_std": 0.06667371653020382, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6608072817325592, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 82.35416793823242, "epoch": 1.57679180887372, "grad_norm": 6.502984624084201, "kl": 0.37109375, "learning_rate": 8.688850967007963e-07, "loss": 0.0004, "reward": 1.7740885615348816, "reward_std": 0.07244019024074078, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7740885317325592, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 82.09114837646484, "epoch": 1.5802047781569966, "grad_norm": 0.7182214144674386, "kl": 0.3740234375, "learning_rate": 8.686006825938567e-07, "loss": 0.0004, "reward": 1.634765625, "reward_std": 0.04806549474596977, "rewards/format_reward": 1.0, "rewards/score_reward": 0.634765625, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 82.33594131469727, "epoch": 1.583617747440273, "grad_norm": 1.32829855791013, "kl": 0.3544921875, "learning_rate": 8.683162684869168e-07, "loss": 0.0004, "reward": 1.6966145634651184, "reward_std": 0.025380939245224, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6966145932674408, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 81.80208587646484, "epoch": 1.5870307167235493, "grad_norm": 15.013223562771197, "kl": 0.384765625, "learning_rate": 8.680318543799772e-07, "loss": 0.0004, "reward": 1.5924479365348816, "reward_std": 0.039695436134934425, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5924479067325592, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 82.40104675292969, "epoch": 1.590443686006826, "grad_norm": 1.3308637101097924, "kl": 0.3828125, "learning_rate": 8.677474402730375e-07, "loss": 0.0004, "reward": 1.5865885615348816, "reward_std": 0.09966214373707771, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5865885317325592, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 83.1171875, "epoch": 1.5938566552901023, "grad_norm": 0.9507261132850247, "kl": 0.359375, "learning_rate": 8.674630261660977e-07, "loss": 0.0004, "reward": 1.6920573115348816, "reward_std": 0.04550305753946304, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6920572817325592, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 84.10416793823242, "epoch": 1.5972696245733788, "grad_norm": 1.263398755724932, "kl": 0.3779296875, "learning_rate": 8.671786120591581e-07, "loss": 0.0004, "reward": 1.6341145634651184, "reward_std": 0.05995068699121475, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6341145932674408, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 83.75260543823242, "epoch": 1.6006825938566553, "grad_norm": 0.9490079681990795, "kl": 0.4833984375, "learning_rate": 8.668941979522184e-07, "loss": 0.0005, "reward": 1.693359375, "reward_std": 0.06411881186068058, "rewards/format_reward": 1.0, "rewards/score_reward": 0.693359375, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 84.26302337646484, "epoch": 1.6040955631399316, "grad_norm": 2.049697038538439, "kl": 0.373046875, "learning_rate": 8.666097838452788e-07, "loss": 0.0004, "reward": 1.6243489384651184, "reward_std": 0.06728849187493324, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6243489682674408, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 82.67448425292969, "epoch": 1.6075085324232083, "grad_norm": 1.3633076510857585, "kl": 0.373046875, "learning_rate": 8.66325369738339e-07, "loss": 0.0004, "reward": 1.669921875, "reward_std": 0.0521214883774519, "rewards/format_reward": 1.0, "rewards/score_reward": 0.669921875, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 83.67708587646484, "epoch": 1.6109215017064846, "grad_norm": 2.3312934200713054, "kl": 0.373046875, "learning_rate": 8.660409556313992e-07, "loss": 0.0004, "reward": 1.6380208134651184, "reward_std": 0.08995531126856804, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6380208432674408, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 84.12500381469727, "epoch": 1.6143344709897611, "grad_norm": 1.0504251363167936, "kl": 0.357421875, "learning_rate": 8.657565415244596e-07, "loss": 0.0004, "reward": 1.57421875, "reward_std": 0.06437285803258419, "rewards/format_reward": 1.0, "rewards/score_reward": 0.57421875, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 83.38541793823242, "epoch": 1.6177474402730376, "grad_norm": 2.022646171401982, "kl": 0.3759765625, "learning_rate": 8.654721274175198e-07, "loss": 0.0004, "reward": 1.6634114980697632, "reward_std": 0.07706599682569504, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6634114384651184, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 81.60937881469727, "epoch": 1.621160409556314, "grad_norm": 0.7934139816389911, "kl": 0.380859375, "learning_rate": 8.651877133105802e-07, "loss": 0.0004, "reward": 1.7473958730697632, "reward_std": 0.035674863029271364, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7473958134651184, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 85.07292175292969, "epoch": 1.6245733788395904, "grad_norm": 1.6077990206281476, "kl": 0.359375, "learning_rate": 8.649032992036405e-07, "loss": 0.0004, "reward": 1.6380208730697632, "reward_std": 0.09933283552527428, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6380208134651184, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 84.54166793823242, "epoch": 1.627986348122867, "grad_norm": 1.8064468460532965, "kl": 0.3681640625, "learning_rate": 8.646188850967008e-07, "loss": 0.0004, "reward": 1.6705729365348816, "reward_std": 0.04635761398822069, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6705729067325592, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 84.00000381469727, "epoch": 1.6313993174061432, "grad_norm": 1.062766079869125, "kl": 0.3671875, "learning_rate": 8.643344709897611e-07, "loss": 0.0004, "reward": 1.599609375, "reward_std": 0.06615681946277618, "rewards/format_reward": 1.0, "rewards/score_reward": 0.599609375, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 85.08594131469727, "epoch": 1.63481228668942, "grad_norm": 0.8540821698057083, "kl": 0.37109375, "learning_rate": 8.640500568828214e-07, "loss": 0.0004, "reward": 1.626953125, "reward_std": 0.06704970449209213, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6295573115348816, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 85.70833587646484, "epoch": 1.6382252559726962, "grad_norm": 5.715449199008879e+24, "kl": 5.100155801499217e+23, "learning_rate": 8.637656427758816e-07, "loss": 5.09088133330984e+20, "reward": 1.7180989384651184, "reward_std": 0.0615079440176487, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7180989682674408, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 85.94791793823242, "epoch": 1.6416382252559727, "grad_norm": 2.0294485462234775, "kl": 0.38671875, "learning_rate": 8.634812286689419e-07, "loss": 0.0004, "reward": 1.6197916865348816, "reward_std": 0.08659588918089867, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6223958134651184, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 84.9140625, "epoch": 1.6450511945392492, "grad_norm": 0.9513347789449267, "kl": 0.3662109375, "learning_rate": 8.631968145620023e-07, "loss": 0.0004, "reward": 1.6276041865348816, "reward_std": 0.04024505754932761, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6276041567325592, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 84.56771087646484, "epoch": 1.6484641638225255, "grad_norm": 1.2465373511036542, "kl": 0.37890625, "learning_rate": 8.629124004550625e-07, "loss": 0.0004, "reward": 1.6458333730697632, "reward_std": 0.07158710807561874, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6458333134651184, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 85.45052337646484, "epoch": 1.6518771331058022, "grad_norm": 1.6951465635107268, "kl": 0.3779296875, "learning_rate": 8.626279863481228e-07, "loss": 0.0004, "reward": 1.6100260615348816, "reward_std": 0.08582727238535881, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6100260317325592, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 85.83854293823242, "epoch": 1.6552901023890785, "grad_norm": 0.9683942672572937, "kl": 0.3681640625, "learning_rate": 8.623435722411832e-07, "loss": 0.0004, "reward": 1.5625, "reward_std": 0.06422141008079052, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5625, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 86.375, "epoch": 1.658703071672355, "grad_norm": 1.214951092550408, "kl": 0.3544921875, "learning_rate": 8.620591581342434e-07, "loss": 0.0004, "reward": 1.6217448115348816, "reward_std": 0.09571450389921665, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6217447817325592, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 84.2734375, "epoch": 1.6621160409556315, "grad_norm": 0.8855267726213146, "kl": 0.3681640625, "learning_rate": 8.617747440273038e-07, "loss": 0.0004, "reward": 1.6490885615348816, "reward_std": 0.07367542386054993, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6490885317325592, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 85.71354293823242, "epoch": 1.6655290102389078, "grad_norm": 2.0542604940070084, "kl": 0.3583984375, "learning_rate": 8.61490329920364e-07, "loss": 0.0004, "reward": 1.6412760019302368, "reward_std": 0.09524079784750938, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6412760615348816, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 83.12760543823242, "epoch": 1.6689419795221843, "grad_norm": 0.6003566234888362, "kl": 0.3544921875, "learning_rate": 8.612059158134242e-07, "loss": 0.0004, "reward": 1.6907551884651184, "reward_std": 0.04180018976330757, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6907552182674408, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 85.1796875, "epoch": 1.6723549488054608, "grad_norm": 1.375233399288869, "kl": 0.359375, "learning_rate": 8.609215017064846e-07, "loss": 0.0004, "reward": 1.708984375, "reward_std": 0.09308796375989914, "rewards/format_reward": 1.0, "rewards/score_reward": 0.708984375, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 84.13541793823242, "epoch": 1.675767918088737, "grad_norm": 1.0096261994829492, "kl": 0.3671875, "learning_rate": 8.606370875995449e-07, "loss": 0.0004, "reward": 1.6178385615348816, "reward_std": 0.06235022470355034, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6178385317325592, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 81.97916793823242, "epoch": 1.6791808873720138, "grad_norm": 1.0104660850949458, "kl": 0.38671875, "learning_rate": 8.603526734926053e-07, "loss": 0.0004, "reward": 1.6263020634651184, "reward_std": 0.06939618289470673, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6263020932674408, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 83.09896087646484, "epoch": 1.68259385665529, "grad_norm": 4.537624643323018, "kl": 0.3837890625, "learning_rate": 8.600682593856655e-07, "loss": 0.0004, "reward": 1.705078125, "reward_std": 0.07749409601092339, "rewards/format_reward": 1.0, "rewards/score_reward": 0.705078125, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 82.36458587646484, "epoch": 1.6860068259385665, "grad_norm": 1.5109358898426333, "kl": 0.3837890625, "learning_rate": 8.597838452787258e-07, "loss": 0.0004, "reward": 1.70703125, "reward_std": 0.08285653404891491, "rewards/format_reward": 1.0, "rewards/score_reward": 0.70703125, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 81.81771087646484, "epoch": 1.689419795221843, "grad_norm": 1.935052612538173, "kl": 0.3857421875, "learning_rate": 8.59499431171786e-07, "loss": 0.0004, "reward": 1.630859375, "reward_std": 0.07854790426790714, "rewards/format_reward": 1.0, "rewards/score_reward": 0.630859375, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 84.48177337646484, "epoch": 1.6928327645051193, "grad_norm": 1.0988864750711256, "kl": 0.376953125, "learning_rate": 8.592150170648463e-07, "loss": 0.0004, "reward": 1.7356770634651184, "reward_std": 0.11323807016015053, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.73828125, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 83.69531631469727, "epoch": 1.696245733788396, "grad_norm": 0.788279430797225, "kl": 0.38671875, "learning_rate": 8.589306029579067e-07, "loss": 0.0004, "reward": 1.7115885019302368, "reward_std": 0.05677330307662487, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7115885615348816, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 82.4765625, "epoch": 1.6996587030716723, "grad_norm": 0.9365771317980552, "kl": 0.361328125, "learning_rate": 8.58646188850967e-07, "loss": 0.0004, "reward": 1.7565104365348816, "reward_std": 0.06571333669126034, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7565104067325592, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 82.73437881469727, "epoch": 1.7030716723549488, "grad_norm": 1.2461417845516842, "kl": 0.3662109375, "learning_rate": 8.583617747440272e-07, "loss": 0.0004, "reward": 1.703125, "reward_std": 0.0894688032567501, "rewards/format_reward": 1.0, "rewards/score_reward": 0.703125, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 83.07292175292969, "epoch": 1.7064846416382253, "grad_norm": 0.9305866806042947, "kl": 0.3671875, "learning_rate": 8.580773606370876e-07, "loss": 0.0004, "reward": 1.5924479365348816, "reward_std": 0.05827983841300011, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5924479067325592, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 83.93750381469727, "epoch": 1.7098976109215016, "grad_norm": 1.1707213941155872, "kl": 0.35546875, "learning_rate": 8.577929465301479e-07, "loss": 0.0004, "reward": 1.6106770634651184, "reward_std": 0.11536679416894913, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6106770932674408, "step": 500 }, { "clip_ratio": 0.0, "completion_length": 84.1015625, "epoch": 1.713310580204778, "grad_norm": 2.176769772067134, "kl": 0.4384765625, "learning_rate": 8.575085324232082e-07, "loss": 0.0004, "reward": 1.6575521230697632, "reward_std": 0.13990925252437592, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6575520634651184, "step": 501 }, { "clip_ratio": 0.0, "completion_length": 84.56771087646484, "epoch": 1.7167235494880546, "grad_norm": 1.6425725238038196, "kl": 0.376953125, "learning_rate": 8.572241183162684e-07, "loss": 0.0004, "reward": 1.7766926884651184, "reward_std": 0.13000089302659035, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7766927182674408, "step": 502 }, { "clip_ratio": 0.0, "completion_length": 86.63021087646484, "epoch": 1.7201365187713311, "grad_norm": 2.2651321550232786, "kl": 0.3642578125, "learning_rate": 8.569397042093287e-07, "loss": 0.0004, "reward": 1.626953125, "reward_std": 0.09548188745975494, "rewards/format_reward": 1.0, "rewards/score_reward": 0.626953125, "step": 503 }, { "clip_ratio": 0.0, "completion_length": 84.50260543823242, "epoch": 1.7235494880546076, "grad_norm": 3.2494638588501545, "kl": 0.3544921875, "learning_rate": 8.56655290102389e-07, "loss": 0.0004, "reward": 1.6236979365348816, "reward_std": 0.13885827362537384, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6236979067325592, "step": 504 }, { "clip_ratio": 0.0, "completion_length": 85.5078125, "epoch": 1.726962457337884, "grad_norm": 1.551480693152312, "kl": 0.3916015625, "learning_rate": 8.563708759954493e-07, "loss": 0.0004, "reward": 1.6588541865348816, "reward_std": 0.10049732401967049, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6588541567325592, "step": 505 }, { "clip_ratio": 0.0, "completion_length": 86.4453125, "epoch": 1.7303754266211604, "grad_norm": 1.086264140707891, "kl": 0.3544921875, "learning_rate": 8.560864618885097e-07, "loss": 0.0004, "reward": 1.6432291269302368, "reward_std": 0.09179632365703583, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6432291865348816, "step": 506 }, { "clip_ratio": 0.0, "completion_length": 87.64583587646484, "epoch": 1.733788395904437, "grad_norm": 1.1964290207348702, "kl": 0.375, "learning_rate": 8.558020477815699e-07, "loss": 0.0004, "reward": 1.697265625, "reward_std": 0.09961741417646408, "rewards/format_reward": 1.0, "rewards/score_reward": 0.697265625, "step": 507 }, { "clip_ratio": 0.0, "completion_length": 87.96875381469727, "epoch": 1.7372013651877132, "grad_norm": 1.106545096387213, "kl": 0.36328125, "learning_rate": 8.555176336746302e-07, "loss": 0.0004, "reward": 1.7565104365348816, "reward_std": 0.05493106320500374, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7565104067325592, "step": 508 }, { "clip_ratio": 0.0, "completion_length": 87.32552337646484, "epoch": 1.74061433447099, "grad_norm": 1.8889283942283681, "kl": 0.359375, "learning_rate": 8.552332195676906e-07, "loss": 0.0004, "reward": 1.6236979365348816, "reward_std": 0.14839252084493637, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6236979067325592, "step": 509 }, { "clip_ratio": 0.0, "completion_length": 86.51823043823242, "epoch": 1.7440273037542662, "grad_norm": 2.3369860297243874, "kl": 0.3447265625, "learning_rate": 8.549488054607507e-07, "loss": 0.0003, "reward": 1.7076822519302368, "reward_std": 0.09767115861177444, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7076823115348816, "step": 510 }, { "clip_ratio": 0.0, "completion_length": 86.57552337646484, "epoch": 1.7474402730375427, "grad_norm": 1.784133966733994, "kl": 0.3564453125, "learning_rate": 8.546643913538111e-07, "loss": 0.0004, "reward": 1.6282551884651184, "reward_std": 0.08661074936389923, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6282552182674408, "step": 511 }, { "clip_ratio": 0.0, "completion_length": 87.69010543823242, "epoch": 1.7508532423208192, "grad_norm": 1.192220525648947, "kl": 0.3388671875, "learning_rate": 8.543799772468714e-07, "loss": 0.0003, "reward": 1.671875, "reward_std": 0.061069631949067116, "rewards/format_reward": 1.0, "rewards/score_reward": 0.671875, "step": 512 }, { "clip_ratio": 0.0, "completion_length": 86.12500381469727, "epoch": 1.7542662116040955, "grad_norm": 1.155689219209665, "kl": 0.345703125, "learning_rate": 8.540955631399316e-07, "loss": 0.0003, "reward": 1.568359375, "reward_std": 0.06762049999088049, "rewards/format_reward": 1.0, "rewards/score_reward": 0.568359375, "step": 513 }, { "clip_ratio": 0.0, "completion_length": 85.921875, "epoch": 1.757679180887372, "grad_norm": 1.3643099597258062, "kl": 0.349609375, "learning_rate": 8.53811149032992e-07, "loss": 0.0003, "reward": 1.6686198115348816, "reward_std": 0.07182777673006058, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6686197817325592, "step": 514 }, { "clip_ratio": 0.0, "completion_length": 87.22396087646484, "epoch": 1.7610921501706485, "grad_norm": 1.2445858584152778, "kl": 0.337890625, "learning_rate": 8.535267349260523e-07, "loss": 0.0003, "reward": 1.6705729365348816, "reward_std": 0.08478489331901073, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6705729067325592, "step": 515 }, { "clip_ratio": 0.0, "completion_length": 87.83333587646484, "epoch": 1.764505119453925, "grad_norm": 1.1706980455308953, "kl": 0.333984375, "learning_rate": 8.532423208191127e-07, "loss": 0.0003, "reward": 1.671875, "reward_std": 0.0706002451479435, "rewards/format_reward": 1.0, "rewards/score_reward": 0.671875, "step": 516 }, { "clip_ratio": 0.0, "completion_length": 85.90364837646484, "epoch": 1.7679180887372015, "grad_norm": 0.9271271395719267, "kl": 0.3564453125, "learning_rate": 8.529579067121729e-07, "loss": 0.0004, "reward": 1.5638020634651184, "reward_std": 0.05457827262580395, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5638020932674408, "step": 517 }, { "clip_ratio": 0.0, "completion_length": 86.01823043823242, "epoch": 1.7713310580204777, "grad_norm": 0.701994359372261, "kl": 0.345703125, "learning_rate": 8.526734926052331e-07, "loss": 0.0003, "reward": 1.6966146230697632, "reward_std": 0.0477428212761879, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6966145634651184, "step": 518 }, { "clip_ratio": 0.0, "completion_length": 86.14844131469727, "epoch": 1.7747440273037542, "grad_norm": 2.060487059507345, "kl": 0.3427734375, "learning_rate": 8.523890784982935e-07, "loss": 0.0003, "reward": 1.7233072519302368, "reward_std": 0.0664135180413723, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7233073115348816, "step": 519 }, { "clip_ratio": 0.0, "completion_length": 88.3515625, "epoch": 1.7781569965870307, "grad_norm": 2.08416273780882, "kl": 0.349609375, "learning_rate": 8.521046643913537e-07, "loss": 0.0003, "reward": 1.8020833730697632, "reward_std": 0.06435495242476463, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8020833134651184, "step": 520 }, { "clip_ratio": 0.0, "completion_length": 84.80208587646484, "epoch": 1.781569965870307, "grad_norm": 1.0951294952473551, "kl": 0.3642578125, "learning_rate": 8.518202502844141e-07, "loss": 0.0004, "reward": 1.5944010019302368, "reward_std": 0.06706923991441727, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5944010615348816, "step": 521 }, { "clip_ratio": 0.0, "completion_length": 85.38802337646484, "epoch": 1.7849829351535837, "grad_norm": 1.4364971173135677, "kl": 0.3642578125, "learning_rate": 8.515358361774744e-07, "loss": 0.0004, "reward": 1.6549479365348816, "reward_std": 0.08269165083765984, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6549479067325592, "step": 522 }, { "clip_ratio": 0.0, "completion_length": 85.54687881469727, "epoch": 1.78839590443686, "grad_norm": 1.4371349275198395, "kl": 0.3623046875, "learning_rate": 8.512514220705347e-07, "loss": 0.0004, "reward": 1.654296875, "reward_std": 0.11360817775130272, "rewards/format_reward": 1.0, "rewards/score_reward": 0.654296875, "step": 523 }, { "clip_ratio": 0.0, "completion_length": 83.76302337646484, "epoch": 1.7918088737201365, "grad_norm": 0.9107330461039155, "kl": 0.373046875, "learning_rate": 8.50967007963595e-07, "loss": 0.0004, "reward": 1.5234375, "reward_std": 0.05017400346696377, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5234375, "step": 524 }, { "clip_ratio": 0.0, "completion_length": 84.40625381469727, "epoch": 1.795221843003413, "grad_norm": 1.4664926217267837, "kl": 0.3798828125, "learning_rate": 8.506825938566553e-07, "loss": 0.0004, "reward": 1.69140625, "reward_std": 0.12141218408942223, "rewards/format_reward": 1.0, "rewards/score_reward": 0.69140625, "step": 525 }, { "clip_ratio": 0.0, "completion_length": 85.13542175292969, "epoch": 1.7986348122866893, "grad_norm": 0.7233495709254039, "kl": 0.359375, "learning_rate": 8.503981797497155e-07, "loss": 0.0004, "reward": 1.6751302480697632, "reward_std": 0.06507596746087074, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6751301884651184, "step": 526 }, { "clip_ratio": 0.0, "completion_length": 87.40104293823242, "epoch": 1.802047781569966, "grad_norm": 2.6919959136015232, "kl": 0.3525390625, "learning_rate": 8.501137656427758e-07, "loss": 0.0004, "reward": 1.6171875, "reward_std": 0.09480248391628265, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6197916567325592, "step": 527 }, { "clip_ratio": 0.0, "completion_length": 86.47917175292969, "epoch": 1.8054607508532423, "grad_norm": 1.3534650990554473, "kl": 0.3984375, "learning_rate": 8.498293515358362e-07, "loss": 0.0004, "reward": 1.6614583134651184, "reward_std": 0.09187979996204376, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6640625, "step": 528 }, { "clip_ratio": 0.0, "completion_length": 87.47135925292969, "epoch": 1.8088737201365188, "grad_norm": 3.9598062719622, "kl": 0.365234375, "learning_rate": 8.495449374288964e-07, "loss": 0.0004, "reward": 1.677734375, "reward_std": 0.09464399144053459, "rewards/format_reward": 1.0, "rewards/score_reward": 0.677734375, "step": 529 }, { "clip_ratio": 0.0, "completion_length": 86.81510543823242, "epoch": 1.8122866894197953, "grad_norm": 1.4888277629643247, "kl": 0.365234375, "learning_rate": 8.492605233219567e-07, "loss": 0.0004, "reward": 1.6471354365348816, "reward_std": 0.09870539978146553, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6471354067325592, "step": 530 }, { "clip_ratio": 0.0, "completion_length": 84.37760925292969, "epoch": 1.8156996587030716, "grad_norm": 3.070469773719469, "kl": 0.36328125, "learning_rate": 8.489761092150171e-07, "loss": 0.0004, "reward": 1.6438801884651184, "reward_std": 0.034829131327569485, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6438802182674408, "step": 531 }, { "clip_ratio": 0.0, "completion_length": 88.73958587646484, "epoch": 1.819112627986348, "grad_norm": 2.983549288073959, "kl": 0.349609375, "learning_rate": 8.486916951080773e-07, "loss": 0.0003, "reward": 1.677734375, "reward_std": 0.11112905293703079, "rewards/format_reward": 1.0, "rewards/score_reward": 0.677734375, "step": 532 }, { "clip_ratio": 0.0, "completion_length": 86.85677337646484, "epoch": 1.8225255972696246, "grad_norm": 3.5322106719331923, "kl": 0.3701171875, "learning_rate": 8.484072810011376e-07, "loss": 0.0004, "reward": 1.5735676884651184, "reward_std": 0.09922734647989273, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5735677182674408, "step": 533 }, { "clip_ratio": 0.0, "completion_length": 91.10416793823242, "epoch": 1.8259385665529009, "grad_norm": 1.4448680257456126, "kl": 0.3271484375, "learning_rate": 8.481228668941979e-07, "loss": 0.0003, "reward": 1.7024739384651184, "reward_std": 0.06773156672716141, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7024739682674408, "step": 534 }, { "clip_ratio": 0.0, "completion_length": 92.16406631469727, "epoch": 1.8293515358361776, "grad_norm": 1.6227411633477402, "kl": 0.3515625, "learning_rate": 8.478384527872581e-07, "loss": 0.0004, "reward": 1.65234375, "reward_std": 0.10220126807689667, "rewards/format_reward": 1.0, "rewards/score_reward": 0.65234375, "step": 535 }, { "clip_ratio": 0.0, "completion_length": 87.546875, "epoch": 1.8327645051194539, "grad_norm": 1.316982950085007, "kl": 0.3583984375, "learning_rate": 8.475540386803185e-07, "loss": 0.0004, "reward": 1.6204426884651184, "reward_std": 0.08460849523544312, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6204427182674408, "step": 536 }, { "clip_ratio": 0.0, "completion_length": 91.25521087646484, "epoch": 1.8361774744027304, "grad_norm": 1.5221211321144743, "kl": 0.3359375, "learning_rate": 8.472696245733788e-07, "loss": 0.0003, "reward": 1.6901041865348816, "reward_std": 0.09611472487449646, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6901041567325592, "step": 537 }, { "clip_ratio": 0.0, "completion_length": 89.39323425292969, "epoch": 1.8395904436860069, "grad_norm": 0.9768783067858587, "kl": 0.3720703125, "learning_rate": 8.469852104664392e-07, "loss": 0.0004, "reward": 1.6686198115348816, "reward_std": 0.04929559864103794, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6686197817325592, "step": 538 }, { "clip_ratio": 0.0, "completion_length": 90.37500381469727, "epoch": 1.8430034129692832, "grad_norm": 13.197955734412927, "kl": 0.36328125, "learning_rate": 8.467007963594994e-07, "loss": 0.0004, "reward": 1.666015625, "reward_std": 0.08138662204146385, "rewards/format_reward": 1.0, "rewards/score_reward": 0.666015625, "step": 539 }, { "clip_ratio": 0.0, "completion_length": 92.98698043823242, "epoch": 1.8464163822525599, "grad_norm": 1.2443096922595813, "kl": 0.322265625, "learning_rate": 8.464163822525597e-07, "loss": 0.0003, "reward": 1.7578125, "reward_std": 0.07171917892992496, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7578125, "step": 540 }, { "clip_ratio": 0.0, "completion_length": 93.40625381469727, "epoch": 1.8498293515358362, "grad_norm": 1.180479236863779, "kl": 0.3369140625, "learning_rate": 8.4613196814562e-07, "loss": 0.0003, "reward": 1.7194010615348816, "reward_std": 0.07686464861035347, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7194010317325592, "step": 541 }, { "clip_ratio": 0.0, "completion_length": 94.18750381469727, "epoch": 1.8532423208191127, "grad_norm": 0.49719699959971514, "kl": 0.3291015625, "learning_rate": 8.458475540386802e-07, "loss": 0.0003, "reward": 1.77734375, "reward_std": 0.03211255744099617, "rewards/format_reward": 1.0, "rewards/score_reward": 0.77734375, "step": 542 }, { "clip_ratio": 0.0, "completion_length": 92.78646087646484, "epoch": 1.8566552901023892, "grad_norm": 1.3215594356002058, "kl": 0.3212890625, "learning_rate": 8.455631399317406e-07, "loss": 0.0003, "reward": 1.6673177480697632, "reward_std": 0.0796097107231617, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6673176884651184, "step": 543 }, { "clip_ratio": 0.0, "completion_length": 95.17187881469727, "epoch": 1.8600682593856654, "grad_norm": 1.998666942718494, "kl": 0.34375, "learning_rate": 8.452787258248009e-07, "loss": 0.0003, "reward": 1.7734375, "reward_std": 0.06535907462239265, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7734375, "step": 544 }, { "clip_ratio": 0.0, "completion_length": 93.92448043823242, "epoch": 1.863481228668942, "grad_norm": 1.5086174241873027, "kl": 0.3359375, "learning_rate": 8.449943117178611e-07, "loss": 0.0003, "reward": 1.640625, "reward_std": 0.10604285076260567, "rewards/format_reward": 1.0, "rewards/score_reward": 0.640625, "step": 545 }, { "clip_ratio": 0.0, "completion_length": 92.47135925292969, "epoch": 1.8668941979522184, "grad_norm": 1.3968797770218468, "kl": 0.3349609375, "learning_rate": 8.447098976109215e-07, "loss": 0.0003, "reward": 1.6640625, "reward_std": 0.09276530146598816, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6640625, "step": 546 }, { "clip_ratio": 0.0, "completion_length": 92.68750381469727, "epoch": 1.8703071672354947, "grad_norm": 6.30197450682208, "kl": 0.349609375, "learning_rate": 8.444254835039818e-07, "loss": 0.0003, "reward": 1.6868489384651184, "reward_std": 0.08846297860145569, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6868489682674408, "step": 547 }, { "clip_ratio": 0.0, "completion_length": 92.5703125, "epoch": 1.8737201365187715, "grad_norm": 1.1701881128360283, "kl": 0.3349609375, "learning_rate": 8.441410693970421e-07, "loss": 0.0003, "reward": 1.6432291865348816, "reward_std": 0.07562291249632835, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6432291567325592, "step": 548 }, { "clip_ratio": 0.0, "completion_length": 91.52344131469727, "epoch": 1.8771331058020477, "grad_norm": 1.3824846106961557, "kl": 0.3330078125, "learning_rate": 8.438566552901023e-07, "loss": 0.0003, "reward": 1.7161458134651184, "reward_std": 0.08724524080753326, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7161458432674408, "step": 549 }, { "clip_ratio": 0.0, "completion_length": 90.57291793823242, "epoch": 1.8805460750853242, "grad_norm": 1.1787784815526392, "kl": 0.3447265625, "learning_rate": 8.435722411831626e-07, "loss": 0.0003, "reward": 1.7473958134651184, "reward_std": 0.03656220622360706, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7473958432674408, "step": 550 }, { "clip_ratio": 0.0, "completion_length": 92.69271087646484, "epoch": 1.8839590443686007, "grad_norm": 1.3617753476226184, "kl": 0.34375, "learning_rate": 8.432878270762229e-07, "loss": 0.0003, "reward": 1.6966146230697632, "reward_std": 0.09350526705384254, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6966145634651184, "step": 551 }, { "clip_ratio": 0.0, "completion_length": 92.09896087646484, "epoch": 1.887372013651877, "grad_norm": 0.9557782181367556, "kl": 0.337890625, "learning_rate": 8.430034129692832e-07, "loss": 0.0003, "reward": 1.7428385615348816, "reward_std": 0.0709981694817543, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7428385317325592, "step": 552 }, { "clip_ratio": 0.0, "completion_length": 89.19010925292969, "epoch": 1.8907849829351537, "grad_norm": 0.6830925462072321, "kl": 0.3369140625, "learning_rate": 8.427189988623436e-07, "loss": 0.0003, "reward": 1.7141926884651184, "reward_std": 0.0191352479159832, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7141927182674408, "step": 553 }, { "clip_ratio": 0.0, "completion_length": 89.04948425292969, "epoch": 1.89419795221843, "grad_norm": 1.488299163532018, "kl": 0.3505859375, "learning_rate": 8.424345847554038e-07, "loss": 0.0004, "reward": 1.7897135615348816, "reward_std": 0.06674535199999809, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7897135317325592, "step": 554 }, { "clip_ratio": 0.0, "completion_length": 90.62239837646484, "epoch": 1.8976109215017065, "grad_norm": 1.5440871285139455, "kl": 0.3583984375, "learning_rate": 8.421501706484641e-07, "loss": 0.0004, "reward": 1.6803385019302368, "reward_std": 0.12311924993991852, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6803385615348816, "step": 555 }, { "clip_ratio": 0.0, "completion_length": 93.05469131469727, "epoch": 1.901023890784983, "grad_norm": 1.1530667164021655, "kl": 0.333984375, "learning_rate": 8.418657565415245e-07, "loss": 0.0003, "reward": 1.7513020634651184, "reward_std": 0.06323454715311527, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7513020932674408, "step": 556 }, { "clip_ratio": 0.0, "completion_length": 92.48698043823242, "epoch": 1.9044368600682593, "grad_norm": 2.177025785533933, "kl": 0.3271484375, "learning_rate": 8.415813424345846e-07, "loss": 0.0003, "reward": 1.6217448115348816, "reward_std": 0.08969010412693024, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6217447817325592, "step": 557 }, { "clip_ratio": 0.0, "completion_length": 87.703125, "epoch": 1.9078498293515358, "grad_norm": 1.2219177403472843, "kl": 0.361328125, "learning_rate": 8.41296928327645e-07, "loss": 0.0004, "reward": 1.603515625, "reward_std": 0.060234617441892624, "rewards/format_reward": 1.0, "rewards/score_reward": 0.603515625, "step": 558 }, { "clip_ratio": 0.0, "completion_length": 89.92969131469727, "epoch": 1.9112627986348123, "grad_norm": 36.63011573739111, "kl": 0.35546875, "learning_rate": 8.410125142207053e-07, "loss": 0.0004, "reward": 1.7063801884651184, "reward_std": 0.04699514992535114, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7063802182674408, "step": 559 }, { "clip_ratio": 0.0, "completion_length": 91.81771087646484, "epoch": 1.9146757679180886, "grad_norm": 1.1111704277139725, "kl": 0.3388671875, "learning_rate": 8.407281001137656e-07, "loss": 0.0003, "reward": 1.7291666269302368, "reward_std": 0.056855062022805214, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7291666865348816, "step": 560 }, { "clip_ratio": 0.0, "completion_length": 90.28646087646484, "epoch": 1.9180887372013653, "grad_norm": 2.243650096805872, "kl": 0.3779296875, "learning_rate": 8.404436860068259e-07, "loss": 0.0004, "reward": 1.7298176884651184, "reward_std": 0.11680452898144722, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.732421875, "step": 561 }, { "clip_ratio": 0.0, "completion_length": 89.703125, "epoch": 1.9215017064846416, "grad_norm": 2.2638745625284966, "kl": 0.3623046875, "learning_rate": 8.401592718998862e-07, "loss": 0.0004, "reward": 1.7076823115348816, "reward_std": 0.09525788202881813, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7076822817325592, "step": 562 }, { "clip_ratio": 0.0, "completion_length": 87.90885925292969, "epoch": 1.924914675767918, "grad_norm": 6.190096635927977, "kl": 0.333984375, "learning_rate": 8.398748577929466e-07, "loss": 0.0003, "reward": 1.6575520634651184, "reward_std": 0.0732962116599083, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6575520932674408, "step": 563 }, { "clip_ratio": 0.0, "completion_length": 89.09896087646484, "epoch": 1.9283276450511946, "grad_norm": 2.8611982452913547, "kl": 0.357421875, "learning_rate": 8.395904436860067e-07, "loss": 0.0004, "reward": 1.701171875, "reward_std": 0.07369120605289936, "rewards/format_reward": 1.0, "rewards/score_reward": 0.701171875, "step": 564 }, { "clip_ratio": 0.0, "completion_length": 89.02604293823242, "epoch": 1.9317406143344709, "grad_norm": 0.8727803579649033, "kl": 0.3671875, "learning_rate": 8.39306029579067e-07, "loss": 0.0004, "reward": 1.7076822519302368, "reward_std": 0.04787530563771725, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7076823115348816, "step": 565 }, { "clip_ratio": 0.0, "completion_length": 88.56771087646484, "epoch": 1.9351535836177476, "grad_norm": 3.7336355632198837, "kl": 0.337890625, "learning_rate": 8.390216154721274e-07, "loss": 0.0003, "reward": 1.73046875, "reward_std": 0.035291168838739395, "rewards/format_reward": 1.0, "rewards/score_reward": 0.73046875, "step": 566 }, { "clip_ratio": 0.0, "completion_length": 88.34896087646484, "epoch": 1.9385665529010239, "grad_norm": 0.9428228702210786, "kl": 0.3408203125, "learning_rate": 8.387372013651876e-07, "loss": 0.0003, "reward": 1.75, "reward_std": 0.06803633458912373, "rewards/format_reward": 1.0, "rewards/score_reward": 0.75, "step": 567 }, { "clip_ratio": 0.0, "completion_length": 88.01823043823242, "epoch": 1.9419795221843004, "grad_norm": 1.1732358212126697, "kl": 0.3505859375, "learning_rate": 8.38452787258248e-07, "loss": 0.0004, "reward": 1.7109375, "reward_std": 0.08007191121578217, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7109375, "step": 568 }, { "clip_ratio": 0.0, "completion_length": 86.703125, "epoch": 1.9453924914675769, "grad_norm": 1.2263061107933104, "kl": 0.341796875, "learning_rate": 8.381683731513083e-07, "loss": 0.0003, "reward": 1.7057291269302368, "reward_std": 0.07669342495501041, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7057291865348816, "step": 569 }, { "clip_ratio": 0.0, "completion_length": 87.5546875, "epoch": 1.9488054607508531, "grad_norm": 1.1504794783896644, "kl": 0.3623046875, "learning_rate": 8.378839590443686e-07, "loss": 0.0004, "reward": 1.6549478769302368, "reward_std": 0.04535055346786976, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6549479365348816, "step": 570 }, { "clip_ratio": 0.0, "completion_length": 87.52864837646484, "epoch": 1.9522184300341296, "grad_norm": 2.365069141137376, "kl": 0.3330078125, "learning_rate": 8.375995449374289e-07, "loss": 0.0003, "reward": 1.7096354365348816, "reward_std": 0.07287785038352013, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7096354067325592, "step": 571 }, { "clip_ratio": 0.0, "completion_length": 86.88542175292969, "epoch": 1.9556313993174061, "grad_norm": 1.230166866154621, "kl": 0.33984375, "learning_rate": 8.373151308304891e-07, "loss": 0.0003, "reward": 1.7220051884651184, "reward_std": 0.07294442877173424, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7220052182674408, "step": 572 }, { "clip_ratio": 0.0, "completion_length": 87.13541793823242, "epoch": 1.9590443686006824, "grad_norm": 1.6195449460418831, "kl": 0.3662109375, "learning_rate": 8.370307167235494e-07, "loss": 0.0004, "reward": 1.6569010615348816, "reward_std": 0.0660613626241684, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6569010317325592, "step": 573 }, { "clip_ratio": 0.0, "completion_length": 88.46614837646484, "epoch": 1.9624573378839592, "grad_norm": 2.2523285232088743, "kl": 0.3291015625, "learning_rate": 8.367463026166097e-07, "loss": 0.0003, "reward": 1.7845051884651184, "reward_std": 0.06281723827123642, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7845052182674408, "step": 574 }, { "clip_ratio": 0.0, "completion_length": 86.38802337646484, "epoch": 1.9658703071672354, "grad_norm": 1.1107167843517167, "kl": 0.3466796875, "learning_rate": 8.364618885096701e-07, "loss": 0.0003, "reward": 1.7239583730697632, "reward_std": 0.04451741650700569, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7239583134651184, "step": 575 }, { "clip_ratio": 0.0, "completion_length": 89.48958587646484, "epoch": 1.969283276450512, "grad_norm": 6.412672071170684, "kl": 0.31640625, "learning_rate": 8.361774744027303e-07, "loss": 0.0003, "reward": 1.7584635019302368, "reward_std": 0.03474032133817673, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7584635615348816, "step": 576 }, { "clip_ratio": 0.0, "completion_length": 88.72656631469727, "epoch": 1.9726962457337884, "grad_norm": 3.363856089023588, "kl": 0.3486328125, "learning_rate": 8.358930602957906e-07, "loss": 0.0003, "reward": 1.7473958134651184, "reward_std": 0.08343162760138512, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7473958432674408, "step": 577 }, { "clip_ratio": 0.0, "completion_length": 89.8125, "epoch": 1.9761092150170647, "grad_norm": 2.053057991675016, "kl": 0.32421875, "learning_rate": 8.35608646188851e-07, "loss": 0.0003, "reward": 1.6295573115348816, "reward_std": 0.08616746217012405, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6295572817325592, "step": 578 }, { "clip_ratio": 0.0, "completion_length": 88.48958587646484, "epoch": 1.9795221843003414, "grad_norm": 0.9567044481423476, "kl": 0.3369140625, "learning_rate": 8.353242320819113e-07, "loss": 0.0003, "reward": 1.6451823115348816, "reward_std": 0.02608323097229004, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6451822817325592, "step": 579 }, { "clip_ratio": 0.0, "completion_length": 88.86458587646484, "epoch": 1.9829351535836177, "grad_norm": 2.282329928214347, "kl": 0.3544921875, "learning_rate": 8.350398179749715e-07, "loss": 0.0004, "reward": 1.7252603769302368, "reward_std": 0.07163488864898682, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7252604365348816, "step": 580 }, { "clip_ratio": 0.0, "completion_length": 89.97396087646484, "epoch": 1.9863481228668942, "grad_norm": 2.9415117683476386, "kl": 0.3310546875, "learning_rate": 8.347554038680318e-07, "loss": 0.0003, "reward": 1.6295572519302368, "reward_std": 0.11577972769737244, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6295573115348816, "step": 581 }, { "clip_ratio": 0.0, "completion_length": 90.30729293823242, "epoch": 1.9897610921501707, "grad_norm": 1.1593376967365145, "kl": 0.341796875, "learning_rate": 8.34470989761092e-07, "loss": 0.0003, "reward": 1.6946614384651184, "reward_std": 0.08335665799677372, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6946614682674408, "step": 582 }, { "clip_ratio": 0.0, "completion_length": 87.26041793823242, "epoch": 1.993174061433447, "grad_norm": 1.360936255321753, "kl": 0.349609375, "learning_rate": 8.341865756541524e-07, "loss": 0.0003, "reward": 1.6946614384651184, "reward_std": 0.06871990673244, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6946614682674408, "step": 583 }, { "clip_ratio": 0.0, "completion_length": 85.76667022705078, "epoch": 1.9965870307167235, "grad_norm": 1.2495065590904861, "kl": 0.3544921875, "learning_rate": 8.339021615472127e-07, "loss": 0.0004, "reward": 1.5791667699813843, "reward_std": 0.07280751317739487, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5791666805744171, "step": 584 }, { "clip_ratio": 0.0, "completion_length": 89.32552337646484, "epoch": 2.0034129692832763, "grad_norm": 1.8075129395897096, "kl": 0.3408203125, "learning_rate": 8.336177474402731e-07, "loss": 0.0003, "reward": 1.5475260615348816, "reward_std": 0.0916825607419014, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5475260317325592, "step": 585 }, { "clip_ratio": 0.0, "completion_length": 86.34896087646484, "epoch": 2.006825938566553, "grad_norm": 0.6776058449686476, "kl": 0.359375, "learning_rate": 8.333333333333333e-07, "loss": 0.0004, "reward": 1.7565104365348816, "reward_std": 0.06108981743454933, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7565104067325592, "step": 586 }, { "clip_ratio": 0.0, "completion_length": 88.88281631469727, "epoch": 2.0102389078498293, "grad_norm": 1.6145517205141053, "kl": 0.349609375, "learning_rate": 8.330489192263936e-07, "loss": 0.0003, "reward": 1.6803385615348816, "reward_std": 0.09837386384606361, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6803385317325592, "step": 587 }, { "clip_ratio": 0.0, "completion_length": 85.75521087646484, "epoch": 2.013651877133106, "grad_norm": 1.0758695303514663, "kl": 0.3466796875, "learning_rate": 8.327645051194539e-07, "loss": 0.0003, "reward": 1.7233072519302368, "reward_std": 0.04719344712793827, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7233073115348816, "step": 588 }, { "clip_ratio": 0.0, "completion_length": 89.03125381469727, "epoch": 2.0170648464163823, "grad_norm": 1.4632605936917835, "kl": 0.353515625, "learning_rate": 8.324800910125141e-07, "loss": 0.0004, "reward": 1.6419270634651184, "reward_std": 0.09446370229125023, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6419270932674408, "step": 589 }, { "clip_ratio": 0.0, "completion_length": 85.62760925292969, "epoch": 2.0204778156996586, "grad_norm": 1.2103445144395588, "kl": 0.34375, "learning_rate": 8.321956769055745e-07, "loss": 0.0003, "reward": 1.70703125, "reward_std": 0.059134794399142265, "rewards/format_reward": 1.0, "rewards/score_reward": 0.70703125, "step": 590 }, { "clip_ratio": 0.0, "completion_length": 84.01823425292969, "epoch": 2.0238907849829353, "grad_norm": 1.1440281867956383, "kl": 0.353515625, "learning_rate": 8.319112627986348e-07, "loss": 0.0004, "reward": 1.78125, "reward_std": 0.041669176891446114, "rewards/format_reward": 1.0, "rewards/score_reward": 0.78125, "step": 591 }, { "clip_ratio": 0.0, "completion_length": 82.15364837646484, "epoch": 2.0273037542662116, "grad_norm": 1.1284099206366063, "kl": 0.357421875, "learning_rate": 8.31626848691695e-07, "loss": 0.0004, "reward": 1.5944010019302368, "reward_std": 0.07880177535116673, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5944010615348816, "step": 592 }, { "clip_ratio": 0.0, "completion_length": 84.41667175292969, "epoch": 2.030716723549488, "grad_norm": 1.3219659980262368, "kl": 0.357421875, "learning_rate": 8.313424345847554e-07, "loss": 0.0004, "reward": 1.69140625, "reward_std": 0.044819485396146774, "rewards/format_reward": 1.0, "rewards/score_reward": 0.69140625, "step": 593 }, { "clip_ratio": 0.0, "completion_length": 84.08073043823242, "epoch": 2.0341296928327646, "grad_norm": 1.872018406996746, "kl": 0.3388671875, "learning_rate": 8.310580204778157e-07, "loss": 0.0003, "reward": 1.78125, "reward_std": 0.05981649272143841, "rewards/format_reward": 1.0, "rewards/score_reward": 0.78125, "step": 594 }, { "clip_ratio": 0.0, "completion_length": 83.19010543823242, "epoch": 2.037542662116041, "grad_norm": 1.132111914053262, "kl": 0.373046875, "learning_rate": 8.30773606370876e-07, "loss": 0.0004, "reward": 1.7044270634651184, "reward_std": 0.06972737237811089, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7044270932674408, "step": 595 }, { "clip_ratio": 0.0, "completion_length": 85.15885925292969, "epoch": 2.0409556313993176, "grad_norm": 0.9693468113339988, "kl": 0.3525390625, "learning_rate": 8.304891922639362e-07, "loss": 0.0004, "reward": 1.6966146230697632, "reward_std": 0.07741402089595795, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6966145634651184, "step": 596 }, { "clip_ratio": 0.0, "completion_length": 82.76562881469727, "epoch": 2.044368600682594, "grad_norm": 1.5189555532228176, "kl": 0.3603515625, "learning_rate": 8.302047781569965e-07, "loss": 0.0004, "reward": 1.71875, "reward_std": 0.05222830828279257, "rewards/format_reward": 1.0, "rewards/score_reward": 0.71875, "step": 597 }, { "clip_ratio": 0.0, "completion_length": 83.22135925292969, "epoch": 2.04778156996587, "grad_norm": 1.2004408105354272, "kl": 0.34375, "learning_rate": 8.299203640500568e-07, "loss": 0.0003, "reward": 1.630859375, "reward_std": 0.04208582825958729, "rewards/format_reward": 1.0, "rewards/score_reward": 0.630859375, "step": 598 }, { "clip_ratio": 0.0, "completion_length": 83.33333587646484, "epoch": 2.051194539249147, "grad_norm": 1.5427678442916506, "kl": 0.3359375, "learning_rate": 8.296359499431171e-07, "loss": 0.0003, "reward": 1.7890625, "reward_std": 0.04649033769965172, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7890625, "step": 599 }, { "clip_ratio": 0.0, "completion_length": 83.47135543823242, "epoch": 2.054607508532423, "grad_norm": 1.44394988354348, "kl": 0.337890625, "learning_rate": 8.293515358361775e-07, "loss": 0.0003, "reward": 1.5930989384651184, "reward_std": 0.0857815220952034, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5930989682674408, "step": 600 }, { "clip_ratio": 0.0, "completion_length": 83.45052337646484, "epoch": 2.0580204778157, "grad_norm": 1.7585882267425719, "kl": 0.3369140625, "learning_rate": 8.290671217292378e-07, "loss": 0.0003, "reward": 1.7350260019302368, "reward_std": 0.03346764389425516, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7350260615348816, "step": 601 }, { "clip_ratio": 0.0, "completion_length": 82.83073043823242, "epoch": 2.061433447098976, "grad_norm": 1.4044724087451594, "kl": 0.3388671875, "learning_rate": 8.28782707622298e-07, "loss": 0.0003, "reward": 1.7669270634651184, "reward_std": 0.07518419250845909, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7669270932674408, "step": 602 }, { "clip_ratio": 0.0, "completion_length": 82.82552337646484, "epoch": 2.0648464163822524, "grad_norm": 0.8059571672287911, "kl": 0.3447265625, "learning_rate": 8.284982935153583e-07, "loss": 0.0003, "reward": 1.74609375, "reward_std": 0.06345542706549168, "rewards/format_reward": 1.0, "rewards/score_reward": 0.74609375, "step": 603 }, { "clip_ratio": 0.0, "completion_length": 83.37760925292969, "epoch": 2.068259385665529, "grad_norm": 0.8332986094798723, "kl": 0.3505859375, "learning_rate": 8.282138794084186e-07, "loss": 0.0004, "reward": 1.712890625, "reward_std": 0.04144043102860451, "rewards/format_reward": 1.0, "rewards/score_reward": 0.712890625, "step": 604 }, { "clip_ratio": 0.0, "completion_length": 83.58073043823242, "epoch": 2.0716723549488054, "grad_norm": 9.46026109644391, "kl": 0.388671875, "learning_rate": 8.279294653014789e-07, "loss": 0.0004, "reward": 1.7662760019302368, "reward_std": 0.05690521188080311, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7662760615348816, "step": 605 }, { "clip_ratio": 0.0, "completion_length": 82.75, "epoch": 2.0750853242320817, "grad_norm": 0.6693658251862813, "kl": 0.3466796875, "learning_rate": 8.276450511945392e-07, "loss": 0.0003, "reward": 1.6783854365348816, "reward_std": 0.02183163701556623, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6783854067325592, "step": 606 }, { "clip_ratio": 0.0, "completion_length": 80.56250381469727, "epoch": 2.0784982935153584, "grad_norm": 0.9100892242542346, "kl": 0.3759765625, "learning_rate": 8.273606370875995e-07, "loss": 0.0004, "reward": 1.6536458134651184, "reward_std": 0.01940045692026615, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6536458432674408, "step": 607 }, { "clip_ratio": 0.0, "completion_length": 83.96354293823242, "epoch": 2.0819112627986347, "grad_norm": 1.1797427308740611, "kl": 0.345703125, "learning_rate": 8.270762229806598e-07, "loss": 0.0003, "reward": 1.6080728769302368, "reward_std": 0.05302077392116189, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6080729365348816, "step": 608 }, { "clip_ratio": 0.0, "completion_length": 82.63021087646484, "epoch": 2.0853242320819114, "grad_norm": 0.934979712392823, "kl": 0.3525390625, "learning_rate": 8.267918088737201e-07, "loss": 0.0004, "reward": 1.5501301884651184, "reward_std": 0.06974650174379349, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5501302182674408, "step": 609 }, { "clip_ratio": 0.0, "completion_length": 82.8828125, "epoch": 2.0887372013651877, "grad_norm": 2.4177473278043453, "kl": 0.35546875, "learning_rate": 8.265073947667805e-07, "loss": 0.0004, "reward": 1.6946614384651184, "reward_std": 0.04903462901711464, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6946614682674408, "step": 610 }, { "clip_ratio": 0.0, "completion_length": 84.95312881469727, "epoch": 2.092150170648464, "grad_norm": 1.0152857065356466, "kl": 0.353515625, "learning_rate": 8.262229806598406e-07, "loss": 0.0004, "reward": 1.6888020634651184, "reward_std": 0.04912221524864435, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6888020932674408, "step": 611 }, { "clip_ratio": 0.0, "completion_length": 85.42187881469727, "epoch": 2.0955631399317407, "grad_norm": 2.009317612577515, "kl": 0.369140625, "learning_rate": 8.259385665529009e-07, "loss": 0.0004, "reward": 1.6373697519302368, "reward_std": 0.07974325492978096, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6373698115348816, "step": 612 }, { "clip_ratio": 0.0, "completion_length": 85.00521087646484, "epoch": 2.098976109215017, "grad_norm": 1.1282872124966665, "kl": 0.3466796875, "learning_rate": 8.256541524459613e-07, "loss": 0.0003, "reward": 1.66796875, "reward_std": 0.06841013394296169, "rewards/format_reward": 1.0, "rewards/score_reward": 0.66796875, "step": 613 }, { "clip_ratio": 0.0, "completion_length": 85.70833587646484, "epoch": 2.1023890784982937, "grad_norm": 0.8164893847245359, "kl": 0.3505859375, "learning_rate": 8.253697383390215e-07, "loss": 0.0004, "reward": 1.6458333730697632, "reward_std": 0.03764333645813167, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6458333134651184, "step": 614 }, { "clip_ratio": 0.0, "completion_length": 84.78906631469727, "epoch": 2.10580204778157, "grad_norm": 1.4673088027055332, "kl": 0.349609375, "learning_rate": 8.250853242320819e-07, "loss": 0.0003, "reward": 1.609375, "reward_std": 0.03897401690483093, "rewards/format_reward": 1.0, "rewards/score_reward": 0.609375, "step": 615 }, { "clip_ratio": 0.0, "completion_length": 85.36198425292969, "epoch": 2.1092150170648463, "grad_norm": 1.0718250341318485, "kl": 0.3583984375, "learning_rate": 8.248009101251422e-07, "loss": 0.0004, "reward": 1.7239583134651184, "reward_std": 0.047610098496079445, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7239583432674408, "step": 616 }, { "clip_ratio": 0.0, "completion_length": 84.90885543823242, "epoch": 2.112627986348123, "grad_norm": 2.0311459535928096, "kl": 0.3359375, "learning_rate": 8.245164960182025e-07, "loss": 0.0003, "reward": 1.7805989384651184, "reward_std": 0.040681893937289715, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.783203125, "step": 617 }, { "clip_ratio": 0.0, "completion_length": 84.85417175292969, "epoch": 2.1160409556313993, "grad_norm": 1.1053456903938041, "kl": 0.345703125, "learning_rate": 8.242320819112628e-07, "loss": 0.0003, "reward": 1.6276041865348816, "reward_std": 0.04602805757895112, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6276041567325592, "step": 618 }, { "clip_ratio": 0.0, "completion_length": 87.21354675292969, "epoch": 2.1194539249146755, "grad_norm": 1.0239029751603592, "kl": 0.328125, "learning_rate": 8.23947667804323e-07, "loss": 0.0003, "reward": 1.7662760019302368, "reward_std": 0.05812716297805309, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7662760615348816, "step": 619 }, { "clip_ratio": 0.0, "completion_length": 85.20052337646484, "epoch": 2.1228668941979523, "grad_norm": 1.1787878028130816, "kl": 0.33984375, "learning_rate": 8.236632536973833e-07, "loss": 0.0003, "reward": 1.6569010615348816, "reward_std": 0.039845652878284454, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6569010317325592, "step": 620 }, { "clip_ratio": 0.0, "completion_length": 85.22656631469727, "epoch": 2.1262798634812285, "grad_norm": 2.8635736722246343, "kl": 0.353515625, "learning_rate": 8.233788395904436e-07, "loss": 0.0004, "reward": 1.6315103769302368, "reward_std": 0.05856482684612274, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6315104365348816, "step": 621 }, { "clip_ratio": 0.0, "completion_length": 87.33594131469727, "epoch": 2.1296928327645053, "grad_norm": 0.7670451764020251, "kl": 0.33984375, "learning_rate": 8.23094425483504e-07, "loss": 0.0003, "reward": 1.6002604365348816, "reward_std": 0.03476380184292793, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6002604067325592, "step": 622 }, { "clip_ratio": 0.0, "completion_length": 87.68489837646484, "epoch": 2.1331058020477816, "grad_norm": 1.4601305993306266, "kl": 0.34375, "learning_rate": 8.228100113765643e-07, "loss": 0.0003, "reward": 1.6529947519302368, "reward_std": 0.04780086129903793, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6555989682674408, "step": 623 }, { "clip_ratio": 0.0, "completion_length": 86.48177337646484, "epoch": 2.136518771331058, "grad_norm": 1.4913063019191277, "kl": 0.3505859375, "learning_rate": 8.225255972696245e-07, "loss": 0.0004, "reward": 1.677734375, "reward_std": 0.10169350728392601, "rewards/format_reward": 1.0, "rewards/score_reward": 0.677734375, "step": 624 }, { "clip_ratio": 0.0, "completion_length": 88.69271087646484, "epoch": 2.1399317406143346, "grad_norm": 0.9218682875545678, "kl": 0.3388671875, "learning_rate": 8.222411831626849e-07, "loss": 0.0003, "reward": 1.591796875, "reward_std": 0.042041096836328506, "rewards/format_reward": 1.0, "rewards/score_reward": 0.591796875, "step": 625 }, { "clip_ratio": 0.0, "completion_length": 89.37500381469727, "epoch": 2.143344709897611, "grad_norm": 1.1883967397100947, "kl": 0.3271484375, "learning_rate": 8.219567690557452e-07, "loss": 0.0003, "reward": 1.7473958134651184, "reward_std": 0.0775490328669548, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.75, "step": 626 }, { "clip_ratio": 0.0, "completion_length": 89.37239837646484, "epoch": 2.1467576791808876, "grad_norm": 1.1623748441538038, "kl": 0.3515625, "learning_rate": 8.216723549488054e-07, "loss": 0.0004, "reward": 1.6959635019302368, "reward_std": 0.08265412226319313, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6959635615348816, "step": 627 }, { "clip_ratio": 0.0, "completion_length": 90.17708587646484, "epoch": 2.150170648464164, "grad_norm": 1.3988583480410386, "kl": 0.3232421875, "learning_rate": 8.213879408418657e-07, "loss": 0.0003, "reward": 1.6959635019302368, "reward_std": 0.08568483218550682, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6959635615348816, "step": 628 }, { "clip_ratio": 0.0, "completion_length": 90.33854293823242, "epoch": 2.15358361774744, "grad_norm": 0.9667015892473024, "kl": 0.3388671875, "learning_rate": 8.21103526734926e-07, "loss": 0.0003, "reward": 1.5670573115348816, "reward_std": 0.049736532382667065, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5670572817325592, "step": 629 }, { "clip_ratio": 0.0, "completion_length": 91.203125, "epoch": 2.156996587030717, "grad_norm": 7.1339151084231345, "kl": 0.349609375, "learning_rate": 8.208191126279863e-07, "loss": 0.0003, "reward": 1.744140625, "reward_std": 0.10101091861724854, "rewards/format_reward": 1.0, "rewards/score_reward": 0.744140625, "step": 630 }, { "clip_ratio": 0.0, "completion_length": 89.68750381469727, "epoch": 2.160409556313993, "grad_norm": 2.143669033819859, "kl": 0.3359375, "learning_rate": 8.205346985210466e-07, "loss": 0.0003, "reward": 1.76953125, "reward_std": 0.06575741805136204, "rewards/format_reward": 1.0, "rewards/score_reward": 0.76953125, "step": 631 }, { "clip_ratio": 0.0, "completion_length": 92.01823043823242, "epoch": 2.1638225255972694, "grad_norm": 1.9324818527666652, "kl": 0.3349609375, "learning_rate": 8.20250284414107e-07, "loss": 0.0003, "reward": 1.6595051884651184, "reward_std": 0.02991834143176675, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6595052182674408, "step": 632 }, { "clip_ratio": 0.0, "completion_length": 91.98698043823242, "epoch": 2.167235494880546, "grad_norm": 3.349827331746939, "kl": 0.3369140625, "learning_rate": 8.199658703071672e-07, "loss": 0.0003, "reward": 1.767578125, "reward_std": 0.08350810408592224, "rewards/format_reward": 1.0, "rewards/score_reward": 0.767578125, "step": 633 }, { "clip_ratio": 0.0, "completion_length": 92.98698425292969, "epoch": 2.1706484641638224, "grad_norm": 1.9225281455898449, "kl": 0.359375, "learning_rate": 8.196814562002274e-07, "loss": 0.0004, "reward": 1.7109375, "reward_std": 0.017009655013680458, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7109375, "step": 634 }, { "clip_ratio": 0.0, "completion_length": 94.22135543823242, "epoch": 2.174061433447099, "grad_norm": 1.6123524773464675, "kl": 0.3466796875, "learning_rate": 8.193970420932878e-07, "loss": 0.0003, "reward": 1.7265625, "reward_std": 0.053507108241319656, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7265625, "step": 635 }, { "clip_ratio": 0.0, "completion_length": 93.71094131469727, "epoch": 2.1774744027303754, "grad_norm": 1.6471458888724577, "kl": 0.3681640625, "learning_rate": 8.19112627986348e-07, "loss": 0.0004, "reward": 1.6009114384651184, "reward_std": 0.10492121055722237, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6009114682674408, "step": 636 }, { "clip_ratio": 0.0, "completion_length": 96.53385543823242, "epoch": 2.1808873720136517, "grad_norm": 0.7334585439641441, "kl": 0.326171875, "learning_rate": 8.188282138794084e-07, "loss": 0.0003, "reward": 1.7486978769302368, "reward_std": 0.05134894140064716, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7486979365348816, "step": 637 }, { "clip_ratio": 0.0, "completion_length": 95.23437881469727, "epoch": 2.1843003412969284, "grad_norm": 1.4350950988425575, "kl": 0.328125, "learning_rate": 8.185437997724687e-07, "loss": 0.0003, "reward": 1.568359375, "reward_std": 0.09607492759823799, "rewards/format_reward": 1.0, "rewards/score_reward": 0.568359375, "step": 638 }, { "clip_ratio": 0.0, "completion_length": 95.42448043823242, "epoch": 2.1877133105802047, "grad_norm": 3.169884728904964, "kl": 0.353515625, "learning_rate": 8.182593856655289e-07, "loss": 0.0004, "reward": 1.6927083134651184, "reward_std": 0.0647718533873558, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6927083432674408, "step": 639 }, { "clip_ratio": 0.0, "completion_length": 94.51823043823242, "epoch": 2.1911262798634814, "grad_norm": 1.3149959195231689, "kl": 0.3369140625, "learning_rate": 8.179749715585893e-07, "loss": 0.0003, "reward": 1.701171875, "reward_std": 0.07599260471761227, "rewards/format_reward": 1.0, "rewards/score_reward": 0.701171875, "step": 640 }, { "clip_ratio": 0.0, "completion_length": 96.29687881469727, "epoch": 2.1945392491467577, "grad_norm": 2.7355747280556675, "kl": 0.3232421875, "learning_rate": 8.176905574516496e-07, "loss": 0.0003, "reward": 1.71484375, "reward_std": 0.059184540063142776, "rewards/format_reward": 1.0, "rewards/score_reward": 0.71484375, "step": 641 }, { "clip_ratio": 0.0, "completion_length": 95.20833587646484, "epoch": 2.197952218430034, "grad_norm": 0.7240616893523296, "kl": 0.3330078125, "learning_rate": 8.174061433447098e-07, "loss": 0.0003, "reward": 1.7408853769302368, "reward_std": 0.042257053311914206, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7408854365348816, "step": 642 }, { "clip_ratio": 0.0, "completion_length": 95.38021087646484, "epoch": 2.2013651877133107, "grad_norm": 0.8023862868358099, "kl": 0.32421875, "learning_rate": 8.171217292377701e-07, "loss": 0.0003, "reward": 1.701171875, "reward_std": 0.03373391553759575, "rewards/format_reward": 1.0, "rewards/score_reward": 0.701171875, "step": 643 }, { "clip_ratio": 0.0, "completion_length": 97.24479675292969, "epoch": 2.204778156996587, "grad_norm": 1.2555658384697228, "kl": 0.3212890625, "learning_rate": 8.168373151308304e-07, "loss": 0.0003, "reward": 1.6588541865348816, "reward_std": 0.09582761116325855, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6588541567325592, "step": 644 }, { "clip_ratio": 0.0, "completion_length": 95.28646087646484, "epoch": 2.2081911262798632, "grad_norm": 0.8305851264895727, "kl": 0.326171875, "learning_rate": 8.165529010238908e-07, "loss": 0.0003, "reward": 1.7604166269302368, "reward_std": 0.03726425766944885, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7604166865348816, "step": 645 }, { "clip_ratio": 0.0, "completion_length": 96.93229293823242, "epoch": 2.21160409556314, "grad_norm": 1.663452980362655, "kl": 0.33984375, "learning_rate": 8.16268486916951e-07, "loss": 0.0003, "reward": 1.7428385615348816, "reward_std": 0.016438620164990425, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7428385317325592, "step": 646 }, { "clip_ratio": 0.0, "completion_length": 99.671875, "epoch": 2.2150170648464163, "grad_norm": 0.9402511646183654, "kl": 0.3115234375, "learning_rate": 8.159840728100114e-07, "loss": 0.0003, "reward": 1.7005208134651184, "reward_std": 0.0987062156200409, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7005208432674408, "step": 647 }, { "clip_ratio": 0.0, "completion_length": 97.9375, "epoch": 2.218430034129693, "grad_norm": 0.8629274188699053, "kl": 0.326171875, "learning_rate": 8.156996587030717e-07, "loss": 0.0003, "reward": 1.6744791865348816, "reward_std": 0.03162622358649969, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6744791567325592, "step": 648 }, { "clip_ratio": 0.0, "completion_length": 97.98958587646484, "epoch": 2.2218430034129693, "grad_norm": 0.7786650417095966, "kl": 0.333984375, "learning_rate": 8.15415244596132e-07, "loss": 0.0003, "reward": 1.5885416865348816, "reward_std": 0.057270489633083344, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5885416567325592, "step": 649 }, { "clip_ratio": 0.0, "completion_length": 98.57812881469727, "epoch": 2.2252559726962455, "grad_norm": 9.692951799254235, "kl": 0.3193359375, "learning_rate": 8.151308304891922e-07, "loss": 0.0003, "reward": 1.7350260615348816, "reward_std": 0.03956230077892542, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7350260317325592, "step": 650 }, { "clip_ratio": 0.0, "completion_length": 98.20052337646484, "epoch": 2.2286689419795223, "grad_norm": 1.1101479374893397, "kl": 0.3212890625, "learning_rate": 8.148464163822525e-07, "loss": 0.0003, "reward": 1.7584635615348816, "reward_std": 0.06251206062734127, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7584635317325592, "step": 651 }, { "clip_ratio": 0.0, "completion_length": 97.46094131469727, "epoch": 2.2320819112627985, "grad_norm": 0.7861369783368165, "kl": 0.345703125, "learning_rate": 8.145620022753128e-07, "loss": 0.0003, "reward": 1.6712239384651184, "reward_std": 0.06792191229760647, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6712239682674408, "step": 652 }, { "clip_ratio": 0.0, "completion_length": 99.85156631469727, "epoch": 2.2354948805460753, "grad_norm": 1.1727815871938723, "kl": 0.3271484375, "learning_rate": 8.142775881683731e-07, "loss": 0.0003, "reward": 1.6217448115348816, "reward_std": 0.04309370182454586, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6217447817325592, "step": 653 }, { "clip_ratio": 0.0, "completion_length": 98.99219131469727, "epoch": 2.2389078498293515, "grad_norm": 1.1455602212410103, "kl": 0.318359375, "learning_rate": 8.139931740614335e-07, "loss": 0.0003, "reward": 1.6666666865348816, "reward_std": 0.050260525196790695, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6666666567325592, "step": 654 }, { "clip_ratio": 0.0, "completion_length": 103.10416793823242, "epoch": 2.242320819112628, "grad_norm": 1.3446330286673167, "kl": 0.3212890625, "learning_rate": 8.137087599544937e-07, "loss": 0.0003, "reward": 1.5611978769302368, "reward_std": 0.09627503529191017, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5611979216337204, "step": 655 }, { "clip_ratio": 0.0, "completion_length": 102.21614837646484, "epoch": 2.2457337883959045, "grad_norm": 0.9433084686243766, "kl": 0.3310546875, "learning_rate": 8.13424345847554e-07, "loss": 0.0003, "reward": 1.669921875, "reward_std": 0.08574909716844559, "rewards/format_reward": 1.0, "rewards/score_reward": 0.669921875, "step": 656 }, { "clip_ratio": 0.0, "completion_length": 103.80989837646484, "epoch": 2.249146757679181, "grad_norm": 2.100776672112962, "kl": 0.3310546875, "learning_rate": 8.131399317406144e-07, "loss": 0.0003, "reward": 1.578125, "reward_std": 0.06345460936427116, "rewards/format_reward": 1.0, "rewards/score_reward": 0.578125, "step": 657 }, { "clip_ratio": 0.0, "completion_length": 104.77604293823242, "epoch": 2.252559726962457, "grad_norm": 1.670065528625287, "kl": 0.3076171875, "learning_rate": 8.128555176336745e-07, "loss": 0.0003, "reward": 1.69140625, "reward_std": 0.084031468257308, "rewards/format_reward": 1.0, "rewards/score_reward": 0.69140625, "step": 658 }, { "clip_ratio": 0.0, "completion_length": 104.39323043823242, "epoch": 2.255972696245734, "grad_norm": 0.8123361542863025, "kl": 0.302734375, "learning_rate": 8.125711035267349e-07, "loss": 0.0003, "reward": 1.685546875, "reward_std": 0.06549367867410183, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6881510317325592, "step": 659 }, { "clip_ratio": 0.0, "completion_length": 107.20052337646484, "epoch": 2.25938566552901, "grad_norm": 1.614143483529804, "kl": 0.3056640625, "learning_rate": 8.122866894197952e-07, "loss": 0.0003, "reward": 1.6549478769302368, "reward_std": 0.06661384552717209, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6549479365348816, "step": 660 }, { "clip_ratio": 0.0, "completion_length": 103.95052337646484, "epoch": 2.262798634812287, "grad_norm": 1.0544319309657506, "kl": 0.3193359375, "learning_rate": 8.120022753128554e-07, "loss": 0.0003, "reward": 1.6412760615348816, "reward_std": 0.05087629845365882, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6412760317325592, "step": 661 }, { "clip_ratio": 0.0, "completion_length": 106.890625, "epoch": 2.266211604095563, "grad_norm": 1.5471604323748378, "kl": 0.3203125, "learning_rate": 8.117178612059158e-07, "loss": 0.0003, "reward": 1.6822916865348816, "reward_std": 0.06750042736530304, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6822916567325592, "step": 662 }, { "clip_ratio": 0.0, "completion_length": 105.91146087646484, "epoch": 2.26962457337884, "grad_norm": 1.4078075326179038, "kl": 0.310546875, "learning_rate": 8.114334470989761e-07, "loss": 0.0003, "reward": 1.634765625, "reward_std": 0.04955905303359032, "rewards/format_reward": 1.0, "rewards/score_reward": 0.634765625, "step": 663 }, { "clip_ratio": 0.0, "completion_length": 106.95573043823242, "epoch": 2.273037542662116, "grad_norm": 0.7791352713417106, "kl": 0.2958984375, "learning_rate": 8.111490329920365e-07, "loss": 0.0003, "reward": 1.736328125, "reward_std": 0.058129868004471064, "rewards/format_reward": 1.0, "rewards/score_reward": 0.736328125, "step": 664 }, { "clip_ratio": 0.0, "completion_length": 107.67187881469727, "epoch": 2.2764505119453924, "grad_norm": 1.2352698168167635, "kl": 0.30859375, "learning_rate": 8.108646188850967e-07, "loss": 0.0003, "reward": 1.759765625, "reward_std": 0.05888826213777065, "rewards/format_reward": 1.0, "rewards/score_reward": 0.759765625, "step": 665 }, { "clip_ratio": 0.0, "completion_length": 105.65625381469727, "epoch": 2.279863481228669, "grad_norm": 1.0263143314158263, "kl": 0.3232421875, "learning_rate": 8.105802047781569e-07, "loss": 0.0003, "reward": 1.5989583730697632, "reward_std": 0.06435495242476463, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5989583134651184, "step": 666 }, { "clip_ratio": 0.0, "completion_length": 106.83073425292969, "epoch": 2.2832764505119454, "grad_norm": 1.2962079497655077, "kl": 0.326171875, "learning_rate": 8.102957906712173e-07, "loss": 0.0003, "reward": 1.697265625, "reward_std": 0.07498325780034065, "rewards/format_reward": 1.0, "rewards/score_reward": 0.697265625, "step": 667 }, { "clip_ratio": 0.0, "completion_length": 107.32552337646484, "epoch": 2.2866894197952217, "grad_norm": 0.750141551567522, "kl": 0.3232421875, "learning_rate": 8.100113765642775e-07, "loss": 0.0003, "reward": 1.7454427480697632, "reward_std": 0.054577043280005455, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.748046875, "step": 668 }, { "clip_ratio": 0.0, "completion_length": 107.13802337646484, "epoch": 2.2901023890784984, "grad_norm": 0.8134543562848078, "kl": 0.3330078125, "learning_rate": 8.097269624573379e-07, "loss": 0.0003, "reward": 1.8346354365348816, "reward_std": 0.03557533724233508, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8346354067325592, "step": 669 }, { "clip_ratio": 0.0, "completion_length": 107.20833587646484, "epoch": 2.2935153583617747, "grad_norm": 1.4514559928742483, "kl": 0.4384765625, "learning_rate": 8.094425483503982e-07, "loss": 0.0004, "reward": 1.7213541865348816, "reward_std": 0.0661938488483429, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7213541567325592, "step": 670 }, { "clip_ratio": 0.0, "completion_length": 108.32552337646484, "epoch": 2.296928327645051, "grad_norm": 2.745260290944982, "kl": 0.2998046875, "learning_rate": 8.091581342434584e-07, "loss": 0.0003, "reward": 1.6783854365348816, "reward_std": 0.06562534347176552, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6783854067325592, "step": 671 }, { "clip_ratio": 0.0, "completion_length": 106.32031631469727, "epoch": 2.3003412969283277, "grad_norm": 0.7901836319104333, "kl": 0.326171875, "learning_rate": 8.088737201365188e-07, "loss": 0.0003, "reward": 1.7005208134651184, "reward_std": 0.06820796336978674, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7005208432674408, "step": 672 }, { "clip_ratio": 0.0, "completion_length": 107.70833587646484, "epoch": 2.303754266211604, "grad_norm": 1.5003219215887484, "kl": 0.3056640625, "learning_rate": 8.08589306029579e-07, "loss": 0.0003, "reward": 1.7760416865348816, "reward_std": 0.05263342522084713, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7760416567325592, "step": 673 }, { "clip_ratio": 0.0, "completion_length": 108.58073043823242, "epoch": 2.3071672354948807, "grad_norm": 0.796966554721583, "kl": 0.3203125, "learning_rate": 8.083048919226393e-07, "loss": 0.0003, "reward": 1.7858072519302368, "reward_std": 0.033733912743628025, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7858073115348816, "step": 674 }, { "clip_ratio": 0.0, "completion_length": 106.67708587646484, "epoch": 2.310580204778157, "grad_norm": 1.4901982758380472, "kl": 0.3115234375, "learning_rate": 8.080204778156996e-07, "loss": 0.0003, "reward": 1.673828125, "reward_std": 0.052278757095336914, "rewards/format_reward": 1.0, "rewards/score_reward": 0.673828125, "step": 675 }, { "clip_ratio": 0.0, "completion_length": 107.57031631469727, "epoch": 2.3139931740614337, "grad_norm": 1.1213293675502485, "kl": 0.2880859375, "learning_rate": 8.077360637087599e-07, "loss": 0.0003, "reward": 1.837890625, "reward_std": 0.058063880540430546, "rewards/format_reward": 1.0, "rewards/score_reward": 0.837890625, "step": 676 }, { "clip_ratio": 0.0, "completion_length": 106.95833587646484, "epoch": 2.31740614334471, "grad_norm": 0.9226266431022252, "kl": 0.318359375, "learning_rate": 8.074516496018202e-07, "loss": 0.0003, "reward": 1.6848958134651184, "reward_std": 0.04280748963356018, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6848958432674408, "step": 677 }, { "clip_ratio": 0.0, "completion_length": 106.02864837646484, "epoch": 2.3208191126279862, "grad_norm": 1.8107564320930347, "kl": 0.3388671875, "learning_rate": 8.071672354948805e-07, "loss": 0.0003, "reward": 1.6217447519302368, "reward_std": 0.09455828368663788, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6217448115348816, "step": 678 }, { "clip_ratio": 0.0, "completion_length": 106.40364837646484, "epoch": 2.324232081911263, "grad_norm": 0.6424346765281235, "kl": 0.32421875, "learning_rate": 8.068828213879409e-07, "loss": 0.0003, "reward": 1.6145833730697632, "reward_std": 0.0388009138405323, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6145833134651184, "step": 679 }, { "clip_ratio": 0.0, "completion_length": 104.98177337646484, "epoch": 2.3276450511945392, "grad_norm": 0.35172989072255395, "kl": 0.3232421875, "learning_rate": 8.065984072810011e-07, "loss": 0.0003, "reward": 1.64453125, "reward_std": 0.013610977679491043, "rewards/format_reward": 1.0, "rewards/score_reward": 0.64453125, "step": 680 }, { "clip_ratio": 0.0, "completion_length": 106.44531631469727, "epoch": 2.3310580204778155, "grad_norm": 0.9724172635299714, "kl": 0.3115234375, "learning_rate": 8.063139931740613e-07, "loss": 0.0003, "reward": 1.7877604365348816, "reward_std": 0.06260046549141407, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7877604067325592, "step": 681 }, { "clip_ratio": 0.0, "completion_length": 105.51041793823242, "epoch": 2.3344709897610922, "grad_norm": 1.0046710103823762, "kl": 0.4033203125, "learning_rate": 8.060295790671217e-07, "loss": 0.0004, "reward": 1.7161458730697632, "reward_std": 0.05626784265041351, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7161458134651184, "step": 682 }, { "clip_ratio": 0.0, "completion_length": 106.0546875, "epoch": 2.3378839590443685, "grad_norm": 0.7917751856496388, "kl": 0.3212890625, "learning_rate": 8.057451649601819e-07, "loss": 0.0003, "reward": 1.615234375, "reward_std": 0.05398268811404705, "rewards/format_reward": 1.0, "rewards/score_reward": 0.615234375, "step": 683 }, { "clip_ratio": 0.0, "completion_length": 103.82291793823242, "epoch": 2.3412969283276452, "grad_norm": 2.6930042081318892, "kl": 0.345703125, "learning_rate": 8.054607508532423e-07, "loss": 0.0003, "reward": 1.6907551884651184, "reward_std": 0.09885259717702866, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6907552182674408, "step": 684 }, { "clip_ratio": 0.0, "completion_length": 104.59114837646484, "epoch": 2.3447098976109215, "grad_norm": 0.995430000993366, "kl": 0.330078125, "learning_rate": 8.051763367463026e-07, "loss": 0.0003, "reward": 1.6888020634651184, "reward_std": 0.06803633272647858, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6888020932674408, "step": 685 }, { "clip_ratio": 0.0, "completion_length": 104.58073043823242, "epoch": 2.348122866894198, "grad_norm": 1.5093616070624039, "kl": 0.3125, "learning_rate": 8.048919226393628e-07, "loss": 0.0003, "reward": 1.6731771230697632, "reward_std": 0.05528467148542404, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6731770634651184, "step": 686 }, { "clip_ratio": 0.0, "completion_length": 105.44010925292969, "epoch": 2.3515358361774745, "grad_norm": 1.2218767484601487, "kl": 0.3251953125, "learning_rate": 8.046075085324232e-07, "loss": 0.0003, "reward": 1.5904948115348816, "reward_std": 0.07272518053650856, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5904947817325592, "step": 687 }, { "clip_ratio": 0.0, "completion_length": 104.45052337646484, "epoch": 2.354948805460751, "grad_norm": 0.9276161614433437, "kl": 0.3203125, "learning_rate": 8.043230944254835e-07, "loss": 0.0003, "reward": 1.8020833134651184, "reward_std": 0.04324844013899565, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8020833432674408, "step": 688 }, { "clip_ratio": 0.0, "completion_length": 105.19531631469727, "epoch": 2.3583617747440275, "grad_norm": 1.0134223947995153, "kl": 0.3095703125, "learning_rate": 8.040386803185438e-07, "loss": 0.0003, "reward": 1.6119791865348816, "reward_std": 0.025513664819300175, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6119791567325592, "step": 689 }, { "clip_ratio": 0.0, "completion_length": 105.72656631469727, "epoch": 2.361774744027304, "grad_norm": 0.572108844570605, "kl": 0.3544921875, "learning_rate": 8.03754266211604e-07, "loss": 0.0004, "reward": 1.6471354365348816, "reward_std": 0.035309887025505304, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6471354067325592, "step": 690 }, { "clip_ratio": 0.0, "completion_length": 108.046875, "epoch": 2.36518771331058, "grad_norm": 1.2157043857924235, "kl": 0.3271484375, "learning_rate": 8.034698521046643e-07, "loss": 0.0003, "reward": 1.6302083134651184, "reward_std": 0.09137860871851444, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6302083432674408, "step": 691 }, { "clip_ratio": 0.0, "completion_length": 107.67708587646484, "epoch": 2.368600682593857, "grad_norm": 1.1629206900307925, "kl": 0.34375, "learning_rate": 8.031854379977247e-07, "loss": 0.0003, "reward": 1.7864583134651184, "reward_std": 0.03202521428465843, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7864583432674408, "step": 692 }, { "clip_ratio": 0.0, "completion_length": 108.85677337646484, "epoch": 2.372013651877133, "grad_norm": 0.9893078420112971, "kl": 0.3330078125, "learning_rate": 8.029010238907849e-07, "loss": 0.0003, "reward": 1.5872395634651184, "reward_std": 0.06321435421705246, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5872395932674408, "step": 693 }, { "clip_ratio": 0.0, "completion_length": 109.54427337646484, "epoch": 2.3754266211604094, "grad_norm": 0.2854135782321334, "kl": 0.345703125, "learning_rate": 8.026166097838453e-07, "loss": 0.0003, "reward": 1.6959635615348816, "reward_std": 0.016438620164990425, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6959635317325592, "step": 694 }, { "clip_ratio": 0.0, "completion_length": 107.08333587646484, "epoch": 2.378839590443686, "grad_norm": 1.0018901952160941, "kl": 0.326171875, "learning_rate": 8.023321956769056e-07, "loss": 0.0003, "reward": 1.6653645634651184, "reward_std": 0.046356143429875374, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6653645932674408, "step": 695 }, { "clip_ratio": 0.0, "completion_length": 109.56771087646484, "epoch": 2.3822525597269624, "grad_norm": 2.072964986069925, "kl": 0.322265625, "learning_rate": 8.020477815699659e-07, "loss": 0.0003, "reward": 1.7421875, "reward_std": 0.06632869690656662, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7421875, "step": 696 }, { "clip_ratio": 0.0, "completion_length": 109.3671875, "epoch": 2.385665529010239, "grad_norm": 1.9817720408171766, "kl": 0.33203125, "learning_rate": 8.017633674630261e-07, "loss": 0.0003, "reward": 1.6516926884651184, "reward_std": 0.05273684300482273, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6516927182674408, "step": 697 }, { "clip_ratio": 0.0, "completion_length": 111.7421875, "epoch": 2.3890784982935154, "grad_norm": 1.2970873636892506, "kl": 0.3310546875, "learning_rate": 8.014789533560864e-07, "loss": 0.0003, "reward": 1.6842448115348816, "reward_std": 0.08203713223338127, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6842447817325592, "step": 698 }, { "clip_ratio": 0.0, "completion_length": 110.10417175292969, "epoch": 2.3924914675767917, "grad_norm": 1.2591013624067071, "kl": 0.328125, "learning_rate": 8.011945392491467e-07, "loss": 0.0003, "reward": 1.64453125, "reward_std": 0.05223296396434307, "rewards/format_reward": 1.0, "rewards/score_reward": 0.64453125, "step": 699 }, { "clip_ratio": 0.0, "completion_length": 112.40625381469727, "epoch": 2.3959044368600684, "grad_norm": 1.5080818394926234, "kl": 0.3291015625, "learning_rate": 8.00910125142207e-07, "loss": 0.0003, "reward": 1.6438802480697632, "reward_std": 0.07667412981390953, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6438801884651184, "step": 700 }, { "clip_ratio": 0.0, "completion_length": 108.89323043823242, "epoch": 2.3993174061433447, "grad_norm": 1.9249374628004496, "kl": 0.322265625, "learning_rate": 8.006257110352674e-07, "loss": 0.0003, "reward": 1.689453125, "reward_std": 0.10391949117183685, "rewards/format_reward": 1.0, "rewards/score_reward": 0.689453125, "step": 701 }, { "clip_ratio": 0.0, "completion_length": 108.5546875, "epoch": 2.4027303754266214, "grad_norm": 0.7432821231451135, "kl": 0.3173828125, "learning_rate": 8.003412969283276e-07, "loss": 0.0003, "reward": 1.6770833134651184, "reward_std": 0.04037859849631786, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6770833432674408, "step": 702 }, { "clip_ratio": 0.0, "completion_length": 108.98698043823242, "epoch": 2.4061433447098977, "grad_norm": 0.5159555797072627, "kl": 0.3212890625, "learning_rate": 8.000568828213879e-07, "loss": 0.0003, "reward": 1.7122395634651184, "reward_std": 0.028209642972797155, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7122395932674408, "step": 703 }, { "clip_ratio": 0.0, "completion_length": 108.17708587646484, "epoch": 2.409556313993174, "grad_norm": 1.1371385894430504, "kl": 0.31640625, "learning_rate": 7.997724687144482e-07, "loss": 0.0003, "reward": 1.720703125, "reward_std": 0.05223549343645573, "rewards/format_reward": 1.0, "rewards/score_reward": 0.720703125, "step": 704 }, { "clip_ratio": 0.0, "completion_length": 107.96875381469727, "epoch": 2.4129692832764507, "grad_norm": 2.18842698304344, "kl": 0.3330078125, "learning_rate": 7.994880546075084e-07, "loss": 0.0003, "reward": 1.6022135019302368, "reward_std": 0.05662039015442133, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6022135615348816, "step": 705 }, { "clip_ratio": 0.0, "completion_length": 106.20052337646484, "epoch": 2.416382252559727, "grad_norm": 1.2193056934347086, "kl": 0.3125, "learning_rate": 7.992036405005688e-07, "loss": 0.0003, "reward": 1.72265625, "reward_std": 0.06179104559123516, "rewards/format_reward": 1.0, "rewards/score_reward": 0.72265625, "step": 706 }, { "clip_ratio": 0.0, "completion_length": 106.6484375, "epoch": 2.419795221843003, "grad_norm": 1.490637777036968, "kl": 0.32421875, "learning_rate": 7.989192263936291e-07, "loss": 0.0003, "reward": 1.6360676884651184, "reward_std": 0.06678730808198452, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6360677182674408, "step": 707 }, { "clip_ratio": 0.0, "completion_length": 105.76562881469727, "epoch": 2.42320819112628, "grad_norm": 0.7133045749326807, "kl": 0.359375, "learning_rate": 7.986348122866893e-07, "loss": 0.0004, "reward": 1.7591145634651184, "reward_std": 0.02978438977152109, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7591145932674408, "step": 708 }, { "clip_ratio": 0.0, "completion_length": 107.39323043823242, "epoch": 2.426621160409556, "grad_norm": 0.7452262407002039, "kl": 0.3427734375, "learning_rate": 7.983503981797497e-07, "loss": 0.0003, "reward": 1.6171875, "reward_std": 0.046091342344880104, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6171875, "step": 709 }, { "clip_ratio": 0.0, "completion_length": 106.00521087646484, "epoch": 2.430034129692833, "grad_norm": 1.045735178246977, "kl": 0.333984375, "learning_rate": 7.9806598407281e-07, "loss": 0.0003, "reward": 1.7220051884651184, "reward_std": 0.05155232921242714, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7220052182674408, "step": 710 }, { "clip_ratio": 0.0, "completion_length": 105.65885925292969, "epoch": 2.4334470989761092, "grad_norm": 1.106428485758298, "kl": 0.3388671875, "learning_rate": 7.977815699658704e-07, "loss": 0.0003, "reward": 1.6751301884651184, "reward_std": 0.07864074409008026, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6751302182674408, "step": 711 }, { "clip_ratio": 0.0, "completion_length": 104.65104293823242, "epoch": 2.4368600682593855, "grad_norm": 1.8821960037603567, "kl": 0.326171875, "learning_rate": 7.974971558589305e-07, "loss": 0.0003, "reward": 1.6979166269302368, "reward_std": 0.04145215987227857, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6979166865348816, "step": 712 }, { "clip_ratio": 0.0, "completion_length": 104.65104293823242, "epoch": 2.4402730375426622, "grad_norm": 0.9085527716880027, "kl": 0.326171875, "learning_rate": 7.972127417519908e-07, "loss": 0.0003, "reward": 1.662109375, "reward_std": 0.057804906740784645, "rewards/format_reward": 1.0, "rewards/score_reward": 0.662109375, "step": 713 }, { "clip_ratio": 0.0, "completion_length": 105.49219131469727, "epoch": 2.4436860068259385, "grad_norm": 0.7000893752872132, "kl": 0.3232421875, "learning_rate": 7.969283276450512e-07, "loss": 0.0003, "reward": 1.6868489980697632, "reward_std": 0.027621357701718807, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6868489384651184, "step": 714 }, { "clip_ratio": 0.0, "completion_length": 104.62500381469727, "epoch": 2.4470989761092152, "grad_norm": 0.8459658341682097, "kl": 0.3515625, "learning_rate": 7.966439135381114e-07, "loss": 0.0004, "reward": 1.572265625, "reward_std": 0.06253306940197945, "rewards/format_reward": 1.0, "rewards/score_reward": 0.572265625, "step": 715 }, { "clip_ratio": 0.0, "completion_length": 102.34635543823242, "epoch": 2.4505119453924915, "grad_norm": 2.462945150460452, "kl": 0.34375, "learning_rate": 7.963594994311718e-07, "loss": 0.0003, "reward": 1.7109375, "reward_std": 0.08515188470482826, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7109375, "step": 716 }, { "clip_ratio": 0.0, "completion_length": 103.6796875, "epoch": 2.453924914675768, "grad_norm": 0.7415750287938692, "kl": 0.3984375, "learning_rate": 7.960750853242321e-07, "loss": 0.0004, "reward": 1.6920573115348816, "reward_std": 0.044364744797348976, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6920572817325592, "step": 717 }, { "clip_ratio": 0.0, "completion_length": 100.69792175292969, "epoch": 2.4573378839590445, "grad_norm": 0.8819928609781024, "kl": 0.3193359375, "learning_rate": 7.957906712172923e-07, "loss": 0.0003, "reward": 1.6829426884651184, "reward_std": 0.04935688525438309, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6829427182674408, "step": 718 }, { "clip_ratio": 0.0, "completion_length": 103.23958587646484, "epoch": 2.460750853242321, "grad_norm": 1.3315555076155827, "kl": 0.3330078125, "learning_rate": 7.955062571103527e-07, "loss": 0.0003, "reward": 1.6067708730697632, "reward_std": 0.06816987879574299, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6067708134651184, "step": 719 }, { "clip_ratio": 0.0, "completion_length": 101.90625381469727, "epoch": 2.464163822525597, "grad_norm": 1.9866466199782562, "kl": 0.337890625, "learning_rate": 7.952218430034129e-07, "loss": 0.0003, "reward": 1.6666666865348816, "reward_std": 0.03629528731107712, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6666666567325592, "step": 720 }, { "clip_ratio": 0.0, "completion_length": 102.34896087646484, "epoch": 2.467576791808874, "grad_norm": 0.9030601400301952, "kl": 0.3447265625, "learning_rate": 7.949374288964732e-07, "loss": 0.0003, "reward": 1.6393229365348816, "reward_std": 0.05486178770661354, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6393229067325592, "step": 721 }, { "clip_ratio": 0.0, "completion_length": 101.35677337646484, "epoch": 2.47098976109215, "grad_norm": 1.911227822502417, "kl": 0.3447265625, "learning_rate": 7.946530147895335e-07, "loss": 0.0003, "reward": 1.6263021230697632, "reward_std": 0.05493106134235859, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6263020634651184, "step": 722 }, { "clip_ratio": 0.0, "completion_length": 100.47656631469727, "epoch": 2.474402730375427, "grad_norm": 4.715492546063484, "kl": 0.3349609375, "learning_rate": 7.943686006825938e-07, "loss": 0.0003, "reward": 1.650390625, "reward_std": 0.055282142013311386, "rewards/format_reward": 1.0, "rewards/score_reward": 0.650390625, "step": 723 }, { "clip_ratio": 0.0, "completion_length": 101.8671875, "epoch": 2.477815699658703, "grad_norm": 0.9476090344955197, "kl": 0.3408203125, "learning_rate": 7.940841865756541e-07, "loss": 0.0003, "reward": 1.73046875, "reward_std": 0.05645064078271389, "rewards/format_reward": 1.0, "rewards/score_reward": 0.73046875, "step": 724 }, { "clip_ratio": 0.0, "completion_length": 102.76302337646484, "epoch": 2.4812286689419794, "grad_norm": 2.37943622119885, "kl": 0.349609375, "learning_rate": 7.937997724687144e-07, "loss": 0.0003, "reward": 1.6588541865348816, "reward_std": 0.03331579361110926, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6588541567325592, "step": 725 }, { "clip_ratio": 0.0, "completion_length": 101.54427337646484, "epoch": 2.484641638225256, "grad_norm": 1.1931832298284848, "kl": 0.3720703125, "learning_rate": 7.935153583617748e-07, "loss": 0.0004, "reward": 1.66015625, "reward_std": 0.04486938938498497, "rewards/format_reward": 1.0, "rewards/score_reward": 0.66015625, "step": 726 }, { "clip_ratio": 0.0, "completion_length": 99.97396087646484, "epoch": 2.4880546075085324, "grad_norm": 0.56096451657982, "kl": 0.3505859375, "learning_rate": 7.93230944254835e-07, "loss": 0.0004, "reward": 1.7630208134651184, "reward_std": 0.027925473637878895, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7630208432674408, "step": 727 }, { "clip_ratio": 0.0, "completion_length": 100.76823425292969, "epoch": 2.491467576791809, "grad_norm": 1.4315097802880585, "kl": 0.326171875, "learning_rate": 7.929465301478952e-07, "loss": 0.0003, "reward": 1.6608073115348816, "reward_std": 0.054559143260121346, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6608072817325592, "step": 728 }, { "clip_ratio": 0.0, "completion_length": 100.88281631469727, "epoch": 2.4948805460750854, "grad_norm": 1.0702467791499577, "kl": 0.3828125, "learning_rate": 7.926621160409556e-07, "loss": 0.0004, "reward": 1.6692708134651184, "reward_std": 0.05306709371507168, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6692708432674408, "step": 729 }, { "clip_ratio": 0.0, "completion_length": 100.00260925292969, "epoch": 2.4982935153583616, "grad_norm": 0.6515995310139302, "kl": 0.357421875, "learning_rate": 7.923777019340158e-07, "loss": 0.0004, "reward": 1.7506510615348816, "reward_std": 0.031296911649405956, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7506510317325592, "step": 730 }, { "clip_ratio": 0.0, "completion_length": 100.43489837646484, "epoch": 2.5017064846416384, "grad_norm": 1.2797571462886816, "kl": 0.3408203125, "learning_rate": 7.920932878270762e-07, "loss": 0.0003, "reward": 1.7845051884651184, "reward_std": 0.037416763603687286, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7845052182674408, "step": 731 }, { "clip_ratio": 0.0, "completion_length": 100.50521087646484, "epoch": 2.5051194539249146, "grad_norm": 0.6097032045317201, "kl": 0.3388671875, "learning_rate": 7.918088737201365e-07, "loss": 0.0003, "reward": 1.6705729365348816, "reward_std": 0.021964360494166613, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6705729067325592, "step": 732 }, { "clip_ratio": 0.0, "completion_length": 100.80729675292969, "epoch": 2.508532423208191, "grad_norm": 1.3053944448473571, "kl": 0.359375, "learning_rate": 7.915244596131968e-07, "loss": 0.0004, "reward": 1.7473958134651184, "reward_std": 0.03642883151769638, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7473958432674408, "step": 733 }, { "clip_ratio": 0.0, "completion_length": 100.39583587646484, "epoch": 2.5119453924914676, "grad_norm": 0.9244191126241131, "kl": 0.3349609375, "learning_rate": 7.912400455062571e-07, "loss": 0.0003, "reward": 1.689453125, "reward_std": 0.04666091036051512, "rewards/format_reward": 1.0, "rewards/score_reward": 0.689453125, "step": 734 }, { "clip_ratio": 0.0, "completion_length": 100.38281631469727, "epoch": 2.515358361774744, "grad_norm": 0.82152271442344, "kl": 0.4716796875, "learning_rate": 7.909556313993174e-07, "loss": 0.0005, "reward": 1.6276041865348816, "reward_std": 0.05852656811475754, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6276041567325592, "step": 735 }, { "clip_ratio": 0.0, "completion_length": 99.32291793823242, "epoch": 2.51877133105802, "grad_norm": 0.6062365593166977, "kl": 0.345703125, "learning_rate": 7.906712172923777e-07, "loss": 0.0003, "reward": 1.724609375, "reward_std": 0.024790534749627113, "rewards/format_reward": 1.0, "rewards/score_reward": 0.724609375, "step": 736 }, { "clip_ratio": 0.0, "completion_length": 100.46875, "epoch": 2.522184300341297, "grad_norm": 2.3241131028255886, "kl": 0.3349609375, "learning_rate": 7.903868031854379e-07, "loss": 0.0003, "reward": 1.6796875, "reward_std": 0.04282702878117561, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6796875, "step": 737 }, { "clip_ratio": 0.0, "completion_length": 103.79948043823242, "epoch": 2.5255972696245736, "grad_norm": 1.5823624550187803, "kl": 0.349609375, "learning_rate": 7.901023890784982e-07, "loss": 0.0003, "reward": 1.689453125, "reward_std": 0.06273359898477793, "rewards/format_reward": 1.0, "rewards/score_reward": 0.689453125, "step": 738 }, { "clip_ratio": 0.0, "completion_length": 103.42448043823242, "epoch": 2.52901023890785, "grad_norm": 1.0280223887246849, "kl": 0.3486328125, "learning_rate": 7.898179749715586e-07, "loss": 0.0003, "reward": 1.6041666865348816, "reward_std": 0.06832115165889263, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6041666567325592, "step": 739 }, { "clip_ratio": 0.0, "completion_length": 103.43229293823242, "epoch": 2.532423208191126, "grad_norm": 3.050889107955995, "kl": 0.3408203125, "learning_rate": 7.895335608646188e-07, "loss": 0.0003, "reward": 1.66796875, "reward_std": 0.06091007869690657, "rewards/format_reward": 1.0, "rewards/score_reward": 0.66796875, "step": 740 }, { "clip_ratio": 0.0, "completion_length": 103.21094131469727, "epoch": 2.535836177474403, "grad_norm": 1.2165806631230587, "kl": 0.3388671875, "learning_rate": 7.892491467576792e-07, "loss": 0.0003, "reward": 1.751953125, "reward_std": 0.07960905879735947, "rewards/format_reward": 1.0, "rewards/score_reward": 0.751953125, "step": 741 }, { "clip_ratio": 0.0, "completion_length": 105.89583587646484, "epoch": 2.539249146757679, "grad_norm": 0.6984082962291642, "kl": 0.3642578125, "learning_rate": 7.889647326507395e-07, "loss": 0.0004, "reward": 1.7389322519302368, "reward_std": 0.04066252522170544, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7389323115348816, "step": 742 }, { "clip_ratio": 0.0, "completion_length": 103.14583587646484, "epoch": 2.5426621160409555, "grad_norm": 1.4429788145237028, "kl": 0.3251953125, "learning_rate": 7.886803185437996e-07, "loss": 0.0003, "reward": 1.7076823115348816, "reward_std": 0.06987922638654709, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7076822817325592, "step": 743 }, { "clip_ratio": 0.0, "completion_length": 105.5859375, "epoch": 2.546075085324232, "grad_norm": 3.0429666226045806, "kl": 0.3251953125, "learning_rate": 7.8839590443686e-07, "loss": 0.0003, "reward": 1.7838541269302368, "reward_std": 0.04530582204461098, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7838541865348816, "step": 744 }, { "clip_ratio": 0.0, "completion_length": 103.46614837646484, "epoch": 2.5494880546075085, "grad_norm": 0.913403134480414, "kl": 0.3564453125, "learning_rate": 7.881114903299203e-07, "loss": 0.0004, "reward": 1.5540364384651184, "reward_std": 0.03811880946159363, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5540364682674408, "step": 745 }, { "clip_ratio": 0.0, "completion_length": 103.34114837646484, "epoch": 2.5529010238907848, "grad_norm": 3.392811576917803, "kl": 0.361328125, "learning_rate": 7.878270762229806e-07, "loss": 0.0004, "reward": 1.662109375, "reward_std": 0.07783731445670128, "rewards/format_reward": 1.0, "rewards/score_reward": 0.662109375, "step": 746 }, { "clip_ratio": 0.0, "completion_length": 102.93489837646484, "epoch": 2.5563139931740615, "grad_norm": 1.6082018663147901, "kl": 0.3349609375, "learning_rate": 7.875426621160409e-07, "loss": 0.0003, "reward": 1.6783854365348816, "reward_std": 0.059650855138897896, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6783854067325592, "step": 747 }, { "clip_ratio": 0.0, "completion_length": 106.84375, "epoch": 2.5597269624573378, "grad_norm": 1.8131378042723276, "kl": 0.3359375, "learning_rate": 7.872582480091013e-07, "loss": 0.0003, "reward": 1.7493489980697632, "reward_std": 0.047915685921907425, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7493489384651184, "step": 748 }, { "clip_ratio": 0.0, "completion_length": 105.38802337646484, "epoch": 2.5631399317406145, "grad_norm": 0.8818103472521271, "kl": 0.341796875, "learning_rate": 7.869738339021616e-07, "loss": 0.0003, "reward": 1.6640625, "reward_std": 0.054030715487897396, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6640625, "step": 749 }, { "clip_ratio": 0.0, "completion_length": 103.59114837646484, "epoch": 2.5665529010238908, "grad_norm": 1.2312421617691527, "kl": 0.333984375, "learning_rate": 7.866894197952218e-07, "loss": 0.0003, "reward": 1.646484375, "reward_std": 0.057121580466628075, "rewards/format_reward": 1.0, "rewards/score_reward": 0.646484375, "step": 750 }, { "clip_ratio": 0.0, "completion_length": 103.09114837646484, "epoch": 2.5699658703071675, "grad_norm": 0.6818129015603697, "kl": 0.333984375, "learning_rate": 7.864050056882821e-07, "loss": 0.0003, "reward": 1.6360677480697632, "reward_std": 0.03811880946159363, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6360676884651184, "step": 751 }, { "clip_ratio": 0.0, "completion_length": 102.45312881469727, "epoch": 2.573378839590444, "grad_norm": 3.8642081623445, "kl": 0.3662109375, "learning_rate": 7.861205915813423e-07, "loss": 0.0004, "reward": 1.7721354365348816, "reward_std": 0.06408868916332722, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7721354067325592, "step": 752 }, { "clip_ratio": 0.0, "completion_length": 104.48698043823242, "epoch": 2.57679180887372, "grad_norm": 0.7704821265489634, "kl": 0.341796875, "learning_rate": 7.858361774744027e-07, "loss": 0.0003, "reward": 1.7063801884651184, "reward_std": 0.04464972950518131, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7063802182674408, "step": 753 }, { "clip_ratio": 0.0, "completion_length": 102.98698425292969, "epoch": 2.580204778156997, "grad_norm": 0.454177449727773, "kl": 0.3857421875, "learning_rate": 7.85551763367463e-07, "loss": 0.0004, "reward": 1.7096354365348816, "reward_std": 0.018015244975686073, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7096354067325592, "step": 754 }, { "clip_ratio": 0.0, "completion_length": 102.7734375, "epoch": 2.583617747440273, "grad_norm": 0.7107844985052753, "kl": 0.345703125, "learning_rate": 7.852673492605233e-07, "loss": 0.0003, "reward": 1.67578125, "reward_std": 0.023387661902233958, "rewards/format_reward": 1.0, "rewards/score_reward": 0.67578125, "step": 755 }, { "clip_ratio": 0.0, "completion_length": 103.09114837646484, "epoch": 2.5870307167235493, "grad_norm": 0.6522841822497077, "kl": 0.3447265625, "learning_rate": 7.849829351535836e-07, "loss": 0.0003, "reward": 1.705078125, "reward_std": 0.04039707500487566, "rewards/format_reward": 1.0, "rewards/score_reward": 0.705078125, "step": 756 }, { "clip_ratio": 0.0, "completion_length": 105.1796875, "epoch": 2.590443686006826, "grad_norm": 1.1971302487968276, "kl": 0.361328125, "learning_rate": 7.846985210466439e-07, "loss": 0.0004, "reward": 1.6673177480697632, "reward_std": 0.061329009011387825, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6673176884651184, "step": 757 }, { "clip_ratio": 0.0, "completion_length": 101.17448043823242, "epoch": 2.5938566552901023, "grad_norm": 0.8938134758085035, "kl": 0.322265625, "learning_rate": 7.844141069397043e-07, "loss": 0.0003, "reward": 1.6888020634651184, "reward_std": 0.04734383150935173, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6888020932674408, "step": 758 }, { "clip_ratio": 0.0, "completion_length": 102.546875, "epoch": 2.5972696245733786, "grad_norm": 1.424904722919981, "kl": 0.3388671875, "learning_rate": 7.841296928327644e-07, "loss": 0.0003, "reward": 1.7682291865348816, "reward_std": 0.060038029216229916, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7682291567325592, "step": 759 }, { "clip_ratio": 0.0, "completion_length": 101.16927337646484, "epoch": 2.6006825938566553, "grad_norm": 0.8447114731583383, "kl": 0.341796875, "learning_rate": 7.838452787258247e-07, "loss": 0.0003, "reward": 1.6848958134651184, "reward_std": 0.04662387608550489, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6848958432674408, "step": 760 }, { "clip_ratio": 0.0, "completion_length": 102.44010543823242, "epoch": 2.6040955631399316, "grad_norm": 1.4465770846748844, "kl": 0.33984375, "learning_rate": 7.835608646188851e-07, "loss": 0.0003, "reward": 1.6953125, "reward_std": 0.06568264774978161, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6953125, "step": 761 }, { "clip_ratio": 0.0, "completion_length": 99.34896087646484, "epoch": 2.6075085324232083, "grad_norm": 0.9522056698223524, "kl": 0.3759765625, "learning_rate": 7.832764505119453e-07, "loss": 0.0004, "reward": 1.7298177480697632, "reward_std": 0.034568688832223415, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7298176884651184, "step": 762 }, { "clip_ratio": 0.0, "completion_length": 101.99479675292969, "epoch": 2.6109215017064846, "grad_norm": 0.7803640855302345, "kl": 0.337890625, "learning_rate": 7.829920364050057e-07, "loss": 0.0003, "reward": 1.6783853769302368, "reward_std": 0.0327466344460845, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6783854365348816, "step": 763 }, { "clip_ratio": 0.0, "completion_length": 101.14844131469727, "epoch": 2.6143344709897613, "grad_norm": 1.6133649682448037, "kl": 0.345703125, "learning_rate": 7.82707622298066e-07, "loss": 0.0003, "reward": 1.6770833730697632, "reward_std": 0.04848443064838648, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6770833134651184, "step": 764 }, { "clip_ratio": 0.0, "completion_length": 101.25521087646484, "epoch": 2.6177474402730376, "grad_norm": 0.4714833737627319, "kl": 0.3662109375, "learning_rate": 7.824232081911262e-07, "loss": 0.0004, "reward": 1.6536458134651184, "reward_std": 0.02893024403601885, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6536458432674408, "step": 765 }, { "clip_ratio": 0.0, "completion_length": 97.88541793823242, "epoch": 2.621160409556314, "grad_norm": 0.9192961325631411, "kl": 0.361328125, "learning_rate": 7.821387940841866e-07, "loss": 0.0004, "reward": 1.5930989980697632, "reward_std": 0.03344827424734831, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5930989384651184, "step": 766 }, { "clip_ratio": 0.0, "completion_length": 100.0625, "epoch": 2.6245733788395906, "grad_norm": 0.48268584032629785, "kl": 0.353515625, "learning_rate": 7.818543799772468e-07, "loss": 0.0004, "reward": 1.662109375, "reward_std": 0.01657281443476677, "rewards/format_reward": 1.0, "rewards/score_reward": 0.662109375, "step": 767 }, { "clip_ratio": 0.0, "completion_length": 99.40364837646484, "epoch": 2.627986348122867, "grad_norm": 2.239980021263, "kl": 0.3515625, "learning_rate": 7.815699658703071e-07, "loss": 0.0004, "reward": 1.6529948115348816, "reward_std": 0.05191932059824467, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6529947817325592, "step": 768 }, { "clip_ratio": 0.0, "completion_length": 96.80989837646484, "epoch": 2.631399317406143, "grad_norm": 0.5549066970773882, "kl": 0.36328125, "learning_rate": 7.812855517633674e-07, "loss": 0.0004, "reward": 1.712890625, "reward_std": 0.0372832166031003, "rewards/format_reward": 1.0, "rewards/score_reward": 0.712890625, "step": 769 }, { "clip_ratio": 0.0, "completion_length": 95.87239837646484, "epoch": 2.63481228668942, "grad_norm": 1.2621707511515132, "kl": 0.359375, "learning_rate": 7.810011376564277e-07, "loss": 0.0004, "reward": 1.708984375, "reward_std": 0.04105422645807266, "rewards/format_reward": 1.0, "rewards/score_reward": 0.708984375, "step": 770 }, { "clip_ratio": 0.0, "completion_length": 99.87239837646484, "epoch": 2.638225255972696, "grad_norm": 1.855768146370001, "kl": 0.34375, "learning_rate": 7.80716723549488e-07, "loss": 0.0003, "reward": 1.7278645634651184, "reward_std": 0.05140047799795866, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7278645932674408, "step": 771 }, { "clip_ratio": 0.0, "completion_length": 95.95052337646484, "epoch": 2.6416382252559725, "grad_norm": 0.7752053442735731, "kl": 0.3681640625, "learning_rate": 7.804323094425483e-07, "loss": 0.0004, "reward": 1.6959635615348816, "reward_std": 0.020255661569535732, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6959635317325592, "step": 772 }, { "clip_ratio": 0.0, "completion_length": 99.05469131469727, "epoch": 2.645051194539249, "grad_norm": 0.600474585614485, "kl": 0.3544921875, "learning_rate": 7.801478953356087e-07, "loss": 0.0004, "reward": 1.7024739384651184, "reward_std": 0.033449744805693626, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7024739682674408, "step": 773 }, { "clip_ratio": 0.0, "completion_length": 96.421875, "epoch": 2.6484641638225255, "grad_norm": 0.9141065211220303, "kl": 0.3642578125, "learning_rate": 7.798634812286689e-07, "loss": 0.0004, "reward": 1.595703125, "reward_std": 0.036959489807486534, "rewards/format_reward": 1.0, "rewards/score_reward": 0.595703125, "step": 774 }, { "clip_ratio": 0.0, "completion_length": 98.14583587646484, "epoch": 2.651877133105802, "grad_norm": 0.47384560300201595, "kl": 0.361328125, "learning_rate": 7.795790671217291e-07, "loss": 0.0004, "reward": 1.7779948115348816, "reward_std": 0.020255662500858307, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7779947817325592, "step": 775 }, { "clip_ratio": 0.0, "completion_length": 95.51823043823242, "epoch": 2.6552901023890785, "grad_norm": 0.7118436787807713, "kl": 0.3505859375, "learning_rate": 7.792946530147895e-07, "loss": 0.0004, "reward": 1.7044271230697632, "reward_std": 0.04747884348034859, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7044270634651184, "step": 776 }, { "clip_ratio": 0.0, "completion_length": 94.01823043823242, "epoch": 2.658703071672355, "grad_norm": 1.228724449790625, "kl": 0.34765625, "learning_rate": 7.790102389078498e-07, "loss": 0.0003, "reward": 1.681640625, "reward_std": 0.029416336677968502, "rewards/format_reward": 1.0, "rewards/score_reward": 0.681640625, "step": 777 }, { "clip_ratio": 0.0, "completion_length": 93.47135543823242, "epoch": 2.6621160409556315, "grad_norm": 0.8538222258419471, "kl": 0.3349609375, "learning_rate": 7.787258248009101e-07, "loss": 0.0003, "reward": 1.8483072519302368, "reward_std": 0.045370498672127724, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8483073115348816, "step": 778 }, { "clip_ratio": 0.0, "completion_length": 94.62760543823242, "epoch": 2.6655290102389078, "grad_norm": 0.8627800546028974, "kl": 0.333984375, "learning_rate": 7.784414106939704e-07, "loss": 0.0003, "reward": 1.68359375, "reward_std": 0.02141392114572227, "rewards/format_reward": 1.0, "rewards/score_reward": 0.68359375, "step": 779 }, { "clip_ratio": 0.0, "completion_length": 93.74219131469727, "epoch": 2.6689419795221845, "grad_norm": 1.375934928796252, "kl": 0.369140625, "learning_rate": 7.781569965870307e-07, "loss": 0.0004, "reward": 1.6516926884651184, "reward_std": 0.08798100054264069, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6516927182674408, "step": 780 }, { "clip_ratio": 0.0, "completion_length": 93.4296875, "epoch": 2.6723549488054608, "grad_norm": 0.9221321380311083, "kl": 0.3837890625, "learning_rate": 7.77872582480091e-07, "loss": 0.0004, "reward": 1.7291666865348816, "reward_std": 0.04931692034006119, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7291666567325592, "step": 781 }, { "clip_ratio": 0.0, "completion_length": 91.61719131469727, "epoch": 2.675767918088737, "grad_norm": 1.5943123015760805, "kl": 0.3408203125, "learning_rate": 7.775881683731512e-07, "loss": 0.0003, "reward": 1.6647135615348816, "reward_std": 0.05523600056767464, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6647135317325592, "step": 782 }, { "clip_ratio": 0.0, "completion_length": 93.59114837646484, "epoch": 2.6791808873720138, "grad_norm": 1.3149360799099048, "kl": 0.3349609375, "learning_rate": 7.773037542662116e-07, "loss": 0.0003, "reward": 1.5325520634651184, "reward_std": 0.0272219548933208, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5325520932674408, "step": 783 }, { "clip_ratio": 0.0, "completion_length": 91.08594131469727, "epoch": 2.68259385665529, "grad_norm": 1.9677364802407806, "kl": 0.3798828125, "learning_rate": 7.770193401592718e-07, "loss": 0.0004, "reward": 1.7565104365348816, "reward_std": 0.08690943103283644, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7565104067325592, "step": 784 }, { "clip_ratio": 0.0, "completion_length": 89.74739837646484, "epoch": 2.6860068259385663, "grad_norm": 0.6192167786166826, "kl": 0.365234375, "learning_rate": 7.767349260523321e-07, "loss": 0.0004, "reward": 1.7083333134651184, "reward_std": 0.020779844373464584, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7083333432674408, "step": 785 }, { "clip_ratio": 0.0, "completion_length": 93.38021087646484, "epoch": 2.689419795221843, "grad_norm": 1.2266256562696947, "kl": 0.3525390625, "learning_rate": 7.764505119453925e-07, "loss": 0.0004, "reward": 1.76171875, "reward_std": 0.03669574949890375, "rewards/format_reward": 1.0, "rewards/score_reward": 0.76171875, "step": 786 }, { "clip_ratio": 0.0, "completion_length": 93.91927337646484, "epoch": 2.6928327645051193, "grad_norm": 1.1754127562023997, "kl": 0.3623046875, "learning_rate": 7.761660978384527e-07, "loss": 0.0004, "reward": 1.6744791865348816, "reward_std": 0.04504619725048542, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6744791567325592, "step": 787 }, { "clip_ratio": 0.0, "completion_length": 93.45833587646484, "epoch": 2.696245733788396, "grad_norm": 0.6986798656758498, "kl": 0.3369140625, "learning_rate": 7.758816837315131e-07, "loss": 0.0003, "reward": 1.7623697519302368, "reward_std": 0.02836149651557207, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7623698115348816, "step": 788 }, { "clip_ratio": 0.0, "completion_length": 94.625, "epoch": 2.6996587030716723, "grad_norm": 1.1751321969444168, "kl": 0.3525390625, "learning_rate": 7.755972696245734e-07, "loss": 0.0004, "reward": 1.703125, "reward_std": 0.05182955786585808, "rewards/format_reward": 1.0, "rewards/score_reward": 0.703125, "step": 789 }, { "clip_ratio": 0.0, "completion_length": 93.76302337646484, "epoch": 2.703071672354949, "grad_norm": 1.090168264467346, "kl": 0.3408203125, "learning_rate": 7.753128555176335e-07, "loss": 0.0003, "reward": 1.68359375, "reward_std": 0.028058198746293783, "rewards/format_reward": 1.0, "rewards/score_reward": 0.68359375, "step": 790 }, { "clip_ratio": 0.0, "completion_length": 94.33854293823242, "epoch": 2.7064846416382253, "grad_norm": 0.8186405219063854, "kl": 0.361328125, "learning_rate": 7.750284414106939e-07, "loss": 0.0004, "reward": 1.625, "reward_std": 0.03287936095148325, "rewards/format_reward": 1.0, "rewards/score_reward": 0.625, "step": 791 }, { "clip_ratio": 0.0, "completion_length": 94.67448043823242, "epoch": 2.7098976109215016, "grad_norm": 1.7883160274769598, "kl": 0.3603515625, "learning_rate": 7.747440273037542e-07, "loss": 0.0004, "reward": 1.666015625, "reward_std": 0.044319361448287964, "rewards/format_reward": 1.0, "rewards/score_reward": 0.666015625, "step": 792 }, { "clip_ratio": 0.0, "completion_length": 96.06771087646484, "epoch": 2.7133105802047783, "grad_norm": 0.8390586960979516, "kl": 0.33984375, "learning_rate": 7.744596131968146e-07, "loss": 0.0003, "reward": 1.7350260615348816, "reward_std": 0.035878635942935944, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7350260317325592, "step": 793 }, { "clip_ratio": 0.0, "completion_length": 94.61198043823242, "epoch": 2.7167235494880546, "grad_norm": 0.9218119058738101, "kl": 0.37890625, "learning_rate": 7.741751990898748e-07, "loss": 0.0004, "reward": 1.6067708134651184, "reward_std": 0.04159665945917368, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6067708432674408, "step": 794 }, { "clip_ratio": 0.0, "completion_length": 96.38802337646484, "epoch": 2.720136518771331, "grad_norm": 1.075404975480102, "kl": 0.353515625, "learning_rate": 7.738907849829352e-07, "loss": 0.0004, "reward": 1.7122395634651184, "reward_std": 0.04081439785659313, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7122395932674408, "step": 795 }, { "clip_ratio": 0.0, "completion_length": 96.80989837646484, "epoch": 2.7235494880546076, "grad_norm": 1.0614182849864948, "kl": 0.33984375, "learning_rate": 7.736063708759955e-07, "loss": 0.0003, "reward": 1.7044270634651184, "reward_std": 0.059771763160824776, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7044270932674408, "step": 796 }, { "clip_ratio": 0.0, "completion_length": 96.703125, "epoch": 2.726962457337884, "grad_norm": 1.3121888770382721, "kl": 0.3564453125, "learning_rate": 7.733219567690557e-07, "loss": 0.0004, "reward": 1.6061198115348816, "reward_std": 0.046795270405709743, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6061197817325592, "step": 797 }, { "clip_ratio": 0.0, "completion_length": 97.29427337646484, "epoch": 2.73037542662116, "grad_norm": 1.5043805445631255, "kl": 0.3505859375, "learning_rate": 7.73037542662116e-07, "loss": 0.0004, "reward": 1.7239583730697632, "reward_std": 0.027791929431259632, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7239583134651184, "step": 798 }, { "clip_ratio": 0.0, "completion_length": 98.02864837646484, "epoch": 2.733788395904437, "grad_norm": 13.562225077409225, "kl": 0.3408203125, "learning_rate": 7.727531285551763e-07, "loss": 0.0003, "reward": 1.6783854365348816, "reward_std": 0.05225462280213833, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6783854067325592, "step": 799 }, { "clip_ratio": 0.0, "completion_length": 97.45312881469727, "epoch": 2.737201365187713, "grad_norm": 0.5012313576119091, "kl": 0.376953125, "learning_rate": 7.724687144482366e-07, "loss": 0.0004, "reward": 1.5989583134651184, "reward_std": 0.041683049872517586, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5989583432674408, "step": 800 }, { "clip_ratio": 0.0, "completion_length": 100.24739837646484, "epoch": 2.74061433447099, "grad_norm": 0.5925880272513533, "kl": 0.349609375, "learning_rate": 7.721843003412969e-07, "loss": 0.0003, "reward": 1.7623698115348816, "reward_std": 0.029766894411295652, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7623697817325592, "step": 801 }, { "clip_ratio": 0.0, "completion_length": 98.0546875, "epoch": 2.744027303754266, "grad_norm": 0.7446717718782289, "kl": 0.349609375, "learning_rate": 7.718998862343572e-07, "loss": 0.0003, "reward": 1.6920572519302368, "reward_std": 0.04545767419040203, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6920573115348816, "step": 802 }, { "clip_ratio": 0.0, "completion_length": 99.03385543823242, "epoch": 2.747440273037543, "grad_norm": 0.7484418233022739, "kl": 0.337890625, "learning_rate": 7.716154721274175e-07, "loss": 0.0003, "reward": 1.6790364384651184, "reward_std": 0.015300956554710865, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6790364682674408, "step": 803 }, { "clip_ratio": 0.0, "completion_length": 99.09375381469727, "epoch": 2.750853242320819, "grad_norm": 1.1272682817533402, "kl": 0.3447265625, "learning_rate": 7.713310580204778e-07, "loss": 0.0003, "reward": 1.654296875, "reward_std": 0.05503219552338123, "rewards/format_reward": 1.0, "rewards/score_reward": 0.654296875, "step": 804 }, { "clip_ratio": 0.0, "completion_length": 100.84635543823242, "epoch": 2.7542662116040955, "grad_norm": 0.8536286193007765, "kl": 0.3505859375, "learning_rate": 7.710466439135382e-07, "loss": 0.0004, "reward": 1.658203125, "reward_std": 0.03857461176812649, "rewards/format_reward": 1.0, "rewards/score_reward": 0.658203125, "step": 805 }, { "clip_ratio": 0.0, "completion_length": 99.92969131469727, "epoch": 2.757679180887372, "grad_norm": 0.34763367969156544, "kl": 0.3330078125, "learning_rate": 7.707622298065983e-07, "loss": 0.0003, "reward": 1.73046875, "reward_std": 0.011048543266952038, "rewards/format_reward": 1.0, "rewards/score_reward": 0.73046875, "step": 806 }, { "clip_ratio": 0.0, "completion_length": 101.95312881469727, "epoch": 2.7610921501706485, "grad_norm": 1.2677214016959548, "kl": 0.328125, "learning_rate": 7.704778156996586e-07, "loss": 0.0003, "reward": 1.7005208134651184, "reward_std": 0.07542221620678902, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7005208432674408, "step": 807 }, { "clip_ratio": 0.0, "completion_length": 102.96614837646484, "epoch": 2.7645051194539247, "grad_norm": 0.9916567320384283, "kl": 0.3447265625, "learning_rate": 7.70193401592719e-07, "loss": 0.0003, "reward": 1.7213541269302368, "reward_std": 0.04423160990700126, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7213541865348816, "step": 808 }, { "clip_ratio": 0.0, "completion_length": 101.30729675292969, "epoch": 2.7679180887372015, "grad_norm": 0.8134171544703641, "kl": 0.33203125, "learning_rate": 7.699089874857792e-07, "loss": 0.0003, "reward": 1.6315103769302368, "reward_std": 0.05749349854886532, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6315104365348816, "step": 809 }, { "clip_ratio": 0.0, "completion_length": 104.34114837646484, "epoch": 2.7713310580204777, "grad_norm": 0.7354906192080863, "kl": 0.349609375, "learning_rate": 7.696245733788396e-07, "loss": 0.0003, "reward": 1.7428385019302368, "reward_std": 0.04778132401406765, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7428385615348816, "step": 810 }, { "clip_ratio": 0.0, "completion_length": 104.22135543823242, "epoch": 2.774744027303754, "grad_norm": 1.859782846684756, "kl": 0.3408203125, "learning_rate": 7.693401592718999e-07, "loss": 0.0003, "reward": 1.6790364980697632, "reward_std": 0.06623752415180206, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6790364384651184, "step": 811 }, { "clip_ratio": 0.0, "completion_length": 102.59635543823242, "epoch": 2.7781569965870307, "grad_norm": 0.8989640718572539, "kl": 0.3427734375, "learning_rate": 7.690557451649601e-07, "loss": 0.0003, "reward": 1.6595051884651184, "reward_std": 0.04580758325755596, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6595052182674408, "step": 812 }, { "clip_ratio": 0.0, "completion_length": 106.27083587646484, "epoch": 2.781569965870307, "grad_norm": 1.166100940266947, "kl": 0.3447265625, "learning_rate": 7.687713310580204e-07, "loss": 0.0003, "reward": 1.7923177480697632, "reward_std": 0.026234676130115986, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7923176884651184, "step": 813 }, { "clip_ratio": 0.0, "completion_length": 106.00521087646484, "epoch": 2.7849829351535837, "grad_norm": 0.9521688925923095, "kl": 0.32421875, "learning_rate": 7.684869169510807e-07, "loss": 0.0003, "reward": 1.5963541865348816, "reward_std": 0.05727424472570419, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5963541716337204, "step": 814 }, { "clip_ratio": 0.0, "completion_length": 107.10677337646484, "epoch": 2.78839590443686, "grad_norm": 1.0105341348015664, "kl": 0.3203125, "learning_rate": 7.68202502844141e-07, "loss": 0.0003, "reward": 1.6100260615348816, "reward_std": 0.0753069818019867, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6100260317325592, "step": 815 }, { "clip_ratio": 0.0, "completion_length": 107.52864837646484, "epoch": 2.7918088737201368, "grad_norm": 1.3855436354169501, "kl": 0.359375, "learning_rate": 7.679180887372013e-07, "loss": 0.0004, "reward": 1.6966145634651184, "reward_std": 0.044668858870863914, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6966145932674408, "step": 816 }, { "clip_ratio": 0.0, "completion_length": 106.98177337646484, "epoch": 2.795221843003413, "grad_norm": 1.378767021809889, "kl": 0.33984375, "learning_rate": 7.676336746302616e-07, "loss": 0.0003, "reward": 1.736328125, "reward_std": 0.06363247521221638, "rewards/format_reward": 1.0, "rewards/score_reward": 0.736328125, "step": 817 }, { "clip_ratio": 0.0, "completion_length": 108.31510543823242, "epoch": 2.7986348122866893, "grad_norm": 1.0433024326025169, "kl": 0.3251953125, "learning_rate": 7.67349260523322e-07, "loss": 0.0003, "reward": 1.73046875, "reward_std": 0.034086463041603565, "rewards/format_reward": 1.0, "rewards/score_reward": 0.73046875, "step": 818 }, { "clip_ratio": 0.0, "completion_length": 108.11979675292969, "epoch": 2.802047781569966, "grad_norm": 1.0087548989104589, "kl": 0.3505859375, "learning_rate": 7.670648464163822e-07, "loss": 0.0004, "reward": 1.6490885615348816, "reward_std": 0.03813818097114563, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6490885317325592, "step": 819 }, { "clip_ratio": 0.0, "completion_length": 110.31510543823242, "epoch": 2.8054607508532423, "grad_norm": 0.9797680090841479, "kl": 0.359375, "learning_rate": 7.667804323094426e-07, "loss": 0.0004, "reward": 1.701171875, "reward_std": 0.049054816365242004, "rewards/format_reward": 1.0, "rewards/score_reward": 0.701171875, "step": 820 }, { "clip_ratio": 0.0, "completion_length": 111.98958587646484, "epoch": 2.8088737201365186, "grad_norm": 0.9669836482592344, "kl": 0.3388671875, "learning_rate": 7.664960182025028e-07, "loss": 0.0003, "reward": 1.6848958730697632, "reward_std": 0.04756536707282066, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6848958134651184, "step": 821 }, { "clip_ratio": 0.0, "completion_length": 109.42708587646484, "epoch": 2.8122866894197953, "grad_norm": 1.5980737635881619, "kl": 0.32421875, "learning_rate": 7.66211604095563e-07, "loss": 0.0003, "reward": 1.6432291269302368, "reward_std": 0.05585012398660183, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6432291865348816, "step": 822 }, { "clip_ratio": 0.0, "completion_length": 110.71614837646484, "epoch": 2.8156996587030716, "grad_norm": 1.2687594015345784, "kl": 0.34765625, "learning_rate": 7.659271899886234e-07, "loss": 0.0003, "reward": 1.6243489980697632, "reward_std": 0.06934668868780136, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6243489384651184, "step": 823 }, { "clip_ratio": 0.0, "completion_length": 114.47396087646484, "epoch": 2.819112627986348, "grad_norm": 1.0279380719790983, "kl": 0.3603515625, "learning_rate": 7.656427758816837e-07, "loss": 0.0004, "reward": 1.6966145634651184, "reward_std": 0.053287042304873466, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6966145932674408, "step": 824 }, { "clip_ratio": 0.0, "completion_length": 114.64062881469727, "epoch": 2.8225255972696246, "grad_norm": 1.2021595163679764, "kl": 0.3291015625, "learning_rate": 7.65358361774744e-07, "loss": 0.0003, "reward": 1.7213541269302368, "reward_std": 0.08083124924451113, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7213541865348816, "step": 825 }, { "clip_ratio": 0.0, "completion_length": 114.1796875, "epoch": 2.825938566552901, "grad_norm": 4.436490540176609, "kl": 0.380859375, "learning_rate": 7.650739476678043e-07, "loss": 0.0004, "reward": 1.6171875, "reward_std": 0.06507490389049053, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6171875, "step": 826 }, { "clip_ratio": 0.0, "completion_length": 118.01823043823242, "epoch": 2.8293515358361776, "grad_norm": 0.9972147331866373, "kl": 0.33203125, "learning_rate": 7.647895335608646e-07, "loss": 0.0003, "reward": 1.830078125, "reward_std": 0.027088819071650505, "rewards/format_reward": 1.0, "rewards/score_reward": 0.830078125, "step": 827 }, { "clip_ratio": 0.0, "completion_length": 116.44791793823242, "epoch": 2.832764505119454, "grad_norm": 1.5490272653522719, "kl": 0.3125, "learning_rate": 7.645051194539249e-07, "loss": 0.0003, "reward": 1.71875, "reward_std": 0.04734447970986366, "rewards/format_reward": 1.0, "rewards/score_reward": 0.71875, "step": 828 }, { "clip_ratio": 0.0, "completion_length": 116.54948043823242, "epoch": 2.8361774744027306, "grad_norm": 1.1426667527188152, "kl": 0.32421875, "learning_rate": 7.642207053469851e-07, "loss": 0.0003, "reward": 1.61328125, "reward_std": 0.08493182063102722, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6158854365348816, "step": 829 }, { "clip_ratio": 0.0, "completion_length": 119.08073425292969, "epoch": 2.839590443686007, "grad_norm": 3.9118534777016776, "kl": 0.30859375, "learning_rate": 7.639362912400455e-07, "loss": 0.0003, "reward": 1.6555989384651184, "reward_std": 0.08367990329861641, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6555989682674408, "step": 830 }, { "clip_ratio": 0.0, "completion_length": 117.05729293823242, "epoch": 2.843003412969283, "grad_norm": 0.6450559524118008, "kl": 0.490234375, "learning_rate": 7.636518771331057e-07, "loss": 0.0005, "reward": 1.708984375, "reward_std": 0.03202415443956852, "rewards/format_reward": 1.0, "rewards/score_reward": 0.708984375, "step": 831 }, { "clip_ratio": 0.0, "completion_length": 118.32031631469727, "epoch": 2.84641638225256, "grad_norm": 1.6913882678942953, "kl": 2.1357421875, "learning_rate": 7.63367463026166e-07, "loss": 0.0021, "reward": 1.6647135019302368, "reward_std": 0.04324432834982872, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6647135615348816, "step": 832 }, { "clip_ratio": 0.0, "completion_length": 120.296875, "epoch": 2.849829351535836, "grad_norm": 0.912199972723799, "kl": 0.3212890625, "learning_rate": 7.630830489192264e-07, "loss": 0.0003, "reward": 1.6907551884651184, "reward_std": 0.030753114260733128, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6907552182674408, "step": 833 }, { "clip_ratio": 0.0, "completion_length": 123.28125381469727, "epoch": 2.8532423208191124, "grad_norm": 1.2136526784950472, "kl": 0.3251953125, "learning_rate": 7.627986348122866e-07, "loss": 0.0003, "reward": 1.66796875, "reward_std": 0.06457584071904421, "rewards/format_reward": 0.9947916567325592, "rewards/score_reward": 0.6731770932674408, "step": 834 }, { "clip_ratio": 0.0, "completion_length": 122.61198043823242, "epoch": 2.856655290102389, "grad_norm": 0.4266727762019838, "kl": 0.341796875, "learning_rate": 7.62514220705347e-07, "loss": 0.0003, "reward": 1.6263021230697632, "reward_std": 0.033100245986133814, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6263020634651184, "step": 835 }, { "clip_ratio": 0.0, "completion_length": 119.6796875, "epoch": 2.8600682593856654, "grad_norm": 0.512484329745866, "kl": 0.3359375, "learning_rate": 7.622298065984073e-07, "loss": 0.0003, "reward": 1.6959635615348816, "reward_std": 0.022267657332122326, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6959635317325592, "step": 836 }, { "clip_ratio": 0.0, "completion_length": 123.96094131469727, "epoch": 2.8634812286689417, "grad_norm": 1.3525865132672972, "kl": 0.328125, "learning_rate": 7.619453924914674e-07, "loss": 0.0003, "reward": 1.6783854365348816, "reward_std": 0.06108900532126427, "rewards/format_reward": 0.9947916567325592, "rewards/score_reward": 0.68359375, "step": 837 }, { "clip_ratio": 0.0, "completion_length": 120.30729675292969, "epoch": 2.8668941979522184, "grad_norm": 1.2971211716451714, "kl": 0.3447265625, "learning_rate": 7.616609783845278e-07, "loss": 0.0003, "reward": 1.6868489980697632, "reward_std": 0.07833703607320786, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6868489384651184, "step": 838 }, { "clip_ratio": 0.0, "completion_length": 121.08333587646484, "epoch": 2.8703071672354947, "grad_norm": 4.069403180489904, "kl": 0.33203125, "learning_rate": 7.613765642775881e-07, "loss": 0.0003, "reward": 1.728515625, "reward_std": 0.10532183200120926, "rewards/format_reward": 0.9895833432674408, "rewards/score_reward": 0.7389323115348816, "step": 839 }, { "clip_ratio": 0.0, "completion_length": 120.17448425292969, "epoch": 2.8737201365187715, "grad_norm": 2.837868346190689, "kl": 0.3369140625, "learning_rate": 7.610921501706485e-07, "loss": 0.0003, "reward": 1.5787760615348816, "reward_std": 0.045636600349098444, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5787760317325592, "step": 840 }, { "clip_ratio": 0.0, "completion_length": 123.71614837646484, "epoch": 2.8771331058020477, "grad_norm": 2.8545011158251645, "kl": 0.3271484375, "learning_rate": 7.608077360637087e-07, "loss": 0.0003, "reward": 1.671875, "reward_std": 0.10229508206248283, "rewards/format_reward": 0.9947916865348816, "rewards/score_reward": 0.6770833432674408, "step": 841 }, { "clip_ratio": 0.0, "completion_length": 123.63021087646484, "epoch": 2.8805460750853245, "grad_norm": 1.4961202778972358, "kl": 0.32421875, "learning_rate": 7.605233219567691e-07, "loss": 0.0003, "reward": 1.59375, "reward_std": 0.05710326321423054, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.5963541865348816, "step": 842 }, { "clip_ratio": 0.0, "completion_length": 120.15364837646484, "epoch": 2.8839590443686007, "grad_norm": 1.6286674598621518, "kl": 0.3271484375, "learning_rate": 7.602389078498294e-07, "loss": 0.0003, "reward": 1.6555989980697632, "reward_std": 0.05541328527033329, "rewards/format_reward": 0.9921875, "rewards/score_reward": 0.6634114682674408, "step": 843 }, { "clip_ratio": 0.0, "completion_length": 121.59896087646484, "epoch": 2.887372013651877, "grad_norm": 0.9605841144173257, "kl": 0.3349609375, "learning_rate": 7.599544937428895e-07, "loss": 0.0003, "reward": 1.71484375, "reward_std": 0.06462040543556213, "rewards/format_reward": 0.9947916567325592, "rewards/score_reward": 0.7200520932674408, "step": 844 }, { "clip_ratio": 0.0, "completion_length": 119.65104293823242, "epoch": 2.8907849829351537, "grad_norm": 1.2311666489810822, "kl": 0.33984375, "learning_rate": 7.596700796359499e-07, "loss": 0.0003, "reward": 1.630859375, "reward_std": 0.10517039522528648, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6334635615348816, "step": 845 }, { "clip_ratio": 0.0, "completion_length": 119.77864837646484, "epoch": 2.89419795221843, "grad_norm": 1.150038222588226, "kl": 0.33203125, "learning_rate": 7.593856655290102e-07, "loss": 0.0003, "reward": 1.69921875, "reward_std": 0.0366301778703928, "rewards/format_reward": 0.9947916865348816, "rewards/score_reward": 0.7044270932674408, "step": 846 }, { "clip_ratio": 0.0, "completion_length": 122.61198043823242, "epoch": 2.8976109215017063, "grad_norm": 1.327781893077994, "kl": 0.3427734375, "learning_rate": 7.591012514220705e-07, "loss": 0.0003, "reward": 1.697265625, "reward_std": 0.07244166173040867, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6998698115348816, "step": 847 }, { "clip_ratio": 0.0, "completion_length": 120.41667175292969, "epoch": 2.901023890784983, "grad_norm": 1.0408145228844203, "kl": 0.3564453125, "learning_rate": 7.588168373151308e-07, "loss": 0.0004, "reward": 1.685546875, "reward_std": 0.100460946559906, "rewards/format_reward": 0.9947916567325592, "rewards/score_reward": 0.6907552182674408, "step": 848 }, { "clip_ratio": 0.0, "completion_length": 118.09375, "epoch": 2.9044368600682593, "grad_norm": 1.077944683274948, "kl": 0.3505859375, "learning_rate": 7.585324232081911e-07, "loss": 0.0004, "reward": 1.7076823115348816, "reward_std": 0.04916122928261757, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7076822817325592, "step": 849 }, { "clip_ratio": 0.0, "completion_length": 121.10416793823242, "epoch": 2.9078498293515356, "grad_norm": 0.6022559647601229, "kl": 0.341796875, "learning_rate": 7.582480091012514e-07, "loss": 0.0003, "reward": 1.6477864384651184, "reward_std": 0.05414224602282047, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.650390625, "step": 850 }, { "clip_ratio": 0.0, "completion_length": 120.82292175292969, "epoch": 2.9112627986348123, "grad_norm": 5.768487861686044, "kl": 0.3447265625, "learning_rate": 7.579635949943117e-07, "loss": 0.0003, "reward": 1.7141926884651184, "reward_std": 0.09197631850838661, "rewards/format_reward": 0.9921875, "rewards/score_reward": 0.7220052182674408, "step": 851 }, { "clip_ratio": 0.0, "completion_length": 118.99219131469727, "epoch": 2.9146757679180886, "grad_norm": 0.5389214852684965, "kl": 0.349609375, "learning_rate": 7.57679180887372e-07, "loss": 0.0003, "reward": 1.5696614980697632, "reward_std": 0.027355906553566456, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5696614384651184, "step": 852 }, { "clip_ratio": 0.0, "completion_length": 122.68489837646484, "epoch": 2.9180887372013653, "grad_norm": 0.8995813657030552, "kl": 0.33203125, "learning_rate": 7.573947667804322e-07, "loss": 0.0003, "reward": 1.66015625, "reward_std": 0.0512482151389122, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6627604365348816, "step": 853 }, { "clip_ratio": 0.0, "completion_length": 116.87760925292969, "epoch": 2.9215017064846416, "grad_norm": 0.42417724096571674, "kl": 0.349609375, "learning_rate": 7.571103526734925e-07, "loss": 0.0003, "reward": 1.7252603769302368, "reward_std": 0.025380939245224, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7252604365348816, "step": 854 }, { "clip_ratio": 0.0, "completion_length": 117.90885543823242, "epoch": 2.9249146757679183, "grad_norm": 0.9116752329839035, "kl": 0.3359375, "learning_rate": 7.568259385665529e-07, "loss": 0.0003, "reward": 1.6627604365348816, "reward_std": 0.06470775045454502, "rewards/format_reward": 0.9947916865348816, "rewards/score_reward": 0.66796875, "step": 855 }, { "clip_ratio": 0.0, "completion_length": 119.59114837646484, "epoch": 2.9283276450511946, "grad_norm": 0.874232582259426, "kl": 0.3330078125, "learning_rate": 7.565415244596131e-07, "loss": 0.0003, "reward": 1.7819010615348816, "reward_std": 0.033164105378091335, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7819010317325592, "step": 856 }, { "clip_ratio": 0.0, "completion_length": 119.08854293823242, "epoch": 2.931740614334471, "grad_norm": 0.8563628050351609, "kl": 0.3359375, "learning_rate": 7.562571103526735e-07, "loss": 0.0003, "reward": 1.6608072519302368, "reward_std": 0.032462057657539845, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6634114682674408, "step": 857 }, { "clip_ratio": 0.0, "completion_length": 118.21614837646484, "epoch": 2.9351535836177476, "grad_norm": 0.8336319162886326, "kl": 0.33203125, "learning_rate": 7.559726962457338e-07, "loss": 0.0003, "reward": 1.7180989980697632, "reward_std": 0.0802936963737011, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.720703125, "step": 858 }, { "clip_ratio": 0.0, "completion_length": 118.37500381469727, "epoch": 2.938566552901024, "grad_norm": 1.6957078186309504, "kl": 0.3447265625, "learning_rate": 7.55688282138794e-07, "loss": 0.0003, "reward": 1.4772135615348816, "reward_std": 0.061657918617129326, "rewards/format_reward": 1.0, "rewards/score_reward": 0.4772135466337204, "step": 859 }, { "clip_ratio": 0.0, "completion_length": 118.50000381469727, "epoch": 2.9419795221843, "grad_norm": 0.45844641247092643, "kl": 0.3388671875, "learning_rate": 7.554038680318543e-07, "loss": 0.0003, "reward": 1.6881510019302368, "reward_std": 0.045368860475718975, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6881510615348816, "step": 860 }, { "clip_ratio": 0.0, "completion_length": 117.34635543823242, "epoch": 2.945392491467577, "grad_norm": 0.34860402742253394, "kl": 0.33203125, "learning_rate": 7.551194539249146e-07, "loss": 0.0003, "reward": 1.7395833134651184, "reward_std": 0.014731390401721, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7421875, "step": 861 }, { "clip_ratio": 0.0, "completion_length": 119.37239837646484, "epoch": 2.948805460750853, "grad_norm": 2.8555029780070917, "kl": 0.330078125, "learning_rate": 7.54835039817975e-07, "loss": 0.0003, "reward": 1.6764323115348816, "reward_std": 0.08537916839122772, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6790364682674408, "step": 862 }, { "clip_ratio": 0.0, "completion_length": 120.86979293823242, "epoch": 2.9522184300341294, "grad_norm": 1.428872098800159, "kl": 0.3466796875, "learning_rate": 7.545506257110352e-07, "loss": 0.0003, "reward": 1.6328125, "reward_std": 0.04366163443773985, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6328125, "step": 863 }, { "clip_ratio": 0.0, "completion_length": 120.11719131469727, "epoch": 2.955631399317406, "grad_norm": 0.7406423110980573, "kl": 0.333984375, "learning_rate": 7.542662116040955e-07, "loss": 0.0003, "reward": 1.65234375, "reward_std": 0.0671195536851883, "rewards/format_reward": 1.0, "rewards/score_reward": 0.65234375, "step": 864 }, { "clip_ratio": 0.0, "completion_length": 123.71875381469727, "epoch": 2.9590443686006824, "grad_norm": 0.8268963946677109, "kl": 0.3193359375, "learning_rate": 7.539817974971559e-07, "loss": 0.0003, "reward": 1.658203125, "reward_std": 0.04892209079116583, "rewards/format_reward": 1.0, "rewards/score_reward": 0.658203125, "step": 865 }, { "clip_ratio": 0.0, "completion_length": 118.3203125, "epoch": 2.962457337883959, "grad_norm": 0.8642313204063196, "kl": 0.412109375, "learning_rate": 7.536973833902161e-07, "loss": 0.0004, "reward": 1.6998698115348816, "reward_std": 0.045503877103328705, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6998697817325592, "step": 866 }, { "clip_ratio": 0.0, "completion_length": 121.09114837646484, "epoch": 2.9658703071672354, "grad_norm": 1.869259789485949, "kl": 0.328125, "learning_rate": 7.534129692832765e-07, "loss": 0.0003, "reward": 1.7428385019302368, "reward_std": 0.027796040754765272, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7428385615348816, "step": 867 }, { "clip_ratio": 0.0, "completion_length": 121.50521087646484, "epoch": 2.969283276450512, "grad_norm": 1.0450855057866537, "kl": 0.3388671875, "learning_rate": 7.531285551763367e-07, "loss": 0.0003, "reward": 1.677734375, "reward_std": 0.049451522529125214, "rewards/format_reward": 1.0, "rewards/score_reward": 0.677734375, "step": 868 }, { "clip_ratio": 0.0, "completion_length": 125.95573425292969, "epoch": 2.9726962457337884, "grad_norm": 0.7826126016355441, "kl": 0.3486328125, "learning_rate": 7.528441410693969e-07, "loss": 0.0003, "reward": 1.642578125, "reward_std": 0.053190361708402634, "rewards/format_reward": 1.0, "rewards/score_reward": 0.642578125, "step": 869 }, { "clip_ratio": 0.0, "completion_length": 122.47396087646484, "epoch": 2.9761092150170647, "grad_norm": 2.544460107070754, "kl": 3.5322265625, "learning_rate": 7.525597269624573e-07, "loss": 0.0035, "reward": 1.69140625, "reward_std": 0.10633007064461708, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6940104067325592, "step": 870 }, { "clip_ratio": 0.0, "completion_length": 124.58333587646484, "epoch": 2.9795221843003414, "grad_norm": 0.9449805241529943, "kl": 0.3349609375, "learning_rate": 7.522753128555176e-07, "loss": 0.0003, "reward": 1.7376302480697632, "reward_std": 0.03984565567225218, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.740234375, "step": 871 }, { "clip_ratio": 0.0, "completion_length": 123.79687881469727, "epoch": 2.9829351535836177, "grad_norm": 0.8058079924803631, "kl": 0.3466796875, "learning_rate": 7.519908987485779e-07, "loss": 0.0003, "reward": 1.7760416269302368, "reward_std": 0.03570806421339512, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7760416865348816, "step": 872 }, { "clip_ratio": 0.0, "completion_length": 128.2942771911621, "epoch": 2.986348122866894, "grad_norm": 1.0856987110467229, "kl": 0.328125, "learning_rate": 7.517064846416382e-07, "loss": 0.0003, "reward": 1.767578125, "reward_std": 0.06012207642197609, "rewards/format_reward": 1.0, "rewards/score_reward": 0.767578125, "step": 873 }, { "clip_ratio": 0.0, "completion_length": 128.83333587646484, "epoch": 2.9897610921501707, "grad_norm": 0.7707142832604001, "kl": 0.3369140625, "learning_rate": 7.514220705346985e-07, "loss": 0.0003, "reward": 1.7643228769302368, "reward_std": 0.08544645085930824, "rewards/format_reward": 0.9869791567325592, "rewards/score_reward": 0.77734375, "step": 874 }, { "clip_ratio": 0.0, "completion_length": 129.3125, "epoch": 2.993174061433447, "grad_norm": 0.5703078389332564, "kl": 0.3427734375, "learning_rate": 7.511376564277588e-07, "loss": 0.0003, "reward": 1.7389323115348816, "reward_std": 0.034721603617072105, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7389322817325592, "step": 875 }, { "clip_ratio": 0.0, "completion_length": 125.25000762939453, "epoch": 2.9965870307167233, "grad_norm": 1.9673531024193323, "kl": 0.34375, "learning_rate": 7.50853242320819e-07, "loss": 0.0003, "reward": 1.4916667938232422, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.49166668951511383, "step": 876 }, { "clip_ratio": 0.0, "completion_length": 127.38541793823242, "epoch": 3.0034129692832763, "grad_norm": 2.3988669509272476, "kl": 0.3408203125, "learning_rate": 7.505688282138794e-07, "loss": 0.0003, "reward": 1.7688801884651184, "reward_std": 0.051969222724437714, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.771484375, "step": 877 }, { "clip_ratio": 0.0, "completion_length": 126.64583587646484, "epoch": 3.006825938566553, "grad_norm": 0.8160630442104931, "kl": 0.34765625, "learning_rate": 7.502844141069396e-07, "loss": 0.0003, "reward": 1.62890625, "reward_std": 0.09428206458687782, "rewards/format_reward": 0.984375, "rewards/score_reward": 0.64453125, "step": 878 }, { "clip_ratio": 0.0, "completion_length": 129.9947967529297, "epoch": 3.0102389078498293, "grad_norm": 0.9041253065847712, "kl": 0.3251953125, "learning_rate": 7.5e-07, "loss": 0.0003, "reward": 1.6875, "reward_std": 0.09450090490281582, "rewards/format_reward": 0.9895833134651184, "rewards/score_reward": 0.6979166567325592, "step": 879 }, { "clip_ratio": 0.0, "completion_length": 129.66406631469727, "epoch": 3.013651877133106, "grad_norm": 1.1569797989460096, "kl": 0.3291015625, "learning_rate": 7.497155858930603e-07, "loss": 0.0003, "reward": 1.7330729365348816, "reward_std": 0.08349431492388248, "rewards/format_reward": 0.9947916865348816, "rewards/score_reward": 0.73828125, "step": 880 }, { "clip_ratio": 0.0, "completion_length": 127.84635925292969, "epoch": 3.0170648464163823, "grad_norm": 1.0073228236661187, "kl": 0.341796875, "learning_rate": 7.494311717861206e-07, "loss": 0.0003, "reward": 1.7122396230697632, "reward_std": 0.08568301796913147, "rewards/format_reward": 0.984375, "rewards/score_reward": 0.7278645932674408, "step": 881 }, { "clip_ratio": 0.0, "completion_length": 125.23958587646484, "epoch": 3.0204778156996586, "grad_norm": 1.0487538641174197, "kl": 0.3310546875, "learning_rate": 7.491467576791809e-07, "loss": 0.0003, "reward": 1.7115885019302368, "reward_std": 0.0840219296514988, "rewards/format_reward": 0.9817708134651184, "rewards/score_reward": 0.7298176884651184, "step": 882 }, { "clip_ratio": 0.0, "completion_length": 126.48177337646484, "epoch": 3.0238907849829353, "grad_norm": 0.9485360093353027, "kl": 0.3291015625, "learning_rate": 7.488623435722411e-07, "loss": 0.0003, "reward": 1.6751301884651184, "reward_std": 0.0926969163119793, "rewards/format_reward": 0.9895833432674408, "rewards/score_reward": 0.685546875, "step": 883 }, { "clip_ratio": 0.0, "completion_length": 125.60937881469727, "epoch": 3.0273037542662116, "grad_norm": 0.752849642797922, "kl": 0.3408203125, "learning_rate": 7.485779294653015e-07, "loss": 0.0003, "reward": 1.6393228769302368, "reward_std": 0.03864026814699173, "rewards/format_reward": 0.9947916567325592, "rewards/score_reward": 0.64453125, "step": 884 }, { "clip_ratio": 0.0, "completion_length": 126.78907012939453, "epoch": 3.030716723549488, "grad_norm": 0.765818648092144, "kl": 0.337890625, "learning_rate": 7.482935153583617e-07, "loss": 0.0003, "reward": 1.7591145634651184, "reward_std": 0.06042821519076824, "rewards/format_reward": 0.9895833432674408, "rewards/score_reward": 0.76953125, "step": 885 }, { "clip_ratio": 0.0, "completion_length": 129.28646087646484, "epoch": 3.0341296928327646, "grad_norm": 0.9870836819756575, "kl": 0.330078125, "learning_rate": 7.48009101251422e-07, "loss": 0.0003, "reward": 1.7454426884651184, "reward_std": 0.08710894733667374, "rewards/format_reward": 0.9895833134651184, "rewards/score_reward": 0.755859375, "step": 886 }, { "clip_ratio": 0.0, "completion_length": 121.92187881469727, "epoch": 3.037542662116041, "grad_norm": 0.2876951807388977, "kl": 0.337890625, "learning_rate": 7.477246871444824e-07, "loss": 0.0003, "reward": 1.705078125, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.705078125, "step": 887 }, { "clip_ratio": 0.0, "completion_length": 122.56510543823242, "epoch": 3.0409556313993176, "grad_norm": 3.588866037494204, "kl": 0.3203125, "learning_rate": 7.474402730375426e-07, "loss": 0.0003, "reward": 1.767578125, "reward_std": 0.05918347928673029, "rewards/format_reward": 0.9947916567325592, "rewards/score_reward": 0.7727864682674408, "step": 888 }, { "clip_ratio": 0.0, "completion_length": 122.84114837646484, "epoch": 3.044368600682594, "grad_norm": 1.052972077607255, "kl": 0.3369140625, "learning_rate": 7.47155858930603e-07, "loss": 0.0003, "reward": 1.6868489980697632, "reward_std": 0.0716740433126688, "rewards/format_reward": 0.9921875, "rewards/score_reward": 0.6946614682674408, "step": 889 }, { "clip_ratio": 0.0, "completion_length": 122.66146087646484, "epoch": 3.04778156996587, "grad_norm": 10.175969879521812, "kl": 0.337890625, "learning_rate": 7.468714448236633e-07, "loss": 0.0003, "reward": 1.7376302480697632, "reward_std": 0.029899622313678265, "rewards/format_reward": 0.9947916865348816, "rewards/score_reward": 0.7428385615348816, "step": 890 }, { "clip_ratio": 0.0, "completion_length": 121.3515625, "epoch": 3.051194539249147, "grad_norm": 0.8907748975985047, "kl": 0.353515625, "learning_rate": 7.465870307167234e-07, "loss": 0.0004, "reward": 1.611328125, "reward_std": 0.05409604497253895, "rewards/format_reward": 1.0, "rewards/score_reward": 0.611328125, "step": 891 }, { "clip_ratio": 0.0, "completion_length": 121.08594131469727, "epoch": 3.054607508532423, "grad_norm": 0.8843339976651706, "kl": 0.3427734375, "learning_rate": 7.463026166097838e-07, "loss": 0.0003, "reward": 1.5416666865348816, "reward_std": 0.0471267169341445, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.5442708283662796, "step": 892 }, { "clip_ratio": 0.0, "completion_length": 119.31250381469727, "epoch": 3.0580204778157, "grad_norm": 1.1398679458233, "kl": 0.3564453125, "learning_rate": 7.460182025028441e-07, "loss": 0.0004, "reward": 1.6341145634651184, "reward_std": 0.06313087698072195, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.63671875, "step": 893 }, { "clip_ratio": 0.0, "completion_length": 118.51302337646484, "epoch": 3.061433447098976, "grad_norm": 0.765184834159464, "kl": 0.32421875, "learning_rate": 7.457337883959044e-07, "loss": 0.0003, "reward": 1.7115885615348816, "reward_std": 0.060215895995497704, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7115885317325592, "step": 894 }, { "clip_ratio": 0.0, "completion_length": 118.33854293823242, "epoch": 3.0648464163822524, "grad_norm": 1.4524641424027327, "kl": 0.341796875, "learning_rate": 7.454493742889647e-07, "loss": 0.0003, "reward": 1.7298177480697632, "reward_std": 0.05484331212937832, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.732421875, "step": 895 }, { "clip_ratio": 0.0, "completion_length": 117.21614837646484, "epoch": 3.068259385665529, "grad_norm": 0.803816312624113, "kl": 0.3583984375, "learning_rate": 7.45164960182025e-07, "loss": 0.0004, "reward": 1.78125, "reward_std": 0.03912464156746864, "rewards/format_reward": 1.0, "rewards/score_reward": 0.78125, "step": 896 }, { "clip_ratio": 0.0, "completion_length": 117.53385543823242, "epoch": 3.0716723549488054, "grad_norm": 1.383194610399521, "kl": 0.3330078125, "learning_rate": 7.448805460750853e-07, "loss": 0.0003, "reward": 1.72265625, "reward_std": 0.02573454938828945, "rewards/format_reward": 1.0, "rewards/score_reward": 0.72265625, "step": 897 }, { "clip_ratio": 0.0, "completion_length": 117.49739837646484, "epoch": 3.0750853242320817, "grad_norm": 0.4604845283311597, "kl": 0.3505859375, "learning_rate": 7.445961319681456e-07, "loss": 0.0004, "reward": 1.6100260615348816, "reward_std": 0.027355908416211605, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6126302182674408, "step": 898 }, { "clip_ratio": 0.0, "completion_length": 117.24739837646484, "epoch": 3.0784982935153584, "grad_norm": 0.5435103446762062, "kl": 0.3369140625, "learning_rate": 7.443117178612059e-07, "loss": 0.0003, "reward": 1.6998697519302368, "reward_std": 0.030905211344361305, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6998698115348816, "step": 899 }, { "clip_ratio": 0.0, "completion_length": 118.38802337646484, "epoch": 3.0819112627986347, "grad_norm": 0.5330113883586366, "kl": 0.345703125, "learning_rate": 7.440273037542661e-07, "loss": 0.0003, "reward": 1.65625, "reward_std": 0.029197330586612225, "rewards/format_reward": 1.0, "rewards/score_reward": 0.65625, "step": 900 }, { "clip_ratio": 0.0, "completion_length": 116.04166793823242, "epoch": 3.0853242320819114, "grad_norm": 0.6122298729666796, "kl": 0.357421875, "learning_rate": 7.437428896473264e-07, "loss": 0.0004, "reward": 1.7389322519302368, "reward_std": 0.05414224788546562, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7415364682674408, "step": 901 }, { "clip_ratio": 0.0, "completion_length": 117.953125, "epoch": 3.0887372013651877, "grad_norm": 0.6873699312669506, "kl": 0.337890625, "learning_rate": 7.434584755403868e-07, "loss": 0.0003, "reward": 1.6809895634651184, "reward_std": 0.032132746651768684, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6809895932674408, "step": 902 }, { "clip_ratio": 0.0, "completion_length": 115.92969131469727, "epoch": 3.092150170648464, "grad_norm": 0.5985799846373041, "kl": 0.33203125, "learning_rate": 7.43174061433447e-07, "loss": 0.0003, "reward": 1.6907551884651184, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6907552182674408, "step": 903 }, { "clip_ratio": 0.0, "completion_length": 115.56250381469727, "epoch": 3.0955631399317407, "grad_norm": 3.3200508175266146, "kl": 0.3447265625, "learning_rate": 7.428896473265074e-07, "loss": 0.0003, "reward": 1.71875, "reward_std": 0.054995983839035034, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7213541567325592, "step": 904 }, { "clip_ratio": 0.0, "completion_length": 117.73177337646484, "epoch": 3.098976109215017, "grad_norm": 1.056312399504638, "kl": 0.462890625, "learning_rate": 7.426052332195677e-07, "loss": 0.0005, "reward": 1.599609375, "reward_std": 0.042147659696638584, "rewards/format_reward": 0.9947916567325592, "rewards/score_reward": 0.6048177182674408, "step": 905 }, { "clip_ratio": 0.0, "completion_length": 117.30208587646484, "epoch": 3.1023890784982937, "grad_norm": 0.7081585115727521, "kl": 0.3486328125, "learning_rate": 7.42320819112628e-07, "loss": 0.0003, "reward": 1.833984375, "reward_std": 0.04096606653183699, "rewards/format_reward": 1.0, "rewards/score_reward": 0.833984375, "step": 906 }, { "clip_ratio": 0.0, "completion_length": 116.13021087646484, "epoch": 3.10580204778157, "grad_norm": 1.0346542426724048, "kl": 0.357421875, "learning_rate": 7.420364050056882e-07, "loss": 0.0004, "reward": 1.7356770634651184, "reward_std": 0.05201583541929722, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7356770932674408, "step": 907 }, { "clip_ratio": 0.0, "completion_length": 117.86198043823242, "epoch": 3.1092150170648463, "grad_norm": 0.7530818372340873, "kl": 0.349609375, "learning_rate": 7.417519908987485e-07, "loss": 0.0003, "reward": 1.7200520634651184, "reward_std": 0.040112329646945, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7200520932674408, "step": 908 }, { "clip_ratio": 0.0, "completion_length": 116.4375, "epoch": 3.112627986348123, "grad_norm": 0.7401872496227557, "kl": 0.353515625, "learning_rate": 7.414675767918089e-07, "loss": 0.0004, "reward": 1.8450520634651184, "reward_std": 0.05598367191851139, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8450520932674408, "step": 909 }, { "clip_ratio": 0.0, "completion_length": 118.55469131469727, "epoch": 3.1160409556313993, "grad_norm": 1.4886506313041654, "kl": 0.3388671875, "learning_rate": 7.411831626848691e-07, "loss": 0.0003, "reward": 1.6959635019302368, "reward_std": 0.08024749159812927, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6959635615348816, "step": 910 }, { "clip_ratio": 0.0, "completion_length": 116.22135543823242, "epoch": 3.1194539249146755, "grad_norm": 0.7006350654861662, "kl": 0.33984375, "learning_rate": 7.408987485779294e-07, "loss": 0.0003, "reward": 1.7102864384651184, "reward_std": 0.037016949616372585, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7102864682674408, "step": 911 }, { "clip_ratio": 0.0, "completion_length": 117.18750381469727, "epoch": 3.1228668941979523, "grad_norm": 0.4773958338640885, "kl": 0.34375, "learning_rate": 7.406143344709898e-07, "loss": 0.0003, "reward": 1.7428385615348816, "reward_std": 0.030649589374661446, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7454427182674408, "step": 912 }, { "clip_ratio": 0.0, "completion_length": 120.64062881469727, "epoch": 3.1262798634812285, "grad_norm": 1.064732161016139, "kl": 0.328125, "learning_rate": 7.4032992036405e-07, "loss": 0.0003, "reward": 1.7428385615348816, "reward_std": 0.03480812627822161, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7428385317325592, "step": 913 }, { "clip_ratio": 0.0, "completion_length": 117.95052337646484, "epoch": 3.1296928327645053, "grad_norm": 1.6051250208237804, "kl": 0.34375, "learning_rate": 7.400455062571104e-07, "loss": 0.0003, "reward": 1.767578125, "reward_std": 0.027708697598427534, "rewards/format_reward": 1.0, "rewards/score_reward": 0.767578125, "step": 914 }, { "clip_ratio": 0.0, "completion_length": 118.30208587646484, "epoch": 3.1331058020477816, "grad_norm": 0.8862958703740497, "kl": 0.3427734375, "learning_rate": 7.397610921501706e-07, "loss": 0.0003, "reward": 1.6731770634651184, "reward_std": 0.04252413660287857, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6731770932674408, "step": 915 }, { "clip_ratio": 0.0, "completion_length": 119.26562881469727, "epoch": 3.136518771331058, "grad_norm": 0.5948152564471709, "kl": 0.3359375, "learning_rate": 7.394766780432308e-07, "loss": 0.0003, "reward": 1.7233073115348816, "reward_std": 0.04409847687929869, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7233072817325592, "step": 916 }, { "clip_ratio": 0.0, "completion_length": 116.26042175292969, "epoch": 3.1399317406143346, "grad_norm": 1.7631549199839456, "kl": 0.359375, "learning_rate": 7.391922639362912e-07, "loss": 0.0004, "reward": 1.7252603769302368, "reward_std": 0.035158442333340645, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7252604365348816, "step": 917 }, { "clip_ratio": 0.0, "completion_length": 118.01302337646484, "epoch": 3.143344709897611, "grad_norm": 0.9787810876482336, "kl": 0.35546875, "learning_rate": 7.389078498293515e-07, "loss": 0.0004, "reward": 1.6888020634651184, "reward_std": 0.05241335928440094, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6888020932674408, "step": 918 }, { "clip_ratio": 0.0, "completion_length": 115.58333587646484, "epoch": 3.1467576791808876, "grad_norm": 2.339438659754107, "kl": 0.373046875, "learning_rate": 7.386234357224118e-07, "loss": 0.0004, "reward": 1.69921875, "reward_std": 0.0837935023009777, "rewards/format_reward": 1.0, "rewards/score_reward": 0.69921875, "step": 919 }, { "clip_ratio": 0.0, "completion_length": 119.76562881469727, "epoch": 3.150170648464164, "grad_norm": 0.7804080291782096, "kl": 0.3759765625, "learning_rate": 7.383390216154721e-07, "loss": 0.0004, "reward": 1.6692708730697632, "reward_std": 0.04252413799986243, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6692708134651184, "step": 920 }, { "clip_ratio": 0.0, "completion_length": 118.44010543823242, "epoch": 3.15358361774744, "grad_norm": 1.041558606918766, "kl": 0.3583984375, "learning_rate": 7.380546075085325e-07, "loss": 0.0004, "reward": 1.5885416269302368, "reward_std": 0.032594538293778896, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5885416716337204, "step": 921 }, { "clip_ratio": 0.0, "completion_length": 116.91927337646484, "epoch": 3.156996587030717, "grad_norm": 0.8193192855808231, "kl": 0.3447265625, "learning_rate": 7.377701934015926e-07, "loss": 0.0003, "reward": 1.705078125, "reward_std": 0.034720129799097776, "rewards/format_reward": 1.0, "rewards/score_reward": 0.705078125, "step": 922 }, { "clip_ratio": 0.0, "completion_length": 120.58854293823242, "epoch": 3.160409556313993, "grad_norm": 1.028125183276714, "kl": 0.3408203125, "learning_rate": 7.374857792946529e-07, "loss": 0.0003, "reward": 1.7708333134651184, "reward_std": 0.0781654566526413, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7708333432674408, "step": 923 }, { "clip_ratio": 0.0, "completion_length": 115.58854293823242, "epoch": 3.1638225255972694, "grad_norm": 6.027028908318045, "kl": 0.365234375, "learning_rate": 7.372013651877133e-07, "loss": 0.0004, "reward": 1.8287760615348816, "reward_std": 0.03246205672621727, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8287760317325592, "step": 924 }, { "clip_ratio": 0.0, "completion_length": 116.24479675292969, "epoch": 3.167235494880546, "grad_norm": 0.672105748693181, "kl": 0.3466796875, "learning_rate": 7.369169510807736e-07, "loss": 0.0003, "reward": 1.7252604365348816, "reward_std": 0.04962274618446827, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7252604067325592, "step": 925 }, { "clip_ratio": 0.0, "completion_length": 115.47135543823242, "epoch": 3.1706484641638224, "grad_norm": 1.084345937703969, "kl": 0.36328125, "learning_rate": 7.366325369738339e-07, "loss": 0.0004, "reward": 1.6985677480697632, "reward_std": 0.05176788009703159, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6985676884651184, "step": 926 }, { "clip_ratio": 0.0, "completion_length": 115.23698043823242, "epoch": 3.174061433447099, "grad_norm": 0.6714749368748198, "kl": 0.322265625, "learning_rate": 7.363481228668942e-07, "loss": 0.0003, "reward": 1.7845052480697632, "reward_std": 0.05070466175675392, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.787109375, "step": 927 }, { "clip_ratio": 0.0, "completion_length": 116.67448425292969, "epoch": 3.1774744027303754, "grad_norm": 0.4029656174042569, "kl": 0.3505859375, "learning_rate": 7.360637087599545e-07, "loss": 0.0003, "reward": 1.7057291269302368, "reward_std": 0.0073656952008605, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7057291865348816, "step": 928 }, { "clip_ratio": 0.0, "completion_length": 115.5703125, "epoch": 3.1808873720136517, "grad_norm": 1.1690272944410935, "kl": 0.361328125, "learning_rate": 7.357792946530148e-07, "loss": 0.0004, "reward": 1.6549479365348816, "reward_std": 0.014598665293306112, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6549479067325592, "step": 929 }, { "clip_ratio": 0.0, "completion_length": 114.39844131469727, "epoch": 3.1843003412969284, "grad_norm": 1.7358605565277727, "kl": 0.3525390625, "learning_rate": 7.35494880546075e-07, "loss": 0.0004, "reward": 1.734375, "reward_std": 0.03296751994639635, "rewards/format_reward": 1.0, "rewards/score_reward": 0.734375, "step": 930 }, { "clip_ratio": 0.0, "completion_length": 114.98698043823242, "epoch": 3.1877133105802047, "grad_norm": 0.9477039021389219, "kl": 0.33203125, "learning_rate": 7.352104664391354e-07, "loss": 0.0003, "reward": 1.681640625, "reward_std": 0.055413288064301014, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6842447817325592, "step": 931 }, { "clip_ratio": 0.0, "completion_length": 113.74739837646484, "epoch": 3.1911262798634814, "grad_norm": 0.793890613485673, "kl": 0.36328125, "learning_rate": 7.349260523321956e-07, "loss": 0.0004, "reward": 1.708984375, "reward_std": 0.052602482959628105, "rewards/format_reward": 1.0, "rewards/score_reward": 0.708984375, "step": 932 }, { "clip_ratio": 0.0, "completion_length": 115.76041793823242, "epoch": 3.1945392491467577, "grad_norm": 0.825792830869866, "kl": 0.333984375, "learning_rate": 7.346416382252559e-07, "loss": 0.0003, "reward": 1.7213541865348816, "reward_std": 0.028058198746293783, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7213541567325592, "step": 933 }, { "clip_ratio": 0.0, "completion_length": 114.82812881469727, "epoch": 3.197952218430034, "grad_norm": 0.5251641078701894, "kl": 0.3515625, "learning_rate": 7.343572241183163e-07, "loss": 0.0004, "reward": 1.7825520634651184, "reward_std": 0.024242624640464783, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7825520932674408, "step": 934 }, { "clip_ratio": 0.0, "completion_length": 113.65625381469727, "epoch": 3.2013651877133107, "grad_norm": 1.0557444393143869, "kl": 0.345703125, "learning_rate": 7.340728100113765e-07, "loss": 0.0003, "reward": 1.6888020634651184, "reward_std": 0.0327466344460845, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6888020932674408, "step": 935 }, { "clip_ratio": 0.0, "completion_length": 115.109375, "epoch": 3.204778156996587, "grad_norm": 3.2051728119643905, "kl": 0.326171875, "learning_rate": 7.337883959044369e-07, "loss": 0.0003, "reward": 1.671875, "reward_std": 0.053659205324947834, "rewards/format_reward": 1.0, "rewards/score_reward": 0.671875, "step": 936 }, { "clip_ratio": 0.0, "completion_length": 114.375, "epoch": 3.2081911262798632, "grad_norm": 1.090043410120884, "kl": 0.337890625, "learning_rate": 7.335039817974972e-07, "loss": 0.0003, "reward": 1.7057291865348816, "reward_std": 0.06706801801919937, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7083333134651184, "step": 937 }, { "clip_ratio": 0.0, "completion_length": 115.83073043823242, "epoch": 3.21160409556314, "grad_norm": 0.45345279009745687, "kl": 0.3916015625, "learning_rate": 7.332195676905573e-07, "loss": 0.0004, "reward": 1.6829426884651184, "reward_std": 0.026216774247586727, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6829427182674408, "step": 938 }, { "clip_ratio": 0.0, "completion_length": 117.81510925292969, "epoch": 3.2150170648464163, "grad_norm": 0.5764089301295467, "kl": 0.3359375, "learning_rate": 7.329351535836177e-07, "loss": 0.0003, "reward": 1.7141926884651184, "reward_std": 0.023520144633948803, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7141927182674408, "step": 939 }, { "clip_ratio": 0.0, "completion_length": 118.51302337646484, "epoch": 3.218430034129693, "grad_norm": 0.8220381491176103, "kl": 0.3505859375, "learning_rate": 7.32650739476678e-07, "loss": 0.0004, "reward": 1.6614583134651184, "reward_std": 0.03500552847981453, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6614583432674408, "step": 940 }, { "clip_ratio": 0.0, "completion_length": 118.04167175292969, "epoch": 3.2218430034129693, "grad_norm": 0.7943264552352445, "kl": 0.3603515625, "learning_rate": 7.323663253697383e-07, "loss": 0.0004, "reward": 1.6263020634651184, "reward_std": 0.05884817987680435, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6263020932674408, "step": 941 }, { "clip_ratio": 0.0, "completion_length": 120.72396087646484, "epoch": 3.2252559726962455, "grad_norm": 0.856339198336881, "kl": 0.33984375, "learning_rate": 7.320819112627986e-07, "loss": 0.0003, "reward": 1.7083333134651184, "reward_std": 0.07374972850084305, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7083333432674408, "step": 942 }, { "clip_ratio": 0.0, "completion_length": 121.34896087646484, "epoch": 3.2286689419795223, "grad_norm": 0.6332924519246774, "kl": 0.357421875, "learning_rate": 7.317974971558589e-07, "loss": 0.0004, "reward": 1.6946614384651184, "reward_std": 0.027488631196320057, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6946614682674408, "step": 943 }, { "clip_ratio": 0.0, "completion_length": 119.02344131469727, "epoch": 3.2320819112627985, "grad_norm": 0.45894440263396524, "kl": 0.345703125, "learning_rate": 7.315130830489193e-07, "loss": 0.0003, "reward": 1.6809895634651184, "reward_std": 0.022184427361935377, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6809895932674408, "step": 944 }, { "clip_ratio": 0.0, "completion_length": 119.16666793823242, "epoch": 3.2354948805460753, "grad_norm": 0.39277253350078156, "kl": 0.3525390625, "learning_rate": 7.312286689419795e-07, "loss": 0.0004, "reward": 1.634765625, "reward_std": 0.016306546051055193, "rewards/format_reward": 1.0, "rewards/score_reward": 0.634765625, "step": 945 }, { "clip_ratio": 0.0, "completion_length": 121.02604293823242, "epoch": 3.2389078498293515, "grad_norm": 0.9581855298787734, "kl": 0.3310546875, "learning_rate": 7.309442548350398e-07, "loss": 0.0003, "reward": 1.7610677480697632, "reward_std": 0.060455333441495895, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7610676884651184, "step": 946 }, { "clip_ratio": 0.0, "completion_length": 118.69010543823242, "epoch": 3.242320819112628, "grad_norm": 0.593868030050322, "kl": 0.3310546875, "learning_rate": 7.306598407281001e-07, "loss": 0.0003, "reward": 1.7747396230697632, "reward_std": 0.033100247383117676, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7747395634651184, "step": 947 }, { "clip_ratio": 0.0, "completion_length": 117.62760925292969, "epoch": 3.2457337883959045, "grad_norm": 1.4957059091992615, "kl": 0.345703125, "learning_rate": 7.303754266211603e-07, "loss": 0.0003, "reward": 1.669921875, "reward_std": 0.08121275715529919, "rewards/format_reward": 1.0, "rewards/score_reward": 0.669921875, "step": 948 }, { "clip_ratio": 0.0, "completion_length": 118.87760543823242, "epoch": 3.249146757679181, "grad_norm": 3.1264328562792714, "kl": 0.365234375, "learning_rate": 7.300910125142207e-07, "loss": 0.0004, "reward": 1.634765625, "reward_std": 0.05757672339677811, "rewards/format_reward": 1.0, "rewards/score_reward": 0.634765625, "step": 949 }, { "clip_ratio": 0.0, "completion_length": 118.40625381469727, "epoch": 3.252559726962457, "grad_norm": 3.460482605222328, "kl": 0.5498046875, "learning_rate": 7.29806598407281e-07, "loss": 0.0005, "reward": 1.7408854365348816, "reward_std": 0.03798632696270943, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7434895932674408, "step": 950 }, { "clip_ratio": 0.0, "completion_length": 118.5, "epoch": 3.255972696245734, "grad_norm": 0.7425626638875436, "kl": 0.3505859375, "learning_rate": 7.295221843003413e-07, "loss": 0.0004, "reward": 1.728515625, "reward_std": 0.07378214225172997, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7311197817325592, "step": 951 }, { "clip_ratio": 0.0, "completion_length": 122.69531631469727, "epoch": 3.25938566552901, "grad_norm": 0.89223857056451, "kl": 0.34375, "learning_rate": 7.292377701934016e-07, "loss": 0.0003, "reward": 1.7102864980697632, "reward_std": 0.04776276834309101, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.712890625, "step": 952 }, { "clip_ratio": 0.0, "completion_length": 118.01562881469727, "epoch": 3.262798634812287, "grad_norm": 0.39082799598801704, "kl": 0.3544921875, "learning_rate": 7.289533560864618e-07, "loss": 0.0004, "reward": 1.8098958134651184, "reward_std": 0.020843947771936655, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8098958432674408, "step": 953 }, { "clip_ratio": 0.0, "completion_length": 120.86719131469727, "epoch": 3.266211604095563, "grad_norm": 1.0528837650173786, "kl": 0.3486328125, "learning_rate": 7.286689419795221e-07, "loss": 0.0003, "reward": 1.708984375, "reward_std": 0.0659501776099205, "rewards/format_reward": 1.0, "rewards/score_reward": 0.708984375, "step": 954 }, { "clip_ratio": 0.0, "completion_length": 119.40104293823242, "epoch": 3.26962457337884, "grad_norm": 1.4371144865647079, "kl": 0.3583984375, "learning_rate": 7.283845278725824e-07, "loss": 0.0004, "reward": 1.76171875, "reward_std": 0.06676594540476799, "rewards/format_reward": 1.0, "rewards/score_reward": 0.76171875, "step": 955 }, { "clip_ratio": 0.0, "completion_length": 118.71875381469727, "epoch": 3.273037542662116, "grad_norm": 0.4572291602492037, "kl": 0.38671875, "learning_rate": 7.281001137656428e-07, "loss": 0.0004, "reward": 1.7083333134651184, "reward_std": 0.020692503079771996, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7083333432674408, "step": 956 }, { "clip_ratio": 0.0, "completion_length": 117.6640625, "epoch": 3.2764505119453924, "grad_norm": 0.9571746271429995, "kl": 0.3642578125, "learning_rate": 7.27815699658703e-07, "loss": 0.0004, "reward": 1.7447916269302368, "reward_std": 0.05740615352988243, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7447916865348816, "step": 957 }, { "clip_ratio": 0.0, "completion_length": 118.16146087646484, "epoch": 3.279863481228669, "grad_norm": 0.574742716398012, "kl": 0.3515625, "learning_rate": 7.275312855517633e-07, "loss": 0.0004, "reward": 1.640625, "reward_std": 0.03529116790741682, "rewards/format_reward": 1.0, "rewards/score_reward": 0.640625, "step": 958 }, { "clip_ratio": 0.0, "completion_length": 119.28385925292969, "epoch": 3.2832764505119454, "grad_norm": 0.7704612596731175, "kl": 0.3427734375, "learning_rate": 7.272468714448237e-07, "loss": 0.0003, "reward": 1.7923177480697632, "reward_std": 0.0612036008387804, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7923176884651184, "step": 959 }, { "clip_ratio": 0.0, "completion_length": 117.85677337646484, "epoch": 3.2866894197952217, "grad_norm": 0.6050361418305508, "kl": 0.357421875, "learning_rate": 7.269624573378839e-07, "loss": 0.0004, "reward": 1.6907552480697632, "reward_std": 0.04460352845489979, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6907551884651184, "step": 960 }, { "clip_ratio": 0.0, "completion_length": 119.94791793823242, "epoch": 3.2901023890784984, "grad_norm": 1.1694878595091576, "kl": 0.3759765625, "learning_rate": 7.266780432309442e-07, "loss": 0.0004, "reward": 1.7194010019302368, "reward_std": 0.06606284156441689, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7194010615348816, "step": 961 }, { "clip_ratio": 0.0, "completion_length": 118.98698425292969, "epoch": 3.2935153583617747, "grad_norm": 0.8730057509733369, "kl": 0.33984375, "learning_rate": 7.263936291240045e-07, "loss": 0.0003, "reward": 1.6725260615348816, "reward_std": 0.057252995669841766, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6725260317325592, "step": 962 }, { "clip_ratio": 0.0, "completion_length": 121.20052337646484, "epoch": 3.296928327645051, "grad_norm": 0.23311605568607416, "kl": 0.3232421875, "learning_rate": 7.261092150170647e-07, "loss": 0.0003, "reward": 1.7858073115348816, "reward_std": 0.01275724172592163, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7858072817325592, "step": 963 }, { "clip_ratio": 0.0, "completion_length": 119.16146087646484, "epoch": 3.3003412969283277, "grad_norm": 1.803592166699046, "kl": 0.3349609375, "learning_rate": 7.258248009101251e-07, "loss": 0.0003, "reward": 1.6868489384651184, "reward_std": 0.08085688948631287, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6868489682674408, "step": 964 }, { "clip_ratio": 0.0, "completion_length": 119.83333587646484, "epoch": 3.303754266211604, "grad_norm": 0.6457359140548917, "kl": 0.32421875, "learning_rate": 7.255403868031854e-07, "loss": 0.0003, "reward": 1.69921875, "reward_std": 0.04493512772023678, "rewards/format_reward": 1.0, "rewards/score_reward": 0.69921875, "step": 965 }, { "clip_ratio": 0.0, "completion_length": 122.17448425292969, "epoch": 3.3071672354948807, "grad_norm": 0.5900119885212093, "kl": 0.337890625, "learning_rate": 7.252559726962458e-07, "loss": 0.0003, "reward": 1.7578125, "reward_std": 0.04997717821970582, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7604166567325592, "step": 966 }, { "clip_ratio": 0.0, "completion_length": 118.65104675292969, "epoch": 3.310580204778157, "grad_norm": 1.7853391669813807, "kl": 0.3330078125, "learning_rate": 7.24971558589306e-07, "loss": 0.0003, "reward": 1.7486979365348816, "reward_std": 0.01361097814515233, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7486979067325592, "step": 967 }, { "clip_ratio": 0.0, "completion_length": 116.68229675292969, "epoch": 3.3139931740614337, "grad_norm": 1.098944162994754, "kl": 0.3427734375, "learning_rate": 7.246871444823664e-07, "loss": 0.0003, "reward": 1.6888020634651184, "reward_std": 0.06588350236415863, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6888020932674408, "step": 968 }, { "clip_ratio": 0.0, "completion_length": 118.17708587646484, "epoch": 3.31740614334471, "grad_norm": 3.9104977878694562, "kl": 0.337890625, "learning_rate": 7.244027303754266e-07, "loss": 0.0003, "reward": 1.6569010615348816, "reward_std": 0.06095603480935097, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6569010317325592, "step": 969 }, { "clip_ratio": 0.0, "completion_length": 119.80729293823242, "epoch": 3.3208191126279862, "grad_norm": 1.3919593447744503, "kl": 0.34375, "learning_rate": 7.241183162684868e-07, "loss": 0.0003, "reward": 1.7220052480697632, "reward_std": 0.05146417021751404, "rewards/format_reward": 0.9947916567325592, "rewards/score_reward": 0.7272135317325592, "step": 970 }, { "clip_ratio": 0.0, "completion_length": 120.23698043823242, "epoch": 3.324232081911263, "grad_norm": 1.5257896892935368, "kl": 0.341796875, "learning_rate": 7.238339021615472e-07, "loss": 0.0003, "reward": 1.7213541865348816, "reward_std": 0.051616025157272816, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7213541567325592, "step": 971 }, { "clip_ratio": 0.0, "completion_length": 122.17448043823242, "epoch": 3.3276450511945392, "grad_norm": 1.4028100001379364, "kl": 0.3291015625, "learning_rate": 7.235494880546075e-07, "loss": 0.0003, "reward": 1.69140625, "reward_std": 0.08048799261450768, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6940104067325592, "step": 972 }, { "clip_ratio": 0.0, "completion_length": 118.20573043823242, "epoch": 3.3310580204778155, "grad_norm": 4.232426735172062, "kl": 0.34765625, "learning_rate": 7.232650739476678e-07, "loss": 0.0003, "reward": 1.7180989980697632, "reward_std": 0.03715049289166927, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7180989384651184, "step": 973 }, { "clip_ratio": 0.0, "completion_length": 123.79948043823242, "epoch": 3.3344709897610922, "grad_norm": 2.761851939542113, "kl": 0.330078125, "learning_rate": 7.229806598407281e-07, "loss": 0.0003, "reward": 1.7858072519302368, "reward_std": 0.05074356868863106, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7884114682674408, "step": 974 }, { "clip_ratio": 0.0, "completion_length": 121.80208587646484, "epoch": 3.3378839590443685, "grad_norm": 0.9092754703821664, "kl": 0.3310546875, "learning_rate": 7.226962457337884e-07, "loss": 0.0003, "reward": 1.732421875, "reward_std": 0.06018906459212303, "rewards/format_reward": 1.0, "rewards/score_reward": 0.732421875, "step": 975 }, { "clip_ratio": 0.0, "completion_length": 120.7578125, "epoch": 3.3412969283276452, "grad_norm": 3.0562641055384323, "kl": 0.330078125, "learning_rate": 7.224118316268487e-07, "loss": 0.0003, "reward": 1.70703125, "reward_std": 0.06976933404803276, "rewards/format_reward": 1.0, "rewards/score_reward": 0.70703125, "step": 976 }, { "clip_ratio": 0.0, "completion_length": 121.65885543823242, "epoch": 3.3447098976109215, "grad_norm": 1.0333781829714925, "kl": 0.30859375, "learning_rate": 7.221274175199089e-07, "loss": 0.0003, "reward": 1.6861979365348816, "reward_std": 0.029330057092010975, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6861979067325592, "step": 977 }, { "clip_ratio": 0.0, "completion_length": 123.59375, "epoch": 3.348122866894198, "grad_norm": 0.7836256846172727, "kl": 0.3427734375, "learning_rate": 7.218430034129693e-07, "loss": 0.0003, "reward": 1.62890625, "reward_std": 0.052147090435028076, "rewards/format_reward": 1.0, "rewards/score_reward": 0.62890625, "step": 978 }, { "clip_ratio": 0.0, "completion_length": 121.15104293823242, "epoch": 3.3515358361774745, "grad_norm": 0.8931579141240249, "kl": 0.3291015625, "learning_rate": 7.215585893060295e-07, "loss": 0.0003, "reward": 1.7786458134651184, "reward_std": 0.04769809078425169, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7786458432674408, "step": 979 }, { "clip_ratio": 0.0, "completion_length": 121.39323043823242, "epoch": 3.354948805460751, "grad_norm": 0.7492571214074945, "kl": 0.3271484375, "learning_rate": 7.212741751990898e-07, "loss": 0.0003, "reward": 1.7415364384651184, "reward_std": 0.02509636152535677, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7415364682674408, "step": 980 }, { "clip_ratio": 0.0, "completion_length": 124.66666793823242, "epoch": 3.3583617747440275, "grad_norm": 1.2159332185542417, "kl": 0.3212890625, "learning_rate": 7.209897610921502e-07, "loss": 0.0003, "reward": 1.7591145634651184, "reward_std": 0.08384193293750286, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7591145932674408, "step": 981 }, { "clip_ratio": 0.0, "completion_length": 122.78125381469727, "epoch": 3.361774744027304, "grad_norm": 1.219470034874086, "kl": 0.3359375, "learning_rate": 7.207053469852104e-07, "loss": 0.0003, "reward": 1.7805989384651184, "reward_std": 0.07419838756322861, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7805989682674408, "step": 982 }, { "clip_ratio": 0.0, "completion_length": 123.64062881469727, "epoch": 3.36518771331058, "grad_norm": 0.8669158781297907, "kl": 0.33984375, "learning_rate": 7.204209328782708e-07, "loss": 0.0003, "reward": 1.7259114384651184, "reward_std": 0.08410926908254623, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.728515625, "step": 983 }, { "clip_ratio": 0.0, "completion_length": 123.11719131469727, "epoch": 3.368600682593857, "grad_norm": 1.7883837657217299, "kl": 0.3408203125, "learning_rate": 7.201365187713311e-07, "loss": 0.0003, "reward": 1.7083333730697632, "reward_std": 0.062208764255046844, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7109375, "step": 984 }, { "clip_ratio": 0.0, "completion_length": 124.34635543823242, "epoch": 3.372013651877133, "grad_norm": 0.6499831830591903, "kl": 0.3310546875, "learning_rate": 7.198521046643912e-07, "loss": 0.0003, "reward": 1.7916666865348816, "reward_std": 0.03033646149560809, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7916666567325592, "step": 985 }, { "clip_ratio": 0.0, "completion_length": 125.40104675292969, "epoch": 3.3754266211604094, "grad_norm": 1.7867355152900493, "kl": 0.33203125, "learning_rate": 7.195676905574516e-07, "loss": 0.0003, "reward": 1.716796875, "reward_std": 0.09357511252164841, "rewards/format_reward": 1.0, "rewards/score_reward": 0.716796875, "step": 986 }, { "clip_ratio": 0.0, "completion_length": 122.42708587646484, "epoch": 3.378839590443686, "grad_norm": 0.680527847126298, "kl": 0.328125, "learning_rate": 7.192832764505119e-07, "loss": 0.0003, "reward": 1.8795573115348816, "reward_std": 0.04054917115718126, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8795572817325592, "step": 987 }, { "clip_ratio": 0.0, "completion_length": 125.29687881469727, "epoch": 3.3822525597269624, "grad_norm": 0.9485057561342398, "kl": 0.3525390625, "learning_rate": 7.189988623435723e-07, "loss": 0.0004, "reward": 1.6614583730697632, "reward_std": 0.08580467849969864, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6614583134651184, "step": 988 }, { "clip_ratio": 0.0, "completion_length": 121.42708587646484, "epoch": 3.385665529010239, "grad_norm": 1.0327006721250076, "kl": 0.361328125, "learning_rate": 7.187144482366325e-07, "loss": 0.0004, "reward": 1.7057291269302368, "reward_std": 0.08250227756798267, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7083333432674408, "step": 989 }, { "clip_ratio": 0.0, "completion_length": 125.00521087646484, "epoch": 3.3890784982935154, "grad_norm": 1.6010782032332158, "kl": 0.33984375, "learning_rate": 7.184300341296928e-07, "loss": 0.0003, "reward": 1.70703125, "reward_std": 0.024242624640464783, "rewards/format_reward": 1.0, "rewards/score_reward": 0.70703125, "step": 990 }, { "clip_ratio": 0.0, "completion_length": 122.93229293823242, "epoch": 3.3924914675767917, "grad_norm": 0.7618034397106225, "kl": 0.345703125, "learning_rate": 7.181456200227532e-07, "loss": 0.0003, "reward": 1.78125, "reward_std": 0.05343913845717907, "rewards/format_reward": 1.0, "rewards/score_reward": 0.78125, "step": 991 }, { "clip_ratio": 0.0, "completion_length": 122.19010543823242, "epoch": 3.3959044368600684, "grad_norm": 0.673281642940669, "kl": 0.3369140625, "learning_rate": 7.178612059158133e-07, "loss": 0.0003, "reward": 1.673828125, "reward_std": 0.04846530593931675, "rewards/format_reward": 1.0, "rewards/score_reward": 0.673828125, "step": 992 }, { "clip_ratio": 0.0, "completion_length": 125.12760925292969, "epoch": 3.3993174061433447, "grad_norm": 0.7100599047846949, "kl": 0.3271484375, "learning_rate": 7.175767918088737e-07, "loss": 0.0003, "reward": 1.6940104365348816, "reward_std": 0.06476425565779209, "rewards/format_reward": 0.9921875, "rewards/score_reward": 0.7018229067325592, "step": 993 }, { "clip_ratio": 0.0, "completion_length": 122.23958587646484, "epoch": 3.4027303754266214, "grad_norm": 0.8958846412750069, "kl": 0.337890625, "learning_rate": 7.17292377701934e-07, "loss": 0.0003, "reward": 1.7467448115348816, "reward_std": 0.0581256989389658, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7467447817325592, "step": 994 }, { "clip_ratio": 0.0, "completion_length": 122.35937881469727, "epoch": 3.4061433447098977, "grad_norm": 0.6618691950978373, "kl": 0.34375, "learning_rate": 7.170079635949942e-07, "loss": 0.0003, "reward": 1.736328125, "reward_std": 0.06076698936522007, "rewards/format_reward": 0.9947916865348816, "rewards/score_reward": 0.7415364384651184, "step": 995 }, { "clip_ratio": 0.0, "completion_length": 121.69791793823242, "epoch": 3.409556313993174, "grad_norm": 0.7755360729328777, "kl": 0.3310546875, "learning_rate": 7.167235494880546e-07, "loss": 0.0003, "reward": 1.7311198115348816, "reward_std": 0.045856669545173645, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7311197817325592, "step": 996 }, { "clip_ratio": 0.0, "completion_length": 122.59896087646484, "epoch": 3.4129692832764507, "grad_norm": 3.96899774188533, "kl": 0.3369140625, "learning_rate": 7.164391353811149e-07, "loss": 0.0003, "reward": 1.7454426884651184, "reward_std": 0.07625576481223106, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7454427182674408, "step": 997 }, { "clip_ratio": 0.0, "completion_length": 121.67708587646484, "epoch": 3.416382252559727, "grad_norm": 1.0664119147769948, "kl": 0.34375, "learning_rate": 7.161547212741752e-07, "loss": 0.0003, "reward": 1.69140625, "reward_std": 0.0599506888538599, "rewards/format_reward": 1.0, "rewards/score_reward": 0.69140625, "step": 998 }, { "clip_ratio": 0.0, "completion_length": 123.78385543823242, "epoch": 3.419795221843003, "grad_norm": 1.0706528153773698, "kl": 0.349609375, "learning_rate": 7.158703071672355e-07, "loss": 0.0003, "reward": 1.6888020634651184, "reward_std": 0.06100422702729702, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6888020932674408, "step": 999 }, { "clip_ratio": 0.0, "completion_length": 122.66667175292969, "epoch": 3.42320819112628, "grad_norm": 0.7576437324556144, "kl": 0.357421875, "learning_rate": 7.155858930602957e-07, "loss": 0.0004, "reward": 1.7213541865348816, "reward_std": 0.059968589805066586, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7213541567325592, "step": 1000 }, { "clip_ratio": 0.0, "completion_length": 121.99739837646484, "epoch": 3.426621160409556, "grad_norm": 1.0746359613001444, "kl": 0.322265625, "learning_rate": 7.15301478953356e-07, "loss": 0.0003, "reward": 1.6653646230697632, "reward_std": 0.07185337878763676, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6653645634651184, "step": 1001 }, { "clip_ratio": 0.0, "completion_length": 122.54687881469727, "epoch": 3.430034129692833, "grad_norm": 0.6609119704821947, "kl": 0.3349609375, "learning_rate": 7.150170648464163e-07, "loss": 0.0003, "reward": 1.7571614384651184, "reward_std": 0.03597327135503292, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.759765625, "step": 1002 }, { "clip_ratio": 0.0, "completion_length": 123.04167175292969, "epoch": 3.4334470989761092, "grad_norm": 1.0379833325620769, "kl": 0.3427734375, "learning_rate": 7.147326507394767e-07, "loss": 0.0003, "reward": 1.7180989980697632, "reward_std": 0.05414077825844288, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7180989384651184, "step": 1003 }, { "clip_ratio": 0.0, "completion_length": 125.07552337646484, "epoch": 3.4368600682593855, "grad_norm": 0.6942112418508124, "kl": 0.337890625, "learning_rate": 7.144482366325369e-07, "loss": 0.0003, "reward": 1.7037760615348816, "reward_std": 0.05854504369199276, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7037760317325592, "step": 1004 }, { "clip_ratio": 0.0, "completion_length": 122.74219131469727, "epoch": 3.4402730375426622, "grad_norm": 13.191194644705453, "kl": 0.33203125, "learning_rate": 7.141638225255972e-07, "loss": 0.0003, "reward": 1.6875, "reward_std": 0.08881682902574539, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6875, "step": 1005 }, { "clip_ratio": 0.0, "completion_length": 124.09114837646484, "epoch": 3.4436860068259385, "grad_norm": 1.0935050222091915, "kl": 0.337890625, "learning_rate": 7.138794084186576e-07, "loss": 0.0003, "reward": 1.7200520634651184, "reward_std": 0.08138250932097435, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7200520932674408, "step": 1006 }, { "clip_ratio": 0.0, "completion_length": 125.71094131469727, "epoch": 3.4470989761092152, "grad_norm": 0.5436634007931264, "kl": 0.3408203125, "learning_rate": 7.135949943117178e-07, "loss": 0.0003, "reward": 1.7682291865348816, "reward_std": 0.03289889823645353, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7682291567325592, "step": 1007 }, { "clip_ratio": 0.0, "completion_length": 125.2734375, "epoch": 3.4505119453924915, "grad_norm": 1.3970278513344536, "kl": 0.35546875, "learning_rate": 7.133105802047781e-07, "loss": 0.0004, "reward": 1.7897135019302368, "reward_std": 0.06040542759001255, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7897135615348816, "step": 1008 }, { "clip_ratio": 0.0, "completion_length": 126.18229293823242, "epoch": 3.453924914675768, "grad_norm": 0.8701528095407937, "kl": 0.328125, "learning_rate": 7.130261660978384e-07, "loss": 0.0003, "reward": 1.732421875, "reward_std": 0.07967234938405454, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7350260317325592, "step": 1009 }, { "clip_ratio": 0.0, "completion_length": 127.58854675292969, "epoch": 3.4573378839590445, "grad_norm": 1.533314962908222, "kl": 0.3369140625, "learning_rate": 7.127417519908986e-07, "loss": 0.0003, "reward": 1.7141926884651184, "reward_std": 0.06313259154558182, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.716796875, "step": 1010 }, { "clip_ratio": 0.0, "completion_length": 129.20834350585938, "epoch": 3.460750853242321, "grad_norm": 0.7990723351427761, "kl": 0.3330078125, "learning_rate": 7.12457337883959e-07, "loss": 0.0003, "reward": 1.7194010615348816, "reward_std": 0.04563660081475973, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7220052182674408, "step": 1011 }, { "clip_ratio": 0.0, "completion_length": 129.45052337646484, "epoch": 3.464163822525597, "grad_norm": 0.9720909910230587, "kl": 0.33203125, "learning_rate": 7.121729237770193e-07, "loss": 0.0003, "reward": 1.701171875, "reward_std": 0.048331763595342636, "rewards/format_reward": 1.0, "rewards/score_reward": 0.701171875, "step": 1012 }, { "clip_ratio": 0.0, "completion_length": 131.46615600585938, "epoch": 3.467576791808874, "grad_norm": 1.029305766223479, "kl": 0.3310546875, "learning_rate": 7.118885096700797e-07, "loss": 0.0003, "reward": 1.7513020634651184, "reward_std": 0.05698926001787186, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7513020932674408, "step": 1013 }, { "clip_ratio": 0.0, "completion_length": 127.65364837646484, "epoch": 3.47098976109215, "grad_norm": 3.1721420593272946, "kl": 0.3525390625, "learning_rate": 7.116040955631399e-07, "loss": 0.0004, "reward": 1.716796875, "reward_std": 0.04778213985264301, "rewards/format_reward": 1.0, "rewards/score_reward": 0.716796875, "step": 1014 }, { "clip_ratio": 0.0, "completion_length": 128.10937881469727, "epoch": 3.474402730375427, "grad_norm": 0.8687705236544616, "kl": 0.4375, "learning_rate": 7.113196814562003e-07, "loss": 0.0004, "reward": 1.7180989384651184, "reward_std": 0.05775870569050312, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7180989682674408, "step": 1015 }, { "clip_ratio": 0.0, "completion_length": 128.45052337646484, "epoch": 3.477815699658703, "grad_norm": 0.7196462745814607, "kl": 0.3369140625, "learning_rate": 7.110352673492605e-07, "loss": 0.0003, "reward": 1.7376301884651184, "reward_std": 0.053267098031938076, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.740234375, "step": 1016 }, { "clip_ratio": 0.0, "completion_length": 128.46875381469727, "epoch": 3.4812286689419794, "grad_norm": 0.7619007515146601, "kl": 0.3271484375, "learning_rate": 7.107508532423207e-07, "loss": 0.0003, "reward": 1.7337239980697632, "reward_std": 0.032023504842072725, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7337239384651184, "step": 1017 }, { "clip_ratio": 0.0, "completion_length": 129.95052337646484, "epoch": 3.484641638225256, "grad_norm": 1.0500741664357298, "kl": 0.3408203125, "learning_rate": 7.104664391353811e-07, "loss": 0.0003, "reward": 1.6848958134651184, "reward_std": 0.04339536791667342, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6848958432674408, "step": 1018 }, { "clip_ratio": 0.0, "completion_length": 129.70833587646484, "epoch": 3.4880546075085324, "grad_norm": 1.8369113457122257, "kl": 0.3349609375, "learning_rate": 7.101820250284414e-07, "loss": 0.0003, "reward": 1.7578125, "reward_std": 0.053962914273142815, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7578125, "step": 1019 }, { "clip_ratio": 0.0, "completion_length": 127.234375, "epoch": 3.491467576791809, "grad_norm": 0.9164385036915926, "kl": 0.3427734375, "learning_rate": 7.098976109215017e-07, "loss": 0.0003, "reward": 1.7330729365348816, "reward_std": 0.06593422219157219, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7330729067325592, "step": 1020 }, { "clip_ratio": 0.0, "completion_length": 129.51302337646484, "epoch": 3.4948805460750854, "grad_norm": 0.9058569611860127, "kl": 0.34765625, "learning_rate": 7.09613196814562e-07, "loss": 0.0003, "reward": 1.685546875, "reward_std": 0.06904380023479462, "rewards/format_reward": 1.0, "rewards/score_reward": 0.685546875, "step": 1021 }, { "clip_ratio": 0.0, "completion_length": 129.09896087646484, "epoch": 3.4982935153583616, "grad_norm": 0.7661747473370246, "kl": 0.33984375, "learning_rate": 7.093287827076223e-07, "loss": 0.0003, "reward": 1.59375, "reward_std": 0.05345703661441803, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5937500149011612, "step": 1022 }, { "clip_ratio": 0.0, "completion_length": 129.3854217529297, "epoch": 3.5017064846416384, "grad_norm": 1.2125065890588795, "kl": 0.3466796875, "learning_rate": 7.090443686006825e-07, "loss": 0.0003, "reward": 1.716796875, "reward_std": 0.056507036089897156, "rewards/format_reward": 1.0, "rewards/score_reward": 0.716796875, "step": 1023 }, { "clip_ratio": 0.0, "completion_length": 127.7265625, "epoch": 3.5051194539249146, "grad_norm": 0.4888727127131887, "kl": 0.3271484375, "learning_rate": 7.087599544937428e-07, "loss": 0.0003, "reward": 1.6868489980697632, "reward_std": 0.027374626137316227, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6868489384651184, "step": 1024 }, { "clip_ratio": 0.0, "completion_length": 127.09114837646484, "epoch": 3.508532423208191, "grad_norm": 0.5511340526590798, "kl": 0.33203125, "learning_rate": 7.084755403868032e-07, "loss": 0.0003, "reward": 1.7395833134651184, "reward_std": 0.06330316234380007, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7421875, "step": 1025 }, { "clip_ratio": 0.0, "completion_length": 127.69010543823242, "epoch": 3.5119453924914676, "grad_norm": 0.5579340726866808, "kl": 0.326171875, "learning_rate": 7.081911262798634e-07, "loss": 0.0003, "reward": 1.720703125, "reward_std": 0.04587620496749878, "rewards/format_reward": 1.0, "rewards/score_reward": 0.720703125, "step": 1026 }, { "clip_ratio": 0.0, "completion_length": 125.36198043823242, "epoch": 3.515358361774744, "grad_norm": 1.0344107213905742, "kl": 0.34765625, "learning_rate": 7.079067121729237e-07, "loss": 0.0003, "reward": 1.6842447519302368, "reward_std": 0.035795814357697964, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6842448115348816, "step": 1027 }, { "clip_ratio": 0.0, "completion_length": 129.9817771911621, "epoch": 3.51877133105802, "grad_norm": 1.0319702120129308, "kl": 0.3271484375, "learning_rate": 7.076222980659841e-07, "loss": 0.0003, "reward": 1.69921875, "reward_std": 0.046206166967749596, "rewards/format_reward": 1.0, "rewards/score_reward": 0.69921875, "step": 1028 }, { "clip_ratio": 0.0, "completion_length": 127.33854675292969, "epoch": 3.522184300341297, "grad_norm": 1.455634004224518, "kl": 0.3701171875, "learning_rate": 7.073378839590443e-07, "loss": 0.0004, "reward": 1.6998698115348816, "reward_std": 0.03336398396641016, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6998697817325592, "step": 1029 }, { "clip_ratio": 0.0, "completion_length": 126.48177337646484, "epoch": 3.5255972696245736, "grad_norm": 0.9069818564867285, "kl": 0.341796875, "learning_rate": 7.070534698521047e-07, "loss": 0.0003, "reward": 1.7682291865348816, "reward_std": 0.04359835013747215, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7682291567325592, "step": 1030 }, { "clip_ratio": 0.0, "completion_length": 125.58073425292969, "epoch": 3.52901023890785, "grad_norm": 0.93372374647031, "kl": 0.3408203125, "learning_rate": 7.067690557451649e-07, "loss": 0.0003, "reward": 1.7454426884651184, "reward_std": 0.06733632739633322, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7454427182674408, "step": 1031 }, { "clip_ratio": 0.0, "completion_length": 127.01823043823242, "epoch": 3.532423208191126, "grad_norm": 0.5924589904424769, "kl": 0.322265625, "learning_rate": 7.064846416382251e-07, "loss": 0.0003, "reward": 1.6575521230697632, "reward_std": 0.031694845063611865, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.66015625, "step": 1032 }, { "clip_ratio": 0.0, "completion_length": 126.91146087646484, "epoch": 3.535836177474403, "grad_norm": 1.4348488450024035, "kl": 0.330078125, "learning_rate": 7.062002275312855e-07, "loss": 0.0003, "reward": 1.6751302480697632, "reward_std": 0.04821857437491417, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6751301884651184, "step": 1033 }, { "clip_ratio": 0.0, "completion_length": 125.48698425292969, "epoch": 3.539249146757679, "grad_norm": 0.8921735569209432, "kl": 0.3349609375, "learning_rate": 7.059158134243458e-07, "loss": 0.0003, "reward": 1.6790364980697632, "reward_std": 0.07681037671864033, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6790364384651184, "step": 1034 }, { "clip_ratio": 0.0, "completion_length": 127.53385543823242, "epoch": 3.5426621160409555, "grad_norm": 0.9098134955368508, "kl": 0.326171875, "learning_rate": 7.056313993174062e-07, "loss": 0.0003, "reward": 1.6516926884651184, "reward_std": 0.07175931334495544, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6516927182674408, "step": 1035 }, { "clip_ratio": 0.0, "completion_length": 131.55469512939453, "epoch": 3.546075085324232, "grad_norm": 1.090916105216606, "kl": 0.33203125, "learning_rate": 7.053469852104664e-07, "loss": 0.0003, "reward": 1.6354166269302368, "reward_std": 0.044194171205163, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6354166865348816, "step": 1036 }, { "clip_ratio": 0.0, "completion_length": 127.84115219116211, "epoch": 3.5494880546075085, "grad_norm": 0.5670630435972603, "kl": 0.34375, "learning_rate": 7.050625711035267e-07, "loss": 0.0003, "reward": 1.7298176884651184, "reward_std": 0.027355906553566456, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7298177182674408, "step": 1037 }, { "clip_ratio": 0.0, "completion_length": 130.1875, "epoch": 3.5529010238907848, "grad_norm": 1.2053639713687245, "kl": 0.3310546875, "learning_rate": 7.047781569965871e-07, "loss": 0.0003, "reward": 1.7115885019302368, "reward_std": 0.07743299007415771, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7115885615348816, "step": 1038 }, { "clip_ratio": 0.0, "completion_length": 129.1041717529297, "epoch": 3.5563139931740615, "grad_norm": 0.9098637935957485, "kl": 0.349609375, "learning_rate": 7.044937428896472e-07, "loss": 0.0003, "reward": 1.7819010615348816, "reward_std": 0.08677176386117935, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7819010317325592, "step": 1039 }, { "clip_ratio": 0.0, "completion_length": 126.51302719116211, "epoch": 3.5597269624573378, "grad_norm": 2.3824291321079873, "kl": 0.326171875, "learning_rate": 7.042093287827076e-07, "loss": 0.0003, "reward": 1.7220051884651184, "reward_std": 0.051617249846458435, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7220052182674408, "step": 1040 }, { "clip_ratio": 0.0, "completion_length": 124.63281631469727, "epoch": 3.5631399317406145, "grad_norm": 0.40753825181142866, "kl": 0.322265625, "learning_rate": 7.039249146757679e-07, "loss": 0.0003, "reward": 1.7955728769302368, "reward_std": 0.021964360028505325, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7955729365348816, "step": 1041 }, { "clip_ratio": 0.0, "completion_length": 125.26823043823242, "epoch": 3.5665529010238908, "grad_norm": 0.9985796754941076, "kl": 0.3251953125, "learning_rate": 7.036405005688281e-07, "loss": 0.0003, "reward": 1.6901041865348816, "reward_std": 0.06763962097465992, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6901041567325592, "step": 1042 }, { "clip_ratio": 0.0, "completion_length": 126.31250381469727, "epoch": 3.5699658703071675, "grad_norm": 0.7898843399081867, "kl": 0.3310546875, "learning_rate": 7.033560864618885e-07, "loss": 0.0003, "reward": 1.7454426884651184, "reward_std": 0.05528055690228939, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7454427182674408, "step": 1043 }, { "clip_ratio": 0.0, "completion_length": 125.10677337646484, "epoch": 3.573378839590444, "grad_norm": 1.5040689106516172, "kl": 0.33203125, "learning_rate": 7.030716723549488e-07, "loss": 0.0003, "reward": 1.75390625, "reward_std": 0.06615124084055424, "rewards/format_reward": 1.0, "rewards/score_reward": 0.75390625, "step": 1044 }, { "clip_ratio": 0.0, "completion_length": 124.65104293823242, "epoch": 3.57679180887372, "grad_norm": 1.504488235993053, "kl": 0.3330078125, "learning_rate": 7.027872582480091e-07, "loss": 0.0003, "reward": 1.65625, "reward_std": 0.05729438364505768, "rewards/format_reward": 1.0, "rewards/score_reward": 0.65625, "step": 1045 }, { "clip_ratio": 0.0, "completion_length": 128.6588592529297, "epoch": 3.580204778156997, "grad_norm": 2.7093675254258516, "kl": 0.337890625, "learning_rate": 7.025028441410694e-07, "loss": 0.0003, "reward": 1.69921875, "reward_std": 0.08861401304602623, "rewards/format_reward": 1.0, "rewards/score_reward": 0.69921875, "step": 1046 }, { "clip_ratio": 0.0, "completion_length": 124.26041793823242, "epoch": 3.583617747440273, "grad_norm": 0.6001294252080503, "kl": 0.55078125, "learning_rate": 7.022184300341296e-07, "loss": 0.0005, "reward": 1.6790364384651184, "reward_std": 0.034720784053206444, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6790364682674408, "step": 1047 }, { "clip_ratio": 0.0, "completion_length": 128.84375762939453, "epoch": 3.5870307167235493, "grad_norm": 0.2550334862454155, "kl": 0.3251953125, "learning_rate": 7.019340159271899e-07, "loss": 0.0003, "reward": 1.72265625, "reward_std": 0.0073656952008605, "rewards/format_reward": 1.0, "rewards/score_reward": 0.72265625, "step": 1048 }, { "clip_ratio": 0.0, "completion_length": 127.97916793823242, "epoch": 3.590443686006826, "grad_norm": 2.0836898624839613, "kl": 0.33203125, "learning_rate": 7.016496018202502e-07, "loss": 0.0003, "reward": 1.751953125, "reward_std": 0.040443526580929756, "rewards/format_reward": 1.0, "rewards/score_reward": 0.751953125, "step": 1049 }, { "clip_ratio": 0.0, "completion_length": 128.52083587646484, "epoch": 3.5938566552901023, "grad_norm": 1.1247132987012645, "kl": 0.34765625, "learning_rate": 7.013651877133106e-07, "loss": 0.0003, "reward": 1.7063801884651184, "reward_std": 0.0480671264231205, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7063802182674408, "step": 1050 }, { "clip_ratio": 0.0, "completion_length": 130.82553100585938, "epoch": 3.5972696245733786, "grad_norm": 1.2706042033589429, "kl": 0.333984375, "learning_rate": 7.010807736063708e-07, "loss": 0.0003, "reward": 1.7154948115348816, "reward_std": 0.06735423021018505, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7180989384651184, "step": 1051 }, { "clip_ratio": 0.0, "completion_length": 129.6510467529297, "epoch": 3.6006825938566553, "grad_norm": 1.0945647241539298, "kl": 0.3408203125, "learning_rate": 7.007963594994311e-07, "loss": 0.0003, "reward": 1.7962239384651184, "reward_std": 0.041821847669780254, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7962239682674408, "step": 1052 }, { "clip_ratio": 0.0, "completion_length": 131.7526092529297, "epoch": 3.6040955631399316, "grad_norm": 0.5647141802054626, "kl": 0.330078125, "learning_rate": 7.005119453924915e-07, "loss": 0.0003, "reward": 1.7721354365348816, "reward_std": 0.052584992721676826, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7747395634651184, "step": 1053 }, { "clip_ratio": 0.0, "completion_length": 131.2526092529297, "epoch": 3.6075085324232083, "grad_norm": 1.3269377285693789, "kl": 0.3291015625, "learning_rate": 7.002275312855518e-07, "loss": 0.0003, "reward": 1.71875, "reward_std": 0.04751464631408453, "rewards/format_reward": 1.0, "rewards/score_reward": 0.71875, "step": 1054 }, { "clip_ratio": 0.0, "completion_length": 133.15625762939453, "epoch": 3.6109215017064846, "grad_norm": 0.9973061479357722, "kl": 0.33984375, "learning_rate": 6.99943117178612e-07, "loss": 0.0003, "reward": 1.7766926884651184, "reward_std": 0.07193753495812416, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7766927182674408, "step": 1055 }, { "clip_ratio": 0.0, "completion_length": 131.32032012939453, "epoch": 3.6143344709897613, "grad_norm": 1.0320408750178893, "kl": 0.3515625, "learning_rate": 6.996587030716723e-07, "loss": 0.0004, "reward": 1.7233073115348816, "reward_std": 0.06465825252234936, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7233072817325592, "step": 1056 }, { "clip_ratio": 0.0, "completion_length": 128.4791717529297, "epoch": 3.6177474402730376, "grad_norm": 1.1614233313009963, "kl": 0.337890625, "learning_rate": 6.993742889647326e-07, "loss": 0.0003, "reward": 1.6888020634651184, "reward_std": 0.043376813642680645, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6888020932674408, "step": 1057 }, { "clip_ratio": 0.0, "completion_length": 127.08073425292969, "epoch": 3.621160409556314, "grad_norm": 0.32081105494785844, "kl": 0.3408203125, "learning_rate": 6.990898748577929e-07, "loss": 0.0003, "reward": 1.7682291269302368, "reward_std": 0.01799587346613407, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7682291865348816, "step": 1058 }, { "clip_ratio": 0.0, "completion_length": 130.5572967529297, "epoch": 3.6245733788395906, "grad_norm": 0.7748501608151885, "kl": 0.3359375, "learning_rate": 6.988054607508532e-07, "loss": 0.0003, "reward": 1.6282552480697632, "reward_std": 0.09162428975105286, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.630859375, "step": 1059 }, { "clip_ratio": 0.0, "completion_length": 131.265625, "epoch": 3.627986348122867, "grad_norm": 0.8011665108534396, "kl": 0.341796875, "learning_rate": 6.985210466439136e-07, "loss": 0.0003, "reward": 1.7018228769302368, "reward_std": 0.06875840201973915, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7018229365348816, "step": 1060 }, { "clip_ratio": 0.0, "completion_length": 128.42187881469727, "epoch": 3.631399317406143, "grad_norm": 1.3622787186424694, "kl": 0.341796875, "learning_rate": 6.982366325369738e-07, "loss": 0.0003, "reward": 1.7350260615348816, "reward_std": 0.06782102771103382, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7376302182674408, "step": 1061 }, { "clip_ratio": 0.0, "completion_length": 128.04688262939453, "epoch": 3.63481228668942, "grad_norm": 0.6704444100086602, "kl": 0.3466796875, "learning_rate": 6.97952218430034e-07, "loss": 0.0003, "reward": 1.6953125, "reward_std": 0.03997960314154625, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6979166567325592, "step": 1062 }, { "clip_ratio": 0.0, "completion_length": 128.98958587646484, "epoch": 3.638225255972696, "grad_norm": 1.1110397052656344, "kl": 0.3466796875, "learning_rate": 6.976678043230944e-07, "loss": 0.0003, "reward": 1.6627603769302368, "reward_std": 0.10775565728545189, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6627604365348816, "step": 1063 }, { "clip_ratio": 0.0, "completion_length": 127.83594131469727, "epoch": 3.6416382252559725, "grad_norm": 0.843817934680897, "kl": 0.3505859375, "learning_rate": 6.973833902161546e-07, "loss": 0.0004, "reward": 1.6927083134651184, "reward_std": 0.0654293317347765, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6953125, "step": 1064 }, { "clip_ratio": 0.0, "completion_length": 130.5729217529297, "epoch": 3.645051194539249, "grad_norm": 0.8156838665948895, "kl": 0.328125, "learning_rate": 6.97098976109215e-07, "loss": 0.0003, "reward": 1.7259114384651184, "reward_std": 0.04118695016950369, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7259114682674408, "step": 1065 }, { "clip_ratio": 0.0, "completion_length": 126.53125, "epoch": 3.6484641638225255, "grad_norm": 0.7221784915579125, "kl": 0.3349609375, "learning_rate": 6.968145620022753e-07, "loss": 0.0003, "reward": 1.7298176884651184, "reward_std": 0.0421731686219573, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7298177182674408, "step": 1066 }, { "clip_ratio": 0.0, "completion_length": 127.83073043823242, "epoch": 3.651877133105802, "grad_norm": 0.8035433164821522, "kl": 0.357421875, "learning_rate": 6.965301478953356e-07, "loss": 0.0004, "reward": 1.5657551884651184, "reward_std": 0.041232336312532425, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.568359375, "step": 1067 }, { "clip_ratio": 0.0, "completion_length": 125.46354293823242, "epoch": 3.6552901023890785, "grad_norm": 0.9281849830489064, "kl": 0.359375, "learning_rate": 6.962457337883959e-07, "loss": 0.0004, "reward": 1.6673176884651184, "reward_std": 0.045636601746082306, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6673177182674408, "step": 1068 }, { "clip_ratio": 0.0, "completion_length": 124.57031631469727, "epoch": 3.658703071672355, "grad_norm": 0.5660839769738737, "kl": 0.3359375, "learning_rate": 6.959613196814562e-07, "loss": 0.0003, "reward": 1.8776041865348816, "reward_std": 0.010782274417579174, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8776041567325592, "step": 1069 }, { "clip_ratio": 0.0, "completion_length": 127.11458587646484, "epoch": 3.6621160409556315, "grad_norm": 0.9320290415822183, "kl": 0.3564453125, "learning_rate": 6.956769055745164e-07, "loss": 0.0004, "reward": 1.7610676884651184, "reward_std": 0.05286875367164612, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7610677182674408, "step": 1070 }, { "clip_ratio": 0.0, "completion_length": 124.89323043823242, "epoch": 3.6655290102389078, "grad_norm": 0.9912678507204548, "kl": 0.3271484375, "learning_rate": 6.953924914675767e-07, "loss": 0.0003, "reward": 1.7591145634651184, "reward_std": 0.053830185905098915, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7591145932674408, "step": 1071 }, { "clip_ratio": 0.0, "completion_length": 127.26562881469727, "epoch": 3.6689419795221845, "grad_norm": 0.6978289497287556, "kl": 0.35546875, "learning_rate": 6.951080773606371e-07, "loss": 0.0004, "reward": 1.5729166865348816, "reward_std": 0.049608008936047554, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5729166567325592, "step": 1072 }, { "clip_ratio": 0.0, "completion_length": 122.92187881469727, "epoch": 3.6723549488054608, "grad_norm": 0.6584552018529503, "kl": 0.3369140625, "learning_rate": 6.948236632536973e-07, "loss": 0.0003, "reward": 1.728515625, "reward_std": 0.04068124387413263, "rewards/format_reward": 1.0, "rewards/score_reward": 0.728515625, "step": 1073 }, { "clip_ratio": 0.0, "completion_length": 126.02083587646484, "epoch": 3.675767918088737, "grad_norm": 1.2889811485629006, "kl": 0.337890625, "learning_rate": 6.945392491467576e-07, "loss": 0.0003, "reward": 1.7356771230697632, "reward_std": 0.08068418502807617, "rewards/format_reward": 0.9947916865348816, "rewards/score_reward": 0.7408854365348816, "step": 1074 }, { "clip_ratio": 0.0, "completion_length": 123.6796875, "epoch": 3.6791808873720138, "grad_norm": 4.227105171810579, "kl": 0.3427734375, "learning_rate": 6.94254835039818e-07, "loss": 0.0003, "reward": 1.6438802480697632, "reward_std": 0.04710915870964527, "rewards/format_reward": 0.9947916865348816, "rewards/score_reward": 0.6490885615348816, "step": 1075 }, { "clip_ratio": 0.0, "completion_length": 121.69531631469727, "epoch": 3.68259385665529, "grad_norm": 0.8851548735914758, "kl": 0.333984375, "learning_rate": 6.939704209328783e-07, "loss": 0.0003, "reward": 1.8177083730697632, "reward_std": 0.05484224855899811, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8177083134651184, "step": 1076 }, { "clip_ratio": 0.0, "completion_length": 123.48958587646484, "epoch": 3.6860068259385663, "grad_norm": 0.7614075049190189, "kl": 0.3349609375, "learning_rate": 6.936860068259386e-07, "loss": 0.0003, "reward": 1.703125, "reward_std": 0.052167280577123165, "rewards/format_reward": 1.0, "rewards/score_reward": 0.703125, "step": 1077 }, { "clip_ratio": 0.0, "completion_length": 123.84375, "epoch": 3.689419795221843, "grad_norm": 0.835931875877666, "kl": 0.3486328125, "learning_rate": 6.934015927189988e-07, "loss": 0.0003, "reward": 1.7272135019302368, "reward_std": 0.05856441520154476, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7272135615348816, "step": 1078 }, { "clip_ratio": 0.0, "completion_length": 121.83594131469727, "epoch": 3.6928327645051193, "grad_norm": 1.3321159143237944, "kl": 0.3486328125, "learning_rate": 6.931171786120591e-07, "loss": 0.0003, "reward": 1.7571614384651184, "reward_std": 0.05313290189951658, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7571614682674408, "step": 1079 }, { "clip_ratio": 0.0, "completion_length": 120.9609375, "epoch": 3.696245733788396, "grad_norm": 1.0495634048912148, "kl": 0.36328125, "learning_rate": 6.928327645051194e-07, "loss": 0.0004, "reward": 1.7819010615348816, "reward_std": 0.052583932876586914, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7819010317325592, "step": 1080 }, { "clip_ratio": 0.0, "completion_length": 121.76041793823242, "epoch": 3.6996587030716723, "grad_norm": 0.8593704018767362, "kl": 0.35546875, "learning_rate": 6.925483503981797e-07, "loss": 0.0004, "reward": 1.6295573115348816, "reward_std": 0.045768678188323975, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6295572817325592, "step": 1081 }, { "clip_ratio": 0.0, "completion_length": 120.79687881469727, "epoch": 3.703071672354949, "grad_norm": 0.651125921828569, "kl": 0.3603515625, "learning_rate": 6.922639362912401e-07, "loss": 0.0004, "reward": 1.68359375, "reward_std": 0.04083309881389141, "rewards/format_reward": 1.0, "rewards/score_reward": 0.68359375, "step": 1082 }, { "clip_ratio": 0.0, "completion_length": 124.16927337646484, "epoch": 3.7064846416382253, "grad_norm": 0.5725007916792213, "kl": 0.357421875, "learning_rate": 6.919795221843003e-07, "loss": 0.0004, "reward": 1.7721353769302368, "reward_std": 0.03919408097863197, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7747395932674408, "step": 1083 }, { "clip_ratio": 0.0, "completion_length": 121.734375, "epoch": 3.7098976109215016, "grad_norm": 1.3030567569500582, "kl": 0.3486328125, "learning_rate": 6.916951080773606e-07, "loss": 0.0003, "reward": 1.7408854365348816, "reward_std": 0.09414304420351982, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7408854067325592, "step": 1084 }, { "clip_ratio": 0.0, "completion_length": 124.79427337646484, "epoch": 3.7133105802047783, "grad_norm": 0.9983715711659163, "kl": 0.392578125, "learning_rate": 6.91410693970421e-07, "loss": 0.0004, "reward": 1.6256510615348816, "reward_std": 0.07264382764697075, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6256510317325592, "step": 1085 }, { "clip_ratio": 0.0, "completion_length": 125.54948043823242, "epoch": 3.7167235494880546, "grad_norm": 1.2274293523041548, "kl": 0.3466796875, "learning_rate": 6.911262798634811e-07, "loss": 0.0003, "reward": 1.671875, "reward_std": 0.07158874440938234, "rewards/format_reward": 1.0, "rewards/score_reward": 0.671875, "step": 1086 }, { "clip_ratio": 0.0, "completion_length": 126.17969131469727, "epoch": 3.720136518771331, "grad_norm": 0.8488359789544587, "kl": 0.35546875, "learning_rate": 6.908418657565415e-07, "loss": 0.0004, "reward": 1.6829427480697632, "reward_std": 0.08995555341243744, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.685546875, "step": 1087 }, { "clip_ratio": 0.0, "completion_length": 122.62239837646484, "epoch": 3.7235494880546076, "grad_norm": 1.3260415490866364, "kl": 0.3642578125, "learning_rate": 6.905574516496018e-07, "loss": 0.0004, "reward": 1.771484375, "reward_std": 0.06768764927983284, "rewards/format_reward": 1.0, "rewards/score_reward": 0.771484375, "step": 1088 }, { "clip_ratio": 0.0, "completion_length": 124.72917175292969, "epoch": 3.726962457337884, "grad_norm": 0.6110995641401267, "kl": 0.3525390625, "learning_rate": 6.90273037542662e-07, "loss": 0.0004, "reward": 1.7630208134651184, "reward_std": 0.03314562886953354, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7630208432674408, "step": 1089 }, { "clip_ratio": 0.0, "completion_length": 122.22396087646484, "epoch": 3.73037542662116, "grad_norm": 1.1259252370443713, "kl": 0.3544921875, "learning_rate": 6.899886234357224e-07, "loss": 0.0004, "reward": 1.7421875, "reward_std": 0.05275967810302973, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7421875, "step": 1090 }, { "clip_ratio": 0.0, "completion_length": 118.34375381469727, "epoch": 3.733788395904437, "grad_norm": 0.7015000748316322, "kl": 0.34765625, "learning_rate": 6.897042093287827e-07, "loss": 0.0003, "reward": 1.8079427480697632, "reward_std": 0.05471058702096343, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8079426884651184, "step": 1091 }, { "clip_ratio": 0.0, "completion_length": 120.24739837646484, "epoch": 3.737201365187713, "grad_norm": 1.1124731832559123, "kl": 0.3798828125, "learning_rate": 6.89419795221843e-07, "loss": 0.0004, "reward": 1.7727864384651184, "reward_std": 0.056551603600382805, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.775390625, "step": 1092 }, { "clip_ratio": 0.0, "completion_length": 120.73958587646484, "epoch": 3.74061433447099, "grad_norm": 0.9955753840994379, "kl": 0.3671875, "learning_rate": 6.891353811149032e-07, "loss": 0.0004, "reward": 1.7311198115348816, "reward_std": 0.04081543767824769, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7311197817325592, "step": 1093 }, { "clip_ratio": 0.0, "completion_length": 119.88802337646484, "epoch": 3.744027303754266, "grad_norm": 2.3130347669355094, "kl": 0.37109375, "learning_rate": 6.888509670079635e-07, "loss": 0.0004, "reward": 1.6588541269302368, "reward_std": 0.09752170369029045, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6588541865348816, "step": 1094 }, { "clip_ratio": 0.0, "completion_length": 122.59114837646484, "epoch": 3.747440273037543, "grad_norm": 0.6624552991878297, "kl": 0.36328125, "learning_rate": 6.885665529010239e-07, "loss": 0.0004, "reward": 1.5768229365348816, "reward_std": 0.05852510221302509, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5768229067325592, "step": 1095 }, { "clip_ratio": 0.0, "completion_length": 121.40885543823242, "epoch": 3.750853242320819, "grad_norm": 0.9872114133575554, "kl": 0.3564453125, "learning_rate": 6.882821387940841e-07, "loss": 0.0004, "reward": 1.8138020634651184, "reward_std": 0.062246205285191536, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8138020932674408, "step": 1096 }, { "clip_ratio": 0.0, "completion_length": 119.88541793823242, "epoch": 3.7542662116040955, "grad_norm": 0.602383107664125, "kl": 0.37109375, "learning_rate": 6.879977246871445e-07, "loss": 0.0004, "reward": 1.6796875, "reward_std": 0.0647539496421814, "rewards/format_reward": 0.9947916865348816, "rewards/score_reward": 0.6848958134651184, "step": 1097 }, { "clip_ratio": 0.0, "completion_length": 121.20573425292969, "epoch": 3.757679180887372, "grad_norm": 10.106141350358037, "kl": 0.3388671875, "learning_rate": 6.877133105802048e-07, "loss": 0.0003, "reward": 1.79296875, "reward_std": 0.049843632616102695, "rewards/format_reward": 1.0, "rewards/score_reward": 0.79296875, "step": 1098 }, { "clip_ratio": 0.0, "completion_length": 121.93750381469727, "epoch": 3.7610921501706485, "grad_norm": 0.9764677247583751, "kl": 0.3662109375, "learning_rate": 6.87428896473265e-07, "loss": 0.0004, "reward": 1.716796875, "reward_std": 0.07031419314444065, "rewards/format_reward": 1.0, "rewards/score_reward": 0.716796875, "step": 1099 }, { "clip_ratio": 0.0, "completion_length": 119.0625, "epoch": 3.7645051194539247, "grad_norm": 1.237805400735445, "kl": 0.3671875, "learning_rate": 6.871444823663254e-07, "loss": 0.0004, "reward": 1.744140625, "reward_std": 0.06632845662534237, "rewards/format_reward": 1.0, "rewards/score_reward": 0.744140625, "step": 1100 }, { "clip_ratio": 0.0, "completion_length": 120.51041793823242, "epoch": 3.7679180887372015, "grad_norm": 0.6711243671750197, "kl": 0.3583984375, "learning_rate": 6.868600682593856e-07, "loss": 0.0004, "reward": 1.76171875, "reward_std": 0.058430466800928116, "rewards/format_reward": 1.0, "rewards/score_reward": 0.76171875, "step": 1101 }, { "clip_ratio": 0.0, "completion_length": 121.20573425292969, "epoch": 3.7713310580204777, "grad_norm": 0.6641926130286615, "kl": 0.3876953125, "learning_rate": 6.865756541524459e-07, "loss": 0.0004, "reward": 1.6575520634651184, "reward_std": 0.04846424423158169, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.66015625, "step": 1102 }, { "clip_ratio": 0.0, "completion_length": 118.55469131469727, "epoch": 3.774744027303754, "grad_norm": 0.7359208621491775, "kl": 0.369140625, "learning_rate": 6.862912400455062e-07, "loss": 0.0004, "reward": 1.6647135019302368, "reward_std": 0.06251206062734127, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6647135615348816, "step": 1103 }, { "clip_ratio": 0.0, "completion_length": 120.04687881469727, "epoch": 3.7781569965870307, "grad_norm": 1.007701793717653, "kl": 0.37890625, "learning_rate": 6.860068259385665e-07, "loss": 0.0004, "reward": 1.7252604365348816, "reward_std": 0.06672638282179832, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7278645634651184, "step": 1104 }, { "clip_ratio": 0.0, "completion_length": 118.07812881469727, "epoch": 3.781569965870307, "grad_norm": 2.00593582552534, "kl": 0.3525390625, "learning_rate": 6.857224118316268e-07, "loss": 0.0004, "reward": 1.6803385615348816, "reward_std": 0.05793033912777901, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6803385317325592, "step": 1105 }, { "clip_ratio": 0.0, "completion_length": 118.38281631469727, "epoch": 3.7849829351535837, "grad_norm": 1.2536877022937079, "kl": 0.3798828125, "learning_rate": 6.854379977246871e-07, "loss": 0.0004, "reward": 1.625, "reward_std": 0.07557899877429008, "rewards/format_reward": 1.0, "rewards/score_reward": 0.625, "step": 1106 }, { "clip_ratio": 0.0, "completion_length": 118.61198425292969, "epoch": 3.78839590443686, "grad_norm": 2.0078303542725635, "kl": 0.3603515625, "learning_rate": 6.851535836177475e-07, "loss": 0.0004, "reward": 1.6901041865348816, "reward_std": 0.09679494984447956, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6927083134651184, "step": 1107 }, { "clip_ratio": 0.0, "completion_length": 118.5703125, "epoch": 3.7918088737201368, "grad_norm": 0.9287239918414221, "kl": 0.3935546875, "learning_rate": 6.848691695108077e-07, "loss": 0.0004, "reward": 1.6588541865348816, "reward_std": 0.09523891285061836, "rewards/format_reward": 0.9947916865348816, "rewards/score_reward": 0.6640625, "step": 1108 }, { "clip_ratio": 0.0, "completion_length": 114.11458587646484, "epoch": 3.795221843003413, "grad_norm": 1.0557692711432438, "kl": 0.369140625, "learning_rate": 6.845847554038679e-07, "loss": 0.0004, "reward": 1.7044270634651184, "reward_std": 0.07540203258395195, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7044270932674408, "step": 1109 }, { "clip_ratio": 0.0, "completion_length": 118.72917175292969, "epoch": 3.7986348122866893, "grad_norm": 0.5580385850697952, "kl": 0.3642578125, "learning_rate": 6.843003412969283e-07, "loss": 0.0004, "reward": 1.6555989384651184, "reward_std": 0.04478245601058006, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6555989682674408, "step": 1110 }, { "clip_ratio": 0.0, "completion_length": 118.71354293823242, "epoch": 3.802047781569966, "grad_norm": 7.763655012957424, "kl": 0.5869140625, "learning_rate": 6.840159271899885e-07, "loss": 0.0006, "reward": 1.7766927480697632, "reward_std": 0.028779208660125732, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7766926884651184, "step": 1111 }, { "clip_ratio": 0.0, "completion_length": 118.91927337646484, "epoch": 3.8054607508532423, "grad_norm": 0.4618633609931731, "kl": 0.3681640625, "learning_rate": 6.837315130830489e-07, "loss": 0.0004, "reward": 1.7701822519302368, "reward_std": 0.03862451668828726, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7727864682674408, "step": 1112 }, { "clip_ratio": 0.0, "completion_length": 116.90104675292969, "epoch": 3.8088737201365186, "grad_norm": 0.8954637025009311, "kl": 0.3564453125, "learning_rate": 6.834470989761092e-07, "loss": 0.0004, "reward": 1.6901041865348816, "reward_std": 0.0633291732519865, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6901041567325592, "step": 1113 }, { "clip_ratio": 0.0, "completion_length": 118.67448043823242, "epoch": 3.8122866894197953, "grad_norm": 0.9488534349477457, "kl": 0.3857421875, "learning_rate": 6.831626848691696e-07, "loss": 0.0004, "reward": 1.69140625, "reward_std": 0.07257397472858429, "rewards/format_reward": 1.0, "rewards/score_reward": 0.69140625, "step": 1114 }, { "clip_ratio": 0.0, "completion_length": 115.4140625, "epoch": 3.8156996587030716, "grad_norm": 0.949088799233436, "kl": 0.3828125, "learning_rate": 6.828782707622298e-07, "loss": 0.0004, "reward": 1.712890625, "reward_std": 0.08217230718582869, "rewards/format_reward": 1.0, "rewards/score_reward": 0.712890625, "step": 1115 }, { "clip_ratio": 0.0, "completion_length": 115.29166793823242, "epoch": 3.819112627986348, "grad_norm": 0.7455284188750294, "kl": 0.3642578125, "learning_rate": 6.825938566552901e-07, "loss": 0.0004, "reward": 1.7727864980697632, "reward_std": 0.06525794230401516, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7727864384651184, "step": 1116 }, { "clip_ratio": 0.0, "completion_length": 113.90364837646484, "epoch": 3.8225255972696246, "grad_norm": 0.8264026897079306, "kl": 0.37890625, "learning_rate": 6.823094425483504e-07, "loss": 0.0004, "reward": 1.7845052480697632, "reward_std": 0.040966885164380074, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7845051884651184, "step": 1117 }, { "clip_ratio": 0.0, "completion_length": 117.2734375, "epoch": 3.825938566552901, "grad_norm": 1.3235169987607691, "kl": 0.3984375, "learning_rate": 6.820250284414106e-07, "loss": 0.0004, "reward": 1.6881510019302368, "reward_std": 0.06266579031944275, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6881510615348816, "step": 1118 }, { "clip_ratio": 0.0, "completion_length": 116.66927337646484, "epoch": 3.8293515358361776, "grad_norm": 1.297881452400758, "kl": 0.3779296875, "learning_rate": 6.81740614334471e-07, "loss": 0.0004, "reward": 1.7936198115348816, "reward_std": 0.07458573207259178, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7936197817325592, "step": 1119 }, { "clip_ratio": 0.0, "completion_length": 114.3359375, "epoch": 3.832764505119454, "grad_norm": 0.49538069182609196, "kl": 0.388671875, "learning_rate": 6.814562002275313e-07, "loss": 0.0004, "reward": 1.8424479365348816, "reward_std": 0.036428830586373806, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8424479067325592, "step": 1120 }, { "clip_ratio": 0.0, "completion_length": 113.07291793823242, "epoch": 3.8361774744027306, "grad_norm": 1.0470620442502627, "kl": 0.3701171875, "learning_rate": 6.811717861205915e-07, "loss": 0.0004, "reward": 1.7897135615348816, "reward_std": 0.08110326901078224, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7897135317325592, "step": 1121 }, { "clip_ratio": 0.0, "completion_length": 115.23698425292969, "epoch": 3.839590443686007, "grad_norm": 1.7320648828217513, "kl": 0.3759765625, "learning_rate": 6.808873720136519e-07, "loss": 0.0004, "reward": 1.7174479365348816, "reward_std": 0.0560015719383955, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7174479067325592, "step": 1122 }, { "clip_ratio": 0.0, "completion_length": 113.80729675292969, "epoch": 3.843003412969283, "grad_norm": 0.8338124195014553, "kl": 0.3779296875, "learning_rate": 6.806029579067122e-07, "loss": 0.0004, "reward": 1.7819010019302368, "reward_std": 0.06676553562283516, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7845052182674408, "step": 1123 }, { "clip_ratio": 0.0, "completion_length": 112.62500381469727, "epoch": 3.84641638225256, "grad_norm": 0.5194835173939176, "kl": 0.3837890625, "learning_rate": 6.803185437997725e-07, "loss": 0.0004, "reward": 1.8463541865348816, "reward_std": 0.025647208094596863, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8463541567325592, "step": 1124 }, { "clip_ratio": 0.0, "completion_length": 112.22135543823242, "epoch": 3.849829351535836, "grad_norm": 2.274413316598466, "kl": 0.3828125, "learning_rate": 6.800341296928327e-07, "loss": 0.0004, "reward": 1.6595052480697632, "reward_std": 0.061043212190270424, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6595051884651184, "step": 1125 }, { "clip_ratio": 0.0, "completion_length": 110.14583587646484, "epoch": 3.8532423208191124, "grad_norm": 1.0164850848523372, "kl": 0.376953125, "learning_rate": 6.79749715585893e-07, "loss": 0.0004, "reward": 1.751953125, "reward_std": 0.06413818057626486, "rewards/format_reward": 1.0, "rewards/score_reward": 0.751953125, "step": 1126 }, { "clip_ratio": 0.0, "completion_length": 110.47396087646484, "epoch": 3.856655290102389, "grad_norm": 1.7548278990450603, "kl": 0.3818359375, "learning_rate": 6.794653014789533e-07, "loss": 0.0004, "reward": 1.71875, "reward_std": 0.032613092102110386, "rewards/format_reward": 1.0, "rewards/score_reward": 0.71875, "step": 1127 }, { "clip_ratio": 0.0, "completion_length": 111.81510543823242, "epoch": 3.8600682593856654, "grad_norm": 1.0397461760229574, "kl": 0.3779296875, "learning_rate": 6.791808873720136e-07, "loss": 0.0004, "reward": 1.6315103769302368, "reward_std": 0.04083374794572592, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6315104365348816, "step": 1128 }, { "clip_ratio": 0.0, "completion_length": 110.11198043823242, "epoch": 3.8634812286689417, "grad_norm": 0.6362252758664803, "kl": 0.3857421875, "learning_rate": 6.78896473265074e-07, "loss": 0.0004, "reward": 1.7552083134651184, "reward_std": 0.03728215675801039, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7552083432674408, "step": 1129 }, { "clip_ratio": 0.0, "completion_length": 108.50781631469727, "epoch": 3.8668941979522184, "grad_norm": 1.9367841527940912, "kl": 0.3828125, "learning_rate": 6.786120591581342e-07, "loss": 0.0004, "reward": 1.6744791865348816, "reward_std": 0.033145627938210964, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6744791567325592, "step": 1130 }, { "clip_ratio": 0.0, "completion_length": 109.09635925292969, "epoch": 3.8703071672354947, "grad_norm": 8.333297398375407, "kl": 0.4375, "learning_rate": 6.783276450511945e-07, "loss": 0.0004, "reward": 1.650390625, "reward_std": 0.060367338359355927, "rewards/format_reward": 1.0, "rewards/score_reward": 0.650390625, "step": 1131 }, { "clip_ratio": 0.0, "completion_length": 106.77604675292969, "epoch": 3.8737201365187715, "grad_norm": 1.0591901756661513, "kl": 0.3974609375, "learning_rate": 6.780432309442548e-07, "loss": 0.0004, "reward": 1.7604166269302368, "reward_std": 0.06107109785079956, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7630208432674408, "step": 1132 }, { "clip_ratio": 0.0, "completion_length": 107.56510543823242, "epoch": 3.8771331058020477, "grad_norm": 0.7108218885165913, "kl": 0.388671875, "learning_rate": 6.77758816837315e-07, "loss": 0.0004, "reward": 1.7239583134651184, "reward_std": 0.05569721199572086, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7239583432674408, "step": 1133 }, { "clip_ratio": 0.0, "completion_length": 106.36719131469727, "epoch": 3.8805460750853245, "grad_norm": 1.6484313067805212, "kl": 0.3857421875, "learning_rate": 6.774744027303754e-07, "loss": 0.0004, "reward": 1.8151041865348816, "reward_std": 0.025647209025919437, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8151041567325592, "step": 1134 }, { "clip_ratio": 0.0, "completion_length": 108.25781631469727, "epoch": 3.8839590443686007, "grad_norm": 0.9123310263685032, "kl": 0.38671875, "learning_rate": 6.771899886234357e-07, "loss": 0.0004, "reward": 1.646484375, "reward_std": 0.05488009564578533, "rewards/format_reward": 1.0, "rewards/score_reward": 0.646484375, "step": 1135 }, { "clip_ratio": 0.0, "completion_length": 108.1796875, "epoch": 3.887372013651877, "grad_norm": 0.4030124406187424, "kl": 0.4013671875, "learning_rate": 6.769055745164959e-07, "loss": 0.0004, "reward": 1.671875, "reward_std": 0.021564548835158348, "rewards/format_reward": 1.0, "rewards/score_reward": 0.671875, "step": 1136 }, { "clip_ratio": 0.0, "completion_length": 106.09114837646484, "epoch": 3.8907849829351537, "grad_norm": 0.8650000522012478, "kl": 0.3662109375, "learning_rate": 6.766211604095563e-07, "loss": 0.0004, "reward": 1.80859375, "reward_std": 0.07344390079379082, "rewards/format_reward": 1.0, "rewards/score_reward": 0.80859375, "step": 1137 }, { "clip_ratio": 0.0, "completion_length": 106.10937881469727, "epoch": 3.89419795221843, "grad_norm": 0.4581225027360767, "kl": 0.388671875, "learning_rate": 6.763367463026166e-07, "loss": 0.0004, "reward": 1.8001301884651184, "reward_std": 0.036232246086001396, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8001302182674408, "step": 1138 }, { "clip_ratio": 0.0, "completion_length": 106.61979293823242, "epoch": 3.8976109215017063, "grad_norm": 2.2788178988142183, "kl": 0.3798828125, "learning_rate": 6.76052332195677e-07, "loss": 0.0004, "reward": 1.8131510615348816, "reward_std": 0.06251434795558453, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8131510317325592, "step": 1139 }, { "clip_ratio": 0.0, "completion_length": 106.46094131469727, "epoch": 3.901023890784983, "grad_norm": 1.0975252643752669, "kl": 0.3984375, "learning_rate": 6.757679180887371e-07, "loss": 0.0004, "reward": 1.6985676884651184, "reward_std": 0.04756496101617813, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.701171875, "step": 1140 }, { "clip_ratio": 0.0, "completion_length": 108.70573425292969, "epoch": 3.9044368600682593, "grad_norm": 0.9850292940300658, "kl": 0.38671875, "learning_rate": 6.754835039817974e-07, "loss": 0.0004, "reward": 1.6790364384651184, "reward_std": 0.05797654204070568, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6790364682674408, "step": 1141 }, { "clip_ratio": 0.0, "completion_length": 106.02864837646484, "epoch": 3.9078498293515356, "grad_norm": 0.6864184118356806, "kl": 0.388671875, "learning_rate": 6.751990898748578e-07, "loss": 0.0004, "reward": 1.7337239384651184, "reward_std": 0.03629634901881218, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7337239682674408, "step": 1142 }, { "clip_ratio": 0.0, "completion_length": 106.42187881469727, "epoch": 3.9112627986348123, "grad_norm": 0.989405728787817, "kl": 0.380859375, "learning_rate": 6.74914675767918e-07, "loss": 0.0004, "reward": 1.78125, "reward_std": 0.04265686310827732, "rewards/format_reward": 1.0, "rewards/score_reward": 0.78125, "step": 1143 }, { "clip_ratio": 0.0, "completion_length": 105.62760925292969, "epoch": 3.9146757679180886, "grad_norm": 0.5777466922273324, "kl": 0.376953125, "learning_rate": 6.746302616609784e-07, "loss": 0.0004, "reward": 1.734375, "reward_std": 0.0326139098033309, "rewards/format_reward": 1.0, "rewards/score_reward": 0.734375, "step": 1144 }, { "clip_ratio": 0.0, "completion_length": 105.79687881469727, "epoch": 3.9180887372013653, "grad_norm": 0.6710397787817208, "kl": 0.3857421875, "learning_rate": 6.743458475540387e-07, "loss": 0.0004, "reward": 1.712890625, "reward_std": 0.05424748919904232, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7154947817325592, "step": 1145 }, { "clip_ratio": 0.0, "completion_length": 106.32552337646484, "epoch": 3.9215017064846416, "grad_norm": 1.0894429307883926, "kl": 0.37890625, "learning_rate": 6.74061433447099e-07, "loss": 0.0004, "reward": 1.7220051884651184, "reward_std": 0.0396107342094183, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7220052182674408, "step": 1146 }, { "clip_ratio": 0.0, "completion_length": 106.93489837646484, "epoch": 3.9249146757679183, "grad_norm": 0.41455824001826813, "kl": 0.38671875, "learning_rate": 6.737770193401593e-07, "loss": 0.0004, "reward": 1.6907551884651184, "reward_std": 0.020122936461120844, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6907552182674408, "step": 1147 }, { "clip_ratio": 0.0, "completion_length": 106.265625, "epoch": 3.9283276450511946, "grad_norm": 0.49206954721592666, "kl": 0.38671875, "learning_rate": 6.734926052332195e-07, "loss": 0.0004, "reward": 1.759765625, "reward_std": 0.02595050586387515, "rewards/format_reward": 1.0, "rewards/score_reward": 0.759765625, "step": 1148 }, { "clip_ratio": 0.0, "completion_length": 106.86979675292969, "epoch": 3.931740614334471, "grad_norm": 1.0354525626929194, "kl": 0.400390625, "learning_rate": 6.732081911262798e-07, "loss": 0.0004, "reward": 1.6341145634651184, "reward_std": 0.05238816700875759, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6341145932674408, "step": 1149 }, { "clip_ratio": 0.0, "completion_length": 106.11198043823242, "epoch": 3.9351535836177476, "grad_norm": 0.8218302626809065, "kl": 0.3828125, "learning_rate": 6.729237770193401e-07, "loss": 0.0004, "reward": 1.6751301884651184, "reward_std": 0.054121240973472595, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6751302182674408, "step": 1150 }, { "clip_ratio": 0.0, "completion_length": 104.12760543823242, "epoch": 3.938566552901024, "grad_norm": 1.556933744053442, "kl": 0.3896484375, "learning_rate": 6.726393629124005e-07, "loss": 0.0004, "reward": 1.66796875, "reward_std": 0.04975482448935509, "rewards/format_reward": 1.0, "rewards/score_reward": 0.66796875, "step": 1151 }, { "clip_ratio": 0.0, "completion_length": 103.04687881469727, "epoch": 3.9419795221843, "grad_norm": 1.34628358818031, "kl": 0.3759765625, "learning_rate": 6.723549488054607e-07, "loss": 0.0004, "reward": 1.6302083134651184, "reward_std": 0.07880005799233913, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6302083432674408, "step": 1152 }, { "clip_ratio": 0.0, "completion_length": 105.53385543823242, "epoch": 3.945392491467577, "grad_norm": 1.3993876356397537, "kl": 0.375, "learning_rate": 6.72070534698521e-07, "loss": 0.0004, "reward": 1.787109375, "reward_std": 0.11741629987955093, "rewards/format_reward": 1.0, "rewards/score_reward": 0.787109375, "step": 1153 }, { "clip_ratio": 0.0, "completion_length": 102.99219131469727, "epoch": 3.948805460750853, "grad_norm": 5.065569365038972, "kl": 0.3984375, "learning_rate": 6.717861205915814e-07, "loss": 0.0004, "reward": 1.6809895634651184, "reward_std": 0.07099776156246662, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.68359375, "step": 1154 }, { "clip_ratio": 0.0, "completion_length": 102.92448425292969, "epoch": 3.9522184300341294, "grad_norm": 1.077448729032295, "kl": 0.3876953125, "learning_rate": 6.715017064846416e-07, "loss": 0.0004, "reward": 1.72265625, "reward_std": 0.06181042082607746, "rewards/format_reward": 1.0, "rewards/score_reward": 0.72265625, "step": 1155 }, { "clip_ratio": 0.0, "completion_length": 104.14062881469727, "epoch": 3.955631399317406, "grad_norm": 0.8749876482391493, "kl": 0.3916015625, "learning_rate": 6.712172923777019e-07, "loss": 0.0004, "reward": 1.6067708134651184, "reward_std": 0.04386314935982227, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6067708432674408, "step": 1156 }, { "clip_ratio": 0.0, "completion_length": 104.27864837646484, "epoch": 3.9590443686006824, "grad_norm": 1.6797896909582586, "kl": 0.3994140625, "learning_rate": 6.709328782707622e-07, "loss": 0.0004, "reward": 1.6204427480697632, "reward_std": 0.11001028120517731, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.623046875, "step": 1157 }, { "clip_ratio": 0.0, "completion_length": 100.74219131469727, "epoch": 3.962457337883959, "grad_norm": 1.027678819247799, "kl": 0.3857421875, "learning_rate": 6.706484641638224e-07, "loss": 0.0004, "reward": 1.6725260615348816, "reward_std": 0.03870734106749296, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6725260317325592, "step": 1158 }, { "clip_ratio": 0.0, "completion_length": 101.9140625, "epoch": 3.9658703071672354, "grad_norm": 0.7194693124132525, "kl": 0.3974609375, "learning_rate": 6.703640500568828e-07, "loss": 0.0004, "reward": 1.6822916269302368, "reward_std": 0.03629675507545471, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6822916865348816, "step": 1159 }, { "clip_ratio": 0.0, "completion_length": 100.64323425292969, "epoch": 3.969283276450512, "grad_norm": 0.6434209566563447, "kl": 0.396484375, "learning_rate": 6.700796359499431e-07, "loss": 0.0004, "reward": 1.673828125, "reward_std": 0.039389850571751595, "rewards/format_reward": 1.0, "rewards/score_reward": 0.673828125, "step": 1160 }, { "clip_ratio": 0.0, "completion_length": 98.46094131469727, "epoch": 3.9726962457337884, "grad_norm": 1.6778735541215342, "kl": 0.431640625, "learning_rate": 6.697952218430035e-07, "loss": 0.0004, "reward": 1.6412760615348816, "reward_std": 0.061984699219465256, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6412760317325592, "step": 1161 }, { "clip_ratio": 0.0, "completion_length": 98.171875, "epoch": 3.9761092150170647, "grad_norm": 0.6413507112751967, "kl": 0.3955078125, "learning_rate": 6.695108077360637e-07, "loss": 0.0004, "reward": 1.6529948115348816, "reward_std": 0.046643007546663284, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6529947817325592, "step": 1162 }, { "clip_ratio": 0.0, "completion_length": 97.68489837646484, "epoch": 3.9795221843003414, "grad_norm": 1.0553872819065808, "kl": 0.3974609375, "learning_rate": 6.692263936291239e-07, "loss": 0.0004, "reward": 1.677734375, "reward_std": 0.07146661169826984, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6803385317325592, "step": 1163 }, { "clip_ratio": 0.0, "completion_length": 98.65625381469727, "epoch": 3.9829351535836177, "grad_norm": 0.29494160867781294, "kl": 0.4013671875, "learning_rate": 6.689419795221843e-07, "loss": 0.0004, "reward": 1.7630208730697632, "reward_std": 0.010782274417579174, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7630208134651184, "step": 1164 }, { "clip_ratio": 0.0, "completion_length": 97.98958587646484, "epoch": 3.986348122866894, "grad_norm": 1.0198710558148978, "kl": 0.4208984375, "learning_rate": 6.686575654152445e-07, "loss": 0.0004, "reward": 1.7747395634651184, "reward_std": 0.05159730650484562, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7747395932674408, "step": 1165 }, { "clip_ratio": 0.0, "completion_length": 98.37760543823242, "epoch": 3.9897610921501707, "grad_norm": 0.9170672542210703, "kl": 0.4140625, "learning_rate": 6.683731513083049e-07, "loss": 0.0004, "reward": 1.7473958134651184, "reward_std": 0.05243354942649603, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7473958432674408, "step": 1166 }, { "clip_ratio": 0.0, "completion_length": 96.71614837646484, "epoch": 3.993174061433447, "grad_norm": 0.8538461198903122, "kl": 0.396484375, "learning_rate": 6.680887372013652e-07, "loss": 0.0004, "reward": 1.8098958134651184, "reward_std": 0.03996088542044163, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.8125, "step": 1167 }, { "clip_ratio": 0.0, "completion_length": 94.70000457763672, "epoch": 3.9965870307167233, "grad_norm": 1.8378886775336896, "kl": 0.4453125, "learning_rate": 6.678043230944254e-07, "loss": 0.0004, "reward": 1.7208333611488342, "reward_std": 0.06408699601888657, "rewards/format_reward": 1.0, "rewards/score_reward": 0.720833346247673, "step": 1168 }, { "clip_ratio": 0.0, "completion_length": 97.59375381469727, "epoch": 4.003412969283277, "grad_norm": 3.3412628345742545, "kl": 0.416015625, "learning_rate": 6.675199089874858e-07, "loss": 0.0004, "reward": 1.7721353769302368, "reward_std": 0.013193263672292233, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7721354365348816, "step": 1169 }, { "clip_ratio": 0.0, "completion_length": 96.27864837646484, "epoch": 4.006825938566553, "grad_norm": 1.4028754538627335, "kl": 0.708984375, "learning_rate": 6.672354948805461e-07, "loss": 0.0007, "reward": 1.6966145634651184, "reward_std": 0.061539310961961746, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6966145932674408, "step": 1170 }, { "clip_ratio": 0.0, "completion_length": 98.42708587646484, "epoch": 4.010238907849829, "grad_norm": 0.6972073887626499, "kl": 0.4052734375, "learning_rate": 6.669510807736063e-07, "loss": 0.0004, "reward": 1.720703125, "reward_std": 0.06791908480226994, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7233072817325592, "step": 1171 }, { "clip_ratio": 0.0, "completion_length": 96.28125, "epoch": 4.013651877133106, "grad_norm": 2.0440677927011555, "kl": 0.408203125, "learning_rate": 6.666666666666666e-07, "loss": 0.0004, "reward": 1.7109375, "reward_std": 0.041535634547472, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7109375, "step": 1172 }, { "clip_ratio": 0.0, "completion_length": 98.04687881469727, "epoch": 4.017064846416382, "grad_norm": 0.7666781411456298, "kl": 0.4287109375, "learning_rate": 6.663822525597269e-07, "loss": 0.0004, "reward": 1.7317708134651184, "reward_std": 0.035158444195985794, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7317708432674408, "step": 1173 }, { "clip_ratio": 0.0, "completion_length": 94.84635925292969, "epoch": 4.020477815699659, "grad_norm": 3.137478667657675, "kl": 0.43359375, "learning_rate": 6.660978384527872e-07, "loss": 0.0004, "reward": 1.7740885615348816, "reward_std": 0.04399511404335499, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7766927182674408, "step": 1174 }, { "clip_ratio": 0.0, "completion_length": 96.81510543823242, "epoch": 4.023890784982935, "grad_norm": 0.8619728419904281, "kl": 0.427734375, "learning_rate": 6.658134243458475e-07, "loss": 0.0004, "reward": 1.80859375, "reward_std": 0.06088988855481148, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.8111979365348816, "step": 1175 }, { "clip_ratio": 0.0, "completion_length": 96.14062881469727, "epoch": 4.027303754266212, "grad_norm": 1.7672321564330653, "kl": 0.4150390625, "learning_rate": 6.655290102389079e-07, "loss": 0.0004, "reward": 1.7779947519302368, "reward_std": 0.09256206452846527, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7779948115348816, "step": 1176 }, { "clip_ratio": 0.0, "completion_length": 97.00781631469727, "epoch": 4.030716723549488, "grad_norm": 0.7221164760761016, "kl": 0.421875, "learning_rate": 6.652445961319681e-07, "loss": 0.0004, "reward": 1.6848958134651184, "reward_std": 0.05231424793601036, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6848958432674408, "step": 1177 }, { "clip_ratio": 0.0, "completion_length": 92.94791793823242, "epoch": 4.034129692832765, "grad_norm": 8.196059957077942, "kl": 0.4267578125, "learning_rate": 6.649601820250284e-07, "loss": 0.0004, "reward": 1.6608072519302368, "reward_std": 0.037265317514538765, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6608073115348816, "step": 1178 }, { "clip_ratio": 0.0, "completion_length": 92.89844131469727, "epoch": 4.037542662116041, "grad_norm": 0.7972890185107325, "kl": 0.4404296875, "learning_rate": 6.646757679180887e-07, "loss": 0.0004, "reward": 1.7747395634651184, "reward_std": 0.048768600448966026, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.77734375, "step": 1179 }, { "clip_ratio": 0.0, "completion_length": 94.66406631469727, "epoch": 4.040955631399317, "grad_norm": 0.9412952196555546, "kl": 0.443359375, "learning_rate": 6.643913538111489e-07, "loss": 0.0004, "reward": 1.7135416865348816, "reward_std": 0.06884944811463356, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7135416567325592, "step": 1180 }, { "clip_ratio": 0.0, "completion_length": 93.13541793823242, "epoch": 4.044368600682594, "grad_norm": 1.2129731458244448, "kl": 0.435546875, "learning_rate": 6.641069397042093e-07, "loss": 0.0004, "reward": 1.7610677480697632, "reward_std": 0.05772817134857178, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7610676884651184, "step": 1181 }, { "clip_ratio": 0.0, "completion_length": 95.0546875, "epoch": 4.047781569965871, "grad_norm": 0.7754905597048687, "kl": 0.4345703125, "learning_rate": 6.638225255972696e-07, "loss": 0.0004, "reward": 1.6536458134651184, "reward_std": 0.041669175028800964, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6536458432674408, "step": 1182 }, { "clip_ratio": 0.0, "completion_length": 93.61979293823242, "epoch": 4.051194539249146, "grad_norm": 0.7641974143179343, "kl": 0.4296875, "learning_rate": 6.635381114903299e-07, "loss": 0.0004, "reward": 1.8326822519302368, "reward_std": 0.058962758630514145, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8326823115348816, "step": 1183 }, { "clip_ratio": 0.0, "completion_length": 95.74479675292969, "epoch": 4.054607508532423, "grad_norm": 0.9892431242819292, "kl": 0.4150390625, "learning_rate": 6.632536973833902e-07, "loss": 0.0004, "reward": 1.7408854365348816, "reward_std": 0.06194167956709862, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7408854067325592, "step": 1184 }, { "clip_ratio": 0.0, "completion_length": 97.21614837646484, "epoch": 4.0580204778157, "grad_norm": 0.8387973002849841, "kl": 0.4150390625, "learning_rate": 6.629692832764505e-07, "loss": 0.0004, "reward": 1.5989583730697632, "reward_std": 0.07093482092022896, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5989583134651184, "step": 1185 }, { "clip_ratio": 0.0, "completion_length": 97.45833587646484, "epoch": 4.061433447098976, "grad_norm": 0.8428946329625321, "kl": 0.4326171875, "learning_rate": 6.626848691695109e-07, "loss": 0.0004, "reward": 1.6692708134651184, "reward_std": 0.06912991777062416, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.671875, "step": 1186 }, { "clip_ratio": 0.0, "completion_length": 96.16927337646484, "epoch": 4.064846416382252, "grad_norm": 0.49309110472651024, "kl": 0.4423828125, "learning_rate": 6.62400455062571e-07, "loss": 0.0004, "reward": 1.7161458134651184, "reward_std": 0.025647207628935575, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7161458432674408, "step": 1187 }, { "clip_ratio": 0.0, "completion_length": 95.79948043823242, "epoch": 4.068259385665529, "grad_norm": 0.803245504034232, "kl": 0.4404296875, "learning_rate": 6.621160409556313e-07, "loss": 0.0004, "reward": 1.734375, "reward_std": 0.06663322076201439, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7369791567325592, "step": 1188 }, { "clip_ratio": 0.0, "completion_length": 95.96354293823242, "epoch": 4.071672354948806, "grad_norm": 1.0269122113269902, "kl": 0.4267578125, "learning_rate": 6.618316268486917e-07, "loss": 0.0004, "reward": 1.671875, "reward_std": 0.05578455328941345, "rewards/format_reward": 1.0, "rewards/score_reward": 0.671875, "step": 1189 }, { "clip_ratio": 0.0, "completion_length": 96.32291793823242, "epoch": 4.075085324232082, "grad_norm": 0.6685564133956828, "kl": 0.4208984375, "learning_rate": 6.615472127417519e-07, "loss": 0.0004, "reward": 1.8287760615348816, "reward_std": 0.044319361448287964, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8287760317325592, "step": 1190 }, { "clip_ratio": 0.0, "completion_length": 98.99219131469727, "epoch": 4.078498293515358, "grad_norm": 0.8595369230376535, "kl": 0.4267578125, "learning_rate": 6.612627986348123e-07, "loss": 0.0004, "reward": 1.7421875, "reward_std": 0.10203251615166664, "rewards/format_reward": 0.9947916865348816, "rewards/score_reward": 0.7473958432674408, "step": 1191 }, { "clip_ratio": 0.0, "completion_length": 98.9765625, "epoch": 4.081911262798635, "grad_norm": 0.8087002870068064, "kl": 0.42578125, "learning_rate": 6.609783845278726e-07, "loss": 0.0004, "reward": 1.7994791865348816, "reward_std": 0.043775808066129684, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7994791567325592, "step": 1192 }, { "clip_ratio": 0.0, "completion_length": 100.00781631469727, "epoch": 4.085324232081911, "grad_norm": 0.7140992279169269, "kl": 0.435546875, "learning_rate": 6.606939704209329e-07, "loss": 0.0004, "reward": 1.7369791865348816, "reward_std": 0.03399912267923355, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7369791567325592, "step": 1193 }, { "clip_ratio": 0.0, "completion_length": 99.515625, "epoch": 4.088737201365188, "grad_norm": 0.9750265086969125, "kl": 0.4072265625, "learning_rate": 6.604095563139932e-07, "loss": 0.0004, "reward": 1.8209635615348816, "reward_std": 0.07562184892594814, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8209635317325592, "step": 1194 }, { "clip_ratio": 0.0, "completion_length": 101.12239837646484, "epoch": 4.092150170648464, "grad_norm": 0.9047602595888815, "kl": 0.423828125, "learning_rate": 6.601251422070534e-07, "loss": 0.0004, "reward": 1.755859375, "reward_std": 0.08936555683612823, "rewards/format_reward": 1.0, "rewards/score_reward": 0.755859375, "step": 1195 }, { "clip_ratio": 0.0, "completion_length": 101.49219131469727, "epoch": 4.09556313993174, "grad_norm": 1.3857877889814254, "kl": 0.4345703125, "learning_rate": 6.598407281001137e-07, "loss": 0.0004, "reward": 1.69921875, "reward_std": 0.04988836217671633, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7018229067325592, "step": 1196 }, { "clip_ratio": 0.0, "completion_length": 101.59896087646484, "epoch": 4.098976109215017, "grad_norm": 1.3675758980492976, "kl": 0.4072265625, "learning_rate": 6.59556313993174e-07, "loss": 0.0004, "reward": 1.7311198115348816, "reward_std": 0.03763682767748833, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7311197817325592, "step": 1197 }, { "clip_ratio": 0.0, "completion_length": 99.95052337646484, "epoch": 4.102389078498294, "grad_norm": 1.1221963392593093, "kl": 0.4130859375, "learning_rate": 6.592718998862344e-07, "loss": 0.0004, "reward": 1.7389323115348816, "reward_std": 0.044192456640303135, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7389322817325592, "step": 1198 }, { "clip_ratio": 0.0, "completion_length": 101.6328125, "epoch": 4.1058020477815695, "grad_norm": 0.8596875079477807, "kl": 0.408203125, "learning_rate": 6.589874857792946e-07, "loss": 0.0004, "reward": 1.7076822519302368, "reward_std": 0.052014607936143875, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7076823115348816, "step": 1199 }, { "clip_ratio": 0.0, "completion_length": 102.77083587646484, "epoch": 4.109215017064846, "grad_norm": 1.2261030464459246, "kl": 0.4033203125, "learning_rate": 6.587030716723549e-07, "loss": 0.0004, "reward": 1.6959635615348816, "reward_std": 0.08923429809510708, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6959635317325592, "step": 1200 }, { "clip_ratio": 0.0, "completion_length": 100.40625381469727, "epoch": 4.112627986348123, "grad_norm": 0.6321954998685975, "kl": 0.421875, "learning_rate": 6.584186575654153e-07, "loss": 0.0004, "reward": 1.8235676884651184, "reward_std": 0.033733916468918324, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8235677182674408, "step": 1201 }, { "clip_ratio": 0.0, "completion_length": 101.44010925292969, "epoch": 4.1160409556314, "grad_norm": 1.4030346426961988, "kl": 0.4189453125, "learning_rate": 6.581342434584754e-07, "loss": 0.0004, "reward": 1.66015625, "reward_std": 0.0452192947268486, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6627604067325592, "step": 1202 }, { "clip_ratio": 0.0, "completion_length": 101.82812881469727, "epoch": 4.1194539249146755, "grad_norm": 1.5345593046213317, "kl": 0.41015625, "learning_rate": 6.578498293515358e-07, "loss": 0.0004, "reward": 1.8391926884651184, "reward_std": 0.049559054896235466, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8391927182674408, "step": 1203 }, { "clip_ratio": 0.0, "completion_length": 100.70833587646484, "epoch": 4.122866894197952, "grad_norm": 0.8197498103658135, "kl": 0.416015625, "learning_rate": 6.575654152445961e-07, "loss": 0.0004, "reward": 1.7421875, "reward_std": 0.04964146576821804, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7421875, "step": 1204 }, { "clip_ratio": 0.0, "completion_length": 101.53385925292969, "epoch": 4.126279863481229, "grad_norm": 2.320340702932209, "kl": 0.4150390625, "learning_rate": 6.572810011376564e-07, "loss": 0.0004, "reward": 1.779296875, "reward_std": 0.0324799595400691, "rewards/format_reward": 1.0, "rewards/score_reward": 0.779296875, "step": 1205 }, { "clip_ratio": 0.0, "completion_length": 100.33854675292969, "epoch": 4.129692832764505, "grad_norm": 0.816343291796798, "kl": 0.42578125, "learning_rate": 6.569965870307167e-07, "loss": 0.0004, "reward": 1.71875, "reward_std": 0.053703938610851765, "rewards/format_reward": 1.0, "rewards/score_reward": 0.71875, "step": 1206 }, { "clip_ratio": 0.0, "completion_length": 101.38541793823242, "epoch": 4.1331058020477816, "grad_norm": 1.3993626013355278, "kl": 0.412109375, "learning_rate": 6.56712172923777e-07, "loss": 0.0004, "reward": 1.7838541269302368, "reward_std": 0.04876795317977667, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7838541865348816, "step": 1207 }, { "clip_ratio": 0.0, "completion_length": 100.23177337646484, "epoch": 4.136518771331058, "grad_norm": 0.9233230801651963, "kl": 0.404296875, "learning_rate": 6.564277588168374e-07, "loss": 0.0004, "reward": 1.6529947519302368, "reward_std": 0.0674026608467102, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6529948115348816, "step": 1208 }, { "clip_ratio": 0.0, "completion_length": 100.01302337646484, "epoch": 4.139931740614334, "grad_norm": 0.9762440690747196, "kl": 0.41796875, "learning_rate": 6.561433447098976e-07, "loss": 0.0004, "reward": 1.6803385615348816, "reward_std": 0.05755882151424885, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6803385317325592, "step": 1209 }, { "clip_ratio": 0.0, "completion_length": 101.45052337646484, "epoch": 4.143344709897611, "grad_norm": 1.5102081056876135, "kl": 0.419921875, "learning_rate": 6.558589306029578e-07, "loss": 0.0004, "reward": 1.6829426884651184, "reward_std": 0.05516573786735535, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6829427182674408, "step": 1210 }, { "clip_ratio": 0.0, "completion_length": 98.45833587646484, "epoch": 4.146757679180888, "grad_norm": 9.053118735235923, "kl": 0.408203125, "learning_rate": 6.555745164960182e-07, "loss": 0.0004, "reward": 1.7994791865348816, "reward_std": 0.05366002395749092, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7994791567325592, "step": 1211 }, { "clip_ratio": 0.0, "completion_length": 99.46094131469727, "epoch": 4.150170648464163, "grad_norm": 0.5293302451015647, "kl": 0.4052734375, "learning_rate": 6.552901023890784e-07, "loss": 0.0004, "reward": 1.732421875, "reward_std": 0.02240120153874159, "rewards/format_reward": 1.0, "rewards/score_reward": 0.732421875, "step": 1212 }, { "clip_ratio": 0.0, "completion_length": 101.05989837646484, "epoch": 4.15358361774744, "grad_norm": 0.9779638412712145, "kl": 0.4228515625, "learning_rate": 6.550056882821388e-07, "loss": 0.0004, "reward": 1.7545573115348816, "reward_std": 0.04307269863784313, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7545572817325592, "step": 1213 }, { "clip_ratio": 0.0, "completion_length": 100.25781631469727, "epoch": 4.156996587030717, "grad_norm": 0.874132535259472, "kl": 0.4150390625, "learning_rate": 6.547212741751991e-07, "loss": 0.0004, "reward": 1.7682291865348816, "reward_std": 0.06323454063385725, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7682291567325592, "step": 1214 }, { "clip_ratio": 0.0, "completion_length": 99.69271087646484, "epoch": 4.160409556313994, "grad_norm": 1.1482834329575309, "kl": 0.4248046875, "learning_rate": 6.544368600682593e-07, "loss": 0.0004, "reward": 1.765625, "reward_std": 0.06417602486908436, "rewards/format_reward": 1.0, "rewards/score_reward": 0.765625, "step": 1215 }, { "clip_ratio": 0.0, "completion_length": 99.0390625, "epoch": 4.163822525597269, "grad_norm": 1.5836802664280012, "kl": 0.412109375, "learning_rate": 6.541524459613197e-07, "loss": 0.0004, "reward": 1.76953125, "reward_std": 0.046072788536548615, "rewards/format_reward": 1.0, "rewards/score_reward": 0.76953125, "step": 1216 }, { "clip_ratio": 0.0, "completion_length": 100.23958587646484, "epoch": 4.167235494880546, "grad_norm": 0.5951837240500405, "kl": 0.4140625, "learning_rate": 6.5386803185438e-07, "loss": 0.0004, "reward": 1.7473958134651184, "reward_std": 0.02665361436083913, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7473958432674408, "step": 1217 }, { "clip_ratio": 0.0, "completion_length": 98.98698043823242, "epoch": 4.170648464163823, "grad_norm": 1.175079422044949, "kl": 0.4306640625, "learning_rate": 6.535836177474402e-07, "loss": 0.0004, "reward": 1.8391926884651184, "reward_std": 0.06731613911688328, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8391927182674408, "step": 1218 }, { "clip_ratio": 0.0, "completion_length": 100.42708587646484, "epoch": 4.174061433447099, "grad_norm": 1.1304632307260274, "kl": 0.4296875, "learning_rate": 6.532992036405005e-07, "loss": 0.0004, "reward": 1.8626301884651184, "reward_std": 0.07755086198449135, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8626302182674408, "step": 1219 }, { "clip_ratio": 0.0, "completion_length": 101.53646087646484, "epoch": 4.177474402730375, "grad_norm": 4.521625428330359, "kl": 0.4091796875, "learning_rate": 6.530147895335608e-07, "loss": 0.0004, "reward": 1.7994791269302368, "reward_std": 0.03638409823179245, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7994791865348816, "step": 1220 }, { "clip_ratio": 0.0, "completion_length": 99.53385543823242, "epoch": 4.180887372013652, "grad_norm": 0.6601233350931669, "kl": 0.4091796875, "learning_rate": 6.527303754266211e-07, "loss": 0.0004, "reward": 1.70703125, "reward_std": 0.04289483278989792, "rewards/format_reward": 1.0, "rewards/score_reward": 0.70703125, "step": 1221 }, { "clip_ratio": 0.0, "completion_length": 101.89583587646484, "epoch": 4.184300341296928, "grad_norm": 1.5607816435750714, "kl": 0.462890625, "learning_rate": 6.524459613196814e-07, "loss": 0.0005, "reward": 1.7057291865348816, "reward_std": 0.03515762463212013, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7057291567325592, "step": 1222 }, { "clip_ratio": 0.0, "completion_length": 103.67708587646484, "epoch": 4.187713310580205, "grad_norm": 1.3335745401176982, "kl": 0.4423828125, "learning_rate": 6.521615472127418e-07, "loss": 0.0004, "reward": 1.7102864980697632, "reward_std": 0.1111479364335537, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7102864384651184, "step": 1223 }, { "clip_ratio": 0.0, "completion_length": 102.65364837646484, "epoch": 4.191126279863481, "grad_norm": 2.676295724094691, "kl": 0.416015625, "learning_rate": 6.51877133105802e-07, "loss": 0.0004, "reward": 1.767578125, "reward_std": 0.06192483939230442, "rewards/format_reward": 1.0, "rewards/score_reward": 0.767578125, "step": 1224 }, { "clip_ratio": 0.0, "completion_length": 103.31250381469727, "epoch": 4.194539249146757, "grad_norm": 0.6000699368196561, "kl": 0.4130859375, "learning_rate": 6.515927189988623e-07, "loss": 0.0004, "reward": 1.8092447519302368, "reward_std": 0.02893065381795168, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8092448115348816, "step": 1225 }, { "clip_ratio": 0.0, "completion_length": 102.46875381469727, "epoch": 4.197952218430034, "grad_norm": 0.8433249348967469, "kl": 0.404296875, "learning_rate": 6.513083048919226e-07, "loss": 0.0004, "reward": 1.7721354365348816, "reward_std": 0.034086463041603565, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7721354067325592, "step": 1226 }, { "clip_ratio": 0.0, "completion_length": 104.10677337646484, "epoch": 4.201365187713311, "grad_norm": 0.7629264404210659, "kl": 0.4033203125, "learning_rate": 6.510238907849829e-07, "loss": 0.0004, "reward": 1.771484375, "reward_std": 0.04309223499149084, "rewards/format_reward": 1.0, "rewards/score_reward": 0.771484375, "step": 1227 }, { "clip_ratio": 0.0, "completion_length": 103.90625381469727, "epoch": 4.204778156996587, "grad_norm": 2.1037950996965877, "kl": 0.431640625, "learning_rate": 6.507394766780432e-07, "loss": 0.0004, "reward": 1.7272135615348816, "reward_std": 0.07465418800711632, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7272135317325592, "step": 1228 }, { "clip_ratio": 0.0, "completion_length": 103.08594131469727, "epoch": 4.208191126279863, "grad_norm": 1.250874225033751, "kl": 0.412109375, "learning_rate": 6.504550625711035e-07, "loss": 0.0004, "reward": 1.7115885615348816, "reward_std": 0.06785846687853336, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7115885317325592, "step": 1229 }, { "clip_ratio": 0.0, "completion_length": 103.83854675292969, "epoch": 4.21160409556314, "grad_norm": 0.48731157957067484, "kl": 0.408203125, "learning_rate": 6.501706484641638e-07, "loss": 0.0004, "reward": 1.7513020634651184, "reward_std": 0.025228843558579683, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7513020932674408, "step": 1230 }, { "clip_ratio": 0.0, "completion_length": 105.52083587646484, "epoch": 4.215017064846417, "grad_norm": 1.1241323428748349, "kl": 0.4345703125, "learning_rate": 6.498862343572241e-07, "loss": 0.0004, "reward": 1.7571614384651184, "reward_std": 0.05995044857263565, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7571614682674408, "step": 1231 }, { "clip_ratio": 0.0, "completion_length": 103.13281631469727, "epoch": 4.2184300341296925, "grad_norm": 0.8018825943373256, "kl": 0.4013671875, "learning_rate": 6.496018202502844e-07, "loss": 0.0004, "reward": 1.755859375, "reward_std": 0.034854328259825706, "rewards/format_reward": 1.0, "rewards/score_reward": 0.755859375, "step": 1232 }, { "clip_ratio": 0.0, "completion_length": 106.79427337646484, "epoch": 4.221843003412969, "grad_norm": 0.5088629949857377, "kl": 0.41015625, "learning_rate": 6.493174061433447e-07, "loss": 0.0004, "reward": 1.65234375, "reward_std": 0.03544260933995247, "rewards/format_reward": 1.0, "rewards/score_reward": 0.65234375, "step": 1233 }, { "clip_ratio": 0.0, "completion_length": 105.07292175292969, "epoch": 4.225255972696246, "grad_norm": 0.6394599240911869, "kl": 0.4013671875, "learning_rate": 6.490329920364049e-07, "loss": 0.0004, "reward": 1.7122395634651184, "reward_std": 0.03884047269821167, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7122395932674408, "step": 1234 }, { "clip_ratio": 0.0, "completion_length": 105.13021087646484, "epoch": 4.228668941979522, "grad_norm": 0.9119629297251322, "kl": 0.421875, "learning_rate": 6.487485779294652e-07, "loss": 0.0004, "reward": 1.681640625, "reward_std": 0.04153538774698973, "rewards/format_reward": 1.0, "rewards/score_reward": 0.681640625, "step": 1235 }, { "clip_ratio": 0.0, "completion_length": 103.79687881469727, "epoch": 4.2320819112627985, "grad_norm": 0.8240059622500623, "kl": 0.4033203125, "learning_rate": 6.484641638225256e-07, "loss": 0.0004, "reward": 1.7584635019302368, "reward_std": 0.03237776551395655, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7584635615348816, "step": 1236 }, { "clip_ratio": 0.0, "completion_length": 104.54948425292969, "epoch": 4.235494880546075, "grad_norm": 0.4899852720731317, "kl": 0.587890625, "learning_rate": 6.481797497155858e-07, "loss": 0.0006, "reward": 1.7903646230697632, "reward_std": 0.028058198746293783, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7903645634651184, "step": 1237 }, { "clip_ratio": 0.0, "completion_length": 102.08854293823242, "epoch": 4.238907849829351, "grad_norm": 5.816627035409615, "kl": 0.404296875, "learning_rate": 6.478953356086462e-07, "loss": 0.0004, "reward": 1.7096354365348816, "reward_std": 0.05514595843851566, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7096354067325592, "step": 1238 }, { "clip_ratio": 0.0, "completion_length": 104.94010925292969, "epoch": 4.242320819112628, "grad_norm": 2.60734633845308, "kl": 0.4072265625, "learning_rate": 6.476109215017065e-07, "loss": 0.0004, "reward": 1.759765625, "reward_std": 0.05856441706418991, "rewards/format_reward": 1.0, "rewards/score_reward": 0.759765625, "step": 1239 }, { "clip_ratio": 0.0, "completion_length": 103.29166793823242, "epoch": 4.2457337883959045, "grad_norm": 0.8404717498264913, "kl": 0.4130859375, "learning_rate": 6.473265073947668e-07, "loss": 0.0004, "reward": 1.7864583134651184, "reward_std": 0.07355290092527866, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7864583432674408, "step": 1240 }, { "clip_ratio": 0.0, "completion_length": 102.72135543823242, "epoch": 4.249146757679181, "grad_norm": 0.633543058363226, "kl": 0.4033203125, "learning_rate": 6.47042093287827e-07, "loss": 0.0004, "reward": 1.6595051884651184, "reward_std": 0.031708089634776115, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6595052182674408, "step": 1241 }, { "clip_ratio": 0.0, "completion_length": 104.52083587646484, "epoch": 4.252559726962457, "grad_norm": 3.7788373572654654, "kl": 0.4052734375, "learning_rate": 6.467576791808873e-07, "loss": 0.0004, "reward": 1.8033854365348816, "reward_std": 0.03432138171046972, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8033854067325592, "step": 1242 }, { "clip_ratio": 0.0, "completion_length": 101.78646087646484, "epoch": 4.255972696245734, "grad_norm": 0.5489903361706094, "kl": 0.4033203125, "learning_rate": 6.464732650739476e-07, "loss": 0.0004, "reward": 1.8776041865348816, "reward_std": 0.03439082112163305, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8776041567325592, "step": 1243 }, { "clip_ratio": 0.0, "completion_length": 101.11719131469727, "epoch": 4.2593856655290105, "grad_norm": 0.8572270985884444, "kl": 0.40234375, "learning_rate": 6.461888509670079e-07, "loss": 0.0004, "reward": 1.7825520634651184, "reward_std": 0.03883982077240944, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7825520932674408, "step": 1244 }, { "clip_ratio": 0.0, "completion_length": 102.08594131469727, "epoch": 4.262798634812286, "grad_norm": 1.1923518916700655, "kl": 0.4384765625, "learning_rate": 6.459044368600683e-07, "loss": 0.0004, "reward": 1.7923177480697632, "reward_std": 0.04351059999316931, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7923176884651184, "step": 1245 }, { "clip_ratio": 0.0, "completion_length": 104.59375, "epoch": 4.266211604095563, "grad_norm": 1.5582471702788148, "kl": 0.4462890625, "learning_rate": 6.456200227531286e-07, "loss": 0.0004, "reward": 1.6946614384651184, "reward_std": 0.05474802106618881, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6946614682674408, "step": 1246 }, { "clip_ratio": 0.0, "completion_length": 102.22135925292969, "epoch": 4.26962457337884, "grad_norm": 2.6473251100577038, "kl": 0.4072265625, "learning_rate": 6.453356086461888e-07, "loss": 0.0004, "reward": 1.7083333134651184, "reward_std": 0.06292871618643403, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7083333432674408, "step": 1247 }, { "clip_ratio": 0.0, "completion_length": 102.96094131469727, "epoch": 4.273037542662116, "grad_norm": 0.6593053477581088, "kl": 0.41796875, "learning_rate": 6.450511945392492e-07, "loss": 0.0004, "reward": 1.6731770634651184, "reward_std": 0.0219643609598279, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6731770932674408, "step": 1248 }, { "clip_ratio": 0.0, "completion_length": 101.45312881469727, "epoch": 4.276450511945392, "grad_norm": 0.999629223766659, "kl": 0.408203125, "learning_rate": 6.447667804323094e-07, "loss": 0.0004, "reward": 1.689453125, "reward_std": 0.035973270889371634, "rewards/format_reward": 1.0, "rewards/score_reward": 0.689453125, "step": 1249 }, { "clip_ratio": 0.0, "completion_length": 102.35416793823242, "epoch": 4.279863481228669, "grad_norm": 3.877914644387417, "kl": 0.4130859375, "learning_rate": 6.444823663253697e-07, "loss": 0.0004, "reward": 1.8020833730697632, "reward_std": 0.054881155490875244, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8020833134651184, "step": 1250 }, { "clip_ratio": 0.0, "completion_length": 101.02344131469727, "epoch": 4.283276450511945, "grad_norm": 6.559956789161545, "kl": 0.4072265625, "learning_rate": 6.4419795221843e-07, "loss": 0.0004, "reward": 1.75390625, "reward_std": 0.0869758129119873, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7565104067325592, "step": 1251 }, { "clip_ratio": 0.0, "completion_length": 99.08073425292969, "epoch": 4.286689419795222, "grad_norm": 0.9031912860281789, "kl": 0.416015625, "learning_rate": 6.439135381114903e-07, "loss": 0.0004, "reward": 1.71875, "reward_std": 0.03910674341022968, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7213541567325592, "step": 1252 }, { "clip_ratio": 0.0, "completion_length": 100.07552337646484, "epoch": 4.290102389078498, "grad_norm": 0.6179270211198036, "kl": 0.404296875, "learning_rate": 6.436291240045506e-07, "loss": 0.0004, "reward": 1.7805989384651184, "reward_std": 0.031037935987114906, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7805989682674408, "step": 1253 }, { "clip_ratio": 0.0, "completion_length": 101.43229293823242, "epoch": 4.293515358361775, "grad_norm": 0.9546035454129165, "kl": 0.4130859375, "learning_rate": 6.433447098976109e-07, "loss": 0.0004, "reward": 1.7220052480697632, "reward_std": 0.08432433009147644, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7220051884651184, "step": 1254 }, { "clip_ratio": 0.0, "completion_length": 100.43229293823242, "epoch": 4.296928327645051, "grad_norm": 0.7043690879272826, "kl": 0.4287109375, "learning_rate": 6.430602957906713e-07, "loss": 0.0004, "reward": 1.7858073115348816, "reward_std": 0.04846383444964886, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7858072817325592, "step": 1255 }, { "clip_ratio": 0.0, "completion_length": 98.890625, "epoch": 4.300341296928328, "grad_norm": 1.0076865325847169, "kl": 0.421875, "learning_rate": 6.427758816837315e-07, "loss": 0.0004, "reward": 1.7825521230697632, "reward_std": 0.03202521614730358, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7825520634651184, "step": 1256 }, { "clip_ratio": 0.0, "completion_length": 99.45833587646484, "epoch": 4.303754266211604, "grad_norm": 1.516534280593754, "kl": 0.4697265625, "learning_rate": 6.424914675767917e-07, "loss": 0.0005, "reward": 1.681640625, "reward_std": 0.09234863892197609, "rewards/format_reward": 1.0, "rewards/score_reward": 0.681640625, "step": 1257 }, { "clip_ratio": 0.0, "completion_length": 99.59375381469727, "epoch": 4.30716723549488, "grad_norm": 5.977844482241286, "kl": 0.412109375, "learning_rate": 6.422070534698521e-07, "loss": 0.0004, "reward": 1.796875, "reward_std": 0.06055581197142601, "rewards/format_reward": 1.0, "rewards/score_reward": 0.796875, "step": 1258 }, { "clip_ratio": 0.0, "completion_length": 101.265625, "epoch": 4.310580204778157, "grad_norm": 1.6077940152129377, "kl": 0.4228515625, "learning_rate": 6.419226393629123e-07, "loss": 0.0004, "reward": 1.7213541865348816, "reward_std": 0.07082030735909939, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7213541567325592, "step": 1259 }, { "clip_ratio": 0.0, "completion_length": 98.97917175292969, "epoch": 4.313993174061434, "grad_norm": 1.9846127033708767, "kl": 0.4130859375, "learning_rate": 6.416382252559727e-07, "loss": 0.0004, "reward": 1.6888020634651184, "reward_std": 0.04324392415583134, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6888020932674408, "step": 1260 }, { "clip_ratio": 0.0, "completion_length": 101.55469131469727, "epoch": 4.3174061433447095, "grad_norm": 1.266967896824898, "kl": 0.416015625, "learning_rate": 6.41353811149033e-07, "loss": 0.0004, "reward": 1.6953125, "reward_std": 0.09737684391438961, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6953125, "step": 1261 }, { "clip_ratio": 0.0, "completion_length": 101.02864837646484, "epoch": 4.320819112627986, "grad_norm": 1.6736396621867702, "kl": 0.4287109375, "learning_rate": 6.410693970420932e-07, "loss": 0.0004, "reward": 1.7337239384651184, "reward_std": 0.099751777946949, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7337239682674408, "step": 1262 }, { "clip_ratio": 0.0, "completion_length": 100.48958587646484, "epoch": 4.324232081911263, "grad_norm": 1.1282872614367436, "kl": 0.416015625, "learning_rate": 6.407849829351536e-07, "loss": 0.0004, "reward": 1.755859375, "reward_std": 0.08138210326433182, "rewards/format_reward": 1.0, "rewards/score_reward": 0.755859375, "step": 1263 }, { "clip_ratio": 0.0, "completion_length": 101.92187881469727, "epoch": 4.327645051194539, "grad_norm": 0.9006655479792104, "kl": 0.41796875, "learning_rate": 6.405005688282139e-07, "loss": 0.0004, "reward": 1.7194010615348816, "reward_std": 0.039693558588624, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7194010317325592, "step": 1264 }, { "clip_ratio": 0.0, "completion_length": 99.50260925292969, "epoch": 4.3310580204778155, "grad_norm": 0.6941228487057949, "kl": 0.4208984375, "learning_rate": 6.402161547212741e-07, "loss": 0.0004, "reward": 1.796875, "reward_std": 0.04267557989805937, "rewards/format_reward": 1.0, "rewards/score_reward": 0.796875, "step": 1265 }, { "clip_ratio": 0.0, "completion_length": 100.57031631469727, "epoch": 4.334470989761092, "grad_norm": 1.2455469798826388, "kl": 0.41796875, "learning_rate": 6.399317406143344e-07, "loss": 0.0004, "reward": 1.7532551884651184, "reward_std": 0.05931702069938183, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.755859375, "step": 1266 }, { "clip_ratio": 0.0, "completion_length": 100.07031631469727, "epoch": 4.337883959044369, "grad_norm": 0.7416635695675278, "kl": 0.4150390625, "learning_rate": 6.396473265073947e-07, "loss": 0.0004, "reward": 1.7565103769302368, "reward_std": 0.05102814733982086, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7565104365348816, "step": 1267 }, { "clip_ratio": 0.0, "completion_length": 100.89062881469727, "epoch": 4.341296928327645, "grad_norm": 1.09386625970229, "kl": 0.4091796875, "learning_rate": 6.39362912400455e-07, "loss": 0.0004, "reward": 1.7916666865348816, "reward_std": 0.05641928315162659, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7916666567325592, "step": 1268 }, { "clip_ratio": 0.0, "completion_length": 98.70052337646484, "epoch": 4.3447098976109215, "grad_norm": 0.5975456359456057, "kl": 0.40625, "learning_rate": 6.390784982935153e-07, "loss": 0.0004, "reward": 1.7734375, "reward_std": 0.028342368081212044, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7734375, "step": 1269 }, { "clip_ratio": 0.0, "completion_length": 98.90364837646484, "epoch": 4.348122866894198, "grad_norm": 1.9242087485746613, "kl": 0.4091796875, "learning_rate": 6.387940841865757e-07, "loss": 0.0004, "reward": 1.73828125, "reward_std": 0.056050002574920654, "rewards/format_reward": 1.0, "rewards/score_reward": 0.73828125, "step": 1270 }, { "clip_ratio": 0.0, "completion_length": 99.02344131469727, "epoch": 4.351535836177474, "grad_norm": 1.1525569695865199, "kl": 0.4091796875, "learning_rate": 6.38509670079636e-07, "loss": 0.0004, "reward": 1.7369791865348816, "reward_std": 0.05981649272143841, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7369791567325592, "step": 1271 }, { "clip_ratio": 0.0, "completion_length": 100.46875381469727, "epoch": 4.354948805460751, "grad_norm": 0.7838950273349232, "kl": 0.4091796875, "learning_rate": 6.382252559726961e-07, "loss": 0.0004, "reward": 1.7708333730697632, "reward_std": 0.04855493642389774, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7734375, "step": 1272 }, { "clip_ratio": 0.0, "completion_length": 97.55729293823242, "epoch": 4.3583617747440275, "grad_norm": 0.8430766575352868, "kl": 0.404296875, "learning_rate": 6.379408418657565e-07, "loss": 0.0004, "reward": 1.7845052480697632, "reward_std": 0.05089501664042473, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7845051884651184, "step": 1273 }, { "clip_ratio": 0.0, "completion_length": 100.09375381469727, "epoch": 4.361774744027303, "grad_norm": 0.9391426018727813, "kl": 0.416015625, "learning_rate": 6.376564277588168e-07, "loss": 0.0004, "reward": 1.8235676884651184, "reward_std": 0.04019926302134991, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8235677182674408, "step": 1274 }, { "clip_ratio": 0.0, "completion_length": 98.58333587646484, "epoch": 4.36518771331058, "grad_norm": 2.0871657557102425, "kl": 0.421875, "learning_rate": 6.373720136518771e-07, "loss": 0.0004, "reward": 1.73046875, "reward_std": 0.05114150233566761, "rewards/format_reward": 1.0, "rewards/score_reward": 0.73046875, "step": 1275 }, { "clip_ratio": 0.0, "completion_length": 97.19271087646484, "epoch": 4.368600682593857, "grad_norm": 0.5289478097176655, "kl": 0.4130859375, "learning_rate": 6.370875995449374e-07, "loss": 0.0004, "reward": 1.7819010615348816, "reward_std": 0.02860757615417242, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7819010317325592, "step": 1276 }, { "clip_ratio": 0.0, "completion_length": 98.625, "epoch": 4.372013651877133, "grad_norm": 5.707313096377437, "kl": 0.4365234375, "learning_rate": 6.368031854379977e-07, "loss": 0.0004, "reward": 1.7506510615348816, "reward_std": 0.057558175176382065, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7506510317325592, "step": 1277 }, { "clip_ratio": 0.0, "completion_length": 98.92969131469727, "epoch": 4.375426621160409, "grad_norm": 0.47613759254607213, "kl": 0.41015625, "learning_rate": 6.36518771331058e-07, "loss": 0.0004, "reward": 1.701171875, "reward_std": 0.027355906553566456, "rewards/format_reward": 1.0, "rewards/score_reward": 0.701171875, "step": 1278 }, { "clip_ratio": 0.0, "completion_length": 98.44010543823242, "epoch": 4.378839590443686, "grad_norm": 0.8190253912157577, "kl": 0.408203125, "learning_rate": 6.362343572241183e-07, "loss": 0.0004, "reward": 1.7096353769302368, "reward_std": 0.0512482114136219, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7096354365348816, "step": 1279 }, { "clip_ratio": 0.0, "completion_length": 98.33854293823242, "epoch": 4.382252559726963, "grad_norm": 1.0036253270663678, "kl": 0.4072265625, "learning_rate": 6.359499431171786e-07, "loss": 0.0004, "reward": 1.697265625, "reward_std": 0.04068189486861229, "rewards/format_reward": 1.0, "rewards/score_reward": 0.697265625, "step": 1280 }, { "clip_ratio": 0.0, "completion_length": 98.546875, "epoch": 4.385665529010239, "grad_norm": 0.5947176360125677, "kl": 0.4189453125, "learning_rate": 6.356655290102388e-07, "loss": 0.0004, "reward": 1.6764323115348816, "reward_std": 0.02807650715112686, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6764322817325592, "step": 1281 }, { "clip_ratio": 0.0, "completion_length": 98.30208587646484, "epoch": 4.389078498293515, "grad_norm": 0.4204423527663779, "kl": 0.3974609375, "learning_rate": 6.353811149032991e-07, "loss": 0.0004, "reward": 1.6979166865348816, "reward_std": 0.009643959812819958, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6979166567325592, "step": 1282 }, { "clip_ratio": 0.0, "completion_length": 97.03125, "epoch": 4.392491467576792, "grad_norm": 1.6236736643285614, "kl": 0.4033203125, "learning_rate": 6.350967007963595e-07, "loss": 0.0004, "reward": 1.681640625, "reward_std": 0.0642636138945818, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6842447817325592, "step": 1283 }, { "clip_ratio": 0.0, "completion_length": 98.34114837646484, "epoch": 4.395904436860068, "grad_norm": 0.5209354310306546, "kl": 0.40625, "learning_rate": 6.348122866894197e-07, "loss": 0.0004, "reward": 1.748046875, "reward_std": 0.03386664018034935, "rewards/format_reward": 1.0, "rewards/score_reward": 0.748046875, "step": 1284 }, { "clip_ratio": 0.0, "completion_length": 98.92187881469727, "epoch": 4.399317406143345, "grad_norm": 1.3893672239442791, "kl": 0.4033203125, "learning_rate": 6.345278725824801e-07, "loss": 0.0004, "reward": 1.6764323115348816, "reward_std": 0.06112684868276119, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6764322817325592, "step": 1285 }, { "clip_ratio": 0.0, "completion_length": 98.33073425292969, "epoch": 4.402730375426621, "grad_norm": 1.3647134569771928, "kl": 0.408203125, "learning_rate": 6.342434584755404e-07, "loss": 0.0004, "reward": 1.6588541865348816, "reward_std": 0.037986328825354576, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6588541567325592, "step": 1286 }, { "clip_ratio": 0.0, "completion_length": 97.83073043823242, "epoch": 4.406143344709897, "grad_norm": 1.3588689129824036, "kl": 0.41796875, "learning_rate": 6.339590443686008e-07, "loss": 0.0004, "reward": 1.7239583134651184, "reward_std": 0.03160685067996383, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7239583432674408, "step": 1287 }, { "clip_ratio": 0.0, "completion_length": 98.58594131469727, "epoch": 4.409556313993174, "grad_norm": 1.7218740422976981, "kl": 0.4208984375, "learning_rate": 6.336746302616609e-07, "loss": 0.0004, "reward": 1.666015625, "reward_std": 0.06266415677964687, "rewards/format_reward": 1.0, "rewards/score_reward": 0.666015625, "step": 1288 }, { "clip_ratio": 0.0, "completion_length": 98.58333587646484, "epoch": 4.412969283276451, "grad_norm": 0.7167011247575605, "kl": 0.416015625, "learning_rate": 6.333902161547212e-07, "loss": 0.0004, "reward": 1.6998698115348816, "reward_std": 0.03969502728432417, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6998697817325592, "step": 1289 }, { "clip_ratio": 0.0, "completion_length": 98.84114837646484, "epoch": 4.4163822525597265, "grad_norm": 0.5724747906389456, "kl": 0.4150390625, "learning_rate": 6.331058020477816e-07, "loss": 0.0004, "reward": 1.5911458134651184, "reward_std": 0.03511806204915047, "rewards/format_reward": 1.0, "rewards/score_reward": 0.5911458432674408, "step": 1290 }, { "clip_ratio": 0.0, "completion_length": 99.3828125, "epoch": 4.419795221843003, "grad_norm": 1.1266642082230387, "kl": 0.44140625, "learning_rate": 6.328213879408418e-07, "loss": 0.0004, "reward": 1.7467448115348816, "reward_std": 0.020122936461120844, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7467447817325592, "step": 1291 }, { "clip_ratio": 0.0, "completion_length": 97.86719131469727, "epoch": 4.42320819112628, "grad_norm": 0.9844172041740683, "kl": 0.427734375, "learning_rate": 6.325369738339022e-07, "loss": 0.0004, "reward": 1.732421875, "reward_std": 0.03430242044851184, "rewards/format_reward": 1.0, "rewards/score_reward": 0.732421875, "step": 1292 }, { "clip_ratio": 0.0, "completion_length": 97.33073043823242, "epoch": 4.426621160409557, "grad_norm": 0.8865550557091507, "kl": 0.4111328125, "learning_rate": 6.322525597269625e-07, "loss": 0.0004, "reward": 1.7545573115348816, "reward_std": 0.04523842316120863, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7545572817325592, "step": 1293 }, { "clip_ratio": 0.0, "completion_length": 97.9375, "epoch": 4.4300341296928325, "grad_norm": 1.171903343019065, "kl": 0.4140625, "learning_rate": 6.319681456200227e-07, "loss": 0.0004, "reward": 1.7845051884651184, "reward_std": 0.04054917022585869, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7845052182674408, "step": 1294 }, { "clip_ratio": 0.0, "completion_length": 99.34114837646484, "epoch": 4.433447098976109, "grad_norm": 0.7925820183550402, "kl": 0.41015625, "learning_rate": 6.316837315130831e-07, "loss": 0.0004, "reward": 1.7434895634651184, "reward_std": 0.04155435226857662, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7434895932674408, "step": 1295 }, { "clip_ratio": 0.0, "completion_length": 98.24479293823242, "epoch": 4.436860068259386, "grad_norm": 1.5166605278453207, "kl": 0.3984375, "learning_rate": 6.313993174061433e-07, "loss": 0.0004, "reward": 1.84765625, "reward_std": 0.044363681226968765, "rewards/format_reward": 1.0, "rewards/score_reward": 0.84765625, "step": 1296 }, { "clip_ratio": 0.0, "completion_length": 97.33333587646484, "epoch": 4.440273037542662, "grad_norm": 1.1526923090084769, "kl": 0.4296875, "learning_rate": 6.311149032992036e-07, "loss": 0.0004, "reward": 1.8404948115348816, "reward_std": 0.0431810449808836, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8404947817325592, "step": 1297 }, { "clip_ratio": 0.0, "completion_length": 97.98437881469727, "epoch": 4.4436860068259385, "grad_norm": 5.419446512471501, "kl": 0.4169921875, "learning_rate": 6.308304891922639e-07, "loss": 0.0004, "reward": 1.8255208134651184, "reward_std": 0.04999507777392864, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8255208432674408, "step": 1298 }, { "clip_ratio": 0.0, "completion_length": 97.52083587646484, "epoch": 4.447098976109215, "grad_norm": 0.6480267249520083, "kl": 0.41015625, "learning_rate": 6.305460750853242e-07, "loss": 0.0004, "reward": 1.7727864980697632, "reward_std": 0.0407006167806685, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7727864384651184, "step": 1299 }, { "clip_ratio": 0.0, "completion_length": 95.94791793823242, "epoch": 4.450511945392491, "grad_norm": 0.7557176139285633, "kl": 0.423828125, "learning_rate": 6.302616609783845e-07, "loss": 0.0004, "reward": 1.8040364980697632, "reward_std": 0.02110915444791317, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8040364384651184, "step": 1300 }, { "clip_ratio": 0.0, "completion_length": 97.96094131469727, "epoch": 4.453924914675768, "grad_norm": 0.35019595927896063, "kl": 0.416015625, "learning_rate": 6.299772468714448e-07, "loss": 0.0004, "reward": 1.7076823115348816, "reward_std": 0.01773066632449627, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7076822817325592, "step": 1301 }, { "clip_ratio": 0.0, "completion_length": 98.91146087646484, "epoch": 4.4573378839590445, "grad_norm": 1.6487936217596357, "kl": 0.431640625, "learning_rate": 6.296928327645052e-07, "loss": 0.0004, "reward": 1.6263021230697632, "reward_std": 0.057138415053486824, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6263020634651184, "step": 1302 }, { "clip_ratio": 0.0, "completion_length": 96.01562881469727, "epoch": 4.460750853242321, "grad_norm": 1.0751853698242584, "kl": 0.4365234375, "learning_rate": 6.294084186575653e-07, "loss": 0.0004, "reward": 1.7454426884651184, "reward_std": 0.054646486416459084, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7454427182674408, "step": 1303 }, { "clip_ratio": 0.0, "completion_length": 95.546875, "epoch": 4.464163822525597, "grad_norm": 0.8042065646240789, "kl": 0.435546875, "learning_rate": 6.291240045506256e-07, "loss": 0.0004, "reward": 1.6998698115348816, "reward_std": 0.029917524196207523, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6998697817325592, "step": 1304 }, { "clip_ratio": 0.0, "completion_length": 95.14062881469727, "epoch": 4.467576791808874, "grad_norm": 1.3523722624540748, "kl": 0.4365234375, "learning_rate": 6.28839590443686e-07, "loss": 0.0004, "reward": 1.6614583134651184, "reward_std": 0.07526864856481552, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6614583432674408, "step": 1305 }, { "clip_ratio": 0.0, "completion_length": 95.34635543823242, "epoch": 4.4709897610921505, "grad_norm": 0.7710252820471054, "kl": 0.4306640625, "learning_rate": 6.285551763367462e-07, "loss": 0.0004, "reward": 1.7473958730697632, "reward_std": 0.04671121947467327, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7473958134651184, "step": 1306 }, { "clip_ratio": 0.0, "completion_length": 94.89323043823242, "epoch": 4.474402730375426, "grad_norm": 0.7837035206965351, "kl": 0.435546875, "learning_rate": 6.282707622298066e-07, "loss": 0.0004, "reward": 1.837890625, "reward_std": 0.016572814900428057, "rewards/format_reward": 1.0, "rewards/score_reward": 0.837890625, "step": 1307 }, { "clip_ratio": 0.0, "completion_length": 95.57812881469727, "epoch": 4.477815699658703, "grad_norm": 0.6198434513590767, "kl": 0.4208984375, "learning_rate": 6.279863481228669e-07, "loss": 0.0004, "reward": 1.755859375, "reward_std": 0.02253392618149519, "rewards/format_reward": 1.0, "rewards/score_reward": 0.755859375, "step": 1308 }, { "clip_ratio": 0.0, "completion_length": 95.82031631469727, "epoch": 4.48122866894198, "grad_norm": 1.0196495022420677, "kl": 0.4443359375, "learning_rate": 6.277019340159271e-07, "loss": 0.0004, "reward": 1.6901041865348816, "reward_std": 0.026499883271753788, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6901041567325592, "step": 1309 }, { "clip_ratio": 0.0, "completion_length": 93.95833587646484, "epoch": 4.484641638225256, "grad_norm": 0.6560140128252675, "kl": 0.4267578125, "learning_rate": 6.274175199089875e-07, "loss": 0.0004, "reward": 1.6796875, "reward_std": 0.028342368081212044, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6796875, "step": 1310 }, { "clip_ratio": 0.0, "completion_length": 96.73698425292969, "epoch": 4.488054607508532, "grad_norm": 1.357912957481918, "kl": 0.431640625, "learning_rate": 6.271331058020477e-07, "loss": 0.0004, "reward": 1.76171875, "reward_std": 0.03813777305185795, "rewards/format_reward": 1.0, "rewards/score_reward": 0.76171875, "step": 1311 }, { "clip_ratio": 0.0, "completion_length": 95.5625, "epoch": 4.491467576791809, "grad_norm": 1.5608748180764953, "kl": 0.4345703125, "learning_rate": 6.268486916951081e-07, "loss": 0.0004, "reward": 1.7415364384651184, "reward_std": 0.038402979262173176, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7415364682674408, "step": 1312 }, { "clip_ratio": 0.0, "completion_length": 93.73958587646484, "epoch": 4.494880546075085, "grad_norm": 1.3143866880769155, "kl": 0.423828125, "learning_rate": 6.265642775881683e-07, "loss": 0.0004, "reward": 1.7174478769302368, "reward_std": 0.04136481694877148, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7174479365348816, "step": 1313 }, { "clip_ratio": 0.0, "completion_length": 90.90625381469727, "epoch": 4.498293515358362, "grad_norm": 1.4379815830432598, "kl": 0.4462890625, "learning_rate": 6.262798634812286e-07, "loss": 0.0004, "reward": 1.72265625, "reward_std": 0.01828151335939765, "rewards/format_reward": 1.0, "rewards/score_reward": 0.72265625, "step": 1314 }, { "clip_ratio": 0.0, "completion_length": 93.84635543823242, "epoch": 4.501706484641638, "grad_norm": 0.8295566682119345, "kl": 0.423828125, "learning_rate": 6.25995449374289e-07, "loss": 0.0004, "reward": 1.8255208730697632, "reward_std": 0.02834236714988947, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8255208134651184, "step": 1315 }, { "clip_ratio": 0.0, "completion_length": 92.21094131469727, "epoch": 4.505119453924914, "grad_norm": 1.8360422330398352, "kl": 0.427734375, "learning_rate": 6.257110352673492e-07, "loss": 0.0004, "reward": 1.7298176884651184, "reward_std": 0.047477614134550095, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7298177182674408, "step": 1316 }, { "clip_ratio": 0.0, "completion_length": 91.484375, "epoch": 4.508532423208191, "grad_norm": 2.2936711315291842, "kl": 0.4306640625, "learning_rate": 6.254266211604096e-07, "loss": 0.0004, "reward": 1.84375, "reward_std": 0.04741327092051506, "rewards/format_reward": 1.0, "rewards/score_reward": 0.84375, "step": 1317 }, { "clip_ratio": 0.0, "completion_length": 91.98958587646484, "epoch": 4.511945392491468, "grad_norm": 1.3977928246926792, "kl": 0.4521484375, "learning_rate": 6.251422070534699e-07, "loss": 0.0005, "reward": 1.7747396230697632, "reward_std": 0.044320421293377876, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7747395634651184, "step": 1318 }, { "clip_ratio": 0.0, "completion_length": 89.22656631469727, "epoch": 4.515358361774744, "grad_norm": 1.4916800205700882, "kl": 0.4541015625, "learning_rate": 6.2485779294653e-07, "loss": 0.0005, "reward": 1.62890625, "reward_std": 0.058411093428730965, "rewards/format_reward": 1.0, "rewards/score_reward": 0.62890625, "step": 1319 }, { "clip_ratio": 0.0, "completion_length": 90.44271087646484, "epoch": 4.51877133105802, "grad_norm": 1.6977264350500958, "kl": 0.45703125, "learning_rate": 6.245733788395904e-07, "loss": 0.0005, "reward": 1.7473958134651184, "reward_std": 0.027221957221627235, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7473958432674408, "step": 1320 }, { "clip_ratio": 0.0, "completion_length": 89.96875381469727, "epoch": 4.522184300341297, "grad_norm": 1.2366728159180103, "kl": 0.4541015625, "learning_rate": 6.242889647326507e-07, "loss": 0.0005, "reward": 1.7220051884651184, "reward_std": 0.051179349422454834, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7220052182674408, "step": 1321 }, { "clip_ratio": 0.0, "completion_length": 90.55208587646484, "epoch": 4.525597269624574, "grad_norm": 0.9217477178758525, "kl": 0.47265625, "learning_rate": 6.24004550625711e-07, "loss": 0.0005, "reward": 1.8196614384651184, "reward_std": 0.05300229787826538, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8196614682674408, "step": 1322 }, { "clip_ratio": 0.0, "completion_length": 90.84635543823242, "epoch": 4.5290102389078495, "grad_norm": 3.1528365868356736, "kl": 0.4443359375, "learning_rate": 6.237201365187713e-07, "loss": 0.0004, "reward": 1.7096353769302368, "reward_std": 0.04409888759255409, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7096354365348816, "step": 1323 }, { "clip_ratio": 0.0, "completion_length": 91.4609375, "epoch": 4.532423208191126, "grad_norm": 0.981362306659265, "kl": 0.4951171875, "learning_rate": 6.234357224118316e-07, "loss": 0.0005, "reward": 1.75390625, "reward_std": 0.04361706832423806, "rewards/format_reward": 1.0, "rewards/score_reward": 0.75390625, "step": 1324 }, { "clip_ratio": 0.0, "completion_length": 90.66406631469727, "epoch": 4.535836177474403, "grad_norm": 0.5893608184981463, "kl": 0.4423828125, "learning_rate": 6.231513083048919e-07, "loss": 0.0004, "reward": 1.7161458730697632, "reward_std": 0.02834236714988947, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7161458134651184, "step": 1325 }, { "clip_ratio": 0.0, "completion_length": 90.75521087646484, "epoch": 4.53924914675768, "grad_norm": 0.8302578342038425, "kl": 0.435546875, "learning_rate": 6.228668941979522e-07, "loss": 0.0004, "reward": 1.6927083134651184, "reward_std": 0.03897401690483093, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6927083432674408, "step": 1326 }, { "clip_ratio": 0.0, "completion_length": 91.53646087646484, "epoch": 4.5426621160409555, "grad_norm": 2.6023682933724612, "kl": 0.447265625, "learning_rate": 6.225824800910125e-07, "loss": 0.0004, "reward": 1.7708333134651184, "reward_std": 0.045237201265990734, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7708333432674408, "step": 1327 }, { "clip_ratio": 0.0, "completion_length": 90.08073425292969, "epoch": 4.546075085324232, "grad_norm": 1.7494809589166203, "kl": 0.4365234375, "learning_rate": 6.222980659840727e-07, "loss": 0.0004, "reward": 1.7330729365348816, "reward_std": 0.032747451681643724, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7330729067325592, "step": 1328 }, { "clip_ratio": 0.0, "completion_length": 90.84635925292969, "epoch": 4.549488054607508, "grad_norm": 1.6265211132045068, "kl": 0.44140625, "learning_rate": 6.22013651877133e-07, "loss": 0.0004, "reward": 1.7161458730697632, "reward_std": 0.04267476312816143, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7161458134651184, "step": 1329 }, { "clip_ratio": 0.0, "completion_length": 90.68229675292969, "epoch": 4.552901023890785, "grad_norm": 1.9299107696929607, "kl": 0.45703125, "learning_rate": 6.217292377701934e-07, "loss": 0.0005, "reward": 1.7415364384651184, "reward_std": 0.06472606211900711, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7415364682674408, "step": 1330 }, { "clip_ratio": 0.0, "completion_length": 91.31510925292969, "epoch": 4.5563139931740615, "grad_norm": 1.426437659748793, "kl": 0.4326171875, "learning_rate": 6.214448236632536e-07, "loss": 0.0004, "reward": 1.7109375, "reward_std": 0.05799403227865696, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7109375, "step": 1331 }, { "clip_ratio": 0.0, "completion_length": 91.96094131469727, "epoch": 4.559726962457338, "grad_norm": 0.48028439125164657, "kl": 0.435546875, "learning_rate": 6.21160409556314e-07, "loss": 0.0004, "reward": 1.7044270634651184, "reward_std": 0.02581737283617258, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7044270932674408, "step": 1332 }, { "clip_ratio": 0.0, "completion_length": 91.44010543823242, "epoch": 4.563139931740614, "grad_norm": 0.7449690741858731, "kl": 0.4384765625, "learning_rate": 6.208759954493743e-07, "loss": 0.0004, "reward": 1.7578125, "reward_std": 0.01799587346613407, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7578125, "step": 1333 }, { "clip_ratio": 0.0, "completion_length": 93.14323425292969, "epoch": 4.566552901023891, "grad_norm": 0.8864740890096235, "kl": 0.4521484375, "learning_rate": 6.205915813424347e-07, "loss": 0.0005, "reward": 1.751953125, "reward_std": 0.06067170388996601, "rewards/format_reward": 1.0, "rewards/score_reward": 0.751953125, "step": 1334 }, { "clip_ratio": 0.0, "completion_length": 90.74739837646484, "epoch": 4.5699658703071675, "grad_norm": 0.9944528239842713, "kl": 0.4404296875, "learning_rate": 6.203071672354948e-07, "loss": 0.0004, "reward": 1.787109375, "reward_std": 0.03189142979681492, "rewards/format_reward": 1.0, "rewards/score_reward": 0.787109375, "step": 1335 }, { "clip_ratio": 0.0, "completion_length": 92.33073425292969, "epoch": 4.573378839590443, "grad_norm": 0.9401821748035823, "kl": 0.44140625, "learning_rate": 6.200227531285551e-07, "loss": 0.0004, "reward": 1.677734375, "reward_std": 0.06858799606561661, "rewards/format_reward": 1.0, "rewards/score_reward": 0.677734375, "step": 1336 }, { "clip_ratio": 0.0, "completion_length": 91.80469131469727, "epoch": 4.57679180887372, "grad_norm": 0.6257255861693142, "kl": 0.4521484375, "learning_rate": 6.197383390216155e-07, "loss": 0.0005, "reward": 1.70703125, "reward_std": 0.029328585602343082, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7096354365348816, "step": 1337 }, { "clip_ratio": 0.0, "completion_length": 92.57552337646484, "epoch": 4.580204778156997, "grad_norm": 1.1103162132141413, "kl": 0.44140625, "learning_rate": 6.194539249146757e-07, "loss": 0.0004, "reward": 1.6276041865348816, "reward_std": 0.04846424330025911, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6276041567325592, "step": 1338 }, { "clip_ratio": 0.0, "completion_length": 93.53125381469727, "epoch": 4.5836177474402735, "grad_norm": 0.8124942296467503, "kl": 0.451171875, "learning_rate": 6.191695108077361e-07, "loss": 0.0005, "reward": 1.7565104365348816, "reward_std": 0.02573454985395074, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7565104067325592, "step": 1339 }, { "clip_ratio": 0.0, "completion_length": 93.0546875, "epoch": 4.587030716723549, "grad_norm": 5.869133107794814, "kl": 0.4404296875, "learning_rate": 6.188850967007964e-07, "loss": 0.0004, "reward": 1.720703125, "reward_std": 0.022685371339321136, "rewards/format_reward": 1.0, "rewards/score_reward": 0.720703125, "step": 1340 }, { "clip_ratio": 0.0, "completion_length": 93.53385543823242, "epoch": 4.590443686006826, "grad_norm": 0.4947015501478301, "kl": 0.4423828125, "learning_rate": 6.186006825938566e-07, "loss": 0.0004, "reward": 1.7109375, "reward_std": 0.010782274417579174, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7109375, "step": 1341 }, { "clip_ratio": 0.0, "completion_length": 95.54948425292969, "epoch": 4.593856655290102, "grad_norm": 16.442013787188962, "kl": 0.43359375, "learning_rate": 6.183162684869169e-07, "loss": 0.0004, "reward": 1.7845051884651184, "reward_std": 0.05021185055375099, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7845052182674408, "step": 1342 }, { "clip_ratio": 0.0, "completion_length": 93.64583587646484, "epoch": 4.597269624573379, "grad_norm": 0.8610833711316687, "kl": 0.4208984375, "learning_rate": 6.180318543799772e-07, "loss": 0.0004, "reward": 1.7473958134651184, "reward_std": 0.021831634920090437, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7473958432674408, "step": 1343 }, { "clip_ratio": 0.0, "completion_length": 95.9453125, "epoch": 4.600682593856655, "grad_norm": 0.45677156873664315, "kl": 0.4482421875, "learning_rate": 6.177474402730375e-07, "loss": 0.0004, "reward": 1.798828125, "reward_std": 0.021109154913574457, "rewards/format_reward": 1.0, "rewards/score_reward": 0.798828125, "step": 1344 }, { "clip_ratio": 0.0, "completion_length": 96.78125381469727, "epoch": 4.604095563139932, "grad_norm": 0.7698558025429064, "kl": 0.4140625, "learning_rate": 6.174630261660978e-07, "loss": 0.0004, "reward": 1.599609375, "reward_std": 0.03813818283379078, "rewards/format_reward": 1.0, "rewards/score_reward": 0.599609375, "step": 1345 }, { "clip_ratio": 0.0, "completion_length": 96.00260543823242, "epoch": 4.607508532423208, "grad_norm": 0.581258797197305, "kl": 0.4404296875, "learning_rate": 6.171786120591581e-07, "loss": 0.0004, "reward": 1.705078125, "reward_std": 0.026500944048166275, "rewards/format_reward": 1.0, "rewards/score_reward": 0.705078125, "step": 1346 }, { "clip_ratio": 0.0, "completion_length": 97.02864837646484, "epoch": 4.610921501706485, "grad_norm": 4.230852617231304, "kl": 0.4248046875, "learning_rate": 6.168941979522184e-07, "loss": 0.0004, "reward": 1.6770833134651184, "reward_std": 0.05131313391029835, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6770833432674408, "step": 1347 }, { "clip_ratio": 0.0, "completion_length": 95.55989837646484, "epoch": 4.614334470989761, "grad_norm": 1.1181663467397165, "kl": 0.42578125, "learning_rate": 6.166097838452787e-07, "loss": 0.0004, "reward": 1.8268229365348816, "reward_std": 0.035118065774440765, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8268229067325592, "step": 1348 }, { "clip_ratio": 0.0, "completion_length": 96.88021087646484, "epoch": 4.617747440273037, "grad_norm": 0.6997460400026313, "kl": 0.427734375, "learning_rate": 6.163253697383391e-07, "loss": 0.0004, "reward": 1.7395833134651184, "reward_std": 0.02437535021454096, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7395833432674408, "step": 1349 }, { "clip_ratio": 0.0, "completion_length": 99.3125, "epoch": 4.621160409556314, "grad_norm": 0.9541733956087292, "kl": 0.40625, "learning_rate": 6.160409556313992e-07, "loss": 0.0004, "reward": 1.7688801884651184, "reward_std": 0.038537174463272095, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7688802182674408, "step": 1350 }, { "clip_ratio": 0.0, "completion_length": 96.65104675292969, "epoch": 4.624573378839591, "grad_norm": 0.5040597127610037, "kl": 0.431640625, "learning_rate": 6.157565415244595e-07, "loss": 0.0004, "reward": 1.693359375, "reward_std": 0.016306546051055193, "rewards/format_reward": 1.0, "rewards/score_reward": 0.693359375, "step": 1351 }, { "clip_ratio": 0.0, "completion_length": 97.81250381469727, "epoch": 4.627986348122867, "grad_norm": 0.8782274097007683, "kl": 0.400390625, "learning_rate": 6.154721274175199e-07, "loss": 0.0004, "reward": 1.669921875, "reward_std": 0.04736812971532345, "rewards/format_reward": 1.0, "rewards/score_reward": 0.669921875, "step": 1352 }, { "clip_ratio": 0.0, "completion_length": 97.73437881469727, "epoch": 4.631399317406143, "grad_norm": 0.7871355582809122, "kl": 0.419921875, "learning_rate": 6.151877133105801e-07, "loss": 0.0004, "reward": 1.7259114384651184, "reward_std": 0.030004863627254963, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7259114682674408, "step": 1353 }, { "clip_ratio": 0.0, "completion_length": 97.59896087646484, "epoch": 4.63481228668942, "grad_norm": 0.9666830423135261, "kl": 0.41015625, "learning_rate": 6.149032992036405e-07, "loss": 0.0004, "reward": 1.7408853769302368, "reward_std": 0.029330057092010975, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7408854365348816, "step": 1354 }, { "clip_ratio": 0.0, "completion_length": 98.25000381469727, "epoch": 4.638225255972696, "grad_norm": 1.1038705459114657, "kl": 0.416015625, "learning_rate": 6.146188850967008e-07, "loss": 0.0004, "reward": 1.78125, "reward_std": 0.021831635385751724, "rewards/format_reward": 1.0, "rewards/score_reward": 0.78125, "step": 1355 }, { "clip_ratio": 0.0, "completion_length": 98.76302337646484, "epoch": 4.6416382252559725, "grad_norm": 1.2056657003546505, "kl": 0.419921875, "learning_rate": 6.143344709897611e-07, "loss": 0.0004, "reward": 1.7057291865348816, "reward_std": 0.04635761119425297, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7083333134651184, "step": 1356 }, { "clip_ratio": 0.0, "completion_length": 100.08854293823242, "epoch": 4.645051194539249, "grad_norm": 0.6559713422511638, "kl": 0.4345703125, "learning_rate": 6.140500568828214e-07, "loss": 0.0004, "reward": 1.6692708134651184, "reward_std": 0.04622488655149937, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6692708432674408, "step": 1357 }, { "clip_ratio": 0.0, "completion_length": 98.484375, "epoch": 4.648464163822526, "grad_norm": 0.7139529689074263, "kl": 0.41796875, "learning_rate": 6.137656427758816e-07, "loss": 0.0004, "reward": 1.6145833134651184, "reward_std": 0.030336463823914528, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6145833432674408, "step": 1358 }, { "clip_ratio": 0.0, "completion_length": 98.33854293823242, "epoch": 4.651877133105802, "grad_norm": 1.0306368316769152, "kl": 0.4248046875, "learning_rate": 6.13481228668942e-07, "loss": 0.0004, "reward": 1.740234375, "reward_std": 0.03125882148742676, "rewards/format_reward": 1.0, "rewards/score_reward": 0.740234375, "step": 1359 }, { "clip_ratio": 0.0, "completion_length": 99.65104675292969, "epoch": 4.6552901023890785, "grad_norm": 0.7566820229007295, "kl": 0.443359375, "learning_rate": 6.131968145620022e-07, "loss": 0.0004, "reward": 1.689453125, "reward_std": 0.030051066540181637, "rewards/format_reward": 1.0, "rewards/score_reward": 0.689453125, "step": 1360 }, { "clip_ratio": 0.0, "completion_length": 99.38802337646484, "epoch": 4.658703071672355, "grad_norm": 10.90224362300686, "kl": 0.4130859375, "learning_rate": 6.129124004550625e-07, "loss": 0.0004, "reward": 1.72265625, "reward_std": 0.05593829043209553, "rewards/format_reward": 1.0, "rewards/score_reward": 0.72265625, "step": 1361 }, { "clip_ratio": 0.0, "completion_length": 99.58333587646484, "epoch": 4.662116040955631, "grad_norm": 6.643631849424821, "kl": 0.4326171875, "learning_rate": 6.126279863481229e-07, "loss": 0.0004, "reward": 1.70703125, "reward_std": 0.0555636677891016, "rewards/format_reward": 1.0, "rewards/score_reward": 0.70703125, "step": 1362 }, { "clip_ratio": 0.0, "completion_length": 99.24739837646484, "epoch": 4.665529010238908, "grad_norm": 1.2125233297936702, "kl": 0.4248046875, "learning_rate": 6.123435722411831e-07, "loss": 0.0004, "reward": 1.6595051884651184, "reward_std": 0.05043839290738106, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6595052182674408, "step": 1363 }, { "clip_ratio": 0.0, "completion_length": 99.8984375, "epoch": 4.6689419795221845, "grad_norm": 0.977642674530558, "kl": 0.4287109375, "learning_rate": 6.120591581342435e-07, "loss": 0.0004, "reward": 1.748046875, "reward_std": 0.04324515024200082, "rewards/format_reward": 1.0, "rewards/score_reward": 0.748046875, "step": 1364 }, { "clip_ratio": 0.0, "completion_length": 99.08333587646484, "epoch": 4.672354948805461, "grad_norm": 0.6074126053999939, "kl": 0.4267578125, "learning_rate": 6.117747440273038e-07, "loss": 0.0004, "reward": 1.7395833134651184, "reward_std": 0.042694306932389736, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7395833432674408, "step": 1365 }, { "clip_ratio": 0.0, "completion_length": 100.14323043823242, "epoch": 4.675767918088737, "grad_norm": 0.7264166587656632, "kl": 0.42578125, "learning_rate": 6.114903299203639e-07, "loss": 0.0004, "reward": 1.7942708134651184, "reward_std": 0.04894232004880905, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.796875, "step": 1366 }, { "clip_ratio": 0.0, "completion_length": 99.51302337646484, "epoch": 4.679180887372014, "grad_norm": 2.8724452343269657, "kl": 0.416015625, "learning_rate": 6.112059158134243e-07, "loss": 0.0004, "reward": 1.720703125, "reward_std": 0.03835906460881233, "rewards/format_reward": 1.0, "rewards/score_reward": 0.720703125, "step": 1367 }, { "clip_ratio": 0.0, "completion_length": 98.43489837646484, "epoch": 4.6825938566552905, "grad_norm": 1.1096753308722156, "kl": 0.416015625, "learning_rate": 6.109215017064846e-07, "loss": 0.0004, "reward": 1.76171875, "reward_std": 0.04876942280679941, "rewards/format_reward": 1.0, "rewards/score_reward": 0.76171875, "step": 1368 }, { "clip_ratio": 0.0, "completion_length": 97.80989837646484, "epoch": 4.686006825938566, "grad_norm": 0.9506581156969636, "kl": 0.431640625, "learning_rate": 6.106370875995449e-07, "loss": 0.0004, "reward": 1.8248697519302368, "reward_std": 0.023672241251915693, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8248698115348816, "step": 1369 }, { "clip_ratio": 0.0, "completion_length": 98.00260543823242, "epoch": 4.689419795221843, "grad_norm": 0.8276984179828265, "kl": 0.431640625, "learning_rate": 6.103526734926052e-07, "loss": 0.0004, "reward": 1.71875, "reward_std": 0.029330055695027113, "rewards/format_reward": 1.0, "rewards/score_reward": 0.71875, "step": 1370 }, { "clip_ratio": 0.0, "completion_length": 97.83073425292969, "epoch": 4.69283276450512, "grad_norm": 0.41422739517189755, "kl": 0.41015625, "learning_rate": 6.100682593856655e-07, "loss": 0.0004, "reward": 1.810546875, "reward_std": 0.022401200607419014, "rewards/format_reward": 1.0, "rewards/score_reward": 0.810546875, "step": 1371 }, { "clip_ratio": 0.0, "completion_length": 98.82031631469727, "epoch": 4.696245733788396, "grad_norm": 0.652634635218751, "kl": 0.4287109375, "learning_rate": 6.097838452787258e-07, "loss": 0.0004, "reward": 1.732421875, "reward_std": 0.035794343799352646, "rewards/format_reward": 1.0, "rewards/score_reward": 0.732421875, "step": 1372 }, { "clip_ratio": 0.0, "completion_length": 96.64323043823242, "epoch": 4.699658703071672, "grad_norm": 1.3657566597361932, "kl": 0.42578125, "learning_rate": 6.09499431171786e-07, "loss": 0.0004, "reward": 1.7447916865348816, "reward_std": 0.04225787054747343, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7447916567325592, "step": 1373 }, { "clip_ratio": 0.0, "completion_length": 96.78385925292969, "epoch": 4.703071672354949, "grad_norm": 0.576781178766095, "kl": 0.4013671875, "learning_rate": 6.092150170648464e-07, "loss": 0.0004, "reward": 1.7981771230697632, "reward_std": 0.007232970092445612, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7981770634651184, "step": 1374 }, { "clip_ratio": 0.0, "completion_length": 97.0703125, "epoch": 4.706484641638225, "grad_norm": 0.505185407085331, "kl": 0.4208984375, "learning_rate": 6.089306029579066e-07, "loss": 0.0004, "reward": 1.7669270634651184, "reward_std": 0.007232970092445612, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7669270932674408, "step": 1375 }, { "clip_ratio": 0.0, "completion_length": 95.39583587646484, "epoch": 4.709897610921502, "grad_norm": 0.7178700644507451, "kl": 0.4189453125, "learning_rate": 6.086461888509669e-07, "loss": 0.0004, "reward": 1.6555989980697632, "reward_std": 0.01773066632449627, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6555989384651184, "step": 1376 }, { "clip_ratio": 0.0, "completion_length": 97.25521087646484, "epoch": 4.713310580204778, "grad_norm": 1.1622268068941235, "kl": 0.4453125, "learning_rate": 6.083617747440273e-07, "loss": 0.0004, "reward": 1.7174479365348816, "reward_std": 0.05714152380824089, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7174479067325592, "step": 1377 }, { "clip_ratio": 0.0, "completion_length": 97.63542175292969, "epoch": 4.716723549488055, "grad_norm": 1.2902880909827412, "kl": 0.435546875, "learning_rate": 6.080773606370876e-07, "loss": 0.0004, "reward": 1.7044270634651184, "reward_std": 0.0399602334946394, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7044270932674408, "step": 1378 }, { "clip_ratio": 0.0, "completion_length": 96.66667175292969, "epoch": 4.720136518771331, "grad_norm": 3.239649127050632, "kl": 0.435546875, "learning_rate": 6.077929465301479e-07, "loss": 0.0004, "reward": 1.76953125, "reward_std": 0.058348460122942924, "rewards/format_reward": 1.0, "rewards/score_reward": 0.76953125, "step": 1379 }, { "clip_ratio": 0.0, "completion_length": 97.83073043823242, "epoch": 4.723549488054608, "grad_norm": 1.432265053150509, "kl": 0.431640625, "learning_rate": 6.075085324232082e-07, "loss": 0.0004, "reward": 1.693359375, "reward_std": 0.06873102858662605, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6959635615348816, "step": 1380 }, { "clip_ratio": 0.0, "completion_length": 95.95573043823242, "epoch": 4.726962457337884, "grad_norm": 4.678705010020665, "kl": 0.4228515625, "learning_rate": 6.072241183162685e-07, "loss": 0.0004, "reward": 1.6946614384651184, "reward_std": 0.03458805941045284, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6946614682674408, "step": 1381 }, { "clip_ratio": 0.0, "completion_length": 96.91146087646484, "epoch": 4.73037542662116, "grad_norm": 2.343692143856308, "kl": 0.43359375, "learning_rate": 6.069397042093287e-07, "loss": 0.0004, "reward": 1.6875, "reward_std": 0.040511325001716614, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6875, "step": 1382 }, { "clip_ratio": 0.0, "completion_length": 97.39062881469727, "epoch": 4.733788395904437, "grad_norm": 0.7266721970712645, "kl": 0.40625, "learning_rate": 6.06655290102389e-07, "loss": 0.0004, "reward": 1.71484375, "reward_std": 0.03204845357686281, "rewards/format_reward": 1.0, "rewards/score_reward": 0.71484375, "step": 1383 }, { "clip_ratio": 0.0, "completion_length": 96.68489837646484, "epoch": 4.737201365187714, "grad_norm": 1.7640089374168306, "kl": 0.4189453125, "learning_rate": 6.063708759954494e-07, "loss": 0.0004, "reward": 1.7571614980697632, "reward_std": 0.07583952136337757, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7571614384651184, "step": 1384 }, { "clip_ratio": 0.0, "completion_length": 96.23177337646484, "epoch": 4.7406143344709895, "grad_norm": 0.33725169585085985, "kl": 0.4140625, "learning_rate": 6.060864618885096e-07, "loss": 0.0004, "reward": 1.7779948115348816, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7779947817325592, "step": 1385 }, { "clip_ratio": 0.0, "completion_length": 95.15364837646484, "epoch": 4.744027303754266, "grad_norm": 0.9117473835284118, "kl": 0.443359375, "learning_rate": 6.0580204778157e-07, "loss": 0.0004, "reward": 1.7200521230697632, "reward_std": 0.02805819734930992, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7200520634651184, "step": 1386 }, { "clip_ratio": 0.0, "completion_length": 97.70833587646484, "epoch": 4.747440273037543, "grad_norm": 1.679298129686855, "kl": 0.40625, "learning_rate": 6.055176336746303e-07, "loss": 0.0004, "reward": 1.705078125, "reward_std": 0.05538580007851124, "rewards/format_reward": 1.0, "rewards/score_reward": 0.705078125, "step": 1387 }, { "clip_ratio": 0.0, "completion_length": 97.53125381469727, "epoch": 4.750853242320819, "grad_norm": 1.096432764703066, "kl": 0.4140625, "learning_rate": 6.052332195676905e-07, "loss": 0.0004, "reward": 1.6966145634651184, "reward_std": 0.03458699956536293, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6966145932674408, "step": 1388 }, { "clip_ratio": 0.0, "completion_length": 97.94271087646484, "epoch": 4.7542662116040955, "grad_norm": 0.9486868269080506, "kl": 0.4248046875, "learning_rate": 6.049488054607508e-07, "loss": 0.0004, "reward": 1.7272135019302368, "reward_std": 0.04543830454349518, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7272135615348816, "step": 1389 }, { "clip_ratio": 0.0, "completion_length": 95.60417175292969, "epoch": 4.757679180887372, "grad_norm": 1.1625875437126572, "kl": 0.3994140625, "learning_rate": 6.046643913538111e-07, "loss": 0.0004, "reward": 1.72265625, "reward_std": 0.03202521521598101, "rewards/format_reward": 1.0, "rewards/score_reward": 0.72265625, "step": 1390 }, { "clip_ratio": 0.0, "completion_length": 98.65104675292969, "epoch": 4.761092150170649, "grad_norm": 1.2058596957037817, "kl": 0.4013671875, "learning_rate": 6.043799772468714e-07, "loss": 0.0004, "reward": 1.6940104365348816, "reward_std": 0.051028148271143436, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6940104067325592, "step": 1391 }, { "clip_ratio": 0.0, "completion_length": 98.30208587646484, "epoch": 4.764505119453925, "grad_norm": 0.8248190394008932, "kl": 0.416015625, "learning_rate": 6.040955631399317e-07, "loss": 0.0004, "reward": 1.763671875, "reward_std": 0.034607595298439264, "rewards/format_reward": 1.0, "rewards/score_reward": 0.763671875, "step": 1392 }, { "clip_ratio": 0.0, "completion_length": 97.9296875, "epoch": 4.7679180887372015, "grad_norm": 4.701856991776377, "kl": 0.4150390625, "learning_rate": 6.03811149032992e-07, "loss": 0.0004, "reward": 1.8118489384651184, "reward_std": 0.04230736568570137, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8118489682674408, "step": 1393 }, { "clip_ratio": 0.0, "completion_length": 99.01041793823242, "epoch": 4.771331058020478, "grad_norm": 1.9096600085459667, "kl": 0.4140625, "learning_rate": 6.035267349260523e-07, "loss": 0.0004, "reward": 1.8072916865348816, "reward_std": 0.04009296000003815, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8072916567325592, "step": 1394 }, { "clip_ratio": 0.0, "completion_length": 99.83073425292969, "epoch": 4.774744027303754, "grad_norm": 1.2871909309963556, "kl": 0.4052734375, "learning_rate": 6.032423208191126e-07, "loss": 0.0004, "reward": 1.7838541865348816, "reward_std": 0.046206166967749596, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7838541567325592, "step": 1395 }, { "clip_ratio": 0.0, "completion_length": 100.03125381469727, "epoch": 4.778156996587031, "grad_norm": 2.1089319735065537, "kl": 0.396484375, "learning_rate": 6.02957906712173e-07, "loss": 0.0004, "reward": 1.83984375, "reward_std": 0.07140900008380413, "rewards/format_reward": 1.0, "rewards/score_reward": 0.83984375, "step": 1396 }, { "clip_ratio": 0.0, "completion_length": 100.5390625, "epoch": 4.7815699658703075, "grad_norm": 1.3103486097310797, "kl": 0.4091796875, "learning_rate": 6.026734926052331e-07, "loss": 0.0004, "reward": 1.7962239384651184, "reward_std": 0.0335824703797698, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7962239682674408, "step": 1397 }, { "clip_ratio": 0.0, "completion_length": 104.09896087646484, "epoch": 4.784982935153583, "grad_norm": 1.2410553427030198, "kl": 0.390625, "learning_rate": 6.023890784982934e-07, "loss": 0.0004, "reward": 1.7174478769302368, "reward_std": 0.060955459251999855, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7200520932674408, "step": 1398 }, { "clip_ratio": 0.0, "completion_length": 101.89062881469727, "epoch": 4.78839590443686, "grad_norm": 0.707913366688351, "kl": 0.3994140625, "learning_rate": 6.021046643913538e-07, "loss": 0.0004, "reward": 1.7552083730697632, "reward_std": 0.04463225603103638, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7578125, "step": 1399 }, { "clip_ratio": 0.0, "completion_length": 101.35416793823242, "epoch": 4.791808873720137, "grad_norm": 0.7290632785114851, "kl": 0.400390625, "learning_rate": 6.018202502844141e-07, "loss": 0.0004, "reward": 1.7565103769302368, "reward_std": 0.036562207620590925, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7565104365348816, "step": 1400 }, { "clip_ratio": 0.0, "completion_length": 102.19271087646484, "epoch": 4.795221843003413, "grad_norm": 0.4642745389520104, "kl": 0.412109375, "learning_rate": 6.015358361774744e-07, "loss": 0.0004, "reward": 1.7610677480697632, "reward_std": 0.01275724172592163, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7610676884651184, "step": 1401 }, { "clip_ratio": 0.0, "completion_length": 100.30208587646484, "epoch": 4.798634812286689, "grad_norm": 0.6118299570818825, "kl": 0.3935546875, "learning_rate": 6.012514220705347e-07, "loss": 0.0004, "reward": 1.7037760615348816, "reward_std": 0.021109154913574457, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7037760317325592, "step": 1402 }, { "clip_ratio": 0.0, "completion_length": 103.24479675292969, "epoch": 4.802047781569966, "grad_norm": 0.8038771658630028, "kl": 0.392578125, "learning_rate": 6.00967007963595e-07, "loss": 0.0004, "reward": 1.69921875, "reward_std": 0.03517552465200424, "rewards/format_reward": 1.0, "rewards/score_reward": 0.69921875, "step": 1403 }, { "clip_ratio": 0.0, "completion_length": 99.921875, "epoch": 4.805460750853243, "grad_norm": 0.8122963859407988, "kl": 0.39453125, "learning_rate": 6.006825938566553e-07, "loss": 0.0004, "reward": 1.7415364980697632, "reward_std": 0.03386663785204291, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7415364384651184, "step": 1404 }, { "clip_ratio": 0.0, "completion_length": 102.21094131469727, "epoch": 4.808873720136519, "grad_norm": 1.128518244010968, "kl": 0.38671875, "learning_rate": 6.003981797497155e-07, "loss": 0.0004, "reward": 1.7369791269302368, "reward_std": 0.0803380124270916, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7369791865348816, "step": 1405 }, { "clip_ratio": 0.0, "completion_length": 100.171875, "epoch": 4.812286689419795, "grad_norm": 0.3111631682180195, "kl": 0.400390625, "learning_rate": 6.001137656427759e-07, "loss": 0.0004, "reward": 1.71484375, "reward_std": 0.007232970092445612, "rewards/format_reward": 1.0, "rewards/score_reward": 0.71484375, "step": 1406 }, { "clip_ratio": 0.0, "completion_length": 99.61979293823242, "epoch": 4.815699658703072, "grad_norm": 1.6365658878958564, "kl": 0.3955078125, "learning_rate": 5.998293515358361e-07, "loss": 0.0004, "reward": 1.7434895634651184, "reward_std": 0.028209641575813293, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7434895932674408, "step": 1407 }, { "clip_ratio": 0.0, "completion_length": 101.21354293823242, "epoch": 4.819112627986348, "grad_norm": 1.048834662030451, "kl": 0.3876953125, "learning_rate": 5.995449374288964e-07, "loss": 0.0004, "reward": 1.7454426884651184, "reward_std": 0.05146580655127764, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7454427182674408, "step": 1408 }, { "clip_ratio": 0.0, "completion_length": 100.04167175292969, "epoch": 4.822525597269625, "grad_norm": 1.5428988283893228, "kl": 0.4052734375, "learning_rate": 5.992605233219568e-07, "loss": 0.0004, "reward": 1.697265625, "reward_std": 0.06422100774943829, "rewards/format_reward": 1.0, "rewards/score_reward": 0.697265625, "step": 1409 }, { "clip_ratio": 0.0, "completion_length": 98.60937881469727, "epoch": 4.825938566552901, "grad_norm": 0.8229260932416913, "kl": 0.4189453125, "learning_rate": 5.98976109215017e-07, "loss": 0.0004, "reward": 1.6940104365348816, "reward_std": 0.0454572681337595, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6940104067325592, "step": 1410 }, { "clip_ratio": 0.0, "completion_length": 98.76302337646484, "epoch": 4.829351535836177, "grad_norm": 2.493702426156892, "kl": 0.4248046875, "learning_rate": 5.986916951080774e-07, "loss": 0.0004, "reward": 1.7122396230697632, "reward_std": 0.0709039457142353, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7122395634651184, "step": 1411 }, { "clip_ratio": 0.0, "completion_length": 97.86719131469727, "epoch": 4.832764505119454, "grad_norm": 2.4110677704881653, "kl": 0.400390625, "learning_rate": 5.984072810011376e-07, "loss": 0.0004, "reward": 1.7688801884651184, "reward_std": 0.06686931103467941, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7688802182674408, "step": 1412 }, { "clip_ratio": 0.0, "completion_length": 97.41666793823242, "epoch": 4.836177474402731, "grad_norm": 1.244389545105803, "kl": 0.400390625, "learning_rate": 5.981228668941978e-07, "loss": 0.0004, "reward": 1.8203125, "reward_std": 0.0522546237334609, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8203125, "step": 1413 }, { "clip_ratio": 0.0, "completion_length": 98.00260543823242, "epoch": 4.839590443686006, "grad_norm": 1.429833440683009, "kl": 0.41015625, "learning_rate": 5.978384527872582e-07, "loss": 0.0004, "reward": 1.6640625, "reward_std": 0.060954807326197624, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6640625, "step": 1414 }, { "clip_ratio": 0.0, "completion_length": 98.47396087646484, "epoch": 4.843003412969283, "grad_norm": 1.0791492534611171, "kl": 0.4091796875, "learning_rate": 5.975540386803185e-07, "loss": 0.0004, "reward": 1.7884114384651184, "reward_std": 0.043529318645596504, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7884114682674408, "step": 1415 }, { "clip_ratio": 0.0, "completion_length": 100.19271087646484, "epoch": 4.84641638225256, "grad_norm": 1.3627941341949168, "kl": 0.4169921875, "learning_rate": 5.972696245733788e-07, "loss": 0.0004, "reward": 1.65234375, "reward_std": 0.04278958961367607, "rewards/format_reward": 1.0, "rewards/score_reward": 0.65234375, "step": 1416 }, { "clip_ratio": 0.0, "completion_length": 98.91927337646484, "epoch": 4.849829351535837, "grad_norm": 1.007768125904751, "kl": 0.3935546875, "learning_rate": 5.969852104664391e-07, "loss": 0.0004, "reward": 1.7578125, "reward_std": 0.028209643438458443, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7578125, "step": 1417 }, { "clip_ratio": 0.0, "completion_length": 99.0390625, "epoch": 4.853242320819112, "grad_norm": 0.8214136640499211, "kl": 0.470703125, "learning_rate": 5.967007963594995e-07, "loss": 0.0005, "reward": 1.6796875, "reward_std": 0.039978787302970886, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6796875, "step": 1418 }, { "clip_ratio": 0.0, "completion_length": 97.82031631469727, "epoch": 4.856655290102389, "grad_norm": 0.72330213791312, "kl": 0.404296875, "learning_rate": 5.964163822525598e-07, "loss": 0.0004, "reward": 1.716796875, "reward_std": 0.029899622313678265, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7194010615348816, "step": 1419 }, { "clip_ratio": 0.0, "completion_length": 96.3203125, "epoch": 4.860068259385666, "grad_norm": 1.1174903274180608, "kl": 0.4150390625, "learning_rate": 5.961319681456199e-07, "loss": 0.0004, "reward": 1.7265625, "reward_std": 0.061980584636330605, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7291666567325592, "step": 1420 }, { "clip_ratio": 0.0, "completion_length": 97.60416793823242, "epoch": 4.863481228668942, "grad_norm": 1.0018498614709663, "kl": 0.423828125, "learning_rate": 5.958475540386803e-07, "loss": 0.0004, "reward": 1.7682291865348816, "reward_std": 0.03656155616044998, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7682291567325592, "step": 1421 }, { "clip_ratio": 0.0, "completion_length": 96.41146087646484, "epoch": 4.8668941979522184, "grad_norm": 1.069428498432776, "kl": 0.4091796875, "learning_rate": 5.955631399317406e-07, "loss": 0.0004, "reward": 1.6744791269302368, "reward_std": 0.03884129133075476, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6744791865348816, "step": 1422 }, { "clip_ratio": 0.0, "completion_length": 95.82812881469727, "epoch": 4.870307167235495, "grad_norm": 2.998172950445194, "kl": 0.4384765625, "learning_rate": 5.952787258248009e-07, "loss": 0.0004, "reward": 1.740234375, "reward_std": 0.02735590562224388, "rewards/format_reward": 1.0, "rewards/score_reward": 0.740234375, "step": 1423 }, { "clip_ratio": 0.0, "completion_length": 97.09635925292969, "epoch": 4.873720136518771, "grad_norm": 0.5603073028910545, "kl": 0.4111328125, "learning_rate": 5.949943117178612e-07, "loss": 0.0004, "reward": 1.734375, "reward_std": 0.020692503545433283, "rewards/format_reward": 1.0, "rewards/score_reward": 0.734375, "step": 1424 }, { "clip_ratio": 0.0, "completion_length": 96.01302337646484, "epoch": 4.877133105802048, "grad_norm": 1.2620972898705076, "kl": 0.400390625, "learning_rate": 5.947098976109215e-07, "loss": 0.0004, "reward": 1.7096354365348816, "reward_std": 0.060822732746601105, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7096354067325592, "step": 1425 }, { "clip_ratio": 0.0, "completion_length": 97.5703125, "epoch": 4.8805460750853245, "grad_norm": 2.9842053273325257, "kl": 0.40625, "learning_rate": 5.944254835039818e-07, "loss": 0.0004, "reward": 1.7467447519302368, "reward_std": 0.030905211344361305, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7467448115348816, "step": 1426 }, { "clip_ratio": 0.0, "completion_length": 96.10937881469727, "epoch": 4.8839590443686, "grad_norm": 6.38155350630668, "kl": 0.4150390625, "learning_rate": 5.941410693970421e-07, "loss": 0.0004, "reward": 1.7864583134651184, "reward_std": 0.029462780803442, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7864583432674408, "step": 1427 }, { "clip_ratio": 0.0, "completion_length": 96.35937881469727, "epoch": 4.887372013651877, "grad_norm": 1.4846253104097276, "kl": 0.3896484375, "learning_rate": 5.938566552901024e-07, "loss": 0.0004, "reward": 1.8704427480697632, "reward_std": 0.032626137137413025, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8704426884651184, "step": 1428 }, { "clip_ratio": 0.0, "completion_length": 95.22916793823242, "epoch": 4.890784982935154, "grad_norm": 1.5229055607390816, "kl": 0.4140625, "learning_rate": 5.935722411831626e-07, "loss": 0.0004, "reward": 1.7350260615348816, "reward_std": 0.04438264574855566, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7350260317325592, "step": 1429 }, { "clip_ratio": 0.0, "completion_length": 94.80989837646484, "epoch": 4.8941979522184305, "grad_norm": 1.9888455069139648, "kl": 0.4365234375, "learning_rate": 5.932878270762229e-07, "loss": 0.0004, "reward": 1.7643229365348816, "reward_std": 0.04719238728284836, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7643229067325592, "step": 1430 }, { "clip_ratio": 0.0, "completion_length": 95.66146087646484, "epoch": 4.897610921501706, "grad_norm": 0.9466029105315145, "kl": 0.4072265625, "learning_rate": 5.930034129692833e-07, "loss": 0.0004, "reward": 1.7545573115348816, "reward_std": 0.03232933208346367, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7545572817325592, "step": 1431 }, { "clip_ratio": 0.0, "completion_length": 96.54167175292969, "epoch": 4.901023890784983, "grad_norm": 0.8908382189475881, "kl": 0.41796875, "learning_rate": 5.927189988623435e-07, "loss": 0.0004, "reward": 1.7591145634651184, "reward_std": 0.03884047269821167, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7591145932674408, "step": 1432 }, { "clip_ratio": 0.0, "completion_length": 97.41927337646484, "epoch": 4.90443686006826, "grad_norm": 1.7020380837330389, "kl": 0.4326171875, "learning_rate": 5.924345847554039e-07, "loss": 0.0004, "reward": 1.77734375, "reward_std": 0.013610977679491043, "rewards/format_reward": 1.0, "rewards/score_reward": 0.77734375, "step": 1433 }, { "clip_ratio": 0.0, "completion_length": 96.99739837646484, "epoch": 4.907849829351536, "grad_norm": 1.2976740633576413, "kl": 0.3955078125, "learning_rate": 5.921501706484642e-07, "loss": 0.0004, "reward": 1.8255208134651184, "reward_std": 0.06466089375317097, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8255208432674408, "step": 1434 }, { "clip_ratio": 0.0, "completion_length": 96.02344131469727, "epoch": 4.911262798634812, "grad_norm": 0.3370451206969518, "kl": 0.3935546875, "learning_rate": 5.918657565415244e-07, "loss": 0.0004, "reward": 1.7213541865348816, "reward_std": 0.0073656952008605, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7213541567325592, "step": 1435 }, { "clip_ratio": 0.0, "completion_length": 97.56771087646484, "epoch": 4.914675767918089, "grad_norm": 2.194190085351579, "kl": 0.41796875, "learning_rate": 5.915813424345847e-07, "loss": 0.0004, "reward": 1.6979166269302368, "reward_std": 0.03314562886953354, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6979166865348816, "step": 1436 }, { "clip_ratio": 0.0, "completion_length": 96.16927337646484, "epoch": 4.918088737201365, "grad_norm": 0.8320090580747795, "kl": 0.41796875, "learning_rate": 5.91296928327645e-07, "loss": 0.0004, "reward": 1.7330729365348816, "reward_std": 0.014598664827644825, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7330729067325592, "step": 1437 }, { "clip_ratio": 0.0, "completion_length": 98.01302337646484, "epoch": 4.921501706484642, "grad_norm": 2.0997252783415643, "kl": 0.408203125, "learning_rate": 5.910125142207054e-07, "loss": 0.0004, "reward": 1.7298177480697632, "reward_std": 0.05854504182934761, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7298176884651184, "step": 1438 }, { "clip_ratio": 0.0, "completion_length": 98.10677337646484, "epoch": 4.924914675767918, "grad_norm": 0.9318459886320095, "kl": 0.3994140625, "learning_rate": 5.907281001137656e-07, "loss": 0.0004, "reward": 1.67578125, "reward_std": 0.03897401504218578, "rewards/format_reward": 1.0, "rewards/score_reward": 0.67578125, "step": 1439 }, { "clip_ratio": 0.0, "completion_length": 95.50260925292969, "epoch": 4.928327645051194, "grad_norm": 1.2486323793458989, "kl": 0.3876953125, "learning_rate": 5.904436860068259e-07, "loss": 0.0004, "reward": 1.7350260615348816, "reward_std": 0.04778214357793331, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7350260317325592, "step": 1440 }, { "clip_ratio": 0.0, "completion_length": 95.66406631469727, "epoch": 4.931740614334471, "grad_norm": 0.2909851878694239, "kl": 0.4013671875, "learning_rate": 5.901592718998863e-07, "loss": 0.0004, "reward": 1.7526041865348816, "reward_std": 0.0073656952008605, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7526041567325592, "step": 1441 }, { "clip_ratio": 0.0, "completion_length": 97.58333587646484, "epoch": 4.935153583617748, "grad_norm": 1.749546410190421, "kl": 0.4072265625, "learning_rate": 5.898748577929465e-07, "loss": 0.0004, "reward": 1.6653646230697632, "reward_std": 0.031694844365119934, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6653645634651184, "step": 1442 }, { "clip_ratio": 0.0, "completion_length": 97.65364837646484, "epoch": 4.938566552901024, "grad_norm": 1.572364613432202, "kl": 0.4111328125, "learning_rate": 5.895904436860068e-07, "loss": 0.0004, "reward": 1.6920572519302368, "reward_std": 0.03842316707596183, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6920573115348816, "step": 1443 }, { "clip_ratio": 0.0, "completion_length": 98.8671875, "epoch": 4.9419795221843, "grad_norm": 0.4683186988304473, "kl": 0.3935546875, "learning_rate": 5.893060295790671e-07, "loss": 0.0004, "reward": 1.7975260615348816, "reward_std": 0.023520145565271378, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7975260317325592, "step": 1444 }, { "clip_ratio": 0.0, "completion_length": 95.14062881469727, "epoch": 4.945392491467577, "grad_norm": 0.54694883660232, "kl": 0.39453125, "learning_rate": 5.890216154721273e-07, "loss": 0.0004, "reward": 1.7376302480697632, "reward_std": 0.02353951521217823, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7376301884651184, "step": 1445 }, { "clip_ratio": 0.0, "completion_length": 97.65885543823242, "epoch": 4.948805460750854, "grad_norm": 1.6864431111096458, "kl": 0.400390625, "learning_rate": 5.887372013651877e-07, "loss": 0.0004, "reward": 1.6692708730697632, "reward_std": 0.04543936438858509, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6692708134651184, "step": 1446 }, { "clip_ratio": 0.0, "completion_length": 98.72656631469727, "epoch": 4.952218430034129, "grad_norm": 1.2755379897547603, "kl": 0.3955078125, "learning_rate": 5.88452787258248e-07, "loss": 0.0004, "reward": 1.7239583730697632, "reward_std": 0.032613092102110386, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7239583134651184, "step": 1447 }, { "clip_ratio": 0.0, "completion_length": 98.17448425292969, "epoch": 4.955631399317406, "grad_norm": 0.7050820046645382, "kl": 0.3876953125, "learning_rate": 5.881683731513083e-07, "loss": 0.0004, "reward": 1.6614583134651184, "reward_std": 0.042742734774947166, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6614583432674408, "step": 1448 }, { "clip_ratio": 0.0, "completion_length": 99.82031631469727, "epoch": 4.959044368600683, "grad_norm": 0.7083038779851591, "kl": 0.4091796875, "learning_rate": 5.878839590443686e-07, "loss": 0.0004, "reward": 1.791015625, "reward_std": 0.0302711334079504, "rewards/format_reward": 1.0, "rewards/score_reward": 0.791015625, "step": 1449 }, { "clip_ratio": 0.0, "completion_length": 96.70052337646484, "epoch": 4.962457337883959, "grad_norm": 1.5693395881151475, "kl": 0.404296875, "learning_rate": 5.875995449374289e-07, "loss": 0.0004, "reward": 1.66015625, "reward_std": 0.0560015719383955, "rewards/format_reward": 1.0, "rewards/score_reward": 0.66015625, "step": 1450 }, { "clip_ratio": 0.0, "completion_length": 96.71094131469727, "epoch": 4.965870307167235, "grad_norm": 0.7564739459194788, "kl": 0.392578125, "learning_rate": 5.873151308304891e-07, "loss": 0.0004, "reward": 1.740234375, "reward_std": 0.027442432940006256, "rewards/format_reward": 1.0, "rewards/score_reward": 0.740234375, "step": 1451 }, { "clip_ratio": 0.0, "completion_length": 97.20833587646484, "epoch": 4.969283276450512, "grad_norm": 0.4302711878506189, "kl": 0.3994140625, "learning_rate": 5.870307167235494e-07, "loss": 0.0004, "reward": 1.7024739980697632, "reward_std": 0.02253392618149519, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.705078125, "step": 1452 }, { "clip_ratio": 0.0, "completion_length": 96.96614837646484, "epoch": 4.972696245733788, "grad_norm": 18.90795687378281, "kl": 0.4091796875, "learning_rate": 5.867463026166098e-07, "loss": 0.0004, "reward": 1.771484375, "reward_std": 0.03708638763055205, "rewards/format_reward": 1.0, "rewards/score_reward": 0.771484375, "step": 1453 }, { "clip_ratio": 0.0, "completion_length": 98.69271087646484, "epoch": 4.976109215017065, "grad_norm": 1.5934608474176086, "kl": 0.41015625, "learning_rate": 5.8646188850967e-07, "loss": 0.0004, "reward": 1.724609375, "reward_std": 0.021241880720481277, "rewards/format_reward": 1.0, "rewards/score_reward": 0.724609375, "step": 1454 }, { "clip_ratio": 0.0, "completion_length": 98.39844131469727, "epoch": 4.979522184300341, "grad_norm": 0.8644935470387539, "kl": 0.4130859375, "learning_rate": 5.861774744027303e-07, "loss": 0.0004, "reward": 1.7493489384651184, "reward_std": 0.04054917115718126, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7493489682674408, "step": 1455 }, { "clip_ratio": 0.0, "completion_length": 98.08333587646484, "epoch": 4.982935153583618, "grad_norm": 0.9911585402656481, "kl": 0.4208984375, "learning_rate": 5.858930602957907e-07, "loss": 0.0004, "reward": 1.716796875, "reward_std": 0.02509636152535677, "rewards/format_reward": 1.0, "rewards/score_reward": 0.716796875, "step": 1456 }, { "clip_ratio": 0.0, "completion_length": 97.15104675292969, "epoch": 4.986348122866894, "grad_norm": 1.713625854767094, "kl": 0.4033203125, "learning_rate": 5.856086461888509e-07, "loss": 0.0004, "reward": 1.787109375, "reward_std": 0.059399839490652084, "rewards/format_reward": 1.0, "rewards/score_reward": 0.787109375, "step": 1457 }, { "clip_ratio": 0.0, "completion_length": 96.40885543823242, "epoch": 4.989761092150171, "grad_norm": 0.9964964311782645, "kl": 0.42578125, "learning_rate": 5.853242320819113e-07, "loss": 0.0004, "reward": 1.7916666865348816, "reward_std": 0.024375351145863533, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7916666567325592, "step": 1458 }, { "clip_ratio": 0.0, "completion_length": 99.32812881469727, "epoch": 4.993174061433447, "grad_norm": 1.5402348818173075, "kl": 0.3916015625, "learning_rate": 5.850398179749715e-07, "loss": 0.0004, "reward": 1.7115885019302368, "reward_std": 0.04425155371427536, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7115885615348816, "step": 1459 }, { "clip_ratio": 0.0, "completion_length": 97.58333969116211, "epoch": 4.996587030716723, "grad_norm": 1.6095569938393237, "kl": 0.421875, "learning_rate": 5.847554038680317e-07, "loss": 0.0004, "reward": 1.7375000715255737, "reward_std": 0.08638019859790802, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7375000417232513, "step": 1460 }, { "clip_ratio": 0.0, "completion_length": 98.8046875, "epoch": 5.003412969283277, "grad_norm": 2.338315117666755, "kl": 0.3916015625, "learning_rate": 5.844709897610921e-07, "loss": 0.0004, "reward": 1.759765625, "reward_std": 0.020691442070528865, "rewards/format_reward": 1.0, "rewards/score_reward": 0.759765625, "step": 1461 }, { "clip_ratio": 0.0, "completion_length": 96.66667175292969, "epoch": 5.006825938566553, "grad_norm": 1.1045714250154806, "kl": 0.404296875, "learning_rate": 5.841865756541524e-07, "loss": 0.0004, "reward": 1.783203125, "reward_std": 0.06479032710194588, "rewards/format_reward": 1.0, "rewards/score_reward": 0.783203125, "step": 1462 }, { "clip_ratio": 0.0, "completion_length": 97.16406631469727, "epoch": 5.010238907849829, "grad_norm": 1.1519708479404924, "kl": 0.4052734375, "learning_rate": 5.839021615472128e-07, "loss": 0.0004, "reward": 1.6946614384651184, "reward_std": 0.039561483077704906, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6946614682674408, "step": 1463 }, { "clip_ratio": 0.0, "completion_length": 99.0078125, "epoch": 5.013651877133106, "grad_norm": 1.1466957585405748, "kl": 0.396484375, "learning_rate": 5.83617747440273e-07, "loss": 0.0004, "reward": 1.71875, "reward_std": 0.02524821273982525, "rewards/format_reward": 1.0, "rewards/score_reward": 0.71875, "step": 1464 }, { "clip_ratio": 0.0, "completion_length": 98.77864837646484, "epoch": 5.017064846416382, "grad_norm": 0.9770536317101751, "kl": 0.4150390625, "learning_rate": 5.833333333333334e-07, "loss": 0.0004, "reward": 1.7317708134651184, "reward_std": 0.026653614826500416, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7317708432674408, "step": 1465 }, { "clip_ratio": 0.0, "completion_length": 98.94010543823242, "epoch": 5.020477815699659, "grad_norm": 0.4739165220666893, "kl": 0.4248046875, "learning_rate": 5.830489192263937e-07, "loss": 0.0004, "reward": 1.8138021230697632, "reward_std": 0.021962891798466444, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8138020634651184, "step": 1466 }, { "clip_ratio": 0.0, "completion_length": 97.6796875, "epoch": 5.023890784982935, "grad_norm": 0.8934875105000375, "kl": 0.423828125, "learning_rate": 5.827645051194538e-07, "loss": 0.0004, "reward": 1.7805989980697632, "reward_std": 0.027355907950550318, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7805989384651184, "step": 1467 }, { "clip_ratio": 0.0, "completion_length": 100.01302337646484, "epoch": 5.027303754266212, "grad_norm": 17.47916939613515, "kl": 0.416015625, "learning_rate": 5.824800910125142e-07, "loss": 0.0004, "reward": 1.7688801884651184, "reward_std": 0.043662043288350105, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7688802182674408, "step": 1468 }, { "clip_ratio": 0.0, "completion_length": 99.51823425292969, "epoch": 5.030716723549488, "grad_norm": 0.7764038909728278, "kl": 0.423828125, "learning_rate": 5.821956769055745e-07, "loss": 0.0004, "reward": 1.798828125, "reward_std": 0.018717535538598895, "rewards/format_reward": 1.0, "rewards/score_reward": 0.798828125, "step": 1469 }, { "clip_ratio": 0.0, "completion_length": 98.37760925292969, "epoch": 5.034129692832765, "grad_norm": 0.7333248644589926, "kl": 0.3974609375, "learning_rate": 5.819112627986348e-07, "loss": 0.0004, "reward": 1.767578125, "reward_std": 0.024658459704369307, "rewards/format_reward": 1.0, "rewards/score_reward": 0.767578125, "step": 1470 }, { "clip_ratio": 0.0, "completion_length": 100.3984375, "epoch": 5.037542662116041, "grad_norm": 0.7041414187813313, "kl": 0.4140625, "learning_rate": 5.816268486916951e-07, "loss": 0.0004, "reward": 1.6673176884651184, "reward_std": 0.013876184821128845, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6673177182674408, "step": 1471 }, { "clip_ratio": 0.0, "completion_length": 100.07552337646484, "epoch": 5.040955631399317, "grad_norm": 1.7602453846326973, "kl": 0.3955078125, "learning_rate": 5.813424345847554e-07, "loss": 0.0004, "reward": 1.7623698115348816, "reward_std": 0.030885839834809303, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7623697817325592, "step": 1472 }, { "clip_ratio": 0.0, "completion_length": 101.11198043823242, "epoch": 5.044368600682594, "grad_norm": 0.550413079693051, "kl": 0.40625, "learning_rate": 5.810580204778157e-07, "loss": 0.0004, "reward": 1.72265625, "reward_std": 0.01828151335939765, "rewards/format_reward": 1.0, "rewards/score_reward": 0.72265625, "step": 1473 }, { "clip_ratio": 0.0, "completion_length": 99.11458587646484, "epoch": 5.047781569965871, "grad_norm": 4.757214173567049, "kl": 0.4091796875, "learning_rate": 5.80773606370876e-07, "loss": 0.0004, "reward": 1.6783853769302368, "reward_std": 0.025647209491580725, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6783854365348816, "step": 1474 }, { "clip_ratio": 0.0, "completion_length": 100.51302337646484, "epoch": 5.051194539249146, "grad_norm": 0.968761161334896, "kl": 0.40625, "learning_rate": 5.804891922639363e-07, "loss": 0.0004, "reward": 1.7291666865348816, "reward_std": 0.03529116744175553, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7291666567325592, "step": 1475 }, { "clip_ratio": 0.0, "completion_length": 100.65364837646484, "epoch": 5.054607508532423, "grad_norm": 0.8508232043450387, "kl": 0.4052734375, "learning_rate": 5.802047781569965e-07, "loss": 0.0004, "reward": 1.84765625, "reward_std": 0.033100245986133814, "rewards/format_reward": 1.0, "rewards/score_reward": 0.84765625, "step": 1476 }, { "clip_ratio": 0.0, "completion_length": 99.01823425292969, "epoch": 5.0580204778157, "grad_norm": 5.596536676289623, "kl": 0.4169921875, "learning_rate": 5.799203640500568e-07, "loss": 0.0004, "reward": 1.71875, "reward_std": 0.05366002582013607, "rewards/format_reward": 1.0, "rewards/score_reward": 0.71875, "step": 1477 }, { "clip_ratio": 0.0, "completion_length": 100.5859375, "epoch": 5.061433447098976, "grad_norm": 1.498903251493927, "kl": 0.4013671875, "learning_rate": 5.796359499431172e-07, "loss": 0.0004, "reward": 1.7747395634651184, "reward_std": 0.04552670568227768, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7747395932674408, "step": 1478 }, { "clip_ratio": 0.0, "completion_length": 100.65625381469727, "epoch": 5.064846416382252, "grad_norm": 0.6179727253616806, "kl": 0.4208984375, "learning_rate": 5.793515358361774e-07, "loss": 0.0004, "reward": 1.8385416865348816, "reward_std": 0.028209643438458443, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8385416567325592, "step": 1479 }, { "clip_ratio": 0.0, "completion_length": 100.51042175292969, "epoch": 5.068259385665529, "grad_norm": 1.40217230908906, "kl": 0.41015625, "learning_rate": 5.790671217292378e-07, "loss": 0.0004, "reward": 1.73828125, "reward_std": 0.06777006387710571, "rewards/format_reward": 1.0, "rewards/score_reward": 0.73828125, "step": 1480 }, { "clip_ratio": 0.0, "completion_length": 100.49219131469727, "epoch": 5.071672354948806, "grad_norm": 2.040519886227816, "kl": 0.404296875, "learning_rate": 5.787827076222981e-07, "loss": 0.0004, "reward": 1.6848958730697632, "reward_std": 0.03174104634672403, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6848958134651184, "step": 1481 }, { "clip_ratio": 0.0, "completion_length": 100.05469131469727, "epoch": 5.075085324232082, "grad_norm": 0.3000502641090059, "kl": 0.4033203125, "learning_rate": 5.784982935153582e-07, "loss": 0.0004, "reward": 1.6536458134651184, "reward_std": 0.010782274417579174, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6536458432674408, "step": 1482 }, { "clip_ratio": 0.0, "completion_length": 99.54687881469727, "epoch": 5.078498293515358, "grad_norm": 0.4234507742480492, "kl": 0.4013671875, "learning_rate": 5.782138794084186e-07, "loss": 0.0004, "reward": 1.6555989384651184, "reward_std": 0.013876184821128845, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6555989682674408, "step": 1483 }, { "clip_ratio": 0.0, "completion_length": 101.44271087646484, "epoch": 5.081911262798635, "grad_norm": 1.4688750817723668, "kl": 0.4140625, "learning_rate": 5.779294653014789e-07, "loss": 0.0004, "reward": 1.6783854365348816, "reward_std": 0.04729909636080265, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6783854067325592, "step": 1484 }, { "clip_ratio": 0.0, "completion_length": 102.71875381469727, "epoch": 5.085324232081911, "grad_norm": 0.496687648581903, "kl": 0.388671875, "learning_rate": 5.776450511945393e-07, "loss": 0.0004, "reward": 1.7083333134651184, "reward_std": 0.018147969618439674, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7083333432674408, "step": 1485 }, { "clip_ratio": 0.0, "completion_length": 104.05729675292969, "epoch": 5.088737201365188, "grad_norm": 3.640971605735476, "kl": 0.42578125, "learning_rate": 5.773606370875995e-07, "loss": 0.0004, "reward": 1.71484375, "reward_std": 0.04118670802563429, "rewards/format_reward": 1.0, "rewards/score_reward": 0.71484375, "step": 1486 }, { "clip_ratio": 0.0, "completion_length": 100.92969131469727, "epoch": 5.092150170648464, "grad_norm": 0.7949442546978286, "kl": 0.41015625, "learning_rate": 5.770762229806598e-07, "loss": 0.0004, "reward": 1.6959635615348816, "reward_std": 0.04445143649354577, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6959635317325592, "step": 1487 }, { "clip_ratio": 0.0, "completion_length": 103.70052337646484, "epoch": 5.09556313993174, "grad_norm": 1.1572187644699827, "kl": 0.384765625, "learning_rate": 5.767918088737202e-07, "loss": 0.0004, "reward": 1.66015625, "reward_std": 0.04510528780519962, "rewards/format_reward": 1.0, "rewards/score_reward": 0.66015625, "step": 1488 }, { "clip_ratio": 0.0, "completion_length": 104.03125, "epoch": 5.098976109215017, "grad_norm": 0.971109503700208, "kl": 0.412109375, "learning_rate": 5.765073947667804e-07, "loss": 0.0004, "reward": 1.73828125, "reward_std": 0.05615301709622145, "rewards/format_reward": 1.0, "rewards/score_reward": 0.73828125, "step": 1489 }, { "clip_ratio": 0.0, "completion_length": 104.70052337646484, "epoch": 5.102389078498294, "grad_norm": 0.4947642236989346, "kl": 0.4052734375, "learning_rate": 5.762229806598407e-07, "loss": 0.0004, "reward": 1.828125, "reward_std": 0.02919732965528965, "rewards/format_reward": 0.9947916865348816, "rewards/score_reward": 0.8333333432674408, "step": 1490 }, { "clip_ratio": 0.0, "completion_length": 104.96875381469727, "epoch": 5.1058020477815695, "grad_norm": 3.7726098727728017, "kl": 0.3974609375, "learning_rate": 5.75938566552901e-07, "loss": 0.0004, "reward": 1.7428385019302368, "reward_std": 0.039389848709106445, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7428385615348816, "step": 1491 }, { "clip_ratio": 0.0, "completion_length": 104.79948043823242, "epoch": 5.109215017064846, "grad_norm": 0.8556678227380005, "kl": 0.40234375, "learning_rate": 5.756541524459612e-07, "loss": 0.0004, "reward": 1.8131510019302368, "reward_std": 0.04390164650976658, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8131510615348816, "step": 1492 }, { "clip_ratio": 0.0, "completion_length": 104.42448043823242, "epoch": 5.112627986348123, "grad_norm": 0.8916887012670424, "kl": 0.4013671875, "learning_rate": 5.753697383390216e-07, "loss": 0.0004, "reward": 1.734375, "reward_std": 0.03529116837307811, "rewards/format_reward": 1.0, "rewards/score_reward": 0.734375, "step": 1493 }, { "clip_ratio": 0.0, "completion_length": 107.09375381469727, "epoch": 5.1160409556314, "grad_norm": 0.48364871027029244, "kl": 0.41796875, "learning_rate": 5.750853242320819e-07, "loss": 0.0004, "reward": 1.7467447519302368, "reward_std": 0.030183793045580387, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7493489682674408, "step": 1494 }, { "clip_ratio": 0.0, "completion_length": 106.14583587646484, "epoch": 5.1194539249146755, "grad_norm": 0.9374104851890508, "kl": 0.408203125, "learning_rate": 5.748009101251422e-07, "loss": 0.0004, "reward": 1.806640625, "reward_std": 0.03750410117208958, "rewards/format_reward": 1.0, "rewards/score_reward": 0.806640625, "step": 1495 }, { "clip_ratio": 0.0, "completion_length": 106.3671875, "epoch": 5.122866894197952, "grad_norm": 0.9795763624409562, "kl": 0.4130859375, "learning_rate": 5.745164960182025e-07, "loss": 0.0004, "reward": 1.74609375, "reward_std": 0.03197983140125871, "rewards/format_reward": 1.0, "rewards/score_reward": 0.74609375, "step": 1496 }, { "clip_ratio": 0.0, "completion_length": 106.98698043823242, "epoch": 5.126279863481229, "grad_norm": 1.4560146149702458, "kl": 0.4111328125, "learning_rate": 5.742320819112628e-07, "loss": 0.0004, "reward": 1.8997395634651184, "reward_std": 0.06012537330389023, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8997395932674408, "step": 1497 }, { "clip_ratio": 0.0, "completion_length": 106.29948425292969, "epoch": 5.129692832764505, "grad_norm": 0.8180254757505218, "kl": 0.3857421875, "learning_rate": 5.73947667804323e-07, "loss": 0.0004, "reward": 1.8014323115348816, "reward_std": 0.03247995814308524, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8014322817325592, "step": 1498 }, { "clip_ratio": 0.0, "completion_length": 104.93489837646484, "epoch": 5.1331058020477816, "grad_norm": 0.5437746427000891, "kl": 0.4052734375, "learning_rate": 5.736632536973833e-07, "loss": 0.0004, "reward": 1.763671875, "reward_std": 0.015452401712536812, "rewards/format_reward": 1.0, "rewards/score_reward": 0.763671875, "step": 1499 }, { "clip_ratio": 0.0, "completion_length": 107.20052337646484, "epoch": 5.136518771331058, "grad_norm": 0.8071978017491281, "kl": 0.396484375, "learning_rate": 5.733788395904437e-07, "loss": 0.0004, "reward": 1.7005208134651184, "reward_std": 0.017009655013680458, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7005208432674408, "step": 1500 }, { "clip_ratio": 0.0, "completion_length": 105.09896087646484, "epoch": 5.139931740614334, "grad_norm": 0.9211623797231083, "kl": 0.3896484375, "learning_rate": 5.730944254835039e-07, "loss": 0.0004, "reward": 1.734375, "reward_std": 0.03991402965039015, "rewards/format_reward": 1.0, "rewards/score_reward": 0.734375, "step": 1501 }, { "clip_ratio": 0.0, "completion_length": 107.51041793823242, "epoch": 5.143344709897611, "grad_norm": 3.3863113092787995, "kl": 0.390625, "learning_rate": 5.728100113765642e-07, "loss": 0.0004, "reward": 1.8072916865348816, "reward_std": 0.020692503079771996, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8072916567325592, "step": 1502 }, { "clip_ratio": 0.0, "completion_length": 108.11979293823242, "epoch": 5.146757679180888, "grad_norm": 1.6886429735338198, "kl": 0.392578125, "learning_rate": 5.725255972696246e-07, "loss": 0.0004, "reward": 1.8274739384651184, "reward_std": 0.041686832904815674, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8274739682674408, "step": 1503 }, { "clip_ratio": 0.0, "completion_length": 109.6875, "epoch": 5.150170648464163, "grad_norm": 0.9185904954707014, "kl": 0.412109375, "learning_rate": 5.722411831626849e-07, "loss": 0.0004, "reward": 1.7727864384651184, "reward_std": 0.03956230357289314, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7727864682674408, "step": 1504 }, { "clip_ratio": 0.0, "completion_length": 107.77344131469727, "epoch": 5.15358361774744, "grad_norm": 0.6969598610784445, "kl": 0.392578125, "learning_rate": 5.719567690557452e-07, "loss": 0.0004, "reward": 1.802734375, "reward_std": 0.023938510101288557, "rewards/format_reward": 1.0, "rewards/score_reward": 0.802734375, "step": 1505 }, { "clip_ratio": 0.0, "completion_length": 109.21094131469727, "epoch": 5.156996587030717, "grad_norm": 0.9680995680467126, "kl": 0.4111328125, "learning_rate": 5.716723549488054e-07, "loss": 0.0004, "reward": 1.798828125, "reward_std": 0.03130420483648777, "rewards/format_reward": 1.0, "rewards/score_reward": 0.798828125, "step": 1506 }, { "clip_ratio": 0.0, "completion_length": 108.84896087646484, "epoch": 5.160409556313994, "grad_norm": 0.638521181289926, "kl": 0.4033203125, "learning_rate": 5.713879408418657e-07, "loss": 0.0004, "reward": 1.7845052480697632, "reward_std": 0.024963634088635445, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7845051884651184, "step": 1507 }, { "clip_ratio": 0.0, "completion_length": 108.01302337646484, "epoch": 5.163822525597269, "grad_norm": 0.9470893566507093, "kl": 0.423828125, "learning_rate": 5.71103526734926e-07, "loss": 0.0004, "reward": 1.7864583134651184, "reward_std": 0.022817853838205338, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7864583432674408, "step": 1508 }, { "clip_ratio": 0.0, "completion_length": 109.72916793823242, "epoch": 5.167235494880546, "grad_norm": 0.5351351718406061, "kl": 0.4052734375, "learning_rate": 5.708191126279863e-07, "loss": 0.0004, "reward": 1.7890625, "reward_std": 0.030772079713642597, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7916666567325592, "step": 1509 }, { "clip_ratio": 0.0, "completion_length": 110.60416793823242, "epoch": 5.170648464163823, "grad_norm": 0.7664186913918147, "kl": 0.3994140625, "learning_rate": 5.705346985210467e-07, "loss": 0.0004, "reward": 1.7434896230697632, "reward_std": 0.041402905248105526, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7434895634651184, "step": 1510 }, { "clip_ratio": 0.0, "completion_length": 111.10677337646484, "epoch": 5.174061433447099, "grad_norm": 3.2849533128337964, "kl": 0.4052734375, "learning_rate": 5.702502844141069e-07, "loss": 0.0004, "reward": 1.8287760615348816, "reward_std": 0.04916735179722309, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8287760317325592, "step": 1511 }, { "clip_ratio": 0.0, "completion_length": 112.89323425292969, "epoch": 5.177474402730375, "grad_norm": 0.5240614753129079, "kl": 0.3837890625, "learning_rate": 5.699658703071673e-07, "loss": 0.0004, "reward": 1.6725260019302368, "reward_std": 0.022401200607419014, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6725260615348816, "step": 1512 }, { "clip_ratio": 0.0, "completion_length": 110.5546875, "epoch": 5.180887372013652, "grad_norm": 1.2401309690845685, "kl": 0.37890625, "learning_rate": 5.696814562002275e-07, "loss": 0.0004, "reward": 1.775390625, "reward_std": 0.02748863259330392, "rewards/format_reward": 1.0, "rewards/score_reward": 0.775390625, "step": 1513 }, { "clip_ratio": 0.0, "completion_length": 114.21094131469727, "epoch": 5.184300341296928, "grad_norm": 4.533878654851203, "kl": 0.4013671875, "learning_rate": 5.693970420932877e-07, "loss": 0.0004, "reward": 1.6998698115348816, "reward_std": 0.06492452137172222, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7024739682674408, "step": 1514 }, { "clip_ratio": 0.0, "completion_length": 109.74219131469727, "epoch": 5.187713310580205, "grad_norm": 1.1762345600760624, "kl": 0.39453125, "learning_rate": 5.691126279863481e-07, "loss": 0.0004, "reward": 1.7252604365348816, "reward_std": 0.05328704044222832, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7252604067325592, "step": 1515 }, { "clip_ratio": 0.0, "completion_length": 111.7734375, "epoch": 5.191126279863481, "grad_norm": 1.4887780389578864, "kl": 0.40234375, "learning_rate": 5.688282138794084e-07, "loss": 0.0004, "reward": 1.6920572519302368, "reward_std": 0.01773066632449627, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6920573115348816, "step": 1516 }, { "clip_ratio": 0.0, "completion_length": 112.98698425292969, "epoch": 5.194539249146757, "grad_norm": 0.5947190641710395, "kl": 0.4189453125, "learning_rate": 5.685437997724687e-07, "loss": 0.0004, "reward": 1.685546875, "reward_std": 0.03368853125721216, "rewards/format_reward": 1.0, "rewards/score_reward": 0.685546875, "step": 1517 }, { "clip_ratio": 0.0, "completion_length": 110.96354293823242, "epoch": 5.197952218430034, "grad_norm": 4.787836752437666, "kl": 0.373046875, "learning_rate": 5.68259385665529e-07, "loss": 0.0004, "reward": 1.7903645634651184, "reward_std": 0.03870774619281292, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7903645932674408, "step": 1518 }, { "clip_ratio": 0.0, "completion_length": 113.40885925292969, "epoch": 5.201365187713311, "grad_norm": 1.1370155313250145, "kl": 0.373046875, "learning_rate": 5.679749715585893e-07, "loss": 0.0004, "reward": 1.8216145634651184, "reward_std": 0.02933005429804325, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8216145932674408, "step": 1519 }, { "clip_ratio": 0.0, "completion_length": 114.76302337646484, "epoch": 5.204778156996587, "grad_norm": 0.8647457010885176, "kl": 0.4033203125, "learning_rate": 5.676905574516496e-07, "loss": 0.0004, "reward": 1.6712239384651184, "reward_std": 0.04572475980967283, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6712239682674408, "step": 1520 }, { "clip_ratio": 0.0, "completion_length": 114.359375, "epoch": 5.208191126279863, "grad_norm": 1.6341489061382888, "kl": 0.3974609375, "learning_rate": 5.674061433447098e-07, "loss": 0.0004, "reward": 1.7994791865348816, "reward_std": 0.025248214602470398, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7994791567325592, "step": 1521 }, { "clip_ratio": 0.0, "completion_length": 113.63021087646484, "epoch": 5.21160409556314, "grad_norm": 2.8400313047981722, "kl": 0.3974609375, "learning_rate": 5.671217292377702e-07, "loss": 0.0004, "reward": 1.6783854365348816, "reward_std": 0.048330700024962425, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6783854067325592, "step": 1522 }, { "clip_ratio": 0.0, "completion_length": 111.359375, "epoch": 5.215017064846417, "grad_norm": 1.475717083449533, "kl": 0.3994140625, "learning_rate": 5.668373151308304e-07, "loss": 0.0004, "reward": 1.7005208134651184, "reward_std": 0.05753888189792633, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7005208432674408, "step": 1523 }, { "clip_ratio": 0.0, "completion_length": 114.09375, "epoch": 5.2184300341296925, "grad_norm": 1.5835361237141434, "kl": 0.3955078125, "learning_rate": 5.665529010238907e-07, "loss": 0.0004, "reward": 1.7454426884651184, "reward_std": 0.0562667828053236, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7454427182674408, "step": 1524 }, { "clip_ratio": 0.0, "completion_length": 115.19010543823242, "epoch": 5.221843003412969, "grad_norm": 4.833278870209248, "kl": 0.390625, "learning_rate": 5.662684869169511e-07, "loss": 0.0004, "reward": 1.7506510615348816, "reward_std": 0.07592767104506493, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7506510317325592, "step": 1525 }, { "clip_ratio": 0.0, "completion_length": 114.42969131469727, "epoch": 5.225255972696246, "grad_norm": 2.3398320837599274, "kl": 0.404296875, "learning_rate": 5.659840728100114e-07, "loss": 0.0004, "reward": 1.6953125, "reward_std": 0.08822083845734596, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6979166567325592, "step": 1526 }, { "clip_ratio": 0.0, "completion_length": 114.42969131469727, "epoch": 5.228668941979522, "grad_norm": 3.650396663245772, "kl": 0.470703125, "learning_rate": 5.656996587030717e-07, "loss": 0.0005, "reward": 1.73828125, "reward_std": 0.026633426547050476, "rewards/format_reward": 1.0, "rewards/score_reward": 0.73828125, "step": 1527 }, { "clip_ratio": 0.0, "completion_length": 117.06771087646484, "epoch": 5.2320819112627985, "grad_norm": 0.803120801529988, "kl": 0.3798828125, "learning_rate": 5.65415244596132e-07, "loss": 0.0004, "reward": 1.662109375, "reward_std": 0.03737055882811546, "rewards/format_reward": 1.0, "rewards/score_reward": 0.662109375, "step": 1528 }, { "clip_ratio": 0.0, "completion_length": 115.48177337646484, "epoch": 5.235494880546075, "grad_norm": 2.7283062978379315, "kl": 0.396484375, "learning_rate": 5.651308304891922e-07, "loss": 0.0004, "reward": 1.7356771230697632, "reward_std": 0.03996023256331682, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.73828125, "step": 1529 }, { "clip_ratio": 0.0, "completion_length": 114.05729675292969, "epoch": 5.238907849829351, "grad_norm": 1.1812385747612775, "kl": 0.38671875, "learning_rate": 5.648464163822525e-07, "loss": 0.0004, "reward": 1.787109375, "reward_std": 0.03090456034988165, "rewards/format_reward": 1.0, "rewards/score_reward": 0.787109375, "step": 1530 }, { "clip_ratio": 0.0, "completion_length": 116.66146087646484, "epoch": 5.242320819112628, "grad_norm": 1.1400079599230717, "kl": 0.3701171875, "learning_rate": 5.645620022753128e-07, "loss": 0.0004, "reward": 1.7350260615348816, "reward_std": 0.020122936461120844, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7350260317325592, "step": 1531 }, { "clip_ratio": 0.0, "completion_length": 115.4140625, "epoch": 5.2457337883959045, "grad_norm": 1.284748780937588, "kl": 0.4033203125, "learning_rate": 5.642775881683732e-07, "loss": 0.0004, "reward": 1.7545572519302368, "reward_std": 0.03204516228288412, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7545573115348816, "step": 1532 }, { "clip_ratio": 0.0, "completion_length": 114.02344131469727, "epoch": 5.249146757679181, "grad_norm": 0.6061711817544418, "kl": 0.3916015625, "learning_rate": 5.639931740614334e-07, "loss": 0.0004, "reward": 1.787109375, "reward_std": 0.02496363688260317, "rewards/format_reward": 1.0, "rewards/score_reward": 0.787109375, "step": 1533 }, { "clip_ratio": 0.0, "completion_length": 114.04427337646484, "epoch": 5.252559726962457, "grad_norm": 1.3075356552853303, "kl": 0.384765625, "learning_rate": 5.637087599544937e-07, "loss": 0.0004, "reward": 1.7200520634651184, "reward_std": 0.03982668835669756, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7200520932674408, "step": 1534 }, { "clip_ratio": 0.0, "completion_length": 115.67708587646484, "epoch": 5.255972696245734, "grad_norm": 0.6075921545714194, "kl": 0.38671875, "learning_rate": 5.634243458475541e-07, "loss": 0.0004, "reward": 1.7630208134651184, "reward_std": 0.03418865706771612, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7630208432674408, "step": 1535 }, { "clip_ratio": 0.0, "completion_length": 114.6484375, "epoch": 5.2593856655290105, "grad_norm": 1.279524479384532, "kl": 0.3818359375, "learning_rate": 5.631399317406143e-07, "loss": 0.0004, "reward": 1.6432291865348816, "reward_std": 0.07192134857177734, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6432291567325592, "step": 1536 }, { "clip_ratio": 0.0, "completion_length": 115.61198043823242, "epoch": 5.262798634812286, "grad_norm": 1.212298635241593, "kl": 0.38671875, "learning_rate": 5.628555176336746e-07, "loss": 0.0004, "reward": 1.7265625, "reward_std": 0.055367494001984596, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7265625, "step": 1537 }, { "clip_ratio": 0.0, "completion_length": 114.67448425292969, "epoch": 5.266211604095563, "grad_norm": 1.4497553552761542, "kl": 0.37890625, "learning_rate": 5.625711035267349e-07, "loss": 0.0004, "reward": 1.8841146230697632, "reward_std": 0.04379518097266555, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8841145634651184, "step": 1538 }, { "clip_ratio": 0.0, "completion_length": 116.51562881469727, "epoch": 5.26962457337884, "grad_norm": 1.0550603804618226, "kl": 0.392578125, "learning_rate": 5.622866894197951e-07, "loss": 0.0004, "reward": 1.7018229365348816, "reward_std": 0.046709753572940826, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7018229067325592, "step": 1539 }, { "clip_ratio": 0.0, "completion_length": 114.94010543823242, "epoch": 5.273037542662116, "grad_norm": 1.6437563447953285, "kl": 0.40234375, "learning_rate": 5.620022753128555e-07, "loss": 0.0004, "reward": 1.6477864980697632, "reward_std": 0.02509636152535677, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6477864384651184, "step": 1540 }, { "clip_ratio": 0.0, "completion_length": 118.05208587646484, "epoch": 5.276450511945392, "grad_norm": 0.41652150572791924, "kl": 0.400390625, "learning_rate": 5.617178612059158e-07, "loss": 0.0004, "reward": 1.736328125, "reward_std": 0.026304116006940603, "rewards/format_reward": 1.0, "rewards/score_reward": 0.736328125, "step": 1541 }, { "clip_ratio": 0.0, "completion_length": 114.89844131469727, "epoch": 5.279863481228669, "grad_norm": 5.982900576250373, "kl": 0.3759765625, "learning_rate": 5.614334470989761e-07, "loss": 0.0004, "reward": 1.8444010615348816, "reward_std": 0.023539516143500805, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8444010317325592, "step": 1542 }, { "clip_ratio": 0.0, "completion_length": 116.01302337646484, "epoch": 5.283276450511945, "grad_norm": 0.9639185063206768, "kl": 0.404296875, "learning_rate": 5.611490329920364e-07, "loss": 0.0004, "reward": 1.677734375, "reward_std": 0.04622300900518894, "rewards/format_reward": 1.0, "rewards/score_reward": 0.677734375, "step": 1543 }, { "clip_ratio": 0.0, "completion_length": 117.65885925292969, "epoch": 5.286689419795222, "grad_norm": 0.6927830045174452, "kl": 0.380859375, "learning_rate": 5.608646188850967e-07, "loss": 0.0004, "reward": 1.8053385615348816, "reward_std": 0.02509636152535677, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8053385317325592, "step": 1544 }, { "clip_ratio": 0.0, "completion_length": 117.7890625, "epoch": 5.290102389078498, "grad_norm": 1.0907280343960206, "kl": 0.3857421875, "learning_rate": 5.605802047781569e-07, "loss": 0.0004, "reward": 1.8001301884651184, "reward_std": 0.05628614965826273, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8001302182674408, "step": 1545 }, { "clip_ratio": 0.0, "completion_length": 115.91666793823242, "epoch": 5.293515358361775, "grad_norm": 6.255955537681932, "kl": 0.3701171875, "learning_rate": 5.602957906712172e-07, "loss": 0.0004, "reward": 1.8444010019302368, "reward_std": 0.04940679110586643, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8444010615348816, "step": 1546 }, { "clip_ratio": 0.0, "completion_length": 117.77344131469727, "epoch": 5.296928327645051, "grad_norm": 0.8148116137047559, "kl": 0.4296875, "learning_rate": 5.600113765642776e-07, "loss": 0.0004, "reward": 1.7180989384651184, "reward_std": 0.02976689673960209, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7180989682674408, "step": 1547 }, { "clip_ratio": 0.0, "completion_length": 115.97916793823242, "epoch": 5.300341296928328, "grad_norm": 0.709844937576862, "kl": 0.37890625, "learning_rate": 5.597269624573379e-07, "loss": 0.0004, "reward": 1.7506510019302368, "reward_std": 0.04316003806889057, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7506510615348816, "step": 1548 }, { "clip_ratio": 0.0, "completion_length": 117.82031631469727, "epoch": 5.303754266211604, "grad_norm": 4.945103444930295, "kl": 0.37890625, "learning_rate": 5.594425483503981e-07, "loss": 0.0004, "reward": 1.689453125, "reward_std": 0.030051066540181637, "rewards/format_reward": 1.0, "rewards/score_reward": 0.689453125, "step": 1549 }, { "clip_ratio": 0.0, "completion_length": 118.1875, "epoch": 5.30716723549488, "grad_norm": 2.5187334037796196, "kl": 0.3818359375, "learning_rate": 5.591581342434585e-07, "loss": 0.0004, "reward": 1.7076823115348816, "reward_std": 0.08640624955296516, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7076822817325592, "step": 1550 }, { "clip_ratio": 0.0, "completion_length": 119.66666793823242, "epoch": 5.310580204778157, "grad_norm": 18.890702508146283, "kl": 0.375, "learning_rate": 5.588737201365188e-07, "loss": 0.0004, "reward": 1.6829426884651184, "reward_std": 0.05024091154336929, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6829427182674408, "step": 1551 }, { "clip_ratio": 0.0, "completion_length": 116.96354293823242, "epoch": 5.313993174061434, "grad_norm": 0.8841731300266397, "kl": 0.3720703125, "learning_rate": 5.58589306029579e-07, "loss": 0.0004, "reward": 1.828125, "reward_std": 0.020692503545433283, "rewards/format_reward": 1.0, "rewards/score_reward": 0.828125, "step": 1552 }, { "clip_ratio": 0.0, "completion_length": 117.02083587646484, "epoch": 5.3174061433447095, "grad_norm": 2.979324114406871, "kl": 0.3681640625, "learning_rate": 5.583048919226393e-07, "loss": 0.0004, "reward": 1.7122396230697632, "reward_std": 0.036429482977837324, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7122395634651184, "step": 1553 }, { "clip_ratio": 0.0, "completion_length": 119.71094131469727, "epoch": 5.320819112627986, "grad_norm": 1.8117001481910946, "kl": 0.3837890625, "learning_rate": 5.580204778156996e-07, "loss": 0.0004, "reward": 1.7825521230697632, "reward_std": 0.027221955358982086, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7825520634651184, "step": 1554 }, { "clip_ratio": 0.0, "completion_length": 121.03906631469727, "epoch": 5.324232081911263, "grad_norm": 1.1282148264016612, "kl": 0.3740234375, "learning_rate": 5.577360637087599e-07, "loss": 0.0004, "reward": 1.7578125, "reward_std": 0.03147477749735117, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7578125, "step": 1555 }, { "clip_ratio": 0.0, "completion_length": 119.03385925292969, "epoch": 5.327645051194539, "grad_norm": 0.6803347787669503, "kl": 0.380859375, "learning_rate": 5.574516496018202e-07, "loss": 0.0004, "reward": 1.736328125, "reward_std": 0.03134164400398731, "rewards/format_reward": 1.0, "rewards/score_reward": 0.736328125, "step": 1556 }, { "clip_ratio": 0.0, "completion_length": 116.3828125, "epoch": 5.3310580204778155, "grad_norm": 0.6215865152846557, "kl": 0.3857421875, "learning_rate": 5.571672354948806e-07, "loss": 0.0004, "reward": 1.6998698115348816, "reward_std": 0.03130420483648777, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6998697817325592, "step": 1557 }, { "clip_ratio": 0.0, "completion_length": 123.31510543823242, "epoch": 5.334470989761092, "grad_norm": 2.369285419654684, "kl": 0.3740234375, "learning_rate": 5.568828213879408e-07, "loss": 0.0004, "reward": 1.7115885615348816, "reward_std": 0.04931944981217384, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7115885317325592, "step": 1558 }, { "clip_ratio": 0.0, "completion_length": 122.47917175292969, "epoch": 5.337883959044369, "grad_norm": 1.175318384408281, "kl": 0.376953125, "learning_rate": 5.565984072810012e-07, "loss": 0.0004, "reward": 1.7799479365348816, "reward_std": 0.014598665293306112, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7799479067325592, "step": 1559 }, { "clip_ratio": 0.0, "completion_length": 119.61979293823242, "epoch": 5.341296928327645, "grad_norm": 1.2065455018917228, "kl": 0.3759765625, "learning_rate": 5.563139931740614e-07, "loss": 0.0004, "reward": 1.701171875, "reward_std": 0.04905317910015583, "rewards/format_reward": 1.0, "rewards/score_reward": 0.701171875, "step": 1560 }, { "clip_ratio": 0.0, "completion_length": 120.875, "epoch": 5.3447098976109215, "grad_norm": 0.296963493962814, "kl": 0.3798828125, "learning_rate": 5.560295790671216e-07, "loss": 0.0004, "reward": 1.7727864980697632, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7727864384651184, "step": 1561 }, { "clip_ratio": 0.0, "completion_length": 121.43229293823242, "epoch": 5.348122866894198, "grad_norm": 1.272704494822232, "kl": 0.37890625, "learning_rate": 5.55745164960182e-07, "loss": 0.0004, "reward": 1.7734375, "reward_std": 0.02820964204147458, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7734375, "step": 1562 }, { "clip_ratio": 0.0, "completion_length": 123.60416793823242, "epoch": 5.351535836177474, "grad_norm": 2.3813348126596834, "kl": 0.376953125, "learning_rate": 5.554607508532423e-07, "loss": 0.0004, "reward": 1.7259114980697632, "reward_std": 0.07456684485077858, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.728515625, "step": 1563 }, { "clip_ratio": 0.0, "completion_length": 123.43750381469727, "epoch": 5.354948805460751, "grad_norm": 1.1384566182225415, "kl": 0.3701171875, "learning_rate": 5.551763367463026e-07, "loss": 0.0004, "reward": 1.6927083134651184, "reward_std": 0.028209642507135868, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6927083432674408, "step": 1564 }, { "clip_ratio": 0.0, "completion_length": 123.45573425292969, "epoch": 5.3583617747440275, "grad_norm": 2.584823191959769, "kl": 0.3681640625, "learning_rate": 5.548919226393629e-07, "loss": 0.0004, "reward": 1.8509114980697632, "reward_std": 0.027574503794312477, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8509114384651184, "step": 1565 }, { "clip_ratio": 0.0, "completion_length": 122.58073043823242, "epoch": 5.361774744027303, "grad_norm": 1.5375159674473153, "kl": 0.3720703125, "learning_rate": 5.546075085324232e-07, "loss": 0.0004, "reward": 1.693359375, "reward_std": 0.05300164595246315, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6959635317325592, "step": 1566 }, { "clip_ratio": 0.0, "completion_length": 124.296875, "epoch": 5.36518771331058, "grad_norm": 1.135288713884382, "kl": 0.3681640625, "learning_rate": 5.543230944254836e-07, "loss": 0.0004, "reward": 1.8307291865348816, "reward_std": 0.04796436242759228, "rewards/format_reward": 0.9947916865348816, "rewards/score_reward": 0.8359375, "step": 1567 }, { "clip_ratio": 0.0, "completion_length": 122.22135925292969, "epoch": 5.368600682593857, "grad_norm": 1.1832968888406317, "kl": 0.3828125, "learning_rate": 5.540386803185437e-07, "loss": 0.0004, "reward": 1.6783854365348816, "reward_std": 0.02465952094644308, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6783854067325592, "step": 1568 }, { "clip_ratio": 0.0, "completion_length": 123.86458587646484, "epoch": 5.372013651877133, "grad_norm": 0.4604777111649463, "kl": 0.3740234375, "learning_rate": 5.537542662116041e-07, "loss": 0.0004, "reward": 1.7513021230697632, "reward_std": 0.02693778555840254, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.75390625, "step": 1569 }, { "clip_ratio": 0.0, "completion_length": 124.52864837646484, "epoch": 5.375426621160409, "grad_norm": 0.3868036643728789, "kl": 0.36328125, "learning_rate": 5.534698521046644e-07, "loss": 0.0004, "reward": 1.798828125, "reward_std": 0.029899620916694403, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.8014323115348816, "step": 1570 }, { "clip_ratio": 0.0, "completion_length": 121.93489837646484, "epoch": 5.378839590443686, "grad_norm": 0.7904822726437076, "kl": 0.3671875, "learning_rate": 5.531854379977246e-07, "loss": 0.0004, "reward": 1.7890625, "reward_std": 0.05484306812286377, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7890625, "step": 1571 }, { "clip_ratio": 0.0, "completion_length": 122.38281631469727, "epoch": 5.382252559726963, "grad_norm": 0.3228961749123881, "kl": 0.373046875, "learning_rate": 5.52901023890785e-07, "loss": 0.0004, "reward": 1.78515625, "reward_std": 0.007232970092445612, "rewards/format_reward": 1.0, "rewards/score_reward": 0.78515625, "step": 1572 }, { "clip_ratio": 0.0, "completion_length": 123.28125, "epoch": 5.385665529010239, "grad_norm": 0.37435867411150714, "kl": 0.380859375, "learning_rate": 5.526166097838453e-07, "loss": 0.0004, "reward": 1.7721354365348816, "reward_std": 0.02834236901253462, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7747395932674408, "step": 1573 }, { "clip_ratio": 0.0, "completion_length": 118.1953125, "epoch": 5.389078498293515, "grad_norm": 0.30890761684019047, "kl": 0.35546875, "learning_rate": 5.523321956769056e-07, "loss": 0.0004, "reward": 1.759765625, "reward_std": 0.008086706511676311, "rewards/format_reward": 1.0, "rewards/score_reward": 0.759765625, "step": 1574 }, { "clip_ratio": 0.0, "completion_length": 121.91146087646484, "epoch": 5.392491467576792, "grad_norm": 0.7671927154118033, "kl": 0.3681640625, "learning_rate": 5.520477815699659e-07, "loss": 0.0004, "reward": 1.6875, "reward_std": 0.03528970014303923, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6875, "step": 1575 }, { "clip_ratio": 0.0, "completion_length": 118.50000381469727, "epoch": 5.395904436860068, "grad_norm": 1.18721977035361, "kl": 0.3681640625, "learning_rate": 5.517633674630261e-07, "loss": 0.0004, "reward": 1.8639323115348816, "reward_std": 0.019135249312967062, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8639322817325592, "step": 1576 }, { "clip_ratio": 0.0, "completion_length": 122.02083587646484, "epoch": 5.399317406143345, "grad_norm": 1.0993290700494691, "kl": 0.3759765625, "learning_rate": 5.514789533560864e-07, "loss": 0.0004, "reward": 1.818359375, "reward_std": 0.051425427198410034, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.8209635317325592, "step": 1577 }, { "clip_ratio": 0.0, "completion_length": 118.78125381469727, "epoch": 5.402730375426621, "grad_norm": 1.0317530259652625, "kl": 0.3662109375, "learning_rate": 5.511945392491467e-07, "loss": 0.0004, "reward": 1.7272135615348816, "reward_std": 0.03274622559547424, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7272135317325592, "step": 1578 }, { "clip_ratio": 0.0, "completion_length": 121.1796875, "epoch": 5.406143344709897, "grad_norm": 1.1036216839796367, "kl": 0.375, "learning_rate": 5.509101251422071e-07, "loss": 0.0004, "reward": 1.8854166865348816, "reward_std": 0.04401524644345045, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.8880208134651184, "step": 1579 }, { "clip_ratio": 0.0, "completion_length": 121.03385543823242, "epoch": 5.409556313993174, "grad_norm": 1.7754559527580553, "kl": 0.3828125, "learning_rate": 5.506257110352673e-07, "loss": 0.0004, "reward": 1.7109375, "reward_std": 0.080445546656847, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7135416567325592, "step": 1580 }, { "clip_ratio": 0.0, "completion_length": 123.09375381469727, "epoch": 5.412969283276451, "grad_norm": 0.30356405878048415, "kl": 0.3740234375, "learning_rate": 5.503412969283276e-07, "loss": 0.0004, "reward": 1.78125, "reward_std": 0.015717608854174614, "rewards/format_reward": 1.0, "rewards/score_reward": 0.78125, "step": 1581 }, { "clip_ratio": 0.0, "completion_length": 121.30208587646484, "epoch": 5.4163822525597265, "grad_norm": 0.013165749600210372, "kl": 0.3837890625, "learning_rate": 5.50056882821388e-07, "loss": 0.0004, "reward": 1.7239583134651184, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7239583432674408, "step": 1582 }, { "clip_ratio": 0.0, "completion_length": 119.89844131469727, "epoch": 5.419795221843003, "grad_norm": 0.6747846193336575, "kl": 0.3701171875, "learning_rate": 5.497724687144481e-07, "loss": 0.0004, "reward": 1.861328125, "reward_std": 0.03673278074711561, "rewards/format_reward": 1.0, "rewards/score_reward": 0.861328125, "step": 1583 }, { "clip_ratio": 0.0, "completion_length": 121.54166793823242, "epoch": 5.42320819112628, "grad_norm": 1.0326372792794454, "kl": 0.37109375, "learning_rate": 5.494880546075085e-07, "loss": 0.0004, "reward": 1.80078125, "reward_std": 0.03202521521598101, "rewards/format_reward": 1.0, "rewards/score_reward": 0.80078125, "step": 1584 }, { "clip_ratio": 0.0, "completion_length": 122.56771087646484, "epoch": 5.426621160409557, "grad_norm": 0.4900143134592337, "kl": 0.353515625, "learning_rate": 5.492036405005688e-07, "loss": 0.0004, "reward": 1.806640625, "reward_std": 0.020122936461120844, "rewards/format_reward": 1.0, "rewards/score_reward": 0.806640625, "step": 1585 }, { "clip_ratio": 0.0, "completion_length": 120.18489837646484, "epoch": 5.4300341296928325, "grad_norm": 0.29128321339008884, "kl": 0.3798828125, "learning_rate": 5.48919226393629e-07, "loss": 0.0004, "reward": 1.7682291865348816, "reward_std": 0.010782274417579174, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7682291567325592, "step": 1586 }, { "clip_ratio": 0.0, "completion_length": 120.72396087646484, "epoch": 5.433447098976109, "grad_norm": 0.7761560279812697, "kl": 0.3623046875, "learning_rate": 5.486348122866894e-07, "loss": 0.0004, "reward": 1.6940103769302368, "reward_std": 0.03184628766030073, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6940104365348816, "step": 1587 }, { "clip_ratio": 0.0, "completion_length": 118.87760925292969, "epoch": 5.436860068259386, "grad_norm": 0.9740627639571457, "kl": 0.388671875, "learning_rate": 5.483503981797497e-07, "loss": 0.0004, "reward": 1.7779948115348816, "reward_std": 0.033648326992988586, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7779947817325592, "step": 1588 }, { "clip_ratio": 0.0, "completion_length": 118.09896087646484, "epoch": 5.440273037542662, "grad_norm": 1.1456169878267277, "kl": 0.3623046875, "learning_rate": 5.4806598407281e-07, "loss": 0.0004, "reward": 1.7395833730697632, "reward_std": 0.029462780803442, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7395833134651184, "step": 1589 }, { "clip_ratio": 0.0, "completion_length": 116.21614837646484, "epoch": 5.4436860068259385, "grad_norm": 0.513610092577173, "kl": 0.37109375, "learning_rate": 5.477815699658703e-07, "loss": 0.0004, "reward": 1.8151041865348816, "reward_std": 0.018501579761505127, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8151041567325592, "step": 1590 }, { "clip_ratio": 0.0, "completion_length": 120.2890625, "epoch": 5.447098976109215, "grad_norm": 0.9972005455506836, "kl": 0.375, "learning_rate": 5.474971558589305e-07, "loss": 0.0004, "reward": 1.7799479365348816, "reward_std": 0.03476380230858922, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7799479067325592, "step": 1591 }, { "clip_ratio": 0.0, "completion_length": 117.13802337646484, "epoch": 5.450511945392491, "grad_norm": 0.9953561871318796, "kl": 0.3876953125, "learning_rate": 5.472127417519909e-07, "loss": 0.0004, "reward": 1.783203125, "reward_std": 0.03232933208346367, "rewards/format_reward": 1.0, "rewards/score_reward": 0.783203125, "step": 1592 }, { "clip_ratio": 0.0, "completion_length": 117.29948425292969, "epoch": 5.453924914675768, "grad_norm": 1.4034005069525726, "kl": 0.365234375, "learning_rate": 5.469283276450511e-07, "loss": 0.0004, "reward": 1.6875, "reward_std": 0.025361567735671997, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6875, "step": 1593 }, { "clip_ratio": 0.0, "completion_length": 116.91667175292969, "epoch": 5.4573378839590445, "grad_norm": 2.0679205334154784, "kl": 0.3583984375, "learning_rate": 5.466439135381115e-07, "loss": 0.0004, "reward": 1.8125, "reward_std": 0.020692503079771996, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8125, "step": 1594 }, { "clip_ratio": 0.0, "completion_length": 118.64062881469727, "epoch": 5.460750853242321, "grad_norm": 1.4278465754116, "kl": 0.3876953125, "learning_rate": 5.463594994311718e-07, "loss": 0.0004, "reward": 1.7057291865348816, "reward_std": 0.01799587346613407, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7057291567325592, "step": 1595 }, { "clip_ratio": 0.0, "completion_length": 118.11198425292969, "epoch": 5.464163822525597, "grad_norm": 0.46699989423875404, "kl": 0.37890625, "learning_rate": 5.46075085324232e-07, "loss": 0.0004, "reward": 1.796875, "reward_std": 0.021962891332805157, "rewards/format_reward": 1.0, "rewards/score_reward": 0.796875, "step": 1596 }, { "clip_ratio": 0.0, "completion_length": 114.25521087646484, "epoch": 5.467576791808874, "grad_norm": 1.3890377430483807, "kl": 0.369140625, "learning_rate": 5.457906712172924e-07, "loss": 0.0004, "reward": 1.7799478769302368, "reward_std": 0.06376625970005989, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7825520932674408, "step": 1597 }, { "clip_ratio": 0.0, "completion_length": 116.14323425292969, "epoch": 5.4709897610921505, "grad_norm": 0.7077179296992983, "kl": 0.375, "learning_rate": 5.455062571103527e-07, "loss": 0.0004, "reward": 1.7610676884651184, "reward_std": 0.022533927112817764, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7610677182674408, "step": 1598 }, { "clip_ratio": 0.0, "completion_length": 116.11979675292969, "epoch": 5.474402730375426, "grad_norm": 1.3993675623071131, "kl": 0.369140625, "learning_rate": 5.452218430034129e-07, "loss": 0.0004, "reward": 1.8118489384651184, "reward_std": 0.04083334282040596, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8118489682674408, "step": 1599 }, { "clip_ratio": 0.0, "completion_length": 115.75260925292969, "epoch": 5.477815699658703, "grad_norm": 1.5234035984101233, "kl": 0.373046875, "learning_rate": 5.449374288964732e-07, "loss": 0.0004, "reward": 1.6959635615348816, "reward_std": 0.04563448205590248, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6959635317325592, "step": 1600 }, { "clip_ratio": 0.0, "completion_length": 117.65104675292969, "epoch": 5.48122866894198, "grad_norm": 0.3478512470073815, "kl": 0.3837890625, "learning_rate": 5.446530147895335e-07, "loss": 0.0004, "reward": 1.7786458730697632, "reward_std": 0.0073656952008605, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7786458134651184, "step": 1601 }, { "clip_ratio": 0.0, "completion_length": 117.84635543823242, "epoch": 5.484641638225256, "grad_norm": 2.65961107193967, "kl": 0.3779296875, "learning_rate": 5.443686006825938e-07, "loss": 0.0004, "reward": 1.748046875, "reward_std": 0.03075311565771699, "rewards/format_reward": 1.0, "rewards/score_reward": 0.748046875, "step": 1602 }, { "clip_ratio": 0.0, "completion_length": 113.80469131469727, "epoch": 5.488054607508532, "grad_norm": 1.428690948626782, "kl": 0.34765625, "learning_rate": 5.440841865756541e-07, "loss": 0.0003, "reward": 1.80859375, "reward_std": 0.04848443157970905, "rewards/format_reward": 1.0, "rewards/score_reward": 0.80859375, "step": 1603 }, { "clip_ratio": 0.0, "completion_length": 116.65364837646484, "epoch": 5.491467576791809, "grad_norm": 0.9167299004435262, "kl": 0.37109375, "learning_rate": 5.437997724687145e-07, "loss": 0.0004, "reward": 1.6907551884651184, "reward_std": 0.03090521227568388, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6907552182674408, "step": 1604 }, { "clip_ratio": 0.0, "completion_length": 114.09635543823242, "epoch": 5.494880546075085, "grad_norm": 1.4558469877201292, "kl": 0.3857421875, "learning_rate": 5.435153583617747e-07, "loss": 0.0004, "reward": 1.7109375, "reward_std": 0.03997813630849123, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7109375, "step": 1605 }, { "clip_ratio": 0.0, "completion_length": 114.35156631469727, "epoch": 5.498293515358362, "grad_norm": 0.7469590410348974, "kl": 0.384765625, "learning_rate": 5.432309442548351e-07, "loss": 0.0004, "reward": 1.7330729365348816, "reward_std": 0.02295057848095894, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7330729067325592, "step": 1606 }, { "clip_ratio": 0.0, "completion_length": 116.09896087646484, "epoch": 5.501706484641638, "grad_norm": 0.6520824289875928, "kl": 0.41015625, "learning_rate": 5.429465301478953e-07, "loss": 0.0004, "reward": 1.71875, "reward_std": 0.03629593923687935, "rewards/format_reward": 1.0, "rewards/score_reward": 0.71875, "step": 1607 }, { "clip_ratio": 0.0, "completion_length": 116.06250381469727, "epoch": 5.505119453924914, "grad_norm": 1.0393336935758155, "kl": 0.3623046875, "learning_rate": 5.426621160409555e-07, "loss": 0.0004, "reward": 1.7897135615348816, "reward_std": 0.02847484964877367, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7897135317325592, "step": 1608 }, { "clip_ratio": 0.0, "completion_length": 115.78385543823242, "epoch": 5.508532423208191, "grad_norm": 1.6903907243491842, "kl": 0.3681640625, "learning_rate": 5.423777019340159e-07, "loss": 0.0004, "reward": 1.6334635615348816, "reward_std": 0.04953951761126518, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6360676884651184, "step": 1609 }, { "clip_ratio": 0.0, "completion_length": 116.40885543823242, "epoch": 5.511945392491468, "grad_norm": 3.785889398182469, "kl": 0.3701171875, "learning_rate": 5.420932878270762e-07, "loss": 0.0004, "reward": 1.7825520634651184, "reward_std": 0.04502246715128422, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7825520932674408, "step": 1610 }, { "clip_ratio": 0.0, "completion_length": 113.68750381469727, "epoch": 5.515358361774744, "grad_norm": 1.31757224820313, "kl": 0.38671875, "learning_rate": 5.418088737201366e-07, "loss": 0.0004, "reward": 1.8111978769302368, "reward_std": 0.028058198746293783, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8111979365348816, "step": 1611 }, { "clip_ratio": 0.0, "completion_length": 115.63021087646484, "epoch": 5.51877133105802, "grad_norm": 0.8765258808468752, "kl": 0.3740234375, "learning_rate": 5.415244596131968e-07, "loss": 0.0004, "reward": 1.67578125, "reward_std": 0.014598664827644825, "rewards/format_reward": 1.0, "rewards/score_reward": 0.67578125, "step": 1612 }, { "clip_ratio": 0.0, "completion_length": 115.2578125, "epoch": 5.522184300341297, "grad_norm": 1.0021956183372487, "kl": 0.35546875, "learning_rate": 5.412400455062571e-07, "loss": 0.0004, "reward": 1.6979166865348816, "reward_std": 0.020976672880351543, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6979166567325592, "step": 1613 }, { "clip_ratio": 0.0, "completion_length": 115.11198043823242, "epoch": 5.525597269624574, "grad_norm": 1.0400987886399908, "kl": 0.35546875, "learning_rate": 5.409556313993175e-07, "loss": 0.0004, "reward": 1.7845052480697632, "reward_std": 0.015168231446295977, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7845051884651184, "step": 1614 }, { "clip_ratio": 0.0, "completion_length": 117.22916793823242, "epoch": 5.5290102389078495, "grad_norm": 0.9763357725539702, "kl": 0.3818359375, "learning_rate": 5.406712172923776e-07, "loss": 0.0004, "reward": 1.833984375, "reward_std": 0.02393850963562727, "rewards/format_reward": 1.0, "rewards/score_reward": 0.833984375, "step": 1615 }, { "clip_ratio": 0.0, "completion_length": 114.48958587646484, "epoch": 5.532423208191126, "grad_norm": 1.041858896564134, "kl": 0.3798828125, "learning_rate": 5.40386803185438e-07, "loss": 0.0004, "reward": 1.7194010615348816, "reward_std": 0.026938195107504725, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7194010317325592, "step": 1616 }, { "clip_ratio": 0.0, "completion_length": 115.79167175292969, "epoch": 5.535836177474403, "grad_norm": 1.9825235053437333, "kl": 0.3720703125, "learning_rate": 5.401023890784983e-07, "loss": 0.0004, "reward": 1.7552083134651184, "reward_std": 0.05060978466644883, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7552083432674408, "step": 1617 }, { "clip_ratio": 0.0, "completion_length": 113.62500381469727, "epoch": 5.53924914675768, "grad_norm": 0.3497240801135328, "kl": 0.36328125, "learning_rate": 5.398179749715585e-07, "loss": 0.0004, "reward": 1.7532551884651184, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7532552182674408, "step": 1618 }, { "clip_ratio": 0.0, "completion_length": 114.37760925292969, "epoch": 5.5426621160409555, "grad_norm": 0.9881652080250994, "kl": 0.373046875, "learning_rate": 5.395335608646189e-07, "loss": 0.0004, "reward": 1.771484375, "reward_std": 0.03219660557806492, "rewards/format_reward": 1.0, "rewards/score_reward": 0.771484375, "step": 1619 }, { "clip_ratio": 0.0, "completion_length": 114.40885543823242, "epoch": 5.546075085324232, "grad_norm": 4.794863202390878, "kl": 0.3798828125, "learning_rate": 5.392491467576792e-07, "loss": 0.0004, "reward": 1.7662760615348816, "reward_std": 0.03737056162208319, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7662760317325592, "step": 1620 }, { "clip_ratio": 0.0, "completion_length": 114.703125, "epoch": 5.549488054607508, "grad_norm": 1.1816821023550792, "kl": 0.37890625, "learning_rate": 5.389647326507395e-07, "loss": 0.0004, "reward": 1.7154948115348816, "reward_std": 0.03969502728432417, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7154947817325592, "step": 1621 }, { "clip_ratio": 0.0, "completion_length": 113.546875, "epoch": 5.552901023890785, "grad_norm": 1.5937097103041742, "kl": 0.3681640625, "learning_rate": 5.386803185437997e-07, "loss": 0.0004, "reward": 1.7174479365348816, "reward_std": 0.020976672880351543, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7174479067325592, "step": 1622 }, { "clip_ratio": 0.0, "completion_length": 114.40104293823242, "epoch": 5.5563139931740615, "grad_norm": 1.792782532768101, "kl": 0.3720703125, "learning_rate": 5.3839590443686e-07, "loss": 0.0004, "reward": 1.7760416269302368, "reward_std": 0.02281785337254405, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7760416865348816, "step": 1623 }, { "clip_ratio": 0.0, "completion_length": 114.99219131469727, "epoch": 5.559726962457338, "grad_norm": 2.0577872892212503, "kl": 0.3828125, "learning_rate": 5.381114903299203e-07, "loss": 0.0004, "reward": 1.759765625, "reward_std": 0.05677248630672693, "rewards/format_reward": 1.0, "rewards/score_reward": 0.759765625, "step": 1624 }, { "clip_ratio": 0.0, "completion_length": 115.87239837646484, "epoch": 5.563139931740614, "grad_norm": 4.237968624190803, "kl": 0.3818359375, "learning_rate": 5.378270762229806e-07, "loss": 0.0004, "reward": 1.703125, "reward_std": 0.029283852316439152, "rewards/format_reward": 1.0, "rewards/score_reward": 0.703125, "step": 1625 }, { "clip_ratio": 0.0, "completion_length": 112.04427337646484, "epoch": 5.566552901023891, "grad_norm": 1.5690786922308664, "kl": 0.3759765625, "learning_rate": 5.37542662116041e-07, "loss": 0.0004, "reward": 1.7447916269302368, "reward_std": 0.016173413023352623, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7447916865348816, "step": 1626 }, { "clip_ratio": 0.0, "completion_length": 113.26302337646484, "epoch": 5.5699658703071675, "grad_norm": 0.24278964496104538, "kl": 0.3828125, "learning_rate": 5.372582480091012e-07, "loss": 0.0004, "reward": 1.7786458134651184, "reward_std": 0.0073656952008605, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7786458432674408, "step": 1627 }, { "clip_ratio": 0.0, "completion_length": 112.5, "epoch": 5.573378839590443, "grad_norm": 0.010920256094550461, "kl": 0.375, "learning_rate": 5.369738339021615e-07, "loss": 0.0004, "reward": 1.7291666865348816, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7291666567325592, "step": 1628 }, { "clip_ratio": 0.0, "completion_length": 115.2265625, "epoch": 5.57679180887372, "grad_norm": 0.6101525504544288, "kl": 0.35546875, "learning_rate": 5.366894197952219e-07, "loss": 0.0004, "reward": 1.6783854365348816, "reward_std": 0.035708063282072544, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6809895932674408, "step": 1629 }, { "clip_ratio": 0.0, "completion_length": 110.73958587646484, "epoch": 5.580204778156997, "grad_norm": 1.2058703231571601, "kl": 0.369140625, "learning_rate": 5.36405005688282e-07, "loss": 0.0004, "reward": 1.7252603769302368, "reward_std": 0.018015244975686073, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7252604365348816, "step": 1630 }, { "clip_ratio": 0.0, "completion_length": 112.07292175292969, "epoch": 5.5836177474402735, "grad_norm": 2.6115005295021216, "kl": 0.37109375, "learning_rate": 5.361205915813424e-07, "loss": 0.0004, "reward": 1.7662760019302368, "reward_std": 0.03232933022081852, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7662760615348816, "step": 1631 }, { "clip_ratio": 0.0, "completion_length": 113.03906631469727, "epoch": 5.587030716723549, "grad_norm": 1.1551989874970572, "kl": 0.3779296875, "learning_rate": 5.358361774744027e-07, "loss": 0.0004, "reward": 1.81640625, "reward_std": 0.039978133514523506, "rewards/format_reward": 1.0, "rewards/score_reward": 0.81640625, "step": 1632 }, { "clip_ratio": 0.0, "completion_length": 111.953125, "epoch": 5.590443686006826, "grad_norm": 0.6919905607277624, "kl": 0.3798828125, "learning_rate": 5.35551763367463e-07, "loss": 0.0004, "reward": 1.7109375, "reward_std": 0.031626221258193254, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7109375, "step": 1633 }, { "clip_ratio": 0.0, "completion_length": 111.55989837646484, "epoch": 5.593856655290102, "grad_norm": 1.5052644761427898, "kl": 0.3701171875, "learning_rate": 5.352673492605233e-07, "loss": 0.0004, "reward": 1.8522135615348816, "reward_std": 0.041554758325219154, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8522135317325592, "step": 1634 }, { "clip_ratio": 0.0, "completion_length": 111.15104293823242, "epoch": 5.597269624573379, "grad_norm": 0.3296569569938978, "kl": 0.3828125, "learning_rate": 5.349829351535836e-07, "loss": 0.0004, "reward": 1.8391927480697632, "reward_std": 0.008086706511676311, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8391926884651184, "step": 1635 }, { "clip_ratio": 0.0, "completion_length": 112.95052337646484, "epoch": 5.600682593856655, "grad_norm": 0.6952445059766216, "kl": 0.3759765625, "learning_rate": 5.34698521046644e-07, "loss": 0.0004, "reward": 1.8001301884651184, "reward_std": 0.027070919051766396, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8001302182674408, "step": 1636 }, { "clip_ratio": 0.0, "completion_length": 111.20312881469727, "epoch": 5.604095563139932, "grad_norm": 0.6111981562530996, "kl": 0.3671875, "learning_rate": 5.344141069397042e-07, "loss": 0.0004, "reward": 1.7311197519302368, "reward_std": 0.030905211344361305, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7311198115348816, "step": 1637 }, { "clip_ratio": 0.0, "completion_length": 109.87239837646484, "epoch": 5.607508532423208, "grad_norm": 1.5364467039732166, "kl": 0.3720703125, "learning_rate": 5.341296928327644e-07, "loss": 0.0004, "reward": 1.8125, "reward_std": 0.01828151335939765, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8125, "step": 1638 }, { "clip_ratio": 0.0, "completion_length": 111.35156631469727, "epoch": 5.610921501706485, "grad_norm": 1.2978251587323468, "kl": 0.35546875, "learning_rate": 5.338452787258248e-07, "loss": 0.0004, "reward": 1.7591145634651184, "reward_std": 0.02963376324623823, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7591145932674408, "step": 1639 }, { "clip_ratio": 0.0, "completion_length": 109.57291793823242, "epoch": 5.614334470989761, "grad_norm": 1.427173613693279, "kl": 0.384765625, "learning_rate": 5.33560864618885e-07, "loss": 0.0004, "reward": 1.6783854365348816, "reward_std": 0.04414878599345684, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6783854067325592, "step": 1640 }, { "clip_ratio": 0.0, "completion_length": 110.03385543823242, "epoch": 5.617747440273037, "grad_norm": 0.8656501280291367, "kl": 0.3740234375, "learning_rate": 5.332764505119454e-07, "loss": 0.0004, "reward": 1.6901041865348816, "reward_std": 0.04276210814714432, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6901041567325592, "step": 1641 }, { "clip_ratio": 0.0, "completion_length": 110.67708587646484, "epoch": 5.621160409556314, "grad_norm": 1.0936391936847218, "kl": 0.3701171875, "learning_rate": 5.329920364050057e-07, "loss": 0.0004, "reward": 1.7272135615348816, "reward_std": 0.02222809847444296, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7272135317325592, "step": 1642 }, { "clip_ratio": 0.0, "completion_length": 110.76302337646484, "epoch": 5.624573378839591, "grad_norm": 2.705174458306237, "kl": 0.3701171875, "learning_rate": 5.32707622298066e-07, "loss": 0.0004, "reward": 1.6998698115348816, "reward_std": 0.015168231446295977, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6998697817325592, "step": 1643 }, { "clip_ratio": 0.0, "completion_length": 110.82291793823242, "epoch": 5.627986348122867, "grad_norm": 1.0739055212606354, "kl": 0.3828125, "learning_rate": 5.324232081911263e-07, "loss": 0.0004, "reward": 1.7076822519302368, "reward_std": 0.023520145565271378, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7076823115348816, "step": 1644 }, { "clip_ratio": 0.0, "completion_length": 110.75521087646484, "epoch": 5.631399317406143, "grad_norm": 0.010982863428035172, "kl": 0.3681640625, "learning_rate": 5.321387940841866e-07, "loss": 0.0004, "reward": 1.8072916865348816, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8072916567325592, "step": 1645 }, { "clip_ratio": 0.0, "completion_length": 110.20312881469727, "epoch": 5.63481228668942, "grad_norm": 0.3275723847964243, "kl": 0.3740234375, "learning_rate": 5.318543799772468e-07, "loss": 0.0004, "reward": 1.736328125, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.736328125, "step": 1646 }, { "clip_ratio": 0.0, "completion_length": 111.63542175292969, "epoch": 5.638225255972696, "grad_norm": 1.204618932620537, "kl": 0.419921875, "learning_rate": 5.315699658703071e-07, "loss": 0.0004, "reward": 1.8489583134651184, "reward_std": 0.024393252097070217, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8489583432674408, "step": 1647 }, { "clip_ratio": 0.0, "completion_length": 108.81771087646484, "epoch": 5.6416382252559725, "grad_norm": 1.3782467248976953, "kl": 0.376953125, "learning_rate": 5.312855517633675e-07, "loss": 0.0004, "reward": 1.7903646230697632, "reward_std": 0.0371494316495955, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7903645634651184, "step": 1648 }, { "clip_ratio": 0.0, "completion_length": 109.19271087646484, "epoch": 5.645051194539249, "grad_norm": 2.011892158040604, "kl": 0.36328125, "learning_rate": 5.310011376564277e-07, "loss": 0.0004, "reward": 1.841796875, "reward_std": 0.035707819275557995, "rewards/format_reward": 1.0, "rewards/score_reward": 0.841796875, "step": 1649 }, { "clip_ratio": 0.0, "completion_length": 109.65625381469727, "epoch": 5.648464163822526, "grad_norm": 0.6913747524355857, "kl": 0.3759765625, "learning_rate": 5.30716723549488e-07, "loss": 0.0004, "reward": 1.6959635615348816, "reward_std": 0.015452401712536812, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6959635317325592, "step": 1650 }, { "clip_ratio": 0.0, "completion_length": 107.98437881469727, "epoch": 5.651877133105802, "grad_norm": 1.2757029995686577, "kl": 0.5068359375, "learning_rate": 5.304323094425484e-07, "loss": 0.0005, "reward": 1.8307291865348816, "reward_std": 0.0402450542896986, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8307291567325592, "step": 1651 }, { "clip_ratio": 0.0, "completion_length": 106.578125, "epoch": 5.6552901023890785, "grad_norm": 0.8680888743241186, "kl": 0.373046875, "learning_rate": 5.301478953356086e-07, "loss": 0.0004, "reward": 1.7220052480697632, "reward_std": 0.027488631196320057, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7220051884651184, "step": 1652 }, { "clip_ratio": 0.0, "completion_length": 110.65625, "epoch": 5.658703071672355, "grad_norm": 0.8954016821390604, "kl": 0.3779296875, "learning_rate": 5.298634812286689e-07, "loss": 0.0004, "reward": 1.7513020634651184, "reward_std": 0.033100245986133814, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.75390625, "step": 1653 }, { "clip_ratio": 0.0, "completion_length": 108.20312881469727, "epoch": 5.662116040955631, "grad_norm": 0.8155496527876838, "kl": 0.3798828125, "learning_rate": 5.295790671217292e-07, "loss": 0.0004, "reward": 1.7779948115348816, "reward_std": 0.018957491498440504, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7779947817325592, "step": 1654 }, { "clip_ratio": 0.0, "completion_length": 109.06510543823242, "epoch": 5.665529010238908, "grad_norm": 0.8600569893193007, "kl": 0.3623046875, "learning_rate": 5.292946530147894e-07, "loss": 0.0004, "reward": 1.7701823115348816, "reward_std": 0.026234674267470837, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7701822817325592, "step": 1655 }, { "clip_ratio": 0.0, "completion_length": 109.08854293823242, "epoch": 5.6689419795221845, "grad_norm": 0.626383410355646, "kl": 0.3623046875, "learning_rate": 5.290102389078498e-07, "loss": 0.0004, "reward": 1.69140625, "reward_std": 0.016876930370926857, "rewards/format_reward": 1.0, "rewards/score_reward": 0.69140625, "step": 1656 }, { "clip_ratio": 0.0, "completion_length": 111.46875381469727, "epoch": 5.672354948805461, "grad_norm": 0.3856663690939685, "kl": 0.3740234375, "learning_rate": 5.287258248009101e-07, "loss": 0.0004, "reward": 1.7890625, "reward_std": 0.013610977679491043, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7890625, "step": 1657 }, { "clip_ratio": 0.0, "completion_length": 108.61198425292969, "epoch": 5.675767918088737, "grad_norm": 1.0025747153386162, "kl": 0.373046875, "learning_rate": 5.284414106939705e-07, "loss": 0.0004, "reward": 1.7467448115348816, "reward_std": 0.009074393659830093, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7467447817325592, "step": 1658 }, { "clip_ratio": 0.0, "completion_length": 107.6953125, "epoch": 5.679180887372014, "grad_norm": 1.3919335663421426, "kl": 0.38671875, "learning_rate": 5.281569965870307e-07, "loss": 0.0004, "reward": 1.7584635615348816, "reward_std": 0.02989962138235569, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7584635317325592, "step": 1659 }, { "clip_ratio": 0.0, "completion_length": 107.80989837646484, "epoch": 5.6825938566552905, "grad_norm": 3.2731279542558656, "kl": 0.3662109375, "learning_rate": 5.27872582480091e-07, "loss": 0.0004, "reward": 1.7799479365348816, "reward_std": 0.007232970092445612, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7799479067325592, "step": 1660 }, { "clip_ratio": 0.0, "completion_length": 109.37760543823242, "epoch": 5.686006825938566, "grad_norm": 2.1887317920203637, "kl": 0.3642578125, "learning_rate": 5.275881683731513e-07, "loss": 0.0004, "reward": 1.7239583134651184, "reward_std": 0.053838131949305534, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7239583432674408, "step": 1661 }, { "clip_ratio": 0.0, "completion_length": 109.15885925292969, "epoch": 5.689419795221843, "grad_norm": 0.9899134489508192, "kl": 0.376953125, "learning_rate": 5.273037542662115e-07, "loss": 0.0004, "reward": 1.8268229365348816, "reward_std": 0.021830817684531212, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8268229067325592, "step": 1662 }, { "clip_ratio": 0.0, "completion_length": 109.59375, "epoch": 5.69283276450512, "grad_norm": 0.05329247034210244, "kl": 0.373046875, "learning_rate": 5.270193401592719e-07, "loss": 0.0004, "reward": 1.71875, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.71875, "step": 1663 }, { "clip_ratio": 0.0, "completion_length": 107.83073043823242, "epoch": 5.696245733788396, "grad_norm": 0.2836475767947837, "kl": 0.3916015625, "learning_rate": 5.267349260523322e-07, "loss": 0.0004, "reward": 1.6432291269302368, "reward_std": 0.008351913653314114, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6432291865348816, "step": 1664 }, { "clip_ratio": 0.0, "completion_length": 109.71094131469727, "epoch": 5.699658703071672, "grad_norm": 1.021595888724573, "kl": 0.3720703125, "learning_rate": 5.264505119453924e-07, "loss": 0.0004, "reward": 1.74609375, "reward_std": 0.0345235476270318, "rewards/format_reward": 1.0, "rewards/score_reward": 0.74609375, "step": 1665 }, { "clip_ratio": 0.0, "completion_length": 110.00260543823242, "epoch": 5.703071672354949, "grad_norm": 0.991527649953514, "kl": 0.375, "learning_rate": 5.261660978384528e-07, "loss": 0.0004, "reward": 1.7337239384651184, "reward_std": 0.04701451864093542, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7337239682674408, "step": 1666 }, { "clip_ratio": 0.0, "completion_length": 109.92708587646484, "epoch": 5.706484641638225, "grad_norm": 8.580227026184955, "kl": 0.3876953125, "learning_rate": 5.258816837315131e-07, "loss": 0.0004, "reward": 1.7760416865348816, "reward_std": 0.015717608854174614, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7760416567325592, "step": 1667 }, { "clip_ratio": 0.0, "completion_length": 110.16667175292969, "epoch": 5.709897610921502, "grad_norm": 0.44842798032799835, "kl": 0.3828125, "learning_rate": 5.255972696245734e-07, "loss": 0.0004, "reward": 1.84765625, "reward_std": 0.018015244975686073, "rewards/format_reward": 1.0, "rewards/score_reward": 0.84765625, "step": 1668 }, { "clip_ratio": 0.0, "completion_length": 110.47917175292969, "epoch": 5.713310580204778, "grad_norm": 0.47057173024220483, "kl": 0.3759765625, "learning_rate": 5.253128555176336e-07, "loss": 0.0004, "reward": 1.7115885019302368, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7115885615348816, "step": 1669 }, { "clip_ratio": 0.0, "completion_length": 109.39323043823242, "epoch": 5.716723549488055, "grad_norm": 0.7692868939627058, "kl": 0.37109375, "learning_rate": 5.250284414106939e-07, "loss": 0.0004, "reward": 1.8046875, "reward_std": 0.021831635385751724, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8046875, "step": 1670 }, { "clip_ratio": 0.0, "completion_length": 109.32031631469727, "epoch": 5.720136518771331, "grad_norm": 0.7015599095834909, "kl": 0.3681640625, "learning_rate": 5.247440273037542e-07, "loss": 0.0004, "reward": 1.7897135615348816, "reward_std": 0.020255662500858307, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7897135317325592, "step": 1671 }, { "clip_ratio": 0.0, "completion_length": 108.75000381469727, "epoch": 5.723549488054608, "grad_norm": 0.7953146491087755, "kl": 0.3681640625, "learning_rate": 5.244596131968145e-07, "loss": 0.0004, "reward": 1.7298177480697632, "reward_std": 0.02353951707482338, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7298176884651184, "step": 1672 }, { "clip_ratio": 0.0, "completion_length": 110.08594131469727, "epoch": 5.726962457337884, "grad_norm": 1.525892053582503, "kl": 0.37109375, "learning_rate": 5.241751990898749e-07, "loss": 0.0004, "reward": 1.708984375, "reward_std": 0.046927180141210556, "rewards/format_reward": 1.0, "rewards/score_reward": 0.708984375, "step": 1673 }, { "clip_ratio": 0.0, "completion_length": 109.53125, "epoch": 5.73037542662116, "grad_norm": 0.6439469727802187, "kl": 0.3740234375, "learning_rate": 5.238907849829351e-07, "loss": 0.0004, "reward": 1.8098958134651184, "reward_std": 0.017009655013680458, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.8125, "step": 1674 }, { "clip_ratio": 0.0, "completion_length": 108.79427337646484, "epoch": 5.733788395904437, "grad_norm": 1.2093539772299442, "kl": 0.361328125, "learning_rate": 5.236063708759954e-07, "loss": 0.0004, "reward": 1.75, "reward_std": 0.02906378824263811, "rewards/format_reward": 1.0, "rewards/score_reward": 0.75, "step": 1675 }, { "clip_ratio": 0.0, "completion_length": 112.03906631469727, "epoch": 5.737201365187714, "grad_norm": 0.5307566664535591, "kl": 0.38671875, "learning_rate": 5.233219567690558e-07, "loss": 0.0004, "reward": 1.7962239980697632, "reward_std": 0.008086706511676311, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7962239384651184, "step": 1676 }, { "clip_ratio": 0.0, "completion_length": 110.3828125, "epoch": 5.7406143344709895, "grad_norm": 0.5062349740153232, "kl": 0.3720703125, "learning_rate": 5.23037542662116e-07, "loss": 0.0004, "reward": 1.740234375, "reward_std": 0.015319676604121923, "rewards/format_reward": 1.0, "rewards/score_reward": 0.740234375, "step": 1677 }, { "clip_ratio": 0.0, "completion_length": 111.91667175292969, "epoch": 5.744027303754266, "grad_norm": 1.2069627360988986, "kl": 0.3623046875, "learning_rate": 5.227531285551763e-07, "loss": 0.0004, "reward": 1.7734375, "reward_std": 0.044819487258791924, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7734375, "step": 1678 }, { "clip_ratio": 0.0, "completion_length": 111.33854293823242, "epoch": 5.747440273037543, "grad_norm": 1.2242139655533015, "kl": 0.3681640625, "learning_rate": 5.224687144482366e-07, "loss": 0.0004, "reward": 1.7942708134651184, "reward_std": 0.037853602319955826, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7942708432674408, "step": 1679 }, { "clip_ratio": 0.0, "completion_length": 109.41927337646484, "epoch": 5.750853242320819, "grad_norm": 1.307103412487941, "kl": 0.373046875, "learning_rate": 5.221843003412969e-07, "loss": 0.0004, "reward": 1.7623697519302368, "reward_std": 0.019135249312967062, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7623698115348816, "step": 1680 }, { "clip_ratio": 0.0, "completion_length": 110.01823425292969, "epoch": 5.7542662116040955, "grad_norm": 1.2153203028487336, "kl": 0.373046875, "learning_rate": 5.218998862343572e-07, "loss": 0.0004, "reward": 1.775390625, "reward_std": 0.04072967916727066, "rewards/format_reward": 1.0, "rewards/score_reward": 0.775390625, "step": 1681 }, { "clip_ratio": 0.0, "completion_length": 110.1328125, "epoch": 5.757679180887372, "grad_norm": 1.1677659962191826, "kl": 0.3642578125, "learning_rate": 5.216154721274175e-07, "loss": 0.0004, "reward": 1.796875, "reward_std": 0.028778147883713245, "rewards/format_reward": 1.0, "rewards/score_reward": 0.796875, "step": 1682 }, { "clip_ratio": 0.0, "completion_length": 111.703125, "epoch": 5.761092150170649, "grad_norm": 0.828071123881288, "kl": 0.36328125, "learning_rate": 5.213310580204779e-07, "loss": 0.0004, "reward": 1.8098958134651184, "reward_std": 0.013610977679491043, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8098958432674408, "step": 1683 }, { "clip_ratio": 0.0, "completion_length": 110.83073043823242, "epoch": 5.764505119453925, "grad_norm": 0.4002800303117734, "kl": 0.37109375, "learning_rate": 5.210466439135381e-07, "loss": 0.0004, "reward": 1.787109375, "reward_std": 0.012889966368675232, "rewards/format_reward": 1.0, "rewards/score_reward": 0.787109375, "step": 1684 }, { "clip_ratio": 0.0, "completion_length": 112.09114837646484, "epoch": 5.7679180887372015, "grad_norm": 1.9853407755897783, "kl": 0.3828125, "learning_rate": 5.207622298065983e-07, "loss": 0.0004, "reward": 1.7825521230697632, "reward_std": 0.011048543266952038, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7825520634651184, "step": 1685 }, { "clip_ratio": 0.0, "completion_length": 113.29166793823242, "epoch": 5.771331058020478, "grad_norm": 1.0351814481299046, "kl": 0.375, "learning_rate": 5.204778156996587e-07, "loss": 0.0004, "reward": 1.8033853769302368, "reward_std": 0.05236797407269478, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8033854365348816, "step": 1686 }, { "clip_ratio": 0.0, "completion_length": 111.23698043823242, "epoch": 5.774744027303754, "grad_norm": 0.47188249826220713, "kl": 0.3681640625, "learning_rate": 5.201934015927189e-07, "loss": 0.0004, "reward": 1.6953125, "reward_std": 0.010782274417579174, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6953125, "step": 1687 }, { "clip_ratio": 0.0, "completion_length": 111.90885925292969, "epoch": 5.778156996587031, "grad_norm": 1.8156309204329812, "kl": 0.3701171875, "learning_rate": 5.199089874857793e-07, "loss": 0.0004, "reward": 1.7552083134651184, "reward_std": 0.02919586142525077, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7552083432674408, "step": 1688 }, { "clip_ratio": 0.0, "completion_length": 109.31250381469727, "epoch": 5.7815699658703075, "grad_norm": 2.989839984923651, "kl": 0.3779296875, "learning_rate": 5.196245733788396e-07, "loss": 0.0004, "reward": 1.6920573115348816, "reward_std": 0.029785616789013147, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6920572817325592, "step": 1689 }, { "clip_ratio": 0.0, "completion_length": 108.79687881469727, "epoch": 5.784982935153583, "grad_norm": 23.302535241584202, "kl": 0.3837890625, "learning_rate": 5.193401592718999e-07, "loss": 0.0004, "reward": 1.828125, "reward_std": 0.025647209491580725, "rewards/format_reward": 1.0, "rewards/score_reward": 0.828125, "step": 1690 }, { "clip_ratio": 0.0, "completion_length": 110.13021087646484, "epoch": 5.78839590443686, "grad_norm": 2.6234827934134493, "kl": 0.3603515625, "learning_rate": 5.190557451649602e-07, "loss": 0.0004, "reward": 1.8600260019302368, "reward_std": 0.05190593935549259, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8600260615348816, "step": 1691 }, { "clip_ratio": 0.0, "completion_length": 110.24739837646484, "epoch": 5.791808873720137, "grad_norm": 1.195247342626533, "kl": 0.369140625, "learning_rate": 5.187713310580204e-07, "loss": 0.0004, "reward": 1.7141926884651184, "reward_std": 0.03090521227568388, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7141927182674408, "step": 1692 }, { "clip_ratio": 0.0, "completion_length": 107.14323425292969, "epoch": 5.795221843003413, "grad_norm": 1.0975003917098762, "kl": 0.37109375, "learning_rate": 5.184869169510807e-07, "loss": 0.0004, "reward": 1.8385416865348816, "reward_std": 0.032461813651025295, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8385416567325592, "step": 1693 }, { "clip_ratio": 0.0, "completion_length": 111.38542175292969, "epoch": 5.798634812286689, "grad_norm": 1.1104908346087363, "kl": 0.37109375, "learning_rate": 5.18202502844141e-07, "loss": 0.0004, "reward": 1.8020833134651184, "reward_std": 0.03649745415896177, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8020833432674408, "step": 1694 }, { "clip_ratio": 0.0, "completion_length": 109.62760543823242, "epoch": 5.802047781569966, "grad_norm": 1.142630699606146, "kl": 0.3662109375, "learning_rate": 5.179180887372014e-07, "loss": 0.0004, "reward": 1.71875, "reward_std": 0.02551448345184326, "rewards/format_reward": 1.0, "rewards/score_reward": 0.71875, "step": 1695 }, { "clip_ratio": 0.0, "completion_length": 108.94792175292969, "epoch": 5.805460750853243, "grad_norm": 1.2467570004537605, "kl": 0.36328125, "learning_rate": 5.176336746302616e-07, "loss": 0.0004, "reward": 1.7376302480697632, "reward_std": 0.042477527633309364, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7376301884651184, "step": 1696 }, { "clip_ratio": 0.0, "completion_length": 112.00260543823242, "epoch": 5.808873720136519, "grad_norm": 2.6879171685369, "kl": 0.3671875, "learning_rate": 5.173492605233219e-07, "loss": 0.0004, "reward": 1.7805989384651184, "reward_std": 0.05810926295816898, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7805989682674408, "step": 1697 }, { "clip_ratio": 0.0, "completion_length": 110.15885543823242, "epoch": 5.812286689419795, "grad_norm": 0.8895341142311269, "kl": 0.380859375, "learning_rate": 5.170648464163823e-07, "loss": 0.0004, "reward": 1.8268228769302368, "reward_std": 0.03090480249375105, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8268229365348816, "step": 1698 }, { "clip_ratio": 0.0, "completion_length": 111.15104293823242, "epoch": 5.815699658703072, "grad_norm": 0.5566684652848956, "kl": 0.3505859375, "learning_rate": 5.167804323094426e-07, "loss": 0.0004, "reward": 1.818359375, "reward_std": 0.022533927112817764, "rewards/format_reward": 1.0, "rewards/score_reward": 0.818359375, "step": 1699 }, { "clip_ratio": 0.0, "completion_length": 111.75260925292969, "epoch": 5.819112627986348, "grad_norm": 0.4973464406044998, "kl": 0.36328125, "learning_rate": 5.164960182025028e-07, "loss": 0.0004, "reward": 1.82421875, "reward_std": 0.014598664827644825, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.8268229067325592, "step": 1700 }, { "clip_ratio": 0.0, "completion_length": 111.86458587646484, "epoch": 5.822525597269625, "grad_norm": 1.0437703702190728, "kl": 0.3828125, "learning_rate": 5.162116040955631e-07, "loss": 0.0004, "reward": 1.724609375, "reward_std": 0.016306545585393906, "rewards/format_reward": 1.0, "rewards/score_reward": 0.724609375, "step": 1701 }, { "clip_ratio": 0.0, "completion_length": 114.82291793823242, "epoch": 5.825938566552901, "grad_norm": 0.6053603669715805, "kl": 0.376953125, "learning_rate": 5.159271899886234e-07, "loss": 0.0004, "reward": 1.7740885019302368, "reward_std": 0.020255661569535732, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7740885615348816, "step": 1702 }, { "clip_ratio": 0.0, "completion_length": 114.25521087646484, "epoch": 5.829351535836177, "grad_norm": 0.6910901933511766, "kl": 0.3564453125, "learning_rate": 5.156427758816837e-07, "loss": 0.0004, "reward": 1.8489583134651184, "reward_std": 0.014731390401721, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8489583432674408, "step": 1703 }, { "clip_ratio": 0.0, "completion_length": 111.75260543823242, "epoch": 5.832764505119454, "grad_norm": 0.6735364811037858, "kl": 0.3544921875, "learning_rate": 5.15358361774744e-07, "loss": 0.0004, "reward": 1.8125, "reward_std": 0.02437535021454096, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8125, "step": 1704 }, { "clip_ratio": 0.0, "completion_length": 112.40625381469727, "epoch": 5.836177474402731, "grad_norm": 0.38123168130959206, "kl": 0.3486328125, "learning_rate": 5.150739476678044e-07, "loss": 0.0003, "reward": 1.8391927480697632, "reward_std": 0.012889966368675232, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8391926884651184, "step": 1705 }, { "clip_ratio": 0.0, "completion_length": 114.97135543823242, "epoch": 5.839590443686006, "grad_norm": 0.5308196152942457, "kl": 0.3701171875, "learning_rate": 5.147895335608646e-07, "loss": 0.0004, "reward": 1.662109375, "reward_std": 0.012889966368675232, "rewards/format_reward": 1.0, "rewards/score_reward": 0.662109375, "step": 1706 }, { "clip_ratio": 0.0, "completion_length": 114.63541793823242, "epoch": 5.843003412969283, "grad_norm": 1.5063701662399942, "kl": 0.3828125, "learning_rate": 5.145051194539249e-07, "loss": 0.0004, "reward": 1.7135416865348816, "reward_std": 0.056988444179296494, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7161458134651184, "step": 1707 }, { "clip_ratio": 0.0, "completion_length": 114.53385543823242, "epoch": 5.84641638225256, "grad_norm": 1.2919022851070103, "kl": 0.376953125, "learning_rate": 5.142207053469852e-07, "loss": 0.0004, "reward": 1.744140625, "reward_std": 0.05938030034303665, "rewards/format_reward": 1.0, "rewards/score_reward": 0.744140625, "step": 1708 }, { "clip_ratio": 0.0, "completion_length": 113.88802337646484, "epoch": 5.849829351535837, "grad_norm": 0.5389286181480434, "kl": 0.3515625, "learning_rate": 5.139362912400454e-07, "loss": 0.0004, "reward": 1.7408854365348816, "reward_std": 0.022950578946620226, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7408854067325592, "step": 1709 }, { "clip_ratio": 0.0, "completion_length": 117.84375, "epoch": 5.853242320819112, "grad_norm": 0.6508986269949955, "kl": 0.3603515625, "learning_rate": 5.136518771331058e-07, "loss": 0.0004, "reward": 1.7845052480697632, "reward_std": 0.015452401712536812, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7845051884651184, "step": 1710 }, { "clip_ratio": 0.0, "completion_length": 116.13021087646484, "epoch": 5.856655290102389, "grad_norm": 1.0011741762745832, "kl": 0.3642578125, "learning_rate": 5.133674630261661e-07, "loss": 0.0004, "reward": 1.7526041865348816, "reward_std": 0.022097086533904076, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7552083134651184, "step": 1711 }, { "clip_ratio": 0.0, "completion_length": 116.42187881469727, "epoch": 5.860068259385666, "grad_norm": 0.6695577104489921, "kl": 0.3642578125, "learning_rate": 5.130830489192263e-07, "loss": 0.0004, "reward": 1.7903645634651184, "reward_std": 0.025380939245224, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7903645932674408, "step": 1712 }, { "clip_ratio": 0.0, "completion_length": 118.26041793823242, "epoch": 5.863481228668942, "grad_norm": 0.7064137458896338, "kl": 0.3671875, "learning_rate": 5.127986348122867e-07, "loss": 0.0004, "reward": 1.7858072519302368, "reward_std": 0.032462057657539845, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7858073115348816, "step": 1713 }, { "clip_ratio": 0.0, "completion_length": 116.53906631469727, "epoch": 5.8668941979522184, "grad_norm": 1.8606074260764849, "kl": 0.42578125, "learning_rate": 5.12514220705347e-07, "loss": 0.0004, "reward": 1.6236979365348816, "reward_std": 0.014731390867382288, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6263020783662796, "step": 1714 }, { "clip_ratio": 0.0, "completion_length": 118.60416793823242, "epoch": 5.870307167235495, "grad_norm": 0.7156936292879228, "kl": 0.3525390625, "learning_rate": 5.122298065984073e-07, "loss": 0.0004, "reward": 1.8274739980697632, "reward_std": 0.01773066632449627, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8274739384651184, "step": 1715 }, { "clip_ratio": 0.0, "completion_length": 119.40625381469727, "epoch": 5.873720136518771, "grad_norm": 1.1406654146914137, "kl": 0.3671875, "learning_rate": 5.119453924914675e-07, "loss": 0.0004, "reward": 1.7760416269302368, "reward_std": 0.07513722311705351, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7786458432674408, "step": 1716 }, { "clip_ratio": 0.0, "completion_length": 116.80208587646484, "epoch": 5.877133105802048, "grad_norm": 0.8353624196030518, "kl": 0.359375, "learning_rate": 5.116609783845278e-07, "loss": 0.0004, "reward": 1.7395833134651184, "reward_std": 0.027791929431259632, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7395833432674408, "step": 1717 }, { "clip_ratio": 0.0, "completion_length": 115.70052337646484, "epoch": 5.8805460750853245, "grad_norm": 1.7665123343964844, "kl": 0.359375, "learning_rate": 5.113765642775881e-07, "loss": 0.0004, "reward": 1.689453125, "reward_std": 0.04948961175978184, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6920572817325592, "step": 1718 }, { "clip_ratio": 0.0, "completion_length": 116.84375381469727, "epoch": 5.8839590443686, "grad_norm": 0.43764802289031585, "kl": 0.3583984375, "learning_rate": 5.110921501706484e-07, "loss": 0.0004, "reward": 1.7486978769302368, "reward_std": 0.014598664827644825, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7486979365348816, "step": 1719 }, { "clip_ratio": 0.0, "completion_length": 117.203125, "epoch": 5.887372013651877, "grad_norm": 0.016074068847456623, "kl": 0.3662109375, "learning_rate": 5.108077360637088e-07, "loss": 0.0004, "reward": 1.7447916865348816, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7447916567325592, "step": 1720 }, { "clip_ratio": 0.0, "completion_length": 115.31510543823242, "epoch": 5.890784982935154, "grad_norm": 0.6087282094165196, "kl": 0.3623046875, "learning_rate": 5.105233219567691e-07, "loss": 0.0004, "reward": 1.7845051884651184, "reward_std": 0.018868980929255486, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7845052182674408, "step": 1721 }, { "clip_ratio": 0.0, "completion_length": 113.66666793823242, "epoch": 5.8941979522184305, "grad_norm": 0.5814466029320885, "kl": 0.3525390625, "learning_rate": 5.102389078498293e-07, "loss": 0.0004, "reward": 1.7786458730697632, "reward_std": 0.03323297016322613, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.78125, "step": 1722 }, { "clip_ratio": 0.0, "completion_length": 115.34375, "epoch": 5.897610921501706, "grad_norm": 1.7702638603512835, "kl": 0.36328125, "learning_rate": 5.099544937428896e-07, "loss": 0.0004, "reward": 1.7278646230697632, "reward_std": 0.05545113515108824, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7278645634651184, "step": 1723 }, { "clip_ratio": 0.0, "completion_length": 114.79948043823242, "epoch": 5.901023890784983, "grad_norm": 0.5935477145696186, "kl": 0.37109375, "learning_rate": 5.096700796359499e-07, "loss": 0.0004, "reward": 1.6829426884651184, "reward_std": 0.022401200607419014, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6829427182674408, "step": 1724 }, { "clip_ratio": 0.0, "completion_length": 115.11719131469727, "epoch": 5.90443686006826, "grad_norm": 1.208940114874043, "kl": 0.36328125, "learning_rate": 5.093856655290102e-07, "loss": 0.0004, "reward": 1.716796875, "reward_std": 0.05359033774584532, "rewards/format_reward": 1.0, "rewards/score_reward": 0.716796875, "step": 1725 }, { "clip_ratio": 0.0, "completion_length": 116.13021087646484, "epoch": 5.907849829351536, "grad_norm": 1.1888277638643698, "kl": 0.3681640625, "learning_rate": 5.091012514220705e-07, "loss": 0.0004, "reward": 1.7376301884651184, "reward_std": 0.02509636152535677, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7376302182674408, "step": 1726 }, { "clip_ratio": 0.0, "completion_length": 116.86458587646484, "epoch": 5.911262798634812, "grad_norm": 1.257176001511015, "kl": 0.37109375, "learning_rate": 5.088168373151308e-07, "loss": 0.0004, "reward": 1.734375, "reward_std": 0.04294103384017944, "rewards/format_reward": 1.0, "rewards/score_reward": 0.734375, "step": 1727 }, { "clip_ratio": 0.0, "completion_length": 117.39844131469727, "epoch": 5.914675767918089, "grad_norm": 0.3087277339224398, "kl": 0.37890625, "learning_rate": 5.085324232081911e-07, "loss": 0.0004, "reward": 1.6529947519302368, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6529948115348816, "step": 1728 }, { "clip_ratio": 0.0, "completion_length": 117.96354293823242, "epoch": 5.918088737201365, "grad_norm": 0.6204227058492493, "kl": 0.36328125, "learning_rate": 5.082480091012514e-07, "loss": 0.0004, "reward": 1.8046875, "reward_std": 0.008351913653314114, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8046875, "step": 1729 }, { "clip_ratio": 0.0, "completion_length": 114.98958587646484, "epoch": 5.921501706484642, "grad_norm": 0.4501176541803342, "kl": 0.359375, "learning_rate": 5.079635949943118e-07, "loss": 0.0004, "reward": 1.7923177480697632, "reward_std": 0.010061264038085938, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7923176884651184, "step": 1730 }, { "clip_ratio": 0.0, "completion_length": 114.85677337646484, "epoch": 5.924914675767918, "grad_norm": 0.795492702030259, "kl": 0.361328125, "learning_rate": 5.076791808873719e-07, "loss": 0.0004, "reward": 1.787109375, "reward_std": 0.02496363501995802, "rewards/format_reward": 1.0, "rewards/score_reward": 0.787109375, "step": 1731 }, { "clip_ratio": 0.0, "completion_length": 116.95052337646484, "epoch": 5.928327645051194, "grad_norm": 0.8987643322949439, "kl": 0.3662109375, "learning_rate": 5.073947667804322e-07, "loss": 0.0004, "reward": 1.845703125, "reward_std": 0.03090521227568388, "rewards/format_reward": 1.0, "rewards/score_reward": 0.845703125, "step": 1732 }, { "clip_ratio": 0.0, "completion_length": 115.26302337646484, "epoch": 5.931740614334471, "grad_norm": 0.5499104527370882, "kl": 0.3564453125, "learning_rate": 5.071103526734926e-07, "loss": 0.0004, "reward": 1.7838541865348816, "reward_std": 0.018281513825058937, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7838541567325592, "step": 1733 }, { "clip_ratio": 0.0, "completion_length": 113.74479293823242, "epoch": 5.935153583617748, "grad_norm": 0.009570243171245366, "kl": 0.3857421875, "learning_rate": 5.068259385665528e-07, "loss": 0.0004, "reward": 1.7604166865348816, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7604166567325592, "step": 1734 }, { "clip_ratio": 0.0, "completion_length": 117.40364837646484, "epoch": 5.938566552901024, "grad_norm": 2.2046840118962496, "kl": 0.3671875, "learning_rate": 5.065415244596132e-07, "loss": 0.0004, "reward": 1.7109375, "reward_std": 0.03678309544920921, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7109375, "step": 1735 }, { "clip_ratio": 0.0, "completion_length": 113.84375381469727, "epoch": 5.9419795221843, "grad_norm": 1.1000791269119252, "kl": 0.38671875, "learning_rate": 5.062571103526735e-07, "loss": 0.0004, "reward": 1.7233073115348816, "reward_std": 0.037265317514538765, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7259114384651184, "step": 1736 }, { "clip_ratio": 0.0, "completion_length": 117.14323043823242, "epoch": 5.945392491467577, "grad_norm": 0.9893488814619079, "kl": 0.3671875, "learning_rate": 5.059726962457338e-07, "loss": 0.0004, "reward": 1.7838541865348816, "reward_std": 0.029063787311315536, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7838541567325592, "step": 1737 }, { "clip_ratio": 0.0, "completion_length": 115.34635925292969, "epoch": 5.948805460750854, "grad_norm": 1.33250763339306, "kl": 0.375, "learning_rate": 5.056882821387941e-07, "loss": 0.0004, "reward": 1.7942708134651184, "reward_std": 0.030487907119095325, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7942708432674408, "step": 1738 }, { "clip_ratio": 0.0, "completion_length": 113.31771087646484, "epoch": 5.952218430034129, "grad_norm": 0.85990177129801, "kl": 0.3662109375, "learning_rate": 5.054038680318543e-07, "loss": 0.0004, "reward": 1.6927083134651184, "reward_std": 0.04019967373460531, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6927083432674408, "step": 1739 }, { "clip_ratio": 0.0, "completion_length": 114.71354293823242, "epoch": 5.955631399317406, "grad_norm": 3.410183018300222, "kl": 0.359375, "learning_rate": 5.051194539249146e-07, "loss": 0.0004, "reward": 1.77734375, "reward_std": 0.023760643787682056, "rewards/format_reward": 1.0, "rewards/score_reward": 0.77734375, "step": 1740 }, { "clip_ratio": 0.0, "completion_length": 112.73437881469727, "epoch": 5.959044368600683, "grad_norm": 1.2563169173291375, "kl": 0.3779296875, "learning_rate": 5.048350398179749e-07, "loss": 0.0004, "reward": 1.849609375, "reward_std": 0.024025851394981146, "rewards/format_reward": 1.0, "rewards/score_reward": 0.849609375, "step": 1741 }, { "clip_ratio": 0.0, "completion_length": 116.13542175292969, "epoch": 5.962457337883959, "grad_norm": 14.097271268309072, "kl": 0.3642578125, "learning_rate": 5.045506257110353e-07, "loss": 0.0004, "reward": 1.6829426884651184, "reward_std": 0.058431691490113735, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6829427182674408, "step": 1742 }, { "clip_ratio": 0.0, "completion_length": 112.67708587646484, "epoch": 5.965870307167235, "grad_norm": 1.5092259268850012, "kl": 0.375, "learning_rate": 5.042662116040956e-07, "loss": 0.0004, "reward": 1.8463541865348816, "reward_std": 0.035175526048988104, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8463541567325592, "step": 1743 }, { "clip_ratio": 0.0, "completion_length": 114.04948043823242, "epoch": 5.969283276450512, "grad_norm": 4.920078203973983, "kl": 0.376953125, "learning_rate": 5.039817974971558e-07, "loss": 0.0004, "reward": 1.6764322519302368, "reward_std": 0.04826847463846207, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6764323115348816, "step": 1744 }, { "clip_ratio": 0.0, "completion_length": 113.70052337646484, "epoch": 5.972696245733788, "grad_norm": 0.6420964248346505, "kl": 0.359375, "learning_rate": 5.036973833902162e-07, "loss": 0.0004, "reward": 1.744140625, "reward_std": 0.023539515677839518, "rewards/format_reward": 1.0, "rewards/score_reward": 0.744140625, "step": 1745 }, { "clip_ratio": 0.0, "completion_length": 117.08333587646484, "epoch": 5.976109215017065, "grad_norm": 0.9186713285528428, "kl": 0.3623046875, "learning_rate": 5.034129692832765e-07, "loss": 0.0004, "reward": 1.7194010615348816, "reward_std": 0.052690641954541206, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7194010317325592, "step": 1746 }, { "clip_ratio": 0.0, "completion_length": 113.30989837646484, "epoch": 5.979522184300341, "grad_norm": 1.9248624687038571, "kl": 0.3662109375, "learning_rate": 5.031285551763367e-07, "loss": 0.0004, "reward": 1.6712239384651184, "reward_std": 0.04925469495356083, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6712239682674408, "step": 1747 }, { "clip_ratio": 0.0, "completion_length": 113.02604293823242, "epoch": 5.982935153583618, "grad_norm": 0.5311032522046372, "kl": 0.375, "learning_rate": 5.02844141069397e-07, "loss": 0.0004, "reward": 1.8196614980697632, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8196614384651184, "step": 1748 }, { "clip_ratio": 0.0, "completion_length": 111.43489837646484, "epoch": 5.986348122866894, "grad_norm": 0.9706200506462935, "kl": 0.37109375, "learning_rate": 5.025597269624573e-07, "loss": 0.0004, "reward": 1.6595051884651184, "reward_std": 0.061980994418263435, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6595052182674408, "step": 1749 }, { "clip_ratio": 0.0, "completion_length": 113.05729293823242, "epoch": 5.989761092150171, "grad_norm": 0.1434819257730738, "kl": 0.3662109375, "learning_rate": 5.022753128555176e-07, "loss": 0.0004, "reward": 1.75, "reward_std": 0.014731390401721, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7526041865348816, "step": 1750 }, { "clip_ratio": 0.0, "completion_length": 112.64062881469727, "epoch": 5.993174061433447, "grad_norm": 0.8987875097505652, "kl": 0.3662109375, "learning_rate": 5.019908987485779e-07, "loss": 0.0004, "reward": 1.7513020634651184, "reward_std": 0.023254937957972288, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7513020932674408, "step": 1751 }, { "clip_ratio": 0.0, "completion_length": 117.46667098999023, "epoch": 5.996587030716723, "grad_norm": 0.6619580761541393, "kl": 0.3681640625, "learning_rate": 5.017064846416383e-07, "loss": 0.0004, "reward": 1.5500001311302185, "reward_std": 0.051754917949438095, "rewards/format_reward": 1.0, "rewards/score_reward": 0.550000011920929, "step": 1752 }, { "clip_ratio": 0.0, "completion_length": 111.92708587646484, "epoch": 6.003412969283277, "grad_norm": 0.6712370540956257, "kl": 0.3701171875, "learning_rate": 5.014220705346985e-07, "loss": 0.0004, "reward": 1.787109375, "reward_std": 0.022533926647156477, "rewards/format_reward": 1.0, "rewards/score_reward": 0.787109375, "step": 1753 }, { "clip_ratio": 0.0, "completion_length": 114.04427337646484, "epoch": 6.006825938566553, "grad_norm": 0.3502069053639361, "kl": 0.353515625, "learning_rate": 5.011376564277588e-07, "loss": 0.0004, "reward": 1.875, "reward_std": 0.014731390401721, "rewards/format_reward": 1.0, "rewards/score_reward": 0.875, "step": 1754 }, { "clip_ratio": 0.0, "completion_length": 115.07552337646484, "epoch": 6.010238907849829, "grad_norm": 0.8010100008430074, "kl": 0.3740234375, "learning_rate": 5.008532423208191e-07, "loss": 0.0004, "reward": 1.7135416865348816, "reward_std": 0.02663342608138919, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7135416567325592, "step": 1755 }, { "clip_ratio": 0.0, "completion_length": 113.93750381469727, "epoch": 6.013651877133106, "grad_norm": 1.161778343900581, "kl": 0.36328125, "learning_rate": 5.005688282138793e-07, "loss": 0.0004, "reward": 1.837890625, "reward_std": 0.016572814900428057, "rewards/format_reward": 1.0, "rewards/score_reward": 0.837890625, "step": 1756 }, { "clip_ratio": 0.0, "completion_length": 114.74739837646484, "epoch": 6.017064846416382, "grad_norm": 1.0251781971063354, "kl": 0.37109375, "learning_rate": 5.002844141069397e-07, "loss": 0.0004, "reward": 1.7981771230697632, "reward_std": 0.015584883745759726, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7981770634651184, "step": 1757 }, { "clip_ratio": 0.0, "completion_length": 115.40364837646484, "epoch": 6.020477815699659, "grad_norm": 0.7804819393941579, "kl": 0.361328125, "learning_rate": 5e-07, "loss": 0.0004, "reward": 1.736328125, "reward_std": 0.020122935995459557, "rewards/format_reward": 1.0, "rewards/score_reward": 0.736328125, "step": 1758 }, { "clip_ratio": 0.0, "completion_length": 115.34635543823242, "epoch": 6.023890784982935, "grad_norm": 0.5013309143273186, "kl": 0.35546875, "learning_rate": 4.997155858930602e-07, "loss": 0.0004, "reward": 1.7734375, "reward_std": 0.031741044484078884, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7760416567325592, "step": 1759 }, { "clip_ratio": 0.0, "completion_length": 115.79687881469727, "epoch": 6.027303754266212, "grad_norm": 0.8439643691796449, "kl": 0.365234375, "learning_rate": 4.994311717861205e-07, "loss": 0.0004, "reward": 1.8098958134651184, "reward_std": 0.024746861308813095, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8098958432674408, "step": 1760 }, { "clip_ratio": 0.0, "completion_length": 116.49739837646484, "epoch": 6.030716723549488, "grad_norm": 0.5837244506452273, "kl": 0.3603515625, "learning_rate": 4.991467576791809e-07, "loss": 0.0004, "reward": 1.8606770634651184, "reward_std": 0.023254938423633575, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8606770932674408, "step": 1761 }, { "clip_ratio": 0.0, "completion_length": 115.8671875, "epoch": 6.034129692832765, "grad_norm": 2.4217354280829806, "kl": 0.3671875, "learning_rate": 4.988623435722411e-07, "loss": 0.0004, "reward": 1.7877604365348816, "reward_std": 0.023254937492311, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7877604067325592, "step": 1762 }, { "clip_ratio": 0.0, "completion_length": 115.63542175292969, "epoch": 6.037542662116041, "grad_norm": 0.6505223845976138, "kl": 0.3740234375, "learning_rate": 4.985779294653015e-07, "loss": 0.0004, "reward": 1.7942708730697632, "reward_std": 0.03318306803703308, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7942708134651184, "step": 1763 }, { "clip_ratio": 0.0, "completion_length": 116.13021087646484, "epoch": 6.040955631399317, "grad_norm": 0.7768389767825354, "kl": 0.3642578125, "learning_rate": 4.982935153583617e-07, "loss": 0.0004, "reward": 1.72265625, "reward_std": 0.029462780803442, "rewards/format_reward": 1.0, "rewards/score_reward": 0.72265625, "step": 1764 }, { "clip_ratio": 0.0, "completion_length": 116.36458587646484, "epoch": 6.044368600682594, "grad_norm": 3.0175956508284045, "kl": 0.3662109375, "learning_rate": 4.980091012514221e-07, "loss": 0.0004, "reward": 1.779296875, "reward_std": 0.06376519985496998, "rewards/format_reward": 1.0, "rewards/score_reward": 0.779296875, "step": 1765 }, { "clip_ratio": 0.0, "completion_length": 113.94010543823242, "epoch": 6.047781569965871, "grad_norm": 0.7793945251746788, "kl": 0.373046875, "learning_rate": 4.977246871444823e-07, "loss": 0.0004, "reward": 1.7916666865348816, "reward_std": 0.04806606750935316, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7916666567325592, "step": 1766 }, { "clip_ratio": 0.0, "completion_length": 116.08594131469727, "epoch": 6.051194539249146, "grad_norm": 0.7318605984754099, "kl": 0.3740234375, "learning_rate": 4.974402730375427e-07, "loss": 0.0004, "reward": 1.7916666865348816, "reward_std": 0.05228063277900219, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7942708432674408, "step": 1767 }, { "clip_ratio": 0.0, "completion_length": 114.37239837646484, "epoch": 6.054607508532423, "grad_norm": 0.36759451701914847, "kl": 0.3818359375, "learning_rate": 4.97155858930603e-07, "loss": 0.0004, "reward": 1.771484375, "reward_std": 0.01275724172592163, "rewards/format_reward": 1.0, "rewards/score_reward": 0.771484375, "step": 1768 }, { "clip_ratio": 0.0, "completion_length": 115.63541793823242, "epoch": 6.0580204778157, "grad_norm": 0.42497922746367933, "kl": 0.3662109375, "learning_rate": 4.968714448236632e-07, "loss": 0.0004, "reward": 1.8098958730697632, "reward_std": 0.014465940184891224, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8098958134651184, "step": 1769 }, { "clip_ratio": 0.0, "completion_length": 115.89844131469727, "epoch": 6.061433447098976, "grad_norm": 0.5811422177618528, "kl": 0.36328125, "learning_rate": 4.965870307167235e-07, "loss": 0.0004, "reward": 1.8248698115348816, "reward_std": 0.02012293692678213, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8248697817325592, "step": 1770 }, { "clip_ratio": 0.0, "completion_length": 115.84635925292969, "epoch": 6.064846416382252, "grad_norm": 0.4253444165248577, "kl": 0.357421875, "learning_rate": 4.963026166097839e-07, "loss": 0.0004, "reward": 1.837890625, "reward_std": 0.02402585092931986, "rewards/format_reward": 1.0, "rewards/score_reward": 0.837890625, "step": 1771 }, { "clip_ratio": 0.0, "completion_length": 114.20312881469727, "epoch": 6.068259385665529, "grad_norm": 1.1168901382970866, "kl": 0.36328125, "learning_rate": 4.960182025028441e-07, "loss": 0.0004, "reward": 1.8131510019302368, "reward_std": 0.027621357701718807, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.8157552182674408, "step": 1772 }, { "clip_ratio": 0.0, "completion_length": 116.54427337646484, "epoch": 6.071672354948806, "grad_norm": 1.243730467538832, "kl": 0.3544921875, "learning_rate": 4.957337883959044e-07, "loss": 0.0004, "reward": 1.833984375, "reward_std": 0.03699986543506384, "rewards/format_reward": 1.0, "rewards/score_reward": 0.833984375, "step": 1773 }, { "clip_ratio": 0.0, "completion_length": 116.54166793823242, "epoch": 6.075085324232082, "grad_norm": 1.3624139287221775, "kl": 0.365234375, "learning_rate": 4.954493742889647e-07, "loss": 0.0004, "reward": 1.73046875, "reward_std": 0.06661384738981724, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7330729067325592, "step": 1774 }, { "clip_ratio": 0.0, "completion_length": 116.59635543823242, "epoch": 6.078498293515358, "grad_norm": 3.05298960269557, "kl": 0.3671875, "learning_rate": 4.95164960182025e-07, "loss": 0.0004, "reward": 1.712890625, "reward_std": 0.02765879686921835, "rewards/format_reward": 1.0, "rewards/score_reward": 0.712890625, "step": 1775 }, { "clip_ratio": 0.0, "completion_length": 116.11198043823242, "epoch": 6.081911262798635, "grad_norm": 0.800526331462666, "kl": 0.3662109375, "learning_rate": 4.948805460750853e-07, "loss": 0.0004, "reward": 1.82421875, "reward_std": 0.02573454938828945, "rewards/format_reward": 1.0, "rewards/score_reward": 0.82421875, "step": 1776 }, { "clip_ratio": 0.0, "completion_length": 118.234375, "epoch": 6.085324232081911, "grad_norm": 0.34525650133485736, "kl": 0.353515625, "learning_rate": 4.945961319681456e-07, "loss": 0.0004, "reward": 1.6979166269302368, "reward_std": 0.015717608854174614, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6979166865348816, "step": 1777 }, { "clip_ratio": 0.0, "completion_length": 116.65885925292969, "epoch": 6.088737201365188, "grad_norm": 1.2520498651334575, "kl": 0.3583984375, "learning_rate": 4.943117178612059e-07, "loss": 0.0004, "reward": 1.8079427480697632, "reward_std": 0.028607575222849846, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8079426884651184, "step": 1778 }, { "clip_ratio": 0.0, "completion_length": 116.80469131469727, "epoch": 6.092150170648464, "grad_norm": 3.7449088968729476, "kl": 0.3505859375, "learning_rate": 4.940273037542662e-07, "loss": 0.0004, "reward": 1.8131510615348816, "reward_std": 0.025950506329536438, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8131510317325592, "step": 1779 }, { "clip_ratio": 0.0, "completion_length": 115.38281631469727, "epoch": 6.09556313993174, "grad_norm": 1.0156956347022297, "kl": 0.3662109375, "learning_rate": 4.937428896473265e-07, "loss": 0.0004, "reward": 1.7786458134651184, "reward_std": 0.05163474380970001, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7786458432674408, "step": 1780 }, { "clip_ratio": 0.0, "completion_length": 117.98958587646484, "epoch": 6.098976109215017, "grad_norm": 1.9761678852325053, "kl": 0.3583984375, "learning_rate": 4.934584755403867e-07, "loss": 0.0004, "reward": 1.7838541865348816, "reward_std": 0.03314562886953354, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7864583134651184, "step": 1781 }, { "clip_ratio": 0.0, "completion_length": 114.859375, "epoch": 6.102389078498294, "grad_norm": 0.9628748079255213, "kl": 0.36328125, "learning_rate": 4.931740614334471e-07, "loss": 0.0004, "reward": 1.7467448115348816, "reward_std": 0.03301037289202213, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7467447817325592, "step": 1782 }, { "clip_ratio": 0.0, "completion_length": 115.49739837646484, "epoch": 6.1058020477815695, "grad_norm": 0.5110093456022221, "kl": 0.359375, "learning_rate": 4.928896473265074e-07, "loss": 0.0004, "reward": 1.6822916269302368, "reward_std": 0.030423804186284542, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6822916865348816, "step": 1783 }, { "clip_ratio": 0.0, "completion_length": 117.69791793823242, "epoch": 6.109215017064846, "grad_norm": 1.5491859433621502, "kl": 0.3603515625, "learning_rate": 4.926052332195676e-07, "loss": 0.0004, "reward": 1.7291666865348816, "reward_std": 0.023254937492311, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7291666567325592, "step": 1784 }, { "clip_ratio": 0.0, "completion_length": 116.76823043823242, "epoch": 6.112627986348123, "grad_norm": 0.7531106525221075, "kl": 0.3583984375, "learning_rate": 4.923208191126279e-07, "loss": 0.0004, "reward": 1.7890625, "reward_std": 0.03785359859466553, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7890625, "step": 1785 }, { "clip_ratio": 0.0, "completion_length": 115.00521087646484, "epoch": 6.1160409556314, "grad_norm": 0.797449130831691, "kl": 0.3427734375, "learning_rate": 4.920364050056883e-07, "loss": 0.0003, "reward": 1.763671875, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.763671875, "step": 1786 }, { "clip_ratio": 0.0, "completion_length": 115.59635925292969, "epoch": 6.1194539249146755, "grad_norm": 1.1346031764088704, "kl": 0.357421875, "learning_rate": 4.917519908987486e-07, "loss": 0.0004, "reward": 1.853515625, "reward_std": 0.04011191939935088, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.8561197817325592, "step": 1787 }, { "clip_ratio": 0.0, "completion_length": 115.94010543823242, "epoch": 6.122866894197952, "grad_norm": 0.8237881376212303, "kl": 0.349609375, "learning_rate": 4.914675767918088e-07, "loss": 0.0004, "reward": 1.7350260615348816, "reward_std": 0.030051068402826786, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7350260317325592, "step": 1788 }, { "clip_ratio": 0.0, "completion_length": 117.86198425292969, "epoch": 6.126279863481229, "grad_norm": 2.1199697547752585, "kl": 0.3466796875, "learning_rate": 4.911831626848692e-07, "loss": 0.0003, "reward": 1.75390625, "reward_std": 0.052499230951070786, "rewards/format_reward": 1.0, "rewards/score_reward": 0.75390625, "step": 1789 }, { "clip_ratio": 0.0, "completion_length": 115.66927337646484, "epoch": 6.129692832764505, "grad_norm": 0.19954036722274748, "kl": 0.3720703125, "learning_rate": 4.908987485779295e-07, "loss": 0.0004, "reward": 1.7317708134651184, "reward_std": 0.0073656952008605, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7317708432674408, "step": 1790 }, { "clip_ratio": 0.0, "completion_length": 115.26041793823242, "epoch": 6.1331058020477816, "grad_norm": 0.3624357467059968, "kl": 0.359375, "learning_rate": 4.906143344709897e-07, "loss": 0.0004, "reward": 1.8815104365348816, "reward_std": 0.007232970092445612, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8815104067325592, "step": 1791 }, { "clip_ratio": 0.0, "completion_length": 114.68229293823242, "epoch": 6.136518771331058, "grad_norm": 0.3969743272891493, "kl": 0.3955078125, "learning_rate": 4.9032992036405e-07, "loss": 0.0004, "reward": 1.685546875, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.685546875, "step": 1792 }, { "clip_ratio": 0.0, "completion_length": 114.63021087646484, "epoch": 6.139931740614334, "grad_norm": 0.28934680849098793, "kl": 0.3603515625, "learning_rate": 4.900455062571104e-07, "loss": 0.0004, "reward": 1.7838541865348816, "reward_std": 0.010782274417579174, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7838541567325592, "step": 1793 }, { "clip_ratio": 0.0, "completion_length": 117.30208587646484, "epoch": 6.143344709897611, "grad_norm": 1.0053035201228944, "kl": 0.34765625, "learning_rate": 4.897610921501706e-07, "loss": 0.0003, "reward": 1.8118489384651184, "reward_std": 0.04563660081475973, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8118489682674408, "step": 1794 }, { "clip_ratio": 0.0, "completion_length": 117.70573043823242, "epoch": 6.146757679180888, "grad_norm": 3.7861730420588673, "kl": 0.3740234375, "learning_rate": 4.894766780432309e-07, "loss": 0.0004, "reward": 1.7897135615348816, "reward_std": 0.031171479262411594, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7897135317325592, "step": 1795 }, { "clip_ratio": 0.0, "completion_length": 117.14323043823242, "epoch": 6.150170648464163, "grad_norm": 0.9195285402959936, "kl": 0.3603515625, "learning_rate": 4.891922639362912e-07, "loss": 0.0004, "reward": 1.6484375, "reward_std": 0.03897254727780819, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6484375, "step": 1796 }, { "clip_ratio": 0.0, "completion_length": 116.42708587646484, "epoch": 6.15358361774744, "grad_norm": 3.1007113594465463, "kl": 0.353515625, "learning_rate": 4.889078498293515e-07, "loss": 0.0004, "reward": 1.7389323115348816, "reward_std": 0.049206264317035675, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7389322817325592, "step": 1797 }, { "clip_ratio": 0.0, "completion_length": 115.17187881469727, "epoch": 6.156996587030717, "grad_norm": 0.8539092332853655, "kl": 0.34765625, "learning_rate": 4.886234357224118e-07, "loss": 0.0003, "reward": 1.7428385019302368, "reward_std": 0.03574591036885977, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7428385615348816, "step": 1798 }, { "clip_ratio": 0.0, "completion_length": 119.07291793823242, "epoch": 6.160409556313994, "grad_norm": 2.379221808334864, "kl": 0.3505859375, "learning_rate": 4.883390216154721e-07, "loss": 0.0003, "reward": 1.796875, "reward_std": 0.022097086533904076, "rewards/format_reward": 1.0, "rewards/score_reward": 0.796875, "step": 1799 }, { "clip_ratio": 0.0, "completion_length": 117.7890625, "epoch": 6.163822525597269, "grad_norm": 1.7183887856785622, "kl": 0.3583984375, "learning_rate": 4.880546075085323e-07, "loss": 0.0004, "reward": 1.6803385615348816, "reward_std": 0.040416447445750237, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6803385317325592, "step": 1800 }, { "clip_ratio": 0.0, "completion_length": 117.83594131469727, "epoch": 6.167235494880546, "grad_norm": 1.3247233026310843, "kl": 0.3525390625, "learning_rate": 4.877701934015927e-07, "loss": 0.0004, "reward": 1.7623698115348816, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7623697817325592, "step": 1801 }, { "clip_ratio": 0.0, "completion_length": 115.72135543823242, "epoch": 6.170648464163823, "grad_norm": 4.485124896851899, "kl": 0.3564453125, "learning_rate": 4.87485779294653e-07, "loss": 0.0004, "reward": 1.755859375, "reward_std": 0.04351059906184673, "rewards/format_reward": 1.0, "rewards/score_reward": 0.755859375, "step": 1802 }, { "clip_ratio": 0.0, "completion_length": 115.34114837646484, "epoch": 6.174061433447099, "grad_norm": 0.8711471792286316, "kl": 0.359375, "learning_rate": 4.872013651877132e-07, "loss": 0.0004, "reward": 1.8541666865348816, "reward_std": 0.020692503079771996, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8541666567325592, "step": 1803 }, { "clip_ratio": 0.0, "completion_length": 117.45052337646484, "epoch": 6.177474402730375, "grad_norm": 1.7557507980697247, "kl": 0.353515625, "learning_rate": 4.869169510807736e-07, "loss": 0.0004, "reward": 1.71484375, "reward_std": 0.05381146818399429, "rewards/format_reward": 1.0, "rewards/score_reward": 0.71484375, "step": 1804 }, { "clip_ratio": 0.0, "completion_length": 117.75000381469727, "epoch": 6.180887372013652, "grad_norm": 0.9797075944152879, "kl": 0.3642578125, "learning_rate": 4.866325369738339e-07, "loss": 0.0004, "reward": 1.8209635615348816, "reward_std": 0.02630411647260189, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8209635317325592, "step": 1805 }, { "clip_ratio": 0.0, "completion_length": 115.97916793823242, "epoch": 6.184300341296928, "grad_norm": 0.516758513203958, "kl": 0.3544921875, "learning_rate": 4.863481228668942e-07, "loss": 0.0004, "reward": 1.779296875, "reward_std": 0.033316200599074364, "rewards/format_reward": 1.0, "rewards/score_reward": 0.779296875, "step": 1806 }, { "clip_ratio": 0.0, "completion_length": 116.859375, "epoch": 6.187713310580205, "grad_norm": 2.703705706964435, "kl": 0.3642578125, "learning_rate": 4.860637087599544e-07, "loss": 0.0004, "reward": 1.7545572519302368, "reward_std": 0.01275724172592163, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7545573115348816, "step": 1807 }, { "clip_ratio": 0.0, "completion_length": 117.71094131469727, "epoch": 6.191126279863481, "grad_norm": 0.8746648269831863, "kl": 0.3505859375, "learning_rate": 4.857792946530148e-07, "loss": 0.0003, "reward": 1.8001302480697632, "reward_std": 0.031170010566711426, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8001301884651184, "step": 1808 }, { "clip_ratio": 0.0, "completion_length": 117.45573043823242, "epoch": 6.194539249146757, "grad_norm": 0.2646079784181546, "kl": 0.361328125, "learning_rate": 4.854948805460751e-07, "loss": 0.0004, "reward": 1.8033854365348816, "reward_std": 0.007232970092445612, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8033854067325592, "step": 1809 }, { "clip_ratio": 0.0, "completion_length": 115.45833587646484, "epoch": 6.197952218430034, "grad_norm": 0.6155773173481884, "kl": 0.3525390625, "learning_rate": 4.852104664391354e-07, "loss": 0.0004, "reward": 1.7630208134651184, "reward_std": 0.03007019404321909, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7630208432674408, "step": 1810 }, { "clip_ratio": 0.0, "completion_length": 114.46875381469727, "epoch": 6.201365187713311, "grad_norm": 0.5324764632321405, "kl": 0.3515625, "learning_rate": 4.849260523321956e-07, "loss": 0.0004, "reward": 1.8287760615348816, "reward_std": 0.022533927112817764, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8287760317325592, "step": 1811 }, { "clip_ratio": 0.0, "completion_length": 115.03385543823242, "epoch": 6.204778156996587, "grad_norm": 1.4775801317184125, "kl": 0.349609375, "learning_rate": 4.84641638225256e-07, "loss": 0.0004, "reward": 1.85546875, "reward_std": 0.03160831984132528, "rewards/format_reward": 1.0, "rewards/score_reward": 0.85546875, "step": 1812 }, { "clip_ratio": 0.0, "completion_length": 116.40885543823242, "epoch": 6.208191126279863, "grad_norm": 0.8869855037300347, "kl": 0.3662109375, "learning_rate": 4.843572241183162e-07, "loss": 0.0004, "reward": 1.685546875, "reward_std": 0.04346521571278572, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6881510317325592, "step": 1813 }, { "clip_ratio": 0.0, "completion_length": 116.20573043823242, "epoch": 6.21160409556314, "grad_norm": 0.9187671781766541, "kl": 0.36328125, "learning_rate": 4.840728100113766e-07, "loss": 0.0004, "reward": 1.7708333134651184, "reward_std": 0.022184427361935377, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7708333432674408, "step": 1814 }, { "clip_ratio": 0.0, "completion_length": 114.82031631469727, "epoch": 6.215017064846417, "grad_norm": 0.31445660867547287, "kl": 0.3486328125, "learning_rate": 4.837883959044369e-07, "loss": 0.0003, "reward": 1.8411458730697632, "reward_std": 0.017009655013680458, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8411458134651184, "step": 1815 }, { "clip_ratio": 0.0, "completion_length": 115.35417175292969, "epoch": 6.2184300341296925, "grad_norm": 3.057598379510026, "kl": 0.3623046875, "learning_rate": 4.835039817974971e-07, "loss": 0.0004, "reward": 1.7252604365348816, "reward_std": 0.03686591610312462, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7252604067325592, "step": 1816 }, { "clip_ratio": 0.0, "completion_length": 114.54687881469727, "epoch": 6.221843003412969, "grad_norm": 1.1447976813170546, "kl": 0.37890625, "learning_rate": 4.832195676905574e-07, "loss": 0.0004, "reward": 1.8515625, "reward_std": 0.03770215716212988, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8515625, "step": 1817 }, { "clip_ratio": 0.0, "completion_length": 115.26041793823242, "epoch": 6.225255972696246, "grad_norm": 1.4918690437248, "kl": 0.3525390625, "learning_rate": 4.829351535836178e-07, "loss": 0.0004, "reward": 1.7721353769302368, "reward_std": 0.03912399336695671, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7721354365348816, "step": 1818 }, { "clip_ratio": 0.0, "completion_length": 114.89323425292969, "epoch": 6.228668941979522, "grad_norm": 0.4615206556430972, "kl": 0.3447265625, "learning_rate": 4.82650739476678e-07, "loss": 0.0003, "reward": 1.7552083134651184, "reward_std": 0.021831635385751724, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7578125, "step": 1819 }, { "clip_ratio": 0.0, "completion_length": 113.39062881469727, "epoch": 6.2320819112627985, "grad_norm": 1.376958056414641, "kl": 0.3681640625, "learning_rate": 4.823663253697383e-07, "loss": 0.0004, "reward": 1.74609375, "reward_std": 0.03296670224517584, "rewards/format_reward": 1.0, "rewards/score_reward": 0.74609375, "step": 1820 }, { "clip_ratio": 0.0, "completion_length": 114.19791793823242, "epoch": 6.235494880546075, "grad_norm": 0.007705858481443819, "kl": 0.3544921875, "learning_rate": 4.820819112627986e-07, "loss": 0.0004, "reward": 1.7395833730697632, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7395833134651184, "step": 1821 }, { "clip_ratio": 0.0, "completion_length": 112.95312881469727, "epoch": 6.238907849829351, "grad_norm": 2.403783612164475, "kl": 0.3515625, "learning_rate": 4.817974971558589e-07, "loss": 0.0004, "reward": 1.8177083134651184, "reward_std": 0.05722804833203554, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8177083432674408, "step": 1822 }, { "clip_ratio": 0.0, "completion_length": 113.1484375, "epoch": 6.242320819112628, "grad_norm": 1.7291878418645548, "kl": 0.369140625, "learning_rate": 4.815130830489192e-07, "loss": 0.0004, "reward": 1.8170573115348816, "reward_std": 0.021697684191167355, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8170572817325592, "step": 1823 }, { "clip_ratio": 0.0, "completion_length": 113.31510543823242, "epoch": 6.2457337883959045, "grad_norm": 0.909239626608027, "kl": 0.361328125, "learning_rate": 4.812286689419795e-07, "loss": 0.0004, "reward": 1.6953125, "reward_std": 0.03524578223004937, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6953125, "step": 1824 }, { "clip_ratio": 0.0, "completion_length": 113.81510925292969, "epoch": 6.249146757679181, "grad_norm": 2.2145114056669266, "kl": 0.349609375, "learning_rate": 4.809442548350399e-07, "loss": 0.0003, "reward": 1.7734375, "reward_std": 0.05203537130728364, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7734375, "step": 1825 }, { "clip_ratio": 0.0, "completion_length": 112.02083587646484, "epoch": 6.252559726962457, "grad_norm": 0.5404802675861848, "kl": 0.359375, "learning_rate": 4.806598407281001e-07, "loss": 0.0004, "reward": 1.7630208134651184, "reward_std": 0.017009655013680458, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7630208432674408, "step": 1826 }, { "clip_ratio": 0.0, "completion_length": 114.78125381469727, "epoch": 6.255972696245734, "grad_norm": 0.602112111774833, "kl": 0.373046875, "learning_rate": 4.803754266211604e-07, "loss": 0.0004, "reward": 1.7291666865348816, "reward_std": 0.015717608854174614, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7291666567325592, "step": 1827 }, { "clip_ratio": 0.0, "completion_length": 115.54427337646484, "epoch": 6.2593856655290105, "grad_norm": 0.9505500608674755, "kl": 0.3427734375, "learning_rate": 4.800910125142207e-07, "loss": 0.0003, "reward": 1.8522135019302368, "reward_std": 0.03331619966775179, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8522135615348816, "step": 1828 }, { "clip_ratio": 0.0, "completion_length": 115.72396087646484, "epoch": 6.262798634812286, "grad_norm": 1.206074091228104, "kl": 0.3486328125, "learning_rate": 4.79806598407281e-07, "loss": 0.0003, "reward": 1.7565103769302368, "reward_std": 0.04109854530543089, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7565104365348816, "step": 1829 }, { "clip_ratio": 0.0, "completion_length": 114.97396087646484, "epoch": 6.266211604095563, "grad_norm": 0.9306193818139994, "kl": 0.3359375, "learning_rate": 4.795221843003413e-07, "loss": 0.0003, "reward": 1.7220051884651184, "reward_std": 0.036429072730243206, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7220052182674408, "step": 1830 }, { "clip_ratio": 0.0, "completion_length": 111.77083587646484, "epoch": 6.26962457337884, "grad_norm": 0.8952495828167948, "kl": 0.361328125, "learning_rate": 4.792377701934016e-07, "loss": 0.0004, "reward": 1.7454426884651184, "reward_std": 0.01773066632449627, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7454427182674408, "step": 1831 }, { "clip_ratio": 0.0, "completion_length": 112.80729293823242, "epoch": 6.273037542662116, "grad_norm": 0.8496967689539493, "kl": 0.359375, "learning_rate": 4.789533560864618e-07, "loss": 0.0004, "reward": 1.6907551884651184, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6907552182674408, "step": 1832 }, { "clip_ratio": 0.0, "completion_length": 114.03906631469727, "epoch": 6.276450511945392, "grad_norm": 1.1316057150020251, "kl": 0.3505859375, "learning_rate": 4.786689419795222e-07, "loss": 0.0004, "reward": 1.755859375, "reward_std": 0.046470724046230316, "rewards/format_reward": 1.0, "rewards/score_reward": 0.755859375, "step": 1833 }, { "clip_ratio": 0.0, "completion_length": 115.36198043823242, "epoch": 6.279863481228669, "grad_norm": 0.5305299900108075, "kl": 0.3525390625, "learning_rate": 4.783845278725825e-07, "loss": 0.0004, "reward": 1.7532552480697632, "reward_std": 0.015452401712536812, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7532551884651184, "step": 1834 }, { "clip_ratio": 0.0, "completion_length": 110.45573425292969, "epoch": 6.283276450511945, "grad_norm": 3.5004520174754346, "kl": 0.35546875, "learning_rate": 4.781001137656427e-07, "loss": 0.0004, "reward": 1.7408854365348816, "reward_std": 0.018015244510024786, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7408854067325592, "step": 1835 }, { "clip_ratio": 0.0, "completion_length": 115.10677337646484, "epoch": 6.286689419795222, "grad_norm": 1.2443282865056347, "kl": 0.3505859375, "learning_rate": 4.778156996587031e-07, "loss": 0.0004, "reward": 1.7239583134651184, "reward_std": 0.03997960453853011, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7239583432674408, "step": 1836 }, { "clip_ratio": 0.0, "completion_length": 113.50260543823242, "epoch": 6.290102389078498, "grad_norm": 0.5616906091254823, "kl": 0.359375, "learning_rate": 4.775312855517634e-07, "loss": 0.0004, "reward": 1.697265625, "reward_std": 0.001841423800215125, "rewards/format_reward": 1.0, "rewards/score_reward": 0.697265625, "step": 1837 }, { "clip_ratio": 0.0, "completion_length": 112.1953125, "epoch": 6.293515358361775, "grad_norm": 0.01942733415528187, "kl": 0.337890625, "learning_rate": 4.772468714448236e-07, "loss": 0.0003, "reward": 1.8177083134651184, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8177083432674408, "step": 1838 }, { "clip_ratio": 0.0, "completion_length": 114.44271087646484, "epoch": 6.296928327645051, "grad_norm": 1.5107746874874979, "kl": 0.3544921875, "learning_rate": 4.769624573378839e-07, "loss": 0.0004, "reward": 1.8235677480697632, "reward_std": 0.016306545585393906, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8235676884651184, "step": 1839 }, { "clip_ratio": 0.0, "completion_length": 113.55469131469727, "epoch": 6.300341296928328, "grad_norm": 2.2896510663828558, "kl": 0.375, "learning_rate": 4.766780432309442e-07, "loss": 0.0004, "reward": 1.6901041865348816, "reward_std": 0.06463912688195705, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6901041567325592, "step": 1840 }, { "clip_ratio": 0.0, "completion_length": 113.63802337646484, "epoch": 6.303754266211604, "grad_norm": 6.572949297092353, "kl": 0.353515625, "learning_rate": 4.7639362912400454e-07, "loss": 0.0004, "reward": 1.7955729365348816, "reward_std": 0.0341886542737484, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7955729067325592, "step": 1841 }, { "clip_ratio": 0.0, "completion_length": 115.13541793823242, "epoch": 6.30716723549488, "grad_norm": 5.832132825818439, "kl": 0.345703125, "learning_rate": 4.761092150170648e-07, "loss": 0.0003, "reward": 1.7057291865348816, "reward_std": 0.05646107904613018, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7057291567325592, "step": 1842 }, { "clip_ratio": 0.0, "completion_length": 116.109375, "epoch": 6.310580204778157, "grad_norm": 1.0438336534514217, "kl": 0.353515625, "learning_rate": 4.758248009101251e-07, "loss": 0.0004, "reward": 1.8404948115348816, "reward_std": 0.032245039474219084, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8404947817325592, "step": 1843 }, { "clip_ratio": 0.0, "completion_length": 115.77864837646484, "epoch": 6.313993174061434, "grad_norm": 4.051481352720465, "kl": 0.349609375, "learning_rate": 4.7554038680318545e-07, "loss": 0.0004, "reward": 1.7740885615348816, "reward_std": 0.07364876382052898, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7766927182674408, "step": 1844 }, { "clip_ratio": 0.0, "completion_length": 113.01823425292969, "epoch": 6.3174061433447095, "grad_norm": 0.7169433670811868, "kl": 0.36328125, "learning_rate": 4.752559726962457e-07, "loss": 0.0004, "reward": 1.7819010019302368, "reward_std": 0.0272208945825696, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7819010615348816, "step": 1845 }, { "clip_ratio": 0.0, "completion_length": 116.20052337646484, "epoch": 6.320819112627986, "grad_norm": 25.242648003888497, "kl": 0.353515625, "learning_rate": 4.74971558589306e-07, "loss": 0.0004, "reward": 1.8033853769302368, "reward_std": 0.014598664827644825, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.8059895932674408, "step": 1846 }, { "clip_ratio": 0.0, "completion_length": 114.75521087646484, "epoch": 6.324232081911263, "grad_norm": 0.5395166202562202, "kl": 0.3408203125, "learning_rate": 4.746871444823663e-07, "loss": 0.0003, "reward": 1.8170573115348816, "reward_std": 0.01773066632449627, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8170572817325592, "step": 1847 }, { "clip_ratio": 0.0, "completion_length": 115.95833587646484, "epoch": 6.327645051194539, "grad_norm": 1.3342947025164509, "kl": 0.341796875, "learning_rate": 4.744027303754266e-07, "loss": 0.0003, "reward": 1.7838541865348816, "reward_std": 0.0073656952008605, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7838541567325592, "step": 1848 }, { "clip_ratio": 0.0, "completion_length": 114.35416793823242, "epoch": 6.3310580204778155, "grad_norm": 1.316343274807414, "kl": 0.34375, "learning_rate": 4.7411831626848694e-07, "loss": 0.0003, "reward": 1.8430989384651184, "reward_std": 0.008086706511676311, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8430989682674408, "step": 1849 }, { "clip_ratio": 0.0, "completion_length": 114.23958587646484, "epoch": 6.334470989761092, "grad_norm": 0.6656127573220456, "kl": 0.3369140625, "learning_rate": 4.7383390216154715e-07, "loss": 0.0003, "reward": 1.751953125, "reward_std": 0.023672240786254406, "rewards/format_reward": 1.0, "rewards/score_reward": 0.751953125, "step": 1850 }, { "clip_ratio": 0.0, "completion_length": 117.02604293823242, "epoch": 6.337883959044369, "grad_norm": 0.5302589222846041, "kl": 0.3603515625, "learning_rate": 4.7354948805460747e-07, "loss": 0.0004, "reward": 1.84765625, "reward_std": 0.014598665293306112, "rewards/format_reward": 1.0, "rewards/score_reward": 0.84765625, "step": 1851 }, { "clip_ratio": 0.0, "completion_length": 113.75521087646484, "epoch": 6.341296928327645, "grad_norm": 0.892146420772625, "kl": 0.357421875, "learning_rate": 4.732650739476678e-07, "loss": 0.0004, "reward": 1.7877604365348816, "reward_std": 0.01361097814515233, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7877604067325592, "step": 1852 }, { "clip_ratio": 0.0, "completion_length": 116.57552337646484, "epoch": 6.3447098976109215, "grad_norm": 0.6409380087523145, "kl": 0.3466796875, "learning_rate": 4.729806598407281e-07, "loss": 0.0003, "reward": 1.7766926884651184, "reward_std": 0.016306545585393906, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7766927182674408, "step": 1853 }, { "clip_ratio": 0.0, "completion_length": 114.54427337646484, "epoch": 6.348122866894198, "grad_norm": 1.9194227277697753, "kl": 0.35546875, "learning_rate": 4.726962457337884e-07, "loss": 0.0004, "reward": 1.7552083134651184, "reward_std": 0.02209708606824279, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7552083432674408, "step": 1854 }, { "clip_ratio": 0.0, "completion_length": 114.01302337646484, "epoch": 6.351535836177474, "grad_norm": 0.9974833258406355, "kl": 0.349609375, "learning_rate": 4.7241183162684864e-07, "loss": 0.0003, "reward": 1.7154947519302368, "reward_std": 0.022401202004402876, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7154948115348816, "step": 1855 }, { "clip_ratio": 0.0, "completion_length": 116.89844131469727, "epoch": 6.354948805460751, "grad_norm": 0.3574832583879532, "kl": 0.34375, "learning_rate": 4.7212741751990896e-07, "loss": 0.0003, "reward": 1.8098958730697632, "reward_std": 0.018501579761505127, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8098958134651184, "step": 1856 }, { "clip_ratio": 0.0, "completion_length": 115.62500381469727, "epoch": 6.3583617747440275, "grad_norm": 0.788759751045722, "kl": 0.35546875, "learning_rate": 4.718430034129693e-07, "loss": 0.0004, "reward": 1.8287760615348816, "reward_std": 0.013876185286790133, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8287760317325592, "step": 1857 }, { "clip_ratio": 0.0, "completion_length": 114.81771087646484, "epoch": 6.361774744027303, "grad_norm": 0.29925140522338334, "kl": 0.3505859375, "learning_rate": 4.7155858930602955e-07, "loss": 0.0003, "reward": 1.7936198115348816, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7936197817325592, "step": 1858 }, { "clip_ratio": 0.0, "completion_length": 115.23437881469727, "epoch": 6.36518771331058, "grad_norm": 0.36825161397043377, "kl": 0.35546875, "learning_rate": 4.7127417519908987e-07, "loss": 0.0004, "reward": 1.8723958134651184, "reward_std": 0.0073656952008605, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8723958432674408, "step": 1859 }, { "clip_ratio": 0.0, "completion_length": 115.125, "epoch": 6.368600682593857, "grad_norm": 0.7465482128537518, "kl": 0.349609375, "learning_rate": 4.709897610921502e-07, "loss": 0.0003, "reward": 1.70703125, "reward_std": 0.02424262510612607, "rewards/format_reward": 1.0, "rewards/score_reward": 0.70703125, "step": 1860 }, { "clip_ratio": 0.0, "completion_length": 117.72135543823242, "epoch": 6.372013651877133, "grad_norm": 1.0660292551984807, "kl": 0.3544921875, "learning_rate": 4.7070534698521045e-07, "loss": 0.0004, "reward": 1.7552083730697632, "reward_std": 0.029197330586612225, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7578125, "step": 1861 }, { "clip_ratio": 0.0, "completion_length": 116.92708587646484, "epoch": 6.375426621160409, "grad_norm": 0.9402219700653799, "kl": 0.345703125, "learning_rate": 4.704209328782707e-07, "loss": 0.0003, "reward": 1.8841146230697632, "reward_std": 0.053962910547852516, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8841145634651184, "step": 1862 }, { "clip_ratio": 0.0, "completion_length": 115.77604293823242, "epoch": 6.378839590443686, "grad_norm": 0.3950285325744808, "kl": 0.359375, "learning_rate": 4.7013651877133104e-07, "loss": 0.0004, "reward": 1.7760416865348816, "reward_std": 0.009643959812819958, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7760416567325592, "step": 1863 }, { "clip_ratio": 0.0, "completion_length": 118.09375381469727, "epoch": 6.382252559726963, "grad_norm": 1.2176398960372279, "kl": 0.3505859375, "learning_rate": 4.6985210466439136e-07, "loss": 0.0004, "reward": 1.8352864384651184, "reward_std": 0.029481257311999798, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8352864682674408, "step": 1864 }, { "clip_ratio": 0.0, "completion_length": 117.95833587646484, "epoch": 6.385665529010239, "grad_norm": 0.6874170608836442, "kl": 0.34375, "learning_rate": 4.695676905574516e-07, "loss": 0.0003, "reward": 1.69921875, "reward_std": 0.041252280585467815, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7018229067325592, "step": 1865 }, { "clip_ratio": 0.0, "completion_length": 121.16927337646484, "epoch": 6.389078498293515, "grad_norm": 0.6047663606642283, "kl": 0.3447265625, "learning_rate": 4.692832764505119e-07, "loss": 0.0003, "reward": 1.7311197519302368, "reward_std": 0.02509636152535677, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7311198115348816, "step": 1866 }, { "clip_ratio": 0.0, "completion_length": 116.79427337646484, "epoch": 6.392491467576792, "grad_norm": 0.46648557999038465, "kl": 0.3544921875, "learning_rate": 4.689988623435722e-07, "loss": 0.0004, "reward": 1.7584635615348816, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7584635317325592, "step": 1867 }, { "clip_ratio": 0.0, "completion_length": 117.44791793823242, "epoch": 6.395904436860068, "grad_norm": 1.0977685233254024, "kl": 0.3388671875, "learning_rate": 4.6871444823663253e-07, "loss": 0.0003, "reward": 1.7942708730697632, "reward_std": 0.01242793072015047, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7942708134651184, "step": 1868 }, { "clip_ratio": 0.0, "completion_length": 120.21354293823242, "epoch": 6.399317406143345, "grad_norm": 0.958213585790376, "kl": 0.33984375, "learning_rate": 4.684300341296928e-07, "loss": 0.0003, "reward": 1.6751301884651184, "reward_std": 0.02012293692678213, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6751302182674408, "step": 1869 }, { "clip_ratio": 0.0, "completion_length": 118.77604293823242, "epoch": 6.402730375426621, "grad_norm": 0.6569360205561777, "kl": 0.3486328125, "learning_rate": 4.681456200227531e-07, "loss": 0.0003, "reward": 1.6790364384651184, "reward_std": 0.027708697598427534, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6790364682674408, "step": 1870 }, { "clip_ratio": 0.0, "completion_length": 118.20573425292969, "epoch": 6.406143344709897, "grad_norm": 1.2939781130077979, "kl": 0.3505859375, "learning_rate": 4.678612059158134e-07, "loss": 0.0004, "reward": 1.6796875, "reward_std": 0.0337538574822247, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6796875, "step": 1871 }, { "clip_ratio": 0.0, "completion_length": 118.05208587646484, "epoch": 6.409556313993174, "grad_norm": 0.951116051280932, "kl": 0.3525390625, "learning_rate": 4.675767918088737e-07, "loss": 0.0004, "reward": 1.7610676884651184, "reward_std": 0.026234676130115986, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7610677182674408, "step": 1872 }, { "clip_ratio": 0.0, "completion_length": 116.6796875, "epoch": 6.412969283276451, "grad_norm": 2.1469656150485794, "kl": 0.330078125, "learning_rate": 4.6729237770193397e-07, "loss": 0.0003, "reward": 1.7545572519302368, "reward_std": 0.023520145565271378, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7545573115348816, "step": 1873 }, { "clip_ratio": 0.0, "completion_length": 118.33594131469727, "epoch": 6.4163822525597265, "grad_norm": 0.8847268367061897, "kl": 0.3603515625, "learning_rate": 4.670079635949943e-07, "loss": 0.0004, "reward": 1.8059895634651184, "reward_std": 0.02879751892760396, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8059895932674408, "step": 1874 }, { "clip_ratio": 0.0, "completion_length": 119.21614837646484, "epoch": 6.419795221843003, "grad_norm": 0.6655568571420948, "kl": 0.3447265625, "learning_rate": 4.667235494880546e-07, "loss": 0.0003, "reward": 1.7994791865348816, "reward_std": 0.013610977679491043, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7994791567325592, "step": 1875 }, { "clip_ratio": 0.0, "completion_length": 117.88802337646484, "epoch": 6.42320819112628, "grad_norm": 0.008974966397352447, "kl": 0.3408203125, "learning_rate": 4.664391353811149e-07, "loss": 0.0003, "reward": 1.7708333134651184, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7708333432674408, "step": 1876 }, { "clip_ratio": 0.0, "completion_length": 113.85677337646484, "epoch": 6.426621160409557, "grad_norm": 0.5435177901580394, "kl": 0.3427734375, "learning_rate": 4.6615472127417514e-07, "loss": 0.0003, "reward": 1.80078125, "reward_std": 0.021964360028505325, "rewards/format_reward": 1.0, "rewards/score_reward": 0.80078125, "step": 1877 }, { "clip_ratio": 0.0, "completion_length": 119.10417175292969, "epoch": 6.4300341296928325, "grad_norm": 2.4414790173151246, "kl": 0.3388671875, "learning_rate": 4.6587030716723546e-07, "loss": 0.0003, "reward": 1.873046875, "reward_std": 0.028512938879430294, "rewards/format_reward": 1.0, "rewards/score_reward": 0.873046875, "step": 1878 }, { "clip_ratio": 0.0, "completion_length": 118.37500381469727, "epoch": 6.433447098976109, "grad_norm": 0.7346088132255306, "kl": 0.345703125, "learning_rate": 4.655858930602958e-07, "loss": 0.0003, "reward": 1.775390625, "reward_std": 0.030051065608859062, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7779948115348816, "step": 1879 }, { "clip_ratio": 0.0, "completion_length": 119.74739837646484, "epoch": 6.436860068259386, "grad_norm": 0.541530609668558, "kl": 0.3544921875, "learning_rate": 4.653014789533561e-07, "loss": 0.0004, "reward": 1.8046875, "reward_std": 0.03007019404321909, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8046875, "step": 1880 }, { "clip_ratio": 0.0, "completion_length": 120.13541793823242, "epoch": 6.440273037542662, "grad_norm": 1.9918584070261012, "kl": 0.3447265625, "learning_rate": 4.650170648464163e-07, "loss": 0.0003, "reward": 1.755859375, "reward_std": 0.04409847408533096, "rewards/format_reward": 1.0, "rewards/score_reward": 0.755859375, "step": 1881 }, { "clip_ratio": 0.0, "completion_length": 122.42969131469727, "epoch": 6.4436860068259385, "grad_norm": 0.41070368919303263, "kl": 0.345703125, "learning_rate": 4.6473265073947663e-07, "loss": 0.0003, "reward": 1.7213541865348816, "reward_std": 0.0073656952008605, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7213541567325592, "step": 1882 }, { "clip_ratio": 0.0, "completion_length": 121.38802337646484, "epoch": 6.447098976109215, "grad_norm": 0.7000017570383247, "kl": 0.33984375, "learning_rate": 4.6444823663253695e-07, "loss": 0.0003, "reward": 1.783203125, "reward_std": 0.015452401712536812, "rewards/format_reward": 1.0, "rewards/score_reward": 0.783203125, "step": 1883 }, { "clip_ratio": 0.0, "completion_length": 121.03646087646484, "epoch": 6.450511945392491, "grad_norm": 1.2874704014557805, "kl": 0.341796875, "learning_rate": 4.641638225255973e-07, "loss": 0.0003, "reward": 1.7298177480697632, "reward_std": 0.0467303479090333, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7298176884651184, "step": 1884 }, { "clip_ratio": 0.0, "completion_length": 118.62500381469727, "epoch": 6.453924914675768, "grad_norm": 0.7144690942243858, "kl": 0.3564453125, "learning_rate": 4.6387940841865754e-07, "loss": 0.0004, "reward": 1.8118489980697632, "reward_std": 0.03825153596699238, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8118489384651184, "step": 1885 }, { "clip_ratio": 0.0, "completion_length": 120.50000381469727, "epoch": 6.4573378839590445, "grad_norm": 1.2329512679549148, "kl": 0.482421875, "learning_rate": 4.6359499431171786e-07, "loss": 0.0005, "reward": 1.7916666865348816, "reward_std": 0.04316109977662563, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7916666567325592, "step": 1886 }, { "clip_ratio": 0.0, "completion_length": 120.2421875, "epoch": 6.460750853242321, "grad_norm": 1.1667777034694564, "kl": 0.3515625, "learning_rate": 4.6331058020477813e-07, "loss": 0.0004, "reward": 1.8020833134651184, "reward_std": 0.03773795813322067, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8020833432674408, "step": 1887 }, { "clip_ratio": 0.0, "completion_length": 118.37239837646484, "epoch": 6.464163822525597, "grad_norm": 3.111581858658486, "kl": 0.3369140625, "learning_rate": 4.6302616609783845e-07, "loss": 0.0003, "reward": 1.80078125, "reward_std": 0.014598664827644825, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.8033854067325592, "step": 1888 }, { "clip_ratio": 0.0, "completion_length": 119.25000381469727, "epoch": 6.467576791808874, "grad_norm": 1.0088514959726136, "kl": 0.3369140625, "learning_rate": 4.627417519908987e-07, "loss": 0.0003, "reward": 1.8346354365348816, "reward_std": 0.024242624640464783, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8346354067325592, "step": 1889 }, { "clip_ratio": 0.0, "completion_length": 120.15364837646484, "epoch": 6.4709897610921505, "grad_norm": 0.6900269067664792, "kl": 0.3544921875, "learning_rate": 4.6245733788395903e-07, "loss": 0.0004, "reward": 1.744140625, "reward_std": 0.027621356770396233, "rewards/format_reward": 0.9947916865348816, "rewards/score_reward": 0.7493489384651184, "step": 1890 }, { "clip_ratio": 0.0, "completion_length": 119.97916793823242, "epoch": 6.474402730375426, "grad_norm": 1.0431926649141454, "kl": 0.345703125, "learning_rate": 4.6217292377701935e-07, "loss": 0.0003, "reward": 1.830078125, "reward_std": 0.019135249312967062, "rewards/format_reward": 1.0, "rewards/score_reward": 0.830078125, "step": 1891 }, { "clip_ratio": 0.0, "completion_length": 122.15104675292969, "epoch": 6.477815699658703, "grad_norm": 0.9587490625510734, "kl": 0.3642578125, "learning_rate": 4.618885096700796e-07, "loss": 0.0004, "reward": 1.7200520634651184, "reward_std": 0.014598665293306112, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7200520932674408, "step": 1892 }, { "clip_ratio": 0.0, "completion_length": 120.12239837646484, "epoch": 6.48122866894198, "grad_norm": 3.9075475832990567, "kl": 0.337890625, "learning_rate": 4.616040955631399e-07, "loss": 0.0003, "reward": 1.75390625, "reward_std": 0.040521932765841484, "rewards/format_reward": 1.0, "rewards/score_reward": 0.75390625, "step": 1893 }, { "clip_ratio": 0.0, "completion_length": 118.97916793823242, "epoch": 6.484641638225256, "grad_norm": 0.44505373707645246, "kl": 0.36328125, "learning_rate": 4.613196814562002e-07, "loss": 0.0004, "reward": 1.8151041865348816, "reward_std": 0.010782274417579174, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8151041567325592, "step": 1894 }, { "clip_ratio": 0.0, "completion_length": 123.0234375, "epoch": 6.488054607508532, "grad_norm": 0.7689123184825857, "kl": 0.337890625, "learning_rate": 4.610352673492605e-07, "loss": 0.0003, "reward": 1.7486979365348816, "reward_std": 0.021964361891150475, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7513020932674408, "step": 1895 }, { "clip_ratio": 0.0, "completion_length": 119.89062881469727, "epoch": 6.491467576791809, "grad_norm": 0.6904253740996138, "kl": 0.33984375, "learning_rate": 4.6075085324232084e-07, "loss": 0.0003, "reward": 1.7884114980697632, "reward_std": 0.026082579977810383, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7884114384651184, "step": 1896 }, { "clip_ratio": 0.0, "completion_length": 119.17708587646484, "epoch": 6.494880546075085, "grad_norm": 0.6146002238146885, "kl": 0.3408203125, "learning_rate": 4.6046643913538106e-07, "loss": 0.0003, "reward": 1.8658854365348816, "reward_std": 0.007232970092445612, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8658854067325592, "step": 1897 }, { "clip_ratio": 0.0, "completion_length": 118.62239837646484, "epoch": 6.498293515358362, "grad_norm": 0.13962170240320224, "kl": 0.337890625, "learning_rate": 4.601820250284414e-07, "loss": 0.0003, "reward": 1.7760416269302368, "reward_std": 0.014731390401721, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7786458432674408, "step": 1898 }, { "clip_ratio": 0.0, "completion_length": 117.50000381469727, "epoch": 6.501706484641638, "grad_norm": 0.6937615012840832, "kl": 0.3564453125, "learning_rate": 4.598976109215017e-07, "loss": 0.0004, "reward": 1.7858073115348816, "reward_std": 0.033669810742139816, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7858072817325592, "step": 1899 }, { "clip_ratio": 0.0, "completion_length": 119.34114837646484, "epoch": 6.505119453924914, "grad_norm": 1.6237422059860913, "kl": 0.361328125, "learning_rate": 4.5961319681456196e-07, "loss": 0.0004, "reward": 1.740234375, "reward_std": 0.022533926647156477, "rewards/format_reward": 1.0, "rewards/score_reward": 0.740234375, "step": 1900 }, { "clip_ratio": 0.0, "completion_length": 117.37239837646484, "epoch": 6.508532423208191, "grad_norm": 1.9806651348377533, "kl": 0.3603515625, "learning_rate": 4.593287827076223e-07, "loss": 0.0004, "reward": 1.6809896230697632, "reward_std": 0.02450807485729456, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6809895634651184, "step": 1901 }, { "clip_ratio": 0.0, "completion_length": 120.83594131469727, "epoch": 6.511945392491468, "grad_norm": 1.020566339588951, "kl": 0.3544921875, "learning_rate": 4.5904436860068255e-07, "loss": 0.0004, "reward": 1.8209635615348816, "reward_std": 0.04221855476498604, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.8235677182674408, "step": 1902 }, { "clip_ratio": 0.0, "completion_length": 118.44531631469727, "epoch": 6.515358361774744, "grad_norm": 1.7346159625862074, "kl": 0.3466796875, "learning_rate": 4.5875995449374287e-07, "loss": 0.0003, "reward": 1.7421875, "reward_std": 0.019400456454604864, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7421875, "step": 1903 }, { "clip_ratio": 0.0, "completion_length": 118.48958587646484, "epoch": 6.51877133105802, "grad_norm": 0.6445141116077198, "kl": 0.3544921875, "learning_rate": 4.5847554038680314e-07, "loss": 0.0004, "reward": 1.734375, "reward_std": 0.028931879438459873, "rewards/format_reward": 1.0, "rewards/score_reward": 0.734375, "step": 1904 }, { "clip_ratio": 0.0, "completion_length": 115.92708587646484, "epoch": 6.522184300341297, "grad_norm": 0.3258500524249412, "kl": 0.359375, "learning_rate": 4.5819112627986345e-07, "loss": 0.0004, "reward": 1.7760416865348816, "reward_std": 0.014731390401721, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7786458134651184, "step": 1905 }, { "clip_ratio": 0.0, "completion_length": 118.8671875, "epoch": 6.525597269624574, "grad_norm": 0.6031329408911712, "kl": 0.349609375, "learning_rate": 4.579067121729238e-07, "loss": 0.0003, "reward": 1.7194010615348816, "reward_std": 0.022685371339321136, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7194010317325592, "step": 1906 }, { "clip_ratio": 0.0, "completion_length": 117.64844131469727, "epoch": 6.5290102389078495, "grad_norm": 1.076786000881935, "kl": 0.353515625, "learning_rate": 4.576222980659841e-07, "loss": 0.0004, "reward": 1.7135416865348816, "reward_std": 0.015717608854174614, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7135416567325592, "step": 1907 }, { "clip_ratio": 0.0, "completion_length": 120.31510925292969, "epoch": 6.532423208191126, "grad_norm": 0.328444329314108, "kl": 0.34765625, "learning_rate": 4.573378839590443e-07, "loss": 0.0003, "reward": 1.6907552480697632, "reward_std": 0.04195375367999077, "rewards/format_reward": 0.9947916567325592, "rewards/score_reward": 0.6959635317325592, "step": 1908 }, { "clip_ratio": 0.0, "completion_length": 119.21094131469727, "epoch": 6.535836177474403, "grad_norm": 0.5908813808252205, "kl": 0.359375, "learning_rate": 4.5705346985210463e-07, "loss": 0.0004, "reward": 1.681640625, "reward_std": 0.031037935987114906, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6842448115348816, "step": 1909 }, { "clip_ratio": 0.0, "completion_length": 118.86979293823242, "epoch": 6.53924914675768, "grad_norm": 2.5734615876950393, "kl": 0.3447265625, "learning_rate": 4.5676905574516495e-07, "loss": 0.0003, "reward": 1.8079427480697632, "reward_std": 0.012889966368675232, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8079426884651184, "step": 1910 }, { "clip_ratio": 0.0, "completion_length": 120.1328125, "epoch": 6.5426621160409555, "grad_norm": 1.1188441016851498, "kl": 0.34765625, "learning_rate": 4.5648464163822527e-07, "loss": 0.0003, "reward": 1.7018228769302368, "reward_std": 0.014598665293306112, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7018229365348816, "step": 1911 }, { "clip_ratio": 0.0, "completion_length": 118.28906631469727, "epoch": 6.546075085324232, "grad_norm": 0.21566767167146864, "kl": 0.3544921875, "learning_rate": 4.5620022753128553e-07, "loss": 0.0004, "reward": 1.8600260615348816, "reward_std": 0.008086706511676311, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8600260317325592, "step": 1912 }, { "clip_ratio": 0.0, "completion_length": 119.11198425292969, "epoch": 6.549488054607508, "grad_norm": 1.3183450662108076, "kl": 0.3359375, "learning_rate": 4.559158134243458e-07, "loss": 0.0003, "reward": 1.8541666865348816, "reward_std": 0.05330559425055981, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8541666567325592, "step": 1913 }, { "clip_ratio": 0.0, "completion_length": 120.96875381469727, "epoch": 6.552901023890785, "grad_norm": 1.0138011536538796, "kl": 0.345703125, "learning_rate": 4.556313993174061e-07, "loss": 0.0003, "reward": 1.7096354365348816, "reward_std": 0.007232970092445612, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7096354067325592, "step": 1914 }, { "clip_ratio": 0.0, "completion_length": 119.61979293823242, "epoch": 6.5563139931740615, "grad_norm": 0.5207418317451217, "kl": 0.34765625, "learning_rate": 4.5534698521046644e-07, "loss": 0.0003, "reward": 1.7337239980697632, "reward_std": 0.016306545585393906, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7337239384651184, "step": 1915 }, { "clip_ratio": 0.0, "completion_length": 118.921875, "epoch": 6.559726962457338, "grad_norm": 2.5583607223249585, "kl": 0.34765625, "learning_rate": 4.550625711035267e-07, "loss": 0.0003, "reward": 1.83203125, "reward_std": 0.027659203857183456, "rewards/format_reward": 1.0, "rewards/score_reward": 0.83203125, "step": 1916 }, { "clip_ratio": 0.0, "completion_length": 117.46354293823242, "epoch": 6.563139931740614, "grad_norm": 2.0220786634240815, "kl": 0.34765625, "learning_rate": 4.54778156996587e-07, "loss": 0.0003, "reward": 1.7018228769302368, "reward_std": 0.03261309117078781, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7018229365348816, "step": 1917 }, { "clip_ratio": 0.0, "completion_length": 117.7578125, "epoch": 6.566552901023891, "grad_norm": 0.5306275168652541, "kl": 0.3544921875, "learning_rate": 4.544937428896473e-07, "loss": 0.0004, "reward": 1.8001301884651184, "reward_std": 0.015168231446295977, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8001302182674408, "step": 1918 }, { "clip_ratio": 0.0, "completion_length": 119.89844131469727, "epoch": 6.5699658703071675, "grad_norm": 1.4010727391575852, "kl": 0.3515625, "learning_rate": 4.542093287827076e-07, "loss": 0.0004, "reward": 1.8177083730697632, "reward_std": 0.040110861882567406, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8177083134651184, "step": 1919 }, { "clip_ratio": 0.0, "completion_length": 119.95312881469727, "epoch": 6.573378839590443, "grad_norm": 0.6790574574499743, "kl": 0.3515625, "learning_rate": 4.539249146757679e-07, "loss": 0.0004, "reward": 1.8313801884651184, "reward_std": 0.032377765979617834, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8313802182674408, "step": 1920 }, { "clip_ratio": 0.0, "completion_length": 119.30989837646484, "epoch": 6.57679180887372, "grad_norm": 0.8962684128607207, "kl": 0.3564453125, "learning_rate": 4.536405005688282e-07, "loss": 0.0004, "reward": 1.755859375, "reward_std": 0.026234676130115986, "rewards/format_reward": 1.0, "rewards/score_reward": 0.755859375, "step": 1921 }, { "clip_ratio": 0.0, "completion_length": 118.94010925292969, "epoch": 6.580204778156997, "grad_norm": 0.9732478131056106, "kl": 0.345703125, "learning_rate": 4.533560864618885e-07, "loss": 0.0003, "reward": 1.7220051884651184, "reward_std": 0.015452401712536812, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7220052182674408, "step": 1922 }, { "clip_ratio": 0.0, "completion_length": 118.59635543823242, "epoch": 6.5836177474402735, "grad_norm": 2.4347706572533103, "kl": 0.34765625, "learning_rate": 4.530716723549488e-07, "loss": 0.0003, "reward": 1.763671875, "reward_std": 0.02465845923870802, "rewards/format_reward": 1.0, "rewards/score_reward": 0.763671875, "step": 1923 }, { "clip_ratio": 0.0, "completion_length": 118.48437881469727, "epoch": 6.587030716723549, "grad_norm": 1.664154874817501, "kl": 0.349609375, "learning_rate": 4.5278725824800905e-07, "loss": 0.0004, "reward": 1.7845051884651184, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7845052182674408, "step": 1924 }, { "clip_ratio": 0.0, "completion_length": 118.859375, "epoch": 6.590443686006826, "grad_norm": 1.5902528266968396, "kl": 0.34765625, "learning_rate": 4.5250284414106937e-07, "loss": 0.0003, "reward": 1.7721353769302368, "reward_std": 0.03439082205295563, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7721354365348816, "step": 1925 }, { "clip_ratio": 0.0, "completion_length": 120.0234375, "epoch": 6.593856655290102, "grad_norm": 0.21369656303512843, "kl": 0.349609375, "learning_rate": 4.522184300341297e-07, "loss": 0.0003, "reward": 1.8001302480697632, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8001301884651184, "step": 1926 }, { "clip_ratio": 0.0, "completion_length": 119.95052337646484, "epoch": 6.597269624573379, "grad_norm": 0.6778111453382084, "kl": 0.3544921875, "learning_rate": 4.5193401592719e-07, "loss": 0.0004, "reward": 1.783203125, "reward_std": 0.020255662500858307, "rewards/format_reward": 1.0, "rewards/score_reward": 0.783203125, "step": 1927 }, { "clip_ratio": 0.0, "completion_length": 119.01302337646484, "epoch": 6.600682593856655, "grad_norm": 0.6833853254031214, "kl": 0.3427734375, "learning_rate": 4.516496018202502e-07, "loss": 0.0003, "reward": 1.7623697519302368, "reward_std": 0.03189142979681492, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7623698115348816, "step": 1928 }, { "clip_ratio": 0.0, "completion_length": 120.28125, "epoch": 6.604095563139932, "grad_norm": 0.80157135659124, "kl": 0.359375, "learning_rate": 4.5136518771331054e-07, "loss": 0.0004, "reward": 1.7506510615348816, "reward_std": 0.022685371339321136, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7506510317325592, "step": 1929 }, { "clip_ratio": 0.0, "completion_length": 119.15625381469727, "epoch": 6.607508532423208, "grad_norm": 2.8156240416636247, "kl": 0.3603515625, "learning_rate": 4.5108077360637086e-07, "loss": 0.0004, "reward": 1.783203125, "reward_std": 0.05188188515603542, "rewards/format_reward": 1.0, "rewards/score_reward": 0.783203125, "step": 1930 }, { "clip_ratio": 0.0, "completion_length": 122.10416793823242, "epoch": 6.610921501706485, "grad_norm": 1.718852499161668, "kl": 0.3486328125, "learning_rate": 4.507963594994312e-07, "loss": 0.0003, "reward": 1.8138020634651184, "reward_std": 0.024242624640464783, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8138020932674408, "step": 1931 }, { "clip_ratio": 0.0, "completion_length": 121.07552337646484, "epoch": 6.614334470989761, "grad_norm": 0.6013079610344585, "kl": 0.353515625, "learning_rate": 4.5051194539249145e-07, "loss": 0.0004, "reward": 1.7838541865348816, "reward_std": 0.04778543394058943, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7838541567325592, "step": 1932 }, { "clip_ratio": 0.0, "completion_length": 120.26042175292969, "epoch": 6.617747440273037, "grad_norm": 1.3852863391606616, "kl": 0.3486328125, "learning_rate": 4.5022753128555177e-07, "loss": 0.0003, "reward": 1.8216145634651184, "reward_std": 0.023937448859214783, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8216145932674408, "step": 1933 }, { "clip_ratio": 0.0, "completion_length": 119.3125, "epoch": 6.621160409556314, "grad_norm": 0.8715328225786315, "kl": 0.3515625, "learning_rate": 4.4994311717861203e-07, "loss": 0.0004, "reward": 1.8040364384651184, "reward_std": 0.028807627968490124, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8040364682674408, "step": 1934 }, { "clip_ratio": 0.0, "completion_length": 119.1328125, "epoch": 6.624573378839591, "grad_norm": 0.15885285280242617, "kl": 0.3544921875, "learning_rate": 4.496587030716723e-07, "loss": 0.0004, "reward": 1.8463541865348816, "reward_std": 0.0073656952008605, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.8489583134651184, "step": 1935 }, { "clip_ratio": 0.0, "completion_length": 121.08333587646484, "epoch": 6.627986348122867, "grad_norm": 3.053375798994562, "kl": 0.3681640625, "learning_rate": 4.493742889647326e-07, "loss": 0.0004, "reward": 1.6588541269302368, "reward_std": 0.05920195300132036, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6588541865348816, "step": 1936 }, { "clip_ratio": 0.0, "completion_length": 117.71614837646484, "epoch": 6.631399317406143, "grad_norm": 0.7337693034947961, "kl": 0.3583984375, "learning_rate": 4.4908987485779294e-07, "loss": 0.0004, "reward": 1.8626301884651184, "reward_std": 0.039827752858400345, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.865234375, "step": 1937 }, { "clip_ratio": 0.0, "completion_length": 121.76041793823242, "epoch": 6.63481228668942, "grad_norm": 0.5971189620382235, "kl": 0.3623046875, "learning_rate": 4.4880546075085326e-07, "loss": 0.0004, "reward": 1.8561198115348816, "reward_std": 0.0191352479159832, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8561197817325592, "step": 1938 }, { "clip_ratio": 0.0, "completion_length": 117.46614837646484, "epoch": 6.638225255972696, "grad_norm": 0.6863089229359416, "kl": 0.35546875, "learning_rate": 4.4852104664391347e-07, "loss": 0.0004, "reward": 1.7434895634651184, "reward_std": 0.033886585384607315, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7434895932674408, "step": 1939 }, { "clip_ratio": 0.0, "completion_length": 119.09114837646484, "epoch": 6.6416382252559725, "grad_norm": 0.7476738656869681, "kl": 0.3466796875, "learning_rate": 4.482366325369738e-07, "loss": 0.0003, "reward": 1.7337239384651184, "reward_std": 0.03825153689831495, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7337239682674408, "step": 1940 }, { "clip_ratio": 0.0, "completion_length": 117.79687881469727, "epoch": 6.645051194539249, "grad_norm": 7.266416870093747, "kl": 0.35546875, "learning_rate": 4.479522184300341e-07, "loss": 0.0004, "reward": 1.810546875, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.810546875, "step": 1941 }, { "clip_ratio": 0.0, "completion_length": 117.59114837646484, "epoch": 6.648464163822526, "grad_norm": 0.6156723862460382, "kl": 0.3720703125, "learning_rate": 4.4766780432309443e-07, "loss": 0.0004, "reward": 1.84765625, "reward_std": 0.01841423800215125, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.8502604067325592, "step": 1942 }, { "clip_ratio": 0.0, "completion_length": 118.47656631469727, "epoch": 6.651877133105802, "grad_norm": 1.7797677861709786, "kl": 0.349609375, "learning_rate": 4.473833902161547e-07, "loss": 0.0003, "reward": 1.7662760615348816, "reward_std": 0.03728403802961111, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7688801884651184, "step": 1943 }, { "clip_ratio": 0.0, "completion_length": 115.9609375, "epoch": 6.6552901023890785, "grad_norm": 14.002457663726336, "kl": 0.3486328125, "learning_rate": 4.4709897610921496e-07, "loss": 0.0003, "reward": 1.7805989980697632, "reward_std": 0.02991834143176675, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7805989384651184, "step": 1944 }, { "clip_ratio": 0.0, "completion_length": 119.34896087646484, "epoch": 6.658703071672355, "grad_norm": 0.5084376422873341, "kl": 0.3759765625, "learning_rate": 4.468145620022753e-07, "loss": 0.0004, "reward": 1.7923176884651184, "reward_std": 0.008086706511676311, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7923177182674408, "step": 1945 }, { "clip_ratio": 0.0, "completion_length": 118.80989837646484, "epoch": 6.662116040955631, "grad_norm": 0.6145127289822598, "kl": 0.3447265625, "learning_rate": 4.465301478953356e-07, "loss": 0.0003, "reward": 1.8606770634651184, "reward_std": 0.02084394684061408, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8606770932674408, "step": 1946 }, { "clip_ratio": 0.0, "completion_length": 118.65885543823242, "epoch": 6.665529010238908, "grad_norm": 0.3645073281810941, "kl": 0.341796875, "learning_rate": 4.4624573378839587e-07, "loss": 0.0003, "reward": 1.7923176884651184, "reward_std": 0.022818096913397312, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.794921875, "step": 1947 }, { "clip_ratio": 0.0, "completion_length": 117.6953125, "epoch": 6.6689419795221845, "grad_norm": 0.008791758491466104, "kl": 0.3466796875, "learning_rate": 4.459613196814562e-07, "loss": 0.0003, "reward": 1.8489583134651184, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8489583432674408, "step": 1948 }, { "clip_ratio": 0.0, "completion_length": 120.44271087646484, "epoch": 6.672354948805461, "grad_norm": 0.602931559831581, "kl": 0.3359375, "learning_rate": 4.456769055745165e-07, "loss": 0.0003, "reward": 1.744140625, "reward_std": 0.023520144633948803, "rewards/format_reward": 1.0, "rewards/score_reward": 0.744140625, "step": 1949 }, { "clip_ratio": 0.0, "completion_length": 116.61458587646484, "epoch": 6.675767918088737, "grad_norm": 1.091552903822049, "kl": 0.3486328125, "learning_rate": 4.453924914675768e-07, "loss": 0.0003, "reward": 1.7467448115348816, "reward_std": 0.04990732669830322, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7467447817325592, "step": 1950 }, { "clip_ratio": 0.0, "completion_length": 117.02083587646484, "epoch": 6.679180887372014, "grad_norm": 1.210609574792501, "kl": 0.3515625, "learning_rate": 4.4510807736063704e-07, "loss": 0.0004, "reward": 1.72265625, "reward_std": 0.032613092102110386, "rewards/format_reward": 1.0, "rewards/score_reward": 0.72265625, "step": 1951 }, { "clip_ratio": 0.0, "completion_length": 118.84375381469727, "epoch": 6.6825938566552905, "grad_norm": 2.661439060659253, "kl": 0.3369140625, "learning_rate": 4.4482366325369736e-07, "loss": 0.0003, "reward": 1.7389323115348816, "reward_std": 0.04096606746315956, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7389322817325592, "step": 1952 }, { "clip_ratio": 0.0, "completion_length": 114.90885925292969, "epoch": 6.686006825938566, "grad_norm": 1.1899815595564645, "kl": 0.3603515625, "learning_rate": 4.445392491467577e-07, "loss": 0.0004, "reward": 1.6692708134651184, "reward_std": 0.010782274417579174, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6692708432674408, "step": 1953 }, { "clip_ratio": 0.0, "completion_length": 121.56771087646484, "epoch": 6.689419795221843, "grad_norm": 0.006762913374384969, "kl": 0.33984375, "learning_rate": 4.44254835039818e-07, "loss": 0.0003, "reward": 1.7708333134651184, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7708333432674408, "step": 1954 }, { "clip_ratio": 0.0, "completion_length": 117.55989837646484, "epoch": 6.69283276450512, "grad_norm": 1.088839937427218, "kl": 0.345703125, "learning_rate": 4.439704209328782e-07, "loss": 0.0003, "reward": 1.6666666865348816, "reward_std": 0.022271769121289253, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6666666567325592, "step": 1955 }, { "clip_ratio": 0.0, "completion_length": 118.24739837646484, "epoch": 6.696245733788396, "grad_norm": 0.25748565964450165, "kl": 0.3515625, "learning_rate": 4.4368600682593853e-07, "loss": 0.0004, "reward": 1.7415364384651184, "reward_std": 0.009207119233906269, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.744140625, "step": 1956 }, { "clip_ratio": 0.0, "completion_length": 116.72916793823242, "epoch": 6.699658703071672, "grad_norm": 1.4272655894573651, "kl": 0.3525390625, "learning_rate": 4.4340159271899885e-07, "loss": 0.0004, "reward": 1.8098958730697632, "reward_std": 0.008351913653314114, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8098958134651184, "step": 1957 }, { "clip_ratio": 0.0, "completion_length": 119.03125381469727, "epoch": 6.703071672354949, "grad_norm": 0.2703660660078375, "kl": 0.3525390625, "learning_rate": 4.4311717861205917e-07, "loss": 0.0004, "reward": 1.7135416865348816, "reward_std": 0.009643959812819958, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7135416567325592, "step": 1958 }, { "clip_ratio": 0.0, "completion_length": 114.47135925292969, "epoch": 6.706484641638225, "grad_norm": 0.44666809553355985, "kl": 0.3447265625, "learning_rate": 4.4283276450511944e-07, "loss": 0.0003, "reward": 1.755859375, "reward_std": 0.02253392618149519, "rewards/format_reward": 1.0, "rewards/score_reward": 0.755859375, "step": 1959 }, { "clip_ratio": 0.0, "completion_length": 117.35677337646484, "epoch": 6.709897610921502, "grad_norm": 2.3497307956095406, "kl": 0.3544921875, "learning_rate": 4.425483503981797e-07, "loss": 0.0004, "reward": 1.7369791865348816, "reward_std": 0.031979831866919994, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7369791567325592, "step": 1960 }, { "clip_ratio": 0.0, "completion_length": 117.18750381469727, "epoch": 6.713310580204778, "grad_norm": 1.100978711763743, "kl": 0.33984375, "learning_rate": 4.4226393629124e-07, "loss": 0.0003, "reward": 1.87109375, "reward_std": 0.018368853256106377, "rewards/format_reward": 1.0, "rewards/score_reward": 0.87109375, "step": 1961 }, { "clip_ratio": 0.0, "completion_length": 116.11458587646484, "epoch": 6.716723549488055, "grad_norm": 0.1469645907240824, "kl": 0.3466796875, "learning_rate": 4.4197952218430034e-07, "loss": 0.0003, "reward": 1.8072916865348816, "reward_std": 0.014731390401721, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.8098958432674408, "step": 1962 }, { "clip_ratio": 0.0, "completion_length": 115.70573043823242, "epoch": 6.720136518771331, "grad_norm": 0.2860146091083916, "kl": 0.3505859375, "learning_rate": 4.416951080773606e-07, "loss": 0.0004, "reward": 1.7350260615348816, "reward_std": 0.008086706511676311, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7350260317325592, "step": 1963 }, { "clip_ratio": 0.0, "completion_length": 115.35156631469727, "epoch": 6.723549488054608, "grad_norm": 0.5839226985585784, "kl": 0.3583984375, "learning_rate": 4.4141069397042093e-07, "loss": 0.0004, "reward": 1.7584635615348816, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7584635317325592, "step": 1964 }, { "clip_ratio": 0.0, "completion_length": 117.2109375, "epoch": 6.726962457337884, "grad_norm": 1.8870352527313992, "kl": 0.3544921875, "learning_rate": 4.411262798634812e-07, "loss": 0.0004, "reward": 1.701171875, "reward_std": 0.033600371330976486, "rewards/format_reward": 1.0, "rewards/score_reward": 0.701171875, "step": 1965 }, { "clip_ratio": 0.0, "completion_length": 115.45573043823242, "epoch": 6.73037542662116, "grad_norm": 1.0429343918954674, "kl": 0.3642578125, "learning_rate": 4.408418657565415e-07, "loss": 0.0004, "reward": 1.7942708134651184, "reward_std": 0.024069522507488728, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7942708432674408, "step": 1966 }, { "clip_ratio": 0.0, "completion_length": 115.34896087646484, "epoch": 6.733788395904437, "grad_norm": 0.012906131952410962, "kl": 0.3466796875, "learning_rate": 4.405574516496018e-07, "loss": 0.0003, "reward": 1.7864583134651184, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7864583432674408, "step": 1967 }, { "clip_ratio": 0.0, "completion_length": 116.921875, "epoch": 6.737201365187714, "grad_norm": 0.4446806073741564, "kl": 0.3369140625, "learning_rate": 4.402730375426621e-07, "loss": 0.0003, "reward": 1.7135416269302368, "reward_std": 0.009643959812819958, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7135416865348816, "step": 1968 }, { "clip_ratio": 0.0, "completion_length": 115.34896087646484, "epoch": 6.7406143344709895, "grad_norm": 0.9153671095611711, "kl": 0.3525390625, "learning_rate": 4.399886234357224e-07, "loss": 0.0004, "reward": 1.7571614384651184, "reward_std": 0.02630411647260189, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7571614682674408, "step": 1969 }, { "clip_ratio": 0.0, "completion_length": 115.14844131469727, "epoch": 6.744027303754266, "grad_norm": 0.008377715907099377, "kl": 0.34765625, "learning_rate": 4.3970420932878264e-07, "loss": 0.0003, "reward": 1.8177083134651184, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8177083432674408, "step": 1970 }, { "clip_ratio": 0.0, "completion_length": 117.57812881469727, "epoch": 6.747440273037543, "grad_norm": 0.6704582206939566, "kl": 0.3564453125, "learning_rate": 4.3941979522184296e-07, "loss": 0.0004, "reward": 1.728515625, "reward_std": 0.02595050446689129, "rewards/format_reward": 1.0, "rewards/score_reward": 0.728515625, "step": 1971 }, { "clip_ratio": 0.0, "completion_length": 118.76823425292969, "epoch": 6.750853242320819, "grad_norm": 1.4428480512101594, "kl": 0.345703125, "learning_rate": 4.391353811149033e-07, "loss": 0.0003, "reward": 1.7786458730697632, "reward_std": 0.017009655013680458, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7786458134651184, "step": 1972 }, { "clip_ratio": 0.0, "completion_length": 114.22916793823242, "epoch": 6.7542662116040955, "grad_norm": 0.8460912407561718, "kl": 0.3564453125, "learning_rate": 4.388509670079636e-07, "loss": 0.0004, "reward": 1.775390625, "reward_std": 0.04868537187576294, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7779948115348816, "step": 1973 }, { "clip_ratio": 0.0, "completion_length": 118.25781631469727, "epoch": 6.757679180887372, "grad_norm": 0.6626056496190187, "kl": 0.3515625, "learning_rate": 4.3856655290102386e-07, "loss": 0.0004, "reward": 1.8053385615348816, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8053385317325592, "step": 1974 }, { "clip_ratio": 0.0, "completion_length": 119.50260925292969, "epoch": 6.761092150170649, "grad_norm": 0.8064206686191274, "kl": 0.345703125, "learning_rate": 4.382821387940842e-07, "loss": 0.0003, "reward": 1.7122395634651184, "reward_std": 0.04177523637190461, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7122395932674408, "step": 1975 }, { "clip_ratio": 0.0, "completion_length": 117.01302337646484, "epoch": 6.764505119453925, "grad_norm": 0.5191155633080312, "kl": 0.3359375, "learning_rate": 4.3799772468714445e-07, "loss": 0.0003, "reward": 1.7923176884651184, "reward_std": 0.022818096913397312, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7923177182674408, "step": 1976 }, { "clip_ratio": 0.0, "completion_length": 117.02344131469727, "epoch": 6.7679180887372015, "grad_norm": 1.5638525869081075, "kl": 0.3466796875, "learning_rate": 4.3771331058020477e-07, "loss": 0.0003, "reward": 1.6536458134651184, "reward_std": 0.024746862705796957, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6536458432674408, "step": 1977 }, { "clip_ratio": 0.0, "completion_length": 115.34114837646484, "epoch": 6.771331058020478, "grad_norm": 0.5729861239337951, "kl": 0.34765625, "learning_rate": 4.3742889647326503e-07, "loss": 0.0003, "reward": 1.83203125, "reward_std": 0.018015244975686073, "rewards/format_reward": 1.0, "rewards/score_reward": 0.83203125, "step": 1978 }, { "clip_ratio": 0.0, "completion_length": 119.60677337646484, "epoch": 6.774744027303754, "grad_norm": 1.135128962519864, "kl": 0.345703125, "learning_rate": 4.3714448236632535e-07, "loss": 0.0003, "reward": 1.7981770634651184, "reward_std": 0.028012814465910196, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7981770932674408, "step": 1979 }, { "clip_ratio": 0.0, "completion_length": 114.67448043823242, "epoch": 6.778156996587031, "grad_norm": 1.548938801418079, "kl": 0.3349609375, "learning_rate": 4.3686006825938567e-07, "loss": 0.0003, "reward": 1.7291666269302368, "reward_std": 0.030423804186284542, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7291666865348816, "step": 1980 }, { "clip_ratio": 0.0, "completion_length": 118.72916793823242, "epoch": 6.7815699658703075, "grad_norm": 0.602010377794339, "kl": 0.3359375, "learning_rate": 4.3657565415244594e-07, "loss": 0.0003, "reward": 1.787109375, "reward_std": 0.038757242262363434, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7897135317325592, "step": 1981 }, { "clip_ratio": 0.0, "completion_length": 116.67708587646484, "epoch": 6.784982935153583, "grad_norm": 1.2980130233577027, "kl": 0.345703125, "learning_rate": 4.362912400455062e-07, "loss": 0.0003, "reward": 1.802734375, "reward_std": 0.02735590608790517, "rewards/format_reward": 1.0, "rewards/score_reward": 0.802734375, "step": 1982 }, { "clip_ratio": 0.0, "completion_length": 117.8671875, "epoch": 6.78839590443686, "grad_norm": 0.6326711843895614, "kl": 0.341796875, "learning_rate": 4.360068259385665e-07, "loss": 0.0003, "reward": 1.7623698115348816, "reward_std": 0.02630411647260189, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7623697817325592, "step": 1983 }, { "clip_ratio": 0.0, "completion_length": 117.03646087646484, "epoch": 6.791808873720137, "grad_norm": 0.22790781920255057, "kl": 0.341796875, "learning_rate": 4.3572241183162684e-07, "loss": 0.0003, "reward": 1.89453125, "reward_std": 0.007232970092445612, "rewards/format_reward": 1.0, "rewards/score_reward": 0.89453125, "step": 1984 }, { "clip_ratio": 0.0, "completion_length": 118.48177337646484, "epoch": 6.795221843003413, "grad_norm": 0.6873835924156739, "kl": 0.353515625, "learning_rate": 4.3543799772468716e-07, "loss": 0.0004, "reward": 1.7591146230697632, "reward_std": 0.03189248964190483, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7591145634651184, "step": 1985 }, { "clip_ratio": 0.0, "completion_length": 113.87760543823242, "epoch": 6.798634812286689, "grad_norm": 0.3497890045152915, "kl": 0.35546875, "learning_rate": 4.351535836177474e-07, "loss": 0.0004, "reward": 1.7447916865348816, "reward_std": 0.019287919625639915, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7447916567325592, "step": 1986 }, { "clip_ratio": 0.0, "completion_length": 119.32552337646484, "epoch": 6.802047781569966, "grad_norm": 3.666888798387516, "kl": 0.3603515625, "learning_rate": 4.348691695108077e-07, "loss": 0.0004, "reward": 1.8444010019302368, "reward_std": 0.035878635942935944, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8444010615348816, "step": 1987 }, { "clip_ratio": 0.0, "completion_length": 119.34375381469727, "epoch": 6.805460750853243, "grad_norm": 0.8088655910531088, "kl": 0.353515625, "learning_rate": 4.34584755403868e-07, "loss": 0.0004, "reward": 1.7311198115348816, "reward_std": 0.033316200599074364, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7311197817325592, "step": 1988 }, { "clip_ratio": 0.0, "completion_length": 117.39062881469727, "epoch": 6.808873720136519, "grad_norm": 2.47061332395312, "kl": 0.3623046875, "learning_rate": 4.3430034129692834e-07, "loss": 0.0004, "reward": 1.7239583134651184, "reward_std": 0.030182731337845325, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7239583432674408, "step": 1989 }, { "clip_ratio": 0.0, "completion_length": 118.43489837646484, "epoch": 6.812286689419795, "grad_norm": 0.5148352480023112, "kl": 0.345703125, "learning_rate": 4.340159271899886e-07, "loss": 0.0003, "reward": 1.7708333134651184, "reward_std": 0.015717608854174614, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7708333432674408, "step": 1990 }, { "clip_ratio": 0.0, "completion_length": 119.2265625, "epoch": 6.815699658703072, "grad_norm": 1.2156804523071805, "kl": 0.34375, "learning_rate": 4.3373151308304887e-07, "loss": 0.0003, "reward": 1.7916666865348816, "reward_std": 0.018147969618439674, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7916666567325592, "step": 1991 }, { "clip_ratio": 0.0, "completion_length": 121.88802337646484, "epoch": 6.819112627986348, "grad_norm": 1.8882454274286418, "kl": 0.3603515625, "learning_rate": 4.334470989761092e-07, "loss": 0.0004, "reward": 1.802734375, "reward_std": 0.0479148649610579, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.8053385317325592, "step": 1992 }, { "clip_ratio": 0.0, "completion_length": 124.32291793823242, "epoch": 6.822525597269625, "grad_norm": 1.684989652011261, "kl": 0.3466796875, "learning_rate": 4.331626848691695e-07, "loss": 0.0003, "reward": 1.7845052480697632, "reward_std": 0.029634173028171062, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7845051884651184, "step": 1993 }, { "clip_ratio": 0.0, "completion_length": 119.171875, "epoch": 6.825938566552901, "grad_norm": 0.2663020865369003, "kl": 0.3447265625, "learning_rate": 4.328782707622298e-07, "loss": 0.0003, "reward": 1.7864583134651184, "reward_std": 0.009643959812819958, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7864583432674408, "step": 1994 }, { "clip_ratio": 0.0, "completion_length": 121.84114837646484, "epoch": 6.829351535836177, "grad_norm": 0.27152556507639786, "kl": 0.341796875, "learning_rate": 4.325938566552901e-07, "loss": 0.0003, "reward": 1.6907551884651184, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6907552182674408, "step": 1995 }, { "clip_ratio": 0.0, "completion_length": 120.68489837646484, "epoch": 6.832764505119454, "grad_norm": 0.015226006877713758, "kl": 0.353515625, "learning_rate": 4.323094425483504e-07, "loss": 0.0004, "reward": 1.6354166865348816, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6354166567325592, "step": 1996 }, { "clip_ratio": 0.0, "completion_length": 122.19271087646484, "epoch": 6.836177474402731, "grad_norm": 1.7783510239970386, "kl": 0.3349609375, "learning_rate": 4.320250284414107e-07, "loss": 0.0003, "reward": 1.7897135615348816, "reward_std": 0.047782139386981726, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7897135317325592, "step": 1997 }, { "clip_ratio": 0.0, "completion_length": 123.31510543823242, "epoch": 6.839590443686006, "grad_norm": 0.3656736749832086, "kl": 0.3427734375, "learning_rate": 4.3174061433447095e-07, "loss": 0.0003, "reward": 1.7688801884651184, "reward_std": 0.015452401712536812, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7688802182674408, "step": 1998 }, { "clip_ratio": 0.0, "completion_length": 119.27083587646484, "epoch": 6.843003412969283, "grad_norm": 0.42693503501306007, "kl": 0.3505859375, "learning_rate": 4.3145620022753127e-07, "loss": 0.0004, "reward": 1.7942708730697632, "reward_std": 0.020426234230399132, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7942708134651184, "step": 1999 }, { "clip_ratio": 0.0, "completion_length": 118.84635543823242, "epoch": 6.84641638225256, "grad_norm": 0.5190774287411429, "kl": 0.33984375, "learning_rate": 4.311717861205916e-07, "loss": 0.0003, "reward": 1.759765625, "reward_std": 0.013876185286790133, "rewards/format_reward": 1.0, "rewards/score_reward": 0.759765625, "step": 2000 }, { "clip_ratio": 0.0, "completion_length": 119.25000381469727, "epoch": 6.849829351535837, "grad_norm": 1.4244778052994498, "kl": 0.3515625, "learning_rate": 4.308873720136519e-07, "loss": 0.0004, "reward": 1.6979166865348816, "reward_std": 0.030772077850997448, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7005208432674408, "step": 2001 }, { "clip_ratio": 0.0, "completion_length": 119.47656631469727, "epoch": 6.853242320819112, "grad_norm": 2.40727222223965, "kl": 0.3466796875, "learning_rate": 4.306029579067121e-07, "loss": 0.0003, "reward": 1.7604166865348816, "reward_std": 0.03182838764041662, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7604166567325592, "step": 2002 }, { "clip_ratio": 0.0, "completion_length": 121.68489837646484, "epoch": 6.856655290102389, "grad_norm": 1.6094800294023448, "kl": 0.3427734375, "learning_rate": 4.3031854379977244e-07, "loss": 0.0003, "reward": 1.7721354365348816, "reward_std": 0.04883804265409708, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7721354067325592, "step": 2003 }, { "clip_ratio": 0.0, "completion_length": 120.58333587646484, "epoch": 6.860068259385666, "grad_norm": 0.8322226884308841, "kl": 0.3544921875, "learning_rate": 4.3003412969283276e-07, "loss": 0.0004, "reward": 1.8118489384651184, "reward_std": 0.022533927112817764, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.814453125, "step": 2004 }, { "clip_ratio": 0.0, "completion_length": 120.62760925292969, "epoch": 6.863481228668942, "grad_norm": 0.633354510740904, "kl": 0.3408203125, "learning_rate": 4.29749715585893e-07, "loss": 0.0003, "reward": 1.779296875, "reward_std": 0.015452401712536812, "rewards/format_reward": 1.0, "rewards/score_reward": 0.779296875, "step": 2005 }, { "clip_ratio": 0.0, "completion_length": 120.12239837646484, "epoch": 6.8668941979522184, "grad_norm": 0.3019708808556411, "kl": 0.3388671875, "learning_rate": 4.2946530147895335e-07, "loss": 0.0003, "reward": 1.6927083134651184, "reward_std": 0.014731390401721, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6927083432674408, "step": 2006 }, { "clip_ratio": 0.0, "completion_length": 117.94010925292969, "epoch": 6.870307167235495, "grad_norm": 1.3780378801728714, "kl": 0.345703125, "learning_rate": 4.291808873720136e-07, "loss": 0.0003, "reward": 1.7884114384651184, "reward_std": 0.050610026344656944, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7884114682674408, "step": 2007 }, { "clip_ratio": 0.0, "completion_length": 120.11198043823242, "epoch": 6.873720136518771, "grad_norm": 0.5007898015506423, "kl": 0.3310546875, "learning_rate": 4.2889647326507393e-07, "loss": 0.0003, "reward": 1.7265625, "reward_std": 0.017009655013680458, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7265625, "step": 2008 }, { "clip_ratio": 0.0, "completion_length": 122.21614837646484, "epoch": 6.877133105802048, "grad_norm": 1.169103683603451, "kl": 0.3330078125, "learning_rate": 4.286120591581342e-07, "loss": 0.0003, "reward": 1.779296875, "reward_std": 0.01275724172592163, "rewards/format_reward": 1.0, "rewards/score_reward": 0.779296875, "step": 2009 }, { "clip_ratio": 0.0, "completion_length": 123.24219131469727, "epoch": 6.8805460750853245, "grad_norm": 1.7415571880236107, "kl": 0.3349609375, "learning_rate": 4.283276450511945e-07, "loss": 0.0003, "reward": 1.7942708134651184, "reward_std": 0.022097086533904076, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.796875, "step": 2010 }, { "clip_ratio": 0.0, "completion_length": 119.69531631469727, "epoch": 6.8839590443686, "grad_norm": 0.533730660340003, "kl": 0.3349609375, "learning_rate": 4.2804323094425484e-07, "loss": 0.0003, "reward": 1.8268229365348816, "reward_std": 0.007232970092445612, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8268229067325592, "step": 2011 }, { "clip_ratio": 0.0, "completion_length": 118.36719131469727, "epoch": 6.887372013651877, "grad_norm": 0.9792728689255611, "kl": 0.333984375, "learning_rate": 4.277588168373151e-07, "loss": 0.0003, "reward": 1.7708333134651184, "reward_std": 0.0364974532276392, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7708333432674408, "step": 2012 }, { "clip_ratio": 0.0, "completion_length": 121.39844131469727, "epoch": 6.890784982935154, "grad_norm": 1.2713535253055357, "kl": 0.3271484375, "learning_rate": 4.2747440273037537e-07, "loss": 0.0003, "reward": 1.7669270634651184, "reward_std": 0.014598664827644825, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7669270932674408, "step": 2013 }, { "clip_ratio": 0.0, "completion_length": 120.4140625, "epoch": 6.8941979522184305, "grad_norm": 0.9893617036233551, "kl": 0.33984375, "learning_rate": 4.271899886234357e-07, "loss": 0.0003, "reward": 1.8365885615348816, "reward_std": 0.02509636152535677, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8365885317325592, "step": 2014 }, { "clip_ratio": 0.0, "completion_length": 119.13021087646484, "epoch": 6.897610921501706, "grad_norm": 0.472224668035417, "kl": 0.3408203125, "learning_rate": 4.26905574516496e-07, "loss": 0.0003, "reward": 1.7526041269302368, "reward_std": 0.01799587346613407, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7526041865348816, "step": 2015 }, { "clip_ratio": 0.0, "completion_length": 122.05208587646484, "epoch": 6.901023890784983, "grad_norm": 2.9309489388803542, "kl": 0.3408203125, "learning_rate": 4.2662116040955633e-07, "loss": 0.0003, "reward": 1.7063802480697632, "reward_std": 0.023539516143500805, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7063801884651184, "step": 2016 }, { "clip_ratio": 0.0, "completion_length": 120.7578125, "epoch": 6.90443686006826, "grad_norm": 0.47542046125409554, "kl": 0.3291015625, "learning_rate": 4.2633674630261654e-07, "loss": 0.0003, "reward": 1.84765625, "reward_std": 0.007232970092445612, "rewards/format_reward": 1.0, "rewards/score_reward": 0.84765625, "step": 2017 }, { "clip_ratio": 0.0, "completion_length": 119.69531631469727, "epoch": 6.907849829351536, "grad_norm": 0.007635750238224291, "kl": 0.341796875, "learning_rate": 4.2605233219567686e-07, "loss": 0.0003, "reward": 1.8333333134651184, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8333333432674408, "step": 2018 }, { "clip_ratio": 0.0, "completion_length": 121.08594131469727, "epoch": 6.911262798634812, "grad_norm": 0.7011370128269584, "kl": 0.32421875, "learning_rate": 4.257679180887372e-07, "loss": 0.0003, "reward": 1.796875, "reward_std": 0.018147969618439674, "rewards/format_reward": 1.0, "rewards/score_reward": 0.796875, "step": 2019 }, { "clip_ratio": 0.0, "completion_length": 123.56250381469727, "epoch": 6.914675767918089, "grad_norm": 1.9587329841760923, "kl": 0.3388671875, "learning_rate": 4.254835039817975e-07, "loss": 0.0003, "reward": 1.6595052480697632, "reward_std": 0.034321791026741266, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6595051884651184, "step": 2020 }, { "clip_ratio": 0.0, "completion_length": 126.12239837646484, "epoch": 6.918088737201365, "grad_norm": 0.14660252101582413, "kl": 0.34375, "learning_rate": 4.2519908987485777e-07, "loss": 0.0003, "reward": 1.736328125, "reward_std": 0.009207119233906269, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7389323115348816, "step": 2021 }, { "clip_ratio": 0.0, "completion_length": 121.92969131469727, "epoch": 6.921501706484642, "grad_norm": 1.2685508816783475, "kl": 0.341796875, "learning_rate": 4.249146757679181e-07, "loss": 0.0003, "reward": 1.7317708134651184, "reward_std": 0.04024505615234375, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.734375, "step": 2022 }, { "clip_ratio": 0.0, "completion_length": 122.75260543823242, "epoch": 6.924914675767918, "grad_norm": 0.2156122360099976, "kl": 0.3369140625, "learning_rate": 4.2463026166097835e-07, "loss": 0.0003, "reward": 1.806640625, "reward_std": 0.008086706511676311, "rewards/format_reward": 1.0, "rewards/score_reward": 0.806640625, "step": 2023 }, { "clip_ratio": 0.0, "completion_length": 123.73177337646484, "epoch": 6.928327645051194, "grad_norm": 0.007504430474158568, "kl": 0.328125, "learning_rate": 4.2434584755403867e-07, "loss": 0.0003, "reward": 1.7864583730697632, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7864583134651184, "step": 2024 }, { "clip_ratio": 0.0, "completion_length": 122.35677337646484, "epoch": 6.931740614334471, "grad_norm": 1.109320391948835, "kl": 0.322265625, "learning_rate": 4.2406143344709894e-07, "loss": 0.0003, "reward": 1.7239583134651184, "reward_std": 0.0557174002751708, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7239583432674408, "step": 2025 }, { "clip_ratio": 0.0, "completion_length": 122.85156631469727, "epoch": 6.935153583617748, "grad_norm": 0.3604688034831872, "kl": 0.328125, "learning_rate": 4.2377701934015926e-07, "loss": 0.0003, "reward": 1.8125, "reward_std": 0.009643959812819958, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8125, "step": 2026 }, { "clip_ratio": 0.0, "completion_length": 123.87500381469727, "epoch": 6.938566552901024, "grad_norm": 0.4041913947940741, "kl": 0.326171875, "learning_rate": 4.234926052332196e-07, "loss": 0.0003, "reward": 1.7923176884651184, "reward_std": 0.007935261586681008, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7923177182674408, "step": 2027 }, { "clip_ratio": 0.0, "completion_length": 120.98958587646484, "epoch": 6.9419795221843, "grad_norm": 0.25789686989116395, "kl": 0.3193359375, "learning_rate": 4.2320819112627985e-07, "loss": 0.0003, "reward": 1.7630208134651184, "reward_std": 0.0073656952008605, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7630208432674408, "step": 2028 }, { "clip_ratio": 0.0, "completion_length": 122.93229293823242, "epoch": 6.945392491467577, "grad_norm": 2.717007958944514, "kl": 0.3466796875, "learning_rate": 4.229237770193401e-07, "loss": 0.0003, "reward": 1.6881510019302368, "reward_std": 0.055431186221539974, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6907552182674408, "step": 2029 }, { "clip_ratio": 0.0, "completion_length": 124.22135543823242, "epoch": 6.948805460750854, "grad_norm": 1.4910847149111672, "kl": 0.328125, "learning_rate": 4.2263936291240043e-07, "loss": 0.0003, "reward": 1.7532551884651184, "reward_std": 0.02708882000297308, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7532552182674408, "step": 2030 }, { "clip_ratio": 0.0, "completion_length": 121.87239837646484, "epoch": 6.952218430034129, "grad_norm": 3.984730990675492, "kl": 0.3271484375, "learning_rate": 4.2235494880546075e-07, "loss": 0.0003, "reward": 1.755859375, "reward_std": 0.0453688632696867, "rewards/format_reward": 1.0, "rewards/score_reward": 0.755859375, "step": 2031 }, { "clip_ratio": 0.0, "completion_length": 124.50521087646484, "epoch": 6.955631399317406, "grad_norm": 0.8568575154315875, "kl": 0.333984375, "learning_rate": 4.2207053469852107e-07, "loss": 0.0003, "reward": 1.8587239384651184, "reward_std": 0.022818097844719887, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.861328125, "step": 2032 }, { "clip_ratio": 0.0, "completion_length": 120.609375, "epoch": 6.959044368600683, "grad_norm": 0.7087989314605038, "kl": 0.3369140625, "learning_rate": 4.217861205915813e-07, "loss": 0.0003, "reward": 1.7135416269302368, "reward_std": 0.025361569598317146, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7135416865348816, "step": 2033 }, { "clip_ratio": 0.0, "completion_length": 123.09896087646484, "epoch": 6.962457337883959, "grad_norm": 1.9219612984488688, "kl": 0.3369140625, "learning_rate": 4.215017064846416e-07, "loss": 0.0003, "reward": 1.84375, "reward_std": 0.03728215675801039, "rewards/format_reward": 1.0, "rewards/score_reward": 0.84375, "step": 2034 }, { "clip_ratio": 0.0, "completion_length": 124.18229675292969, "epoch": 6.965870307167235, "grad_norm": 0.4353872104839916, "kl": 0.333984375, "learning_rate": 4.212172923777019e-07, "loss": 0.0003, "reward": 1.734375, "reward_std": 0.020692503545433283, "rewards/format_reward": 1.0, "rewards/score_reward": 0.734375, "step": 2035 }, { "clip_ratio": 0.0, "completion_length": 121.58073043823242, "epoch": 6.969283276450512, "grad_norm": 4.41812588282041, "kl": 0.3330078125, "learning_rate": 4.2093287827076224e-07, "loss": 0.0003, "reward": 1.712890625, "reward_std": 0.06367138586938381, "rewards/format_reward": 1.0, "rewards/score_reward": 0.712890625, "step": 2036 }, { "clip_ratio": 0.0, "completion_length": 121.22656631469727, "epoch": 6.972696245733788, "grad_norm": 1.3152203674030856, "kl": 0.3232421875, "learning_rate": 4.206484641638225e-07, "loss": 0.0003, "reward": 1.7532552480697632, "reward_std": 0.06231522932648659, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7532551884651184, "step": 2037 }, { "clip_ratio": 0.0, "completion_length": 120.34114837646484, "epoch": 6.976109215017065, "grad_norm": 0.32167933574289803, "kl": 0.3349609375, "learning_rate": 4.203640500568828e-07, "loss": 0.0003, "reward": 1.7604166865348816, "reward_std": 0.009643959812819958, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7604166567325592, "step": 2038 }, { "clip_ratio": 0.0, "completion_length": 120.36458587646484, "epoch": 6.979522184300341, "grad_norm": 0.666406402653827, "kl": 0.3251953125, "learning_rate": 4.200796359499431e-07, "loss": 0.0003, "reward": 1.771484375, "reward_std": 0.029918341897428036, "rewards/format_reward": 1.0, "rewards/score_reward": 0.771484375, "step": 2039 }, { "clip_ratio": 0.0, "completion_length": 122.1796875, "epoch": 6.982935153583618, "grad_norm": 0.8195072188251667, "kl": 0.3349609375, "learning_rate": 4.1979522184300336e-07, "loss": 0.0003, "reward": 1.7845051884651184, "reward_std": 0.0448697991669178, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7845052182674408, "step": 2040 }, { "clip_ratio": 0.0, "completion_length": 123.42708587646484, "epoch": 6.986348122866894, "grad_norm": 0.5508040876105956, "kl": 0.33203125, "learning_rate": 4.195108077360637e-07, "loss": 0.0003, "reward": 1.802734375, "reward_std": 0.016306546051055193, "rewards/format_reward": 1.0, "rewards/score_reward": 0.802734375, "step": 2041 }, { "clip_ratio": 0.0, "completion_length": 122.6328125, "epoch": 6.989761092150171, "grad_norm": 1.0814461756497382, "kl": 0.3291015625, "learning_rate": 4.19226393629124e-07, "loss": 0.0003, "reward": 1.7571614384651184, "reward_std": 0.04905317910015583, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.759765625, "step": 2042 }, { "clip_ratio": 0.0, "completion_length": 122.6328125, "epoch": 6.993174061433447, "grad_norm": 0.5558575461374591, "kl": 0.337890625, "learning_rate": 4.189419795221843e-07, "loss": 0.0003, "reward": 1.7584635615348816, "reward_std": 0.0300510679371655, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7610677182674408, "step": 2043 }, { "clip_ratio": 0.0, "completion_length": 113.6833381652832, "epoch": 6.996587030716723, "grad_norm": 0.7783800668506591, "kl": 0.3310546875, "learning_rate": 4.1865756541524453e-07, "loss": 0.0003, "reward": 1.6000001430511475, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6000000238418579, "step": 2044 }, { "clip_ratio": 0.0, "completion_length": 120.19791793823242, "epoch": 7.003412969283277, "grad_norm": 0.8410550494995476, "kl": 0.3359375, "learning_rate": 4.1837315130830485e-07, "loss": 0.0003, "reward": 1.8522135615348816, "reward_std": 0.015319676604121923, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8522135317325592, "step": 2045 }, { "clip_ratio": 0.0, "completion_length": 118.84375, "epoch": 7.006825938566553, "grad_norm": 0.5624784189415044, "kl": 0.3359375, "learning_rate": 4.180887372013652e-07, "loss": 0.0003, "reward": 1.845703125, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.845703125, "step": 2046 }, { "clip_ratio": 0.0, "completion_length": 121.88281631469727, "epoch": 7.010238907849829, "grad_norm": 1.5913355809040515, "kl": 0.35546875, "learning_rate": 4.178043230944255e-07, "loss": 0.0004, "reward": 1.7428385019302368, "reward_std": 0.026101951487362385, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7428385615348816, "step": 2047 }, { "clip_ratio": 0.0, "completion_length": 121.28646087646484, "epoch": 7.013651877133106, "grad_norm": 0.007155031845089931, "kl": 0.333984375, "learning_rate": 4.1751990898748576e-07, "loss": 0.0003, "reward": 1.796875, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.796875, "step": 2048 }, { "clip_ratio": 0.0, "completion_length": 120.94531631469727, "epoch": 7.017064846416382, "grad_norm": 0.3407326017347175, "kl": 0.3203125, "learning_rate": 4.17235494880546e-07, "loss": 0.0003, "reward": 1.779296875, "reward_std": 0.0026955686043947935, "rewards/format_reward": 1.0, "rewards/score_reward": 0.779296875, "step": 2049 }, { "clip_ratio": 0.0, "completion_length": 120.82031631469727, "epoch": 7.020477815699659, "grad_norm": 1.1239245672448266, "kl": 0.3310546875, "learning_rate": 4.1695108077360635e-07, "loss": 0.0003, "reward": 1.81640625, "reward_std": 0.05176910292357206, "rewards/format_reward": 1.0, "rewards/score_reward": 0.81640625, "step": 2050 }, { "clip_ratio": 0.0, "completion_length": 118.70573043823242, "epoch": 7.023890784982935, "grad_norm": 0.4879587437306996, "kl": 0.34375, "learning_rate": 4.1666666666666667e-07, "loss": 0.0003, "reward": 1.7278646230697632, "reward_std": 0.007232970092445612, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7278645634651184, "step": 2051 }, { "clip_ratio": 0.0, "completion_length": 120.78385543823242, "epoch": 7.027303754266212, "grad_norm": 0.5610873552925136, "kl": 0.3271484375, "learning_rate": 4.1638225255972693e-07, "loss": 0.0003, "reward": 1.7161458134651184, "reward_std": 0.028058198746293783, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7161458432674408, "step": 2052 }, { "clip_ratio": 0.0, "completion_length": 119.77083587646484, "epoch": 7.030716723549488, "grad_norm": 1.1925888606469508, "kl": 0.3349609375, "learning_rate": 4.1609783845278725e-07, "loss": 0.0003, "reward": 1.7545573115348816, "reward_std": 0.028342124540358782, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7545572817325592, "step": 2053 }, { "clip_ratio": 0.0, "completion_length": 120.30989837646484, "epoch": 7.034129692832765, "grad_norm": 2.7704547616722994, "kl": 0.3173828125, "learning_rate": 4.158134243458475e-07, "loss": 0.0003, "reward": 1.74609375, "reward_std": 0.016876930370926857, "rewards/format_reward": 1.0, "rewards/score_reward": 0.74609375, "step": 2054 }, { "clip_ratio": 0.0, "completion_length": 122.95573043823242, "epoch": 7.037542662116041, "grad_norm": 0.5801456104250883, "kl": 0.3271484375, "learning_rate": 4.1552901023890784e-07, "loss": 0.0003, "reward": 1.8190104365348816, "reward_std": 0.031626221258193254, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8190104067325592, "step": 2055 }, { "clip_ratio": 0.0, "completion_length": 122.53385543823242, "epoch": 7.040955631399317, "grad_norm": 1.1219200337476085, "kl": 0.3193359375, "learning_rate": 4.152445961319681e-07, "loss": 0.0003, "reward": 1.7005208134651184, "reward_std": 0.030772077850997448, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7005208432674408, "step": 2056 }, { "clip_ratio": 0.0, "completion_length": 121.48958587646484, "epoch": 7.044368600682594, "grad_norm": 0.8730100349356563, "kl": 0.33203125, "learning_rate": 4.149601820250284e-07, "loss": 0.0003, "reward": 1.7161458134651184, "reward_std": 0.032745164819061756, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7161458432674408, "step": 2057 }, { "clip_ratio": 0.0, "completion_length": 116.39323043823242, "epoch": 7.047781569965871, "grad_norm": 0.00709566127440147, "kl": 0.3310546875, "learning_rate": 4.1467576791808874e-07, "loss": 0.0003, "reward": 1.828125, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.828125, "step": 2058 }, { "clip_ratio": 0.0, "completion_length": 117.63802337646484, "epoch": 7.051194539249146, "grad_norm": 0.9657581446907505, "kl": 0.333984375, "learning_rate": 4.14391353811149e-07, "loss": 0.0003, "reward": 1.8352864980697632, "reward_std": 0.021697684191167355, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8352864384651184, "step": 2059 }, { "clip_ratio": 0.0, "completion_length": 120.66666793823242, "epoch": 7.054607508532423, "grad_norm": 1.7385533299993718, "kl": 0.3330078125, "learning_rate": 4.141069397042093e-07, "loss": 0.0003, "reward": 1.7141927480697632, "reward_std": 0.03344827424734831, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7141926884651184, "step": 2060 }, { "clip_ratio": 0.0, "completion_length": 119.70052337646484, "epoch": 7.0580204778157, "grad_norm": 0.538410857427652, "kl": 0.3203125, "learning_rate": 4.138225255972696e-07, "loss": 0.0003, "reward": 1.7584635019302368, "reward_std": 0.023672240786254406, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7584635615348816, "step": 2061 }, { "clip_ratio": 0.0, "completion_length": 120.38802337646484, "epoch": 7.061433447098976, "grad_norm": 0.9845172700086725, "kl": 0.3203125, "learning_rate": 4.135381114903299e-07, "loss": 0.0003, "reward": 1.7200521230697632, "reward_std": 0.007232970092445612, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7200520634651184, "step": 2062 }, { "clip_ratio": 0.0, "completion_length": 120.1640625, "epoch": 7.064846416382252, "grad_norm": 1.1345315284274111, "kl": 0.3212890625, "learning_rate": 4.1325369738339024e-07, "loss": 0.0003, "reward": 1.724609375, "reward_std": 0.026234676130115986, "rewards/format_reward": 1.0, "rewards/score_reward": 0.724609375, "step": 2063 }, { "clip_ratio": 0.0, "completion_length": 121.40885543823242, "epoch": 7.068259385665529, "grad_norm": 0.33365255965890245, "kl": 0.326171875, "learning_rate": 4.1296928327645045e-07, "loss": 0.0003, "reward": 1.7526041865348816, "reward_std": 0.010782274417579174, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7526041567325592, "step": 2064 }, { "clip_ratio": 0.0, "completion_length": 119.31250381469727, "epoch": 7.071672354948806, "grad_norm": 3.21920159287233, "kl": 0.3427734375, "learning_rate": 4.1268486916951077e-07, "loss": 0.0003, "reward": 1.796875, "reward_std": 0.019134188070893288, "rewards/format_reward": 1.0, "rewards/score_reward": 0.796875, "step": 2065 }, { "clip_ratio": 0.0, "completion_length": 121.21614837646484, "epoch": 7.075085324232082, "grad_norm": 0.44674221523765983, "kl": 0.3359375, "learning_rate": 4.124004550625711e-07, "loss": 0.0003, "reward": 1.8756510615348816, "reward_std": 0.016306546051055193, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8756510317325592, "step": 2066 }, { "clip_ratio": 0.0, "completion_length": 121.97396087646484, "epoch": 7.078498293515358, "grad_norm": 0.7807135149651387, "kl": 0.3203125, "learning_rate": 4.121160409556314e-07, "loss": 0.0003, "reward": 1.7545572519302368, "reward_std": 0.022533927112817764, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7545573115348816, "step": 2067 }, { "clip_ratio": 0.0, "completion_length": 121.19010925292969, "epoch": 7.081911262798635, "grad_norm": 6.994011279701762, "kl": 0.326171875, "learning_rate": 4.118316268486917e-07, "loss": 0.0003, "reward": 1.76171875, "reward_std": 0.03537851106375456, "rewards/format_reward": 1.0, "rewards/score_reward": 0.76171875, "step": 2068 }, { "clip_ratio": 0.0, "completion_length": 122.65104293823242, "epoch": 7.085324232081911, "grad_norm": 1.1926391255612878, "kl": 0.3330078125, "learning_rate": 4.11547212741752e-07, "loss": 0.0003, "reward": 1.7233072519302368, "reward_std": 0.03232933022081852, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7259114682674408, "step": 2069 }, { "clip_ratio": 0.0, "completion_length": 119.98437881469727, "epoch": 7.088737201365188, "grad_norm": 2.1885993332841602, "kl": 0.3408203125, "learning_rate": 4.1126279863481226e-07, "loss": 0.0003, "reward": 1.6725260615348816, "reward_std": 0.03395398333668709, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6725260317325592, "step": 2070 }, { "clip_ratio": 0.0, "completion_length": 120.76823043823242, "epoch": 7.092150170648464, "grad_norm": 0.863047687199368, "kl": 0.345703125, "learning_rate": 4.109783845278726e-07, "loss": 0.0003, "reward": 1.6614583134651184, "reward_std": 0.048484429717063904, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6614583432674408, "step": 2071 }, { "clip_ratio": 0.0, "completion_length": 117.80989837646484, "epoch": 7.09556313993174, "grad_norm": 1.5924815952860185, "kl": 0.326171875, "learning_rate": 4.1069397042093285e-07, "loss": 0.0003, "reward": 1.8483072519302368, "reward_std": 0.021109154913574457, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8483073115348816, "step": 2072 }, { "clip_ratio": 0.0, "completion_length": 120.65104293823242, "epoch": 7.098976109215017, "grad_norm": 1.226803727074065, "kl": 0.3291015625, "learning_rate": 4.1040955631399317e-07, "loss": 0.0003, "reward": 1.837890625, "reward_std": 0.02636822033673525, "rewards/format_reward": 1.0, "rewards/score_reward": 0.837890625, "step": 2073 }, { "clip_ratio": 0.0, "completion_length": 120.11198043823242, "epoch": 7.102389078498294, "grad_norm": 1.2039274206264723, "kl": 0.3427734375, "learning_rate": 4.101251422070535e-07, "loss": 0.0003, "reward": 1.7330728769302368, "reward_std": 0.028797519393265247, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7330729365348816, "step": 2074 }, { "clip_ratio": 0.0, "completion_length": 120.89323043823242, "epoch": 7.1058020477815695, "grad_norm": 0.8375688283552757, "kl": 0.3896484375, "learning_rate": 4.098407281001137e-07, "loss": 0.0004, "reward": 1.8463541865348816, "reward_std": 0.027486102655529976, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8463541567325592, "step": 2075 }, { "clip_ratio": 0.0, "completion_length": 121.78125381469727, "epoch": 7.109215017064846, "grad_norm": 0.007119796657915448, "kl": 0.3291015625, "learning_rate": 4.09556313993174e-07, "loss": 0.0003, "reward": 1.8020833730697632, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8020833134651184, "step": 2076 }, { "clip_ratio": 0.0, "completion_length": 124.73437881469727, "epoch": 7.112627986348123, "grad_norm": 1.575276066830238, "kl": 0.3251953125, "learning_rate": 4.0927189988623434e-07, "loss": 0.0003, "reward": 1.810546875, "reward_std": 0.02708882000297308, "rewards/format_reward": 1.0, "rewards/score_reward": 0.810546875, "step": 2077 }, { "clip_ratio": 0.0, "completion_length": 120.97656631469727, "epoch": 7.1160409556314, "grad_norm": 0.455844373071591, "kl": 0.322265625, "learning_rate": 4.0898748577929466e-07, "loss": 0.0003, "reward": 1.7623698115348816, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7623697817325592, "step": 2078 }, { "clip_ratio": 0.0, "completion_length": 119.92708587646484, "epoch": 7.1194539249146755, "grad_norm": 1.0654475921396658, "kl": 0.328125, "learning_rate": 4.087030716723549e-07, "loss": 0.0003, "reward": 1.7552083134651184, "reward_std": 0.016173413023352623, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7552083432674408, "step": 2079 }, { "clip_ratio": 0.0, "completion_length": 122.17969131469727, "epoch": 7.122866894197952, "grad_norm": 0.006740040484262562, "kl": 0.330078125, "learning_rate": 4.084186575654152e-07, "loss": 0.0003, "reward": 1.8177083134651184, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8177083432674408, "step": 2080 }, { "clip_ratio": 0.0, "completion_length": 118.78385543823242, "epoch": 7.126279863481229, "grad_norm": 0.32748857782340196, "kl": 0.328125, "learning_rate": 4.081342434584755e-07, "loss": 0.0003, "reward": 1.8359375, "reward_std": 0.0073656952008605, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8359375, "step": 2081 }, { "clip_ratio": 0.0, "completion_length": 121.15364837646484, "epoch": 7.129692832764505, "grad_norm": 0.3681274467485712, "kl": 0.3173828125, "learning_rate": 4.0784982935153583e-07, "loss": 0.0003, "reward": 1.8704426884651184, "reward_std": 0.008086706511676311, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8704427182674408, "step": 2082 }, { "clip_ratio": 0.0, "completion_length": 123.60156631469727, "epoch": 7.1331058020477816, "grad_norm": 1.325011016886877, "kl": 0.3515625, "learning_rate": 4.075654152445961e-07, "loss": 0.0004, "reward": 1.859375, "reward_std": 0.011048543266952038, "rewards/format_reward": 1.0, "rewards/score_reward": 0.859375, "step": 2083 }, { "clip_ratio": 0.0, "completion_length": 121.00000381469727, "epoch": 7.136518771331058, "grad_norm": 1.743551738825511, "kl": 0.3154296875, "learning_rate": 4.072810011376564e-07, "loss": 0.0003, "reward": 1.8411458730697632, "reward_std": 0.02665361389517784, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8411458134651184, "step": 2084 }, { "clip_ratio": 0.0, "completion_length": 122.30729293823242, "epoch": 7.139931740614334, "grad_norm": 1.0737067913652363, "kl": 0.330078125, "learning_rate": 4.0699658703071674e-07, "loss": 0.0003, "reward": 1.7135416865348816, "reward_std": 0.035423893481492996, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7135416567325592, "step": 2085 }, { "clip_ratio": 0.0, "completion_length": 122.43750381469727, "epoch": 7.143344709897611, "grad_norm": 0.1588035733363699, "kl": 0.328125, "learning_rate": 4.06712172923777e-07, "loss": 0.0003, "reward": 1.6334635019302368, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6334635615348816, "step": 2086 }, { "clip_ratio": 0.0, "completion_length": 124.06771087646484, "epoch": 7.146757679180888, "grad_norm": 0.013207555187554725, "kl": 0.3154296875, "learning_rate": 4.0642775881683727e-07, "loss": 0.0003, "reward": 1.78125, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.78125, "step": 2087 }, { "clip_ratio": 0.0, "completion_length": 120.35937881469727, "epoch": 7.150170648464163, "grad_norm": 0.0064020570868710104, "kl": 0.322265625, "learning_rate": 4.061433447098976e-07, "loss": 0.0003, "reward": 1.8229166865348816, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8229166567325592, "step": 2088 }, { "clip_ratio": 0.0, "completion_length": 124.87500381469727, "epoch": 7.15358361774744, "grad_norm": 1.2407138450900055, "kl": 0.337890625, "learning_rate": 4.058589306029579e-07, "loss": 0.0003, "reward": 1.7434895634651184, "reward_std": 0.02834236901253462, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.74609375, "step": 2089 }, { "clip_ratio": 0.0, "completion_length": 124.32812881469727, "epoch": 7.156996587030717, "grad_norm": 0.4220986699302694, "kl": 0.3212890625, "learning_rate": 4.0557451649601823e-07, "loss": 0.0003, "reward": 1.84375, "reward_std": 0.014731390401721, "rewards/format_reward": 1.0, "rewards/score_reward": 0.84375, "step": 2090 }, { "clip_ratio": 0.0, "completion_length": 122.68229293823242, "epoch": 7.160409556313994, "grad_norm": 0.7977937957018374, "kl": 0.3349609375, "learning_rate": 4.0529010238907844e-07, "loss": 0.0003, "reward": 1.73828125, "reward_std": 0.014598665293306112, "rewards/format_reward": 1.0, "rewards/score_reward": 0.73828125, "step": 2091 }, { "clip_ratio": 0.0, "completion_length": 119.75000381469727, "epoch": 7.163822525597269, "grad_norm": 0.7228738789757542, "kl": 0.3427734375, "learning_rate": 4.0500568828213876e-07, "loss": 0.0003, "reward": 1.6380208134651184, "reward_std": 0.028058198746293783, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.640625, "step": 2092 }, { "clip_ratio": 0.0, "completion_length": 121.48177337646484, "epoch": 7.167235494880546, "grad_norm": 0.24560352212935768, "kl": 0.3193359375, "learning_rate": 4.047212741751991e-07, "loss": 0.0003, "reward": 1.84375, "reward_std": 0.009643959812819958, "rewards/format_reward": 1.0, "rewards/score_reward": 0.84375, "step": 2093 }, { "clip_ratio": 0.0, "completion_length": 120.71875, "epoch": 7.170648464163823, "grad_norm": 2.2746446964559794, "kl": 0.330078125, "learning_rate": 4.044368600682594e-07, "loss": 0.0003, "reward": 1.748046875, "reward_std": 0.03103728499263525, "rewards/format_reward": 1.0, "rewards/score_reward": 0.748046875, "step": 2094 }, { "clip_ratio": 0.0, "completion_length": 124.29948043823242, "epoch": 7.174061433447099, "grad_norm": 1.1470602753468504, "kl": 0.32421875, "learning_rate": 4.0415244596131967e-07, "loss": 0.0003, "reward": 1.78515625, "reward_std": 0.03305034339427948, "rewards/format_reward": 1.0, "rewards/score_reward": 0.78515625, "step": 2095 }, { "clip_ratio": 0.0, "completion_length": 121.1015625, "epoch": 7.177474402730375, "grad_norm": 0.006770978277451123, "kl": 0.3212890625, "learning_rate": 4.0386803185437993e-07, "loss": 0.0003, "reward": 1.765625, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.765625, "step": 2096 }, { "clip_ratio": 0.0, "completion_length": 119.29167175292969, "epoch": 7.180887372013652, "grad_norm": 1.2326218134686622, "kl": 0.328125, "learning_rate": 4.0358361774744025e-07, "loss": 0.0003, "reward": 1.7975260615348816, "reward_std": 0.04878543969243765, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7975260317325592, "step": 2097 }, { "clip_ratio": 0.0, "completion_length": 121.1875, "epoch": 7.184300341296928, "grad_norm": 0.5665607107335766, "kl": 0.3193359375, "learning_rate": 4.0329920364050057e-07, "loss": 0.0003, "reward": 1.859375, "reward_std": 0.014731390401721, "rewards/format_reward": 1.0, "rewards/score_reward": 0.859375, "step": 2098 }, { "clip_ratio": 0.0, "completion_length": 119.91666793823242, "epoch": 7.187713310580205, "grad_norm": 0.024624107399396165, "kl": 0.3369140625, "learning_rate": 4.0301478953356084e-07, "loss": 0.0003, "reward": 1.6927083134651184, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6927083432674408, "step": 2099 }, { "clip_ratio": 0.0, "completion_length": 122.24739837646484, "epoch": 7.191126279863481, "grad_norm": 0.9276690523125267, "kl": 0.3251953125, "learning_rate": 4.0273037542662116e-07, "loss": 0.0003, "reward": 1.8352864384651184, "reward_std": 0.02509636152535677, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8352864682674408, "step": 2100 }, { "clip_ratio": 0.0, "completion_length": 123.05469131469727, "epoch": 7.194539249146757, "grad_norm": 0.9533496595816954, "kl": 0.33203125, "learning_rate": 4.024459613196814e-07, "loss": 0.0003, "reward": 1.7389323115348816, "reward_std": 0.01913524977862835, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7389322817325592, "step": 2101 }, { "clip_ratio": 0.0, "completion_length": 120.05729675292969, "epoch": 7.197952218430034, "grad_norm": 0.8910196409056688, "kl": 0.3310546875, "learning_rate": 4.0216154721274174e-07, "loss": 0.0003, "reward": 1.8255208730697632, "reward_std": 0.037853604182600975, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8255208134651184, "step": 2102 }, { "clip_ratio": 0.0, "completion_length": 120.59375, "epoch": 7.201365187713311, "grad_norm": 1.1770482010982146, "kl": 0.3291015625, "learning_rate": 4.01877133105802e-07, "loss": 0.0003, "reward": 1.7884114980697632, "reward_std": 0.04707862436771393, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7884114384651184, "step": 2103 }, { "clip_ratio": 0.0, "completion_length": 123.07291793823242, "epoch": 7.204778156996587, "grad_norm": 0.6800071038310042, "kl": 0.3232421875, "learning_rate": 4.0159271899886233e-07, "loss": 0.0003, "reward": 1.8795573115348816, "reward_std": 0.008086706511676311, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8795572817325592, "step": 2104 }, { "clip_ratio": 0.0, "completion_length": 119.81250381469727, "epoch": 7.208191126279863, "grad_norm": 1.1487429395984592, "kl": 0.3271484375, "learning_rate": 4.0130830489192265e-07, "loss": 0.0003, "reward": 1.765625, "reward_std": 0.02551448391750455, "rewards/format_reward": 1.0, "rewards/score_reward": 0.765625, "step": 2105 }, { "clip_ratio": 0.0, "completion_length": 120.47917175292969, "epoch": 7.21160409556314, "grad_norm": 1.2942779191255953, "kl": 0.34375, "learning_rate": 4.0102389078498297e-07, "loss": 0.0003, "reward": 1.7643229365348816, "reward_std": 0.03202521428465843, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7643229067325592, "step": 2106 }, { "clip_ratio": 0.0, "completion_length": 118.69010925292969, "epoch": 7.215017064846417, "grad_norm": 0.5927647540364404, "kl": 0.3310546875, "learning_rate": 4.007394766780432e-07, "loss": 0.0003, "reward": 1.8059896230697632, "reward_std": 0.01841423800215125, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.80859375, "step": 2107 }, { "clip_ratio": 0.0, "completion_length": 118.23437881469727, "epoch": 7.2184300341296925, "grad_norm": 0.625868515569712, "kl": 0.3203125, "learning_rate": 4.004550625711035e-07, "loss": 0.0003, "reward": 1.8658854365348816, "reward_std": 0.016876930370926857, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8658854067325592, "step": 2108 }, { "clip_ratio": 0.0, "completion_length": 118.55729675292969, "epoch": 7.221843003412969, "grad_norm": 0.9354175767572648, "kl": 0.3388671875, "learning_rate": 4.001706484641638e-07, "loss": 0.0003, "reward": 1.7337239384651184, "reward_std": 0.03344827517867088, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7337239682674408, "step": 2109 }, { "clip_ratio": 0.0, "completion_length": 115.83854675292969, "epoch": 7.225255972696246, "grad_norm": 0.326936895857574, "kl": 0.33203125, "learning_rate": 3.998862343572241e-07, "loss": 0.0003, "reward": 1.8203125, "reward_std": 0.010782274417579174, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8203125, "step": 2110 }, { "clip_ratio": 0.0, "completion_length": 118.35937881469727, "epoch": 7.228668941979522, "grad_norm": 1.2730302638659896, "kl": 0.3447265625, "learning_rate": 3.996018202502844e-07, "loss": 0.0003, "reward": 1.734375, "reward_std": 0.025380939710885286, "rewards/format_reward": 1.0, "rewards/score_reward": 0.734375, "step": 2111 }, { "clip_ratio": 0.0, "completion_length": 118.54687881469727, "epoch": 7.2320819112627985, "grad_norm": 0.7431587231566628, "kl": 0.3271484375, "learning_rate": 3.993174061433447e-07, "loss": 0.0003, "reward": 1.8268229365348816, "reward_std": 0.0378536032512784, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8268229067325592, "step": 2112 }, { "clip_ratio": 0.0, "completion_length": 115.91667175292969, "epoch": 7.235494880546075, "grad_norm": 0.37198256205974645, "kl": 0.341796875, "learning_rate": 3.99032992036405e-07, "loss": 0.0003, "reward": 1.7708333730697632, "reward_std": 0.011135884560644627, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7708333134651184, "step": 2113 }, { "clip_ratio": 0.0, "completion_length": 115.97656631469727, "epoch": 7.238907849829351, "grad_norm": 0.9606197565869113, "kl": 0.349609375, "learning_rate": 3.9874857792946526e-07, "loss": 0.0004, "reward": 1.8255208134651184, "reward_std": 0.02551366575062275, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.828125, "step": 2114 }, { "clip_ratio": 0.0, "completion_length": 115.41667175292969, "epoch": 7.242320819112628, "grad_norm": 0.53116915768969, "kl": 0.322265625, "learning_rate": 3.984641638225256e-07, "loss": 0.0003, "reward": 1.724609375, "reward_std": 0.008086706511676311, "rewards/format_reward": 1.0, "rewards/score_reward": 0.724609375, "step": 2115 }, { "clip_ratio": 0.0, "completion_length": 116.12760925292969, "epoch": 7.2457337883959045, "grad_norm": 1.081297777275376, "kl": 0.3359375, "learning_rate": 3.981797497155859e-07, "loss": 0.0003, "reward": 1.7662760019302368, "reward_std": 0.02748863259330392, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7662760615348816, "step": 2116 }, { "clip_ratio": 0.0, "completion_length": 115.04427337646484, "epoch": 7.249146757679181, "grad_norm": 0.31997192215111325, "kl": 0.326171875, "learning_rate": 3.9789533560864617e-07, "loss": 0.0003, "reward": 1.7005208134651184, "reward_std": 0.010782274417579174, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7005208432674408, "step": 2117 }, { "clip_ratio": 0.0, "completion_length": 115.08333587646484, "epoch": 7.252559726962457, "grad_norm": 0.904486201078897, "kl": 0.3330078125, "learning_rate": 3.9761092150170643e-07, "loss": 0.0003, "reward": 1.8046875, "reward_std": 0.008351913653314114, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8046875, "step": 2118 }, { "clip_ratio": 0.0, "completion_length": 113.19010925292969, "epoch": 7.255972696245734, "grad_norm": 1.1567688355943329, "kl": 0.3359375, "learning_rate": 3.9732650739476675e-07, "loss": 0.0003, "reward": 1.75390625, "reward_std": 0.047345301136374474, "rewards/format_reward": 1.0, "rewards/score_reward": 0.75390625, "step": 2119 }, { "clip_ratio": 0.0, "completion_length": 113.56771087646484, "epoch": 7.2593856655290105, "grad_norm": 0.4445970955347968, "kl": 0.3408203125, "learning_rate": 3.9704209328782707e-07, "loss": 0.0003, "reward": 1.7819010615348816, "reward_std": 0.015319675207138062, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7819010317325592, "step": 2120 }, { "clip_ratio": 0.0, "completion_length": 114.66146087646484, "epoch": 7.262798634812286, "grad_norm": 3.2009253797323063, "kl": 0.3359375, "learning_rate": 3.967576791808874e-07, "loss": 0.0003, "reward": 1.7428385615348816, "reward_std": 0.028779210057109594, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7428385317325592, "step": 2121 }, { "clip_ratio": 0.0, "completion_length": 114.19271087646484, "epoch": 7.266211604095563, "grad_norm": 5.125053140233845, "kl": 0.333984375, "learning_rate": 3.964732650739476e-07, "loss": 0.0003, "reward": 1.7447916865348816, "reward_std": 0.03892781212925911, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7447916567325592, "step": 2122 }, { "clip_ratio": 0.0, "completion_length": 113.359375, "epoch": 7.26962457337884, "grad_norm": 1.282460257543352, "kl": 0.3359375, "learning_rate": 3.961888509670079e-07, "loss": 0.0003, "reward": 1.8313801884651184, "reward_std": 0.03360037039965391, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8313802182674408, "step": 2123 }, { "clip_ratio": 0.0, "completion_length": 112.41146087646484, "epoch": 7.273037542662116, "grad_norm": 0.7295598549315228, "kl": 0.32421875, "learning_rate": 3.9590443686006824e-07, "loss": 0.0003, "reward": 1.8671875, "reward_std": 0.0073656952008605, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8671875, "step": 2124 }, { "clip_ratio": 0.0, "completion_length": 114.89583587646484, "epoch": 7.276450511945392, "grad_norm": 2.2859549384428655, "kl": 0.3369140625, "learning_rate": 3.9562002275312856e-07, "loss": 0.0003, "reward": 1.8509114384651184, "reward_std": 0.03103728499263525, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8509114682674408, "step": 2125 }, { "clip_ratio": 0.0, "completion_length": 112.24219131469727, "epoch": 7.279863481228669, "grad_norm": 0.8667688761595848, "kl": 0.3466796875, "learning_rate": 3.9533560864618883e-07, "loss": 0.0003, "reward": 1.8541666865348816, "reward_std": 0.016703827306628227, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8541666567325592, "step": 2126 }, { "clip_ratio": 0.0, "completion_length": 112.44531631469727, "epoch": 7.283276450511945, "grad_norm": 0.3745958460719327, "kl": 0.333984375, "learning_rate": 3.950511945392491e-07, "loss": 0.0003, "reward": 1.83984375, "reward_std": 0.014598664827644825, "rewards/format_reward": 1.0, "rewards/score_reward": 0.83984375, "step": 2127 }, { "clip_ratio": 0.0, "completion_length": 111.66927337646484, "epoch": 7.286689419795222, "grad_norm": 0.3198112061809034, "kl": 0.3251953125, "learning_rate": 3.947667804323094e-07, "loss": 0.0003, "reward": 1.8072916865348816, "reward_std": 0.011135884560644627, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8072916567325592, "step": 2128 }, { "clip_ratio": 0.0, "completion_length": 112.32812881469727, "epoch": 7.290102389078498, "grad_norm": 2.191346392326988, "kl": 0.34375, "learning_rate": 3.9448236632536974e-07, "loss": 0.0003, "reward": 1.7447916865348816, "reward_std": 0.03458764776587486, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7447916567325592, "step": 2129 }, { "clip_ratio": 0.0, "completion_length": 113.66666793823242, "epoch": 7.293515358361775, "grad_norm": 1.4103851734364894, "kl": 0.3349609375, "learning_rate": 3.9419795221843e-07, "loss": 0.0003, "reward": 1.7825520634651184, "reward_std": 0.021962891332805157, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7825520932674408, "step": 2130 }, { "clip_ratio": 0.0, "completion_length": 113.32552337646484, "epoch": 7.296928327645051, "grad_norm": 0.9195861740676294, "kl": 0.337890625, "learning_rate": 3.939135381114903e-07, "loss": 0.0003, "reward": 1.7330729365348816, "reward_std": 0.023254938423633575, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7330729067325592, "step": 2131 }, { "clip_ratio": 0.0, "completion_length": 113.31250381469727, "epoch": 7.300341296928328, "grad_norm": 1.0825494452500555, "kl": 0.3232421875, "learning_rate": 3.9362912400455064e-07, "loss": 0.0003, "reward": 1.7845051884651184, "reward_std": 0.03090521041303873, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7845052182674408, "step": 2132 }, { "clip_ratio": 0.0, "completion_length": 112.43489837646484, "epoch": 7.303754266211604, "grad_norm": 0.2751356180325541, "kl": 0.326171875, "learning_rate": 3.933447098976109e-07, "loss": 0.0003, "reward": 1.7005208730697632, "reward_std": 0.017009655013680458, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7005208134651184, "step": 2133 }, { "clip_ratio": 0.0, "completion_length": 112.90104293823242, "epoch": 7.30716723549488, "grad_norm": 8.46300919329813, "kl": 0.341796875, "learning_rate": 3.930602957906712e-07, "loss": 0.0003, "reward": 1.7936198115348816, "reward_std": 0.03794570825994015, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7936197817325592, "step": 2134 }, { "clip_ratio": 0.0, "completion_length": 111.66666793823242, "epoch": 7.310580204778157, "grad_norm": 1.0738521046108667, "kl": 0.3291015625, "learning_rate": 3.927758816837315e-07, "loss": 0.0003, "reward": 1.685546875, "reward_std": 0.030905209947377443, "rewards/format_reward": 1.0, "rewards/score_reward": 0.685546875, "step": 2135 }, { "clip_ratio": 0.0, "completion_length": 111.63802337646484, "epoch": 7.313993174061434, "grad_norm": 1.334525060676517, "kl": 0.3251953125, "learning_rate": 3.924914675767918e-07, "loss": 0.0003, "reward": 1.7526041865348816, "reward_std": 0.0293300561606884, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7552083432674408, "step": 2136 }, { "clip_ratio": 0.0, "completion_length": 109.99479293823242, "epoch": 7.3174061433447095, "grad_norm": 1.0412109623744763, "kl": 0.3359375, "learning_rate": 3.9220705346985213e-07, "loss": 0.0003, "reward": 1.7942708134651184, "reward_std": 0.021830817684531212, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7942708432674408, "step": 2137 }, { "clip_ratio": 0.0, "completion_length": 109.43229675292969, "epoch": 7.320819112627986, "grad_norm": 0.2687962597890707, "kl": 0.3359375, "learning_rate": 3.9192263936291235e-07, "loss": 0.0003, "reward": 1.8020833730697632, "reward_std": 0.011135884560644627, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8020833134651184, "step": 2138 }, { "clip_ratio": 0.0, "completion_length": 110.13802337646484, "epoch": 7.324232081911263, "grad_norm": 0.16246446772401235, "kl": 0.3251953125, "learning_rate": 3.9163822525597267e-07, "loss": 0.0003, "reward": 1.7682291865348816, "reward_std": 0.0073656952008605, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7682291567325592, "step": 2139 }, { "clip_ratio": 0.0, "completion_length": 110.40364837646484, "epoch": 7.327645051194539, "grad_norm": 0.5430747640152088, "kl": 0.3388671875, "learning_rate": 3.91353811149033e-07, "loss": 0.0003, "reward": 1.75390625, "reward_std": 0.014598665293306112, "rewards/format_reward": 1.0, "rewards/score_reward": 0.75390625, "step": 2140 }, { "clip_ratio": 0.0, "completion_length": 111.38542175292969, "epoch": 7.3310580204778155, "grad_norm": 1.2226295301027486, "kl": 0.3369140625, "learning_rate": 3.910693970420933e-07, "loss": 0.0003, "reward": 1.7571614384651184, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7571614682674408, "step": 2141 }, { "clip_ratio": 0.0, "completion_length": 109.91146087646484, "epoch": 7.334470989761092, "grad_norm": 0.39602419631999475, "kl": 0.3232421875, "learning_rate": 3.9078498293515357e-07, "loss": 0.0003, "reward": 1.765625, "reward_std": 0.009643959812819958, "rewards/format_reward": 1.0, "rewards/score_reward": 0.765625, "step": 2142 }, { "clip_ratio": 0.0, "completion_length": 108.796875, "epoch": 7.337883959044369, "grad_norm": 1.0279544105849314, "kl": 0.3212890625, "learning_rate": 3.9050056882821384e-07, "loss": 0.0003, "reward": 1.7532552480697632, "reward_std": 0.02124188095331192, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7532551884651184, "step": 2143 }, { "clip_ratio": 0.0, "completion_length": 108.57552337646484, "epoch": 7.341296928327645, "grad_norm": 0.8197232722546933, "kl": 0.341796875, "learning_rate": 3.9021615472127416e-07, "loss": 0.0003, "reward": 1.7389323115348816, "reward_std": 0.039695026353001595, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7389322817325592, "step": 2144 }, { "clip_ratio": 0.0, "completion_length": 109.91146087646484, "epoch": 7.3447098976109215, "grad_norm": 0.8671519472050065, "kl": 0.3291015625, "learning_rate": 3.899317406143344e-07, "loss": 0.0003, "reward": 1.8450521230697632, "reward_std": 0.014598665293306112, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8450520634651184, "step": 2145 }, { "clip_ratio": 0.0, "completion_length": 111.6796875, "epoch": 7.348122866894198, "grad_norm": 0.46186763552778676, "kl": 0.3369140625, "learning_rate": 3.8964732650739474e-07, "loss": 0.0003, "reward": 1.7571614384651184, "reward_std": 0.02124188095331192, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7571614682674408, "step": 2146 }, { "clip_ratio": 0.0, "completion_length": 111.29687881469727, "epoch": 7.351535836177474, "grad_norm": 1.3731378587982297, "kl": 0.337890625, "learning_rate": 3.8936291240045506e-07, "loss": 0.0003, "reward": 1.7109375, "reward_std": 0.0073656952008605, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7109375, "step": 2147 }, { "clip_ratio": 0.0, "completion_length": 108.99739837646484, "epoch": 7.354948805460751, "grad_norm": 0.025633975547215034, "kl": 0.333984375, "learning_rate": 3.8907849829351533e-07, "loss": 0.0003, "reward": 1.7760416269302368, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7760416865348816, "step": 2148 }, { "clip_ratio": 0.0, "completion_length": 110.95052337646484, "epoch": 7.3583617747440275, "grad_norm": 0.3018975052011507, "kl": 0.3359375, "learning_rate": 3.887940841865756e-07, "loss": 0.0003, "reward": 1.7578125, "reward_std": 0.0073656952008605, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7578125, "step": 2149 }, { "clip_ratio": 0.0, "completion_length": 111.44271087646484, "epoch": 7.361774744027303, "grad_norm": 1.2616463001141347, "kl": 0.3486328125, "learning_rate": 3.885096700796359e-07, "loss": 0.0003, "reward": 1.66015625, "reward_std": 0.05799337662756443, "rewards/format_reward": 1.0, "rewards/score_reward": 0.66015625, "step": 2150 }, { "clip_ratio": 0.0, "completion_length": 110.140625, "epoch": 7.36518771331058, "grad_norm": 0.008323207857516906, "kl": 0.33984375, "learning_rate": 3.8822525597269624e-07, "loss": 0.0003, "reward": 1.8177083730697632, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8177083134651184, "step": 2151 }, { "clip_ratio": 0.0, "completion_length": 110.125, "epoch": 7.368600682593857, "grad_norm": 1.1206046655395145, "kl": 0.33984375, "learning_rate": 3.8794084186575656e-07, "loss": 0.0003, "reward": 1.771484375, "reward_std": 0.022685371339321136, "rewards/format_reward": 1.0, "rewards/score_reward": 0.771484375, "step": 2152 }, { "clip_ratio": 0.0, "completion_length": 109.61719131469727, "epoch": 7.372013651877133, "grad_norm": 1.4726994530722568, "kl": 0.33203125, "learning_rate": 3.8765642775881677e-07, "loss": 0.0003, "reward": 1.7265625, "reward_std": 0.03018354857340455, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7265625, "step": 2153 }, { "clip_ratio": 0.0, "completion_length": 110.13802337646484, "epoch": 7.375426621160409, "grad_norm": 1.3264965991986848, "kl": 0.3408203125, "learning_rate": 3.873720136518771e-07, "loss": 0.0003, "reward": 1.7923177480697632, "reward_std": 0.03346764296293259, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.794921875, "step": 2154 }, { "clip_ratio": 0.0, "completion_length": 109.921875, "epoch": 7.378839590443686, "grad_norm": 2.6487590547784685, "kl": 0.34375, "learning_rate": 3.870875995449374e-07, "loss": 0.0003, "reward": 1.7799479365348816, "reward_std": 0.020843947771936655, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7799479067325592, "step": 2155 }, { "clip_ratio": 0.0, "completion_length": 113.26041793823242, "epoch": 7.382252559726963, "grad_norm": 1.0242921751545493, "kl": 0.333984375, "learning_rate": 3.8680318543799773e-07, "loss": 0.0003, "reward": 1.857421875, "reward_std": 0.03754948824644089, "rewards/format_reward": 1.0, "rewards/score_reward": 0.857421875, "step": 2156 }, { "clip_ratio": 0.0, "completion_length": 111.89323043823242, "epoch": 7.385665529010239, "grad_norm": 0.6454487267290074, "kl": 0.3330078125, "learning_rate": 3.86518771331058e-07, "loss": 0.0003, "reward": 1.7311197519302368, "reward_std": 0.01275724172592163, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7311198115348816, "step": 2157 }, { "clip_ratio": 0.0, "completion_length": 114.01041793823242, "epoch": 7.389078498293515, "grad_norm": 1.4949452954275655, "kl": 0.33203125, "learning_rate": 3.862343572241183e-07, "loss": 0.0003, "reward": 1.8658854365348816, "reward_std": 0.024525326676666737, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8658854067325592, "step": 2158 }, { "clip_ratio": 0.0, "completion_length": 111.28385925292969, "epoch": 7.392491467576792, "grad_norm": 40.90026891927938, "kl": 0.3369140625, "learning_rate": 3.859499431171786e-07, "loss": 0.0003, "reward": 1.7467448115348816, "reward_std": 0.028512938879430294, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7467447817325592, "step": 2159 }, { "clip_ratio": 0.0, "completion_length": 108.18489837646484, "epoch": 7.395904436860068, "grad_norm": 0.35612191683367633, "kl": 0.341796875, "learning_rate": 3.856655290102389e-07, "loss": 0.0003, "reward": 1.84375, "reward_std": 0.009643959812819958, "rewards/format_reward": 1.0, "rewards/score_reward": 0.84375, "step": 2160 }, { "clip_ratio": 0.0, "completion_length": 114.10156631469727, "epoch": 7.399317406143345, "grad_norm": 0.5596044563154648, "kl": 0.3310546875, "learning_rate": 3.8538111490329917e-07, "loss": 0.0003, "reward": 1.7688801884651184, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7688802182674408, "step": 2161 }, { "clip_ratio": 0.0, "completion_length": 115.0, "epoch": 7.402730375426621, "grad_norm": 0.7322190369402086, "kl": 0.330078125, "learning_rate": 3.850967007963595e-07, "loss": 0.0003, "reward": 1.8411458134651184, "reward_std": 0.008351913653314114, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8411458432674408, "step": 2162 }, { "clip_ratio": 0.0, "completion_length": 111.33073043823242, "epoch": 7.406143344709897, "grad_norm": 1.3480347972510807, "kl": 0.33203125, "learning_rate": 3.848122866894198e-07, "loss": 0.0003, "reward": 1.796875, "reward_std": 0.04973691888153553, "rewards/format_reward": 1.0, "rewards/score_reward": 0.796875, "step": 2163 }, { "clip_ratio": 0.0, "completion_length": 116.51302337646484, "epoch": 7.409556313993174, "grad_norm": 0.01685521789354278, "kl": 0.318359375, "learning_rate": 3.8452787258248007e-07, "loss": 0.0003, "reward": 1.8854166269302368, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8854166865348816, "step": 2164 }, { "clip_ratio": 0.0, "completion_length": 114.50260925292969, "epoch": 7.412969283276451, "grad_norm": 0.7501603869829129, "kl": 0.3310546875, "learning_rate": 3.8424345847554034e-07, "loss": 0.0003, "reward": 1.75, "reward_std": 0.020976672880351543, "rewards/format_reward": 1.0, "rewards/score_reward": 0.75, "step": 2165 }, { "clip_ratio": 0.0, "completion_length": 112.17448043823242, "epoch": 7.4163822525597265, "grad_norm": 0.013294401561550843, "kl": 0.326171875, "learning_rate": 3.8395904436860066e-07, "loss": 0.0003, "reward": 1.78125, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.78125, "step": 2166 }, { "clip_ratio": 0.0, "completion_length": 113.53906631469727, "epoch": 7.419795221843003, "grad_norm": 1.4260318370976812, "kl": 0.3291015625, "learning_rate": 3.83674630261661e-07, "loss": 0.0003, "reward": 1.7799478769302368, "reward_std": 0.03416928742080927, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7799479365348816, "step": 2167 }, { "clip_ratio": 0.0, "completion_length": 112.52083587646484, "epoch": 7.42320819112628, "grad_norm": 2.2651596606390143, "kl": 0.3251953125, "learning_rate": 3.833902161547213e-07, "loss": 0.0003, "reward": 1.7669271230697632, "reward_std": 0.018015244510024786, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7669270634651184, "step": 2168 }, { "clip_ratio": 0.0, "completion_length": 113.95312881469727, "epoch": 7.426621160409557, "grad_norm": 3.7697726325534076, "kl": 0.337890625, "learning_rate": 3.831058020477815e-07, "loss": 0.0003, "reward": 1.720703125, "reward_std": 0.03584054671227932, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7233073115348816, "step": 2169 }, { "clip_ratio": 0.0, "completion_length": 115.67708587646484, "epoch": 7.4300341296928325, "grad_norm": 0.6765191444027113, "kl": 0.3271484375, "learning_rate": 3.8282138794084183e-07, "loss": 0.0003, "reward": 1.8522135615348816, "reward_std": 0.015319676604121923, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8522135317325592, "step": 2170 }, { "clip_ratio": 0.0, "completion_length": 113.85677337646484, "epoch": 7.433447098976109, "grad_norm": 0.5150108757594062, "kl": 0.349609375, "learning_rate": 3.8253697383390215e-07, "loss": 0.0004, "reward": 1.7845051884651184, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7845052182674408, "step": 2171 }, { "clip_ratio": 0.0, "completion_length": 117.75000381469727, "epoch": 7.436860068259386, "grad_norm": 1.19433545829117, "kl": 0.3359375, "learning_rate": 3.8225255972696247e-07, "loss": 0.0003, "reward": 1.7858072519302368, "reward_std": 0.030905209947377443, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7858073115348816, "step": 2172 }, { "clip_ratio": 0.0, "completion_length": 116.31510925292969, "epoch": 7.440273037542662, "grad_norm": 0.2599993432285594, "kl": 0.328125, "learning_rate": 3.8196814562002274e-07, "loss": 0.0003, "reward": 1.853515625, "reward_std": 0.008086706511676311, "rewards/format_reward": 1.0, "rewards/score_reward": 0.853515625, "step": 2173 }, { "clip_ratio": 0.0, "completion_length": 116.95833587646484, "epoch": 7.4436860068259385, "grad_norm": 3.7000130499589594, "kl": 0.32421875, "learning_rate": 3.81683731513083e-07, "loss": 0.0003, "reward": 1.8268229365348816, "reward_std": 0.02465952094644308, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8268229067325592, "step": 2174 }, { "clip_ratio": 0.0, "completion_length": 114.71875381469727, "epoch": 7.447098976109215, "grad_norm": 1.1183770825243533, "kl": 0.3447265625, "learning_rate": 3.813993174061433e-07, "loss": 0.0003, "reward": 1.7578125, "reward_std": 0.04033321375027299, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7578125, "step": 2175 }, { "clip_ratio": 0.0, "completion_length": 114.4609375, "epoch": 7.450511945392491, "grad_norm": 0.2472043259238858, "kl": 0.326171875, "learning_rate": 3.8111490329920364e-07, "loss": 0.0003, "reward": 1.7779947519302368, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7779948115348816, "step": 2176 }, { "clip_ratio": 0.0, "completion_length": 115.36719131469727, "epoch": 7.453924914675768, "grad_norm": 0.7723539672531474, "kl": 0.330078125, "learning_rate": 3.808304891922639e-07, "loss": 0.0003, "reward": 1.873046875, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.873046875, "step": 2177 }, { "clip_ratio": 0.0, "completion_length": 115.48958587646484, "epoch": 7.4573378839590445, "grad_norm": 1.121533586962194, "kl": 0.32421875, "learning_rate": 3.8054607508532423e-07, "loss": 0.0003, "reward": 1.8014323115348816, "reward_std": 0.04423283785581589, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8014322817325592, "step": 2178 }, { "clip_ratio": 0.0, "completion_length": 117.27604675292969, "epoch": 7.460750853242321, "grad_norm": 0.535200345902053, "kl": 0.3271484375, "learning_rate": 3.8026166097838455e-07, "loss": 0.0003, "reward": 1.7884114384651184, "reward_std": 0.015452401712536812, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7884114682674408, "step": 2179 }, { "clip_ratio": 0.0, "completion_length": 117.06510543823242, "epoch": 7.464163822525597, "grad_norm": 1.278101457434852, "kl": 0.3984375, "learning_rate": 3.7997724687144476e-07, "loss": 0.0004, "reward": 1.7721353769302368, "reward_std": 0.01828151335939765, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7721354365348816, "step": 2180 }, { "clip_ratio": 0.0, "completion_length": 114.57291793823242, "epoch": 7.467576791808874, "grad_norm": 2.6331889360654674, "kl": 0.3359375, "learning_rate": 3.796928327645051e-07, "loss": 0.0003, "reward": 1.7708333134651184, "reward_std": 0.03261309023946524, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7708333432674408, "step": 2181 }, { "clip_ratio": 0.0, "completion_length": 115.1640625, "epoch": 7.4709897610921505, "grad_norm": 0.9464942939011072, "kl": 0.33203125, "learning_rate": 3.794084186575654e-07, "loss": 0.0003, "reward": 1.8125, "reward_std": 0.025867275893688202, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8125, "step": 2182 }, { "clip_ratio": 0.0, "completion_length": 117.35156631469727, "epoch": 7.474402730375426, "grad_norm": 0.3249064416897853, "kl": 0.3232421875, "learning_rate": 3.791240045506257e-07, "loss": 0.0003, "reward": 1.828125, "reward_std": 0.014731390401721, "rewards/format_reward": 1.0, "rewards/score_reward": 0.828125, "step": 2183 }, { "clip_ratio": 0.0, "completion_length": 115.90625381469727, "epoch": 7.477815699658703, "grad_norm": 1.7082234454503236, "kl": 0.3408203125, "learning_rate": 3.78839590443686e-07, "loss": 0.0003, "reward": 1.7291666269302368, "reward_std": 0.037238895893096924, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7291666865348816, "step": 2184 }, { "clip_ratio": 0.0, "completion_length": 117.27604293823242, "epoch": 7.48122866894198, "grad_norm": 1.2513463601723593, "kl": 0.3271484375, "learning_rate": 3.7855517633674625e-07, "loss": 0.0003, "reward": 1.791015625, "reward_std": 0.030905209481716156, "rewards/format_reward": 1.0, "rewards/score_reward": 0.791015625, "step": 2185 }, { "clip_ratio": 0.0, "completion_length": 116.85677337646484, "epoch": 7.484641638225256, "grad_norm": 0.5101061366094729, "kl": 0.361328125, "learning_rate": 3.7827076222980657e-07, "loss": 0.0004, "reward": 1.7584635019302368, "reward_std": 0.015319676604121923, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7584635615348816, "step": 2186 }, { "clip_ratio": 0.0, "completion_length": 116.75260925292969, "epoch": 7.488054607508532, "grad_norm": 0.4178610967313168, "kl": 0.326171875, "learning_rate": 3.779863481228669e-07, "loss": 0.0003, "reward": 1.8014323115348816, "reward_std": 0.008086706511676311, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8014322817325592, "step": 2187 }, { "clip_ratio": 0.0, "completion_length": 119.27604293823242, "epoch": 7.491467576791809, "grad_norm": 0.6243561226045041, "kl": 0.3251953125, "learning_rate": 3.7770193401592716e-07, "loss": 0.0003, "reward": 1.8072916865348816, "reward_std": 0.03546133264899254, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8072916567325592, "step": 2188 }, { "clip_ratio": 0.0, "completion_length": 117.03906631469727, "epoch": 7.494880546075085, "grad_norm": 8.190837934677122, "kl": 0.3232421875, "learning_rate": 3.774175199089875e-07, "loss": 0.0003, "reward": 1.8098958134651184, "reward_std": 0.0073656952008605, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8098958432674408, "step": 2189 }, { "clip_ratio": 0.0, "completion_length": 116.27604675292969, "epoch": 7.498293515358362, "grad_norm": 0.38333638898806544, "kl": 0.3369140625, "learning_rate": 3.7713310580204775e-07, "loss": 0.0003, "reward": 1.7897135615348816, "reward_std": 0.015168231911957264, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7897135317325592, "step": 2190 }, { "clip_ratio": 0.0, "completion_length": 118.08854293823242, "epoch": 7.501706484641638, "grad_norm": 0.618544914678829, "kl": 0.337890625, "learning_rate": 3.7684869169510806e-07, "loss": 0.0003, "reward": 1.7591146230697632, "reward_std": 0.023254937492311, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7591145634651184, "step": 2191 }, { "clip_ratio": 0.0, "completion_length": 117.33854293823242, "epoch": 7.505119453924914, "grad_norm": 0.48413440890542836, "kl": 0.32421875, "learning_rate": 3.7656427758816833e-07, "loss": 0.0003, "reward": 1.8860676884651184, "reward_std": 0.019222591072320938, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8860677182674408, "step": 2192 }, { "clip_ratio": 0.0, "completion_length": 113.60677337646484, "epoch": 7.508532423208191, "grad_norm": 7.33778727622696, "kl": 0.3330078125, "learning_rate": 3.7627986348122865e-07, "loss": 0.0003, "reward": 1.6940104365348816, "reward_std": 0.01687692990526557, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6940104067325592, "step": 2193 }, { "clip_ratio": 0.0, "completion_length": 119.35416793823242, "epoch": 7.511945392491468, "grad_norm": 1.2385221755150944, "kl": 0.3291015625, "learning_rate": 3.7599544937428897e-07, "loss": 0.0003, "reward": 1.7486978769302368, "reward_std": 0.056987786665558815, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7486979365348816, "step": 2194 }, { "clip_ratio": 0.0, "completion_length": 116.90885925292969, "epoch": 7.515358361774744, "grad_norm": 0.5521046324290764, "kl": 0.326171875, "learning_rate": 3.7571103526734924e-07, "loss": 0.0003, "reward": 1.8346354365348816, "reward_std": 0.014598664827644825, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8346354067325592, "step": 2195 }, { "clip_ratio": 0.0, "completion_length": 117.64323043823242, "epoch": 7.51877133105802, "grad_norm": 1.4414188522560853, "kl": 0.3310546875, "learning_rate": 3.754266211604095e-07, "loss": 0.0003, "reward": 1.7513021230697632, "reward_std": 0.03160832170397043, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7513020634651184, "step": 2196 }, { "clip_ratio": 0.0, "completion_length": 120.87760543823242, "epoch": 7.522184300341297, "grad_norm": 1.052751662807273, "kl": 0.326171875, "learning_rate": 3.751422070534698e-07, "loss": 0.0003, "reward": 1.7884114980697632, "reward_std": 0.029899622313678265, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7884114384651184, "step": 2197 }, { "clip_ratio": 0.0, "completion_length": 117.09375381469727, "epoch": 7.525597269624574, "grad_norm": 0.6990985804881525, "kl": 0.33984375, "learning_rate": 3.7485779294653014e-07, "loss": 0.0003, "reward": 1.767578125, "reward_std": 0.022401202004402876, "rewards/format_reward": 1.0, "rewards/score_reward": 0.767578125, "step": 2198 }, { "clip_ratio": 0.0, "completion_length": 116.24219131469727, "epoch": 7.5290102389078495, "grad_norm": 0.7315917995842088, "kl": 0.337890625, "learning_rate": 3.7457337883959046e-07, "loss": 0.0003, "reward": 1.7526041865348816, "reward_std": 0.010782274417579174, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7526041567325592, "step": 2199 }, { "clip_ratio": 0.0, "completion_length": 114.359375, "epoch": 7.532423208191126, "grad_norm": 1.0469270632661554, "kl": 0.3388671875, "learning_rate": 3.7428896473265073e-07, "loss": 0.0003, "reward": 1.7942708134651184, "reward_std": 0.021831634920090437, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7942708432674408, "step": 2200 }, { "clip_ratio": 0.0, "completion_length": 115.77604675292969, "epoch": 7.535836177474403, "grad_norm": 0.7255666005213893, "kl": 0.3310546875, "learning_rate": 3.74004550625711e-07, "loss": 0.0003, "reward": 1.7610676884651184, "reward_std": 0.03358246898278594, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7610677182674408, "step": 2201 }, { "clip_ratio": 0.0, "completion_length": 115.94010543823242, "epoch": 7.53924914675768, "grad_norm": 0.9511677873991945, "kl": 0.33203125, "learning_rate": 3.737201365187713e-07, "loss": 0.0003, "reward": 1.7102864384651184, "reward_std": 0.015168231446295977, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7102864682674408, "step": 2202 }, { "clip_ratio": 0.0, "completion_length": 114.39844131469727, "epoch": 7.5426621160409555, "grad_norm": 1.219812203442762, "kl": 0.3271484375, "learning_rate": 3.7343572241183163e-07, "loss": 0.0003, "reward": 1.740234375, "reward_std": 0.016306545585393906, "rewards/format_reward": 1.0, "rewards/score_reward": 0.740234375, "step": 2203 }, { "clip_ratio": 0.0, "completion_length": 115.05989837646484, "epoch": 7.546075085324232, "grad_norm": 2.064464290603534, "kl": 0.34765625, "learning_rate": 3.731513083048919e-07, "loss": 0.0003, "reward": 1.783203125, "reward_std": 0.02509636152535677, "rewards/format_reward": 1.0, "rewards/score_reward": 0.783203125, "step": 2204 }, { "clip_ratio": 0.0, "completion_length": 115.05208587646484, "epoch": 7.549488054607508, "grad_norm": 0.15091655252432873, "kl": 0.3349609375, "learning_rate": 3.728668941979522e-07, "loss": 0.0003, "reward": 1.7916666865348816, "reward_std": 0.014731390401721, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7942708134651184, "step": 2205 }, { "clip_ratio": 0.0, "completion_length": 114.70833587646484, "epoch": 7.552901023890785, "grad_norm": 0.3863945019719947, "kl": 0.3427734375, "learning_rate": 3.725824800910125e-07, "loss": 0.0003, "reward": 1.7682291865348816, "reward_std": 0.010782274417579174, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7682291567325592, "step": 2206 }, { "clip_ratio": 0.0, "completion_length": 116.4296875, "epoch": 7.5563139931740615, "grad_norm": 1.5322582752761935, "kl": 0.333984375, "learning_rate": 3.722980659840728e-07, "loss": 0.0003, "reward": 1.6608072519302368, "reward_std": 0.04677655175328255, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.6634114682674408, "step": 2207 }, { "clip_ratio": 0.0, "completion_length": 114.80469131469727, "epoch": 7.559726962457338, "grad_norm": 5.42607730094615, "kl": 0.3154296875, "learning_rate": 3.7201365187713307e-07, "loss": 0.0003, "reward": 1.794921875, "reward_std": 0.03574591176584363, "rewards/format_reward": 1.0, "rewards/score_reward": 0.794921875, "step": 2208 }, { "clip_ratio": 0.0, "completion_length": 116.2578125, "epoch": 7.563139931740614, "grad_norm": 0.8553988710394166, "kl": 0.3369140625, "learning_rate": 3.717292377701934e-07, "loss": 0.0003, "reward": 1.8138020634651184, "reward_std": 0.020976672880351543, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8138020932674408, "step": 2209 }, { "clip_ratio": 0.0, "completion_length": 113.59635925292969, "epoch": 7.566552901023891, "grad_norm": 1.1549278744057363, "kl": 0.3408203125, "learning_rate": 3.714448236632537e-07, "loss": 0.0003, "reward": 1.681640625, "reward_std": 0.019135249312967062, "rewards/format_reward": 1.0, "rewards/score_reward": 0.681640625, "step": 2210 }, { "clip_ratio": 0.0, "completion_length": 115.99219131469727, "epoch": 7.5699658703071675, "grad_norm": 1.2816474251186647, "kl": 0.32421875, "learning_rate": 3.71160409556314e-07, "loss": 0.0003, "reward": 1.8294270634651184, "reward_std": 0.014598665293306112, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8294270932674408, "step": 2211 }, { "clip_ratio": 0.0, "completion_length": 115.16666793823242, "epoch": 7.573378839590443, "grad_norm": 0.9350700098858303, "kl": 0.333984375, "learning_rate": 3.7087599544937425e-07, "loss": 0.0003, "reward": 1.76171875, "reward_std": 0.024525326676666737, "rewards/format_reward": 1.0, "rewards/score_reward": 0.76171875, "step": 2212 }, { "clip_ratio": 0.0, "completion_length": 113.29166793823242, "epoch": 7.57679180887372, "grad_norm": 2.285375803102493, "kl": 0.35546875, "learning_rate": 3.7059158134243457e-07, "loss": 0.0004, "reward": 1.7115885019302368, "reward_std": 0.015452401712536812, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7115885615348816, "step": 2213 }, { "clip_ratio": 0.0, "completion_length": 114.19531631469727, "epoch": 7.580204778156997, "grad_norm": 0.4480416216684392, "kl": 0.3349609375, "learning_rate": 3.703071672354949e-07, "loss": 0.0003, "reward": 1.853515625, "reward_std": 0.008086706511676311, "rewards/format_reward": 1.0, "rewards/score_reward": 0.853515625, "step": 2214 }, { "clip_ratio": 0.0, "completion_length": 116.45833587646484, "epoch": 7.5836177474402735, "grad_norm": 0.6966709356024811, "kl": 0.3212890625, "learning_rate": 3.700227531285552e-07, "loss": 0.0003, "reward": 1.8307291865348816, "reward_std": 0.017009655013680458, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8307291567325592, "step": 2215 }, { "clip_ratio": 0.0, "completion_length": 114.87760543823242, "epoch": 7.587030716723549, "grad_norm": 1.1200875759765287, "kl": 0.322265625, "learning_rate": 3.697383390216154e-07, "loss": 0.0003, "reward": 1.8515625, "reward_std": 0.023083304055035114, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8515625, "step": 2216 }, { "clip_ratio": 0.0, "completion_length": 115.55729293823242, "epoch": 7.590443686006826, "grad_norm": 0.7707022078106606, "kl": 0.330078125, "learning_rate": 3.6945392491467574e-07, "loss": 0.0003, "reward": 1.82421875, "reward_std": 0.025647207628935575, "rewards/format_reward": 1.0, "rewards/score_reward": 0.82421875, "step": 2217 }, { "clip_ratio": 0.0, "completion_length": 115.01302337646484, "epoch": 7.593856655290102, "grad_norm": 3.056951327748772, "kl": 0.3154296875, "learning_rate": 3.6916951080773606e-07, "loss": 0.0003, "reward": 1.8600260615348816, "reward_std": 0.012889966368675232, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8600260317325592, "step": 2218 }, { "clip_ratio": 0.0, "completion_length": 110.96875381469727, "epoch": 7.597269624573379, "grad_norm": 0.4677804315806867, "kl": 0.3466796875, "learning_rate": 3.688850967007963e-07, "loss": 0.0003, "reward": 1.7239583134651184, "reward_std": 0.009643959812819958, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7239583432674408, "step": 2219 }, { "clip_ratio": 0.0, "completion_length": 112.88802337646484, "epoch": 7.600682593856655, "grad_norm": 3.237730247657306, "kl": 0.3544921875, "learning_rate": 3.6860068259385664e-07, "loss": 0.0004, "reward": 1.8020833134651184, "reward_std": 0.05847971513867378, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8020833432674408, "step": 2220 }, { "clip_ratio": 0.0, "completion_length": 114.75260543823242, "epoch": 7.604095563139932, "grad_norm": 0.337495495744237, "kl": 0.3359375, "learning_rate": 3.6831626848691696e-07, "loss": 0.0003, "reward": 1.744140625, "reward_std": 0.008086706511676311, "rewards/format_reward": 1.0, "rewards/score_reward": 0.744140625, "step": 2221 }, { "clip_ratio": 0.0, "completion_length": 115.77604675292969, "epoch": 7.607508532423208, "grad_norm": 0.8760819693646575, "kl": 0.333984375, "learning_rate": 3.6803185437997723e-07, "loss": 0.0003, "reward": 1.7330728769302368, "reward_std": 0.011048543266952038, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7330729365348816, "step": 2222 }, { "clip_ratio": 0.0, "completion_length": 115.06250381469727, "epoch": 7.610921501706485, "grad_norm": 0.008554068298396878, "kl": 0.326171875, "learning_rate": 3.677474402730375e-07, "loss": 0.0003, "reward": 1.8697916865348816, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8697916567325592, "step": 2223 }, { "clip_ratio": 0.0, "completion_length": 111.97135543823242, "epoch": 7.614334470989761, "grad_norm": 0.4230921990254634, "kl": 0.345703125, "learning_rate": 3.674630261660978e-07, "loss": 0.0003, "reward": 1.796875, "reward_std": 0.016173413023352623, "rewards/format_reward": 1.0, "rewards/score_reward": 0.796875, "step": 2224 }, { "clip_ratio": 0.0, "completion_length": 114.30469131469727, "epoch": 7.617747440273037, "grad_norm": 0.8713984772459483, "kl": 0.3251953125, "learning_rate": 3.6717861205915813e-07, "loss": 0.0003, "reward": 1.7265625, "reward_std": 0.010782274417579174, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7265625, "step": 2225 }, { "clip_ratio": 0.0, "completion_length": 115.44531631469727, "epoch": 7.621160409556314, "grad_norm": 0.43038658995333773, "kl": 0.3388671875, "learning_rate": 3.6689419795221845e-07, "loss": 0.0003, "reward": 1.8235677480697632, "reward_std": 0.008086706511676311, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8235676884651184, "step": 2226 }, { "clip_ratio": 0.0, "completion_length": 110.33854293823242, "epoch": 7.624573378839591, "grad_norm": 0.006471809069122733, "kl": 0.3349609375, "learning_rate": 3.6660978384527867e-07, "loss": 0.0003, "reward": 1.8020833134651184, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8020833432674408, "step": 2227 }, { "clip_ratio": 0.0, "completion_length": 114.14583587646484, "epoch": 7.627986348122867, "grad_norm": 0.9783246757200404, "kl": 0.333984375, "learning_rate": 3.66325369738339e-07, "loss": 0.0003, "reward": 1.7311198115348816, "reward_std": 0.0376368286088109, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7311197817325592, "step": 2228 }, { "clip_ratio": 0.0, "completion_length": 115.05208587646484, "epoch": 7.631399317406143, "grad_norm": 1.3934632694281408, "kl": 0.3427734375, "learning_rate": 3.660409556313993e-07, "loss": 0.0003, "reward": 1.685546875, "reward_std": 0.02012293692678213, "rewards/format_reward": 1.0, "rewards/score_reward": 0.685546875, "step": 2229 }, { "clip_ratio": 0.0, "completion_length": 114.77604293823242, "epoch": 7.63481228668942, "grad_norm": 0.5654830415040226, "kl": 0.333984375, "learning_rate": 3.6575654152445963e-07, "loss": 0.0003, "reward": 1.744140625, "reward_std": 0.012889966368675232, "rewards/format_reward": 1.0, "rewards/score_reward": 0.744140625, "step": 2230 }, { "clip_ratio": 0.0, "completion_length": 110.90885543823242, "epoch": 7.638225255972696, "grad_norm": 1.1303522105327737, "kl": 0.3515625, "learning_rate": 3.654721274175199e-07, "loss": 0.0004, "reward": 1.7701822519302368, "reward_std": 0.01275724172592163, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7701823115348816, "step": 2231 }, { "clip_ratio": 0.0, "completion_length": 113.05469131469727, "epoch": 7.6416382252559725, "grad_norm": 0.8029341647846325, "kl": 0.33203125, "learning_rate": 3.6518771331058016e-07, "loss": 0.0003, "reward": 1.8138020634651184, "reward_std": 0.025647208094596863, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8138020932674408, "step": 2232 }, { "clip_ratio": 0.0, "completion_length": 111.82031631469727, "epoch": 7.645051194539249, "grad_norm": 1.0356652600314042, "kl": 0.3359375, "learning_rate": 3.649032992036405e-07, "loss": 0.0003, "reward": 1.8619791865348816, "reward_std": 0.010782274417579174, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8619791567325592, "step": 2233 }, { "clip_ratio": 0.0, "completion_length": 114.8671875, "epoch": 7.648464163822526, "grad_norm": 1.1212667783756245, "kl": 0.3291015625, "learning_rate": 3.646188850967008e-07, "loss": 0.0003, "reward": 1.7669270634651184, "reward_std": 0.024242624640464783, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7669270932674408, "step": 2234 }, { "clip_ratio": 0.0, "completion_length": 115.68750381469727, "epoch": 7.651877133105802, "grad_norm": 1.326266435317065, "kl": 0.326171875, "learning_rate": 3.6433447098976107e-07, "loss": 0.0003, "reward": 1.7838541269302368, "reward_std": 0.04235103353857994, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7838541865348816, "step": 2235 }, { "clip_ratio": 0.0, "completion_length": 116.02864837646484, "epoch": 7.6552901023890785, "grad_norm": 0.5258952977421194, "kl": 0.341796875, "learning_rate": 3.640500568828214e-07, "loss": 0.0003, "reward": 1.7864583134651184, "reward_std": 0.014465940184891224, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7864583432674408, "step": 2236 }, { "clip_ratio": 0.0, "completion_length": 113.59114837646484, "epoch": 7.658703071672355, "grad_norm": 1.7845530510188536, "kl": 0.34375, "learning_rate": 3.6376564277588165e-07, "loss": 0.0003, "reward": 1.8274739384651184, "reward_std": 0.058918023481965065, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8274739682674408, "step": 2237 }, { "clip_ratio": 0.0, "completion_length": 114.08854293823242, "epoch": 7.662116040955631, "grad_norm": 0.9156012885229607, "kl": 0.322265625, "learning_rate": 3.6348122866894197e-07, "loss": 0.0003, "reward": 1.7708333134651184, "reward_std": 0.03669427987188101, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7708333432674408, "step": 2238 }, { "clip_ratio": 0.0, "completion_length": 113.515625, "epoch": 7.665529010238908, "grad_norm": 2.8773466582903495, "kl": 0.3251953125, "learning_rate": 3.6319681456200224e-07, "loss": 0.0003, "reward": 1.794921875, "reward_std": 0.04054917022585869, "rewards/format_reward": 1.0, "rewards/score_reward": 0.794921875, "step": 2239 }, { "clip_ratio": 0.0, "completion_length": 111.80469131469727, "epoch": 7.6689419795221845, "grad_norm": 1.5134664571253307, "kl": 0.34375, "learning_rate": 3.6291240045506256e-07, "loss": 0.0003, "reward": 1.8346354365348816, "reward_std": 0.011048543266952038, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8346354067325592, "step": 2240 }, { "clip_ratio": 0.0, "completion_length": 112.95833587646484, "epoch": 7.672354948805461, "grad_norm": 0.6577573814442512, "kl": 0.318359375, "learning_rate": 3.626279863481229e-07, "loss": 0.0003, "reward": 1.8046875, "reward_std": 0.021830817684531212, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8046875, "step": 2241 }, { "clip_ratio": 0.0, "completion_length": 114.44531631469727, "epoch": 7.675767918088737, "grad_norm": 0.009435730765119747, "kl": 0.3251953125, "learning_rate": 3.623435722411832e-07, "loss": 0.0003, "reward": 1.8854166865348816, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8854166567325592, "step": 2242 }, { "clip_ratio": 0.0, "completion_length": 117.31771087646484, "epoch": 7.679180887372014, "grad_norm": 1.2828340835918683, "kl": 0.3369140625, "learning_rate": 3.620591581342434e-07, "loss": 0.0003, "reward": 1.705078125, "reward_std": 0.03358100075274706, "rewards/format_reward": 1.0, "rewards/score_reward": 0.705078125, "step": 2243 }, { "clip_ratio": 0.0, "completion_length": 113.77344131469727, "epoch": 7.6825938566552905, "grad_norm": 1.0997554787075274, "kl": 0.3330078125, "learning_rate": 3.6177474402730373e-07, "loss": 0.0003, "reward": 1.71484375, "reward_std": 0.023254938423633575, "rewards/format_reward": 1.0, "rewards/score_reward": 0.71484375, "step": 2244 }, { "clip_ratio": 0.0, "completion_length": 119.04166793823242, "epoch": 7.686006825938566, "grad_norm": 1.4771027910165466, "kl": 0.3447265625, "learning_rate": 3.6149032992036405e-07, "loss": 0.0003, "reward": 1.6966145634651184, "reward_std": 0.029064604081213474, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6966145932674408, "step": 2245 }, { "clip_ratio": 0.0, "completion_length": 114.77083587646484, "epoch": 7.689419795221843, "grad_norm": 0.3130978262892327, "kl": 0.3330078125, "learning_rate": 3.6120591581342437e-07, "loss": 0.0003, "reward": 1.7864583134651184, "reward_std": 0.009643959812819958, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7864583432674408, "step": 2246 }, { "clip_ratio": 0.0, "completion_length": 113.90364837646484, "epoch": 7.69283276450512, "grad_norm": 0.8577997989602719, "kl": 0.34765625, "learning_rate": 3.6092150170648464e-07, "loss": 0.0003, "reward": 1.7701822519302368, "reward_std": 0.027442430146038532, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7701823115348816, "step": 2247 }, { "clip_ratio": 0.0, "completion_length": 116.0859375, "epoch": 7.696245733788396, "grad_norm": 1.3276346222881006, "kl": 0.3115234375, "learning_rate": 3.606370875995449e-07, "loss": 0.0003, "reward": 1.69921875, "reward_std": 0.04239059519022703, "rewards/format_reward": 1.0, "rewards/score_reward": 0.69921875, "step": 2248 }, { "clip_ratio": 0.0, "completion_length": 114.40364837646484, "epoch": 7.699658703071672, "grad_norm": 4.361714312585499, "kl": 0.3291015625, "learning_rate": 3.603526734926052e-07, "loss": 0.0003, "reward": 1.759765625, "reward_std": 0.052803998813033104, "rewards/format_reward": 1.0, "rewards/score_reward": 0.759765625, "step": 2249 }, { "clip_ratio": 0.0, "completion_length": 117.42448043823242, "epoch": 7.703071672354949, "grad_norm": 0.6531019023052336, "kl": 0.3427734375, "learning_rate": 3.6006825938566554e-07, "loss": 0.0003, "reward": 1.8209635615348816, "reward_std": 0.015452401712536812, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8209635317325592, "step": 2250 }, { "clip_ratio": 0.0, "completion_length": 115.40104675292969, "epoch": 7.706484641638225, "grad_norm": 0.27930964752908477, "kl": 0.34765625, "learning_rate": 3.597838452787258e-07, "loss": 0.0003, "reward": 1.7727864980697632, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7727864384651184, "step": 2251 }, { "clip_ratio": 0.0, "completion_length": 111.5546875, "epoch": 7.709897610921502, "grad_norm": 0.5040050500357406, "kl": 0.333984375, "learning_rate": 3.5949943117178613e-07, "loss": 0.0003, "reward": 1.8046875, "reward_std": 0.010782274417579174, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8046875, "step": 2252 }, { "clip_ratio": 0.0, "completion_length": 115.13542175292969, "epoch": 7.713310580204778, "grad_norm": 1.50586097472681, "kl": 0.3291015625, "learning_rate": 3.592150170648464e-07, "loss": 0.0003, "reward": 1.7688801884651184, "reward_std": 0.020122936461120844, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7688802182674408, "step": 2253 }, { "clip_ratio": 0.0, "completion_length": 112.48177337646484, "epoch": 7.716723549488055, "grad_norm": 1.6595263022106164, "kl": 0.330078125, "learning_rate": 3.5893060295790666e-07, "loss": 0.0003, "reward": 1.7395833134651184, "reward_std": 0.011048543266952038, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7395833432674408, "step": 2254 }, { "clip_ratio": 0.0, "completion_length": 113.64323043823242, "epoch": 7.720136518771331, "grad_norm": 0.16145963321136844, "kl": 0.3369140625, "learning_rate": 3.58646188850967e-07, "loss": 0.0003, "reward": 1.857421875, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.857421875, "step": 2255 }, { "clip_ratio": 0.0, "completion_length": 116.42448043823242, "epoch": 7.723549488054608, "grad_norm": 0.9826449953713379, "kl": 0.33203125, "learning_rate": 3.583617747440273e-07, "loss": 0.0003, "reward": 1.751953125, "reward_std": 0.04208729974925518, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7545572817325592, "step": 2256 }, { "clip_ratio": 0.0, "completion_length": 111.88021087646484, "epoch": 7.726962457337884, "grad_norm": 0.6281739308345112, "kl": 0.33203125, "learning_rate": 3.580773606370876e-07, "loss": 0.0003, "reward": 1.80078125, "reward_std": 0.01940045692026615, "rewards/format_reward": 1.0, "rewards/score_reward": 0.80078125, "step": 2257 }, { "clip_ratio": 0.0, "completion_length": 114.82031631469727, "epoch": 7.73037542662116, "grad_norm": 0.6663450433707563, "kl": 0.33203125, "learning_rate": 3.5779294653014783e-07, "loss": 0.0003, "reward": 1.6861979365348816, "reward_std": 0.014598664827644825, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6861979067325592, "step": 2258 }, { "clip_ratio": 0.0, "completion_length": 115.47396087646484, "epoch": 7.733788395904437, "grad_norm": 0.5095416159602015, "kl": 0.3291015625, "learning_rate": 3.5750853242320815e-07, "loss": 0.0003, "reward": 1.7135416865348816, "reward_std": 0.020692503079771996, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7135416567325592, "step": 2259 }, { "clip_ratio": 0.0, "completion_length": 116.03385925292969, "epoch": 7.737201365187714, "grad_norm": 5.906362205475309, "kl": 0.31640625, "learning_rate": 3.5722411831626847e-07, "loss": 0.0003, "reward": 1.8196614384651184, "reward_std": 0.035973270423710346, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.822265625, "step": 2260 }, { "clip_ratio": 0.0, "completion_length": 112.60937881469727, "epoch": 7.7406143344709895, "grad_norm": 0.47935615631112977, "kl": 0.3349609375, "learning_rate": 3.569397042093288e-07, "loss": 0.0003, "reward": 1.8209635615348816, "reward_std": 0.026101949624717236, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8209635317325592, "step": 2261 }, { "clip_ratio": 0.0, "completion_length": 115.01302337646484, "epoch": 7.744027303754266, "grad_norm": 0.8179018524360178, "kl": 0.330078125, "learning_rate": 3.5665529010238906e-07, "loss": 0.0003, "reward": 1.806640625, "reward_std": 0.026234674267470837, "rewards/format_reward": 1.0, "rewards/score_reward": 0.806640625, "step": 2262 }, { "clip_ratio": 0.0, "completion_length": 111.41927337646484, "epoch": 7.747440273037543, "grad_norm": 0.6925202222615015, "kl": 0.3330078125, "learning_rate": 3.563708759954493e-07, "loss": 0.0003, "reward": 1.7662760615348816, "reward_std": 0.016306546051055193, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7662760317325592, "step": 2263 }, { "clip_ratio": 0.0, "completion_length": 113.40104675292969, "epoch": 7.750853242320819, "grad_norm": 0.6979775125375691, "kl": 0.3310546875, "learning_rate": 3.5608646188850964e-07, "loss": 0.0003, "reward": 1.8326823115348816, "reward_std": 0.015319675207138062, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8326822817325592, "step": 2264 }, { "clip_ratio": 0.0, "completion_length": 112.7421875, "epoch": 7.7542662116040955, "grad_norm": 1.3275550554176319, "kl": 0.328125, "learning_rate": 3.5580204778156996e-07, "loss": 0.0003, "reward": 1.763671875, "reward_std": 0.02353951521217823, "rewards/format_reward": 1.0, "rewards/score_reward": 0.763671875, "step": 2265 }, { "clip_ratio": 0.0, "completion_length": 112.57552337646484, "epoch": 7.757679180887372, "grad_norm": 1.3568066778000116, "kl": 0.337890625, "learning_rate": 3.5551763367463023e-07, "loss": 0.0003, "reward": 1.771484375, "reward_std": 0.026500943582504988, "rewards/format_reward": 1.0, "rewards/score_reward": 0.771484375, "step": 2266 }, { "clip_ratio": 0.0, "completion_length": 112.80208587646484, "epoch": 7.761092150170649, "grad_norm": 0.5668177811577503, "kl": 0.3291015625, "learning_rate": 3.5523321956769055e-07, "loss": 0.0003, "reward": 1.8092448115348816, "reward_std": 0.020255662500858307, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8092447817325592, "step": 2267 }, { "clip_ratio": 0.0, "completion_length": 112.61979293823242, "epoch": 7.764505119453925, "grad_norm": 4.344050669108584, "kl": 0.3427734375, "learning_rate": 3.5494880546075087e-07, "loss": 0.0003, "reward": 1.8046875, "reward_std": 0.01361097814515233, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8046875, "step": 2268 }, { "clip_ratio": 0.0, "completion_length": 110.80469131469727, "epoch": 7.7679180887372015, "grad_norm": 1.1016273886872776, "kl": 0.3408203125, "learning_rate": 3.5466439135381114e-07, "loss": 0.0003, "reward": 1.8033854365348816, "reward_std": 0.029064605478197336, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8033854067325592, "step": 2269 }, { "clip_ratio": 0.0, "completion_length": 112.796875, "epoch": 7.771331058020478, "grad_norm": 0.7229005373754048, "kl": 0.337890625, "learning_rate": 3.543799772468714e-07, "loss": 0.0003, "reward": 1.8782551884651184, "reward_std": 0.02367158979177475, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8782552182674408, "step": 2270 }, { "clip_ratio": 0.0, "completion_length": 113.12500381469727, "epoch": 7.774744027303754, "grad_norm": 0.7849908388406013, "kl": 0.330078125, "learning_rate": 3.540955631399317e-07, "loss": 0.0003, "reward": 1.7135416865348816, "reward_std": 0.030270070768892765, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7135416567325592, "step": 2271 }, { "clip_ratio": 0.0, "completion_length": 112.42969131469727, "epoch": 7.778156996587031, "grad_norm": 0.6678665996190829, "kl": 0.34375, "learning_rate": 3.5381114903299204e-07, "loss": 0.0003, "reward": 1.712890625, "reward_std": 0.018545251106843352, "rewards/format_reward": 1.0, "rewards/score_reward": 0.712890625, "step": 2272 }, { "clip_ratio": 0.0, "completion_length": 110.34635925292969, "epoch": 7.7815699658703075, "grad_norm": 0.7635785310659657, "kl": 0.345703125, "learning_rate": 3.5352673492605236e-07, "loss": 0.0003, "reward": 1.7506510615348816, "reward_std": 0.026101949159055948, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7506510317325592, "step": 2273 }, { "clip_ratio": 0.0, "completion_length": 111.96094131469727, "epoch": 7.784982935153583, "grad_norm": 1.4175768503522796, "kl": 0.3388671875, "learning_rate": 3.532423208191126e-07, "loss": 0.0003, "reward": 1.7955728769302368, "reward_std": 0.03175894636660814, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7955729365348816, "step": 2274 }, { "clip_ratio": 0.0, "completion_length": 113.46354675292969, "epoch": 7.78839590443686, "grad_norm": 0.9951993552522961, "kl": 0.3447265625, "learning_rate": 3.529579067121729e-07, "loss": 0.0003, "reward": 1.67578125, "reward_std": 0.007232970092445612, "rewards/format_reward": 1.0, "rewards/score_reward": 0.67578125, "step": 2275 }, { "clip_ratio": 0.0, "completion_length": 111.8828125, "epoch": 7.791808873720137, "grad_norm": 0.6061793484146325, "kl": 0.3330078125, "learning_rate": 3.526734926052332e-07, "loss": 0.0003, "reward": 1.8463541865348816, "reward_std": 0.013610977679491043, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8463541567325592, "step": 2276 }, { "clip_ratio": 0.0, "completion_length": 112.70312881469727, "epoch": 7.795221843003413, "grad_norm": 0.7574680347247481, "kl": 0.330078125, "learning_rate": 3.5238907849829353e-07, "loss": 0.0003, "reward": 1.736328125, "reward_std": 0.028512941673398018, "rewards/format_reward": 1.0, "rewards/score_reward": 0.736328125, "step": 2277 }, { "clip_ratio": 0.0, "completion_length": 115.8359375, "epoch": 7.798634812286689, "grad_norm": 0.8865338237883009, "kl": 0.3310546875, "learning_rate": 3.521046643913538e-07, "loss": 0.0003, "reward": 1.798828125, "reward_std": 0.025096362456679344, "rewards/format_reward": 1.0, "rewards/score_reward": 0.798828125, "step": 2278 }, { "clip_ratio": 0.0, "completion_length": 114.17969131469727, "epoch": 7.802047781569966, "grad_norm": 1.1761729274142942, "kl": 0.333984375, "learning_rate": 3.5182025028441407e-07, "loss": 0.0003, "reward": 1.7506510615348816, "reward_std": 0.045238424092531204, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7506510317325592, "step": 2279 }, { "clip_ratio": 0.0, "completion_length": 114.4296875, "epoch": 7.805460750853243, "grad_norm": 0.913980906372831, "kl": 0.34375, "learning_rate": 3.515358361774744e-07, "loss": 0.0003, "reward": 1.7962239384651184, "reward_std": 0.012889966368675232, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7962239682674408, "step": 2280 }, { "clip_ratio": 0.0, "completion_length": 116.16667175292969, "epoch": 7.808873720136519, "grad_norm": 1.79922865817761, "kl": 0.341796875, "learning_rate": 3.512514220705347e-07, "loss": 0.0003, "reward": 1.80859375, "reward_std": 0.037681969814002514, "rewards/format_reward": 1.0, "rewards/score_reward": 0.80859375, "step": 2281 }, { "clip_ratio": 0.0, "completion_length": 113.59896087646484, "epoch": 7.812286689419795, "grad_norm": 1.0836096154074597, "kl": 0.3369140625, "learning_rate": 3.5096700796359497e-07, "loss": 0.0003, "reward": 1.783203125, "reward_std": 0.015452401712536812, "rewards/format_reward": 1.0, "rewards/score_reward": 0.783203125, "step": 2282 }, { "clip_ratio": 0.0, "completion_length": 114.54167175292969, "epoch": 7.815699658703072, "grad_norm": 0.7192434398094092, "kl": 0.3388671875, "learning_rate": 3.506825938566553e-07, "loss": 0.0003, "reward": 1.7200520634651184, "reward_std": 0.03502489905804396, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7200520932674408, "step": 2283 }, { "clip_ratio": 0.0, "completion_length": 115.37500381469727, "epoch": 7.819112627986348, "grad_norm": 0.269209526997317, "kl": 0.328125, "learning_rate": 3.5039817974971556e-07, "loss": 0.0003, "reward": 1.84375, "reward_std": 0.011135884560644627, "rewards/format_reward": 1.0, "rewards/score_reward": 0.84375, "step": 2284 }, { "clip_ratio": 0.0, "completion_length": 114.70312881469727, "epoch": 7.822525597269625, "grad_norm": 0.21928444819365542, "kl": 0.330078125, "learning_rate": 3.501137656427759e-07, "loss": 0.0003, "reward": 1.8098958134651184, "reward_std": 0.0073656952008605, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8098958432674408, "step": 2285 }, { "clip_ratio": 0.0, "completion_length": 115.0078125, "epoch": 7.825938566552901, "grad_norm": 0.2767319302352157, "kl": 0.33984375, "learning_rate": 3.4982935153583614e-07, "loss": 0.0003, "reward": 1.7526041865348816, "reward_std": 0.0073656952008605, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7526041567325592, "step": 2286 }, { "clip_ratio": 0.0, "completion_length": 114.33333587646484, "epoch": 7.829351535836177, "grad_norm": 0.7533662629434487, "kl": 0.3369140625, "learning_rate": 3.4954493742889646e-07, "loss": 0.0003, "reward": 1.7916666865348816, "reward_std": 0.015717608854174614, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7942708134651184, "step": 2287 }, { "clip_ratio": 0.0, "completion_length": 115.4296875, "epoch": 7.832764505119454, "grad_norm": 1.8727216827178808, "kl": 0.3291015625, "learning_rate": 3.492605233219568e-07, "loss": 0.0003, "reward": 1.705078125, "reward_std": 0.023539516609162092, "rewards/format_reward": 1.0, "rewards/score_reward": 0.705078125, "step": 2288 }, { "clip_ratio": 0.0, "completion_length": 114.55729675292969, "epoch": 7.836177474402731, "grad_norm": 0.3705699853311024, "kl": 0.34375, "learning_rate": 3.48976109215017e-07, "loss": 0.0003, "reward": 1.8352864384651184, "reward_std": 0.020255661569535732, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.837890625, "step": 2289 }, { "clip_ratio": 0.0, "completion_length": 112.73698043823242, "epoch": 7.839590443686006, "grad_norm": 2.8639625535594058, "kl": 0.32421875, "learning_rate": 3.486916951080773e-07, "loss": 0.0003, "reward": 1.8333333730697632, "reward_std": 0.014731390401721, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8333333134651184, "step": 2290 }, { "clip_ratio": 0.0, "completion_length": 114.1796875, "epoch": 7.843003412969283, "grad_norm": 1.126244300209996, "kl": 0.3359375, "learning_rate": 3.4840728100113764e-07, "loss": 0.0003, "reward": 1.8248698115348816, "reward_std": 0.021109154913574457, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8248697817325592, "step": 2291 }, { "clip_ratio": 0.0, "completion_length": 115.58073043823242, "epoch": 7.84641638225256, "grad_norm": 1.0977438385198057, "kl": 0.318359375, "learning_rate": 3.4812286689419796e-07, "loss": 0.0003, "reward": 1.7877603769302368, "reward_std": 0.021964360494166613, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7903645932674408, "step": 2292 }, { "clip_ratio": 0.0, "completion_length": 115.64323425292969, "epoch": 7.849829351535837, "grad_norm": 0.8072271898518689, "kl": 0.3310546875, "learning_rate": 3.478384527872582e-07, "loss": 0.0003, "reward": 1.7669271230697632, "reward_std": 0.020692503545433283, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7669270634651184, "step": 2293 }, { "clip_ratio": 0.0, "completion_length": 114.25000381469727, "epoch": 7.853242320819112, "grad_norm": 0.729300872317399, "kl": 0.3369140625, "learning_rate": 3.4755403868031854e-07, "loss": 0.0003, "reward": 1.8098958730697632, "reward_std": 0.021830817684531212, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8098958134651184, "step": 2294 }, { "clip_ratio": 0.0, "completion_length": 116.41666793823242, "epoch": 7.856655290102389, "grad_norm": 0.40547003850457286, "kl": 0.3369140625, "learning_rate": 3.472696245733788e-07, "loss": 0.0003, "reward": 1.830078125, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.830078125, "step": 2295 }, { "clip_ratio": 0.0, "completion_length": 114.10937881469727, "epoch": 7.860068259385666, "grad_norm": 1.2235336454090402, "kl": 0.3271484375, "learning_rate": 3.4698521046643913e-07, "loss": 0.0003, "reward": 1.6920572519302368, "reward_std": 0.01288996683433652, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6920573115348816, "step": 2296 }, { "clip_ratio": 0.0, "completion_length": 111.86979293823242, "epoch": 7.863481228668942, "grad_norm": 3.777838732818416, "kl": 0.3974609375, "learning_rate": 3.467007963594994e-07, "loss": 0.0004, "reward": 1.7903645634651184, "reward_std": 0.04289483092725277, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7903645932674408, "step": 2297 }, { "clip_ratio": 0.0, "completion_length": 118.12760925292969, "epoch": 7.8668941979522184, "grad_norm": 1.5580277339403832, "kl": 0.3349609375, "learning_rate": 3.464163822525597e-07, "loss": 0.0003, "reward": 1.759765625, "reward_std": 0.03189142793416977, "rewards/format_reward": 1.0, "rewards/score_reward": 0.759765625, "step": 2298 }, { "clip_ratio": 0.0, "completion_length": 115.48177337646484, "epoch": 7.870307167235495, "grad_norm": 0.4053557825965141, "kl": 0.3212890625, "learning_rate": 3.4613196814562003e-07, "loss": 0.0003, "reward": 1.7591145634651184, "reward_std": 0.007232970092445612, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7591145932674408, "step": 2299 }, { "clip_ratio": 0.0, "completion_length": 116.64062881469727, "epoch": 7.873720136518771, "grad_norm": 0.6636902497607949, "kl": 0.330078125, "learning_rate": 3.458475540386803e-07, "loss": 0.0003, "reward": 1.76171875, "reward_std": 0.013610977679491043, "rewards/format_reward": 1.0, "rewards/score_reward": 0.76171875, "step": 2300 }, { "clip_ratio": 0.0, "completion_length": 115.859375, "epoch": 7.877133105802048, "grad_norm": 1.1066585724552853, "kl": 0.32421875, "learning_rate": 3.4556313993174057e-07, "loss": 0.0003, "reward": 1.8600260019302368, "reward_std": 0.0300510679371655, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8600260615348816, "step": 2301 }, { "clip_ratio": 0.0, "completion_length": 115.6875, "epoch": 7.8805460750853245, "grad_norm": 0.9238165428417635, "kl": 0.31640625, "learning_rate": 3.452787258248009e-07, "loss": 0.0003, "reward": 1.8268229365348816, "reward_std": 0.01687692990526557, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8268229067325592, "step": 2302 }, { "clip_ratio": 0.0, "completion_length": 115.43750381469727, "epoch": 7.8839590443686, "grad_norm": 2.5784316093118176, "kl": 0.328125, "learning_rate": 3.449943117178612e-07, "loss": 0.0003, "reward": 1.7604166865348816, "reward_std": 0.015717608854174614, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7604166567325592, "step": 2303 }, { "clip_ratio": 0.0, "completion_length": 115.09896087646484, "epoch": 7.887372013651877, "grad_norm": 0.6385949529954131, "kl": 0.328125, "learning_rate": 3.447098976109215e-07, "loss": 0.0003, "reward": 1.890625, "reward_std": 0.022184425964951515, "rewards/format_reward": 1.0, "rewards/score_reward": 0.890625, "step": 2304 }, { "clip_ratio": 0.0, "completion_length": 116.05989837646484, "epoch": 7.890784982935154, "grad_norm": 0.4948767206624058, "kl": 0.330078125, "learning_rate": 3.4442548350398174e-07, "loss": 0.0003, "reward": 1.798828125, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.798828125, "step": 2305 }, { "clip_ratio": 0.0, "completion_length": 115.703125, "epoch": 7.8941979522184305, "grad_norm": 1.7740611576702259, "kl": 0.341796875, "learning_rate": 3.4414106939704206e-07, "loss": 0.0003, "reward": 1.8255208730697632, "reward_std": 0.052288953214883804, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8255208134651184, "step": 2306 }, { "clip_ratio": 0.0, "completion_length": 116.40104675292969, "epoch": 7.897610921501706, "grad_norm": 0.6116461343388541, "kl": 0.33984375, "learning_rate": 3.438566552901024e-07, "loss": 0.0003, "reward": 1.7845051884651184, "reward_std": 0.030004866421222687, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7845052182674408, "step": 2307 }, { "clip_ratio": 0.0, "completion_length": 115.60677337646484, "epoch": 7.901023890784983, "grad_norm": 0.9406895240232404, "kl": 0.3271484375, "learning_rate": 3.435722411831627e-07, "loss": 0.0003, "reward": 1.751953125, "reward_std": 0.02748716250061989, "rewards/format_reward": 1.0, "rewards/score_reward": 0.751953125, "step": 2308 }, { "clip_ratio": 0.0, "completion_length": 116.81510925292969, "epoch": 7.90443686006826, "grad_norm": 0.993892215673615, "kl": 0.3232421875, "learning_rate": 3.4328782707622296e-07, "loss": 0.0003, "reward": 1.796875, "reward_std": 0.036783091723918915, "rewards/format_reward": 1.0, "rewards/score_reward": 0.796875, "step": 2309 }, { "clip_ratio": 0.0, "completion_length": 114.70573425292969, "epoch": 7.907849829351536, "grad_norm": 0.5884895572302727, "kl": 0.3203125, "learning_rate": 3.4300341296928323e-07, "loss": 0.0003, "reward": 1.8229166269302368, "reward_std": 0.02536156866699457, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8229166865348816, "step": 2310 }, { "clip_ratio": 0.0, "completion_length": 116.35677337646484, "epoch": 7.911262798634812, "grad_norm": 0.33344857806037004, "kl": 0.3232421875, "learning_rate": 3.4271899886234355e-07, "loss": 0.0003, "reward": 1.720703125, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.720703125, "step": 2311 }, { "clip_ratio": 0.0, "completion_length": 115.95052337646484, "epoch": 7.914675767918089, "grad_norm": 1.762900438587733, "kl": 0.3251953125, "learning_rate": 3.4243458475540387e-07, "loss": 0.0003, "reward": 1.82421875, "reward_std": 0.05174875073134899, "rewards/format_reward": 1.0, "rewards/score_reward": 0.82421875, "step": 2312 }, { "clip_ratio": 0.0, "completion_length": 113.78125, "epoch": 7.918088737201365, "grad_norm": 0.596574628410694, "kl": 0.3193359375, "learning_rate": 3.4215017064846414e-07, "loss": 0.0003, "reward": 1.7825520634651184, "reward_std": 0.01361097814515233, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7825520932674408, "step": 2313 }, { "clip_ratio": 0.0, "completion_length": 115.52604293823242, "epoch": 7.921501706484642, "grad_norm": 0.9477351807178191, "kl": 0.330078125, "learning_rate": 3.4186575654152446e-07, "loss": 0.0003, "reward": 1.7376301884651184, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7376302182674408, "step": 2314 }, { "clip_ratio": 0.0, "completion_length": 116.91146087646484, "epoch": 7.924914675767918, "grad_norm": 1.4091975980749443, "kl": 0.3349609375, "learning_rate": 3.415813424345848e-07, "loss": 0.0003, "reward": 1.791015625, "reward_std": 0.027355907019227743, "rewards/format_reward": 1.0, "rewards/score_reward": 0.791015625, "step": 2315 }, { "clip_ratio": 0.0, "completion_length": 116.63802337646484, "epoch": 7.928327645051194, "grad_norm": 1.0686730280158954, "kl": 0.3291015625, "learning_rate": 3.4129692832764504e-07, "loss": 0.0003, "reward": 1.7578125, "reward_std": 0.014465940184891224, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7578125, "step": 2316 }, { "clip_ratio": 0.0, "completion_length": 116.55729293823242, "epoch": 7.931740614334471, "grad_norm": 0.7192941462042323, "kl": 0.3623046875, "learning_rate": 3.410125142207053e-07, "loss": 0.0004, "reward": 1.7845051884651184, "reward_std": 0.015452401712536812, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7845052182674408, "step": 2317 }, { "clip_ratio": 0.0, "completion_length": 118.74739837646484, "epoch": 7.935153583617748, "grad_norm": 0.7709198592291769, "kl": 0.314453125, "learning_rate": 3.4072810011376563e-07, "loss": 0.0003, "reward": 1.779296875, "reward_std": 0.015452401712536812, "rewards/format_reward": 1.0, "rewards/score_reward": 0.779296875, "step": 2318 }, { "clip_ratio": 0.0, "completion_length": 115.36719131469727, "epoch": 7.938566552901024, "grad_norm": 0.63100486294561, "kl": 0.330078125, "learning_rate": 3.4044368600682595e-07, "loss": 0.0003, "reward": 1.7734375, "reward_std": 0.02465951954945922, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7734375, "step": 2319 }, { "clip_ratio": 0.0, "completion_length": 118.10677337646484, "epoch": 7.9419795221843, "grad_norm": 3.3770959707912667, "kl": 0.3203125, "learning_rate": 3.4015927189988627e-07, "loss": 0.0003, "reward": 1.7467447519302368, "reward_std": 0.029899622313678265, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7467448115348816, "step": 2320 }, { "clip_ratio": 0.0, "completion_length": 118.71875, "epoch": 7.945392491467577, "grad_norm": 1.1918896441759683, "kl": 0.3134765625, "learning_rate": 3.398748577929465e-07, "loss": 0.0003, "reward": 1.76953125, "reward_std": 0.025228843092918396, "rewards/format_reward": 1.0, "rewards/score_reward": 0.76953125, "step": 2321 }, { "clip_ratio": 0.0, "completion_length": 119.83333587646484, "epoch": 7.948805460750854, "grad_norm": 0.6166092065637456, "kl": 0.330078125, "learning_rate": 3.395904436860068e-07, "loss": 0.0003, "reward": 1.8294271230697632, "reward_std": 0.011048543266952038, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8294270634651184, "step": 2322 }, { "clip_ratio": 0.0, "completion_length": 117.01302337646484, "epoch": 7.952218430034129, "grad_norm": 1.0104802476702146, "kl": 0.3203125, "learning_rate": 3.393060295790671e-07, "loss": 0.0003, "reward": 1.7291666269302368, "reward_std": 0.028778147883713245, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7291666865348816, "step": 2323 }, { "clip_ratio": 0.0, "completion_length": 121.125, "epoch": 7.955631399317406, "grad_norm": 0.8838786508802677, "kl": 0.3251953125, "learning_rate": 3.390216154721274e-07, "loss": 0.0003, "reward": 1.7239583134651184, "reward_std": 0.015717608854174614, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7239583432674408, "step": 2324 }, { "clip_ratio": 0.0, "completion_length": 117.66146087646484, "epoch": 7.959044368600683, "grad_norm": 1.7567050337871344, "kl": 0.337890625, "learning_rate": 3.387372013651877e-07, "loss": 0.0003, "reward": 1.8307291865348816, "reward_std": 0.014465940184891224, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8307291567325592, "step": 2325 }, { "clip_ratio": 0.0, "completion_length": 119.55989837646484, "epoch": 7.962457337883959, "grad_norm": 0.9719817452743201, "kl": 0.30859375, "learning_rate": 3.3845278725824797e-07, "loss": 0.0003, "reward": 1.779296875, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.779296875, "step": 2326 }, { "clip_ratio": 0.0, "completion_length": 118.30729293823242, "epoch": 7.965870307167235, "grad_norm": 0.6997906851069027, "kl": 0.32421875, "learning_rate": 3.381683731513083e-07, "loss": 0.0003, "reward": 1.7903646230697632, "reward_std": 0.028076100163161755, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.79296875, "step": 2327 }, { "clip_ratio": 0.0, "completion_length": 119.25000381469727, "epoch": 7.969283276450512, "grad_norm": 1.8291850566307113, "kl": 0.34375, "learning_rate": 3.3788395904436856e-07, "loss": 0.0003, "reward": 1.7434896230697632, "reward_std": 0.030487905722111464, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7434895634651184, "step": 2328 }, { "clip_ratio": 0.0, "completion_length": 118.39844131469727, "epoch": 7.972696245733788, "grad_norm": 0.48801935212649883, "kl": 0.318359375, "learning_rate": 3.375995449374289e-07, "loss": 0.0003, "reward": 1.8352864980697632, "reward_std": 0.010870677651837468, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8352864384651184, "step": 2329 }, { "clip_ratio": 0.0, "completion_length": 120.01041793823242, "epoch": 7.976109215017065, "grad_norm": 0.008602021893822939, "kl": 0.3251953125, "learning_rate": 3.373151308304892e-07, "loss": 0.0003, "reward": 1.8489583730697632, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8489583134651184, "step": 2330 }, { "clip_ratio": 0.0, "completion_length": 119.85937881469727, "epoch": 7.979522184300341, "grad_norm": 1.5945700835699042, "kl": 0.3271484375, "learning_rate": 3.370307167235495e-07, "loss": 0.0003, "reward": 1.7916666865348816, "reward_std": 0.019134188070893288, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7916666567325592, "step": 2331 }, { "clip_ratio": 0.0, "completion_length": 121.27344131469727, "epoch": 7.982935153583618, "grad_norm": 0.28733082362992024, "kl": 0.3046875, "learning_rate": 3.3674630261660973e-07, "loss": 0.0003, "reward": 1.7578125, "reward_std": 0.0073656952008605, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7578125, "step": 2332 }, { "clip_ratio": 0.0, "completion_length": 122.67969131469727, "epoch": 7.986348122866894, "grad_norm": 0.5469768678750697, "kl": 0.322265625, "learning_rate": 3.3646188850967005e-07, "loss": 0.0003, "reward": 1.8619791865348816, "reward_std": 0.010782274417579174, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8619791567325592, "step": 2333 }, { "clip_ratio": 0.0, "completion_length": 120.70052337646484, "epoch": 7.989761092150171, "grad_norm": 1.1802532873545912, "kl": 0.3203125, "learning_rate": 3.3617747440273037e-07, "loss": 0.0003, "reward": 1.837890625, "reward_std": 0.01773066632449627, "rewards/format_reward": 1.0, "rewards/score_reward": 0.837890625, "step": 2334 }, { "clip_ratio": 0.0, "completion_length": 121.64323425292969, "epoch": 7.993174061433447, "grad_norm": 0.4657209096217637, "kl": 0.3125, "learning_rate": 3.358930602957907e-07, "loss": 0.0003, "reward": 1.724609375, "reward_std": 0.03246205672621727, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7272135615348816, "step": 2335 }, { "clip_ratio": 0.0, "completion_length": 120.58333969116211, "epoch": 7.996587030716723, "grad_norm": 1.3989698336082452, "kl": 0.3134765625, "learning_rate": 3.3560864618885096e-07, "loss": 0.0003, "reward": 1.6666668057441711, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6666666865348816, "step": 2336 }, { "clip_ratio": 0.0, "completion_length": 120.81510925292969, "epoch": 8.003412969283277, "grad_norm": 0.8232480418985689, "kl": 0.3232421875, "learning_rate": 3.353242320819112e-07, "loss": 0.0003, "reward": 1.7532551884651184, "reward_std": 0.02748863212764263, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7532552182674408, "step": 2337 }, { "clip_ratio": 0.0, "completion_length": 120.3046875, "epoch": 8.006825938566553, "grad_norm": 0.4740348435453406, "kl": 0.3203125, "learning_rate": 3.3503981797497154e-07, "loss": 0.0003, "reward": 1.7701823115348816, "reward_std": 0.007935261353850365, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7701822817325592, "step": 2338 }, { "clip_ratio": 0.0, "completion_length": 119.17448043823242, "epoch": 8.01023890784983, "grad_norm": 0.5269877668815064, "kl": 0.3017578125, "learning_rate": 3.3475540386803186e-07, "loss": 0.0003, "reward": 1.8834635019302368, "reward_std": 0.02509636152535677, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8834635615348816, "step": 2339 }, { "clip_ratio": 0.0, "completion_length": 121.453125, "epoch": 8.013651877133105, "grad_norm": 0.17945474611052317, "kl": 0.3115234375, "learning_rate": 3.3447098976109213e-07, "loss": 0.0003, "reward": 1.7942708730697632, "reward_std": 0.0073656952008605, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.796875, "step": 2340 }, { "clip_ratio": 0.0, "completion_length": 121.04687881469727, "epoch": 8.017064846416382, "grad_norm": 0.4581995944879456, "kl": 0.3154296875, "learning_rate": 3.3418657565415245e-07, "loss": 0.0003, "reward": 1.8528646230697632, "reward_std": 0.007232970092445612, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8528645634651184, "step": 2341 }, { "clip_ratio": 0.0, "completion_length": 121.54167175292969, "epoch": 8.020477815699659, "grad_norm": 0.7677506855739138, "kl": 0.3173828125, "learning_rate": 3.339021615472127e-07, "loss": 0.0003, "reward": 1.7369791865348816, "reward_std": 0.018501579761505127, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7369791567325592, "step": 2342 }, { "clip_ratio": 0.0, "completion_length": 119.33073425292969, "epoch": 8.023890784982935, "grad_norm": 3.464214237993704, "kl": 0.318359375, "learning_rate": 3.3361774744027303e-07, "loss": 0.0003, "reward": 1.8671875, "reward_std": 0.04140290804207325, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8671875, "step": 2343 }, { "clip_ratio": 0.0, "completion_length": 120.36198043823242, "epoch": 8.027303754266212, "grad_norm": 1.9332265166473817, "kl": 0.3251953125, "learning_rate": 3.333333333333333e-07, "loss": 0.0003, "reward": 1.7578125, "reward_std": 0.018501579761505127, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7578125, "step": 2344 }, { "clip_ratio": 0.0, "completion_length": 122.09114837646484, "epoch": 8.030716723549489, "grad_norm": 1.064909132623244, "kl": 0.3203125, "learning_rate": 3.330489192263936e-07, "loss": 0.0003, "reward": 1.7180989384651184, "reward_std": 0.02834212500602007, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7180989682674408, "step": 2345 }, { "clip_ratio": 0.0, "completion_length": 123.05469131469727, "epoch": 8.034129692832764, "grad_norm": 0.008742789833525918, "kl": 0.3212890625, "learning_rate": 3.3276450511945394e-07, "loss": 0.0003, "reward": 1.6979166865348816, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6979166567325592, "step": 2346 }, { "clip_ratio": 0.0, "completion_length": 120.80469131469727, "epoch": 8.03754266211604, "grad_norm": 0.7992088236176867, "kl": 0.3349609375, "learning_rate": 3.324800910125142e-07, "loss": 0.0003, "reward": 1.7421875, "reward_std": 0.01828151335939765, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7421875, "step": 2347 }, { "clip_ratio": 0.0, "completion_length": 124.12500381469727, "epoch": 8.040955631399317, "grad_norm": 1.2165975588548343, "kl": 0.318359375, "learning_rate": 3.3219567690557447e-07, "loss": 0.0003, "reward": 1.8046875, "reward_std": 0.043073758482933044, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.8072916567325592, "step": 2348 }, { "clip_ratio": 0.0, "completion_length": 121.81771087646484, "epoch": 8.044368600682594, "grad_norm": 0.7165554455535094, "kl": 0.3154296875, "learning_rate": 3.319112627986348e-07, "loss": 0.0003, "reward": 1.8463541269302368, "reward_std": 0.01361097814515233, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8463541865348816, "step": 2349 }, { "clip_ratio": 0.0, "completion_length": 120.13021087646484, "epoch": 8.04778156996587, "grad_norm": 0.7891727975608402, "kl": 0.3388671875, "learning_rate": 3.316268486916951e-07, "loss": 0.0003, "reward": 1.7805989384651184, "reward_std": 0.03825153410434723, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.783203125, "step": 2350 }, { "clip_ratio": 0.0, "completion_length": 121.13281631469727, "epoch": 8.051194539249147, "grad_norm": 1.586991371201763, "kl": 0.3232421875, "learning_rate": 3.3134243458475543e-07, "loss": 0.0003, "reward": 1.7272135019302368, "reward_std": 0.018868979066610336, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7272135615348816, "step": 2351 }, { "clip_ratio": 0.0, "completion_length": 122.65364837646484, "epoch": 8.054607508532424, "grad_norm": 1.5348551093309588, "kl": 0.310546875, "learning_rate": 3.3105802047781565e-07, "loss": 0.0003, "reward": 1.8580728769302368, "reward_std": 0.05352647975087166, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8580729365348816, "step": 2352 }, { "clip_ratio": 0.0, "completion_length": 122.38021087646484, "epoch": 8.058020477815699, "grad_norm": 0.5616681633868732, "kl": 0.3134765625, "learning_rate": 3.3077360637087596e-07, "loss": 0.0003, "reward": 1.7766926884651184, "reward_std": 0.016306546051055193, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7766927182674408, "step": 2353 }, { "clip_ratio": 0.0, "completion_length": 124.68229293823242, "epoch": 8.061433447098976, "grad_norm": 0.23211723645417587, "kl": 0.322265625, "learning_rate": 3.304891922639363e-07, "loss": 0.0003, "reward": 1.8307291865348816, "reward_std": 0.0073656952008605, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8307291567325592, "step": 2354 }, { "clip_ratio": 0.0, "completion_length": 117.69791793823242, "epoch": 8.064846416382252, "grad_norm": 0.8315828081864043, "kl": 0.3173828125, "learning_rate": 3.302047781569966e-07, "loss": 0.0003, "reward": 1.82421875, "reward_std": 0.007232970092445612, "rewards/format_reward": 1.0, "rewards/score_reward": 0.82421875, "step": 2355 }, { "clip_ratio": 0.0, "completion_length": 119.21875, "epoch": 8.06825938566553, "grad_norm": 1.5454406996462156, "kl": 0.3271484375, "learning_rate": 3.2992036405005687e-07, "loss": 0.0003, "reward": 1.7994791865348816, "reward_std": 0.02084394684061408, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7994791567325592, "step": 2356 }, { "clip_ratio": 0.0, "completion_length": 121.23958587646484, "epoch": 8.071672354948806, "grad_norm": 1.4658272388995186, "kl": 0.328125, "learning_rate": 3.296359499431172e-07, "loss": 0.0003, "reward": 1.7786458134651184, "reward_std": 0.0073656952008605, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.78125, "step": 2357 }, { "clip_ratio": 0.0, "completion_length": 122.58333587646484, "epoch": 8.075085324232083, "grad_norm": 0.16481748753834313, "kl": 0.306640625, "learning_rate": 3.2935153583617746e-07, "loss": 0.0003, "reward": 1.8411458134651184, "reward_std": 0.0073656952008605, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.84375, "step": 2358 }, { "clip_ratio": 0.0, "completion_length": 123.48437881469727, "epoch": 8.078498293515358, "grad_norm": 1.6123685016482805, "kl": 0.306640625, "learning_rate": 3.290671217292377e-07, "loss": 0.0003, "reward": 1.8645833730697632, "reward_std": 0.024375349283218384, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.8671875, "step": 2359 }, { "clip_ratio": 0.0, "completion_length": 122.40885543823242, "epoch": 8.081911262798634, "grad_norm": 0.5712331399188865, "kl": 0.3125, "learning_rate": 3.2878270762229804e-07, "loss": 0.0003, "reward": 1.7643229365348816, "reward_std": 0.013610977679491043, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7643229067325592, "step": 2360 }, { "clip_ratio": 0.0, "completion_length": 120.90364837646484, "epoch": 8.085324232081911, "grad_norm": 0.8204495795372315, "kl": 0.3134765625, "learning_rate": 3.2849829351535836e-07, "loss": 0.0003, "reward": 1.80859375, "reward_std": 0.023254938423633575, "rewards/format_reward": 1.0, "rewards/score_reward": 0.80859375, "step": 2361 }, { "clip_ratio": 0.0, "completion_length": 120.90885543823242, "epoch": 8.088737201365188, "grad_norm": 0.7345343345074865, "kl": 0.31640625, "learning_rate": 3.282138794084187e-07, "loss": 0.0003, "reward": 1.82421875, "reward_std": 0.04125227965414524, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.8268229067325592, "step": 2362 }, { "clip_ratio": 0.0, "completion_length": 118.13021087646484, "epoch": 8.092150170648464, "grad_norm": 0.008764263398386398, "kl": 0.3212890625, "learning_rate": 3.279294653014789e-07, "loss": 0.0003, "reward": 1.6875, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6875, "step": 2363 }, { "clip_ratio": 0.0, "completion_length": 122.22916793823242, "epoch": 8.095563139931741, "grad_norm": 0.6475524988335198, "kl": 0.3193359375, "learning_rate": 3.276450511945392e-07, "loss": 0.0003, "reward": 1.794921875, "reward_std": 0.01657281443476677, "rewards/format_reward": 1.0, "rewards/score_reward": 0.794921875, "step": 2364 }, { "clip_ratio": 0.0, "completion_length": 118.76823043823242, "epoch": 8.098976109215018, "grad_norm": 1.6311450271662755, "kl": 0.3125, "learning_rate": 3.2736063708759953e-07, "loss": 0.0003, "reward": 1.8404948115348816, "reward_std": 0.03480747388675809, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8404947817325592, "step": 2365 }, { "clip_ratio": 0.0, "completion_length": 119.69010543823242, "epoch": 8.102389078498293, "grad_norm": 0.3192135923377346, "kl": 0.314453125, "learning_rate": 3.2707622298065985e-07, "loss": 0.0003, "reward": 1.7565103769302368, "reward_std": 0.007232970092445612, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7565104365348816, "step": 2366 }, { "clip_ratio": 0.0, "completion_length": 117.51823043823242, "epoch": 8.10580204778157, "grad_norm": 0.5685600996234285, "kl": 0.328125, "learning_rate": 3.267918088737201e-07, "loss": 0.0003, "reward": 1.7669270634651184, "reward_std": 0.007232970092445612, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7669270932674408, "step": 2367 }, { "clip_ratio": 0.0, "completion_length": 118.26041793823242, "epoch": 8.109215017064846, "grad_norm": 0.5029906356665039, "kl": 0.3203125, "learning_rate": 3.265073947667804e-07, "loss": 0.0003, "reward": 1.8489583730697632, "reward_std": 0.025779934134334326, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.8515625, "step": 2368 }, { "clip_ratio": 0.0, "completion_length": 121.72916793823242, "epoch": 8.112627986348123, "grad_norm": 0.8535736384201708, "kl": 0.3076171875, "learning_rate": 3.262229806598407e-07, "loss": 0.0003, "reward": 1.7721353769302368, "reward_std": 0.0377208786085248, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7721354365348816, "step": 2369 }, { "clip_ratio": 0.0, "completion_length": 120.29948043823242, "epoch": 8.1160409556314, "grad_norm": 0.6824781145631633, "kl": 0.3115234375, "learning_rate": 3.25938566552901e-07, "loss": 0.0003, "reward": 1.7682291865348816, "reward_std": 0.017009655013680458, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7682291567325592, "step": 2370 }, { "clip_ratio": 0.0, "completion_length": 119.45573425292969, "epoch": 8.119453924914676, "grad_norm": 0.5008703494558315, "kl": 0.33203125, "learning_rate": 3.256541524459613e-07, "loss": 0.0003, "reward": 1.7291666865348816, "reward_std": 0.021564548835158348, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7291666567325592, "step": 2371 }, { "clip_ratio": 0.0, "completion_length": 119.35417175292969, "epoch": 8.122866894197951, "grad_norm": 1.1463159946697519, "kl": 0.3349609375, "learning_rate": 3.253697383390216e-07, "loss": 0.0003, "reward": 1.7923176884651184, "reward_std": 0.02253392618149519, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7923177182674408, "step": 2372 }, { "clip_ratio": 0.0, "completion_length": 118.56510543823242, "epoch": 8.126279863481228, "grad_norm": 0.5418196818738218, "kl": 0.3212890625, "learning_rate": 3.250853242320819e-07, "loss": 0.0003, "reward": 1.828125, "reward_std": 0.015717608854174614, "rewards/format_reward": 1.0, "rewards/score_reward": 0.828125, "step": 2373 }, { "clip_ratio": 0.0, "completion_length": 118.73177337646484, "epoch": 8.129692832764505, "grad_norm": 0.5938812845133525, "kl": 0.314453125, "learning_rate": 3.248009101251422e-07, "loss": 0.0003, "reward": 1.7838541269302368, "reward_std": 0.023083304055035114, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.7864583432674408, "step": 2374 }, { "clip_ratio": 0.0, "completion_length": 119.76042175292969, "epoch": 8.133105802047782, "grad_norm": 0.02851748841146074, "kl": 0.3154296875, "learning_rate": 3.2451649601820246e-07, "loss": 0.0003, "reward": 1.7916666865348816, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7916666567325592, "step": 2375 }, { "clip_ratio": 0.0, "completion_length": 120.51041793823242, "epoch": 8.136518771331058, "grad_norm": 19.16208978824412, "kl": 0.31640625, "learning_rate": 3.242320819112628e-07, "loss": 0.0003, "reward": 1.779296875, "reward_std": 0.033334920182824135, "rewards/format_reward": 1.0, "rewards/score_reward": 0.779296875, "step": 2376 }, { "clip_ratio": 0.0, "completion_length": 119.20833587646484, "epoch": 8.139931740614335, "grad_norm": 0.8949234853271726, "kl": 0.32421875, "learning_rate": 3.239476678043231e-07, "loss": 0.0003, "reward": 1.833984375, "reward_std": 0.024025851860642433, "rewards/format_reward": 1.0, "rewards/score_reward": 0.833984375, "step": 2377 }, { "clip_ratio": 0.0, "completion_length": 117.265625, "epoch": 8.143344709897612, "grad_norm": 0.9405162778779174, "kl": 0.3330078125, "learning_rate": 3.236632536973834e-07, "loss": 0.0003, "reward": 1.83984375, "reward_std": 0.007232970092445612, "rewards/format_reward": 1.0, "rewards/score_reward": 0.83984375, "step": 2378 }, { "clip_ratio": 0.0, "completion_length": 116.82552337646484, "epoch": 8.146757679180887, "grad_norm": 1.3106987348459145, "kl": 0.3291015625, "learning_rate": 3.2337883959044364e-07, "loss": 0.0003, "reward": 1.7513021230697632, "reward_std": 0.018015244510024786, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7513020634651184, "step": 2379 }, { "clip_ratio": 0.0, "completion_length": 117.91406631469727, "epoch": 8.150170648464163, "grad_norm": 0.3001725949702059, "kl": 0.32421875, "learning_rate": 3.2309442548350396e-07, "loss": 0.0003, "reward": 1.8307291865348816, "reward_std": 0.010782274417579174, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8307291567325592, "step": 2380 }, { "clip_ratio": 0.0, "completion_length": 118.32812881469727, "epoch": 8.15358361774744, "grad_norm": 1.4044897723858454, "kl": 0.3193359375, "learning_rate": 3.228100113765643e-07, "loss": 0.0003, "reward": 1.8619791865348816, "reward_std": 0.020843948237597942, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8619791567325592, "step": 2381 }, { "clip_ratio": 0.0, "completion_length": 118.59635543823242, "epoch": 8.156996587030717, "grad_norm": 1.6843436860030956, "kl": 0.3369140625, "learning_rate": 3.225255972696246e-07, "loss": 0.0003, "reward": 1.7122395634651184, "reward_std": 0.028209642507135868, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7122395932674408, "step": 2382 }, { "clip_ratio": 0.0, "completion_length": 118.79687881469727, "epoch": 8.160409556313994, "grad_norm": 0.9135773055459847, "kl": 0.314453125, "learning_rate": 3.2224118316268486e-07, "loss": 0.0003, "reward": 1.8450520634651184, "reward_std": 0.03031627368181944, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8450520932674408, "step": 2383 }, { "clip_ratio": 0.0, "completion_length": 122.4296875, "epoch": 8.16382252559727, "grad_norm": 0.9274864165508852, "kl": 0.33203125, "learning_rate": 3.2195676905574513e-07, "loss": 0.0003, "reward": 1.7623698115348816, "reward_std": 0.02481219172477722, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7623697817325592, "step": 2384 }, { "clip_ratio": 0.0, "completion_length": 116.44010925292969, "epoch": 8.167235494880545, "grad_norm": 0.536792674917108, "kl": 0.3193359375, "learning_rate": 3.2167235494880545e-07, "loss": 0.0003, "reward": 1.8626302480697632, "reward_std": 0.01275724172592163, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8626301884651184, "step": 2385 }, { "clip_ratio": 0.0, "completion_length": 120.21354293823242, "epoch": 8.170648464163822, "grad_norm": 0.558367696214709, "kl": 0.302734375, "learning_rate": 3.2138794084186577e-07, "loss": 0.0003, "reward": 1.7903645634651184, "reward_std": 0.021830817218869925, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7903645932674408, "step": 2386 }, { "clip_ratio": 0.0, "completion_length": 122.46094131469727, "epoch": 8.174061433447099, "grad_norm": 0.6599549443829297, "kl": 0.31640625, "learning_rate": 3.2110352673492603e-07, "loss": 0.0003, "reward": 1.84765625, "reward_std": 0.03544261306524277, "rewards/format_reward": 1.0, "rewards/score_reward": 0.84765625, "step": 2387 }, { "clip_ratio": 0.0, "completion_length": 119.41927337646484, "epoch": 8.177474402730375, "grad_norm": 0.5052662636791856, "kl": 0.322265625, "learning_rate": 3.2081911262798635e-07, "loss": 0.0003, "reward": 1.83203125, "reward_std": 0.035378510132431984, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.8346354067325592, "step": 2388 }, { "clip_ratio": 0.0, "completion_length": 123.12500381469727, "epoch": 8.180887372013652, "grad_norm": 1.5295058198231068, "kl": 0.3154296875, "learning_rate": 3.205346985210466e-07, "loss": 0.0003, "reward": 1.8424479365348816, "reward_std": 0.01687692990526557, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8424479067325592, "step": 2389 }, { "clip_ratio": 0.0, "completion_length": 122.26041793823242, "epoch": 8.184300341296929, "grad_norm": 0.6904504028500299, "kl": 0.314453125, "learning_rate": 3.2025028441410694e-07, "loss": 0.0003, "reward": 1.7799479365348816, "reward_std": 0.0439287219196558, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7799479067325592, "step": 2390 }, { "clip_ratio": 0.0, "completion_length": 118.74739837646484, "epoch": 8.187713310580206, "grad_norm": 1.245610980113232, "kl": 0.3232421875, "learning_rate": 3.199658703071672e-07, "loss": 0.0003, "reward": 1.8138020634651184, "reward_std": 0.01841423800215125, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8138020932674408, "step": 2391 }, { "clip_ratio": 0.0, "completion_length": 122.57552337646484, "epoch": 8.19112627986348, "grad_norm": 0.9500404111558305, "kl": 0.3095703125, "learning_rate": 3.196814562002275e-07, "loss": 0.0003, "reward": 1.8430989384651184, "reward_std": 0.021109154913574457, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8430989682674408, "step": 2392 }, { "clip_ratio": 0.0, "completion_length": 118.2890625, "epoch": 8.194539249146757, "grad_norm": 1.0223146257666167, "kl": 0.3271484375, "learning_rate": 3.1939704209328785e-07, "loss": 0.0003, "reward": 1.76171875, "reward_std": 0.007232970092445612, "rewards/format_reward": 1.0, "rewards/score_reward": 0.76171875, "step": 2393 }, { "clip_ratio": 0.0, "completion_length": 118.68229293823242, "epoch": 8.197952218430034, "grad_norm": 1.1136402897725077, "kl": 0.3203125, "learning_rate": 3.1911262798634806e-07, "loss": 0.0003, "reward": 1.8014323115348816, "reward_std": 0.013876185286790133, "rewards/format_reward": 1.0, "rewards/score_reward": 0.8014322817325592, "step": 2394 }, { "clip_ratio": 0.0, "completion_length": 119.30208587646484, "epoch": 8.20136518771331, "grad_norm": 0.6312022426949697, "kl": 0.3212890625, "learning_rate": 3.188282138794084e-07, "loss": 0.0003, "reward": 1.7727864980697632, "reward_std": 0.005524271633476019, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7727864384651184, "step": 2395 }, { "clip_ratio": 0.0, "completion_length": 122.39844131469727, "epoch": 8.204778156996587, "grad_norm": 1.4256597864485845, "kl": 0.3212890625, "learning_rate": 3.185437997724687e-07, "loss": 0.0003, "reward": 1.7252604365348816, "reward_std": 0.03147477610036731, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7252604067325592, "step": 2396 }, { "clip_ratio": 0.0, "completion_length": 116.14583587646484, "epoch": 8.208191126279864, "grad_norm": 0.9778767583628202, "kl": 0.3359375, "learning_rate": 3.18259385665529e-07, "loss": 0.0003, "reward": 1.7877603769302368, "reward_std": 0.018015244975686073, "rewards/format_reward": 1.0, "rewards/score_reward": 0.7877604365348816, "step": 2397 }, { "clip_ratio": 0.0, "completion_length": 121.09114837646484, "epoch": 8.211604095563139, "grad_norm": 1.9517033443142366, "kl": 0.3056640625, "learning_rate": 3.179749715585893e-07, "loss": 0.0003, "reward": 1.76953125, "reward_std": 0.011048543266952038, "rewards/format_reward": 1.0, "rewards/score_reward": 0.76953125, "step": 2398 }, { "clip_ratio": 0.0, "completion_length": 121.15104675292969, "epoch": 8.215017064846416, "grad_norm": 1.5940780459524697, "kl": 0.3427734375, "learning_rate": 3.1769055745164955e-07, "loss": 0.0003, "reward": 1.84765625, "reward_std": 0.026367809623479843, "rewards/format_reward": 0.9973958432674408, "rewards/score_reward": 0.8502604365348816, "step": 2399 }, { "clip_ratio": 0.0, "completion_length": 121.76041793823242, "epoch": 8.218430034129693, "grad_norm": 0.5361045853608327, "kl": 0.3212890625, "learning_rate": 3.1740614334470987e-07, "loss": 0.0003, "reward": 1.6061198115348816, "reward_std": 0.01275724172592163, "rewards/format_reward": 1.0, "rewards/score_reward": 0.6061197817325592, "step": 2400 } ], "logging_steps": 1.0, "max_steps": 3516, "num_input_tokens_seen": 0, "num_train_epochs": 12, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }