{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7799805004874878, "eval_steps": 500, "global_step": 1200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 158.8828125, "epoch": 0.0006499837504062399, "grad_norm": 1.5806399583816528, "kl": 0.0, "learning_rate": 9.993498049414824e-07, "loss": 0.0, "reward": 0.7850136160850525, "reward_std": 0.42312540113925934, "rewards/format_reward_gen": 0.2734375, "rewards/llm_reward": 0.5115761458873749, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 164.12109375, "epoch": 0.0012999675008124798, "grad_norm": 1.417451024055481, "kl": 0.00043773651123046875, "learning_rate": 9.98699609882965e-07, "loss": 0.0, "reward": 0.8027656972408295, "reward_std": 0.45912837982177734, "rewards/format_reward_gen": 0.30859375, "rewards/llm_reward": 0.49417197704315186, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 168.50390625, "epoch": 0.0019499512512187196, "grad_norm": 2.877234935760498, "kl": 0.0030670166015625, "learning_rate": 9.980494148244474e-07, "loss": 0.0001, "reward": 0.8528703451156616, "reward_std": 0.4842498302459717, "rewards/format_reward_gen": 0.3359375, "rewards/llm_reward": 0.5169328451156616, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 168.33984375, "epoch": 0.0025999350016249595, "grad_norm": 1.3875830173492432, "kl": 0.018035888671875, "learning_rate": 9.973992197659298e-07, "loss": 0.0007, "reward": 0.7684720754623413, "reward_std": 0.4962170720100403, "rewards/format_reward_gen": 0.33984375, "rewards/llm_reward": 0.4286283254623413, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 179.25, "epoch": 0.003249918752031199, "grad_norm": 2.66680645942688, "kl": 0.0186767578125, "learning_rate": 9.967490247074121e-07, "loss": 0.0007, "reward": 0.805337518453598, "reward_std": 0.508876770734787, "rewards/format_reward_gen": 0.31640625, "rewards/llm_reward": 0.48893125355243683, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 164.9921875, "epoch": 0.003899902502437439, "grad_norm": 5.002059459686279, "kl": 0.126220703125, "learning_rate": 9.960988296488945e-07, "loss": 0.005, "reward": 0.9243552386760712, "reward_std": 0.45675380527973175, "rewards/format_reward_gen": 0.34375, "rewards/llm_reward": 0.5806052088737488, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 142.69140625, "epoch": 0.004549886252843679, "grad_norm": 5.563825607299805, "kl": 0.23291015625, "learning_rate": 9.954486345903772e-07, "loss": 0.0093, "reward": 0.8668138086795807, "reward_std": 0.35726678371429443, "rewards/format_reward_gen": 0.27734375, "rewards/llm_reward": 0.5894700884819031, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 142.25, "epoch": 0.005199870003249919, "grad_norm": 2.8617208003997803, "kl": 0.29296875, "learning_rate": 9.947984395318595e-07, "loss": 0.0117, "reward": 0.9167207181453705, "reward_std": 0.40177732706069946, "rewards/format_reward_gen": 0.31640625, "rewards/llm_reward": 0.6003144979476929, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 156.015625, "epoch": 0.005849853753656159, "grad_norm": 1.5311064720153809, "kl": 0.197998046875, "learning_rate": 9.94148244473342e-07, "loss": 0.0079, "reward": 0.8728009462356567, "reward_std": 0.38377636671066284, "rewards/format_reward_gen": 0.265625, "rewards/llm_reward": 0.6071759164333344, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 81.34375, "epoch": 0.006499837504062398, "grad_norm": 4.934182167053223, "kl": 0.970703125, "learning_rate": 9.934980494148243e-07, "loss": 0.0388, "reward": 0.9246318936347961, "reward_std": 0.29694148898124695, "rewards/format_reward_gen": 0.1875, "rewards/llm_reward": 0.7371319234371185, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 112.57421875, "epoch": 0.007149821254468639, "grad_norm": 3.345250368118286, "kl": 0.6796875, "learning_rate": 9.928478543563067e-07, "loss": 0.0272, "reward": 0.9390791058540344, "reward_std": 0.3392036259174347, "rewards/format_reward_gen": 0.25390625, "rewards/llm_reward": 0.6851729154586792, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 62.27734375, "epoch": 0.007799805004874878, "grad_norm": 5.069005012512207, "kl": 1.61328125, "learning_rate": 9.921976592977893e-07, "loss": 0.0646, "reward": 0.9808790385723114, "reward_std": 0.23841515183448792, "rewards/format_reward_gen": 0.1875, "rewards/llm_reward": 0.7933789789676666, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 94.28515625, "epoch": 0.008449788755281119, "grad_norm": 4.095582485198975, "kl": 0.8359375, "learning_rate": 9.915474642392717e-07, "loss": 0.0334, "reward": 1.0193516314029694, "reward_std": 0.3181927353143692, "rewards/format_reward_gen": 0.26953125, "rewards/llm_reward": 0.7498204112052917, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 94.98828125, "epoch": 0.009099772505687357, "grad_norm": 4.370489120483398, "kl": 0.958984375, "learning_rate": 9.908972691807541e-07, "loss": 0.0383, "reward": 0.9611796140670776, "reward_std": 0.21234361827373505, "rewards/format_reward_gen": 0.21484375, "rewards/llm_reward": 0.7463358044624329, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 67.265625, "epoch": 0.009749756256093598, "grad_norm": 4.546682834625244, "kl": 1.15234375, "learning_rate": 9.902470741222365e-07, "loss": 0.0461, "reward": 0.9888086020946503, "reward_std": 0.20395110547542572, "rewards/format_reward_gen": 0.1953125, "rewards/llm_reward": 0.7934961318969727, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 104.375, "epoch": 0.010399740006499838, "grad_norm": 3.779834270477295, "kl": 0.923828125, "learning_rate": 9.89596879063719e-07, "loss": 0.037, "reward": 1.0390560030937195, "reward_std": 0.2830479294061661, "rewards/format_reward_gen": 0.3125, "rewards/llm_reward": 0.7265560030937195, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 89.18359375, "epoch": 0.011049723756906077, "grad_norm": 3.4755823612213135, "kl": 1.041015625, "learning_rate": 9.889466840052015e-07, "loss": 0.0416, "reward": 1.0938486456871033, "reward_std": 0.2198621854186058, "rewards/format_reward_gen": 0.2890625, "rewards/llm_reward": 0.8047861456871033, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 96.55078125, "epoch": 0.011699707507312317, "grad_norm": 2.7385542392730713, "kl": 1.123046875, "learning_rate": 9.88296488946684e-07, "loss": 0.045, "reward": 1.066490113735199, "reward_std": 0.21102812886238098, "rewards/format_reward_gen": 0.27734375, "rewards/llm_reward": 0.7891463041305542, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 100.98046875, "epoch": 0.012349691257718558, "grad_norm": 3.7986505031585693, "kl": 1.13671875, "learning_rate": 9.876462938881665e-07, "loss": 0.0454, "reward": 1.1051547527313232, "reward_std": 0.282667875289917, "rewards/format_reward_gen": 0.35546875, "rewards/llm_reward": 0.7496860921382904, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 81.3515625, "epoch": 0.012999675008124796, "grad_norm": 3.298828601837158, "kl": 1.09765625, "learning_rate": 9.86996098829649e-07, "loss": 0.0439, "reward": 1.0624216794967651, "reward_std": 0.2140645831823349, "rewards/format_reward_gen": 0.26953125, "rewards/llm_reward": 0.7928904294967651, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 117.3203125, "epoch": 0.013649658758531037, "grad_norm": 3.1331675052642822, "kl": 0.7890625, "learning_rate": 9.863459037711313e-07, "loss": 0.0316, "reward": 1.0443809628486633, "reward_std": 0.2460612803697586, "rewards/format_reward_gen": 0.34765625, "rewards/llm_reward": 0.6967247426509857, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 104.8828125, "epoch": 0.014299642508937277, "grad_norm": 2.9134809970855713, "kl": 1.18359375, "learning_rate": 9.856957087126137e-07, "loss": 0.0474, "reward": 1.1253960728645325, "reward_std": 0.18913496285676956, "rewards/format_reward_gen": 0.36328125, "rewards/llm_reward": 0.7621147930622101, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 125.61328125, "epoch": 0.014949626259343516, "grad_norm": 2.8173060417175293, "kl": 0.94140625, "learning_rate": 9.850455136540961e-07, "loss": 0.0377, "reward": 1.1247712969779968, "reward_std": 0.22840051352977753, "rewards/format_reward_gen": 0.39453125, "rewards/llm_reward": 0.7302401065826416, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 93.37890625, "epoch": 0.015599610009749756, "grad_norm": 2.9835970401763916, "kl": 1.037109375, "learning_rate": 9.843953185955787e-07, "loss": 0.0415, "reward": 1.1317469477653503, "reward_std": 0.22205300629138947, "rewards/format_reward_gen": 0.33984375, "rewards/llm_reward": 0.7919031977653503, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 96.29296875, "epoch": 0.016249593760155997, "grad_norm": 2.0694520473480225, "kl": 0.861328125, "learning_rate": 9.837451235370611e-07, "loss": 0.0344, "reward": 1.1294084787368774, "reward_std": 0.193441703915596, "rewards/format_reward_gen": 0.390625, "rewards/llm_reward": 0.738783448934555, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 78.5859375, "epoch": 0.016899577510562237, "grad_norm": 3.2725260257720947, "kl": 1.67578125, "learning_rate": 9.830949284785435e-07, "loss": 0.067, "reward": 1.0869756937026978, "reward_std": 0.19627541303634644, "rewards/format_reward_gen": 0.3125, "rewards/llm_reward": 0.7744756639003754, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 109.99609375, "epoch": 0.017549561260968474, "grad_norm": 3.843336820602417, "kl": 0.9375, "learning_rate": 9.82444733420026e-07, "loss": 0.0375, "reward": 1.2960811257362366, "reward_std": 0.18611512333154678, "rewards/format_reward_gen": 0.5, "rewards/llm_reward": 0.7960811257362366, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 113.453125, "epoch": 0.018199545011374715, "grad_norm": 4.205933094024658, "kl": 0.82421875, "learning_rate": 9.817945383615083e-07, "loss": 0.033, "reward": 1.2512186169624329, "reward_std": 0.21914777904748917, "rewards/format_reward_gen": 0.4765625, "rewards/llm_reward": 0.7746560871601105, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 113.9296875, "epoch": 0.018849528761780955, "grad_norm": 2.607285499572754, "kl": 0.99609375, "learning_rate": 9.81144343302991e-07, "loss": 0.0398, "reward": 1.0928421020507812, "reward_std": 0.18951992690563202, "rewards/format_reward_gen": 0.37109375, "rewards/llm_reward": 0.7217483222484589, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 140.19140625, "epoch": 0.019499512512187196, "grad_norm": 2.9757184982299805, "kl": 0.923828125, "learning_rate": 9.804941482444733e-07, "loss": 0.0369, "reward": 1.190615177154541, "reward_std": 0.2875594049692154, "rewards/format_reward_gen": 0.4453125, "rewards/llm_reward": 0.745302677154541, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 107.58984375, "epoch": 0.020149496262593436, "grad_norm": 3.8264060020446777, "kl": 1.25390625, "learning_rate": 9.798439531859557e-07, "loss": 0.0501, "reward": 1.14525306224823, "reward_std": 0.2152007669210434, "rewards/format_reward_gen": 0.4296875, "rewards/llm_reward": 0.7155655026435852, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 120.63671875, "epoch": 0.020799480012999676, "grad_norm": 3.3717663288116455, "kl": 0.849609375, "learning_rate": 9.791937581274381e-07, "loss": 0.034, "reward": 1.200701355934143, "reward_std": 0.21967582404613495, "rewards/format_reward_gen": 0.44140625, "rewards/llm_reward": 0.7592951059341431, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 127.125, "epoch": 0.021449463763405913, "grad_norm": 4.208988189697266, "kl": 0.625, "learning_rate": 9.785435630689205e-07, "loss": 0.025, "reward": 1.3637093305587769, "reward_std": 0.23683421313762665, "rewards/format_reward_gen": 0.5859375, "rewards/llm_reward": 0.7777718603610992, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 144.0703125, "epoch": 0.022099447513812154, "grad_norm": 5.85151481628418, "kl": 0.544921875, "learning_rate": 9.778933680104031e-07, "loss": 0.0218, "reward": 1.4088477492332458, "reward_std": 0.2810004949569702, "rewards/format_reward_gen": 0.6484375, "rewards/llm_reward": 0.7604102492332458, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 157.95703125, "epoch": 0.022749431264218394, "grad_norm": 2.8424575328826904, "kl": 0.443359375, "learning_rate": 9.772431729518855e-07, "loss": 0.0177, "reward": 1.3464818596839905, "reward_std": 0.2753034234046936, "rewards/format_reward_gen": 0.62890625, "rewards/llm_reward": 0.7175756096839905, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 146.87890625, "epoch": 0.023399415014624635, "grad_norm": 2.6199090480804443, "kl": 0.51953125, "learning_rate": 9.76592977893368e-07, "loss": 0.0208, "reward": 1.3032031059265137, "reward_std": 0.2720598429441452, "rewards/format_reward_gen": 0.6484375, "rewards/llm_reward": 0.6547656357288361, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 175.03515625, "epoch": 0.024049398765030875, "grad_norm": 2.2387523651123047, "kl": 0.137939453125, "learning_rate": 9.759427828348505e-07, "loss": 0.0055, "reward": 1.5512778759002686, "reward_std": 0.23974967002868652, "rewards/format_reward_gen": 0.86328125, "rewards/llm_reward": 0.6879966557025909, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 151.6015625, "epoch": 0.024699382515437115, "grad_norm": 6.489838123321533, "kl": 0.26953125, "learning_rate": 9.75292587776333e-07, "loss": 0.0108, "reward": 1.5718271136283875, "reward_std": 0.23181036114692688, "rewards/format_reward_gen": 0.81640625, "rewards/llm_reward": 0.7554208040237427, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 166.015625, "epoch": 0.025349366265843352, "grad_norm": 1.4463943243026733, "kl": 0.14794921875, "learning_rate": 9.746423927178153e-07, "loss": 0.0059, "reward": 1.566750168800354, "reward_std": 0.20850246399641037, "rewards/format_reward_gen": 0.859375, "rewards/llm_reward": 0.707375168800354, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 185.61328125, "epoch": 0.025999350016249593, "grad_norm": 0.973233163356781, "kl": 0.0330810546875, "learning_rate": 9.739921976592977e-07, "loss": 0.0013, "reward": 1.56285959482193, "reward_std": 0.2545943260192871, "rewards/format_reward_gen": 0.8828125, "rewards/llm_reward": 0.6800470352172852, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 193.6953125, "epoch": 0.026649333766655833, "grad_norm": 1.3716545104980469, "kl": 0.02423095703125, "learning_rate": 9.7334200260078e-07, "loss": 0.001, "reward": 1.6033917665481567, "reward_std": 0.2058720514178276, "rewards/format_reward_gen": 0.9375, "rewards/llm_reward": 0.665891706943512, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 193.671875, "epoch": 0.027299317517062074, "grad_norm": 0.530457079410553, "kl": 0.02423095703125, "learning_rate": 9.726918075422627e-07, "loss": 0.001, "reward": 1.5517857074737549, "reward_std": 0.22988620400428772, "rewards/format_reward_gen": 0.90625, "rewards/llm_reward": 0.6455357372760773, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 190.25, "epoch": 0.027949301267468314, "grad_norm": 0.6165294051170349, "kl": 0.0289306640625, "learning_rate": 9.720416124837451e-07, "loss": 0.0012, "reward": 1.6414905786514282, "reward_std": 0.1999732404947281, "rewards/format_reward_gen": 0.9375, "rewards/llm_reward": 0.7039906978607178, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 191.30078125, "epoch": 0.028599285017874555, "grad_norm": 0.5362805724143982, "kl": 0.0250244140625, "learning_rate": 9.713914174252275e-07, "loss": 0.001, "reward": 1.6436612010002136, "reward_std": 0.15407836437225342, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.678817480802536, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 166.51171875, "epoch": 0.02924926876828079, "grad_norm": 0.49199458956718445, "kl": 0.030517578125, "learning_rate": 9.7074122236671e-07, "loss": 0.0012, "reward": 1.6876552104949951, "reward_std": 0.15825065225362778, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7189052104949951, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 176.796875, "epoch": 0.029899252518687032, "grad_norm": 0.5904018878936768, "kl": 0.0301513671875, "learning_rate": 9.700910273081923e-07, "loss": 0.0012, "reward": 1.6222714185714722, "reward_std": 0.19049951434135437, "rewards/format_reward_gen": 0.94921875, "rewards/llm_reward": 0.6730526387691498, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 180.890625, "epoch": 0.030549236269093272, "grad_norm": 0.5989759564399719, "kl": 0.02459716796875, "learning_rate": 9.69440832249675e-07, "loss": 0.001, "reward": 1.6457005739212036, "reward_std": 0.20102287828922272, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.676950603723526, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 203.828125, "epoch": 0.031199220019499513, "grad_norm": 0.5985329747200012, "kl": 0.025634765625, "learning_rate": 9.687906371911573e-07, "loss": 0.001, "reward": 1.616675615310669, "reward_std": 0.17408033460378647, "rewards/format_reward_gen": 0.9609375, "rewards/llm_reward": 0.655738115310669, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 171.5625, "epoch": 0.03184920376990575, "grad_norm": 0.6683792471885681, "kl": 0.0318603515625, "learning_rate": 9.681404421326397e-07, "loss": 0.0013, "reward": 1.6249927878379822, "reward_std": 0.23042723536491394, "rewards/format_reward_gen": 0.9453125, "rewards/llm_reward": 0.6796802282333374, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 180.0390625, "epoch": 0.032499187520311994, "grad_norm": 0.5195692181587219, "kl": 0.02294921875, "learning_rate": 9.67490247074122e-07, "loss": 0.0009, "reward": 1.6880380511283875, "reward_std": 0.125424325466156, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.6958505213260651, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 164.77734375, "epoch": 0.03314917127071823, "grad_norm": 0.4757358729839325, "kl": 0.02288818359375, "learning_rate": 9.668400520156045e-07, "loss": 0.0009, "reward": 1.7798490524291992, "reward_std": 0.13794026896357536, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7954740524291992, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 179.53515625, "epoch": 0.033799155021124475, "grad_norm": 0.6227936744689941, "kl": 0.02508544921875, "learning_rate": 9.66189856957087e-07, "loss": 0.001, "reward": 1.7046623229980469, "reward_std": 0.13770811259746552, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7241935729980469, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 187.6171875, "epoch": 0.03444913877153071, "grad_norm": 0.5537880659103394, "kl": 0.0257568359375, "learning_rate": 9.655396618985695e-07, "loss": 0.001, "reward": 1.6409095525741577, "reward_std": 0.16839562356472015, "rewards/format_reward_gen": 0.953125, "rewards/llm_reward": 0.6877845823764801, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 177.85546875, "epoch": 0.03509912252193695, "grad_norm": 0.5302692651748657, "kl": 0.02587890625, "learning_rate": 9.64889466840052e-07, "loss": 0.001, "reward": 1.6687390208244324, "reward_std": 0.11193385720252991, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.68827024102211, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 172.6640625, "epoch": 0.03574910627234319, "grad_norm": 0.5260276198387146, "kl": 0.0264892578125, "learning_rate": 9.642392717815345e-07, "loss": 0.0011, "reward": 1.6885814666748047, "reward_std": 0.10631540417671204, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.6963939666748047, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 166.16015625, "epoch": 0.03639909002274943, "grad_norm": 0.5464720726013184, "kl": 0.0347900390625, "learning_rate": 9.635890767230169e-07, "loss": 0.0014, "reward": 1.7031294703483582, "reward_std": 0.12228592485189438, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7109420001506805, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 164.46875, "epoch": 0.03704907377315567, "grad_norm": 1.2631051540374756, "kl": 0.02581787109375, "learning_rate": 9.629388816644993e-07, "loss": 0.001, "reward": 1.7205508947372437, "reward_std": 0.15017912536859512, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7283633947372437, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 181.390625, "epoch": 0.03769905752356191, "grad_norm": 0.533428430557251, "kl": 0.02630615234375, "learning_rate": 9.622886866059817e-07, "loss": 0.0011, "reward": 1.6914546489715576, "reward_std": 0.14094888418912888, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.6992672085762024, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 189.828125, "epoch": 0.038349041273968154, "grad_norm": 0.5545369386672974, "kl": 0.02789306640625, "learning_rate": 9.616384915474643e-07, "loss": 0.0011, "reward": 1.7007687091827393, "reward_std": 0.1302722580730915, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7281124889850616, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 146.984375, "epoch": 0.03899902502437439, "grad_norm": 1.273440957069397, "kl": 0.02947998046875, "learning_rate": 9.609882964889467e-07, "loss": 0.0012, "reward": 1.763837456703186, "reward_std": 0.11711093783378601, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7638373970985413, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 174.390625, "epoch": 0.03964900877478063, "grad_norm": 0.6384807229042053, "kl": 0.0277099609375, "learning_rate": 9.60338101430429e-07, "loss": 0.0011, "reward": 1.693419098854065, "reward_std": 0.12708913907408714, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7090440988540649, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 177.8125, "epoch": 0.04029899252518687, "grad_norm": 1.1804734468460083, "kl": 0.1026611328125, "learning_rate": 9.596879063719115e-07, "loss": 0.0041, "reward": 1.6252986788749695, "reward_std": 0.1454663947224617, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.6526423692703247, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 206.20703125, "epoch": 0.04094897627559311, "grad_norm": 0.45619362592697144, "kl": 0.0267333984375, "learning_rate": 9.590377113133939e-07, "loss": 0.0011, "reward": 1.679602026939392, "reward_std": 0.11172954738140106, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7069457769393921, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 174.33984375, "epoch": 0.04159896002599935, "grad_norm": 0.6149910092353821, "kl": 0.03070068359375, "learning_rate": 9.583875162548765e-07, "loss": 0.0012, "reward": 1.7432504296302795, "reward_std": 0.1276635304093361, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7588753998279572, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 153.53125, "epoch": 0.04224894377640559, "grad_norm": 0.5304292440414429, "kl": 0.03106689453125, "learning_rate": 9.577373211963589e-07, "loss": 0.0012, "reward": 1.7036563754081726, "reward_std": 0.08586867898702621, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7036563754081726, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 156.77734375, "epoch": 0.04289892752681183, "grad_norm": 0.4576033353805542, "kl": 0.02655029296875, "learning_rate": 9.570871261378413e-07, "loss": 0.0011, "reward": 1.8027727007865906, "reward_std": 0.09639408811926842, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8144914507865906, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 156.453125, "epoch": 0.04354891127721807, "grad_norm": 0.5228363871574402, "kl": 0.0321044921875, "learning_rate": 9.564369310793237e-07, "loss": 0.0013, "reward": 1.7706980109214783, "reward_std": 0.1129983700811863, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7824167609214783, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 152.9375, "epoch": 0.04419889502762431, "grad_norm": 0.5476620197296143, "kl": 0.033935546875, "learning_rate": 9.55786736020806e-07, "loss": 0.0014, "reward": 1.7983694672584534, "reward_std": 0.11383669078350067, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8061820268630981, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 186.53515625, "epoch": 0.04484887877803055, "grad_norm": 0.5744796395301819, "kl": 0.02801513671875, "learning_rate": 9.551365409622887e-07, "loss": 0.0011, "reward": 1.7051947116851807, "reward_std": 0.15312981605529785, "rewards/format_reward_gen": 0.95703125, "rewards/llm_reward": 0.7481634020805359, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 173.0625, "epoch": 0.04549886252843679, "grad_norm": 0.6745384335517883, "kl": 0.05126953125, "learning_rate": 9.54486345903771e-07, "loss": 0.0021, "reward": 1.7222567796707153, "reward_std": 0.129588071256876, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7496005296707153, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 158.75, "epoch": 0.04614884627884303, "grad_norm": 0.5609740614891052, "kl": 0.02593994140625, "learning_rate": 9.538361508452535e-07, "loss": 0.001, "reward": 1.6483426690101624, "reward_std": 0.15553710609674454, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.6834988594055176, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 170.37109375, "epoch": 0.04679883002924927, "grad_norm": 0.5004397034645081, "kl": 0.02301025390625, "learning_rate": 9.53185955786736e-07, "loss": 0.0009, "reward": 1.7227931022644043, "reward_std": 0.12839480862021446, "rewards/format_reward_gen": 0.953125, "rewards/llm_reward": 0.7696681618690491, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 159.8046875, "epoch": 0.047448813779655506, "grad_norm": 0.46414420008659363, "kl": 0.02362060546875, "learning_rate": 9.525357607282184e-07, "loss": 0.0009, "reward": 1.675027310848236, "reward_std": 0.1792113445699215, "rewards/format_reward_gen": 0.9453125, "rewards/llm_reward": 0.7297148108482361, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 176.74609375, "epoch": 0.04809879753006175, "grad_norm": 0.42888718843460083, "kl": 0.025634765625, "learning_rate": 9.518855656697009e-07, "loss": 0.001, "reward": 1.6999921798706055, "reward_std": 0.16240982711315155, "rewards/format_reward_gen": 0.9453125, "rewards/llm_reward": 0.7546797394752502, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 166.546875, "epoch": 0.04874878128046799, "grad_norm": 0.5758837461471558, "kl": 0.03167724609375, "learning_rate": 9.512353706111833e-07, "loss": 0.0013, "reward": 1.6748338341712952, "reward_std": 0.19102077186107635, "rewards/format_reward_gen": 0.9296875, "rewards/llm_reward": 0.7451463639736176, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 168.30078125, "epoch": 0.04939876503087423, "grad_norm": 0.6992278695106506, "kl": 0.04638671875, "learning_rate": 9.505851755526657e-07, "loss": 0.0019, "reward": 1.606232762336731, "reward_std": 0.217591755092144, "rewards/format_reward_gen": 0.89453125, "rewards/llm_reward": 0.7117015421390533, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 195.609375, "epoch": 0.05004874878128047, "grad_norm": 0.44847291707992554, "kl": 0.02008056640625, "learning_rate": 9.499349804941483e-07, "loss": 0.0008, "reward": 1.6705156564712524, "reward_std": 0.21188189089298248, "rewards/format_reward_gen": 0.91796875, "rewards/llm_reward": 0.7525469064712524, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 184.7421875, "epoch": 0.050698732531686705, "grad_norm": 0.49340465664863586, "kl": 0.03009033203125, "learning_rate": 9.492847854356307e-07, "loss": 0.0012, "reward": 1.6215594410896301, "reward_std": 0.22277702391147614, "rewards/format_reward_gen": 0.9296875, "rewards/llm_reward": 0.6918719410896301, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 175.33984375, "epoch": 0.05134871628209295, "grad_norm": 1.3213627338409424, "kl": 0.02178955078125, "learning_rate": 9.486345903771132e-07, "loss": 0.0009, "reward": 1.6615626215934753, "reward_std": 0.22495736181735992, "rewards/format_reward_gen": 0.9140625, "rewards/llm_reward": 0.7475001215934753, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 169.75, "epoch": 0.051998700032499186, "grad_norm": 0.5859085321426392, "kl": 0.02642822265625, "learning_rate": 9.479843953185956e-07, "loss": 0.0011, "reward": 1.5816291570663452, "reward_std": 0.2553277164697647, "rewards/format_reward_gen": 0.88671875, "rewards/llm_reward": 0.69491046667099, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 185.82421875, "epoch": 0.05264868378290543, "grad_norm": 0.6078607439994812, "kl": 0.02496337890625, "learning_rate": 9.47334200260078e-07, "loss": 0.001, "reward": 1.614077627658844, "reward_std": 0.2847144529223442, "rewards/format_reward_gen": 0.88671875, "rewards/llm_reward": 0.727358877658844, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 169.09765625, "epoch": 0.05329866753331167, "grad_norm": 0.49671614170074463, "kl": 0.01947021484375, "learning_rate": 9.466840052015605e-07, "loss": 0.0008, "reward": 1.7304826974868774, "reward_std": 0.13797680288553238, "rewards/format_reward_gen": 0.953125, "rewards/llm_reward": 0.777357667684555, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 174.34375, "epoch": 0.05394865128371791, "grad_norm": 0.8918381333351135, "kl": 0.02496337890625, "learning_rate": 9.460338101430429e-07, "loss": 0.001, "reward": 1.694312870502472, "reward_std": 0.1866544708609581, "rewards/format_reward_gen": 0.92578125, "rewards/llm_reward": 0.7685315608978271, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 193.07421875, "epoch": 0.05459863503412415, "grad_norm": 0.5470022559165955, "kl": 0.025390625, "learning_rate": 9.453836150845254e-07, "loss": 0.001, "reward": 1.6506220698356628, "reward_std": 0.20664742588996887, "rewards/format_reward_gen": 0.94140625, "rewards/llm_reward": 0.7092158198356628, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 168.61328125, "epoch": 0.055248618784530384, "grad_norm": 0.5570728778839111, "kl": 0.0205078125, "learning_rate": 9.447334200260078e-07, "loss": 0.0008, "reward": 1.6235308647155762, "reward_std": 0.18287189304828644, "rewards/format_reward_gen": 0.9453125, "rewards/llm_reward": 0.6782183945178986, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 193.6015625, "epoch": 0.05589860253493663, "grad_norm": 0.4524511992931366, "kl": 0.0198974609375, "learning_rate": 9.440832249674902e-07, "loss": 0.0008, "reward": 1.7269737720489502, "reward_std": 0.12958713248372078, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.758223682641983, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 177.26953125, "epoch": 0.056548586285342865, "grad_norm": 0.9984080195426941, "kl": 0.0220947265625, "learning_rate": 9.434330299089727e-07, "loss": 0.0009, "reward": 1.7248924374580383, "reward_std": 0.1636945754289627, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.7600486278533936, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 158.76171875, "epoch": 0.05719857003574911, "grad_norm": 0.44317126274108887, "kl": 0.02459716796875, "learning_rate": 9.42782834850455e-07, "loss": 0.001, "reward": 1.7475877404212952, "reward_std": 0.1515064276754856, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7710252106189728, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 186.3359375, "epoch": 0.057848553786155346, "grad_norm": 0.5955072045326233, "kl": 0.0350341796875, "learning_rate": 9.421326397919376e-07, "loss": 0.0014, "reward": 1.6174898743629456, "reward_std": 0.22561455518007278, "rewards/format_reward_gen": 0.9296875, "rewards/llm_reward": 0.6878024041652679, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 184.0546875, "epoch": 0.05849853753656158, "grad_norm": 0.48800209164619446, "kl": 0.02630615234375, "learning_rate": 9.4148244473342e-07, "loss": 0.0011, "reward": 1.7198349237442017, "reward_std": 0.1559121124446392, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.7549911439418793, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 184.03515625, "epoch": 0.05914852128696783, "grad_norm": 0.42676079273223877, "kl": 0.023681640625, "learning_rate": 9.408322496749023e-07, "loss": 0.0009, "reward": 1.7113338112831116, "reward_std": 0.1814878061413765, "rewards/format_reward_gen": 0.9609375, "rewards/llm_reward": 0.7503962814807892, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 182.265625, "epoch": 0.059798505037374064, "grad_norm": 0.7708536982536316, "kl": 0.02215576171875, "learning_rate": 9.401820546163848e-07, "loss": 0.0009, "reward": 1.7062721252441406, "reward_std": 0.11279774457216263, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7297095358371735, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 180.68359375, "epoch": 0.06044848878778031, "grad_norm": 0.6172276735305786, "kl": 0.023681640625, "learning_rate": 9.395318595578672e-07, "loss": 0.0009, "reward": 1.6979453563690186, "reward_std": 0.1639072522521019, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.7331016361713409, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 212.8203125, "epoch": 0.061098472538186545, "grad_norm": 0.5389184355735779, "kl": 0.02667236328125, "learning_rate": 9.388816644993499e-07, "loss": 0.0011, "reward": 1.6718235611915588, "reward_std": 0.16040245443582535, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.7069798111915588, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 188.1640625, "epoch": 0.06174845628859278, "grad_norm": 0.5686272382736206, "kl": 0.0247802734375, "learning_rate": 9.382314694408322e-07, "loss": 0.001, "reward": 1.6912047863006592, "reward_std": 0.16078674048185349, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7068297863006592, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 174.41015625, "epoch": 0.062398440038999026, "grad_norm": 0.5170308351516724, "kl": 0.02471923828125, "learning_rate": 9.375812743823146e-07, "loss": 0.001, "reward": 1.7187109589576721, "reward_std": 0.11570781841874123, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7382422089576721, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 167.3828125, "epoch": 0.06304842378940527, "grad_norm": 0.5027660727500916, "kl": 0.0225830078125, "learning_rate": 9.369310793237971e-07, "loss": 0.0009, "reward": 1.713843047618866, "reward_std": 0.16344037652015686, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7450931072235107, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 167.17578125, "epoch": 0.0636984075398115, "grad_norm": 0.4714580476284027, "kl": 0.02191162109375, "learning_rate": 9.362808842652795e-07, "loss": 0.0009, "reward": 1.7789289355278015, "reward_std": 0.09838701039552689, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8023664653301239, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 172.8671875, "epoch": 0.06434839129021774, "grad_norm": 0.5182251930236816, "kl": 0.02490234375, "learning_rate": 9.35630689206762e-07, "loss": 0.001, "reward": 1.714259684085846, "reward_std": 0.12482651323080063, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7220722436904907, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 184.7890625, "epoch": 0.06499837504062399, "grad_norm": 0.5387192964553833, "kl": 0.02215576171875, "learning_rate": 9.349804941482444e-07, "loss": 0.0009, "reward": 1.7511805891990662, "reward_std": 0.126932542771101, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7628993690013885, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 194.6875, "epoch": 0.06564835879103022, "grad_norm": 0.46476829051971436, "kl": 0.02349853515625, "learning_rate": 9.343302990897268e-07, "loss": 0.0009, "reward": 1.7267686128616333, "reward_std": 0.14464465528726578, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7502059936523438, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 185.80859375, "epoch": 0.06629834254143646, "grad_norm": 0.5238707661628723, "kl": 0.0255126953125, "learning_rate": 9.336801040312093e-07, "loss": 0.001, "reward": 1.7085803151130676, "reward_std": 0.13314789533615112, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7281116247177124, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 175.80859375, "epoch": 0.0669483262918427, "grad_norm": 0.4923504889011383, "kl": 0.0255126953125, "learning_rate": 9.330299089726917e-07, "loss": 0.001, "reward": 1.7319114804267883, "reward_std": 0.14059919118881226, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7475365400314331, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 169.5234375, "epoch": 0.06759831004224895, "grad_norm": 0.4674239456653595, "kl": 0.029052734375, "learning_rate": 9.323797139141742e-07, "loss": 0.0012, "reward": 1.8083715438842773, "reward_std": 0.11492390185594559, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8200902938842773, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 183.6796875, "epoch": 0.06824829379265518, "grad_norm": 0.5557112097740173, "kl": 0.03179931640625, "learning_rate": 9.317295188556566e-07, "loss": 0.0013, "reward": 1.7648906707763672, "reward_std": 0.10444141551852226, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.7687969207763672, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 169.67578125, "epoch": 0.06889827754306142, "grad_norm": 0.5271931886672974, "kl": 0.023193359375, "learning_rate": 9.31079323797139e-07, "loss": 0.0009, "reward": 1.7410590052604675, "reward_std": 0.1286703199148178, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7566839456558228, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 193.9765625, "epoch": 0.06954826129346767, "grad_norm": 0.5317109823226929, "kl": 0.0205078125, "learning_rate": 9.304291287386215e-07, "loss": 0.0008, "reward": 1.7861355543136597, "reward_std": 0.0998367965221405, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8056668043136597, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 177.59765625, "epoch": 0.0701982450438739, "grad_norm": 0.47176510095596313, "kl": 0.02435302734375, "learning_rate": 9.297789336801039e-07, "loss": 0.001, "reward": 1.7731464505195618, "reward_std": 0.11942831426858902, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7809590399265289, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 166.34765625, "epoch": 0.07084822879428014, "grad_norm": 0.609501838684082, "kl": 0.0262451171875, "learning_rate": 9.291287386215864e-07, "loss": 0.001, "reward": 1.744205117225647, "reward_std": 0.11635170876979828, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.755923867225647, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 189.94140625, "epoch": 0.07149821254468638, "grad_norm": 0.4934156537055969, "kl": 0.025146484375, "learning_rate": 9.284785435630688e-07, "loss": 0.001, "reward": 1.744773268699646, "reward_std": 0.13392389565706253, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.764304518699646, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 171.609375, "epoch": 0.07214819629509263, "grad_norm": 0.6225120425224304, "kl": 0.02099609375, "learning_rate": 9.278283485045512e-07, "loss": 0.0008, "reward": 1.729310691356659, "reward_std": 0.151279479265213, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7449356317520142, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 169.7890625, "epoch": 0.07279818004549886, "grad_norm": 0.5204961895942688, "kl": 0.0281982421875, "learning_rate": 9.271781534460338e-07, "loss": 0.0011, "reward": 1.7370412349700928, "reward_std": 0.10321186482906342, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7526662349700928, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 197.07421875, "epoch": 0.0734481637959051, "grad_norm": 0.4829391837120056, "kl": 0.0235595703125, "learning_rate": 9.265279583875162e-07, "loss": 0.0009, "reward": 1.763712227344513, "reward_std": 0.12647927552461624, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7832435071468353, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 197.53125, "epoch": 0.07409814754631135, "grad_norm": 0.6654679179191589, "kl": 0.0247802734375, "learning_rate": 9.258777633289987e-07, "loss": 0.001, "reward": 1.691585659980774, "reward_std": 0.13841454684734344, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7228357493877411, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 178.46875, "epoch": 0.07474813129671758, "grad_norm": 0.4664066433906555, "kl": 0.02197265625, "learning_rate": 9.252275682704811e-07, "loss": 0.0009, "reward": 1.7516658902168274, "reward_std": 0.10847150534391403, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7633846700191498, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 191.58984375, "epoch": 0.07539811504712382, "grad_norm": 0.44155025482177734, "kl": 0.02545166015625, "learning_rate": 9.245773732119635e-07, "loss": 0.001, "reward": 1.7651060819625854, "reward_std": 0.12331819906830788, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7963560819625854, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 172.23046875, "epoch": 0.07604809879753006, "grad_norm": 0.9483346939086914, "kl": 0.0911865234375, "learning_rate": 9.23927178153446e-07, "loss": 0.0037, "reward": 1.7547824382781982, "reward_std": 0.09926614165306091, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7625949680805206, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 163.99609375, "epoch": 0.07669808254793631, "grad_norm": 0.4820493459701538, "kl": 0.02813720703125, "learning_rate": 9.232769830949284e-07, "loss": 0.0011, "reward": 1.7770671844482422, "reward_std": 0.10211049020290375, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.780973494052887, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 178.484375, "epoch": 0.07734806629834254, "grad_norm": 0.48121774196624756, "kl": 0.0228271484375, "learning_rate": 9.226267880364109e-07, "loss": 0.0009, "reward": 1.7627117037773132, "reward_std": 0.11653940007090569, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7705242037773132, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 179.78125, "epoch": 0.07799805004874878, "grad_norm": 0.5009667277336121, "kl": 0.0341796875, "learning_rate": 9.219765929778933e-07, "loss": 0.0014, "reward": 1.7264337539672852, "reward_std": 0.14604074507951736, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7498712837696075, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 178.078125, "epoch": 0.07864803379915503, "grad_norm": 0.5490631461143494, "kl": 0.02520751953125, "learning_rate": 9.213263979193757e-07, "loss": 0.001, "reward": 1.724075198173523, "reward_std": 0.1521899700164795, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.7592314183712006, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 157.8046875, "epoch": 0.07929801754956126, "grad_norm": 0.534515380859375, "kl": 0.027587890625, "learning_rate": 9.206762028608582e-07, "loss": 0.0011, "reward": 1.7815688848495483, "reward_std": 0.11090880259871483, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7932876348495483, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 177.34765625, "epoch": 0.0799480012999675, "grad_norm": 0.5051369071006775, "kl": 0.0284423828125, "learning_rate": 9.200260078023406e-07, "loss": 0.0011, "reward": 1.7385034561157227, "reward_std": 0.13223116844892502, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7502221465110779, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 190.80078125, "epoch": 0.08059798505037374, "grad_norm": 0.5696348547935486, "kl": 0.021728515625, "learning_rate": 9.193758127438231e-07, "loss": 0.0009, "reward": 1.749208927154541, "reward_std": 0.10300594940781593, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.764833927154541, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 164.703125, "epoch": 0.08124796880077997, "grad_norm": 0.4454639256000519, "kl": 0.02716064453125, "learning_rate": 9.187256176853055e-07, "loss": 0.0011, "reward": 1.7982723712921143, "reward_std": 0.1075095646083355, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8178036212921143, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 187.578125, "epoch": 0.08189795255118622, "grad_norm": 0.5446704030036926, "kl": 0.02777099609375, "learning_rate": 9.180754226267879e-07, "loss": 0.0011, "reward": 1.7138053178787231, "reward_std": 0.16456080228090286, "rewards/format_reward_gen": 0.9609375, "rewards/llm_reward": 0.7528678476810455, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 164.2734375, "epoch": 0.08254793630159246, "grad_norm": 0.46300792694091797, "kl": 0.0281982421875, "learning_rate": 9.174252275682704e-07, "loss": 0.0011, "reward": 1.7795513272285461, "reward_std": 0.10739680007100105, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7912701368331909, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 169.66796875, "epoch": 0.0831979200519987, "grad_norm": 0.47600823640823364, "kl": 0.02410888671875, "learning_rate": 9.167750325097528e-07, "loss": 0.001, "reward": 1.800128698348999, "reward_std": 0.09263810515403748, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.807941198348999, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 176.12109375, "epoch": 0.08384790380240494, "grad_norm": 0.5276387333869934, "kl": 0.02618408203125, "learning_rate": 9.161248374512354e-07, "loss": 0.001, "reward": 1.7357704043388367, "reward_std": 0.1182083785533905, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7474891543388367, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 183.1640625, "epoch": 0.08449788755281118, "grad_norm": 0.5229530334472656, "kl": 0.01934814453125, "learning_rate": 9.154746423927178e-07, "loss": 0.0008, "reward": 1.763990879058838, "reward_std": 0.09412258863449097, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.767897218465805, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 180.71484375, "epoch": 0.08514787130321742, "grad_norm": 0.4498423933982849, "kl": 0.0323486328125, "learning_rate": 9.148244473342002e-07, "loss": 0.0013, "reward": 1.7402167320251465, "reward_std": 0.1368965208530426, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7597479820251465, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 164.4140625, "epoch": 0.08579785505362365, "grad_norm": 0.4438989758491516, "kl": 0.024169921875, "learning_rate": 9.141742522756827e-07, "loss": 0.001, "reward": 1.7497809529304504, "reward_std": 0.1270533725619316, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7575934529304504, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 186.94921875, "epoch": 0.0864478388040299, "grad_norm": 0.5512260794639587, "kl": 0.02862548828125, "learning_rate": 9.135240572171651e-07, "loss": 0.0011, "reward": 1.7809050679206848, "reward_std": 0.1385423094034195, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7965300679206848, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 185.9296875, "epoch": 0.08709782255443614, "grad_norm": 0.4762997031211853, "kl": 0.02557373046875, "learning_rate": 9.128738621586476e-07, "loss": 0.001, "reward": 1.7519458532333374, "reward_std": 0.12463156878948212, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7792895436286926, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 192.8046875, "epoch": 0.08774780630484239, "grad_norm": 0.4384933412075043, "kl": 0.02325439453125, "learning_rate": 9.1222366710013e-07, "loss": 0.0009, "reward": 1.7571836113929749, "reward_std": 0.12069221958518028, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7806210815906525, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 170.703125, "epoch": 0.08839779005524862, "grad_norm": 0.46076834201812744, "kl": 0.032470703125, "learning_rate": 9.115734720416124e-07, "loss": 0.0013, "reward": 1.7303792238235474, "reward_std": 0.1348210796713829, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7499105334281921, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 189.359375, "epoch": 0.08904777380565486, "grad_norm": 0.7023166418075562, "kl": 0.02569580078125, "learning_rate": 9.109232769830949e-07, "loss": 0.001, "reward": 1.7817272543907166, "reward_std": 0.14625020697712898, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7973522543907166, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 191.74609375, "epoch": 0.0896977575560611, "grad_norm": 0.5977641940116882, "kl": 0.0277099609375, "learning_rate": 9.102730819245773e-07, "loss": 0.0011, "reward": 1.7642285227775574, "reward_std": 0.13869920372962952, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.783759742975235, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 189.21484375, "epoch": 0.09034774130646733, "grad_norm": 0.4808005392551422, "kl": 0.02587890625, "learning_rate": 9.096228868660598e-07, "loss": 0.001, "reward": 1.691650152206421, "reward_std": 0.1264980584383011, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7072752118110657, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 162.0625, "epoch": 0.09099772505687358, "grad_norm": 0.5446332097053528, "kl": 0.03253173828125, "learning_rate": 9.089726918075422e-07, "loss": 0.0013, "reward": 1.791551113128662, "reward_std": 0.13369844108819962, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8032698333263397, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 185.93359375, "epoch": 0.09164770880727982, "grad_norm": 12.509982109069824, "kl": 0.02777099609375, "learning_rate": 9.083224967490246e-07, "loss": 0.0011, "reward": 1.7419942021369934, "reward_std": 0.11839332804083824, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7654316425323486, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 184.28125, "epoch": 0.09229769255768606, "grad_norm": 0.5067651271820068, "kl": 0.0245361328125, "learning_rate": 9.076723016905071e-07, "loss": 0.001, "reward": 1.766642153263092, "reward_std": 0.12905701249837875, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7783609628677368, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 192.5546875, "epoch": 0.0929476763080923, "grad_norm": 0.48180028796195984, "kl": 0.03033447265625, "learning_rate": 9.070221066319895e-07, "loss": 0.0012, "reward": 1.7111955285072327, "reward_std": 0.12996093183755875, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7346329987049103, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 178.14453125, "epoch": 0.09359766005849854, "grad_norm": 0.7377758622169495, "kl": 0.02618408203125, "learning_rate": 9.06371911573472e-07, "loss": 0.001, "reward": 1.7157790064811707, "reward_std": 0.10579509288072586, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7274976968765259, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 170.40234375, "epoch": 0.09424764380890478, "grad_norm": 0.5131024718284607, "kl": 0.0260009765625, "learning_rate": 9.057217165149544e-07, "loss": 0.001, "reward": 1.7267245054244995, "reward_std": 0.16365773230791092, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7579744160175323, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 173.60546875, "epoch": 0.09489762755931101, "grad_norm": 0.46591606736183167, "kl": 0.0303955078125, "learning_rate": 9.050715214564369e-07, "loss": 0.0012, "reward": 1.7838011980056763, "reward_std": 0.12414713948965073, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.7877074480056763, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 172.56640625, "epoch": 0.09554761130971726, "grad_norm": 0.48974162340164185, "kl": 0.02679443359375, "learning_rate": 9.044213263979194e-07, "loss": 0.0011, "reward": 1.7747175693511963, "reward_std": 0.12659987807273865, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7903425097465515, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 180.16015625, "epoch": 0.0961975950601235, "grad_norm": 0.555130124092102, "kl": 0.0269775390625, "learning_rate": 9.037711313394018e-07, "loss": 0.0011, "reward": 1.77283775806427, "reward_std": 0.11178223416209221, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7845564186573029, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 194.16796875, "epoch": 0.09684757881052973, "grad_norm": 0.5912514925003052, "kl": 0.0260009765625, "learning_rate": 9.031209362808843e-07, "loss": 0.001, "reward": 1.697731375694275, "reward_std": 0.1531492918729782, "rewards/format_reward_gen": 0.953125, "rewards/llm_reward": 0.7446063756942749, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 174.94140625, "epoch": 0.09749756256093597, "grad_norm": 0.5407187938690186, "kl": 0.022705078125, "learning_rate": 9.024707412223667e-07, "loss": 0.0009, "reward": 1.7845452427864075, "reward_std": 0.10335347801446915, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.7884514927864075, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 169.953125, "epoch": 0.09814754631134222, "grad_norm": 0.5036870837211609, "kl": 0.030517578125, "learning_rate": 9.018205461638491e-07, "loss": 0.0012, "reward": 1.7746670842170715, "reward_std": 0.12046713754534721, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7863858044147491, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 174.6484375, "epoch": 0.09879753006174846, "grad_norm": 1.7556536197662354, "kl": 0.03070068359375, "learning_rate": 9.011703511053316e-07, "loss": 0.0012, "reward": 1.7369931936264038, "reward_std": 0.19217509031295776, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7682431936264038, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 160.875, "epoch": 0.09944751381215469, "grad_norm": 0.4428662061691284, "kl": 0.0250244140625, "learning_rate": 9.00520156046814e-07, "loss": 0.001, "reward": 1.7867044806480408, "reward_std": 0.10376572981476784, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.794516921043396, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 168.36328125, "epoch": 0.10009749756256094, "grad_norm": 0.4700548052787781, "kl": 0.029541015625, "learning_rate": 8.998699609882965e-07, "loss": 0.0012, "reward": 1.757910668849945, "reward_std": 0.09782728552818298, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7735356688499451, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 188.31640625, "epoch": 0.10074748131296718, "grad_norm": 1.7316864728927612, "kl": 0.1185302734375, "learning_rate": 8.992197659297789e-07, "loss": 0.0048, "reward": 1.7207124829292297, "reward_std": 0.14290717989206314, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7480562329292297, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 183.8046875, "epoch": 0.10139746506337341, "grad_norm": 0.5985956192016602, "kl": 0.030517578125, "learning_rate": 8.985695708712613e-07, "loss": 0.0012, "reward": 1.7431225776672363, "reward_std": 0.14135073125362396, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7509350180625916, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 188.8359375, "epoch": 0.10204744881377965, "grad_norm": 0.5127871036529541, "kl": 0.02978515625, "learning_rate": 8.979193758127438e-07, "loss": 0.0012, "reward": 1.7681177854537964, "reward_std": 0.11347085610032082, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.783742755651474, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 196.5390625, "epoch": 0.1026974325641859, "grad_norm": 0.46684518456459045, "kl": 0.02581787109375, "learning_rate": 8.972691807542262e-07, "loss": 0.001, "reward": 1.7181996703147888, "reward_std": 0.09428722783923149, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7299184799194336, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 192.484375, "epoch": 0.10334741631459214, "grad_norm": 0.5024276375770569, "kl": 0.02825927734375, "learning_rate": 8.966189856957087e-07, "loss": 0.0011, "reward": 1.7364615201950073, "reward_std": 0.12015743181109428, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7442739605903625, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 164.41796875, "epoch": 0.10399740006499837, "grad_norm": 0.46656712889671326, "kl": 0.02850341796875, "learning_rate": 8.959687906371911e-07, "loss": 0.0011, "reward": 1.7966421842575073, "reward_std": 0.1081707514822483, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8083609044551849, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 178.4453125, "epoch": 0.10464738381540462, "grad_norm": 0.5502929091453552, "kl": 0.02447509765625, "learning_rate": 8.953185955786735e-07, "loss": 0.001, "reward": 1.7126962542533875, "reward_std": 0.1632557176053524, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7322275340557098, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 181.3671875, "epoch": 0.10529736756581086, "grad_norm": 0.5733208656311035, "kl": 0.02044677734375, "learning_rate": 8.94668400520156e-07, "loss": 0.0008, "reward": 1.732736349105835, "reward_std": 0.1266542710363865, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.736642599105835, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 169.9296875, "epoch": 0.10594735131621709, "grad_norm": 0.4684702157974243, "kl": 0.01947021484375, "learning_rate": 8.940182054616385e-07, "loss": 0.0008, "reward": 1.8047522902488708, "reward_std": 0.09277599304914474, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8086585700511932, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 176.51171875, "epoch": 0.10659733506662333, "grad_norm": 0.5179416537284851, "kl": 0.02899169921875, "learning_rate": 8.93368010403121e-07, "loss": 0.0012, "reward": 1.6973817348480225, "reward_std": 0.148548886179924, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7091005146503448, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 184.234375, "epoch": 0.10724731881702958, "grad_norm": 0.5195695161819458, "kl": 0.02484130859375, "learning_rate": 8.927178153446034e-07, "loss": 0.001, "reward": 1.6889218091964722, "reward_std": 0.12577034533023834, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7006405591964722, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 164.51953125, "epoch": 0.10789730256743582, "grad_norm": 0.4923626482486725, "kl": 0.03070068359375, "learning_rate": 8.920676202860858e-07, "loss": 0.0012, "reward": 1.7504708170890808, "reward_std": 0.12201941385865211, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7621895670890808, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 197.21875, "epoch": 0.10854728631784205, "grad_norm": 0.3694620430469513, "kl": 0.0244140625, "learning_rate": 8.914174252275683e-07, "loss": 0.001, "reward": 1.7369641661643982, "reward_std": 0.0855107307434082, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7564953863620758, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 165.81640625, "epoch": 0.1091972700682483, "grad_norm": 0.7018545269966125, "kl": 0.0406494140625, "learning_rate": 8.907672301690507e-07, "loss": 0.0016, "reward": 1.7663414478302002, "reward_std": 0.10648777708411217, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7741539478302002, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 167.16015625, "epoch": 0.10984725381865454, "grad_norm": 0.5183096528053284, "kl": 0.02532958984375, "learning_rate": 8.901170351105332e-07, "loss": 0.001, "reward": 1.7746068835258484, "reward_std": 0.09393669292330742, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.7785132229328156, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 178.34765625, "epoch": 0.11049723756906077, "grad_norm": 0.5421291589736938, "kl": 0.0250244140625, "learning_rate": 8.894668400520156e-07, "loss": 0.001, "reward": 1.7201703190803528, "reward_std": 0.12972256541252136, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7357953488826752, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 181.8125, "epoch": 0.11114722131946701, "grad_norm": 0.7720229625701904, "kl": 0.0244140625, "learning_rate": 8.88816644993498e-07, "loss": 0.001, "reward": 1.7616466283798218, "reward_std": 0.13694939017295837, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7850841283798218, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 185.76171875, "epoch": 0.11179720506987326, "grad_norm": 0.5874478220939636, "kl": 0.030517578125, "learning_rate": 8.881664499349805e-07, "loss": 0.0012, "reward": 1.717527985572815, "reward_std": 0.18760735541582108, "rewards/format_reward_gen": 0.953125, "rewards/llm_reward": 0.7644030451774597, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 159.73828125, "epoch": 0.11244718882027949, "grad_norm": 0.5373659133911133, "kl": 0.02490234375, "learning_rate": 8.875162548764629e-07, "loss": 0.001, "reward": 1.7680508494377136, "reward_std": 0.12398312985897064, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7680509090423584, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 171.7578125, "epoch": 0.11309717257068573, "grad_norm": 0.508639931678772, "kl": 0.02593994140625, "learning_rate": 8.868660598179454e-07, "loss": 0.001, "reward": 1.7714696526527405, "reward_std": 0.14224601536989212, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7870946824550629, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 159.09375, "epoch": 0.11374715632109197, "grad_norm": 0.5485427379608154, "kl": 0.02642822265625, "learning_rate": 8.862158647594278e-07, "loss": 0.0011, "reward": 1.7524933815002441, "reward_std": 0.13597264140844345, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7642121016979218, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 181.4140625, "epoch": 0.11439714007149822, "grad_norm": 0.5082714557647705, "kl": 0.02862548828125, "learning_rate": 8.855656697009101e-07, "loss": 0.0011, "reward": 1.7377606630325317, "reward_std": 0.1199987456202507, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.753385603427887, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 185.34375, "epoch": 0.11504712382190445, "grad_norm": 2.4298057556152344, "kl": 0.0318603515625, "learning_rate": 8.849154746423927e-07, "loss": 0.0013, "reward": 1.6778329014778137, "reward_std": 0.15841242671012878, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.6934578716754913, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 178.12109375, "epoch": 0.11569710757231069, "grad_norm": 0.5026186108589172, "kl": 0.037841796875, "learning_rate": 8.84265279583875e-07, "loss": 0.0015, "reward": 1.7737547159194946, "reward_std": 0.138802170753479, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.789379745721817, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 175.12890625, "epoch": 0.11634709132271694, "grad_norm": 1.3915280103683472, "kl": 0.02362060546875, "learning_rate": 8.836150845253575e-07, "loss": 0.0009, "reward": 1.746864140033722, "reward_std": 0.13067952543497086, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7585828900337219, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 181.453125, "epoch": 0.11699707507312317, "grad_norm": 0.406914621591568, "kl": 0.0234375, "learning_rate": 8.8296488946684e-07, "loss": 0.0009, "reward": 1.7685033679008484, "reward_std": 0.08619717694818974, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.788034588098526, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 191.234375, "epoch": 0.11764705882352941, "grad_norm": 0.4714392423629761, "kl": 0.02642822265625, "learning_rate": 8.823146944083224e-07, "loss": 0.0011, "reward": 1.7120540738105774, "reward_std": 0.11407105252146721, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7276790738105774, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 172.69140625, "epoch": 0.11829704257393565, "grad_norm": 0.5603452324867249, "kl": 0.03033447265625, "learning_rate": 8.81664499349805e-07, "loss": 0.0012, "reward": 1.7603004574775696, "reward_std": 0.10785305500030518, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7681129574775696, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 170.31640625, "epoch": 0.1189470263243419, "grad_norm": 0.5559119582176208, "kl": 0.03271484375, "learning_rate": 8.810143042912873e-07, "loss": 0.0013, "reward": 1.7584509253501892, "reward_std": 0.13312407582998276, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7701696455478668, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 164.6171875, "epoch": 0.11959701007474813, "grad_norm": 0.5173207521438599, "kl": 0.02801513671875, "learning_rate": 8.803641092327698e-07, "loss": 0.0011, "reward": 1.788980484008789, "reward_std": 0.11948523670434952, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8006992340087891, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 179.5625, "epoch": 0.12024699382515437, "grad_norm": 0.5141646862030029, "kl": 0.02874755859375, "learning_rate": 8.797139141742522e-07, "loss": 0.0012, "reward": 1.737974226474762, "reward_std": 0.12260865792632103, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.741880476474762, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 167.76953125, "epoch": 0.12089697757556062, "grad_norm": 0.858518123626709, "kl": 0.02716064453125, "learning_rate": 8.790637191157346e-07, "loss": 0.0011, "reward": 1.8440847396850586, "reward_std": 0.04831286519765854, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8440847098827362, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 195.43359375, "epoch": 0.12154696132596685, "grad_norm": 0.4813303053379059, "kl": 0.02783203125, "learning_rate": 8.784135240572171e-07, "loss": 0.0011, "reward": 1.7448266744613647, "reward_std": 0.10320591181516647, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7643579244613647, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 187.375, "epoch": 0.12219694507637309, "grad_norm": 0.5151005983352661, "kl": 0.02838134765625, "learning_rate": 8.777633289986995e-07, "loss": 0.0011, "reward": 1.741048514842987, "reward_std": 0.10360036790370941, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7605797648429871, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 172.703125, "epoch": 0.12284692882677933, "grad_norm": 0.48469942808151245, "kl": 0.030029296875, "learning_rate": 8.77113133940182e-07, "loss": 0.0012, "reward": 1.7596665620803833, "reward_std": 0.14325398951768875, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7831041216850281, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 178.29296875, "epoch": 0.12349691257718556, "grad_norm": 0.4647466242313385, "kl": 0.03082275390625, "learning_rate": 8.764629388816644e-07, "loss": 0.0012, "reward": 1.7723158597946167, "reward_std": 0.10182227939367294, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7918470799922943, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 165.7734375, "epoch": 0.12414689632759181, "grad_norm": 0.46226003766059875, "kl": 0.03778076171875, "learning_rate": 8.758127438231468e-07, "loss": 0.0015, "reward": 1.809382140636444, "reward_std": 0.0894334688782692, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8250070810317993, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 197.3046875, "epoch": 0.12479688007799805, "grad_norm": 0.4975655972957611, "kl": 0.029052734375, "learning_rate": 8.751625487646293e-07, "loss": 0.0012, "reward": 1.7519711256027222, "reward_std": 0.1221310906112194, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7832210659980774, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 165.75, "epoch": 0.12544686382840428, "grad_norm": 0.499728798866272, "kl": 0.02947998046875, "learning_rate": 8.745123537061117e-07, "loss": 0.0012, "reward": 1.7784842252731323, "reward_std": 0.13429703935980797, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.8058279752731323, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 179.265625, "epoch": 0.12609684757881054, "grad_norm": 0.42320647835731506, "kl": 0.0238037109375, "learning_rate": 8.738621586475942e-07, "loss": 0.001, "reward": 1.8351368308067322, "reward_std": 0.07267814129590988, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8390430808067322, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 166.46484375, "epoch": 0.12674683132921677, "grad_norm": 0.3878614604473114, "kl": 0.031005859375, "learning_rate": 8.732119635890766e-07, "loss": 0.0012, "reward": 1.794700026512146, "reward_std": 0.10711149498820305, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8142313063144684, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 180.7734375, "epoch": 0.127396815079623, "grad_norm": 0.4356563687324524, "kl": 0.02569580078125, "learning_rate": 8.725617685305591e-07, "loss": 0.001, "reward": 1.8244778513908386, "reward_std": 0.09375341236591339, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8401028215885162, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 188.84765625, "epoch": 0.12804679883002926, "grad_norm": 1.445223093032837, "kl": 0.1258544921875, "learning_rate": 8.719115734720416e-07, "loss": 0.005, "reward": 1.7750861048698425, "reward_std": 0.131418377161026, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7985236048698425, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 192.63671875, "epoch": 0.1286967825804355, "grad_norm": 0.5168812274932861, "kl": 0.032958984375, "learning_rate": 8.71261378413524e-07, "loss": 0.0013, "reward": 1.705972969532013, "reward_std": 0.11416084691882133, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7372229993343353, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 179.7578125, "epoch": 0.12934676633084172, "grad_norm": 0.472368061542511, "kl": 0.0322265625, "learning_rate": 8.706111833550065e-07, "loss": 0.0013, "reward": 1.7516363263130188, "reward_std": 0.1524297147989273, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7828863263130188, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 182.38671875, "epoch": 0.12999675008124797, "grad_norm": 0.5593376755714417, "kl": 0.04638671875, "learning_rate": 8.699609882964889e-07, "loss": 0.0019, "reward": 1.6655258536338806, "reward_std": 0.27342432737350464, "rewards/format_reward_gen": 0.93359375, "rewards/llm_reward": 0.7319321632385254, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 172.96484375, "epoch": 0.1306467338316542, "grad_norm": 0.4657059907913208, "kl": 0.02740478515625, "learning_rate": 8.693107932379714e-07, "loss": 0.0011, "reward": 1.7493746280670166, "reward_std": 0.13702595978975296, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7767183780670166, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 182.71484375, "epoch": 0.13129671758206043, "grad_norm": 0.49233290553092957, "kl": 0.02838134765625, "learning_rate": 8.686605981794538e-07, "loss": 0.0011, "reward": 1.7488125562667847, "reward_std": 0.16500753164291382, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7800625562667847, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 180.41015625, "epoch": 0.1319467013324667, "grad_norm": 0.6786900758743286, "kl": 0.05126953125, "learning_rate": 8.680104031209362e-07, "loss": 0.0021, "reward": 1.7405818700790405, "reward_std": 0.16670531034469604, "rewards/format_reward_gen": 0.95703125, "rewards/llm_reward": 0.7835506200790405, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 179.58984375, "epoch": 0.13259668508287292, "grad_norm": 0.4481852948665619, "kl": 0.027099609375, "learning_rate": 8.673602080624187e-07, "loss": 0.0011, "reward": 1.7703141570091248, "reward_std": 0.15401559323072433, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.8054704070091248, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 184.4765625, "epoch": 0.13324666883327918, "grad_norm": 0.4701312482357025, "kl": 0.02655029296875, "learning_rate": 8.667100130039011e-07, "loss": 0.0011, "reward": 1.7175315022468567, "reward_std": 0.18063858151435852, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.7526877522468567, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 186.7890625, "epoch": 0.1338966525836854, "grad_norm": 0.47942838072776794, "kl": 0.02703857421875, "learning_rate": 8.660598179453836e-07, "loss": 0.0011, "reward": 1.6956843137741089, "reward_std": 0.20408668369054794, "rewards/format_reward_gen": 0.94921875, "rewards/llm_reward": 0.7464655637741089, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 175.28125, "epoch": 0.13454663633409164, "grad_norm": 0.5810760259628296, "kl": 0.0267333984375, "learning_rate": 8.65409622886866e-07, "loss": 0.0011, "reward": 1.767817735671997, "reward_std": 0.16319949179887772, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7951614260673523, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 181.6953125, "epoch": 0.1351966200844979, "grad_norm": 0.4795970320701599, "kl": 0.029541015625, "learning_rate": 8.647594278283484e-07, "loss": 0.0012, "reward": 1.7509849071502686, "reward_std": 0.1600380316376686, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.786141037940979, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 206.06640625, "epoch": 0.13584660383490413, "grad_norm": 0.5478484630584717, "kl": 0.0260009765625, "learning_rate": 8.641092327698309e-07, "loss": 0.001, "reward": 1.6822729110717773, "reward_std": 0.16626878082752228, "rewards/format_reward_gen": 0.95703125, "rewards/llm_reward": 0.7252416610717773, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 187.80859375, "epoch": 0.13649658758531036, "grad_norm": 0.4997495412826538, "kl": 0.0242919921875, "learning_rate": 8.634590377113133e-07, "loss": 0.001, "reward": 1.7234801650047302, "reward_std": 0.1680876836180687, "rewards/format_reward_gen": 0.9609375, "rewards/llm_reward": 0.7625426948070526, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 195.81640625, "epoch": 0.13714657133571662, "grad_norm": 0.5374329686164856, "kl": 0.0377197265625, "learning_rate": 8.628088426527958e-07, "loss": 0.0015, "reward": 1.7390813827514648, "reward_std": 0.1415918841958046, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7625188827514648, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 175.5546875, "epoch": 0.13779655508612285, "grad_norm": 0.4949089586734772, "kl": 0.02764892578125, "learning_rate": 8.621586475942782e-07, "loss": 0.0011, "reward": 1.7943991422653198, "reward_std": 0.128727488219738, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8100241720676422, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 205.20703125, "epoch": 0.13844653883652908, "grad_norm": 0.4941396415233612, "kl": 0.02520751953125, "learning_rate": 8.615084525357606e-07, "loss": 0.001, "reward": 1.6702356934547424, "reward_std": 0.12427826970815659, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7014856934547424, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 221.44140625, "epoch": 0.13909652258693533, "grad_norm": 0.5148347020149231, "kl": 0.0263671875, "learning_rate": 8.608582574772432e-07, "loss": 0.0011, "reward": 1.6350823640823364, "reward_std": 0.15164951235055923, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.6663324236869812, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 193.71484375, "epoch": 0.13974650633734156, "grad_norm": 0.527244508266449, "kl": 0.03009033203125, "learning_rate": 8.602080624187256e-07, "loss": 0.0012, "reward": 1.7401618957519531, "reward_std": 0.1506534442305565, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7557869553565979, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 201.76953125, "epoch": 0.1403964900877478, "grad_norm": 0.5036085247993469, "kl": 0.0316162109375, "learning_rate": 8.595578673602081e-07, "loss": 0.0013, "reward": 1.6781622767448425, "reward_std": 0.16518264263868332, "rewards/format_reward_gen": 0.9609375, "rewards/llm_reward": 0.7172247469425201, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 200.19921875, "epoch": 0.14104647383815405, "grad_norm": 0.58259117603302, "kl": 0.0281982421875, "learning_rate": 8.589076723016905e-07, "loss": 0.0011, "reward": 1.7118995785713196, "reward_std": 0.15669042617082596, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7392432987689972, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 179.13671875, "epoch": 0.14169645758856028, "grad_norm": 0.45688900351524353, "kl": 0.0303955078125, "learning_rate": 8.582574772431729e-07, "loss": 0.0012, "reward": 1.7634297609329224, "reward_std": 0.1466008499264717, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7751484513282776, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 202.00390625, "epoch": 0.14234644133896654, "grad_norm": 0.4585893154144287, "kl": 0.02783203125, "learning_rate": 8.576072821846554e-07, "loss": 0.0011, "reward": 1.7439244389533997, "reward_std": 0.12798229604959488, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7712681889533997, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 179.81640625, "epoch": 0.14299642508937277, "grad_norm": 0.8146737217903137, "kl": 0.09228515625, "learning_rate": 8.569570871261378e-07, "loss": 0.0037, "reward": 1.7656559944152832, "reward_std": 0.13258464261889458, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7812809646129608, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 197.53125, "epoch": 0.143646408839779, "grad_norm": 0.4389616549015045, "kl": 0.03741455078125, "learning_rate": 8.563068920676203e-07, "loss": 0.0015, "reward": 1.7795733213424683, "reward_std": 0.07308576628565788, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7873858511447906, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 201.01953125, "epoch": 0.14429639259018526, "grad_norm": 0.6231451630592346, "kl": 0.03338623046875, "learning_rate": 8.556566970091027e-07, "loss": 0.0013, "reward": 1.7466623783111572, "reward_std": 0.13983318209648132, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7622873187065125, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 186.13671875, "epoch": 0.1449463763405915, "grad_norm": 0.48941436409950256, "kl": 0.03338623046875, "learning_rate": 8.550065019505851e-07, "loss": 0.0013, "reward": 1.7035220861434937, "reward_std": 0.15596336126327515, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7269595563411713, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 195.05859375, "epoch": 0.14559636009099772, "grad_norm": 0.5027194023132324, "kl": 0.03387451171875, "learning_rate": 8.543563068920676e-07, "loss": 0.0014, "reward": 1.7123292684555054, "reward_std": 0.13406392186880112, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7396731078624725, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 188.71875, "epoch": 0.14624634384140398, "grad_norm": 0.7768802642822266, "kl": 0.03662109375, "learning_rate": 8.5370611183355e-07, "loss": 0.0015, "reward": 1.7583776712417603, "reward_std": 0.16145571321249008, "rewards/format_reward_gen": 0.95703125, "rewards/llm_reward": 0.8013463914394379, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 188.515625, "epoch": 0.1468963275918102, "grad_norm": 0.4681095480918884, "kl": 0.0406494140625, "learning_rate": 8.530559167750325e-07, "loss": 0.0016, "reward": 1.6915298104286194, "reward_std": 0.19005366414785385, "rewards/format_reward_gen": 0.94921875, "rewards/llm_reward": 0.7423110604286194, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 186.5625, "epoch": 0.14754631134221644, "grad_norm": 0.46647199988365173, "kl": 0.028564453125, "learning_rate": 8.524057217165149e-07, "loss": 0.0011, "reward": 1.746890902519226, "reward_std": 0.19006936252117157, "rewards/format_reward_gen": 0.9609375, "rewards/llm_reward": 0.7859534025192261, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 187.10546875, "epoch": 0.1481962950926227, "grad_norm": 0.5158973932266235, "kl": 0.02978515625, "learning_rate": 8.517555266579973e-07, "loss": 0.0012, "reward": 1.7335621118545532, "reward_std": 0.14523334801197052, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.768718421459198, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 181.796875, "epoch": 0.14884627884302892, "grad_norm": 0.44082656502723694, "kl": 0.026611328125, "learning_rate": 8.511053315994798e-07, "loss": 0.0011, "reward": 1.7217842936515808, "reward_std": 0.0883215069770813, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7295967936515808, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 195.328125, "epoch": 0.14949626259343515, "grad_norm": 0.569291889667511, "kl": 0.02886962890625, "learning_rate": 8.504551365409622e-07, "loss": 0.0012, "reward": 1.7087717652320862, "reward_std": 0.18147821724414825, "rewards/format_reward_gen": 0.953125, "rewards/llm_reward": 0.7556467354297638, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 172.83984375, "epoch": 0.1501462463438414, "grad_norm": 0.4765099883079529, "kl": 0.029541015625, "learning_rate": 8.498049414824448e-07, "loss": 0.0012, "reward": 1.7958463430404663, "reward_std": 0.12386680021882057, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8153774738311768, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 205.390625, "epoch": 0.15079623009424764, "grad_norm": 0.47415482997894287, "kl": 0.02838134765625, "learning_rate": 8.491547464239272e-07, "loss": 0.0011, "reward": 1.6976915001869202, "reward_std": 0.14118564873933792, "rewards/format_reward_gen": 0.953125, "rewards/llm_reward": 0.7445664703845978, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 179.28125, "epoch": 0.15144621384465387, "grad_norm": 0.43203601241111755, "kl": 0.0283203125, "learning_rate": 8.485045513654096e-07, "loss": 0.0011, "reward": 1.782004714012146, "reward_std": 0.1189635880291462, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7976297736167908, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 191.2109375, "epoch": 0.15209619759506013, "grad_norm": 0.5841758251190186, "kl": 0.03289794921875, "learning_rate": 8.478543563068921e-07, "loss": 0.0013, "reward": 1.7366433143615723, "reward_std": 0.17169132828712463, "rewards/format_reward_gen": 0.9609375, "rewards/llm_reward": 0.7757058143615723, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 157.6640625, "epoch": 0.15274618134546636, "grad_norm": 0.4069887697696686, "kl": 0.03631591796875, "learning_rate": 8.472041612483745e-07, "loss": 0.0015, "reward": 1.8261361122131348, "reward_std": 0.09076279401779175, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.83785480260849, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 171.2421875, "epoch": 0.15339616509587262, "grad_norm": 0.446793794631958, "kl": 0.0286865234375, "learning_rate": 8.46553966189857e-07, "loss": 0.0011, "reward": 1.7995299100875854, "reward_std": 0.12296212464570999, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8073423802852631, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 167.84765625, "epoch": 0.15404614884627885, "grad_norm": 0.5116661190986633, "kl": 0.032958984375, "learning_rate": 8.459037711313394e-07, "loss": 0.0013, "reward": 1.791234016418457, "reward_std": 0.1106923408806324, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8068590760231018, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 174.48046875, "epoch": 0.15469613259668508, "grad_norm": 0.4691352844238281, "kl": 0.038818359375, "learning_rate": 8.452535760728218e-07, "loss": 0.0016, "reward": 1.7427113056182861, "reward_std": 0.1429579257965088, "rewards/format_reward_gen": 0.9609375, "rewards/llm_reward": 0.7817737460136414, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 179.10546875, "epoch": 0.15534611634709133, "grad_norm": 0.8664090633392334, "kl": 0.0296630859375, "learning_rate": 8.446033810143043e-07, "loss": 0.0012, "reward": 1.7434597611427307, "reward_std": 0.15633811056613922, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7747097909450531, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 179.12109375, "epoch": 0.15599610009749756, "grad_norm": 0.510591983795166, "kl": 0.02880859375, "learning_rate": 8.439531859557867e-07, "loss": 0.0012, "reward": 1.7355501651763916, "reward_std": 0.1298944689333439, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7472688555717468, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 186.39453125, "epoch": 0.1566460838479038, "grad_norm": 0.5054064393043518, "kl": 0.0411376953125, "learning_rate": 8.433029908972692e-07, "loss": 0.0016, "reward": 1.717657446861267, "reward_std": 0.17712730914354324, "rewards/format_reward_gen": 0.9453125, "rewards/llm_reward": 0.7723449468612671, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 189.0234375, "epoch": 0.15729606759831005, "grad_norm": 0.4892916679382324, "kl": 0.02734375, "learning_rate": 8.426527958387516e-07, "loss": 0.0011, "reward": 1.7351526618003845, "reward_std": 0.16510161012411118, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7624964118003845, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 193.84765625, "epoch": 0.15794605134871628, "grad_norm": 0.5995280742645264, "kl": 0.035888671875, "learning_rate": 8.42002600780234e-07, "loss": 0.0014, "reward": 1.705505609512329, "reward_std": 0.14088347554206848, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7289431691169739, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 194.49609375, "epoch": 0.1585960350991225, "grad_norm": 0.6661954522132874, "kl": 0.0277099609375, "learning_rate": 8.413524057217165e-07, "loss": 0.0011, "reward": 1.6728763580322266, "reward_std": 0.16425494849681854, "rewards/format_reward_gen": 0.953125, "rewards/llm_reward": 0.7197513580322266, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 181.9765625, "epoch": 0.15924601884952877, "grad_norm": 0.44324225187301636, "kl": 0.03302001953125, "learning_rate": 8.407022106631989e-07, "loss": 0.0013, "reward": 1.7142429947853088, "reward_std": 0.1836128979921341, "rewards/format_reward_gen": 0.953125, "rewards/llm_reward": 0.7611179649829865, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 199.0, "epoch": 0.159896002599935, "grad_norm": 0.45210233330726624, "kl": 0.0302734375, "learning_rate": 8.400520156046814e-07, "loss": 0.0012, "reward": 1.6957476735115051, "reward_std": 0.16992917656898499, "rewards/format_reward_gen": 0.953125, "rewards/llm_reward": 0.7426226139068604, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 190.2421875, "epoch": 0.16054598635034123, "grad_norm": 0.5040702223777771, "kl": 0.02752685546875, "learning_rate": 8.394018205461638e-07, "loss": 0.0011, "reward": 1.7451273202896118, "reward_std": 0.11788241565227509, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7646585702896118, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 183.578125, "epoch": 0.1611959701007475, "grad_norm": 0.4220493733882904, "kl": 0.02728271484375, "learning_rate": 8.387516254876462e-07, "loss": 0.0011, "reward": 1.7599961757659912, "reward_std": 0.09904596954584122, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7678087055683136, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 191.8359375, "epoch": 0.16184595385115372, "grad_norm": 0.4108138084411621, "kl": 0.034912109375, "learning_rate": 8.381014304291288e-07, "loss": 0.0014, "reward": 1.7546542286872864, "reward_std": 0.15046705305576324, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7859043180942535, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 172.90625, "epoch": 0.16249593760155995, "grad_norm": 0.5560400485992432, "kl": 0.0360107421875, "learning_rate": 8.374512353706112e-07, "loss": 0.0014, "reward": 1.7402279376983643, "reward_std": 0.14881623536348343, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.7753842175006866, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 177.67578125, "epoch": 0.1631459213519662, "grad_norm": 0.5883175730705261, "kl": 0.0369873046875, "learning_rate": 8.368010403120937e-07, "loss": 0.0015, "reward": 1.7161803245544434, "reward_std": 0.17633052170276642, "rewards/format_reward_gen": 0.94921875, "rewards/llm_reward": 0.7669615745544434, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 189.734375, "epoch": 0.16379590510237244, "grad_norm": 0.5339719653129578, "kl": 0.0316162109375, "learning_rate": 8.361508452535761e-07, "loss": 0.0013, "reward": 1.7880685925483704, "reward_std": 0.12561334297060966, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.811506062746048, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 188.671875, "epoch": 0.1644458888527787, "grad_norm": 0.4482637047767639, "kl": 0.02764892578125, "learning_rate": 8.355006501950585e-07, "loss": 0.0011, "reward": 1.7575886249542236, "reward_std": 0.15473860502243042, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7810261249542236, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 183.8046875, "epoch": 0.16509587260318492, "grad_norm": 0.4350739121437073, "kl": 0.0352783203125, "learning_rate": 8.34850455136541e-07, "loss": 0.0014, "reward": 1.719433307647705, "reward_std": 0.1579793058335781, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.7545895874500275, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 182.2265625, "epoch": 0.16574585635359115, "grad_norm": 0.43324658274650574, "kl": 0.0286865234375, "learning_rate": 8.342002600780234e-07, "loss": 0.0011, "reward": 1.7663543820381165, "reward_std": 0.1018737144768238, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7780730128288269, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 180.18359375, "epoch": 0.1663958401039974, "grad_norm": 0.6821237802505493, "kl": 0.0263671875, "learning_rate": 8.335500650195059e-07, "loss": 0.0011, "reward": 1.7984776496887207, "reward_std": 0.09401439875364304, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8062902390956879, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 195.95703125, "epoch": 0.16704582385440364, "grad_norm": 0.4747907519340515, "kl": 0.034423828125, "learning_rate": 8.328998699609883e-07, "loss": 0.0014, "reward": 1.6857905387878418, "reward_std": 0.14122164994478226, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7170405387878418, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 197.28125, "epoch": 0.16769580760480987, "grad_norm": 0.48556309938430786, "kl": 0.0335693359375, "learning_rate": 8.322496749024707e-07, "loss": 0.0013, "reward": 1.7649497985839844, "reward_std": 0.12375162914395332, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7961997985839844, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 178.15234375, "epoch": 0.16834579135521613, "grad_norm": 0.4668838679790497, "kl": 0.0296630859375, "learning_rate": 8.315994798439532e-07, "loss": 0.0012, "reward": 1.7729778289794922, "reward_std": 0.09409546479582787, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7846965491771698, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 197.7421875, "epoch": 0.16899577510562236, "grad_norm": 0.4533539414405823, "kl": 0.02838134765625, "learning_rate": 8.309492847854356e-07, "loss": 0.0011, "reward": 1.7822368741035461, "reward_std": 0.12096904218196869, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8017681539058685, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 205.10546875, "epoch": 0.1696457588560286, "grad_norm": 0.4461776614189148, "kl": 0.02447509765625, "learning_rate": 8.302990897269181e-07, "loss": 0.001, "reward": 1.7435848712921143, "reward_std": 0.09371576830744743, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7553035616874695, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 191.20703125, "epoch": 0.17029574260643485, "grad_norm": 0.4121449887752533, "kl": 0.02545166015625, "learning_rate": 8.296488946684005e-07, "loss": 0.001, "reward": 1.7641119956970215, "reward_std": 0.10852835699915886, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7797369658946991, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 207.546875, "epoch": 0.17094572635684108, "grad_norm": 0.4971691071987152, "kl": 0.0350341796875, "learning_rate": 8.289986996098829e-07, "loss": 0.0014, "reward": 1.7656006813049316, "reward_std": 0.1640140861272812, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7968506813049316, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 187.80078125, "epoch": 0.1715957101072473, "grad_norm": 0.40449008345603943, "kl": 0.03759765625, "learning_rate": 8.283485045513654e-07, "loss": 0.0015, "reward": 1.7837939858436584, "reward_std": 0.12867777422070503, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8072314560413361, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 213.21484375, "epoch": 0.17224569385765356, "grad_norm": 0.5089178085327148, "kl": 0.02703857421875, "learning_rate": 8.276983094928479e-07, "loss": 0.0011, "reward": 1.777925968170166, "reward_std": 0.10364478081464767, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7935509979724884, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 174.03125, "epoch": 0.1728956776080598, "grad_norm": 0.4633951783180237, "kl": 0.03106689453125, "learning_rate": 8.270481144343304e-07, "loss": 0.0012, "reward": 1.8294642567634583, "reward_std": 0.11000846698880196, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8450892567634583, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 186.40234375, "epoch": 0.17354566135846602, "grad_norm": 0.5356776118278503, "kl": 0.0294189453125, "learning_rate": 8.263979193758128e-07, "loss": 0.0012, "reward": 1.7340087890625, "reward_std": 0.13520370796322823, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7496337890625, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 179.80078125, "epoch": 0.17419564510887228, "grad_norm": 0.4683663547039032, "kl": 0.02789306640625, "learning_rate": 8.257477243172951e-07, "loss": 0.0011, "reward": 1.8107211589813232, "reward_std": 0.12811756134033203, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8341586887836456, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 187.8515625, "epoch": 0.1748456288592785, "grad_norm": 0.4603193998336792, "kl": 0.03070068359375, "learning_rate": 8.250975292587777e-07, "loss": 0.0012, "reward": 1.7408462166786194, "reward_std": 0.1274801641702652, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7681899666786194, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 192.5, "epoch": 0.17549561260968477, "grad_norm": 0.42879346013069153, "kl": 0.02410888671875, "learning_rate": 8.2444733420026e-07, "loss": 0.001, "reward": 1.7740105390548706, "reward_std": 0.08346934616565704, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.777916818857193, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 190.8515625, "epoch": 0.176145596360091, "grad_norm": 0.4181022346019745, "kl": 0.02557373046875, "learning_rate": 8.237971391417425e-07, "loss": 0.001, "reward": 1.805622935295105, "reward_std": 0.08106480911374092, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8134354948997498, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 195.1796875, "epoch": 0.17679558011049723, "grad_norm": 0.5098494291305542, "kl": 0.0301513671875, "learning_rate": 8.231469440832249e-07, "loss": 0.0012, "reward": 1.7252883911132812, "reward_std": 0.11298202723264694, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7370070815086365, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 192.140625, "epoch": 0.1774455638609035, "grad_norm": 0.7500156164169312, "kl": 0.03155517578125, "learning_rate": 8.224967490247073e-07, "loss": 0.0013, "reward": 1.772659182548523, "reward_std": 0.13263439387083054, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7882842123508453, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 192.92578125, "epoch": 0.17809554761130972, "grad_norm": 0.4094005823135376, "kl": 0.02789306640625, "learning_rate": 8.218465539661898e-07, "loss": 0.0011, "reward": 1.7712860107421875, "reward_std": 0.0984211191534996, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7830047905445099, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 200.41015625, "epoch": 0.17874553136171595, "grad_norm": 0.4378266930580139, "kl": 0.02801513671875, "learning_rate": 8.211963589076722e-07, "loss": 0.0011, "reward": 1.7776845693588257, "reward_std": 0.08802752196788788, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7894032895565033, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 199.55859375, "epoch": 0.1793955151121222, "grad_norm": 0.4316923916339874, "kl": 0.02996826171875, "learning_rate": 8.205461638491547e-07, "loss": 0.0012, "reward": 1.7625699043273926, "reward_std": 0.09640281274914742, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7703824639320374, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 191.54296875, "epoch": 0.18004549886252844, "grad_norm": 0.40870729088783264, "kl": 0.0257568359375, "learning_rate": 8.198959687906371e-07, "loss": 0.001, "reward": 1.7984453439712524, "reward_std": 0.07041627168655396, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8023515343666077, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 194.19921875, "epoch": 0.18069548261293467, "grad_norm": 0.4288944602012634, "kl": 0.03314208984375, "learning_rate": 8.192457737321195e-07, "loss": 0.0013, "reward": 1.7624847888946533, "reward_std": 0.08820923790335655, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.7663910686969757, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 224.81640625, "epoch": 0.18134546636334092, "grad_norm": 0.41608166694641113, "kl": 0.03216552734375, "learning_rate": 8.18595578673602e-07, "loss": 0.0013, "reward": 1.763047218322754, "reward_std": 0.11330926418304443, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7864847779273987, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 187.78125, "epoch": 0.18199545011374715, "grad_norm": 0.48637452721595764, "kl": 0.02734375, "learning_rate": 8.179453836150844e-07, "loss": 0.0011, "reward": 1.762601613998413, "reward_std": 0.09715057536959648, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.7665078341960907, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 174.9921875, "epoch": 0.18264543386415338, "grad_norm": 0.43651098012924194, "kl": 0.02734375, "learning_rate": 8.172951885565669e-07, "loss": 0.0011, "reward": 1.7929569482803345, "reward_std": 0.0913686454296112, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8007694482803345, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 204.69921875, "epoch": 0.18329541761455964, "grad_norm": 0.3951195478439331, "kl": 0.02679443359375, "learning_rate": 8.166449934980494e-07, "loss": 0.0011, "reward": 1.7596476674079895, "reward_std": 0.12016720324754715, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7830850780010223, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 188.04296875, "epoch": 0.18394540136496587, "grad_norm": 0.42390990257263184, "kl": 0.036865234375, "learning_rate": 8.159947984395318e-07, "loss": 0.0015, "reward": 1.7484487295150757, "reward_std": 0.15711642801761627, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7796986401081085, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 184.81640625, "epoch": 0.18459538511537213, "grad_norm": 0.5817144513130188, "kl": 0.0284423828125, "learning_rate": 8.153446033810143e-07, "loss": 0.0011, "reward": 1.7561439275741577, "reward_std": 0.06060056388378143, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7561438977718353, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 180.57421875, "epoch": 0.18524536886577836, "grad_norm": 0.4170922636985779, "kl": 0.02581787109375, "learning_rate": 8.146944083224967e-07, "loss": 0.001, "reward": 1.820489764213562, "reward_std": 0.12137925997376442, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.840021014213562, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 206.5703125, "epoch": 0.1858953526161846, "grad_norm": 0.4897206127643585, "kl": 0.0291748046875, "learning_rate": 8.140442132639792e-07, "loss": 0.0012, "reward": 1.7129886746406555, "reward_std": 0.11566810309886932, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7286137044429779, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 172.88671875, "epoch": 0.18654533636659085, "grad_norm": 0.5149694681167603, "kl": 0.0462646484375, "learning_rate": 8.133940182054616e-07, "loss": 0.0019, "reward": 1.8077491521835327, "reward_std": 0.1106136254966259, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8233741521835327, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 196.84765625, "epoch": 0.18719532011699708, "grad_norm": 0.43654730916023254, "kl": 0.0316162109375, "learning_rate": 8.12743823146944e-07, "loss": 0.0013, "reward": 1.7701095938682556, "reward_std": 0.14524441212415695, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.789640873670578, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 182.3515625, "epoch": 0.1878453038674033, "grad_norm": 0.400662899017334, "kl": 0.0341796875, "learning_rate": 8.120936280884265e-07, "loss": 0.0014, "reward": 1.7838002443313599, "reward_std": 0.0942181758582592, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.7877064943313599, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 193.625, "epoch": 0.18849528761780956, "grad_norm": 0.4677969515323639, "kl": 0.0362548828125, "learning_rate": 8.114434330299089e-07, "loss": 0.0015, "reward": 1.728374183177948, "reward_std": 0.13449250534176826, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7518116533756256, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 177.50390625, "epoch": 0.1891452713682158, "grad_norm": 0.570397138595581, "kl": 0.03057861328125, "learning_rate": 8.107932379713914e-07, "loss": 0.0012, "reward": 1.7482311725616455, "reward_std": 0.12253822386264801, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7560436725616455, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 191.4765625, "epoch": 0.18979525511862202, "grad_norm": 0.4136800467967987, "kl": 0.03155517578125, "learning_rate": 8.101430429128738e-07, "loss": 0.0013, "reward": 1.7777548432350159, "reward_std": 0.14153000712394714, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.8129111528396606, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 187.41015625, "epoch": 0.19044523886902828, "grad_norm": 0.46874523162841797, "kl": 0.03515625, "learning_rate": 8.094928478543562e-07, "loss": 0.0014, "reward": 1.7244184017181396, "reward_std": 0.1667429581284523, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7517622709274292, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 173.9453125, "epoch": 0.1910952226194345, "grad_norm": 0.6158692240715027, "kl": 0.02752685546875, "learning_rate": 8.088426527958387e-07, "loss": 0.0011, "reward": 1.8153298497200012, "reward_std": 0.15121372789144516, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.8465798497200012, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 171.90625, "epoch": 0.19174520636984074, "grad_norm": 0.5098119378089905, "kl": 0.0362548828125, "learning_rate": 8.081924577373211e-07, "loss": 0.0014, "reward": 1.7577843070030212, "reward_std": 0.1423262059688568, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.7929405570030212, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 171.59375, "epoch": 0.192395190120247, "grad_norm": 0.896820604801178, "kl": 0.02960205078125, "learning_rate": 8.075422626788036e-07, "loss": 0.0012, "reward": 1.7646830677986145, "reward_std": 0.1367306411266327, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7842143177986145, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 161.125, "epoch": 0.19304517387065323, "grad_norm": 0.4369134306907654, "kl": 0.0399169921875, "learning_rate": 8.06892067620286e-07, "loss": 0.0016, "reward": 1.840453028678894, "reward_std": 0.09269643947482109, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.852171778678894, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 185.9296875, "epoch": 0.19369515762105946, "grad_norm": 0.4626027047634125, "kl": 0.0347900390625, "learning_rate": 8.062418725617684e-07, "loss": 0.0014, "reward": 1.77413010597229, "reward_std": 0.1368669643998146, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7897550165653229, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 182.41796875, "epoch": 0.19434514137146572, "grad_norm": 0.4475487172603607, "kl": 0.03192138671875, "learning_rate": 8.05591677503251e-07, "loss": 0.0013, "reward": 1.7803812623023987, "reward_std": 0.10350389406085014, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7920999526977539, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 177.5234375, "epoch": 0.19499512512187195, "grad_norm": 0.5676428079605103, "kl": 0.0286865234375, "learning_rate": 8.049414824447334e-07, "loss": 0.0011, "reward": 1.782118320465088, "reward_std": 0.1331407055258751, "rewards/format_reward_gen": 0.9609375, "rewards/llm_reward": 0.8211807608604431, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 173.08984375, "epoch": 0.1956451088722782, "grad_norm": 0.515059232711792, "kl": 0.0302734375, "learning_rate": 8.042912873862159e-07, "loss": 0.0012, "reward": 1.69658625125885, "reward_std": 0.14224743843078613, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7161175310611725, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 198.10546875, "epoch": 0.19629509262268444, "grad_norm": 0.4971987009048462, "kl": 0.03131103515625, "learning_rate": 8.036410923276983e-07, "loss": 0.0013, "reward": 1.7380147576332092, "reward_std": 0.14358926564455032, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7536397576332092, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 185.74609375, "epoch": 0.19694507637309067, "grad_norm": 0.5246761441230774, "kl": 0.03228759765625, "learning_rate": 8.029908972691807e-07, "loss": 0.0013, "reward": 1.6964857578277588, "reward_std": 0.1790958270430565, "rewards/format_reward_gen": 0.9609375, "rewards/llm_reward": 0.7355482280254364, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 172.5234375, "epoch": 0.19759506012349692, "grad_norm": 0.5431080460548401, "kl": 0.02996826171875, "learning_rate": 8.023407022106632e-07, "loss": 0.0012, "reward": 1.727781355381012, "reward_std": 0.14567125216126442, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7512188851833344, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 178.52734375, "epoch": 0.19824504387390315, "grad_norm": 0.4479975700378418, "kl": 0.032470703125, "learning_rate": 8.016905071521456e-07, "loss": 0.0013, "reward": 1.7743908166885376, "reward_std": 0.16335195302963257, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.8056408166885376, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 172.58203125, "epoch": 0.19889502762430938, "grad_norm": 0.46796560287475586, "kl": 0.03253173828125, "learning_rate": 8.010403120936281e-07, "loss": 0.0013, "reward": 1.8023510575294495, "reward_std": 0.13821449875831604, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8218822479248047, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 180.546875, "epoch": 0.19954501137471564, "grad_norm": 0.4881601929664612, "kl": 0.030029296875, "learning_rate": 8.003901170351105e-07, "loss": 0.0012, "reward": 1.7547274231910706, "reward_std": 0.12231723219156265, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7703524231910706, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 179.10546875, "epoch": 0.20019499512512187, "grad_norm": 1.0548551082611084, "kl": 0.031494140625, "learning_rate": 7.997399219765929e-07, "loss": 0.0013, "reward": 1.828441321849823, "reward_std": 0.08057626150548458, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.836253821849823, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 192.3046875, "epoch": 0.2008449788755281, "grad_norm": 0.4433007538318634, "kl": 0.034423828125, "learning_rate": 7.990897269180754e-07, "loss": 0.0014, "reward": 1.6966384053230286, "reward_std": 0.13986649364233017, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7278884053230286, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 165.6328125, "epoch": 0.20149496262593436, "grad_norm": 0.4142873287200928, "kl": 0.0328369140625, "learning_rate": 7.984395318595578e-07, "loss": 0.0013, "reward": 1.7595221996307373, "reward_std": 0.11296559125185013, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7751471698284149, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 161.62109375, "epoch": 0.2021449463763406, "grad_norm": 0.5011398196220398, "kl": 0.0345458984375, "learning_rate": 7.977893368010403e-07, "loss": 0.0014, "reward": 1.7882235050201416, "reward_std": 0.16755671054124832, "rewards/format_reward_gen": 0.95703125, "rewards/llm_reward": 0.8311922550201416, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 176.19140625, "epoch": 0.20279493012674682, "grad_norm": 0.46056002378463745, "kl": 0.0301513671875, "learning_rate": 7.971391417425227e-07, "loss": 0.0012, "reward": 1.800415575504303, "reward_std": 0.11588463187217712, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.816040575504303, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 164.546875, "epoch": 0.20344491387715308, "grad_norm": 0.5247759819030762, "kl": 0.04119873046875, "learning_rate": 7.964889466840051e-07, "loss": 0.0017, "reward": 1.7757028341293335, "reward_std": 0.09364756569266319, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7913278043270111, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 170.5, "epoch": 0.2040948976275593, "grad_norm": 0.48712193965911865, "kl": 0.04931640625, "learning_rate": 7.958387516254876e-07, "loss": 0.002, "reward": 1.7498035430908203, "reward_std": 0.15440017729997635, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7810536026954651, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 173.05078125, "epoch": 0.20474488137796554, "grad_norm": 0.47315162420272827, "kl": 0.032958984375, "learning_rate": 7.9518855656697e-07, "loss": 0.0013, "reward": 1.7841360569000244, "reward_std": 0.10974674299359322, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7997610569000244, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 160.43359375, "epoch": 0.2053948651283718, "grad_norm": 0.4631814956665039, "kl": 0.0352783203125, "learning_rate": 7.945383615084526e-07, "loss": 0.0014, "reward": 1.7508141994476318, "reward_std": 0.10910889878869057, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7781579196453094, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 180.66796875, "epoch": 0.20604484887877803, "grad_norm": 0.4483095407485962, "kl": 0.040771484375, "learning_rate": 7.93888166449935e-07, "loss": 0.0016, "reward": 1.7327216863632202, "reward_std": 0.135917529463768, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7600654363632202, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 180.19140625, "epoch": 0.20669483262918428, "grad_norm": 0.4856536388397217, "kl": 0.0341796875, "learning_rate": 7.932379713914174e-07, "loss": 0.0014, "reward": 1.7356404066085815, "reward_std": 0.10243161395192146, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7473591864109039, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 175.43359375, "epoch": 0.2073448163795905, "grad_norm": 0.40853843092918396, "kl": 0.02880859375, "learning_rate": 7.925877763328999e-07, "loss": 0.0012, "reward": 1.8285585641860962, "reward_std": 0.06914914026856422, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8363710939884186, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 185.234375, "epoch": 0.20799480012999674, "grad_norm": 0.43387311697006226, "kl": 0.02862548828125, "learning_rate": 7.919375812743823e-07, "loss": 0.0011, "reward": 1.7781727313995361, "reward_std": 0.11173965595662594, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7937977313995361, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 164.26953125, "epoch": 0.208644783880403, "grad_norm": 1.4432930946350098, "kl": 0.0377197265625, "learning_rate": 7.912873862158648e-07, "loss": 0.0015, "reward": 1.7641202211380005, "reward_std": 0.12447617575526237, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7797451913356781, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 166.71875, "epoch": 0.20929476763080923, "grad_norm": 0.46623605489730835, "kl": 0.0335693359375, "learning_rate": 7.906371911573472e-07, "loss": 0.0013, "reward": 1.7853580117225647, "reward_std": 0.12446978315711021, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7970767617225647, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 186.83984375, "epoch": 0.20994475138121546, "grad_norm": 0.5220369696617126, "kl": 0.041748046875, "learning_rate": 7.899869960988296e-07, "loss": 0.0017, "reward": 1.7213578820228577, "reward_std": 0.18932301551103592, "rewards/format_reward_gen": 0.9609375, "rewards/llm_reward": 0.7604203522205353, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 176.421875, "epoch": 0.21059473513162172, "grad_norm": 0.38918402791023254, "kl": 0.0333251953125, "learning_rate": 7.893368010403121e-07, "loss": 0.0013, "reward": 1.822035551071167, "reward_std": 0.11059433966875076, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.837660551071167, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 208.3046875, "epoch": 0.21124471888202795, "grad_norm": 0.4458601474761963, "kl": 0.033935546875, "learning_rate": 7.886866059817945e-07, "loss": 0.0014, "reward": 1.7191236019134521, "reward_std": 0.1259307563304901, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7425611615180969, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 181.75, "epoch": 0.21189470263243418, "grad_norm": 0.4456200897693634, "kl": 0.0340576171875, "learning_rate": 7.88036410923277e-07, "loss": 0.0014, "reward": 1.8072214722633362, "reward_std": 0.1399131566286087, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.8384714722633362, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 181.41015625, "epoch": 0.21254468638284044, "grad_norm": 0.46492260694503784, "kl": 0.03387451171875, "learning_rate": 7.873862158647594e-07, "loss": 0.0014, "reward": 1.7898766994476318, "reward_std": 0.10886027663946152, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8055016696453094, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 184.09375, "epoch": 0.21319467013324667, "grad_norm": 0.45180845260620117, "kl": 0.0274658203125, "learning_rate": 7.867360208062418e-07, "loss": 0.0011, "reward": 1.8081932663917542, "reward_std": 0.07757536321878433, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8160057961940765, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 190.39453125, "epoch": 0.2138446538836529, "grad_norm": 0.4440869688987732, "kl": 0.0361328125, "learning_rate": 7.860858257477243e-07, "loss": 0.0014, "reward": 1.7508025169372559, "reward_std": 0.12756097689270973, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7742400467395782, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 169.62890625, "epoch": 0.21449463763405915, "grad_norm": 0.47388067841529846, "kl": 0.03466796875, "learning_rate": 7.854356306892067e-07, "loss": 0.0014, "reward": 1.7960801720619202, "reward_std": 0.09899366647005081, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8038927018642426, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 189.65234375, "epoch": 0.21514462138446538, "grad_norm": 0.41413426399230957, "kl": 0.0386962890625, "learning_rate": 7.847854356306892e-07, "loss": 0.0016, "reward": 1.7560593485832214, "reward_std": 0.12399476766586304, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.7912156283855438, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 188.95703125, "epoch": 0.21579460513487164, "grad_norm": 0.4718843698501587, "kl": 0.02838134765625, "learning_rate": 7.841352405721716e-07, "loss": 0.0011, "reward": 1.7568809986114502, "reward_std": 0.1172739565372467, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.780318558216095, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 187.40234375, "epoch": 0.21644458888527787, "grad_norm": 0.41248390078544617, "kl": 0.0322265625, "learning_rate": 7.83485045513654e-07, "loss": 0.0013, "reward": 1.7435346245765686, "reward_std": 0.1010383814573288, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7708784341812134, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 162.34375, "epoch": 0.2170945726356841, "grad_norm": 0.4707634150981903, "kl": 0.0347900390625, "learning_rate": 7.828348504551366e-07, "loss": 0.0014, "reward": 1.7558313608169556, "reward_std": 0.1261696070432663, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7675500512123108, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 188.4140625, "epoch": 0.21774455638609036, "grad_norm": 0.3840872049331665, "kl": 0.02294921875, "learning_rate": 7.82184655396619e-07, "loss": 0.0009, "reward": 1.8180765509605408, "reward_std": 0.10218833386898041, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.833701491355896, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 169.703125, "epoch": 0.2183945401364966, "grad_norm": 0.41291555762290955, "kl": 0.0322265625, "learning_rate": 7.815344603381015e-07, "loss": 0.0013, "reward": 1.7838048338890076, "reward_std": 0.13450215756893158, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8033360242843628, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 184.0, "epoch": 0.21904452388690282, "grad_norm": 0.42080187797546387, "kl": 0.03094482421875, "learning_rate": 7.808842652795839e-07, "loss": 0.0012, "reward": 1.7887045741081238, "reward_std": 0.12142607942223549, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8043296039104462, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 181.796875, "epoch": 0.21969450763730908, "grad_norm": 0.503686785697937, "kl": 0.0341796875, "learning_rate": 7.802340702210663e-07, "loss": 0.0014, "reward": 1.7486024498939514, "reward_std": 0.18359223753213882, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.7837586402893066, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 186.21484375, "epoch": 0.2203444913877153, "grad_norm": 0.39749786257743835, "kl": 0.02667236328125, "learning_rate": 7.795838751625488e-07, "loss": 0.0011, "reward": 1.8017954230308533, "reward_std": 0.10183745622634888, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8174204230308533, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 180.45703125, "epoch": 0.22099447513812154, "grad_norm": 0.41666752099990845, "kl": 0.02630615234375, "learning_rate": 7.789336801040312e-07, "loss": 0.0011, "reward": 1.7953996658325195, "reward_std": 0.07082703895866871, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.7993059754371643, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 206.23828125, "epoch": 0.2216444588885278, "grad_norm": 0.5280798077583313, "kl": 0.02593994140625, "learning_rate": 7.782834850455137e-07, "loss": 0.001, "reward": 1.7369862794876099, "reward_std": 0.15816929936408997, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.7721425294876099, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 171.07421875, "epoch": 0.22229444263893403, "grad_norm": 0.4191420078277588, "kl": 0.035400390625, "learning_rate": 7.776332899869961e-07, "loss": 0.0014, "reward": 1.7994737029075623, "reward_std": 0.10678522288799286, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8111924529075623, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 194.2109375, "epoch": 0.22294442638934026, "grad_norm": 0.499418705701828, "kl": 0.039306640625, "learning_rate": 7.769830949284785e-07, "loss": 0.0016, "reward": 1.7583361864089966, "reward_std": 0.11116430163383484, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7700549066066742, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 173.328125, "epoch": 0.2235944101397465, "grad_norm": 0.4613669514656067, "kl": 0.0343017578125, "learning_rate": 7.76332899869961e-07, "loss": 0.0014, "reward": 1.808002233505249, "reward_std": 0.10106742009520531, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8236272931098938, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 191.51953125, "epoch": 0.22424439389015274, "grad_norm": 0.9883086085319519, "kl": 0.0382080078125, "learning_rate": 7.756827048114434e-07, "loss": 0.0015, "reward": 1.768822193145752, "reward_std": 0.15787237137556076, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.796165943145752, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 185.78125, "epoch": 0.22489437764055897, "grad_norm": 0.9995537400245667, "kl": 0.0352783203125, "learning_rate": 7.750325097529259e-07, "loss": 0.0014, "reward": 1.776960015296936, "reward_std": 0.11729512363672256, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.792585015296936, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 190.1640625, "epoch": 0.22554436139096523, "grad_norm": 0.4206548035144806, "kl": 0.02874755859375, "learning_rate": 7.743823146944083e-07, "loss": 0.0012, "reward": 1.8033357858657837, "reward_std": 0.1283932402729988, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8228669762611389, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 181.8359375, "epoch": 0.22619434514137146, "grad_norm": 0.49880480766296387, "kl": 0.030517578125, "learning_rate": 7.737321196358907e-07, "loss": 0.0012, "reward": 1.725852608680725, "reward_std": 0.15950699523091316, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.7610088288784027, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 180.6328125, "epoch": 0.22684432889177772, "grad_norm": 0.43126803636550903, "kl": 0.03118896484375, "learning_rate": 7.730819245773732e-07, "loss": 0.0012, "reward": 1.769345998764038, "reward_std": 0.08276466280221939, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7771585583686829, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 192.84375, "epoch": 0.22749431264218395, "grad_norm": 0.4327355623245239, "kl": 0.02801513671875, "learning_rate": 7.724317295188556e-07, "loss": 0.0011, "reward": 1.7838518619537354, "reward_std": 0.10311364382505417, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7994768619537354, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 185.75, "epoch": 0.22814429639259018, "grad_norm": 0.42330265045166016, "kl": 0.03167724609375, "learning_rate": 7.717815344603382e-07, "loss": 0.0013, "reward": 1.7369843125343323, "reward_std": 0.15117451548576355, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7682342827320099, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 190.88671875, "epoch": 0.22879428014299644, "grad_norm": 0.4655004143714905, "kl": 0.03173828125, "learning_rate": 7.711313394018206e-07, "loss": 0.0013, "reward": 1.7632811069488525, "reward_std": 0.1140737272799015, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7828123569488525, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 193.01953125, "epoch": 0.22944426389340267, "grad_norm": 0.47968244552612305, "kl": 0.02545166015625, "learning_rate": 7.70481144343303e-07, "loss": 0.001, "reward": 1.7890470027923584, "reward_std": 0.07343822531402111, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7968595027923584, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 190.23828125, "epoch": 0.2300942476438089, "grad_norm": 1.0541473627090454, "kl": 0.025390625, "learning_rate": 7.698309492847855e-07, "loss": 0.001, "reward": 1.7852169275283813, "reward_std": 0.11519115045666695, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8008419573307037, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 210.44921875, "epoch": 0.23074423139421515, "grad_norm": 0.4518624246120453, "kl": 0.0355224609375, "learning_rate": 7.691807542262678e-07, "loss": 0.0014, "reward": 1.774196207523346, "reward_std": 0.12635523825883865, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7898212373256683, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 199.03515625, "epoch": 0.23139421514462138, "grad_norm": 0.4113123416900635, "kl": 0.02392578125, "learning_rate": 7.685305591677504e-07, "loss": 0.001, "reward": 1.7781568765640259, "reward_std": 0.10711773112416267, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7898755967617035, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 197.07421875, "epoch": 0.23204419889502761, "grad_norm": 0.4381250739097595, "kl": 0.02716064453125, "learning_rate": 7.678803641092327e-07, "loss": 0.0011, "reward": 1.7063517570495605, "reward_std": 0.14815658703446388, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7376017868518829, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 194.87890625, "epoch": 0.23269418264543387, "grad_norm": 0.3943432867527008, "kl": 0.02593994140625, "learning_rate": 7.672301690507151e-07, "loss": 0.001, "reward": 1.835715115070343, "reward_std": 0.10575883090496063, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8552464246749878, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 191.7421875, "epoch": 0.2333441663958401, "grad_norm": 0.42345210909843445, "kl": 0.02960205078125, "learning_rate": 7.665799739921976e-07, "loss": 0.0012, "reward": 1.7707967162132263, "reward_std": 0.15350254625082016, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7942341864109039, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 181.09765625, "epoch": 0.23399415014624633, "grad_norm": 0.45998910069465637, "kl": 0.033447265625, "learning_rate": 7.6592977893368e-07, "loss": 0.0013, "reward": 1.7425235509872437, "reward_std": 0.13317134231328964, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7659610509872437, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 191.6875, "epoch": 0.2346441338966526, "grad_norm": 0.415353387594223, "kl": 0.02392578125, "learning_rate": 7.652795838751625e-07, "loss": 0.001, "reward": 1.8394846320152283, "reward_std": 0.08725294843316078, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8472971618175507, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 190.31640625, "epoch": 0.23529411764705882, "grad_norm": 0.48643869161605835, "kl": 0.02581787109375, "learning_rate": 7.646293888166449e-07, "loss": 0.001, "reward": 1.8091813921928406, "reward_std": 0.12326768785715103, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8248063921928406, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 197.2109375, "epoch": 0.23594410139746505, "grad_norm": 0.47447580099105835, "kl": 0.02587890625, "learning_rate": 7.639791937581273e-07, "loss": 0.001, "reward": 1.7414993047714233, "reward_std": 0.09406448528170586, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.7454055547714233, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 203.703125, "epoch": 0.2365940851478713, "grad_norm": 0.4416190981864929, "kl": 0.03472900390625, "learning_rate": 7.633289986996098e-07, "loss": 0.0014, "reward": 1.714638590812683, "reward_std": 0.16412311047315598, "rewards/format_reward_gen": 0.9609375, "rewards/llm_reward": 0.7537010908126831, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 182.625, "epoch": 0.23724406889827754, "grad_norm": 0.4077817499637604, "kl": 0.0294189453125, "learning_rate": 7.626788036410922e-07, "loss": 0.0012, "reward": 1.7996890544891357, "reward_std": 0.12557120621204376, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8153141140937805, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 190.64453125, "epoch": 0.2378940526486838, "grad_norm": 0.42463400959968567, "kl": 0.0333251953125, "learning_rate": 7.620286085825747e-07, "loss": 0.0013, "reward": 1.7978031039237976, "reward_std": 0.1391490362584591, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8212405741214752, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 196.08984375, "epoch": 0.23854403639909003, "grad_norm": 0.4273701310157776, "kl": 0.02880859375, "learning_rate": 7.613784135240571e-07, "loss": 0.0012, "reward": 1.7556397318840027, "reward_std": 0.1408524066209793, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7829834222793579, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 193.14453125, "epoch": 0.23919402014949626, "grad_norm": 0.46962493658065796, "kl": 0.025146484375, "learning_rate": 7.607282184655396e-07, "loss": 0.001, "reward": 1.7877516746520996, "reward_std": 0.10761094838380814, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8033767342567444, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 199.53125, "epoch": 0.2398440038999025, "grad_norm": 1.8874495029449463, "kl": 0.03369140625, "learning_rate": 7.600780234070221e-07, "loss": 0.0013, "reward": 1.7325694561004639, "reward_std": 0.14276714622974396, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7599132657051086, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 174.67578125, "epoch": 0.24049398765030874, "grad_norm": 0.5393614768981934, "kl": 0.038818359375, "learning_rate": 7.594278283485045e-07, "loss": 0.0016, "reward": 1.7723038792610168, "reward_std": 0.1253133825957775, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7840226888656616, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 178.75390625, "epoch": 0.24114397140071497, "grad_norm": 0.44769948720932007, "kl": 0.0272216796875, "learning_rate": 7.58777633289987e-07, "loss": 0.0011, "reward": 1.7596750259399414, "reward_std": 0.12606630474328995, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7753000557422638, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 186.4921875, "epoch": 0.24179395515112123, "grad_norm": 0.4238332211971283, "kl": 0.034912109375, "learning_rate": 7.581274382314694e-07, "loss": 0.0014, "reward": 1.797433316707611, "reward_std": 0.12025081366300583, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8169645667076111, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 193.26953125, "epoch": 0.24244393890152746, "grad_norm": 0.38761091232299805, "kl": 0.0318603515625, "learning_rate": 7.574772431729518e-07, "loss": 0.0013, "reward": 1.743790626525879, "reward_std": 0.1504654735326767, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7750405371189117, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 201.33984375, "epoch": 0.2430939226519337, "grad_norm": 0.4119164049625397, "kl": 0.0311279296875, "learning_rate": 7.568270481144343e-07, "loss": 0.0012, "reward": 1.8080140352249146, "reward_std": 0.07587588578462601, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8158265650272369, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 189.546875, "epoch": 0.24374390640233995, "grad_norm": 0.40127241611480713, "kl": 0.03271484375, "learning_rate": 7.561768530559167e-07, "loss": 0.0013, "reward": 1.8326174020767212, "reward_std": 0.11404037661850452, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8443361520767212, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 175.33984375, "epoch": 0.24439389015274618, "grad_norm": 0.45222049951553345, "kl": 0.0263671875, "learning_rate": 7.555266579973992e-07, "loss": 0.0011, "reward": 1.776690661907196, "reward_std": 0.12548404932022095, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7923156917095184, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 193.40625, "epoch": 0.2450438739031524, "grad_norm": 0.42485570907592773, "kl": 0.035400390625, "learning_rate": 7.548764629388816e-07, "loss": 0.0014, "reward": 1.799051582813263, "reward_std": 0.11422578245401382, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8185828626155853, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 188.796875, "epoch": 0.24569385765355867, "grad_norm": 0.4774068295955658, "kl": 0.0335693359375, "learning_rate": 7.54226267880364e-07, "loss": 0.0013, "reward": 1.77499920129776, "reward_std": 0.11130572110414505, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7906241714954376, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 202.6953125, "epoch": 0.2463438414039649, "grad_norm": 0.4346219003200531, "kl": 0.0294189453125, "learning_rate": 7.535760728218465e-07, "loss": 0.0012, "reward": 1.7662047743797302, "reward_std": 0.12422113865613937, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7896422445774078, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 191.7734375, "epoch": 0.24699382515437113, "grad_norm": 0.6779094934463501, "kl": 0.0396728515625, "learning_rate": 7.529258777633289e-07, "loss": 0.0016, "reward": 1.782205045223236, "reward_std": 0.13697300478816032, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.8095488548278809, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 181.21484375, "epoch": 0.24764380890477738, "grad_norm": 0.46399644017219543, "kl": 0.02886962890625, "learning_rate": 7.522756827048114e-07, "loss": 0.0012, "reward": 1.7914424538612366, "reward_std": 0.10709996521472931, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8070674240589142, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 181.5, "epoch": 0.24829379265518361, "grad_norm": 0.40981975197792053, "kl": 0.02886962890625, "learning_rate": 7.516254876462938e-07, "loss": 0.0012, "reward": 1.743277668952942, "reward_std": 0.10544325411319733, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7549964189529419, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 203.66015625, "epoch": 0.24894377640558987, "grad_norm": 0.43432676792144775, "kl": 0.02972412109375, "learning_rate": 7.509752925877762e-07, "loss": 0.0012, "reward": 1.8046414256095886, "reward_std": 0.10934992134571075, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8202664256095886, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 188.0234375, "epoch": 0.2495937601559961, "grad_norm": 0.4472021758556366, "kl": 0.03564453125, "learning_rate": 7.503250975292587e-07, "loss": 0.0014, "reward": 1.7895944714546204, "reward_std": 0.10334055870771408, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8052195310592651, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 181.6484375, "epoch": 0.25024374390640236, "grad_norm": 0.39117518067359924, "kl": 0.033935546875, "learning_rate": 7.496749024707412e-07, "loss": 0.0014, "reward": 1.7638800144195557, "reward_std": 0.1309705302119255, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7912237048149109, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 175.87890625, "epoch": 0.25089372765680856, "grad_norm": 0.4548526704311371, "kl": 0.038818359375, "learning_rate": 7.490247074122237e-07, "loss": 0.0016, "reward": 1.7863773107528687, "reward_std": 0.14282897114753723, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8098147213459015, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 187.234375, "epoch": 0.2515437114072148, "grad_norm": 0.47016966342926025, "kl": 0.02923583984375, "learning_rate": 7.483745123537061e-07, "loss": 0.0012, "reward": 1.7690533995628357, "reward_std": 0.12117179483175278, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7924909591674805, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 177.921875, "epoch": 0.2521936951576211, "grad_norm": 0.4501223862171173, "kl": 0.0322265625, "learning_rate": 7.477243172951885e-07, "loss": 0.0013, "reward": 1.7491596341133118, "reward_std": 0.10184172168374062, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7647846639156342, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 187.31640625, "epoch": 0.2528436789080273, "grad_norm": 0.49668341875076294, "kl": 0.0335693359375, "learning_rate": 7.47074122236671e-07, "loss": 0.0013, "reward": 1.758347988128662, "reward_std": 0.11678595840930939, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7778792381286621, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 184.484375, "epoch": 0.25349366265843354, "grad_norm": 0.4732310175895691, "kl": 0.02972412109375, "learning_rate": 7.464239271781534e-07, "loss": 0.0012, "reward": 1.7992894649505615, "reward_std": 0.09446707367897034, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8110081851482391, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 195.71484375, "epoch": 0.2541436464088398, "grad_norm": 0.5084022879600525, "kl": 0.02874755859375, "learning_rate": 7.457737321196359e-07, "loss": 0.0011, "reward": 1.7563305497169495, "reward_std": 0.1488976925611496, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7797681093215942, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 181.58203125, "epoch": 0.254793630159246, "grad_norm": 0.3991442918777466, "kl": 0.02935791015625, "learning_rate": 7.451235370611183e-07, "loss": 0.0012, "reward": 1.7505470514297485, "reward_std": 0.08758186548948288, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7700783014297485, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 176.0234375, "epoch": 0.25544361390965226, "grad_norm": 0.5102388858795166, "kl": 0.03009033203125, "learning_rate": 7.444733420026007e-07, "loss": 0.0012, "reward": 1.7617217898368835, "reward_std": 0.1306707188487053, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7851592600345612, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 194.48046875, "epoch": 0.2560935976600585, "grad_norm": 0.41230806708335876, "kl": 0.02899169921875, "learning_rate": 7.438231469440832e-07, "loss": 0.0012, "reward": 1.807016134262085, "reward_std": 0.1262124478816986, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8265474140644073, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 218.41015625, "epoch": 0.2567435814104647, "grad_norm": 0.44964876770973206, "kl": 0.0301513671875, "learning_rate": 7.431729518855656e-07, "loss": 0.0012, "reward": 1.7397547960281372, "reward_std": 0.11447279527783394, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7631922662258148, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 173.6015625, "epoch": 0.257393565160871, "grad_norm": 0.406356543302536, "kl": 0.0333251953125, "learning_rate": 7.425227568270481e-07, "loss": 0.0013, "reward": 1.788908839225769, "reward_std": 0.1290540024638176, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.820158839225769, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 208.421875, "epoch": 0.25804354891127723, "grad_norm": 0.41417792439460754, "kl": 0.03656005859375, "learning_rate": 7.418725617685305e-07, "loss": 0.0015, "reward": 1.7550159096717834, "reward_std": 0.1473376527428627, "rewards/format_reward_gen": 0.9609375, "rewards/llm_reward": 0.7940783500671387, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 181.3359375, "epoch": 0.25869353266168343, "grad_norm": 0.46675431728363037, "kl": 0.0472412109375, "learning_rate": 7.412223667100129e-07, "loss": 0.0019, "reward": 1.7766024470329285, "reward_std": 0.10571258515119553, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7883211672306061, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 189.92578125, "epoch": 0.2593435164120897, "grad_norm": 0.4285334050655365, "kl": 0.02862548828125, "learning_rate": 7.405721716514954e-07, "loss": 0.0011, "reward": 1.7771927118301392, "reward_std": 0.09566831588745117, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7850052714347839, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 183.984375, "epoch": 0.25999350016249595, "grad_norm": 0.5521664619445801, "kl": 0.0406494140625, "learning_rate": 7.399219765929778e-07, "loss": 0.0016, "reward": 1.7467612028121948, "reward_std": 0.1260315552353859, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7701986730098724, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 187.2421875, "epoch": 0.26064348391290215, "grad_norm": 0.45571357011795044, "kl": 0.0389404296875, "learning_rate": 7.392717815344603e-07, "loss": 0.0016, "reward": 1.7747647166252136, "reward_std": 0.12907645106315613, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7942959368228912, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 188.4765625, "epoch": 0.2612934676633084, "grad_norm": 0.6379846930503845, "kl": 0.0264892578125, "learning_rate": 7.386215864759428e-07, "loss": 0.0011, "reward": 1.8021169900894165, "reward_std": 0.10282481834292412, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8138357698917389, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 182.12890625, "epoch": 0.26194345141371467, "grad_norm": 0.5321911573410034, "kl": 0.0369873046875, "learning_rate": 7.379713914174252e-07, "loss": 0.0015, "reward": 1.7491355538368225, "reward_std": 0.1240260973572731, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7647605538368225, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 186.28125, "epoch": 0.26259343516412087, "grad_norm": 0.5421339273452759, "kl": 0.03326416015625, "learning_rate": 7.373211963589077e-07, "loss": 0.0013, "reward": 1.7340133786201477, "reward_std": 0.13682317733764648, "rewards/format_reward_gen": 0.9609375, "rewards/llm_reward": 0.7730758488178253, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 181.89453125, "epoch": 0.2632434189145271, "grad_norm": 0.4921005666255951, "kl": 0.03662109375, "learning_rate": 7.366710013003901e-07, "loss": 0.0015, "reward": 1.792698621749878, "reward_std": 0.15483372658491135, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8161362409591675, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 189.72265625, "epoch": 0.2638934026649334, "grad_norm": 0.47232699394226074, "kl": 0.03363037109375, "learning_rate": 7.360208062418726e-07, "loss": 0.0013, "reward": 1.7382976412773132, "reward_std": 0.16221433877944946, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.7734539806842804, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 178.390625, "epoch": 0.26454338641533964, "grad_norm": 0.44989368319511414, "kl": 0.02801513671875, "learning_rate": 7.35370611183355e-07, "loss": 0.0011, "reward": 1.8238263130187988, "reward_std": 0.07950492948293686, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.827732503414154, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 180.6484375, "epoch": 0.26519337016574585, "grad_norm": 0.4771304130554199, "kl": 0.0418701171875, "learning_rate": 7.347204161248374e-07, "loss": 0.0017, "reward": 1.722359299659729, "reward_std": 0.14654317498207092, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7418905198574066, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 194.24609375, "epoch": 0.2658433539161521, "grad_norm": 0.4018694758415222, "kl": 0.03045654296875, "learning_rate": 7.340702210663199e-07, "loss": 0.0012, "reward": 1.8138460516929626, "reward_std": 0.08975718170404434, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8216585516929626, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 184.0625, "epoch": 0.26649333766655836, "grad_norm": 0.4213220775127411, "kl": 0.036865234375, "learning_rate": 7.334200260078023e-07, "loss": 0.0015, "reward": 1.777735710144043, "reward_std": 0.1216619610786438, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7972669899463654, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 176.1015625, "epoch": 0.26714332141696456, "grad_norm": 0.4685880243778229, "kl": 0.03045654296875, "learning_rate": 7.327698309492848e-07, "loss": 0.0012, "reward": 1.8078132271766663, "reward_std": 0.1306670643389225, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.827344536781311, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 187.67578125, "epoch": 0.2677933051673708, "grad_norm": 0.4742574989795685, "kl": 0.0299072265625, "learning_rate": 7.321196358907672e-07, "loss": 0.0012, "reward": 1.7970709800720215, "reward_std": 0.10969995334744453, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8048835396766663, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 205.67578125, "epoch": 0.2684432889177771, "grad_norm": 0.49834513664245605, "kl": 0.02642822265625, "learning_rate": 7.314694408322496e-07, "loss": 0.0011, "reward": 1.7572205662727356, "reward_std": 0.08142505586147308, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7572205364704132, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 197.8359375, "epoch": 0.2690932726681833, "grad_norm": 0.6015698313713074, "kl": 0.0294189453125, "learning_rate": 7.308192457737321e-07, "loss": 0.0012, "reward": 1.7633637189865112, "reward_std": 0.13184166699647903, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7907074689865112, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 184.09765625, "epoch": 0.26974325641858954, "grad_norm": 0.5241138935089111, "kl": 0.0291748046875, "learning_rate": 7.301690507152145e-07, "loss": 0.0012, "reward": 1.808104157447815, "reward_std": 0.0986228808760643, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8198229074478149, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 210.42578125, "epoch": 0.2703932401689958, "grad_norm": 1.0910027027130127, "kl": 0.03179931640625, "learning_rate": 7.29518855656697e-07, "loss": 0.0013, "reward": 1.7045276761054993, "reward_std": 0.15764447301626205, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7318713963031769, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 177.625, "epoch": 0.271043223919402, "grad_norm": 0.4701594114303589, "kl": 0.02960205078125, "learning_rate": 7.288686605981794e-07, "loss": 0.0012, "reward": 1.786037802696228, "reward_std": 0.08820248767733574, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7977564632892609, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 185.58984375, "epoch": 0.27169320766980826, "grad_norm": 0.38519471883773804, "kl": 0.03179931640625, "learning_rate": 7.282184655396618e-07, "loss": 0.0013, "reward": 1.805063247680664, "reward_std": 0.07876692339777946, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8245944380760193, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 189.015625, "epoch": 0.2723431914202145, "grad_norm": 0.4672132134437561, "kl": 0.03515625, "learning_rate": 7.275682704811444e-07, "loss": 0.0014, "reward": 1.7133662104606628, "reward_std": 0.09301316738128662, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.7172724902629852, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 182.08984375, "epoch": 0.2729931751706207, "grad_norm": 0.4171793758869171, "kl": 0.0330810546875, "learning_rate": 7.269180754226268e-07, "loss": 0.0013, "reward": 1.8296454548835754, "reward_std": 0.0999644473195076, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8413642644882202, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 187.328125, "epoch": 0.273643158921027, "grad_norm": 0.4740014672279358, "kl": 0.0311279296875, "learning_rate": 7.262678803641093e-07, "loss": 0.0012, "reward": 1.8207770586013794, "reward_std": 0.08532674983143806, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8324957489967346, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 200.7421875, "epoch": 0.27429314267143323, "grad_norm": 0.4572932720184326, "kl": 0.02777099609375, "learning_rate": 7.256176853055917e-07, "loss": 0.0011, "reward": 1.76516592502594, "reward_std": 0.13146037980914116, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7964159250259399, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 178.0078125, "epoch": 0.27494312642183943, "grad_norm": 0.5324826240539551, "kl": 0.03448486328125, "learning_rate": 7.249674902470741e-07, "loss": 0.0014, "reward": 1.806324303150177, "reward_std": 0.10230845957994461, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.821949303150177, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 200.296875, "epoch": 0.2755931101722457, "grad_norm": 0.6467932462692261, "kl": 0.03533935546875, "learning_rate": 7.243172951885566e-07, "loss": 0.0014, "reward": 1.750683605670929, "reward_std": 0.14327062666416168, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.770214855670929, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 184.578125, "epoch": 0.27624309392265195, "grad_norm": 0.5023516416549683, "kl": 0.03369140625, "learning_rate": 7.23667100130039e-07, "loss": 0.0013, "reward": 1.74515038728714, "reward_std": 0.10631689801812172, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7568690776824951, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 192.453125, "epoch": 0.27689307767305815, "grad_norm": 1.3008544445037842, "kl": 0.03564453125, "learning_rate": 7.230169050715215e-07, "loss": 0.0014, "reward": 1.7657435536384583, "reward_std": 0.14773748815059662, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7930873036384583, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 190.453125, "epoch": 0.2775430614234644, "grad_norm": 0.3682793080806732, "kl": 0.03131103515625, "learning_rate": 7.223667100130039e-07, "loss": 0.0013, "reward": 1.8312984704971313, "reward_std": 0.06009410694241524, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8352047204971313, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 170.1640625, "epoch": 0.27819304517387067, "grad_norm": 0.46865320205688477, "kl": 0.0411376953125, "learning_rate": 7.217165149544863e-07, "loss": 0.0016, "reward": 1.7738690376281738, "reward_std": 0.16674728319048882, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.8051190078258514, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 168.8125, "epoch": 0.27884302892427687, "grad_norm": 0.4554998278617859, "kl": 0.03790283203125, "learning_rate": 7.210663198959688e-07, "loss": 0.0015, "reward": 1.8122961521148682, "reward_std": 0.13506393879652023, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8318274021148682, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 176.86328125, "epoch": 0.27949301267468313, "grad_norm": 0.4432987868785858, "kl": 0.029296875, "learning_rate": 7.204161248374512e-07, "loss": 0.0012, "reward": 1.770066499710083, "reward_std": 0.08064586669206619, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7895977795124054, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 180.609375, "epoch": 0.2801429964250894, "grad_norm": 0.4235861897468567, "kl": 0.0350341796875, "learning_rate": 7.197659297789337e-07, "loss": 0.0014, "reward": 1.8251872658729553, "reward_std": 0.1139419749379158, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8486247658729553, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 180.4375, "epoch": 0.2807929801754956, "grad_norm": 0.4701739251613617, "kl": 0.04180908203125, "learning_rate": 7.191157347204161e-07, "loss": 0.0017, "reward": 1.77414870262146, "reward_std": 0.08908676728606224, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.78196120262146, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 181.42578125, "epoch": 0.28144296392590185, "grad_norm": 0.4306628704071045, "kl": 0.0330810546875, "learning_rate": 7.184655396618985e-07, "loss": 0.0013, "reward": 1.7990790605545044, "reward_std": 0.06598228216171265, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8029853403568268, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 178.40625, "epoch": 0.2820929476763081, "grad_norm": 0.40625131130218506, "kl": 0.02911376953125, "learning_rate": 7.17815344603381e-07, "loss": 0.0012, "reward": 1.8359938263893127, "reward_std": 0.09348307177424431, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8516188263893127, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 176.51953125, "epoch": 0.2827429314267143, "grad_norm": 3.1480090618133545, "kl": 0.0323486328125, "learning_rate": 7.171651495448634e-07, "loss": 0.0013, "reward": 1.7969990968704224, "reward_std": 0.11574871093034744, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8087179064750671, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 187.36328125, "epoch": 0.28339291517712056, "grad_norm": 0.4972032606601715, "kl": 0.035400390625, "learning_rate": 7.16514954486346e-07, "loss": 0.0014, "reward": 1.7261755466461182, "reward_std": 0.1295272260904312, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7535193264484406, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 182.79296875, "epoch": 0.2840428989275268, "grad_norm": 0.47000494599342346, "kl": 0.0303955078125, "learning_rate": 7.158647594278284e-07, "loss": 0.0012, "reward": 1.8050374388694763, "reward_std": 0.06670941784977913, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8128499686717987, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 173.4609375, "epoch": 0.2846928826779331, "grad_norm": 0.40936219692230225, "kl": 0.02899169921875, "learning_rate": 7.152145643693108e-07, "loss": 0.0012, "reward": 1.807050108909607, "reward_std": 0.07543875649571419, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8109564185142517, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 172.203125, "epoch": 0.2853428664283393, "grad_norm": 0.5007762312889099, "kl": 0.0347900390625, "learning_rate": 7.145643693107933e-07, "loss": 0.0014, "reward": 1.7758764028549194, "reward_std": 0.12026319280266762, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7954076826572418, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 186.62109375, "epoch": 0.28599285017874554, "grad_norm": 0.4724201560020447, "kl": 0.03192138671875, "learning_rate": 7.139141742522757e-07, "loss": 0.0013, "reward": 1.796559751033783, "reward_std": 0.10698577016592026, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8160910606384277, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 172.48828125, "epoch": 0.2866428339291518, "grad_norm": 0.43877699971199036, "kl": 0.0345458984375, "learning_rate": 7.132639791937582e-07, "loss": 0.0014, "reward": 1.7344261407852173, "reward_std": 0.08844507113099098, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7461448907852173, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 166.6796875, "epoch": 0.287292817679558, "grad_norm": 0.4692077338695526, "kl": 0.038818359375, "learning_rate": 7.126137841352406e-07, "loss": 0.0016, "reward": 1.8309828639030457, "reward_std": 0.09183839336037636, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.834889143705368, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 194.50390625, "epoch": 0.28794280142996426, "grad_norm": 0.4495907127857208, "kl": 0.05029296875, "learning_rate": 7.11963589076723e-07, "loss": 0.002, "reward": 1.784292459487915, "reward_std": 0.10931423678994179, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.8116362392902374, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 172.13671875, "epoch": 0.2885927851803705, "grad_norm": 0.5317756533622742, "kl": 0.03955078125, "learning_rate": 7.113133940182054e-07, "loss": 0.0016, "reward": 1.7448713779449463, "reward_std": 0.14717362076044083, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7683088183403015, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 174.8359375, "epoch": 0.2892427689307767, "grad_norm": 0.47538718581199646, "kl": 0.0350341796875, "learning_rate": 7.106631989596878e-07, "loss": 0.0014, "reward": 1.770770251750946, "reward_std": 0.09332285821437836, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7824890315532684, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 182.19140625, "epoch": 0.289892752681183, "grad_norm": 0.4155156910419464, "kl": 0.0390625, "learning_rate": 7.100130039011703e-07, "loss": 0.0016, "reward": 1.8541619181632996, "reward_std": 0.08792497217655182, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8658806681632996, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 173.640625, "epoch": 0.29054273643158923, "grad_norm": 0.4021815061569214, "kl": 0.03173828125, "learning_rate": 7.093628088426527e-07, "loss": 0.0013, "reward": 1.829224944114685, "reward_std": 0.08039754256606102, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8409436941146851, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 193.94921875, "epoch": 0.29119272018199543, "grad_norm": 1.2912062406539917, "kl": 0.02789306640625, "learning_rate": 7.087126137841351e-07, "loss": 0.0011, "reward": 1.8195543885231018, "reward_std": 0.0885070189833641, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8312731683254242, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 191.07421875, "epoch": 0.2918427039324017, "grad_norm": 0.39285850524902344, "kl": 0.0416259765625, "learning_rate": 7.080624187256176e-07, "loss": 0.0017, "reward": 1.8120809197425842, "reward_std": 0.07769922912120819, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8198934197425842, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 189.375, "epoch": 0.29249268768280795, "grad_norm": 0.4503980576992035, "kl": 0.02703857421875, "learning_rate": 7.074122236671e-07, "loss": 0.0011, "reward": 1.7380544543266296, "reward_std": 0.08249738812446594, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7458669245243073, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 180.23046875, "epoch": 0.29314267143321415, "grad_norm": 0.4908846318721771, "kl": 0.03594970703125, "learning_rate": 7.067620286085825e-07, "loss": 0.0014, "reward": 1.7939634323120117, "reward_std": 0.07465782389044762, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.7978696525096893, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 166.2734375, "epoch": 0.2937926551836204, "grad_norm": 0.4631759822368622, "kl": 0.0380859375, "learning_rate": 7.061118335500649e-07, "loss": 0.0015, "reward": 1.7940703630447388, "reward_std": 0.12099963426589966, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8057891130447388, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 166.75, "epoch": 0.29444263893402667, "grad_norm": 0.46768325567245483, "kl": 0.0380859375, "learning_rate": 7.054616384915473e-07, "loss": 0.0015, "reward": 1.7649129629135132, "reward_std": 0.12202504649758339, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7805379629135132, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 166.1015625, "epoch": 0.29509262268443287, "grad_norm": 0.45303627848625183, "kl": 0.034912109375, "learning_rate": 7.048114434330299e-07, "loss": 0.0014, "reward": 1.8056789636611938, "reward_std": 0.1788170412182808, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.8330227434635162, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 172.859375, "epoch": 0.29574260643483913, "grad_norm": 0.5380299091339111, "kl": 0.0360107421875, "learning_rate": 7.041612483745123e-07, "loss": 0.0014, "reward": 1.8009212017059326, "reward_std": 0.10284445062279701, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8126399517059326, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 169.80078125, "epoch": 0.2963925901852454, "grad_norm": 0.4983402490615845, "kl": 0.037841796875, "learning_rate": 7.035110533159948e-07, "loss": 0.0015, "reward": 1.8041171431541443, "reward_std": 0.14759978279471397, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.8314608931541443, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 181.48046875, "epoch": 0.2970425739356516, "grad_norm": 0.5715484619140625, "kl": 0.03369140625, "learning_rate": 7.028608582574772e-07, "loss": 0.0013, "reward": 1.7525050640106201, "reward_std": 0.14951244741678238, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7798488736152649, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 177.35546875, "epoch": 0.29769255768605785, "grad_norm": 0.4624209403991699, "kl": 0.02899169921875, "learning_rate": 7.022106631989596e-07, "loss": 0.0012, "reward": 1.7959466576576233, "reward_std": 0.10569869354367256, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8154779672622681, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 186.6796875, "epoch": 0.2983425414364641, "grad_norm": 0.6695345640182495, "kl": 0.02825927734375, "learning_rate": 7.015604681404421e-07, "loss": 0.0011, "reward": 1.7504655122756958, "reward_std": 0.13207682222127914, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.769996702671051, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 187.28515625, "epoch": 0.2989925251868703, "grad_norm": 0.4533548951148987, "kl": 0.0374755859375, "learning_rate": 7.009102730819245e-07, "loss": 0.0015, "reward": 1.7503867745399475, "reward_std": 0.1800392121076584, "rewards/format_reward_gen": 0.95703125, "rewards/llm_reward": 0.7933554947376251, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 184.4375, "epoch": 0.29964250893727656, "grad_norm": 0.40564101934432983, "kl": 0.03460693359375, "learning_rate": 7.00260078023407e-07, "loss": 0.0014, "reward": 1.7886518836021423, "reward_std": 0.13960616290569305, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8120893836021423, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 172.6484375, "epoch": 0.3002924926876828, "grad_norm": 0.4385133981704712, "kl": 0.033203125, "learning_rate": 6.996098829648894e-07, "loss": 0.0013, "reward": 1.7514255046844482, "reward_std": 0.11278577148914337, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7670505046844482, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 186.14453125, "epoch": 0.300942476438089, "grad_norm": 0.454773873090744, "kl": 0.03118896484375, "learning_rate": 6.989596879063718e-07, "loss": 0.0012, "reward": 1.8437588810920715, "reward_std": 0.09646228700876236, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8554776012897491, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 183.75390625, "epoch": 0.3015924601884953, "grad_norm": 0.3681083917617798, "kl": 0.0352783203125, "learning_rate": 6.983094928478543e-07, "loss": 0.0014, "reward": 1.788814663887024, "reward_std": 0.13243108987808228, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.8239708840847015, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 178.19921875, "epoch": 0.30224244393890154, "grad_norm": 1.2158068418502808, "kl": 0.02593994140625, "learning_rate": 6.976592977893367e-07, "loss": 0.001, "reward": 1.7527163624763489, "reward_std": 0.15250849723815918, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7800600230693817, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 180.21875, "epoch": 0.30289242768930774, "grad_norm": 0.41421735286712646, "kl": 0.02716064453125, "learning_rate": 6.970091027308192e-07, "loss": 0.0011, "reward": 1.8253625631332397, "reward_std": 0.08475968986749649, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8292687833309174, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 185.359375, "epoch": 0.303542411439714, "grad_norm": 0.4152349531650543, "kl": 0.0284423828125, "learning_rate": 6.963589076723016e-07, "loss": 0.0011, "reward": 1.739513337612152, "reward_std": 0.09828931093215942, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7512321174144745, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 167.5625, "epoch": 0.30419239519012026, "grad_norm": 0.5794462561607361, "kl": 0.0350341796875, "learning_rate": 6.95708712613784e-07, "loss": 0.0014, "reward": 1.784704566001892, "reward_std": 0.10064167156815529, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7964232861995697, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 181.125, "epoch": 0.30484237894052646, "grad_norm": 0.5310074090957642, "kl": 0.0352783203125, "learning_rate": 6.950585175552665e-07, "loss": 0.0014, "reward": 1.78120756149292, "reward_std": 0.1394018530845642, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8007388114929199, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 181.35546875, "epoch": 0.3054923626909327, "grad_norm": 0.39072275161743164, "kl": 0.032470703125, "learning_rate": 6.944083224967489e-07, "loss": 0.0013, "reward": 1.818184733390808, "reward_std": 0.08883300423622131, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8299034833908081, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 188.30859375, "epoch": 0.306142346441339, "grad_norm": 0.5668962597846985, "kl": 0.02886962890625, "learning_rate": 6.937581274382315e-07, "loss": 0.0012, "reward": 1.811598300933838, "reward_std": 0.06686370819807053, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8155045211315155, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 178.6015625, "epoch": 0.30679233019174523, "grad_norm": 0.48567861318588257, "kl": 0.044189453125, "learning_rate": 6.931079323797139e-07, "loss": 0.0018, "reward": 1.7555301785469055, "reward_std": 0.15513313561677933, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7828739285469055, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 184.05859375, "epoch": 0.30744231394215144, "grad_norm": 0.45873668789863586, "kl": 0.03564453125, "learning_rate": 6.924577373211963e-07, "loss": 0.0014, "reward": 1.751229703426361, "reward_std": 0.12413206323981285, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7707608938217163, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 182.3046875, "epoch": 0.3080922976925577, "grad_norm": 0.4667455852031708, "kl": 0.03729248046875, "learning_rate": 6.918075422626788e-07, "loss": 0.0015, "reward": 1.7854362726211548, "reward_std": 0.1002122238278389, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7932487726211548, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 183.234375, "epoch": 0.30874228144296395, "grad_norm": 0.4158461093902588, "kl": 0.02996826171875, "learning_rate": 6.911573472041612e-07, "loss": 0.0012, "reward": 1.837350845336914, "reward_std": 0.0746743381023407, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8412570953369141, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 170.5078125, "epoch": 0.30939226519337015, "grad_norm": 0.5583842396736145, "kl": 0.03533935546875, "learning_rate": 6.905071521456437e-07, "loss": 0.0014, "reward": 1.8011140823364258, "reward_std": 0.12821539491415024, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8245516419410706, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 173.51171875, "epoch": 0.3100422489437764, "grad_norm": 0.4885324239730835, "kl": 0.037841796875, "learning_rate": 6.898569570871261e-07, "loss": 0.0015, "reward": 1.8018548488616943, "reward_std": 0.10694931820034981, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8174799382686615, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 193.52734375, "epoch": 0.31069223269418267, "grad_norm": 0.5002551674842834, "kl": 0.02899169921875, "learning_rate": 6.892067620286085e-07, "loss": 0.0012, "reward": 1.8342078924179077, "reward_std": 0.09043335169553757, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8459265828132629, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 184.1796875, "epoch": 0.31134221644458887, "grad_norm": 0.4091686010360718, "kl": 0.02899169921875, "learning_rate": 6.88556566970091e-07, "loss": 0.0012, "reward": 1.827073335647583, "reward_std": 0.07914631813764572, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.834885835647583, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 177.77734375, "epoch": 0.31199220019499513, "grad_norm": 0.3895643651485443, "kl": 0.0296630859375, "learning_rate": 6.879063719115734e-07, "loss": 0.0012, "reward": 1.7762731909751892, "reward_std": 0.106668621301651, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7918981909751892, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 183.7578125, "epoch": 0.3126421839454014, "grad_norm": 0.49387356638908386, "kl": 0.0296630859375, "learning_rate": 6.872561768530559e-07, "loss": 0.0012, "reward": 1.7776379585266113, "reward_std": 0.10895796492695808, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8010754883289337, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 184.23046875, "epoch": 0.3132921676958076, "grad_norm": 0.5493704080581665, "kl": 0.0340576171875, "learning_rate": 6.866059817945383e-07, "loss": 0.0014, "reward": 1.787933349609375, "reward_std": 0.10100497677922249, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.795745849609375, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 185.4765625, "epoch": 0.31394215144621385, "grad_norm": 0.432606041431427, "kl": 0.03204345703125, "learning_rate": 6.859557867360207e-07, "loss": 0.0013, "reward": 1.7533288598060608, "reward_std": 0.13430344313383102, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7767663598060608, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 186.0078125, "epoch": 0.3145921351966201, "grad_norm": 0.49924522638320923, "kl": 0.029052734375, "learning_rate": 6.853055916775032e-07, "loss": 0.0012, "reward": 1.8197981119155884, "reward_std": 0.06989205256104469, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8276106119155884, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 181.94921875, "epoch": 0.3152421189470263, "grad_norm": 0.5351433157920837, "kl": 0.0390625, "learning_rate": 6.846553966189856e-07, "loss": 0.0016, "reward": 1.777860164642334, "reward_std": 0.13124702870845795, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7934850454330444, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 181.46484375, "epoch": 0.31589210269743256, "grad_norm": 0.4371291995048523, "kl": 0.02960205078125, "learning_rate": 6.840052015604681e-07, "loss": 0.0012, "reward": 1.7916890382766724, "reward_std": 0.12124122679233551, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.81122025847435, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 186.50390625, "epoch": 0.3165420864478388, "grad_norm": 0.44771459698677063, "kl": 0.03216552734375, "learning_rate": 6.833550065019505e-07, "loss": 0.0013, "reward": 1.7707844376564026, "reward_std": 0.07402872294187546, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7825031876564026, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 184.9296875, "epoch": 0.317192070198245, "grad_norm": 0.4707917869091034, "kl": 0.03668212890625, "learning_rate": 6.82704811443433e-07, "loss": 0.0015, "reward": 1.7990386486053467, "reward_std": 0.10563554614782333, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8068512678146362, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 177.203125, "epoch": 0.3178420539486513, "grad_norm": 0.44027087092399597, "kl": 0.03125, "learning_rate": 6.820546163849155e-07, "loss": 0.0013, "reward": 1.7827808856964111, "reward_std": 0.07947412133216858, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.7866871058940887, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 195.15234375, "epoch": 0.31849203769905754, "grad_norm": 0.40995216369628906, "kl": 0.0323486328125, "learning_rate": 6.814044213263979e-07, "loss": 0.0013, "reward": 1.7645278573036194, "reward_std": 0.11409011483192444, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7879653871059418, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 194.77734375, "epoch": 0.31914202144946374, "grad_norm": 0.45380687713623047, "kl": 0.035888671875, "learning_rate": 6.807542262678804e-07, "loss": 0.0014, "reward": 1.7562048435211182, "reward_std": 0.14372693002223969, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.775736004114151, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 182.03125, "epoch": 0.31979200519987, "grad_norm": 0.37691378593444824, "kl": 0.03173828125, "learning_rate": 6.801040312093628e-07, "loss": 0.0013, "reward": 1.8158243298530579, "reward_std": 0.1053396686911583, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8314493596553802, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 179.3046875, "epoch": 0.32044198895027626, "grad_norm": 0.46925660967826843, "kl": 0.02935791015625, "learning_rate": 6.794538361508452e-07, "loss": 0.0012, "reward": 1.79949152469635, "reward_std": 0.08967238664627075, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8112102150917053, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 177.5546875, "epoch": 0.32109197270068246, "grad_norm": 0.46735283732414246, "kl": 0.03961181640625, "learning_rate": 6.788036410923277e-07, "loss": 0.0016, "reward": 1.7885650396347046, "reward_std": 0.1323993131518364, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8120025098323822, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 186.13671875, "epoch": 0.3217419564510887, "grad_norm": 0.8425098061561584, "kl": 0.03497314453125, "learning_rate": 6.781534460338101e-07, "loss": 0.0014, "reward": 1.8180752396583557, "reward_std": 0.08980480581521988, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8258877098560333, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 182.9453125, "epoch": 0.322391940201495, "grad_norm": 0.44619154930114746, "kl": 0.035888671875, "learning_rate": 6.775032509752926e-07, "loss": 0.0014, "reward": 1.7727527618408203, "reward_std": 0.10624490678310394, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7922841012477875, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 188.29296875, "epoch": 0.3230419239519012, "grad_norm": 0.43829941749572754, "kl": 0.03131103515625, "learning_rate": 6.76853055916775e-07, "loss": 0.0013, "reward": 1.7656469345092773, "reward_std": 0.1273122914135456, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7812718451023102, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 197.4765625, "epoch": 0.32369190770230744, "grad_norm": 0.45391297340393066, "kl": 0.02679443359375, "learning_rate": 6.762028608582574e-07, "loss": 0.0011, "reward": 1.7791529893875122, "reward_std": 0.11656562611460686, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.8064967691898346, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 210.3125, "epoch": 0.3243418914527137, "grad_norm": 0.4168054759502411, "kl": 0.03192138671875, "learning_rate": 6.755526657997399e-07, "loss": 0.0013, "reward": 1.75980144739151, "reward_std": 0.10968594253063202, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7793327569961548, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 185.125, "epoch": 0.3249918752031199, "grad_norm": 0.5028923749923706, "kl": 0.02874755859375, "learning_rate": 6.749024707412223e-07, "loss": 0.0011, "reward": 1.8096270561218262, "reward_std": 0.132043719291687, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.8369708359241486, "step": 500 }, { "clip_ratio": 0.0, "completion_length": 180.12109375, "epoch": 0.32564185895352615, "grad_norm": 0.4253119230270386, "kl": 0.029296875, "learning_rate": 6.742522756827048e-07, "loss": 0.0012, "reward": 1.7423874735832214, "reward_std": 0.14565058797597885, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7619187235832214, "step": 501 }, { "clip_ratio": 0.0, "completion_length": 167.66796875, "epoch": 0.3262918427039324, "grad_norm": 0.38757821917533875, "kl": 0.0313720703125, "learning_rate": 6.736020806241872e-07, "loss": 0.0013, "reward": 1.7918633818626404, "reward_std": 0.12387978285551071, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8153008818626404, "step": 502 }, { "clip_ratio": 0.0, "completion_length": 193.203125, "epoch": 0.32694182645433867, "grad_norm": 0.4776434898376465, "kl": 0.03460693359375, "learning_rate": 6.729518855656696e-07, "loss": 0.0014, "reward": 1.7458550333976746, "reward_std": 0.14741221815347672, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7731987833976746, "step": 503 }, { "clip_ratio": 0.0, "completion_length": 196.44921875, "epoch": 0.32759181020474487, "grad_norm": 0.42068222165107727, "kl": 0.03240966796875, "learning_rate": 6.723016905071521e-07, "loss": 0.0013, "reward": 1.7272292375564575, "reward_std": 0.11583319306373596, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7467604875564575, "step": 504 }, { "clip_ratio": 0.0, "completion_length": 190.17578125, "epoch": 0.32824179395515113, "grad_norm": 0.4561879336833954, "kl": 0.028564453125, "learning_rate": 6.716514954486346e-07, "loss": 0.0011, "reward": 1.7978796362876892, "reward_std": 0.11140285432338715, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8174108564853668, "step": 505 }, { "clip_ratio": 0.0, "completion_length": 183.43359375, "epoch": 0.3288917777055574, "grad_norm": 0.494276225566864, "kl": 0.0333251953125, "learning_rate": 6.710013003901171e-07, "loss": 0.0013, "reward": 1.7586630582809448, "reward_std": 0.16760993003845215, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7899131178855896, "step": 506 }, { "clip_ratio": 0.0, "completion_length": 203.69140625, "epoch": 0.3295417614559636, "grad_norm": 0.38342583179473877, "kl": 0.031494140625, "learning_rate": 6.703511053315995e-07, "loss": 0.0013, "reward": 1.7974373698234558, "reward_std": 0.09214676544070244, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8169685900211334, "step": 507 }, { "clip_ratio": 0.0, "completion_length": 195.94921875, "epoch": 0.33019174520636985, "grad_norm": 0.4740481376647949, "kl": 0.02880859375, "learning_rate": 6.697009102730819e-07, "loss": 0.0012, "reward": 1.827374815940857, "reward_std": 0.10750704631209373, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.8547185659408569, "step": 508 }, { "clip_ratio": 0.0, "completion_length": 187.1796875, "epoch": 0.3308417289567761, "grad_norm": 0.3817651569843292, "kl": 0.03033447265625, "learning_rate": 6.690507152145644e-07, "loss": 0.0012, "reward": 1.82425057888031, "reward_std": 0.06781300902366638, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8281568288803101, "step": 509 }, { "clip_ratio": 0.0, "completion_length": 186.5, "epoch": 0.3314917127071823, "grad_norm": 0.38264256715774536, "kl": 0.03094482421875, "learning_rate": 6.684005201560468e-07, "loss": 0.0012, "reward": 1.7876830697059631, "reward_std": 0.09636543691158295, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8033081293106079, "step": 510 }, { "clip_ratio": 0.0, "completion_length": 188.5390625, "epoch": 0.33214169645758856, "grad_norm": 0.4043103754520416, "kl": 0.03155517578125, "learning_rate": 6.677503250975293e-07, "loss": 0.0013, "reward": 1.797426998615265, "reward_std": 0.08651497960090637, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8091457784175873, "step": 511 }, { "clip_ratio": 0.0, "completion_length": 187.1796875, "epoch": 0.3327916802079948, "grad_norm": 0.495900958776474, "kl": 0.03277587890625, "learning_rate": 6.671001300390117e-07, "loss": 0.0013, "reward": 1.7458907961845398, "reward_std": 0.11321578174829483, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7576095163822174, "step": 512 }, { "clip_ratio": 0.0, "completion_length": 191.8203125, "epoch": 0.333441663958401, "grad_norm": 0.7337943315505981, "kl": 0.0418701171875, "learning_rate": 6.664499349804941e-07, "loss": 0.0017, "reward": 1.7330076098442078, "reward_std": 0.13393214344978333, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7603513598442078, "step": 513 }, { "clip_ratio": 0.0, "completion_length": 189.953125, "epoch": 0.3340916477088073, "grad_norm": 0.38979029655456543, "kl": 0.0377197265625, "learning_rate": 6.657997399219766e-07, "loss": 0.0015, "reward": 1.781134009361267, "reward_std": 0.08862390741705894, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7928527891635895, "step": 514 }, { "clip_ratio": 0.0, "completion_length": 204.76953125, "epoch": 0.33474163145921354, "grad_norm": 0.39078107476234436, "kl": 0.0322265625, "learning_rate": 6.65149544863459e-07, "loss": 0.0013, "reward": 1.7749285697937012, "reward_std": 0.1059408150613308, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.8022722601890564, "step": 515 }, { "clip_ratio": 0.0, "completion_length": 180.15625, "epoch": 0.33539161520961974, "grad_norm": 0.36570194363594055, "kl": 0.033935546875, "learning_rate": 6.644993498049415e-07, "loss": 0.0014, "reward": 1.8293550610542297, "reward_std": 0.0760810449719429, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8410737812519073, "step": 516 }, { "clip_ratio": 0.0, "completion_length": 180.3125, "epoch": 0.336041598960026, "grad_norm": 0.4362087845802307, "kl": 0.02960205078125, "learning_rate": 6.638491547464239e-07, "loss": 0.0012, "reward": 1.772316336631775, "reward_std": 0.09468614868819714, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7801288068294525, "step": 517 }, { "clip_ratio": 0.0, "completion_length": 197.90625, "epoch": 0.33669158271043226, "grad_norm": 0.4322270452976227, "kl": 0.03277587890625, "learning_rate": 6.631989596879063e-07, "loss": 0.0013, "reward": 1.8111275434494019, "reward_std": 0.0736442506313324, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8228462934494019, "step": 518 }, { "clip_ratio": 0.0, "completion_length": 194.96875, "epoch": 0.33734156646083846, "grad_norm": 0.4243268072605133, "kl": 0.0296630859375, "learning_rate": 6.625487646293888e-07, "loss": 0.0012, "reward": 1.7954052686691284, "reward_std": 0.07054703682661057, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8071240186691284, "step": 519 }, { "clip_ratio": 0.0, "completion_length": 185.171875, "epoch": 0.3379915502112447, "grad_norm": 0.3711923062801361, "kl": 0.03021240234375, "learning_rate": 6.618985695708712e-07, "loss": 0.0012, "reward": 1.8548699021339417, "reward_std": 0.10563256964087486, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8704949021339417, "step": 520 }, { "clip_ratio": 0.0, "completion_length": 218.32421875, "epoch": 0.338641533961651, "grad_norm": 0.4149784445762634, "kl": 0.02716064453125, "learning_rate": 6.612483745123538e-07, "loss": 0.0011, "reward": 1.7754391431808472, "reward_std": 0.08070631325244904, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.7793453931808472, "step": 521 }, { "clip_ratio": 0.0, "completion_length": 201.44140625, "epoch": 0.3392915177120572, "grad_norm": 0.41792982816696167, "kl": 0.0367431640625, "learning_rate": 6.605981794538362e-07, "loss": 0.0015, "reward": 1.8083611130714417, "reward_std": 0.10774357989430428, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.823986142873764, "step": 522 }, { "clip_ratio": 0.0, "completion_length": 200.171875, "epoch": 0.33994150146246344, "grad_norm": 0.5050821304321289, "kl": 0.0343017578125, "learning_rate": 6.599479843953186e-07, "loss": 0.0014, "reward": 1.7738243341445923, "reward_std": 0.09743199869990349, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7933555543422699, "step": 523 }, { "clip_ratio": 0.0, "completion_length": 193.44140625, "epoch": 0.3405914852128697, "grad_norm": 1.7666746377944946, "kl": 0.03759765625, "learning_rate": 6.592977893368011e-07, "loss": 0.0015, "reward": 1.7800042033195496, "reward_std": 0.06525975838303566, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.7839103639125824, "step": 524 }, { "clip_ratio": 0.0, "completion_length": 188.91015625, "epoch": 0.3412414689632759, "grad_norm": 0.42412421107292175, "kl": 0.0335693359375, "learning_rate": 6.586475942782835e-07, "loss": 0.0013, "reward": 1.7758591175079346, "reward_std": 0.13045203685760498, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.799296647310257, "step": 525 }, { "clip_ratio": 0.0, "completion_length": 179.81640625, "epoch": 0.34189145271368215, "grad_norm": 0.3778225779533386, "kl": 0.03125, "learning_rate": 6.57997399219766e-07, "loss": 0.0013, "reward": 1.818921983242035, "reward_std": 0.10730082169175148, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8306406438350677, "step": 526 }, { "clip_ratio": 0.0, "completion_length": 183.88671875, "epoch": 0.3425414364640884, "grad_norm": 0.5290052890777588, "kl": 0.03515625, "learning_rate": 6.573472041612484e-07, "loss": 0.0014, "reward": 1.754630446434021, "reward_std": 0.1244109608232975, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.758536696434021, "step": 527 }, { "clip_ratio": 0.0, "completion_length": 189.83984375, "epoch": 0.3431914202144946, "grad_norm": 0.42552605271339417, "kl": 0.02655029296875, "learning_rate": 6.566970091027308e-07, "loss": 0.0011, "reward": 1.8098065853118896, "reward_std": 0.08400767296552658, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8137127161026001, "step": 528 }, { "clip_ratio": 0.0, "completion_length": 191.9375, "epoch": 0.34384140396490087, "grad_norm": 0.4414931833744049, "kl": 0.0384521484375, "learning_rate": 6.560468140442133e-07, "loss": 0.0015, "reward": 1.7859277725219727, "reward_std": 0.10243409872055054, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7976465225219727, "step": 529 }, { "clip_ratio": 0.0, "completion_length": 183.859375, "epoch": 0.34449138771530713, "grad_norm": 0.4084601402282715, "kl": 0.0311279296875, "learning_rate": 6.553966189856956e-07, "loss": 0.0012, "reward": 1.7723811864852905, "reward_std": 0.08136724680662155, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7801937460899353, "step": 530 }, { "clip_ratio": 0.0, "completion_length": 197.62109375, "epoch": 0.34514137146571333, "grad_norm": 0.5065193772315979, "kl": 0.0369873046875, "learning_rate": 6.547464239271782e-07, "loss": 0.0015, "reward": 1.747236430644989, "reward_std": 0.13305412605404854, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7667677104473114, "step": 531 }, { "clip_ratio": 0.0, "completion_length": 174.42578125, "epoch": 0.3457913552161196, "grad_norm": 0.469655305147171, "kl": 0.03857421875, "learning_rate": 6.540962288686605e-07, "loss": 0.0015, "reward": 1.7327078580856323, "reward_std": 0.12388584017753601, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7405203580856323, "step": 532 }, { "clip_ratio": 0.0, "completion_length": 188.48828125, "epoch": 0.34644133896652585, "grad_norm": 0.4058604836463928, "kl": 0.02703857421875, "learning_rate": 6.534460338101429e-07, "loss": 0.0011, "reward": 1.7828049659729004, "reward_std": 0.12039559707045555, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8062424659729004, "step": 533 }, { "clip_ratio": 0.0, "completion_length": 192.6171875, "epoch": 0.34709132271693205, "grad_norm": 0.5045217275619507, "kl": 0.0413818359375, "learning_rate": 6.527958387516254e-07, "loss": 0.0017, "reward": 1.7803025245666504, "reward_std": 0.07945918291807175, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.784208744764328, "step": 534 }, { "clip_ratio": 0.0, "completion_length": 197.37109375, "epoch": 0.3477413064673383, "grad_norm": 0.46536368131637573, "kl": 0.03253173828125, "learning_rate": 6.521456436931078e-07, "loss": 0.0013, "reward": 1.7741031646728516, "reward_std": 0.10703307762742043, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7936343550682068, "step": 535 }, { "clip_ratio": 0.0, "completion_length": 184.62109375, "epoch": 0.34839129021774456, "grad_norm": 0.4205235242843628, "kl": 0.0302734375, "learning_rate": 6.514954486345903e-07, "loss": 0.0012, "reward": 1.7810351848602295, "reward_std": 0.09732525423169136, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7927539646625519, "step": 536 }, { "clip_ratio": 0.0, "completion_length": 195.109375, "epoch": 0.3490412739681508, "grad_norm": 0.3940182626247406, "kl": 0.03082275390625, "learning_rate": 6.508452535760727e-07, "loss": 0.0012, "reward": 1.7405920624732971, "reward_std": 0.14174319803714752, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.76793572306633, "step": 537 }, { "clip_ratio": 0.0, "completion_length": 172.10546875, "epoch": 0.349691257718557, "grad_norm": 0.4310148060321808, "kl": 0.03106689453125, "learning_rate": 6.501950585175551e-07, "loss": 0.0012, "reward": 1.7936915159225464, "reward_std": 0.09460056573152542, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8054102957248688, "step": 538 }, { "clip_ratio": 0.0, "completion_length": 180.65234375, "epoch": 0.3503412414689633, "grad_norm": 0.5159792304039001, "kl": 0.03680419921875, "learning_rate": 6.495448634590377e-07, "loss": 0.0015, "reward": 1.7723994255065918, "reward_std": 0.12258055061101913, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7919305860996246, "step": 539 }, { "clip_ratio": 0.0, "completion_length": 183.4609375, "epoch": 0.35099122521936954, "grad_norm": 0.45871058106422424, "kl": 0.035400390625, "learning_rate": 6.488946684005201e-07, "loss": 0.0014, "reward": 1.792600929737091, "reward_std": 0.09507682546973228, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8004133701324463, "step": 540 }, { "clip_ratio": 0.0, "completion_length": 177.3359375, "epoch": 0.35164120896977574, "grad_norm": 0.3475419580936432, "kl": 0.03045654296875, "learning_rate": 6.482444733420026e-07, "loss": 0.0012, "reward": 1.7963948249816895, "reward_std": 0.060442570596933365, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8042072951793671, "step": 541 }, { "clip_ratio": 0.0, "completion_length": 215.34375, "epoch": 0.352291192720182, "grad_norm": 0.4596771001815796, "kl": 0.02996826171875, "learning_rate": 6.47594278283485e-07, "loss": 0.0012, "reward": 1.6851466298103333, "reward_std": 0.1097908653318882, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.6968653798103333, "step": 542 }, { "clip_ratio": 0.0, "completion_length": 191.91796875, "epoch": 0.35294117647058826, "grad_norm": 0.3967244327068329, "kl": 0.0291748046875, "learning_rate": 6.469440832249674e-07, "loss": 0.0012, "reward": 1.7661408185958862, "reward_std": 0.06506912782788277, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7739533185958862, "step": 543 }, { "clip_ratio": 0.0, "completion_length": 161.1640625, "epoch": 0.35359116022099446, "grad_norm": 0.48367011547088623, "kl": 0.041259765625, "learning_rate": 6.462938881664499e-07, "loss": 0.0016, "reward": 1.7832210063934326, "reward_std": 0.17143891751766205, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.810564786195755, "step": 544 }, { "clip_ratio": 0.0, "completion_length": 186.33203125, "epoch": 0.3542411439714007, "grad_norm": 0.44959312677383423, "kl": 0.0338134765625, "learning_rate": 6.456436931079323e-07, "loss": 0.0014, "reward": 1.7773289680480957, "reward_std": 0.14014050364494324, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7968603074550629, "step": 545 }, { "clip_ratio": 0.0, "completion_length": 186.86328125, "epoch": 0.354891127721807, "grad_norm": 0.4790340065956116, "kl": 0.0294189453125, "learning_rate": 6.449934980494148e-07, "loss": 0.0012, "reward": 1.7947072386741638, "reward_std": 0.07926813885569572, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8025197088718414, "step": 546 }, { "clip_ratio": 0.0, "completion_length": 196.41015625, "epoch": 0.3555411114722132, "grad_norm": 0.5247699618339539, "kl": 0.03857421875, "learning_rate": 6.443433029908972e-07, "loss": 0.0015, "reward": 1.710862398147583, "reward_std": 0.17492102831602097, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.7460186779499054, "step": 547 }, { "clip_ratio": 0.0, "completion_length": 176.96875, "epoch": 0.35619109522261944, "grad_norm": 0.41275733709335327, "kl": 0.0330810546875, "learning_rate": 6.436931079323796e-07, "loss": 0.0013, "reward": 1.7721323370933533, "reward_std": 0.10037597641348839, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7838510572910309, "step": 548 }, { "clip_ratio": 0.0, "completion_length": 171.84375, "epoch": 0.3568410789730257, "grad_norm": 0.4784419536590576, "kl": 0.03009033203125, "learning_rate": 6.430429128738621e-07, "loss": 0.0012, "reward": 1.7928367853164673, "reward_std": 0.07194963283836842, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.7967430651187897, "step": 549 }, { "clip_ratio": 0.0, "completion_length": 203.5703125, "epoch": 0.3574910627234319, "grad_norm": 0.4371991753578186, "kl": 0.03289794921875, "learning_rate": 6.423927178153445e-07, "loss": 0.0013, "reward": 1.7308475375175476, "reward_std": 0.11913301423192024, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.74256631731987, "step": 550 }, { "clip_ratio": 0.0, "completion_length": 181.1171875, "epoch": 0.35814104647383815, "grad_norm": 0.4171755313873291, "kl": 0.0362548828125, "learning_rate": 6.41742522756827e-07, "loss": 0.0015, "reward": 1.7844351530075073, "reward_std": 0.13566062971949577, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8039664328098297, "step": 551 }, { "clip_ratio": 0.0, "completion_length": 178.484375, "epoch": 0.3587910302242444, "grad_norm": 0.42888373136520386, "kl": 0.03082275390625, "learning_rate": 6.410923276983094e-07, "loss": 0.0012, "reward": 1.8171371221542358, "reward_std": 0.11527584493160248, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8288558721542358, "step": 552 }, { "clip_ratio": 0.0, "completion_length": 179.51953125, "epoch": 0.3594410139746506, "grad_norm": 0.44373029470443726, "kl": 0.03515625, "learning_rate": 6.404421326397918e-07, "loss": 0.0014, "reward": 1.7982013821601868, "reward_std": 0.1130921058356762, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.813826322555542, "step": 553 }, { "clip_ratio": 0.0, "completion_length": 200.51171875, "epoch": 0.36009099772505687, "grad_norm": 0.465863436460495, "kl": 0.03192138671875, "learning_rate": 6.397919375812743e-07, "loss": 0.0013, "reward": 1.784894347190857, "reward_std": 0.09542516991496086, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7966131269931793, "step": 554 }, { "clip_ratio": 0.0, "completion_length": 188.9453125, "epoch": 0.36074098147546313, "grad_norm": 0.8005475997924805, "kl": 0.10821533203125, "learning_rate": 6.391417425227567e-07, "loss": 0.0043, "reward": 1.7708330154418945, "reward_std": 0.07104471325874329, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.7747392058372498, "step": 555 }, { "clip_ratio": 0.0, "completion_length": 220.109375, "epoch": 0.36139096522586933, "grad_norm": 0.4081125259399414, "kl": 0.0303955078125, "learning_rate": 6.384915474642393e-07, "loss": 0.0012, "reward": 1.7454529404640198, "reward_std": 0.12294583767652512, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7688904106616974, "step": 556 }, { "clip_ratio": 0.0, "completion_length": 190.01171875, "epoch": 0.3620409489762756, "grad_norm": 0.4298749268054962, "kl": 0.03350830078125, "learning_rate": 6.378413524057217e-07, "loss": 0.0013, "reward": 1.8139179944992065, "reward_std": 0.1015559770166874, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8217305243015289, "step": 557 }, { "clip_ratio": 0.0, "completion_length": 177.35546875, "epoch": 0.36269093272668185, "grad_norm": 0.49499404430389404, "kl": 0.03466796875, "learning_rate": 6.371911573472041e-07, "loss": 0.0014, "reward": 1.7798447012901306, "reward_std": 0.13552547991275787, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8032823204994202, "step": 558 }, { "clip_ratio": 0.0, "completion_length": 174.125, "epoch": 0.36334091647708805, "grad_norm": 0.41058599948883057, "kl": 0.030517578125, "learning_rate": 6.365409622886866e-07, "loss": 0.0012, "reward": 1.8146637678146362, "reward_std": 0.08464222401380539, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.822476327419281, "step": 559 }, { "clip_ratio": 0.0, "completion_length": 192.8203125, "epoch": 0.3639909002274943, "grad_norm": 0.5331651568412781, "kl": 0.02984619140625, "learning_rate": 6.35890767230169e-07, "loss": 0.0012, "reward": 1.7383880019187927, "reward_std": 0.14846888184547424, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.773544192314148, "step": 560 }, { "clip_ratio": 0.0, "completion_length": 182.25390625, "epoch": 0.36464088397790057, "grad_norm": 0.47230467200279236, "kl": 0.032958984375, "learning_rate": 6.352405721716515e-07, "loss": 0.0013, "reward": 1.7598772048950195, "reward_std": 0.10837159305810928, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7755021154880524, "step": 561 }, { "clip_ratio": 0.0, "completion_length": 181.54296875, "epoch": 0.36529086772830677, "grad_norm": 0.5254387259483337, "kl": 0.03466796875, "learning_rate": 6.345903771131339e-07, "loss": 0.0014, "reward": 1.787140667438507, "reward_std": 0.1481962502002716, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.8222968578338623, "step": 562 }, { "clip_ratio": 0.0, "completion_length": 174.1875, "epoch": 0.365940851478713, "grad_norm": 0.45630010962486267, "kl": 0.039306640625, "learning_rate": 6.339401820546163e-07, "loss": 0.0016, "reward": 1.739733338356018, "reward_std": 0.1564476266503334, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.7748896777629852, "step": 563 }, { "clip_ratio": 0.0, "completion_length": 185.1640625, "epoch": 0.3665908352291193, "grad_norm": 0.4138317406177521, "kl": 0.0391845703125, "learning_rate": 6.332899869960988e-07, "loss": 0.0016, "reward": 1.7395538091659546, "reward_std": 0.16725194454193115, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.7747100591659546, "step": 564 }, { "clip_ratio": 0.0, "completion_length": 180.6171875, "epoch": 0.3672408189795255, "grad_norm": 0.4756114184856415, "kl": 0.0308837890625, "learning_rate": 6.326397919375812e-07, "loss": 0.0012, "reward": 1.7557262182235718, "reward_std": 0.1829560026526451, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.7908825278282166, "step": 565 }, { "clip_ratio": 0.0, "completion_length": 188.84765625, "epoch": 0.36789080272993174, "grad_norm": 0.4243033528327942, "kl": 0.03057861328125, "learning_rate": 6.319895968790637e-07, "loss": 0.0012, "reward": 1.7355726957321167, "reward_std": 0.11382076516747475, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7551039755344391, "step": 566 }, { "clip_ratio": 0.0, "completion_length": 194.48828125, "epoch": 0.368540786480338, "grad_norm": 0.39993223547935486, "kl": 0.0269775390625, "learning_rate": 6.313394018205461e-07, "loss": 0.0011, "reward": 1.8191792964935303, "reward_std": 0.0759977400302887, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8230855166912079, "step": 567 }, { "clip_ratio": 0.0, "completion_length": 184.64453125, "epoch": 0.36919077023074426, "grad_norm": 0.5064452290534973, "kl": 0.046630859375, "learning_rate": 6.306892067620285e-07, "loss": 0.0019, "reward": 1.745811939239502, "reward_std": 0.10411352291703224, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7653431594371796, "step": 568 }, { "clip_ratio": 0.0, "completion_length": 198.93359375, "epoch": 0.36984075398115046, "grad_norm": 0.46098792552948, "kl": 0.0340576171875, "learning_rate": 6.30039011703511e-07, "loss": 0.0014, "reward": 1.7168042063713074, "reward_std": 0.18162816017866135, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.7519604861736298, "step": 569 }, { "clip_ratio": 0.0, "completion_length": 173.40625, "epoch": 0.3704907377315567, "grad_norm": 0.4862184524536133, "kl": 0.0352783203125, "learning_rate": 6.293888166449934e-07, "loss": 0.0014, "reward": 1.780934453010559, "reward_std": 0.1405239775776863, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8004657030105591, "step": 570 }, { "clip_ratio": 0.0, "completion_length": 179.671875, "epoch": 0.371140721481963, "grad_norm": 0.46212050318717957, "kl": 0.0360107421875, "learning_rate": 6.287386215864759e-07, "loss": 0.0014, "reward": 1.7967819571495056, "reward_std": 0.11618692800402641, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8163132667541504, "step": 571 }, { "clip_ratio": 0.0, "completion_length": 183.625, "epoch": 0.3717907052323692, "grad_norm": 0.43616652488708496, "kl": 0.032958984375, "learning_rate": 6.280884265279583e-07, "loss": 0.0013, "reward": 1.7706486582756042, "reward_std": 0.14126816391944885, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7940861880779266, "step": 572 }, { "clip_ratio": 0.0, "completion_length": 196.03515625, "epoch": 0.37244068898277544, "grad_norm": 0.4807702302932739, "kl": 0.02935791015625, "learning_rate": 6.274382314694408e-07, "loss": 0.0012, "reward": 1.7888177037239075, "reward_std": 0.1207280121743679, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8044427037239075, "step": 573 }, { "clip_ratio": 0.0, "completion_length": 202.2734375, "epoch": 0.3730906727331817, "grad_norm": 0.3913426697254181, "kl": 0.0322265625, "learning_rate": 6.267880364109233e-07, "loss": 0.0013, "reward": 1.8106656074523926, "reward_std": 0.13959287106990814, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.8380094170570374, "step": 574 }, { "clip_ratio": 0.0, "completion_length": 182.72265625, "epoch": 0.3737406564835879, "grad_norm": 0.4062698781490326, "kl": 0.03009033203125, "learning_rate": 6.261378413524057e-07, "loss": 0.0012, "reward": 1.8144197463989258, "reward_std": 0.06829864904284477, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.818325936794281, "step": 575 }, { "clip_ratio": 0.0, "completion_length": 177.51953125, "epoch": 0.37439064023399415, "grad_norm": 0.37561559677124023, "kl": 0.044189453125, "learning_rate": 6.254876462938882e-07, "loss": 0.0018, "reward": 1.8123384714126587, "reward_std": 0.15673330426216125, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.8396821916103363, "step": 576 }, { "clip_ratio": 0.0, "completion_length": 186.390625, "epoch": 0.3750406239844004, "grad_norm": 0.42080453038215637, "kl": 0.03955078125, "learning_rate": 6.248374512353706e-07, "loss": 0.0016, "reward": 1.7680512070655823, "reward_std": 0.11832399293780327, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.795395016670227, "step": 577 }, { "clip_ratio": 0.0, "completion_length": 190.7734375, "epoch": 0.3756906077348066, "grad_norm": 0.4334554374217987, "kl": 0.030029296875, "learning_rate": 6.241872561768531e-07, "loss": 0.0012, "reward": 1.7676487565040588, "reward_std": 0.14943264424800873, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7988987565040588, "step": 578 }, { "clip_ratio": 0.0, "completion_length": 175.421875, "epoch": 0.37634059148521287, "grad_norm": 0.42092251777648926, "kl": 0.0277099609375, "learning_rate": 6.235370611183355e-07, "loss": 0.0011, "reward": 1.7844560742378235, "reward_std": 0.14795775711536407, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.8157060444355011, "step": 579 }, { "clip_ratio": 0.0, "completion_length": 188.24609375, "epoch": 0.37699057523561913, "grad_norm": 0.47600749135017395, "kl": 0.02874755859375, "learning_rate": 6.228868660598179e-07, "loss": 0.0012, "reward": 1.8064352869987488, "reward_std": 0.092244328930974, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8220602869987488, "step": 580 }, { "clip_ratio": 0.0, "completion_length": 186.4375, "epoch": 0.37764055898602533, "grad_norm": 0.36179402470588684, "kl": 0.03277587890625, "learning_rate": 6.222366710013004e-07, "loss": 0.0013, "reward": 1.7663333415985107, "reward_std": 0.10717130452394485, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7936771512031555, "step": 581 }, { "clip_ratio": 0.0, "completion_length": 202.44921875, "epoch": 0.3782905427364316, "grad_norm": 0.4292626678943634, "kl": 0.02978515625, "learning_rate": 6.215864759427828e-07, "loss": 0.0012, "reward": 1.777450978755951, "reward_std": 0.09355739131569862, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7891696989536285, "step": 582 }, { "clip_ratio": 0.0, "completion_length": 195.94921875, "epoch": 0.37894052648683785, "grad_norm": 0.4599159359931946, "kl": 0.032470703125, "learning_rate": 6.209362808842653e-07, "loss": 0.0013, "reward": 1.7262449264526367, "reward_std": 0.12261593341827393, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7418699562549591, "step": 583 }, { "clip_ratio": 0.0, "completion_length": 206.78125, "epoch": 0.37959051023724405, "grad_norm": 0.4283079206943512, "kl": 0.032958984375, "learning_rate": 6.202860858257477e-07, "loss": 0.0013, "reward": 1.7514910101890564, "reward_std": 0.10815878957509995, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7710223197937012, "step": 584 }, { "clip_ratio": 0.0, "completion_length": 188.58984375, "epoch": 0.3802404939876503, "grad_norm": 0.5837042927742004, "kl": 0.06768798828125, "learning_rate": 6.196358907672301e-07, "loss": 0.0027, "reward": 1.7590094804763794, "reward_std": 0.08475501462817192, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7746344804763794, "step": 585 }, { "clip_ratio": 0.0, "completion_length": 195.03125, "epoch": 0.38089047773805657, "grad_norm": 0.453581303358078, "kl": 0.03106689453125, "learning_rate": 6.189856957087126e-07, "loss": 0.0012, "reward": 1.7908175587654114, "reward_std": 0.07856597378849983, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.7947238683700562, "step": 586 }, { "clip_ratio": 0.0, "completion_length": 187.5859375, "epoch": 0.38154046148846277, "grad_norm": 0.47761592268943787, "kl": 0.043701171875, "learning_rate": 6.18335500650195e-07, "loss": 0.0017, "reward": 1.7871860265731812, "reward_std": 0.1295296624302864, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8106234967708588, "step": 587 }, { "clip_ratio": 0.0, "completion_length": 179.046875, "epoch": 0.382190445238869, "grad_norm": 0.7247717976570129, "kl": 0.0338134765625, "learning_rate": 6.176853055916775e-07, "loss": 0.0013, "reward": 1.8119346499443054, "reward_std": 0.0849592387676239, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.819747120141983, "step": 588 }, { "clip_ratio": 0.0, "completion_length": 190.953125, "epoch": 0.3828404289892753, "grad_norm": 0.4401405453681946, "kl": 0.030517578125, "learning_rate": 6.170351105331599e-07, "loss": 0.0012, "reward": 1.8114028573036194, "reward_std": 0.06945372372865677, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.815309077501297, "step": 589 }, { "clip_ratio": 0.0, "completion_length": 199.75, "epoch": 0.3834904127396815, "grad_norm": 0.460540235042572, "kl": 0.0509033203125, "learning_rate": 6.163849154746424e-07, "loss": 0.002, "reward": 1.740091860294342, "reward_std": 0.1509309709072113, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.775248110294342, "step": 590 }, { "clip_ratio": 0.0, "completion_length": 194.28515625, "epoch": 0.38414039649008774, "grad_norm": 0.37580057978630066, "kl": 0.0325927734375, "learning_rate": 6.157347204161249e-07, "loss": 0.0013, "reward": 1.7751976251602173, "reward_std": 0.10897025093436241, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7947288751602173, "step": 591 }, { "clip_ratio": 0.0, "completion_length": 181.64453125, "epoch": 0.384790380240494, "grad_norm": 0.5909035205841064, "kl": 0.0361328125, "learning_rate": 6.150845253576073e-07, "loss": 0.0014, "reward": 1.7414361834526062, "reward_std": 0.12851916626095772, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7648736536502838, "step": 592 }, { "clip_ratio": 0.0, "completion_length": 192.0859375, "epoch": 0.3854403639909002, "grad_norm": 0.4483637511730194, "kl": 0.033935546875, "learning_rate": 6.144343302990898e-07, "loss": 0.0014, "reward": 1.7654818892478943, "reward_std": 0.11226189881563187, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7928256392478943, "step": 593 }, { "clip_ratio": 0.0, "completion_length": 181.98828125, "epoch": 0.38609034774130646, "grad_norm": 0.4227682054042816, "kl": 0.0333251953125, "learning_rate": 6.137841352405722e-07, "loss": 0.0013, "reward": 1.8165239095687866, "reward_std": 0.09207615256309509, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8243364095687866, "step": 594 }, { "clip_ratio": 0.0, "completion_length": 188.0234375, "epoch": 0.3867403314917127, "grad_norm": 0.42090100049972534, "kl": 0.031494140625, "learning_rate": 6.131339401820546e-07, "loss": 0.0013, "reward": 1.776020109653473, "reward_std": 0.11440681666135788, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.8033638894557953, "step": 595 }, { "clip_ratio": 0.0, "completion_length": 170.34765625, "epoch": 0.3873903152421189, "grad_norm": 0.37360748648643494, "kl": 0.03271484375, "learning_rate": 6.124837451235371e-07, "loss": 0.0013, "reward": 1.840088665485382, "reward_std": 0.09504618123173714, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8557136654853821, "step": 596 }, { "clip_ratio": 0.0, "completion_length": 190.29296875, "epoch": 0.3880402989925252, "grad_norm": 0.5706738829612732, "kl": 0.0345458984375, "learning_rate": 6.118335500650195e-07, "loss": 0.0014, "reward": 1.7443532943725586, "reward_std": 0.14590216428041458, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.771697074174881, "step": 597 }, { "clip_ratio": 0.0, "completion_length": 201.59765625, "epoch": 0.38869028274293144, "grad_norm": 0.38256970047950745, "kl": 0.030517578125, "learning_rate": 6.11183355006502e-07, "loss": 0.0012, "reward": 1.7826003432273865, "reward_std": 0.16457660496234894, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.8177565634250641, "step": 598 }, { "clip_ratio": 0.0, "completion_length": 184.9921875, "epoch": 0.38934026649333764, "grad_norm": 0.3757571280002594, "kl": 0.02813720703125, "learning_rate": 6.105331599479844e-07, "loss": 0.0011, "reward": 1.7997584342956543, "reward_std": 0.07551082968711853, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8075709342956543, "step": 599 }, { "clip_ratio": 0.0, "completion_length": 203.3125, "epoch": 0.3899902502437439, "grad_norm": 0.46998050808906555, "kl": 0.03424072265625, "learning_rate": 6.098829648894668e-07, "loss": 0.0014, "reward": 1.7480822205543518, "reward_std": 0.08988617360591888, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7637072205543518, "step": 600 }, { "clip_ratio": 0.0, "completion_length": 190.5703125, "epoch": 0.39064023399415015, "grad_norm": 0.44143760204315186, "kl": 0.03289794921875, "learning_rate": 6.092327698309493e-07, "loss": 0.0013, "reward": 1.756867229938507, "reward_std": 0.10523197799921036, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7763985395431519, "step": 601 }, { "clip_ratio": 0.0, "completion_length": 184.08984375, "epoch": 0.3912902177445564, "grad_norm": 0.46374329924583435, "kl": 0.0458984375, "learning_rate": 6.085825747724317e-07, "loss": 0.0018, "reward": 1.7313203811645508, "reward_std": 0.12622041627764702, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7508516311645508, "step": 602 }, { "clip_ratio": 0.0, "completion_length": 186.609375, "epoch": 0.3919402014949626, "grad_norm": 0.45976293087005615, "kl": 0.036376953125, "learning_rate": 6.079323797139142e-07, "loss": 0.0015, "reward": 1.7896799445152283, "reward_std": 0.14558055251836777, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.8209299743175507, "step": 603 }, { "clip_ratio": 0.0, "completion_length": 199.046875, "epoch": 0.3925901852453689, "grad_norm": 0.455250084400177, "kl": 0.0296630859375, "learning_rate": 6.072821846553966e-07, "loss": 0.0012, "reward": 1.7980371713638306, "reward_std": 0.09183384850621223, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8097558617591858, "step": 604 }, { "clip_ratio": 0.0, "completion_length": 187.875, "epoch": 0.39324016899577513, "grad_norm": 0.46062731742858887, "kl": 0.03271484375, "learning_rate": 6.06631989596879e-07, "loss": 0.0013, "reward": 1.73819899559021, "reward_std": 0.11084530502557755, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7577302157878876, "step": 605 }, { "clip_ratio": 0.0, "completion_length": 186.8984375, "epoch": 0.39389015274618133, "grad_norm": 0.43664759397506714, "kl": 0.0364990234375, "learning_rate": 6.059817945383615e-07, "loss": 0.0015, "reward": 1.784709870815277, "reward_std": 0.11524225026369095, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8081473708152771, "step": 606 }, { "clip_ratio": 0.0, "completion_length": 186.90625, "epoch": 0.3945401364965876, "grad_norm": 0.45377832651138306, "kl": 0.03955078125, "learning_rate": 6.05331599479844e-07, "loss": 0.0016, "reward": 1.7588475346565247, "reward_std": 0.11938215047121048, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7744724750518799, "step": 607 }, { "clip_ratio": 0.0, "completion_length": 207.7734375, "epoch": 0.39519012024699385, "grad_norm": 0.47506585717201233, "kl": 0.03851318359375, "learning_rate": 6.046814044213265e-07, "loss": 0.0015, "reward": 1.7813263535499573, "reward_std": 0.08289660513401031, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7891388535499573, "step": 608 }, { "clip_ratio": 0.0, "completion_length": 197.1015625, "epoch": 0.39584010399740005, "grad_norm": 0.7067251205444336, "kl": 0.03515625, "learning_rate": 6.040312093628089e-07, "loss": 0.0014, "reward": 1.7709038257598877, "reward_std": 0.13311755657196045, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7943412661552429, "step": 609 }, { "clip_ratio": 0.0, "completion_length": 184.44140625, "epoch": 0.3964900877478063, "grad_norm": 0.46351704001426697, "kl": 0.0379638671875, "learning_rate": 6.033810143042913e-07, "loss": 0.0015, "reward": 1.7505944967269897, "reward_std": 0.1336926817893982, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7740320563316345, "step": 610 }, { "clip_ratio": 0.0, "completion_length": 178.40234375, "epoch": 0.39714007149821257, "grad_norm": 0.46722835302352905, "kl": 0.038330078125, "learning_rate": 6.027308192457738e-07, "loss": 0.0015, "reward": 1.7906476855278015, "reward_std": 0.1399606540799141, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.8179914057254791, "step": 611 }, { "clip_ratio": 0.0, "completion_length": 188.0234375, "epoch": 0.39779005524861877, "grad_norm": 0.43253397941589355, "kl": 0.033203125, "learning_rate": 6.020806241872562e-07, "loss": 0.0013, "reward": 1.7688956260681152, "reward_std": 0.10615430772304535, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.78452068567276, "step": 612 }, { "clip_ratio": 0.0, "completion_length": 202.56640625, "epoch": 0.398440038999025, "grad_norm": 0.5909953117370605, "kl": 0.0318603515625, "learning_rate": 6.014304291287387e-07, "loss": 0.0013, "reward": 1.7488840222358704, "reward_std": 0.1314442753791809, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7801339626312256, "step": 613 }, { "clip_ratio": 0.0, "completion_length": 195.7265625, "epoch": 0.3990900227494313, "grad_norm": 0.4420548975467682, "kl": 0.0257568359375, "learning_rate": 6.007802340702211e-07, "loss": 0.001, "reward": 1.8059398531913757, "reward_std": 0.13246694952249527, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8293773531913757, "step": 614 }, { "clip_ratio": 0.0, "completion_length": 201.35546875, "epoch": 0.3997400064998375, "grad_norm": 0.4135378897190094, "kl": 0.03472900390625, "learning_rate": 6.001300390117035e-07, "loss": 0.0014, "reward": 1.7841624021530151, "reward_std": 0.11773504316806793, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7997874021530151, "step": 615 }, { "clip_ratio": 0.0, "completion_length": 191.140625, "epoch": 0.40038999025024374, "grad_norm": 0.4537773132324219, "kl": 0.03009033203125, "learning_rate": 5.99479843953186e-07, "loss": 0.0012, "reward": 1.7920769453048706, "reward_std": 0.125887930393219, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8116082549095154, "step": 616 }, { "clip_ratio": 0.0, "completion_length": 185.265625, "epoch": 0.40103997400065, "grad_norm": 0.3923322558403015, "kl": 0.02923583984375, "learning_rate": 5.988296488946683e-07, "loss": 0.0012, "reward": 1.7921087145805359, "reward_std": 0.08250471018254757, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8077337145805359, "step": 617 }, { "clip_ratio": 0.0, "completion_length": 186.171875, "epoch": 0.4016899577510562, "grad_norm": 0.46012040972709656, "kl": 0.03564453125, "learning_rate": 5.981794538361509e-07, "loss": 0.0014, "reward": 1.7855560779571533, "reward_std": 0.12502873316407204, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8011811375617981, "step": 618 }, { "clip_ratio": 0.0, "completion_length": 203.26171875, "epoch": 0.40233994150146246, "grad_norm": 0.44035086035728455, "kl": 0.030029296875, "learning_rate": 5.975292587776332e-07, "loss": 0.0012, "reward": 1.7427037358283997, "reward_std": 0.10273829847574234, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7505161762237549, "step": 619 }, { "clip_ratio": 0.0, "completion_length": 193.22265625, "epoch": 0.4029899252518687, "grad_norm": 0.37705159187316895, "kl": 0.0340576171875, "learning_rate": 5.968790637191156e-07, "loss": 0.0014, "reward": 1.8175121545791626, "reward_std": 0.09698431938886642, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8292308747768402, "step": 620 }, { "clip_ratio": 0.0, "completion_length": 193.66015625, "epoch": 0.4036399090022749, "grad_norm": 0.44324952363967896, "kl": 0.0350341796875, "learning_rate": 5.962288686605981e-07, "loss": 0.0014, "reward": 1.7779308557510376, "reward_std": 0.11746732145547867, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7974620163440704, "step": 621 }, { "clip_ratio": 0.0, "completion_length": 202.62109375, "epoch": 0.4042898927526812, "grad_norm": 0.37682756781578064, "kl": 0.0311279296875, "learning_rate": 5.955786736020805e-07, "loss": 0.0012, "reward": 1.7780320644378662, "reward_std": 0.10956063866615295, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7936570942401886, "step": 622 }, { "clip_ratio": 0.0, "completion_length": 194.18359375, "epoch": 0.40493987650308744, "grad_norm": 1.304762840270996, "kl": 0.03125, "learning_rate": 5.94928478543563e-07, "loss": 0.0013, "reward": 1.8042656779289246, "reward_std": 0.09184245765209198, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8277031779289246, "step": 623 }, { "clip_ratio": 0.0, "completion_length": 173.65625, "epoch": 0.40558986025349364, "grad_norm": 0.36913853883743286, "kl": 0.0372314453125, "learning_rate": 5.942782834850455e-07, "loss": 0.0015, "reward": 1.8216418623924255, "reward_std": 0.06965627707540989, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8255481123924255, "step": 624 }, { "clip_ratio": 0.0, "completion_length": 195.84765625, "epoch": 0.4062398440038999, "grad_norm": 0.5723446011543274, "kl": 0.0465087890625, "learning_rate": 5.936280884265279e-07, "loss": 0.0019, "reward": 1.7284331917762756, "reward_std": 0.1581534519791603, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7596831619739532, "step": 625 }, { "clip_ratio": 0.0, "completion_length": 194.24609375, "epoch": 0.40688982775430615, "grad_norm": 0.4173901677131653, "kl": 0.03564453125, "learning_rate": 5.929778933680104e-07, "loss": 0.0014, "reward": 1.7796046137809753, "reward_std": 0.0822504572570324, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.791323333978653, "step": 626 }, { "clip_ratio": 0.0, "completion_length": 200.9375, "epoch": 0.40753981150471236, "grad_norm": 0.4810071289539337, "kl": 0.03656005859375, "learning_rate": 5.923276983094928e-07, "loss": 0.0015, "reward": 1.7440940737724304, "reward_std": 0.14011867344379425, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7675315737724304, "step": 627 }, { "clip_ratio": 0.0, "completion_length": 196.04296875, "epoch": 0.4081897952551186, "grad_norm": 0.42032167315483093, "kl": 0.031982421875, "learning_rate": 5.916775032509753e-07, "loss": 0.0013, "reward": 1.8290187120437622, "reward_std": 0.06369676440954208, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8329249322414398, "step": 628 }, { "clip_ratio": 0.0, "completion_length": 185.84765625, "epoch": 0.4088397790055249, "grad_norm": 0.5043155550956726, "kl": 0.029052734375, "learning_rate": 5.910273081924577e-07, "loss": 0.0012, "reward": 1.8298227190971375, "reward_std": 0.11722817271947861, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8532602190971375, "step": 629 }, { "clip_ratio": 0.0, "completion_length": 203.1953125, "epoch": 0.4094897627559311, "grad_norm": 0.48749545216560364, "kl": 0.02740478515625, "learning_rate": 5.903771131339401e-07, "loss": 0.0011, "reward": 1.773337483406067, "reward_std": 0.07447732239961624, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7811499834060669, "step": 630 }, { "clip_ratio": 0.0, "completion_length": 191.55078125, "epoch": 0.41013974650633733, "grad_norm": 0.4147813320159912, "kl": 0.0340576171875, "learning_rate": 5.897269180754226e-07, "loss": 0.0014, "reward": 1.7735944986343384, "reward_std": 0.10682874172925949, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7853132486343384, "step": 631 }, { "clip_ratio": 0.0, "completion_length": 191.67578125, "epoch": 0.4107897302567436, "grad_norm": 0.4569845199584961, "kl": 0.0367431640625, "learning_rate": 5.89076723016905e-07, "loss": 0.0015, "reward": 1.7395920753479004, "reward_std": 0.11317070201039314, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7591232359409332, "step": 632 }, { "clip_ratio": 0.0, "completion_length": 192.43359375, "epoch": 0.41143971400714985, "grad_norm": 0.41926005482673645, "kl": 0.0382080078125, "learning_rate": 5.884265279583875e-07, "loss": 0.0015, "reward": 1.7631359696388245, "reward_std": 0.12205355614423752, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7787609994411469, "step": 633 }, { "clip_ratio": 0.0, "completion_length": 197.91796875, "epoch": 0.41208969775755605, "grad_norm": 0.42035162448883057, "kl": 0.03564453125, "learning_rate": 5.877763328998699e-07, "loss": 0.0014, "reward": 1.7853078842163086, "reward_std": 0.09860872104763985, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.797026664018631, "step": 634 }, { "clip_ratio": 0.0, "completion_length": 186.9921875, "epoch": 0.4127396815079623, "grad_norm": 0.9608245491981506, "kl": 0.02911376953125, "learning_rate": 5.871261378413523e-07, "loss": 0.0012, "reward": 1.853518009185791, "reward_std": 0.0771024040877819, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8613305985927582, "step": 635 }, { "clip_ratio": 0.0, "completion_length": 189.63671875, "epoch": 0.41338966525836857, "grad_norm": 0.43705689907073975, "kl": 0.03314208984375, "learning_rate": 5.864759427828348e-07, "loss": 0.0013, "reward": 1.781096875667572, "reward_std": 0.10922854766249657, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.804534375667572, "step": 636 }, { "clip_ratio": 0.0, "completion_length": 185.9140625, "epoch": 0.41403964900877477, "grad_norm": 0.5973747372627258, "kl": 0.03314208984375, "learning_rate": 5.858257477243172e-07, "loss": 0.0013, "reward": 1.7962565422058105, "reward_std": 0.09106647968292236, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8040690422058105, "step": 637 }, { "clip_ratio": 0.0, "completion_length": 202.25390625, "epoch": 0.414689632759181, "grad_norm": 0.45891422033309937, "kl": 0.03515625, "learning_rate": 5.851755526657997e-07, "loss": 0.0014, "reward": 1.7267173528671265, "reward_std": 0.10862134397029877, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7423423230648041, "step": 638 }, { "clip_ratio": 0.0, "completion_length": 208.296875, "epoch": 0.4153396165095873, "grad_norm": 0.5900288820266724, "kl": 0.03729248046875, "learning_rate": 5.845253576072821e-07, "loss": 0.0015, "reward": 1.757373332977295, "reward_std": 0.12333794683218002, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7847170531749725, "step": 639 }, { "clip_ratio": 0.0, "completion_length": 192.81640625, "epoch": 0.4159896002599935, "grad_norm": 0.4485771656036377, "kl": 0.034423828125, "learning_rate": 5.838751625487645e-07, "loss": 0.0014, "reward": 1.8029879927635193, "reward_std": 0.11175959929823875, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8186130225658417, "step": 640 }, { "clip_ratio": 0.0, "completion_length": 191.15234375, "epoch": 0.41663958401039974, "grad_norm": 0.42148223519325256, "kl": 0.03143310546875, "learning_rate": 5.832249674902471e-07, "loss": 0.0013, "reward": 1.7966973781585693, "reward_std": 0.08911943063139915, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.804509848356247, "step": 641 }, { "clip_ratio": 0.0, "completion_length": 188.81640625, "epoch": 0.417289567760806, "grad_norm": 0.42980390787124634, "kl": 0.0419921875, "learning_rate": 5.825747724317295e-07, "loss": 0.0017, "reward": 1.8206413984298706, "reward_std": 0.13755077868700027, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.8479851484298706, "step": 642 }, { "clip_ratio": 0.0, "completion_length": 192.93359375, "epoch": 0.4179395515112122, "grad_norm": 0.40380987524986267, "kl": 0.037109375, "learning_rate": 5.81924577373212e-07, "loss": 0.0015, "reward": 1.7826376557350159, "reward_std": 0.10124779492616653, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8021689057350159, "step": 643 }, { "clip_ratio": 0.0, "completion_length": 200.48046875, "epoch": 0.41858953526161846, "grad_norm": 0.4288991689682007, "kl": 0.02947998046875, "learning_rate": 5.812743823146944e-07, "loss": 0.0012, "reward": 1.8110978603363037, "reward_std": 0.08126482553780079, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8150041103363037, "step": 644 }, { "clip_ratio": 0.0, "completion_length": 192.82421875, "epoch": 0.4192395190120247, "grad_norm": 0.42511630058288574, "kl": 0.04052734375, "learning_rate": 5.806241872561768e-07, "loss": 0.0016, "reward": 1.7748700976371765, "reward_std": 0.10736145451664925, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7904950380325317, "step": 645 }, { "clip_ratio": 0.0, "completion_length": 181.7578125, "epoch": 0.4198895027624309, "grad_norm": 0.3979143798351288, "kl": 0.0284423828125, "learning_rate": 5.799739921976593e-07, "loss": 0.0011, "reward": 1.8191203474998474, "reward_std": 0.11708731949329376, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8347453773021698, "step": 646 }, { "clip_ratio": 0.0, "completion_length": 199.6015625, "epoch": 0.4205394865128372, "grad_norm": 0.43127700686454773, "kl": 0.0386962890625, "learning_rate": 5.793237971391417e-07, "loss": 0.0016, "reward": 1.75886869430542, "reward_std": 0.1357279121875763, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7862124741077423, "step": 647 }, { "clip_ratio": 0.0, "completion_length": 176.2421875, "epoch": 0.42118947026324344, "grad_norm": 0.39804238080978394, "kl": 0.03076171875, "learning_rate": 5.786736020806242e-07, "loss": 0.0012, "reward": 1.82747483253479, "reward_std": 0.09723668918013573, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8430998921394348, "step": 648 }, { "clip_ratio": 0.0, "completion_length": 184.09765625, "epoch": 0.42183945401364964, "grad_norm": 0.4100029468536377, "kl": 0.0335693359375, "learning_rate": 5.780234070221066e-07, "loss": 0.0013, "reward": 1.813113808631897, "reward_std": 0.11340664699673653, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8365513682365417, "step": 649 }, { "clip_ratio": 0.0, "completion_length": 188.2734375, "epoch": 0.4224894377640559, "grad_norm": 0.43602877855300903, "kl": 0.03424072265625, "learning_rate": 5.77373211963589e-07, "loss": 0.0014, "reward": 1.7378233075141907, "reward_std": 0.12867463007569313, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7612608373165131, "step": 650 }, { "clip_ratio": 0.0, "completion_length": 190.1796875, "epoch": 0.42313942151446216, "grad_norm": 0.4371906518936157, "kl": 0.0289306640625, "learning_rate": 5.767230169050715e-07, "loss": 0.0012, "reward": 1.7418895363807678, "reward_std": 0.11755868047475815, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.765326976776123, "step": 651 }, { "clip_ratio": 0.0, "completion_length": 190.0078125, "epoch": 0.42378940526486836, "grad_norm": 0.41409072279930115, "kl": 0.03472900390625, "learning_rate": 5.760728218465539e-07, "loss": 0.0014, "reward": 1.8003789186477661, "reward_std": 0.12069586291909218, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8238165080547333, "step": 652 }, { "clip_ratio": 0.0, "completion_length": 184.6875, "epoch": 0.4244393890152746, "grad_norm": 0.4156893193721771, "kl": 0.0352783203125, "learning_rate": 5.754226267880364e-07, "loss": 0.0014, "reward": 1.722606897354126, "reward_std": 0.1075347363948822, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7421380877494812, "step": 653 }, { "clip_ratio": 0.0, "completion_length": 191.03515625, "epoch": 0.4250893727656809, "grad_norm": 0.39687320590019226, "kl": 0.02825927734375, "learning_rate": 5.747724317295188e-07, "loss": 0.0011, "reward": 1.8605526089668274, "reward_std": 0.08756935596466064, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8683651685714722, "step": 654 }, { "clip_ratio": 0.0, "completion_length": 183.50390625, "epoch": 0.4257393565160871, "grad_norm": 0.656285285949707, "kl": 0.0308837890625, "learning_rate": 5.741222366710012e-07, "loss": 0.0012, "reward": 1.78287672996521, "reward_std": 0.0888153724372387, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7985017597675323, "step": 655 }, { "clip_ratio": 0.0, "completion_length": 184.734375, "epoch": 0.42638934026649333, "grad_norm": 0.5374554991722107, "kl": 0.0396728515625, "learning_rate": 5.734720416124837e-07, "loss": 0.0016, "reward": 1.79817533493042, "reward_std": 0.09665170684456825, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8059878349304199, "step": 656 }, { "clip_ratio": 0.0, "completion_length": 193.0234375, "epoch": 0.4270393240168996, "grad_norm": 0.37231624126434326, "kl": 0.0264892578125, "learning_rate": 5.728218465539661e-07, "loss": 0.0011, "reward": 1.7746374011039734, "reward_std": 0.08232136443257332, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7824499011039734, "step": 657 }, { "clip_ratio": 0.0, "completion_length": 187.94140625, "epoch": 0.4276893077673058, "grad_norm": 0.43339264392852783, "kl": 0.044921875, "learning_rate": 5.721716514954487e-07, "loss": 0.0018, "reward": 1.7495959997177124, "reward_std": 0.16792897135019302, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.7847522497177124, "step": 658 }, { "clip_ratio": 0.0, "completion_length": 191.98046875, "epoch": 0.42833929151771205, "grad_norm": 0.46260371804237366, "kl": 0.0323486328125, "learning_rate": 5.715214564369311e-07, "loss": 0.0013, "reward": 1.798645555973053, "reward_std": 0.10400885343551636, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8181767463684082, "step": 659 }, { "clip_ratio": 0.0, "completion_length": 174.14453125, "epoch": 0.4289892752681183, "grad_norm": 0.3730888068675995, "kl": 0.034423828125, "learning_rate": 5.708712613784135e-07, "loss": 0.0014, "reward": 1.824648916721344, "reward_std": 0.06794697977602482, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.832461416721344, "step": 660 }, { "clip_ratio": 0.0, "completion_length": 180.26171875, "epoch": 0.4296392590185245, "grad_norm": 0.43788111209869385, "kl": 0.038818359375, "learning_rate": 5.70221066319896e-07, "loss": 0.0016, "reward": 1.7679771184921265, "reward_std": 0.08990071341395378, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7836021780967712, "step": 661 }, { "clip_ratio": 0.0, "completion_length": 185.19140625, "epoch": 0.43028924276893077, "grad_norm": 0.45981162786483765, "kl": 0.04443359375, "learning_rate": 5.695708712613784e-07, "loss": 0.0018, "reward": 1.753218650817871, "reward_std": 0.11090259253978729, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7610311508178711, "step": 662 }, { "clip_ratio": 0.0, "completion_length": 174.625, "epoch": 0.430939226519337, "grad_norm": 1.0561856031417847, "kl": 0.0452880859375, "learning_rate": 5.689206762028609e-07, "loss": 0.0018, "reward": 1.7415293455123901, "reward_std": 0.17437484115362167, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.7766855955123901, "step": 663 }, { "clip_ratio": 0.0, "completion_length": 191.59375, "epoch": 0.4315892102697433, "grad_norm": 0.3897829055786133, "kl": 0.02789306640625, "learning_rate": 5.682704811443433e-07, "loss": 0.0011, "reward": 1.7827918529510498, "reward_std": 0.07037868350744247, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7906043827533722, "step": 664 }, { "clip_ratio": 0.0, "completion_length": 172.921875, "epoch": 0.4322391940201495, "grad_norm": 0.5456742644309998, "kl": 0.032470703125, "learning_rate": 5.676202860858257e-07, "loss": 0.0013, "reward": 1.7813442945480347, "reward_std": 0.09390059858560562, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7891567945480347, "step": 665 }, { "clip_ratio": 0.0, "completion_length": 185.63671875, "epoch": 0.43288917777055574, "grad_norm": 0.3890838623046875, "kl": 0.0380859375, "learning_rate": 5.669700910273082e-07, "loss": 0.0015, "reward": 1.8207110166549683, "reward_std": 0.07851742953062057, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8285236358642578, "step": 666 }, { "clip_ratio": 0.0, "completion_length": 184.375, "epoch": 0.433539161520962, "grad_norm": 0.38463377952575684, "kl": 0.0321044921875, "learning_rate": 5.663198959687906e-07, "loss": 0.0013, "reward": 1.8141263127326965, "reward_std": 0.11648676916956902, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8297513127326965, "step": 667 }, { "clip_ratio": 0.0, "completion_length": 181.79296875, "epoch": 0.4341891452713682, "grad_norm": 0.37491053342819214, "kl": 0.03515625, "learning_rate": 5.656697009102731e-07, "loss": 0.0014, "reward": 1.789654016494751, "reward_std": 0.06900523602962494, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.797466516494751, "step": 668 }, { "clip_ratio": 0.0, "completion_length": 165.703125, "epoch": 0.43483912902177446, "grad_norm": 0.43290698528289795, "kl": 0.0364990234375, "learning_rate": 5.650195058517555e-07, "loss": 0.0015, "reward": 1.8181244730949402, "reward_std": 0.05400357209146023, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.818124532699585, "step": 669 }, { "clip_ratio": 0.0, "completion_length": 183.640625, "epoch": 0.4354891127721807, "grad_norm": 0.3804187476634979, "kl": 0.0360107421875, "learning_rate": 5.643693107932379e-07, "loss": 0.0014, "reward": 1.7674435377120972, "reward_std": 0.06189293786883354, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.7713497877120972, "step": 670 }, { "clip_ratio": 0.0, "completion_length": 165.9296875, "epoch": 0.4361390965225869, "grad_norm": 0.4212367832660675, "kl": 0.0355224609375, "learning_rate": 5.637191157347204e-07, "loss": 0.0014, "reward": 1.8254013061523438, "reward_std": 0.10329882800579071, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8371200561523438, "step": 671 }, { "clip_ratio": 0.0, "completion_length": 185.06640625, "epoch": 0.4367890802729932, "grad_norm": 0.3907769024372101, "kl": 0.0341796875, "learning_rate": 5.630689206762028e-07, "loss": 0.0014, "reward": 1.810665249824524, "reward_std": 0.10492907464504242, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8262903094291687, "step": 672 }, { "clip_ratio": 0.0, "completion_length": 189.80078125, "epoch": 0.43743906402339944, "grad_norm": 0.4678020775318146, "kl": 0.0469970703125, "learning_rate": 5.624187256176853e-07, "loss": 0.0019, "reward": 1.798274278640747, "reward_std": 0.1696677878499031, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.8334305286407471, "step": 673 }, { "clip_ratio": 0.0, "completion_length": 186.296875, "epoch": 0.43808904777380564, "grad_norm": 0.3897817134857178, "kl": 0.0343017578125, "learning_rate": 5.617685305591677e-07, "loss": 0.0014, "reward": 1.81245756149292, "reward_std": 0.08514690771698952, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8241762816905975, "step": 674 }, { "clip_ratio": 0.0, "completion_length": 168.84375, "epoch": 0.4387390315242119, "grad_norm": 0.40643244981765747, "kl": 0.03509521484375, "learning_rate": 5.611183355006501e-07, "loss": 0.0014, "reward": 1.7885099649429321, "reward_std": 0.0642305426299572, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8002287149429321, "step": 675 }, { "clip_ratio": 0.0, "completion_length": 177.69140625, "epoch": 0.43938901527461816, "grad_norm": 0.4496050477027893, "kl": 0.036865234375, "learning_rate": 5.604681404421327e-07, "loss": 0.0015, "reward": 1.7711995840072632, "reward_std": 0.11118980869650841, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7907307744026184, "step": 676 }, { "clip_ratio": 0.0, "completion_length": 179.8984375, "epoch": 0.44003899902502436, "grad_norm": 0.44096797704696655, "kl": 0.0323486328125, "learning_rate": 5.598179453836151e-07, "loss": 0.0013, "reward": 1.7945879697799683, "reward_std": 0.12220678478479385, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.8258379995822906, "step": 677 }, { "clip_ratio": 0.0, "completion_length": 170.5234375, "epoch": 0.4406889827754306, "grad_norm": 0.5105856657028198, "kl": 0.041259765625, "learning_rate": 5.591677503250976e-07, "loss": 0.0017, "reward": 1.8186424374580383, "reward_std": 0.11485068127512932, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8342675268650055, "step": 678 }, { "clip_ratio": 0.0, "completion_length": 187.41015625, "epoch": 0.4413389665258369, "grad_norm": 0.43609368801116943, "kl": 0.0391845703125, "learning_rate": 5.5851755526658e-07, "loss": 0.0016, "reward": 1.7789846658706665, "reward_std": 0.10046631097793579, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.802422046661377, "step": 679 }, { "clip_ratio": 0.0, "completion_length": 186.09765625, "epoch": 0.4419889502762431, "grad_norm": 0.4097209870815277, "kl": 0.0333251953125, "learning_rate": 5.578673602080624e-07, "loss": 0.0013, "reward": 1.8040542006492615, "reward_std": 0.10377581417560577, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8157728910446167, "step": 680 }, { "clip_ratio": 0.0, "completion_length": 173.23046875, "epoch": 0.44263893402664933, "grad_norm": 0.4958920180797577, "kl": 0.033447265625, "learning_rate": 5.572171651495449e-07, "loss": 0.0013, "reward": 1.7736340165138245, "reward_std": 0.09004256874322891, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7853527665138245, "step": 681 }, { "clip_ratio": 0.0, "completion_length": 166.48046875, "epoch": 0.4432889177770556, "grad_norm": 0.3667526841163635, "kl": 0.040283203125, "learning_rate": 5.565669700910273e-07, "loss": 0.0016, "reward": 1.8219077587127686, "reward_std": 0.07470732927322388, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8297202587127686, "step": 682 }, { "clip_ratio": 0.0, "completion_length": 188.19921875, "epoch": 0.4439389015274618, "grad_norm": 0.3437153100967407, "kl": 0.0286865234375, "learning_rate": 5.559167750325098e-07, "loss": 0.0011, "reward": 1.772543728351593, "reward_std": 0.08301141113042831, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7842625379562378, "step": 683 }, { "clip_ratio": 0.0, "completion_length": 184.828125, "epoch": 0.44458888527786805, "grad_norm": 0.9424434900283813, "kl": 0.04052734375, "learning_rate": 5.552665799739922e-07, "loss": 0.0016, "reward": 1.789855420589447, "reward_std": 0.1255950666964054, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8093866407871246, "step": 684 }, { "clip_ratio": 0.0, "completion_length": 193.125, "epoch": 0.4452388690282743, "grad_norm": 0.4355522692203522, "kl": 0.03369140625, "learning_rate": 5.546163849154746e-07, "loss": 0.0014, "reward": 1.7781206369400024, "reward_std": 0.10357507690787315, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7937457263469696, "step": 685 }, { "clip_ratio": 0.0, "completion_length": 192.8515625, "epoch": 0.4458888527786805, "grad_norm": 0.4501170814037323, "kl": 0.0338134765625, "learning_rate": 5.539661898569571e-07, "loss": 0.0014, "reward": 1.7404855489730835, "reward_std": 0.12916814535856247, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7639230191707611, "step": 686 }, { "clip_ratio": 0.0, "completion_length": 181.90234375, "epoch": 0.44653883652908677, "grad_norm": 0.38672909140586853, "kl": 0.028076171875, "learning_rate": 5.533159947984395e-07, "loss": 0.0011, "reward": 1.8021849393844604, "reward_std": 0.09314711019396782, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8178098797798157, "step": 687 }, { "clip_ratio": 0.0, "completion_length": 187.3359375, "epoch": 0.447188820279493, "grad_norm": 0.4371621310710907, "kl": 0.030029296875, "learning_rate": 5.52665799739922e-07, "loss": 0.0012, "reward": 1.801535725593567, "reward_std": 0.10740634053945541, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8249732255935669, "step": 688 }, { "clip_ratio": 0.0, "completion_length": 187.828125, "epoch": 0.44783880402989923, "grad_norm": 0.41239625215530396, "kl": 0.03131103515625, "learning_rate": 5.520156046814044e-07, "loss": 0.0013, "reward": 1.781527578830719, "reward_std": 0.09534148126840591, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.797152578830719, "step": 689 }, { "clip_ratio": 0.0, "completion_length": 186.09765625, "epoch": 0.4484887877803055, "grad_norm": 0.3743458092212677, "kl": 0.02777099609375, "learning_rate": 5.513654096228868e-07, "loss": 0.0011, "reward": 1.809461534023285, "reward_std": 0.08497010543942451, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8211802840232849, "step": 690 }, { "clip_ratio": 0.0, "completion_length": 185.92578125, "epoch": 0.44913877153071174, "grad_norm": 1.3368453979492188, "kl": 0.034423828125, "learning_rate": 5.507152145643693e-07, "loss": 0.0014, "reward": 1.7670636773109436, "reward_std": 0.10577571019530296, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7905011773109436, "step": 691 }, { "clip_ratio": 0.0, "completion_length": 201.35546875, "epoch": 0.44978875528111795, "grad_norm": 0.4875413775444031, "kl": 0.034423828125, "learning_rate": 5.500650195058517e-07, "loss": 0.0014, "reward": 1.8351286053657532, "reward_std": 0.07247317582368851, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8429411351680756, "step": 692 }, { "clip_ratio": 0.0, "completion_length": 198.50390625, "epoch": 0.4504387390315242, "grad_norm": 0.42948368191719055, "kl": 0.0301513671875, "learning_rate": 5.494148244473343e-07, "loss": 0.0012, "reward": 1.77531099319458, "reward_std": 0.09428054094314575, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7909359931945801, "step": 693 }, { "clip_ratio": 0.0, "completion_length": 188.24609375, "epoch": 0.45108872278193046, "grad_norm": 0.414645791053772, "kl": 0.030029296875, "learning_rate": 5.487646293888167e-07, "loss": 0.0012, "reward": 1.788496732711792, "reward_std": 0.08141731843352318, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7963091731071472, "step": 694 }, { "clip_ratio": 0.0, "completion_length": 183.1484375, "epoch": 0.45173870653233666, "grad_norm": 0.437346488237381, "kl": 0.0389404296875, "learning_rate": 5.481144343302991e-07, "loss": 0.0016, "reward": 1.8181734085083008, "reward_std": 0.13981790840625763, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.8455171585083008, "step": 695 }, { "clip_ratio": 0.0, "completion_length": 207.015625, "epoch": 0.4523886902827429, "grad_norm": 0.39240992069244385, "kl": 0.0355224609375, "learning_rate": 5.474642392717816e-07, "loss": 0.0014, "reward": 1.7969433069229126, "reward_std": 0.0853295624256134, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8125683069229126, "step": 696 }, { "clip_ratio": 0.0, "completion_length": 189.68359375, "epoch": 0.4530386740331492, "grad_norm": 0.3683679699897766, "kl": 0.0301513671875, "learning_rate": 5.46814044213264e-07, "loss": 0.0012, "reward": 1.8198460340499878, "reward_std": 0.07982643321156502, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8315647840499878, "step": 697 }, { "clip_ratio": 0.0, "completion_length": 194.96484375, "epoch": 0.45368865778355544, "grad_norm": 0.5823833346366882, "kl": 0.0372314453125, "learning_rate": 5.461638491547465e-07, "loss": 0.0015, "reward": 1.7453318238258362, "reward_std": 0.13662114366889, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7609568238258362, "step": 698 }, { "clip_ratio": 0.0, "completion_length": 199.0703125, "epoch": 0.45433864153396164, "grad_norm": 0.37419119477272034, "kl": 0.033935546875, "learning_rate": 5.455136540962289e-07, "loss": 0.0014, "reward": 1.7875471115112305, "reward_std": 0.13337340205907822, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.8148908615112305, "step": 699 }, { "clip_ratio": 0.0, "completion_length": 199.15234375, "epoch": 0.4549886252843679, "grad_norm": 0.33663782477378845, "kl": 0.03448486328125, "learning_rate": 5.448634590377113e-07, "loss": 0.0014, "reward": 1.819895625114441, "reward_std": 0.12702568992972374, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.8511456549167633, "step": 700 }, { "clip_ratio": 0.0, "completion_length": 205.72265625, "epoch": 0.45563860903477416, "grad_norm": 0.43700656294822693, "kl": 0.034912109375, "learning_rate": 5.442132639791938e-07, "loss": 0.0014, "reward": 1.7396800518035889, "reward_std": 0.11489680036902428, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7592113316059113, "step": 701 }, { "clip_ratio": 0.0, "completion_length": 185.5625, "epoch": 0.45628859278518036, "grad_norm": 0.46789661049842834, "kl": 0.0382080078125, "learning_rate": 5.435630689206762e-07, "loss": 0.0015, "reward": 1.7536394596099854, "reward_std": 0.13020119071006775, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7731706500053406, "step": 702 }, { "clip_ratio": 0.0, "completion_length": 182.08203125, "epoch": 0.4569385765355866, "grad_norm": 0.42665326595306396, "kl": 0.03143310546875, "learning_rate": 5.429128738621587e-07, "loss": 0.0013, "reward": 1.8557876348495483, "reward_std": 0.05472888424992561, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8557876348495483, "step": 703 }, { "clip_ratio": 0.0, "completion_length": 198.46484375, "epoch": 0.4575885602859929, "grad_norm": 0.45513585209846497, "kl": 0.03106689453125, "learning_rate": 5.42262678803641e-07, "loss": 0.0012, "reward": 1.776524543762207, "reward_std": 0.05753389559686184, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7765245139598846, "step": 704 }, { "clip_ratio": 0.0, "completion_length": 202.57421875, "epoch": 0.4582385440363991, "grad_norm": 0.49195870757102966, "kl": 0.040283203125, "learning_rate": 5.416124837451234e-07, "loss": 0.0016, "reward": 1.7418715953826904, "reward_std": 0.19771930575370789, "rewards/format_reward_gen": 0.95703125, "rewards/llm_reward": 0.78484046459198, "step": 705 }, { "clip_ratio": 0.0, "completion_length": 207.63671875, "epoch": 0.45888852778680533, "grad_norm": 0.4061947464942932, "kl": 0.0303955078125, "learning_rate": 5.40962288686606e-07, "loss": 0.0012, "reward": 1.7771041989326477, "reward_std": 0.1207166463136673, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.8044479489326477, "step": 706 }, { "clip_ratio": 0.0, "completion_length": 208.9140625, "epoch": 0.4595385115372116, "grad_norm": 0.4704895317554474, "kl": 0.036865234375, "learning_rate": 5.403120936280883e-07, "loss": 0.0015, "reward": 1.7334842681884766, "reward_std": 0.17029112577438354, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.7686404287815094, "step": 707 }, { "clip_ratio": 0.0, "completion_length": 213.01953125, "epoch": 0.4601884952876178, "grad_norm": 0.4332941174507141, "kl": 0.028076171875, "learning_rate": 5.396618985695708e-07, "loss": 0.0011, "reward": 1.749272346496582, "reward_std": 0.10064338333904743, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.776616096496582, "step": 708 }, { "clip_ratio": 0.0, "completion_length": 201.828125, "epoch": 0.46083847903802405, "grad_norm": 0.4441506564617157, "kl": 0.03173828125, "learning_rate": 5.390117035110532e-07, "loss": 0.0013, "reward": 1.7835718393325806, "reward_std": 0.10332150384783745, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.799196720123291, "step": 709 }, { "clip_ratio": 0.0, "completion_length": 204.0625, "epoch": 0.4614884627884303, "grad_norm": 0.6170893907546997, "kl": 0.029541015625, "learning_rate": 5.383615084525357e-07, "loss": 0.0012, "reward": 1.7738268375396729, "reward_std": 0.08562919497489929, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7855455577373505, "step": 710 }, { "clip_ratio": 0.0, "completion_length": 191.19140625, "epoch": 0.4621384465388365, "grad_norm": 0.48940160870552063, "kl": 0.0335693359375, "learning_rate": 5.377113133940182e-07, "loss": 0.0013, "reward": 1.7937355637550354, "reward_std": 0.09009999781847, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8054543733596802, "step": 711 }, { "clip_ratio": 0.0, "completion_length": 196.05859375, "epoch": 0.46278843028924277, "grad_norm": 0.4708462655544281, "kl": 0.0306396484375, "learning_rate": 5.370611183355006e-07, "loss": 0.0012, "reward": 1.7914795875549316, "reward_std": 0.15188492089509964, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.8266358375549316, "step": 712 }, { "clip_ratio": 0.0, "completion_length": 207.65625, "epoch": 0.463438414039649, "grad_norm": 0.45880046486854553, "kl": 0.0357666015625, "learning_rate": 5.364109232769831e-07, "loss": 0.0014, "reward": 1.7804071307182312, "reward_std": 0.10580289736390114, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7960321307182312, "step": 713 }, { "clip_ratio": 0.0, "completion_length": 193.44140625, "epoch": 0.46408839779005523, "grad_norm": 0.37791362404823303, "kl": 0.0291748046875, "learning_rate": 5.357607282184655e-07, "loss": 0.0012, "reward": 1.8256163597106934, "reward_std": 0.076287180185318, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.829522579908371, "step": 714 }, { "clip_ratio": 0.0, "completion_length": 199.35546875, "epoch": 0.4647383815404615, "grad_norm": 0.4969753324985504, "kl": 0.03564453125, "learning_rate": 5.351105331599479e-07, "loss": 0.0014, "reward": 1.7350137829780579, "reward_std": 0.13509593904018402, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7623574435710907, "step": 715 }, { "clip_ratio": 0.0, "completion_length": 190.34375, "epoch": 0.46538836529086774, "grad_norm": 0.5246131420135498, "kl": 0.0369873046875, "learning_rate": 5.344603381014304e-07, "loss": 0.0015, "reward": 1.7920023798942566, "reward_std": 0.11344422772526741, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8076273798942566, "step": 716 }, { "clip_ratio": 0.0, "completion_length": 191.62890625, "epoch": 0.46603834904127395, "grad_norm": 0.5108781456947327, "kl": 0.0361328125, "learning_rate": 5.338101430429128e-07, "loss": 0.0014, "reward": 1.7567957043647766, "reward_std": 0.13274191319942474, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7763269543647766, "step": 717 }, { "clip_ratio": 0.0, "completion_length": 193.55078125, "epoch": 0.4666883327916802, "grad_norm": 0.8034334778785706, "kl": 0.0335693359375, "learning_rate": 5.331599479843953e-07, "loss": 0.0013, "reward": 1.7584998607635498, "reward_std": 0.08890243247151375, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7663123905658722, "step": 718 }, { "clip_ratio": 0.0, "completion_length": 200.15625, "epoch": 0.46733831654208646, "grad_norm": 0.40912944078445435, "kl": 0.02978515625, "learning_rate": 5.325097529258777e-07, "loss": 0.0012, "reward": 1.7998226881027222, "reward_std": 0.09063509479165077, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8154476881027222, "step": 719 }, { "clip_ratio": 0.0, "completion_length": 191.16015625, "epoch": 0.46798830029249266, "grad_norm": 0.6359648704528809, "kl": 0.037353515625, "learning_rate": 5.318595578673601e-07, "loss": 0.0015, "reward": 1.7856976389884949, "reward_std": 0.14153042435646057, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.8130413889884949, "step": 720 }, { "clip_ratio": 0.0, "completion_length": 198.7265625, "epoch": 0.4686382840428989, "grad_norm": 0.35709360241889954, "kl": 0.0367431640625, "learning_rate": 5.312093628088426e-07, "loss": 0.0015, "reward": 1.802691102027893, "reward_std": 0.09020049124956131, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8144098818302155, "step": 721 }, { "clip_ratio": 0.0, "completion_length": 191.4375, "epoch": 0.4692882677933052, "grad_norm": 0.4605138897895813, "kl": 0.035400390625, "learning_rate": 5.30559167750325e-07, "loss": 0.0014, "reward": 1.8186166286468506, "reward_std": 0.08879189938306808, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8264291286468506, "step": 722 }, { "clip_ratio": 0.0, "completion_length": 182.8828125, "epoch": 0.4699382515437114, "grad_norm": 0.4312344193458557, "kl": 0.0322265625, "learning_rate": 5.299089726918075e-07, "loss": 0.0013, "reward": 1.829098880290985, "reward_std": 0.06559118628501892, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8290988504886627, "step": 723 }, { "clip_ratio": 0.0, "completion_length": 178.67578125, "epoch": 0.47058823529411764, "grad_norm": 0.5255938768386841, "kl": 0.03948974609375, "learning_rate": 5.292587776332899e-07, "loss": 0.0016, "reward": 1.78421288728714, "reward_std": 0.10820337384939194, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8037440478801727, "step": 724 }, { "clip_ratio": 0.0, "completion_length": 191.34765625, "epoch": 0.4712382190445239, "grad_norm": 0.3842274844646454, "kl": 0.0308837890625, "learning_rate": 5.286085825747723e-07, "loss": 0.0012, "reward": 1.8379690051078796, "reward_std": 0.07157861813902855, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.841875284910202, "step": 725 }, { "clip_ratio": 0.0, "completion_length": 183.65234375, "epoch": 0.4718882027949301, "grad_norm": 0.40556764602661133, "kl": 0.0419921875, "learning_rate": 5.279583875162548e-07, "loss": 0.0017, "reward": 1.8040605187416077, "reward_std": 0.09096940979361534, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8196854591369629, "step": 726 }, { "clip_ratio": 0.0, "completion_length": 183.73828125, "epoch": 0.47253818654533636, "grad_norm": 0.3420720100402832, "kl": 0.02996826171875, "learning_rate": 5.273081924577373e-07, "loss": 0.0012, "reward": 1.7857879400253296, "reward_std": 0.06530994549393654, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7975066900253296, "step": 727 }, { "clip_ratio": 0.0, "completion_length": 201.0546875, "epoch": 0.4731881702957426, "grad_norm": 0.45435911417007446, "kl": 0.0338134765625, "learning_rate": 5.266579973992198e-07, "loss": 0.0014, "reward": 1.7930557131767273, "reward_std": 0.09723220020532608, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8047744333744049, "step": 728 }, { "clip_ratio": 0.0, "completion_length": 192.078125, "epoch": 0.4738381540461489, "grad_norm": 0.605760931968689, "kl": 0.02813720703125, "learning_rate": 5.260078023407022e-07, "loss": 0.0011, "reward": 1.8410661220550537, "reward_std": 0.09317188337445259, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8488786518573761, "step": 729 }, { "clip_ratio": 0.0, "completion_length": 195.484375, "epoch": 0.4744881377965551, "grad_norm": 0.501793622970581, "kl": 0.04150390625, "learning_rate": 5.253576072821846e-07, "loss": 0.0017, "reward": 1.7546801567077637, "reward_std": 0.11522606760263443, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7781175971031189, "step": 730 }, { "clip_ratio": 0.0, "completion_length": 199.21484375, "epoch": 0.47513812154696133, "grad_norm": 0.37758669257164, "kl": 0.03021240234375, "learning_rate": 5.247074122236671e-07, "loss": 0.0012, "reward": 1.808888852596283, "reward_std": 0.06080776825547218, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8127951323986053, "step": 731 }, { "clip_ratio": 0.0, "completion_length": 197.83984375, "epoch": 0.4757881052973676, "grad_norm": 0.38624274730682373, "kl": 0.0350341796875, "learning_rate": 5.240572171651495e-07, "loss": 0.0014, "reward": 1.7581818103790283, "reward_std": 0.14598389714956284, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7855255305767059, "step": 732 }, { "clip_ratio": 0.0, "completion_length": 190.34375, "epoch": 0.4764380890477738, "grad_norm": 0.42932671308517456, "kl": 0.03289794921875, "learning_rate": 5.23407022106632e-07, "loss": 0.0013, "reward": 1.7458367943763733, "reward_std": 0.12773098051548004, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7692742943763733, "step": 733 }, { "clip_ratio": 0.0, "completion_length": 191.2265625, "epoch": 0.47708807279818005, "grad_norm": 0.4823538661003113, "kl": 0.0284423828125, "learning_rate": 5.227568270481144e-07, "loss": 0.0011, "reward": 1.7779291272163391, "reward_std": 0.12130644917488098, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7935541272163391, "step": 734 }, { "clip_ratio": 0.0, "completion_length": 182.41796875, "epoch": 0.4777380565485863, "grad_norm": 0.44385284185409546, "kl": 0.035888671875, "learning_rate": 5.221066319895968e-07, "loss": 0.0014, "reward": 1.7627350091934204, "reward_std": 0.1637931764125824, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7900786697864532, "step": 735 }, { "clip_ratio": 0.0, "completion_length": 191.4453125, "epoch": 0.4783880402989925, "grad_norm": 0.47595226764678955, "kl": 0.03369140625, "learning_rate": 5.214564369310793e-07, "loss": 0.0013, "reward": 1.7253245115280151, "reward_std": 0.1419242024421692, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7487620115280151, "step": 736 }, { "clip_ratio": 0.0, "completion_length": 190.91796875, "epoch": 0.47903802404939877, "grad_norm": 0.5264828205108643, "kl": 0.0362548828125, "learning_rate": 5.208062418725617e-07, "loss": 0.0015, "reward": 1.7767434120178223, "reward_std": 0.17276835441589355, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.8079934120178223, "step": 737 }, { "clip_ratio": 0.0, "completion_length": 207.66796875, "epoch": 0.479688007799805, "grad_norm": 0.458351194858551, "kl": 0.0340576171875, "learning_rate": 5.201560468140442e-07, "loss": 0.0014, "reward": 1.7576287984848022, "reward_std": 0.12670517712831497, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7771600186824799, "step": 738 }, { "clip_ratio": 0.0, "completion_length": 187.99609375, "epoch": 0.48033799155021123, "grad_norm": 0.4074235260486603, "kl": 0.030517578125, "learning_rate": 5.195058517555266e-07, "loss": 0.0012, "reward": 1.8139995336532593, "reward_std": 0.08450416103005409, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8218120336532593, "step": 739 }, { "clip_ratio": 0.0, "completion_length": 192.52734375, "epoch": 0.4809879753006175, "grad_norm": 0.44142937660217285, "kl": 0.03173828125, "learning_rate": 5.18855656697009e-07, "loss": 0.0013, "reward": 1.7781165838241577, "reward_std": 0.09855614602565765, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7976478636264801, "step": 740 }, { "clip_ratio": 0.0, "completion_length": 198.08203125, "epoch": 0.48163795905102375, "grad_norm": 0.6830851435661316, "kl": 0.0338134765625, "learning_rate": 5.182054616384915e-07, "loss": 0.0014, "reward": 1.7938323020935059, "reward_std": 0.10818858817219734, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8094573020935059, "step": 741 }, { "clip_ratio": 0.0, "completion_length": 195.2890625, "epoch": 0.48228794280142995, "grad_norm": 0.44543731212615967, "kl": 0.0318603515625, "learning_rate": 5.175552665799739e-07, "loss": 0.0013, "reward": 1.7674986124038696, "reward_std": 0.12287720292806625, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.798748642206192, "step": 742 }, { "clip_ratio": 0.0, "completion_length": 210.265625, "epoch": 0.4829379265518362, "grad_norm": 0.43998488783836365, "kl": 0.03369140625, "learning_rate": 5.169050715214564e-07, "loss": 0.0013, "reward": 1.7470692992210388, "reward_std": 0.13834530860185623, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7744131088256836, "step": 743 }, { "clip_ratio": 0.0, "completion_length": 178.265625, "epoch": 0.48358791030224246, "grad_norm": 0.4168734848499298, "kl": 0.03192138671875, "learning_rate": 5.162548764629389e-07, "loss": 0.0013, "reward": 1.7599825859069824, "reward_std": 0.11168651282787323, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.78342005610466, "step": 744 }, { "clip_ratio": 0.0, "completion_length": 190.9375, "epoch": 0.48423789405264867, "grad_norm": 0.43452364206314087, "kl": 0.0335693359375, "learning_rate": 5.156046814044213e-07, "loss": 0.0013, "reward": 1.753530204296112, "reward_std": 0.12966421246528625, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7769677042961121, "step": 745 }, { "clip_ratio": 0.0, "completion_length": 176.09375, "epoch": 0.4848878778030549, "grad_norm": 0.6088820695877075, "kl": 0.03857421875, "learning_rate": 5.149544863459038e-07, "loss": 0.0015, "reward": 1.787908136844635, "reward_std": 0.0944662019610405, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.799626886844635, "step": 746 }, { "clip_ratio": 0.0, "completion_length": 191.23828125, "epoch": 0.4855378615534612, "grad_norm": 0.39156925678253174, "kl": 0.02886962890625, "learning_rate": 5.143042912873862e-07, "loss": 0.0012, "reward": 1.7893466353416443, "reward_std": 0.12079891748726368, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.8166904747486115, "step": 747 }, { "clip_ratio": 0.0, "completion_length": 191.51953125, "epoch": 0.4861878453038674, "grad_norm": 0.4276871085166931, "kl": 0.0335693359375, "learning_rate": 5.136540962288687e-07, "loss": 0.0013, "reward": 1.7709150314331055, "reward_std": 0.12519196793437004, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7943525910377502, "step": 748 }, { "clip_ratio": 0.0, "completion_length": 197.546875, "epoch": 0.48683782905427364, "grad_norm": 0.403310090303421, "kl": 0.0311279296875, "learning_rate": 5.130039011703511e-07, "loss": 0.0012, "reward": 1.7932199239730835, "reward_std": 0.0778467245399952, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8010323643684387, "step": 749 }, { "clip_ratio": 0.0, "completion_length": 192.12890625, "epoch": 0.4874878128046799, "grad_norm": 0.4896431863307953, "kl": 0.037353515625, "learning_rate": 5.123537061118335e-07, "loss": 0.0015, "reward": 1.7542850971221924, "reward_std": 0.1426181197166443, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7738163471221924, "step": 750 }, { "clip_ratio": 0.0, "completion_length": 202.81640625, "epoch": 0.4881377965550861, "grad_norm": 0.44104674458503723, "kl": 0.03570556640625, "learning_rate": 5.11703511053316e-07, "loss": 0.0014, "reward": 1.7676652669906616, "reward_std": 0.11740065738558769, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7911027371883392, "step": 751 }, { "clip_ratio": 0.0, "completion_length": 192.546875, "epoch": 0.48878778030549236, "grad_norm": 0.3422824442386627, "kl": 0.03515625, "learning_rate": 5.110533159947984e-07, "loss": 0.0014, "reward": 1.8127375841140747, "reward_std": 0.0983527172356844, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8283625841140747, "step": 752 }, { "clip_ratio": 0.0, "completion_length": 176.515625, "epoch": 0.4894377640558986, "grad_norm": 0.9507759213447571, "kl": 0.039794921875, "learning_rate": 5.104031209362809e-07, "loss": 0.0016, "reward": 1.7745216488838196, "reward_std": 0.11625125259160995, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7979592084884644, "step": 753 }, { "clip_ratio": 0.0, "completion_length": 196.51171875, "epoch": 0.4900877478063048, "grad_norm": 0.4047873616218567, "kl": 0.02947998046875, "learning_rate": 5.097529258777633e-07, "loss": 0.0012, "reward": 1.8124966025352478, "reward_std": 0.08897675573825836, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.824215292930603, "step": 754 }, { "clip_ratio": 0.0, "completion_length": 182.03515625, "epoch": 0.4907377315567111, "grad_norm": 0.4451761841773987, "kl": 0.03125, "learning_rate": 5.091027308192457e-07, "loss": 0.0013, "reward": 1.8204562067985535, "reward_std": 0.12541137635707855, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8399874866008759, "step": 755 }, { "clip_ratio": 0.0, "completion_length": 191.29296875, "epoch": 0.49138771530711733, "grad_norm": 0.40858420729637146, "kl": 0.0301513671875, "learning_rate": 5.084525357607282e-07, "loss": 0.0012, "reward": 1.8070401549339294, "reward_std": 0.09011008590459824, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8187589347362518, "step": 756 }, { "clip_ratio": 0.0, "completion_length": 189.3515625, "epoch": 0.49203769905752354, "grad_norm": 0.38527098298072815, "kl": 0.0380859375, "learning_rate": 5.078023407022106e-07, "loss": 0.0015, "reward": 1.8201762437820435, "reward_std": 0.057266585528850555, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8240824639797211, "step": 757 }, { "clip_ratio": 0.0, "completion_length": 180.26953125, "epoch": 0.4926876828079298, "grad_norm": 0.40203002095222473, "kl": 0.0360107421875, "learning_rate": 5.071521456436931e-07, "loss": 0.0014, "reward": 1.7776890397071838, "reward_std": 0.09800457954406738, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7894078195095062, "step": 758 }, { "clip_ratio": 0.0, "completion_length": 206.3359375, "epoch": 0.49333766655833605, "grad_norm": 0.43389371037483215, "kl": 0.0286865234375, "learning_rate": 5.065019505851755e-07, "loss": 0.0011, "reward": 1.7800634503364563, "reward_std": 0.07454444281756878, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.7839696705341339, "step": 759 }, { "clip_ratio": 0.0, "completion_length": 179.7265625, "epoch": 0.49398765030874225, "grad_norm": 0.4089887738227844, "kl": 0.031005859375, "learning_rate": 5.058517555266579e-07, "loss": 0.0012, "reward": 1.8366400003433228, "reward_std": 0.07626937702298164, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8444525599479675, "step": 760 }, { "clip_ratio": 0.0, "completion_length": 177.12109375, "epoch": 0.4946376340591485, "grad_norm": 0.39336130023002625, "kl": 0.0318603515625, "learning_rate": 5.052015604681405e-07, "loss": 0.0013, "reward": 1.834023118019104, "reward_std": 0.08650090731680393, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8418355584144592, "step": 761 }, { "clip_ratio": 0.0, "completion_length": 186.71484375, "epoch": 0.49528761780955477, "grad_norm": 0.394202321767807, "kl": 0.03289794921875, "learning_rate": 5.045513654096229e-07, "loss": 0.0013, "reward": 1.800493836402893, "reward_std": 0.12567004561424255, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8239313066005707, "step": 762 }, { "clip_ratio": 0.0, "completion_length": 188.51953125, "epoch": 0.495937601559961, "grad_norm": 0.5658450126647949, "kl": 0.0606689453125, "learning_rate": 5.039011703511054e-07, "loss": 0.0024, "reward": 1.7642220854759216, "reward_std": 0.1243927963078022, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7876595556735992, "step": 763 }, { "clip_ratio": 0.0, "completion_length": 181.92578125, "epoch": 0.49658758531036723, "grad_norm": 0.44364094734191895, "kl": 0.036865234375, "learning_rate": 5.032509752925878e-07, "loss": 0.0015, "reward": 1.8178138136863708, "reward_std": 0.10705358535051346, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8295325636863708, "step": 764 }, { "clip_ratio": 0.0, "completion_length": 191.56640625, "epoch": 0.4972375690607735, "grad_norm": 0.4580462872982025, "kl": 0.03656005859375, "learning_rate": 5.026007802340702e-07, "loss": 0.0015, "reward": 1.7499244809150696, "reward_std": 0.07876642048358917, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7577369809150696, "step": 765 }, { "clip_ratio": 0.0, "completion_length": 176.046875, "epoch": 0.49788755281117975, "grad_norm": 0.44478335976600647, "kl": 0.030517578125, "learning_rate": 5.019505851755527e-07, "loss": 0.0012, "reward": 1.8370006084442139, "reward_std": 0.09219951182603836, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8448129892349243, "step": 766 }, { "clip_ratio": 0.0, "completion_length": 179.53125, "epoch": 0.49853753656158595, "grad_norm": 0.3839004635810852, "kl": 0.0386962890625, "learning_rate": 5.013003901170351e-07, "loss": 0.0015, "reward": 1.8143697381019592, "reward_std": 0.08122451789677143, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8260884881019592, "step": 767 }, { "clip_ratio": 0.0, "completion_length": 169.5234375, "epoch": 0.4991875203119922, "grad_norm": 0.5298296809196472, "kl": 0.036865234375, "learning_rate": 5.006501950585176e-07, "loss": 0.0015, "reward": 1.7670984268188477, "reward_std": 0.19300183653831482, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7983484864234924, "step": 768 }, { "clip_ratio": 0.0, "completion_length": 181.1953125, "epoch": 0.49983750406239846, "grad_norm": 0.3684612810611725, "kl": 0.0364990234375, "learning_rate": 5e-07, "loss": 0.0015, "reward": 1.7727476358413696, "reward_std": 0.10080211609601974, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7883727550506592, "step": 769 }, { "clip_ratio": 0.0, "completion_length": 182.07421875, "epoch": 0.5004874878128047, "grad_norm": 0.5706169605255127, "kl": 0.036865234375, "learning_rate": 4.993498049414825e-07, "loss": 0.0015, "reward": 1.7459184527397156, "reward_std": 0.1347079500555992, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7732621431350708, "step": 770 }, { "clip_ratio": 0.0, "completion_length": 193.73828125, "epoch": 0.5011374715632109, "grad_norm": 0.40800032019615173, "kl": 0.0318603515625, "learning_rate": 4.986996098829649e-07, "loss": 0.0013, "reward": 1.8008939027786255, "reward_std": 0.08538675680756569, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8087064921855927, "step": 771 }, { "clip_ratio": 0.0, "completion_length": 177.453125, "epoch": 0.5017874553136171, "grad_norm": 0.3692110478878021, "kl": 0.03118896484375, "learning_rate": 4.980494148244473e-07, "loss": 0.0012, "reward": 1.811791181564331, "reward_std": 0.07379375211894512, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8235099613666534, "step": 772 }, { "clip_ratio": 0.0, "completion_length": 187.83203125, "epoch": 0.5024374390640234, "grad_norm": 0.377994567155838, "kl": 0.03369140625, "learning_rate": 4.973992197659298e-07, "loss": 0.0013, "reward": 1.782192349433899, "reward_std": 0.0952906385064125, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7978173494338989, "step": 773 }, { "clip_ratio": 0.0, "completion_length": 173.7421875, "epoch": 0.5030874228144296, "grad_norm": 0.3907105326652527, "kl": 0.03155517578125, "learning_rate": 4.967490247074122e-07, "loss": 0.0013, "reward": 1.8221325278282166, "reward_std": 0.08733223751187325, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8338513374328613, "step": 774 }, { "clip_ratio": 0.0, "completion_length": 172.33984375, "epoch": 0.5037374065648359, "grad_norm": 0.4013062119483948, "kl": 0.03167724609375, "learning_rate": 4.960988296488947e-07, "loss": 0.0013, "reward": 1.7874236106872559, "reward_std": 0.09326309710741043, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7991423904895782, "step": 775 }, { "clip_ratio": 0.0, "completion_length": 195.16015625, "epoch": 0.5043873903152422, "grad_norm": 0.39040979743003845, "kl": 0.02923583984375, "learning_rate": 4.954486345903771e-07, "loss": 0.0012, "reward": 1.7927119731903076, "reward_std": 0.07085063867270947, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.80052450299263, "step": 776 }, { "clip_ratio": 0.0, "completion_length": 175.57421875, "epoch": 0.5050373740656483, "grad_norm": 0.4325307011604309, "kl": 0.0408935546875, "learning_rate": 4.947984395318595e-07, "loss": 0.0016, "reward": 1.7368260622024536, "reward_std": 0.08935992792248726, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7485447824001312, "step": 777 }, { "clip_ratio": 0.0, "completion_length": 177.31640625, "epoch": 0.5056873578160546, "grad_norm": 0.48696234822273254, "kl": 0.02740478515625, "learning_rate": 4.94148244473342e-07, "loss": 0.0011, "reward": 1.8130318522453308, "reward_std": 0.09306906908750534, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.828656792640686, "step": 778 }, { "clip_ratio": 0.0, "completion_length": 202.13671875, "epoch": 0.5063373415664608, "grad_norm": 0.6446750164031982, "kl": 0.0447998046875, "learning_rate": 4.934980494148245e-07, "loss": 0.0018, "reward": 1.70381361246109, "reward_std": 0.12258860841393471, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7233448624610901, "step": 779 }, { "clip_ratio": 0.0, "completion_length": 179.37890625, "epoch": 0.5069873253168671, "grad_norm": 0.44464245438575745, "kl": 0.03375244140625, "learning_rate": 4.928478543563069e-07, "loss": 0.0014, "reward": 1.853852391242981, "reward_std": 0.07496988773345947, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8655711710453033, "step": 780 }, { "clip_ratio": 0.0, "completion_length": 187.49609375, "epoch": 0.5076373090672733, "grad_norm": 0.42208102345466614, "kl": 0.03131103515625, "learning_rate": 4.921976592977894e-07, "loss": 0.0012, "reward": 1.7745885252952576, "reward_std": 0.0775836780667305, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.77849480509758, "step": 781 }, { "clip_ratio": 0.0, "completion_length": 189.37109375, "epoch": 0.5082872928176796, "grad_norm": 0.40344417095184326, "kl": 0.0313720703125, "learning_rate": 4.915474642392718e-07, "loss": 0.0013, "reward": 1.7803438901901245, "reward_std": 0.11007314175367355, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7959688603878021, "step": 782 }, { "clip_ratio": 0.0, "completion_length": 178.0625, "epoch": 0.5089372765680859, "grad_norm": 0.3698311448097229, "kl": 0.0263671875, "learning_rate": 4.908972691807542e-07, "loss": 0.0011, "reward": 1.8048125505447388, "reward_std": 0.08178141713142395, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8204375207424164, "step": 783 }, { "clip_ratio": 0.0, "completion_length": 174.9765625, "epoch": 0.509587260318492, "grad_norm": 0.43960344791412354, "kl": 0.03662109375, "learning_rate": 4.902470741222367e-07, "loss": 0.0015, "reward": 1.7821239829063416, "reward_std": 0.08387724682688713, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7899364531040192, "step": 784 }, { "clip_ratio": 0.0, "completion_length": 174.06640625, "epoch": 0.5102372440688983, "grad_norm": 0.43475964665412903, "kl": 0.0369873046875, "learning_rate": 4.895968790637191e-07, "loss": 0.0015, "reward": 1.756967544555664, "reward_std": 0.09554729983210564, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7764987945556641, "step": 785 }, { "clip_ratio": 0.0, "completion_length": 178.50390625, "epoch": 0.5108872278193045, "grad_norm": 1.1539459228515625, "kl": 0.0364990234375, "learning_rate": 4.889466840052016e-07, "loss": 0.0015, "reward": 1.7933200597763062, "reward_std": 0.11656703427433968, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8050387799739838, "step": 786 }, { "clip_ratio": 0.0, "completion_length": 166.3359375, "epoch": 0.5115372115697108, "grad_norm": 0.48196080327033997, "kl": 0.034912109375, "learning_rate": 4.88296488946684e-07, "loss": 0.0014, "reward": 1.7964471578598022, "reward_std": 0.10629110038280487, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8081658482551575, "step": 787 }, { "clip_ratio": 0.0, "completion_length": 174.734375, "epoch": 0.512187195320117, "grad_norm": 0.4606374204158783, "kl": 0.0367431640625, "learning_rate": 4.876462938881665e-07, "loss": 0.0015, "reward": 1.8363490104675293, "reward_std": 0.1009160578250885, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8519739806652069, "step": 788 }, { "clip_ratio": 0.0, "completion_length": 177.4609375, "epoch": 0.5128371790705233, "grad_norm": 0.4337221384048462, "kl": 0.0352783203125, "learning_rate": 4.869960988296489e-07, "loss": 0.0014, "reward": 1.7778484225273132, "reward_std": 0.12165633961558342, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.8051921129226685, "step": 789 }, { "clip_ratio": 0.0, "completion_length": 171.75, "epoch": 0.5134871628209294, "grad_norm": 0.43795281648635864, "kl": 0.02972412109375, "learning_rate": 4.863459037711314e-07, "loss": 0.0012, "reward": 1.796277940273285, "reward_std": 0.10008535534143448, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8119029700756073, "step": 790 }, { "clip_ratio": 0.0, "completion_length": 177.6484375, "epoch": 0.5141371465713357, "grad_norm": 0.4759089946746826, "kl": 0.0350341796875, "learning_rate": 4.856957087126138e-07, "loss": 0.0014, "reward": 1.7542956471443176, "reward_std": 0.12095995619893074, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7777332067489624, "step": 791 }, { "clip_ratio": 0.0, "completion_length": 191.07421875, "epoch": 0.514787130321742, "grad_norm": 0.4244515299797058, "kl": 0.0335693359375, "learning_rate": 4.850455136540961e-07, "loss": 0.0013, "reward": 1.784468412399292, "reward_std": 0.09454088285565376, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7961871922016144, "step": 792 }, { "clip_ratio": 0.0, "completion_length": 172.3671875, "epoch": 0.5154371140721482, "grad_norm": 0.44874250888824463, "kl": 0.04052734375, "learning_rate": 4.843953185955786e-07, "loss": 0.0016, "reward": 1.7759078741073608, "reward_std": 0.14892974495887756, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.8071578145027161, "step": 793 }, { "clip_ratio": 0.0, "completion_length": 187.6328125, "epoch": 0.5160870978225545, "grad_norm": 0.40973180532455444, "kl": 0.0355224609375, "learning_rate": 4.83745123537061e-07, "loss": 0.0014, "reward": 1.8146200776100159, "reward_std": 0.07191416248679161, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8185262382030487, "step": 794 }, { "clip_ratio": 0.0, "completion_length": 174.9921875, "epoch": 0.5167370815729607, "grad_norm": 0.4127054512500763, "kl": 0.0389404296875, "learning_rate": 4.830949284785435e-07, "loss": 0.0016, "reward": 1.8037102222442627, "reward_std": 0.10094309598207474, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8193352818489075, "step": 795 }, { "clip_ratio": 0.0, "completion_length": 175.09375, "epoch": 0.5173870653233669, "grad_norm": 0.400357186794281, "kl": 0.038330078125, "learning_rate": 4.82444733420026e-07, "loss": 0.0015, "reward": 1.8034607768058777, "reward_std": 0.09716755896806717, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8151794672012329, "step": 796 }, { "clip_ratio": 0.0, "completion_length": 167.875, "epoch": 0.5180370490737731, "grad_norm": 0.599647045135498, "kl": 0.0296630859375, "learning_rate": 4.817945383615084e-07, "loss": 0.0012, "reward": 1.8471906185150146, "reward_std": 0.061947450041770935, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8550030589103699, "step": 797 }, { "clip_ratio": 0.0, "completion_length": 173.3125, "epoch": 0.5186870328241794, "grad_norm": 0.4086368680000305, "kl": 0.0352783203125, "learning_rate": 4.811443433029908e-07, "loss": 0.0014, "reward": 1.8714032769203186, "reward_std": 0.07820706441998482, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8792158365249634, "step": 798 }, { "clip_ratio": 0.0, "completion_length": 188.13671875, "epoch": 0.5193370165745856, "grad_norm": 0.4458964169025421, "kl": 0.03350830078125, "learning_rate": 4.804941482444733e-07, "loss": 0.0013, "reward": 1.8074030876159668, "reward_std": 0.07826285436749458, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8113093376159668, "step": 799 }, { "clip_ratio": 0.0, "completion_length": 166.265625, "epoch": 0.5199870003249919, "grad_norm": 0.388070285320282, "kl": 0.033935546875, "learning_rate": 4.798439531859557e-07, "loss": 0.0014, "reward": 1.880418062210083, "reward_std": 0.05667422153055668, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8843242824077606, "step": 800 }, { "clip_ratio": 0.0, "completion_length": 167.1171875, "epoch": 0.5206369840753982, "grad_norm": 0.5298600792884827, "kl": 0.0460205078125, "learning_rate": 4.791937581274382e-07, "loss": 0.0018, "reward": 1.7940431833267212, "reward_std": 0.12161290645599365, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.809668093919754, "step": 801 }, { "clip_ratio": 0.0, "completion_length": 177.9296875, "epoch": 0.5212869678258043, "grad_norm": 0.604422390460968, "kl": 0.0362548828125, "learning_rate": 4.785435630689206e-07, "loss": 0.0015, "reward": 1.7380317449569702, "reward_std": 0.09381310269236565, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.741938054561615, "step": 802 }, { "clip_ratio": 0.0, "completion_length": 184.94140625, "epoch": 0.5219369515762106, "grad_norm": 0.526277482509613, "kl": 0.0352783203125, "learning_rate": 4.77893368010403e-07, "loss": 0.0014, "reward": 1.7546991109848022, "reward_std": 0.12199800461530685, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7781366109848022, "step": 803 }, { "clip_ratio": 0.0, "completion_length": 167.4765625, "epoch": 0.5225869353266168, "grad_norm": 0.43110358715057373, "kl": 0.052734375, "learning_rate": 4.772431729518855e-07, "loss": 0.0021, "reward": 1.7781946063041687, "reward_std": 0.0784563459455967, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7860071063041687, "step": 804 }, { "clip_ratio": 0.0, "completion_length": 180.01171875, "epoch": 0.5232369190770231, "grad_norm": 0.5188779234886169, "kl": 0.039794921875, "learning_rate": 4.76592977893368e-07, "loss": 0.0016, "reward": 1.7180629968643188, "reward_std": 0.11238932609558105, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7375942468643188, "step": 805 }, { "clip_ratio": 0.0, "completion_length": 171.96484375, "epoch": 0.5238869028274293, "grad_norm": 0.43658357858657837, "kl": 0.0408935546875, "learning_rate": 4.7594278283485044e-07, "loss": 0.0016, "reward": 1.8117826581001282, "reward_std": 0.11998527497053146, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8313139975070953, "step": 806 }, { "clip_ratio": 0.0, "completion_length": 155.40234375, "epoch": 0.5245368865778356, "grad_norm": 0.42371267080307007, "kl": 0.0433349609375, "learning_rate": 4.7529258777633283e-07, "loss": 0.0017, "reward": 1.7897324562072754, "reward_std": 0.09979479014873505, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.809263676404953, "step": 807 }, { "clip_ratio": 0.0, "completion_length": 178.2421875, "epoch": 0.5251868703282417, "grad_norm": 0.43838346004486084, "kl": 0.0362548828125, "learning_rate": 4.7464239271781533e-07, "loss": 0.0015, "reward": 1.8050849437713623, "reward_std": 0.08639303967356682, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8128974437713623, "step": 808 }, { "clip_ratio": 0.0, "completion_length": 168.9453125, "epoch": 0.525836854078648, "grad_norm": 0.4027659296989441, "kl": 0.040771484375, "learning_rate": 4.739921976592978e-07, "loss": 0.0016, "reward": 1.8445942997932434, "reward_std": 0.08663389459252357, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8602192103862762, "step": 809 }, { "clip_ratio": 0.0, "completion_length": 182.39453125, "epoch": 0.5264868378290543, "grad_norm": 0.457188218832016, "kl": 0.038330078125, "learning_rate": 4.7334200260078023e-07, "loss": 0.0015, "reward": 1.7897332310676575, "reward_std": 0.08165174722671509, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.7936395108699799, "step": 810 }, { "clip_ratio": 0.0, "completion_length": 184.01171875, "epoch": 0.5271368215794605, "grad_norm": 0.43208128213882446, "kl": 0.0426025390625, "learning_rate": 4.726918075422627e-07, "loss": 0.0017, "reward": 1.760746955871582, "reward_std": 0.12136922776699066, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.780278205871582, "step": 811 }, { "clip_ratio": 0.0, "completion_length": 175.48046875, "epoch": 0.5277868053298668, "grad_norm": 0.5634702444076538, "kl": 0.045166015625, "learning_rate": 4.720416124837451e-07, "loss": 0.0018, "reward": 1.7854153513908386, "reward_std": 0.09960415214300156, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7971340715885162, "step": 812 }, { "clip_ratio": 0.0, "completion_length": 186.2421875, "epoch": 0.528436789080273, "grad_norm": 0.5092892646789551, "kl": 0.0423583984375, "learning_rate": 4.713914174252275e-07, "loss": 0.0017, "reward": 1.7885926961898804, "reward_std": 0.08025558665394783, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.792498916387558, "step": 813 }, { "clip_ratio": 0.0, "completion_length": 185.05078125, "epoch": 0.5290867728306793, "grad_norm": 0.4379364252090454, "kl": 0.0404052734375, "learning_rate": 4.7074122236671e-07, "loss": 0.0016, "reward": 1.8324421048164368, "reward_std": 0.07992773875594139, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8402545750141144, "step": 814 }, { "clip_ratio": 0.0, "completion_length": 190.0390625, "epoch": 0.5297367565810854, "grad_norm": 0.5056179761886597, "kl": 0.041259765625, "learning_rate": 4.700910273081924e-07, "loss": 0.0016, "reward": 1.8177080750465393, "reward_std": 0.07748821936547756, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8255205452442169, "step": 815 }, { "clip_ratio": 0.0, "completion_length": 173.54296875, "epoch": 0.5303867403314917, "grad_norm": 0.44210124015808105, "kl": 0.0389404296875, "learning_rate": 4.694408322496749e-07, "loss": 0.0016, "reward": 1.7728638052940369, "reward_std": 0.10587649047374725, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7963013052940369, "step": 816 }, { "clip_ratio": 0.0, "completion_length": 164.35546875, "epoch": 0.531036724081898, "grad_norm": 0.3737505376338959, "kl": 0.0390625, "learning_rate": 4.687906371911573e-07, "loss": 0.0016, "reward": 1.8311355113983154, "reward_std": 0.09753788635134697, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8467604517936707, "step": 817 }, { "clip_ratio": 0.0, "completion_length": 190.16796875, "epoch": 0.5316867078323042, "grad_norm": 0.44634753465652466, "kl": 0.03466796875, "learning_rate": 4.6814044213263977e-07, "loss": 0.0014, "reward": 1.7679845094680786, "reward_std": 0.13975835219025612, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7953282296657562, "step": 818 }, { "clip_ratio": 0.0, "completion_length": 182.77734375, "epoch": 0.5323366915827105, "grad_norm": 0.47163626551628113, "kl": 0.0364990234375, "learning_rate": 4.674902470741222e-07, "loss": 0.0015, "reward": 1.7696322202682495, "reward_std": 0.1603064239025116, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.8008822500705719, "step": 819 }, { "clip_ratio": 0.0, "completion_length": 177.84765625, "epoch": 0.5329866753331167, "grad_norm": 0.5419964790344238, "kl": 0.038818359375, "learning_rate": 4.6684005201560467e-07, "loss": 0.0016, "reward": 1.803783893585205, "reward_std": 0.12877155095338821, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8194088935852051, "step": 820 }, { "clip_ratio": 0.0, "completion_length": 171.625, "epoch": 0.5336366590835229, "grad_norm": 0.4053913652896881, "kl": 0.0302734375, "learning_rate": 4.661898569570871e-07, "loss": 0.0012, "reward": 1.826511025428772, "reward_std": 0.07272836938500404, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.834323525428772, "step": 821 }, { "clip_ratio": 0.0, "completion_length": 180.15625, "epoch": 0.5342866428339291, "grad_norm": 0.35735857486724854, "kl": 0.0362548828125, "learning_rate": 4.655396618985695e-07, "loss": 0.0015, "reward": 1.8335109949111938, "reward_std": 0.05691806972026825, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8413235247135162, "step": 822 }, { "clip_ratio": 0.0, "completion_length": 188.05078125, "epoch": 0.5349366265843354, "grad_norm": 0.38633832335472107, "kl": 0.04473876953125, "learning_rate": 4.6488946684005196e-07, "loss": 0.0018, "reward": 1.7587759494781494, "reward_std": 0.09727821499109268, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7783071994781494, "step": 823 }, { "clip_ratio": 0.0, "completion_length": 177.62890625, "epoch": 0.5355866103347416, "grad_norm": 0.6463331580162048, "kl": 0.0374755859375, "learning_rate": 4.642392717815344e-07, "loss": 0.0015, "reward": 1.7681392431259155, "reward_std": 0.11362559162080288, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7837641835212708, "step": 824 }, { "clip_ratio": 0.0, "completion_length": 162.94140625, "epoch": 0.5362365940851479, "grad_norm": 0.32172492146492004, "kl": 0.0394287109375, "learning_rate": 4.635890767230169e-07, "loss": 0.0016, "reward": 1.8756299018859863, "reward_std": 0.06105530075728893, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8795361518859863, "step": 825 }, { "clip_ratio": 0.0, "completion_length": 157.37890625, "epoch": 0.5368865778355542, "grad_norm": 0.4367740750312805, "kl": 0.03173828125, "learning_rate": 4.6293888166449936e-07, "loss": 0.0013, "reward": 1.869483470916748, "reward_std": 0.06420052796602249, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8733898401260376, "step": 826 }, { "clip_ratio": 0.0, "completion_length": 181.40625, "epoch": 0.5375365615859603, "grad_norm": 0.4400818347930908, "kl": 0.033203125, "learning_rate": 4.6228868660598176e-07, "loss": 0.0013, "reward": 1.8234564661979675, "reward_std": 0.06960663758218288, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8390815258026123, "step": 827 }, { "clip_ratio": 0.0, "completion_length": 190.859375, "epoch": 0.5381865453363666, "grad_norm": 0.4468178451061249, "kl": 0.03143310546875, "learning_rate": 4.616384915474642e-07, "loss": 0.0013, "reward": 1.830746352672577, "reward_std": 0.06960565969347954, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8307463526725769, "step": 828 }, { "clip_ratio": 0.0, "completion_length": 180.703125, "epoch": 0.5388365290867728, "grad_norm": 0.5033491849899292, "kl": 0.0355224609375, "learning_rate": 4.6098829648894666e-07, "loss": 0.0014, "reward": 1.7839325666427612, "reward_std": 0.14511574804782867, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8034638166427612, "step": 829 }, { "clip_ratio": 0.0, "completion_length": 192.68359375, "epoch": 0.5394865128371791, "grad_norm": 0.4869483411312103, "kl": 0.03350830078125, "learning_rate": 4.603381014304291e-07, "loss": 0.0013, "reward": 1.7124587297439575, "reward_std": 0.10027382522821426, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7202712595462799, "step": 830 }, { "clip_ratio": 0.0, "completion_length": 182.7421875, "epoch": 0.5401364965875853, "grad_norm": 0.39253363013267517, "kl": 0.03094482421875, "learning_rate": 4.5968790637191156e-07, "loss": 0.0012, "reward": 1.8217841386795044, "reward_std": 0.07163297384977341, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8335029482841492, "step": 831 }, { "clip_ratio": 0.0, "completion_length": 193.19140625, "epoch": 0.5407864803379916, "grad_norm": 0.429255872964859, "kl": 0.035888671875, "learning_rate": 4.5903771131339395e-07, "loss": 0.0014, "reward": 1.8071541786193848, "reward_std": 0.09711743891239166, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8188729882240295, "step": 832 }, { "clip_ratio": 0.0, "completion_length": 179.53125, "epoch": 0.5414364640883977, "grad_norm": 0.3692113757133484, "kl": 0.02825927734375, "learning_rate": 4.583875162548764e-07, "loss": 0.0011, "reward": 1.8221608996391296, "reward_std": 0.06087626516819, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8260670900344849, "step": 833 }, { "clip_ratio": 0.0, "completion_length": 193.45703125, "epoch": 0.542086447838804, "grad_norm": 0.4363584518432617, "kl": 0.03314208984375, "learning_rate": 4.577373211963589e-07, "loss": 0.0013, "reward": 1.7900715470314026, "reward_std": 0.10197808407247066, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8056966066360474, "step": 834 }, { "clip_ratio": 0.0, "completion_length": 181.49609375, "epoch": 0.5427364315892103, "grad_norm": 0.4024101197719574, "kl": 0.041748046875, "learning_rate": 4.5708712613784135e-07, "loss": 0.0017, "reward": 1.84742271900177, "reward_std": 0.05532095581293106, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8513290286064148, "step": 835 }, { "clip_ratio": 0.0, "completion_length": 190.515625, "epoch": 0.5433864153396165, "grad_norm": 0.4010083079338074, "kl": 0.0322265625, "learning_rate": 4.564369310793238e-07, "loss": 0.0013, "reward": 1.8371753692626953, "reward_std": 0.07391345128417015, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8449878990650177, "step": 836 }, { "clip_ratio": 0.0, "completion_length": 173.3359375, "epoch": 0.5440363990900228, "grad_norm": 2.1841881275177, "kl": 0.03955078125, "learning_rate": 4.557867360208062e-07, "loss": 0.0016, "reward": 1.7949926257133484, "reward_std": 0.11295177415013313, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.810617595911026, "step": 837 }, { "clip_ratio": 0.0, "completion_length": 192.30078125, "epoch": 0.544686382840429, "grad_norm": 0.44377896189689636, "kl": 0.0311279296875, "learning_rate": 4.5513654096228865e-07, "loss": 0.0012, "reward": 1.7648036479949951, "reward_std": 0.10896360874176025, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7765223383903503, "step": 838 }, { "clip_ratio": 0.0, "completion_length": 195.41015625, "epoch": 0.5453363665908352, "grad_norm": 0.4271000325679779, "kl": 0.0433349609375, "learning_rate": 4.544863459037711e-07, "loss": 0.0017, "reward": 1.727506697177887, "reward_std": 0.1365378573536873, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7509441375732422, "step": 839 }, { "clip_ratio": 0.0, "completion_length": 184.6484375, "epoch": 0.5459863503412414, "grad_norm": 0.4898456931114197, "kl": 0.0313720703125, "learning_rate": 4.5383615084525355e-07, "loss": 0.0013, "reward": 1.783243179321289, "reward_std": 0.0767790675163269, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7832431495189667, "step": 840 }, { "clip_ratio": 0.0, "completion_length": 200.94140625, "epoch": 0.5466363340916477, "grad_norm": 0.4795975387096405, "kl": 0.03466796875, "learning_rate": 4.53185955786736e-07, "loss": 0.0014, "reward": 1.7772495746612549, "reward_std": 0.11185487359762192, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7889683544635773, "step": 841 }, { "clip_ratio": 0.0, "completion_length": 176.640625, "epoch": 0.547286317842054, "grad_norm": 0.4855664074420929, "kl": 0.03564453125, "learning_rate": 4.5253576072821844e-07, "loss": 0.0014, "reward": 1.828288972377777, "reward_std": 0.13686008751392365, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8517265021800995, "step": 842 }, { "clip_ratio": 0.0, "completion_length": 192.10546875, "epoch": 0.5479363015924602, "grad_norm": 0.5694993138313293, "kl": 0.0322265625, "learning_rate": 4.518855656697009e-07, "loss": 0.0013, "reward": 1.7910508513450623, "reward_std": 0.10407111048698425, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8027696311473846, "step": 843 }, { "clip_ratio": 0.0, "completion_length": 202.84765625, "epoch": 0.5485862853428665, "grad_norm": 0.4724510908126831, "kl": 0.03802490234375, "learning_rate": 4.5123537061118334e-07, "loss": 0.0015, "reward": 1.7244287729263306, "reward_std": 0.15308506786823273, "rewards/format_reward_gen": 0.953125, "rewards/llm_reward": 0.7713037729263306, "step": 844 }, { "clip_ratio": 0.0, "completion_length": 182.8046875, "epoch": 0.5492362690932727, "grad_norm": 0.5223296284675598, "kl": 0.034912109375, "learning_rate": 4.505851755526658e-07, "loss": 0.0014, "reward": 1.81949383020401, "reward_std": 0.08218470960855484, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8273063600063324, "step": 845 }, { "clip_ratio": 0.0, "completion_length": 186.73828125, "epoch": 0.5498862528436789, "grad_norm": 0.4676055610179901, "kl": 0.03009033203125, "learning_rate": 4.4993498049414824e-07, "loss": 0.0012, "reward": 1.8149497509002686, "reward_std": 0.07576587796211243, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8266685009002686, "step": 846 }, { "clip_ratio": 0.0, "completion_length": 178.63671875, "epoch": 0.5505362365940851, "grad_norm": 0.4382324516773224, "kl": 0.03515625, "learning_rate": 4.4928478543563064e-07, "loss": 0.0014, "reward": 1.779710054397583, "reward_std": 0.1211673691868782, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8031475245952606, "step": 847 }, { "clip_ratio": 0.0, "completion_length": 179.31640625, "epoch": 0.5511862203444914, "grad_norm": 0.42638954520225525, "kl": 0.037353515625, "learning_rate": 4.486345903771131e-07, "loss": 0.0015, "reward": 1.7955639362335205, "reward_std": 0.09264573082327843, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8072826862335205, "step": 848 }, { "clip_ratio": 0.0, "completion_length": 192.2265625, "epoch": 0.5518362040948976, "grad_norm": 4.2206597328186035, "kl": 0.0345458984375, "learning_rate": 4.4798439531859553e-07, "loss": 0.0014, "reward": 1.8077515363693237, "reward_std": 0.06871544942259789, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8194703161716461, "step": 849 }, { "clip_ratio": 0.0, "completion_length": 189.171875, "epoch": 0.5524861878453039, "grad_norm": 0.6449670195579529, "kl": 0.03033447265625, "learning_rate": 4.47334200260078e-07, "loss": 0.0012, "reward": 1.811192512512207, "reward_std": 0.08954620361328125, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8268174231052399, "step": 850 }, { "clip_ratio": 0.0, "completion_length": 185.91796875, "epoch": 0.5531361715957102, "grad_norm": 0.38834258913993835, "kl": 0.0322265625, "learning_rate": 4.466840052015605e-07, "loss": 0.0013, "reward": 1.8248572945594788, "reward_std": 0.1066344678401947, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.8522010743618011, "step": 851 }, { "clip_ratio": 0.0, "completion_length": 186.8984375, "epoch": 0.5537861553461163, "grad_norm": 0.38554883003234863, "kl": 0.02984619140625, "learning_rate": 4.460338101430429e-07, "loss": 0.0012, "reward": 1.8218517899513245, "reward_std": 0.07606486976146698, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8296642303466797, "step": 852 }, { "clip_ratio": 0.0, "completion_length": 172.82421875, "epoch": 0.5544361390965226, "grad_norm": 0.43257176876068115, "kl": 0.03082275390625, "learning_rate": 4.4538361508452533e-07, "loss": 0.0012, "reward": 1.7813332080841064, "reward_std": 0.10213170200586319, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7930519878864288, "step": 853 }, { "clip_ratio": 0.0, "completion_length": 190.9921875, "epoch": 0.5550861228469288, "grad_norm": 0.4600614905357361, "kl": 0.034912109375, "learning_rate": 4.447334200260078e-07, "loss": 0.0014, "reward": 1.791800320148468, "reward_std": 0.1022610180079937, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.799612820148468, "step": 854 }, { "clip_ratio": 0.0, "completion_length": 186.390625, "epoch": 0.5557361065973351, "grad_norm": 0.4315139651298523, "kl": 0.03125, "learning_rate": 4.4408322496749023e-07, "loss": 0.0012, "reward": 1.8259111046791077, "reward_std": 0.0737186037003994, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.82981738448143, "step": 855 }, { "clip_ratio": 0.0, "completion_length": 197.1796875, "epoch": 0.5563860903477413, "grad_norm": 0.5213205814361572, "kl": 0.0399169921875, "learning_rate": 4.434330299089727e-07, "loss": 0.0016, "reward": 1.7328187823295593, "reward_std": 0.15763645619153976, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.7679750919342041, "step": 856 }, { "clip_ratio": 0.0, "completion_length": 171.1328125, "epoch": 0.5570360740981476, "grad_norm": 0.46491917967796326, "kl": 0.045166015625, "learning_rate": 4.427828348504551e-07, "loss": 0.0018, "reward": 1.7559444904327393, "reward_std": 0.13714652135968208, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7871944606304169, "step": 857 }, { "clip_ratio": 0.0, "completion_length": 190.9375, "epoch": 0.5576860578485537, "grad_norm": 0.42156800627708435, "kl": 0.02642822265625, "learning_rate": 4.421326397919375e-07, "loss": 0.0011, "reward": 1.8403043746948242, "reward_std": 0.09391935355961323, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8481168746948242, "step": 858 }, { "clip_ratio": 0.0, "completion_length": 177.51171875, "epoch": 0.55833604159896, "grad_norm": 0.46104106307029724, "kl": 0.0445556640625, "learning_rate": 4.4148244473342e-07, "loss": 0.0018, "reward": 1.7218815684318542, "reward_std": 0.12193494290113449, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7375065088272095, "step": 859 }, { "clip_ratio": 0.0, "completion_length": 182.17578125, "epoch": 0.5589860253493663, "grad_norm": 0.47813132405281067, "kl": 0.0445556640625, "learning_rate": 4.408322496749025e-07, "loss": 0.0018, "reward": 1.775139331817627, "reward_std": 0.11151367798447609, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.786858081817627, "step": 860 }, { "clip_ratio": 0.0, "completion_length": 180.57421875, "epoch": 0.5596360090997725, "grad_norm": 0.9551346302032471, "kl": 0.0458984375, "learning_rate": 4.401820546163849e-07, "loss": 0.0018, "reward": 1.811128854751587, "reward_std": 0.09213229641318321, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8228476345539093, "step": 861 }, { "clip_ratio": 0.0, "completion_length": 194.609375, "epoch": 0.5602859928501788, "grad_norm": 0.4333401024341583, "kl": 0.0361328125, "learning_rate": 4.395318595578673e-07, "loss": 0.0014, "reward": 1.754298210144043, "reward_std": 0.12631674855947495, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.773829460144043, "step": 862 }, { "clip_ratio": 0.0, "completion_length": 183.40234375, "epoch": 0.560935976600585, "grad_norm": 0.38089901208877563, "kl": 0.03472900390625, "learning_rate": 4.3888166449934977e-07, "loss": 0.0014, "reward": 1.834741473197937, "reward_std": 0.06451849080622196, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8386476635932922, "step": 863 }, { "clip_ratio": 0.0, "completion_length": 179.8828125, "epoch": 0.5615859603509912, "grad_norm": 0.47388413548469543, "kl": 0.02899169921875, "learning_rate": 4.382314694408322e-07, "loss": 0.0012, "reward": 1.82737135887146, "reward_std": 0.053999755531549454, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8312776386737823, "step": 864 }, { "clip_ratio": 0.0, "completion_length": 185.8828125, "epoch": 0.5622359441013974, "grad_norm": 0.5121338367462158, "kl": 0.03204345703125, "learning_rate": 4.3758127438231467e-07, "loss": 0.0013, "reward": 1.763613760471344, "reward_std": 0.07373407110571861, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.7675200402736664, "step": 865 }, { "clip_ratio": 0.0, "completion_length": 190.47265625, "epoch": 0.5628859278518037, "grad_norm": 0.6064637899398804, "kl": 0.0380859375, "learning_rate": 4.369310793237971e-07, "loss": 0.0015, "reward": 1.7528907656669617, "reward_std": 0.14194413274526596, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7763282656669617, "step": 866 }, { "clip_ratio": 0.0, "completion_length": 164.7734375, "epoch": 0.56353591160221, "grad_norm": 1.2013131380081177, "kl": 0.0372314453125, "learning_rate": 4.3628088426527957e-07, "loss": 0.0015, "reward": 1.8036458492279053, "reward_std": 0.06618023663759232, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8114583790302277, "step": 867 }, { "clip_ratio": 0.0, "completion_length": 175.98046875, "epoch": 0.5641858953526162, "grad_norm": 0.40129315853118896, "kl": 0.04736328125, "learning_rate": 4.35630689206762e-07, "loss": 0.0019, "reward": 1.7977937459945679, "reward_std": 0.12397796660661697, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8173250555992126, "step": 868 }, { "clip_ratio": 0.0, "completion_length": 185.84375, "epoch": 0.5648358791030225, "grad_norm": 0.4340304434299469, "kl": 0.03564453125, "learning_rate": 4.3498049414824446e-07, "loss": 0.0014, "reward": 1.838840365409851, "reward_std": 0.07925792038440704, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8505590856075287, "step": 869 }, { "clip_ratio": 0.0, "completion_length": 189.94140625, "epoch": 0.5654858628534286, "grad_norm": 0.4502250552177429, "kl": 0.03564453125, "learning_rate": 4.343302990897269e-07, "loss": 0.0014, "reward": 1.7858741879463196, "reward_std": 0.06995772197842598, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.7897803783416748, "step": 870 }, { "clip_ratio": 0.0, "completion_length": 179.67578125, "epoch": 0.5661358466038349, "grad_norm": 0.3971560299396515, "kl": 0.033935546875, "learning_rate": 4.3368010403120936e-07, "loss": 0.0014, "reward": 1.8339310884475708, "reward_std": 0.08579548448324203, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8378373980522156, "step": 871 }, { "clip_ratio": 0.0, "completion_length": 190.84765625, "epoch": 0.5667858303542411, "grad_norm": 0.4721647799015045, "kl": 0.05908203125, "learning_rate": 4.330299089726918e-07, "loss": 0.0024, "reward": 1.8466461300849915, "reward_std": 0.07667634263634682, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8622711002826691, "step": 872 }, { "clip_ratio": 0.0, "completion_length": 175.0546875, "epoch": 0.5674358141046474, "grad_norm": 0.4090482294559479, "kl": 0.0396728515625, "learning_rate": 4.323797139141742e-07, "loss": 0.0016, "reward": 1.8359793424606323, "reward_std": 0.1109684556722641, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8516043424606323, "step": 873 }, { "clip_ratio": 0.0, "completion_length": 176.5546875, "epoch": 0.5680857978550536, "grad_norm": 0.48806390166282654, "kl": 0.0345458984375, "learning_rate": 4.3172951885565666e-07, "loss": 0.0014, "reward": 1.7637763023376465, "reward_std": 0.11924371495842934, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7833076417446136, "step": 874 }, { "clip_ratio": 0.0, "completion_length": 168.8125, "epoch": 0.5687357816054599, "grad_norm": 0.4549996852874756, "kl": 0.04248046875, "learning_rate": 4.310793237971391e-07, "loss": 0.0017, "reward": 1.8103148937225342, "reward_std": 0.12603148072957993, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.8376586735248566, "step": 875 }, { "clip_ratio": 0.0, "completion_length": 179.6953125, "epoch": 0.5693857653558662, "grad_norm": 0.45246538519859314, "kl": 0.0430908203125, "learning_rate": 4.304291287386216e-07, "loss": 0.0017, "reward": 1.7682083249092102, "reward_std": 0.1353822946548462, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.787739634513855, "step": 876 }, { "clip_ratio": 0.0, "completion_length": 202.13671875, "epoch": 0.5700357491062723, "grad_norm": 0.47730037569999695, "kl": 0.03533935546875, "learning_rate": 4.2977893368010406e-07, "loss": 0.0014, "reward": 1.7477026581764221, "reward_std": 0.1389765404164791, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7711401581764221, "step": 877 }, { "clip_ratio": 0.0, "completion_length": 191.59765625, "epoch": 0.5706857328566786, "grad_norm": 0.4089455008506775, "kl": 0.03369140625, "learning_rate": 4.2912873862158645e-07, "loss": 0.0013, "reward": 1.775547206401825, "reward_std": 0.11551935970783234, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.791172206401825, "step": 878 }, { "clip_ratio": 0.0, "completion_length": 191.84765625, "epoch": 0.5713357166070848, "grad_norm": 0.4124261438846588, "kl": 0.03631591796875, "learning_rate": 4.284785435630689e-07, "loss": 0.0015, "reward": 1.7859327793121338, "reward_std": 0.1133103221654892, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7976514995098114, "step": 879 }, { "clip_ratio": 0.0, "completion_length": 193.84765625, "epoch": 0.5719857003574911, "grad_norm": 0.4187591075897217, "kl": 0.02496337890625, "learning_rate": 4.2782834850455135e-07, "loss": 0.001, "reward": 1.7674381136894226, "reward_std": 0.1246849074959755, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7830631136894226, "step": 880 }, { "clip_ratio": 0.0, "completion_length": 184.11328125, "epoch": 0.5726356841078973, "grad_norm": 0.4711749851703644, "kl": 0.031005859375, "learning_rate": 4.271781534460338e-07, "loss": 0.0012, "reward": 1.7480486631393433, "reward_std": 0.10510442405939102, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7597674131393433, "step": 881 }, { "clip_ratio": 0.0, "completion_length": 188.58203125, "epoch": 0.5732856678583036, "grad_norm": 0.41709083318710327, "kl": 0.02813720703125, "learning_rate": 4.2652795838751625e-07, "loss": 0.0011, "reward": 1.851789653301239, "reward_std": 0.08761879615485668, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8674146234989166, "step": 882 }, { "clip_ratio": 0.0, "completion_length": 187.0546875, "epoch": 0.5739356516087097, "grad_norm": 0.4568042457103729, "kl": 0.02899169921875, "learning_rate": 4.2587776332899865e-07, "loss": 0.0012, "reward": 1.7659105062484741, "reward_std": 0.09306317195296288, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7776293158531189, "step": 883 }, { "clip_ratio": 0.0, "completion_length": 181.48828125, "epoch": 0.574585635359116, "grad_norm": 0.4249840974807739, "kl": 0.03240966796875, "learning_rate": 4.252275682704811e-07, "loss": 0.0013, "reward": 1.7589719891548157, "reward_std": 0.10408058762550354, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7745969891548157, "step": 884 }, { "clip_ratio": 0.0, "completion_length": 194.359375, "epoch": 0.5752356191095223, "grad_norm": 0.4282366931438446, "kl": 0.031005859375, "learning_rate": 4.245773732119636e-07, "loss": 0.0012, "reward": 1.796618103981018, "reward_std": 0.11082316190004349, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8044306337833405, "step": 885 }, { "clip_ratio": 0.0, "completion_length": 170.26953125, "epoch": 0.5758856028599285, "grad_norm": 0.7288938760757446, "kl": 0.06097412109375, "learning_rate": 4.2392717815344605e-07, "loss": 0.0024, "reward": 1.7820309400558472, "reward_std": 0.11807475239038467, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7937496900558472, "step": 886 }, { "clip_ratio": 0.0, "completion_length": 187.765625, "epoch": 0.5765355866103348, "grad_norm": 0.4452626407146454, "kl": 0.027099609375, "learning_rate": 4.232769830949285e-07, "loss": 0.0011, "reward": 1.76884263753891, "reward_std": 0.1105339489877224, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7805613577365875, "step": 887 }, { "clip_ratio": 0.0, "completion_length": 189.30859375, "epoch": 0.577185570360741, "grad_norm": 0.5248490571975708, "kl": 0.036376953125, "learning_rate": 4.226267880364109e-07, "loss": 0.0015, "reward": 1.7524800896644592, "reward_std": 0.14196935296058655, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7641988396644592, "step": 888 }, { "clip_ratio": 0.0, "completion_length": 178.42578125, "epoch": 0.5778355541111472, "grad_norm": 0.5136523246765137, "kl": 0.0460205078125, "learning_rate": 4.2197659297789334e-07, "loss": 0.0018, "reward": 1.8006196022033691, "reward_std": 0.12253502383828163, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8123383522033691, "step": 889 }, { "clip_ratio": 0.0, "completion_length": 173.09375, "epoch": 0.5784855378615534, "grad_norm": 0.4148343503475189, "kl": 0.032958984375, "learning_rate": 4.213263979193758e-07, "loss": 0.0013, "reward": 1.794230580329895, "reward_std": 0.16974620521068573, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.8215743899345398, "step": 890 }, { "clip_ratio": 0.0, "completion_length": 193.9609375, "epoch": 0.5791355216119597, "grad_norm": 0.4655911922454834, "kl": 0.029541015625, "learning_rate": 4.2067620286085824e-07, "loss": 0.0012, "reward": 1.7384775280952454, "reward_std": 0.12177430093288422, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7501962780952454, "step": 891 }, { "clip_ratio": 0.0, "completion_length": 177.82421875, "epoch": 0.579785505362366, "grad_norm": 0.43923649191856384, "kl": 0.02984619140625, "learning_rate": 4.200260078023407e-07, "loss": 0.0012, "reward": 1.7758422493934631, "reward_std": 0.11727463081479073, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7875609397888184, "step": 892 }, { "clip_ratio": 0.0, "completion_length": 180.12890625, "epoch": 0.5804354891127722, "grad_norm": 0.4242077171802521, "kl": 0.03021240234375, "learning_rate": 4.193758127438231e-07, "loss": 0.0012, "reward": 1.8016091585159302, "reward_std": 0.11080197244882584, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8133278787136078, "step": 893 }, { "clip_ratio": 0.0, "completion_length": 182.72265625, "epoch": 0.5810854728631785, "grad_norm": 0.4464413821697235, "kl": 0.03076171875, "learning_rate": 4.187256176853056e-07, "loss": 0.0012, "reward": 1.7770317792892456, "reward_std": 0.11774063110351562, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7887504994869232, "step": 894 }, { "clip_ratio": 0.0, "completion_length": 184.28515625, "epoch": 0.5817354566135846, "grad_norm": 0.6972528696060181, "kl": 0.034912109375, "learning_rate": 4.1807542262678803e-07, "loss": 0.0014, "reward": 1.7305086255073547, "reward_std": 0.1538247913122177, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7539461851119995, "step": 895 }, { "clip_ratio": 0.0, "completion_length": 176.48828125, "epoch": 0.5823854403639909, "grad_norm": 0.4429013431072235, "kl": 0.031982421875, "learning_rate": 4.174252275682705e-07, "loss": 0.0013, "reward": 1.7585737705230713, "reward_std": 0.09298504143953323, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.7624800503253937, "step": 896 }, { "clip_ratio": 0.0, "completion_length": 191.12890625, "epoch": 0.5830354241143971, "grad_norm": 0.4457312226295471, "kl": 0.03057861328125, "learning_rate": 4.1677503250975293e-07, "loss": 0.0012, "reward": 1.7904365062713623, "reward_std": 0.12676216661930084, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8099677860736847, "step": 897 }, { "clip_ratio": 0.0, "completion_length": 170.91015625, "epoch": 0.5836854078648034, "grad_norm": 0.4686286747455597, "kl": 0.0328369140625, "learning_rate": 4.1612483745123533e-07, "loss": 0.0013, "reward": 1.7588238716125488, "reward_std": 0.13549210876226425, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7861676216125488, "step": 898 }, { "clip_ratio": 0.0, "completion_length": 191.01953125, "epoch": 0.5843353916152096, "grad_norm": 0.5482606887817383, "kl": 0.039306640625, "learning_rate": 4.154746423927178e-07, "loss": 0.0016, "reward": 1.752385139465332, "reward_std": 0.1194327287375927, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7719163596630096, "step": 899 }, { "clip_ratio": 0.0, "completion_length": 185.35546875, "epoch": 0.5849853753656159, "grad_norm": 0.4420669376850128, "kl": 0.03228759765625, "learning_rate": 4.1482444733420023e-07, "loss": 0.0013, "reward": 1.7915504574775696, "reward_std": 0.11807362735271454, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8110816776752472, "step": 900 }, { "clip_ratio": 0.0, "completion_length": 187.26953125, "epoch": 0.585635359116022, "grad_norm": 0.4136887788772583, "kl": 0.0311279296875, "learning_rate": 4.141742522756827e-07, "loss": 0.0012, "reward": 1.7899082899093628, "reward_std": 0.10139216855168343, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8055332899093628, "step": 901 }, { "clip_ratio": 0.0, "completion_length": 171.57421875, "epoch": 0.5862853428664283, "grad_norm": 0.48187994956970215, "kl": 0.0382080078125, "learning_rate": 4.135240572171652e-07, "loss": 0.0015, "reward": 1.793617606163025, "reward_std": 0.10058778896927834, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8092425465583801, "step": 902 }, { "clip_ratio": 0.0, "completion_length": 170.6640625, "epoch": 0.5869353266168346, "grad_norm": 0.540339469909668, "kl": 0.0435791015625, "learning_rate": 4.128738621586476e-07, "loss": 0.0017, "reward": 1.8450756072998047, "reward_std": 0.09401385486125946, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8567943572998047, "step": 903 }, { "clip_ratio": 0.0, "completion_length": 188.66015625, "epoch": 0.5875853103672408, "grad_norm": 0.5105404853820801, "kl": 0.037353515625, "learning_rate": 4.1222366710013e-07, "loss": 0.0015, "reward": 1.7139981985092163, "reward_std": 0.15088944509625435, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7413419783115387, "step": 904 }, { "clip_ratio": 0.0, "completion_length": 169.47265625, "epoch": 0.5882352941176471, "grad_norm": 0.45396456122398376, "kl": 0.03338623046875, "learning_rate": 4.1157347204161247e-07, "loss": 0.0013, "reward": 1.7807791829109192, "reward_std": 0.0823165811598301, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7885917127132416, "step": 905 }, { "clip_ratio": 0.0, "completion_length": 167.3515625, "epoch": 0.5888852778680533, "grad_norm": 0.43500909209251404, "kl": 0.03466796875, "learning_rate": 4.109232769830949e-07, "loss": 0.0014, "reward": 1.8636568188667297, "reward_std": 0.08641054853796959, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.875375509262085, "step": 906 }, { "clip_ratio": 0.0, "completion_length": 167.5625, "epoch": 0.5895352616184596, "grad_norm": 0.4378800392150879, "kl": 0.0328369140625, "learning_rate": 4.1027308192457737e-07, "loss": 0.0013, "reward": 1.801814317703247, "reward_std": 0.14229630678892136, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8252518475055695, "step": 907 }, { "clip_ratio": 0.0, "completion_length": 169.734375, "epoch": 0.5901852453688657, "grad_norm": 0.6409247517585754, "kl": 0.03057861328125, "learning_rate": 4.0962288686605977e-07, "loss": 0.0012, "reward": 1.7805213928222656, "reward_std": 0.11342296004295349, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8000526130199432, "step": 908 }, { "clip_ratio": 0.0, "completion_length": 180.703125, "epoch": 0.590835229119272, "grad_norm": 0.4364732801914215, "kl": 0.0389404296875, "learning_rate": 4.089726918075422e-07, "loss": 0.0016, "reward": 1.75666081905365, "reward_std": 0.14066259935498238, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7800983488559723, "step": 909 }, { "clip_ratio": 0.0, "completion_length": 188.5, "epoch": 0.5914852128696783, "grad_norm": 0.5367032289505005, "kl": 0.0389404296875, "learning_rate": 4.083224967490247e-07, "loss": 0.0016, "reward": 1.7396982312202454, "reward_std": 0.16282782703638077, "rewards/format_reward_gen": 0.9609375, "rewards/llm_reward": 0.778760701417923, "step": 910 }, { "clip_ratio": 0.0, "completion_length": 178.37890625, "epoch": 0.5921351966200845, "grad_norm": 0.4033646583557129, "kl": 0.0341796875, "learning_rate": 4.0767230169050717e-07, "loss": 0.0014, "reward": 1.79085773229599, "reward_std": 0.1367226280272007, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8142952024936676, "step": 911 }, { "clip_ratio": 0.0, "completion_length": 189.01953125, "epoch": 0.5927851803704908, "grad_norm": 0.4548186957836151, "kl": 0.039794921875, "learning_rate": 4.070221066319896e-07, "loss": 0.0016, "reward": 1.7599846124649048, "reward_std": 0.14777033776044846, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7834221124649048, "step": 912 }, { "clip_ratio": 0.0, "completion_length": 177.3203125, "epoch": 0.593435164120897, "grad_norm": 0.43277591466903687, "kl": 0.035400390625, "learning_rate": 4.06371911573472e-07, "loss": 0.0014, "reward": 1.7997661232948303, "reward_std": 0.13001765683293343, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8232036232948303, "step": 913 }, { "clip_ratio": 0.0, "completion_length": 170.93359375, "epoch": 0.5940851478713032, "grad_norm": 0.4435647428035736, "kl": 0.036376953125, "learning_rate": 4.0572171651495446e-07, "loss": 0.0015, "reward": 1.8128976225852966, "reward_std": 0.09852533787488937, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.824616402387619, "step": 914 }, { "clip_ratio": 0.0, "completion_length": 197.8203125, "epoch": 0.5947351316217094, "grad_norm": 0.41216886043548584, "kl": 0.03863525390625, "learning_rate": 4.050715214564369e-07, "loss": 0.0015, "reward": 1.7613242864608765, "reward_std": 0.11346687003970146, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7925743758678436, "step": 915 }, { "clip_ratio": 0.0, "completion_length": 170.3828125, "epoch": 0.5953851153721157, "grad_norm": 0.47117942571640015, "kl": 0.033447265625, "learning_rate": 4.0442132639791936e-07, "loss": 0.0013, "reward": 1.8090383410453796, "reward_std": 0.12907607853412628, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8246634006500244, "step": 916 }, { "clip_ratio": 0.0, "completion_length": 176.171875, "epoch": 0.596035099122522, "grad_norm": 0.4190269112586975, "kl": 0.02978515625, "learning_rate": 4.037711313394018e-07, "loss": 0.0012, "reward": 1.8124569654464722, "reward_std": 0.08783544413745403, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8241757154464722, "step": 917 }, { "clip_ratio": 0.0, "completion_length": 184.8515625, "epoch": 0.5966850828729282, "grad_norm": 0.5700403451919556, "kl": 0.03302001953125, "learning_rate": 4.031209362808842e-07, "loss": 0.0013, "reward": 1.7785966396331787, "reward_std": 0.1167614534497261, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7942216098308563, "step": 918 }, { "clip_ratio": 0.0, "completion_length": 176.82421875, "epoch": 0.5973350666233345, "grad_norm": 0.4825807809829712, "kl": 0.031982421875, "learning_rate": 4.024707412223667e-07, "loss": 0.0013, "reward": 1.8166956901550293, "reward_std": 0.09374012425541878, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8362269699573517, "step": 919 }, { "clip_ratio": 0.0, "completion_length": 172.1328125, "epoch": 0.5979850503737406, "grad_norm": 0.46398618817329407, "kl": 0.031494140625, "learning_rate": 4.0182054616384916e-07, "loss": 0.0013, "reward": 1.8478155732154846, "reward_std": 0.06395713984966278, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8556280732154846, "step": 920 }, { "clip_ratio": 0.0, "completion_length": 191.671875, "epoch": 0.5986350341241469, "grad_norm": 0.4515369236469269, "kl": 0.04248046875, "learning_rate": 4.011703511053316e-07, "loss": 0.0017, "reward": 1.7189452052116394, "reward_std": 0.13486916571855545, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7462888956069946, "step": 921 }, { "clip_ratio": 0.0, "completion_length": 188.2421875, "epoch": 0.5992850178745531, "grad_norm": 0.35537323355674744, "kl": 0.035888671875, "learning_rate": 4.0052015604681405e-07, "loss": 0.0014, "reward": 1.8096063733100891, "reward_std": 0.07537727616727352, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8213251233100891, "step": 922 }, { "clip_ratio": 0.0, "completion_length": 171.69921875, "epoch": 0.5999350016249594, "grad_norm": 0.35787150263786316, "kl": 0.0328369140625, "learning_rate": 3.9986996098829645e-07, "loss": 0.0013, "reward": 1.8507437109947205, "reward_std": 0.09333394467830658, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8663687109947205, "step": 923 }, { "clip_ratio": 0.0, "completion_length": 154.7109375, "epoch": 0.6005849853753656, "grad_norm": 0.6401011347770691, "kl": 0.0498046875, "learning_rate": 3.992197659297789e-07, "loss": 0.002, "reward": 1.7737172842025757, "reward_std": 0.14429821074008942, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.8088734745979309, "step": 924 }, { "clip_ratio": 0.0, "completion_length": 181.59375, "epoch": 0.6012349691257719, "grad_norm": 0.6539933085441589, "kl": 0.0465087890625, "learning_rate": 3.9856957087126135e-07, "loss": 0.0019, "reward": 1.7682682275772095, "reward_std": 0.14048561453819275, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7917057275772095, "step": 925 }, { "clip_ratio": 0.0, "completion_length": 175.79296875, "epoch": 0.601884952876178, "grad_norm": 0.41471171379089355, "kl": 0.0355224609375, "learning_rate": 3.979193758127438e-07, "loss": 0.0014, "reward": 1.784180223941803, "reward_std": 0.11008936539292336, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7958990037441254, "step": 926 }, { "clip_ratio": 0.0, "completion_length": 196.71484375, "epoch": 0.6025349366265843, "grad_norm": 0.44379565119743347, "kl": 0.0372314453125, "learning_rate": 3.972691807542263e-07, "loss": 0.0015, "reward": 1.7195895314216614, "reward_std": 0.12268917262554169, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.750839501619339, "step": 927 }, { "clip_ratio": 0.0, "completion_length": 179.2734375, "epoch": 0.6031849203769906, "grad_norm": 0.4344809651374817, "kl": 0.0391845703125, "learning_rate": 3.966189856957087e-07, "loss": 0.0016, "reward": 1.7364763617515564, "reward_std": 0.09356924518942833, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7521012723445892, "step": 928 }, { "clip_ratio": 0.0, "completion_length": 200.4375, "epoch": 0.6038349041273968, "grad_norm": 0.35911014676094055, "kl": 0.034912109375, "learning_rate": 3.9596879063719115e-07, "loss": 0.0014, "reward": 1.774923324584961, "reward_std": 0.12990208342671394, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.8022671341896057, "step": 929 }, { "clip_ratio": 0.0, "completion_length": 181.42578125, "epoch": 0.6044848878778031, "grad_norm": 0.44087809324264526, "kl": 0.0428466796875, "learning_rate": 3.953185955786736e-07, "loss": 0.0017, "reward": 1.7639787793159485, "reward_std": 0.11874078586697578, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7835099995136261, "step": 930 }, { "clip_ratio": 0.0, "completion_length": 173.453125, "epoch": 0.6051348716282093, "grad_norm": 0.43094298243522644, "kl": 0.036376953125, "learning_rate": 3.9466840052015604e-07, "loss": 0.0015, "reward": 1.7582061290740967, "reward_std": 0.1264205165207386, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7777373790740967, "step": 931 }, { "clip_ratio": 0.0, "completion_length": 208.8125, "epoch": 0.6057848553786155, "grad_norm": 0.421852707862854, "kl": 0.035888671875, "learning_rate": 3.940182054616385e-07, "loss": 0.0014, "reward": 1.7796244621276855, "reward_std": 0.07239995896816254, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7874370217323303, "step": 932 }, { "clip_ratio": 0.0, "completion_length": 203.2890625, "epoch": 0.6064348391290217, "grad_norm": 0.4334278106689453, "kl": 0.03662109375, "learning_rate": 3.933680104031209e-07, "loss": 0.0015, "reward": 1.751363754272461, "reward_std": 0.08945203572511673, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7708950638771057, "step": 933 }, { "clip_ratio": 0.0, "completion_length": 179.328125, "epoch": 0.607084822879428, "grad_norm": 0.4070710837841034, "kl": 0.029541015625, "learning_rate": 3.9271781534460334e-07, "loss": 0.0012, "reward": 1.8125702142715454, "reward_std": 0.11042599380016327, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8321014046669006, "step": 934 }, { "clip_ratio": 0.0, "completion_length": 190.390625, "epoch": 0.6077348066298343, "grad_norm": 0.4124375581741333, "kl": 0.028076171875, "learning_rate": 3.920676202860858e-07, "loss": 0.0011, "reward": 1.823108971118927, "reward_std": 0.06710207089781761, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8270152509212494, "step": 935 }, { "clip_ratio": 0.0, "completion_length": 185.86328125, "epoch": 0.6083847903802405, "grad_norm": 0.44736114144325256, "kl": 0.03106689453125, "learning_rate": 3.914174252275683e-07, "loss": 0.0012, "reward": 1.7654988765716553, "reward_std": 0.1148376315832138, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7850300967693329, "step": 936 }, { "clip_ratio": 0.0, "completion_length": 191.8828125, "epoch": 0.6090347741306468, "grad_norm": 0.38367870450019836, "kl": 0.038330078125, "learning_rate": 3.9076723016905074e-07, "loss": 0.0015, "reward": 1.7773592472076416, "reward_std": 0.09714360907673836, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7929843068122864, "step": 937 }, { "clip_ratio": 0.0, "completion_length": 176.25390625, "epoch": 0.6096847578810529, "grad_norm": 0.5306936502456665, "kl": 0.03076171875, "learning_rate": 3.9011703511053313e-07, "loss": 0.0012, "reward": 1.8117938041687012, "reward_std": 0.0874580442905426, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8274188041687012, "step": 938 }, { "clip_ratio": 0.0, "completion_length": 184.05859375, "epoch": 0.6103347416314592, "grad_norm": 0.6760292053222656, "kl": 0.025390625, "learning_rate": 3.894668400520156e-07, "loss": 0.001, "reward": 1.7873624563217163, "reward_std": 0.07379531301558018, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7990812063217163, "step": 939 }, { "clip_ratio": 0.0, "completion_length": 182.171875, "epoch": 0.6109847253818654, "grad_norm": 0.39923617243766785, "kl": 0.0418701171875, "learning_rate": 3.8881664499349803e-07, "loss": 0.0017, "reward": 1.7933474779129028, "reward_std": 0.1456693448126316, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.8285037279129028, "step": 940 }, { "clip_ratio": 0.0, "completion_length": 178.9375, "epoch": 0.6116347091322717, "grad_norm": 0.460961252450943, "kl": 0.03179931640625, "learning_rate": 3.881664499349805e-07, "loss": 0.0013, "reward": 1.791818082332611, "reward_std": 0.1190006285905838, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.8191618323326111, "step": 941 }, { "clip_ratio": 0.0, "completion_length": 176.140625, "epoch": 0.612284692882678, "grad_norm": 0.4425315260887146, "kl": 0.0357666015625, "learning_rate": 3.8751625487646293e-07, "loss": 0.0014, "reward": 1.7788153886795044, "reward_std": 0.10737211257219315, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7905341684818268, "step": 942 }, { "clip_ratio": 0.0, "completion_length": 214.33984375, "epoch": 0.6129346766330842, "grad_norm": 0.4445823132991791, "kl": 0.04150390625, "learning_rate": 3.8686605981794533e-07, "loss": 0.0017, "reward": 1.7573366165161133, "reward_std": 0.11357072368264198, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7807741165161133, "step": 943 }, { "clip_ratio": 0.0, "completion_length": 212.45703125, "epoch": 0.6135846603834905, "grad_norm": 0.39206022024154663, "kl": 0.02587890625, "learning_rate": 3.862158647594278e-07, "loss": 0.001, "reward": 1.7809513807296753, "reward_std": 0.07599299401044846, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.7848576307296753, "step": 944 }, { "clip_ratio": 0.0, "completion_length": 182.171875, "epoch": 0.6142346441338966, "grad_norm": 0.404849112033844, "kl": 0.0357666015625, "learning_rate": 3.855656697009103e-07, "loss": 0.0014, "reward": 1.8216770887374878, "reward_std": 0.1324584186077118, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.845114529132843, "step": 945 }, { "clip_ratio": 0.0, "completion_length": 181.61328125, "epoch": 0.6148846278843029, "grad_norm": 0.3858604431152344, "kl": 0.03424072265625, "learning_rate": 3.8491547464239273e-07, "loss": 0.0014, "reward": 1.825411319732666, "reward_std": 0.06265377346426249, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8371300399303436, "step": 946 }, { "clip_ratio": 0.0, "completion_length": 204.21875, "epoch": 0.6155346116347091, "grad_norm": 0.4178066551685333, "kl": 0.0328369140625, "learning_rate": 3.842652795838752e-07, "loss": 0.0013, "reward": 1.7993798851966858, "reward_std": 0.11451025307178497, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8189111351966858, "step": 947 }, { "clip_ratio": 0.0, "completion_length": 181.26171875, "epoch": 0.6161845953851154, "grad_norm": 0.3773108124732971, "kl": 0.029541015625, "learning_rate": 3.8361508452535757e-07, "loss": 0.0012, "reward": 1.8375107645988464, "reward_std": 0.07095373049378395, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8453232049942017, "step": 948 }, { "clip_ratio": 0.0, "completion_length": 207.58984375, "epoch": 0.6168345791355216, "grad_norm": 0.4516773819923401, "kl": 0.0338134765625, "learning_rate": 3.8296488946684e-07, "loss": 0.0014, "reward": 1.7409477829933167, "reward_std": 0.11566438525915146, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.772197812795639, "step": 949 }, { "clip_ratio": 0.0, "completion_length": 185.55078125, "epoch": 0.6174845628859279, "grad_norm": 0.45261332392692566, "kl": 0.038818359375, "learning_rate": 3.8231469440832247e-07, "loss": 0.0016, "reward": 1.7533063888549805, "reward_std": 0.1811508983373642, "rewards/format_reward_gen": 0.95703125, "rewards/llm_reward": 0.7962751388549805, "step": 950 }, { "clip_ratio": 0.0, "completion_length": 187.40234375, "epoch": 0.618134546636334, "grad_norm": 0.4276083707809448, "kl": 0.0286865234375, "learning_rate": 3.816644993498049e-07, "loss": 0.0011, "reward": 1.8512737154960632, "reward_std": 0.06765822321176529, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8512737154960632, "step": 951 }, { "clip_ratio": 0.0, "completion_length": 200.50390625, "epoch": 0.6187845303867403, "grad_norm": 0.41468504071235657, "kl": 0.03564453125, "learning_rate": 3.8101430429128737e-07, "loss": 0.0014, "reward": 1.770843505859375, "reward_std": 0.10192658007144928, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7825621962547302, "step": 952 }, { "clip_ratio": 0.0, "completion_length": 191.5546875, "epoch": 0.6194345141371466, "grad_norm": 0.39450734853744507, "kl": 0.0380859375, "learning_rate": 3.803641092327698e-07, "loss": 0.0015, "reward": 1.8114401698112488, "reward_std": 0.07046904787421227, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8231589198112488, "step": 953 }, { "clip_ratio": 0.0, "completion_length": 187.625, "epoch": 0.6200844978875528, "grad_norm": 0.4152645170688629, "kl": 0.0313720703125, "learning_rate": 3.7971391417425227e-07, "loss": 0.0013, "reward": 1.8183239102363586, "reward_std": 0.09353368356823921, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8339489102363586, "step": 954 }, { "clip_ratio": 0.0, "completion_length": 180.40234375, "epoch": 0.6207344816379591, "grad_norm": 0.40878191590309143, "kl": 0.0330810546875, "learning_rate": 3.790637191157347e-07, "loss": 0.0013, "reward": 1.8109038472175598, "reward_std": 0.1368171125650406, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.8382475674152374, "step": 955 }, { "clip_ratio": 0.0, "completion_length": 181.89453125, "epoch": 0.6213844653883653, "grad_norm": 0.4257148802280426, "kl": 0.037841796875, "learning_rate": 3.7841352405721716e-07, "loss": 0.0015, "reward": 1.8108998537063599, "reward_std": 0.12346826866269112, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8343373239040375, "step": 956 }, { "clip_ratio": 0.0, "completion_length": 163.09375, "epoch": 0.6220344491387715, "grad_norm": 0.3709239661693573, "kl": 0.030517578125, "learning_rate": 3.777633289986996e-07, "loss": 0.0012, "reward": 1.8581033945083618, "reward_std": 0.09396366775035858, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8776345551013947, "step": 957 }, { "clip_ratio": 0.0, "completion_length": 173.59765625, "epoch": 0.6226844328891777, "grad_norm": 0.4490263760089874, "kl": 0.033935546875, "learning_rate": 3.77113133940182e-07, "loss": 0.0014, "reward": 1.8377001881599426, "reward_std": 0.10777600854635239, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8572314381599426, "step": 958 }, { "clip_ratio": 0.0, "completion_length": 187.8828125, "epoch": 0.623334416639584, "grad_norm": 0.3840901851654053, "kl": 0.0364990234375, "learning_rate": 3.7646293888166446e-07, "loss": 0.0015, "reward": 1.820638358592987, "reward_std": 0.12438156455755234, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8440757989883423, "step": 959 }, { "clip_ratio": 0.0, "completion_length": 178.703125, "epoch": 0.6239844003899903, "grad_norm": 0.5965369939804077, "kl": 0.041015625, "learning_rate": 3.758127438231469e-07, "loss": 0.0016, "reward": 1.7611795663833618, "reward_std": 0.16850006580352783, "rewards/format_reward_gen": 0.9609375, "rewards/llm_reward": 0.8002421259880066, "step": 960 }, { "clip_ratio": 0.0, "completion_length": 181.93359375, "epoch": 0.6246343841403965, "grad_norm": 0.42373570799827576, "kl": 0.029541015625, "learning_rate": 3.7516254876462936e-07, "loss": 0.0012, "reward": 1.7846553921699524, "reward_std": 0.09458793699741364, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8002803921699524, "step": 961 }, { "clip_ratio": 0.0, "completion_length": 197.46484375, "epoch": 0.6252843678908028, "grad_norm": 0.4007236063480377, "kl": 0.0341796875, "learning_rate": 3.7451235370611186e-07, "loss": 0.0014, "reward": 1.7912622094154358, "reward_std": 0.082552969455719, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8068872690200806, "step": 962 }, { "clip_ratio": 0.0, "completion_length": 193.46875, "epoch": 0.6259343516412089, "grad_norm": 0.4529918432235718, "kl": 0.0355224609375, "learning_rate": 3.7386215864759426e-07, "loss": 0.0014, "reward": 1.7545545101165771, "reward_std": 0.11484400928020477, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7779919803142548, "step": 963 }, { "clip_ratio": 0.0, "completion_length": 205.0703125, "epoch": 0.6265843353916152, "grad_norm": 0.36969834566116333, "kl": 0.02996826171875, "learning_rate": 3.732119635890767e-07, "loss": 0.0012, "reward": 1.797232449054718, "reward_std": 0.09008255787193775, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.808951199054718, "step": 964 }, { "clip_ratio": 0.0, "completion_length": 189.5625, "epoch": 0.6272343191420214, "grad_norm": 2.458524703979492, "kl": 0.03271484375, "learning_rate": 3.7256176853055915e-07, "loss": 0.0013, "reward": 1.7897863388061523, "reward_std": 0.0833202600479126, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8015050888061523, "step": 965 }, { "clip_ratio": 0.0, "completion_length": 204.328125, "epoch": 0.6278843028924277, "grad_norm": 0.45382505655288696, "kl": 0.0421142578125, "learning_rate": 3.719115734720416e-07, "loss": 0.0017, "reward": 1.7671091556549072, "reward_std": 0.13125651329755783, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.794452965259552, "step": 966 }, { "clip_ratio": 0.0, "completion_length": 180.50390625, "epoch": 0.628534286642834, "grad_norm": 0.4203580617904663, "kl": 0.0313720703125, "learning_rate": 3.7126137841352405e-07, "loss": 0.0013, "reward": 1.8209667205810547, "reward_std": 0.09239033795893192, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8326854705810547, "step": 967 }, { "clip_ratio": 0.0, "completion_length": 197.12890625, "epoch": 0.6291842703932402, "grad_norm": 0.45782536268234253, "kl": 0.0377197265625, "learning_rate": 3.7061118335500645e-07, "loss": 0.0015, "reward": 1.7156803607940674, "reward_std": 0.15713908523321152, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7469304203987122, "step": 968 }, { "clip_ratio": 0.0, "completion_length": 190.921875, "epoch": 0.6298342541436464, "grad_norm": 0.4245551824569702, "kl": 0.0350341796875, "learning_rate": 3.699609882964889e-07, "loss": 0.0014, "reward": 1.7110081911087036, "reward_std": 0.10746461898088455, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7344456911087036, "step": 969 }, { "clip_ratio": 0.0, "completion_length": 194.35546875, "epoch": 0.6304842378940526, "grad_norm": 0.4787604808807373, "kl": 0.038330078125, "learning_rate": 3.693107932379714e-07, "loss": 0.0015, "reward": 1.7987757325172424, "reward_std": 0.15525683760643005, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8222132325172424, "step": 970 }, { "clip_ratio": 0.0, "completion_length": 188.4921875, "epoch": 0.6311342216444589, "grad_norm": 0.4703696072101593, "kl": 0.03167724609375, "learning_rate": 3.6866059817945385e-07, "loss": 0.0013, "reward": 1.7790573239326477, "reward_std": 0.10818932577967644, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7946823239326477, "step": 971 }, { "clip_ratio": 0.0, "completion_length": 199.90234375, "epoch": 0.6317842053948651, "grad_norm": 0.44467198848724365, "kl": 0.03167724609375, "learning_rate": 3.680104031209363e-07, "loss": 0.0013, "reward": 1.8114886283874512, "reward_std": 0.1344350352883339, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.8388323485851288, "step": 972 }, { "clip_ratio": 0.0, "completion_length": 211.28515625, "epoch": 0.6324341891452714, "grad_norm": 0.6068856716156006, "kl": 0.0421142578125, "learning_rate": 3.673602080624187e-07, "loss": 0.0017, "reward": 1.7464879155158997, "reward_std": 0.11824396252632141, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7621129155158997, "step": 973 }, { "clip_ratio": 0.0, "completion_length": 176.97265625, "epoch": 0.6330841728956776, "grad_norm": 0.4586310386657715, "kl": 0.0357666015625, "learning_rate": 3.6671001300390114e-07, "loss": 0.0014, "reward": 1.786487340927124, "reward_std": 0.13887165114283562, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8060186207294464, "step": 974 }, { "clip_ratio": 0.0, "completion_length": 187.83984375, "epoch": 0.6337341566460839, "grad_norm": 0.42115384340286255, "kl": 0.0272216796875, "learning_rate": 3.660598179453836e-07, "loss": 0.0011, "reward": 1.8308107256889343, "reward_std": 0.08547118119895458, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8425295352935791, "step": 975 }, { "clip_ratio": 0.0, "completion_length": 190.3359375, "epoch": 0.63438414039649, "grad_norm": 2.067542552947998, "kl": 0.02862548828125, "learning_rate": 3.6540962288686604e-07, "loss": 0.0011, "reward": 1.8346669673919678, "reward_std": 0.06908021494746208, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8424794673919678, "step": 976 }, { "clip_ratio": 0.0, "completion_length": 181.48046875, "epoch": 0.6350341241468963, "grad_norm": 0.45364001393318176, "kl": 0.0335693359375, "learning_rate": 3.647594278283485e-07, "loss": 0.0013, "reward": 1.8306390643119812, "reward_std": 0.09354406595230103, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8384515643119812, "step": 977 }, { "clip_ratio": 0.0, "completion_length": 181.58984375, "epoch": 0.6356841078973026, "grad_norm": 0.3997001647949219, "kl": 0.03167724609375, "learning_rate": 3.641092327698309e-07, "loss": 0.0013, "reward": 1.7995595932006836, "reward_std": 0.08808603137731552, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8034657835960388, "step": 978 }, { "clip_ratio": 0.0, "completion_length": 196.21875, "epoch": 0.6363340916477088, "grad_norm": 0.5039976239204407, "kl": 0.025390625, "learning_rate": 3.634590377113134e-07, "loss": 0.001, "reward": 1.7934063076972961, "reward_std": 0.06836653873324394, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8051250576972961, "step": 979 }, { "clip_ratio": 0.0, "completion_length": 202.78125, "epoch": 0.6369840753981151, "grad_norm": 0.36101457476615906, "kl": 0.0306396484375, "learning_rate": 3.6280884265279584e-07, "loss": 0.0012, "reward": 1.7964158058166504, "reward_std": 0.09669894352555275, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8159470856189728, "step": 980 }, { "clip_ratio": 0.0, "completion_length": 185.5234375, "epoch": 0.6376340591485213, "grad_norm": 0.5528690218925476, "kl": 0.0357666015625, "learning_rate": 3.621586475942783e-07, "loss": 0.0014, "reward": 1.8273435831069946, "reward_std": 0.060826197266578674, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8390623033046722, "step": 981 }, { "clip_ratio": 0.0, "completion_length": 173.7890625, "epoch": 0.6382840428989275, "grad_norm": 0.372527152299881, "kl": 0.0377197265625, "learning_rate": 3.6150845253576074e-07, "loss": 0.0015, "reward": 1.8413505554199219, "reward_std": 0.06341157667338848, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8413506150245667, "step": 982 }, { "clip_ratio": 0.0, "completion_length": 183.87890625, "epoch": 0.6389340266493337, "grad_norm": 0.4449405074119568, "kl": 0.03515625, "learning_rate": 3.6085825747724313e-07, "loss": 0.0014, "reward": 1.7979797720909119, "reward_std": 0.10093452781438828, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8136047720909119, "step": 983 }, { "clip_ratio": 0.0, "completion_length": 185.60546875, "epoch": 0.63958401039974, "grad_norm": 0.3810074031352997, "kl": 0.0323486328125, "learning_rate": 3.602080624187256e-07, "loss": 0.0013, "reward": 1.8465237021446228, "reward_std": 0.07409032434225082, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8504299819469452, "step": 984 }, { "clip_ratio": 0.0, "completion_length": 199.4609375, "epoch": 0.6402339941501463, "grad_norm": 0.4513474702835083, "kl": 0.0345458984375, "learning_rate": 3.5955786736020803e-07, "loss": 0.0014, "reward": 1.7757773995399475, "reward_std": 0.10192598029971123, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7874961793422699, "step": 985 }, { "clip_ratio": 0.0, "completion_length": 193.03125, "epoch": 0.6408839779005525, "grad_norm": 0.4179965853691101, "kl": 0.0340576171875, "learning_rate": 3.589076723016905e-07, "loss": 0.0014, "reward": 1.7601438164710999, "reward_std": 0.09072377532720566, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7796750366687775, "step": 986 }, { "clip_ratio": 0.0, "completion_length": 196.9765625, "epoch": 0.6415339616509588, "grad_norm": 0.4120063781738281, "kl": 0.0364990234375, "learning_rate": 3.58257477243173e-07, "loss": 0.0015, "reward": 1.7684841752052307, "reward_std": 0.09888343885540962, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7841092348098755, "step": 987 }, { "clip_ratio": 0.0, "completion_length": 194.08203125, "epoch": 0.6421839454013649, "grad_norm": 0.3370498716831207, "kl": 0.0279541015625, "learning_rate": 3.576072821846554e-07, "loss": 0.0011, "reward": 1.8374082446098328, "reward_std": 0.07257914543151855, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8491269946098328, "step": 988 }, { "clip_ratio": 0.0, "completion_length": 213.66796875, "epoch": 0.6428339291517712, "grad_norm": 0.44161760807037354, "kl": 0.033203125, "learning_rate": 3.569570871261378e-07, "loss": 0.0013, "reward": 1.763028860092163, "reward_std": 0.08604997023940086, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7708413600921631, "step": 989 }, { "clip_ratio": 0.0, "completion_length": 196.453125, "epoch": 0.6434839129021774, "grad_norm": 0.5230216383934021, "kl": 0.0396728515625, "learning_rate": 3.563068920676203e-07, "loss": 0.0016, "reward": 1.764816701412201, "reward_std": 0.11438180133700371, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7882542014122009, "step": 990 }, { "clip_ratio": 0.0, "completion_length": 190.671875, "epoch": 0.6441338966525837, "grad_norm": 0.4449288845062256, "kl": 0.0377197265625, "learning_rate": 3.556566970091027e-07, "loss": 0.0015, "reward": 1.7587233781814575, "reward_std": 0.14181923121213913, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7860670983791351, "step": 991 }, { "clip_ratio": 0.0, "completion_length": 197.98046875, "epoch": 0.64478388040299, "grad_norm": 0.49198055267333984, "kl": 0.0374755859375, "learning_rate": 3.550065019505852e-07, "loss": 0.0015, "reward": 1.8039186596870422, "reward_std": 0.14165644347667694, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.8351686894893646, "step": 992 }, { "clip_ratio": 0.0, "completion_length": 184.80859375, "epoch": 0.6454338641533962, "grad_norm": 0.34699365496635437, "kl": 0.0267333984375, "learning_rate": 3.5435630689206757e-07, "loss": 0.0011, "reward": 1.8429383039474487, "reward_std": 0.03507668245583773, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8429383337497711, "step": 993 }, { "clip_ratio": 0.0, "completion_length": 186.0703125, "epoch": 0.6460838479038024, "grad_norm": 0.4776883125305176, "kl": 0.04241943359375, "learning_rate": 3.5370611183355e-07, "loss": 0.0017, "reward": 1.8128418326377869, "reward_std": 0.11139575019478798, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8245605230331421, "step": 994 }, { "clip_ratio": 0.0, "completion_length": 193.21875, "epoch": 0.6467338316542086, "grad_norm": 0.4199292063713074, "kl": 0.028076171875, "learning_rate": 3.5305591677503247e-07, "loss": 0.0011, "reward": 1.8201545476913452, "reward_std": 0.10774442553520203, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8357795178890228, "step": 995 }, { "clip_ratio": 0.0, "completion_length": 204.66796875, "epoch": 0.6473838154046149, "grad_norm": 0.4261852502822876, "kl": 0.0286865234375, "learning_rate": 3.5240572171651497e-07, "loss": 0.0011, "reward": 1.7712887525558472, "reward_std": 0.08671223744750023, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7908200621604919, "step": 996 }, { "clip_ratio": 0.0, "completion_length": 196.0703125, "epoch": 0.6480337991550211, "grad_norm": 0.44369497895240784, "kl": 0.02789306640625, "learning_rate": 3.517555266579974e-07, "loss": 0.0011, "reward": 1.797546625137329, "reward_std": 0.08411657065153122, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8131715953350067, "step": 997 }, { "clip_ratio": 0.0, "completion_length": 189.84375, "epoch": 0.6486837829054274, "grad_norm": 0.5071277022361755, "kl": 0.04449462890625, "learning_rate": 3.511053315994798e-07, "loss": 0.0018, "reward": 1.8354552388191223, "reward_std": 0.06284216046333313, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8432677686214447, "step": 998 }, { "clip_ratio": 0.0, "completion_length": 202.38671875, "epoch": 0.6493337666558336, "grad_norm": 0.44896814227104187, "kl": 0.03228759765625, "learning_rate": 3.5045513654096226e-07, "loss": 0.0013, "reward": 1.811030924320221, "reward_std": 0.09149676561355591, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8266558945178986, "step": 999 }, { "clip_ratio": 0.0, "completion_length": 175.6875, "epoch": 0.6499837504062398, "grad_norm": 0.3674420714378357, "kl": 0.0360107421875, "learning_rate": 3.498049414824447e-07, "loss": 0.0014, "reward": 1.8002638816833496, "reward_std": 0.0735167209059, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8080763816833496, "step": 1000 }, { "clip_ratio": 0.0, "completion_length": 187.703125, "epoch": 0.650633734156646, "grad_norm": 0.39421898126602173, "kl": 0.02484130859375, "learning_rate": 3.4915474642392716e-07, "loss": 0.001, "reward": 1.8534637689590454, "reward_std": 0.07495693117380142, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8651825487613678, "step": 1001 }, { "clip_ratio": 0.0, "completion_length": 202.265625, "epoch": 0.6512837179070523, "grad_norm": 0.442737340927124, "kl": 0.02557373046875, "learning_rate": 3.485045513654096e-07, "loss": 0.001, "reward": 1.8227238655090332, "reward_std": 0.055130401626229286, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8266300559043884, "step": 1002 }, { "clip_ratio": 0.0, "completion_length": 197.18359375, "epoch": 0.6519337016574586, "grad_norm": 0.5658230781555176, "kl": 0.02435302734375, "learning_rate": 3.47854356306892e-07, "loss": 0.001, "reward": 1.7874836921691895, "reward_std": 0.08195417374372482, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7992024123668671, "step": 1003 }, { "clip_ratio": 0.0, "completion_length": 189.5625, "epoch": 0.6525836854078648, "grad_norm": 0.4926718473434448, "kl": 0.0384521484375, "learning_rate": 3.4720416124837446e-07, "loss": 0.0015, "reward": 1.7660005688667297, "reward_std": 0.08173608407378197, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7816255986690521, "step": 1004 }, { "clip_ratio": 0.0, "completion_length": 194.390625, "epoch": 0.6532336691582711, "grad_norm": 0.40708252787590027, "kl": 0.036865234375, "learning_rate": 3.4655396618985696e-07, "loss": 0.0015, "reward": 1.7800211310386658, "reward_std": 0.11480817571282387, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7995523810386658, "step": 1005 }, { "clip_ratio": 0.0, "completion_length": 184.42578125, "epoch": 0.6538836529086773, "grad_norm": 0.413188636302948, "kl": 0.03131103515625, "learning_rate": 3.459037711313394e-07, "loss": 0.0013, "reward": 1.7847500443458557, "reward_std": 0.15534866601228714, "rewards/format_reward_gen": 0.9609375, "rewards/llm_reward": 0.8238125443458557, "step": 1006 }, { "clip_ratio": 0.0, "completion_length": 192.04296875, "epoch": 0.6545336366590835, "grad_norm": 0.4217368960380554, "kl": 0.0286865234375, "learning_rate": 3.4525357607282186e-07, "loss": 0.0011, "reward": 1.790935218334198, "reward_std": 0.12079916149377823, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8143726587295532, "step": 1007 }, { "clip_ratio": 0.0, "completion_length": 169.4375, "epoch": 0.6551836204094897, "grad_norm": 0.37016919255256653, "kl": 0.0264892578125, "learning_rate": 3.4460338101430425e-07, "loss": 0.0011, "reward": 1.8288608193397522, "reward_std": 0.11867260187864304, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8483920097351074, "step": 1008 }, { "clip_ratio": 0.0, "completion_length": 199.52734375, "epoch": 0.655833604159896, "grad_norm": 0.45895934104919434, "kl": 0.02777099609375, "learning_rate": 3.439531859557867e-07, "loss": 0.0011, "reward": 1.8102195858955383, "reward_std": 0.0840974785387516, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8219383358955383, "step": 1009 }, { "clip_ratio": 0.0, "completion_length": 182.39453125, "epoch": 0.6564835879103023, "grad_norm": 0.45920953154563904, "kl": 0.0274658203125, "learning_rate": 3.4330299089726915e-07, "loss": 0.0011, "reward": 1.8254390954971313, "reward_std": 0.09000331163406372, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8371579349040985, "step": 1010 }, { "clip_ratio": 0.0, "completion_length": 184.7578125, "epoch": 0.6571335716607085, "grad_norm": 0.447531521320343, "kl": 0.0347900390625, "learning_rate": 3.426527958387516e-07, "loss": 0.0014, "reward": 1.7348089814186096, "reward_std": 0.16272903606295586, "rewards/format_reward_gen": 0.9609375, "rewards/llm_reward": 0.773871511220932, "step": 1011 }, { "clip_ratio": 0.0, "completion_length": 174.421875, "epoch": 0.6577835554111148, "grad_norm": 0.4250657856464386, "kl": 0.03021240234375, "learning_rate": 3.4200260078023405e-07, "loss": 0.0012, "reward": 1.7940666675567627, "reward_std": 0.12222032994031906, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8135978579521179, "step": 1012 }, { "clip_ratio": 0.0, "completion_length": 203.15625, "epoch": 0.6584335391615209, "grad_norm": 0.45355382561683655, "kl": 0.0369873046875, "learning_rate": 3.413524057217165e-07, "loss": 0.0015, "reward": 1.7785805463790894, "reward_std": 0.12349207699298859, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.8059242963790894, "step": 1013 }, { "clip_ratio": 0.0, "completion_length": 192.046875, "epoch": 0.6590835229119272, "grad_norm": 0.5050148367881775, "kl": 0.0341796875, "learning_rate": 3.4070221066319895e-07, "loss": 0.0014, "reward": 1.7792090773582458, "reward_std": 0.12732946500182152, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.8104590773582458, "step": 1014 }, { "clip_ratio": 0.0, "completion_length": 204.8671875, "epoch": 0.6597335066623334, "grad_norm": 0.4135866165161133, "kl": 0.0333251953125, "learning_rate": 3.400520156046814e-07, "loss": 0.0013, "reward": 1.7539638876914978, "reward_std": 0.1427154541015625, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7813076078891754, "step": 1015 }, { "clip_ratio": 0.0, "completion_length": 194.41796875, "epoch": 0.6603834904127397, "grad_norm": 0.39179861545562744, "kl": 0.034423828125, "learning_rate": 3.3940182054616385e-07, "loss": 0.0014, "reward": 1.775888741016388, "reward_std": 0.11394671350717545, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7954200208187103, "step": 1016 }, { "clip_ratio": 0.0, "completion_length": 170.73828125, "epoch": 0.661033474163146, "grad_norm": 0.41553884744644165, "kl": 0.02728271484375, "learning_rate": 3.387516254876463e-07, "loss": 0.0011, "reward": 1.829412817955017, "reward_std": 0.08782512694597244, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8450378179550171, "step": 1017 }, { "clip_ratio": 0.0, "completion_length": 181.93359375, "epoch": 0.6616834579135522, "grad_norm": 0.3411272466182709, "kl": 0.029296875, "learning_rate": 3.381014304291287e-07, "loss": 0.0012, "reward": 1.8306576609611511, "reward_std": 0.10427945479750633, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8501889705657959, "step": 1018 }, { "clip_ratio": 0.0, "completion_length": 192.3984375, "epoch": 0.6623334416639584, "grad_norm": 0.38494956493377686, "kl": 0.0355224609375, "learning_rate": 3.3745123537061114e-07, "loss": 0.0014, "reward": 1.7825042009353638, "reward_std": 0.10405607894062996, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8020354509353638, "step": 1019 }, { "clip_ratio": 0.0, "completion_length": 189.5625, "epoch": 0.6629834254143646, "grad_norm": 0.4323137104511261, "kl": 0.035400390625, "learning_rate": 3.368010403120936e-07, "loss": 0.0014, "reward": 1.7775614261627197, "reward_std": 0.10501259192824364, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7970926761627197, "step": 1020 }, { "clip_ratio": 0.0, "completion_length": 186.6171875, "epoch": 0.6636334091647709, "grad_norm": 0.37492501735687256, "kl": 0.0325927734375, "learning_rate": 3.3615084525357604e-07, "loss": 0.0013, "reward": 1.806358516216278, "reward_std": 0.12261468172073364, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8297960162162781, "step": 1021 }, { "clip_ratio": 0.0, "completion_length": 205.87890625, "epoch": 0.6642833929151771, "grad_norm": 0.6521563529968262, "kl": 0.0318603515625, "learning_rate": 3.3550065019505854e-07, "loss": 0.0013, "reward": 1.8285425901412964, "reward_std": 0.09081440418958664, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.848073810338974, "step": 1022 }, { "clip_ratio": 0.0, "completion_length": 195.46484375, "epoch": 0.6649333766655834, "grad_norm": 0.40868985652923584, "kl": 0.03070068359375, "learning_rate": 3.3485045513654094e-07, "loss": 0.0012, "reward": 1.8064932823181152, "reward_std": 0.07211176306009293, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8182120323181152, "step": 1023 }, { "clip_ratio": 0.0, "completion_length": 175.953125, "epoch": 0.6655833604159896, "grad_norm": 0.3950628638267517, "kl": 0.0421142578125, "learning_rate": 3.342002600780234e-07, "loss": 0.0017, "reward": 1.807862102985382, "reward_std": 0.12643610686063766, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.8352059125900269, "step": 1024 }, { "clip_ratio": 0.0, "completion_length": 191.390625, "epoch": 0.6662333441663958, "grad_norm": 0.49619850516319275, "kl": 0.0435791015625, "learning_rate": 3.3355006501950584e-07, "loss": 0.0017, "reward": 1.7777438163757324, "reward_std": 0.116212859749794, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7894625663757324, "step": 1025 }, { "clip_ratio": 0.0, "completion_length": 191.9609375, "epoch": 0.666883327916802, "grad_norm": 0.3836101293563843, "kl": 0.02996826171875, "learning_rate": 3.328998699609883e-07, "loss": 0.0012, "reward": 1.8122394680976868, "reward_std": 0.11477949097752571, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8278644979000092, "step": 1026 }, { "clip_ratio": 0.0, "completion_length": 194.6328125, "epoch": 0.6675333116672083, "grad_norm": 0.4621821343898773, "kl": 0.0301513671875, "learning_rate": 3.3224967490247073e-07, "loss": 0.0012, "reward": 1.7771345376968384, "reward_std": 0.06751604191958904, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7849469482898712, "step": 1027 }, { "clip_ratio": 0.0, "completion_length": 180.87890625, "epoch": 0.6681832954176146, "grad_norm": 0.33893558382987976, "kl": 0.03216552734375, "learning_rate": 3.3159947984395313e-07, "loss": 0.0013, "reward": 1.825176477432251, "reward_std": 0.07440808415412903, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.836895227432251, "step": 1028 }, { "clip_ratio": 0.0, "completion_length": 170.46484375, "epoch": 0.6688332791680208, "grad_norm": 0.43978822231292725, "kl": 0.03106689453125, "learning_rate": 3.309492847854356e-07, "loss": 0.0012, "reward": 1.8302163481712341, "reward_std": 0.07577112689614296, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8380288481712341, "step": 1029 }, { "clip_ratio": 0.0, "completion_length": 185.609375, "epoch": 0.6694832629184271, "grad_norm": 0.4351489841938019, "kl": 0.03387451171875, "learning_rate": 3.302990897269181e-07, "loss": 0.0014, "reward": 1.7889364361763, "reward_std": 0.07933376729488373, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.7928426861763, "step": 1030 }, { "clip_ratio": 0.0, "completion_length": 184.87890625, "epoch": 0.6701332466688332, "grad_norm": 0.4289831221103668, "kl": 0.0384521484375, "learning_rate": 3.2964889466840053e-07, "loss": 0.0015, "reward": 1.8162646889686584, "reward_std": 0.12347571551799774, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8318896889686584, "step": 1031 }, { "clip_ratio": 0.0, "completion_length": 178.484375, "epoch": 0.6707832304192395, "grad_norm": 0.4170733690261841, "kl": 0.0345458984375, "learning_rate": 3.28998699609883e-07, "loss": 0.0014, "reward": 1.8050548434257507, "reward_std": 0.09572723507881165, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8167736232280731, "step": 1032 }, { "clip_ratio": 0.0, "completion_length": 182.4296875, "epoch": 0.6714332141696457, "grad_norm": 0.391887903213501, "kl": 0.034912109375, "learning_rate": 3.283485045513654e-07, "loss": 0.0014, "reward": 1.7868040800094604, "reward_std": 0.080360297113657, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7946165800094604, "step": 1033 }, { "clip_ratio": 0.0, "completion_length": 172.234375, "epoch": 0.672083197920052, "grad_norm": 0.41478824615478516, "kl": 0.0391845703125, "learning_rate": 3.276983094928478e-07, "loss": 0.0016, "reward": 1.7861740589141846, "reward_std": 0.1377660036087036, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.809611588716507, "step": 1034 }, { "clip_ratio": 0.0, "completion_length": 173.30859375, "epoch": 0.6727331816704583, "grad_norm": 0.41659730672836304, "kl": 0.03466796875, "learning_rate": 3.2704811443433027e-07, "loss": 0.0014, "reward": 1.8344513177871704, "reward_std": 0.13882596790790558, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8539826273918152, "step": 1035 }, { "clip_ratio": 0.0, "completion_length": 172.01171875, "epoch": 0.6733831654208645, "grad_norm": 0.3884132504463196, "kl": 0.0321044921875, "learning_rate": 3.263979193758127e-07, "loss": 0.0013, "reward": 1.8099502325057983, "reward_std": 0.09349188022315502, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.825575202703476, "step": 1036 }, { "clip_ratio": 0.0, "completion_length": 195.49609375, "epoch": 0.6740331491712708, "grad_norm": 0.41380369663238525, "kl": 0.0328369140625, "learning_rate": 3.2574772431729517e-07, "loss": 0.0013, "reward": 1.7844451665878296, "reward_std": 0.1332039311528206, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.8156952559947968, "step": 1037 }, { "clip_ratio": 0.0, "completion_length": 171.54296875, "epoch": 0.6746831329216769, "grad_norm": 0.4637012183666229, "kl": 0.0352783203125, "learning_rate": 3.2509752925877757e-07, "loss": 0.0014, "reward": 1.7517489194869995, "reward_std": 0.11908707022666931, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7595614492893219, "step": 1038 }, { "clip_ratio": 0.0, "completion_length": 179.5234375, "epoch": 0.6753331166720832, "grad_norm": 0.3966270387172699, "kl": 0.025634765625, "learning_rate": 3.2444733420026007e-07, "loss": 0.001, "reward": 1.8209083080291748, "reward_std": 0.055153291672468185, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8209082782268524, "step": 1039 }, { "clip_ratio": 0.0, "completion_length": 201.48046875, "epoch": 0.6759831004224894, "grad_norm": 0.3854163885116577, "kl": 0.02752685546875, "learning_rate": 3.237971391417425e-07, "loss": 0.0011, "reward": 1.755513310432434, "reward_std": 0.08293469250202179, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7672320902347565, "step": 1040 }, { "clip_ratio": 0.0, "completion_length": 194.6015625, "epoch": 0.6766330841728957, "grad_norm": 0.4542045593261719, "kl": 0.03826904296875, "learning_rate": 3.2314694408322497e-07, "loss": 0.0015, "reward": 1.7762371301651, "reward_std": 0.12012538313865662, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7996746003627777, "step": 1041 }, { "clip_ratio": 0.0, "completion_length": 195.10546875, "epoch": 0.677283067923302, "grad_norm": 0.43766456842422485, "kl": 0.027099609375, "learning_rate": 3.224967490247074e-07, "loss": 0.0011, "reward": 1.8003324270248413, "reward_std": 0.0552508644759655, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8042386472225189, "step": 1042 }, { "clip_ratio": 0.0, "completion_length": 188.23828125, "epoch": 0.6779330516737082, "grad_norm": 0.38042861223220825, "kl": 0.03118896484375, "learning_rate": 3.218465539661898e-07, "loss": 0.0012, "reward": 1.8389278650283813, "reward_std": 0.09443030506372452, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8506466150283813, "step": 1043 }, { "clip_ratio": 0.0, "completion_length": 170.80859375, "epoch": 0.6785830354241144, "grad_norm": 0.44898226857185364, "kl": 0.0404052734375, "learning_rate": 3.2119635890767226e-07, "loss": 0.0016, "reward": 1.8282275199890137, "reward_std": 0.12897058576345444, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8516650497913361, "step": 1044 }, { "clip_ratio": 0.0, "completion_length": 184.484375, "epoch": 0.6792330191745206, "grad_norm": 0.40255284309387207, "kl": 0.03515625, "learning_rate": 3.205461638491547e-07, "loss": 0.0014, "reward": 1.800277054309845, "reward_std": 0.11475891247391701, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8159021735191345, "step": 1045 }, { "clip_ratio": 0.0, "completion_length": 188.0234375, "epoch": 0.6798830029249269, "grad_norm": 0.4201299548149109, "kl": 0.0341796875, "learning_rate": 3.1989596879063716e-07, "loss": 0.0014, "reward": 1.8075695037841797, "reward_std": 0.0699673481285572, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8153819739818573, "step": 1046 }, { "clip_ratio": 0.0, "completion_length": 186.43359375, "epoch": 0.6805329866753331, "grad_norm": 0.4417533278465271, "kl": 0.0352783203125, "learning_rate": 3.1924577373211966e-07, "loss": 0.0014, "reward": 1.6951594948768616, "reward_std": 0.10517853498458862, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7107844650745392, "step": 1047 }, { "clip_ratio": 0.0, "completion_length": 191.52734375, "epoch": 0.6811829704257394, "grad_norm": 0.33604007959365845, "kl": 0.029541015625, "learning_rate": 3.1859557867360206e-07, "loss": 0.0012, "reward": 1.8614375591278076, "reward_std": 0.05225335247814655, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8653438091278076, "step": 1048 }, { "clip_ratio": 0.0, "completion_length": 193.4609375, "epoch": 0.6818329541761456, "grad_norm": 0.42953217029571533, "kl": 0.03179931640625, "learning_rate": 3.179453836150845e-07, "loss": 0.0013, "reward": 1.7938883304595947, "reward_std": 0.08693274110555649, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8017008006572723, "step": 1049 }, { "clip_ratio": 0.0, "completion_length": 178.546875, "epoch": 0.6824829379265518, "grad_norm": 0.46273985505104065, "kl": 0.0396728515625, "learning_rate": 3.1729518855656696e-07, "loss": 0.0016, "reward": 1.8243137001991272, "reward_std": 0.0860644057393074, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8360324800014496, "step": 1050 }, { "clip_ratio": 0.0, "completion_length": 194.21484375, "epoch": 0.683132921676958, "grad_norm": 0.4937601089477539, "kl": 0.0458984375, "learning_rate": 3.166449934980494e-07, "loss": 0.0018, "reward": 1.7394456267356873, "reward_std": 0.18393495678901672, "rewards/format_reward_gen": 0.9609375, "rewards/llm_reward": 0.7785081267356873, "step": 1051 }, { "clip_ratio": 0.0, "completion_length": 206.71484375, "epoch": 0.6837829054273643, "grad_norm": 0.3579740524291992, "kl": 0.0321044921875, "learning_rate": 3.1599479843953186e-07, "loss": 0.0013, "reward": 1.8114365935325623, "reward_std": 0.09555742889642715, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8231553137302399, "step": 1052 }, { "clip_ratio": 0.0, "completion_length": 187.83984375, "epoch": 0.6844328891777706, "grad_norm": 0.4441312253475189, "kl": 0.0386962890625, "learning_rate": 3.1534460338101425e-07, "loss": 0.0015, "reward": 1.7955375909805298, "reward_std": 0.10113810747861862, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8150688111782074, "step": 1053 }, { "clip_ratio": 0.0, "completion_length": 181.9140625, "epoch": 0.6850828729281768, "grad_norm": 0.649268627166748, "kl": 0.04364013671875, "learning_rate": 3.146944083224967e-07, "loss": 0.0017, "reward": 1.8081307411193848, "reward_std": 0.10664074495434761, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8198494911193848, "step": 1054 }, { "clip_ratio": 0.0, "completion_length": 188.93359375, "epoch": 0.6857328566785831, "grad_norm": 0.5335497260093689, "kl": 0.03131103515625, "learning_rate": 3.1404421326397915e-07, "loss": 0.0013, "reward": 1.8213006258010864, "reward_std": 0.07993898168206215, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8330193758010864, "step": 1055 }, { "clip_ratio": 0.0, "completion_length": 178.12109375, "epoch": 0.6863828404289892, "grad_norm": 0.4199521243572235, "kl": 0.036376953125, "learning_rate": 3.1339401820546165e-07, "loss": 0.0015, "reward": 1.8294618129730225, "reward_std": 0.12663426250219345, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8528993129730225, "step": 1056 }, { "clip_ratio": 0.0, "completion_length": 175.40234375, "epoch": 0.6870328241793955, "grad_norm": 0.4176506996154785, "kl": 0.035888671875, "learning_rate": 3.127438231469441e-07, "loss": 0.0014, "reward": 1.8136243224143982, "reward_std": 0.10449229180812836, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.829249233007431, "step": 1057 }, { "clip_ratio": 0.0, "completion_length": 191.7578125, "epoch": 0.6876828079298017, "grad_norm": 0.44447797536849976, "kl": 0.0341796875, "learning_rate": 3.1209362808842655e-07, "loss": 0.0014, "reward": 1.7445033192634583, "reward_std": 0.14563194662332535, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7718470692634583, "step": 1058 }, { "clip_ratio": 0.0, "completion_length": 187.265625, "epoch": 0.688332791680208, "grad_norm": 0.4576212763786316, "kl": 0.029541015625, "learning_rate": 3.1144343302990895e-07, "loss": 0.0012, "reward": 1.7896063327789307, "reward_std": 0.08117595873773098, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8013250827789307, "step": 1059 }, { "clip_ratio": 0.0, "completion_length": 173.41015625, "epoch": 0.6889827754306143, "grad_norm": 0.3970135450363159, "kl": 0.0452880859375, "learning_rate": 3.107932379713914e-07, "loss": 0.0018, "reward": 1.774661362171173, "reward_std": 0.11996995285153389, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7980988621711731, "step": 1060 }, { "clip_ratio": 0.0, "completion_length": 179.796875, "epoch": 0.6896327591810205, "grad_norm": 0.427669495344162, "kl": 0.037353515625, "learning_rate": 3.1014304291287384e-07, "loss": 0.0015, "reward": 1.7670468091964722, "reward_std": 0.12560519576072693, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7982968389987946, "step": 1061 }, { "clip_ratio": 0.0, "completion_length": 214.33984375, "epoch": 0.6902827429314267, "grad_norm": 0.4855952858924866, "kl": 0.027587890625, "learning_rate": 3.094928478543563e-07, "loss": 0.0011, "reward": 1.7807477116584778, "reward_std": 0.06677663326263428, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7924664616584778, "step": 1062 }, { "clip_ratio": 0.0, "completion_length": 173.67578125, "epoch": 0.6909327266818329, "grad_norm": 0.5038449764251709, "kl": 0.035888671875, "learning_rate": 3.0884265279583874e-07, "loss": 0.0014, "reward": 1.7772186994552612, "reward_std": 0.138608168810606, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.8045624494552612, "step": 1063 }, { "clip_ratio": 0.0, "completion_length": 178.73828125, "epoch": 0.6915827104322392, "grad_norm": 0.4271376132965088, "kl": 0.0416259765625, "learning_rate": 3.081924577373212e-07, "loss": 0.0017, "reward": 1.7848414182662964, "reward_std": 0.1467432677745819, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8082788288593292, "step": 1064 }, { "clip_ratio": 0.0, "completion_length": 188.16796875, "epoch": 0.6922326941826454, "grad_norm": 0.4622233510017395, "kl": 0.037841796875, "learning_rate": 3.0754226267880364e-07, "loss": 0.0015, "reward": 1.7646682858467102, "reward_std": 0.154909186065197, "rewards/format_reward_gen": 0.95703125, "rewards/llm_reward": 0.8076370060443878, "step": 1065 }, { "clip_ratio": 0.0, "completion_length": 173.13671875, "epoch": 0.6928826779330517, "grad_norm": 0.4206404387950897, "kl": 0.0321044921875, "learning_rate": 3.068920676202861e-07, "loss": 0.0013, "reward": 1.8093135356903076, "reward_std": 0.07869943976402283, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8249386250972748, "step": 1066 }, { "clip_ratio": 0.0, "completion_length": 197.51171875, "epoch": 0.693532661683458, "grad_norm": 0.4076714813709259, "kl": 0.033203125, "learning_rate": 3.0624187256176854e-07, "loss": 0.0013, "reward": 1.7758047580718994, "reward_std": 0.09935861080884933, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7875235080718994, "step": 1067 }, { "clip_ratio": 0.0, "completion_length": 176.6484375, "epoch": 0.6941826454338641, "grad_norm": 0.4708860516548157, "kl": 0.0321044921875, "learning_rate": 3.05591677503251e-07, "loss": 0.0013, "reward": 1.787661373615265, "reward_std": 0.09127227216959, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7993801534175873, "step": 1068 }, { "clip_ratio": 0.0, "completion_length": 196.38671875, "epoch": 0.6948326291842704, "grad_norm": 0.41460514068603516, "kl": 0.0357666015625, "learning_rate": 3.049414824447334e-07, "loss": 0.0014, "reward": 1.814038097858429, "reward_std": 0.07470107823610306, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8257568180561066, "step": 1069 }, { "clip_ratio": 0.0, "completion_length": 191.3359375, "epoch": 0.6954826129346766, "grad_norm": 0.4565941095352173, "kl": 0.03558349609375, "learning_rate": 3.0429128738621583e-07, "loss": 0.0014, "reward": 1.7812556624412537, "reward_std": 0.1536712981760502, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.8125056624412537, "step": 1070 }, { "clip_ratio": 0.0, "completion_length": 186.67578125, "epoch": 0.6961325966850829, "grad_norm": 0.41383224725723267, "kl": 0.0391845703125, "learning_rate": 3.036410923276983e-07, "loss": 0.0016, "reward": 1.7807564735412598, "reward_std": 0.11470435187220573, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7963814437389374, "step": 1071 }, { "clip_ratio": 0.0, "completion_length": 183.31640625, "epoch": 0.6967825804354891, "grad_norm": 0.4627256691455841, "kl": 0.02764892578125, "learning_rate": 3.0299089726918073e-07, "loss": 0.0011, "reward": 1.8482991456985474, "reward_std": 0.06170045584440231, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8522054255008698, "step": 1072 }, { "clip_ratio": 0.0, "completion_length": 171.546875, "epoch": 0.6974325641858954, "grad_norm": 0.3962770998477936, "kl": 0.0367431640625, "learning_rate": 3.0234070221066323e-07, "loss": 0.0015, "reward": 1.824195146560669, "reward_std": 0.11468543112277985, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.843726396560669, "step": 1073 }, { "clip_ratio": 0.0, "completion_length": 207.05859375, "epoch": 0.6980825479363016, "grad_norm": 0.41136568784713745, "kl": 0.0352783203125, "learning_rate": 3.0169050715214563e-07, "loss": 0.0014, "reward": 1.786520779132843, "reward_std": 0.11752978339791298, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8060520589351654, "step": 1074 }, { "clip_ratio": 0.0, "completion_length": 208.67578125, "epoch": 0.6987325316867078, "grad_norm": 0.4848628640174866, "kl": 0.03955078125, "learning_rate": 3.010403120936281e-07, "loss": 0.0016, "reward": 1.7658708095550537, "reward_std": 0.08594296872615814, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7893082201480865, "step": 1075 }, { "clip_ratio": 0.0, "completion_length": 199.12890625, "epoch": 0.699382515437114, "grad_norm": 0.41849857568740845, "kl": 0.02679443359375, "learning_rate": 3.0039011703511053e-07, "loss": 0.0011, "reward": 1.7945247888565063, "reward_std": 0.06800022721290588, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.7984309792518616, "step": 1076 }, { "clip_ratio": 0.0, "completion_length": 186.859375, "epoch": 0.7000324991875203, "grad_norm": 0.5506415963172913, "kl": 0.03857421875, "learning_rate": 2.99739921976593e-07, "loss": 0.0015, "reward": 1.8247051239013672, "reward_std": 0.11791781336069107, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8481426239013672, "step": 1077 }, { "clip_ratio": 0.0, "completion_length": 201.4453125, "epoch": 0.7006824829379266, "grad_norm": 0.3948516845703125, "kl": 0.0328369140625, "learning_rate": 2.990897269180754e-07, "loss": 0.0013, "reward": 1.792831540107727, "reward_std": 0.06844388693571091, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8006440103054047, "step": 1078 }, { "clip_ratio": 0.0, "completion_length": 181.16015625, "epoch": 0.7013324666883328, "grad_norm": 0.44941434264183044, "kl": 0.034423828125, "learning_rate": 2.984395318595578e-07, "loss": 0.0014, "reward": 1.8026949167251587, "reward_std": 0.08386041969060898, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8144136071205139, "step": 1079 }, { "clip_ratio": 0.0, "completion_length": 194.9765625, "epoch": 0.7019824504387391, "grad_norm": 0.4482017755508423, "kl": 0.034423828125, "learning_rate": 2.9778933680104027e-07, "loss": 0.0014, "reward": 1.7709839344024658, "reward_std": 0.09870462119579315, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7787964344024658, "step": 1080 }, { "clip_ratio": 0.0, "completion_length": 190.015625, "epoch": 0.7026324341891452, "grad_norm": 0.3762977719306946, "kl": 0.03271484375, "learning_rate": 2.9713914174252277e-07, "loss": 0.0013, "reward": 1.812050223350525, "reward_std": 0.07228341698646545, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8198627531528473, "step": 1081 }, { "clip_ratio": 0.0, "completion_length": 193.85546875, "epoch": 0.7032824179395515, "grad_norm": 0.44663089513778687, "kl": 0.031982421875, "learning_rate": 2.964889466840052e-07, "loss": 0.0013, "reward": 1.8178853392601013, "reward_std": 0.08426556549966335, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8296040594577789, "step": 1082 }, { "clip_ratio": 0.0, "completion_length": 175.7421875, "epoch": 0.7039324016899577, "grad_norm": 0.41612163186073303, "kl": 0.0391845703125, "learning_rate": 2.9583875162548767e-07, "loss": 0.0016, "reward": 1.8181031346321106, "reward_std": 0.09899551793932915, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8298219442367554, "step": 1083 }, { "clip_ratio": 0.0, "completion_length": 187.1640625, "epoch": 0.704582385440364, "grad_norm": 0.5497518181800842, "kl": 0.03094482421875, "learning_rate": 2.9518855656697007e-07, "loss": 0.0012, "reward": 1.8024157881736755, "reward_std": 0.120302714407444, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8219471275806427, "step": 1084 }, { "clip_ratio": 0.0, "completion_length": 190.20703125, "epoch": 0.7052323691907703, "grad_norm": 0.4867874085903168, "kl": 0.037353515625, "learning_rate": 2.945383615084525e-07, "loss": 0.0015, "reward": 1.754986584186554, "reward_std": 0.1174008846282959, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.778424084186554, "step": 1085 }, { "clip_ratio": 0.0, "completion_length": 184.72265625, "epoch": 0.7058823529411765, "grad_norm": 0.3876325190067291, "kl": 0.03216552734375, "learning_rate": 2.9388816644993497e-07, "loss": 0.0013, "reward": 1.7892791032791138, "reward_std": 0.07336762920022011, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7970915734767914, "step": 1086 }, { "clip_ratio": 0.0, "completion_length": 187.421875, "epoch": 0.7065323366915827, "grad_norm": 0.4321843385696411, "kl": 0.03125, "learning_rate": 2.932379713914174e-07, "loss": 0.0012, "reward": 1.7898118495941162, "reward_std": 0.08043871819972992, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8015305995941162, "step": 1087 }, { "clip_ratio": 0.0, "completion_length": 190.89453125, "epoch": 0.7071823204419889, "grad_norm": 0.5156404376029968, "kl": 0.0361328125, "learning_rate": 2.9258777633289986e-07, "loss": 0.0014, "reward": 1.7724705338478088, "reward_std": 0.14293940737843513, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.8037205040454865, "step": 1088 }, { "clip_ratio": 0.0, "completion_length": 201.63671875, "epoch": 0.7078323041923952, "grad_norm": 0.38684046268463135, "kl": 0.03521728515625, "learning_rate": 2.9193758127438226e-07, "loss": 0.0014, "reward": 1.805163562297821, "reward_std": 0.10459776967763901, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.824694812297821, "step": 1089 }, { "clip_ratio": 0.0, "completion_length": 190.07421875, "epoch": 0.7084822879428014, "grad_norm": 0.3893895447254181, "kl": 0.04046630859375, "learning_rate": 2.9128738621586476e-07, "loss": 0.0016, "reward": 1.7745498418807983, "reward_std": 0.08598966524004936, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.794081062078476, "step": 1090 }, { "clip_ratio": 0.0, "completion_length": 186.55859375, "epoch": 0.7091322716932077, "grad_norm": 0.7263259291648865, "kl": 0.03302001953125, "learning_rate": 2.906371911573472e-07, "loss": 0.0013, "reward": 1.8409469723701477, "reward_std": 0.07477273046970367, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8526657819747925, "step": 1091 }, { "clip_ratio": 0.0, "completion_length": 176.4609375, "epoch": 0.709782255443614, "grad_norm": 0.7159109115600586, "kl": 0.033203125, "learning_rate": 2.8998699609882966e-07, "loss": 0.0013, "reward": 1.7719786167144775, "reward_std": 0.07908502593636513, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7797911763191223, "step": 1092 }, { "clip_ratio": 0.0, "completion_length": 176.6328125, "epoch": 0.7104322391940201, "grad_norm": 0.40652018785476685, "kl": 0.03082275390625, "learning_rate": 2.893368010403121e-07, "loss": 0.0012, "reward": 1.8357094526290894, "reward_std": 0.06367306038737297, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8396157026290894, "step": 1093 }, { "clip_ratio": 0.0, "completion_length": 184.984375, "epoch": 0.7110822229444264, "grad_norm": 0.42279255390167236, "kl": 0.03515625, "learning_rate": 2.886866059817945e-07, "loss": 0.0014, "reward": 1.7922340631484985, "reward_std": 0.12812912464141846, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.8234840333461761, "step": 1094 }, { "clip_ratio": 0.0, "completion_length": 173.65234375, "epoch": 0.7117322066948326, "grad_norm": 0.3937469720840454, "kl": 0.03466796875, "learning_rate": 2.8803641092327695e-07, "loss": 0.0014, "reward": 1.7998132705688477, "reward_std": 0.09790626168251038, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8154382705688477, "step": 1095 }, { "clip_ratio": 0.0, "completion_length": 182.6171875, "epoch": 0.7123821904452389, "grad_norm": 0.5451598763465881, "kl": 0.03369140625, "learning_rate": 2.873862158647594e-07, "loss": 0.0013, "reward": 1.8142290115356445, "reward_std": 0.0752369835972786, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8298540413379669, "step": 1096 }, { "clip_ratio": 0.0, "completion_length": 196.6015625, "epoch": 0.7130321741956451, "grad_norm": 0.3860398828983307, "kl": 0.0322265625, "learning_rate": 2.8673602080624185e-07, "loss": 0.0013, "reward": 1.8146876096725464, "reward_std": 0.06711167097091675, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8225001394748688, "step": 1097 }, { "clip_ratio": 0.0, "completion_length": 207.828125, "epoch": 0.7136821579460514, "grad_norm": 0.3938411772251129, "kl": 0.04736328125, "learning_rate": 2.8608582574772435e-07, "loss": 0.0019, "reward": 1.761798620223999, "reward_std": 0.13341763615608215, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.7891423106193542, "step": 1098 }, { "clip_ratio": 0.0, "completion_length": 194.5859375, "epoch": 0.7143321416964575, "grad_norm": 0.43851128220558167, "kl": 0.033203125, "learning_rate": 2.8543563068920675e-07, "loss": 0.0013, "reward": 1.7711879014968872, "reward_std": 0.10318533703684807, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7829067707061768, "step": 1099 }, { "clip_ratio": 0.0, "completion_length": 182.24609375, "epoch": 0.7149821254468638, "grad_norm": 0.4422314167022705, "kl": 0.029296875, "learning_rate": 2.847854356306892e-07, "loss": 0.0012, "reward": 1.82668536901474, "reward_std": 0.09725919738411903, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.83840411901474, "step": 1100 }, { "clip_ratio": 0.0, "completion_length": 182.7890625, "epoch": 0.71563210919727, "grad_norm": 0.4008391797542572, "kl": 0.03271484375, "learning_rate": 2.8413524057217165e-07, "loss": 0.0013, "reward": 1.8475186824798584, "reward_std": 0.04975683428347111, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.8475186824798584, "step": 1101 }, { "clip_ratio": 0.0, "completion_length": 183.1796875, "epoch": 0.7162820929476763, "grad_norm": 0.4019288420677185, "kl": 0.02960205078125, "learning_rate": 2.834850455136541e-07, "loss": 0.0012, "reward": 1.7988024353981018, "reward_std": 0.06410825625061989, "rewards/format_reward_gen": 1.0, "rewards/llm_reward": 0.7988024353981018, "step": 1102 }, { "clip_ratio": 0.0, "completion_length": 191.3515625, "epoch": 0.7169320766980826, "grad_norm": 0.41164836287498474, "kl": 0.03277587890625, "learning_rate": 2.8283485045513655e-07, "loss": 0.0013, "reward": 1.8018370270729065, "reward_std": 0.096959188580513, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8135557770729065, "step": 1103 }, { "clip_ratio": 0.0, "completion_length": 178.88671875, "epoch": 0.7175820604484888, "grad_norm": 0.4101284146308899, "kl": 0.03759765625, "learning_rate": 2.8218465539661894e-07, "loss": 0.0015, "reward": 1.7927234172821045, "reward_std": 0.11944489926099777, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.8278797268867493, "step": 1104 }, { "clip_ratio": 0.0, "completion_length": 192.453125, "epoch": 0.7182320441988951, "grad_norm": 0.44664642214775085, "kl": 0.0390625, "learning_rate": 2.815344603381014e-07, "loss": 0.0016, "reward": 1.7729839086532593, "reward_std": 0.10729848593473434, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7964213788509369, "step": 1105 }, { "clip_ratio": 0.0, "completion_length": 172.39453125, "epoch": 0.7188820279493012, "grad_norm": 0.4367665946483612, "kl": 0.033447265625, "learning_rate": 2.8088426527958384e-07, "loss": 0.0013, "reward": 1.7880056500434875, "reward_std": 0.11432789266109467, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8036307096481323, "step": 1106 }, { "clip_ratio": 0.0, "completion_length": 167.1953125, "epoch": 0.7195320116997075, "grad_norm": 0.4176228940486908, "kl": 0.036865234375, "learning_rate": 2.8023407022106634e-07, "loss": 0.0015, "reward": 1.8294838666915894, "reward_std": 0.0976816900074482, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8490151166915894, "step": 1107 }, { "clip_ratio": 0.0, "completion_length": 195.61328125, "epoch": 0.7201819954501137, "grad_norm": 0.42343443632125854, "kl": 0.03387451171875, "learning_rate": 2.795838751625488e-07, "loss": 0.0014, "reward": 1.76171875, "reward_std": 0.08805209957063198, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7695312798023224, "step": 1108 }, { "clip_ratio": 0.0, "completion_length": 179.625, "epoch": 0.72083197920052, "grad_norm": 0.5951894521713257, "kl": 0.0379638671875, "learning_rate": 2.789336801040312e-07, "loss": 0.0015, "reward": 1.7346779108047485, "reward_std": 0.16506250202655792, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7659279108047485, "step": 1109 }, { "clip_ratio": 0.0, "completion_length": 177.9921875, "epoch": 0.7214819629509263, "grad_norm": 0.7695301175117493, "kl": 0.0289306640625, "learning_rate": 2.7828348504551364e-07, "loss": 0.0012, "reward": 1.826983094215393, "reward_std": 0.08126891031861305, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8465143144130707, "step": 1110 }, { "clip_ratio": 0.0, "completion_length": 176.14453125, "epoch": 0.7221319467013325, "grad_norm": 0.410283625125885, "kl": 0.0362548828125, "learning_rate": 2.776332899869961e-07, "loss": 0.0014, "reward": 1.7906194925308228, "reward_std": 0.05719350092113018, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.7945257425308228, "step": 1111 }, { "clip_ratio": 0.0, "completion_length": 178.484375, "epoch": 0.7227819304517387, "grad_norm": 0.49493688344955444, "kl": 0.0299072265625, "learning_rate": 2.7698309492847854e-07, "loss": 0.0012, "reward": 1.8517454862594604, "reward_std": 0.06782207265496254, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8595579862594604, "step": 1112 }, { "clip_ratio": 0.0, "completion_length": 183.69140625, "epoch": 0.7234319142021449, "grad_norm": 0.42719921469688416, "kl": 0.0447998046875, "learning_rate": 2.76332899869961e-07, "loss": 0.0018, "reward": 1.8024413585662842, "reward_std": 0.10195096582174301, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8219725489616394, "step": 1113 }, { "clip_ratio": 0.0, "completion_length": 164.91796875, "epoch": 0.7240818979525512, "grad_norm": 0.3800489902496338, "kl": 0.03131103515625, "learning_rate": 2.756827048114434e-07, "loss": 0.0013, "reward": 1.8211680054664612, "reward_std": 0.08934185653924942, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8446054458618164, "step": 1114 }, { "clip_ratio": 0.0, "completion_length": 188.75, "epoch": 0.7247318817029574, "grad_norm": 0.38996559381484985, "kl": 0.044189453125, "learning_rate": 2.7503250975292583e-07, "loss": 0.0018, "reward": 1.8221206665039062, "reward_std": 0.09332678094506264, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8377456367015839, "step": 1115 }, { "clip_ratio": 0.0, "completion_length": 191.01953125, "epoch": 0.7253818654533637, "grad_norm": 0.36644938588142395, "kl": 0.034423828125, "learning_rate": 2.7438231469440833e-07, "loss": 0.0014, "reward": 1.809039056301117, "reward_std": 0.10281982645392418, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8285703063011169, "step": 1116 }, { "clip_ratio": 0.0, "completion_length": 175.2109375, "epoch": 0.72603184920377, "grad_norm": 0.3766622841358185, "kl": 0.03155517578125, "learning_rate": 2.737321196358908e-07, "loss": 0.0013, "reward": 1.7616499066352844, "reward_std": 0.10548773407936096, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7811811864376068, "step": 1117 }, { "clip_ratio": 0.0, "completion_length": 181.015625, "epoch": 0.7266818329541761, "grad_norm": 0.37040239572525024, "kl": 0.03338623046875, "learning_rate": 2.7308192457737323e-07, "loss": 0.0013, "reward": 1.85875004529953, "reward_std": 0.09902332350611687, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8743751347064972, "step": 1118 }, { "clip_ratio": 0.0, "completion_length": 174.33984375, "epoch": 0.7273318167045824, "grad_norm": 0.37233006954193115, "kl": 0.03662109375, "learning_rate": 2.7243172951885563e-07, "loss": 0.0015, "reward": 1.8570718169212341, "reward_std": 0.06475404277443886, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8609780669212341, "step": 1119 }, { "clip_ratio": 0.0, "completion_length": 180.7109375, "epoch": 0.7279818004549886, "grad_norm": 0.38442784547805786, "kl": 0.0384521484375, "learning_rate": 2.717815344603381e-07, "loss": 0.0015, "reward": 1.813902735710144, "reward_std": 0.11264144629240036, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.833433985710144, "step": 1120 }, { "clip_ratio": 0.0, "completion_length": 180.0078125, "epoch": 0.7286317842053949, "grad_norm": 0.4037625789642334, "kl": 0.0416259765625, "learning_rate": 2.711313394018205e-07, "loss": 0.0017, "reward": 1.8254218697547913, "reward_std": 0.12166977673768997, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8449531197547913, "step": 1121 }, { "clip_ratio": 0.0, "completion_length": 172.31640625, "epoch": 0.7292817679558011, "grad_norm": 0.4392552971839905, "kl": 0.033203125, "learning_rate": 2.70481144343303e-07, "loss": 0.0013, "reward": 1.7960082292556763, "reward_std": 0.11938555166125298, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.823352038860321, "step": 1122 }, { "clip_ratio": 0.0, "completion_length": 175.921875, "epoch": 0.7299317517062074, "grad_norm": 0.3385460078716278, "kl": 0.02813720703125, "learning_rate": 2.698309492847854e-07, "loss": 0.0011, "reward": 1.7838813662528992, "reward_std": 0.07989689521491528, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.803412675857544, "step": 1123 }, { "clip_ratio": 0.0, "completion_length": 168.078125, "epoch": 0.7305817354566135, "grad_norm": 0.40668633580207825, "kl": 0.0418701171875, "learning_rate": 2.6918075422626787e-07, "loss": 0.0017, "reward": 1.8154296875, "reward_std": 0.11839290708303452, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8388671576976776, "step": 1124 }, { "clip_ratio": 0.0, "completion_length": 181.52734375, "epoch": 0.7312317192070198, "grad_norm": 0.43238547444343567, "kl": 0.03466796875, "learning_rate": 2.685305591677503e-07, "loss": 0.0014, "reward": 1.8247381448745728, "reward_std": 0.1046791709959507, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8442694246768951, "step": 1125 }, { "clip_ratio": 0.0, "completion_length": 190.45703125, "epoch": 0.731881702957426, "grad_norm": 0.48212525248527527, "kl": 0.0355224609375, "learning_rate": 2.6788036410923277e-07, "loss": 0.0014, "reward": 1.775117039680481, "reward_std": 0.11991185694932938, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.794648289680481, "step": 1126 }, { "clip_ratio": 0.0, "completion_length": 180.55859375, "epoch": 0.7325316867078323, "grad_norm": 0.5229998230934143, "kl": 0.0350341796875, "learning_rate": 2.672301690507152e-07, "loss": 0.0014, "reward": 1.8206223845481873, "reward_std": 0.0645713061094284, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8284348249435425, "step": 1127 }, { "clip_ratio": 0.0, "completion_length": 180.92578125, "epoch": 0.7331816704582386, "grad_norm": 0.39731472730636597, "kl": 0.04052734375, "learning_rate": 2.6657997399219767e-07, "loss": 0.0016, "reward": 1.77387934923172, "reward_std": 0.07179629057645798, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7855981588363647, "step": 1128 }, { "clip_ratio": 0.0, "completion_length": 176.703125, "epoch": 0.7338316542086448, "grad_norm": 0.35820508003234863, "kl": 0.031005859375, "learning_rate": 2.6592977893368007e-07, "loss": 0.0012, "reward": 1.8398481607437134, "reward_std": 0.06137095019221306, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8515669107437134, "step": 1129 }, { "clip_ratio": 0.0, "completion_length": 208.9765625, "epoch": 0.734481637959051, "grad_norm": 0.4308309257030487, "kl": 0.037109375, "learning_rate": 2.652795838751625e-07, "loss": 0.0015, "reward": 1.7739565968513489, "reward_std": 0.0872671902179718, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.7778628170490265, "step": 1130 }, { "clip_ratio": 0.0, "completion_length": 192.6328125, "epoch": 0.7351316217094572, "grad_norm": 0.5189074873924255, "kl": 0.0343017578125, "learning_rate": 2.6462938881664496e-07, "loss": 0.0014, "reward": 1.790770947933197, "reward_std": 0.09843292087316513, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8063958883285522, "step": 1131 }, { "clip_ratio": 0.0, "completion_length": 180.98828125, "epoch": 0.7357816054598635, "grad_norm": 0.4649021029472351, "kl": 0.0341796875, "learning_rate": 2.639791937581274e-07, "loss": 0.0014, "reward": 1.7843902707099915, "reward_std": 0.1239771842956543, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7961090207099915, "step": 1132 }, { "clip_ratio": 0.0, "completion_length": 174.65234375, "epoch": 0.7364315892102697, "grad_norm": 0.512108325958252, "kl": 0.0382080078125, "learning_rate": 2.633289986996099e-07, "loss": 0.0015, "reward": 1.788785457611084, "reward_std": 0.11561418324708939, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8044104278087616, "step": 1133 }, { "clip_ratio": 0.0, "completion_length": 177.06640625, "epoch": 0.737081572960676, "grad_norm": 0.384656697511673, "kl": 0.03857421875, "learning_rate": 2.626788036410923e-07, "loss": 0.0015, "reward": 1.8155763149261475, "reward_std": 0.10046703368425369, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8390138447284698, "step": 1134 }, { "clip_ratio": 0.0, "completion_length": 177.80078125, "epoch": 0.7377315567110823, "grad_norm": 0.4496457278728485, "kl": 0.0350341796875, "learning_rate": 2.6202860858257476e-07, "loss": 0.0014, "reward": 1.8005906343460083, "reward_std": 0.09302110970020294, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8162156641483307, "step": 1135 }, { "clip_ratio": 0.0, "completion_length": 199.390625, "epoch": 0.7383815404614885, "grad_norm": 0.4073523283004761, "kl": 0.0350341796875, "learning_rate": 2.613784135240572e-07, "loss": 0.0014, "reward": 1.7627050280570984, "reward_std": 0.10852214321494102, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7822362780570984, "step": 1136 }, { "clip_ratio": 0.0, "completion_length": 179.60546875, "epoch": 0.7390315242118947, "grad_norm": 0.4003308415412903, "kl": 0.03338623046875, "learning_rate": 2.6072821846553966e-07, "loss": 0.0013, "reward": 1.741679310798645, "reward_std": 0.10826538503170013, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.7729293406009674, "step": 1137 }, { "clip_ratio": 0.0, "completion_length": 205.94140625, "epoch": 0.7396815079623009, "grad_norm": 0.7780517935752869, "kl": 0.0306396484375, "learning_rate": 2.600780234070221e-07, "loss": 0.0012, "reward": 1.7631913423538208, "reward_std": 0.10851889476180077, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7827225625514984, "step": 1138 }, { "clip_ratio": 0.0, "completion_length": 171.984375, "epoch": 0.7403314917127072, "grad_norm": 0.3609669506549835, "kl": 0.02801513671875, "learning_rate": 2.594278283485045e-07, "loss": 0.0011, "reward": 1.8347712755203247, "reward_std": 0.06336707435548306, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8464900553226471, "step": 1139 }, { "clip_ratio": 0.0, "completion_length": 203.1328125, "epoch": 0.7409814754631134, "grad_norm": 0.4510640501976013, "kl": 0.033935546875, "learning_rate": 2.5877763328998695e-07, "loss": 0.0014, "reward": 1.7367473244667053, "reward_std": 0.124761201441288, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7601848542690277, "step": 1140 }, { "clip_ratio": 0.0, "completion_length": 195.11328125, "epoch": 0.7416314592135197, "grad_norm": 0.37362000346183777, "kl": 0.0302734375, "learning_rate": 2.5812743823146945e-07, "loss": 0.0012, "reward": 1.8076260089874268, "reward_std": 0.10912998765707016, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8271572589874268, "step": 1141 }, { "clip_ratio": 0.0, "completion_length": 174.8125, "epoch": 0.742281442963926, "grad_norm": 0.37153804302215576, "kl": 0.0306396484375, "learning_rate": 2.574772431729519e-07, "loss": 0.0012, "reward": 1.8393043875694275, "reward_std": 0.07574750855565071, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8471169471740723, "step": 1142 }, { "clip_ratio": 0.0, "completion_length": 178.84375, "epoch": 0.7429314267143321, "grad_norm": 0.5590206980705261, "kl": 0.0355224609375, "learning_rate": 2.5682704811443435e-07, "loss": 0.0014, "reward": 1.793043076992035, "reward_std": 0.08738543093204498, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8047618269920349, "step": 1143 }, { "clip_ratio": 0.0, "completion_length": 179.19140625, "epoch": 0.7435814104647384, "grad_norm": 0.45337212085723877, "kl": 0.04736328125, "learning_rate": 2.5617685305591675e-07, "loss": 0.0019, "reward": 1.7814656496047974, "reward_std": 0.0990733802318573, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7970906794071198, "step": 1144 }, { "clip_ratio": 0.0, "completion_length": 184.64453125, "epoch": 0.7442313942151446, "grad_norm": 0.40075549483299255, "kl": 0.02923583984375, "learning_rate": 2.555266579973992e-07, "loss": 0.0012, "reward": 1.8167061805725098, "reward_std": 0.12274833396077156, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.8440499007701874, "step": 1145 }, { "clip_ratio": 0.0, "completion_length": 177.5, "epoch": 0.7448813779655509, "grad_norm": 0.4811926484107971, "kl": 0.0330810546875, "learning_rate": 2.5487646293888165e-07, "loss": 0.0013, "reward": 1.7992799282073975, "reward_std": 0.10197225585579872, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8188112080097198, "step": 1146 }, { "clip_ratio": 0.0, "completion_length": 199.328125, "epoch": 0.7455313617159571, "grad_norm": 0.4695543944835663, "kl": 0.0380859375, "learning_rate": 2.542262678803641e-07, "loss": 0.0015, "reward": 1.7512574791908264, "reward_std": 0.1166670098900795, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.774694949388504, "step": 1147 }, { "clip_ratio": 0.0, "completion_length": 188.73046875, "epoch": 0.7461813454663634, "grad_norm": 0.4481674134731293, "kl": 0.0345458984375, "learning_rate": 2.5357607282184655e-07, "loss": 0.0014, "reward": 1.8081326484680176, "reward_std": 0.08626127615571022, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8198513984680176, "step": 1148 }, { "clip_ratio": 0.0, "completion_length": 168.9609375, "epoch": 0.7468313292167695, "grad_norm": 0.4179040491580963, "kl": 0.033203125, "learning_rate": 2.5292587776332894e-07, "loss": 0.0013, "reward": 1.8202970027923584, "reward_std": 0.14614590257406235, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.8515470027923584, "step": 1149 }, { "clip_ratio": 0.0, "completion_length": 187.1875, "epoch": 0.7474813129671758, "grad_norm": 0.45744600892066956, "kl": 0.02655029296875, "learning_rate": 2.5227568270481144e-07, "loss": 0.0011, "reward": 1.7908912897109985, "reward_std": 0.09954040125012398, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8104225993156433, "step": 1150 }, { "clip_ratio": 0.0, "completion_length": 189.8828125, "epoch": 0.748131296717582, "grad_norm": 0.37399131059646606, "kl": 0.03045654296875, "learning_rate": 2.516254876462939e-07, "loss": 0.0012, "reward": 1.8230392932891846, "reward_std": 0.0851987674832344, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8347581028938293, "step": 1151 }, { "clip_ratio": 0.0, "completion_length": 191.1328125, "epoch": 0.7487812804679883, "grad_norm": 0.44139474630355835, "kl": 0.0350341796875, "learning_rate": 2.5097529258777634e-07, "loss": 0.0014, "reward": 1.736149549484253, "reward_std": 0.12237653881311417, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7556807994842529, "step": 1152 }, { "clip_ratio": 0.0, "completion_length": 193.7109375, "epoch": 0.7494312642183946, "grad_norm": 0.3281811475753784, "kl": 0.02960205078125, "learning_rate": 2.503250975292588e-07, "loss": 0.0012, "reward": 1.820286750793457, "reward_std": 0.08100263401865959, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8320055305957794, "step": 1153 }, { "clip_ratio": 0.0, "completion_length": 193.1796875, "epoch": 0.7500812479688008, "grad_norm": 0.4538390040397644, "kl": 0.02789306640625, "learning_rate": 2.4967490247074124e-07, "loss": 0.0011, "reward": 1.7614450454711914, "reward_std": 0.08327356353402138, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.7653512954711914, "step": 1154 }, { "clip_ratio": 0.0, "completion_length": 192.2734375, "epoch": 0.750731231719207, "grad_norm": 0.4396093785762787, "kl": 0.0382080078125, "learning_rate": 2.4902470741222364e-07, "loss": 0.0015, "reward": 1.8235569596290588, "reward_std": 0.08603415638208389, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8313694596290588, "step": 1155 }, { "clip_ratio": 0.0, "completion_length": 183.265625, "epoch": 0.7513812154696132, "grad_norm": 0.8222249746322632, "kl": 0.0372314453125, "learning_rate": 2.483745123537061e-07, "loss": 0.0015, "reward": 1.7329755425453186, "reward_std": 0.15203388780355453, "rewards/format_reward_gen": 0.96484375, "rewards/llm_reward": 0.7681317627429962, "step": 1156 }, { "clip_ratio": 0.0, "completion_length": 190.26953125, "epoch": 0.7520311992200195, "grad_norm": 0.458847314119339, "kl": 0.0465087890625, "learning_rate": 2.4772431729518853e-07, "loss": 0.0019, "reward": 1.7822919487953186, "reward_std": 0.090883519500494, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8018232882022858, "step": 1157 }, { "clip_ratio": 0.0, "completion_length": 193.71484375, "epoch": 0.7526811829704257, "grad_norm": 0.38860565423965454, "kl": 0.0350341796875, "learning_rate": 2.47074122236671e-07, "loss": 0.0014, "reward": 1.7870376110076904, "reward_std": 0.14670370519161224, "rewards/format_reward_gen": 0.96875, "rewards/llm_reward": 0.8182876706123352, "step": 1158 }, { "clip_ratio": 0.0, "completion_length": 182.546875, "epoch": 0.753331166720832, "grad_norm": 0.452086478471756, "kl": 0.0330810546875, "learning_rate": 2.4642392717815343e-07, "loss": 0.0013, "reward": 1.771085500717163, "reward_std": 0.06666271016001701, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.7749916911125183, "step": 1159 }, { "clip_ratio": 0.0, "completion_length": 195.4453125, "epoch": 0.7539811504712383, "grad_norm": 0.3943263590335846, "kl": 0.039306640625, "learning_rate": 2.457737321196359e-07, "loss": 0.0016, "reward": 1.790913164615631, "reward_std": 0.10327447950839996, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8026319146156311, "step": 1160 }, { "clip_ratio": 0.0, "completion_length": 200.8125, "epoch": 0.7546311342216444, "grad_norm": 0.3668895363807678, "kl": 0.03228759765625, "learning_rate": 2.4512353706111833e-07, "loss": 0.0013, "reward": 1.7651872634887695, "reward_std": 0.08267639577388763, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7886247038841248, "step": 1161 }, { "clip_ratio": 0.0, "completion_length": 193.44140625, "epoch": 0.7552811179720507, "grad_norm": 0.3715100884437561, "kl": 0.035400390625, "learning_rate": 2.444733420026008e-07, "loss": 0.0014, "reward": 1.79703688621521, "reward_std": 0.08192635327577591, "rewards/format_reward_gen": 0.97265625, "rewards/llm_reward": 0.8243806660175323, "step": 1162 }, { "clip_ratio": 0.0, "completion_length": 188.37890625, "epoch": 0.7559311017224569, "grad_norm": 0.4011111557483673, "kl": 0.036865234375, "learning_rate": 2.4382314694408323e-07, "loss": 0.0015, "reward": 1.7642688155174255, "reward_std": 0.1282973363995552, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7798937559127808, "step": 1163 }, { "clip_ratio": 0.0, "completion_length": 194.66015625, "epoch": 0.7565810854728632, "grad_norm": 0.4446175992488861, "kl": 0.04058837890625, "learning_rate": 2.431729518855657e-07, "loss": 0.0016, "reward": 1.766735553741455, "reward_std": 0.10498557612299919, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7823605239391327, "step": 1164 }, { "clip_ratio": 0.0, "completion_length": 191.3125, "epoch": 0.7572310692232694, "grad_norm": 0.4294019043445587, "kl": 0.036865234375, "learning_rate": 2.425227568270481e-07, "loss": 0.0015, "reward": 1.803627073764801, "reward_std": 0.11228428035974503, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.819252073764801, "step": 1165 }, { "clip_ratio": 0.0, "completion_length": 184.22265625, "epoch": 0.7578810529736757, "grad_norm": 0.42825350165367126, "kl": 0.037353515625, "learning_rate": 2.418725617685305e-07, "loss": 0.0015, "reward": 1.7551646828651428, "reward_std": 0.07658251002430916, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7668834030628204, "step": 1166 }, { "clip_ratio": 0.0, "completion_length": 182.81640625, "epoch": 0.758531036724082, "grad_norm": 0.4244191348552704, "kl": 0.0318603515625, "learning_rate": 2.41222366710013e-07, "loss": 0.0013, "reward": 1.8291561007499695, "reward_std": 0.07423260435461998, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8408748209476471, "step": 1167 }, { "clip_ratio": 0.0, "completion_length": 190.3671875, "epoch": 0.7591810204744881, "grad_norm": 0.38153964281082153, "kl": 0.02972412109375, "learning_rate": 2.405721716514954e-07, "loss": 0.0012, "reward": 1.8208263516426086, "reward_std": 0.09150883182883263, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8442638516426086, "step": 1168 }, { "clip_ratio": 0.0, "completion_length": 177.4765625, "epoch": 0.7598310042248944, "grad_norm": 0.4059700071811676, "kl": 0.0357666015625, "learning_rate": 2.3992197659297787e-07, "loss": 0.0014, "reward": 1.7796109914779663, "reward_std": 0.12691724300384521, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8030483722686768, "step": 1169 }, { "clip_ratio": 0.0, "completion_length": 193.12109375, "epoch": 0.7604809879753006, "grad_norm": 0.3904721438884735, "kl": 0.0311279296875, "learning_rate": 2.392717815344603e-07, "loss": 0.0012, "reward": 1.801910161972046, "reward_std": 0.07904011756181717, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8097226917743683, "step": 1170 }, { "clip_ratio": 0.0, "completion_length": 177.04296875, "epoch": 0.7611309717257069, "grad_norm": 0.437120258808136, "kl": 0.03277587890625, "learning_rate": 2.3862158647594277e-07, "loss": 0.0013, "reward": 1.8189609050750732, "reward_std": 0.081648338586092, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8345859348773956, "step": 1171 }, { "clip_ratio": 0.0, "completion_length": 199.4765625, "epoch": 0.7617809554761131, "grad_norm": 0.4040036201477051, "kl": 0.03106689453125, "learning_rate": 2.3797139141742522e-07, "loss": 0.0012, "reward": 1.7983622550964355, "reward_std": 0.08867611736059189, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8100808560848236, "step": 1172 }, { "clip_ratio": 0.0, "completion_length": 181.2890625, "epoch": 0.7624309392265194, "grad_norm": 0.528631865978241, "kl": 0.0377197265625, "learning_rate": 2.3732119635890767e-07, "loss": 0.0015, "reward": 1.7875006794929504, "reward_std": 0.1205926388502121, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8070319294929504, "step": 1173 }, { "clip_ratio": 0.0, "completion_length": 194.34375, "epoch": 0.7630809229769255, "grad_norm": 0.4293684661388397, "kl": 0.0367431640625, "learning_rate": 2.3667100130039012e-07, "loss": 0.0015, "reward": 1.7684394717216492, "reward_std": 0.08393288403749466, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.776252031326294, "step": 1174 }, { "clip_ratio": 0.0, "completion_length": 192.86328125, "epoch": 0.7637309067273318, "grad_norm": 0.718121349811554, "kl": 0.0386962890625, "learning_rate": 2.3602080624187254e-07, "loss": 0.0015, "reward": 1.7763579487800598, "reward_std": 0.11060105264186859, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.7880766689777374, "step": 1175 }, { "clip_ratio": 0.0, "completion_length": 173.6875, "epoch": 0.764380890477738, "grad_norm": 0.3614491820335388, "kl": 0.035888671875, "learning_rate": 2.35370611183355e-07, "loss": 0.0014, "reward": 1.843487560749054, "reward_std": 0.07256099209189415, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8552062511444092, "step": 1176 }, { "clip_ratio": 0.0, "completion_length": 184.6875, "epoch": 0.7650308742281443, "grad_norm": 0.3913605213165283, "kl": 0.0341796875, "learning_rate": 2.3472041612483746e-07, "loss": 0.0014, "reward": 1.816445529460907, "reward_std": 0.07658944837749004, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8242580592632294, "step": 1177 }, { "clip_ratio": 0.0, "completion_length": 200.29296875, "epoch": 0.7656808579785506, "grad_norm": 0.4486308991909027, "kl": 0.037109375, "learning_rate": 2.3407022106631989e-07, "loss": 0.0015, "reward": 1.7683417797088623, "reward_std": 0.10314005240797997, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7878729701042175, "step": 1178 }, { "clip_ratio": 0.0, "completion_length": 189.53515625, "epoch": 0.7663308417289568, "grad_norm": 0.4180290997028351, "kl": 0.031494140625, "learning_rate": 2.3342002600780233e-07, "loss": 0.0013, "reward": 1.8073444366455078, "reward_std": 0.10813508927822113, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8229694664478302, "step": 1179 }, { "clip_ratio": 0.0, "completion_length": 180.97265625, "epoch": 0.766980825479363, "grad_norm": 0.5057528018951416, "kl": 0.03216552734375, "learning_rate": 2.3276983094928476e-07, "loss": 0.0013, "reward": 1.834154486656189, "reward_std": 0.053780414164066315, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8458731770515442, "step": 1180 }, { "clip_ratio": 0.0, "completion_length": 185.359375, "epoch": 0.7676308092297692, "grad_norm": 0.46091604232788086, "kl": 0.0343017578125, "learning_rate": 2.321196358907672e-07, "loss": 0.0014, "reward": 1.749039351940155, "reward_std": 0.13135002180933952, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7724768817424774, "step": 1181 }, { "clip_ratio": 0.0, "completion_length": 169.484375, "epoch": 0.7682807929801755, "grad_norm": 0.40193411707878113, "kl": 0.0362548828125, "learning_rate": 2.3146944083224968e-07, "loss": 0.0015, "reward": 1.81100732088089, "reward_std": 0.08983615785837173, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8227260708808899, "step": 1182 }, { "clip_ratio": 0.0, "completion_length": 199.4296875, "epoch": 0.7689307767305817, "grad_norm": 0.3999761939048767, "kl": 0.03369140625, "learning_rate": 2.308192457737321e-07, "loss": 0.0013, "reward": 1.7541357278823853, "reward_std": 0.10367318615317345, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7736670076847076, "step": 1183 }, { "clip_ratio": 0.0, "completion_length": 188.40234375, "epoch": 0.769580760480988, "grad_norm": 0.4241648018360138, "kl": 0.03021240234375, "learning_rate": 2.3016905071521455e-07, "loss": 0.0012, "reward": 1.83060622215271, "reward_std": 0.08164843171834946, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8384187519550323, "step": 1184 }, { "clip_ratio": 0.0, "completion_length": 176.234375, "epoch": 0.7702307442313943, "grad_norm": 0.4524229168891907, "kl": 0.0306396484375, "learning_rate": 2.2951885565669698e-07, "loss": 0.0012, "reward": 1.7884928584098816, "reward_std": 0.09722042456269264, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8041179180145264, "step": 1185 }, { "clip_ratio": 0.0, "completion_length": 178.10546875, "epoch": 0.7708807279818004, "grad_norm": 0.6318072080612183, "kl": 0.0347900390625, "learning_rate": 2.2886866059817945e-07, "loss": 0.0014, "reward": 1.8282453417778015, "reward_std": 0.12093406543135643, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.8477766215801239, "step": 1186 }, { "clip_ratio": 0.0, "completion_length": 187.1328125, "epoch": 0.7715307117322067, "grad_norm": 0.4719140827655792, "kl": 0.03350830078125, "learning_rate": 2.282184655396619e-07, "loss": 0.0013, "reward": 1.8111481666564941, "reward_std": 0.10268308594822884, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8228669762611389, "step": 1187 }, { "clip_ratio": 0.0, "completion_length": 182.0078125, "epoch": 0.7721806954826129, "grad_norm": 0.3862394690513611, "kl": 0.0321044921875, "learning_rate": 2.2756827048114432e-07, "loss": 0.0013, "reward": 1.7829917669296265, "reward_std": 0.07796203345060349, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.7908042371273041, "step": 1188 }, { "clip_ratio": 0.0, "completion_length": 195.16015625, "epoch": 0.7728306792330192, "grad_norm": 0.5114907622337341, "kl": 0.0302734375, "learning_rate": 2.2691807542262677e-07, "loss": 0.0012, "reward": 1.7671287655830383, "reward_std": 0.0972078125923872, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.7866599857807159, "step": 1189 }, { "clip_ratio": 0.0, "completion_length": 183.265625, "epoch": 0.7734806629834254, "grad_norm": 0.37296658754348755, "kl": 0.0306396484375, "learning_rate": 2.2626788036410922e-07, "loss": 0.0012, "reward": 1.7947390675544739, "reward_std": 0.07800707593560219, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.8103640675544739, "step": 1190 }, { "clip_ratio": 0.0, "completion_length": 190.50390625, "epoch": 0.7741306467338317, "grad_norm": 0.4109960198402405, "kl": 0.0318603515625, "learning_rate": 2.2561768530559167e-07, "loss": 0.0013, "reward": 1.7814976572990417, "reward_std": 0.10199262574315071, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7971227169036865, "step": 1191 }, { "clip_ratio": 0.0, "completion_length": 188.0546875, "epoch": 0.7747806304842378, "grad_norm": 0.42774316668510437, "kl": 0.03570556640625, "learning_rate": 2.2496749024707412e-07, "loss": 0.0014, "reward": 1.7792039513587952, "reward_std": 0.10977259278297424, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.8026414215564728, "step": 1192 }, { "clip_ratio": 0.0, "completion_length": 208.77734375, "epoch": 0.7754306142346441, "grad_norm": 0.38274434208869934, "kl": 0.03363037109375, "learning_rate": 2.2431729518855654e-07, "loss": 0.0013, "reward": 1.8201509714126587, "reward_std": 0.07132179290056229, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8279634118080139, "step": 1193 }, { "clip_ratio": 0.0, "completion_length": 183.83984375, "epoch": 0.7760805979850504, "grad_norm": 0.43566611409187317, "kl": 0.0401611328125, "learning_rate": 2.23667100130039e-07, "loss": 0.0016, "reward": 1.7607280611991882, "reward_std": 0.14854883402585983, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7841655611991882, "step": 1194 }, { "clip_ratio": 0.0, "completion_length": 187.96484375, "epoch": 0.7767305817354566, "grad_norm": 0.4324680268764496, "kl": 0.0330810546875, "learning_rate": 2.2301690507152144e-07, "loss": 0.0013, "reward": 1.7298167943954468, "reward_std": 0.09712342545390129, "rewards/format_reward_gen": 0.984375, "rewards/llm_reward": 0.7454417943954468, "step": 1195 }, { "clip_ratio": 0.0, "completion_length": 183.4921875, "epoch": 0.7773805654858629, "grad_norm": 0.43270057439804077, "kl": 0.0322265625, "learning_rate": 2.223667100130039e-07, "loss": 0.0013, "reward": 1.8229385018348694, "reward_std": 0.07695582881569862, "rewards/format_reward_gen": 0.99609375, "rewards/llm_reward": 0.8268446624279022, "step": 1196 }, { "clip_ratio": 0.0, "completion_length": 192.06640625, "epoch": 0.7780305492362691, "grad_norm": 0.37520068883895874, "kl": 0.0394287109375, "learning_rate": 2.2171651495448634e-07, "loss": 0.0016, "reward": 1.793242633342743, "reward_std": 0.07833995670080185, "rewards/format_reward_gen": 0.98828125, "rewards/llm_reward": 0.8049613833427429, "step": 1197 }, { "clip_ratio": 0.0, "completion_length": 176.98046875, "epoch": 0.7786805329866753, "grad_norm": 0.40560200810432434, "kl": 0.02667236328125, "learning_rate": 2.2106631989596876e-07, "loss": 0.0011, "reward": 1.8038289546966553, "reward_std": 0.08451887965202332, "rewards/format_reward_gen": 0.9921875, "rewards/llm_reward": 0.8116414844989777, "step": 1198 }, { "clip_ratio": 0.0, "completion_length": 176.6796875, "epoch": 0.7793305167370815, "grad_norm": 0.4518726170063019, "kl": 0.03216552734375, "learning_rate": 2.2041612483745124e-07, "loss": 0.0013, "reward": 1.771854817867279, "reward_std": 0.11043524742126465, "rewards/format_reward_gen": 0.9765625, "rewards/llm_reward": 0.7952923476696014, "step": 1199 }, { "clip_ratio": 0.0, "completion_length": 181.703125, "epoch": 0.7799805004874878, "grad_norm": 0.6881115436553955, "kl": 0.0313720703125, "learning_rate": 2.1976592977893366e-07, "loss": 0.0013, "reward": 1.784915566444397, "reward_std": 0.11860707774758339, "rewards/format_reward_gen": 0.98046875, "rewards/llm_reward": 0.804446816444397, "step": 1200 } ], "logging_steps": 1.0, "max_steps": 1538, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }