{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 7473, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 196.4453125, "epoch": 0.001070520540612873, "grad_norm": 5.03125, "kl": 0.00023896918719401583, "learning_rate": 9.98929479459387e-07, "loss": 0.0, "reward": 0.06715917773544788, "reward_std": 0.6129379905760288, "rewards/reward_func": 0.06715917773544788, "step": 8 }, { "completion_length": 177.9296875, "epoch": 0.002141041081225746, "grad_norm": 4.15625, "kl": 0.00027647913702821825, "learning_rate": 9.978589589187743e-07, "loss": 0.0, "reward": 0.02951172273606062, "reward_std": 0.5923765227198601, "rewards/reward_func": 0.02951172273606062, "step": 16 }, { "completion_length": 173.359375, "epoch": 0.003211561621838619, "grad_norm": 3.34375, "kl": 0.0002405872492090566, "learning_rate": 9.967884383781614e-07, "loss": 0.0, "reward": 0.12952731922268867, "reward_std": 0.6536133792251348, "rewards/reward_func": 0.12952731922268867, "step": 24 }, { "completion_length": 170.0859375, "epoch": 0.004282082162451492, "grad_norm": 3.21875, "kl": 0.0002442408094793791, "learning_rate": 9.957179178375484e-07, "loss": 0.0, "reward": 0.3293330520391464, "reward_std": 0.5732803735882044, "rewards/reward_func": 0.3293330520391464, "step": 32 }, { "completion_length": 190.3828125, "epoch": 0.005352602703064365, "grad_norm": 4.15625, "kl": 0.00024058844792307355, "learning_rate": 9.946473972969355e-07, "loss": 0.0, "reward": 0.36058899760246277, "reward_std": 0.5922529064118862, "rewards/reward_func": 0.36058899760246277, "step": 40 }, { "completion_length": 228.4453125, "epoch": 0.006423123243677238, "grad_norm": 2.765625, "kl": 0.00029027340133325197, "learning_rate": 9.935768767563228e-07, "loss": 0.0, "reward": 0.026657558977603912, "reward_std": 0.48614570777863264, "rewards/reward_func": 0.026657558977603912, "step": 48 }, { "completion_length": 182.453125, "epoch": 0.007493643784290111, "grad_norm": 4.40625, "kl": 0.00024143315386027098, "learning_rate": 9.925063562157099e-07, "loss": 0.0, "reward": 0.3887675404548645, "reward_std": 0.53492078371346, "rewards/reward_func": 0.3887675404548645, "step": 56 }, { "completion_length": 223.5703125, "epoch": 0.008564164324902984, "grad_norm": 2.578125, "kl": 0.000258006857620785, "learning_rate": 9.91435835675097e-07, "loss": 0.0, "reward": 0.31422215048223734, "reward_std": 0.527774453163147, "rewards/reward_func": 0.31422215048223734, "step": 64 }, { "completion_length": 207.1953125, "epoch": 0.009634684865515858, "grad_norm": 4.03125, "kl": 0.0002873431112675462, "learning_rate": 9.90365315134484e-07, "loss": 0.0, "reward": 0.23160290531814098, "reward_std": 0.5312692299485207, "rewards/reward_func": 0.23160290531814098, "step": 72 }, { "completion_length": 170.234375, "epoch": 0.01070520540612873, "grad_norm": 5.09375, "kl": 0.0002804081450449303, "learning_rate": 9.892947945938713e-07, "loss": 0.0, "reward": 0.2816220009699464, "reward_std": 0.600088307633996, "rewards/reward_func": 0.2816220009699464, "step": 80 }, { "completion_length": 175.84375, "epoch": 0.011775725946741603, "grad_norm": 4.125, "kl": 0.00024894280068110675, "learning_rate": 9.882242740532583e-07, "loss": 0.0, "reward": 0.42427181545645, "reward_std": 0.5529402792453766, "rewards/reward_func": 0.42427181545645, "step": 88 }, { "completion_length": 258.6484375, "epoch": 0.012846246487354477, "grad_norm": 4.03125, "kl": 0.00030983560463937465, "learning_rate": 9.871537535126454e-07, "loss": 0.0, "reward": -0.342121371999383, "reward_std": 0.5698626078665257, "rewards/reward_func": -0.342121371999383, "step": 96 }, { "completion_length": 225.90625, "epoch": 0.013916767027967349, "grad_norm": 3.484375, "kl": 0.0002933137984655332, "learning_rate": 9.860832329720325e-07, "loss": 0.0, "reward": 0.12576067401096225, "reward_std": 0.6368702314794064, "rewards/reward_func": 0.12576067401096225, "step": 104 }, { "completion_length": 168.5703125, "epoch": 0.014987287568580221, "grad_norm": 3.765625, "kl": 0.0003037826609215699, "learning_rate": 9.850127124314198e-07, "loss": 0.0, "reward": 0.17080755531787872, "reward_std": 0.5442508868873119, "rewards/reward_func": 0.17080755531787872, "step": 112 }, { "completion_length": 179.1640625, "epoch": 0.016057808109193095, "grad_norm": 3.390625, "kl": 0.0002824857710947981, "learning_rate": 9.839421918908068e-07, "loss": 0.0, "reward": 0.18145466595888138, "reward_std": 0.5059491451829672, "rewards/reward_func": 0.18145466595888138, "step": 120 }, { "completion_length": 191.78125, "epoch": 0.017128328649805968, "grad_norm": 3.75, "kl": 0.00031089498952496797, "learning_rate": 9.82871671350194e-07, "loss": 0.0, "reward": 0.2814232921227813, "reward_std": 0.5498775225132704, "rewards/reward_func": 0.2814232921227813, "step": 128 }, { "completion_length": 154.2578125, "epoch": 0.01819884919041884, "grad_norm": 3.140625, "kl": 0.0003297478033346124, "learning_rate": 9.818011508095812e-07, "loss": 0.0, "reward": 0.46491820737719536, "reward_std": 0.48998717963695526, "rewards/reward_func": 0.46491820737719536, "step": 136 }, { "completion_length": 180.1171875, "epoch": 0.019269369731031716, "grad_norm": 4.53125, "kl": 0.00035076765561825596, "learning_rate": 9.807306302689682e-07, "loss": 0.0, "reward": 0.27495551854372025, "reward_std": 0.6060709934681654, "rewards/reward_func": 0.27495551854372025, "step": 144 }, { "completion_length": 219.390625, "epoch": 0.020339890271644588, "grad_norm": 3.203125, "kl": 0.00032744045893196017, "learning_rate": 9.796601097283553e-07, "loss": 0.0, "reward": -0.04555501043796539, "reward_std": 0.557650757022202, "rewards/reward_func": -0.04555501043796539, "step": 152 }, { "completion_length": 197.6875, "epoch": 0.02141041081225746, "grad_norm": 4.1875, "kl": 0.0003747515474969987, "learning_rate": 9.785895891877424e-07, "loss": 0.0, "reward": 0.04799163248389959, "reward_std": 0.7354221493005753, "rewards/reward_func": 0.04799163248389959, "step": 160 }, { "completion_length": 158.921875, "epoch": 0.022480931352870333, "grad_norm": 4.71875, "kl": 0.00034361303187324665, "learning_rate": 9.775190686471297e-07, "loss": 0.0, "reward": 0.46762343868613243, "reward_std": 0.5215234383940697, "rewards/reward_func": 0.46762343868613243, "step": 168 }, { "completion_length": 179.4296875, "epoch": 0.023551451893483205, "grad_norm": 2.390625, "kl": 0.0003124878894595895, "learning_rate": 9.764485481065167e-07, "loss": 0.0, "reward": 0.15805792342871428, "reward_std": 0.6787187531590462, "rewards/reward_func": 0.15805792342871428, "step": 176 }, { "completion_length": 185.5546875, "epoch": 0.02462197243409608, "grad_norm": 3.078125, "kl": 0.0003605757010518573, "learning_rate": 9.75378027565904e-07, "loss": 0.0, "reward": 0.30135733261704445, "reward_std": 0.4861781559884548, "rewards/reward_func": 0.30135733261704445, "step": 184 }, { "completion_length": 176.8984375, "epoch": 0.025692492974708953, "grad_norm": 2.671875, "kl": 0.0003968792916566599, "learning_rate": 9.743075070252909e-07, "loss": 0.0, "reward": 0.18422270519658923, "reward_std": 0.5276681184768677, "rewards/reward_func": 0.18422270519658923, "step": 192 }, { "completion_length": 196.234375, "epoch": 0.026763013515321826, "grad_norm": 3.8125, "kl": 0.0003164229347021319, "learning_rate": 9.732369864846782e-07, "loss": 0.0, "reward": 0.09587159566581249, "reward_std": 0.5885980241000652, "rewards/reward_func": 0.09587159566581249, "step": 200 }, { "completion_length": 187.15625, "epoch": 0.027833534055934698, "grad_norm": 5.65625, "kl": 0.00039002683661237825, "learning_rate": 9.721664659440652e-07, "loss": 0.0, "reward": 0.19378361385315657, "reward_std": 0.439416766166687, "rewards/reward_func": 0.19378361385315657, "step": 208 }, { "completion_length": 186.359375, "epoch": 0.02890405459654757, "grad_norm": 4.15625, "kl": 0.0004315389560360927, "learning_rate": 9.710959454034525e-07, "loss": 0.0, "reward": 0.2533123311586678, "reward_std": 0.6090654768049717, "rewards/reward_func": 0.2533123311586678, "step": 216 }, { "completion_length": 206.7265625, "epoch": 0.029974575137160443, "grad_norm": 3.8125, "kl": 0.00036658769749919884, "learning_rate": 9.700254248628396e-07, "loss": 0.0, "reward": 0.11443387717008591, "reward_std": 0.6023523053154349, "rewards/reward_func": 0.11443387717008591, "step": 224 }, { "completion_length": 189.0703125, "epoch": 0.03104509567777332, "grad_norm": 4.78125, "kl": 0.00041832886927295476, "learning_rate": 9.689549043222266e-07, "loss": 0.0, "reward": -0.07620369084179401, "reward_std": 0.6309537254273891, "rewards/reward_func": -0.07620369084179401, "step": 232 }, { "completion_length": 193.34375, "epoch": 0.03211561621838619, "grad_norm": 5.03125, "kl": 0.00045970915380166844, "learning_rate": 9.678843837816137e-07, "loss": 0.0, "reward": 0.13946556020528078, "reward_std": 0.5310352686792612, "rewards/reward_func": 0.13946556020528078, "step": 240 }, { "completion_length": 198.046875, "epoch": 0.03318613675899906, "grad_norm": 4.34375, "kl": 0.00045460829278454185, "learning_rate": 9.66813863241001e-07, "loss": 0.0, "reward": 0.20321442000567913, "reward_std": 0.7352660372853279, "rewards/reward_func": 0.20321442000567913, "step": 248 }, { "completion_length": 197.2265625, "epoch": 0.034256657299611935, "grad_norm": 3.21875, "kl": 0.0004281356050341856, "learning_rate": 9.65743342700388e-07, "loss": 0.0, "reward": 0.20208348147571087, "reward_std": 0.5523553621023893, "rewards/reward_func": 0.20208348147571087, "step": 256 }, { "completion_length": 199.7109375, "epoch": 0.03532717784022481, "grad_norm": 5.15625, "kl": 0.000508931974763982, "learning_rate": 9.646728221597751e-07, "loss": 0.0, "reward": 0.15087968483567238, "reward_std": 0.6751584373414516, "rewards/reward_func": 0.15087968483567238, "step": 264 }, { "completion_length": 181.7578125, "epoch": 0.03639769838083768, "grad_norm": 3.109375, "kl": 0.0004673556577472482, "learning_rate": 9.636023016191622e-07, "loss": 0.0, "reward": 0.4762549586594105, "reward_std": 0.5292778257280588, "rewards/reward_func": 0.4762549586594105, "step": 272 }, { "completion_length": 172.3046875, "epoch": 0.03746821892145055, "grad_norm": 3.515625, "kl": 0.00042566236152197234, "learning_rate": 9.625317810785495e-07, "loss": 0.0, "reward": 0.36441371217370033, "reward_std": 0.4376997593790293, "rewards/reward_func": 0.36441371217370033, "step": 280 }, { "completion_length": 174.234375, "epoch": 0.03853873946206343, "grad_norm": 4.875, "kl": 0.0004668605834012851, "learning_rate": 9.614612605379365e-07, "loss": 0.0, "reward": 0.17927304655313492, "reward_std": 0.5315880142152309, "rewards/reward_func": 0.17927304655313492, "step": 288 }, { "completion_length": 152.8671875, "epoch": 0.039609260002676304, "grad_norm": 3.078125, "kl": 0.0005132910300744697, "learning_rate": 9.603907399973236e-07, "loss": 0.0, "reward": 0.5605849623680115, "reward_std": 0.5153817608952522, "rewards/reward_func": 0.5605849623680115, "step": 296 }, { "completion_length": 203.5390625, "epoch": 0.040679780543289176, "grad_norm": 3.796875, "kl": 0.00048365409747930244, "learning_rate": 9.593202194567109e-07, "loss": 0.0, "reward": -0.10374991549178958, "reward_std": 0.5484482925385237, "rewards/reward_func": -0.10374991549178958, "step": 304 }, { "completion_length": 203.1015625, "epoch": 0.04175030108390205, "grad_norm": 3.28125, "kl": 0.0005178198443900328, "learning_rate": 9.58249698916098e-07, "loss": 0.0, "reward": 0.15655188029631972, "reward_std": 0.6044143028557301, "rewards/reward_func": 0.15655188029631972, "step": 312 }, { "completion_length": 193.34375, "epoch": 0.04282082162451492, "grad_norm": 3.34375, "kl": 0.0005464391106215771, "learning_rate": 9.57179178375485e-07, "loss": 0.0, "reward": 0.20653630187734962, "reward_std": 0.637122736312449, "rewards/reward_func": 0.20653630187734962, "step": 320 }, { "completion_length": 182.390625, "epoch": 0.04389134216512779, "grad_norm": 3.953125, "kl": 0.0005447999064926989, "learning_rate": 9.56108657834872e-07, "loss": 0.0, "reward": 0.0629437193274498, "reward_std": 0.6482261158525944, "rewards/reward_func": 0.0629437193274498, "step": 328 }, { "completion_length": 210.1875, "epoch": 0.044961862705740666, "grad_norm": 3.796875, "kl": 0.0005637894355459139, "learning_rate": 9.550381372942594e-07, "loss": 0.0, "reward": 0.005680203437805176, "reward_std": 0.5875861989334226, "rewards/reward_func": 0.005680203437805176, "step": 336 }, { "completion_length": 179.8671875, "epoch": 0.04603238324635354, "grad_norm": 3.625, "kl": 0.00048511310160392895, "learning_rate": 9.539676167536464e-07, "loss": 0.0, "reward": 0.41209501400589943, "reward_std": 0.49328203592449427, "rewards/reward_func": 0.41209501400589943, "step": 344 }, { "completion_length": 223.6875, "epoch": 0.04710290378696641, "grad_norm": 3.765625, "kl": 0.0005501342748175375, "learning_rate": 9.528970962130335e-07, "loss": 0.0, "reward": -0.01573021337389946, "reward_std": 0.6180381271988153, "rewards/reward_func": -0.01573021337389946, "step": 352 }, { "completion_length": 179.4609375, "epoch": 0.04817342432757928, "grad_norm": 4.1875, "kl": 0.0005429566881502979, "learning_rate": 9.518265756724207e-07, "loss": 0.0, "reward": 0.3368812333792448, "reward_std": 0.42025264725089073, "rewards/reward_func": 0.3368812333792448, "step": 360 }, { "completion_length": 217.0234375, "epoch": 0.04924394486819216, "grad_norm": 3.640625, "kl": 0.0005859209413756616, "learning_rate": 9.507560551318078e-07, "loss": 0.0, "reward": 0.07761704362928867, "reward_std": 0.539461400359869, "rewards/reward_func": 0.07761704362928867, "step": 368 }, { "completion_length": 184.6484375, "epoch": 0.050314465408805034, "grad_norm": 3.46875, "kl": 0.0006783130666008219, "learning_rate": 9.496855345911949e-07, "loss": 0.0, "reward": 0.3878909517079592, "reward_std": 0.5879320036619902, "rewards/reward_func": 0.3878909517079592, "step": 376 }, { "completion_length": 237.3125, "epoch": 0.051384985949417906, "grad_norm": 2.828125, "kl": 0.0005360892773751402, "learning_rate": 9.486150140505821e-07, "loss": 0.0, "reward": 0.2408028580248356, "reward_std": 0.6049665845930576, "rewards/reward_func": 0.2408028580248356, "step": 384 }, { "completion_length": 181.765625, "epoch": 0.05245550649003078, "grad_norm": 4.1875, "kl": 0.0007057523835101165, "learning_rate": 9.475444935099693e-07, "loss": 0.0, "reward": 0.1389563176780939, "reward_std": 0.5876767132431269, "rewards/reward_func": 0.1389563176780939, "step": 392 }, { "completion_length": 167.578125, "epoch": 0.05352602703064365, "grad_norm": 3.859375, "kl": 0.0006887019626447, "learning_rate": 9.464739729693562e-07, "loss": 0.0, "reward": 0.6259515974670649, "reward_std": 0.3476364128291607, "rewards/reward_func": 0.6259515974670649, "step": 400 }, { "completion_length": 197.25, "epoch": 0.05459654757125652, "grad_norm": 3.03125, "kl": 0.0006218861890374683, "learning_rate": 9.454034524287434e-07, "loss": 0.0, "reward": 0.05700792092829943, "reward_std": 0.428726595826447, "rewards/reward_func": 0.05700792092829943, "step": 408 }, { "completion_length": 185.859375, "epoch": 0.055667068111869396, "grad_norm": 3.390625, "kl": 0.0005967724355286919, "learning_rate": 9.443329318881306e-07, "loss": 0.0, "reward": 0.2252086065709591, "reward_std": 0.5162075459957123, "rewards/reward_func": 0.2252086065709591, "step": 416 }, { "completion_length": 162.9296875, "epoch": 0.05673758865248227, "grad_norm": 4.28125, "kl": 0.0008201822929549962, "learning_rate": 9.432624113475178e-07, "loss": 0.0, "reward": 0.322255807928741, "reward_std": 0.6453814581036568, "rewards/reward_func": 0.322255807928741, "step": 424 }, { "completion_length": 211.1015625, "epoch": 0.05780810919309514, "grad_norm": 4.0, "kl": 0.0006929989831405692, "learning_rate": 9.421918908069048e-07, "loss": 0.0, "reward": 0.015094950795173645, "reward_std": 0.5742807984352112, "rewards/reward_func": 0.015094950795173645, "step": 432 }, { "completion_length": 190.5625, "epoch": 0.05887862973370801, "grad_norm": 2.421875, "kl": 0.0007785350171616301, "learning_rate": 9.411213702662919e-07, "loss": 0.0, "reward": 0.2807863000780344, "reward_std": 0.38556696847081184, "rewards/reward_func": 0.2807863000780344, "step": 440 }, { "completion_length": 205.90625, "epoch": 0.059949150274320885, "grad_norm": 3.34375, "kl": 0.000683286452840548, "learning_rate": 9.400508497256791e-07, "loss": 0.0, "reward": 0.17150266654789448, "reward_std": 0.6842659376561642, "rewards/reward_func": 0.17150266654789448, "step": 448 }, { "completion_length": 170.46875, "epoch": 0.061019670814933764, "grad_norm": 4.84375, "kl": 0.0007365654601017013, "learning_rate": 9.389803291850661e-07, "loss": 0.0, "reward": 0.28804031014442444, "reward_std": 0.5351038463413715, "rewards/reward_func": 0.28804031014442444, "step": 456 }, { "completion_length": 159.1875, "epoch": 0.06209019135554664, "grad_norm": 5.8125, "kl": 0.0008180349177564494, "learning_rate": 9.379098086444533e-07, "loss": 0.0, "reward": 0.3410092554986477, "reward_std": 0.652816615998745, "rewards/reward_func": 0.3410092554986477, "step": 464 }, { "completion_length": 164.8125, "epoch": 0.0631607118961595, "grad_norm": 5.21875, "kl": 0.0009274725816794671, "learning_rate": 9.368392881038405e-07, "loss": 0.0, "reward": 0.42266903538256884, "reward_std": 0.5641947891563177, "rewards/reward_func": 0.42266903538256884, "step": 472 }, { "completion_length": 194.984375, "epoch": 0.06423123243677238, "grad_norm": 3.078125, "kl": 0.0007724944334768225, "learning_rate": 9.357687675632276e-07, "loss": 0.0, "reward": 0.14429602678865194, "reward_std": 0.6491441205143929, "rewards/reward_func": 0.14429602678865194, "step": 480 }, { "completion_length": 162.75, "epoch": 0.06530175297738525, "grad_norm": 3.0625, "kl": 0.0008644247645861469, "learning_rate": 9.346982470226146e-07, "loss": 0.0, "reward": 0.4321166332811117, "reward_std": 0.595779299736023, "rewards/reward_func": 0.4321166332811117, "step": 488 }, { "completion_length": 206.21875, "epoch": 0.06637227351799813, "grad_norm": 5.3125, "kl": 0.0008183796162484214, "learning_rate": 9.336277264820018e-07, "loss": 0.0, "reward": 0.15042403992265463, "reward_std": 0.6641352027654648, "rewards/reward_func": 0.15042403992265463, "step": 496 }, { "completion_length": 169.53125, "epoch": 0.067442794058611, "grad_norm": 3.359375, "kl": 0.0007982932875165716, "learning_rate": 9.32557205941389e-07, "loss": 0.0, "reward": 0.5449392115697265, "reward_std": 0.4078510096296668, "rewards/reward_func": 0.5449392115697265, "step": 504 }, { "completion_length": 217.1328125, "epoch": 0.06851331459922387, "grad_norm": 4.46875, "kl": 0.0007571115165774245, "learning_rate": 9.314866854007762e-07, "loss": 0.0, "reward": -0.14380409568548203, "reward_std": 0.42090473882853985, "rewards/reward_func": -0.14380409568548203, "step": 512 }, { "completion_length": 188.7578125, "epoch": 0.06958383513983675, "grad_norm": 3.96875, "kl": 0.0009429744168301113, "learning_rate": 9.304161648601631e-07, "loss": 0.0, "reward": 0.2523565851151943, "reward_std": 0.5918965879827738, "rewards/reward_func": 0.2523565851151943, "step": 520 }, { "completion_length": 166.8671875, "epoch": 0.07065435568044962, "grad_norm": 3.796875, "kl": 0.0008449588422081433, "learning_rate": 9.293456443195503e-07, "loss": 0.0, "reward": 0.23421072773635387, "reward_std": 0.5638821180909872, "rewards/reward_func": 0.23421072773635387, "step": 528 }, { "completion_length": 162.359375, "epoch": 0.0717248762210625, "grad_norm": 4.03125, "kl": 0.0010499468044145033, "learning_rate": 9.282751237789375e-07, "loss": 0.0, "reward": 0.14197909273207188, "reward_std": 0.5628513153642416, "rewards/reward_func": 0.14197909273207188, "step": 536 }, { "completion_length": 149.3203125, "epoch": 0.07279539676167536, "grad_norm": 4.3125, "kl": 0.0009130838298005983, "learning_rate": 9.272046032383246e-07, "loss": 0.0, "reward": 0.42748846486210823, "reward_std": 0.4888880178332329, "rewards/reward_func": 0.42748846486210823, "step": 544 }, { "completion_length": 168.25, "epoch": 0.07386591730228824, "grad_norm": 3.09375, "kl": 0.00098653764143819, "learning_rate": 9.261340826977117e-07, "loss": 0.0, "reward": 0.32451459113508463, "reward_std": 0.5782719142735004, "rewards/reward_func": 0.32451459113508463, "step": 552 }, { "completion_length": 186.59375, "epoch": 0.0749364378429011, "grad_norm": 4.34375, "kl": 0.0009752021069289185, "learning_rate": 9.250635621570988e-07, "loss": 0.0, "reward": 0.20521394163370132, "reward_std": 0.5094065079465508, "rewards/reward_func": 0.20521394163370132, "step": 560 }, { "completion_length": 175.78125, "epoch": 0.07600695838351398, "grad_norm": 4.25, "kl": 0.0010090179930557497, "learning_rate": 9.23993041616486e-07, "loss": 0.0, "reward": 0.5578707046806812, "reward_std": 0.4845643825829029, "rewards/reward_func": 0.5578707046806812, "step": 568 }, { "completion_length": 158.234375, "epoch": 0.07707747892412686, "grad_norm": 4.6875, "kl": 0.0011502801644382998, "learning_rate": 9.229225210758731e-07, "loss": 0.0, "reward": 0.4859929271042347, "reward_std": 0.5507038980722427, "rewards/reward_func": 0.4859929271042347, "step": 576 }, { "completion_length": 200.0234375, "epoch": 0.07814799946473973, "grad_norm": 3.9375, "kl": 0.0011561861392692663, "learning_rate": 9.218520005352602e-07, "loss": 0.0, "reward": 0.16512918565422297, "reward_std": 0.5027751969173551, "rewards/reward_func": 0.16512918565422297, "step": 584 }, { "completion_length": 158.3515625, "epoch": 0.07921852000535261, "grad_norm": 3.390625, "kl": 0.0012836234309361316, "learning_rate": 9.207814799946474e-07, "loss": 0.0001, "reward": 0.4137600362300873, "reward_std": 0.5193404145538807, "rewards/reward_func": 0.4137600362300873, "step": 592 }, { "completion_length": 162.8828125, "epoch": 0.08028904054596547, "grad_norm": 2.125, "kl": 0.0011869178197230212, "learning_rate": 9.197109594540344e-07, "loss": 0.0, "reward": 0.39771614968776703, "reward_std": 0.6107706986367702, "rewards/reward_func": 0.39771614968776703, "step": 600 }, { "completion_length": 176.65625, "epoch": 0.08135956108657835, "grad_norm": 4.78125, "kl": 0.0011795180544140749, "learning_rate": 9.186404389134216e-07, "loss": 0.0, "reward": 0.0783949107863009, "reward_std": 0.6460004411637783, "rewards/reward_func": 0.0783949107863009, "step": 608 }, { "completion_length": 158.5390625, "epoch": 0.08243008162719122, "grad_norm": 3.59375, "kl": 0.0013605851854663342, "learning_rate": 9.175699183728087e-07, "loss": 0.0001, "reward": 0.3015612084418535, "reward_std": 0.46242015063762665, "rewards/reward_func": 0.3015612084418535, "step": 616 }, { "completion_length": 192.0, "epoch": 0.0835006021678041, "grad_norm": 6.0625, "kl": 0.001107029449485708, "learning_rate": 9.164993978321959e-07, "loss": 0.0, "reward": -0.052582718431949615, "reward_std": 0.521589694544673, "rewards/reward_func": -0.052582718431949615, "step": 624 }, { "completion_length": 167.3828125, "epoch": 0.08457112270841696, "grad_norm": 3.1875, "kl": 0.0013894213043386117, "learning_rate": 9.15428877291583e-07, "loss": 0.0001, "reward": 0.2013978809118271, "reward_std": 0.6684001944959164, "rewards/reward_func": 0.2013978809118271, "step": 632 }, { "completion_length": 186.6640625, "epoch": 0.08564164324902984, "grad_norm": 3.921875, "kl": 0.0010865220101550221, "learning_rate": 9.143583567509702e-07, "loss": 0.0, "reward": 0.6091820821166039, "reward_std": 0.49955446273088455, "rewards/reward_func": 0.6091820821166039, "step": 640 }, { "completion_length": 202.4609375, "epoch": 0.08671216378964271, "grad_norm": 3.6875, "kl": 0.0012401975327520631, "learning_rate": 9.132878362103572e-07, "loss": 0.0, "reward": 0.14060556702315807, "reward_std": 0.6868433952331543, "rewards/reward_func": 0.14060556702315807, "step": 648 }, { "completion_length": 157.953125, "epoch": 0.08778268433025559, "grad_norm": 4.21875, "kl": 0.0014552801876561716, "learning_rate": 9.122173156697443e-07, "loss": 0.0001, "reward": 0.27967471070587635, "reward_std": 0.5355266528204083, "rewards/reward_func": 0.27967471070587635, "step": 656 }, { "completion_length": 188.9609375, "epoch": 0.08885320487086847, "grad_norm": 2.8125, "kl": 0.0012782855046680197, "learning_rate": 9.111467951291315e-07, "loss": 0.0001, "reward": 0.2866704575717449, "reward_std": 0.46457840129733086, "rewards/reward_func": 0.2866704575717449, "step": 664 }, { "completion_length": 202.0078125, "epoch": 0.08992372541148133, "grad_norm": 3.625, "kl": 0.0010314229875802994, "learning_rate": 9.100762745885187e-07, "loss": 0.0, "reward": 0.21837860718369484, "reward_std": 0.5863924492150545, "rewards/reward_func": 0.21837860718369484, "step": 672 }, { "completion_length": 181.328125, "epoch": 0.09099424595209421, "grad_norm": 4.25, "kl": 0.0011778115513152443, "learning_rate": 9.090057540479058e-07, "loss": 0.0, "reward": 0.17519081057980657, "reward_std": 0.5138188861310482, "rewards/reward_func": 0.17519081057980657, "step": 680 }, { "completion_length": 196.46875, "epoch": 0.09206476649270708, "grad_norm": 4.84375, "kl": 0.0013976221380289644, "learning_rate": 9.079352335072928e-07, "loss": 0.0001, "reward": 0.07826100569218397, "reward_std": 0.6565159633755684, "rewards/reward_func": 0.07826100569218397, "step": 688 }, { "completion_length": 142.6171875, "epoch": 0.09313528703331996, "grad_norm": 4.3125, "kl": 0.0014903126284480095, "learning_rate": 9.0686471296668e-07, "loss": 0.0001, "reward": 0.4409363344311714, "reward_std": 0.6269242819398642, "rewards/reward_func": 0.4409363344311714, "step": 696 }, { "completion_length": 164.75, "epoch": 0.09420580757393282, "grad_norm": 2.859375, "kl": 0.001257821699255146, "learning_rate": 9.057941924260672e-07, "loss": 0.0001, "reward": 0.4695241190493107, "reward_std": 0.4753529988229275, "rewards/reward_func": 0.4695241190493107, "step": 704 }, { "completion_length": 175.765625, "epoch": 0.0952763281145457, "grad_norm": 3.5625, "kl": 0.0015129576058825478, "learning_rate": 9.047236718854542e-07, "loss": 0.0001, "reward": 0.02202584408223629, "reward_std": 0.6471435278654099, "rewards/reward_func": 0.02202584408223629, "step": 712 }, { "completion_length": 178.46875, "epoch": 0.09634684865515857, "grad_norm": 2.671875, "kl": 0.0014852698805043474, "learning_rate": 9.036531513448414e-07, "loss": 0.0001, "reward": 0.1112822787836194, "reward_std": 0.6299657188355923, "rewards/reward_func": 0.1112822787836194, "step": 720 }, { "completion_length": 179.171875, "epoch": 0.09741736919577144, "grad_norm": 3.65625, "kl": 0.001357251821900718, "learning_rate": 9.025826308042285e-07, "loss": 0.0001, "reward": 0.03050302341580391, "reward_std": 0.4494458809494972, "rewards/reward_func": 0.03050302341580391, "step": 728 }, { "completion_length": 179.171875, "epoch": 0.09848788973638432, "grad_norm": 3.21875, "kl": 0.0012782294361386448, "learning_rate": 9.015121102636157e-07, "loss": 0.0001, "reward": 0.07521175127476454, "reward_std": 0.5754083581268787, "rewards/reward_func": 0.07521175127476454, "step": 736 }, { "completion_length": 170.8046875, "epoch": 0.09955841027699719, "grad_norm": 3.640625, "kl": 0.0014729191461810842, "learning_rate": 9.004415897230027e-07, "loss": 0.0001, "reward": 0.19647281896322966, "reward_std": 0.5569281429052353, "rewards/reward_func": 0.19647281896322966, "step": 744 }, { "completion_length": 170.9609375, "epoch": 0.10062893081761007, "grad_norm": 3.390625, "kl": 0.0015401854761876166, "learning_rate": 8.993710691823899e-07, "loss": 0.0001, "reward": 0.3724030330777168, "reward_std": 0.3632662743330002, "rewards/reward_func": 0.3724030330777168, "step": 752 }, { "completion_length": 221.6171875, "epoch": 0.10169945135822293, "grad_norm": 4.15625, "kl": 0.0013893990762881003, "learning_rate": 8.983005486417771e-07, "loss": 0.0001, "reward": -0.09233509004116058, "reward_std": 0.48617786914110184, "rewards/reward_func": -0.09233509004116058, "step": 760 }, { "completion_length": 202.3828125, "epoch": 0.10276997189883581, "grad_norm": 4.375, "kl": 0.0012778284071828239, "learning_rate": 8.972300281011642e-07, "loss": 0.0001, "reward": 0.2091209925711155, "reward_std": 0.6527585946023464, "rewards/reward_func": 0.2091209925711155, "step": 768 }, { "completion_length": 193.1640625, "epoch": 0.10384049243944868, "grad_norm": 3.109375, "kl": 0.0013181737012928352, "learning_rate": 8.961595075605512e-07, "loss": 0.0001, "reward": 0.3330334695056081, "reward_std": 0.541321462020278, "rewards/reward_func": 0.3330334695056081, "step": 776 }, { "completion_length": 152.453125, "epoch": 0.10491101298006156, "grad_norm": 3.359375, "kl": 0.0017155654932139441, "learning_rate": 8.950889870199384e-07, "loss": 0.0001, "reward": 0.5970601001754403, "reward_std": 0.4666150966659188, "rewards/reward_func": 0.5970601001754403, "step": 784 }, { "completion_length": 169.125, "epoch": 0.10598153352067442, "grad_norm": 3.578125, "kl": 0.0017139802366727963, "learning_rate": 8.940184664793256e-07, "loss": 0.0001, "reward": 0.3862100951373577, "reward_std": 0.4441477656364441, "rewards/reward_func": 0.3862100951373577, "step": 792 }, { "completion_length": 155.3828125, "epoch": 0.1070520540612873, "grad_norm": 3.15625, "kl": 0.0015526109855272807, "learning_rate": 8.929479459387127e-07, "loss": 0.0001, "reward": 0.2984956353902817, "reward_std": 0.586381059139967, "rewards/reward_func": 0.2984956353902817, "step": 800 }, { "completion_length": 196.40625, "epoch": 0.10812257460190017, "grad_norm": 4.90625, "kl": 0.0014231447203201242, "learning_rate": 8.918774253980997e-07, "loss": 0.0001, "reward": 0.21418001921847463, "reward_std": 0.6311414241790771, "rewards/reward_func": 0.21418001921847463, "step": 808 }, { "completion_length": 138.2109375, "epoch": 0.10919309514251305, "grad_norm": 4.03125, "kl": 0.0017638935969443992, "learning_rate": 8.908069048574869e-07, "loss": 0.0001, "reward": 0.5397277176380157, "reward_std": 0.5305888652801514, "rewards/reward_func": 0.5397277176380157, "step": 816 }, { "completion_length": 178.0703125, "epoch": 0.11026361568312593, "grad_norm": 3.84375, "kl": 0.0016187937144422904, "learning_rate": 8.897363843168741e-07, "loss": 0.0001, "reward": 0.3325108243152499, "reward_std": 0.5717135239392519, "rewards/reward_func": 0.3325108243152499, "step": 824 }, { "completion_length": 148.953125, "epoch": 0.11133413622373879, "grad_norm": 5.0, "kl": 0.0018482063169358298, "learning_rate": 8.886658637762612e-07, "loss": 0.0001, "reward": 0.2947835847735405, "reward_std": 0.4330580784007907, "rewards/reward_func": 0.2947835847735405, "step": 832 }, { "completion_length": 164.9375, "epoch": 0.11240465676435167, "grad_norm": 4.09375, "kl": 0.0016665154980728403, "learning_rate": 8.875953432356483e-07, "loss": 0.0001, "reward": 0.4421437568962574, "reward_std": 0.6194501928985119, "rewards/reward_func": 0.4421437568962574, "step": 840 }, { "completion_length": 151.4140625, "epoch": 0.11347517730496454, "grad_norm": 3.0, "kl": 0.0017541930938023143, "learning_rate": 8.865248226950354e-07, "loss": 0.0001, "reward": 0.634210865944624, "reward_std": 0.43934057652950287, "rewards/reward_func": 0.634210865944624, "step": 848 }, { "completion_length": 163.0390625, "epoch": 0.11454569784557742, "grad_norm": 4.1875, "kl": 0.001597623537236359, "learning_rate": 8.854543021544225e-07, "loss": 0.0001, "reward": 0.49650172144174576, "reward_std": 0.5100172646343708, "rewards/reward_func": 0.49650172144174576, "step": 856 }, { "completion_length": 175.515625, "epoch": 0.11561621838619028, "grad_norm": 4.28125, "kl": 0.0016097126208478585, "learning_rate": 8.843837816138097e-07, "loss": 0.0001, "reward": 0.300532303750515, "reward_std": 0.5050319191068411, "rewards/reward_func": 0.300532303750515, "step": 864 }, { "completion_length": 166.609375, "epoch": 0.11668673892680316, "grad_norm": 3.375, "kl": 0.001671285106567666, "learning_rate": 8.833132610731968e-07, "loss": 0.0001, "reward": 0.3196424636989832, "reward_std": 0.6211317032575607, "rewards/reward_func": 0.3196424636989832, "step": 872 }, { "completion_length": 199.15625, "epoch": 0.11775725946741603, "grad_norm": 4.46875, "kl": 0.0015437143010785803, "learning_rate": 8.82242740532584e-07, "loss": 0.0001, "reward": -0.04952175496146083, "reward_std": 0.6928350441157818, "rewards/reward_func": -0.04952175496146083, "step": 880 }, { "completion_length": 195.078125, "epoch": 0.1188277800080289, "grad_norm": 3.625, "kl": 0.0014302593117463402, "learning_rate": 8.811722199919711e-07, "loss": 0.0001, "reward": 0.2765323193743825, "reward_std": 0.5081478040665388, "rewards/reward_func": 0.2765323193743825, "step": 888 }, { "completion_length": 162.1640625, "epoch": 0.11989830054864177, "grad_norm": 4.3125, "kl": 0.002093081347993575, "learning_rate": 8.801016994513581e-07, "loss": 0.0001, "reward": 0.43932087533175945, "reward_std": 0.6151396594941616, "rewards/reward_func": 0.43932087533175945, "step": 896 }, { "completion_length": 188.265625, "epoch": 0.12096882108925465, "grad_norm": 4.21875, "kl": 0.0015162140916800126, "learning_rate": 8.790311789107453e-07, "loss": 0.0001, "reward": 0.30038960836827755, "reward_std": 0.6118085775524378, "rewards/reward_func": 0.30038960836827755, "step": 904 }, { "completion_length": 185.625, "epoch": 0.12203934162986753, "grad_norm": 4.21875, "kl": 0.0020139318803558126, "learning_rate": 8.779606583701324e-07, "loss": 0.0001, "reward": 0.09830181300640106, "reward_std": 0.4306083731353283, "rewards/reward_func": 0.09830181300640106, "step": 912 }, { "completion_length": 184.1875, "epoch": 0.1231098621704804, "grad_norm": 3.15625, "kl": 0.0016747360059525818, "learning_rate": 8.768901378295196e-07, "loss": 0.0001, "reward": 0.14476243034005165, "reward_std": 0.5790487378835678, "rewards/reward_func": 0.14476243034005165, "step": 920 }, { "completion_length": 159.734375, "epoch": 0.12418038271109327, "grad_norm": 3.71875, "kl": 0.0016227394880843349, "learning_rate": 8.758196172889067e-07, "loss": 0.0001, "reward": 0.43369535729289055, "reward_std": 0.6066659651696682, "rewards/reward_func": 0.43369535729289055, "step": 928 }, { "completion_length": 197.9375, "epoch": 0.12525090325170615, "grad_norm": 4.5, "kl": 0.0014731917763128877, "learning_rate": 8.747490967482938e-07, "loss": 0.0001, "reward": 0.2775337900966406, "reward_std": 0.4666815670207143, "rewards/reward_func": 0.2775337900966406, "step": 936 }, { "completion_length": 190.015625, "epoch": 0.126321423792319, "grad_norm": 4.21875, "kl": 0.0017878647340694442, "learning_rate": 8.736785762076809e-07, "loss": 0.0001, "reward": 0.12123087048530579, "reward_std": 0.5715042147785425, "rewards/reward_func": 0.12123087048530579, "step": 944 }, { "completion_length": 167.1171875, "epoch": 0.12739194433293188, "grad_norm": 5.25, "kl": 0.0019433694251347333, "learning_rate": 8.726080556670681e-07, "loss": 0.0001, "reward": 0.45328211411833763, "reward_std": 0.5355701018124819, "rewards/reward_func": 0.45328211411833763, "step": 952 }, { "completion_length": 161.625, "epoch": 0.12846246487354476, "grad_norm": 5.0, "kl": 0.002197007488575764, "learning_rate": 8.715375351264552e-07, "loss": 0.0001, "reward": 0.3187681008130312, "reward_std": 0.552251516841352, "rewards/reward_func": 0.3187681008130312, "step": 960 }, { "completion_length": 210.890625, "epoch": 0.12953298541415764, "grad_norm": 5.46875, "kl": 0.0015157314774114639, "learning_rate": 8.704670145858424e-07, "loss": 0.0001, "reward": 0.28820460522547364, "reward_std": 0.6035197824239731, "rewards/reward_func": 0.28820460522547364, "step": 968 }, { "completion_length": 166.625, "epoch": 0.1306035059547705, "grad_norm": 3.765625, "kl": 0.0017258078005397692, "learning_rate": 8.693964940452294e-07, "loss": 0.0001, "reward": 0.5467304401099682, "reward_std": 0.5410985443741083, "rewards/reward_func": 0.5467304401099682, "step": 976 }, { "completion_length": 207.4765625, "epoch": 0.13167402649538337, "grad_norm": 2.9375, "kl": 0.0016618163790553808, "learning_rate": 8.683259735046166e-07, "loss": 0.0001, "reward": 0.24847363959997892, "reward_std": 0.5600069649517536, "rewards/reward_func": 0.24847363959997892, "step": 984 }, { "completion_length": 185.703125, "epoch": 0.13274454703599625, "grad_norm": 3.515625, "kl": 0.0019170493469573557, "learning_rate": 8.672554529640037e-07, "loss": 0.0001, "reward": 0.15675952192395926, "reward_std": 0.4530050400644541, "rewards/reward_func": 0.15675952192395926, "step": 992 }, { "completion_length": 185.6484375, "epoch": 0.13381506757660913, "grad_norm": 4.71875, "kl": 0.0018418136023683473, "learning_rate": 8.661849324233908e-07, "loss": 0.0001, "reward": 0.3646358111873269, "reward_std": 0.5811815112829208, "rewards/reward_func": 0.3646358111873269, "step": 1000 }, { "completion_length": 203.6171875, "epoch": 0.134885588117222, "grad_norm": 3.5625, "kl": 0.0020316866575740278, "learning_rate": 8.65114411882778e-07, "loss": 0.0001, "reward": 0.15310932788997889, "reward_std": 0.5124245472252369, "rewards/reward_func": 0.15310932788997889, "step": 1008 }, { "completion_length": 146.1484375, "epoch": 0.13595610865783486, "grad_norm": 3.65625, "kl": 0.002132126915967092, "learning_rate": 8.640438913421651e-07, "loss": 0.0001, "reward": 0.5445789489895105, "reward_std": 0.5516379848122597, "rewards/reward_func": 0.5445789489895105, "step": 1016 }, { "completion_length": 197.9375, "epoch": 0.13702662919844774, "grad_norm": 4.0625, "kl": 0.001812848830013536, "learning_rate": 8.629733708015521e-07, "loss": 0.0001, "reward": 0.07804312836378813, "reward_std": 0.5468557141721249, "rewards/reward_func": 0.07804312836378813, "step": 1024 }, { "completion_length": 160.8515625, "epoch": 0.13809714973906062, "grad_norm": 3.75, "kl": 0.0020080953399883583, "learning_rate": 8.619028502609393e-07, "loss": 0.0001, "reward": 0.3609808227047324, "reward_std": 0.4902635822072625, "rewards/reward_func": 0.3609808227047324, "step": 1032 }, { "completion_length": 166.921875, "epoch": 0.1391676702796735, "grad_norm": 3.953125, "kl": 0.0021710527944378555, "learning_rate": 8.608323297203265e-07, "loss": 0.0001, "reward": 0.04567475710064173, "reward_std": 0.639631874859333, "rewards/reward_func": 0.04567475710064173, "step": 1040 }, { "completion_length": 137.265625, "epoch": 0.14023819082028635, "grad_norm": 4.46875, "kl": 0.0022375187691068277, "learning_rate": 8.597618091797137e-07, "loss": 0.0001, "reward": 0.6706136465072632, "reward_std": 0.470423087477684, "rewards/reward_func": 0.6706136465072632, "step": 1048 }, { "completion_length": 174.9609375, "epoch": 0.14130871136089923, "grad_norm": 4.8125, "kl": 0.002100168538163416, "learning_rate": 8.586912886391006e-07, "loss": 0.0001, "reward": 0.4244903214275837, "reward_std": 0.4562762314453721, "rewards/reward_func": 0.4244903214275837, "step": 1056 }, { "completion_length": 147.9296875, "epoch": 0.1423792319015121, "grad_norm": 3.40625, "kl": 0.0022357639973051846, "learning_rate": 8.576207680984878e-07, "loss": 0.0001, "reward": 0.5205270126461983, "reward_std": 0.483647458255291, "rewards/reward_func": 0.5205270126461983, "step": 1064 }, { "completion_length": 212.625, "epoch": 0.143449752442125, "grad_norm": 3.46875, "kl": 0.0017438856302760541, "learning_rate": 8.56550247557875e-07, "loss": 0.0001, "reward": 0.04320483095943928, "reward_std": 0.6948880217969418, "rewards/reward_func": 0.04320483095943928, "step": 1072 }, { "completion_length": 181.5390625, "epoch": 0.14452027298273787, "grad_norm": 4.375, "kl": 0.0018758865917334333, "learning_rate": 8.554797270172622e-07, "loss": 0.0001, "reward": 0.30002590641379356, "reward_std": 0.5271559292450547, "rewards/reward_func": 0.30002590641379356, "step": 1080 }, { "completion_length": 176.609375, "epoch": 0.14559079352335072, "grad_norm": 3.59375, "kl": 0.0016780206933617592, "learning_rate": 8.544092064766492e-07, "loss": 0.0001, "reward": 0.29282525181770325, "reward_std": 0.48009985871613026, "rewards/reward_func": 0.29282525181770325, "step": 1088 }, { "completion_length": 218.75, "epoch": 0.1466613140639636, "grad_norm": 3.1875, "kl": 0.002029647948802449, "learning_rate": 8.533386859360363e-07, "loss": 0.0001, "reward": -0.1532103894278407, "reward_std": 0.4770786985754967, "rewards/reward_func": -0.1532103894278407, "step": 1096 }, { "completion_length": 184.4296875, "epoch": 0.14773183460457648, "grad_norm": 3.203125, "kl": 0.0017034321062965319, "learning_rate": 8.522681653954235e-07, "loss": 0.0001, "reward": 0.44974952936172485, "reward_std": 0.5446614529937506, "rewards/reward_func": 0.44974952936172485, "step": 1104 }, { "completion_length": 196.8203125, "epoch": 0.14880235514518936, "grad_norm": 5.0, "kl": 0.001964334660442546, "learning_rate": 8.511976448548106e-07, "loss": 0.0001, "reward": 0.056189559400081635, "reward_std": 0.468637160025537, "rewards/reward_func": 0.056189559400081635, "step": 1112 }, { "completion_length": 169.4921875, "epoch": 0.1498728756858022, "grad_norm": 4.15625, "kl": 0.002173921908251941, "learning_rate": 8.501271243141977e-07, "loss": 0.0001, "reward": 0.23756458796560764, "reward_std": 0.6810929477214813, "rewards/reward_func": 0.23756458796560764, "step": 1120 }, { "completion_length": 155.75, "epoch": 0.1509433962264151, "grad_norm": 4.65625, "kl": 0.00248186280077789, "learning_rate": 8.490566037735849e-07, "loss": 0.0001, "reward": 0.28897845139726996, "reward_std": 0.5369884418323636, "rewards/reward_func": 0.28897845139726996, "step": 1128 }, { "completion_length": 186.2109375, "epoch": 0.15201391676702797, "grad_norm": 3.21875, "kl": 0.0022306487226160243, "learning_rate": 8.479860832329721e-07, "loss": 0.0001, "reward": 0.10482135927304626, "reward_std": 0.6790419593453407, "rewards/reward_func": 0.10482135927304626, "step": 1136 }, { "completion_length": 192.546875, "epoch": 0.15308443730764085, "grad_norm": 3.71875, "kl": 0.0019263384310761467, "learning_rate": 8.469155626923591e-07, "loss": 0.0001, "reward": 0.14666470140218735, "reward_std": 0.5422503855079412, "rewards/reward_func": 0.14666470140218735, "step": 1144 }, { "completion_length": 184.40625, "epoch": 0.15415495784825373, "grad_norm": 2.734375, "kl": 0.002109996523358859, "learning_rate": 8.458450421517462e-07, "loss": 0.0001, "reward": 0.4111117944121361, "reward_std": 0.4935786770656705, "rewards/reward_func": 0.4111117944121361, "step": 1152 }, { "completion_length": 175.953125, "epoch": 0.15522547838886658, "grad_norm": 2.875, "kl": 0.0024219048937084153, "learning_rate": 8.447745216111334e-07, "loss": 0.0001, "reward": 0.26550869084894657, "reward_std": 0.4649670384824276, "rewards/reward_func": 0.26550869084894657, "step": 1160 }, { "completion_length": 161.7578125, "epoch": 0.15629599892947946, "grad_norm": 4.34375, "kl": 0.0023637667181901634, "learning_rate": 8.437040010705205e-07, "loss": 0.0001, "reward": 0.4199391510337591, "reward_std": 0.5549349021166563, "rewards/reward_func": 0.4199391510337591, "step": 1168 }, { "completion_length": 169.859375, "epoch": 0.15736651947009234, "grad_norm": 3.765625, "kl": 0.002736452064709738, "learning_rate": 8.426334805299077e-07, "loss": 0.0001, "reward": 0.09132032562047243, "reward_std": 0.5800180211663246, "rewards/reward_func": 0.09132032562047243, "step": 1176 }, { "completion_length": 165.875, "epoch": 0.15843704001070522, "grad_norm": 2.9375, "kl": 0.0026032868336187676, "learning_rate": 8.415629599892947e-07, "loss": 0.0001, "reward": 0.39741448499262333, "reward_std": 0.553530789911747, "rewards/reward_func": 0.39741448499262333, "step": 1184 }, { "completion_length": 152.6796875, "epoch": 0.15950756055131807, "grad_norm": 4.1875, "kl": 0.0026048235449707136, "learning_rate": 8.404924394486819e-07, "loss": 0.0001, "reward": 0.526178702712059, "reward_std": 0.4392085522413254, "rewards/reward_func": 0.526178702712059, "step": 1192 }, { "completion_length": 180.640625, "epoch": 0.16057808109193095, "grad_norm": 4.46875, "kl": 0.0021871782082598656, "learning_rate": 8.39421918908069e-07, "loss": 0.0001, "reward": 0.18006896087899804, "reward_std": 0.6525123100727797, "rewards/reward_func": 0.18006896087899804, "step": 1200 }, { "completion_length": 153.7734375, "epoch": 0.16164860163254383, "grad_norm": 2.75, "kl": 0.0022942414943827316, "learning_rate": 8.383513983674562e-07, "loss": 0.0001, "reward": 0.37028289400041103, "reward_std": 0.49791209399700165, "rewards/reward_func": 0.37028289400041103, "step": 1208 }, { "completion_length": 172.2109375, "epoch": 0.1627191221731567, "grad_norm": 5.03125, "kl": 0.002093525734380819, "learning_rate": 8.372808778268433e-07, "loss": 0.0001, "reward": 0.13176708482205868, "reward_std": 0.6455358900129795, "rewards/reward_func": 0.13176708482205868, "step": 1216 }, { "completion_length": 169.609375, "epoch": 0.16378964271376958, "grad_norm": 4.75, "kl": 0.0022360333387041464, "learning_rate": 8.362103572862303e-07, "loss": 0.0001, "reward": 0.3072196710854769, "reward_std": 0.5846256157383323, "rewards/reward_func": 0.3072196710854769, "step": 1224 }, { "completion_length": 166.1484375, "epoch": 0.16486016325438244, "grad_norm": 3.890625, "kl": 0.002241482841782272, "learning_rate": 8.351398367456175e-07, "loss": 0.0001, "reward": 0.3591133989393711, "reward_std": 0.4736274667084217, "rewards/reward_func": 0.3591133989393711, "step": 1232 }, { "completion_length": 199.203125, "epoch": 0.16593068379499532, "grad_norm": 3.328125, "kl": 0.00244250099058263, "learning_rate": 8.340693162050047e-07, "loss": 0.0001, "reward": 0.006516195833683014, "reward_std": 0.6199562083929777, "rewards/reward_func": 0.006516195833683014, "step": 1240 }, { "completion_length": 203.1328125, "epoch": 0.1670012043356082, "grad_norm": 4.65625, "kl": 0.0022358261194312945, "learning_rate": 8.329987956643918e-07, "loss": 0.0001, "reward": 0.10224719159305096, "reward_std": 0.6809590011835098, "rewards/reward_func": 0.10224719159305096, "step": 1248 }, { "completion_length": 148.3125, "epoch": 0.16807172487622107, "grad_norm": 4.375, "kl": 0.0025668047892395407, "learning_rate": 8.319282751237789e-07, "loss": 0.0001, "reward": 0.49531039223074913, "reward_std": 0.4778098724782467, "rewards/reward_func": 0.49531039223074913, "step": 1256 }, { "completion_length": 153.2578125, "epoch": 0.16914224541683393, "grad_norm": 3.34375, "kl": 0.0024423423456028104, "learning_rate": 8.30857754583166e-07, "loss": 0.0001, "reward": 0.35373237170279026, "reward_std": 0.4996814336627722, "rewards/reward_func": 0.35373237170279026, "step": 1264 }, { "completion_length": 171.234375, "epoch": 0.1702127659574468, "grad_norm": 3.1875, "kl": 0.0021556682913796976, "learning_rate": 8.297872340425532e-07, "loss": 0.0001, "reward": 0.28696669451892376, "reward_std": 0.5421474725008011, "rewards/reward_func": 0.28696669451892376, "step": 1272 }, { "completion_length": 188.515625, "epoch": 0.17128328649805968, "grad_norm": 3.703125, "kl": 0.0021073912794236094, "learning_rate": 8.287167135019402e-07, "loss": 0.0001, "reward": 0.21668443083763123, "reward_std": 0.419855872169137, "rewards/reward_func": 0.21668443083763123, "step": 1280 }, { "completion_length": 163.34375, "epoch": 0.17235380703867256, "grad_norm": 4.125, "kl": 0.002413511203485541, "learning_rate": 8.276461929613274e-07, "loss": 0.0001, "reward": 0.3908206336200237, "reward_std": 0.5146115329116583, "rewards/reward_func": 0.3908206336200237, "step": 1288 }, { "completion_length": 195.21875, "epoch": 0.17342432757928541, "grad_norm": 2.953125, "kl": 0.002010537078604102, "learning_rate": 8.265756724207146e-07, "loss": 0.0001, "reward": 0.2040023533627391, "reward_std": 0.5783168002963066, "rewards/reward_func": 0.2040023533627391, "step": 1296 }, { "completion_length": 147.5546875, "epoch": 0.1744948481198983, "grad_norm": 3.515625, "kl": 0.0029850091959815472, "learning_rate": 8.255051518801016e-07, "loss": 0.0001, "reward": 0.4321533404290676, "reward_std": 0.3191776555031538, "rewards/reward_func": 0.4321533404290676, "step": 1304 }, { "completion_length": 174.9921875, "epoch": 0.17556536866051117, "grad_norm": 3.84375, "kl": 0.0024711176374694332, "learning_rate": 8.244346313394887e-07, "loss": 0.0001, "reward": 0.31474856473505497, "reward_std": 0.5751422699540854, "rewards/reward_func": 0.31474856473505497, "step": 1312 }, { "completion_length": 180.5703125, "epoch": 0.17663588920112405, "grad_norm": 3.515625, "kl": 0.002760413888609037, "learning_rate": 8.233641107988759e-07, "loss": 0.0001, "reward": 0.2904138704761863, "reward_std": 0.3093845183029771, "rewards/reward_func": 0.2904138704761863, "step": 1320 }, { "completion_length": 171.0703125, "epoch": 0.17770640974173693, "grad_norm": 4.125, "kl": 0.002526076335925609, "learning_rate": 8.222935902582631e-07, "loss": 0.0001, "reward": 0.3407979141920805, "reward_std": 0.6505857929587364, "rewards/reward_func": 0.3407979141920805, "step": 1328 }, { "completion_length": 194.09375, "epoch": 0.17877693028234978, "grad_norm": 3.140625, "kl": 0.002656547527294606, "learning_rate": 8.212230697176503e-07, "loss": 0.0001, "reward": 0.16334644611924887, "reward_std": 0.5975025221705437, "rewards/reward_func": 0.16334644611924887, "step": 1336 }, { "completion_length": 169.984375, "epoch": 0.17984745082296266, "grad_norm": 6.1875, "kl": 0.0022961402573855594, "learning_rate": 8.201525491770372e-07, "loss": 0.0001, "reward": 0.11132130306214094, "reward_std": 0.6224425416439772, "rewards/reward_func": 0.11132130306214094, "step": 1344 }, { "completion_length": 174.171875, "epoch": 0.18091797136357554, "grad_norm": 3.6875, "kl": 0.002571267934399657, "learning_rate": 8.190820286364244e-07, "loss": 0.0001, "reward": 0.3750305436551571, "reward_std": 0.6532670613378286, "rewards/reward_func": 0.3750305436551571, "step": 1352 }, { "completion_length": 175.390625, "epoch": 0.18198849190418842, "grad_norm": 4.09375, "kl": 0.0026113019848708063, "learning_rate": 8.180115080958116e-07, "loss": 0.0001, "reward": 0.23261917755007744, "reward_std": 0.5395109131932259, "rewards/reward_func": 0.23261917755007744, "step": 1360 }, { "completion_length": 218.90625, "epoch": 0.18305901244480127, "grad_norm": 3.0625, "kl": 0.002512662627850659, "learning_rate": 8.169409875551986e-07, "loss": 0.0001, "reward": -0.04414751287549734, "reward_std": 0.49756659008562565, "rewards/reward_func": -0.04414751287549734, "step": 1368 }, { "completion_length": 203.4453125, "epoch": 0.18412953298541415, "grad_norm": 2.84375, "kl": 0.0023657960555283353, "learning_rate": 8.158704670145858e-07, "loss": 0.0001, "reward": 0.1807372528128326, "reward_std": 0.6098357774317265, "rewards/reward_func": 0.1807372528128326, "step": 1376 }, { "completion_length": 148.9609375, "epoch": 0.18520005352602703, "grad_norm": 4.03125, "kl": 0.0027785369311459363, "learning_rate": 8.14799946473973e-07, "loss": 0.0001, "reward": 0.587528869509697, "reward_std": 0.4399991165846586, "rewards/reward_func": 0.587528869509697, "step": 1384 }, { "completion_length": 166.234375, "epoch": 0.1862705740666399, "grad_norm": 4.4375, "kl": 0.002863895075279288, "learning_rate": 8.137294259333601e-07, "loss": 0.0001, "reward": 0.2870405614376068, "reward_std": 0.5268293377012014, "rewards/reward_func": 0.2870405614376068, "step": 1392 }, { "completion_length": 164.875, "epoch": 0.1873410946072528, "grad_norm": 4.3125, "kl": 0.0027437864046078175, "learning_rate": 8.126589053927471e-07, "loss": 0.0001, "reward": 0.27717010863125324, "reward_std": 0.6858110204339027, "rewards/reward_func": 0.27717010863125324, "step": 1400 }, { "completion_length": 160.875, "epoch": 0.18841161514786564, "grad_norm": 3.375, "kl": 0.002744226367212832, "learning_rate": 8.115883848521343e-07, "loss": 0.0001, "reward": 0.3531609745696187, "reward_std": 0.41381734795868397, "rewards/reward_func": 0.3531609745696187, "step": 1408 }, { "completion_length": 202.7734375, "epoch": 0.18948213568847852, "grad_norm": 2.859375, "kl": 0.002227893375675194, "learning_rate": 8.105178643115215e-07, "loss": 0.0001, "reward": 0.05168680660426617, "reward_std": 0.5793404262512922, "rewards/reward_func": 0.05168680660426617, "step": 1416 }, { "completion_length": 190.796875, "epoch": 0.1905526562290914, "grad_norm": 2.765625, "kl": 0.002415237744571641, "learning_rate": 8.094473437709086e-07, "loss": 0.0001, "reward": -0.012714797630906105, "reward_std": 0.6679329574108124, "rewards/reward_func": -0.012714797630906105, "step": 1424 }, { "completion_length": 160.0, "epoch": 0.19162317676970428, "grad_norm": 3.140625, "kl": 0.0027612125559244305, "learning_rate": 8.083768232302956e-07, "loss": 0.0001, "reward": 0.5069613344967365, "reward_std": 0.5272765178233385, "rewards/reward_func": 0.5069613344967365, "step": 1432 }, { "completion_length": 177.1328125, "epoch": 0.19269369731031713, "grad_norm": 3.9375, "kl": 0.002587508424767293, "learning_rate": 8.073063026896828e-07, "loss": 0.0001, "reward": 0.07287294790148735, "reward_std": 0.3514406271278858, "rewards/reward_func": 0.07287294790148735, "step": 1440 }, { "completion_length": 138.4609375, "epoch": 0.19376421785093, "grad_norm": 3.71875, "kl": 0.002967173932120204, "learning_rate": 8.0623578214907e-07, "loss": 0.0001, "reward": 0.40755754709243774, "reward_std": 0.48442143853753805, "rewards/reward_func": 0.40755754709243774, "step": 1448 }, { "completion_length": 160.5234375, "epoch": 0.1948347383915429, "grad_norm": 3.78125, "kl": 0.0028059011965524405, "learning_rate": 8.051652616084571e-07, "loss": 0.0001, "reward": 0.3703090399503708, "reward_std": 0.4106726851314306, "rewards/reward_func": 0.3703090399503708, "step": 1456 }, { "completion_length": 171.15625, "epoch": 0.19590525893215577, "grad_norm": 3.25, "kl": 0.0026552542112767696, "learning_rate": 8.040947410678442e-07, "loss": 0.0001, "reward": 0.3305620066821575, "reward_std": 0.6117083020508289, "rewards/reward_func": 0.3305620066821575, "step": 1464 }, { "completion_length": 155.4296875, "epoch": 0.19697577947276865, "grad_norm": 4.15625, "kl": 0.003170755269820802, "learning_rate": 8.030242205272313e-07, "loss": 0.0001, "reward": 0.6180750611238182, "reward_std": 0.40046251006424427, "rewards/reward_func": 0.6180750611238182, "step": 1472 }, { "completion_length": 180.125, "epoch": 0.1980463000133815, "grad_norm": 5.40625, "kl": 0.002536381929530762, "learning_rate": 8.019536999866184e-07, "loss": 0.0001, "reward": 0.2231542430818081, "reward_std": 0.4958275035023689, "rewards/reward_func": 0.2231542430818081, "step": 1480 }, { "completion_length": 180.4375, "epoch": 0.19911682055399438, "grad_norm": 4.875, "kl": 0.0024219011975219473, "learning_rate": 8.008831794460056e-07, "loss": 0.0001, "reward": 0.1133259404450655, "reward_std": 0.48838030360639095, "rewards/reward_func": 0.1133259404450655, "step": 1488 }, { "completion_length": 147.6171875, "epoch": 0.20018734109460726, "grad_norm": 5.125, "kl": 0.0031812663073651493, "learning_rate": 7.998126589053927e-07, "loss": 0.0001, "reward": 0.4617150817066431, "reward_std": 0.32284008618444204, "rewards/reward_func": 0.4617150817066431, "step": 1496 }, { "completion_length": 167.2109375, "epoch": 0.20125786163522014, "grad_norm": 4.09375, "kl": 0.0028175316692795604, "learning_rate": 7.987421383647799e-07, "loss": 0.0001, "reward": 0.3847576631233096, "reward_std": 0.6411111112684011, "rewards/reward_func": 0.3847576631233096, "step": 1504 }, { "completion_length": 177.4453125, "epoch": 0.202328382175833, "grad_norm": 2.796875, "kl": 0.0026287745859008282, "learning_rate": 7.976716178241669e-07, "loss": 0.0001, "reward": 0.5649865288287401, "reward_std": 0.5727164149284363, "rewards/reward_func": 0.5649865288287401, "step": 1512 }, { "completion_length": 172.1875, "epoch": 0.20339890271644587, "grad_norm": 3.8125, "kl": 0.002526555268559605, "learning_rate": 7.966010972835541e-07, "loss": 0.0001, "reward": 0.014871623367071152, "reward_std": 0.7178319171071053, "rewards/reward_func": 0.014871623367071152, "step": 1520 }, { "completion_length": 190.1328125, "epoch": 0.20446942325705875, "grad_norm": 3.484375, "kl": 0.002442143566440791, "learning_rate": 7.955305767429412e-07, "loss": 0.0001, "reward": -0.07442041672766209, "reward_std": 0.4888562625274062, "rewards/reward_func": -0.07442041672766209, "step": 1528 }, { "completion_length": 207.6328125, "epoch": 0.20553994379767163, "grad_norm": 3.359375, "kl": 0.0033009210601449013, "learning_rate": 7.944600562023284e-07, "loss": 0.0001, "reward": -0.009969270788133144, "reward_std": 0.6862461306154728, "rewards/reward_func": -0.009969270788133144, "step": 1536 }, { "completion_length": 162.59375, "epoch": 0.20661046433828448, "grad_norm": 4.65625, "kl": 0.002739378687692806, "learning_rate": 7.933895356617155e-07, "loss": 0.0001, "reward": 0.18637081049382687, "reward_std": 0.5968187265098095, "rewards/reward_func": 0.18637081049382687, "step": 1544 }, { "completion_length": 177.9453125, "epoch": 0.20768098487889736, "grad_norm": 3.53125, "kl": 0.003128286494757049, "learning_rate": 7.923190151211026e-07, "loss": 0.0001, "reward": 0.22664616536349058, "reward_std": 0.6607938874512911, "rewards/reward_func": 0.22664616536349058, "step": 1552 }, { "completion_length": 172.1328125, "epoch": 0.20875150541951024, "grad_norm": 3.953125, "kl": 0.003004254394909367, "learning_rate": 7.912484945804897e-07, "loss": 0.0001, "reward": 0.2532934434711933, "reward_std": 0.45475378446280956, "rewards/reward_func": 0.2532934434711933, "step": 1560 }, { "completion_length": 187.9609375, "epoch": 0.20982202596012312, "grad_norm": 3.890625, "kl": 0.0027564516640268266, "learning_rate": 7.901779740398768e-07, "loss": 0.0001, "reward": -0.02841023448854685, "reward_std": 0.51457286067307, "rewards/reward_func": -0.02841023448854685, "step": 1568 }, { "completion_length": 167.59375, "epoch": 0.210892546500736, "grad_norm": 2.515625, "kl": 0.0030282980733318254, "learning_rate": 7.89107453499264e-07, "loss": 0.0001, "reward": 0.4444689229130745, "reward_std": 0.5939295422285795, "rewards/reward_func": 0.4444689229130745, "step": 1576 }, { "completion_length": 170.5078125, "epoch": 0.21196306704134885, "grad_norm": 4.28125, "kl": 0.0030873965588398278, "learning_rate": 7.880369329586512e-07, "loss": 0.0001, "reward": 0.380419734865427, "reward_std": 0.47338528744876385, "rewards/reward_func": 0.380419734865427, "step": 1584 }, { "completion_length": 211.0234375, "epoch": 0.21303358758196173, "grad_norm": 2.75, "kl": 0.002254050428746268, "learning_rate": 7.869664124180381e-07, "loss": 0.0001, "reward": 0.2378121637739241, "reward_std": 0.4477904764935374, "rewards/reward_func": 0.2378121637739241, "step": 1592 }, { "completion_length": 172.953125, "epoch": 0.2141041081225746, "grad_norm": 3.296875, "kl": 0.0025954252487281337, "learning_rate": 7.858958918774253e-07, "loss": 0.0001, "reward": 0.2678052484989166, "reward_std": 0.5658102091401815, "rewards/reward_func": 0.2678052484989166, "step": 1600 }, { "completion_length": 186.3125, "epoch": 0.21517462866318748, "grad_norm": 3.59375, "kl": 0.0029648117488250136, "learning_rate": 7.848253713368125e-07, "loss": 0.0001, "reward": -0.019576035905629396, "reward_std": 0.6440879367291927, "rewards/reward_func": -0.019576035905629396, "step": 1608 }, { "completion_length": 172.1015625, "epoch": 0.21624514920380034, "grad_norm": 5.34375, "kl": 0.0030590661335736513, "learning_rate": 7.837548507961997e-07, "loss": 0.0001, "reward": 0.14971440564841032, "reward_std": 0.7218026369810104, "rewards/reward_func": 0.14971440564841032, "step": 1616 }, { "completion_length": 194.1796875, "epoch": 0.21731566974441321, "grad_norm": 3.859375, "kl": 0.0026531801267992705, "learning_rate": 7.826843302555867e-07, "loss": 0.0001, "reward": 0.10876535065472126, "reward_std": 0.5477734114974737, "rewards/reward_func": 0.10876535065472126, "step": 1624 }, { "completion_length": 171.3828125, "epoch": 0.2183861902850261, "grad_norm": 3.15625, "kl": 0.0031873490661382675, "learning_rate": 7.816138097149738e-07, "loss": 0.0001, "reward": 0.45664553716778755, "reward_std": 0.41215432807803154, "rewards/reward_func": 0.45664553716778755, "step": 1632 }, { "completion_length": 180.3515625, "epoch": 0.21945671082563897, "grad_norm": 2.953125, "kl": 0.0028260272229090333, "learning_rate": 7.80543289174361e-07, "loss": 0.0001, "reward": 0.12724297121167183, "reward_std": 0.5571443336084485, "rewards/reward_func": 0.12724297121167183, "step": 1640 }, { "completion_length": 171.859375, "epoch": 0.22052723136625185, "grad_norm": 4.125, "kl": 0.003236800170270726, "learning_rate": 7.794727686337482e-07, "loss": 0.0001, "reward": 0.3013367038220167, "reward_std": 0.47213477827608585, "rewards/reward_func": 0.3013367038220167, "step": 1648 }, { "completion_length": 170.015625, "epoch": 0.2215977519068647, "grad_norm": 4.125, "kl": 0.0037978598556946963, "learning_rate": 7.784022480931352e-07, "loss": 0.0002, "reward": 0.15545153710991144, "reward_std": 0.4550722800195217, "rewards/reward_func": 0.15545153710991144, "step": 1656 }, { "completion_length": 169.140625, "epoch": 0.22266827244747758, "grad_norm": 5.0, "kl": 0.0032820345077198, "learning_rate": 7.773317275525224e-07, "loss": 0.0001, "reward": 0.4377444460988045, "reward_std": 0.3618372976779938, "rewards/reward_func": 0.4377444460988045, "step": 1664 }, { "completion_length": 157.703125, "epoch": 0.22373879298809046, "grad_norm": 3.90625, "kl": 0.0029510459426091984, "learning_rate": 7.762612070119096e-07, "loss": 0.0001, "reward": 0.3989291125908494, "reward_std": 0.6332539850845933, "rewards/reward_func": 0.3989291125908494, "step": 1672 }, { "completion_length": 162.046875, "epoch": 0.22480931352870334, "grad_norm": 6.59375, "kl": 0.0033083032321883366, "learning_rate": 7.751906864712966e-07, "loss": 0.0001, "reward": 0.209829643368721, "reward_std": 0.6311223246157169, "rewards/reward_func": 0.209829643368721, "step": 1680 }, { "completion_length": 170.15625, "epoch": 0.2258798340693162, "grad_norm": 3.796875, "kl": 0.0031797273695701733, "learning_rate": 7.741201659306837e-07, "loss": 0.0001, "reward": 0.3279075580649078, "reward_std": 0.5484324526041746, "rewards/reward_func": 0.3279075580649078, "step": 1688 }, { "completion_length": 162.9921875, "epoch": 0.22695035460992907, "grad_norm": 4.03125, "kl": 0.0034852683020289987, "learning_rate": 7.730496453900709e-07, "loss": 0.0001, "reward": 0.4082569610327482, "reward_std": 0.6405626218765974, "rewards/reward_func": 0.4082569610327482, "step": 1696 }, { "completion_length": 181.515625, "epoch": 0.22802087515054195, "grad_norm": 4.40625, "kl": 0.002649015310453251, "learning_rate": 7.719791248494581e-07, "loss": 0.0001, "reward": 0.369276593439281, "reward_std": 0.6541860643774271, "rewards/reward_func": 0.369276593439281, "step": 1704 }, { "completion_length": 178.90625, "epoch": 0.22909139569115483, "grad_norm": 4.09375, "kl": 0.003142255067359656, "learning_rate": 7.709086043088452e-07, "loss": 0.0001, "reward": -0.047080494463443756, "reward_std": 0.5225706771016121, "rewards/reward_func": -0.047080494463443756, "step": 1712 }, { "completion_length": 186.4375, "epoch": 0.2301619162317677, "grad_norm": 3.53125, "kl": 0.002581312000984326, "learning_rate": 7.698380837682322e-07, "loss": 0.0001, "reward": 0.38728183694183826, "reward_std": 0.5371669437736273, "rewards/reward_func": 0.38728183694183826, "step": 1720 }, { "completion_length": 186.484375, "epoch": 0.23123243677238056, "grad_norm": 4.59375, "kl": 0.0032444458920508623, "learning_rate": 7.687675632276194e-07, "loss": 0.0001, "reward": 0.04241009894758463, "reward_std": 0.571906641125679, "rewards/reward_func": 0.04241009894758463, "step": 1728 }, { "completion_length": 162.625, "epoch": 0.23230295731299344, "grad_norm": 3.3125, "kl": 0.003708757780259475, "learning_rate": 7.676970426870065e-07, "loss": 0.0001, "reward": 0.44313428178429604, "reward_std": 0.49149057269096375, "rewards/reward_func": 0.44313428178429604, "step": 1736 }, { "completion_length": 152.2578125, "epoch": 0.23337347785360632, "grad_norm": 6.78125, "kl": 0.004187669139355421, "learning_rate": 7.666265221463937e-07, "loss": 0.0002, "reward": 0.28122030571103096, "reward_std": 0.66860780864954, "rewards/reward_func": 0.28122030571103096, "step": 1744 }, { "completion_length": 171.765625, "epoch": 0.2344439983942192, "grad_norm": 4.3125, "kl": 0.0036364277184475213, "learning_rate": 7.655560016057808e-07, "loss": 0.0001, "reward": 0.19042097311466932, "reward_std": 0.6095598358660936, "rewards/reward_func": 0.19042097311466932, "step": 1752 }, { "completion_length": 157.0, "epoch": 0.23551451893483205, "grad_norm": 5.0, "kl": 0.003160024934913963, "learning_rate": 7.644854810651679e-07, "loss": 0.0001, "reward": 0.4550087433308363, "reward_std": 0.4861539136618376, "rewards/reward_func": 0.4550087433308363, "step": 1760 }, { "completion_length": 166.6328125, "epoch": 0.23658503947544493, "grad_norm": 4.375, "kl": 0.003758498263778165, "learning_rate": 7.63414960524555e-07, "loss": 0.0002, "reward": 0.3313362691551447, "reward_std": 0.6437762156128883, "rewards/reward_func": 0.3313362691551447, "step": 1768 }, { "completion_length": 151.0859375, "epoch": 0.2376555600160578, "grad_norm": 4.5, "kl": 0.0032230821962002665, "learning_rate": 7.623444399839421e-07, "loss": 0.0001, "reward": 0.22410259768366814, "reward_std": 0.5125870034098625, "rewards/reward_func": 0.22410259768366814, "step": 1776 }, { "completion_length": 168.3984375, "epoch": 0.2387260805566707, "grad_norm": 8.0625, "kl": 0.0032083308324217796, "learning_rate": 7.612739194433293e-07, "loss": 0.0001, "reward": 0.33182543236762285, "reward_std": 0.6010603215545416, "rewards/reward_func": 0.33182543236762285, "step": 1784 }, { "completion_length": 207.2265625, "epoch": 0.23979660109728354, "grad_norm": 3.34375, "kl": 0.002648265421157703, "learning_rate": 7.602033989027165e-07, "loss": 0.0001, "reward": 0.03860955499112606, "reward_std": 0.5622509643435478, "rewards/reward_func": 0.03860955499112606, "step": 1792 }, { "completion_length": 176.15625, "epoch": 0.24086712163789642, "grad_norm": 4.8125, "kl": 0.003711075463797897, "learning_rate": 7.591328783621035e-07, "loss": 0.0001, "reward": 0.30269888415932655, "reward_std": 0.5988016724586487, "rewards/reward_func": 0.30269888415932655, "step": 1800 }, { "completion_length": 185.546875, "epoch": 0.2419376421785093, "grad_norm": 2.734375, "kl": 0.0029471274465322495, "learning_rate": 7.580623578214906e-07, "loss": 0.0001, "reward": 0.29960334673523903, "reward_std": 0.5849093981087208, "rewards/reward_func": 0.29960334673523903, "step": 1808 }, { "completion_length": 167.2109375, "epoch": 0.24300816271912218, "grad_norm": 4.96875, "kl": 0.003220759332180023, "learning_rate": 7.569918372808778e-07, "loss": 0.0001, "reward": 0.1449232380837202, "reward_std": 0.6299447380006313, "rewards/reward_func": 0.1449232380837202, "step": 1816 }, { "completion_length": 171.4375, "epoch": 0.24407868325973506, "grad_norm": 3.46875, "kl": 0.003584496444091201, "learning_rate": 7.559213167402649e-07, "loss": 0.0001, "reward": 0.1334919836372137, "reward_std": 0.5379905849695206, "rewards/reward_func": 0.1334919836372137, "step": 1824 }, { "completion_length": 176.0703125, "epoch": 0.2451492038003479, "grad_norm": 3.734375, "kl": 0.0033595635613892227, "learning_rate": 7.548507961996521e-07, "loss": 0.0001, "reward": 0.2927993945777416, "reward_std": 0.48013297095894814, "rewards/reward_func": 0.2927993945777416, "step": 1832 }, { "completion_length": 156.171875, "epoch": 0.2462197243409608, "grad_norm": 3.953125, "kl": 0.0032605840533506125, "learning_rate": 7.537802756590391e-07, "loss": 0.0001, "reward": 0.4554846081882715, "reward_std": 0.46569772996008396, "rewards/reward_func": 0.4554846081882715, "step": 1840 }, { "completion_length": 154.8515625, "epoch": 0.24729024488157367, "grad_norm": 5.09375, "kl": 0.0038292294193524867, "learning_rate": 7.527097551184263e-07, "loss": 0.0002, "reward": 0.2707878933288157, "reward_std": 0.6204773802310228, "rewards/reward_func": 0.2707878933288157, "step": 1848 }, { "completion_length": 173.046875, "epoch": 0.24836076542218655, "grad_norm": 3.671875, "kl": 0.003133324411464855, "learning_rate": 7.516392345778134e-07, "loss": 0.0001, "reward": 0.16592059656977654, "reward_std": 0.5527506861835718, "rewards/reward_func": 0.16592059656977654, "step": 1856 }, { "completion_length": 167.9375, "epoch": 0.2494312859627994, "grad_norm": 4.125, "kl": 0.0033082127920351923, "learning_rate": 7.505687140372006e-07, "loss": 0.0001, "reward": 0.20683408807963133, "reward_std": 0.5755761060863733, "rewards/reward_func": 0.20683408807963133, "step": 1864 }, { "completion_length": 184.7890625, "epoch": 0.2505018065034123, "grad_norm": 4.125, "kl": 0.0028639852243941277, "learning_rate": 7.494981934965877e-07, "loss": 0.0001, "reward": 0.22647499293088913, "reward_std": 0.5401746807619929, "rewards/reward_func": 0.22647499293088913, "step": 1872 }, { "completion_length": 177.9921875, "epoch": 0.25157232704402516, "grad_norm": 4.75, "kl": 0.0032885581313166767, "learning_rate": 7.484276729559747e-07, "loss": 0.0001, "reward": 0.19369017332792282, "reward_std": 0.7017679810523987, "rewards/reward_func": 0.19369017332792282, "step": 1880 }, { "completion_length": 165.171875, "epoch": 0.252642847584638, "grad_norm": 4.125, "kl": 0.0038394963194150478, "learning_rate": 7.473571524153619e-07, "loss": 0.0002, "reward": 0.4050522642210126, "reward_std": 0.49607561621814966, "rewards/reward_func": 0.4050522642210126, "step": 1888 }, { "completion_length": 212.78125, "epoch": 0.2537133681252509, "grad_norm": 4.625, "kl": 0.003014246642123908, "learning_rate": 7.462866318747491e-07, "loss": 0.0001, "reward": -0.03455093875527382, "reward_std": 0.5755152553319931, "rewards/reward_func": -0.03455093875527382, "step": 1896 }, { "completion_length": 154.8125, "epoch": 0.25478388866586377, "grad_norm": 5.5, "kl": 0.0037700315297115594, "learning_rate": 7.452161113341362e-07, "loss": 0.0002, "reward": 0.5173049904406071, "reward_std": 0.5908821411430836, "rewards/reward_func": 0.5173049904406071, "step": 1904 }, { "completion_length": 175.0703125, "epoch": 0.2558544092064767, "grad_norm": 3.734375, "kl": 0.003177426100592129, "learning_rate": 7.441455907935233e-07, "loss": 0.0001, "reward": 0.3084242893382907, "reward_std": 0.5546926856040955, "rewards/reward_func": 0.3084242893382907, "step": 1912 }, { "completion_length": 153.734375, "epoch": 0.2569249297470895, "grad_norm": 4.65625, "kl": 0.0028014232811983675, "learning_rate": 7.430750702529105e-07, "loss": 0.0001, "reward": 0.4781934395432472, "reward_std": 0.3370926305651665, "rewards/reward_func": 0.4781934395432472, "step": 1920 }, { "completion_length": 174.8203125, "epoch": 0.2579954502877024, "grad_norm": 3.703125, "kl": 0.0027142605104018003, "learning_rate": 7.420045497122976e-07, "loss": 0.0001, "reward": 0.34992816112935543, "reward_std": 0.5419151671230793, "rewards/reward_func": 0.34992816112935543, "step": 1928 }, { "completion_length": 164.3671875, "epoch": 0.2590659708283153, "grad_norm": 4.53125, "kl": 0.003608020633691922, "learning_rate": 7.409340291716846e-07, "loss": 0.0001, "reward": 0.23989262245595455, "reward_std": 0.6161230951547623, "rewards/reward_func": 0.23989262245595455, "step": 1936 }, { "completion_length": 176.71875, "epoch": 0.26013649136892814, "grad_norm": 3.46875, "kl": 0.0028162887319922447, "learning_rate": 7.398635086310718e-07, "loss": 0.0001, "reward": 0.2081197015941143, "reward_std": 0.6199641041457653, "rewards/reward_func": 0.2081197015941143, "step": 1944 }, { "completion_length": 188.7890625, "epoch": 0.261207011909541, "grad_norm": 3.9375, "kl": 0.0030088693019934, "learning_rate": 7.38792988090459e-07, "loss": 0.0001, "reward": 0.43774592503905296, "reward_std": 0.5932074896991253, "rewards/reward_func": 0.43774592503905296, "step": 1952 }, { "completion_length": 183.203125, "epoch": 0.2622775324501539, "grad_norm": 2.671875, "kl": 0.0032375668233726174, "learning_rate": 7.377224675498462e-07, "loss": 0.0001, "reward": 0.14370151609182358, "reward_std": 0.6233563013374805, "rewards/reward_func": 0.14370151609182358, "step": 1960 }, { "completion_length": 188.3984375, "epoch": 0.26334805299076675, "grad_norm": 3.640625, "kl": 0.003420975699555129, "learning_rate": 7.366519470092331e-07, "loss": 0.0001, "reward": 0.3819169942289591, "reward_std": 0.4839291740208864, "rewards/reward_func": 0.3819169942289591, "step": 1968 }, { "completion_length": 168.9140625, "epoch": 0.26441857353137965, "grad_norm": 3.84375, "kl": 0.0034208787546958774, "learning_rate": 7.355814264686203e-07, "loss": 0.0001, "reward": 0.17081641219556332, "reward_std": 0.6360666044056416, "rewards/reward_func": 0.17081641219556332, "step": 1976 }, { "completion_length": 147.734375, "epoch": 0.2654890940719925, "grad_norm": 3.890625, "kl": 0.0038502227107528597, "learning_rate": 7.345109059280075e-07, "loss": 0.0002, "reward": 0.5051426645368338, "reward_std": 0.539917191490531, "rewards/reward_func": 0.5051426645368338, "step": 1984 }, { "completion_length": 144.2421875, "epoch": 0.26655961461260536, "grad_norm": 4.40625, "kl": 0.004192993917968124, "learning_rate": 7.334403853873946e-07, "loss": 0.0002, "reward": 0.5341411675326526, "reward_std": 0.4230798315256834, "rewards/reward_func": 0.5341411675326526, "step": 1992 }, { "completion_length": 147.1953125, "epoch": 0.26763013515321826, "grad_norm": 4.65625, "kl": 0.00358515654806979, "learning_rate": 7.323698648467817e-07, "loss": 0.0001, "reward": 0.33829054702073336, "reward_std": 0.48450249992311, "rewards/reward_func": 0.33829054702073336, "step": 2000 }, { "completion_length": 166.828125, "epoch": 0.2687006556938311, "grad_norm": 2.21875, "kl": 0.003072334686294198, "learning_rate": 7.312993443061688e-07, "loss": 0.0001, "reward": 0.358464740216732, "reward_std": 0.5359793957322836, "rewards/reward_func": 0.358464740216732, "step": 2008 }, { "completion_length": 187.921875, "epoch": 0.269771176234444, "grad_norm": 2.984375, "kl": 0.003413002035813406, "learning_rate": 7.30228823765556e-07, "loss": 0.0001, "reward": 0.12693564407527447, "reward_std": 0.5908421259373426, "rewards/reward_func": 0.12693564407527447, "step": 2016 }, { "completion_length": 162.4453125, "epoch": 0.2708416967750569, "grad_norm": 3.859375, "kl": 0.0031307056196965277, "learning_rate": 7.291583032249431e-07, "loss": 0.0001, "reward": 0.38590772822499275, "reward_std": 0.48073394782841206, "rewards/reward_func": 0.38590772822499275, "step": 2024 }, { "completion_length": 192.0234375, "epoch": 0.2719122173156697, "grad_norm": 3.546875, "kl": 0.0032598864345345646, "learning_rate": 7.280877826843302e-07, "loss": 0.0001, "reward": 0.18051442131400108, "reward_std": 0.6438321061432362, "rewards/reward_func": 0.18051442131400108, "step": 2032 }, { "completion_length": 148.3984375, "epoch": 0.27298273785628263, "grad_norm": 4.09375, "kl": 0.0036882674612570554, "learning_rate": 7.270172621437174e-07, "loss": 0.0001, "reward": 0.4433805178850889, "reward_std": 0.615590687841177, "rewards/reward_func": 0.4433805178850889, "step": 2040 }, { "completion_length": 161.9375, "epoch": 0.2740532583968955, "grad_norm": 3.828125, "kl": 0.00417991288122721, "learning_rate": 7.259467416031044e-07, "loss": 0.0002, "reward": 0.21279390715062618, "reward_std": 0.5614198036491871, "rewards/reward_func": 0.21279390715062618, "step": 2048 }, { "completion_length": 157.0234375, "epoch": 0.2751237789375084, "grad_norm": 3.359375, "kl": 0.0038566369330510497, "learning_rate": 7.248762210624916e-07, "loss": 0.0002, "reward": 0.539018552750349, "reward_std": 0.5531130824238062, "rewards/reward_func": 0.539018552750349, "step": 2056 }, { "completion_length": 188.140625, "epoch": 0.27619429947812124, "grad_norm": 4.0, "kl": 0.0034979561460204422, "learning_rate": 7.238057005218787e-07, "loss": 0.0001, "reward": 0.11634453199803829, "reward_std": 0.599401269108057, "rewards/reward_func": 0.11634453199803829, "step": 2064 }, { "completion_length": 180.5625, "epoch": 0.2772648200187341, "grad_norm": 4.75, "kl": 0.0037427434872370213, "learning_rate": 7.227351799812659e-07, "loss": 0.0001, "reward": 0.2462961538694799, "reward_std": 0.5912914611399174, "rewards/reward_func": 0.2462961538694799, "step": 2072 }, { "completion_length": 169.1484375, "epoch": 0.278335340559347, "grad_norm": 3.859375, "kl": 0.003938340552849695, "learning_rate": 7.21664659440653e-07, "loss": 0.0002, "reward": 0.21049488708376884, "reward_std": 0.513383561745286, "rewards/reward_func": 0.21049488708376884, "step": 2080 }, { "completion_length": 163.03125, "epoch": 0.27940586109995985, "grad_norm": 3.90625, "kl": 0.003593464905861765, "learning_rate": 7.205941389000401e-07, "loss": 0.0001, "reward": 0.42887144535779953, "reward_std": 0.5823842044919729, "rewards/reward_func": 0.42887144535779953, "step": 2088 }, { "completion_length": 154.2421875, "epoch": 0.2804763816405727, "grad_norm": 5.71875, "kl": 0.004313324898248538, "learning_rate": 7.195236183594272e-07, "loss": 0.0002, "reward": 0.48943471536040306, "reward_std": 0.5817722771316767, "rewards/reward_func": 0.48943471536040306, "step": 2096 }, { "completion_length": 153.3984375, "epoch": 0.2815469021811856, "grad_norm": 4.875, "kl": 0.003974846331402659, "learning_rate": 7.184530978188144e-07, "loss": 0.0002, "reward": 0.4710603700950742, "reward_std": 0.47509198915213346, "rewards/reward_func": 0.4710603700950742, "step": 2104 }, { "completion_length": 152.15625, "epoch": 0.28261742272179846, "grad_norm": 3.390625, "kl": 0.003992282261606306, "learning_rate": 7.173825772782015e-07, "loss": 0.0002, "reward": 0.27247738372534513, "reward_std": 0.5785027798265219, "rewards/reward_func": 0.27247738372534513, "step": 2112 }, { "completion_length": 187.140625, "epoch": 0.28368794326241137, "grad_norm": 5.46875, "kl": 0.003529240610077977, "learning_rate": 7.163120567375887e-07, "loss": 0.0001, "reward": -0.031075291335582733, "reward_std": 0.5039861313998699, "rewards/reward_func": -0.031075291335582733, "step": 2120 }, { "completion_length": 179.78125, "epoch": 0.2847584638030242, "grad_norm": 5.0, "kl": 0.0030008777684997767, "learning_rate": 7.152415361969757e-07, "loss": 0.0001, "reward": 0.11691426858305931, "reward_std": 0.5304726148024201, "rewards/reward_func": 0.11691426858305931, "step": 2128 }, { "completion_length": 195.9765625, "epoch": 0.28582898434363707, "grad_norm": 3.4375, "kl": 0.0033693104924168438, "learning_rate": 7.141710156563628e-07, "loss": 0.0001, "reward": 0.1879887394607067, "reward_std": 0.5351051315665245, "rewards/reward_func": 0.1879887394607067, "step": 2136 }, { "completion_length": 184.78125, "epoch": 0.28689950488425, "grad_norm": 3.359375, "kl": 0.003082483890466392, "learning_rate": 7.1310049511575e-07, "loss": 0.0001, "reward": 0.16520871315151453, "reward_std": 0.3897149385884404, "rewards/reward_func": 0.16520871315151453, "step": 2144 }, { "completion_length": 161.625, "epoch": 0.28797002542486283, "grad_norm": 3.28125, "kl": 0.0033815766510087997, "learning_rate": 7.120299745751372e-07, "loss": 0.0001, "reward": 0.2989609017968178, "reward_std": 0.6460195314139128, "rewards/reward_func": 0.2989609017968178, "step": 2152 }, { "completion_length": 158.734375, "epoch": 0.28904054596547574, "grad_norm": 3.65625, "kl": 0.003546297753928229, "learning_rate": 7.109594540345243e-07, "loss": 0.0001, "reward": 0.18108150828629732, "reward_std": 0.5971081778407097, "rewards/reward_func": 0.18108150828629732, "step": 2160 }, { "completion_length": 162.09375, "epoch": 0.2901110665060886, "grad_norm": 4.1875, "kl": 0.003662400442408398, "learning_rate": 7.098889334939114e-07, "loss": 0.0001, "reward": 0.18057992309331894, "reward_std": 0.5010180473327637, "rewards/reward_func": 0.18057992309331894, "step": 2168 }, { "completion_length": 168.703125, "epoch": 0.29118158704670144, "grad_norm": 3.5625, "kl": 0.0036488809855654836, "learning_rate": 7.088184129532985e-07, "loss": 0.0001, "reward": 0.3672813940793276, "reward_std": 0.5888884011656046, "rewards/reward_func": 0.3672813940793276, "step": 2176 }, { "completion_length": 160.6015625, "epoch": 0.29225210758731435, "grad_norm": 4.03125, "kl": 0.0035606406745500863, "learning_rate": 7.077478924126857e-07, "loss": 0.0001, "reward": 0.34612463414669037, "reward_std": 0.5678670313209295, "rewards/reward_func": 0.34612463414669037, "step": 2184 }, { "completion_length": 186.1640625, "epoch": 0.2933226281279272, "grad_norm": 3.546875, "kl": 0.0036609756061807275, "learning_rate": 7.066773718720727e-07, "loss": 0.0001, "reward": 0.12424429133534431, "reward_std": 0.5589212942868471, "rewards/reward_func": 0.12424429133534431, "step": 2192 }, { "completion_length": 158.578125, "epoch": 0.29439314866854005, "grad_norm": 4.09375, "kl": 0.003958662680815905, "learning_rate": 7.056068513314599e-07, "loss": 0.0002, "reward": 0.2399831861257553, "reward_std": 0.6095044985413551, "rewards/reward_func": 0.2399831861257553, "step": 2200 }, { "completion_length": 150.6953125, "epoch": 0.29546366920915296, "grad_norm": 3.109375, "kl": 0.004275305866030976, "learning_rate": 7.045363307908471e-07, "loss": 0.0002, "reward": 0.42568245250731707, "reward_std": 0.5035090297460556, "rewards/reward_func": 0.42568245250731707, "step": 2208 }, { "completion_length": 198.25, "epoch": 0.2965341897497658, "grad_norm": 4.75, "kl": 0.0037761297717224807, "learning_rate": 7.034658102502341e-07, "loss": 0.0002, "reward": 0.20382929779589176, "reward_std": 0.6004747971892357, "rewards/reward_func": 0.20382929779589176, "step": 2216 }, { "completion_length": 166.953125, "epoch": 0.2976047102903787, "grad_norm": 5.59375, "kl": 0.004744472214952111, "learning_rate": 7.023952897096212e-07, "loss": 0.0002, "reward": 0.12384441681206226, "reward_std": 0.5781398452818394, "rewards/reward_func": 0.12384441681206226, "step": 2224 }, { "completion_length": 151.3125, "epoch": 0.29867523083099157, "grad_norm": 6.46875, "kl": 0.004519026639172807, "learning_rate": 7.013247691690084e-07, "loss": 0.0002, "reward": 0.36841049790382385, "reward_std": 0.3033979944884777, "rewards/reward_func": 0.36841049790382385, "step": 2232 }, { "completion_length": 166.9375, "epoch": 0.2997457513716044, "grad_norm": 4.03125, "kl": 0.003987667616456747, "learning_rate": 7.002542486283956e-07, "loss": 0.0002, "reward": -0.1171187162399292, "reward_std": 0.42854547686874866, "rewards/reward_func": -0.1171187162399292, "step": 2240 }, { "completion_length": 183.328125, "epoch": 0.3008162719122173, "grad_norm": 3.515625, "kl": 0.0033717694896040484, "learning_rate": 6.991837280877828e-07, "loss": 0.0001, "reward": 0.22414767649024725, "reward_std": 0.6165672689676285, "rewards/reward_func": 0.22414767649024725, "step": 2248 }, { "completion_length": 193.2421875, "epoch": 0.3018867924528302, "grad_norm": 3.140625, "kl": 0.0031051966943778098, "learning_rate": 6.981132075471697e-07, "loss": 0.0001, "reward": 0.3812776654958725, "reward_std": 0.5649162493646145, "rewards/reward_func": 0.3812776654958725, "step": 2256 }, { "completion_length": 183.9765625, "epoch": 0.3029573129934431, "grad_norm": 4.4375, "kl": 0.003768587455851957, "learning_rate": 6.970426870065569e-07, "loss": 0.0002, "reward": 0.12498392723500729, "reward_std": 0.4842628054320812, "rewards/reward_func": 0.12498392723500729, "step": 2264 }, { "completion_length": 200.5390625, "epoch": 0.30402783353405594, "grad_norm": 4.3125, "kl": 0.0034181236114818603, "learning_rate": 6.959721664659441e-07, "loss": 0.0001, "reward": -0.04165226221084595, "reward_std": 0.5646636541932821, "rewards/reward_func": -0.04165226221084595, "step": 2272 }, { "completion_length": 153.8125, "epoch": 0.3050983540746688, "grad_norm": 3.5625, "kl": 0.0040518031746614724, "learning_rate": 6.949016459253311e-07, "loss": 0.0002, "reward": 0.35562361404299736, "reward_std": 0.44176073744893074, "rewards/reward_func": 0.35562361404299736, "step": 2280 }, { "completion_length": 179.8046875, "epoch": 0.3061688746152817, "grad_norm": 4.53125, "kl": 0.0034489443933125585, "learning_rate": 6.938311253847183e-07, "loss": 0.0001, "reward": 0.30737813375890255, "reward_std": 0.5192515105009079, "rewards/reward_func": 0.30737813375890255, "step": 2288 }, { "completion_length": 175.1015625, "epoch": 0.30723939515589455, "grad_norm": 4.4375, "kl": 0.003332895546918735, "learning_rate": 6.927606048441054e-07, "loss": 0.0001, "reward": 0.16119882743805647, "reward_std": 0.6122260540723801, "rewards/reward_func": 0.16119882743805647, "step": 2296 }, { "completion_length": 166.7421875, "epoch": 0.30830991569650745, "grad_norm": 2.65625, "kl": 0.0034852146345656365, "learning_rate": 6.916900843034925e-07, "loss": 0.0001, "reward": 0.22000440582633018, "reward_std": 0.5837470442056656, "rewards/reward_func": 0.22000440582633018, "step": 2304 }, { "completion_length": 146.2421875, "epoch": 0.3093804362371203, "grad_norm": 3.59375, "kl": 0.003727599134435877, "learning_rate": 6.906195637628796e-07, "loss": 0.0001, "reward": 0.18992659822106361, "reward_std": 0.5706657655537128, "rewards/reward_func": 0.18992659822106361, "step": 2312 }, { "completion_length": 152.09375, "epoch": 0.31045095677773316, "grad_norm": 3.03125, "kl": 0.004237443121382967, "learning_rate": 6.895490432222668e-07, "loss": 0.0002, "reward": 0.5161734204739332, "reward_std": 0.5621990244835615, "rewards/reward_func": 0.5161734204739332, "step": 2320 }, { "completion_length": 139.578125, "epoch": 0.31152147731834606, "grad_norm": 3.546875, "kl": 0.0043363839504309, "learning_rate": 6.88478522681654e-07, "loss": 0.0002, "reward": 0.3602239452302456, "reward_std": 0.6682011783123016, "rewards/reward_func": 0.3602239452302456, "step": 2328 }, { "completion_length": 159.1171875, "epoch": 0.3125919978589589, "grad_norm": 3.296875, "kl": 0.005018363182898611, "learning_rate": 6.87408002141041e-07, "loss": 0.0002, "reward": 0.18990419153124094, "reward_std": 0.38154869619756937, "rewards/reward_func": 0.18990419153124094, "step": 2336 }, { "completion_length": 186.171875, "epoch": 0.31366251839957177, "grad_norm": 3.75, "kl": 0.0034995676251128316, "learning_rate": 6.863374816004281e-07, "loss": 0.0001, "reward": 0.28119928389787674, "reward_std": 0.6371741183102131, "rewards/reward_func": 0.28119928389787674, "step": 2344 }, { "completion_length": 144.5390625, "epoch": 0.3147330389401847, "grad_norm": 3.109375, "kl": 0.003701402310980484, "learning_rate": 6.852669610598153e-07, "loss": 0.0001, "reward": 0.2914201710373163, "reward_std": 0.5679418547078967, "rewards/reward_func": 0.2914201710373163, "step": 2352 }, { "completion_length": 158.453125, "epoch": 0.3158035594807975, "grad_norm": 4.59375, "kl": 0.003569768596207723, "learning_rate": 6.841964405192025e-07, "loss": 0.0001, "reward": 0.38121682219207287, "reward_std": 0.5718358978629112, "rewards/reward_func": 0.38121682219207287, "step": 2360 }, { "completion_length": 172.234375, "epoch": 0.31687408002141043, "grad_norm": 4.09375, "kl": 0.003890137653797865, "learning_rate": 6.831259199785896e-07, "loss": 0.0002, "reward": 0.19206082820892334, "reward_std": 0.5519562661647797, "rewards/reward_func": 0.19206082820892334, "step": 2368 }, { "completion_length": 136.1171875, "epoch": 0.3179446005620233, "grad_norm": 3.953125, "kl": 0.004021885659312829, "learning_rate": 6.820553994379766e-07, "loss": 0.0002, "reward": 0.43440112797543406, "reward_std": 0.5649959053844213, "rewards/reward_func": 0.43440112797543406, "step": 2376 }, { "completion_length": 189.59375, "epoch": 0.31901512110263613, "grad_norm": 7.4375, "kl": 0.0037745212903246284, "learning_rate": 6.809848788973638e-07, "loss": 0.0002, "reward": 0.08486939128488302, "reward_std": 0.5615943241864443, "rewards/reward_func": 0.08486939128488302, "step": 2384 }, { "completion_length": 145.40625, "epoch": 0.32008564164324904, "grad_norm": 6.15625, "kl": 0.004177739087026566, "learning_rate": 6.799143583567509e-07, "loss": 0.0002, "reward": 0.03109552478417754, "reward_std": 0.6218379884958267, "rewards/reward_func": 0.03109552478417754, "step": 2392 }, { "completion_length": 160.3203125, "epoch": 0.3211561621838619, "grad_norm": 4.71875, "kl": 0.004120006924495101, "learning_rate": 6.788438378161381e-07, "loss": 0.0002, "reward": 0.33427711576223373, "reward_std": 0.5099399294704199, "rewards/reward_func": 0.33427711576223373, "step": 2400 }, { "completion_length": 165.9765625, "epoch": 0.3222266827244748, "grad_norm": 3.859375, "kl": 0.0034675312926992774, "learning_rate": 6.777733172755252e-07, "loss": 0.0001, "reward": 0.4284206023439765, "reward_std": 0.5410223100334406, "rewards/reward_func": 0.4284206023439765, "step": 2408 }, { "completion_length": 203.5234375, "epoch": 0.32329720326508765, "grad_norm": 2.828125, "kl": 0.003464344044914469, "learning_rate": 6.767027967349124e-07, "loss": 0.0001, "reward": 0.32477592676877975, "reward_std": 0.5011547729372978, "rewards/reward_func": 0.32477592676877975, "step": 2416 }, { "completion_length": 150.3984375, "epoch": 0.3243677238057005, "grad_norm": 3.296875, "kl": 0.003587738669011742, "learning_rate": 6.756322761942994e-07, "loss": 0.0001, "reward": 0.44885979406535625, "reward_std": 0.5460297726094723, "rewards/reward_func": 0.44885979406535625, "step": 2424 }, { "completion_length": 169.2890625, "epoch": 0.3254382443463134, "grad_norm": 3.296875, "kl": 0.003916321613360196, "learning_rate": 6.745617556536866e-07, "loss": 0.0002, "reward": 0.12248068256303668, "reward_std": 0.5732488930225372, "rewards/reward_func": 0.12248068256303668, "step": 2432 }, { "completion_length": 199.9609375, "epoch": 0.32650876488692626, "grad_norm": 3.5625, "kl": 0.0033171565155498683, "learning_rate": 6.734912351130737e-07, "loss": 0.0001, "reward": 0.2281382903456688, "reward_std": 0.5701967515051365, "rewards/reward_func": 0.2281382903456688, "step": 2440 }, { "completion_length": 191.703125, "epoch": 0.32757928542753917, "grad_norm": 4.4375, "kl": 0.0034852577664423734, "learning_rate": 6.724207145724608e-07, "loss": 0.0001, "reward": 0.21391855087131262, "reward_std": 0.6829859614372253, "rewards/reward_func": 0.21391855087131262, "step": 2448 }, { "completion_length": 186.296875, "epoch": 0.328649805968152, "grad_norm": 3.765625, "kl": 0.004041536885779351, "learning_rate": 6.71350194031848e-07, "loss": 0.0002, "reward": 0.16749184112995863, "reward_std": 0.5975307431071997, "rewards/reward_func": 0.16749184112995863, "step": 2456 }, { "completion_length": 169.34375, "epoch": 0.32972032650876487, "grad_norm": 3.890625, "kl": 0.003387822740478441, "learning_rate": 6.702796734912351e-07, "loss": 0.0001, "reward": 0.4501562397927046, "reward_std": 0.4912100899964571, "rewards/reward_func": 0.4501562397927046, "step": 2464 }, { "completion_length": 147.0, "epoch": 0.3307908470493778, "grad_norm": 3.125, "kl": 0.0038211781647987664, "learning_rate": 6.692091529506222e-07, "loss": 0.0002, "reward": 0.10390966571867466, "reward_std": 0.4674555938690901, "rewards/reward_func": 0.10390966571867466, "step": 2472 }, { "completion_length": 165.9375, "epoch": 0.33186136758999063, "grad_norm": 5.96875, "kl": 0.0037551842688117176, "learning_rate": 6.681386324100093e-07, "loss": 0.0002, "reward": 0.3239047722890973, "reward_std": 0.544673465192318, "rewards/reward_func": 0.3239047722890973, "step": 2480 }, { "completion_length": 170.4375, "epoch": 0.3329318881306035, "grad_norm": 3.015625, "kl": 0.004028416806249879, "learning_rate": 6.670681118693965e-07, "loss": 0.0002, "reward": 0.16679776646196842, "reward_std": 0.4815365634858608, "rewards/reward_func": 0.16679776646196842, "step": 2488 }, { "completion_length": 162.2265625, "epoch": 0.3340024086712164, "grad_norm": 3.734375, "kl": 0.003879312367644161, "learning_rate": 6.659975913287837e-07, "loss": 0.0002, "reward": 0.4071632297709584, "reward_std": 0.5334546230733395, "rewards/reward_func": 0.4071632297709584, "step": 2496 }, { "completion_length": 154.6953125, "epoch": 0.33507292921182924, "grad_norm": 3.84375, "kl": 0.0042274416191503406, "learning_rate": 6.649270707881706e-07, "loss": 0.0002, "reward": 0.17819023504853249, "reward_std": 0.49565806053578854, "rewards/reward_func": 0.17819023504853249, "step": 2504 }, { "completion_length": 192.609375, "epoch": 0.33614344975244215, "grad_norm": 4.0625, "kl": 0.0035822324571199715, "learning_rate": 6.638565502475578e-07, "loss": 0.0001, "reward": -0.004483510740101337, "reward_std": 0.443414025940001, "rewards/reward_func": -0.004483510740101337, "step": 2512 }, { "completion_length": 171.921875, "epoch": 0.337213970293055, "grad_norm": 4.65625, "kl": 0.004149035812588409, "learning_rate": 6.62786029706945e-07, "loss": 0.0002, "reward": 0.08967352751642466, "reward_std": 0.5806238334625959, "rewards/reward_func": 0.08967352751642466, "step": 2520 }, { "completion_length": 153.5703125, "epoch": 0.33828449083366785, "grad_norm": 4.375, "kl": 0.004091008595423773, "learning_rate": 6.617155091663322e-07, "loss": 0.0002, "reward": 0.32766120694577694, "reward_std": 0.5018663741648197, "rewards/reward_func": 0.32766120694577694, "step": 2528 }, { "completion_length": 180.65625, "epoch": 0.33935501137428076, "grad_norm": 4.0625, "kl": 0.003188255534041673, "learning_rate": 6.606449886257192e-07, "loss": 0.0001, "reward": 0.09142577461898327, "reward_std": 0.6621435023844242, "rewards/reward_func": 0.09142577461898327, "step": 2536 }, { "completion_length": 175.28125, "epoch": 0.3404255319148936, "grad_norm": 4.46875, "kl": 0.003918278380297124, "learning_rate": 6.595744680851063e-07, "loss": 0.0002, "reward": 0.26254068687558174, "reward_std": 0.4977311482653022, "rewards/reward_func": 0.26254068687558174, "step": 2544 }, { "completion_length": 178.5859375, "epoch": 0.3414960524555065, "grad_norm": 2.78125, "kl": 0.0037679201050195843, "learning_rate": 6.585039475444935e-07, "loss": 0.0002, "reward": 0.2359130820259452, "reward_std": 0.6390624288469553, "rewards/reward_func": 0.2359130820259452, "step": 2552 }, { "completion_length": 185.7421875, "epoch": 0.34256657299611937, "grad_norm": 4.0625, "kl": 0.003888906561769545, "learning_rate": 6.574334270038807e-07, "loss": 0.0002, "reward": 0.06335067562758923, "reward_std": 0.5590885141864419, "rewards/reward_func": 0.06335067562758923, "step": 2560 }, { "completion_length": 156.1953125, "epoch": 0.3436370935367322, "grad_norm": 4.03125, "kl": 0.004304436064558104, "learning_rate": 6.563629064632677e-07, "loss": 0.0002, "reward": 0.20302090607583523, "reward_std": 0.5702759772539139, "rewards/reward_func": 0.20302090607583523, "step": 2568 }, { "completion_length": 132.640625, "epoch": 0.3447076140773451, "grad_norm": 4.34375, "kl": 0.004362121399026364, "learning_rate": 6.552923859226549e-07, "loss": 0.0002, "reward": 0.6651312373578548, "reward_std": 0.3866056464612484, "rewards/reward_func": 0.6651312373578548, "step": 2576 }, { "completion_length": 153.0078125, "epoch": 0.345778134617958, "grad_norm": 3.109375, "kl": 0.0041726555791683495, "learning_rate": 6.54221865382042e-07, "loss": 0.0002, "reward": 0.2306189425289631, "reward_std": 0.4977853484451771, "rewards/reward_func": 0.2306189425289631, "step": 2584 }, { "completion_length": 166.15625, "epoch": 0.34684865515857083, "grad_norm": 3.59375, "kl": 0.0034308232716284692, "learning_rate": 6.531513448414291e-07, "loss": 0.0001, "reward": 0.11097771301865578, "reward_std": 0.6078328117728233, "rewards/reward_func": 0.11097771301865578, "step": 2592 }, { "completion_length": 153.0, "epoch": 0.34791917569918374, "grad_norm": 4.15625, "kl": 0.003664735675556585, "learning_rate": 6.520808243008162e-07, "loss": 0.0001, "reward": 0.49361006263643503, "reward_std": 0.5971282683312893, "rewards/reward_func": 0.49361006263643503, "step": 2600 }, { "completion_length": 164.8828125, "epoch": 0.3489896962397966, "grad_norm": 5.0, "kl": 0.00437125310418196, "learning_rate": 6.510103037602034e-07, "loss": 0.0002, "reward": 0.1592898964881897, "reward_std": 0.5312252482399344, "rewards/reward_func": 0.1592898964881897, "step": 2608 }, { "completion_length": 165.421875, "epoch": 0.3500602167804095, "grad_norm": 4.34375, "kl": 0.0037221178063191473, "learning_rate": 6.499397832195906e-07, "loss": 0.0001, "reward": 0.41797424480319023, "reward_std": 0.5129497703164816, "rewards/reward_func": 0.41797424480319023, "step": 2616 }, { "completion_length": 182.96875, "epoch": 0.35113073732102235, "grad_norm": 3.65625, "kl": 0.004204686090815812, "learning_rate": 6.488692626789775e-07, "loss": 0.0002, "reward": 0.24847618490457535, "reward_std": 0.5075008701533079, "rewards/reward_func": 0.24847618490457535, "step": 2624 }, { "completion_length": 150.1875, "epoch": 0.3522012578616352, "grad_norm": 4.53125, "kl": 0.0036759270005859435, "learning_rate": 6.477987421383647e-07, "loss": 0.0001, "reward": 0.37406357005238533, "reward_std": 0.43564846366643906, "rewards/reward_func": 0.37406357005238533, "step": 2632 }, { "completion_length": 179.859375, "epoch": 0.3532717784022481, "grad_norm": 4.15625, "kl": 0.003904950339347124, "learning_rate": 6.467282215977519e-07, "loss": 0.0002, "reward": 0.30777904158458114, "reward_std": 0.5255319569259882, "rewards/reward_func": 0.30777904158458114, "step": 2640 }, { "completion_length": 153.1171875, "epoch": 0.35434229894286096, "grad_norm": 4.71875, "kl": 0.004152281413553283, "learning_rate": 6.45657701057139e-07, "loss": 0.0002, "reward": 0.25166825857013464, "reward_std": 0.5060204975306988, "rewards/reward_func": 0.25166825857013464, "step": 2648 }, { "completion_length": 179.9140625, "epoch": 0.35541281948347386, "grad_norm": 5.09375, "kl": 0.0037966810341458768, "learning_rate": 6.445871805165262e-07, "loss": 0.0002, "reward": 0.048941366374492645, "reward_std": 0.5631339196115732, "rewards/reward_func": 0.048941366374492645, "step": 2656 }, { "completion_length": 144.828125, "epoch": 0.3564833400240867, "grad_norm": 4.3125, "kl": 0.004289341013645753, "learning_rate": 6.435166599759133e-07, "loss": 0.0002, "reward": 0.2979842973873019, "reward_std": 0.5271645337343216, "rewards/reward_func": 0.2979842973873019, "step": 2664 }, { "completion_length": 171.3984375, "epoch": 0.35755386056469957, "grad_norm": 4.59375, "kl": 0.0042981151200365275, "learning_rate": 6.424461394353004e-07, "loss": 0.0002, "reward": 0.17236249335110188, "reward_std": 0.6582519998773932, "rewards/reward_func": 0.17236249335110188, "step": 2672 }, { "completion_length": 174.0390625, "epoch": 0.3586243811053125, "grad_norm": 3.828125, "kl": 0.004372917755972594, "learning_rate": 6.413756188946875e-07, "loss": 0.0002, "reward": 0.1233967412263155, "reward_std": 0.5646042246371508, "rewards/reward_func": 0.1233967412263155, "step": 2680 }, { "completion_length": 215.0859375, "epoch": 0.3596949016459253, "grad_norm": 3.265625, "kl": 0.0034220777451992035, "learning_rate": 6.403050983540746e-07, "loss": 0.0001, "reward": 0.022455230355262756, "reward_std": 0.4265636382624507, "rewards/reward_func": 0.022455230355262756, "step": 2688 }, { "completion_length": 165.6640625, "epoch": 0.36076542218653823, "grad_norm": 3.3125, "kl": 0.0036657150485552847, "learning_rate": 6.392345778134618e-07, "loss": 0.0001, "reward": 0.1397247351706028, "reward_std": 0.6145136766135693, "rewards/reward_func": 0.1397247351706028, "step": 2696 }, { "completion_length": 150.046875, "epoch": 0.3618359427271511, "grad_norm": 4.40625, "kl": 0.0038523364637512714, "learning_rate": 6.381640572728489e-07, "loss": 0.0002, "reward": 0.09757146798074245, "reward_std": 0.5506066791713238, "rewards/reward_func": 0.09757146798074245, "step": 2704 }, { "completion_length": 185.109375, "epoch": 0.36290646326776393, "grad_norm": 3.28125, "kl": 0.0035323128395248204, "learning_rate": 6.37093536732236e-07, "loss": 0.0001, "reward": -0.08146252483129501, "reward_std": 0.4336923873052001, "rewards/reward_func": -0.08146252483129501, "step": 2712 }, { "completion_length": 196.1015625, "epoch": 0.36397698380837684, "grad_norm": 2.65625, "kl": 0.003823416627710685, "learning_rate": 6.360230161916231e-07, "loss": 0.0002, "reward": 0.3048650873824954, "reward_std": 0.6732109598815441, "rewards/reward_func": 0.3048650873824954, "step": 2720 }, { "completion_length": 160.6640625, "epoch": 0.3650475043489897, "grad_norm": 4.15625, "kl": 0.00394167794729583, "learning_rate": 6.349524956510103e-07, "loss": 0.0002, "reward": 0.3657691851258278, "reward_std": 0.6270986460149288, "rewards/reward_func": 0.3657691851258278, "step": 2728 }, { "completion_length": 158.0625, "epoch": 0.36611802488960254, "grad_norm": 4.34375, "kl": 0.004382914863526821, "learning_rate": 6.338819751103974e-07, "loss": 0.0002, "reward": 0.23460809141397476, "reward_std": 0.5130759598687291, "rewards/reward_func": 0.23460809141397476, "step": 2736 }, { "completion_length": 185.953125, "epoch": 0.36718854543021545, "grad_norm": 5.65625, "kl": 0.004101004218682647, "learning_rate": 6.328114545697846e-07, "loss": 0.0002, "reward": 0.1987906889989972, "reward_std": 0.6350626721978188, "rewards/reward_func": 0.1987906889989972, "step": 2744 }, { "completion_length": 206.921875, "epoch": 0.3682590659708283, "grad_norm": 3.140625, "kl": 0.0035723625624086708, "learning_rate": 6.317409340291716e-07, "loss": 0.0001, "reward": 0.05194275360554457, "reward_std": 0.5546863917261362, "rewards/reward_func": 0.05194275360554457, "step": 2752 }, { "completion_length": 189.2578125, "epoch": 0.3693295865114412, "grad_norm": 5.0, "kl": 0.004587263334542513, "learning_rate": 6.306704134885587e-07, "loss": 0.0002, "reward": 0.1652057245373726, "reward_std": 0.6709528639912605, "rewards/reward_func": 0.1652057245373726, "step": 2760 }, { "completion_length": 178.6796875, "epoch": 0.37040010705205406, "grad_norm": 3.890625, "kl": 0.004105961037566885, "learning_rate": 6.295998929479459e-07, "loss": 0.0002, "reward": 0.2511095069348812, "reward_std": 0.48893540259450674, "rewards/reward_func": 0.2511095069348812, "step": 2768 }, { "completion_length": 202.7578125, "epoch": 0.3714706275926669, "grad_norm": 3.890625, "kl": 0.003215010277926922, "learning_rate": 6.285293724073331e-07, "loss": 0.0001, "reward": 0.1104801157489419, "reward_std": 0.5324361100792885, "rewards/reward_func": 0.1104801157489419, "step": 2776 }, { "completion_length": 216.71875, "epoch": 0.3725411481332798, "grad_norm": 3.546875, "kl": 0.003063723910599947, "learning_rate": 6.274588518667202e-07, "loss": 0.0001, "reward": 0.1753014111891389, "reward_std": 0.5819915365427732, "rewards/reward_func": 0.1753014111891389, "step": 2784 }, { "completion_length": 159.84375, "epoch": 0.37361166867389267, "grad_norm": 3.34375, "kl": 0.004174454777967185, "learning_rate": 6.263883313261072e-07, "loss": 0.0002, "reward": 0.45437810756266117, "reward_std": 0.4158835466951132, "rewards/reward_func": 0.45437810756266117, "step": 2792 }, { "completion_length": 208.875, "epoch": 0.3746821892145056, "grad_norm": 3.296875, "kl": 0.0031874127162154764, "learning_rate": 6.253178107854944e-07, "loss": 0.0001, "reward": 0.3352484591305256, "reward_std": 0.5119593776762486, "rewards/reward_func": 0.3352484591305256, "step": 2800 }, { "completion_length": 197.4765625, "epoch": 0.37575270975511843, "grad_norm": 3.28125, "kl": 0.0036684646474896, "learning_rate": 6.242472902448816e-07, "loss": 0.0001, "reward": 0.27760483510792255, "reward_std": 0.6436055637896061, "rewards/reward_func": 0.27760483510792255, "step": 2808 }, { "completion_length": 152.84375, "epoch": 0.3768232302957313, "grad_norm": 4.8125, "kl": 0.004549535195110366, "learning_rate": 6.231767697042686e-07, "loss": 0.0002, "reward": 0.46624478977173567, "reward_std": 0.6175453588366508, "rewards/reward_func": 0.46624478977173567, "step": 2816 }, { "completion_length": 144.796875, "epoch": 0.3778937508363442, "grad_norm": 3.421875, "kl": 0.0038801982591394335, "learning_rate": 6.221062491636558e-07, "loss": 0.0002, "reward": 0.3716530613601208, "reward_std": 0.5027909129858017, "rewards/reward_func": 0.3716530613601208, "step": 2824 }, { "completion_length": 189.3984375, "epoch": 0.37896427137695704, "grad_norm": 4.25, "kl": 0.003531339403707534, "learning_rate": 6.210357286230429e-07, "loss": 0.0001, "reward": 0.0588820856064558, "reward_std": 0.5144321415573359, "rewards/reward_func": 0.0588820856064558, "step": 2832 }, { "completion_length": 151.71875, "epoch": 0.3800347919175699, "grad_norm": 4.40625, "kl": 0.004161316930549219, "learning_rate": 6.199652080824301e-07, "loss": 0.0002, "reward": 0.21836877800524235, "reward_std": 0.6778084672987461, "rewards/reward_func": 0.21836877800524235, "step": 2840 }, { "completion_length": 156.9140625, "epoch": 0.3811053124581828, "grad_norm": 5.03125, "kl": 0.0043344263976905495, "learning_rate": 6.188946875418171e-07, "loss": 0.0002, "reward": 0.43052874132990837, "reward_std": 0.5890010427683592, "rewards/reward_func": 0.43052874132990837, "step": 2848 }, { "completion_length": 165.859375, "epoch": 0.38217583299879565, "grad_norm": 4.71875, "kl": 0.0040927641675807536, "learning_rate": 6.178241670012043e-07, "loss": 0.0002, "reward": 0.09390930086374283, "reward_std": 0.4254406839609146, "rewards/reward_func": 0.09390930086374283, "step": 2856 }, { "completion_length": 151.796875, "epoch": 0.38324635353940856, "grad_norm": 4.6875, "kl": 0.004312922974349931, "learning_rate": 6.167536464605915e-07, "loss": 0.0002, "reward": 0.16593856737017632, "reward_std": 0.6011241041123867, "rewards/reward_func": 0.16593856737017632, "step": 2864 }, { "completion_length": 147.5859375, "epoch": 0.3843168740800214, "grad_norm": 4.78125, "kl": 0.004850049444939941, "learning_rate": 6.156831259199785e-07, "loss": 0.0002, "reward": 0.3436956908553839, "reward_std": 0.4854423590004444, "rewards/reward_func": 0.3436956908553839, "step": 2872 }, { "completion_length": 144.6015625, "epoch": 0.38538739462063426, "grad_norm": 3.453125, "kl": 0.00466370303183794, "learning_rate": 6.146126053793656e-07, "loss": 0.0002, "reward": 0.3992779180407524, "reward_std": 0.5042364671826363, "rewards/reward_func": 0.3992779180407524, "step": 2880 }, { "completion_length": 144.2578125, "epoch": 0.38645791516124717, "grad_norm": 5.15625, "kl": 0.004223753814585507, "learning_rate": 6.135420848387528e-07, "loss": 0.0002, "reward": -0.015506982803344727, "reward_std": 0.6913204118609428, "rewards/reward_func": -0.015506982803344727, "step": 2888 }, { "completion_length": 201.4453125, "epoch": 0.38752843570186, "grad_norm": 3.84375, "kl": 0.00345133469090797, "learning_rate": 6.1247156429814e-07, "loss": 0.0001, "reward": -0.23047319240868092, "reward_std": 0.5747088566422462, "rewards/reward_func": -0.23047319240868092, "step": 2896 }, { "completion_length": 180.2890625, "epoch": 0.3885989562424729, "grad_norm": 3.125, "kl": 0.004600081476382911, "learning_rate": 6.114010437575271e-07, "loss": 0.0002, "reward": -0.17729684710502625, "reward_std": 0.3335055038332939, "rewards/reward_func": -0.17729684710502625, "step": 2904 }, { "completion_length": 153.8046875, "epoch": 0.3896694767830858, "grad_norm": 4.09375, "kl": 0.003945650125388056, "learning_rate": 6.103305232169142e-07, "loss": 0.0002, "reward": 0.2731490605510771, "reward_std": 0.573139002546668, "rewards/reward_func": 0.2731490605510771, "step": 2912 }, { "completion_length": 150.1171875, "epoch": 0.39073999732369863, "grad_norm": 3.90625, "kl": 0.004451691260328516, "learning_rate": 6.092600026763013e-07, "loss": 0.0002, "reward": 0.1808023676276207, "reward_std": 0.5803926577791572, "rewards/reward_func": 0.1808023676276207, "step": 2920 }, { "completion_length": 199.84375, "epoch": 0.39181051786431154, "grad_norm": 3.5625, "kl": 0.0031813042878638953, "learning_rate": 6.081894821356885e-07, "loss": 0.0001, "reward": 0.22766825137659907, "reward_std": 0.6164026372134686, "rewards/reward_func": 0.22766825137659907, "step": 2928 }, { "completion_length": 199.2734375, "epoch": 0.3928810384049244, "grad_norm": 3.28125, "kl": 0.0038126638100948185, "learning_rate": 6.071189615950756e-07, "loss": 0.0002, "reward": 0.16099986899644136, "reward_std": 0.7263254784047604, "rewards/reward_func": 0.16099986899644136, "step": 2936 }, { "completion_length": 157.765625, "epoch": 0.3939515589455373, "grad_norm": 3.5, "kl": 0.004354376200353727, "learning_rate": 6.060484410544627e-07, "loss": 0.0002, "reward": 0.31015807017683983, "reward_std": 0.6586577072739601, "rewards/reward_func": 0.31015807017683983, "step": 2944 }, { "completion_length": 155.671875, "epoch": 0.39502207948615015, "grad_norm": 3.609375, "kl": 0.004368482739664614, "learning_rate": 6.049779205138499e-07, "loss": 0.0002, "reward": 0.5017051734030247, "reward_std": 0.42575474083423615, "rewards/reward_func": 0.5017051734030247, "step": 2952 }, { "completion_length": 129.375, "epoch": 0.396092600026763, "grad_norm": 4.6875, "kl": 0.004999362543458119, "learning_rate": 6.039073999732369e-07, "loss": 0.0002, "reward": 0.5034809075295925, "reward_std": 0.5035946983844042, "rewards/reward_func": 0.5034809075295925, "step": 2960 }, { "completion_length": 171.203125, "epoch": 0.3971631205673759, "grad_norm": 3.09375, "kl": 0.003695404506288469, "learning_rate": 6.028368794326241e-07, "loss": 0.0001, "reward": 0.3904507216066122, "reward_std": 0.5394172128289938, "rewards/reward_func": 0.3904507216066122, "step": 2968 }, { "completion_length": 179.3515625, "epoch": 0.39823364110798876, "grad_norm": 2.796875, "kl": 0.004422289348440245, "learning_rate": 6.017663588920112e-07, "loss": 0.0002, "reward": 0.3067244812846184, "reward_std": 0.48167256638407707, "rewards/reward_func": 0.3067244812846184, "step": 2976 }, { "completion_length": 241.015625, "epoch": 0.3993041616486016, "grad_norm": 3.40625, "kl": 0.0026670149818528444, "learning_rate": 6.006958383513984e-07, "loss": 0.0001, "reward": -0.14665643870830536, "reward_std": 0.43440048210322857, "rewards/reward_func": -0.14665643870830536, "step": 2984 }, { "completion_length": 182.59375, "epoch": 0.4003746821892145, "grad_norm": 4.78125, "kl": 0.004074128781212494, "learning_rate": 5.996253178107855e-07, "loss": 0.0002, "reward": 0.2561218962073326, "reward_std": 0.5822538835927844, "rewards/reward_func": 0.2561218962073326, "step": 2992 }, { "completion_length": 193.3515625, "epoch": 0.40144520272982737, "grad_norm": 4.1875, "kl": 0.00386466141208075, "learning_rate": 5.985547972701726e-07, "loss": 0.0002, "reward": 0.21375904511660337, "reward_std": 0.39464170206338167, "rewards/reward_func": 0.21375904511660337, "step": 3000 }, { "completion_length": 159.328125, "epoch": 0.4025157232704403, "grad_norm": 3.734375, "kl": 0.00333388164290227, "learning_rate": 5.974842767295597e-07, "loss": 0.0001, "reward": 0.5523056299425662, "reward_std": 0.47927504777908325, "rewards/reward_func": 0.5523056299425662, "step": 3008 }, { "completion_length": 164.9921875, "epoch": 0.4035862438110531, "grad_norm": 3.625, "kl": 0.004161383403697982, "learning_rate": 5.964137561889468e-07, "loss": 0.0002, "reward": 0.17364376038312912, "reward_std": 0.5346489679068327, "rewards/reward_func": 0.17364376038312912, "step": 3016 }, { "completion_length": 159.7578125, "epoch": 0.404656764351666, "grad_norm": 4.0625, "kl": 0.003841915662633255, "learning_rate": 5.95343235648334e-07, "loss": 0.0002, "reward": 0.4289398565888405, "reward_std": 0.47436373494565487, "rewards/reward_func": 0.4289398565888405, "step": 3024 }, { "completion_length": 182.9609375, "epoch": 0.4057272848922789, "grad_norm": 2.453125, "kl": 0.004264735238393769, "learning_rate": 5.942727151077212e-07, "loss": 0.0002, "reward": 0.021798385307192802, "reward_std": 0.5079176230356097, "rewards/reward_func": 0.021798385307192802, "step": 3032 }, { "completion_length": 160.546875, "epoch": 0.40679780543289173, "grad_norm": 3.0, "kl": 0.005132144928211346, "learning_rate": 5.932021945671082e-07, "loss": 0.0002, "reward": 0.5438925623893738, "reward_std": 0.42197058349847794, "rewards/reward_func": 0.5438925623893738, "step": 3040 }, { "completion_length": 156.0859375, "epoch": 0.40786832597350464, "grad_norm": 4.1875, "kl": 0.003920425719115883, "learning_rate": 5.921316740264953e-07, "loss": 0.0002, "reward": 0.3435197048820555, "reward_std": 0.584853507578373, "rewards/reward_func": 0.3435197048820555, "step": 3048 }, { "completion_length": 148.8359375, "epoch": 0.4089388465141175, "grad_norm": 1.90625, "kl": 0.004158479205216281, "learning_rate": 5.910611534858825e-07, "loss": 0.0002, "reward": 0.3984090769663453, "reward_std": 0.4647171348333359, "rewards/reward_func": 0.3984090769663453, "step": 3056 }, { "completion_length": 175.2109375, "epoch": 0.41000936705473034, "grad_norm": 3.984375, "kl": 0.003644221549620852, "learning_rate": 5.899906329452697e-07, "loss": 0.0001, "reward": 0.16703728586435318, "reward_std": 0.5931989103555679, "rewards/reward_func": 0.16703728586435318, "step": 3064 }, { "completion_length": 173.90625, "epoch": 0.41107988759534325, "grad_norm": 3.21875, "kl": 0.004099360230611637, "learning_rate": 5.889201124046567e-07, "loss": 0.0002, "reward": -0.03924668487161398, "reward_std": 0.6728265807032585, "rewards/reward_func": -0.03924668487161398, "step": 3072 }, { "completion_length": 164.953125, "epoch": 0.4121504081359561, "grad_norm": 3.34375, "kl": 0.004925543296849355, "learning_rate": 5.878495918640438e-07, "loss": 0.0002, "reward": 0.3322628792375326, "reward_std": 0.5970859546214342, "rewards/reward_func": 0.3322628792375326, "step": 3080 }, { "completion_length": 162.0234375, "epoch": 0.41322092867656895, "grad_norm": 4.0625, "kl": 0.004399422614369541, "learning_rate": 5.86779071323431e-07, "loss": 0.0002, "reward": 0.36330926418304443, "reward_std": 0.45976690761744976, "rewards/reward_func": 0.36330926418304443, "step": 3088 }, { "completion_length": 138.796875, "epoch": 0.41429144921718186, "grad_norm": 3.140625, "kl": 0.004546679730992764, "learning_rate": 5.857085507828181e-07, "loss": 0.0002, "reward": 0.4072983153164387, "reward_std": 0.600585313513875, "rewards/reward_func": 0.4072983153164387, "step": 3096 }, { "completion_length": 214.953125, "epoch": 0.4153619697577947, "grad_norm": 2.9375, "kl": 0.00350517057813704, "learning_rate": 5.846380302422052e-07, "loss": 0.0001, "reward": 0.2474349234253168, "reward_std": 0.498451117426157, "rewards/reward_func": 0.2474349234253168, "step": 3104 }, { "completion_length": 139.453125, "epoch": 0.4164324902984076, "grad_norm": 4.1875, "kl": 0.00461685229674913, "learning_rate": 5.835675097015924e-07, "loss": 0.0002, "reward": 0.471061285585165, "reward_std": 0.45820480585098267, "rewards/reward_func": 0.471061285585165, "step": 3112 }, { "completion_length": 144.4453125, "epoch": 0.41750301083902047, "grad_norm": 4.6875, "kl": 0.004540506488410756, "learning_rate": 5.824969891609795e-07, "loss": 0.0002, "reward": 0.4106574021279812, "reward_std": 0.47535229101777077, "rewards/reward_func": 0.4106574021279812, "step": 3120 }, { "completion_length": 161.71875, "epoch": 0.4185735313796333, "grad_norm": 4.78125, "kl": 0.004575909668346867, "learning_rate": 5.814264686203665e-07, "loss": 0.0002, "reward": -0.04112925007939339, "reward_std": 0.4119059517979622, "rewards/reward_func": -0.04112925007939339, "step": 3128 }, { "completion_length": 196.5703125, "epoch": 0.41964405192024623, "grad_norm": 4.90625, "kl": 0.003955840336857364, "learning_rate": 5.803559480797537e-07, "loss": 0.0002, "reward": 0.024920357391238213, "reward_std": 0.5616731429472566, "rewards/reward_func": 0.024920357391238213, "step": 3136 }, { "completion_length": 204.4296875, "epoch": 0.4207145724608591, "grad_norm": 4.96875, "kl": 0.0033879343245644122, "learning_rate": 5.792854275391409e-07, "loss": 0.0001, "reward": 0.2112936358898878, "reward_std": 0.5542439222335815, "rewards/reward_func": 0.2112936358898878, "step": 3144 }, { "completion_length": 188.40625, "epoch": 0.421785093001472, "grad_norm": 4.40625, "kl": 0.003975967440055683, "learning_rate": 5.782149069985281e-07, "loss": 0.0002, "reward": -0.049715520814061165, "reward_std": 0.6521423272788525, "rewards/reward_func": -0.049715520814061165, "step": 3152 }, { "completion_length": 163.4375, "epoch": 0.42285561354208484, "grad_norm": 2.65625, "kl": 0.004166945233009756, "learning_rate": 5.771443864579151e-07, "loss": 0.0002, "reward": 0.41062634997069836, "reward_std": 0.4943850450217724, "rewards/reward_func": 0.41062634997069836, "step": 3160 }, { "completion_length": 130.859375, "epoch": 0.4239261340826977, "grad_norm": 3.859375, "kl": 0.005400074122007936, "learning_rate": 5.760738659173022e-07, "loss": 0.0002, "reward": 0.43160221725702286, "reward_std": 0.5389326587319374, "rewards/reward_func": 0.43160221725702286, "step": 3168 }, { "completion_length": 172.703125, "epoch": 0.4249966546233106, "grad_norm": 4.53125, "kl": 0.005152460333192721, "learning_rate": 5.750033453766894e-07, "loss": 0.0002, "reward": 0.06674006022512913, "reward_std": 0.5032580755650997, "rewards/reward_func": 0.06674006022512913, "step": 3176 }, { "completion_length": 154.6875, "epoch": 0.42606717516392345, "grad_norm": 4.125, "kl": 0.004849692864809185, "learning_rate": 5.739328248360766e-07, "loss": 0.0002, "reward": 0.33508316054940224, "reward_std": 0.5568934958428144, "rewards/reward_func": 0.33508316054940224, "step": 3184 }, { "completion_length": 149.921875, "epoch": 0.42713769570453636, "grad_norm": 3.5, "kl": 0.004149941669311374, "learning_rate": 5.728623042954636e-07, "loss": 0.0002, "reward": 0.560466131195426, "reward_std": 0.4996361844241619, "rewards/reward_func": 0.560466131195426, "step": 3192 }, { "completion_length": 162.3515625, "epoch": 0.4282082162451492, "grad_norm": 2.375, "kl": 0.00443269161041826, "learning_rate": 5.717917837548508e-07, "loss": 0.0002, "reward": 0.4073672443628311, "reward_std": 0.4750672820955515, "rewards/reward_func": 0.4073672443628311, "step": 3200 }, { "completion_length": 173.3203125, "epoch": 0.42927873678576206, "grad_norm": 4.125, "kl": 0.0039921577263157815, "learning_rate": 5.707212632142379e-07, "loss": 0.0002, "reward": -0.03342257114127278, "reward_std": 0.6672232635319233, "rewards/reward_func": -0.03342257114127278, "step": 3208 }, { "completion_length": 155.9375, "epoch": 0.43034925732637497, "grad_norm": 5.78125, "kl": 0.004704885970568284, "learning_rate": 5.69650742673625e-07, "loss": 0.0002, "reward": 0.3005738127976656, "reward_std": 0.5849708952009678, "rewards/reward_func": 0.3005738127976656, "step": 3216 }, { "completion_length": 185.84375, "epoch": 0.4314197778669878, "grad_norm": 2.890625, "kl": 0.0036070215137442574, "learning_rate": 5.685802221330121e-07, "loss": 0.0001, "reward": -0.018972497433423996, "reward_std": 0.5354725271463394, "rewards/reward_func": -0.018972497433423996, "step": 3224 }, { "completion_length": 178.546875, "epoch": 0.43249029840760067, "grad_norm": 2.625, "kl": 0.004298602405469865, "learning_rate": 5.675097015923993e-07, "loss": 0.0002, "reward": 0.3231694786809385, "reward_std": 0.4985707551240921, "rewards/reward_func": 0.3231694786809385, "step": 3232 }, { "completion_length": 162.3203125, "epoch": 0.4335608189482136, "grad_norm": 3.59375, "kl": 0.0039607091166544706, "learning_rate": 5.664391810517865e-07, "loss": 0.0002, "reward": 0.11156550701707602, "reward_std": 0.7430830076336861, "rewards/reward_func": 0.11156550701707602, "step": 3240 }, { "completion_length": 150.40625, "epoch": 0.43463133948882643, "grad_norm": 3.3125, "kl": 0.0049680424854159355, "learning_rate": 5.653686605111735e-07, "loss": 0.0002, "reward": 0.36818648502230644, "reward_std": 0.4827171713113785, "rewards/reward_func": 0.36818648502230644, "step": 3248 }, { "completion_length": 148.8046875, "epoch": 0.43570186002943934, "grad_norm": 5.40625, "kl": 0.004517415567534044, "learning_rate": 5.642981399705606e-07, "loss": 0.0002, "reward": 0.5136874578893185, "reward_std": 0.4101978652179241, "rewards/reward_func": 0.5136874578893185, "step": 3256 }, { "completion_length": 159.90625, "epoch": 0.4367723805700522, "grad_norm": 4.5, "kl": 0.005192397540668026, "learning_rate": 5.632276194299478e-07, "loss": 0.0002, "reward": 0.36011555418372154, "reward_std": 0.590018224902451, "rewards/reward_func": 0.36011555418372154, "step": 3264 }, { "completion_length": 165.8359375, "epoch": 0.43784290111066504, "grad_norm": 5.78125, "kl": 0.004310069081839174, "learning_rate": 5.621570988893349e-07, "loss": 0.0002, "reward": 0.44862041622400284, "reward_std": 0.5028228275477886, "rewards/reward_func": 0.44862041622400284, "step": 3272 }, { "completion_length": 163.8203125, "epoch": 0.43891342165127795, "grad_norm": 3.609375, "kl": 0.004184526915196329, "learning_rate": 5.610865783487221e-07, "loss": 0.0002, "reward": 0.3917035781778395, "reward_std": 0.5541238645091653, "rewards/reward_func": 0.3917035781778395, "step": 3280 }, { "completion_length": 186.40625, "epoch": 0.4399839421918908, "grad_norm": 4.125, "kl": 0.003184476459864527, "learning_rate": 5.600160578081091e-07, "loss": 0.0001, "reward": 0.12950839288532734, "reward_std": 0.5569676849991083, "rewards/reward_func": 0.12950839288532734, "step": 3288 }, { "completion_length": 139.9765625, "epoch": 0.4410544627325037, "grad_norm": 3.9375, "kl": 0.004262359958374873, "learning_rate": 5.589455372674963e-07, "loss": 0.0002, "reward": 0.28253707475960255, "reward_std": 0.44188484735786915, "rewards/reward_func": 0.28253707475960255, "step": 3296 }, { "completion_length": 174.6171875, "epoch": 0.44212498327311656, "grad_norm": 4.59375, "kl": 0.004584858979796991, "learning_rate": 5.578750167268834e-07, "loss": 0.0002, "reward": 0.10256939753890038, "reward_std": 0.47010411880910397, "rewards/reward_func": 0.10256939753890038, "step": 3304 }, { "completion_length": 151.453125, "epoch": 0.4431955038137294, "grad_norm": 3.3125, "kl": 0.004451150889508426, "learning_rate": 5.568044961862706e-07, "loss": 0.0002, "reward": 0.48164689540863037, "reward_std": 0.4186716293916106, "rewards/reward_func": 0.48164689540863037, "step": 3312 }, { "completion_length": 187.1796875, "epoch": 0.4442660243543423, "grad_norm": 5.3125, "kl": 0.003924098331481218, "learning_rate": 5.557339756456577e-07, "loss": 0.0002, "reward": 0.13027670048177242, "reward_std": 0.5701944110915065, "rewards/reward_func": 0.13027670048177242, "step": 3320 }, { "completion_length": 175.734375, "epoch": 0.44533654489495517, "grad_norm": 3.890625, "kl": 0.004432059795362875, "learning_rate": 5.546634551050447e-07, "loss": 0.0002, "reward": 0.15545489452779293, "reward_std": 0.42713499814271927, "rewards/reward_func": 0.15545489452779293, "step": 3328 }, { "completion_length": 159.7109375, "epoch": 0.446407065435568, "grad_norm": 3.953125, "kl": 0.0041090622544288635, "learning_rate": 5.535929345644319e-07, "loss": 0.0002, "reward": 0.5048990547657013, "reward_std": 0.3645612169057131, "rewards/reward_func": 0.5048990547657013, "step": 3336 }, { "completion_length": 170.46875, "epoch": 0.4474775859761809, "grad_norm": 3.109375, "kl": 0.00416830470203422, "learning_rate": 5.525224140238191e-07, "loss": 0.0002, "reward": 0.0841824202798307, "reward_std": 0.5541789922863245, "rewards/reward_func": 0.0841824202798307, "step": 3344 }, { "completion_length": 177.8984375, "epoch": 0.4485481065167938, "grad_norm": 2.90625, "kl": 0.004115153366001323, "learning_rate": 5.514518934832062e-07, "loss": 0.0002, "reward": 0.25281552597880363, "reward_std": 0.5773179177194834, "rewards/reward_func": 0.25281552597880363, "step": 3352 }, { "completion_length": 167.546875, "epoch": 0.4496186270574067, "grad_norm": 3.046875, "kl": 0.004800075956154615, "learning_rate": 5.503813729425933e-07, "loss": 0.0002, "reward": 0.15400892263278365, "reward_std": 0.613510686904192, "rewards/reward_func": 0.15400892263278365, "step": 3360 }, { "completion_length": 173.875, "epoch": 0.45068914759801953, "grad_norm": 5.125, "kl": 0.004234513093251735, "learning_rate": 5.493108524019804e-07, "loss": 0.0002, "reward": 0.14482227806001902, "reward_std": 0.6577083393931389, "rewards/reward_func": 0.14482227806001902, "step": 3368 }, { "completion_length": 196.0078125, "epoch": 0.4517596681386324, "grad_norm": 3.28125, "kl": 0.003691094840178266, "learning_rate": 5.482403318613676e-07, "loss": 0.0001, "reward": 0.20943116396665573, "reward_std": 0.6141318120062351, "rewards/reward_func": 0.20943116396665573, "step": 3376 }, { "completion_length": 189.9375, "epoch": 0.4528301886792453, "grad_norm": 3.625, "kl": 0.004188001621514559, "learning_rate": 5.471698113207546e-07, "loss": 0.0002, "reward": 0.12066240888088942, "reward_std": 0.6333566196262836, "rewards/reward_func": 0.12066240888088942, "step": 3384 }, { "completion_length": 224.203125, "epoch": 0.45390070921985815, "grad_norm": 3.46875, "kl": 0.0038036782352719456, "learning_rate": 5.460992907801418e-07, "loss": 0.0002, "reward": 0.026676064357161522, "reward_std": 0.5244584791362286, "rewards/reward_func": 0.026676064357161522, "step": 3392 }, { "completion_length": 135.8125, "epoch": 0.45497122976047105, "grad_norm": 3.78125, "kl": 0.005447168223327026, "learning_rate": 5.45028770239529e-07, "loss": 0.0002, "reward": 0.3792672948911786, "reward_std": 0.5337657146155834, "rewards/reward_func": 0.3792672948911786, "step": 3400 }, { "completion_length": 180.4765625, "epoch": 0.4560417503010839, "grad_norm": 3.84375, "kl": 0.0039006134611554444, "learning_rate": 5.439582496989162e-07, "loss": 0.0002, "reward": 0.2967074029147625, "reward_std": 0.5028974749147892, "rewards/reward_func": 0.2967074029147625, "step": 3408 }, { "completion_length": 173.46875, "epoch": 0.45711227084169676, "grad_norm": 3.59375, "kl": 0.004641034756787121, "learning_rate": 5.428877291583031e-07, "loss": 0.0002, "reward": 0.04013548418879509, "reward_std": 0.647808875888586, "rewards/reward_func": 0.04013548418879509, "step": 3416 }, { "completion_length": 182.6953125, "epoch": 0.45818279138230966, "grad_norm": 3.265625, "kl": 0.003926090226741508, "learning_rate": 5.418172086176903e-07, "loss": 0.0002, "reward": -0.02354210428893566, "reward_std": 0.46280941739678383, "rewards/reward_func": -0.02354210428893566, "step": 3424 }, { "completion_length": 171.9140625, "epoch": 0.4592533119229225, "grad_norm": 4.3125, "kl": 0.004544450406683609, "learning_rate": 5.407466880770775e-07, "loss": 0.0002, "reward": 0.2378298337571323, "reward_std": 0.5396788232028484, "rewards/reward_func": 0.2378298337571323, "step": 3432 }, { "completion_length": 168.5625, "epoch": 0.4603238324635354, "grad_norm": 4.125, "kl": 0.003944898169720545, "learning_rate": 5.396761675364647e-07, "loss": 0.0002, "reward": 0.3150383196771145, "reward_std": 0.53007797524333, "rewards/reward_func": 0.3150383196771145, "step": 3440 }, { "completion_length": 153.21875, "epoch": 0.46139435300414827, "grad_norm": 3.515625, "kl": 0.004431029927218333, "learning_rate": 5.386056469958517e-07, "loss": 0.0002, "reward": 0.10777561087161303, "reward_std": 0.5590555854141712, "rewards/reward_func": 0.10777561087161303, "step": 3448 }, { "completion_length": 156.1015625, "epoch": 0.4624648735447611, "grad_norm": 5.59375, "kl": 0.004066320398123935, "learning_rate": 5.375351264552388e-07, "loss": 0.0002, "reward": 0.49741687439382076, "reward_std": 0.3438666444271803, "rewards/reward_func": 0.49741687439382076, "step": 3456 }, { "completion_length": 180.890625, "epoch": 0.46353539408537403, "grad_norm": 3.5, "kl": 0.004092392831807956, "learning_rate": 5.36464605914626e-07, "loss": 0.0002, "reward": 0.261587081477046, "reward_std": 0.4724911078810692, "rewards/reward_func": 0.261587081477046, "step": 3464 }, { "completion_length": 188.34375, "epoch": 0.4646059146259869, "grad_norm": 4.5625, "kl": 0.004102788210730068, "learning_rate": 5.353940853740131e-07, "loss": 0.0002, "reward": 0.3170028403401375, "reward_std": 0.5411418545991182, "rewards/reward_func": 0.3170028403401375, "step": 3472 }, { "completion_length": 147.0078125, "epoch": 0.46567643516659973, "grad_norm": 3.90625, "kl": 0.004658064717659727, "learning_rate": 5.343235648334002e-07, "loss": 0.0002, "reward": 0.42856106348335743, "reward_std": 0.45429209433496, "rewards/reward_func": 0.42856106348335743, "step": 3480 }, { "completion_length": 185.578125, "epoch": 0.46674695570721264, "grad_norm": 3.515625, "kl": 0.004181814001640305, "learning_rate": 5.332530442927874e-07, "loss": 0.0002, "reward": 0.1980421096086502, "reward_std": 0.46522266045212746, "rewards/reward_func": 0.1980421096086502, "step": 3488 }, { "completion_length": 147.6015625, "epoch": 0.4678174762478255, "grad_norm": 2.875, "kl": 0.005186378140933812, "learning_rate": 5.321825237521745e-07, "loss": 0.0002, "reward": 0.33479253202676773, "reward_std": 0.3981231078505516, "rewards/reward_func": 0.33479253202676773, "step": 3496 }, { "completion_length": 196.953125, "epoch": 0.4688879967884384, "grad_norm": 2.109375, "kl": 0.003901872376445681, "learning_rate": 5.311120032115616e-07, "loss": 0.0002, "reward": -0.1805968815460801, "reward_std": 0.5539918430149555, "rewards/reward_func": -0.1805968815460801, "step": 3504 }, { "completion_length": 172.09375, "epoch": 0.46995851732905125, "grad_norm": 4.375, "kl": 0.004344145359937102, "learning_rate": 5.300414826709487e-07, "loss": 0.0002, "reward": 0.24764186749234796, "reward_std": 0.5220493152737617, "rewards/reward_func": 0.24764186749234796, "step": 3512 }, { "completion_length": 165.484375, "epoch": 0.4710290378696641, "grad_norm": 3.03125, "kl": 0.004431087261764333, "learning_rate": 5.289709621303359e-07, "loss": 0.0002, "reward": 0.207328287884593, "reward_std": 0.621040590107441, "rewards/reward_func": 0.207328287884593, "step": 3520 }, { "completion_length": 187.7734375, "epoch": 0.472099558410277, "grad_norm": 3.5, "kl": 0.004218856105580926, "learning_rate": 5.27900441589723e-07, "loss": 0.0002, "reward": 0.07554451934993267, "reward_std": 0.6108374260365963, "rewards/reward_func": 0.07554451934993267, "step": 3528 }, { "completion_length": 168.265625, "epoch": 0.47317007895088986, "grad_norm": 5.0625, "kl": 0.004120910074561834, "learning_rate": 5.2682992104911e-07, "loss": 0.0002, "reward": 0.03414946049451828, "reward_std": 0.6455099135637283, "rewards/reward_func": 0.03414946049451828, "step": 3536 }, { "completion_length": 187.5234375, "epoch": 0.47424059949150277, "grad_norm": 3.640625, "kl": 0.003985106013715267, "learning_rate": 5.257594005084972e-07, "loss": 0.0002, "reward": 0.2925253491848707, "reward_std": 0.6335334703326225, "rewards/reward_func": 0.2925253491848707, "step": 3544 }, { "completion_length": 163.46875, "epoch": 0.4753111200321156, "grad_norm": 4.125, "kl": 0.00473158826935105, "learning_rate": 5.246888799678844e-07, "loss": 0.0002, "reward": 0.3663984229788184, "reward_std": 0.558024113997817, "rewards/reward_func": 0.3663984229788184, "step": 3552 }, { "completion_length": 152.03125, "epoch": 0.47638164057272847, "grad_norm": 4.1875, "kl": 0.004815980733837932, "learning_rate": 5.236183594272715e-07, "loss": 0.0002, "reward": 0.13753212243318558, "reward_std": 0.5678570009768009, "rewards/reward_func": 0.13753212243318558, "step": 3560 }, { "completion_length": 178.7421875, "epoch": 0.4774521611133414, "grad_norm": 3.625, "kl": 0.004101649799849838, "learning_rate": 5.225478388866587e-07, "loss": 0.0002, "reward": 0.2735243234783411, "reward_std": 0.6148385126143694, "rewards/reward_func": 0.2735243234783411, "step": 3568 }, { "completion_length": 192.1484375, "epoch": 0.47852268165395423, "grad_norm": 4.84375, "kl": 0.004463646182557568, "learning_rate": 5.214773183460457e-07, "loss": 0.0002, "reward": 0.009237892925739288, "reward_std": 0.4207034735009074, "rewards/reward_func": 0.009237892925739288, "step": 3576 }, { "completion_length": 174.046875, "epoch": 0.4795932021945671, "grad_norm": 4.125, "kl": 0.0036158739821985364, "learning_rate": 5.204067978054328e-07, "loss": 0.0001, "reward": 0.26391329150646925, "reward_std": 0.586926780641079, "rewards/reward_func": 0.26391329150646925, "step": 3584 }, { "completion_length": 175.4375, "epoch": 0.48066372273518, "grad_norm": 3.578125, "kl": 0.0043216931517235935, "learning_rate": 5.1933627726482e-07, "loss": 0.0002, "reward": 0.28290559723973274, "reward_std": 0.6564907301217318, "rewards/reward_func": 0.28290559723973274, "step": 3592 }, { "completion_length": 178.8671875, "epoch": 0.48173424327579284, "grad_norm": 2.671875, "kl": 0.004414036084199324, "learning_rate": 5.182657567242071e-07, "loss": 0.0002, "reward": 0.3407918275333941, "reward_std": 0.5300383027642965, "rewards/reward_func": 0.3407918275333941, "step": 3600 }, { "completion_length": 168.0078125, "epoch": 0.48280476381640575, "grad_norm": 3.828125, "kl": 0.004355661425506696, "learning_rate": 5.171952361835943e-07, "loss": 0.0002, "reward": 0.15907337237149477, "reward_std": 0.6283294912427664, "rewards/reward_func": 0.15907337237149477, "step": 3608 }, { "completion_length": 169.671875, "epoch": 0.4838752843570186, "grad_norm": 4.40625, "kl": 0.0041847134416457266, "learning_rate": 5.161247156429813e-07, "loss": 0.0002, "reward": 0.394026106223464, "reward_std": 0.5191534291952848, "rewards/reward_func": 0.394026106223464, "step": 3616 }, { "completion_length": 187.9765625, "epoch": 0.48494580489763145, "grad_norm": 4.25, "kl": 0.004289885691832751, "learning_rate": 5.150541951023685e-07, "loss": 0.0002, "reward": 0.2409443873912096, "reward_std": 0.714960128068924, "rewards/reward_func": 0.2409443873912096, "step": 3624 }, { "completion_length": 167.8984375, "epoch": 0.48601632543824436, "grad_norm": 3.515625, "kl": 0.004622265987563878, "learning_rate": 5.139836745617556e-07, "loss": 0.0002, "reward": 0.3250633031129837, "reward_std": 0.3942791158333421, "rewards/reward_func": 0.3250633031129837, "step": 3632 }, { "completion_length": 181.3984375, "epoch": 0.4870868459788572, "grad_norm": 3.125, "kl": 0.00506105026579462, "learning_rate": 5.129131540211427e-07, "loss": 0.0002, "reward": 0.1745797097682953, "reward_std": 0.5199177237227559, "rewards/reward_func": 0.1745797097682953, "step": 3640 }, { "completion_length": 177.828125, "epoch": 0.4881573665194701, "grad_norm": 5.125, "kl": 0.00410176973673515, "learning_rate": 5.118426334805299e-07, "loss": 0.0002, "reward": 0.1287559773772955, "reward_std": 0.5036085527390242, "rewards/reward_func": 0.1287559773772955, "step": 3648 }, { "completion_length": 179.125, "epoch": 0.48922788706008297, "grad_norm": 4.0, "kl": 0.004322856722865254, "learning_rate": 5.107721129399171e-07, "loss": 0.0002, "reward": 0.07798391906544566, "reward_std": 0.6537183858454227, "rewards/reward_func": 0.07798391906544566, "step": 3656 }, { "completion_length": 154.4375, "epoch": 0.4902984076006958, "grad_norm": 3.03125, "kl": 0.003943322895793244, "learning_rate": 5.097015923993041e-07, "loss": 0.0002, "reward": 0.1709643267095089, "reward_std": 0.5507702603936195, "rewards/reward_func": 0.1709643267095089, "step": 3664 }, { "completion_length": 168.6875, "epoch": 0.4913689281413087, "grad_norm": 3.125, "kl": 0.004549846984446049, "learning_rate": 5.086310718586912e-07, "loss": 0.0002, "reward": 0.2190579893067479, "reward_std": 0.5686514582484961, "rewards/reward_func": 0.2190579893067479, "step": 3672 }, { "completion_length": 155.828125, "epoch": 0.4924394486819216, "grad_norm": 3.703125, "kl": 0.00439119475777261, "learning_rate": 5.075605513180784e-07, "loss": 0.0002, "reward": 0.45011513587087393, "reward_std": 0.5558454534038901, "rewards/reward_func": 0.45011513587087393, "step": 3680 }, { "completion_length": 158.1796875, "epoch": 0.4935099692225345, "grad_norm": 5.625, "kl": 0.004681064456235617, "learning_rate": 5.064900307774656e-07, "loss": 0.0002, "reward": 0.22979869320988655, "reward_std": 0.3713596798479557, "rewards/reward_func": 0.22979869320988655, "step": 3688 }, { "completion_length": 160.0234375, "epoch": 0.49458048976314734, "grad_norm": 4.375, "kl": 0.00535585597390309, "learning_rate": 5.054195102368527e-07, "loss": 0.0002, "reward": 0.051803894340991974, "reward_std": 0.6633851379156113, "rewards/reward_func": 0.051803894340991974, "step": 3696 }, { "completion_length": 175.9609375, "epoch": 0.4956510103037602, "grad_norm": 4.125, "kl": 0.004008779738796875, "learning_rate": 5.043489896962397e-07, "loss": 0.0002, "reward": 0.2548919077962637, "reward_std": 0.5479347966611385, "rewards/reward_func": 0.2548919077962637, "step": 3704 }, { "completion_length": 168.3671875, "epoch": 0.4967215308443731, "grad_norm": 3.78125, "kl": 0.004640541330445558, "learning_rate": 5.032784691556269e-07, "loss": 0.0002, "reward": 0.2829543873667717, "reward_std": 0.40924315620213747, "rewards/reward_func": 0.2829543873667717, "step": 3712 }, { "completion_length": 182.4921875, "epoch": 0.49779205138498595, "grad_norm": 3.015625, "kl": 0.003496495133731514, "learning_rate": 5.022079486150141e-07, "loss": 0.0001, "reward": 0.3831252008676529, "reward_std": 0.4445470869541168, "rewards/reward_func": 0.3831252008676529, "step": 3720 }, { "completion_length": 162.03125, "epoch": 0.4988625719255988, "grad_norm": 4.3125, "kl": 0.004948699788656086, "learning_rate": 5.011374280744011e-07, "loss": 0.0002, "reward": 0.32710376754403114, "reward_std": 0.47466727904975414, "rewards/reward_func": 0.32710376754403114, "step": 3728 }, { "completion_length": 176.5859375, "epoch": 0.4999330924662117, "grad_norm": 3.640625, "kl": 0.004679859790485352, "learning_rate": 5.000669075337883e-07, "loss": 0.0002, "reward": 0.13110784254968166, "reward_std": 0.44122389145195484, "rewards/reward_func": 0.13110784254968166, "step": 3736 }, { "completion_length": 167.71875, "epoch": 0.5010036130068246, "grad_norm": 4.78125, "kl": 0.004513267427682877, "learning_rate": 4.989963869931754e-07, "loss": 0.0002, "reward": 0.15786111541092396, "reward_std": 0.606589537113905, "rewards/reward_func": 0.15786111541092396, "step": 3744 }, { "completion_length": 150.421875, "epoch": 0.5020741335474375, "grad_norm": 5.3125, "kl": 0.004389044945128262, "learning_rate": 4.979258664525626e-07, "loss": 0.0002, "reward": 0.4018897293135524, "reward_std": 0.44968966394662857, "rewards/reward_func": 0.4018897293135524, "step": 3752 }, { "completion_length": 157.28125, "epoch": 0.5031446540880503, "grad_norm": 3.109375, "kl": 0.004845765855861828, "learning_rate": 4.968553459119496e-07, "loss": 0.0002, "reward": 0.5019057989120483, "reward_std": 0.43940271995961666, "rewards/reward_func": 0.5019057989120483, "step": 3760 }, { "completion_length": 183.84375, "epoch": 0.5042151746286632, "grad_norm": 2.890625, "kl": 0.004980318364687264, "learning_rate": 4.957848253713368e-07, "loss": 0.0002, "reward": 0.10100116580724716, "reward_std": 0.5983940260484815, "rewards/reward_func": 0.10100116580724716, "step": 3768 }, { "completion_length": 148.984375, "epoch": 0.505285695169276, "grad_norm": 2.859375, "kl": 0.0051506354357115924, "learning_rate": 4.947143048307239e-07, "loss": 0.0002, "reward": 0.2997464369982481, "reward_std": 0.6431192979216576, "rewards/reward_func": 0.2997464369982481, "step": 3776 }, { "completion_length": 148.5703125, "epoch": 0.506356215709889, "grad_norm": 3.890625, "kl": 0.004320590727729723, "learning_rate": 4.93643784290111e-07, "loss": 0.0002, "reward": 0.14957408607006073, "reward_std": 0.5004684673622251, "rewards/reward_func": 0.14957408607006073, "step": 3784 }, { "completion_length": 170.7734375, "epoch": 0.5074267362505018, "grad_norm": 4.96875, "kl": 0.00426993565633893, "learning_rate": 4.925732637494981e-07, "loss": 0.0002, "reward": 0.1513789612799883, "reward_std": 0.6300474852323532, "rewards/reward_func": 0.1513789612799883, "step": 3792 }, { "completion_length": 132.6796875, "epoch": 0.5084972567911147, "grad_norm": 3.65625, "kl": 0.00518818135606125, "learning_rate": 4.915027432088853e-07, "loss": 0.0002, "reward": 0.2980203665792942, "reward_std": 0.39504921436309814, "rewards/reward_func": 0.2980203665792942, "step": 3800 }, { "completion_length": 143.1875, "epoch": 0.5095677773317275, "grad_norm": 4.5625, "kl": 0.004469432285986841, "learning_rate": 4.904322226682725e-07, "loss": 0.0002, "reward": 0.4323331117630005, "reward_std": 0.5411158930510283, "rewards/reward_func": 0.4323331117630005, "step": 3808 }, { "completion_length": 204.7578125, "epoch": 0.5106382978723404, "grad_norm": 4.71875, "kl": 0.003944508789572865, "learning_rate": 4.893617021276595e-07, "loss": 0.0002, "reward": 0.06451552081853151, "reward_std": 0.6014019660651684, "rewards/reward_func": 0.06451552081853151, "step": 3816 }, { "completion_length": 171.0625, "epoch": 0.5117088184129533, "grad_norm": 6.53125, "kl": 0.0044076822232455015, "learning_rate": 4.882911815870467e-07, "loss": 0.0002, "reward": 0.26693916134536266, "reward_std": 0.5402739644050598, "rewards/reward_func": 0.26693916134536266, "step": 3824 }, { "completion_length": 160.0703125, "epoch": 0.5127793389535662, "grad_norm": 3.734375, "kl": 0.004957833531079814, "learning_rate": 4.872206610464339e-07, "loss": 0.0002, "reward": 0.2441606866195798, "reward_std": 0.6625313609838486, "rewards/reward_func": 0.2441606866195798, "step": 3832 }, { "completion_length": 155.8515625, "epoch": 0.513849859494179, "grad_norm": 3.640625, "kl": 0.004840250330744311, "learning_rate": 4.861501405058209e-07, "loss": 0.0002, "reward": 0.3202288933098316, "reward_std": 0.6590756271034479, "rewards/reward_func": 0.3202288933098316, "step": 3840 }, { "completion_length": 170.21875, "epoch": 0.5149203800347919, "grad_norm": 4.5625, "kl": 0.005241601204033941, "learning_rate": 4.850796199652081e-07, "loss": 0.0002, "reward": 0.11097644921392202, "reward_std": 0.6563504040241241, "rewards/reward_func": 0.11097644921392202, "step": 3848 }, { "completion_length": 172.3359375, "epoch": 0.5159909005754048, "grad_norm": 4.71875, "kl": 0.0044063644600100815, "learning_rate": 4.840090994245952e-07, "loss": 0.0002, "reward": 0.26450240099802613, "reward_std": 0.6473797373473644, "rewards/reward_func": 0.26450240099802613, "step": 3856 }, { "completion_length": 188.34375, "epoch": 0.5170614211160177, "grad_norm": 3.703125, "kl": 0.004124164639506489, "learning_rate": 4.829385788839824e-07, "loss": 0.0002, "reward": 0.09523116052150726, "reward_std": 0.5340174566954374, "rewards/reward_func": 0.09523116052150726, "step": 3864 }, { "completion_length": 157.3046875, "epoch": 0.5181319416566306, "grad_norm": 4.5, "kl": 0.004781241004820913, "learning_rate": 4.818680583433694e-07, "loss": 0.0002, "reward": 0.3139430582523346, "reward_std": 0.5873579885810614, "rewards/reward_func": 0.3139430582523346, "step": 3872 }, { "completion_length": 153.1015625, "epoch": 0.5192024621972434, "grad_norm": 4.28125, "kl": 0.0045044064754620194, "learning_rate": 4.807975378027566e-07, "loss": 0.0002, "reward": 0.24596689827740192, "reward_std": 0.5791397895663977, "rewards/reward_func": 0.24596689827740192, "step": 3880 }, { "completion_length": 166.8671875, "epoch": 0.5202729827378563, "grad_norm": 4.8125, "kl": 0.004427089152159169, "learning_rate": 4.797270172621437e-07, "loss": 0.0002, "reward": 0.3911690888926387, "reward_std": 0.5238520000129938, "rewards/reward_func": 0.3911690888926387, "step": 3888 }, { "completion_length": 182.296875, "epoch": 0.5213435032784691, "grad_norm": 3.6875, "kl": 0.00470818518078886, "learning_rate": 4.786564967215308e-07, "loss": 0.0002, "reward": -0.06911014439538121, "reward_std": 0.6354586593806744, "rewards/reward_func": -0.06911014439538121, "step": 3896 }, { "completion_length": 151.5859375, "epoch": 0.522414023819082, "grad_norm": 4.65625, "kl": 0.004992738307919353, "learning_rate": 4.775859761809179e-07, "loss": 0.0002, "reward": 0.441136134788394, "reward_std": 0.5409799609333277, "rewards/reward_func": 0.441136134788394, "step": 3904 }, { "completion_length": 158.1875, "epoch": 0.5234845443596949, "grad_norm": 3.921875, "kl": 0.004533803061349317, "learning_rate": 4.765154556403051e-07, "loss": 0.0002, "reward": 0.36645470559597015, "reward_std": 0.5416577542200685, "rewards/reward_func": 0.36645470559597015, "step": 3912 }, { "completion_length": 177.9140625, "epoch": 0.5245550649003078, "grad_norm": 2.78125, "kl": 0.004515117674600333, "learning_rate": 4.754449350996922e-07, "loss": 0.0002, "reward": 0.11683559231460094, "reward_std": 0.5318781770765781, "rewards/reward_func": 0.11683559231460094, "step": 3920 }, { "completion_length": 162.484375, "epoch": 0.5256255854409206, "grad_norm": 2.78125, "kl": 0.00410384067799896, "learning_rate": 4.7437441455907934e-07, "loss": 0.0002, "reward": 0.5109116761013865, "reward_std": 0.389411685988307, "rewards/reward_func": 0.5109116761013865, "step": 3928 }, { "completion_length": 179.484375, "epoch": 0.5266961059815335, "grad_norm": 4.5, "kl": 0.004382628481835127, "learning_rate": 4.7330389401846646e-07, "loss": 0.0002, "reward": 0.12338575161993504, "reward_std": 0.49865792877972126, "rewards/reward_func": 0.12338575161993504, "step": 3936 }, { "completion_length": 168.8203125, "epoch": 0.5277666265221463, "grad_norm": 3.78125, "kl": 0.004615213518263772, "learning_rate": 4.722333734778536e-07, "loss": 0.0002, "reward": 0.2909085564315319, "reward_std": 0.44954105466604233, "rewards/reward_func": 0.2909085564315319, "step": 3944 }, { "completion_length": 186.40625, "epoch": 0.5288371470627593, "grad_norm": 3.703125, "kl": 0.003957096429076046, "learning_rate": 4.7116285293724075e-07, "loss": 0.0002, "reward": 0.35753502883017063, "reward_std": 0.5898796916007996, "rewards/reward_func": 0.35753502883017063, "step": 3952 }, { "completion_length": 165.03125, "epoch": 0.5299076676033722, "grad_norm": 3.25, "kl": 0.0045530806528404355, "learning_rate": 4.700923323966278e-07, "loss": 0.0002, "reward": 0.2869006171822548, "reward_std": 0.4535912126302719, "rewards/reward_func": 0.2869006171822548, "step": 3960 }, { "completion_length": 148.8203125, "epoch": 0.530978188143985, "grad_norm": 4.25, "kl": 0.00460378042771481, "learning_rate": 4.69021811856015e-07, "loss": 0.0002, "reward": 0.48801288567483425, "reward_std": 0.4225266771391034, "rewards/reward_func": 0.48801288567483425, "step": 3968 }, { "completion_length": 174.203125, "epoch": 0.5320487086845979, "grad_norm": 2.65625, "kl": 0.004049515846418217, "learning_rate": 4.679512913154021e-07, "loss": 0.0002, "reward": 0.418088311329484, "reward_std": 0.5685894265770912, "rewards/reward_func": 0.418088311329484, "step": 3976 }, { "completion_length": 165.2578125, "epoch": 0.5331192292252107, "grad_norm": 3.25, "kl": 0.00501069356687367, "learning_rate": 4.668807707747892e-07, "loss": 0.0002, "reward": 0.31565719842910767, "reward_std": 0.6409982740879059, "rewards/reward_func": 0.31565719842910767, "step": 3984 }, { "completion_length": 162.015625, "epoch": 0.5341897497658237, "grad_norm": 3.671875, "kl": 0.0046659239451400936, "learning_rate": 4.6581025023417636e-07, "loss": 0.0002, "reward": -0.0461183600127697, "reward_std": 0.7044645324349403, "rewards/reward_func": -0.0461183600127697, "step": 3992 }, { "completion_length": 142.703125, "epoch": 0.5352602703064365, "grad_norm": 3.84375, "kl": 0.004710770619567484, "learning_rate": 4.6473972969356343e-07, "loss": 0.0002, "reward": 0.5219798712059855, "reward_std": 0.4946548119187355, "rewards/reward_func": 0.5219798712059855, "step": 4000 }, { "completion_length": 146.3203125, "epoch": 0.5363307908470494, "grad_norm": 3.21875, "kl": 0.005039886600570753, "learning_rate": 4.636692091529506e-07, "loss": 0.0002, "reward": 0.420873555354774, "reward_std": 0.4259900487959385, "rewards/reward_func": 0.420873555354774, "step": 4008 }, { "completion_length": 168.2109375, "epoch": 0.5374013113876622, "grad_norm": 4.96875, "kl": 0.004895551188383251, "learning_rate": 4.625986886123377e-07, "loss": 0.0002, "reward": 0.3381440285593271, "reward_std": 0.5715998597443104, "rewards/reward_func": 0.3381440285593271, "step": 4016 }, { "completion_length": 160.578125, "epoch": 0.5384718319282751, "grad_norm": 3.625, "kl": 0.00470035380567424, "learning_rate": 4.6152816807172485e-07, "loss": 0.0002, "reward": 0.3439123351126909, "reward_std": 0.4550882736220956, "rewards/reward_func": 0.3439123351126909, "step": 4024 }, { "completion_length": 159.9453125, "epoch": 0.539542352468888, "grad_norm": 4.375, "kl": 0.00492598774144426, "learning_rate": 4.6045764753111197e-07, "loss": 0.0002, "reward": 0.2067430024035275, "reward_std": 0.5162056926637888, "rewards/reward_func": 0.2067430024035275, "step": 4032 }, { "completion_length": 166.1015625, "epoch": 0.5406128730095009, "grad_norm": 3.0625, "kl": 0.0042450258624739945, "learning_rate": 4.593871269904991e-07, "loss": 0.0002, "reward": 0.3529038140550256, "reward_std": 0.4770152699202299, "rewards/reward_func": 0.3529038140550256, "step": 4040 }, { "completion_length": 178.7109375, "epoch": 0.5416833935501137, "grad_norm": 4.5625, "kl": 0.005025087855756283, "learning_rate": 4.583166064498862e-07, "loss": 0.0002, "reward": -0.081031309440732, "reward_std": 0.4695176286622882, "rewards/reward_func": -0.081031309440732, "step": 4048 }, { "completion_length": 165.65625, "epoch": 0.5427539140907266, "grad_norm": 4.4375, "kl": 0.0055138085735961795, "learning_rate": 4.572460859092734e-07, "loss": 0.0002, "reward": -0.007585156708955765, "reward_std": 0.5119953658431768, "rewards/reward_func": -0.007585156708955765, "step": 4056 }, { "completion_length": 156.34375, "epoch": 0.5438244346313394, "grad_norm": 3.796875, "kl": 0.0043381388823036104, "learning_rate": 4.5617556536866045e-07, "loss": 0.0002, "reward": 0.13799802958965302, "reward_std": 0.6221343949437141, "rewards/reward_func": 0.13799802958965302, "step": 4064 }, { "completion_length": 191.2734375, "epoch": 0.5448949551719524, "grad_norm": 4.71875, "kl": 0.004081014514667913, "learning_rate": 4.5510504482804763e-07, "loss": 0.0002, "reward": -0.10252122208476067, "reward_std": 0.5134240631014109, "rewards/reward_func": -0.10252122208476067, "step": 4072 }, { "completion_length": 149.390625, "epoch": 0.5459654757125653, "grad_norm": 3.640625, "kl": 0.004793624917510897, "learning_rate": 4.540345242874347e-07, "loss": 0.0002, "reward": 0.42647568974643946, "reward_std": 0.6049776747822762, "rewards/reward_func": 0.42647568974643946, "step": 4080 }, { "completion_length": 169.515625, "epoch": 0.5470359962531781, "grad_norm": 5.375, "kl": 0.005105009535327554, "learning_rate": 4.5296400374682187e-07, "loss": 0.0002, "reward": 0.14484626054763794, "reward_std": 0.715711385011673, "rewards/reward_func": 0.14484626054763794, "step": 4088 }, { "completion_length": 184.5859375, "epoch": 0.548106516793791, "grad_norm": 2.21875, "kl": 0.00399865786312148, "learning_rate": 4.51893483206209e-07, "loss": 0.0002, "reward": 0.27984373830258846, "reward_std": 0.6154143176972866, "rewards/reward_func": 0.27984373830258846, "step": 4096 }, { "completion_length": 148.9765625, "epoch": 0.5491770373344038, "grad_norm": 4.8125, "kl": 0.005411504651419818, "learning_rate": 4.508229626655961e-07, "loss": 0.0002, "reward": 0.3810861259698868, "reward_std": 0.6340535804629326, "rewards/reward_func": 0.3810861259698868, "step": 4104 }, { "completion_length": 181.171875, "epoch": 0.5502475578750168, "grad_norm": 3.734375, "kl": 0.003742568180314265, "learning_rate": 4.4975244212498324e-07, "loss": 0.0001, "reward": 0.314508281648159, "reward_std": 0.5607537031173706, "rewards/reward_func": 0.314508281648159, "step": 4112 }, { "completion_length": 131.109375, "epoch": 0.5513180784156296, "grad_norm": 6.5625, "kl": 0.005468558054417372, "learning_rate": 4.486819215843704e-07, "loss": 0.0002, "reward": 0.43094983510673046, "reward_std": 0.39848934579640627, "rewards/reward_func": 0.43094983510673046, "step": 4120 }, { "completion_length": 144.765625, "epoch": 0.5523885989562425, "grad_norm": 5.15625, "kl": 0.005072243511676788, "learning_rate": 4.476114010437575e-07, "loss": 0.0002, "reward": 0.16479766555130482, "reward_std": 0.624469917267561, "rewards/reward_func": 0.16479766555130482, "step": 4128 }, { "completion_length": 150.828125, "epoch": 0.5534591194968553, "grad_norm": 5.5625, "kl": 0.004988896253053099, "learning_rate": 4.4654088050314465e-07, "loss": 0.0002, "reward": 0.23024853132665157, "reward_std": 0.5588976237922907, "rewards/reward_func": 0.23024853132665157, "step": 4136 }, { "completion_length": 162.78125, "epoch": 0.5545296400374682, "grad_norm": 6.53125, "kl": 0.004712989641120657, "learning_rate": 4.454703599625317e-07, "loss": 0.0002, "reward": 0.27441484900191426, "reward_std": 0.4914160780608654, "rewards/reward_func": 0.27441484900191426, "step": 4144 }, { "completion_length": 203.6171875, "epoch": 0.555600160578081, "grad_norm": 3.6875, "kl": 0.003505587810650468, "learning_rate": 4.443998394219189e-07, "loss": 0.0001, "reward": 0.009393353015184402, "reward_std": 0.6114509087055922, "rewards/reward_func": 0.009393353015184402, "step": 4152 }, { "completion_length": 166.7578125, "epoch": 0.556670681118694, "grad_norm": 4.78125, "kl": 0.004777590365847573, "learning_rate": 4.43329318881306e-07, "loss": 0.0002, "reward": 0.11833875393494964, "reward_std": 0.5748403836041689, "rewards/reward_func": 0.11833875393494964, "step": 4160 }, { "completion_length": 150.4140625, "epoch": 0.5577412016593069, "grad_norm": 3.28125, "kl": 0.0049895147094503045, "learning_rate": 4.4225879834069314e-07, "loss": 0.0002, "reward": 0.2932877875864506, "reward_std": 0.6367702716961503, "rewards/reward_func": 0.2932877875864506, "step": 4168 }, { "completion_length": 156.7109375, "epoch": 0.5588117221999197, "grad_norm": 5.09375, "kl": 0.005086433404358104, "learning_rate": 4.4118827780008026e-07, "loss": 0.0002, "reward": 0.14813962019979954, "reward_std": 0.5115363541990519, "rewards/reward_func": 0.14813962019979954, "step": 4176 }, { "completion_length": 166.015625, "epoch": 0.5598822427405326, "grad_norm": 3.953125, "kl": 0.004459643067093566, "learning_rate": 4.401177572594674e-07, "loss": 0.0002, "reward": 0.17805076017975807, "reward_std": 0.7240184545516968, "rewards/reward_func": 0.17805076017975807, "step": 4184 }, { "completion_length": 149.1328125, "epoch": 0.5609527632811454, "grad_norm": 3.8125, "kl": 0.004602790024364367, "learning_rate": 4.390472367188545e-07, "loss": 0.0002, "reward": 0.46490050479769707, "reward_std": 0.4432865995913744, "rewards/reward_func": 0.46490050479769707, "step": 4192 }, { "completion_length": 183.953125, "epoch": 0.5620232838217584, "grad_norm": 3.625, "kl": 0.00446239989832975, "learning_rate": 4.379767161782417e-07, "loss": 0.0002, "reward": 0.19806094001978636, "reward_std": 0.6545614078640938, "rewards/reward_func": 0.19806094001978636, "step": 4200 }, { "completion_length": 195.46875, "epoch": 0.5630938043623712, "grad_norm": 3.5625, "kl": 0.003972954727942124, "learning_rate": 4.3690619563762875e-07, "loss": 0.0002, "reward": -0.12718784296885133, "reward_std": 0.5749151539057493, "rewards/reward_func": -0.12718784296885133, "step": 4208 }, { "completion_length": 137.4453125, "epoch": 0.5641643249029841, "grad_norm": 4.25, "kl": 0.004893360834103078, "learning_rate": 4.358356750970159e-07, "loss": 0.0002, "reward": 0.24862979911267757, "reward_std": 0.6906272917985916, "rewards/reward_func": 0.24862979911267757, "step": 4216 }, { "completion_length": 153.109375, "epoch": 0.5652348454435969, "grad_norm": 3.578125, "kl": 0.0049685456906445324, "learning_rate": 4.3476515455640304e-07, "loss": 0.0002, "reward": 0.41499729454517365, "reward_std": 0.4691876629367471, "rewards/reward_func": 0.41499729454517365, "step": 4224 }, { "completion_length": 149.8515625, "epoch": 0.5663053659842098, "grad_norm": 5.59375, "kl": 0.004520065325777978, "learning_rate": 4.3369463401579017e-07, "loss": 0.0002, "reward": 0.318800778593868, "reward_std": 0.6351992357522249, "rewards/reward_func": 0.318800778593868, "step": 4232 }, { "completion_length": 142.421875, "epoch": 0.5673758865248227, "grad_norm": 4.59375, "kl": 0.005744964553741738, "learning_rate": 4.326241134751773e-07, "loss": 0.0002, "reward": 0.4124446418136358, "reward_std": 0.5395534262061119, "rewards/reward_func": 0.4124446418136358, "step": 4240 }, { "completion_length": 163.6953125, "epoch": 0.5684464070654356, "grad_norm": 4.6875, "kl": 0.004186704114545137, "learning_rate": 4.315535929345644e-07, "loss": 0.0002, "reward": 0.35636366717517376, "reward_std": 0.6417583487927914, "rewards/reward_func": 0.35636366717517376, "step": 4248 }, { "completion_length": 185.578125, "epoch": 0.5695169276060484, "grad_norm": 3.84375, "kl": 0.004251696169376373, "learning_rate": 4.3048307239395153e-07, "loss": 0.0002, "reward": 0.30847467109560966, "reward_std": 0.44796227291226387, "rewards/reward_func": 0.30847467109560966, "step": 4256 }, { "completion_length": 211.21875, "epoch": 0.5705874481466613, "grad_norm": 2.3125, "kl": 0.004341925901826471, "learning_rate": 4.294125518533387e-07, "loss": 0.0002, "reward": 0.2743415031582117, "reward_std": 0.45934509858489037, "rewards/reward_func": 0.2743415031582117, "step": 4264 }, { "completion_length": 171.7265625, "epoch": 0.5716579686872741, "grad_norm": 1.9453125, "kl": 0.004344686400145292, "learning_rate": 4.2834203131272577e-07, "loss": 0.0002, "reward": 0.2994256131350994, "reward_std": 0.6268932148814201, "rewards/reward_func": 0.2994256131350994, "step": 4272 }, { "completion_length": 138.3984375, "epoch": 0.5727284892278871, "grad_norm": 3.171875, "kl": 0.006193301291204989, "learning_rate": 4.2727151077211295e-07, "loss": 0.0002, "reward": 0.3124155914410949, "reward_std": 0.49435919895768166, "rewards/reward_func": 0.3124155914410949, "step": 4280 }, { "completion_length": 175.421875, "epoch": 0.5737990097685, "grad_norm": 3.609375, "kl": 0.004771079227793962, "learning_rate": 4.262009902315e-07, "loss": 0.0002, "reward": 0.27722545340657234, "reward_std": 0.5442187786102295, "rewards/reward_func": 0.27722545340657234, "step": 4288 }, { "completion_length": 224.7265625, "epoch": 0.5748695303091128, "grad_norm": 3.703125, "kl": 0.0033396084618289024, "learning_rate": 4.251304696908872e-07, "loss": 0.0001, "reward": -0.16931618377566338, "reward_std": 0.5313975028693676, "rewards/reward_func": -0.16931618377566338, "step": 4296 }, { "completion_length": 186.859375, "epoch": 0.5759400508497257, "grad_norm": 4.75, "kl": 0.0042559900030028075, "learning_rate": 4.240599491502743e-07, "loss": 0.0002, "reward": 0.13033189252018929, "reward_std": 0.3756987228989601, "rewards/reward_func": 0.13033189252018929, "step": 4304 }, { "completion_length": 142.578125, "epoch": 0.5770105713903385, "grad_norm": 3.9375, "kl": 0.005709152843337506, "learning_rate": 4.2298942860966143e-07, "loss": 0.0002, "reward": 0.3865806292742491, "reward_std": 0.6126521602272987, "rewards/reward_func": 0.3865806292742491, "step": 4312 }, { "completion_length": 164.7890625, "epoch": 0.5780810919309515, "grad_norm": 3.125, "kl": 0.0045434608473442495, "learning_rate": 4.2191890806904856e-07, "loss": 0.0002, "reward": 0.3333674664609134, "reward_std": 0.6179927475750446, "rewards/reward_func": 0.3333674664609134, "step": 4320 }, { "completion_length": 118.8515625, "epoch": 0.5791516124715643, "grad_norm": 3.703125, "kl": 0.004566928721033037, "learning_rate": 4.208483875284357e-07, "loss": 0.0002, "reward": 0.6828272566199303, "reward_std": 0.41035995725542307, "rewards/reward_func": 0.6828272566199303, "step": 4328 }, { "completion_length": 141.640625, "epoch": 0.5802221330121772, "grad_norm": 4.125, "kl": 0.004908986215014011, "learning_rate": 4.197778669878228e-07, "loss": 0.0002, "reward": 0.4755503498017788, "reward_std": 0.5121949464082718, "rewards/reward_func": 0.4755503498017788, "step": 4336 }, { "completion_length": 143.0625, "epoch": 0.58129265355279, "grad_norm": 4.09375, "kl": 0.004488215548917651, "learning_rate": 4.1870734644720997e-07, "loss": 0.0002, "reward": 0.5884530800394714, "reward_std": 0.44153958186507225, "rewards/reward_func": 0.5884530800394714, "step": 4344 }, { "completion_length": 148.1796875, "epoch": 0.5823631740934029, "grad_norm": 3.0, "kl": 0.0046905699709896, "learning_rate": 4.1763682590659704e-07, "loss": 0.0002, "reward": 0.47682703845202923, "reward_std": 0.4733074624091387, "rewards/reward_func": 0.47682703845202923, "step": 4352 }, { "completion_length": 150.296875, "epoch": 0.5834336946340158, "grad_norm": 4.28125, "kl": 0.004917474143439904, "learning_rate": 4.165663053659842e-07, "loss": 0.0002, "reward": 0.2567645199596882, "reward_std": 0.6055057626217604, "rewards/reward_func": 0.2567645199596882, "step": 4360 }, { "completion_length": 205.234375, "epoch": 0.5845042151746287, "grad_norm": 4.09375, "kl": 0.00432168306724634, "learning_rate": 4.1549578482537134e-07, "loss": 0.0002, "reward": 0.04042044514790177, "reward_std": 0.5906463749706745, "rewards/reward_func": 0.04042044514790177, "step": 4368 }, { "completion_length": 147.5625, "epoch": 0.5855747357152415, "grad_norm": 3.796875, "kl": 0.005373828811571002, "learning_rate": 4.1442526428475846e-07, "loss": 0.0002, "reward": 0.3917626924812794, "reward_std": 0.5444907881319523, "rewards/reward_func": 0.3917626924812794, "step": 4376 }, { "completion_length": 203.15625, "epoch": 0.5866452562558544, "grad_norm": 2.296875, "kl": 0.0036935079260729253, "learning_rate": 4.133547437441456e-07, "loss": 0.0001, "reward": 0.09531690180301666, "reward_std": 0.5034721679985523, "rewards/reward_func": 0.09531690180301666, "step": 4384 }, { "completion_length": 148.1328125, "epoch": 0.5877157767964672, "grad_norm": 6.0, "kl": 0.0050933739403262734, "learning_rate": 4.122842232035327e-07, "loss": 0.0002, "reward": 0.11112022027373314, "reward_std": 0.5623177271336317, "rewards/reward_func": 0.11112022027373314, "step": 4392 }, { "completion_length": 185.1796875, "epoch": 0.5887862973370801, "grad_norm": 2.796875, "kl": 0.003696839907206595, "learning_rate": 4.112137026629198e-07, "loss": 0.0001, "reward": 0.37225864082574844, "reward_std": 0.6047016642987728, "rewards/reward_func": 0.37225864082574844, "step": 4400 }, { "completion_length": 156.125, "epoch": 0.5898568178776931, "grad_norm": 5.5, "kl": 0.004960794060025364, "learning_rate": 4.10143182122307e-07, "loss": 0.0002, "reward": 0.4381309971213341, "reward_std": 0.3526679091155529, "rewards/reward_func": 0.4381309971213341, "step": 4408 }, { "completion_length": 173.9453125, "epoch": 0.5909273384183059, "grad_norm": 5.40625, "kl": 0.0046878808352630585, "learning_rate": 4.0907266158169407e-07, "loss": 0.0002, "reward": 0.11471654986962676, "reward_std": 0.7081250138580799, "rewards/reward_func": 0.11471654986962676, "step": 4416 }, { "completion_length": 164.6875, "epoch": 0.5919978589589188, "grad_norm": 3.875, "kl": 0.004589978780131787, "learning_rate": 4.0800214104108124e-07, "loss": 0.0002, "reward": 0.242947518825531, "reward_std": 0.511182009242475, "rewards/reward_func": 0.242947518825531, "step": 4424 }, { "completion_length": 170.96875, "epoch": 0.5930683794995316, "grad_norm": 3.203125, "kl": 0.004504337441176176, "learning_rate": 4.069316205004683e-07, "loss": 0.0002, "reward": 0.13327412493526936, "reward_std": 0.7021276205778122, "rewards/reward_func": 0.13327412493526936, "step": 4432 }, { "completion_length": 166.96875, "epoch": 0.5941389000401445, "grad_norm": 3.65625, "kl": 0.004282756824977696, "learning_rate": 4.0586109995985543e-07, "loss": 0.0002, "reward": 0.2241785153746605, "reward_std": 0.5987379960715771, "rewards/reward_func": 0.2241785153746605, "step": 4440 }, { "completion_length": 174.84375, "epoch": 0.5952094205807574, "grad_norm": 3.3125, "kl": 0.004621970321750268, "learning_rate": 4.047905794192426e-07, "loss": 0.0002, "reward": 0.356457632035017, "reward_std": 0.5185628831386566, "rewards/reward_func": 0.356457632035017, "step": 4448 }, { "completion_length": 151.5859375, "epoch": 0.5962799411213703, "grad_norm": 5.46875, "kl": 0.004531825426965952, "learning_rate": 4.037200588786297e-07, "loss": 0.0002, "reward": 0.41319750994443893, "reward_std": 0.47001610416918993, "rewards/reward_func": 0.41319750994443893, "step": 4456 }, { "completion_length": 155.9375, "epoch": 0.5973504616619831, "grad_norm": 3.875, "kl": 0.005843940074555576, "learning_rate": 4.0264953833801685e-07, "loss": 0.0002, "reward": 0.16792790032923222, "reward_std": 0.45045214518904686, "rewards/reward_func": 0.16792790032923222, "step": 4464 }, { "completion_length": 204.1484375, "epoch": 0.598420982202596, "grad_norm": 3.734375, "kl": 0.0037444017361849546, "learning_rate": 4.0157901779740397e-07, "loss": 0.0001, "reward": 0.11755906883627176, "reward_std": 0.5679098833352327, "rewards/reward_func": 0.11755906883627176, "step": 4472 }, { "completion_length": 153.7734375, "epoch": 0.5994915027432088, "grad_norm": 5.78125, "kl": 0.004829052748391405, "learning_rate": 4.005084972567911e-07, "loss": 0.0002, "reward": 0.19808213412761688, "reward_std": 0.3854016624391079, "rewards/reward_func": 0.19808213412761688, "step": 4480 }, { "completion_length": 203.421875, "epoch": 0.6005620232838218, "grad_norm": 5.9375, "kl": 0.003742009517736733, "learning_rate": 3.994379767161782e-07, "loss": 0.0001, "reward": 0.232608491089195, "reward_std": 0.5558800995349884, "rewards/reward_func": 0.232608491089195, "step": 4488 }, { "completion_length": 154.265625, "epoch": 0.6016325438244347, "grad_norm": 3.96875, "kl": 0.004953162686433643, "learning_rate": 3.9836745617556534e-07, "loss": 0.0002, "reward": 0.4288094639778137, "reward_std": 0.416667815297842, "rewards/reward_func": 0.4288094639778137, "step": 4496 }, { "completion_length": 166.2890625, "epoch": 0.6027030643650475, "grad_norm": 4.40625, "kl": 0.004299461928894743, "learning_rate": 3.9729693563495246e-07, "loss": 0.0002, "reward": 0.12346869148313999, "reward_std": 0.6307908529415727, "rewards/reward_func": 0.12346869148313999, "step": 4504 }, { "completion_length": 157.078125, "epoch": 0.6037735849056604, "grad_norm": 3.796875, "kl": 0.004897929902654141, "learning_rate": 3.9622641509433963e-07, "loss": 0.0002, "reward": 0.3149998290464282, "reward_std": 0.5927382819354534, "rewards/reward_func": 0.3149998290464282, "step": 4512 }, { "completion_length": 168.9296875, "epoch": 0.6048441054462732, "grad_norm": 5.3125, "kl": 0.004709256027126685, "learning_rate": 3.951558945537267e-07, "loss": 0.0002, "reward": 0.23328473046422005, "reward_std": 0.633372450247407, "rewards/reward_func": 0.23328473046422005, "step": 4520 }, { "completion_length": 182.09375, "epoch": 0.6059146259868862, "grad_norm": 2.8125, "kl": 0.0044458308548200876, "learning_rate": 3.940853740131139e-07, "loss": 0.0002, "reward": 0.04303564690053463, "reward_std": 0.46831536665558815, "rewards/reward_func": 0.04303564690053463, "step": 4528 }, { "completion_length": 150.3671875, "epoch": 0.606985146527499, "grad_norm": 4.25, "kl": 0.004751139786094427, "learning_rate": 3.9301485347250094e-07, "loss": 0.0002, "reward": 0.6235604397952557, "reward_std": 0.43624259904026985, "rewards/reward_func": 0.6235604397952557, "step": 4536 }, { "completion_length": 168.140625, "epoch": 0.6080556670681119, "grad_norm": 4.15625, "kl": 0.004754378751385957, "learning_rate": 3.919443329318881e-07, "loss": 0.0002, "reward": 0.07649134658277035, "reward_std": 0.6275423960760236, "rewards/reward_func": 0.07649134658277035, "step": 4544 }, { "completion_length": 145.2109375, "epoch": 0.6091261876087247, "grad_norm": 4.15625, "kl": 0.004508535988861695, "learning_rate": 3.9087381239127524e-07, "loss": 0.0002, "reward": 0.09985450841486454, "reward_std": 0.6514963954687119, "rewards/reward_func": 0.09985450841486454, "step": 4552 }, { "completion_length": 152.9921875, "epoch": 0.6101967081493376, "grad_norm": 3.9375, "kl": 0.004148939304286614, "learning_rate": 3.8980329185066236e-07, "loss": 0.0002, "reward": 0.45872488245368004, "reward_std": 0.43476785998791456, "rewards/reward_func": 0.45872488245368004, "step": 4560 }, { "completion_length": 130.296875, "epoch": 0.6112672286899505, "grad_norm": 5.84375, "kl": 0.005563508428167552, "learning_rate": 3.887327713100495e-07, "loss": 0.0002, "reward": 0.6556578651070595, "reward_std": 0.44750246591866016, "rewards/reward_func": 0.6556578651070595, "step": 4568 }, { "completion_length": 197.5078125, "epoch": 0.6123377492305634, "grad_norm": 3.734375, "kl": 0.003922436822904274, "learning_rate": 3.876622507694366e-07, "loss": 0.0002, "reward": -0.01122634531930089, "reward_std": 0.5639722682535648, "rewards/reward_func": -0.01122634531930089, "step": 4576 }, { "completion_length": 152.25, "epoch": 0.6134082697711762, "grad_norm": 3.53125, "kl": 0.0060851232265122235, "learning_rate": 3.865917302288237e-07, "loss": 0.0002, "reward": 0.5405775438994169, "reward_std": 0.5025825463235378, "rewards/reward_func": 0.5405775438994169, "step": 4584 }, { "completion_length": 165.8671875, "epoch": 0.6144787903117891, "grad_norm": 4.5, "kl": 0.0056036147580016404, "learning_rate": 3.855212096882109e-07, "loss": 0.0002, "reward": 0.015333790332078934, "reward_std": 0.49498444236814976, "rewards/reward_func": 0.015333790332078934, "step": 4592 }, { "completion_length": 185.2421875, "epoch": 0.6155493108524019, "grad_norm": 2.953125, "kl": 0.003879066207446158, "learning_rate": 3.8445068914759797e-07, "loss": 0.0002, "reward": 0.19957604305818677, "reward_std": 0.5595576763153076, "rewards/reward_func": 0.19957604305818677, "step": 4600 }, { "completion_length": 186.2578125, "epoch": 0.6166198313930149, "grad_norm": 3.0, "kl": 0.004655700788134709, "learning_rate": 3.8338016860698514e-07, "loss": 0.0002, "reward": 0.25618747901171446, "reward_std": 0.5953042004257441, "rewards/reward_func": 0.25618747901171446, "step": 4608 }, { "completion_length": 144.6640625, "epoch": 0.6176903519336278, "grad_norm": 3.703125, "kl": 0.004998624965082854, "learning_rate": 3.8230964806637226e-07, "loss": 0.0002, "reward": 0.5813037822954357, "reward_std": 0.4772760821506381, "rewards/reward_func": 0.5813037822954357, "step": 4616 }, { "completion_length": 197.1015625, "epoch": 0.6187608724742406, "grad_norm": 3.71875, "kl": 0.004184005607385188, "learning_rate": 3.812391275257594e-07, "loss": 0.0002, "reward": 0.07982266321778297, "reward_std": 0.5760688092559576, "rewards/reward_func": 0.07982266321778297, "step": 4624 }, { "completion_length": 174.1796875, "epoch": 0.6198313930148535, "grad_norm": 3.34375, "kl": 0.004289099859306589, "learning_rate": 3.801686069851465e-07, "loss": 0.0002, "reward": 0.24364805966615677, "reward_std": 0.5018207374960184, "rewards/reward_func": 0.24364805966615677, "step": 4632 }, { "completion_length": 164.1640625, "epoch": 0.6209019135554663, "grad_norm": 3.375, "kl": 0.004735152295324951, "learning_rate": 3.7909808644453363e-07, "loss": 0.0002, "reward": 0.34338719584047794, "reward_std": 0.5795671846717596, "rewards/reward_func": 0.34338719584047794, "step": 4640 }, { "completion_length": 133.390625, "epoch": 0.6219724340960792, "grad_norm": 5.25, "kl": 0.006147218053229153, "learning_rate": 3.7802756590392075e-07, "loss": 0.0002, "reward": 0.38445473089814186, "reward_std": 0.49531611800193787, "rewards/reward_func": 0.38445473089814186, "step": 4648 }, { "completion_length": 153.3984375, "epoch": 0.6230429546366921, "grad_norm": 4.34375, "kl": 0.004714462149422616, "learning_rate": 3.769570453633079e-07, "loss": 0.0002, "reward": 0.30648303404450417, "reward_std": 0.4680379256606102, "rewards/reward_func": 0.30648303404450417, "step": 4656 }, { "completion_length": 166.171875, "epoch": 0.624113475177305, "grad_norm": 3.75, "kl": 0.004526323958998546, "learning_rate": 3.75886524822695e-07, "loss": 0.0002, "reward": 0.35779890790581703, "reward_std": 0.6776364631950855, "rewards/reward_func": 0.35779890790581703, "step": 4664 }, { "completion_length": 179.2890625, "epoch": 0.6251839957179178, "grad_norm": 3.46875, "kl": 0.004177290364168584, "learning_rate": 3.7481600428208217e-07, "loss": 0.0002, "reward": 0.30811624182388186, "reward_std": 0.6211207360029221, "rewards/reward_func": 0.30811624182388186, "step": 4672 }, { "completion_length": 182.0234375, "epoch": 0.6262545162585307, "grad_norm": 3.140625, "kl": 0.004683909472078085, "learning_rate": 3.7374548374146924e-07, "loss": 0.0002, "reward": 0.008799735456705093, "reward_std": 0.4996862728148699, "rewards/reward_func": 0.008799735456705093, "step": 4680 }, { "completion_length": 148.359375, "epoch": 0.6273250367991435, "grad_norm": 6.59375, "kl": 0.0059346232446841896, "learning_rate": 3.726749632008564e-07, "loss": 0.0002, "reward": 0.1749492734670639, "reward_std": 0.4117685044184327, "rewards/reward_func": 0.1749492734670639, "step": 4688 }, { "completion_length": 164.515625, "epoch": 0.6283955573397565, "grad_norm": 3.28125, "kl": 0.00426359154516831, "learning_rate": 3.7160444266024353e-07, "loss": 0.0002, "reward": 0.10608149319887161, "reward_std": 0.7313233427703381, "rewards/reward_func": 0.10608149319887161, "step": 4696 }, { "completion_length": 162.1640625, "epoch": 0.6294660778803693, "grad_norm": 3.859375, "kl": 0.004711132904049009, "learning_rate": 3.7053392211963065e-07, "loss": 0.0002, "reward": 0.13499032519757748, "reward_std": 0.6217631548643112, "rewards/reward_func": 0.13499032519757748, "step": 4704 }, { "completion_length": 160.1015625, "epoch": 0.6305365984209822, "grad_norm": 4.5, "kl": 0.004943192150676623, "learning_rate": 3.694634015790178e-07, "loss": 0.0002, "reward": 0.20859276875853539, "reward_std": 0.43620782624930143, "rewards/reward_func": 0.20859276875853539, "step": 4712 }, { "completion_length": 179.40625, "epoch": 0.631607118961595, "grad_norm": 6.46875, "kl": 0.004646303132176399, "learning_rate": 3.6839288103840495e-07, "loss": 0.0002, "reward": 0.20350963808596134, "reward_std": 0.6433871760964394, "rewards/reward_func": 0.20350963808596134, "step": 4720 }, { "completion_length": 176.1953125, "epoch": 0.6326776395022079, "grad_norm": 4.71875, "kl": 0.00438886127085425, "learning_rate": 3.67322360497792e-07, "loss": 0.0002, "reward": 0.36116465739905834, "reward_std": 0.6595458313822746, "rewards/reward_func": 0.36116465739905834, "step": 4728 }, { "completion_length": 132.4765625, "epoch": 0.6337481600428209, "grad_norm": 4.09375, "kl": 0.005916833528317511, "learning_rate": 3.662518399571792e-07, "loss": 0.0002, "reward": 0.5307797193527222, "reward_std": 0.43096242286264896, "rewards/reward_func": 0.5307797193527222, "step": 4736 }, { "completion_length": 163.1953125, "epoch": 0.6348186805834337, "grad_norm": 3.65625, "kl": 0.0043211055162828416, "learning_rate": 3.6518131941656626e-07, "loss": 0.0002, "reward": 0.3800085699185729, "reward_std": 0.6451602801680565, "rewards/reward_func": 0.3800085699185729, "step": 4744 }, { "completion_length": 157.9609375, "epoch": 0.6358892011240466, "grad_norm": 3.671875, "kl": 0.004449906060472131, "learning_rate": 3.6411079887595344e-07, "loss": 0.0002, "reward": 0.17367325257509947, "reward_std": 0.5829105107113719, "rewards/reward_func": 0.17367325257509947, "step": 4752 }, { "completion_length": 159.6953125, "epoch": 0.6369597216646594, "grad_norm": 4.09375, "kl": 0.0045379805960692465, "learning_rate": 3.6304027833534056e-07, "loss": 0.0002, "reward": 0.4868684080429375, "reward_std": 0.4918051455169916, "rewards/reward_func": 0.4868684080429375, "step": 4760 }, { "completion_length": 192.5234375, "epoch": 0.6380302422052723, "grad_norm": 2.984375, "kl": 0.003665678290417418, "learning_rate": 3.619697577947277e-07, "loss": 0.0001, "reward": -0.02173500368371606, "reward_std": 0.5965735167264938, "rewards/reward_func": -0.02173500368371606, "step": 4768 }, { "completion_length": 162.7109375, "epoch": 0.6391007627458852, "grad_norm": 4.0625, "kl": 0.005193614459130913, "learning_rate": 3.608992372541148e-07, "loss": 0.0002, "reward": 0.3424297422170639, "reward_std": 0.519868329167366, "rewards/reward_func": 0.3424297422170639, "step": 4776 }, { "completion_length": 167.8046875, "epoch": 0.6401712832864981, "grad_norm": 4.21875, "kl": 0.004302638117223978, "learning_rate": 3.598287167135019e-07, "loss": 0.0002, "reward": 0.14556433307006955, "reward_std": 0.7161346226930618, "rewards/reward_func": 0.14556433307006955, "step": 4784 }, { "completion_length": 186.265625, "epoch": 0.6412418038271109, "grad_norm": 4.4375, "kl": 0.004954680422088131, "learning_rate": 3.5875819617288904e-07, "loss": 0.0002, "reward": 0.2309811543673277, "reward_std": 0.6467564664781094, "rewards/reward_func": 0.2309811543673277, "step": 4792 }, { "completion_length": 168.7890625, "epoch": 0.6423123243677238, "grad_norm": 4.65625, "kl": 0.0042629605159163475, "learning_rate": 3.576876756322762e-07, "loss": 0.0002, "reward": 0.052162475883960724, "reward_std": 0.5783581472933292, "rewards/reward_func": 0.052162475883960724, "step": 4800 }, { "completion_length": 148.328125, "epoch": 0.6433828449083366, "grad_norm": 3.546875, "kl": 0.005317616189131513, "learning_rate": 3.566171550916633e-07, "loss": 0.0002, "reward": 0.40866485610604286, "reward_std": 0.5010849069803953, "rewards/reward_func": 0.40866485610604286, "step": 4808 }, { "completion_length": 166.921875, "epoch": 0.6444533654489496, "grad_norm": 4.5, "kl": 0.004913818440400064, "learning_rate": 3.5554663455105046e-07, "loss": 0.0002, "reward": 0.2594773005694151, "reward_std": 0.6219961307942867, "rewards/reward_func": 0.2594773005694151, "step": 4816 }, { "completion_length": 186.9921875, "epoch": 0.6455238859895625, "grad_norm": 4.9375, "kl": 0.004318988474551588, "learning_rate": 3.5447611401043753e-07, "loss": 0.0002, "reward": 0.2544400542974472, "reward_std": 0.7044170759618282, "rewards/reward_func": 0.2544400542974472, "step": 4824 }, { "completion_length": 181.1953125, "epoch": 0.6465944065301753, "grad_norm": 2.625, "kl": 0.00468209947575815, "learning_rate": 3.534055934698247e-07, "loss": 0.0002, "reward": 0.19006637297570705, "reward_std": 0.4856133693829179, "rewards/reward_func": 0.19006637297570705, "step": 4832 }, { "completion_length": 148.046875, "epoch": 0.6476649270707882, "grad_norm": 4.5625, "kl": 0.00532404551631771, "learning_rate": 3.5233507292921183e-07, "loss": 0.0002, "reward": 0.5334251541644335, "reward_std": 0.469427278265357, "rewards/reward_func": 0.5334251541644335, "step": 4840 }, { "completion_length": 156.171875, "epoch": 0.648735447611401, "grad_norm": 5.0625, "kl": 0.005237195349764079, "learning_rate": 3.5126455238859895e-07, "loss": 0.0002, "reward": 0.33213027007877827, "reward_std": 0.43815805949270725, "rewards/reward_func": 0.33213027007877827, "step": 4848 }, { "completion_length": 124.3359375, "epoch": 0.649805968152014, "grad_norm": 3.890625, "kl": 0.00520428063464351, "learning_rate": 3.5019403184798607e-07, "loss": 0.0002, "reward": 0.6764433234930038, "reward_std": 0.30982979480177164, "rewards/reward_func": 0.6764433234930038, "step": 4856 }, { "completion_length": 148.2421875, "epoch": 0.6508764886926268, "grad_norm": 3.9375, "kl": 0.005578657321166247, "learning_rate": 3.4912351130737324e-07, "loss": 0.0002, "reward": 0.10781971551477909, "reward_std": 0.4713937286287546, "rewards/reward_func": 0.10781971551477909, "step": 4864 }, { "completion_length": 171.9453125, "epoch": 0.6519470092332397, "grad_norm": 7.0625, "kl": 0.004915560552035458, "learning_rate": 3.480529907667603e-07, "loss": 0.0002, "reward": 0.25978787057101727, "reward_std": 0.534579697996378, "rewards/reward_func": 0.25978787057101727, "step": 4872 }, { "completion_length": 173.890625, "epoch": 0.6530175297738525, "grad_norm": 3.578125, "kl": 0.004566041403450072, "learning_rate": 3.469824702261475e-07, "loss": 0.0002, "reward": 0.01513567566871643, "reward_std": 0.39855797588825226, "rewards/reward_func": 0.01513567566871643, "step": 4880 }, { "completion_length": 155.9921875, "epoch": 0.6540880503144654, "grad_norm": 2.90625, "kl": 0.005055926798377186, "learning_rate": 3.4591194968553456e-07, "loss": 0.0002, "reward": 0.29762477427721024, "reward_std": 0.4921109788119793, "rewards/reward_func": 0.29762477427721024, "step": 4888 }, { "completion_length": 183.1875, "epoch": 0.6551585708550783, "grad_norm": 3.1875, "kl": 0.0038829974364489317, "learning_rate": 3.448414291449217e-07, "loss": 0.0002, "reward": 0.15376039780676365, "reward_std": 0.6010817158967257, "rewards/reward_func": 0.15376039780676365, "step": 4896 }, { "completion_length": 143.6640625, "epoch": 0.6562290913956912, "grad_norm": 4.25, "kl": 0.005372069776058197, "learning_rate": 3.4377090860430885e-07, "loss": 0.0002, "reward": 0.45785857178270817, "reward_std": 0.5780720338225365, "rewards/reward_func": 0.45785857178270817, "step": 4904 }, { "completion_length": 170.703125, "epoch": 0.657299611936304, "grad_norm": 3.5, "kl": 0.004123226040974259, "learning_rate": 3.427003880636959e-07, "loss": 0.0002, "reward": 0.3207322843372822, "reward_std": 0.5435153748840094, "rewards/reward_func": 0.3207322843372822, "step": 4912 }, { "completion_length": 170.8515625, "epoch": 0.6583701324769169, "grad_norm": 2.296875, "kl": 0.004442213830770925, "learning_rate": 3.416298675230831e-07, "loss": 0.0002, "reward": 0.2148810252547264, "reward_std": 0.3938889876008034, "rewards/reward_func": 0.2148810252547264, "step": 4920 }, { "completion_length": 172.0546875, "epoch": 0.6594406530175297, "grad_norm": 3.515625, "kl": 0.004354791803052649, "learning_rate": 3.4055934698247016e-07, "loss": 0.0002, "reward": 0.1703500747680664, "reward_std": 0.6057061813771725, "rewards/reward_func": 0.1703500747680664, "step": 4928 }, { "completion_length": 175.7421875, "epoch": 0.6605111735581426, "grad_norm": 3.875, "kl": 0.003823021659627557, "learning_rate": 3.3948882644185734e-07, "loss": 0.0002, "reward": 0.4399567134678364, "reward_std": 0.2992268856614828, "rewards/reward_func": 0.4399567134678364, "step": 4936 }, { "completion_length": 168.5390625, "epoch": 0.6615816940987556, "grad_norm": 4.90625, "kl": 0.0046115216973703355, "learning_rate": 3.3841830590124446e-07, "loss": 0.0002, "reward": 0.21755497064441442, "reward_std": 0.6609915122389793, "rewards/reward_func": 0.21755497064441442, "step": 4944 }, { "completion_length": 152.484375, "epoch": 0.6626522146393684, "grad_norm": 4.125, "kl": 0.004345663794083521, "learning_rate": 3.373477853606316e-07, "loss": 0.0002, "reward": 0.5310599412769079, "reward_std": 0.5352654401212931, "rewards/reward_func": 0.5310599412769079, "step": 4952 }, { "completion_length": 147.4140625, "epoch": 0.6637227351799813, "grad_norm": 4.28125, "kl": 0.004903295426629484, "learning_rate": 3.362772648200187e-07, "loss": 0.0002, "reward": 0.47527459636330605, "reward_std": 0.4394548684358597, "rewards/reward_func": 0.47527459636330605, "step": 4960 }, { "completion_length": 128.90625, "epoch": 0.6647932557205941, "grad_norm": 4.59375, "kl": 0.006024273345246911, "learning_rate": 3.352067442794059e-07, "loss": 0.0002, "reward": 0.2654110789299011, "reward_std": 0.5651892945170403, "rewards/reward_func": 0.2654110789299011, "step": 4968 }, { "completion_length": 148.0546875, "epoch": 0.665863776261207, "grad_norm": 3.53125, "kl": 0.004379941092338413, "learning_rate": 3.3413622373879295e-07, "loss": 0.0002, "reward": 0.5237122774124146, "reward_std": 0.5490029491484165, "rewards/reward_func": 0.5237122774124146, "step": 4976 }, { "completion_length": 191.8046875, "epoch": 0.6669342968018199, "grad_norm": 3.09375, "kl": 0.004273373109754175, "learning_rate": 3.330657031981801e-07, "loss": 0.0002, "reward": 0.47103837318718433, "reward_std": 0.45740975998342037, "rewards/reward_func": 0.47103837318718433, "step": 4984 }, { "completion_length": 173.75, "epoch": 0.6680048173424328, "grad_norm": 2.796875, "kl": 0.004224992386298254, "learning_rate": 3.319951826575672e-07, "loss": 0.0002, "reward": 0.33773920126259327, "reward_std": 0.6048417650163174, "rewards/reward_func": 0.33773920126259327, "step": 4992 }, { "completion_length": 150.1015625, "epoch": 0.6690753378830456, "grad_norm": 5.6875, "kl": 0.004986172774806619, "learning_rate": 3.3092466211695436e-07, "loss": 0.0002, "reward": 0.26249578036367893, "reward_std": 0.49397554993629456, "rewards/reward_func": 0.26249578036367893, "step": 5000 }, { "completion_length": 168.59375, "epoch": 0.6701458584236585, "grad_norm": 3.59375, "kl": 0.00485606407164596, "learning_rate": 3.298541415763415e-07, "loss": 0.0002, "reward": 0.1886943932622671, "reward_std": 0.6403144299983978, "rewards/reward_func": 0.1886943932622671, "step": 5008 }, { "completion_length": 169.3125, "epoch": 0.6712163789642713, "grad_norm": 4.3125, "kl": 0.004554765066131949, "learning_rate": 3.287836210357286e-07, "loss": 0.0002, "reward": 0.20580013655126095, "reward_std": 0.5662092342972755, "rewards/reward_func": 0.20580013655126095, "step": 5016 }, { "completion_length": 159.5390625, "epoch": 0.6722868995048843, "grad_norm": 5.46875, "kl": 0.004217549023451284, "learning_rate": 3.2771310049511573e-07, "loss": 0.0002, "reward": 0.4258319865912199, "reward_std": 0.5163000021129847, "rewards/reward_func": 0.4258319865912199, "step": 5024 }, { "completion_length": 171.4140625, "epoch": 0.6733574200454971, "grad_norm": 3.53125, "kl": 0.004711526213213801, "learning_rate": 3.2664257995450285e-07, "loss": 0.0002, "reward": 0.12669032951816916, "reward_std": 0.6522959657013416, "rewards/reward_func": 0.12669032951816916, "step": 5032 }, { "completion_length": 143.515625, "epoch": 0.67442794058611, "grad_norm": 3.6875, "kl": 0.005328927683876827, "learning_rate": 3.2557205941388997e-07, "loss": 0.0002, "reward": 0.25894895382225513, "reward_std": 0.6157816741615534, "rewards/reward_func": 0.25894895382225513, "step": 5040 }, { "completion_length": 169.09375, "epoch": 0.6754984611267228, "grad_norm": 4.15625, "kl": 0.004591718839947134, "learning_rate": 3.2450153887327715e-07, "loss": 0.0002, "reward": 0.1223931759595871, "reward_std": 0.7310500293970108, "rewards/reward_func": 0.1223931759595871, "step": 5048 }, { "completion_length": 158.8984375, "epoch": 0.6765689816673357, "grad_norm": 4.375, "kl": 0.00482406112132594, "learning_rate": 3.234310183326642e-07, "loss": 0.0002, "reward": 0.30884232465177774, "reward_std": 0.5993989063426852, "rewards/reward_func": 0.30884232465177774, "step": 5056 }, { "completion_length": 153.9921875, "epoch": 0.6776395022079487, "grad_norm": 6.84375, "kl": 0.0044682007865048945, "learning_rate": 3.223604977920514e-07, "loss": 0.0002, "reward": 0.23793572932481766, "reward_std": 0.47554378490895033, "rewards/reward_func": 0.23793572932481766, "step": 5064 }, { "completion_length": 171.5390625, "epoch": 0.6787100227485615, "grad_norm": 6.90625, "kl": 0.0044736934069078416, "learning_rate": 3.2128997725143846e-07, "loss": 0.0002, "reward": 0.37867590319365263, "reward_std": 0.49610742926597595, "rewards/reward_func": 0.37867590319365263, "step": 5072 }, { "completion_length": 148.2890625, "epoch": 0.6797805432891744, "grad_norm": 4.625, "kl": 0.004754859022796154, "learning_rate": 3.2021945671082563e-07, "loss": 0.0002, "reward": 0.517847141250968, "reward_std": 0.5063638836145401, "rewards/reward_func": 0.517847141250968, "step": 5080 }, { "completion_length": 156.71875, "epoch": 0.6808510638297872, "grad_norm": 5.59375, "kl": 0.005674656480550766, "learning_rate": 3.1914893617021275e-07, "loss": 0.0002, "reward": 0.34883139841258526, "reward_std": 0.33303822576999664, "rewards/reward_func": 0.34883139841258526, "step": 5088 }, { "completion_length": 178.890625, "epoch": 0.6819215843704001, "grad_norm": 3.40625, "kl": 0.0046264427655842155, "learning_rate": 3.180784156295999e-07, "loss": 0.0002, "reward": 0.47927757538855076, "reward_std": 0.5571104716509581, "rewards/reward_func": 0.47927757538855076, "step": 5096 }, { "completion_length": 144.3359375, "epoch": 0.682992104911013, "grad_norm": 3.890625, "kl": 0.004523319890722632, "learning_rate": 3.17007895088987e-07, "loss": 0.0002, "reward": 0.34390855580568314, "reward_std": 0.5760727934539318, "rewards/reward_func": 0.34390855580568314, "step": 5104 }, { "completion_length": 159.6484375, "epoch": 0.6840626254516259, "grad_norm": 4.65625, "kl": 0.004729281121399254, "learning_rate": 3.1593737454837417e-07, "loss": 0.0002, "reward": 0.38299885392189026, "reward_std": 0.3037977972999215, "rewards/reward_func": 0.38299885392189026, "step": 5112 }, { "completion_length": 150.53125, "epoch": 0.6851331459922387, "grad_norm": 2.96875, "kl": 0.005811055249068886, "learning_rate": 3.1486685400776124e-07, "loss": 0.0002, "reward": 0.4124348498880863, "reward_std": 0.5133458133786917, "rewards/reward_func": 0.4124348498880863, "step": 5120 }, { "completion_length": 146.6875, "epoch": 0.6862036665328516, "grad_norm": 5.15625, "kl": 0.004858777509070933, "learning_rate": 3.137963334671484e-07, "loss": 0.0002, "reward": 0.1230292096734047, "reward_std": 0.4463986298069358, "rewards/reward_func": 0.1230292096734047, "step": 5128 }, { "completion_length": 160.390625, "epoch": 0.6872741870734644, "grad_norm": 2.96875, "kl": 0.004541641887044534, "learning_rate": 3.127258129265355e-07, "loss": 0.0002, "reward": 0.05217524245381355, "reward_std": 0.45026756450533867, "rewards/reward_func": 0.05217524245381355, "step": 5136 }, { "completion_length": 144.0703125, "epoch": 0.6883447076140774, "grad_norm": 8.6875, "kl": 0.005810694128740579, "learning_rate": 3.1165529238592266e-07, "loss": 0.0002, "reward": 0.31892623007297516, "reward_std": 0.4961309377104044, "rewards/reward_func": 0.31892623007297516, "step": 5144 }, { "completion_length": 202.375, "epoch": 0.6894152281546903, "grad_norm": 3.125, "kl": 0.004103525396203622, "learning_rate": 3.105847718453098e-07, "loss": 0.0002, "reward": 0.35768837202340364, "reward_std": 0.5502582993358374, "rewards/reward_func": 0.35768837202340364, "step": 5152 }, { "completion_length": 173.1484375, "epoch": 0.6904857486953031, "grad_norm": 3.40625, "kl": 0.004345653491327539, "learning_rate": 3.095142513046969e-07, "loss": 0.0002, "reward": 0.30987947806715965, "reward_std": 0.5077685210853815, "rewards/reward_func": 0.30987947806715965, "step": 5160 }, { "completion_length": 176.890625, "epoch": 0.691556269235916, "grad_norm": 3.515625, "kl": 0.0047625836450606585, "learning_rate": 3.08443730764084e-07, "loss": 0.0002, "reward": 0.37025075126439333, "reward_std": 0.47811376582831144, "rewards/reward_func": 0.37025075126439333, "step": 5168 }, { "completion_length": 156.3125, "epoch": 0.6926267897765288, "grad_norm": 3.375, "kl": 0.004461723641725257, "learning_rate": 3.0737321022347114e-07, "loss": 0.0002, "reward": 0.4771025739610195, "reward_std": 0.4133305884897709, "rewards/reward_func": 0.4771025739610195, "step": 5176 }, { "completion_length": 175.3359375, "epoch": 0.6936973103171417, "grad_norm": 3.71875, "kl": 0.004555776889901608, "learning_rate": 3.0630268968285827e-07, "loss": 0.0002, "reward": 0.21876542083919048, "reward_std": 0.5979834999889135, "rewards/reward_func": 0.21876542083919048, "step": 5184 }, { "completion_length": 143.2734375, "epoch": 0.6947678308577546, "grad_norm": 4.1875, "kl": 0.006049849558621645, "learning_rate": 3.0523216914224544e-07, "loss": 0.0002, "reward": 0.3804114758968353, "reward_std": 0.43962680641561747, "rewards/reward_func": 0.3804114758968353, "step": 5192 }, { "completion_length": 176.09375, "epoch": 0.6958383513983675, "grad_norm": 4.1875, "kl": 0.004219004331389442, "learning_rate": 3.041616486016325e-07, "loss": 0.0002, "reward": 0.033823274075984955, "reward_std": 0.5529468916356564, "rewards/reward_func": 0.033823274075984955, "step": 5200 }, { "completion_length": 167.1328125, "epoch": 0.6969088719389803, "grad_norm": 3.453125, "kl": 0.0044156058866064996, "learning_rate": 3.030911280610197e-07, "loss": 0.0002, "reward": 0.35997615940868855, "reward_std": 0.6205689832568169, "rewards/reward_func": 0.35997615940868855, "step": 5208 }, { "completion_length": 134.3515625, "epoch": 0.6979793924795932, "grad_norm": 6.03125, "kl": 0.005597625044174492, "learning_rate": 3.020206075204068e-07, "loss": 0.0002, "reward": 0.5491457581520081, "reward_std": 0.5092198746278882, "rewards/reward_func": 0.5491457581520081, "step": 5216 }, { "completion_length": 161.3515625, "epoch": 0.699049913020206, "grad_norm": 2.984375, "kl": 0.005356652429327369, "learning_rate": 3.009500869797939e-07, "loss": 0.0002, "reward": 0.4664277071133256, "reward_std": 0.5567853916436434, "rewards/reward_func": 0.4664277071133256, "step": 5224 }, { "completion_length": 169.203125, "epoch": 0.700120433560819, "grad_norm": 4.1875, "kl": 0.0043607138795778155, "learning_rate": 2.9987956643918105e-07, "loss": 0.0002, "reward": 0.34521481581032276, "reward_std": 0.6393520161509514, "rewards/reward_func": 0.34521481581032276, "step": 5232 }, { "completion_length": 176.28125, "epoch": 0.7011909541014318, "grad_norm": 3.921875, "kl": 0.004437842464540154, "learning_rate": 2.9880904589856817e-07, "loss": 0.0002, "reward": -0.07886990532279015, "reward_std": 0.6460573114454746, "rewards/reward_func": -0.07886990532279015, "step": 5240 }, { "completion_length": 212.4375, "epoch": 0.7022614746420447, "grad_norm": 3.640625, "kl": 0.004008949821582064, "learning_rate": 2.977385253579553e-07, "loss": 0.0002, "reward": 0.012970509007573128, "reward_std": 0.5811912510544062, "rewards/reward_func": 0.012970509007573128, "step": 5248 }, { "completion_length": 182.1328125, "epoch": 0.7033319951826575, "grad_norm": 4.4375, "kl": 0.004463888035388663, "learning_rate": 2.9666800481734247e-07, "loss": 0.0002, "reward": 0.295044606551528, "reward_std": 0.5268499422818422, "rewards/reward_func": 0.295044606551528, "step": 5256 }, { "completion_length": 158.8515625, "epoch": 0.7044025157232704, "grad_norm": 3.796875, "kl": 0.005019562435336411, "learning_rate": 2.9559748427672953e-07, "loss": 0.0002, "reward": 0.2960619358345866, "reward_std": 0.5362240988761187, "rewards/reward_func": 0.2960619358345866, "step": 5264 }, { "completion_length": 158.328125, "epoch": 0.7054730362638834, "grad_norm": 4.40625, "kl": 0.004566931165754795, "learning_rate": 2.945269637361167e-07, "loss": 0.0002, "reward": 0.5046307481825352, "reward_std": 0.45363772846758366, "rewards/reward_func": 0.5046307481825352, "step": 5272 }, { "completion_length": 184.515625, "epoch": 0.7065435568044962, "grad_norm": 7.125, "kl": 0.004289998818421736, "learning_rate": 2.934564431955038e-07, "loss": 0.0002, "reward": 0.4870417043566704, "reward_std": 0.4741673758253455, "rewards/reward_func": 0.4870417043566704, "step": 5280 }, { "completion_length": 161.359375, "epoch": 0.7076140773451091, "grad_norm": 5.53125, "kl": 0.0042855191277340055, "learning_rate": 2.9238592265489095e-07, "loss": 0.0002, "reward": 0.37416786467656493, "reward_std": 0.5148907378315926, "rewards/reward_func": 0.37416786467656493, "step": 5288 }, { "completion_length": 159.1484375, "epoch": 0.7086845978857219, "grad_norm": 4.03125, "kl": 0.005468921910505742, "learning_rate": 2.9131540211427807e-07, "loss": 0.0002, "reward": 0.2032206254079938, "reward_std": 0.5835869964212179, "rewards/reward_func": 0.2032206254079938, "step": 5296 }, { "completion_length": 178.046875, "epoch": 0.7097551184263348, "grad_norm": 5.46875, "kl": 0.004879669373622164, "learning_rate": 2.9024488157366514e-07, "loss": 0.0002, "reward": 0.07407154329121113, "reward_std": 0.5483472738415003, "rewards/reward_func": 0.07407154329121113, "step": 5304 }, { "completion_length": 172.4296875, "epoch": 0.7108256389669477, "grad_norm": 3.078125, "kl": 0.004936008132062852, "learning_rate": 2.891743610330523e-07, "loss": 0.0002, "reward": 0.1570496652275324, "reward_std": 0.6552108749747276, "rewards/reward_func": 0.1570496652275324, "step": 5312 }, { "completion_length": 183.265625, "epoch": 0.7118961595075606, "grad_norm": 3.15625, "kl": 0.004192218388197944, "learning_rate": 2.881038404924394e-07, "loss": 0.0002, "reward": 0.2290868228301406, "reward_std": 0.6626240387558937, "rewards/reward_func": 0.2290868228301406, "step": 5320 }, { "completion_length": 129.171875, "epoch": 0.7129666800481734, "grad_norm": 3.859375, "kl": 0.00642680426244624, "learning_rate": 2.8703331995182656e-07, "loss": 0.0003, "reward": 0.395254772156477, "reward_std": 0.4721956867724657, "rewards/reward_func": 0.395254772156477, "step": 5328 }, { "completion_length": 157.6640625, "epoch": 0.7140372005887863, "grad_norm": 3.234375, "kl": 0.004553045437205583, "learning_rate": 2.859627994112137e-07, "loss": 0.0002, "reward": 0.44277836102992296, "reward_std": 0.5288186706602573, "rewards/reward_func": 0.44277836102992296, "step": 5336 }, { "completion_length": 174.7578125, "epoch": 0.7151077211293991, "grad_norm": 3.578125, "kl": 0.004835324827581644, "learning_rate": 2.848922788706008e-07, "loss": 0.0002, "reward": 0.2129112258553505, "reward_std": 0.518157972022891, "rewards/reward_func": 0.2129112258553505, "step": 5344 }, { "completion_length": 183.0625, "epoch": 0.7161782416700121, "grad_norm": 4.40625, "kl": 0.004699339595390484, "learning_rate": 2.838217583299879e-07, "loss": 0.0002, "reward": -0.12015869608148932, "reward_std": 0.6651497483253479, "rewards/reward_func": -0.12015869608148932, "step": 5352 }, { "completion_length": 167.125, "epoch": 0.717248762210625, "grad_norm": 3.8125, "kl": 0.004674197465647012, "learning_rate": 2.827512377893751e-07, "loss": 0.0002, "reward": 0.08481440320611, "reward_std": 0.6426707338541746, "rewards/reward_func": 0.08481440320611, "step": 5360 }, { "completion_length": 187.109375, "epoch": 0.7183192827512378, "grad_norm": 3.359375, "kl": 0.004370440146885812, "learning_rate": 2.8168071724876217e-07, "loss": 0.0002, "reward": 0.10261328518390656, "reward_std": 0.33446657191962004, "rewards/reward_func": 0.10261328518390656, "step": 5368 }, { "completion_length": 169.8828125, "epoch": 0.7193898032918506, "grad_norm": 4.46875, "kl": 0.005003685160772875, "learning_rate": 2.8061019670814934e-07, "loss": 0.0002, "reward": 0.35325085651129484, "reward_std": 0.5368635784834623, "rewards/reward_func": 0.35325085651129484, "step": 5376 }, { "completion_length": 127.6328125, "epoch": 0.7204603238324635, "grad_norm": 5.0, "kl": 0.006041952816303819, "learning_rate": 2.795396761675364e-07, "loss": 0.0002, "reward": 0.31289495434612036, "reward_std": 0.5351240076124668, "rewards/reward_func": 0.31289495434612036, "step": 5384 }, { "completion_length": 132.1484375, "epoch": 0.7215308443730765, "grad_norm": 3.890625, "kl": 0.004856948013184592, "learning_rate": 2.784691556269236e-07, "loss": 0.0002, "reward": 0.5194568559527397, "reward_std": 0.4919391795992851, "rewards/reward_func": 0.5194568559527397, "step": 5392 }, { "completion_length": 172.6796875, "epoch": 0.7226013649136893, "grad_norm": 3.71875, "kl": 0.004867620766162872, "learning_rate": 2.773986350863107e-07, "loss": 0.0002, "reward": 0.1840124912559986, "reward_std": 0.6040789932012558, "rewards/reward_func": 0.1840124912559986, "step": 5400 }, { "completion_length": 162.0078125, "epoch": 0.7236718854543022, "grad_norm": 5.4375, "kl": 0.0043890359229408205, "learning_rate": 2.7632811454569783e-07, "loss": 0.0002, "reward": 0.40340816229581833, "reward_std": 0.46000672224909067, "rewards/reward_func": 0.40340816229581833, "step": 5408 }, { "completion_length": 178.796875, "epoch": 0.724742405994915, "grad_norm": 3.671875, "kl": 0.004453314671991393, "learning_rate": 2.7525759400508495e-07, "loss": 0.0002, "reward": 0.1911243163049221, "reward_std": 0.5930454572662711, "rewards/reward_func": 0.1911243163049221, "step": 5416 }, { "completion_length": 166.296875, "epoch": 0.7258129265355279, "grad_norm": 3.890625, "kl": 0.004950450966134667, "learning_rate": 2.7418707346447207e-07, "loss": 0.0002, "reward": 0.2432717476040125, "reward_std": 0.4679036773741245, "rewards/reward_func": 0.2432717476040125, "step": 5424 }, { "completion_length": 204.453125, "epoch": 0.7268834470761407, "grad_norm": 3.6875, "kl": 0.004144096135860309, "learning_rate": 2.731165529238592e-07, "loss": 0.0002, "reward": -0.028832857497036457, "reward_std": 0.5423443503677845, "rewards/reward_func": -0.028832857497036457, "step": 5432 }, { "completion_length": 144.3671875, "epoch": 0.7279539676167537, "grad_norm": 5.46875, "kl": 0.0068962293735239655, "learning_rate": 2.7204603238324637e-07, "loss": 0.0003, "reward": 0.5656752809882164, "reward_std": 0.37680432945489883, "rewards/reward_func": 0.5656752809882164, "step": 5440 }, { "completion_length": 156.8203125, "epoch": 0.7290244881573665, "grad_norm": 3.21875, "kl": 0.005207971204072237, "learning_rate": 2.7097551184263344e-07, "loss": 0.0002, "reward": 0.46704378351569176, "reward_std": 0.4321159301325679, "rewards/reward_func": 0.46704378351569176, "step": 5448 }, { "completion_length": 170.1953125, "epoch": 0.7300950086979794, "grad_norm": 4.28125, "kl": 0.004335955512942746, "learning_rate": 2.699049913020206e-07, "loss": 0.0002, "reward": 0.15294395573437214, "reward_std": 0.6653651669621468, "rewards/reward_func": 0.15294395573437214, "step": 5456 }, { "completion_length": 149.6015625, "epoch": 0.7311655292385922, "grad_norm": 3.90625, "kl": 0.005089007405331358, "learning_rate": 2.6883447076140773e-07, "loss": 0.0002, "reward": 0.2426714487373829, "reward_std": 0.48969776928424835, "rewards/reward_func": 0.2426714487373829, "step": 5464 }, { "completion_length": 171.625, "epoch": 0.7322360497792051, "grad_norm": 3.03125, "kl": 0.004394051560666412, "learning_rate": 2.6776395022079485e-07, "loss": 0.0002, "reward": 0.10768201760947704, "reward_std": 0.4550578175112605, "rewards/reward_func": 0.10768201760947704, "step": 5472 }, { "completion_length": 159.9609375, "epoch": 0.733306570319818, "grad_norm": 3.59375, "kl": 0.005018858646508306, "learning_rate": 2.66693429680182e-07, "loss": 0.0002, "reward": 0.2529556443914771, "reward_std": 0.6179038770496845, "rewards/reward_func": 0.2529556443914771, "step": 5480 }, { "completion_length": 154.8125, "epoch": 0.7343770908604309, "grad_norm": 5.0, "kl": 0.005219785525696352, "learning_rate": 2.656229091395691e-07, "loss": 0.0002, "reward": 0.3117452962324023, "reward_std": 0.5476666176691651, "rewards/reward_func": 0.3117452962324023, "step": 5488 }, { "completion_length": 140.671875, "epoch": 0.7354476114010438, "grad_norm": 4.78125, "kl": 0.006368768343236297, "learning_rate": 2.645523885989562e-07, "loss": 0.0003, "reward": 0.5569799374789, "reward_std": 0.4139596875756979, "rewards/reward_func": 0.5569799374789, "step": 5496 }, { "completion_length": 208.4375, "epoch": 0.7365181319416566, "grad_norm": 3.171875, "kl": 0.004221481096465141, "learning_rate": 2.634818680583434e-07, "loss": 0.0002, "reward": 0.13646352104842663, "reward_std": 0.674302838742733, "rewards/reward_func": 0.13646352104842663, "step": 5504 }, { "completion_length": 177.7578125, "epoch": 0.7375886524822695, "grad_norm": 4.09375, "kl": 0.004927775065880269, "learning_rate": 2.6241134751773046e-07, "loss": 0.0002, "reward": 0.179019657894969, "reward_std": 0.4836566299200058, "rewards/reward_func": 0.179019657894969, "step": 5512 }, { "completion_length": 168.359375, "epoch": 0.7386591730228824, "grad_norm": 3.71875, "kl": 0.004563187627354637, "learning_rate": 2.6134082697711764e-07, "loss": 0.0002, "reward": 0.0944369975477457, "reward_std": 0.6901743151247501, "rewards/reward_func": 0.0944369975477457, "step": 5520 }, { "completion_length": 141.9765625, "epoch": 0.7397296935634953, "grad_norm": 4.5, "kl": 0.005385736672906205, "learning_rate": 2.602703064365047e-07, "loss": 0.0002, "reward": 0.3004543990828097, "reward_std": 0.6421327739953995, "rewards/reward_func": 0.3004543990828097, "step": 5528 }, { "completion_length": 161.1015625, "epoch": 0.7408002141041081, "grad_norm": 4.53125, "kl": 0.005307289626216516, "learning_rate": 2.591997858958919e-07, "loss": 0.0002, "reward": 0.4302559047937393, "reward_std": 0.296412231400609, "rewards/reward_func": 0.4302559047937393, "step": 5536 }, { "completion_length": 160.15625, "epoch": 0.741870734644721, "grad_norm": 5.21875, "kl": 0.00511023830040358, "learning_rate": 2.58129265355279e-07, "loss": 0.0002, "reward": 0.39244108088314533, "reward_std": 0.5584999155253172, "rewards/reward_func": 0.39244108088314533, "step": 5544 }, { "completion_length": 161.0546875, "epoch": 0.7429412551853338, "grad_norm": 3.46875, "kl": 0.004729041800601408, "learning_rate": 2.570587448146661e-07, "loss": 0.0002, "reward": 0.30113553907722235, "reward_std": 0.6211994774639606, "rewards/reward_func": 0.30113553907722235, "step": 5552 }, { "completion_length": 176.578125, "epoch": 0.7440117757259468, "grad_norm": 3.28125, "kl": 0.004635761812096462, "learning_rate": 2.5598822427405324e-07, "loss": 0.0002, "reward": 0.3027530014514923, "reward_std": 0.34483792912214994, "rewards/reward_func": 0.3027530014514923, "step": 5560 }, { "completion_length": 159.375, "epoch": 0.7450822962665596, "grad_norm": 5.53125, "kl": 0.005176402977667749, "learning_rate": 2.5491770373344036e-07, "loss": 0.0002, "reward": 0.17298301681876183, "reward_std": 0.584480419754982, "rewards/reward_func": 0.17298301681876183, "step": 5568 }, { "completion_length": 157.3671875, "epoch": 0.7461528168071725, "grad_norm": 6.5625, "kl": 0.005517173325642943, "learning_rate": 2.538471831928275e-07, "loss": 0.0002, "reward": 0.17840459011495113, "reward_std": 0.6545839756727219, "rewards/reward_func": 0.17840459011495113, "step": 5576 }, { "completion_length": 163.765625, "epoch": 0.7472233373477853, "grad_norm": 3.296875, "kl": 0.005615679023321718, "learning_rate": 2.5277666265221466e-07, "loss": 0.0002, "reward": 0.3282418688759208, "reward_std": 0.4674977771937847, "rewards/reward_func": 0.3282418688759208, "step": 5584 }, { "completion_length": 199.1171875, "epoch": 0.7482938578883982, "grad_norm": 4.96875, "kl": 0.004132435307838023, "learning_rate": 2.5170614211160173e-07, "loss": 0.0002, "reward": 0.041125981137156487, "reward_std": 0.5962537340819836, "rewards/reward_func": 0.041125981137156487, "step": 5592 }, { "completion_length": 195.3828125, "epoch": 0.7493643784290112, "grad_norm": 4.0625, "kl": 0.003978644759627059, "learning_rate": 2.506356215709889e-07, "loss": 0.0002, "reward": 0.13140291906893253, "reward_std": 0.44777560979127884, "rewards/reward_func": 0.13140291906893253, "step": 5600 }, { "completion_length": 173.609375, "epoch": 0.750434898969624, "grad_norm": 2.796875, "kl": 0.004251972888596356, "learning_rate": 2.49565101030376e-07, "loss": 0.0002, "reward": 0.20013932138681412, "reward_std": 0.6238753385841846, "rewards/reward_func": 0.20013932138681412, "step": 5608 }, { "completion_length": 165.1328125, "epoch": 0.7515054195102369, "grad_norm": 7.53125, "kl": 0.004290038690669462, "learning_rate": 2.4849458048976315e-07, "loss": 0.0002, "reward": 0.2280603777617216, "reward_std": 0.4963626991957426, "rewards/reward_func": 0.2280603777617216, "step": 5616 }, { "completion_length": 141.671875, "epoch": 0.7525759400508497, "grad_norm": 4.46875, "kl": 0.00585965282516554, "learning_rate": 2.4742405994915027e-07, "loss": 0.0002, "reward": 0.4678545705974102, "reward_std": 0.43725813180208206, "rewards/reward_func": 0.4678545705974102, "step": 5624 }, { "completion_length": 160.421875, "epoch": 0.7536464605914626, "grad_norm": 4.96875, "kl": 0.005591863940935582, "learning_rate": 2.463535394085374e-07, "loss": 0.0002, "reward": 0.24596700817346573, "reward_std": 0.4220298836007714, "rewards/reward_func": 0.24596700817346573, "step": 5632 }, { "completion_length": 156.6171875, "epoch": 0.7547169811320755, "grad_norm": 3.171875, "kl": 0.004576119041303173, "learning_rate": 2.452830188679245e-07, "loss": 0.0002, "reward": 0.3924466483294964, "reward_std": 0.6098343282938004, "rewards/reward_func": 0.3924466483294964, "step": 5640 }, { "completion_length": 176.890625, "epoch": 0.7557875016726884, "grad_norm": 3.4375, "kl": 0.003426549636060372, "learning_rate": 2.4421249832731163e-07, "loss": 0.0001, "reward": 0.31396659277379513, "reward_std": 0.507732754573226, "rewards/reward_func": 0.31396659277379513, "step": 5648 }, { "completion_length": 155.109375, "epoch": 0.7568580222133012, "grad_norm": 5.0625, "kl": 0.004432518238900229, "learning_rate": 2.4314197778669875e-07, "loss": 0.0002, "reward": 0.3897492587566376, "reward_std": 0.472976541146636, "rewards/reward_func": 0.3897492587566376, "step": 5656 }, { "completion_length": 178.7421875, "epoch": 0.7579285427539141, "grad_norm": 1.96875, "kl": 0.004251753707649186, "learning_rate": 2.4207145724608593e-07, "loss": 0.0002, "reward": 0.08406687900424004, "reward_std": 0.4810841968283057, "rewards/reward_func": 0.08406687900424004, "step": 5664 }, { "completion_length": 166.5859375, "epoch": 0.7589990632945269, "grad_norm": 4.875, "kl": 0.005223593441769481, "learning_rate": 2.4100093670547305e-07, "loss": 0.0002, "reward": 0.3817774336785078, "reward_std": 0.6594663038849831, "rewards/reward_func": 0.3817774336785078, "step": 5672 }, { "completion_length": 161.3671875, "epoch": 0.7600695838351398, "grad_norm": 5.625, "kl": 0.0044474324968177825, "learning_rate": 2.3993041616486017e-07, "loss": 0.0002, "reward": 0.23454780131578445, "reward_std": 0.37179601565003395, "rewards/reward_func": 0.23454780131578445, "step": 5680 }, { "completion_length": 179.796875, "epoch": 0.7611401043757527, "grad_norm": 3.609375, "kl": 0.00496278639184311, "learning_rate": 2.388598956242473e-07, "loss": 0.0002, "reward": 0.10904507525265217, "reward_std": 0.5533247627317905, "rewards/reward_func": 0.10904507525265217, "step": 5688 }, { "completion_length": 160.4296875, "epoch": 0.7622106249163656, "grad_norm": 4.53125, "kl": 0.005624369368888438, "learning_rate": 2.3778937508363441e-07, "loss": 0.0002, "reward": 0.32307033240795135, "reward_std": 0.3578721797093749, "rewards/reward_func": 0.32307033240795135, "step": 5696 }, { "completion_length": 181.2578125, "epoch": 0.7632811454569784, "grad_norm": 3.09375, "kl": 0.004423889273311943, "learning_rate": 2.3671885454302154e-07, "loss": 0.0002, "reward": 0.2661805059760809, "reward_std": 0.433091813698411, "rewards/reward_func": 0.2661805059760809, "step": 5704 }, { "completion_length": 145.40625, "epoch": 0.7643516659975913, "grad_norm": 3.53125, "kl": 0.005560883553698659, "learning_rate": 2.3564833400240866e-07, "loss": 0.0002, "reward": 0.38319743797183037, "reward_std": 0.5694666914641857, "rewards/reward_func": 0.38319743797183037, "step": 5712 }, { "completion_length": 185.6640625, "epoch": 0.7654221865382042, "grad_norm": 3.375, "kl": 0.004699640907347202, "learning_rate": 2.3457781346179578e-07, "loss": 0.0002, "reward": 0.11784735321998596, "reward_std": 0.5145326796919107, "rewards/reward_func": 0.11784735321998596, "step": 5720 }, { "completion_length": 202.7265625, "epoch": 0.7664927070788171, "grad_norm": 3.65625, "kl": 0.0042415427742525935, "learning_rate": 2.335072929211829e-07, "loss": 0.0002, "reward": -0.14664648659527302, "reward_std": 0.6029860116541386, "rewards/reward_func": -0.14664648659527302, "step": 5728 }, { "completion_length": 176.671875, "epoch": 0.76756322761943, "grad_norm": 3.671875, "kl": 0.005350680381525308, "learning_rate": 2.3243677238057005e-07, "loss": 0.0002, "reward": 0.28928207233548164, "reward_std": 0.49851767159998417, "rewards/reward_func": 0.28928207233548164, "step": 5736 }, { "completion_length": 168.3828125, "epoch": 0.7686337481600428, "grad_norm": 4.3125, "kl": 0.005111474136356264, "learning_rate": 2.3136625183995717e-07, "loss": 0.0002, "reward": 0.20785732567310333, "reward_std": 0.4819117970764637, "rewards/reward_func": 0.20785732567310333, "step": 5744 }, { "completion_length": 161.5625, "epoch": 0.7697042687006557, "grad_norm": 5.75, "kl": 0.0046100525360088795, "learning_rate": 2.302957312993443e-07, "loss": 0.0002, "reward": 0.3344459980726242, "reward_std": 0.48296352848410606, "rewards/reward_func": 0.3344459980726242, "step": 5752 }, { "completion_length": 159.8203125, "epoch": 0.7707747892412685, "grad_norm": 3.84375, "kl": 0.005216164543526247, "learning_rate": 2.2922521075873141e-07, "loss": 0.0002, "reward": 0.4968814216554165, "reward_std": 0.5169591847807169, "rewards/reward_func": 0.4968814216554165, "step": 5760 }, { "completion_length": 181.0, "epoch": 0.7718453097818815, "grad_norm": 3.859375, "kl": 0.0039515624375781044, "learning_rate": 2.2815469021811856e-07, "loss": 0.0002, "reward": 0.1462385654449463, "reward_std": 0.5148510783910751, "rewards/reward_func": 0.1462385654449463, "step": 5768 }, { "completion_length": 186.953125, "epoch": 0.7729158303224943, "grad_norm": 3.234375, "kl": 0.00495643715839833, "learning_rate": 2.2708416967750568e-07, "loss": 0.0002, "reward": -0.019661023281514645, "reward_std": 0.4568687481805682, "rewards/reward_func": -0.019661023281514645, "step": 5776 }, { "completion_length": 152.9453125, "epoch": 0.7739863508631072, "grad_norm": 4.125, "kl": 0.005363121483242139, "learning_rate": 2.260136491368928e-07, "loss": 0.0002, "reward": 0.3975646123290062, "reward_std": 0.5788163132965565, "rewards/reward_func": 0.3975646123290062, "step": 5784 }, { "completion_length": 150.8046875, "epoch": 0.77505687140372, "grad_norm": 3.25, "kl": 0.0049289112794213, "learning_rate": 2.2494312859627993e-07, "loss": 0.0002, "reward": 0.29290657490491867, "reward_std": 0.6054155379533768, "rewards/reward_func": 0.29290657490491867, "step": 5792 }, { "completion_length": 149.1640625, "epoch": 0.7761273919443329, "grad_norm": 2.546875, "kl": 0.006072040821891278, "learning_rate": 2.2387260805566705e-07, "loss": 0.0002, "reward": 0.234967946074903, "reward_std": 0.5344967059791088, "rewards/reward_func": 0.234967946074903, "step": 5800 }, { "completion_length": 158.625, "epoch": 0.7771979124849459, "grad_norm": 4.0625, "kl": 0.004590392898535356, "learning_rate": 2.228020875150542e-07, "loss": 0.0002, "reward": 0.419980987906456, "reward_std": 0.4606306320056319, "rewards/reward_func": 0.419980987906456, "step": 5808 }, { "completion_length": 138.7265625, "epoch": 0.7782684330255587, "grad_norm": 3.8125, "kl": 0.004858676256844774, "learning_rate": 2.2173156697444132e-07, "loss": 0.0002, "reward": 0.5591896008700132, "reward_std": 0.5148359183222055, "rewards/reward_func": 0.5591896008700132, "step": 5816 }, { "completion_length": 191.6328125, "epoch": 0.7793389535661716, "grad_norm": 7.0625, "kl": 0.004010791366454214, "learning_rate": 2.2066104643382844e-07, "loss": 0.0002, "reward": 0.07061274722218513, "reward_std": 0.6621855795383453, "rewards/reward_func": 0.07061274722218513, "step": 5824 }, { "completion_length": 151.5546875, "epoch": 0.7804094741067844, "grad_norm": 3.5625, "kl": 0.004385879990877584, "learning_rate": 2.1959052589321556e-07, "loss": 0.0002, "reward": 0.4282612316310406, "reward_std": 0.5311172138899565, "rewards/reward_func": 0.4282612316310406, "step": 5832 }, { "completion_length": 183.8671875, "epoch": 0.7814799946473973, "grad_norm": 4.0, "kl": 0.004209680715575814, "learning_rate": 2.185200053526027e-07, "loss": 0.0002, "reward": 0.1611488163471222, "reward_std": 0.5946944504976273, "rewards/reward_func": 0.1611488163471222, "step": 5840 }, { "completion_length": 137.359375, "epoch": 0.7825505151880102, "grad_norm": 4.3125, "kl": 0.004825499141588807, "learning_rate": 2.1744948481198983e-07, "loss": 0.0002, "reward": 0.5471408823505044, "reward_std": 0.5473849456757307, "rewards/reward_func": 0.5471408823505044, "step": 5848 }, { "completion_length": 159.34375, "epoch": 0.7836210357286231, "grad_norm": 3.4375, "kl": 0.005440732988063246, "learning_rate": 2.1637896427137695e-07, "loss": 0.0002, "reward": 0.4683985644951463, "reward_std": 0.5685102045536041, "rewards/reward_func": 0.4683985644951463, "step": 5856 }, { "completion_length": 161.59375, "epoch": 0.7846915562692359, "grad_norm": 4.5625, "kl": 0.004569044103845954, "learning_rate": 2.1530844373076407e-07, "loss": 0.0002, "reward": 0.0613291235640645, "reward_std": 0.48243121802806854, "rewards/reward_func": 0.0613291235640645, "step": 5864 }, { "completion_length": 170.6328125, "epoch": 0.7857620768098488, "grad_norm": 4.21875, "kl": 0.004493650107178837, "learning_rate": 2.1423792319015122e-07, "loss": 0.0002, "reward": 0.373017355799675, "reward_std": 0.5189967537298799, "rewards/reward_func": 0.373017355799675, "step": 5872 }, { "completion_length": 217.6640625, "epoch": 0.7868325973504616, "grad_norm": 3.5, "kl": 0.003907823265763, "learning_rate": 2.1316740264953834e-07, "loss": 0.0002, "reward": -0.019582286477088928, "reward_std": 0.5519250631332397, "rewards/reward_func": -0.019582286477088928, "step": 5880 }, { "completion_length": 177.4765625, "epoch": 0.7879031178910746, "grad_norm": 4.40625, "kl": 0.004457623173948377, "learning_rate": 2.1209688210892546e-07, "loss": 0.0002, "reward": 0.17010945454239845, "reward_std": 0.5244961641728878, "rewards/reward_func": 0.17010945454239845, "step": 5888 }, { "completion_length": 177.25, "epoch": 0.7889736384316874, "grad_norm": 4.9375, "kl": 0.0046152446011547, "learning_rate": 2.1102636156831259e-07, "loss": 0.0002, "reward": 0.20693709515035152, "reward_std": 0.602562677115202, "rewards/reward_func": 0.20693709515035152, "step": 5896 }, { "completion_length": 165.15625, "epoch": 0.7900441589723003, "grad_norm": 3.546875, "kl": 0.004225551267154515, "learning_rate": 2.099558410276997e-07, "loss": 0.0002, "reward": 0.3072157595306635, "reward_std": 0.522355480119586, "rewards/reward_func": 0.3072157595306635, "step": 5904 }, { "completion_length": 170.8125, "epoch": 0.7911146795129131, "grad_norm": 4.625, "kl": 0.005012799199903384, "learning_rate": 2.0888532048708686e-07, "loss": 0.0002, "reward": 0.320420335046947, "reward_std": 0.43850363977253437, "rewards/reward_func": 0.320420335046947, "step": 5912 }, { "completion_length": 158.8203125, "epoch": 0.792185200053526, "grad_norm": 5.78125, "kl": 0.005518296180525795, "learning_rate": 2.0781479994647398e-07, "loss": 0.0002, "reward": 0.12541838502511382, "reward_std": 0.4963842146098614, "rewards/reward_func": 0.12541838502511382, "step": 5920 }, { "completion_length": 165.421875, "epoch": 0.7932557205941388, "grad_norm": 3.5, "kl": 0.004109891131520271, "learning_rate": 2.067442794058611e-07, "loss": 0.0002, "reward": 0.48840315639972687, "reward_std": 0.5170729719102383, "rewards/reward_func": 0.48840315639972687, "step": 5928 }, { "completion_length": 206.5, "epoch": 0.7943262411347518, "grad_norm": 2.671875, "kl": 0.004218890477204695, "learning_rate": 2.0567375886524822e-07, "loss": 0.0002, "reward": 0.017320919781923294, "reward_std": 0.5469899624586105, "rewards/reward_func": 0.017320919781923294, "step": 5936 }, { "completion_length": 189.0390625, "epoch": 0.7953967616753647, "grad_norm": 3.5, "kl": 0.004204195429338142, "learning_rate": 2.0460323832463537e-07, "loss": 0.0002, "reward": 0.05206027068197727, "reward_std": 0.5685999430716038, "rewards/reward_func": 0.05206027068197727, "step": 5944 }, { "completion_length": 182.0, "epoch": 0.7964672822159775, "grad_norm": 3.140625, "kl": 0.004214008251437917, "learning_rate": 2.035327177840225e-07, "loss": 0.0002, "reward": 0.04203222133219242, "reward_std": 0.5610231403261423, "rewards/reward_func": 0.04203222133219242, "step": 5952 }, { "completion_length": 179.5078125, "epoch": 0.7975378027565904, "grad_norm": 3.84375, "kl": 0.004752454871777445, "learning_rate": 2.024621972434096e-07, "loss": 0.0002, "reward": -0.10599182732403278, "reward_std": 0.6240234952419996, "rewards/reward_func": -0.10599182732403278, "step": 5960 }, { "completion_length": 166.1875, "epoch": 0.7986083232972032, "grad_norm": 4.96875, "kl": 0.004443921585334465, "learning_rate": 2.0139167670279673e-07, "loss": 0.0002, "reward": 0.329925112426281, "reward_std": 0.42279865965247154, "rewards/reward_func": 0.329925112426281, "step": 5968 }, { "completion_length": 152.3515625, "epoch": 0.7996788438378162, "grad_norm": 3.21875, "kl": 0.004863968148129061, "learning_rate": 2.0032115616218383e-07, "loss": 0.0002, "reward": 0.2605556510388851, "reward_std": 0.46567713283002377, "rewards/reward_func": 0.2605556510388851, "step": 5976 }, { "completion_length": 171.7578125, "epoch": 0.800749364378429, "grad_norm": 4.09375, "kl": 0.004715076414868236, "learning_rate": 1.99250635621571e-07, "loss": 0.0002, "reward": -0.13541333191096783, "reward_std": 0.6721258126199245, "rewards/reward_func": -0.13541333191096783, "step": 5984 }, { "completion_length": 143.6640625, "epoch": 0.8018198849190419, "grad_norm": 4.5, "kl": 0.0056079100468195975, "learning_rate": 1.981801150809581e-07, "loss": 0.0002, "reward": 0.3414863357320428, "reward_std": 0.47020469419658184, "rewards/reward_func": 0.3414863357320428, "step": 5992 }, { "completion_length": 168.5078125, "epoch": 0.8028904054596547, "grad_norm": 3.46875, "kl": 0.004616849677404389, "learning_rate": 1.9710959454034522e-07, "loss": 0.0002, "reward": 0.24850520677864552, "reward_std": 0.5514967441558838, "rewards/reward_func": 0.24850520677864552, "step": 6000 }, { "completion_length": 168.734375, "epoch": 0.8039609260002676, "grad_norm": 2.578125, "kl": 0.004905187961412594, "learning_rate": 1.9603907399973234e-07, "loss": 0.0002, "reward": 0.29693731665611267, "reward_std": 0.5282188858836889, "rewards/reward_func": 0.29693731665611267, "step": 6008 }, { "completion_length": 172.6328125, "epoch": 0.8050314465408805, "grad_norm": 3.625, "kl": 0.00480208353837952, "learning_rate": 1.949685534591195e-07, "loss": 0.0002, "reward": 0.40039923787117004, "reward_std": 0.5602267645299435, "rewards/reward_func": 0.40039923787117004, "step": 6016 }, { "completion_length": 148.671875, "epoch": 0.8061019670814934, "grad_norm": 6.78125, "kl": 0.005018723517423496, "learning_rate": 1.938980329185066e-07, "loss": 0.0002, "reward": 0.3693223036825657, "reward_std": 0.4133735718205571, "rewards/reward_func": 0.3693223036825657, "step": 6024 }, { "completion_length": 168.359375, "epoch": 0.8071724876221062, "grad_norm": 5.03125, "kl": 0.004699432494817302, "learning_rate": 1.9282751237789373e-07, "loss": 0.0002, "reward": 0.29968111030757427, "reward_std": 0.5234957840293646, "rewards/reward_func": 0.29968111030757427, "step": 6032 }, { "completion_length": 172.1171875, "epoch": 0.8082430081627191, "grad_norm": 4.40625, "kl": 0.0047337598516605794, "learning_rate": 1.9175699183728085e-07, "loss": 0.0002, "reward": -0.13689319603145123, "reward_std": 0.5464825332164764, "rewards/reward_func": -0.13689319603145123, "step": 6040 }, { "completion_length": 187.53125, "epoch": 0.809313528703332, "grad_norm": 4.71875, "kl": 0.0043419343419373035, "learning_rate": 1.9068647129666797e-07, "loss": 0.0002, "reward": 0.12531755585223436, "reward_std": 0.7370849475264549, "rewards/reward_func": 0.12531755585223436, "step": 6048 }, { "completion_length": 173.359375, "epoch": 0.8103840492439449, "grad_norm": 3.09375, "kl": 0.004423053003847599, "learning_rate": 1.8961595075605512e-07, "loss": 0.0002, "reward": 0.45560589246451855, "reward_std": 0.3819491732865572, "rewards/reward_func": 0.45560589246451855, "step": 6056 }, { "completion_length": 173.4296875, "epoch": 0.8114545697845578, "grad_norm": 4.8125, "kl": 0.004857113177422434, "learning_rate": 1.8854543021544224e-07, "loss": 0.0002, "reward": 0.08866522740572691, "reward_std": 0.5376447830349207, "rewards/reward_func": 0.08866522740572691, "step": 6064 }, { "completion_length": 164.6015625, "epoch": 0.8125250903251706, "grad_norm": 3.328125, "kl": 0.004501277348026633, "learning_rate": 1.8747490967482937e-07, "loss": 0.0002, "reward": 0.2745439810678363, "reward_std": 0.4785211766138673, "rewards/reward_func": 0.2745439810678363, "step": 6072 }, { "completion_length": 160.1796875, "epoch": 0.8135956108657835, "grad_norm": 4.78125, "kl": 0.004779946495546028, "learning_rate": 1.864043891342165e-07, "loss": 0.0002, "reward": 0.22340465802699327, "reward_std": 0.557499123737216, "rewards/reward_func": 0.22340465802699327, "step": 6080 }, { "completion_length": 160.6171875, "epoch": 0.8146661314063963, "grad_norm": 6.59375, "kl": 0.005315470625646412, "learning_rate": 1.8533386859360364e-07, "loss": 0.0002, "reward": 0.12089579226449132, "reward_std": 0.6152683198451996, "rewards/reward_func": 0.12089579226449132, "step": 6088 }, { "completion_length": 180.1171875, "epoch": 0.8157366519470093, "grad_norm": 4.90625, "kl": 0.0042492037755437195, "learning_rate": 1.8426334805299076e-07, "loss": 0.0002, "reward": 0.18059484660625458, "reward_std": 0.5923185907304287, "rewards/reward_func": 0.18059484660625458, "step": 6096 }, { "completion_length": 149.1796875, "epoch": 0.8168071724876221, "grad_norm": 5.125, "kl": 0.00574629902257584, "learning_rate": 1.8319282751237788e-07, "loss": 0.0002, "reward": 0.2305867071263492, "reward_std": 0.47459197975695133, "rewards/reward_func": 0.2305867071263492, "step": 6104 }, { "completion_length": 160.0703125, "epoch": 0.817877693028235, "grad_norm": 4.65625, "kl": 0.005720962421037257, "learning_rate": 1.82122306971765e-07, "loss": 0.0002, "reward": 0.4231163961812854, "reward_std": 0.49531901255249977, "rewards/reward_func": 0.4231163961812854, "step": 6112 }, { "completion_length": 179.625, "epoch": 0.8189482135688478, "grad_norm": 3.609375, "kl": 0.004402774036861956, "learning_rate": 1.8105178643115212e-07, "loss": 0.0002, "reward": -0.09616492129862309, "reward_std": 0.5352960834279656, "rewards/reward_func": -0.09616492129862309, "step": 6120 }, { "completion_length": 197.9921875, "epoch": 0.8200187341094607, "grad_norm": 3.96875, "kl": 0.004230510094203055, "learning_rate": 1.7998126589053927e-07, "loss": 0.0002, "reward": 0.09585804212838411, "reward_std": 0.6544227637350559, "rewards/reward_func": 0.09585804212838411, "step": 6128 }, { "completion_length": 174.0390625, "epoch": 0.8210892546500737, "grad_norm": 3.8125, "kl": 0.004287142743123695, "learning_rate": 1.789107453499264e-07, "loss": 0.0002, "reward": 0.2640516827814281, "reward_std": 0.5714995982125401, "rewards/reward_func": 0.2640516827814281, "step": 6136 }, { "completion_length": 171.5234375, "epoch": 0.8221597751906865, "grad_norm": 2.703125, "kl": 0.004413856513565406, "learning_rate": 1.778402248093135e-07, "loss": 0.0002, "reward": 0.32004706375300884, "reward_std": 0.6919787935912609, "rewards/reward_func": 0.32004706375300884, "step": 6144 }, { "completion_length": 164.6640625, "epoch": 0.8232302957312994, "grad_norm": 3.40625, "kl": 0.004383451188914478, "learning_rate": 1.7676970426870063e-07, "loss": 0.0002, "reward": 0.3686336353421211, "reward_std": 0.5524613773450255, "rewards/reward_func": 0.3686336353421211, "step": 6152 }, { "completion_length": 196.8203125, "epoch": 0.8243008162719122, "grad_norm": 3.078125, "kl": 0.0042349822469986975, "learning_rate": 1.7569918372808778e-07, "loss": 0.0002, "reward": 0.04516376554965973, "reward_std": 0.5202826540917158, "rewards/reward_func": 0.04516376554965973, "step": 6160 }, { "completion_length": 150.421875, "epoch": 0.8253713368125251, "grad_norm": 2.890625, "kl": 0.004983038583304733, "learning_rate": 1.746286631874749e-07, "loss": 0.0002, "reward": 0.320843068882823, "reward_std": 0.4479983486235142, "rewards/reward_func": 0.320843068882823, "step": 6168 }, { "completion_length": 180.46875, "epoch": 0.8264418573531379, "grad_norm": 4.71875, "kl": 0.004280634428141639, "learning_rate": 1.7355814264686203e-07, "loss": 0.0002, "reward": 0.42457358445972204, "reward_std": 0.6277044154703617, "rewards/reward_func": 0.42457358445972204, "step": 6176 }, { "completion_length": 177.421875, "epoch": 0.8275123778937509, "grad_norm": 4.3125, "kl": 0.003841431171167642, "learning_rate": 1.7248762210624915e-07, "loss": 0.0002, "reward": 0.4124793987721205, "reward_std": 0.5699762850999832, "rewards/reward_func": 0.4124793987721205, "step": 6184 }, { "completion_length": 165.5625, "epoch": 0.8285828984343637, "grad_norm": 3.53125, "kl": 0.0045668908569496125, "learning_rate": 1.714171015656363e-07, "loss": 0.0002, "reward": 0.24390191398561, "reward_std": 0.5946025252342224, "rewards/reward_func": 0.24390191398561, "step": 6192 }, { "completion_length": 144.796875, "epoch": 0.8296534189749766, "grad_norm": 6.15625, "kl": 0.005119076173286885, "learning_rate": 1.7034658102502342e-07, "loss": 0.0002, "reward": 0.37631342001259327, "reward_std": 0.54392384365201, "rewards/reward_func": 0.37631342001259327, "step": 6200 }, { "completion_length": 162.703125, "epoch": 0.8307239395155894, "grad_norm": 3.65625, "kl": 0.004819765774300322, "learning_rate": 1.6927606048441054e-07, "loss": 0.0002, "reward": 0.3171768644824624, "reward_std": 0.6571879032999277, "rewards/reward_func": 0.3171768644824624, "step": 6208 }, { "completion_length": 176.15625, "epoch": 0.8317944600562023, "grad_norm": 4.4375, "kl": 0.004749486513901502, "learning_rate": 1.6820553994379766e-07, "loss": 0.0002, "reward": 0.32477567065507174, "reward_std": 0.584898017346859, "rewards/reward_func": 0.32477567065507174, "step": 6216 }, { "completion_length": 164.546875, "epoch": 0.8328649805968152, "grad_norm": 4.96875, "kl": 0.005459955689730123, "learning_rate": 1.6713501940318478e-07, "loss": 0.0002, "reward": 0.3247902784496546, "reward_std": 0.6046720538288355, "rewards/reward_func": 0.3247902784496546, "step": 6224 }, { "completion_length": 173.3828125, "epoch": 0.8339355011374281, "grad_norm": 3.671875, "kl": 0.005025158607168123, "learning_rate": 1.6606449886257193e-07, "loss": 0.0002, "reward": 0.438681710511446, "reward_std": 0.41498881857842207, "rewards/reward_func": 0.438681710511446, "step": 6232 }, { "completion_length": 149.4453125, "epoch": 0.8350060216780409, "grad_norm": 4.125, "kl": 0.0048881605616770685, "learning_rate": 1.6499397832195905e-07, "loss": 0.0002, "reward": 0.3972213324159384, "reward_std": 0.522408589720726, "rewards/reward_func": 0.3972213324159384, "step": 6240 }, { "completion_length": 142.03125, "epoch": 0.8360765422186538, "grad_norm": 5.9375, "kl": 0.00615677481982857, "learning_rate": 1.6392345778134617e-07, "loss": 0.0002, "reward": 0.5399059653282166, "reward_std": 0.5134044801816344, "rewards/reward_func": 0.5399059653282166, "step": 6248 }, { "completion_length": 139.890625, "epoch": 0.8371470627592666, "grad_norm": 4.40625, "kl": 0.005498810496646911, "learning_rate": 1.628529372407333e-07, "loss": 0.0002, "reward": 0.26559029519557953, "reward_std": 0.7036202065646648, "rewards/reward_func": 0.26559029519557953, "step": 6256 }, { "completion_length": 167.046875, "epoch": 0.8382175832998796, "grad_norm": 3.8125, "kl": 0.005057969567133114, "learning_rate": 1.6178241670012044e-07, "loss": 0.0002, "reward": 0.26883680559694767, "reward_std": 0.6107715517282486, "rewards/reward_func": 0.26883680559694767, "step": 6264 }, { "completion_length": 183.6015625, "epoch": 0.8392881038404925, "grad_norm": 3.78125, "kl": 0.00433379874448292, "learning_rate": 1.6071189615950756e-07, "loss": 0.0002, "reward": 0.08581209369003773, "reward_std": 0.601530484855175, "rewards/reward_func": 0.08581209369003773, "step": 6272 }, { "completion_length": 177.2265625, "epoch": 0.8403586243811053, "grad_norm": 4.71875, "kl": 0.0044705503969453275, "learning_rate": 1.5964137561889469e-07, "loss": 0.0002, "reward": 0.23431246215477586, "reward_std": 0.5960433762520552, "rewards/reward_func": 0.23431246215477586, "step": 6280 }, { "completion_length": 152.078125, "epoch": 0.8414291449217182, "grad_norm": 4.8125, "kl": 0.005255370575468987, "learning_rate": 1.585708550782818e-07, "loss": 0.0002, "reward": 0.37102524004876614, "reward_std": 0.6371021419763565, "rewards/reward_func": 0.37102524004876614, "step": 6288 }, { "completion_length": 217.2734375, "epoch": 0.842499665462331, "grad_norm": 4.375, "kl": 0.003196624806150794, "learning_rate": 1.5750033453766893e-07, "loss": 0.0001, "reward": -0.045689786318689585, "reward_std": 0.4852413050830364, "rewards/reward_func": -0.045689786318689585, "step": 6296 }, { "completion_length": 227.7265625, "epoch": 0.843570186002944, "grad_norm": 3.71875, "kl": 0.004137254873057827, "learning_rate": 1.5642981399705608e-07, "loss": 0.0002, "reward": 6.577186286449432e-05, "reward_std": 0.4605599669739604, "rewards/reward_func": 6.577186286449432e-05, "step": 6304 }, { "completion_length": 176.59375, "epoch": 0.8446407065435568, "grad_norm": 3.25, "kl": 0.004376317374408245, "learning_rate": 1.553592934564432e-07, "loss": 0.0002, "reward": 0.12455911561846733, "reward_std": 0.6250845305621624, "rewards/reward_func": 0.12455911561846733, "step": 6312 }, { "completion_length": 150.9609375, "epoch": 0.8457112270841697, "grad_norm": 5.0625, "kl": 0.004825094016268849, "learning_rate": 1.5428877291583032e-07, "loss": 0.0002, "reward": 0.37383434921503067, "reward_std": 0.6138091459870338, "rewards/reward_func": 0.37383434921503067, "step": 6320 }, { "completion_length": 179.15625, "epoch": 0.8467817476247825, "grad_norm": 3.984375, "kl": 0.004287428979296237, "learning_rate": 1.5321825237521744e-07, "loss": 0.0002, "reward": 0.3161802035756409, "reward_std": 0.5376028679311275, "rewards/reward_func": 0.3161802035756409, "step": 6328 }, { "completion_length": 145.8828125, "epoch": 0.8478522681653954, "grad_norm": 3.171875, "kl": 0.0048953695222735405, "learning_rate": 1.521477318346046e-07, "loss": 0.0002, "reward": 0.27396881859749556, "reward_std": 0.6160639338195324, "rewards/reward_func": 0.27396881859749556, "step": 6336 }, { "completion_length": 170.7578125, "epoch": 0.8489227887060083, "grad_norm": 3.171875, "kl": 0.004435895767528564, "learning_rate": 1.510772112939917e-07, "loss": 0.0002, "reward": 0.34336171485483646, "reward_std": 0.6223765797913074, "rewards/reward_func": 0.34336171485483646, "step": 6344 }, { "completion_length": 134.5, "epoch": 0.8499933092466212, "grad_norm": 4.4375, "kl": 0.005276092153508216, "learning_rate": 1.5000669075337883e-07, "loss": 0.0002, "reward": 0.4353441474959254, "reward_std": 0.5333261359483004, "rewards/reward_func": 0.4353441474959254, "step": 6352 }, { "completion_length": 137.9921875, "epoch": 0.851063829787234, "grad_norm": 5.21875, "kl": 0.0056113199389074, "learning_rate": 1.4893617021276595e-07, "loss": 0.0002, "reward": 0.11142583098262548, "reward_std": 0.6482997722923756, "rewards/reward_func": 0.11142583098262548, "step": 6360 }, { "completion_length": 212.765625, "epoch": 0.8521343503278469, "grad_norm": 4.625, "kl": 0.004180938733043149, "learning_rate": 1.4786564967215308e-07, "loss": 0.0002, "reward": -0.04978405591100454, "reward_std": 0.6307705044746399, "rewards/reward_func": -0.04978405591100454, "step": 6368 }, { "completion_length": 156.7265625, "epoch": 0.8532048708684598, "grad_norm": 2.640625, "kl": 0.005362185067497194, "learning_rate": 1.4679512913154022e-07, "loss": 0.0002, "reward": -0.013063086196780205, "reward_std": 0.5464332979172468, "rewards/reward_func": -0.013063086196780205, "step": 6376 }, { "completion_length": 173.03125, "epoch": 0.8542753914090727, "grad_norm": 3.921875, "kl": 0.004632528842194006, "learning_rate": 1.4572460859092734e-07, "loss": 0.0002, "reward": 0.34282067604362965, "reward_std": 0.622871071100235, "rewards/reward_func": 0.34282067604362965, "step": 6384 }, { "completion_length": 169.6796875, "epoch": 0.8553459119496856, "grad_norm": 4.0, "kl": 0.004290186625439674, "learning_rate": 1.4465408805031447e-07, "loss": 0.0002, "reward": 0.2828236762434244, "reward_std": 0.4671051539480686, "rewards/reward_func": 0.2828236762434244, "step": 6392 }, { "completion_length": 163.65625, "epoch": 0.8564164324902984, "grad_norm": 3.9375, "kl": 0.005294958682497963, "learning_rate": 1.435835675097016e-07, "loss": 0.0002, "reward": 0.46301793679594994, "reward_std": 0.5075423391535878, "rewards/reward_func": 0.46301793679594994, "step": 6400 }, { "completion_length": 157.46875, "epoch": 0.8574869530309113, "grad_norm": 3.703125, "kl": 0.004041396110551432, "learning_rate": 1.4251304696908874e-07, "loss": 0.0002, "reward": 0.48526691645383835, "reward_std": 0.3719025030732155, "rewards/reward_func": 0.48526691645383835, "step": 6408 }, { "completion_length": 146.40625, "epoch": 0.8585574735715241, "grad_norm": 5.5, "kl": 0.005460718472022563, "learning_rate": 1.4144252642847586e-07, "loss": 0.0002, "reward": 0.2532842471264303, "reward_std": 0.44650126062333584, "rewards/reward_func": 0.2532842471264303, "step": 6416 }, { "completion_length": 183.3203125, "epoch": 0.859627994112137, "grad_norm": 4.09375, "kl": 0.004378183133667335, "learning_rate": 1.4037200588786295e-07, "loss": 0.0002, "reward": 0.13037376385182142, "reward_std": 0.5470245387405157, "rewards/reward_func": 0.13037376385182142, "step": 6424 }, { "completion_length": 169.9921875, "epoch": 0.8606985146527499, "grad_norm": 3.890625, "kl": 0.0042949684138875455, "learning_rate": 1.3930148534725007e-07, "loss": 0.0002, "reward": 0.1133667528629303, "reward_std": 0.5052687106654048, "rewards/reward_func": 0.1133667528629303, "step": 6432 }, { "completion_length": 149.46875, "epoch": 0.8617690351933628, "grad_norm": 3.203125, "kl": 0.0049638144264463335, "learning_rate": 1.3823096480663722e-07, "loss": 0.0002, "reward": 0.3671250296756625, "reward_std": 0.7020149789750576, "rewards/reward_func": 0.3671250296756625, "step": 6440 }, { "completion_length": 177.7578125, "epoch": 0.8628395557339756, "grad_norm": 3.15625, "kl": 0.004953681491315365, "learning_rate": 1.3716044426602434e-07, "loss": 0.0002, "reward": -0.07036676816642284, "reward_std": 0.5223548822104931, "rewards/reward_func": -0.07036676816642284, "step": 6448 }, { "completion_length": 153.890625, "epoch": 0.8639100762745885, "grad_norm": 4.0625, "kl": 0.004841944552026689, "learning_rate": 1.3608992372541147e-07, "loss": 0.0002, "reward": 0.2930979495868087, "reward_std": 0.6658169776201248, "rewards/reward_func": 0.2930979495868087, "step": 6456 }, { "completion_length": 163.359375, "epoch": 0.8649805968152013, "grad_norm": 3.796875, "kl": 0.004769285937072709, "learning_rate": 1.3501940318479859e-07, "loss": 0.0002, "reward": 0.48341894522309303, "reward_std": 0.4925485821440816, "rewards/reward_func": 0.48341894522309303, "step": 6464 }, { "completion_length": 160.4765625, "epoch": 0.8660511173558143, "grad_norm": 5.5625, "kl": 0.005216082121478394, "learning_rate": 1.339488826441857e-07, "loss": 0.0002, "reward": 0.26688177324831486, "reward_std": 0.5798533223569393, "rewards/reward_func": 0.26688177324831486, "step": 6472 }, { "completion_length": 180.109375, "epoch": 0.8671216378964272, "grad_norm": 4.34375, "kl": 0.005117598222568631, "learning_rate": 1.3287836210357286e-07, "loss": 0.0002, "reward": 0.0437483387067914, "reward_std": 0.5821977593004704, "rewards/reward_func": 0.0437483387067914, "step": 6480 }, { "completion_length": 203.53125, "epoch": 0.86819215843704, "grad_norm": 3.65625, "kl": 0.004146075778407976, "learning_rate": 1.3180784156295998e-07, "loss": 0.0002, "reward": -0.04123528301715851, "reward_std": 0.5794482082128525, "rewards/reward_func": -0.04123528301715851, "step": 6488 }, { "completion_length": 174.515625, "epoch": 0.8692626789776529, "grad_norm": 3.046875, "kl": 0.004450612410437316, "learning_rate": 1.307373210223471e-07, "loss": 0.0002, "reward": 0.17205783817917109, "reward_std": 0.5600821115076542, "rewards/reward_func": 0.17205783817917109, "step": 6496 }, { "completion_length": 160.5078125, "epoch": 0.8703331995182657, "grad_norm": 4.40625, "kl": 0.005854069750057533, "learning_rate": 1.2966680048173422e-07, "loss": 0.0002, "reward": 0.5184466666541994, "reward_std": 0.43986151926219463, "rewards/reward_func": 0.5184466666541994, "step": 6504 }, { "completion_length": 164.9296875, "epoch": 0.8714037200588787, "grad_norm": 3.484375, "kl": 0.005087268742499873, "learning_rate": 1.2859627994112137e-07, "loss": 0.0002, "reward": 0.15493404306471348, "reward_std": 0.5500355400145054, "rewards/reward_func": 0.15493404306471348, "step": 6512 }, { "completion_length": 166.6953125, "epoch": 0.8724742405994915, "grad_norm": 2.40625, "kl": 0.005023477482609451, "learning_rate": 1.275257594005085e-07, "loss": 0.0002, "reward": 0.21355824172496796, "reward_std": 0.695870652794838, "rewards/reward_func": 0.21355824172496796, "step": 6520 }, { "completion_length": 179.40625, "epoch": 0.8735447611401044, "grad_norm": 3.671875, "kl": 0.0044980833772569895, "learning_rate": 1.264552388598956e-07, "loss": 0.0002, "reward": 0.31162807578220963, "reward_std": 0.49335628002882004, "rewards/reward_func": 0.31162807578220963, "step": 6528 }, { "completion_length": 170.890625, "epoch": 0.8746152816807172, "grad_norm": 4.34375, "kl": 0.004584902344504371, "learning_rate": 1.2538471831928273e-07, "loss": 0.0002, "reward": 0.39711445942521095, "reward_std": 0.4620585907250643, "rewards/reward_func": 0.39711445942521095, "step": 6536 }, { "completion_length": 196.34375, "epoch": 0.8756858022213301, "grad_norm": 6.03125, "kl": 0.004008692951174453, "learning_rate": 1.2431419777866988e-07, "loss": 0.0002, "reward": 0.14655437879264355, "reward_std": 0.5025924574583769, "rewards/reward_func": 0.14655437879264355, "step": 6544 }, { "completion_length": 157.1328125, "epoch": 0.876756322761943, "grad_norm": 4.5, "kl": 0.005272853362839669, "learning_rate": 1.23243677238057e-07, "loss": 0.0002, "reward": 0.16074330359697342, "reward_std": 0.4522952139377594, "rewards/reward_func": 0.16074330359697342, "step": 6552 }, { "completion_length": 170.0390625, "epoch": 0.8778268433025559, "grad_norm": 3.71875, "kl": 0.0051819840737152845, "learning_rate": 1.2217315669744412e-07, "loss": 0.0002, "reward": 0.3139108493924141, "reward_std": 0.4983799997717142, "rewards/reward_func": 0.3139108493924141, "step": 6560 }, { "completion_length": 160.1953125, "epoch": 0.8788973638431687, "grad_norm": 2.5, "kl": 0.004580837674438953, "learning_rate": 1.2110263615683125e-07, "loss": 0.0002, "reward": 0.35805173218250275, "reward_std": 0.4121380029246211, "rewards/reward_func": 0.35805173218250275, "step": 6568 }, { "completion_length": 166.125, "epoch": 0.8799678843837816, "grad_norm": 3.890625, "kl": 0.005891179316677153, "learning_rate": 1.200321156162184e-07, "loss": 0.0002, "reward": 0.3935977406799793, "reward_std": 0.4564328156411648, "rewards/reward_func": 0.3935977406799793, "step": 6576 }, { "completion_length": 161.828125, "epoch": 0.8810384049243944, "grad_norm": 4.03125, "kl": 0.004937338293530047, "learning_rate": 1.189615950756055e-07, "loss": 0.0002, "reward": 0.3541194014251232, "reward_std": 0.7327413186430931, "rewards/reward_func": 0.3541194014251232, "step": 6584 }, { "completion_length": 166.890625, "epoch": 0.8821089254650074, "grad_norm": 4.0, "kl": 0.004368811612948775, "learning_rate": 1.1789107453499264e-07, "loss": 0.0002, "reward": 0.43425997346639633, "reward_std": 0.5963248610496521, "rewards/reward_func": 0.43425997346639633, "step": 6592 }, { "completion_length": 139.5, "epoch": 0.8831794460056203, "grad_norm": 3.546875, "kl": 0.006153674854431301, "learning_rate": 1.1682055399437976e-07, "loss": 0.0002, "reward": 0.4587271837517619, "reward_std": 0.5946958791464567, "rewards/reward_func": 0.4587271837517619, "step": 6600 }, { "completion_length": 160.2890625, "epoch": 0.8842499665462331, "grad_norm": 3.265625, "kl": 0.004449796746484935, "learning_rate": 1.1575003345376688e-07, "loss": 0.0002, "reward": 0.3748700972646475, "reward_std": 0.5290507553145289, "rewards/reward_func": 0.3748700972646475, "step": 6608 }, { "completion_length": 175.03125, "epoch": 0.885320487086846, "grad_norm": 3.4375, "kl": 0.0046757735253777355, "learning_rate": 1.1467951291315402e-07, "loss": 0.0002, "reward": 0.36219143867492676, "reward_std": 0.5348459035158157, "rewards/reward_func": 0.36219143867492676, "step": 6616 }, { "completion_length": 159.0078125, "epoch": 0.8863910076274588, "grad_norm": 4.375, "kl": 0.005130204517627135, "learning_rate": 1.1360899237254114e-07, "loss": 0.0002, "reward": 0.40300269052386284, "reward_std": 0.5223680902272463, "rewards/reward_func": 0.40300269052386284, "step": 6624 }, { "completion_length": 168.6328125, "epoch": 0.8874615281680718, "grad_norm": 4.03125, "kl": 0.00497715815436095, "learning_rate": 1.1253847183192827e-07, "loss": 0.0002, "reward": 0.3870235029608011, "reward_std": 0.6206906009465456, "rewards/reward_func": 0.3870235029608011, "step": 6632 }, { "completion_length": 167.6328125, "epoch": 0.8885320487086846, "grad_norm": 2.75, "kl": 0.004565039882436395, "learning_rate": 1.1146795129131539e-07, "loss": 0.0002, "reward": 0.19453393667936325, "reward_std": 0.43898776825517416, "rewards/reward_func": 0.19453393667936325, "step": 6640 }, { "completion_length": 155.34375, "epoch": 0.8896025692492975, "grad_norm": 2.140625, "kl": 0.004799488058779389, "learning_rate": 1.1039743075070253e-07, "loss": 0.0002, "reward": 0.44809896126389503, "reward_std": 0.476587675511837, "rewards/reward_func": 0.44809896126389503, "step": 6648 }, { "completion_length": 182.6640625, "epoch": 0.8906730897899103, "grad_norm": 3.328125, "kl": 0.0043890890083275735, "learning_rate": 1.0932691021008965e-07, "loss": 0.0002, "reward": 0.289157398045063, "reward_std": 0.5493966788053513, "rewards/reward_func": 0.289157398045063, "step": 6656 }, { "completion_length": 150.7421875, "epoch": 0.8917436103305232, "grad_norm": 3.234375, "kl": 0.004936346551403403, "learning_rate": 1.0825638966947678e-07, "loss": 0.0002, "reward": 0.5019879713654518, "reward_std": 0.550602201372385, "rewards/reward_func": 0.5019879713654518, "step": 6664 }, { "completion_length": 141.7265625, "epoch": 0.892814130871136, "grad_norm": 5.34375, "kl": 0.0059457606403157115, "learning_rate": 1.071858691288639e-07, "loss": 0.0002, "reward": 0.4372959118336439, "reward_std": 0.46707610227167606, "rewards/reward_func": 0.4372959118336439, "step": 6672 }, { "completion_length": 170.8046875, "epoch": 0.893884651411749, "grad_norm": 3.703125, "kl": 0.004617019789293408, "learning_rate": 1.0611534858825104e-07, "loss": 0.0002, "reward": 0.06940314406529069, "reward_std": 0.6122306901961565, "rewards/reward_func": 0.06940314406529069, "step": 6680 }, { "completion_length": 177.7265625, "epoch": 0.8949551719523618, "grad_norm": 2.578125, "kl": 0.004395580617710948, "learning_rate": 1.0504482804763816e-07, "loss": 0.0002, "reward": 0.09883344545960426, "reward_std": 0.5640975758433342, "rewards/reward_func": 0.09883344545960426, "step": 6688 }, { "completion_length": 179.0859375, "epoch": 0.8960256924929747, "grad_norm": 4.40625, "kl": 0.004834166058572009, "learning_rate": 1.0397430750702528e-07, "loss": 0.0002, "reward": 0.2342253029346466, "reward_std": 0.6865712143480778, "rewards/reward_func": 0.2342253029346466, "step": 6696 }, { "completion_length": 161.90625, "epoch": 0.8970962130335876, "grad_norm": 5.90625, "kl": 0.004615583224222064, "learning_rate": 1.0290378696641242e-07, "loss": 0.0002, "reward": 0.18776031211018562, "reward_std": 0.5335487443953753, "rewards/reward_func": 0.18776031211018562, "step": 6704 }, { "completion_length": 191.4453125, "epoch": 0.8981667335742004, "grad_norm": 4.46875, "kl": 0.004502045980188996, "learning_rate": 1.0183326642579954e-07, "loss": 0.0002, "reward": 0.22535105049610138, "reward_std": 0.4681578129529953, "rewards/reward_func": 0.22535105049610138, "step": 6712 }, { "completion_length": 187.609375, "epoch": 0.8992372541148134, "grad_norm": 4.875, "kl": 0.004247891949489713, "learning_rate": 1.0076274588518667e-07, "loss": 0.0002, "reward": 0.1750158555805683, "reward_std": 0.6813812926411629, "rewards/reward_func": 0.1750158555805683, "step": 6720 }, { "completion_length": 162.8203125, "epoch": 0.9003077746554262, "grad_norm": 4.25, "kl": 0.004933495947625488, "learning_rate": 9.96922253445738e-08, "loss": 0.0002, "reward": 0.23774974327534437, "reward_std": 0.48002783581614494, "rewards/reward_func": 0.23774974327534437, "step": 6728 }, { "completion_length": 162.859375, "epoch": 0.9013782951960391, "grad_norm": 4.5625, "kl": 0.005467957467772067, "learning_rate": 9.862170480396093e-08, "loss": 0.0002, "reward": -0.0023063644766807556, "reward_std": 0.564548920840025, "rewards/reward_func": -0.0023063644766807556, "step": 6736 }, { "completion_length": 187.5078125, "epoch": 0.9024488157366519, "grad_norm": 5.84375, "kl": 0.004037181934108958, "learning_rate": 9.755118426334805e-08, "loss": 0.0002, "reward": 0.27846864983439445, "reward_std": 0.5254440493881702, "rewards/reward_func": 0.27846864983439445, "step": 6744 }, { "completion_length": 174.0, "epoch": 0.9035193362772648, "grad_norm": 3.046875, "kl": 0.004285787290427834, "learning_rate": 9.648066372273519e-08, "loss": 0.0002, "reward": 0.22628629952669144, "reward_std": 0.49990267865359783, "rewards/reward_func": 0.22628629952669144, "step": 6752 }, { "completion_length": 164.0, "epoch": 0.9045898568178777, "grad_norm": 4.15625, "kl": 0.005695787549484521, "learning_rate": 9.54101431821223e-08, "loss": 0.0002, "reward": 0.26332173496484756, "reward_std": 0.6010408755391836, "rewards/reward_func": 0.26332173496484756, "step": 6760 }, { "completion_length": 165.484375, "epoch": 0.9056603773584906, "grad_norm": 2.65625, "kl": 0.0045514948724303395, "learning_rate": 9.433962264150943e-08, "loss": 0.0002, "reward": 0.1936313882470131, "reward_std": 0.6337927635759115, "rewards/reward_func": 0.1936313882470131, "step": 6768 }, { "completion_length": 161.5078125, "epoch": 0.9067308978991034, "grad_norm": 4.5, "kl": 0.0048291504790540785, "learning_rate": 9.326910210089655e-08, "loss": 0.0002, "reward": 0.137207493185997, "reward_std": 0.5269910991191864, "rewards/reward_func": 0.137207493185997, "step": 6776 }, { "completion_length": 152.515625, "epoch": 0.9078014184397163, "grad_norm": 4.34375, "kl": 0.0052593986911233515, "learning_rate": 9.219858156028367e-08, "loss": 0.0002, "reward": 0.5747008826583624, "reward_std": 0.4051123149693012, "rewards/reward_func": 0.5747008826583624, "step": 6784 }, { "completion_length": 182.9296875, "epoch": 0.9088719389803291, "grad_norm": 6.6875, "kl": 0.004641913692466915, "learning_rate": 9.112806101967081e-08, "loss": 0.0002, "reward": 0.21289030835032463, "reward_std": 0.48070234432816505, "rewards/reward_func": 0.21289030835032463, "step": 6792 }, { "completion_length": 174.640625, "epoch": 0.9099424595209421, "grad_norm": 3.25, "kl": 0.004628196998964995, "learning_rate": 9.005754047905793e-08, "loss": 0.0002, "reward": 0.1559174619615078, "reward_std": 0.6800275854766369, "rewards/reward_func": 0.1559174619615078, "step": 6800 }, { "completion_length": 157.203125, "epoch": 0.911012980061555, "grad_norm": 3.40625, "kl": 0.006008526281220838, "learning_rate": 8.898701993844506e-08, "loss": 0.0002, "reward": 0.12123461440205574, "reward_std": 0.5197535315528512, "rewards/reward_func": 0.12123461440205574, "step": 6808 }, { "completion_length": 164.4609375, "epoch": 0.9120835006021678, "grad_norm": 3.734375, "kl": 0.004909445357043296, "learning_rate": 8.791649939783219e-08, "loss": 0.0002, "reward": 0.4351821830496192, "reward_std": 0.5581427849829197, "rewards/reward_func": 0.4351821830496192, "step": 6816 }, { "completion_length": 157.296875, "epoch": 0.9131540211427807, "grad_norm": 2.625, "kl": 0.004735041700769216, "learning_rate": 8.684597885721932e-08, "loss": 0.0002, "reward": 0.209370581433177, "reward_std": 0.5503848614171147, "rewards/reward_func": 0.209370581433177, "step": 6824 }, { "completion_length": 161.8046875, "epoch": 0.9142245416833935, "grad_norm": 4.78125, "kl": 0.004196583904558793, "learning_rate": 8.577545831660644e-08, "loss": 0.0002, "reward": 0.29275982081890106, "reward_std": 0.4565849918872118, "rewards/reward_func": 0.29275982081890106, "step": 6832 }, { "completion_length": 166.5625, "epoch": 0.9152950622240065, "grad_norm": 4.09375, "kl": 0.004536589724011719, "learning_rate": 8.470493777599358e-08, "loss": 0.0002, "reward": 0.19659875519573689, "reward_std": 0.5807360988110304, "rewards/reward_func": 0.19659875519573689, "step": 6840 }, { "completion_length": 172.1328125, "epoch": 0.9163655827646193, "grad_norm": 3.921875, "kl": 0.004410766297951341, "learning_rate": 8.36344172353807e-08, "loss": 0.0002, "reward": 0.03568706847727299, "reward_std": 0.6016153171658516, "rewards/reward_func": 0.03568706847727299, "step": 6848 }, { "completion_length": 178.8203125, "epoch": 0.9174361033052322, "grad_norm": 3.484375, "kl": 0.00475726873264648, "learning_rate": 8.256389669476782e-08, "loss": 0.0002, "reward": 0.035759665071964264, "reward_std": 0.5136286579072475, "rewards/reward_func": 0.035759665071964264, "step": 6856 }, { "completion_length": 158.8671875, "epoch": 0.918506623845845, "grad_norm": 4.1875, "kl": 0.004829802084714174, "learning_rate": 8.149337615415496e-08, "loss": 0.0002, "reward": 0.43006047047674656, "reward_std": 0.5332435881718993, "rewards/reward_func": 0.43006047047674656, "step": 6864 }, { "completion_length": 158.5078125, "epoch": 0.9195771443864579, "grad_norm": 2.75, "kl": 0.004407216591062024, "learning_rate": 8.042285561354208e-08, "loss": 0.0002, "reward": 0.42078845389187336, "reward_std": 0.5134297851473093, "rewards/reward_func": 0.42078845389187336, "step": 6872 }, { "completion_length": 135.34375, "epoch": 0.9206476649270708, "grad_norm": 6.0625, "kl": 0.006246095523238182, "learning_rate": 7.935233507292921e-08, "loss": 0.0002, "reward": 0.2924302965402603, "reward_std": 0.5908289672806859, "rewards/reward_func": 0.2924302965402603, "step": 6880 }, { "completion_length": 152.2734375, "epoch": 0.9217181854676837, "grad_norm": 4.65625, "kl": 0.005424696602858603, "learning_rate": 7.828181453231633e-08, "loss": 0.0002, "reward": 0.3303174478933215, "reward_std": 0.4598999507725239, "rewards/reward_func": 0.3303174478933215, "step": 6888 }, { "completion_length": 126.9375, "epoch": 0.9227887060082965, "grad_norm": 4.1875, "kl": 0.005991748097585514, "learning_rate": 7.721129399170347e-08, "loss": 0.0002, "reward": 0.5790990553796291, "reward_std": 0.4186181202530861, "rewards/reward_func": 0.5790990553796291, "step": 6896 }, { "completion_length": 165.5859375, "epoch": 0.9238592265489094, "grad_norm": 4.21875, "kl": 0.004846252937568352, "learning_rate": 7.614077345109059e-08, "loss": 0.0002, "reward": 0.21630746312439442, "reward_std": 0.5224413331598043, "rewards/reward_func": 0.21630746312439442, "step": 6904 }, { "completion_length": 150.71875, "epoch": 0.9249297470895222, "grad_norm": 4.4375, "kl": 0.004703165264800191, "learning_rate": 7.507025291047772e-08, "loss": 0.0002, "reward": 0.49177973717451096, "reward_std": 0.46361699141561985, "rewards/reward_func": 0.49177973717451096, "step": 6912 }, { "completion_length": 182.15625, "epoch": 0.9260002676301351, "grad_norm": 4.75, "kl": 0.004313376499339938, "learning_rate": 7.399973236986485e-08, "loss": 0.0002, "reward": 0.12689837673678994, "reward_std": 0.6647001150995493, "rewards/reward_func": 0.12689837673678994, "step": 6920 }, { "completion_length": 155.71875, "epoch": 0.9270707881707481, "grad_norm": 3.203125, "kl": 0.004443499754415825, "learning_rate": 7.292921182925198e-08, "loss": 0.0002, "reward": 0.29615641478449106, "reward_std": 0.5568899232894182, "rewards/reward_func": 0.29615641478449106, "step": 6928 }, { "completion_length": 155.15625, "epoch": 0.9281413087113609, "grad_norm": 3.796875, "kl": 0.0046441941522061825, "learning_rate": 7.18586912886391e-08, "loss": 0.0002, "reward": 0.28125396044924855, "reward_std": 0.4789434429258108, "rewards/reward_func": 0.28125396044924855, "step": 6936 }, { "completion_length": 172.03125, "epoch": 0.9292118292519738, "grad_norm": 4.84375, "kl": 0.0055302626569755375, "learning_rate": 7.078817074802622e-08, "loss": 0.0002, "reward": 0.3774759713560343, "reward_std": 0.5277713388204575, "rewards/reward_func": 0.3774759713560343, "step": 6944 }, { "completion_length": 165.890625, "epoch": 0.9302823497925866, "grad_norm": 3.015625, "kl": 0.004302407876821235, "learning_rate": 6.971765020741336e-08, "loss": 0.0002, "reward": 0.35936339199543, "reward_std": 0.5036177840083838, "rewards/reward_func": 0.35936339199543, "step": 6952 }, { "completion_length": 163.0234375, "epoch": 0.9313528703331995, "grad_norm": 3.78125, "kl": 0.0055159886833280325, "learning_rate": 6.864712966680048e-08, "loss": 0.0002, "reward": 0.5663758469745517, "reward_std": 0.4252478200942278, "rewards/reward_func": 0.5663758469745517, "step": 6960 }, { "completion_length": 151.8984375, "epoch": 0.9324233908738124, "grad_norm": 4.625, "kl": 0.005510843213414773, "learning_rate": 6.757660912618761e-08, "loss": 0.0002, "reward": 0.3682685000821948, "reward_std": 0.5349069032818079, "rewards/reward_func": 0.3682685000821948, "step": 6968 }, { "completion_length": 150.671875, "epoch": 0.9334939114144253, "grad_norm": 4.5, "kl": 0.005221706640440971, "learning_rate": 6.650608858557472e-08, "loss": 0.0002, "reward": 0.48502959311008453, "reward_std": 0.39637486822903156, "rewards/reward_func": 0.48502959311008453, "step": 6976 }, { "completion_length": 170.3359375, "epoch": 0.9345644319550381, "grad_norm": 5.53125, "kl": 0.004806717770406976, "learning_rate": 6.543556804496186e-08, "loss": 0.0002, "reward": 0.20024515688419342, "reward_std": 0.3813412329182029, "rewards/reward_func": 0.20024515688419342, "step": 6984 }, { "completion_length": 165.296875, "epoch": 0.935634952495651, "grad_norm": 4.21875, "kl": 0.0052095072460360825, "learning_rate": 6.436504750434898e-08, "loss": 0.0002, "reward": 0.17343932949006557, "reward_std": 0.542176740244031, "rewards/reward_func": 0.17343932949006557, "step": 6992 }, { "completion_length": 169.6953125, "epoch": 0.9367054730362638, "grad_norm": 3.6875, "kl": 0.00422157411230728, "learning_rate": 6.329452696373611e-08, "loss": 0.0002, "reward": 0.06634609401226044, "reward_std": 0.6044113449752331, "rewards/reward_func": 0.06634609401226044, "step": 7000 }, { "completion_length": 146.5234375, "epoch": 0.9377759935768768, "grad_norm": 5.0625, "kl": 0.005492849391885102, "learning_rate": 6.222400642312324e-08, "loss": 0.0002, "reward": 0.23497827351093292, "reward_std": 0.42981395684182644, "rewards/reward_func": 0.23497827351093292, "step": 7008 }, { "completion_length": 170.328125, "epoch": 0.9388465141174896, "grad_norm": 4.65625, "kl": 0.004708502208814025, "learning_rate": 6.115348588251037e-08, "loss": 0.0002, "reward": -0.04050692915916443, "reward_std": 0.6017125463113189, "rewards/reward_func": -0.04050692915916443, "step": 7016 }, { "completion_length": 166.53125, "epoch": 0.9399170346581025, "grad_norm": 3.78125, "kl": 0.0050715115503408015, "learning_rate": 6.008296534189749e-08, "loss": 0.0002, "reward": 0.3130027763545513, "reward_std": 0.47869889438152313, "rewards/reward_func": 0.3130027763545513, "step": 7024 }, { "completion_length": 194.546875, "epoch": 0.9409875551987154, "grad_norm": 3.0625, "kl": 0.003658687841380015, "learning_rate": 5.901244480128462e-08, "loss": 0.0001, "reward": 0.33295007050037384, "reward_std": 0.4185595214366913, "rewards/reward_func": 0.33295007050037384, "step": 7032 }, { "completion_length": 180.828125, "epoch": 0.9420580757393282, "grad_norm": 2.59375, "kl": 0.005314617330441251, "learning_rate": 5.794192426067175e-08, "loss": 0.0002, "reward": 0.2711464911699295, "reward_std": 0.529150040820241, "rewards/reward_func": 0.2711464911699295, "step": 7040 }, { "completion_length": 166.5390625, "epoch": 0.9431285962799412, "grad_norm": 4.3125, "kl": 0.005095012194942683, "learning_rate": 5.6871403720058877e-08, "loss": 0.0002, "reward": 0.38804778829216957, "reward_std": 0.6022228971123695, "rewards/reward_func": 0.38804778829216957, "step": 7048 }, { "completion_length": 166.375, "epoch": 0.944199116820554, "grad_norm": 3.921875, "kl": 0.005058724695118144, "learning_rate": 5.5800883179446e-08, "loss": 0.0002, "reward": 0.201092598028481, "reward_std": 0.5487896800041199, "rewards/reward_func": 0.201092598028481, "step": 7056 }, { "completion_length": 175.8203125, "epoch": 0.9452696373611669, "grad_norm": 5.59375, "kl": 0.0046728674788028, "learning_rate": 5.4730362638833126e-08, "loss": 0.0002, "reward": 0.21519318595528603, "reward_std": 0.6764262039214373, "rewards/reward_func": 0.21519318595528603, "step": 7064 }, { "completion_length": 170.0625, "epoch": 0.9463401579017797, "grad_norm": 3.65625, "kl": 0.00383082203916274, "learning_rate": 5.3659842098220254e-08, "loss": 0.0002, "reward": 0.2285008803009987, "reward_std": 0.47355389036238194, "rewards/reward_func": 0.2285008803009987, "step": 7072 }, { "completion_length": 160.890625, "epoch": 0.9474106784423926, "grad_norm": 3.5, "kl": 0.004751811851747334, "learning_rate": 5.258932155760738e-08, "loss": 0.0002, "reward": 0.4270824361592531, "reward_std": 0.41622067615389824, "rewards/reward_func": 0.4270824361592531, "step": 7080 }, { "completion_length": 157.25, "epoch": 0.9484811989830055, "grad_norm": 4.875, "kl": 0.005698841763660312, "learning_rate": 5.151880101699451e-08, "loss": 0.0002, "reward": 0.30552778858691454, "reward_std": 0.522463321685791, "rewards/reward_func": 0.30552778858691454, "step": 7088 }, { "completion_length": 156.6796875, "epoch": 0.9495517195236184, "grad_norm": 3.890625, "kl": 0.005308831227011979, "learning_rate": 5.044828047638164e-08, "loss": 0.0002, "reward": 0.2509817620739341, "reward_std": 0.5911254324018955, "rewards/reward_func": 0.2509817620739341, "step": 7096 }, { "completion_length": 172.4609375, "epoch": 0.9506222400642312, "grad_norm": 3.40625, "kl": 0.004089270456461236, "learning_rate": 4.937775993576877e-08, "loss": 0.0002, "reward": 0.1337134689092636, "reward_std": 0.34575022105127573, "rewards/reward_func": 0.1337134689092636, "step": 7104 }, { "completion_length": 165.0625, "epoch": 0.9516927606048441, "grad_norm": 3.609375, "kl": 0.004878541512880474, "learning_rate": 4.8307239395155895e-08, "loss": 0.0002, "reward": 0.28754607075825334, "reward_std": 0.49302330799400806, "rewards/reward_func": 0.28754607075825334, "step": 7112 }, { "completion_length": 190.7109375, "epoch": 0.9527632811454569, "grad_norm": 3.265625, "kl": 0.003856517461827025, "learning_rate": 4.7236718854543023e-08, "loss": 0.0002, "reward": 0.05854572542011738, "reward_std": 0.4840726386755705, "rewards/reward_func": 0.05854572542011738, "step": 7120 }, { "completion_length": 178.7421875, "epoch": 0.9538338016860699, "grad_norm": 5.09375, "kl": 0.005262946098810062, "learning_rate": 4.616619831393015e-08, "loss": 0.0002, "reward": 0.1380448378622532, "reward_std": 0.44803581573069096, "rewards/reward_func": 0.1380448378622532, "step": 7128 }, { "completion_length": 162.75, "epoch": 0.9549043222266828, "grad_norm": 4.1875, "kl": 0.0052962955087423325, "learning_rate": 4.509567777331728e-08, "loss": 0.0002, "reward": 0.4494497813284397, "reward_std": 0.4952176222577691, "rewards/reward_func": 0.4494497813284397, "step": 7136 }, { "completion_length": 148.8671875, "epoch": 0.9559748427672956, "grad_norm": 3.703125, "kl": 0.004964547406416386, "learning_rate": 4.4025157232704395e-08, "loss": 0.0002, "reward": 0.4687324403785169, "reward_std": 0.40094813890755177, "rewards/reward_func": 0.4687324403785169, "step": 7144 }, { "completion_length": 197.28125, "epoch": 0.9570453633079085, "grad_norm": 4.09375, "kl": 0.0037281967815943062, "learning_rate": 4.295463669209152e-08, "loss": 0.0001, "reward": 0.02254125289618969, "reward_std": 0.5664320774376392, "rewards/reward_func": 0.02254125289618969, "step": 7152 }, { "completion_length": 148.703125, "epoch": 0.9581158838485213, "grad_norm": 3.796875, "kl": 0.004984479019185528, "learning_rate": 4.188411615147865e-08, "loss": 0.0002, "reward": 0.325860820710659, "reward_std": 0.41767950914800167, "rewards/reward_func": 0.325860820710659, "step": 7160 }, { "completion_length": 153.265625, "epoch": 0.9591864043891342, "grad_norm": 2.9375, "kl": 0.005220895051024854, "learning_rate": 4.081359561086578e-08, "loss": 0.0002, "reward": 0.4472418650984764, "reward_std": 0.4130860110744834, "rewards/reward_func": 0.4472418650984764, "step": 7168 }, { "completion_length": 169.0625, "epoch": 0.9602569249297471, "grad_norm": 4.5625, "kl": 0.004276268708053976, "learning_rate": 3.974307507025291e-08, "loss": 0.0002, "reward": 0.15905702486634254, "reward_std": 0.4423768687993288, "rewards/reward_func": 0.15905702486634254, "step": 7176 }, { "completion_length": 179.3125, "epoch": 0.96132744547036, "grad_norm": 3.234375, "kl": 0.0035234860552009195, "learning_rate": 3.8672554529640036e-08, "loss": 0.0001, "reward": 0.3482946362346411, "reward_std": 0.6113561438396573, "rewards/reward_func": 0.3482946362346411, "step": 7184 }, { "completion_length": 172.6796875, "epoch": 0.9623979660109728, "grad_norm": 3.890625, "kl": 0.005020510870963335, "learning_rate": 3.7602033989027164e-08, "loss": 0.0002, "reward": 0.12693638168275356, "reward_std": 0.5951482262462378, "rewards/reward_func": 0.12693638168275356, "step": 7192 }, { "completion_length": 161.015625, "epoch": 0.9634684865515857, "grad_norm": 2.765625, "kl": 0.004839012573938817, "learning_rate": 3.653151344841429e-08, "loss": 0.0002, "reward": 0.22009205259382725, "reward_std": 0.607865285128355, "rewards/reward_func": 0.22009205259382725, "step": 7200 }, { "completion_length": 158.515625, "epoch": 0.9645390070921985, "grad_norm": 4.71875, "kl": 0.004412859241710976, "learning_rate": 3.546099290780142e-08, "loss": 0.0002, "reward": 0.17766493232920766, "reward_std": 0.6588699370622635, "rewards/reward_func": 0.17766493232920766, "step": 7208 }, { "completion_length": 168.8828125, "epoch": 0.9656095276328115, "grad_norm": 3.859375, "kl": 0.00440784459351562, "learning_rate": 3.439047236718855e-08, "loss": 0.0002, "reward": 0.31805921625345945, "reward_std": 0.5728737730532885, "rewards/reward_func": 0.31805921625345945, "step": 7216 }, { "completion_length": 169.84375, "epoch": 0.9666800481734243, "grad_norm": 4.96875, "kl": 0.004156895098276436, "learning_rate": 3.331995182657567e-08, "loss": 0.0002, "reward": 0.13793382793664932, "reward_std": 0.6552281193435192, "rewards/reward_func": 0.13793382793664932, "step": 7224 }, { "completion_length": 140.65625, "epoch": 0.9677505687140372, "grad_norm": 4.21875, "kl": 0.00553873396711424, "learning_rate": 3.22494312859628e-08, "loss": 0.0002, "reward": 0.35460880724713206, "reward_std": 0.3868136703968048, "rewards/reward_func": 0.35460880724713206, "step": 7232 }, { "completion_length": 198.5546875, "epoch": 0.96882108925465, "grad_norm": 3.4375, "kl": 0.004783908079843968, "learning_rate": 3.1178910745349926e-08, "loss": 0.0002, "reward": 0.1244197292253375, "reward_std": 0.5501943584531546, "rewards/reward_func": 0.1244197292253375, "step": 7240 }, { "completion_length": 187.1875, "epoch": 0.9698916097952629, "grad_norm": 3.703125, "kl": 0.004152452602284029, "learning_rate": 3.0108390204737054e-08, "loss": 0.0002, "reward": 0.17286342615261674, "reward_std": 0.461435928940773, "rewards/reward_func": 0.17286342615261674, "step": 7248 }, { "completion_length": 122.7421875, "epoch": 0.9709621303358759, "grad_norm": 5.25, "kl": 0.0060864063561894, "learning_rate": 2.903786966412418e-08, "loss": 0.0002, "reward": 0.5303192976862192, "reward_std": 0.4443682935088873, "rewards/reward_func": 0.5303192976862192, "step": 7256 }, { "completion_length": 174.3984375, "epoch": 0.9720326508764887, "grad_norm": 3.546875, "kl": 0.004882953886408359, "learning_rate": 2.7967349123511307e-08, "loss": 0.0002, "reward": 0.40086287446320057, "reward_std": 0.41974346339702606, "rewards/reward_func": 0.40086287446320057, "step": 7264 }, { "completion_length": 191.515625, "epoch": 0.9731031714171016, "grad_norm": 2.28125, "kl": 0.004744857433252037, "learning_rate": 2.6896828582898435e-08, "loss": 0.0002, "reward": -0.035455760546028614, "reward_std": 0.5775532089173794, "rewards/reward_func": -0.035455760546028614, "step": 7272 }, { "completion_length": 181.6484375, "epoch": 0.9741736919577144, "grad_norm": 4.21875, "kl": 0.004760511888889596, "learning_rate": 2.5826308042285557e-08, "loss": 0.0002, "reward": 0.09731801599264145, "reward_std": 0.5751747917383909, "rewards/reward_func": 0.09731801599264145, "step": 7280 }, { "completion_length": 195.15625, "epoch": 0.9752442124983273, "grad_norm": 3.296875, "kl": 0.004167939972830936, "learning_rate": 2.4755787501672685e-08, "loss": 0.0002, "reward": -0.007298767566680908, "reward_std": 0.47776357643306255, "rewards/reward_func": -0.007298767566680908, "step": 7288 }, { "completion_length": 143.59375, "epoch": 0.9763147330389402, "grad_norm": 4.25, "kl": 0.005093816755106673, "learning_rate": 2.3685266961059813e-08, "loss": 0.0002, "reward": 0.3966046618297696, "reward_std": 0.4395467219874263, "rewards/reward_func": 0.3966046618297696, "step": 7296 }, { "completion_length": 196.1015625, "epoch": 0.9773852535795531, "grad_norm": 4.96875, "kl": 0.003976713371230289, "learning_rate": 2.261474642044694e-08, "loss": 0.0002, "reward": 0.05790833756327629, "reward_std": 0.45180133171379566, "rewards/reward_func": 0.05790833756327629, "step": 7304 }, { "completion_length": 158.6171875, "epoch": 0.9784557741201659, "grad_norm": 4.125, "kl": 0.004875800863374025, "learning_rate": 2.154422587983407e-08, "loss": 0.0002, "reward": 0.3239698866382241, "reward_std": 0.6013830993324518, "rewards/reward_func": 0.3239698866382241, "step": 7312 }, { "completion_length": 137.140625, "epoch": 0.9795262946607788, "grad_norm": 2.796875, "kl": 0.0066487987351138145, "learning_rate": 2.0473705339221198e-08, "loss": 0.0003, "reward": 0.42458152025938034, "reward_std": 0.4684657920151949, "rewards/reward_func": 0.42458152025938034, "step": 7320 }, { "completion_length": 184.9609375, "epoch": 0.9805968152013916, "grad_norm": 4.0625, "kl": 0.003812081238720566, "learning_rate": 1.9403184798608323e-08, "loss": 0.0002, "reward": 0.2897532992064953, "reward_std": 0.6526387594640255, "rewards/reward_func": 0.2897532992064953, "step": 7328 }, { "completion_length": 170.515625, "epoch": 0.9816673357420046, "grad_norm": 2.625, "kl": 0.004741923592519015, "learning_rate": 1.8332664257995448e-08, "loss": 0.0002, "reward": 0.1217675432562828, "reward_std": 0.5977188646793365, "rewards/reward_func": 0.1217675432562828, "step": 7336 }, { "completion_length": 215.9609375, "epoch": 0.9827378562826175, "grad_norm": 2.90625, "kl": 0.0035596474481280893, "learning_rate": 1.7262143717382576e-08, "loss": 0.0001, "reward": 0.14845915883779526, "reward_std": 0.5360017623752356, "rewards/reward_func": 0.14845915883779526, "step": 7344 }, { "completion_length": 219.90625, "epoch": 0.9838083768232303, "grad_norm": 4.5, "kl": 0.00427751979441382, "learning_rate": 1.6191623176769704e-08, "loss": 0.0002, "reward": -0.1681067142635584, "reward_std": 0.5721786804497242, "rewards/reward_func": -0.1681067142635584, "step": 7352 }, { "completion_length": 175.6953125, "epoch": 0.9848788973638432, "grad_norm": 3.890625, "kl": 0.0041801958286669105, "learning_rate": 1.5121102636156832e-08, "loss": 0.0002, "reward": 0.06939083803445101, "reward_std": 0.6783208139240742, "rewards/reward_func": 0.06939083803445101, "step": 7360 }, { "completion_length": 144.1875, "epoch": 0.985949417904456, "grad_norm": 3.828125, "kl": 0.004889452655334026, "learning_rate": 1.4050582095543959e-08, "loss": 0.0002, "reward": 0.4226034879684448, "reward_std": 0.5269232532009482, "rewards/reward_func": 0.4226034879684448, "step": 7368 }, { "completion_length": 163.59375, "epoch": 0.987019938445069, "grad_norm": 4.375, "kl": 0.004188012710073963, "learning_rate": 1.2980061554931083e-08, "loss": 0.0002, "reward": 0.39765281416475773, "reward_std": 0.6691529527306557, "rewards/reward_func": 0.39765281416475773, "step": 7376 }, { "completion_length": 171.4765625, "epoch": 0.9880904589856818, "grad_norm": 4.34375, "kl": 0.004243878676788881, "learning_rate": 1.1909541014318212e-08, "loss": 0.0002, "reward": 0.02129072230309248, "reward_std": 0.5695307403802872, "rewards/reward_func": 0.02129072230309248, "step": 7384 }, { "completion_length": 174.0625, "epoch": 0.9891609795262947, "grad_norm": 6.34375, "kl": 0.004543175353319384, "learning_rate": 1.0839020473705338e-08, "loss": 0.0002, "reward": 0.19244904909282923, "reward_std": 0.6081365495920181, "rewards/reward_func": 0.19244904909282923, "step": 7392 }, { "completion_length": 168.640625, "epoch": 0.9902315000669075, "grad_norm": 4.09375, "kl": 0.004472158325370401, "learning_rate": 9.768499933092466e-09, "loss": 0.0002, "reward": 0.23385661654174328, "reward_std": 0.5880191251635551, "rewards/reward_func": 0.23385661654174328, "step": 7400 }, { "completion_length": 167.7265625, "epoch": 0.9913020206075204, "grad_norm": 3.703125, "kl": 0.004636053316062316, "learning_rate": 8.697979392479593e-09, "loss": 0.0002, "reward": 0.1343773351982236, "reward_std": 0.4601830244064331, "rewards/reward_func": 0.1343773351982236, "step": 7408 }, { "completion_length": 181.546875, "epoch": 0.9923725411481332, "grad_norm": 3.515625, "kl": 0.004200820723781362, "learning_rate": 7.627458851866721e-09, "loss": 0.0002, "reward": 0.4208897929638624, "reward_std": 0.5252015050500631, "rewards/reward_func": 0.4208897929638624, "step": 7416 }, { "completion_length": 188.3671875, "epoch": 0.9934430616887462, "grad_norm": 3.90625, "kl": 0.00420898012816906, "learning_rate": 6.5569383112538474e-09, "loss": 0.0002, "reward": 0.01703132875263691, "reward_std": 0.659230999648571, "rewards/reward_func": 0.01703132875263691, "step": 7424 }, { "completion_length": 180.5546875, "epoch": 0.994513582229359, "grad_norm": 3.203125, "kl": 0.004538079345365986, "learning_rate": 5.486417770640974e-09, "loss": 0.0002, "reward": 0.2158992402255535, "reward_std": 0.6045026630163193, "rewards/reward_func": 0.2158992402255535, "step": 7432 }, { "completion_length": 162.9453125, "epoch": 0.9955841027699719, "grad_norm": 3.734375, "kl": 0.004193991771899164, "learning_rate": 4.4158972300281005e-09, "loss": 0.0002, "reward": 0.1715514180250466, "reward_std": 0.6823503784835339, "rewards/reward_func": 0.1715514180250466, "step": 7440 }, { "completion_length": 173.65625, "epoch": 0.9966546233105847, "grad_norm": 6.21875, "kl": 0.004881884902715683, "learning_rate": 3.345376689415228e-09, "loss": 0.0002, "reward": 0.003267081454396248, "reward_std": 0.6868100538849831, "rewards/reward_func": 0.003267081454396248, "step": 7448 }, { "completion_length": 141.2421875, "epoch": 0.9977251438511976, "grad_norm": 4.53125, "kl": 0.005532538751140237, "learning_rate": 2.2748561488023547e-09, "loss": 0.0002, "reward": 0.541567288339138, "reward_std": 0.31684359908103943, "rewards/reward_func": 0.541567288339138, "step": 7456 }, { "completion_length": 157.5546875, "epoch": 0.9987956643918106, "grad_norm": 3.953125, "kl": 0.004744258592836559, "learning_rate": 1.2043356081894823e-09, "loss": 0.0002, "reward": 0.255828570574522, "reward_std": 0.6103123240172863, "rewards/reward_func": 0.255828570574522, "step": 7464 }, { "completion_length": 151.453125, "epoch": 0.9998661849324234, "grad_norm": 3.484375, "kl": 0.004390858637634665, "learning_rate": 1.338150675766091e-10, "loss": 0.0002, "reward": 0.26983874663710594, "reward_std": 0.5870513431727886, "rewards/reward_func": 0.26983874663710594, "step": 7472 } ], "logging_steps": 8, "max_steps": 7473, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1868, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }