| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 7473, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 196.4453125, |
| "epoch": 0.001070520540612873, |
| "grad_norm": 5.03125, |
| "kl": 0.00023896918719401583, |
| "learning_rate": 9.98929479459387e-07, |
| "loss": 0.0, |
| "reward": 0.06715917773544788, |
| "reward_std": 0.6129379905760288, |
| "rewards/reward_func": 0.06715917773544788, |
| "step": 8 |
| }, |
| { |
| "completion_length": 177.9296875, |
| "epoch": 0.002141041081225746, |
| "grad_norm": 4.15625, |
| "kl": 0.00027647913702821825, |
| "learning_rate": 9.978589589187743e-07, |
| "loss": 0.0, |
| "reward": 0.02951172273606062, |
| "reward_std": 0.5923765227198601, |
| "rewards/reward_func": 0.02951172273606062, |
| "step": 16 |
| }, |
| { |
| "completion_length": 173.359375, |
| "epoch": 0.003211561621838619, |
| "grad_norm": 3.34375, |
| "kl": 0.0002405872492090566, |
| "learning_rate": 9.967884383781614e-07, |
| "loss": 0.0, |
| "reward": 0.12952731922268867, |
| "reward_std": 0.6536133792251348, |
| "rewards/reward_func": 0.12952731922268867, |
| "step": 24 |
| }, |
| { |
| "completion_length": 170.0859375, |
| "epoch": 0.004282082162451492, |
| "grad_norm": 3.21875, |
| "kl": 0.0002442408094793791, |
| "learning_rate": 9.957179178375484e-07, |
| "loss": 0.0, |
| "reward": 0.3293330520391464, |
| "reward_std": 0.5732803735882044, |
| "rewards/reward_func": 0.3293330520391464, |
| "step": 32 |
| }, |
| { |
| "completion_length": 190.3828125, |
| "epoch": 0.005352602703064365, |
| "grad_norm": 4.15625, |
| "kl": 0.00024058844792307355, |
| "learning_rate": 9.946473972969355e-07, |
| "loss": 0.0, |
| "reward": 0.36058899760246277, |
| "reward_std": 0.5922529064118862, |
| "rewards/reward_func": 0.36058899760246277, |
| "step": 40 |
| }, |
| { |
| "completion_length": 228.4453125, |
| "epoch": 0.006423123243677238, |
| "grad_norm": 2.765625, |
| "kl": 0.00029027340133325197, |
| "learning_rate": 9.935768767563228e-07, |
| "loss": 0.0, |
| "reward": 0.026657558977603912, |
| "reward_std": 0.48614570777863264, |
| "rewards/reward_func": 0.026657558977603912, |
| "step": 48 |
| }, |
| { |
| "completion_length": 182.453125, |
| "epoch": 0.007493643784290111, |
| "grad_norm": 4.40625, |
| "kl": 0.00024143315386027098, |
| "learning_rate": 9.925063562157099e-07, |
| "loss": 0.0, |
| "reward": 0.3887675404548645, |
| "reward_std": 0.53492078371346, |
| "rewards/reward_func": 0.3887675404548645, |
| "step": 56 |
| }, |
| { |
| "completion_length": 223.5703125, |
| "epoch": 0.008564164324902984, |
| "grad_norm": 2.578125, |
| "kl": 0.000258006857620785, |
| "learning_rate": 9.91435835675097e-07, |
| "loss": 0.0, |
| "reward": 0.31422215048223734, |
| "reward_std": 0.527774453163147, |
| "rewards/reward_func": 0.31422215048223734, |
| "step": 64 |
| }, |
| { |
| "completion_length": 207.1953125, |
| "epoch": 0.009634684865515858, |
| "grad_norm": 4.03125, |
| "kl": 0.0002873431112675462, |
| "learning_rate": 9.90365315134484e-07, |
| "loss": 0.0, |
| "reward": 0.23160290531814098, |
| "reward_std": 0.5312692299485207, |
| "rewards/reward_func": 0.23160290531814098, |
| "step": 72 |
| }, |
| { |
| "completion_length": 170.234375, |
| "epoch": 0.01070520540612873, |
| "grad_norm": 5.09375, |
| "kl": 0.0002804081450449303, |
| "learning_rate": 9.892947945938713e-07, |
| "loss": 0.0, |
| "reward": 0.2816220009699464, |
| "reward_std": 0.600088307633996, |
| "rewards/reward_func": 0.2816220009699464, |
| "step": 80 |
| }, |
| { |
| "completion_length": 175.84375, |
| "epoch": 0.011775725946741603, |
| "grad_norm": 4.125, |
| "kl": 0.00024894280068110675, |
| "learning_rate": 9.882242740532583e-07, |
| "loss": 0.0, |
| "reward": 0.42427181545645, |
| "reward_std": 0.5529402792453766, |
| "rewards/reward_func": 0.42427181545645, |
| "step": 88 |
| }, |
| { |
| "completion_length": 258.6484375, |
| "epoch": 0.012846246487354477, |
| "grad_norm": 4.03125, |
| "kl": 0.00030983560463937465, |
| "learning_rate": 9.871537535126454e-07, |
| "loss": 0.0, |
| "reward": -0.342121371999383, |
| "reward_std": 0.5698626078665257, |
| "rewards/reward_func": -0.342121371999383, |
| "step": 96 |
| }, |
| { |
| "completion_length": 225.90625, |
| "epoch": 0.013916767027967349, |
| "grad_norm": 3.484375, |
| "kl": 0.0002933137984655332, |
| "learning_rate": 9.860832329720325e-07, |
| "loss": 0.0, |
| "reward": 0.12576067401096225, |
| "reward_std": 0.6368702314794064, |
| "rewards/reward_func": 0.12576067401096225, |
| "step": 104 |
| }, |
| { |
| "completion_length": 168.5703125, |
| "epoch": 0.014987287568580221, |
| "grad_norm": 3.765625, |
| "kl": 0.0003037826609215699, |
| "learning_rate": 9.850127124314198e-07, |
| "loss": 0.0, |
| "reward": 0.17080755531787872, |
| "reward_std": 0.5442508868873119, |
| "rewards/reward_func": 0.17080755531787872, |
| "step": 112 |
| }, |
| { |
| "completion_length": 179.1640625, |
| "epoch": 0.016057808109193095, |
| "grad_norm": 3.390625, |
| "kl": 0.0002824857710947981, |
| "learning_rate": 9.839421918908068e-07, |
| "loss": 0.0, |
| "reward": 0.18145466595888138, |
| "reward_std": 0.5059491451829672, |
| "rewards/reward_func": 0.18145466595888138, |
| "step": 120 |
| }, |
| { |
| "completion_length": 191.78125, |
| "epoch": 0.017128328649805968, |
| "grad_norm": 3.75, |
| "kl": 0.00031089498952496797, |
| "learning_rate": 9.82871671350194e-07, |
| "loss": 0.0, |
| "reward": 0.2814232921227813, |
| "reward_std": 0.5498775225132704, |
| "rewards/reward_func": 0.2814232921227813, |
| "step": 128 |
| }, |
| { |
| "completion_length": 154.2578125, |
| "epoch": 0.01819884919041884, |
| "grad_norm": 3.140625, |
| "kl": 0.0003297478033346124, |
| "learning_rate": 9.818011508095812e-07, |
| "loss": 0.0, |
| "reward": 0.46491820737719536, |
| "reward_std": 0.48998717963695526, |
| "rewards/reward_func": 0.46491820737719536, |
| "step": 136 |
| }, |
| { |
| "completion_length": 180.1171875, |
| "epoch": 0.019269369731031716, |
| "grad_norm": 4.53125, |
| "kl": 0.00035076765561825596, |
| "learning_rate": 9.807306302689682e-07, |
| "loss": 0.0, |
| "reward": 0.27495551854372025, |
| "reward_std": 0.6060709934681654, |
| "rewards/reward_func": 0.27495551854372025, |
| "step": 144 |
| }, |
| { |
| "completion_length": 219.390625, |
| "epoch": 0.020339890271644588, |
| "grad_norm": 3.203125, |
| "kl": 0.00032744045893196017, |
| "learning_rate": 9.796601097283553e-07, |
| "loss": 0.0, |
| "reward": -0.04555501043796539, |
| "reward_std": 0.557650757022202, |
| "rewards/reward_func": -0.04555501043796539, |
| "step": 152 |
| }, |
| { |
| "completion_length": 197.6875, |
| "epoch": 0.02141041081225746, |
| "grad_norm": 4.1875, |
| "kl": 0.0003747515474969987, |
| "learning_rate": 9.785895891877424e-07, |
| "loss": 0.0, |
| "reward": 0.04799163248389959, |
| "reward_std": 0.7354221493005753, |
| "rewards/reward_func": 0.04799163248389959, |
| "step": 160 |
| }, |
| { |
| "completion_length": 158.921875, |
| "epoch": 0.022480931352870333, |
| "grad_norm": 4.71875, |
| "kl": 0.00034361303187324665, |
| "learning_rate": 9.775190686471297e-07, |
| "loss": 0.0, |
| "reward": 0.46762343868613243, |
| "reward_std": 0.5215234383940697, |
| "rewards/reward_func": 0.46762343868613243, |
| "step": 168 |
| }, |
| { |
| "completion_length": 179.4296875, |
| "epoch": 0.023551451893483205, |
| "grad_norm": 2.390625, |
| "kl": 0.0003124878894595895, |
| "learning_rate": 9.764485481065167e-07, |
| "loss": 0.0, |
| "reward": 0.15805792342871428, |
| "reward_std": 0.6787187531590462, |
| "rewards/reward_func": 0.15805792342871428, |
| "step": 176 |
| }, |
| { |
| "completion_length": 185.5546875, |
| "epoch": 0.02462197243409608, |
| "grad_norm": 3.078125, |
| "kl": 0.0003605757010518573, |
| "learning_rate": 9.75378027565904e-07, |
| "loss": 0.0, |
| "reward": 0.30135733261704445, |
| "reward_std": 0.4861781559884548, |
| "rewards/reward_func": 0.30135733261704445, |
| "step": 184 |
| }, |
| { |
| "completion_length": 176.8984375, |
| "epoch": 0.025692492974708953, |
| "grad_norm": 2.671875, |
| "kl": 0.0003968792916566599, |
| "learning_rate": 9.743075070252909e-07, |
| "loss": 0.0, |
| "reward": 0.18422270519658923, |
| "reward_std": 0.5276681184768677, |
| "rewards/reward_func": 0.18422270519658923, |
| "step": 192 |
| }, |
| { |
| "completion_length": 196.234375, |
| "epoch": 0.026763013515321826, |
| "grad_norm": 3.8125, |
| "kl": 0.0003164229347021319, |
| "learning_rate": 9.732369864846782e-07, |
| "loss": 0.0, |
| "reward": 0.09587159566581249, |
| "reward_std": 0.5885980241000652, |
| "rewards/reward_func": 0.09587159566581249, |
| "step": 200 |
| }, |
| { |
| "completion_length": 187.15625, |
| "epoch": 0.027833534055934698, |
| "grad_norm": 5.65625, |
| "kl": 0.00039002683661237825, |
| "learning_rate": 9.721664659440652e-07, |
| "loss": 0.0, |
| "reward": 0.19378361385315657, |
| "reward_std": 0.439416766166687, |
| "rewards/reward_func": 0.19378361385315657, |
| "step": 208 |
| }, |
| { |
| "completion_length": 186.359375, |
| "epoch": 0.02890405459654757, |
| "grad_norm": 4.15625, |
| "kl": 0.0004315389560360927, |
| "learning_rate": 9.710959454034525e-07, |
| "loss": 0.0, |
| "reward": 0.2533123311586678, |
| "reward_std": 0.6090654768049717, |
| "rewards/reward_func": 0.2533123311586678, |
| "step": 216 |
| }, |
| { |
| "completion_length": 206.7265625, |
| "epoch": 0.029974575137160443, |
| "grad_norm": 3.8125, |
| "kl": 0.00036658769749919884, |
| "learning_rate": 9.700254248628396e-07, |
| "loss": 0.0, |
| "reward": 0.11443387717008591, |
| "reward_std": 0.6023523053154349, |
| "rewards/reward_func": 0.11443387717008591, |
| "step": 224 |
| }, |
| { |
| "completion_length": 189.0703125, |
| "epoch": 0.03104509567777332, |
| "grad_norm": 4.78125, |
| "kl": 0.00041832886927295476, |
| "learning_rate": 9.689549043222266e-07, |
| "loss": 0.0, |
| "reward": -0.07620369084179401, |
| "reward_std": 0.6309537254273891, |
| "rewards/reward_func": -0.07620369084179401, |
| "step": 232 |
| }, |
| { |
| "completion_length": 193.34375, |
| "epoch": 0.03211561621838619, |
| "grad_norm": 5.03125, |
| "kl": 0.00045970915380166844, |
| "learning_rate": 9.678843837816137e-07, |
| "loss": 0.0, |
| "reward": 0.13946556020528078, |
| "reward_std": 0.5310352686792612, |
| "rewards/reward_func": 0.13946556020528078, |
| "step": 240 |
| }, |
| { |
| "completion_length": 198.046875, |
| "epoch": 0.03318613675899906, |
| "grad_norm": 4.34375, |
| "kl": 0.00045460829278454185, |
| "learning_rate": 9.66813863241001e-07, |
| "loss": 0.0, |
| "reward": 0.20321442000567913, |
| "reward_std": 0.7352660372853279, |
| "rewards/reward_func": 0.20321442000567913, |
| "step": 248 |
| }, |
| { |
| "completion_length": 197.2265625, |
| "epoch": 0.034256657299611935, |
| "grad_norm": 3.21875, |
| "kl": 0.0004281356050341856, |
| "learning_rate": 9.65743342700388e-07, |
| "loss": 0.0, |
| "reward": 0.20208348147571087, |
| "reward_std": 0.5523553621023893, |
| "rewards/reward_func": 0.20208348147571087, |
| "step": 256 |
| }, |
| { |
| "completion_length": 199.7109375, |
| "epoch": 0.03532717784022481, |
| "grad_norm": 5.15625, |
| "kl": 0.000508931974763982, |
| "learning_rate": 9.646728221597751e-07, |
| "loss": 0.0, |
| "reward": 0.15087968483567238, |
| "reward_std": 0.6751584373414516, |
| "rewards/reward_func": 0.15087968483567238, |
| "step": 264 |
| }, |
| { |
| "completion_length": 181.7578125, |
| "epoch": 0.03639769838083768, |
| "grad_norm": 3.109375, |
| "kl": 0.0004673556577472482, |
| "learning_rate": 9.636023016191622e-07, |
| "loss": 0.0, |
| "reward": 0.4762549586594105, |
| "reward_std": 0.5292778257280588, |
| "rewards/reward_func": 0.4762549586594105, |
| "step": 272 |
| }, |
| { |
| "completion_length": 172.3046875, |
| "epoch": 0.03746821892145055, |
| "grad_norm": 3.515625, |
| "kl": 0.00042566236152197234, |
| "learning_rate": 9.625317810785495e-07, |
| "loss": 0.0, |
| "reward": 0.36441371217370033, |
| "reward_std": 0.4376997593790293, |
| "rewards/reward_func": 0.36441371217370033, |
| "step": 280 |
| }, |
| { |
| "completion_length": 174.234375, |
| "epoch": 0.03853873946206343, |
| "grad_norm": 4.875, |
| "kl": 0.0004668605834012851, |
| "learning_rate": 9.614612605379365e-07, |
| "loss": 0.0, |
| "reward": 0.17927304655313492, |
| "reward_std": 0.5315880142152309, |
| "rewards/reward_func": 0.17927304655313492, |
| "step": 288 |
| }, |
| { |
| "completion_length": 152.8671875, |
| "epoch": 0.039609260002676304, |
| "grad_norm": 3.078125, |
| "kl": 0.0005132910300744697, |
| "learning_rate": 9.603907399973236e-07, |
| "loss": 0.0, |
| "reward": 0.5605849623680115, |
| "reward_std": 0.5153817608952522, |
| "rewards/reward_func": 0.5605849623680115, |
| "step": 296 |
| }, |
| { |
| "completion_length": 203.5390625, |
| "epoch": 0.040679780543289176, |
| "grad_norm": 3.796875, |
| "kl": 0.00048365409747930244, |
| "learning_rate": 9.593202194567109e-07, |
| "loss": 0.0, |
| "reward": -0.10374991549178958, |
| "reward_std": 0.5484482925385237, |
| "rewards/reward_func": -0.10374991549178958, |
| "step": 304 |
| }, |
| { |
| "completion_length": 203.1015625, |
| "epoch": 0.04175030108390205, |
| "grad_norm": 3.28125, |
| "kl": 0.0005178198443900328, |
| "learning_rate": 9.58249698916098e-07, |
| "loss": 0.0, |
| "reward": 0.15655188029631972, |
| "reward_std": 0.6044143028557301, |
| "rewards/reward_func": 0.15655188029631972, |
| "step": 312 |
| }, |
| { |
| "completion_length": 193.34375, |
| "epoch": 0.04282082162451492, |
| "grad_norm": 3.34375, |
| "kl": 0.0005464391106215771, |
| "learning_rate": 9.57179178375485e-07, |
| "loss": 0.0, |
| "reward": 0.20653630187734962, |
| "reward_std": 0.637122736312449, |
| "rewards/reward_func": 0.20653630187734962, |
| "step": 320 |
| }, |
| { |
| "completion_length": 182.390625, |
| "epoch": 0.04389134216512779, |
| "grad_norm": 3.953125, |
| "kl": 0.0005447999064926989, |
| "learning_rate": 9.56108657834872e-07, |
| "loss": 0.0, |
| "reward": 0.0629437193274498, |
| "reward_std": 0.6482261158525944, |
| "rewards/reward_func": 0.0629437193274498, |
| "step": 328 |
| }, |
| { |
| "completion_length": 210.1875, |
| "epoch": 0.044961862705740666, |
| "grad_norm": 3.796875, |
| "kl": 0.0005637894355459139, |
| "learning_rate": 9.550381372942594e-07, |
| "loss": 0.0, |
| "reward": 0.005680203437805176, |
| "reward_std": 0.5875861989334226, |
| "rewards/reward_func": 0.005680203437805176, |
| "step": 336 |
| }, |
| { |
| "completion_length": 179.8671875, |
| "epoch": 0.04603238324635354, |
| "grad_norm": 3.625, |
| "kl": 0.00048511310160392895, |
| "learning_rate": 9.539676167536464e-07, |
| "loss": 0.0, |
| "reward": 0.41209501400589943, |
| "reward_std": 0.49328203592449427, |
| "rewards/reward_func": 0.41209501400589943, |
| "step": 344 |
| }, |
| { |
| "completion_length": 223.6875, |
| "epoch": 0.04710290378696641, |
| "grad_norm": 3.765625, |
| "kl": 0.0005501342748175375, |
| "learning_rate": 9.528970962130335e-07, |
| "loss": 0.0, |
| "reward": -0.01573021337389946, |
| "reward_std": 0.6180381271988153, |
| "rewards/reward_func": -0.01573021337389946, |
| "step": 352 |
| }, |
| { |
| "completion_length": 179.4609375, |
| "epoch": 0.04817342432757928, |
| "grad_norm": 4.1875, |
| "kl": 0.0005429566881502979, |
| "learning_rate": 9.518265756724207e-07, |
| "loss": 0.0, |
| "reward": 0.3368812333792448, |
| "reward_std": 0.42025264725089073, |
| "rewards/reward_func": 0.3368812333792448, |
| "step": 360 |
| }, |
| { |
| "completion_length": 217.0234375, |
| "epoch": 0.04924394486819216, |
| "grad_norm": 3.640625, |
| "kl": 0.0005859209413756616, |
| "learning_rate": 9.507560551318078e-07, |
| "loss": 0.0, |
| "reward": 0.07761704362928867, |
| "reward_std": 0.539461400359869, |
| "rewards/reward_func": 0.07761704362928867, |
| "step": 368 |
| }, |
| { |
| "completion_length": 184.6484375, |
| "epoch": 0.050314465408805034, |
| "grad_norm": 3.46875, |
| "kl": 0.0006783130666008219, |
| "learning_rate": 9.496855345911949e-07, |
| "loss": 0.0, |
| "reward": 0.3878909517079592, |
| "reward_std": 0.5879320036619902, |
| "rewards/reward_func": 0.3878909517079592, |
| "step": 376 |
| }, |
| { |
| "completion_length": 237.3125, |
| "epoch": 0.051384985949417906, |
| "grad_norm": 2.828125, |
| "kl": 0.0005360892773751402, |
| "learning_rate": 9.486150140505821e-07, |
| "loss": 0.0, |
| "reward": 0.2408028580248356, |
| "reward_std": 0.6049665845930576, |
| "rewards/reward_func": 0.2408028580248356, |
| "step": 384 |
| }, |
| { |
| "completion_length": 181.765625, |
| "epoch": 0.05245550649003078, |
| "grad_norm": 4.1875, |
| "kl": 0.0007057523835101165, |
| "learning_rate": 9.475444935099693e-07, |
| "loss": 0.0, |
| "reward": 0.1389563176780939, |
| "reward_std": 0.5876767132431269, |
| "rewards/reward_func": 0.1389563176780939, |
| "step": 392 |
| }, |
| { |
| "completion_length": 167.578125, |
| "epoch": 0.05352602703064365, |
| "grad_norm": 3.859375, |
| "kl": 0.0006887019626447, |
| "learning_rate": 9.464739729693562e-07, |
| "loss": 0.0, |
| "reward": 0.6259515974670649, |
| "reward_std": 0.3476364128291607, |
| "rewards/reward_func": 0.6259515974670649, |
| "step": 400 |
| }, |
| { |
| "completion_length": 197.25, |
| "epoch": 0.05459654757125652, |
| "grad_norm": 3.03125, |
| "kl": 0.0006218861890374683, |
| "learning_rate": 9.454034524287434e-07, |
| "loss": 0.0, |
| "reward": 0.05700792092829943, |
| "reward_std": 0.428726595826447, |
| "rewards/reward_func": 0.05700792092829943, |
| "step": 408 |
| }, |
| { |
| "completion_length": 185.859375, |
| "epoch": 0.055667068111869396, |
| "grad_norm": 3.390625, |
| "kl": 0.0005967724355286919, |
| "learning_rate": 9.443329318881306e-07, |
| "loss": 0.0, |
| "reward": 0.2252086065709591, |
| "reward_std": 0.5162075459957123, |
| "rewards/reward_func": 0.2252086065709591, |
| "step": 416 |
| }, |
| { |
| "completion_length": 162.9296875, |
| "epoch": 0.05673758865248227, |
| "grad_norm": 4.28125, |
| "kl": 0.0008201822929549962, |
| "learning_rate": 9.432624113475178e-07, |
| "loss": 0.0, |
| "reward": 0.322255807928741, |
| "reward_std": 0.6453814581036568, |
| "rewards/reward_func": 0.322255807928741, |
| "step": 424 |
| }, |
| { |
| "completion_length": 211.1015625, |
| "epoch": 0.05780810919309514, |
| "grad_norm": 4.0, |
| "kl": 0.0006929989831405692, |
| "learning_rate": 9.421918908069048e-07, |
| "loss": 0.0, |
| "reward": 0.015094950795173645, |
| "reward_std": 0.5742807984352112, |
| "rewards/reward_func": 0.015094950795173645, |
| "step": 432 |
| }, |
| { |
| "completion_length": 190.5625, |
| "epoch": 0.05887862973370801, |
| "grad_norm": 2.421875, |
| "kl": 0.0007785350171616301, |
| "learning_rate": 9.411213702662919e-07, |
| "loss": 0.0, |
| "reward": 0.2807863000780344, |
| "reward_std": 0.38556696847081184, |
| "rewards/reward_func": 0.2807863000780344, |
| "step": 440 |
| }, |
| { |
| "completion_length": 205.90625, |
| "epoch": 0.059949150274320885, |
| "grad_norm": 3.34375, |
| "kl": 0.000683286452840548, |
| "learning_rate": 9.400508497256791e-07, |
| "loss": 0.0, |
| "reward": 0.17150266654789448, |
| "reward_std": 0.6842659376561642, |
| "rewards/reward_func": 0.17150266654789448, |
| "step": 448 |
| }, |
| { |
| "completion_length": 170.46875, |
| "epoch": 0.061019670814933764, |
| "grad_norm": 4.84375, |
| "kl": 0.0007365654601017013, |
| "learning_rate": 9.389803291850661e-07, |
| "loss": 0.0, |
| "reward": 0.28804031014442444, |
| "reward_std": 0.5351038463413715, |
| "rewards/reward_func": 0.28804031014442444, |
| "step": 456 |
| }, |
| { |
| "completion_length": 159.1875, |
| "epoch": 0.06209019135554664, |
| "grad_norm": 5.8125, |
| "kl": 0.0008180349177564494, |
| "learning_rate": 9.379098086444533e-07, |
| "loss": 0.0, |
| "reward": 0.3410092554986477, |
| "reward_std": 0.652816615998745, |
| "rewards/reward_func": 0.3410092554986477, |
| "step": 464 |
| }, |
| { |
| "completion_length": 164.8125, |
| "epoch": 0.0631607118961595, |
| "grad_norm": 5.21875, |
| "kl": 0.0009274725816794671, |
| "learning_rate": 9.368392881038405e-07, |
| "loss": 0.0, |
| "reward": 0.42266903538256884, |
| "reward_std": 0.5641947891563177, |
| "rewards/reward_func": 0.42266903538256884, |
| "step": 472 |
| }, |
| { |
| "completion_length": 194.984375, |
| "epoch": 0.06423123243677238, |
| "grad_norm": 3.078125, |
| "kl": 0.0007724944334768225, |
| "learning_rate": 9.357687675632276e-07, |
| "loss": 0.0, |
| "reward": 0.14429602678865194, |
| "reward_std": 0.6491441205143929, |
| "rewards/reward_func": 0.14429602678865194, |
| "step": 480 |
| }, |
| { |
| "completion_length": 162.75, |
| "epoch": 0.06530175297738525, |
| "grad_norm": 3.0625, |
| "kl": 0.0008644247645861469, |
| "learning_rate": 9.346982470226146e-07, |
| "loss": 0.0, |
| "reward": 0.4321166332811117, |
| "reward_std": 0.595779299736023, |
| "rewards/reward_func": 0.4321166332811117, |
| "step": 488 |
| }, |
| { |
| "completion_length": 206.21875, |
| "epoch": 0.06637227351799813, |
| "grad_norm": 5.3125, |
| "kl": 0.0008183796162484214, |
| "learning_rate": 9.336277264820018e-07, |
| "loss": 0.0, |
| "reward": 0.15042403992265463, |
| "reward_std": 0.6641352027654648, |
| "rewards/reward_func": 0.15042403992265463, |
| "step": 496 |
| }, |
| { |
| "completion_length": 169.53125, |
| "epoch": 0.067442794058611, |
| "grad_norm": 3.359375, |
| "kl": 0.0007982932875165716, |
| "learning_rate": 9.32557205941389e-07, |
| "loss": 0.0, |
| "reward": 0.5449392115697265, |
| "reward_std": 0.4078510096296668, |
| "rewards/reward_func": 0.5449392115697265, |
| "step": 504 |
| }, |
| { |
| "completion_length": 217.1328125, |
| "epoch": 0.06851331459922387, |
| "grad_norm": 4.46875, |
| "kl": 0.0007571115165774245, |
| "learning_rate": 9.314866854007762e-07, |
| "loss": 0.0, |
| "reward": -0.14380409568548203, |
| "reward_std": 0.42090473882853985, |
| "rewards/reward_func": -0.14380409568548203, |
| "step": 512 |
| }, |
| { |
| "completion_length": 188.7578125, |
| "epoch": 0.06958383513983675, |
| "grad_norm": 3.96875, |
| "kl": 0.0009429744168301113, |
| "learning_rate": 9.304161648601631e-07, |
| "loss": 0.0, |
| "reward": 0.2523565851151943, |
| "reward_std": 0.5918965879827738, |
| "rewards/reward_func": 0.2523565851151943, |
| "step": 520 |
| }, |
| { |
| "completion_length": 166.8671875, |
| "epoch": 0.07065435568044962, |
| "grad_norm": 3.796875, |
| "kl": 0.0008449588422081433, |
| "learning_rate": 9.293456443195503e-07, |
| "loss": 0.0, |
| "reward": 0.23421072773635387, |
| "reward_std": 0.5638821180909872, |
| "rewards/reward_func": 0.23421072773635387, |
| "step": 528 |
| }, |
| { |
| "completion_length": 162.359375, |
| "epoch": 0.0717248762210625, |
| "grad_norm": 4.03125, |
| "kl": 0.0010499468044145033, |
| "learning_rate": 9.282751237789375e-07, |
| "loss": 0.0, |
| "reward": 0.14197909273207188, |
| "reward_std": 0.5628513153642416, |
| "rewards/reward_func": 0.14197909273207188, |
| "step": 536 |
| }, |
| { |
| "completion_length": 149.3203125, |
| "epoch": 0.07279539676167536, |
| "grad_norm": 4.3125, |
| "kl": 0.0009130838298005983, |
| "learning_rate": 9.272046032383246e-07, |
| "loss": 0.0, |
| "reward": 0.42748846486210823, |
| "reward_std": 0.4888880178332329, |
| "rewards/reward_func": 0.42748846486210823, |
| "step": 544 |
| }, |
| { |
| "completion_length": 168.25, |
| "epoch": 0.07386591730228824, |
| "grad_norm": 3.09375, |
| "kl": 0.00098653764143819, |
| "learning_rate": 9.261340826977117e-07, |
| "loss": 0.0, |
| "reward": 0.32451459113508463, |
| "reward_std": 0.5782719142735004, |
| "rewards/reward_func": 0.32451459113508463, |
| "step": 552 |
| }, |
| { |
| "completion_length": 186.59375, |
| "epoch": 0.0749364378429011, |
| "grad_norm": 4.34375, |
| "kl": 0.0009752021069289185, |
| "learning_rate": 9.250635621570988e-07, |
| "loss": 0.0, |
| "reward": 0.20521394163370132, |
| "reward_std": 0.5094065079465508, |
| "rewards/reward_func": 0.20521394163370132, |
| "step": 560 |
| }, |
| { |
| "completion_length": 175.78125, |
| "epoch": 0.07600695838351398, |
| "grad_norm": 4.25, |
| "kl": 0.0010090179930557497, |
| "learning_rate": 9.23993041616486e-07, |
| "loss": 0.0, |
| "reward": 0.5578707046806812, |
| "reward_std": 0.4845643825829029, |
| "rewards/reward_func": 0.5578707046806812, |
| "step": 568 |
| }, |
| { |
| "completion_length": 158.234375, |
| "epoch": 0.07707747892412686, |
| "grad_norm": 4.6875, |
| "kl": 0.0011502801644382998, |
| "learning_rate": 9.229225210758731e-07, |
| "loss": 0.0, |
| "reward": 0.4859929271042347, |
| "reward_std": 0.5507038980722427, |
| "rewards/reward_func": 0.4859929271042347, |
| "step": 576 |
| }, |
| { |
| "completion_length": 200.0234375, |
| "epoch": 0.07814799946473973, |
| "grad_norm": 3.9375, |
| "kl": 0.0011561861392692663, |
| "learning_rate": 9.218520005352602e-07, |
| "loss": 0.0, |
| "reward": 0.16512918565422297, |
| "reward_std": 0.5027751969173551, |
| "rewards/reward_func": 0.16512918565422297, |
| "step": 584 |
| }, |
| { |
| "completion_length": 158.3515625, |
| "epoch": 0.07921852000535261, |
| "grad_norm": 3.390625, |
| "kl": 0.0012836234309361316, |
| "learning_rate": 9.207814799946474e-07, |
| "loss": 0.0001, |
| "reward": 0.4137600362300873, |
| "reward_std": 0.5193404145538807, |
| "rewards/reward_func": 0.4137600362300873, |
| "step": 592 |
| }, |
| { |
| "completion_length": 162.8828125, |
| "epoch": 0.08028904054596547, |
| "grad_norm": 2.125, |
| "kl": 0.0011869178197230212, |
| "learning_rate": 9.197109594540344e-07, |
| "loss": 0.0, |
| "reward": 0.39771614968776703, |
| "reward_std": 0.6107706986367702, |
| "rewards/reward_func": 0.39771614968776703, |
| "step": 600 |
| }, |
| { |
| "completion_length": 176.65625, |
| "epoch": 0.08135956108657835, |
| "grad_norm": 4.78125, |
| "kl": 0.0011795180544140749, |
| "learning_rate": 9.186404389134216e-07, |
| "loss": 0.0, |
| "reward": 0.0783949107863009, |
| "reward_std": 0.6460004411637783, |
| "rewards/reward_func": 0.0783949107863009, |
| "step": 608 |
| }, |
| { |
| "completion_length": 158.5390625, |
| "epoch": 0.08243008162719122, |
| "grad_norm": 3.59375, |
| "kl": 0.0013605851854663342, |
| "learning_rate": 9.175699183728087e-07, |
| "loss": 0.0001, |
| "reward": 0.3015612084418535, |
| "reward_std": 0.46242015063762665, |
| "rewards/reward_func": 0.3015612084418535, |
| "step": 616 |
| }, |
| { |
| "completion_length": 192.0, |
| "epoch": 0.0835006021678041, |
| "grad_norm": 6.0625, |
| "kl": 0.001107029449485708, |
| "learning_rate": 9.164993978321959e-07, |
| "loss": 0.0, |
| "reward": -0.052582718431949615, |
| "reward_std": 0.521589694544673, |
| "rewards/reward_func": -0.052582718431949615, |
| "step": 624 |
| }, |
| { |
| "completion_length": 167.3828125, |
| "epoch": 0.08457112270841696, |
| "grad_norm": 3.1875, |
| "kl": 0.0013894213043386117, |
| "learning_rate": 9.15428877291583e-07, |
| "loss": 0.0001, |
| "reward": 0.2013978809118271, |
| "reward_std": 0.6684001944959164, |
| "rewards/reward_func": 0.2013978809118271, |
| "step": 632 |
| }, |
| { |
| "completion_length": 186.6640625, |
| "epoch": 0.08564164324902984, |
| "grad_norm": 3.921875, |
| "kl": 0.0010865220101550221, |
| "learning_rate": 9.143583567509702e-07, |
| "loss": 0.0, |
| "reward": 0.6091820821166039, |
| "reward_std": 0.49955446273088455, |
| "rewards/reward_func": 0.6091820821166039, |
| "step": 640 |
| }, |
| { |
| "completion_length": 202.4609375, |
| "epoch": 0.08671216378964271, |
| "grad_norm": 3.6875, |
| "kl": 0.0012401975327520631, |
| "learning_rate": 9.132878362103572e-07, |
| "loss": 0.0, |
| "reward": 0.14060556702315807, |
| "reward_std": 0.6868433952331543, |
| "rewards/reward_func": 0.14060556702315807, |
| "step": 648 |
| }, |
| { |
| "completion_length": 157.953125, |
| "epoch": 0.08778268433025559, |
| "grad_norm": 4.21875, |
| "kl": 0.0014552801876561716, |
| "learning_rate": 9.122173156697443e-07, |
| "loss": 0.0001, |
| "reward": 0.27967471070587635, |
| "reward_std": 0.5355266528204083, |
| "rewards/reward_func": 0.27967471070587635, |
| "step": 656 |
| }, |
| { |
| "completion_length": 188.9609375, |
| "epoch": 0.08885320487086847, |
| "grad_norm": 2.8125, |
| "kl": 0.0012782855046680197, |
| "learning_rate": 9.111467951291315e-07, |
| "loss": 0.0001, |
| "reward": 0.2866704575717449, |
| "reward_std": 0.46457840129733086, |
| "rewards/reward_func": 0.2866704575717449, |
| "step": 664 |
| }, |
| { |
| "completion_length": 202.0078125, |
| "epoch": 0.08992372541148133, |
| "grad_norm": 3.625, |
| "kl": 0.0010314229875802994, |
| "learning_rate": 9.100762745885187e-07, |
| "loss": 0.0, |
| "reward": 0.21837860718369484, |
| "reward_std": 0.5863924492150545, |
| "rewards/reward_func": 0.21837860718369484, |
| "step": 672 |
| }, |
| { |
| "completion_length": 181.328125, |
| "epoch": 0.09099424595209421, |
| "grad_norm": 4.25, |
| "kl": 0.0011778115513152443, |
| "learning_rate": 9.090057540479058e-07, |
| "loss": 0.0, |
| "reward": 0.17519081057980657, |
| "reward_std": 0.5138188861310482, |
| "rewards/reward_func": 0.17519081057980657, |
| "step": 680 |
| }, |
| { |
| "completion_length": 196.46875, |
| "epoch": 0.09206476649270708, |
| "grad_norm": 4.84375, |
| "kl": 0.0013976221380289644, |
| "learning_rate": 9.079352335072928e-07, |
| "loss": 0.0001, |
| "reward": 0.07826100569218397, |
| "reward_std": 0.6565159633755684, |
| "rewards/reward_func": 0.07826100569218397, |
| "step": 688 |
| }, |
| { |
| "completion_length": 142.6171875, |
| "epoch": 0.09313528703331996, |
| "grad_norm": 4.3125, |
| "kl": 0.0014903126284480095, |
| "learning_rate": 9.0686471296668e-07, |
| "loss": 0.0001, |
| "reward": 0.4409363344311714, |
| "reward_std": 0.6269242819398642, |
| "rewards/reward_func": 0.4409363344311714, |
| "step": 696 |
| }, |
| { |
| "completion_length": 164.75, |
| "epoch": 0.09420580757393282, |
| "grad_norm": 2.859375, |
| "kl": 0.001257821699255146, |
| "learning_rate": 9.057941924260672e-07, |
| "loss": 0.0001, |
| "reward": 0.4695241190493107, |
| "reward_std": 0.4753529988229275, |
| "rewards/reward_func": 0.4695241190493107, |
| "step": 704 |
| }, |
| { |
| "completion_length": 175.765625, |
| "epoch": 0.0952763281145457, |
| "grad_norm": 3.5625, |
| "kl": 0.0015129576058825478, |
| "learning_rate": 9.047236718854542e-07, |
| "loss": 0.0001, |
| "reward": 0.02202584408223629, |
| "reward_std": 0.6471435278654099, |
| "rewards/reward_func": 0.02202584408223629, |
| "step": 712 |
| }, |
| { |
| "completion_length": 178.46875, |
| "epoch": 0.09634684865515857, |
| "grad_norm": 2.671875, |
| "kl": 0.0014852698805043474, |
| "learning_rate": 9.036531513448414e-07, |
| "loss": 0.0001, |
| "reward": 0.1112822787836194, |
| "reward_std": 0.6299657188355923, |
| "rewards/reward_func": 0.1112822787836194, |
| "step": 720 |
| }, |
| { |
| "completion_length": 179.171875, |
| "epoch": 0.09741736919577144, |
| "grad_norm": 3.65625, |
| "kl": 0.001357251821900718, |
| "learning_rate": 9.025826308042285e-07, |
| "loss": 0.0001, |
| "reward": 0.03050302341580391, |
| "reward_std": 0.4494458809494972, |
| "rewards/reward_func": 0.03050302341580391, |
| "step": 728 |
| }, |
| { |
| "completion_length": 179.171875, |
| "epoch": 0.09848788973638432, |
| "grad_norm": 3.21875, |
| "kl": 0.0012782294361386448, |
| "learning_rate": 9.015121102636157e-07, |
| "loss": 0.0001, |
| "reward": 0.07521175127476454, |
| "reward_std": 0.5754083581268787, |
| "rewards/reward_func": 0.07521175127476454, |
| "step": 736 |
| }, |
| { |
| "completion_length": 170.8046875, |
| "epoch": 0.09955841027699719, |
| "grad_norm": 3.640625, |
| "kl": 0.0014729191461810842, |
| "learning_rate": 9.004415897230027e-07, |
| "loss": 0.0001, |
| "reward": 0.19647281896322966, |
| "reward_std": 0.5569281429052353, |
| "rewards/reward_func": 0.19647281896322966, |
| "step": 744 |
| }, |
| { |
| "completion_length": 170.9609375, |
| "epoch": 0.10062893081761007, |
| "grad_norm": 3.390625, |
| "kl": 0.0015401854761876166, |
| "learning_rate": 8.993710691823899e-07, |
| "loss": 0.0001, |
| "reward": 0.3724030330777168, |
| "reward_std": 0.3632662743330002, |
| "rewards/reward_func": 0.3724030330777168, |
| "step": 752 |
| }, |
| { |
| "completion_length": 221.6171875, |
| "epoch": 0.10169945135822293, |
| "grad_norm": 4.15625, |
| "kl": 0.0013893990762881003, |
| "learning_rate": 8.983005486417771e-07, |
| "loss": 0.0001, |
| "reward": -0.09233509004116058, |
| "reward_std": 0.48617786914110184, |
| "rewards/reward_func": -0.09233509004116058, |
| "step": 760 |
| }, |
| { |
| "completion_length": 202.3828125, |
| "epoch": 0.10276997189883581, |
| "grad_norm": 4.375, |
| "kl": 0.0012778284071828239, |
| "learning_rate": 8.972300281011642e-07, |
| "loss": 0.0001, |
| "reward": 0.2091209925711155, |
| "reward_std": 0.6527585946023464, |
| "rewards/reward_func": 0.2091209925711155, |
| "step": 768 |
| }, |
| { |
| "completion_length": 193.1640625, |
| "epoch": 0.10384049243944868, |
| "grad_norm": 3.109375, |
| "kl": 0.0013181737012928352, |
| "learning_rate": 8.961595075605512e-07, |
| "loss": 0.0001, |
| "reward": 0.3330334695056081, |
| "reward_std": 0.541321462020278, |
| "rewards/reward_func": 0.3330334695056081, |
| "step": 776 |
| }, |
| { |
| "completion_length": 152.453125, |
| "epoch": 0.10491101298006156, |
| "grad_norm": 3.359375, |
| "kl": 0.0017155654932139441, |
| "learning_rate": 8.950889870199384e-07, |
| "loss": 0.0001, |
| "reward": 0.5970601001754403, |
| "reward_std": 0.4666150966659188, |
| "rewards/reward_func": 0.5970601001754403, |
| "step": 784 |
| }, |
| { |
| "completion_length": 169.125, |
| "epoch": 0.10598153352067442, |
| "grad_norm": 3.578125, |
| "kl": 0.0017139802366727963, |
| "learning_rate": 8.940184664793256e-07, |
| "loss": 0.0001, |
| "reward": 0.3862100951373577, |
| "reward_std": 0.4441477656364441, |
| "rewards/reward_func": 0.3862100951373577, |
| "step": 792 |
| }, |
| { |
| "completion_length": 155.3828125, |
| "epoch": 0.1070520540612873, |
| "grad_norm": 3.15625, |
| "kl": 0.0015526109855272807, |
| "learning_rate": 8.929479459387127e-07, |
| "loss": 0.0001, |
| "reward": 0.2984956353902817, |
| "reward_std": 0.586381059139967, |
| "rewards/reward_func": 0.2984956353902817, |
| "step": 800 |
| }, |
| { |
| "completion_length": 196.40625, |
| "epoch": 0.10812257460190017, |
| "grad_norm": 4.90625, |
| "kl": 0.0014231447203201242, |
| "learning_rate": 8.918774253980997e-07, |
| "loss": 0.0001, |
| "reward": 0.21418001921847463, |
| "reward_std": 0.6311414241790771, |
| "rewards/reward_func": 0.21418001921847463, |
| "step": 808 |
| }, |
| { |
| "completion_length": 138.2109375, |
| "epoch": 0.10919309514251305, |
| "grad_norm": 4.03125, |
| "kl": 0.0017638935969443992, |
| "learning_rate": 8.908069048574869e-07, |
| "loss": 0.0001, |
| "reward": 0.5397277176380157, |
| "reward_std": 0.5305888652801514, |
| "rewards/reward_func": 0.5397277176380157, |
| "step": 816 |
| }, |
| { |
| "completion_length": 178.0703125, |
| "epoch": 0.11026361568312593, |
| "grad_norm": 3.84375, |
| "kl": 0.0016187937144422904, |
| "learning_rate": 8.897363843168741e-07, |
| "loss": 0.0001, |
| "reward": 0.3325108243152499, |
| "reward_std": 0.5717135239392519, |
| "rewards/reward_func": 0.3325108243152499, |
| "step": 824 |
| }, |
| { |
| "completion_length": 148.953125, |
| "epoch": 0.11133413622373879, |
| "grad_norm": 5.0, |
| "kl": 0.0018482063169358298, |
| "learning_rate": 8.886658637762612e-07, |
| "loss": 0.0001, |
| "reward": 0.2947835847735405, |
| "reward_std": 0.4330580784007907, |
| "rewards/reward_func": 0.2947835847735405, |
| "step": 832 |
| }, |
| { |
| "completion_length": 164.9375, |
| "epoch": 0.11240465676435167, |
| "grad_norm": 4.09375, |
| "kl": 0.0016665154980728403, |
| "learning_rate": 8.875953432356483e-07, |
| "loss": 0.0001, |
| "reward": 0.4421437568962574, |
| "reward_std": 0.6194501928985119, |
| "rewards/reward_func": 0.4421437568962574, |
| "step": 840 |
| }, |
| { |
| "completion_length": 151.4140625, |
| "epoch": 0.11347517730496454, |
| "grad_norm": 3.0, |
| "kl": 0.0017541930938023143, |
| "learning_rate": 8.865248226950354e-07, |
| "loss": 0.0001, |
| "reward": 0.634210865944624, |
| "reward_std": 0.43934057652950287, |
| "rewards/reward_func": 0.634210865944624, |
| "step": 848 |
| }, |
| { |
| "completion_length": 163.0390625, |
| "epoch": 0.11454569784557742, |
| "grad_norm": 4.1875, |
| "kl": 0.001597623537236359, |
| "learning_rate": 8.854543021544225e-07, |
| "loss": 0.0001, |
| "reward": 0.49650172144174576, |
| "reward_std": 0.5100172646343708, |
| "rewards/reward_func": 0.49650172144174576, |
| "step": 856 |
| }, |
| { |
| "completion_length": 175.515625, |
| "epoch": 0.11561621838619028, |
| "grad_norm": 4.28125, |
| "kl": 0.0016097126208478585, |
| "learning_rate": 8.843837816138097e-07, |
| "loss": 0.0001, |
| "reward": 0.300532303750515, |
| "reward_std": 0.5050319191068411, |
| "rewards/reward_func": 0.300532303750515, |
| "step": 864 |
| }, |
| { |
| "completion_length": 166.609375, |
| "epoch": 0.11668673892680316, |
| "grad_norm": 3.375, |
| "kl": 0.001671285106567666, |
| "learning_rate": 8.833132610731968e-07, |
| "loss": 0.0001, |
| "reward": 0.3196424636989832, |
| "reward_std": 0.6211317032575607, |
| "rewards/reward_func": 0.3196424636989832, |
| "step": 872 |
| }, |
| { |
| "completion_length": 199.15625, |
| "epoch": 0.11775725946741603, |
| "grad_norm": 4.46875, |
| "kl": 0.0015437143010785803, |
| "learning_rate": 8.82242740532584e-07, |
| "loss": 0.0001, |
| "reward": -0.04952175496146083, |
| "reward_std": 0.6928350441157818, |
| "rewards/reward_func": -0.04952175496146083, |
| "step": 880 |
| }, |
| { |
| "completion_length": 195.078125, |
| "epoch": 0.1188277800080289, |
| "grad_norm": 3.625, |
| "kl": 0.0014302593117463402, |
| "learning_rate": 8.811722199919711e-07, |
| "loss": 0.0001, |
| "reward": 0.2765323193743825, |
| "reward_std": 0.5081478040665388, |
| "rewards/reward_func": 0.2765323193743825, |
| "step": 888 |
| }, |
| { |
| "completion_length": 162.1640625, |
| "epoch": 0.11989830054864177, |
| "grad_norm": 4.3125, |
| "kl": 0.002093081347993575, |
| "learning_rate": 8.801016994513581e-07, |
| "loss": 0.0001, |
| "reward": 0.43932087533175945, |
| "reward_std": 0.6151396594941616, |
| "rewards/reward_func": 0.43932087533175945, |
| "step": 896 |
| }, |
| { |
| "completion_length": 188.265625, |
| "epoch": 0.12096882108925465, |
| "grad_norm": 4.21875, |
| "kl": 0.0015162140916800126, |
| "learning_rate": 8.790311789107453e-07, |
| "loss": 0.0001, |
| "reward": 0.30038960836827755, |
| "reward_std": 0.6118085775524378, |
| "rewards/reward_func": 0.30038960836827755, |
| "step": 904 |
| }, |
| { |
| "completion_length": 185.625, |
| "epoch": 0.12203934162986753, |
| "grad_norm": 4.21875, |
| "kl": 0.0020139318803558126, |
| "learning_rate": 8.779606583701324e-07, |
| "loss": 0.0001, |
| "reward": 0.09830181300640106, |
| "reward_std": 0.4306083731353283, |
| "rewards/reward_func": 0.09830181300640106, |
| "step": 912 |
| }, |
| { |
| "completion_length": 184.1875, |
| "epoch": 0.1231098621704804, |
| "grad_norm": 3.15625, |
| "kl": 0.0016747360059525818, |
| "learning_rate": 8.768901378295196e-07, |
| "loss": 0.0001, |
| "reward": 0.14476243034005165, |
| "reward_std": 0.5790487378835678, |
| "rewards/reward_func": 0.14476243034005165, |
| "step": 920 |
| }, |
| { |
| "completion_length": 159.734375, |
| "epoch": 0.12418038271109327, |
| "grad_norm": 3.71875, |
| "kl": 0.0016227394880843349, |
| "learning_rate": 8.758196172889067e-07, |
| "loss": 0.0001, |
| "reward": 0.43369535729289055, |
| "reward_std": 0.6066659651696682, |
| "rewards/reward_func": 0.43369535729289055, |
| "step": 928 |
| }, |
| { |
| "completion_length": 197.9375, |
| "epoch": 0.12525090325170615, |
| "grad_norm": 4.5, |
| "kl": 0.0014731917763128877, |
| "learning_rate": 8.747490967482938e-07, |
| "loss": 0.0001, |
| "reward": 0.2775337900966406, |
| "reward_std": 0.4666815670207143, |
| "rewards/reward_func": 0.2775337900966406, |
| "step": 936 |
| }, |
| { |
| "completion_length": 190.015625, |
| "epoch": 0.126321423792319, |
| "grad_norm": 4.21875, |
| "kl": 0.0017878647340694442, |
| "learning_rate": 8.736785762076809e-07, |
| "loss": 0.0001, |
| "reward": 0.12123087048530579, |
| "reward_std": 0.5715042147785425, |
| "rewards/reward_func": 0.12123087048530579, |
| "step": 944 |
| }, |
| { |
| "completion_length": 167.1171875, |
| "epoch": 0.12739194433293188, |
| "grad_norm": 5.25, |
| "kl": 0.0019433694251347333, |
| "learning_rate": 8.726080556670681e-07, |
| "loss": 0.0001, |
| "reward": 0.45328211411833763, |
| "reward_std": 0.5355701018124819, |
| "rewards/reward_func": 0.45328211411833763, |
| "step": 952 |
| }, |
| { |
| "completion_length": 161.625, |
| "epoch": 0.12846246487354476, |
| "grad_norm": 5.0, |
| "kl": 0.002197007488575764, |
| "learning_rate": 8.715375351264552e-07, |
| "loss": 0.0001, |
| "reward": 0.3187681008130312, |
| "reward_std": 0.552251516841352, |
| "rewards/reward_func": 0.3187681008130312, |
| "step": 960 |
| }, |
| { |
| "completion_length": 210.890625, |
| "epoch": 0.12953298541415764, |
| "grad_norm": 5.46875, |
| "kl": 0.0015157314774114639, |
| "learning_rate": 8.704670145858424e-07, |
| "loss": 0.0001, |
| "reward": 0.28820460522547364, |
| "reward_std": 0.6035197824239731, |
| "rewards/reward_func": 0.28820460522547364, |
| "step": 968 |
| }, |
| { |
| "completion_length": 166.625, |
| "epoch": 0.1306035059547705, |
| "grad_norm": 3.765625, |
| "kl": 0.0017258078005397692, |
| "learning_rate": 8.693964940452294e-07, |
| "loss": 0.0001, |
| "reward": 0.5467304401099682, |
| "reward_std": 0.5410985443741083, |
| "rewards/reward_func": 0.5467304401099682, |
| "step": 976 |
| }, |
| { |
| "completion_length": 207.4765625, |
| "epoch": 0.13167402649538337, |
| "grad_norm": 2.9375, |
| "kl": 0.0016618163790553808, |
| "learning_rate": 8.683259735046166e-07, |
| "loss": 0.0001, |
| "reward": 0.24847363959997892, |
| "reward_std": 0.5600069649517536, |
| "rewards/reward_func": 0.24847363959997892, |
| "step": 984 |
| }, |
| { |
| "completion_length": 185.703125, |
| "epoch": 0.13274454703599625, |
| "grad_norm": 3.515625, |
| "kl": 0.0019170493469573557, |
| "learning_rate": 8.672554529640037e-07, |
| "loss": 0.0001, |
| "reward": 0.15675952192395926, |
| "reward_std": 0.4530050400644541, |
| "rewards/reward_func": 0.15675952192395926, |
| "step": 992 |
| }, |
| { |
| "completion_length": 185.6484375, |
| "epoch": 0.13381506757660913, |
| "grad_norm": 4.71875, |
| "kl": 0.0018418136023683473, |
| "learning_rate": 8.661849324233908e-07, |
| "loss": 0.0001, |
| "reward": 0.3646358111873269, |
| "reward_std": 0.5811815112829208, |
| "rewards/reward_func": 0.3646358111873269, |
| "step": 1000 |
| }, |
| { |
| "completion_length": 203.6171875, |
| "epoch": 0.134885588117222, |
| "grad_norm": 3.5625, |
| "kl": 0.0020316866575740278, |
| "learning_rate": 8.65114411882778e-07, |
| "loss": 0.0001, |
| "reward": 0.15310932788997889, |
| "reward_std": 0.5124245472252369, |
| "rewards/reward_func": 0.15310932788997889, |
| "step": 1008 |
| }, |
| { |
| "completion_length": 146.1484375, |
| "epoch": 0.13595610865783486, |
| "grad_norm": 3.65625, |
| "kl": 0.002132126915967092, |
| "learning_rate": 8.640438913421651e-07, |
| "loss": 0.0001, |
| "reward": 0.5445789489895105, |
| "reward_std": 0.5516379848122597, |
| "rewards/reward_func": 0.5445789489895105, |
| "step": 1016 |
| }, |
| { |
| "completion_length": 197.9375, |
| "epoch": 0.13702662919844774, |
| "grad_norm": 4.0625, |
| "kl": 0.001812848830013536, |
| "learning_rate": 8.629733708015521e-07, |
| "loss": 0.0001, |
| "reward": 0.07804312836378813, |
| "reward_std": 0.5468557141721249, |
| "rewards/reward_func": 0.07804312836378813, |
| "step": 1024 |
| }, |
| { |
| "completion_length": 160.8515625, |
| "epoch": 0.13809714973906062, |
| "grad_norm": 3.75, |
| "kl": 0.0020080953399883583, |
| "learning_rate": 8.619028502609393e-07, |
| "loss": 0.0001, |
| "reward": 0.3609808227047324, |
| "reward_std": 0.4902635822072625, |
| "rewards/reward_func": 0.3609808227047324, |
| "step": 1032 |
| }, |
| { |
| "completion_length": 166.921875, |
| "epoch": 0.1391676702796735, |
| "grad_norm": 3.953125, |
| "kl": 0.0021710527944378555, |
| "learning_rate": 8.608323297203265e-07, |
| "loss": 0.0001, |
| "reward": 0.04567475710064173, |
| "reward_std": 0.639631874859333, |
| "rewards/reward_func": 0.04567475710064173, |
| "step": 1040 |
| }, |
| { |
| "completion_length": 137.265625, |
| "epoch": 0.14023819082028635, |
| "grad_norm": 4.46875, |
| "kl": 0.0022375187691068277, |
| "learning_rate": 8.597618091797137e-07, |
| "loss": 0.0001, |
| "reward": 0.6706136465072632, |
| "reward_std": 0.470423087477684, |
| "rewards/reward_func": 0.6706136465072632, |
| "step": 1048 |
| }, |
| { |
| "completion_length": 174.9609375, |
| "epoch": 0.14130871136089923, |
| "grad_norm": 4.8125, |
| "kl": 0.002100168538163416, |
| "learning_rate": 8.586912886391006e-07, |
| "loss": 0.0001, |
| "reward": 0.4244903214275837, |
| "reward_std": 0.4562762314453721, |
| "rewards/reward_func": 0.4244903214275837, |
| "step": 1056 |
| }, |
| { |
| "completion_length": 147.9296875, |
| "epoch": 0.1423792319015121, |
| "grad_norm": 3.40625, |
| "kl": 0.0022357639973051846, |
| "learning_rate": 8.576207680984878e-07, |
| "loss": 0.0001, |
| "reward": 0.5205270126461983, |
| "reward_std": 0.483647458255291, |
| "rewards/reward_func": 0.5205270126461983, |
| "step": 1064 |
| }, |
| { |
| "completion_length": 212.625, |
| "epoch": 0.143449752442125, |
| "grad_norm": 3.46875, |
| "kl": 0.0017438856302760541, |
| "learning_rate": 8.56550247557875e-07, |
| "loss": 0.0001, |
| "reward": 0.04320483095943928, |
| "reward_std": 0.6948880217969418, |
| "rewards/reward_func": 0.04320483095943928, |
| "step": 1072 |
| }, |
| { |
| "completion_length": 181.5390625, |
| "epoch": 0.14452027298273787, |
| "grad_norm": 4.375, |
| "kl": 0.0018758865917334333, |
| "learning_rate": 8.554797270172622e-07, |
| "loss": 0.0001, |
| "reward": 0.30002590641379356, |
| "reward_std": 0.5271559292450547, |
| "rewards/reward_func": 0.30002590641379356, |
| "step": 1080 |
| }, |
| { |
| "completion_length": 176.609375, |
| "epoch": 0.14559079352335072, |
| "grad_norm": 3.59375, |
| "kl": 0.0016780206933617592, |
| "learning_rate": 8.544092064766492e-07, |
| "loss": 0.0001, |
| "reward": 0.29282525181770325, |
| "reward_std": 0.48009985871613026, |
| "rewards/reward_func": 0.29282525181770325, |
| "step": 1088 |
| }, |
| { |
| "completion_length": 218.75, |
| "epoch": 0.1466613140639636, |
| "grad_norm": 3.1875, |
| "kl": 0.002029647948802449, |
| "learning_rate": 8.533386859360363e-07, |
| "loss": 0.0001, |
| "reward": -0.1532103894278407, |
| "reward_std": 0.4770786985754967, |
| "rewards/reward_func": -0.1532103894278407, |
| "step": 1096 |
| }, |
| { |
| "completion_length": 184.4296875, |
| "epoch": 0.14773183460457648, |
| "grad_norm": 3.203125, |
| "kl": 0.0017034321062965319, |
| "learning_rate": 8.522681653954235e-07, |
| "loss": 0.0001, |
| "reward": 0.44974952936172485, |
| "reward_std": 0.5446614529937506, |
| "rewards/reward_func": 0.44974952936172485, |
| "step": 1104 |
| }, |
| { |
| "completion_length": 196.8203125, |
| "epoch": 0.14880235514518936, |
| "grad_norm": 5.0, |
| "kl": 0.001964334660442546, |
| "learning_rate": 8.511976448548106e-07, |
| "loss": 0.0001, |
| "reward": 0.056189559400081635, |
| "reward_std": 0.468637160025537, |
| "rewards/reward_func": 0.056189559400081635, |
| "step": 1112 |
| }, |
| { |
| "completion_length": 169.4921875, |
| "epoch": 0.1498728756858022, |
| "grad_norm": 4.15625, |
| "kl": 0.002173921908251941, |
| "learning_rate": 8.501271243141977e-07, |
| "loss": 0.0001, |
| "reward": 0.23756458796560764, |
| "reward_std": 0.6810929477214813, |
| "rewards/reward_func": 0.23756458796560764, |
| "step": 1120 |
| }, |
| { |
| "completion_length": 155.75, |
| "epoch": 0.1509433962264151, |
| "grad_norm": 4.65625, |
| "kl": 0.00248186280077789, |
| "learning_rate": 8.490566037735849e-07, |
| "loss": 0.0001, |
| "reward": 0.28897845139726996, |
| "reward_std": 0.5369884418323636, |
| "rewards/reward_func": 0.28897845139726996, |
| "step": 1128 |
| }, |
| { |
| "completion_length": 186.2109375, |
| "epoch": 0.15201391676702797, |
| "grad_norm": 3.21875, |
| "kl": 0.0022306487226160243, |
| "learning_rate": 8.479860832329721e-07, |
| "loss": 0.0001, |
| "reward": 0.10482135927304626, |
| "reward_std": 0.6790419593453407, |
| "rewards/reward_func": 0.10482135927304626, |
| "step": 1136 |
| }, |
| { |
| "completion_length": 192.546875, |
| "epoch": 0.15308443730764085, |
| "grad_norm": 3.71875, |
| "kl": 0.0019263384310761467, |
| "learning_rate": 8.469155626923591e-07, |
| "loss": 0.0001, |
| "reward": 0.14666470140218735, |
| "reward_std": 0.5422503855079412, |
| "rewards/reward_func": 0.14666470140218735, |
| "step": 1144 |
| }, |
| { |
| "completion_length": 184.40625, |
| "epoch": 0.15415495784825373, |
| "grad_norm": 2.734375, |
| "kl": 0.002109996523358859, |
| "learning_rate": 8.458450421517462e-07, |
| "loss": 0.0001, |
| "reward": 0.4111117944121361, |
| "reward_std": 0.4935786770656705, |
| "rewards/reward_func": 0.4111117944121361, |
| "step": 1152 |
| }, |
| { |
| "completion_length": 175.953125, |
| "epoch": 0.15522547838886658, |
| "grad_norm": 2.875, |
| "kl": 0.0024219048937084153, |
| "learning_rate": 8.447745216111334e-07, |
| "loss": 0.0001, |
| "reward": 0.26550869084894657, |
| "reward_std": 0.4649670384824276, |
| "rewards/reward_func": 0.26550869084894657, |
| "step": 1160 |
| }, |
| { |
| "completion_length": 161.7578125, |
| "epoch": 0.15629599892947946, |
| "grad_norm": 4.34375, |
| "kl": 0.0023637667181901634, |
| "learning_rate": 8.437040010705205e-07, |
| "loss": 0.0001, |
| "reward": 0.4199391510337591, |
| "reward_std": 0.5549349021166563, |
| "rewards/reward_func": 0.4199391510337591, |
| "step": 1168 |
| }, |
| { |
| "completion_length": 169.859375, |
| "epoch": 0.15736651947009234, |
| "grad_norm": 3.765625, |
| "kl": 0.002736452064709738, |
| "learning_rate": 8.426334805299077e-07, |
| "loss": 0.0001, |
| "reward": 0.09132032562047243, |
| "reward_std": 0.5800180211663246, |
| "rewards/reward_func": 0.09132032562047243, |
| "step": 1176 |
| }, |
| { |
| "completion_length": 165.875, |
| "epoch": 0.15843704001070522, |
| "grad_norm": 2.9375, |
| "kl": 0.0026032868336187676, |
| "learning_rate": 8.415629599892947e-07, |
| "loss": 0.0001, |
| "reward": 0.39741448499262333, |
| "reward_std": 0.553530789911747, |
| "rewards/reward_func": 0.39741448499262333, |
| "step": 1184 |
| }, |
| { |
| "completion_length": 152.6796875, |
| "epoch": 0.15950756055131807, |
| "grad_norm": 4.1875, |
| "kl": 0.0026048235449707136, |
| "learning_rate": 8.404924394486819e-07, |
| "loss": 0.0001, |
| "reward": 0.526178702712059, |
| "reward_std": 0.4392085522413254, |
| "rewards/reward_func": 0.526178702712059, |
| "step": 1192 |
| }, |
| { |
| "completion_length": 180.640625, |
| "epoch": 0.16057808109193095, |
| "grad_norm": 4.46875, |
| "kl": 0.0021871782082598656, |
| "learning_rate": 8.39421918908069e-07, |
| "loss": 0.0001, |
| "reward": 0.18006896087899804, |
| "reward_std": 0.6525123100727797, |
| "rewards/reward_func": 0.18006896087899804, |
| "step": 1200 |
| }, |
| { |
| "completion_length": 153.7734375, |
| "epoch": 0.16164860163254383, |
| "grad_norm": 2.75, |
| "kl": 0.0022942414943827316, |
| "learning_rate": 8.383513983674562e-07, |
| "loss": 0.0001, |
| "reward": 0.37028289400041103, |
| "reward_std": 0.49791209399700165, |
| "rewards/reward_func": 0.37028289400041103, |
| "step": 1208 |
| }, |
| { |
| "completion_length": 172.2109375, |
| "epoch": 0.1627191221731567, |
| "grad_norm": 5.03125, |
| "kl": 0.002093525734380819, |
| "learning_rate": 8.372808778268433e-07, |
| "loss": 0.0001, |
| "reward": 0.13176708482205868, |
| "reward_std": 0.6455358900129795, |
| "rewards/reward_func": 0.13176708482205868, |
| "step": 1216 |
| }, |
| { |
| "completion_length": 169.609375, |
| "epoch": 0.16378964271376958, |
| "grad_norm": 4.75, |
| "kl": 0.0022360333387041464, |
| "learning_rate": 8.362103572862303e-07, |
| "loss": 0.0001, |
| "reward": 0.3072196710854769, |
| "reward_std": 0.5846256157383323, |
| "rewards/reward_func": 0.3072196710854769, |
| "step": 1224 |
| }, |
| { |
| "completion_length": 166.1484375, |
| "epoch": 0.16486016325438244, |
| "grad_norm": 3.890625, |
| "kl": 0.002241482841782272, |
| "learning_rate": 8.351398367456175e-07, |
| "loss": 0.0001, |
| "reward": 0.3591133989393711, |
| "reward_std": 0.4736274667084217, |
| "rewards/reward_func": 0.3591133989393711, |
| "step": 1232 |
| }, |
| { |
| "completion_length": 199.203125, |
| "epoch": 0.16593068379499532, |
| "grad_norm": 3.328125, |
| "kl": 0.00244250099058263, |
| "learning_rate": 8.340693162050047e-07, |
| "loss": 0.0001, |
| "reward": 0.006516195833683014, |
| "reward_std": 0.6199562083929777, |
| "rewards/reward_func": 0.006516195833683014, |
| "step": 1240 |
| }, |
| { |
| "completion_length": 203.1328125, |
| "epoch": 0.1670012043356082, |
| "grad_norm": 4.65625, |
| "kl": 0.0022358261194312945, |
| "learning_rate": 8.329987956643918e-07, |
| "loss": 0.0001, |
| "reward": 0.10224719159305096, |
| "reward_std": 0.6809590011835098, |
| "rewards/reward_func": 0.10224719159305096, |
| "step": 1248 |
| }, |
| { |
| "completion_length": 148.3125, |
| "epoch": 0.16807172487622107, |
| "grad_norm": 4.375, |
| "kl": 0.0025668047892395407, |
| "learning_rate": 8.319282751237789e-07, |
| "loss": 0.0001, |
| "reward": 0.49531039223074913, |
| "reward_std": 0.4778098724782467, |
| "rewards/reward_func": 0.49531039223074913, |
| "step": 1256 |
| }, |
| { |
| "completion_length": 153.2578125, |
| "epoch": 0.16914224541683393, |
| "grad_norm": 3.34375, |
| "kl": 0.0024423423456028104, |
| "learning_rate": 8.30857754583166e-07, |
| "loss": 0.0001, |
| "reward": 0.35373237170279026, |
| "reward_std": 0.4996814336627722, |
| "rewards/reward_func": 0.35373237170279026, |
| "step": 1264 |
| }, |
| { |
| "completion_length": 171.234375, |
| "epoch": 0.1702127659574468, |
| "grad_norm": 3.1875, |
| "kl": 0.0021556682913796976, |
| "learning_rate": 8.297872340425532e-07, |
| "loss": 0.0001, |
| "reward": 0.28696669451892376, |
| "reward_std": 0.5421474725008011, |
| "rewards/reward_func": 0.28696669451892376, |
| "step": 1272 |
| }, |
| { |
| "completion_length": 188.515625, |
| "epoch": 0.17128328649805968, |
| "grad_norm": 3.703125, |
| "kl": 0.0021073912794236094, |
| "learning_rate": 8.287167135019402e-07, |
| "loss": 0.0001, |
| "reward": 0.21668443083763123, |
| "reward_std": 0.419855872169137, |
| "rewards/reward_func": 0.21668443083763123, |
| "step": 1280 |
| }, |
| { |
| "completion_length": 163.34375, |
| "epoch": 0.17235380703867256, |
| "grad_norm": 4.125, |
| "kl": 0.002413511203485541, |
| "learning_rate": 8.276461929613274e-07, |
| "loss": 0.0001, |
| "reward": 0.3908206336200237, |
| "reward_std": 0.5146115329116583, |
| "rewards/reward_func": 0.3908206336200237, |
| "step": 1288 |
| }, |
| { |
| "completion_length": 195.21875, |
| "epoch": 0.17342432757928541, |
| "grad_norm": 2.953125, |
| "kl": 0.002010537078604102, |
| "learning_rate": 8.265756724207146e-07, |
| "loss": 0.0001, |
| "reward": 0.2040023533627391, |
| "reward_std": 0.5783168002963066, |
| "rewards/reward_func": 0.2040023533627391, |
| "step": 1296 |
| }, |
| { |
| "completion_length": 147.5546875, |
| "epoch": 0.1744948481198983, |
| "grad_norm": 3.515625, |
| "kl": 0.0029850091959815472, |
| "learning_rate": 8.255051518801016e-07, |
| "loss": 0.0001, |
| "reward": 0.4321533404290676, |
| "reward_std": 0.3191776555031538, |
| "rewards/reward_func": 0.4321533404290676, |
| "step": 1304 |
| }, |
| { |
| "completion_length": 174.9921875, |
| "epoch": 0.17556536866051117, |
| "grad_norm": 3.84375, |
| "kl": 0.0024711176374694332, |
| "learning_rate": 8.244346313394887e-07, |
| "loss": 0.0001, |
| "reward": 0.31474856473505497, |
| "reward_std": 0.5751422699540854, |
| "rewards/reward_func": 0.31474856473505497, |
| "step": 1312 |
| }, |
| { |
| "completion_length": 180.5703125, |
| "epoch": 0.17663588920112405, |
| "grad_norm": 3.515625, |
| "kl": 0.002760413888609037, |
| "learning_rate": 8.233641107988759e-07, |
| "loss": 0.0001, |
| "reward": 0.2904138704761863, |
| "reward_std": 0.3093845183029771, |
| "rewards/reward_func": 0.2904138704761863, |
| "step": 1320 |
| }, |
| { |
| "completion_length": 171.0703125, |
| "epoch": 0.17770640974173693, |
| "grad_norm": 4.125, |
| "kl": 0.002526076335925609, |
| "learning_rate": 8.222935902582631e-07, |
| "loss": 0.0001, |
| "reward": 0.3407979141920805, |
| "reward_std": 0.6505857929587364, |
| "rewards/reward_func": 0.3407979141920805, |
| "step": 1328 |
| }, |
| { |
| "completion_length": 194.09375, |
| "epoch": 0.17877693028234978, |
| "grad_norm": 3.140625, |
| "kl": 0.002656547527294606, |
| "learning_rate": 8.212230697176503e-07, |
| "loss": 0.0001, |
| "reward": 0.16334644611924887, |
| "reward_std": 0.5975025221705437, |
| "rewards/reward_func": 0.16334644611924887, |
| "step": 1336 |
| }, |
| { |
| "completion_length": 169.984375, |
| "epoch": 0.17984745082296266, |
| "grad_norm": 6.1875, |
| "kl": 0.0022961402573855594, |
| "learning_rate": 8.201525491770372e-07, |
| "loss": 0.0001, |
| "reward": 0.11132130306214094, |
| "reward_std": 0.6224425416439772, |
| "rewards/reward_func": 0.11132130306214094, |
| "step": 1344 |
| }, |
| { |
| "completion_length": 174.171875, |
| "epoch": 0.18091797136357554, |
| "grad_norm": 3.6875, |
| "kl": 0.002571267934399657, |
| "learning_rate": 8.190820286364244e-07, |
| "loss": 0.0001, |
| "reward": 0.3750305436551571, |
| "reward_std": 0.6532670613378286, |
| "rewards/reward_func": 0.3750305436551571, |
| "step": 1352 |
| }, |
| { |
| "completion_length": 175.390625, |
| "epoch": 0.18198849190418842, |
| "grad_norm": 4.09375, |
| "kl": 0.0026113019848708063, |
| "learning_rate": 8.180115080958116e-07, |
| "loss": 0.0001, |
| "reward": 0.23261917755007744, |
| "reward_std": 0.5395109131932259, |
| "rewards/reward_func": 0.23261917755007744, |
| "step": 1360 |
| }, |
| { |
| "completion_length": 218.90625, |
| "epoch": 0.18305901244480127, |
| "grad_norm": 3.0625, |
| "kl": 0.002512662627850659, |
| "learning_rate": 8.169409875551986e-07, |
| "loss": 0.0001, |
| "reward": -0.04414751287549734, |
| "reward_std": 0.49756659008562565, |
| "rewards/reward_func": -0.04414751287549734, |
| "step": 1368 |
| }, |
| { |
| "completion_length": 203.4453125, |
| "epoch": 0.18412953298541415, |
| "grad_norm": 2.84375, |
| "kl": 0.0023657960555283353, |
| "learning_rate": 8.158704670145858e-07, |
| "loss": 0.0001, |
| "reward": 0.1807372528128326, |
| "reward_std": 0.6098357774317265, |
| "rewards/reward_func": 0.1807372528128326, |
| "step": 1376 |
| }, |
| { |
| "completion_length": 148.9609375, |
| "epoch": 0.18520005352602703, |
| "grad_norm": 4.03125, |
| "kl": 0.0027785369311459363, |
| "learning_rate": 8.14799946473973e-07, |
| "loss": 0.0001, |
| "reward": 0.587528869509697, |
| "reward_std": 0.4399991165846586, |
| "rewards/reward_func": 0.587528869509697, |
| "step": 1384 |
| }, |
| { |
| "completion_length": 166.234375, |
| "epoch": 0.1862705740666399, |
| "grad_norm": 4.4375, |
| "kl": 0.002863895075279288, |
| "learning_rate": 8.137294259333601e-07, |
| "loss": 0.0001, |
| "reward": 0.2870405614376068, |
| "reward_std": 0.5268293377012014, |
| "rewards/reward_func": 0.2870405614376068, |
| "step": 1392 |
| }, |
| { |
| "completion_length": 164.875, |
| "epoch": 0.1873410946072528, |
| "grad_norm": 4.3125, |
| "kl": 0.0027437864046078175, |
| "learning_rate": 8.126589053927471e-07, |
| "loss": 0.0001, |
| "reward": 0.27717010863125324, |
| "reward_std": 0.6858110204339027, |
| "rewards/reward_func": 0.27717010863125324, |
| "step": 1400 |
| }, |
| { |
| "completion_length": 160.875, |
| "epoch": 0.18841161514786564, |
| "grad_norm": 3.375, |
| "kl": 0.002744226367212832, |
| "learning_rate": 8.115883848521343e-07, |
| "loss": 0.0001, |
| "reward": 0.3531609745696187, |
| "reward_std": 0.41381734795868397, |
| "rewards/reward_func": 0.3531609745696187, |
| "step": 1408 |
| }, |
| { |
| "completion_length": 202.7734375, |
| "epoch": 0.18948213568847852, |
| "grad_norm": 2.859375, |
| "kl": 0.002227893375675194, |
| "learning_rate": 8.105178643115215e-07, |
| "loss": 0.0001, |
| "reward": 0.05168680660426617, |
| "reward_std": 0.5793404262512922, |
| "rewards/reward_func": 0.05168680660426617, |
| "step": 1416 |
| }, |
| { |
| "completion_length": 190.796875, |
| "epoch": 0.1905526562290914, |
| "grad_norm": 2.765625, |
| "kl": 0.002415237744571641, |
| "learning_rate": 8.094473437709086e-07, |
| "loss": 0.0001, |
| "reward": -0.012714797630906105, |
| "reward_std": 0.6679329574108124, |
| "rewards/reward_func": -0.012714797630906105, |
| "step": 1424 |
| }, |
| { |
| "completion_length": 160.0, |
| "epoch": 0.19162317676970428, |
| "grad_norm": 3.140625, |
| "kl": 0.0027612125559244305, |
| "learning_rate": 8.083768232302956e-07, |
| "loss": 0.0001, |
| "reward": 0.5069613344967365, |
| "reward_std": 0.5272765178233385, |
| "rewards/reward_func": 0.5069613344967365, |
| "step": 1432 |
| }, |
| { |
| "completion_length": 177.1328125, |
| "epoch": 0.19269369731031713, |
| "grad_norm": 3.9375, |
| "kl": 0.002587508424767293, |
| "learning_rate": 8.073063026896828e-07, |
| "loss": 0.0001, |
| "reward": 0.07287294790148735, |
| "reward_std": 0.3514406271278858, |
| "rewards/reward_func": 0.07287294790148735, |
| "step": 1440 |
| }, |
| { |
| "completion_length": 138.4609375, |
| "epoch": 0.19376421785093, |
| "grad_norm": 3.71875, |
| "kl": 0.002967173932120204, |
| "learning_rate": 8.0623578214907e-07, |
| "loss": 0.0001, |
| "reward": 0.40755754709243774, |
| "reward_std": 0.48442143853753805, |
| "rewards/reward_func": 0.40755754709243774, |
| "step": 1448 |
| }, |
| { |
| "completion_length": 160.5234375, |
| "epoch": 0.1948347383915429, |
| "grad_norm": 3.78125, |
| "kl": 0.0028059011965524405, |
| "learning_rate": 8.051652616084571e-07, |
| "loss": 0.0001, |
| "reward": 0.3703090399503708, |
| "reward_std": 0.4106726851314306, |
| "rewards/reward_func": 0.3703090399503708, |
| "step": 1456 |
| }, |
| { |
| "completion_length": 171.15625, |
| "epoch": 0.19590525893215577, |
| "grad_norm": 3.25, |
| "kl": 0.0026552542112767696, |
| "learning_rate": 8.040947410678442e-07, |
| "loss": 0.0001, |
| "reward": 0.3305620066821575, |
| "reward_std": 0.6117083020508289, |
| "rewards/reward_func": 0.3305620066821575, |
| "step": 1464 |
| }, |
| { |
| "completion_length": 155.4296875, |
| "epoch": 0.19697577947276865, |
| "grad_norm": 4.15625, |
| "kl": 0.003170755269820802, |
| "learning_rate": 8.030242205272313e-07, |
| "loss": 0.0001, |
| "reward": 0.6180750611238182, |
| "reward_std": 0.40046251006424427, |
| "rewards/reward_func": 0.6180750611238182, |
| "step": 1472 |
| }, |
| { |
| "completion_length": 180.125, |
| "epoch": 0.1980463000133815, |
| "grad_norm": 5.40625, |
| "kl": 0.002536381929530762, |
| "learning_rate": 8.019536999866184e-07, |
| "loss": 0.0001, |
| "reward": 0.2231542430818081, |
| "reward_std": 0.4958275035023689, |
| "rewards/reward_func": 0.2231542430818081, |
| "step": 1480 |
| }, |
| { |
| "completion_length": 180.4375, |
| "epoch": 0.19911682055399438, |
| "grad_norm": 4.875, |
| "kl": 0.0024219011975219473, |
| "learning_rate": 8.008831794460056e-07, |
| "loss": 0.0001, |
| "reward": 0.1133259404450655, |
| "reward_std": 0.48838030360639095, |
| "rewards/reward_func": 0.1133259404450655, |
| "step": 1488 |
| }, |
| { |
| "completion_length": 147.6171875, |
| "epoch": 0.20018734109460726, |
| "grad_norm": 5.125, |
| "kl": 0.0031812663073651493, |
| "learning_rate": 7.998126589053927e-07, |
| "loss": 0.0001, |
| "reward": 0.4617150817066431, |
| "reward_std": 0.32284008618444204, |
| "rewards/reward_func": 0.4617150817066431, |
| "step": 1496 |
| }, |
| { |
| "completion_length": 167.2109375, |
| "epoch": 0.20125786163522014, |
| "grad_norm": 4.09375, |
| "kl": 0.0028175316692795604, |
| "learning_rate": 7.987421383647799e-07, |
| "loss": 0.0001, |
| "reward": 0.3847576631233096, |
| "reward_std": 0.6411111112684011, |
| "rewards/reward_func": 0.3847576631233096, |
| "step": 1504 |
| }, |
| { |
| "completion_length": 177.4453125, |
| "epoch": 0.202328382175833, |
| "grad_norm": 2.796875, |
| "kl": 0.0026287745859008282, |
| "learning_rate": 7.976716178241669e-07, |
| "loss": 0.0001, |
| "reward": 0.5649865288287401, |
| "reward_std": 0.5727164149284363, |
| "rewards/reward_func": 0.5649865288287401, |
| "step": 1512 |
| }, |
| { |
| "completion_length": 172.1875, |
| "epoch": 0.20339890271644587, |
| "grad_norm": 3.8125, |
| "kl": 0.002526555268559605, |
| "learning_rate": 7.966010972835541e-07, |
| "loss": 0.0001, |
| "reward": 0.014871623367071152, |
| "reward_std": 0.7178319171071053, |
| "rewards/reward_func": 0.014871623367071152, |
| "step": 1520 |
| }, |
| { |
| "completion_length": 190.1328125, |
| "epoch": 0.20446942325705875, |
| "grad_norm": 3.484375, |
| "kl": 0.002442143566440791, |
| "learning_rate": 7.955305767429412e-07, |
| "loss": 0.0001, |
| "reward": -0.07442041672766209, |
| "reward_std": 0.4888562625274062, |
| "rewards/reward_func": -0.07442041672766209, |
| "step": 1528 |
| }, |
| { |
| "completion_length": 207.6328125, |
| "epoch": 0.20553994379767163, |
| "grad_norm": 3.359375, |
| "kl": 0.0033009210601449013, |
| "learning_rate": 7.944600562023284e-07, |
| "loss": 0.0001, |
| "reward": -0.009969270788133144, |
| "reward_std": 0.6862461306154728, |
| "rewards/reward_func": -0.009969270788133144, |
| "step": 1536 |
| }, |
| { |
| "completion_length": 162.59375, |
| "epoch": 0.20661046433828448, |
| "grad_norm": 4.65625, |
| "kl": 0.002739378687692806, |
| "learning_rate": 7.933895356617155e-07, |
| "loss": 0.0001, |
| "reward": 0.18637081049382687, |
| "reward_std": 0.5968187265098095, |
| "rewards/reward_func": 0.18637081049382687, |
| "step": 1544 |
| }, |
| { |
| "completion_length": 177.9453125, |
| "epoch": 0.20768098487889736, |
| "grad_norm": 3.53125, |
| "kl": 0.003128286494757049, |
| "learning_rate": 7.923190151211026e-07, |
| "loss": 0.0001, |
| "reward": 0.22664616536349058, |
| "reward_std": 0.6607938874512911, |
| "rewards/reward_func": 0.22664616536349058, |
| "step": 1552 |
| }, |
| { |
| "completion_length": 172.1328125, |
| "epoch": 0.20875150541951024, |
| "grad_norm": 3.953125, |
| "kl": 0.003004254394909367, |
| "learning_rate": 7.912484945804897e-07, |
| "loss": 0.0001, |
| "reward": 0.2532934434711933, |
| "reward_std": 0.45475378446280956, |
| "rewards/reward_func": 0.2532934434711933, |
| "step": 1560 |
| }, |
| { |
| "completion_length": 187.9609375, |
| "epoch": 0.20982202596012312, |
| "grad_norm": 3.890625, |
| "kl": 0.0027564516640268266, |
| "learning_rate": 7.901779740398768e-07, |
| "loss": 0.0001, |
| "reward": -0.02841023448854685, |
| "reward_std": 0.51457286067307, |
| "rewards/reward_func": -0.02841023448854685, |
| "step": 1568 |
| }, |
| { |
| "completion_length": 167.59375, |
| "epoch": 0.210892546500736, |
| "grad_norm": 2.515625, |
| "kl": 0.0030282980733318254, |
| "learning_rate": 7.89107453499264e-07, |
| "loss": 0.0001, |
| "reward": 0.4444689229130745, |
| "reward_std": 0.5939295422285795, |
| "rewards/reward_func": 0.4444689229130745, |
| "step": 1576 |
| }, |
| { |
| "completion_length": 170.5078125, |
| "epoch": 0.21196306704134885, |
| "grad_norm": 4.28125, |
| "kl": 0.0030873965588398278, |
| "learning_rate": 7.880369329586512e-07, |
| "loss": 0.0001, |
| "reward": 0.380419734865427, |
| "reward_std": 0.47338528744876385, |
| "rewards/reward_func": 0.380419734865427, |
| "step": 1584 |
| }, |
| { |
| "completion_length": 211.0234375, |
| "epoch": 0.21303358758196173, |
| "grad_norm": 2.75, |
| "kl": 0.002254050428746268, |
| "learning_rate": 7.869664124180381e-07, |
| "loss": 0.0001, |
| "reward": 0.2378121637739241, |
| "reward_std": 0.4477904764935374, |
| "rewards/reward_func": 0.2378121637739241, |
| "step": 1592 |
| }, |
| { |
| "completion_length": 172.953125, |
| "epoch": 0.2141041081225746, |
| "grad_norm": 3.296875, |
| "kl": 0.0025954252487281337, |
| "learning_rate": 7.858958918774253e-07, |
| "loss": 0.0001, |
| "reward": 0.2678052484989166, |
| "reward_std": 0.5658102091401815, |
| "rewards/reward_func": 0.2678052484989166, |
| "step": 1600 |
| }, |
| { |
| "completion_length": 186.3125, |
| "epoch": 0.21517462866318748, |
| "grad_norm": 3.59375, |
| "kl": 0.0029648117488250136, |
| "learning_rate": 7.848253713368125e-07, |
| "loss": 0.0001, |
| "reward": -0.019576035905629396, |
| "reward_std": 0.6440879367291927, |
| "rewards/reward_func": -0.019576035905629396, |
| "step": 1608 |
| }, |
| { |
| "completion_length": 172.1015625, |
| "epoch": 0.21624514920380034, |
| "grad_norm": 5.34375, |
| "kl": 0.0030590661335736513, |
| "learning_rate": 7.837548507961997e-07, |
| "loss": 0.0001, |
| "reward": 0.14971440564841032, |
| "reward_std": 0.7218026369810104, |
| "rewards/reward_func": 0.14971440564841032, |
| "step": 1616 |
| }, |
| { |
| "completion_length": 194.1796875, |
| "epoch": 0.21731566974441321, |
| "grad_norm": 3.859375, |
| "kl": 0.0026531801267992705, |
| "learning_rate": 7.826843302555867e-07, |
| "loss": 0.0001, |
| "reward": 0.10876535065472126, |
| "reward_std": 0.5477734114974737, |
| "rewards/reward_func": 0.10876535065472126, |
| "step": 1624 |
| }, |
| { |
| "completion_length": 171.3828125, |
| "epoch": 0.2183861902850261, |
| "grad_norm": 3.15625, |
| "kl": 0.0031873490661382675, |
| "learning_rate": 7.816138097149738e-07, |
| "loss": 0.0001, |
| "reward": 0.45664553716778755, |
| "reward_std": 0.41215432807803154, |
| "rewards/reward_func": 0.45664553716778755, |
| "step": 1632 |
| }, |
| { |
| "completion_length": 180.3515625, |
| "epoch": 0.21945671082563897, |
| "grad_norm": 2.953125, |
| "kl": 0.0028260272229090333, |
| "learning_rate": 7.80543289174361e-07, |
| "loss": 0.0001, |
| "reward": 0.12724297121167183, |
| "reward_std": 0.5571443336084485, |
| "rewards/reward_func": 0.12724297121167183, |
| "step": 1640 |
| }, |
| { |
| "completion_length": 171.859375, |
| "epoch": 0.22052723136625185, |
| "grad_norm": 4.125, |
| "kl": 0.003236800170270726, |
| "learning_rate": 7.794727686337482e-07, |
| "loss": 0.0001, |
| "reward": 0.3013367038220167, |
| "reward_std": 0.47213477827608585, |
| "rewards/reward_func": 0.3013367038220167, |
| "step": 1648 |
| }, |
| { |
| "completion_length": 170.015625, |
| "epoch": 0.2215977519068647, |
| "grad_norm": 4.125, |
| "kl": 0.0037978598556946963, |
| "learning_rate": 7.784022480931352e-07, |
| "loss": 0.0002, |
| "reward": 0.15545153710991144, |
| "reward_std": 0.4550722800195217, |
| "rewards/reward_func": 0.15545153710991144, |
| "step": 1656 |
| }, |
| { |
| "completion_length": 169.140625, |
| "epoch": 0.22266827244747758, |
| "grad_norm": 5.0, |
| "kl": 0.0032820345077198, |
| "learning_rate": 7.773317275525224e-07, |
| "loss": 0.0001, |
| "reward": 0.4377444460988045, |
| "reward_std": 0.3618372976779938, |
| "rewards/reward_func": 0.4377444460988045, |
| "step": 1664 |
| }, |
| { |
| "completion_length": 157.703125, |
| "epoch": 0.22373879298809046, |
| "grad_norm": 3.90625, |
| "kl": 0.0029510459426091984, |
| "learning_rate": 7.762612070119096e-07, |
| "loss": 0.0001, |
| "reward": 0.3989291125908494, |
| "reward_std": 0.6332539850845933, |
| "rewards/reward_func": 0.3989291125908494, |
| "step": 1672 |
| }, |
| { |
| "completion_length": 162.046875, |
| "epoch": 0.22480931352870334, |
| "grad_norm": 6.59375, |
| "kl": 0.0033083032321883366, |
| "learning_rate": 7.751906864712966e-07, |
| "loss": 0.0001, |
| "reward": 0.209829643368721, |
| "reward_std": 0.6311223246157169, |
| "rewards/reward_func": 0.209829643368721, |
| "step": 1680 |
| }, |
| { |
| "completion_length": 170.15625, |
| "epoch": 0.2258798340693162, |
| "grad_norm": 3.796875, |
| "kl": 0.0031797273695701733, |
| "learning_rate": 7.741201659306837e-07, |
| "loss": 0.0001, |
| "reward": 0.3279075580649078, |
| "reward_std": 0.5484324526041746, |
| "rewards/reward_func": 0.3279075580649078, |
| "step": 1688 |
| }, |
| { |
| "completion_length": 162.9921875, |
| "epoch": 0.22695035460992907, |
| "grad_norm": 4.03125, |
| "kl": 0.0034852683020289987, |
| "learning_rate": 7.730496453900709e-07, |
| "loss": 0.0001, |
| "reward": 0.4082569610327482, |
| "reward_std": 0.6405626218765974, |
| "rewards/reward_func": 0.4082569610327482, |
| "step": 1696 |
| }, |
| { |
| "completion_length": 181.515625, |
| "epoch": 0.22802087515054195, |
| "grad_norm": 4.40625, |
| "kl": 0.002649015310453251, |
| "learning_rate": 7.719791248494581e-07, |
| "loss": 0.0001, |
| "reward": 0.369276593439281, |
| "reward_std": 0.6541860643774271, |
| "rewards/reward_func": 0.369276593439281, |
| "step": 1704 |
| }, |
| { |
| "completion_length": 178.90625, |
| "epoch": 0.22909139569115483, |
| "grad_norm": 4.09375, |
| "kl": 0.003142255067359656, |
| "learning_rate": 7.709086043088452e-07, |
| "loss": 0.0001, |
| "reward": -0.047080494463443756, |
| "reward_std": 0.5225706771016121, |
| "rewards/reward_func": -0.047080494463443756, |
| "step": 1712 |
| }, |
| { |
| "completion_length": 186.4375, |
| "epoch": 0.2301619162317677, |
| "grad_norm": 3.53125, |
| "kl": 0.002581312000984326, |
| "learning_rate": 7.698380837682322e-07, |
| "loss": 0.0001, |
| "reward": 0.38728183694183826, |
| "reward_std": 0.5371669437736273, |
| "rewards/reward_func": 0.38728183694183826, |
| "step": 1720 |
| }, |
| { |
| "completion_length": 186.484375, |
| "epoch": 0.23123243677238056, |
| "grad_norm": 4.59375, |
| "kl": 0.0032444458920508623, |
| "learning_rate": 7.687675632276194e-07, |
| "loss": 0.0001, |
| "reward": 0.04241009894758463, |
| "reward_std": 0.571906641125679, |
| "rewards/reward_func": 0.04241009894758463, |
| "step": 1728 |
| }, |
| { |
| "completion_length": 162.625, |
| "epoch": 0.23230295731299344, |
| "grad_norm": 3.3125, |
| "kl": 0.003708757780259475, |
| "learning_rate": 7.676970426870065e-07, |
| "loss": 0.0001, |
| "reward": 0.44313428178429604, |
| "reward_std": 0.49149057269096375, |
| "rewards/reward_func": 0.44313428178429604, |
| "step": 1736 |
| }, |
| { |
| "completion_length": 152.2578125, |
| "epoch": 0.23337347785360632, |
| "grad_norm": 6.78125, |
| "kl": 0.004187669139355421, |
| "learning_rate": 7.666265221463937e-07, |
| "loss": 0.0002, |
| "reward": 0.28122030571103096, |
| "reward_std": 0.66860780864954, |
| "rewards/reward_func": 0.28122030571103096, |
| "step": 1744 |
| }, |
| { |
| "completion_length": 171.765625, |
| "epoch": 0.2344439983942192, |
| "grad_norm": 4.3125, |
| "kl": 0.0036364277184475213, |
| "learning_rate": 7.655560016057808e-07, |
| "loss": 0.0001, |
| "reward": 0.19042097311466932, |
| "reward_std": 0.6095598358660936, |
| "rewards/reward_func": 0.19042097311466932, |
| "step": 1752 |
| }, |
| { |
| "completion_length": 157.0, |
| "epoch": 0.23551451893483205, |
| "grad_norm": 5.0, |
| "kl": 0.003160024934913963, |
| "learning_rate": 7.644854810651679e-07, |
| "loss": 0.0001, |
| "reward": 0.4550087433308363, |
| "reward_std": 0.4861539136618376, |
| "rewards/reward_func": 0.4550087433308363, |
| "step": 1760 |
| }, |
| { |
| "completion_length": 166.6328125, |
| "epoch": 0.23658503947544493, |
| "grad_norm": 4.375, |
| "kl": 0.003758498263778165, |
| "learning_rate": 7.63414960524555e-07, |
| "loss": 0.0002, |
| "reward": 0.3313362691551447, |
| "reward_std": 0.6437762156128883, |
| "rewards/reward_func": 0.3313362691551447, |
| "step": 1768 |
| }, |
| { |
| "completion_length": 151.0859375, |
| "epoch": 0.2376555600160578, |
| "grad_norm": 4.5, |
| "kl": 0.0032230821962002665, |
| "learning_rate": 7.623444399839421e-07, |
| "loss": 0.0001, |
| "reward": 0.22410259768366814, |
| "reward_std": 0.5125870034098625, |
| "rewards/reward_func": 0.22410259768366814, |
| "step": 1776 |
| }, |
| { |
| "completion_length": 168.3984375, |
| "epoch": 0.2387260805566707, |
| "grad_norm": 8.0625, |
| "kl": 0.0032083308324217796, |
| "learning_rate": 7.612739194433293e-07, |
| "loss": 0.0001, |
| "reward": 0.33182543236762285, |
| "reward_std": 0.6010603215545416, |
| "rewards/reward_func": 0.33182543236762285, |
| "step": 1784 |
| }, |
| { |
| "completion_length": 207.2265625, |
| "epoch": 0.23979660109728354, |
| "grad_norm": 3.34375, |
| "kl": 0.002648265421157703, |
| "learning_rate": 7.602033989027165e-07, |
| "loss": 0.0001, |
| "reward": 0.03860955499112606, |
| "reward_std": 0.5622509643435478, |
| "rewards/reward_func": 0.03860955499112606, |
| "step": 1792 |
| }, |
| { |
| "completion_length": 176.15625, |
| "epoch": 0.24086712163789642, |
| "grad_norm": 4.8125, |
| "kl": 0.003711075463797897, |
| "learning_rate": 7.591328783621035e-07, |
| "loss": 0.0001, |
| "reward": 0.30269888415932655, |
| "reward_std": 0.5988016724586487, |
| "rewards/reward_func": 0.30269888415932655, |
| "step": 1800 |
| }, |
| { |
| "completion_length": 185.546875, |
| "epoch": 0.2419376421785093, |
| "grad_norm": 2.734375, |
| "kl": 0.0029471274465322495, |
| "learning_rate": 7.580623578214906e-07, |
| "loss": 0.0001, |
| "reward": 0.29960334673523903, |
| "reward_std": 0.5849093981087208, |
| "rewards/reward_func": 0.29960334673523903, |
| "step": 1808 |
| }, |
| { |
| "completion_length": 167.2109375, |
| "epoch": 0.24300816271912218, |
| "grad_norm": 4.96875, |
| "kl": 0.003220759332180023, |
| "learning_rate": 7.569918372808778e-07, |
| "loss": 0.0001, |
| "reward": 0.1449232380837202, |
| "reward_std": 0.6299447380006313, |
| "rewards/reward_func": 0.1449232380837202, |
| "step": 1816 |
| }, |
| { |
| "completion_length": 171.4375, |
| "epoch": 0.24407868325973506, |
| "grad_norm": 3.46875, |
| "kl": 0.003584496444091201, |
| "learning_rate": 7.559213167402649e-07, |
| "loss": 0.0001, |
| "reward": 0.1334919836372137, |
| "reward_std": 0.5379905849695206, |
| "rewards/reward_func": 0.1334919836372137, |
| "step": 1824 |
| }, |
| { |
| "completion_length": 176.0703125, |
| "epoch": 0.2451492038003479, |
| "grad_norm": 3.734375, |
| "kl": 0.0033595635613892227, |
| "learning_rate": 7.548507961996521e-07, |
| "loss": 0.0001, |
| "reward": 0.2927993945777416, |
| "reward_std": 0.48013297095894814, |
| "rewards/reward_func": 0.2927993945777416, |
| "step": 1832 |
| }, |
| { |
| "completion_length": 156.171875, |
| "epoch": 0.2462197243409608, |
| "grad_norm": 3.953125, |
| "kl": 0.0032605840533506125, |
| "learning_rate": 7.537802756590391e-07, |
| "loss": 0.0001, |
| "reward": 0.4554846081882715, |
| "reward_std": 0.46569772996008396, |
| "rewards/reward_func": 0.4554846081882715, |
| "step": 1840 |
| }, |
| { |
| "completion_length": 154.8515625, |
| "epoch": 0.24729024488157367, |
| "grad_norm": 5.09375, |
| "kl": 0.0038292294193524867, |
| "learning_rate": 7.527097551184263e-07, |
| "loss": 0.0002, |
| "reward": 0.2707878933288157, |
| "reward_std": 0.6204773802310228, |
| "rewards/reward_func": 0.2707878933288157, |
| "step": 1848 |
| }, |
| { |
| "completion_length": 173.046875, |
| "epoch": 0.24836076542218655, |
| "grad_norm": 3.671875, |
| "kl": 0.003133324411464855, |
| "learning_rate": 7.516392345778134e-07, |
| "loss": 0.0001, |
| "reward": 0.16592059656977654, |
| "reward_std": 0.5527506861835718, |
| "rewards/reward_func": 0.16592059656977654, |
| "step": 1856 |
| }, |
| { |
| "completion_length": 167.9375, |
| "epoch": 0.2494312859627994, |
| "grad_norm": 4.125, |
| "kl": 0.0033082127920351923, |
| "learning_rate": 7.505687140372006e-07, |
| "loss": 0.0001, |
| "reward": 0.20683408807963133, |
| "reward_std": 0.5755761060863733, |
| "rewards/reward_func": 0.20683408807963133, |
| "step": 1864 |
| }, |
| { |
| "completion_length": 184.7890625, |
| "epoch": 0.2505018065034123, |
| "grad_norm": 4.125, |
| "kl": 0.0028639852243941277, |
| "learning_rate": 7.494981934965877e-07, |
| "loss": 0.0001, |
| "reward": 0.22647499293088913, |
| "reward_std": 0.5401746807619929, |
| "rewards/reward_func": 0.22647499293088913, |
| "step": 1872 |
| }, |
| { |
| "completion_length": 177.9921875, |
| "epoch": 0.25157232704402516, |
| "grad_norm": 4.75, |
| "kl": 0.0032885581313166767, |
| "learning_rate": 7.484276729559747e-07, |
| "loss": 0.0001, |
| "reward": 0.19369017332792282, |
| "reward_std": 0.7017679810523987, |
| "rewards/reward_func": 0.19369017332792282, |
| "step": 1880 |
| }, |
| { |
| "completion_length": 165.171875, |
| "epoch": 0.252642847584638, |
| "grad_norm": 4.125, |
| "kl": 0.0038394963194150478, |
| "learning_rate": 7.473571524153619e-07, |
| "loss": 0.0002, |
| "reward": 0.4050522642210126, |
| "reward_std": 0.49607561621814966, |
| "rewards/reward_func": 0.4050522642210126, |
| "step": 1888 |
| }, |
| { |
| "completion_length": 212.78125, |
| "epoch": 0.2537133681252509, |
| "grad_norm": 4.625, |
| "kl": 0.003014246642123908, |
| "learning_rate": 7.462866318747491e-07, |
| "loss": 0.0001, |
| "reward": -0.03455093875527382, |
| "reward_std": 0.5755152553319931, |
| "rewards/reward_func": -0.03455093875527382, |
| "step": 1896 |
| }, |
| { |
| "completion_length": 154.8125, |
| "epoch": 0.25478388866586377, |
| "grad_norm": 5.5, |
| "kl": 0.0037700315297115594, |
| "learning_rate": 7.452161113341362e-07, |
| "loss": 0.0002, |
| "reward": 0.5173049904406071, |
| "reward_std": 0.5908821411430836, |
| "rewards/reward_func": 0.5173049904406071, |
| "step": 1904 |
| }, |
| { |
| "completion_length": 175.0703125, |
| "epoch": 0.2558544092064767, |
| "grad_norm": 3.734375, |
| "kl": 0.003177426100592129, |
| "learning_rate": 7.441455907935233e-07, |
| "loss": 0.0001, |
| "reward": 0.3084242893382907, |
| "reward_std": 0.5546926856040955, |
| "rewards/reward_func": 0.3084242893382907, |
| "step": 1912 |
| }, |
| { |
| "completion_length": 153.734375, |
| "epoch": 0.2569249297470895, |
| "grad_norm": 4.65625, |
| "kl": 0.0028014232811983675, |
| "learning_rate": 7.430750702529105e-07, |
| "loss": 0.0001, |
| "reward": 0.4781934395432472, |
| "reward_std": 0.3370926305651665, |
| "rewards/reward_func": 0.4781934395432472, |
| "step": 1920 |
| }, |
| { |
| "completion_length": 174.8203125, |
| "epoch": 0.2579954502877024, |
| "grad_norm": 3.703125, |
| "kl": 0.0027142605104018003, |
| "learning_rate": 7.420045497122976e-07, |
| "loss": 0.0001, |
| "reward": 0.34992816112935543, |
| "reward_std": 0.5419151671230793, |
| "rewards/reward_func": 0.34992816112935543, |
| "step": 1928 |
| }, |
| { |
| "completion_length": 164.3671875, |
| "epoch": 0.2590659708283153, |
| "grad_norm": 4.53125, |
| "kl": 0.003608020633691922, |
| "learning_rate": 7.409340291716846e-07, |
| "loss": 0.0001, |
| "reward": 0.23989262245595455, |
| "reward_std": 0.6161230951547623, |
| "rewards/reward_func": 0.23989262245595455, |
| "step": 1936 |
| }, |
| { |
| "completion_length": 176.71875, |
| "epoch": 0.26013649136892814, |
| "grad_norm": 3.46875, |
| "kl": 0.0028162887319922447, |
| "learning_rate": 7.398635086310718e-07, |
| "loss": 0.0001, |
| "reward": 0.2081197015941143, |
| "reward_std": 0.6199641041457653, |
| "rewards/reward_func": 0.2081197015941143, |
| "step": 1944 |
| }, |
| { |
| "completion_length": 188.7890625, |
| "epoch": 0.261207011909541, |
| "grad_norm": 3.9375, |
| "kl": 0.0030088693019934, |
| "learning_rate": 7.38792988090459e-07, |
| "loss": 0.0001, |
| "reward": 0.43774592503905296, |
| "reward_std": 0.5932074896991253, |
| "rewards/reward_func": 0.43774592503905296, |
| "step": 1952 |
| }, |
| { |
| "completion_length": 183.203125, |
| "epoch": 0.2622775324501539, |
| "grad_norm": 2.671875, |
| "kl": 0.0032375668233726174, |
| "learning_rate": 7.377224675498462e-07, |
| "loss": 0.0001, |
| "reward": 0.14370151609182358, |
| "reward_std": 0.6233563013374805, |
| "rewards/reward_func": 0.14370151609182358, |
| "step": 1960 |
| }, |
| { |
| "completion_length": 188.3984375, |
| "epoch": 0.26334805299076675, |
| "grad_norm": 3.640625, |
| "kl": 0.003420975699555129, |
| "learning_rate": 7.366519470092331e-07, |
| "loss": 0.0001, |
| "reward": 0.3819169942289591, |
| "reward_std": 0.4839291740208864, |
| "rewards/reward_func": 0.3819169942289591, |
| "step": 1968 |
| }, |
| { |
| "completion_length": 168.9140625, |
| "epoch": 0.26441857353137965, |
| "grad_norm": 3.84375, |
| "kl": 0.0034208787546958774, |
| "learning_rate": 7.355814264686203e-07, |
| "loss": 0.0001, |
| "reward": 0.17081641219556332, |
| "reward_std": 0.6360666044056416, |
| "rewards/reward_func": 0.17081641219556332, |
| "step": 1976 |
| }, |
| { |
| "completion_length": 147.734375, |
| "epoch": 0.2654890940719925, |
| "grad_norm": 3.890625, |
| "kl": 0.0038502227107528597, |
| "learning_rate": 7.345109059280075e-07, |
| "loss": 0.0002, |
| "reward": 0.5051426645368338, |
| "reward_std": 0.539917191490531, |
| "rewards/reward_func": 0.5051426645368338, |
| "step": 1984 |
| }, |
| { |
| "completion_length": 144.2421875, |
| "epoch": 0.26655961461260536, |
| "grad_norm": 4.40625, |
| "kl": 0.004192993917968124, |
| "learning_rate": 7.334403853873946e-07, |
| "loss": 0.0002, |
| "reward": 0.5341411675326526, |
| "reward_std": 0.4230798315256834, |
| "rewards/reward_func": 0.5341411675326526, |
| "step": 1992 |
| }, |
| { |
| "completion_length": 147.1953125, |
| "epoch": 0.26763013515321826, |
| "grad_norm": 4.65625, |
| "kl": 0.00358515654806979, |
| "learning_rate": 7.323698648467817e-07, |
| "loss": 0.0001, |
| "reward": 0.33829054702073336, |
| "reward_std": 0.48450249992311, |
| "rewards/reward_func": 0.33829054702073336, |
| "step": 2000 |
| }, |
| { |
| "completion_length": 166.828125, |
| "epoch": 0.2687006556938311, |
| "grad_norm": 2.21875, |
| "kl": 0.003072334686294198, |
| "learning_rate": 7.312993443061688e-07, |
| "loss": 0.0001, |
| "reward": 0.358464740216732, |
| "reward_std": 0.5359793957322836, |
| "rewards/reward_func": 0.358464740216732, |
| "step": 2008 |
| }, |
| { |
| "completion_length": 187.921875, |
| "epoch": 0.269771176234444, |
| "grad_norm": 2.984375, |
| "kl": 0.003413002035813406, |
| "learning_rate": 7.30228823765556e-07, |
| "loss": 0.0001, |
| "reward": 0.12693564407527447, |
| "reward_std": 0.5908421259373426, |
| "rewards/reward_func": 0.12693564407527447, |
| "step": 2016 |
| }, |
| { |
| "completion_length": 162.4453125, |
| "epoch": 0.2708416967750569, |
| "grad_norm": 3.859375, |
| "kl": 0.0031307056196965277, |
| "learning_rate": 7.291583032249431e-07, |
| "loss": 0.0001, |
| "reward": 0.38590772822499275, |
| "reward_std": 0.48073394782841206, |
| "rewards/reward_func": 0.38590772822499275, |
| "step": 2024 |
| }, |
| { |
| "completion_length": 192.0234375, |
| "epoch": 0.2719122173156697, |
| "grad_norm": 3.546875, |
| "kl": 0.0032598864345345646, |
| "learning_rate": 7.280877826843302e-07, |
| "loss": 0.0001, |
| "reward": 0.18051442131400108, |
| "reward_std": 0.6438321061432362, |
| "rewards/reward_func": 0.18051442131400108, |
| "step": 2032 |
| }, |
| { |
| "completion_length": 148.3984375, |
| "epoch": 0.27298273785628263, |
| "grad_norm": 4.09375, |
| "kl": 0.0036882674612570554, |
| "learning_rate": 7.270172621437174e-07, |
| "loss": 0.0001, |
| "reward": 0.4433805178850889, |
| "reward_std": 0.615590687841177, |
| "rewards/reward_func": 0.4433805178850889, |
| "step": 2040 |
| }, |
| { |
| "completion_length": 161.9375, |
| "epoch": 0.2740532583968955, |
| "grad_norm": 3.828125, |
| "kl": 0.00417991288122721, |
| "learning_rate": 7.259467416031044e-07, |
| "loss": 0.0002, |
| "reward": 0.21279390715062618, |
| "reward_std": 0.5614198036491871, |
| "rewards/reward_func": 0.21279390715062618, |
| "step": 2048 |
| }, |
| { |
| "completion_length": 157.0234375, |
| "epoch": 0.2751237789375084, |
| "grad_norm": 3.359375, |
| "kl": 0.0038566369330510497, |
| "learning_rate": 7.248762210624916e-07, |
| "loss": 0.0002, |
| "reward": 0.539018552750349, |
| "reward_std": 0.5531130824238062, |
| "rewards/reward_func": 0.539018552750349, |
| "step": 2056 |
| }, |
| { |
| "completion_length": 188.140625, |
| "epoch": 0.27619429947812124, |
| "grad_norm": 4.0, |
| "kl": 0.0034979561460204422, |
| "learning_rate": 7.238057005218787e-07, |
| "loss": 0.0001, |
| "reward": 0.11634453199803829, |
| "reward_std": 0.599401269108057, |
| "rewards/reward_func": 0.11634453199803829, |
| "step": 2064 |
| }, |
| { |
| "completion_length": 180.5625, |
| "epoch": 0.2772648200187341, |
| "grad_norm": 4.75, |
| "kl": 0.0037427434872370213, |
| "learning_rate": 7.227351799812659e-07, |
| "loss": 0.0001, |
| "reward": 0.2462961538694799, |
| "reward_std": 0.5912914611399174, |
| "rewards/reward_func": 0.2462961538694799, |
| "step": 2072 |
| }, |
| { |
| "completion_length": 169.1484375, |
| "epoch": 0.278335340559347, |
| "grad_norm": 3.859375, |
| "kl": 0.003938340552849695, |
| "learning_rate": 7.21664659440653e-07, |
| "loss": 0.0002, |
| "reward": 0.21049488708376884, |
| "reward_std": 0.513383561745286, |
| "rewards/reward_func": 0.21049488708376884, |
| "step": 2080 |
| }, |
| { |
| "completion_length": 163.03125, |
| "epoch": 0.27940586109995985, |
| "grad_norm": 3.90625, |
| "kl": 0.003593464905861765, |
| "learning_rate": 7.205941389000401e-07, |
| "loss": 0.0001, |
| "reward": 0.42887144535779953, |
| "reward_std": 0.5823842044919729, |
| "rewards/reward_func": 0.42887144535779953, |
| "step": 2088 |
| }, |
| { |
| "completion_length": 154.2421875, |
| "epoch": 0.2804763816405727, |
| "grad_norm": 5.71875, |
| "kl": 0.004313324898248538, |
| "learning_rate": 7.195236183594272e-07, |
| "loss": 0.0002, |
| "reward": 0.48943471536040306, |
| "reward_std": 0.5817722771316767, |
| "rewards/reward_func": 0.48943471536040306, |
| "step": 2096 |
| }, |
| { |
| "completion_length": 153.3984375, |
| "epoch": 0.2815469021811856, |
| "grad_norm": 4.875, |
| "kl": 0.003974846331402659, |
| "learning_rate": 7.184530978188144e-07, |
| "loss": 0.0002, |
| "reward": 0.4710603700950742, |
| "reward_std": 0.47509198915213346, |
| "rewards/reward_func": 0.4710603700950742, |
| "step": 2104 |
| }, |
| { |
| "completion_length": 152.15625, |
| "epoch": 0.28261742272179846, |
| "grad_norm": 3.390625, |
| "kl": 0.003992282261606306, |
| "learning_rate": 7.173825772782015e-07, |
| "loss": 0.0002, |
| "reward": 0.27247738372534513, |
| "reward_std": 0.5785027798265219, |
| "rewards/reward_func": 0.27247738372534513, |
| "step": 2112 |
| }, |
| { |
| "completion_length": 187.140625, |
| "epoch": 0.28368794326241137, |
| "grad_norm": 5.46875, |
| "kl": 0.003529240610077977, |
| "learning_rate": 7.163120567375887e-07, |
| "loss": 0.0001, |
| "reward": -0.031075291335582733, |
| "reward_std": 0.5039861313998699, |
| "rewards/reward_func": -0.031075291335582733, |
| "step": 2120 |
| }, |
| { |
| "completion_length": 179.78125, |
| "epoch": 0.2847584638030242, |
| "grad_norm": 5.0, |
| "kl": 0.0030008777684997767, |
| "learning_rate": 7.152415361969757e-07, |
| "loss": 0.0001, |
| "reward": 0.11691426858305931, |
| "reward_std": 0.5304726148024201, |
| "rewards/reward_func": 0.11691426858305931, |
| "step": 2128 |
| }, |
| { |
| "completion_length": 195.9765625, |
| "epoch": 0.28582898434363707, |
| "grad_norm": 3.4375, |
| "kl": 0.0033693104924168438, |
| "learning_rate": 7.141710156563628e-07, |
| "loss": 0.0001, |
| "reward": 0.1879887394607067, |
| "reward_std": 0.5351051315665245, |
| "rewards/reward_func": 0.1879887394607067, |
| "step": 2136 |
| }, |
| { |
| "completion_length": 184.78125, |
| "epoch": 0.28689950488425, |
| "grad_norm": 3.359375, |
| "kl": 0.003082483890466392, |
| "learning_rate": 7.1310049511575e-07, |
| "loss": 0.0001, |
| "reward": 0.16520871315151453, |
| "reward_std": 0.3897149385884404, |
| "rewards/reward_func": 0.16520871315151453, |
| "step": 2144 |
| }, |
| { |
| "completion_length": 161.625, |
| "epoch": 0.28797002542486283, |
| "grad_norm": 3.28125, |
| "kl": 0.0033815766510087997, |
| "learning_rate": 7.120299745751372e-07, |
| "loss": 0.0001, |
| "reward": 0.2989609017968178, |
| "reward_std": 0.6460195314139128, |
| "rewards/reward_func": 0.2989609017968178, |
| "step": 2152 |
| }, |
| { |
| "completion_length": 158.734375, |
| "epoch": 0.28904054596547574, |
| "grad_norm": 3.65625, |
| "kl": 0.003546297753928229, |
| "learning_rate": 7.109594540345243e-07, |
| "loss": 0.0001, |
| "reward": 0.18108150828629732, |
| "reward_std": 0.5971081778407097, |
| "rewards/reward_func": 0.18108150828629732, |
| "step": 2160 |
| }, |
| { |
| "completion_length": 162.09375, |
| "epoch": 0.2901110665060886, |
| "grad_norm": 4.1875, |
| "kl": 0.003662400442408398, |
| "learning_rate": 7.098889334939114e-07, |
| "loss": 0.0001, |
| "reward": 0.18057992309331894, |
| "reward_std": 0.5010180473327637, |
| "rewards/reward_func": 0.18057992309331894, |
| "step": 2168 |
| }, |
| { |
| "completion_length": 168.703125, |
| "epoch": 0.29118158704670144, |
| "grad_norm": 3.5625, |
| "kl": 0.0036488809855654836, |
| "learning_rate": 7.088184129532985e-07, |
| "loss": 0.0001, |
| "reward": 0.3672813940793276, |
| "reward_std": 0.5888884011656046, |
| "rewards/reward_func": 0.3672813940793276, |
| "step": 2176 |
| }, |
| { |
| "completion_length": 160.6015625, |
| "epoch": 0.29225210758731435, |
| "grad_norm": 4.03125, |
| "kl": 0.0035606406745500863, |
| "learning_rate": 7.077478924126857e-07, |
| "loss": 0.0001, |
| "reward": 0.34612463414669037, |
| "reward_std": 0.5678670313209295, |
| "rewards/reward_func": 0.34612463414669037, |
| "step": 2184 |
| }, |
| { |
| "completion_length": 186.1640625, |
| "epoch": 0.2933226281279272, |
| "grad_norm": 3.546875, |
| "kl": 0.0036609756061807275, |
| "learning_rate": 7.066773718720727e-07, |
| "loss": 0.0001, |
| "reward": 0.12424429133534431, |
| "reward_std": 0.5589212942868471, |
| "rewards/reward_func": 0.12424429133534431, |
| "step": 2192 |
| }, |
| { |
| "completion_length": 158.578125, |
| "epoch": 0.29439314866854005, |
| "grad_norm": 4.09375, |
| "kl": 0.003958662680815905, |
| "learning_rate": 7.056068513314599e-07, |
| "loss": 0.0002, |
| "reward": 0.2399831861257553, |
| "reward_std": 0.6095044985413551, |
| "rewards/reward_func": 0.2399831861257553, |
| "step": 2200 |
| }, |
| { |
| "completion_length": 150.6953125, |
| "epoch": 0.29546366920915296, |
| "grad_norm": 3.109375, |
| "kl": 0.004275305866030976, |
| "learning_rate": 7.045363307908471e-07, |
| "loss": 0.0002, |
| "reward": 0.42568245250731707, |
| "reward_std": 0.5035090297460556, |
| "rewards/reward_func": 0.42568245250731707, |
| "step": 2208 |
| }, |
| { |
| "completion_length": 198.25, |
| "epoch": 0.2965341897497658, |
| "grad_norm": 4.75, |
| "kl": 0.0037761297717224807, |
| "learning_rate": 7.034658102502341e-07, |
| "loss": 0.0002, |
| "reward": 0.20382929779589176, |
| "reward_std": 0.6004747971892357, |
| "rewards/reward_func": 0.20382929779589176, |
| "step": 2216 |
| }, |
| { |
| "completion_length": 166.953125, |
| "epoch": 0.2976047102903787, |
| "grad_norm": 5.59375, |
| "kl": 0.004744472214952111, |
| "learning_rate": 7.023952897096212e-07, |
| "loss": 0.0002, |
| "reward": 0.12384441681206226, |
| "reward_std": 0.5781398452818394, |
| "rewards/reward_func": 0.12384441681206226, |
| "step": 2224 |
| }, |
| { |
| "completion_length": 151.3125, |
| "epoch": 0.29867523083099157, |
| "grad_norm": 6.46875, |
| "kl": 0.004519026639172807, |
| "learning_rate": 7.013247691690084e-07, |
| "loss": 0.0002, |
| "reward": 0.36841049790382385, |
| "reward_std": 0.3033979944884777, |
| "rewards/reward_func": 0.36841049790382385, |
| "step": 2232 |
| }, |
| { |
| "completion_length": 166.9375, |
| "epoch": 0.2997457513716044, |
| "grad_norm": 4.03125, |
| "kl": 0.003987667616456747, |
| "learning_rate": 7.002542486283956e-07, |
| "loss": 0.0002, |
| "reward": -0.1171187162399292, |
| "reward_std": 0.42854547686874866, |
| "rewards/reward_func": -0.1171187162399292, |
| "step": 2240 |
| }, |
| { |
| "completion_length": 183.328125, |
| "epoch": 0.3008162719122173, |
| "grad_norm": 3.515625, |
| "kl": 0.0033717694896040484, |
| "learning_rate": 6.991837280877828e-07, |
| "loss": 0.0001, |
| "reward": 0.22414767649024725, |
| "reward_std": 0.6165672689676285, |
| "rewards/reward_func": 0.22414767649024725, |
| "step": 2248 |
| }, |
| { |
| "completion_length": 193.2421875, |
| "epoch": 0.3018867924528302, |
| "grad_norm": 3.140625, |
| "kl": 0.0031051966943778098, |
| "learning_rate": 6.981132075471697e-07, |
| "loss": 0.0001, |
| "reward": 0.3812776654958725, |
| "reward_std": 0.5649162493646145, |
| "rewards/reward_func": 0.3812776654958725, |
| "step": 2256 |
| }, |
| { |
| "completion_length": 183.9765625, |
| "epoch": 0.3029573129934431, |
| "grad_norm": 4.4375, |
| "kl": 0.003768587455851957, |
| "learning_rate": 6.970426870065569e-07, |
| "loss": 0.0002, |
| "reward": 0.12498392723500729, |
| "reward_std": 0.4842628054320812, |
| "rewards/reward_func": 0.12498392723500729, |
| "step": 2264 |
| }, |
| { |
| "completion_length": 200.5390625, |
| "epoch": 0.30402783353405594, |
| "grad_norm": 4.3125, |
| "kl": 0.0034181236114818603, |
| "learning_rate": 6.959721664659441e-07, |
| "loss": 0.0001, |
| "reward": -0.04165226221084595, |
| "reward_std": 0.5646636541932821, |
| "rewards/reward_func": -0.04165226221084595, |
| "step": 2272 |
| }, |
| { |
| "completion_length": 153.8125, |
| "epoch": 0.3050983540746688, |
| "grad_norm": 3.5625, |
| "kl": 0.0040518031746614724, |
| "learning_rate": 6.949016459253311e-07, |
| "loss": 0.0002, |
| "reward": 0.35562361404299736, |
| "reward_std": 0.44176073744893074, |
| "rewards/reward_func": 0.35562361404299736, |
| "step": 2280 |
| }, |
| { |
| "completion_length": 179.8046875, |
| "epoch": 0.3061688746152817, |
| "grad_norm": 4.53125, |
| "kl": 0.0034489443933125585, |
| "learning_rate": 6.938311253847183e-07, |
| "loss": 0.0001, |
| "reward": 0.30737813375890255, |
| "reward_std": 0.5192515105009079, |
| "rewards/reward_func": 0.30737813375890255, |
| "step": 2288 |
| }, |
| { |
| "completion_length": 175.1015625, |
| "epoch": 0.30723939515589455, |
| "grad_norm": 4.4375, |
| "kl": 0.003332895546918735, |
| "learning_rate": 6.927606048441054e-07, |
| "loss": 0.0001, |
| "reward": 0.16119882743805647, |
| "reward_std": 0.6122260540723801, |
| "rewards/reward_func": 0.16119882743805647, |
| "step": 2296 |
| }, |
| { |
| "completion_length": 166.7421875, |
| "epoch": 0.30830991569650745, |
| "grad_norm": 2.65625, |
| "kl": 0.0034852146345656365, |
| "learning_rate": 6.916900843034925e-07, |
| "loss": 0.0001, |
| "reward": 0.22000440582633018, |
| "reward_std": 0.5837470442056656, |
| "rewards/reward_func": 0.22000440582633018, |
| "step": 2304 |
| }, |
| { |
| "completion_length": 146.2421875, |
| "epoch": 0.3093804362371203, |
| "grad_norm": 3.59375, |
| "kl": 0.003727599134435877, |
| "learning_rate": 6.906195637628796e-07, |
| "loss": 0.0001, |
| "reward": 0.18992659822106361, |
| "reward_std": 0.5706657655537128, |
| "rewards/reward_func": 0.18992659822106361, |
| "step": 2312 |
| }, |
| { |
| "completion_length": 152.09375, |
| "epoch": 0.31045095677773316, |
| "grad_norm": 3.03125, |
| "kl": 0.004237443121382967, |
| "learning_rate": 6.895490432222668e-07, |
| "loss": 0.0002, |
| "reward": 0.5161734204739332, |
| "reward_std": 0.5621990244835615, |
| "rewards/reward_func": 0.5161734204739332, |
| "step": 2320 |
| }, |
| { |
| "completion_length": 139.578125, |
| "epoch": 0.31152147731834606, |
| "grad_norm": 3.546875, |
| "kl": 0.0043363839504309, |
| "learning_rate": 6.88478522681654e-07, |
| "loss": 0.0002, |
| "reward": 0.3602239452302456, |
| "reward_std": 0.6682011783123016, |
| "rewards/reward_func": 0.3602239452302456, |
| "step": 2328 |
| }, |
| { |
| "completion_length": 159.1171875, |
| "epoch": 0.3125919978589589, |
| "grad_norm": 3.296875, |
| "kl": 0.005018363182898611, |
| "learning_rate": 6.87408002141041e-07, |
| "loss": 0.0002, |
| "reward": 0.18990419153124094, |
| "reward_std": 0.38154869619756937, |
| "rewards/reward_func": 0.18990419153124094, |
| "step": 2336 |
| }, |
| { |
| "completion_length": 186.171875, |
| "epoch": 0.31366251839957177, |
| "grad_norm": 3.75, |
| "kl": 0.0034995676251128316, |
| "learning_rate": 6.863374816004281e-07, |
| "loss": 0.0001, |
| "reward": 0.28119928389787674, |
| "reward_std": 0.6371741183102131, |
| "rewards/reward_func": 0.28119928389787674, |
| "step": 2344 |
| }, |
| { |
| "completion_length": 144.5390625, |
| "epoch": 0.3147330389401847, |
| "grad_norm": 3.109375, |
| "kl": 0.003701402310980484, |
| "learning_rate": 6.852669610598153e-07, |
| "loss": 0.0001, |
| "reward": 0.2914201710373163, |
| "reward_std": 0.5679418547078967, |
| "rewards/reward_func": 0.2914201710373163, |
| "step": 2352 |
| }, |
| { |
| "completion_length": 158.453125, |
| "epoch": 0.3158035594807975, |
| "grad_norm": 4.59375, |
| "kl": 0.003569768596207723, |
| "learning_rate": 6.841964405192025e-07, |
| "loss": 0.0001, |
| "reward": 0.38121682219207287, |
| "reward_std": 0.5718358978629112, |
| "rewards/reward_func": 0.38121682219207287, |
| "step": 2360 |
| }, |
| { |
| "completion_length": 172.234375, |
| "epoch": 0.31687408002141043, |
| "grad_norm": 4.09375, |
| "kl": 0.003890137653797865, |
| "learning_rate": 6.831259199785896e-07, |
| "loss": 0.0002, |
| "reward": 0.19206082820892334, |
| "reward_std": 0.5519562661647797, |
| "rewards/reward_func": 0.19206082820892334, |
| "step": 2368 |
| }, |
| { |
| "completion_length": 136.1171875, |
| "epoch": 0.3179446005620233, |
| "grad_norm": 3.953125, |
| "kl": 0.004021885659312829, |
| "learning_rate": 6.820553994379766e-07, |
| "loss": 0.0002, |
| "reward": 0.43440112797543406, |
| "reward_std": 0.5649959053844213, |
| "rewards/reward_func": 0.43440112797543406, |
| "step": 2376 |
| }, |
| { |
| "completion_length": 189.59375, |
| "epoch": 0.31901512110263613, |
| "grad_norm": 7.4375, |
| "kl": 0.0037745212903246284, |
| "learning_rate": 6.809848788973638e-07, |
| "loss": 0.0002, |
| "reward": 0.08486939128488302, |
| "reward_std": 0.5615943241864443, |
| "rewards/reward_func": 0.08486939128488302, |
| "step": 2384 |
| }, |
| { |
| "completion_length": 145.40625, |
| "epoch": 0.32008564164324904, |
| "grad_norm": 6.15625, |
| "kl": 0.004177739087026566, |
| "learning_rate": 6.799143583567509e-07, |
| "loss": 0.0002, |
| "reward": 0.03109552478417754, |
| "reward_std": 0.6218379884958267, |
| "rewards/reward_func": 0.03109552478417754, |
| "step": 2392 |
| }, |
| { |
| "completion_length": 160.3203125, |
| "epoch": 0.3211561621838619, |
| "grad_norm": 4.71875, |
| "kl": 0.004120006924495101, |
| "learning_rate": 6.788438378161381e-07, |
| "loss": 0.0002, |
| "reward": 0.33427711576223373, |
| "reward_std": 0.5099399294704199, |
| "rewards/reward_func": 0.33427711576223373, |
| "step": 2400 |
| }, |
| { |
| "completion_length": 165.9765625, |
| "epoch": 0.3222266827244748, |
| "grad_norm": 3.859375, |
| "kl": 0.0034675312926992774, |
| "learning_rate": 6.777733172755252e-07, |
| "loss": 0.0001, |
| "reward": 0.4284206023439765, |
| "reward_std": 0.5410223100334406, |
| "rewards/reward_func": 0.4284206023439765, |
| "step": 2408 |
| }, |
| { |
| "completion_length": 203.5234375, |
| "epoch": 0.32329720326508765, |
| "grad_norm": 2.828125, |
| "kl": 0.003464344044914469, |
| "learning_rate": 6.767027967349124e-07, |
| "loss": 0.0001, |
| "reward": 0.32477592676877975, |
| "reward_std": 0.5011547729372978, |
| "rewards/reward_func": 0.32477592676877975, |
| "step": 2416 |
| }, |
| { |
| "completion_length": 150.3984375, |
| "epoch": 0.3243677238057005, |
| "grad_norm": 3.296875, |
| "kl": 0.003587738669011742, |
| "learning_rate": 6.756322761942994e-07, |
| "loss": 0.0001, |
| "reward": 0.44885979406535625, |
| "reward_std": 0.5460297726094723, |
| "rewards/reward_func": 0.44885979406535625, |
| "step": 2424 |
| }, |
| { |
| "completion_length": 169.2890625, |
| "epoch": 0.3254382443463134, |
| "grad_norm": 3.296875, |
| "kl": 0.003916321613360196, |
| "learning_rate": 6.745617556536866e-07, |
| "loss": 0.0002, |
| "reward": 0.12248068256303668, |
| "reward_std": 0.5732488930225372, |
| "rewards/reward_func": 0.12248068256303668, |
| "step": 2432 |
| }, |
| { |
| "completion_length": 199.9609375, |
| "epoch": 0.32650876488692626, |
| "grad_norm": 3.5625, |
| "kl": 0.0033171565155498683, |
| "learning_rate": 6.734912351130737e-07, |
| "loss": 0.0001, |
| "reward": 0.2281382903456688, |
| "reward_std": 0.5701967515051365, |
| "rewards/reward_func": 0.2281382903456688, |
| "step": 2440 |
| }, |
| { |
| "completion_length": 191.703125, |
| "epoch": 0.32757928542753917, |
| "grad_norm": 4.4375, |
| "kl": 0.0034852577664423734, |
| "learning_rate": 6.724207145724608e-07, |
| "loss": 0.0001, |
| "reward": 0.21391855087131262, |
| "reward_std": 0.6829859614372253, |
| "rewards/reward_func": 0.21391855087131262, |
| "step": 2448 |
| }, |
| { |
| "completion_length": 186.296875, |
| "epoch": 0.328649805968152, |
| "grad_norm": 3.765625, |
| "kl": 0.004041536885779351, |
| "learning_rate": 6.71350194031848e-07, |
| "loss": 0.0002, |
| "reward": 0.16749184112995863, |
| "reward_std": 0.5975307431071997, |
| "rewards/reward_func": 0.16749184112995863, |
| "step": 2456 |
| }, |
| { |
| "completion_length": 169.34375, |
| "epoch": 0.32972032650876487, |
| "grad_norm": 3.890625, |
| "kl": 0.003387822740478441, |
| "learning_rate": 6.702796734912351e-07, |
| "loss": 0.0001, |
| "reward": 0.4501562397927046, |
| "reward_std": 0.4912100899964571, |
| "rewards/reward_func": 0.4501562397927046, |
| "step": 2464 |
| }, |
| { |
| "completion_length": 147.0, |
| "epoch": 0.3307908470493778, |
| "grad_norm": 3.125, |
| "kl": 0.0038211781647987664, |
| "learning_rate": 6.692091529506222e-07, |
| "loss": 0.0002, |
| "reward": 0.10390966571867466, |
| "reward_std": 0.4674555938690901, |
| "rewards/reward_func": 0.10390966571867466, |
| "step": 2472 |
| }, |
| { |
| "completion_length": 165.9375, |
| "epoch": 0.33186136758999063, |
| "grad_norm": 5.96875, |
| "kl": 0.0037551842688117176, |
| "learning_rate": 6.681386324100093e-07, |
| "loss": 0.0002, |
| "reward": 0.3239047722890973, |
| "reward_std": 0.544673465192318, |
| "rewards/reward_func": 0.3239047722890973, |
| "step": 2480 |
| }, |
| { |
| "completion_length": 170.4375, |
| "epoch": 0.3329318881306035, |
| "grad_norm": 3.015625, |
| "kl": 0.004028416806249879, |
| "learning_rate": 6.670681118693965e-07, |
| "loss": 0.0002, |
| "reward": 0.16679776646196842, |
| "reward_std": 0.4815365634858608, |
| "rewards/reward_func": 0.16679776646196842, |
| "step": 2488 |
| }, |
| { |
| "completion_length": 162.2265625, |
| "epoch": 0.3340024086712164, |
| "grad_norm": 3.734375, |
| "kl": 0.003879312367644161, |
| "learning_rate": 6.659975913287837e-07, |
| "loss": 0.0002, |
| "reward": 0.4071632297709584, |
| "reward_std": 0.5334546230733395, |
| "rewards/reward_func": 0.4071632297709584, |
| "step": 2496 |
| }, |
| { |
| "completion_length": 154.6953125, |
| "epoch": 0.33507292921182924, |
| "grad_norm": 3.84375, |
| "kl": 0.0042274416191503406, |
| "learning_rate": 6.649270707881706e-07, |
| "loss": 0.0002, |
| "reward": 0.17819023504853249, |
| "reward_std": 0.49565806053578854, |
| "rewards/reward_func": 0.17819023504853249, |
| "step": 2504 |
| }, |
| { |
| "completion_length": 192.609375, |
| "epoch": 0.33614344975244215, |
| "grad_norm": 4.0625, |
| "kl": 0.0035822324571199715, |
| "learning_rate": 6.638565502475578e-07, |
| "loss": 0.0001, |
| "reward": -0.004483510740101337, |
| "reward_std": 0.443414025940001, |
| "rewards/reward_func": -0.004483510740101337, |
| "step": 2512 |
| }, |
| { |
| "completion_length": 171.921875, |
| "epoch": 0.337213970293055, |
| "grad_norm": 4.65625, |
| "kl": 0.004149035812588409, |
| "learning_rate": 6.62786029706945e-07, |
| "loss": 0.0002, |
| "reward": 0.08967352751642466, |
| "reward_std": 0.5806238334625959, |
| "rewards/reward_func": 0.08967352751642466, |
| "step": 2520 |
| }, |
| { |
| "completion_length": 153.5703125, |
| "epoch": 0.33828449083366785, |
| "grad_norm": 4.375, |
| "kl": 0.004091008595423773, |
| "learning_rate": 6.617155091663322e-07, |
| "loss": 0.0002, |
| "reward": 0.32766120694577694, |
| "reward_std": 0.5018663741648197, |
| "rewards/reward_func": 0.32766120694577694, |
| "step": 2528 |
| }, |
| { |
| "completion_length": 180.65625, |
| "epoch": 0.33935501137428076, |
| "grad_norm": 4.0625, |
| "kl": 0.003188255534041673, |
| "learning_rate": 6.606449886257192e-07, |
| "loss": 0.0001, |
| "reward": 0.09142577461898327, |
| "reward_std": 0.6621435023844242, |
| "rewards/reward_func": 0.09142577461898327, |
| "step": 2536 |
| }, |
| { |
| "completion_length": 175.28125, |
| "epoch": 0.3404255319148936, |
| "grad_norm": 4.46875, |
| "kl": 0.003918278380297124, |
| "learning_rate": 6.595744680851063e-07, |
| "loss": 0.0002, |
| "reward": 0.26254068687558174, |
| "reward_std": 0.4977311482653022, |
| "rewards/reward_func": 0.26254068687558174, |
| "step": 2544 |
| }, |
| { |
| "completion_length": 178.5859375, |
| "epoch": 0.3414960524555065, |
| "grad_norm": 2.78125, |
| "kl": 0.0037679201050195843, |
| "learning_rate": 6.585039475444935e-07, |
| "loss": 0.0002, |
| "reward": 0.2359130820259452, |
| "reward_std": 0.6390624288469553, |
| "rewards/reward_func": 0.2359130820259452, |
| "step": 2552 |
| }, |
| { |
| "completion_length": 185.7421875, |
| "epoch": 0.34256657299611937, |
| "grad_norm": 4.0625, |
| "kl": 0.003888906561769545, |
| "learning_rate": 6.574334270038807e-07, |
| "loss": 0.0002, |
| "reward": 0.06335067562758923, |
| "reward_std": 0.5590885141864419, |
| "rewards/reward_func": 0.06335067562758923, |
| "step": 2560 |
| }, |
| { |
| "completion_length": 156.1953125, |
| "epoch": 0.3436370935367322, |
| "grad_norm": 4.03125, |
| "kl": 0.004304436064558104, |
| "learning_rate": 6.563629064632677e-07, |
| "loss": 0.0002, |
| "reward": 0.20302090607583523, |
| "reward_std": 0.5702759772539139, |
| "rewards/reward_func": 0.20302090607583523, |
| "step": 2568 |
| }, |
| { |
| "completion_length": 132.640625, |
| "epoch": 0.3447076140773451, |
| "grad_norm": 4.34375, |
| "kl": 0.004362121399026364, |
| "learning_rate": 6.552923859226549e-07, |
| "loss": 0.0002, |
| "reward": 0.6651312373578548, |
| "reward_std": 0.3866056464612484, |
| "rewards/reward_func": 0.6651312373578548, |
| "step": 2576 |
| }, |
| { |
| "completion_length": 153.0078125, |
| "epoch": 0.345778134617958, |
| "grad_norm": 3.109375, |
| "kl": 0.0041726555791683495, |
| "learning_rate": 6.54221865382042e-07, |
| "loss": 0.0002, |
| "reward": 0.2306189425289631, |
| "reward_std": 0.4977853484451771, |
| "rewards/reward_func": 0.2306189425289631, |
| "step": 2584 |
| }, |
| { |
| "completion_length": 166.15625, |
| "epoch": 0.34684865515857083, |
| "grad_norm": 3.59375, |
| "kl": 0.0034308232716284692, |
| "learning_rate": 6.531513448414291e-07, |
| "loss": 0.0001, |
| "reward": 0.11097771301865578, |
| "reward_std": 0.6078328117728233, |
| "rewards/reward_func": 0.11097771301865578, |
| "step": 2592 |
| }, |
| { |
| "completion_length": 153.0, |
| "epoch": 0.34791917569918374, |
| "grad_norm": 4.15625, |
| "kl": 0.003664735675556585, |
| "learning_rate": 6.520808243008162e-07, |
| "loss": 0.0001, |
| "reward": 0.49361006263643503, |
| "reward_std": 0.5971282683312893, |
| "rewards/reward_func": 0.49361006263643503, |
| "step": 2600 |
| }, |
| { |
| "completion_length": 164.8828125, |
| "epoch": 0.3489896962397966, |
| "grad_norm": 5.0, |
| "kl": 0.00437125310418196, |
| "learning_rate": 6.510103037602034e-07, |
| "loss": 0.0002, |
| "reward": 0.1592898964881897, |
| "reward_std": 0.5312252482399344, |
| "rewards/reward_func": 0.1592898964881897, |
| "step": 2608 |
| }, |
| { |
| "completion_length": 165.421875, |
| "epoch": 0.3500602167804095, |
| "grad_norm": 4.34375, |
| "kl": 0.0037221178063191473, |
| "learning_rate": 6.499397832195906e-07, |
| "loss": 0.0001, |
| "reward": 0.41797424480319023, |
| "reward_std": 0.5129497703164816, |
| "rewards/reward_func": 0.41797424480319023, |
| "step": 2616 |
| }, |
| { |
| "completion_length": 182.96875, |
| "epoch": 0.35113073732102235, |
| "grad_norm": 3.65625, |
| "kl": 0.004204686090815812, |
| "learning_rate": 6.488692626789775e-07, |
| "loss": 0.0002, |
| "reward": 0.24847618490457535, |
| "reward_std": 0.5075008701533079, |
| "rewards/reward_func": 0.24847618490457535, |
| "step": 2624 |
| }, |
| { |
| "completion_length": 150.1875, |
| "epoch": 0.3522012578616352, |
| "grad_norm": 4.53125, |
| "kl": 0.0036759270005859435, |
| "learning_rate": 6.477987421383647e-07, |
| "loss": 0.0001, |
| "reward": 0.37406357005238533, |
| "reward_std": 0.43564846366643906, |
| "rewards/reward_func": 0.37406357005238533, |
| "step": 2632 |
| }, |
| { |
| "completion_length": 179.859375, |
| "epoch": 0.3532717784022481, |
| "grad_norm": 4.15625, |
| "kl": 0.003904950339347124, |
| "learning_rate": 6.467282215977519e-07, |
| "loss": 0.0002, |
| "reward": 0.30777904158458114, |
| "reward_std": 0.5255319569259882, |
| "rewards/reward_func": 0.30777904158458114, |
| "step": 2640 |
| }, |
| { |
| "completion_length": 153.1171875, |
| "epoch": 0.35434229894286096, |
| "grad_norm": 4.71875, |
| "kl": 0.004152281413553283, |
| "learning_rate": 6.45657701057139e-07, |
| "loss": 0.0002, |
| "reward": 0.25166825857013464, |
| "reward_std": 0.5060204975306988, |
| "rewards/reward_func": 0.25166825857013464, |
| "step": 2648 |
| }, |
| { |
| "completion_length": 179.9140625, |
| "epoch": 0.35541281948347386, |
| "grad_norm": 5.09375, |
| "kl": 0.0037966810341458768, |
| "learning_rate": 6.445871805165262e-07, |
| "loss": 0.0002, |
| "reward": 0.048941366374492645, |
| "reward_std": 0.5631339196115732, |
| "rewards/reward_func": 0.048941366374492645, |
| "step": 2656 |
| }, |
| { |
| "completion_length": 144.828125, |
| "epoch": 0.3564833400240867, |
| "grad_norm": 4.3125, |
| "kl": 0.004289341013645753, |
| "learning_rate": 6.435166599759133e-07, |
| "loss": 0.0002, |
| "reward": 0.2979842973873019, |
| "reward_std": 0.5271645337343216, |
| "rewards/reward_func": 0.2979842973873019, |
| "step": 2664 |
| }, |
| { |
| "completion_length": 171.3984375, |
| "epoch": 0.35755386056469957, |
| "grad_norm": 4.59375, |
| "kl": 0.0042981151200365275, |
| "learning_rate": 6.424461394353004e-07, |
| "loss": 0.0002, |
| "reward": 0.17236249335110188, |
| "reward_std": 0.6582519998773932, |
| "rewards/reward_func": 0.17236249335110188, |
| "step": 2672 |
| }, |
| { |
| "completion_length": 174.0390625, |
| "epoch": 0.3586243811053125, |
| "grad_norm": 3.828125, |
| "kl": 0.004372917755972594, |
| "learning_rate": 6.413756188946875e-07, |
| "loss": 0.0002, |
| "reward": 0.1233967412263155, |
| "reward_std": 0.5646042246371508, |
| "rewards/reward_func": 0.1233967412263155, |
| "step": 2680 |
| }, |
| { |
| "completion_length": 215.0859375, |
| "epoch": 0.3596949016459253, |
| "grad_norm": 3.265625, |
| "kl": 0.0034220777451992035, |
| "learning_rate": 6.403050983540746e-07, |
| "loss": 0.0001, |
| "reward": 0.022455230355262756, |
| "reward_std": 0.4265636382624507, |
| "rewards/reward_func": 0.022455230355262756, |
| "step": 2688 |
| }, |
| { |
| "completion_length": 165.6640625, |
| "epoch": 0.36076542218653823, |
| "grad_norm": 3.3125, |
| "kl": 0.0036657150485552847, |
| "learning_rate": 6.392345778134618e-07, |
| "loss": 0.0001, |
| "reward": 0.1397247351706028, |
| "reward_std": 0.6145136766135693, |
| "rewards/reward_func": 0.1397247351706028, |
| "step": 2696 |
| }, |
| { |
| "completion_length": 150.046875, |
| "epoch": 0.3618359427271511, |
| "grad_norm": 4.40625, |
| "kl": 0.0038523364637512714, |
| "learning_rate": 6.381640572728489e-07, |
| "loss": 0.0002, |
| "reward": 0.09757146798074245, |
| "reward_std": 0.5506066791713238, |
| "rewards/reward_func": 0.09757146798074245, |
| "step": 2704 |
| }, |
| { |
| "completion_length": 185.109375, |
| "epoch": 0.36290646326776393, |
| "grad_norm": 3.28125, |
| "kl": 0.0035323128395248204, |
| "learning_rate": 6.37093536732236e-07, |
| "loss": 0.0001, |
| "reward": -0.08146252483129501, |
| "reward_std": 0.4336923873052001, |
| "rewards/reward_func": -0.08146252483129501, |
| "step": 2712 |
| }, |
| { |
| "completion_length": 196.1015625, |
| "epoch": 0.36397698380837684, |
| "grad_norm": 2.65625, |
| "kl": 0.003823416627710685, |
| "learning_rate": 6.360230161916231e-07, |
| "loss": 0.0002, |
| "reward": 0.3048650873824954, |
| "reward_std": 0.6732109598815441, |
| "rewards/reward_func": 0.3048650873824954, |
| "step": 2720 |
| }, |
| { |
| "completion_length": 160.6640625, |
| "epoch": 0.3650475043489897, |
| "grad_norm": 4.15625, |
| "kl": 0.00394167794729583, |
| "learning_rate": 6.349524956510103e-07, |
| "loss": 0.0002, |
| "reward": 0.3657691851258278, |
| "reward_std": 0.6270986460149288, |
| "rewards/reward_func": 0.3657691851258278, |
| "step": 2728 |
| }, |
| { |
| "completion_length": 158.0625, |
| "epoch": 0.36611802488960254, |
| "grad_norm": 4.34375, |
| "kl": 0.004382914863526821, |
| "learning_rate": 6.338819751103974e-07, |
| "loss": 0.0002, |
| "reward": 0.23460809141397476, |
| "reward_std": 0.5130759598687291, |
| "rewards/reward_func": 0.23460809141397476, |
| "step": 2736 |
| }, |
| { |
| "completion_length": 185.953125, |
| "epoch": 0.36718854543021545, |
| "grad_norm": 5.65625, |
| "kl": 0.004101004218682647, |
| "learning_rate": 6.328114545697846e-07, |
| "loss": 0.0002, |
| "reward": 0.1987906889989972, |
| "reward_std": 0.6350626721978188, |
| "rewards/reward_func": 0.1987906889989972, |
| "step": 2744 |
| }, |
| { |
| "completion_length": 206.921875, |
| "epoch": 0.3682590659708283, |
| "grad_norm": 3.140625, |
| "kl": 0.0035723625624086708, |
| "learning_rate": 6.317409340291716e-07, |
| "loss": 0.0001, |
| "reward": 0.05194275360554457, |
| "reward_std": 0.5546863917261362, |
| "rewards/reward_func": 0.05194275360554457, |
| "step": 2752 |
| }, |
| { |
| "completion_length": 189.2578125, |
| "epoch": 0.3693295865114412, |
| "grad_norm": 5.0, |
| "kl": 0.004587263334542513, |
| "learning_rate": 6.306704134885587e-07, |
| "loss": 0.0002, |
| "reward": 0.1652057245373726, |
| "reward_std": 0.6709528639912605, |
| "rewards/reward_func": 0.1652057245373726, |
| "step": 2760 |
| }, |
| { |
| "completion_length": 178.6796875, |
| "epoch": 0.37040010705205406, |
| "grad_norm": 3.890625, |
| "kl": 0.004105961037566885, |
| "learning_rate": 6.295998929479459e-07, |
| "loss": 0.0002, |
| "reward": 0.2511095069348812, |
| "reward_std": 0.48893540259450674, |
| "rewards/reward_func": 0.2511095069348812, |
| "step": 2768 |
| }, |
| { |
| "completion_length": 202.7578125, |
| "epoch": 0.3714706275926669, |
| "grad_norm": 3.890625, |
| "kl": 0.003215010277926922, |
| "learning_rate": 6.285293724073331e-07, |
| "loss": 0.0001, |
| "reward": 0.1104801157489419, |
| "reward_std": 0.5324361100792885, |
| "rewards/reward_func": 0.1104801157489419, |
| "step": 2776 |
| }, |
| { |
| "completion_length": 216.71875, |
| "epoch": 0.3725411481332798, |
| "grad_norm": 3.546875, |
| "kl": 0.003063723910599947, |
| "learning_rate": 6.274588518667202e-07, |
| "loss": 0.0001, |
| "reward": 0.1753014111891389, |
| "reward_std": 0.5819915365427732, |
| "rewards/reward_func": 0.1753014111891389, |
| "step": 2784 |
| }, |
| { |
| "completion_length": 159.84375, |
| "epoch": 0.37361166867389267, |
| "grad_norm": 3.34375, |
| "kl": 0.004174454777967185, |
| "learning_rate": 6.263883313261072e-07, |
| "loss": 0.0002, |
| "reward": 0.45437810756266117, |
| "reward_std": 0.4158835466951132, |
| "rewards/reward_func": 0.45437810756266117, |
| "step": 2792 |
| }, |
| { |
| "completion_length": 208.875, |
| "epoch": 0.3746821892145056, |
| "grad_norm": 3.296875, |
| "kl": 0.0031874127162154764, |
| "learning_rate": 6.253178107854944e-07, |
| "loss": 0.0001, |
| "reward": 0.3352484591305256, |
| "reward_std": 0.5119593776762486, |
| "rewards/reward_func": 0.3352484591305256, |
| "step": 2800 |
| }, |
| { |
| "completion_length": 197.4765625, |
| "epoch": 0.37575270975511843, |
| "grad_norm": 3.28125, |
| "kl": 0.0036684646474896, |
| "learning_rate": 6.242472902448816e-07, |
| "loss": 0.0001, |
| "reward": 0.27760483510792255, |
| "reward_std": 0.6436055637896061, |
| "rewards/reward_func": 0.27760483510792255, |
| "step": 2808 |
| }, |
| { |
| "completion_length": 152.84375, |
| "epoch": 0.3768232302957313, |
| "grad_norm": 4.8125, |
| "kl": 0.004549535195110366, |
| "learning_rate": 6.231767697042686e-07, |
| "loss": 0.0002, |
| "reward": 0.46624478977173567, |
| "reward_std": 0.6175453588366508, |
| "rewards/reward_func": 0.46624478977173567, |
| "step": 2816 |
| }, |
| { |
| "completion_length": 144.796875, |
| "epoch": 0.3778937508363442, |
| "grad_norm": 3.421875, |
| "kl": 0.0038801982591394335, |
| "learning_rate": 6.221062491636558e-07, |
| "loss": 0.0002, |
| "reward": 0.3716530613601208, |
| "reward_std": 0.5027909129858017, |
| "rewards/reward_func": 0.3716530613601208, |
| "step": 2824 |
| }, |
| { |
| "completion_length": 189.3984375, |
| "epoch": 0.37896427137695704, |
| "grad_norm": 4.25, |
| "kl": 0.003531339403707534, |
| "learning_rate": 6.210357286230429e-07, |
| "loss": 0.0001, |
| "reward": 0.0588820856064558, |
| "reward_std": 0.5144321415573359, |
| "rewards/reward_func": 0.0588820856064558, |
| "step": 2832 |
| }, |
| { |
| "completion_length": 151.71875, |
| "epoch": 0.3800347919175699, |
| "grad_norm": 4.40625, |
| "kl": 0.004161316930549219, |
| "learning_rate": 6.199652080824301e-07, |
| "loss": 0.0002, |
| "reward": 0.21836877800524235, |
| "reward_std": 0.6778084672987461, |
| "rewards/reward_func": 0.21836877800524235, |
| "step": 2840 |
| }, |
| { |
| "completion_length": 156.9140625, |
| "epoch": 0.3811053124581828, |
| "grad_norm": 5.03125, |
| "kl": 0.0043344263976905495, |
| "learning_rate": 6.188946875418171e-07, |
| "loss": 0.0002, |
| "reward": 0.43052874132990837, |
| "reward_std": 0.5890010427683592, |
| "rewards/reward_func": 0.43052874132990837, |
| "step": 2848 |
| }, |
| { |
| "completion_length": 165.859375, |
| "epoch": 0.38217583299879565, |
| "grad_norm": 4.71875, |
| "kl": 0.0040927641675807536, |
| "learning_rate": 6.178241670012043e-07, |
| "loss": 0.0002, |
| "reward": 0.09390930086374283, |
| "reward_std": 0.4254406839609146, |
| "rewards/reward_func": 0.09390930086374283, |
| "step": 2856 |
| }, |
| { |
| "completion_length": 151.796875, |
| "epoch": 0.38324635353940856, |
| "grad_norm": 4.6875, |
| "kl": 0.004312922974349931, |
| "learning_rate": 6.167536464605915e-07, |
| "loss": 0.0002, |
| "reward": 0.16593856737017632, |
| "reward_std": 0.6011241041123867, |
| "rewards/reward_func": 0.16593856737017632, |
| "step": 2864 |
| }, |
| { |
| "completion_length": 147.5859375, |
| "epoch": 0.3843168740800214, |
| "grad_norm": 4.78125, |
| "kl": 0.004850049444939941, |
| "learning_rate": 6.156831259199785e-07, |
| "loss": 0.0002, |
| "reward": 0.3436956908553839, |
| "reward_std": 0.4854423590004444, |
| "rewards/reward_func": 0.3436956908553839, |
| "step": 2872 |
| }, |
| { |
| "completion_length": 144.6015625, |
| "epoch": 0.38538739462063426, |
| "grad_norm": 3.453125, |
| "kl": 0.00466370303183794, |
| "learning_rate": 6.146126053793656e-07, |
| "loss": 0.0002, |
| "reward": 0.3992779180407524, |
| "reward_std": 0.5042364671826363, |
| "rewards/reward_func": 0.3992779180407524, |
| "step": 2880 |
| }, |
| { |
| "completion_length": 144.2578125, |
| "epoch": 0.38645791516124717, |
| "grad_norm": 5.15625, |
| "kl": 0.004223753814585507, |
| "learning_rate": 6.135420848387528e-07, |
| "loss": 0.0002, |
| "reward": -0.015506982803344727, |
| "reward_std": 0.6913204118609428, |
| "rewards/reward_func": -0.015506982803344727, |
| "step": 2888 |
| }, |
| { |
| "completion_length": 201.4453125, |
| "epoch": 0.38752843570186, |
| "grad_norm": 3.84375, |
| "kl": 0.00345133469090797, |
| "learning_rate": 6.1247156429814e-07, |
| "loss": 0.0001, |
| "reward": -0.23047319240868092, |
| "reward_std": 0.5747088566422462, |
| "rewards/reward_func": -0.23047319240868092, |
| "step": 2896 |
| }, |
| { |
| "completion_length": 180.2890625, |
| "epoch": 0.3885989562424729, |
| "grad_norm": 3.125, |
| "kl": 0.004600081476382911, |
| "learning_rate": 6.114010437575271e-07, |
| "loss": 0.0002, |
| "reward": -0.17729684710502625, |
| "reward_std": 0.3335055038332939, |
| "rewards/reward_func": -0.17729684710502625, |
| "step": 2904 |
| }, |
| { |
| "completion_length": 153.8046875, |
| "epoch": 0.3896694767830858, |
| "grad_norm": 4.09375, |
| "kl": 0.003945650125388056, |
| "learning_rate": 6.103305232169142e-07, |
| "loss": 0.0002, |
| "reward": 0.2731490605510771, |
| "reward_std": 0.573139002546668, |
| "rewards/reward_func": 0.2731490605510771, |
| "step": 2912 |
| }, |
| { |
| "completion_length": 150.1171875, |
| "epoch": 0.39073999732369863, |
| "grad_norm": 3.90625, |
| "kl": 0.004451691260328516, |
| "learning_rate": 6.092600026763013e-07, |
| "loss": 0.0002, |
| "reward": 0.1808023676276207, |
| "reward_std": 0.5803926577791572, |
| "rewards/reward_func": 0.1808023676276207, |
| "step": 2920 |
| }, |
| { |
| "completion_length": 199.84375, |
| "epoch": 0.39181051786431154, |
| "grad_norm": 3.5625, |
| "kl": 0.0031813042878638953, |
| "learning_rate": 6.081894821356885e-07, |
| "loss": 0.0001, |
| "reward": 0.22766825137659907, |
| "reward_std": 0.6164026372134686, |
| "rewards/reward_func": 0.22766825137659907, |
| "step": 2928 |
| }, |
| { |
| "completion_length": 199.2734375, |
| "epoch": 0.3928810384049244, |
| "grad_norm": 3.28125, |
| "kl": 0.0038126638100948185, |
| "learning_rate": 6.071189615950756e-07, |
| "loss": 0.0002, |
| "reward": 0.16099986899644136, |
| "reward_std": 0.7263254784047604, |
| "rewards/reward_func": 0.16099986899644136, |
| "step": 2936 |
| }, |
| { |
| "completion_length": 157.765625, |
| "epoch": 0.3939515589455373, |
| "grad_norm": 3.5, |
| "kl": 0.004354376200353727, |
| "learning_rate": 6.060484410544627e-07, |
| "loss": 0.0002, |
| "reward": 0.31015807017683983, |
| "reward_std": 0.6586577072739601, |
| "rewards/reward_func": 0.31015807017683983, |
| "step": 2944 |
| }, |
| { |
| "completion_length": 155.671875, |
| "epoch": 0.39502207948615015, |
| "grad_norm": 3.609375, |
| "kl": 0.004368482739664614, |
| "learning_rate": 6.049779205138499e-07, |
| "loss": 0.0002, |
| "reward": 0.5017051734030247, |
| "reward_std": 0.42575474083423615, |
| "rewards/reward_func": 0.5017051734030247, |
| "step": 2952 |
| }, |
| { |
| "completion_length": 129.375, |
| "epoch": 0.396092600026763, |
| "grad_norm": 4.6875, |
| "kl": 0.004999362543458119, |
| "learning_rate": 6.039073999732369e-07, |
| "loss": 0.0002, |
| "reward": 0.5034809075295925, |
| "reward_std": 0.5035946983844042, |
| "rewards/reward_func": 0.5034809075295925, |
| "step": 2960 |
| }, |
| { |
| "completion_length": 171.203125, |
| "epoch": 0.3971631205673759, |
| "grad_norm": 3.09375, |
| "kl": 0.003695404506288469, |
| "learning_rate": 6.028368794326241e-07, |
| "loss": 0.0001, |
| "reward": 0.3904507216066122, |
| "reward_std": 0.5394172128289938, |
| "rewards/reward_func": 0.3904507216066122, |
| "step": 2968 |
| }, |
| { |
| "completion_length": 179.3515625, |
| "epoch": 0.39823364110798876, |
| "grad_norm": 2.796875, |
| "kl": 0.004422289348440245, |
| "learning_rate": 6.017663588920112e-07, |
| "loss": 0.0002, |
| "reward": 0.3067244812846184, |
| "reward_std": 0.48167256638407707, |
| "rewards/reward_func": 0.3067244812846184, |
| "step": 2976 |
| }, |
| { |
| "completion_length": 241.015625, |
| "epoch": 0.3993041616486016, |
| "grad_norm": 3.40625, |
| "kl": 0.0026670149818528444, |
| "learning_rate": 6.006958383513984e-07, |
| "loss": 0.0001, |
| "reward": -0.14665643870830536, |
| "reward_std": 0.43440048210322857, |
| "rewards/reward_func": -0.14665643870830536, |
| "step": 2984 |
| }, |
| { |
| "completion_length": 182.59375, |
| "epoch": 0.4003746821892145, |
| "grad_norm": 4.78125, |
| "kl": 0.004074128781212494, |
| "learning_rate": 5.996253178107855e-07, |
| "loss": 0.0002, |
| "reward": 0.2561218962073326, |
| "reward_std": 0.5822538835927844, |
| "rewards/reward_func": 0.2561218962073326, |
| "step": 2992 |
| }, |
| { |
| "completion_length": 193.3515625, |
| "epoch": 0.40144520272982737, |
| "grad_norm": 4.1875, |
| "kl": 0.00386466141208075, |
| "learning_rate": 5.985547972701726e-07, |
| "loss": 0.0002, |
| "reward": 0.21375904511660337, |
| "reward_std": 0.39464170206338167, |
| "rewards/reward_func": 0.21375904511660337, |
| "step": 3000 |
| }, |
| { |
| "completion_length": 159.328125, |
| "epoch": 0.4025157232704403, |
| "grad_norm": 3.734375, |
| "kl": 0.00333388164290227, |
| "learning_rate": 5.974842767295597e-07, |
| "loss": 0.0001, |
| "reward": 0.5523056299425662, |
| "reward_std": 0.47927504777908325, |
| "rewards/reward_func": 0.5523056299425662, |
| "step": 3008 |
| }, |
| { |
| "completion_length": 164.9921875, |
| "epoch": 0.4035862438110531, |
| "grad_norm": 3.625, |
| "kl": 0.004161383403697982, |
| "learning_rate": 5.964137561889468e-07, |
| "loss": 0.0002, |
| "reward": 0.17364376038312912, |
| "reward_std": 0.5346489679068327, |
| "rewards/reward_func": 0.17364376038312912, |
| "step": 3016 |
| }, |
| { |
| "completion_length": 159.7578125, |
| "epoch": 0.404656764351666, |
| "grad_norm": 4.0625, |
| "kl": 0.003841915662633255, |
| "learning_rate": 5.95343235648334e-07, |
| "loss": 0.0002, |
| "reward": 0.4289398565888405, |
| "reward_std": 0.47436373494565487, |
| "rewards/reward_func": 0.4289398565888405, |
| "step": 3024 |
| }, |
| { |
| "completion_length": 182.9609375, |
| "epoch": 0.4057272848922789, |
| "grad_norm": 2.453125, |
| "kl": 0.004264735238393769, |
| "learning_rate": 5.942727151077212e-07, |
| "loss": 0.0002, |
| "reward": 0.021798385307192802, |
| "reward_std": 0.5079176230356097, |
| "rewards/reward_func": 0.021798385307192802, |
| "step": 3032 |
| }, |
| { |
| "completion_length": 160.546875, |
| "epoch": 0.40679780543289173, |
| "grad_norm": 3.0, |
| "kl": 0.005132144928211346, |
| "learning_rate": 5.932021945671082e-07, |
| "loss": 0.0002, |
| "reward": 0.5438925623893738, |
| "reward_std": 0.42197058349847794, |
| "rewards/reward_func": 0.5438925623893738, |
| "step": 3040 |
| }, |
| { |
| "completion_length": 156.0859375, |
| "epoch": 0.40786832597350464, |
| "grad_norm": 4.1875, |
| "kl": 0.003920425719115883, |
| "learning_rate": 5.921316740264953e-07, |
| "loss": 0.0002, |
| "reward": 0.3435197048820555, |
| "reward_std": 0.584853507578373, |
| "rewards/reward_func": 0.3435197048820555, |
| "step": 3048 |
| }, |
| { |
| "completion_length": 148.8359375, |
| "epoch": 0.4089388465141175, |
| "grad_norm": 1.90625, |
| "kl": 0.004158479205216281, |
| "learning_rate": 5.910611534858825e-07, |
| "loss": 0.0002, |
| "reward": 0.3984090769663453, |
| "reward_std": 0.4647171348333359, |
| "rewards/reward_func": 0.3984090769663453, |
| "step": 3056 |
| }, |
| { |
| "completion_length": 175.2109375, |
| "epoch": 0.41000936705473034, |
| "grad_norm": 3.984375, |
| "kl": 0.003644221549620852, |
| "learning_rate": 5.899906329452697e-07, |
| "loss": 0.0001, |
| "reward": 0.16703728586435318, |
| "reward_std": 0.5931989103555679, |
| "rewards/reward_func": 0.16703728586435318, |
| "step": 3064 |
| }, |
| { |
| "completion_length": 173.90625, |
| "epoch": 0.41107988759534325, |
| "grad_norm": 3.21875, |
| "kl": 0.004099360230611637, |
| "learning_rate": 5.889201124046567e-07, |
| "loss": 0.0002, |
| "reward": -0.03924668487161398, |
| "reward_std": 0.6728265807032585, |
| "rewards/reward_func": -0.03924668487161398, |
| "step": 3072 |
| }, |
| { |
| "completion_length": 164.953125, |
| "epoch": 0.4121504081359561, |
| "grad_norm": 3.34375, |
| "kl": 0.004925543296849355, |
| "learning_rate": 5.878495918640438e-07, |
| "loss": 0.0002, |
| "reward": 0.3322628792375326, |
| "reward_std": 0.5970859546214342, |
| "rewards/reward_func": 0.3322628792375326, |
| "step": 3080 |
| }, |
| { |
| "completion_length": 162.0234375, |
| "epoch": 0.41322092867656895, |
| "grad_norm": 4.0625, |
| "kl": 0.004399422614369541, |
| "learning_rate": 5.86779071323431e-07, |
| "loss": 0.0002, |
| "reward": 0.36330926418304443, |
| "reward_std": 0.45976690761744976, |
| "rewards/reward_func": 0.36330926418304443, |
| "step": 3088 |
| }, |
| { |
| "completion_length": 138.796875, |
| "epoch": 0.41429144921718186, |
| "grad_norm": 3.140625, |
| "kl": 0.004546679730992764, |
| "learning_rate": 5.857085507828181e-07, |
| "loss": 0.0002, |
| "reward": 0.4072983153164387, |
| "reward_std": 0.600585313513875, |
| "rewards/reward_func": 0.4072983153164387, |
| "step": 3096 |
| }, |
| { |
| "completion_length": 214.953125, |
| "epoch": 0.4153619697577947, |
| "grad_norm": 2.9375, |
| "kl": 0.00350517057813704, |
| "learning_rate": 5.846380302422052e-07, |
| "loss": 0.0001, |
| "reward": 0.2474349234253168, |
| "reward_std": 0.498451117426157, |
| "rewards/reward_func": 0.2474349234253168, |
| "step": 3104 |
| }, |
| { |
| "completion_length": 139.453125, |
| "epoch": 0.4164324902984076, |
| "grad_norm": 4.1875, |
| "kl": 0.00461685229674913, |
| "learning_rate": 5.835675097015924e-07, |
| "loss": 0.0002, |
| "reward": 0.471061285585165, |
| "reward_std": 0.45820480585098267, |
| "rewards/reward_func": 0.471061285585165, |
| "step": 3112 |
| }, |
| { |
| "completion_length": 144.4453125, |
| "epoch": 0.41750301083902047, |
| "grad_norm": 4.6875, |
| "kl": 0.004540506488410756, |
| "learning_rate": 5.824969891609795e-07, |
| "loss": 0.0002, |
| "reward": 0.4106574021279812, |
| "reward_std": 0.47535229101777077, |
| "rewards/reward_func": 0.4106574021279812, |
| "step": 3120 |
| }, |
| { |
| "completion_length": 161.71875, |
| "epoch": 0.4185735313796333, |
| "grad_norm": 4.78125, |
| "kl": 0.004575909668346867, |
| "learning_rate": 5.814264686203665e-07, |
| "loss": 0.0002, |
| "reward": -0.04112925007939339, |
| "reward_std": 0.4119059517979622, |
| "rewards/reward_func": -0.04112925007939339, |
| "step": 3128 |
| }, |
| { |
| "completion_length": 196.5703125, |
| "epoch": 0.41964405192024623, |
| "grad_norm": 4.90625, |
| "kl": 0.003955840336857364, |
| "learning_rate": 5.803559480797537e-07, |
| "loss": 0.0002, |
| "reward": 0.024920357391238213, |
| "reward_std": 0.5616731429472566, |
| "rewards/reward_func": 0.024920357391238213, |
| "step": 3136 |
| }, |
| { |
| "completion_length": 204.4296875, |
| "epoch": 0.4207145724608591, |
| "grad_norm": 4.96875, |
| "kl": 0.0033879343245644122, |
| "learning_rate": 5.792854275391409e-07, |
| "loss": 0.0001, |
| "reward": 0.2112936358898878, |
| "reward_std": 0.5542439222335815, |
| "rewards/reward_func": 0.2112936358898878, |
| "step": 3144 |
| }, |
| { |
| "completion_length": 188.40625, |
| "epoch": 0.421785093001472, |
| "grad_norm": 4.40625, |
| "kl": 0.003975967440055683, |
| "learning_rate": 5.782149069985281e-07, |
| "loss": 0.0002, |
| "reward": -0.049715520814061165, |
| "reward_std": 0.6521423272788525, |
| "rewards/reward_func": -0.049715520814061165, |
| "step": 3152 |
| }, |
| { |
| "completion_length": 163.4375, |
| "epoch": 0.42285561354208484, |
| "grad_norm": 2.65625, |
| "kl": 0.004166945233009756, |
| "learning_rate": 5.771443864579151e-07, |
| "loss": 0.0002, |
| "reward": 0.41062634997069836, |
| "reward_std": 0.4943850450217724, |
| "rewards/reward_func": 0.41062634997069836, |
| "step": 3160 |
| }, |
| { |
| "completion_length": 130.859375, |
| "epoch": 0.4239261340826977, |
| "grad_norm": 3.859375, |
| "kl": 0.005400074122007936, |
| "learning_rate": 5.760738659173022e-07, |
| "loss": 0.0002, |
| "reward": 0.43160221725702286, |
| "reward_std": 0.5389326587319374, |
| "rewards/reward_func": 0.43160221725702286, |
| "step": 3168 |
| }, |
| { |
| "completion_length": 172.703125, |
| "epoch": 0.4249966546233106, |
| "grad_norm": 4.53125, |
| "kl": 0.005152460333192721, |
| "learning_rate": 5.750033453766894e-07, |
| "loss": 0.0002, |
| "reward": 0.06674006022512913, |
| "reward_std": 0.5032580755650997, |
| "rewards/reward_func": 0.06674006022512913, |
| "step": 3176 |
| }, |
| { |
| "completion_length": 154.6875, |
| "epoch": 0.42606717516392345, |
| "grad_norm": 4.125, |
| "kl": 0.004849692864809185, |
| "learning_rate": 5.739328248360766e-07, |
| "loss": 0.0002, |
| "reward": 0.33508316054940224, |
| "reward_std": 0.5568934958428144, |
| "rewards/reward_func": 0.33508316054940224, |
| "step": 3184 |
| }, |
| { |
| "completion_length": 149.921875, |
| "epoch": 0.42713769570453636, |
| "grad_norm": 3.5, |
| "kl": 0.004149941669311374, |
| "learning_rate": 5.728623042954636e-07, |
| "loss": 0.0002, |
| "reward": 0.560466131195426, |
| "reward_std": 0.4996361844241619, |
| "rewards/reward_func": 0.560466131195426, |
| "step": 3192 |
| }, |
| { |
| "completion_length": 162.3515625, |
| "epoch": 0.4282082162451492, |
| "grad_norm": 2.375, |
| "kl": 0.00443269161041826, |
| "learning_rate": 5.717917837548508e-07, |
| "loss": 0.0002, |
| "reward": 0.4073672443628311, |
| "reward_std": 0.4750672820955515, |
| "rewards/reward_func": 0.4073672443628311, |
| "step": 3200 |
| }, |
| { |
| "completion_length": 173.3203125, |
| "epoch": 0.42927873678576206, |
| "grad_norm": 4.125, |
| "kl": 0.0039921577263157815, |
| "learning_rate": 5.707212632142379e-07, |
| "loss": 0.0002, |
| "reward": -0.03342257114127278, |
| "reward_std": 0.6672232635319233, |
| "rewards/reward_func": -0.03342257114127278, |
| "step": 3208 |
| }, |
| { |
| "completion_length": 155.9375, |
| "epoch": 0.43034925732637497, |
| "grad_norm": 5.78125, |
| "kl": 0.004704885970568284, |
| "learning_rate": 5.69650742673625e-07, |
| "loss": 0.0002, |
| "reward": 0.3005738127976656, |
| "reward_std": 0.5849708952009678, |
| "rewards/reward_func": 0.3005738127976656, |
| "step": 3216 |
| }, |
| { |
| "completion_length": 185.84375, |
| "epoch": 0.4314197778669878, |
| "grad_norm": 2.890625, |
| "kl": 0.0036070215137442574, |
| "learning_rate": 5.685802221330121e-07, |
| "loss": 0.0001, |
| "reward": -0.018972497433423996, |
| "reward_std": 0.5354725271463394, |
| "rewards/reward_func": -0.018972497433423996, |
| "step": 3224 |
| }, |
| { |
| "completion_length": 178.546875, |
| "epoch": 0.43249029840760067, |
| "grad_norm": 2.625, |
| "kl": 0.004298602405469865, |
| "learning_rate": 5.675097015923993e-07, |
| "loss": 0.0002, |
| "reward": 0.3231694786809385, |
| "reward_std": 0.4985707551240921, |
| "rewards/reward_func": 0.3231694786809385, |
| "step": 3232 |
| }, |
| { |
| "completion_length": 162.3203125, |
| "epoch": 0.4335608189482136, |
| "grad_norm": 3.59375, |
| "kl": 0.0039607091166544706, |
| "learning_rate": 5.664391810517865e-07, |
| "loss": 0.0002, |
| "reward": 0.11156550701707602, |
| "reward_std": 0.7430830076336861, |
| "rewards/reward_func": 0.11156550701707602, |
| "step": 3240 |
| }, |
| { |
| "completion_length": 150.40625, |
| "epoch": 0.43463133948882643, |
| "grad_norm": 3.3125, |
| "kl": 0.0049680424854159355, |
| "learning_rate": 5.653686605111735e-07, |
| "loss": 0.0002, |
| "reward": 0.36818648502230644, |
| "reward_std": 0.4827171713113785, |
| "rewards/reward_func": 0.36818648502230644, |
| "step": 3248 |
| }, |
| { |
| "completion_length": 148.8046875, |
| "epoch": 0.43570186002943934, |
| "grad_norm": 5.40625, |
| "kl": 0.004517415567534044, |
| "learning_rate": 5.642981399705606e-07, |
| "loss": 0.0002, |
| "reward": 0.5136874578893185, |
| "reward_std": 0.4101978652179241, |
| "rewards/reward_func": 0.5136874578893185, |
| "step": 3256 |
| }, |
| { |
| "completion_length": 159.90625, |
| "epoch": 0.4367723805700522, |
| "grad_norm": 4.5, |
| "kl": 0.005192397540668026, |
| "learning_rate": 5.632276194299478e-07, |
| "loss": 0.0002, |
| "reward": 0.36011555418372154, |
| "reward_std": 0.590018224902451, |
| "rewards/reward_func": 0.36011555418372154, |
| "step": 3264 |
| }, |
| { |
| "completion_length": 165.8359375, |
| "epoch": 0.43784290111066504, |
| "grad_norm": 5.78125, |
| "kl": 0.004310069081839174, |
| "learning_rate": 5.621570988893349e-07, |
| "loss": 0.0002, |
| "reward": 0.44862041622400284, |
| "reward_std": 0.5028228275477886, |
| "rewards/reward_func": 0.44862041622400284, |
| "step": 3272 |
| }, |
| { |
| "completion_length": 163.8203125, |
| "epoch": 0.43891342165127795, |
| "grad_norm": 3.609375, |
| "kl": 0.004184526915196329, |
| "learning_rate": 5.610865783487221e-07, |
| "loss": 0.0002, |
| "reward": 0.3917035781778395, |
| "reward_std": 0.5541238645091653, |
| "rewards/reward_func": 0.3917035781778395, |
| "step": 3280 |
| }, |
| { |
| "completion_length": 186.40625, |
| "epoch": 0.4399839421918908, |
| "grad_norm": 4.125, |
| "kl": 0.003184476459864527, |
| "learning_rate": 5.600160578081091e-07, |
| "loss": 0.0001, |
| "reward": 0.12950839288532734, |
| "reward_std": 0.5569676849991083, |
| "rewards/reward_func": 0.12950839288532734, |
| "step": 3288 |
| }, |
| { |
| "completion_length": 139.9765625, |
| "epoch": 0.4410544627325037, |
| "grad_norm": 3.9375, |
| "kl": 0.004262359958374873, |
| "learning_rate": 5.589455372674963e-07, |
| "loss": 0.0002, |
| "reward": 0.28253707475960255, |
| "reward_std": 0.44188484735786915, |
| "rewards/reward_func": 0.28253707475960255, |
| "step": 3296 |
| }, |
| { |
| "completion_length": 174.6171875, |
| "epoch": 0.44212498327311656, |
| "grad_norm": 4.59375, |
| "kl": 0.004584858979796991, |
| "learning_rate": 5.578750167268834e-07, |
| "loss": 0.0002, |
| "reward": 0.10256939753890038, |
| "reward_std": 0.47010411880910397, |
| "rewards/reward_func": 0.10256939753890038, |
| "step": 3304 |
| }, |
| { |
| "completion_length": 151.453125, |
| "epoch": 0.4431955038137294, |
| "grad_norm": 3.3125, |
| "kl": 0.004451150889508426, |
| "learning_rate": 5.568044961862706e-07, |
| "loss": 0.0002, |
| "reward": 0.48164689540863037, |
| "reward_std": 0.4186716293916106, |
| "rewards/reward_func": 0.48164689540863037, |
| "step": 3312 |
| }, |
| { |
| "completion_length": 187.1796875, |
| "epoch": 0.4442660243543423, |
| "grad_norm": 5.3125, |
| "kl": 0.003924098331481218, |
| "learning_rate": 5.557339756456577e-07, |
| "loss": 0.0002, |
| "reward": 0.13027670048177242, |
| "reward_std": 0.5701944110915065, |
| "rewards/reward_func": 0.13027670048177242, |
| "step": 3320 |
| }, |
| { |
| "completion_length": 175.734375, |
| "epoch": 0.44533654489495517, |
| "grad_norm": 3.890625, |
| "kl": 0.004432059795362875, |
| "learning_rate": 5.546634551050447e-07, |
| "loss": 0.0002, |
| "reward": 0.15545489452779293, |
| "reward_std": 0.42713499814271927, |
| "rewards/reward_func": 0.15545489452779293, |
| "step": 3328 |
| }, |
| { |
| "completion_length": 159.7109375, |
| "epoch": 0.446407065435568, |
| "grad_norm": 3.953125, |
| "kl": 0.0041090622544288635, |
| "learning_rate": 5.535929345644319e-07, |
| "loss": 0.0002, |
| "reward": 0.5048990547657013, |
| "reward_std": 0.3645612169057131, |
| "rewards/reward_func": 0.5048990547657013, |
| "step": 3336 |
| }, |
| { |
| "completion_length": 170.46875, |
| "epoch": 0.4474775859761809, |
| "grad_norm": 3.109375, |
| "kl": 0.00416830470203422, |
| "learning_rate": 5.525224140238191e-07, |
| "loss": 0.0002, |
| "reward": 0.0841824202798307, |
| "reward_std": 0.5541789922863245, |
| "rewards/reward_func": 0.0841824202798307, |
| "step": 3344 |
| }, |
| { |
| "completion_length": 177.8984375, |
| "epoch": 0.4485481065167938, |
| "grad_norm": 2.90625, |
| "kl": 0.004115153366001323, |
| "learning_rate": 5.514518934832062e-07, |
| "loss": 0.0002, |
| "reward": 0.25281552597880363, |
| "reward_std": 0.5773179177194834, |
| "rewards/reward_func": 0.25281552597880363, |
| "step": 3352 |
| }, |
| { |
| "completion_length": 167.546875, |
| "epoch": 0.4496186270574067, |
| "grad_norm": 3.046875, |
| "kl": 0.004800075956154615, |
| "learning_rate": 5.503813729425933e-07, |
| "loss": 0.0002, |
| "reward": 0.15400892263278365, |
| "reward_std": 0.613510686904192, |
| "rewards/reward_func": 0.15400892263278365, |
| "step": 3360 |
| }, |
| { |
| "completion_length": 173.875, |
| "epoch": 0.45068914759801953, |
| "grad_norm": 5.125, |
| "kl": 0.004234513093251735, |
| "learning_rate": 5.493108524019804e-07, |
| "loss": 0.0002, |
| "reward": 0.14482227806001902, |
| "reward_std": 0.6577083393931389, |
| "rewards/reward_func": 0.14482227806001902, |
| "step": 3368 |
| }, |
| { |
| "completion_length": 196.0078125, |
| "epoch": 0.4517596681386324, |
| "grad_norm": 3.28125, |
| "kl": 0.003691094840178266, |
| "learning_rate": 5.482403318613676e-07, |
| "loss": 0.0001, |
| "reward": 0.20943116396665573, |
| "reward_std": 0.6141318120062351, |
| "rewards/reward_func": 0.20943116396665573, |
| "step": 3376 |
| }, |
| { |
| "completion_length": 189.9375, |
| "epoch": 0.4528301886792453, |
| "grad_norm": 3.625, |
| "kl": 0.004188001621514559, |
| "learning_rate": 5.471698113207546e-07, |
| "loss": 0.0002, |
| "reward": 0.12066240888088942, |
| "reward_std": 0.6333566196262836, |
| "rewards/reward_func": 0.12066240888088942, |
| "step": 3384 |
| }, |
| { |
| "completion_length": 224.203125, |
| "epoch": 0.45390070921985815, |
| "grad_norm": 3.46875, |
| "kl": 0.0038036782352719456, |
| "learning_rate": 5.460992907801418e-07, |
| "loss": 0.0002, |
| "reward": 0.026676064357161522, |
| "reward_std": 0.5244584791362286, |
| "rewards/reward_func": 0.026676064357161522, |
| "step": 3392 |
| }, |
| { |
| "completion_length": 135.8125, |
| "epoch": 0.45497122976047105, |
| "grad_norm": 3.78125, |
| "kl": 0.005447168223327026, |
| "learning_rate": 5.45028770239529e-07, |
| "loss": 0.0002, |
| "reward": 0.3792672948911786, |
| "reward_std": 0.5337657146155834, |
| "rewards/reward_func": 0.3792672948911786, |
| "step": 3400 |
| }, |
| { |
| "completion_length": 180.4765625, |
| "epoch": 0.4560417503010839, |
| "grad_norm": 3.84375, |
| "kl": 0.0039006134611554444, |
| "learning_rate": 5.439582496989162e-07, |
| "loss": 0.0002, |
| "reward": 0.2967074029147625, |
| "reward_std": 0.5028974749147892, |
| "rewards/reward_func": 0.2967074029147625, |
| "step": 3408 |
| }, |
| { |
| "completion_length": 173.46875, |
| "epoch": 0.45711227084169676, |
| "grad_norm": 3.59375, |
| "kl": 0.004641034756787121, |
| "learning_rate": 5.428877291583031e-07, |
| "loss": 0.0002, |
| "reward": 0.04013548418879509, |
| "reward_std": 0.647808875888586, |
| "rewards/reward_func": 0.04013548418879509, |
| "step": 3416 |
| }, |
| { |
| "completion_length": 182.6953125, |
| "epoch": 0.45818279138230966, |
| "grad_norm": 3.265625, |
| "kl": 0.003926090226741508, |
| "learning_rate": 5.418172086176903e-07, |
| "loss": 0.0002, |
| "reward": -0.02354210428893566, |
| "reward_std": 0.46280941739678383, |
| "rewards/reward_func": -0.02354210428893566, |
| "step": 3424 |
| }, |
| { |
| "completion_length": 171.9140625, |
| "epoch": 0.4592533119229225, |
| "grad_norm": 4.3125, |
| "kl": 0.004544450406683609, |
| "learning_rate": 5.407466880770775e-07, |
| "loss": 0.0002, |
| "reward": 0.2378298337571323, |
| "reward_std": 0.5396788232028484, |
| "rewards/reward_func": 0.2378298337571323, |
| "step": 3432 |
| }, |
| { |
| "completion_length": 168.5625, |
| "epoch": 0.4603238324635354, |
| "grad_norm": 4.125, |
| "kl": 0.003944898169720545, |
| "learning_rate": 5.396761675364647e-07, |
| "loss": 0.0002, |
| "reward": 0.3150383196771145, |
| "reward_std": 0.53007797524333, |
| "rewards/reward_func": 0.3150383196771145, |
| "step": 3440 |
| }, |
| { |
| "completion_length": 153.21875, |
| "epoch": 0.46139435300414827, |
| "grad_norm": 3.515625, |
| "kl": 0.004431029927218333, |
| "learning_rate": 5.386056469958517e-07, |
| "loss": 0.0002, |
| "reward": 0.10777561087161303, |
| "reward_std": 0.5590555854141712, |
| "rewards/reward_func": 0.10777561087161303, |
| "step": 3448 |
| }, |
| { |
| "completion_length": 156.1015625, |
| "epoch": 0.4624648735447611, |
| "grad_norm": 5.59375, |
| "kl": 0.004066320398123935, |
| "learning_rate": 5.375351264552388e-07, |
| "loss": 0.0002, |
| "reward": 0.49741687439382076, |
| "reward_std": 0.3438666444271803, |
| "rewards/reward_func": 0.49741687439382076, |
| "step": 3456 |
| }, |
| { |
| "completion_length": 180.890625, |
| "epoch": 0.46353539408537403, |
| "grad_norm": 3.5, |
| "kl": 0.004092392831807956, |
| "learning_rate": 5.36464605914626e-07, |
| "loss": 0.0002, |
| "reward": 0.261587081477046, |
| "reward_std": 0.4724911078810692, |
| "rewards/reward_func": 0.261587081477046, |
| "step": 3464 |
| }, |
| { |
| "completion_length": 188.34375, |
| "epoch": 0.4646059146259869, |
| "grad_norm": 4.5625, |
| "kl": 0.004102788210730068, |
| "learning_rate": 5.353940853740131e-07, |
| "loss": 0.0002, |
| "reward": 0.3170028403401375, |
| "reward_std": 0.5411418545991182, |
| "rewards/reward_func": 0.3170028403401375, |
| "step": 3472 |
| }, |
| { |
| "completion_length": 147.0078125, |
| "epoch": 0.46567643516659973, |
| "grad_norm": 3.90625, |
| "kl": 0.004658064717659727, |
| "learning_rate": 5.343235648334002e-07, |
| "loss": 0.0002, |
| "reward": 0.42856106348335743, |
| "reward_std": 0.45429209433496, |
| "rewards/reward_func": 0.42856106348335743, |
| "step": 3480 |
| }, |
| { |
| "completion_length": 185.578125, |
| "epoch": 0.46674695570721264, |
| "grad_norm": 3.515625, |
| "kl": 0.004181814001640305, |
| "learning_rate": 5.332530442927874e-07, |
| "loss": 0.0002, |
| "reward": 0.1980421096086502, |
| "reward_std": 0.46522266045212746, |
| "rewards/reward_func": 0.1980421096086502, |
| "step": 3488 |
| }, |
| { |
| "completion_length": 147.6015625, |
| "epoch": 0.4678174762478255, |
| "grad_norm": 2.875, |
| "kl": 0.005186378140933812, |
| "learning_rate": 5.321825237521745e-07, |
| "loss": 0.0002, |
| "reward": 0.33479253202676773, |
| "reward_std": 0.3981231078505516, |
| "rewards/reward_func": 0.33479253202676773, |
| "step": 3496 |
| }, |
| { |
| "completion_length": 196.953125, |
| "epoch": 0.4688879967884384, |
| "grad_norm": 2.109375, |
| "kl": 0.003901872376445681, |
| "learning_rate": 5.311120032115616e-07, |
| "loss": 0.0002, |
| "reward": -0.1805968815460801, |
| "reward_std": 0.5539918430149555, |
| "rewards/reward_func": -0.1805968815460801, |
| "step": 3504 |
| }, |
| { |
| "completion_length": 172.09375, |
| "epoch": 0.46995851732905125, |
| "grad_norm": 4.375, |
| "kl": 0.004344145359937102, |
| "learning_rate": 5.300414826709487e-07, |
| "loss": 0.0002, |
| "reward": 0.24764186749234796, |
| "reward_std": 0.5220493152737617, |
| "rewards/reward_func": 0.24764186749234796, |
| "step": 3512 |
| }, |
| { |
| "completion_length": 165.484375, |
| "epoch": 0.4710290378696641, |
| "grad_norm": 3.03125, |
| "kl": 0.004431087261764333, |
| "learning_rate": 5.289709621303359e-07, |
| "loss": 0.0002, |
| "reward": 0.207328287884593, |
| "reward_std": 0.621040590107441, |
| "rewards/reward_func": 0.207328287884593, |
| "step": 3520 |
| }, |
| { |
| "completion_length": 187.7734375, |
| "epoch": 0.472099558410277, |
| "grad_norm": 3.5, |
| "kl": 0.004218856105580926, |
| "learning_rate": 5.27900441589723e-07, |
| "loss": 0.0002, |
| "reward": 0.07554451934993267, |
| "reward_std": 0.6108374260365963, |
| "rewards/reward_func": 0.07554451934993267, |
| "step": 3528 |
| }, |
| { |
| "completion_length": 168.265625, |
| "epoch": 0.47317007895088986, |
| "grad_norm": 5.0625, |
| "kl": 0.004120910074561834, |
| "learning_rate": 5.2682992104911e-07, |
| "loss": 0.0002, |
| "reward": 0.03414946049451828, |
| "reward_std": 0.6455099135637283, |
| "rewards/reward_func": 0.03414946049451828, |
| "step": 3536 |
| }, |
| { |
| "completion_length": 187.5234375, |
| "epoch": 0.47424059949150277, |
| "grad_norm": 3.640625, |
| "kl": 0.003985106013715267, |
| "learning_rate": 5.257594005084972e-07, |
| "loss": 0.0002, |
| "reward": 0.2925253491848707, |
| "reward_std": 0.6335334703326225, |
| "rewards/reward_func": 0.2925253491848707, |
| "step": 3544 |
| }, |
| { |
| "completion_length": 163.46875, |
| "epoch": 0.4753111200321156, |
| "grad_norm": 4.125, |
| "kl": 0.00473158826935105, |
| "learning_rate": 5.246888799678844e-07, |
| "loss": 0.0002, |
| "reward": 0.3663984229788184, |
| "reward_std": 0.558024113997817, |
| "rewards/reward_func": 0.3663984229788184, |
| "step": 3552 |
| }, |
| { |
| "completion_length": 152.03125, |
| "epoch": 0.47638164057272847, |
| "grad_norm": 4.1875, |
| "kl": 0.004815980733837932, |
| "learning_rate": 5.236183594272715e-07, |
| "loss": 0.0002, |
| "reward": 0.13753212243318558, |
| "reward_std": 0.5678570009768009, |
| "rewards/reward_func": 0.13753212243318558, |
| "step": 3560 |
| }, |
| { |
| "completion_length": 178.7421875, |
| "epoch": 0.4774521611133414, |
| "grad_norm": 3.625, |
| "kl": 0.004101649799849838, |
| "learning_rate": 5.225478388866587e-07, |
| "loss": 0.0002, |
| "reward": 0.2735243234783411, |
| "reward_std": 0.6148385126143694, |
| "rewards/reward_func": 0.2735243234783411, |
| "step": 3568 |
| }, |
| { |
| "completion_length": 192.1484375, |
| "epoch": 0.47852268165395423, |
| "grad_norm": 4.84375, |
| "kl": 0.004463646182557568, |
| "learning_rate": 5.214773183460457e-07, |
| "loss": 0.0002, |
| "reward": 0.009237892925739288, |
| "reward_std": 0.4207034735009074, |
| "rewards/reward_func": 0.009237892925739288, |
| "step": 3576 |
| }, |
| { |
| "completion_length": 174.046875, |
| "epoch": 0.4795932021945671, |
| "grad_norm": 4.125, |
| "kl": 0.0036158739821985364, |
| "learning_rate": 5.204067978054328e-07, |
| "loss": 0.0001, |
| "reward": 0.26391329150646925, |
| "reward_std": 0.586926780641079, |
| "rewards/reward_func": 0.26391329150646925, |
| "step": 3584 |
| }, |
| { |
| "completion_length": 175.4375, |
| "epoch": 0.48066372273518, |
| "grad_norm": 3.578125, |
| "kl": 0.0043216931517235935, |
| "learning_rate": 5.1933627726482e-07, |
| "loss": 0.0002, |
| "reward": 0.28290559723973274, |
| "reward_std": 0.6564907301217318, |
| "rewards/reward_func": 0.28290559723973274, |
| "step": 3592 |
| }, |
| { |
| "completion_length": 178.8671875, |
| "epoch": 0.48173424327579284, |
| "grad_norm": 2.671875, |
| "kl": 0.004414036084199324, |
| "learning_rate": 5.182657567242071e-07, |
| "loss": 0.0002, |
| "reward": 0.3407918275333941, |
| "reward_std": 0.5300383027642965, |
| "rewards/reward_func": 0.3407918275333941, |
| "step": 3600 |
| }, |
| { |
| "completion_length": 168.0078125, |
| "epoch": 0.48280476381640575, |
| "grad_norm": 3.828125, |
| "kl": 0.004355661425506696, |
| "learning_rate": 5.171952361835943e-07, |
| "loss": 0.0002, |
| "reward": 0.15907337237149477, |
| "reward_std": 0.6283294912427664, |
| "rewards/reward_func": 0.15907337237149477, |
| "step": 3608 |
| }, |
| { |
| "completion_length": 169.671875, |
| "epoch": 0.4838752843570186, |
| "grad_norm": 4.40625, |
| "kl": 0.0041847134416457266, |
| "learning_rate": 5.161247156429813e-07, |
| "loss": 0.0002, |
| "reward": 0.394026106223464, |
| "reward_std": 0.5191534291952848, |
| "rewards/reward_func": 0.394026106223464, |
| "step": 3616 |
| }, |
| { |
| "completion_length": 187.9765625, |
| "epoch": 0.48494580489763145, |
| "grad_norm": 4.25, |
| "kl": 0.004289885691832751, |
| "learning_rate": 5.150541951023685e-07, |
| "loss": 0.0002, |
| "reward": 0.2409443873912096, |
| "reward_std": 0.714960128068924, |
| "rewards/reward_func": 0.2409443873912096, |
| "step": 3624 |
| }, |
| { |
| "completion_length": 167.8984375, |
| "epoch": 0.48601632543824436, |
| "grad_norm": 3.515625, |
| "kl": 0.004622265987563878, |
| "learning_rate": 5.139836745617556e-07, |
| "loss": 0.0002, |
| "reward": 0.3250633031129837, |
| "reward_std": 0.3942791158333421, |
| "rewards/reward_func": 0.3250633031129837, |
| "step": 3632 |
| }, |
| { |
| "completion_length": 181.3984375, |
| "epoch": 0.4870868459788572, |
| "grad_norm": 3.125, |
| "kl": 0.00506105026579462, |
| "learning_rate": 5.129131540211427e-07, |
| "loss": 0.0002, |
| "reward": 0.1745797097682953, |
| "reward_std": 0.5199177237227559, |
| "rewards/reward_func": 0.1745797097682953, |
| "step": 3640 |
| }, |
| { |
| "completion_length": 177.828125, |
| "epoch": 0.4881573665194701, |
| "grad_norm": 5.125, |
| "kl": 0.00410176973673515, |
| "learning_rate": 5.118426334805299e-07, |
| "loss": 0.0002, |
| "reward": 0.1287559773772955, |
| "reward_std": 0.5036085527390242, |
| "rewards/reward_func": 0.1287559773772955, |
| "step": 3648 |
| }, |
| { |
| "completion_length": 179.125, |
| "epoch": 0.48922788706008297, |
| "grad_norm": 4.0, |
| "kl": 0.004322856722865254, |
| "learning_rate": 5.107721129399171e-07, |
| "loss": 0.0002, |
| "reward": 0.07798391906544566, |
| "reward_std": 0.6537183858454227, |
| "rewards/reward_func": 0.07798391906544566, |
| "step": 3656 |
| }, |
| { |
| "completion_length": 154.4375, |
| "epoch": 0.4902984076006958, |
| "grad_norm": 3.03125, |
| "kl": 0.003943322895793244, |
| "learning_rate": 5.097015923993041e-07, |
| "loss": 0.0002, |
| "reward": 0.1709643267095089, |
| "reward_std": 0.5507702603936195, |
| "rewards/reward_func": 0.1709643267095089, |
| "step": 3664 |
| }, |
| { |
| "completion_length": 168.6875, |
| "epoch": 0.4913689281413087, |
| "grad_norm": 3.125, |
| "kl": 0.004549846984446049, |
| "learning_rate": 5.086310718586912e-07, |
| "loss": 0.0002, |
| "reward": 0.2190579893067479, |
| "reward_std": 0.5686514582484961, |
| "rewards/reward_func": 0.2190579893067479, |
| "step": 3672 |
| }, |
| { |
| "completion_length": 155.828125, |
| "epoch": 0.4924394486819216, |
| "grad_norm": 3.703125, |
| "kl": 0.00439119475777261, |
| "learning_rate": 5.075605513180784e-07, |
| "loss": 0.0002, |
| "reward": 0.45011513587087393, |
| "reward_std": 0.5558454534038901, |
| "rewards/reward_func": 0.45011513587087393, |
| "step": 3680 |
| }, |
| { |
| "completion_length": 158.1796875, |
| "epoch": 0.4935099692225345, |
| "grad_norm": 5.625, |
| "kl": 0.004681064456235617, |
| "learning_rate": 5.064900307774656e-07, |
| "loss": 0.0002, |
| "reward": 0.22979869320988655, |
| "reward_std": 0.3713596798479557, |
| "rewards/reward_func": 0.22979869320988655, |
| "step": 3688 |
| }, |
| { |
| "completion_length": 160.0234375, |
| "epoch": 0.49458048976314734, |
| "grad_norm": 4.375, |
| "kl": 0.00535585597390309, |
| "learning_rate": 5.054195102368527e-07, |
| "loss": 0.0002, |
| "reward": 0.051803894340991974, |
| "reward_std": 0.6633851379156113, |
| "rewards/reward_func": 0.051803894340991974, |
| "step": 3696 |
| }, |
| { |
| "completion_length": 175.9609375, |
| "epoch": 0.4956510103037602, |
| "grad_norm": 4.125, |
| "kl": 0.004008779738796875, |
| "learning_rate": 5.043489896962397e-07, |
| "loss": 0.0002, |
| "reward": 0.2548919077962637, |
| "reward_std": 0.5479347966611385, |
| "rewards/reward_func": 0.2548919077962637, |
| "step": 3704 |
| }, |
| { |
| "completion_length": 168.3671875, |
| "epoch": 0.4967215308443731, |
| "grad_norm": 3.78125, |
| "kl": 0.004640541330445558, |
| "learning_rate": 5.032784691556269e-07, |
| "loss": 0.0002, |
| "reward": 0.2829543873667717, |
| "reward_std": 0.40924315620213747, |
| "rewards/reward_func": 0.2829543873667717, |
| "step": 3712 |
| }, |
| { |
| "completion_length": 182.4921875, |
| "epoch": 0.49779205138498595, |
| "grad_norm": 3.015625, |
| "kl": 0.003496495133731514, |
| "learning_rate": 5.022079486150141e-07, |
| "loss": 0.0001, |
| "reward": 0.3831252008676529, |
| "reward_std": 0.4445470869541168, |
| "rewards/reward_func": 0.3831252008676529, |
| "step": 3720 |
| }, |
| { |
| "completion_length": 162.03125, |
| "epoch": 0.4988625719255988, |
| "grad_norm": 4.3125, |
| "kl": 0.004948699788656086, |
| "learning_rate": 5.011374280744011e-07, |
| "loss": 0.0002, |
| "reward": 0.32710376754403114, |
| "reward_std": 0.47466727904975414, |
| "rewards/reward_func": 0.32710376754403114, |
| "step": 3728 |
| }, |
| { |
| "completion_length": 176.5859375, |
| "epoch": 0.4999330924662117, |
| "grad_norm": 3.640625, |
| "kl": 0.004679859790485352, |
| "learning_rate": 5.000669075337883e-07, |
| "loss": 0.0002, |
| "reward": 0.13110784254968166, |
| "reward_std": 0.44122389145195484, |
| "rewards/reward_func": 0.13110784254968166, |
| "step": 3736 |
| }, |
| { |
| "completion_length": 167.71875, |
| "epoch": 0.5010036130068246, |
| "grad_norm": 4.78125, |
| "kl": 0.004513267427682877, |
| "learning_rate": 4.989963869931754e-07, |
| "loss": 0.0002, |
| "reward": 0.15786111541092396, |
| "reward_std": 0.606589537113905, |
| "rewards/reward_func": 0.15786111541092396, |
| "step": 3744 |
| }, |
| { |
| "completion_length": 150.421875, |
| "epoch": 0.5020741335474375, |
| "grad_norm": 5.3125, |
| "kl": 0.004389044945128262, |
| "learning_rate": 4.979258664525626e-07, |
| "loss": 0.0002, |
| "reward": 0.4018897293135524, |
| "reward_std": 0.44968966394662857, |
| "rewards/reward_func": 0.4018897293135524, |
| "step": 3752 |
| }, |
| { |
| "completion_length": 157.28125, |
| "epoch": 0.5031446540880503, |
| "grad_norm": 3.109375, |
| "kl": 0.004845765855861828, |
| "learning_rate": 4.968553459119496e-07, |
| "loss": 0.0002, |
| "reward": 0.5019057989120483, |
| "reward_std": 0.43940271995961666, |
| "rewards/reward_func": 0.5019057989120483, |
| "step": 3760 |
| }, |
| { |
| "completion_length": 183.84375, |
| "epoch": 0.5042151746286632, |
| "grad_norm": 2.890625, |
| "kl": 0.004980318364687264, |
| "learning_rate": 4.957848253713368e-07, |
| "loss": 0.0002, |
| "reward": 0.10100116580724716, |
| "reward_std": 0.5983940260484815, |
| "rewards/reward_func": 0.10100116580724716, |
| "step": 3768 |
| }, |
| { |
| "completion_length": 148.984375, |
| "epoch": 0.505285695169276, |
| "grad_norm": 2.859375, |
| "kl": 0.0051506354357115924, |
| "learning_rate": 4.947143048307239e-07, |
| "loss": 0.0002, |
| "reward": 0.2997464369982481, |
| "reward_std": 0.6431192979216576, |
| "rewards/reward_func": 0.2997464369982481, |
| "step": 3776 |
| }, |
| { |
| "completion_length": 148.5703125, |
| "epoch": 0.506356215709889, |
| "grad_norm": 3.890625, |
| "kl": 0.004320590727729723, |
| "learning_rate": 4.93643784290111e-07, |
| "loss": 0.0002, |
| "reward": 0.14957408607006073, |
| "reward_std": 0.5004684673622251, |
| "rewards/reward_func": 0.14957408607006073, |
| "step": 3784 |
| }, |
| { |
| "completion_length": 170.7734375, |
| "epoch": 0.5074267362505018, |
| "grad_norm": 4.96875, |
| "kl": 0.00426993565633893, |
| "learning_rate": 4.925732637494981e-07, |
| "loss": 0.0002, |
| "reward": 0.1513789612799883, |
| "reward_std": 0.6300474852323532, |
| "rewards/reward_func": 0.1513789612799883, |
| "step": 3792 |
| }, |
| { |
| "completion_length": 132.6796875, |
| "epoch": 0.5084972567911147, |
| "grad_norm": 3.65625, |
| "kl": 0.00518818135606125, |
| "learning_rate": 4.915027432088853e-07, |
| "loss": 0.0002, |
| "reward": 0.2980203665792942, |
| "reward_std": 0.39504921436309814, |
| "rewards/reward_func": 0.2980203665792942, |
| "step": 3800 |
| }, |
| { |
| "completion_length": 143.1875, |
| "epoch": 0.5095677773317275, |
| "grad_norm": 4.5625, |
| "kl": 0.004469432285986841, |
| "learning_rate": 4.904322226682725e-07, |
| "loss": 0.0002, |
| "reward": 0.4323331117630005, |
| "reward_std": 0.5411158930510283, |
| "rewards/reward_func": 0.4323331117630005, |
| "step": 3808 |
| }, |
| { |
| "completion_length": 204.7578125, |
| "epoch": 0.5106382978723404, |
| "grad_norm": 4.71875, |
| "kl": 0.003944508789572865, |
| "learning_rate": 4.893617021276595e-07, |
| "loss": 0.0002, |
| "reward": 0.06451552081853151, |
| "reward_std": 0.6014019660651684, |
| "rewards/reward_func": 0.06451552081853151, |
| "step": 3816 |
| }, |
| { |
| "completion_length": 171.0625, |
| "epoch": 0.5117088184129533, |
| "grad_norm": 6.53125, |
| "kl": 0.0044076822232455015, |
| "learning_rate": 4.882911815870467e-07, |
| "loss": 0.0002, |
| "reward": 0.26693916134536266, |
| "reward_std": 0.5402739644050598, |
| "rewards/reward_func": 0.26693916134536266, |
| "step": 3824 |
| }, |
| { |
| "completion_length": 160.0703125, |
| "epoch": 0.5127793389535662, |
| "grad_norm": 3.734375, |
| "kl": 0.004957833531079814, |
| "learning_rate": 4.872206610464339e-07, |
| "loss": 0.0002, |
| "reward": 0.2441606866195798, |
| "reward_std": 0.6625313609838486, |
| "rewards/reward_func": 0.2441606866195798, |
| "step": 3832 |
| }, |
| { |
| "completion_length": 155.8515625, |
| "epoch": 0.513849859494179, |
| "grad_norm": 3.640625, |
| "kl": 0.004840250330744311, |
| "learning_rate": 4.861501405058209e-07, |
| "loss": 0.0002, |
| "reward": 0.3202288933098316, |
| "reward_std": 0.6590756271034479, |
| "rewards/reward_func": 0.3202288933098316, |
| "step": 3840 |
| }, |
| { |
| "completion_length": 170.21875, |
| "epoch": 0.5149203800347919, |
| "grad_norm": 4.5625, |
| "kl": 0.005241601204033941, |
| "learning_rate": 4.850796199652081e-07, |
| "loss": 0.0002, |
| "reward": 0.11097644921392202, |
| "reward_std": 0.6563504040241241, |
| "rewards/reward_func": 0.11097644921392202, |
| "step": 3848 |
| }, |
| { |
| "completion_length": 172.3359375, |
| "epoch": 0.5159909005754048, |
| "grad_norm": 4.71875, |
| "kl": 0.0044063644600100815, |
| "learning_rate": 4.840090994245952e-07, |
| "loss": 0.0002, |
| "reward": 0.26450240099802613, |
| "reward_std": 0.6473797373473644, |
| "rewards/reward_func": 0.26450240099802613, |
| "step": 3856 |
| }, |
| { |
| "completion_length": 188.34375, |
| "epoch": 0.5170614211160177, |
| "grad_norm": 3.703125, |
| "kl": 0.004124164639506489, |
| "learning_rate": 4.829385788839824e-07, |
| "loss": 0.0002, |
| "reward": 0.09523116052150726, |
| "reward_std": 0.5340174566954374, |
| "rewards/reward_func": 0.09523116052150726, |
| "step": 3864 |
| }, |
| { |
| "completion_length": 157.3046875, |
| "epoch": 0.5181319416566306, |
| "grad_norm": 4.5, |
| "kl": 0.004781241004820913, |
| "learning_rate": 4.818680583433694e-07, |
| "loss": 0.0002, |
| "reward": 0.3139430582523346, |
| "reward_std": 0.5873579885810614, |
| "rewards/reward_func": 0.3139430582523346, |
| "step": 3872 |
| }, |
| { |
| "completion_length": 153.1015625, |
| "epoch": 0.5192024621972434, |
| "grad_norm": 4.28125, |
| "kl": 0.0045044064754620194, |
| "learning_rate": 4.807975378027566e-07, |
| "loss": 0.0002, |
| "reward": 0.24596689827740192, |
| "reward_std": 0.5791397895663977, |
| "rewards/reward_func": 0.24596689827740192, |
| "step": 3880 |
| }, |
| { |
| "completion_length": 166.8671875, |
| "epoch": 0.5202729827378563, |
| "grad_norm": 4.8125, |
| "kl": 0.004427089152159169, |
| "learning_rate": 4.797270172621437e-07, |
| "loss": 0.0002, |
| "reward": 0.3911690888926387, |
| "reward_std": 0.5238520000129938, |
| "rewards/reward_func": 0.3911690888926387, |
| "step": 3888 |
| }, |
| { |
| "completion_length": 182.296875, |
| "epoch": 0.5213435032784691, |
| "grad_norm": 3.6875, |
| "kl": 0.00470818518078886, |
| "learning_rate": 4.786564967215308e-07, |
| "loss": 0.0002, |
| "reward": -0.06911014439538121, |
| "reward_std": 0.6354586593806744, |
| "rewards/reward_func": -0.06911014439538121, |
| "step": 3896 |
| }, |
| { |
| "completion_length": 151.5859375, |
| "epoch": 0.522414023819082, |
| "grad_norm": 4.65625, |
| "kl": 0.004992738307919353, |
| "learning_rate": 4.775859761809179e-07, |
| "loss": 0.0002, |
| "reward": 0.441136134788394, |
| "reward_std": 0.5409799609333277, |
| "rewards/reward_func": 0.441136134788394, |
| "step": 3904 |
| }, |
| { |
| "completion_length": 158.1875, |
| "epoch": 0.5234845443596949, |
| "grad_norm": 3.921875, |
| "kl": 0.004533803061349317, |
| "learning_rate": 4.765154556403051e-07, |
| "loss": 0.0002, |
| "reward": 0.36645470559597015, |
| "reward_std": 0.5416577542200685, |
| "rewards/reward_func": 0.36645470559597015, |
| "step": 3912 |
| }, |
| { |
| "completion_length": 177.9140625, |
| "epoch": 0.5245550649003078, |
| "grad_norm": 2.78125, |
| "kl": 0.004515117674600333, |
| "learning_rate": 4.754449350996922e-07, |
| "loss": 0.0002, |
| "reward": 0.11683559231460094, |
| "reward_std": 0.5318781770765781, |
| "rewards/reward_func": 0.11683559231460094, |
| "step": 3920 |
| }, |
| { |
| "completion_length": 162.484375, |
| "epoch": 0.5256255854409206, |
| "grad_norm": 2.78125, |
| "kl": 0.00410384067799896, |
| "learning_rate": 4.7437441455907934e-07, |
| "loss": 0.0002, |
| "reward": 0.5109116761013865, |
| "reward_std": 0.389411685988307, |
| "rewards/reward_func": 0.5109116761013865, |
| "step": 3928 |
| }, |
| { |
| "completion_length": 179.484375, |
| "epoch": 0.5266961059815335, |
| "grad_norm": 4.5, |
| "kl": 0.004382628481835127, |
| "learning_rate": 4.7330389401846646e-07, |
| "loss": 0.0002, |
| "reward": 0.12338575161993504, |
| "reward_std": 0.49865792877972126, |
| "rewards/reward_func": 0.12338575161993504, |
| "step": 3936 |
| }, |
| { |
| "completion_length": 168.8203125, |
| "epoch": 0.5277666265221463, |
| "grad_norm": 3.78125, |
| "kl": 0.004615213518263772, |
| "learning_rate": 4.722333734778536e-07, |
| "loss": 0.0002, |
| "reward": 0.2909085564315319, |
| "reward_std": 0.44954105466604233, |
| "rewards/reward_func": 0.2909085564315319, |
| "step": 3944 |
| }, |
| { |
| "completion_length": 186.40625, |
| "epoch": 0.5288371470627593, |
| "grad_norm": 3.703125, |
| "kl": 0.003957096429076046, |
| "learning_rate": 4.7116285293724075e-07, |
| "loss": 0.0002, |
| "reward": 0.35753502883017063, |
| "reward_std": 0.5898796916007996, |
| "rewards/reward_func": 0.35753502883017063, |
| "step": 3952 |
| }, |
| { |
| "completion_length": 165.03125, |
| "epoch": 0.5299076676033722, |
| "grad_norm": 3.25, |
| "kl": 0.0045530806528404355, |
| "learning_rate": 4.700923323966278e-07, |
| "loss": 0.0002, |
| "reward": 0.2869006171822548, |
| "reward_std": 0.4535912126302719, |
| "rewards/reward_func": 0.2869006171822548, |
| "step": 3960 |
| }, |
| { |
| "completion_length": 148.8203125, |
| "epoch": 0.530978188143985, |
| "grad_norm": 4.25, |
| "kl": 0.00460378042771481, |
| "learning_rate": 4.69021811856015e-07, |
| "loss": 0.0002, |
| "reward": 0.48801288567483425, |
| "reward_std": 0.4225266771391034, |
| "rewards/reward_func": 0.48801288567483425, |
| "step": 3968 |
| }, |
| { |
| "completion_length": 174.203125, |
| "epoch": 0.5320487086845979, |
| "grad_norm": 2.65625, |
| "kl": 0.004049515846418217, |
| "learning_rate": 4.679512913154021e-07, |
| "loss": 0.0002, |
| "reward": 0.418088311329484, |
| "reward_std": 0.5685894265770912, |
| "rewards/reward_func": 0.418088311329484, |
| "step": 3976 |
| }, |
| { |
| "completion_length": 165.2578125, |
| "epoch": 0.5331192292252107, |
| "grad_norm": 3.25, |
| "kl": 0.00501069356687367, |
| "learning_rate": 4.668807707747892e-07, |
| "loss": 0.0002, |
| "reward": 0.31565719842910767, |
| "reward_std": 0.6409982740879059, |
| "rewards/reward_func": 0.31565719842910767, |
| "step": 3984 |
| }, |
| { |
| "completion_length": 162.015625, |
| "epoch": 0.5341897497658237, |
| "grad_norm": 3.671875, |
| "kl": 0.0046659239451400936, |
| "learning_rate": 4.6581025023417636e-07, |
| "loss": 0.0002, |
| "reward": -0.0461183600127697, |
| "reward_std": 0.7044645324349403, |
| "rewards/reward_func": -0.0461183600127697, |
| "step": 3992 |
| }, |
| { |
| "completion_length": 142.703125, |
| "epoch": 0.5352602703064365, |
| "grad_norm": 3.84375, |
| "kl": 0.004710770619567484, |
| "learning_rate": 4.6473972969356343e-07, |
| "loss": 0.0002, |
| "reward": 0.5219798712059855, |
| "reward_std": 0.4946548119187355, |
| "rewards/reward_func": 0.5219798712059855, |
| "step": 4000 |
| }, |
| { |
| "completion_length": 146.3203125, |
| "epoch": 0.5363307908470494, |
| "grad_norm": 3.21875, |
| "kl": 0.005039886600570753, |
| "learning_rate": 4.636692091529506e-07, |
| "loss": 0.0002, |
| "reward": 0.420873555354774, |
| "reward_std": 0.4259900487959385, |
| "rewards/reward_func": 0.420873555354774, |
| "step": 4008 |
| }, |
| { |
| "completion_length": 168.2109375, |
| "epoch": 0.5374013113876622, |
| "grad_norm": 4.96875, |
| "kl": 0.004895551188383251, |
| "learning_rate": 4.625986886123377e-07, |
| "loss": 0.0002, |
| "reward": 0.3381440285593271, |
| "reward_std": 0.5715998597443104, |
| "rewards/reward_func": 0.3381440285593271, |
| "step": 4016 |
| }, |
| { |
| "completion_length": 160.578125, |
| "epoch": 0.5384718319282751, |
| "grad_norm": 3.625, |
| "kl": 0.00470035380567424, |
| "learning_rate": 4.6152816807172485e-07, |
| "loss": 0.0002, |
| "reward": 0.3439123351126909, |
| "reward_std": 0.4550882736220956, |
| "rewards/reward_func": 0.3439123351126909, |
| "step": 4024 |
| }, |
| { |
| "completion_length": 159.9453125, |
| "epoch": 0.539542352468888, |
| "grad_norm": 4.375, |
| "kl": 0.00492598774144426, |
| "learning_rate": 4.6045764753111197e-07, |
| "loss": 0.0002, |
| "reward": 0.2067430024035275, |
| "reward_std": 0.5162056926637888, |
| "rewards/reward_func": 0.2067430024035275, |
| "step": 4032 |
| }, |
| { |
| "completion_length": 166.1015625, |
| "epoch": 0.5406128730095009, |
| "grad_norm": 3.0625, |
| "kl": 0.0042450258624739945, |
| "learning_rate": 4.593871269904991e-07, |
| "loss": 0.0002, |
| "reward": 0.3529038140550256, |
| "reward_std": 0.4770152699202299, |
| "rewards/reward_func": 0.3529038140550256, |
| "step": 4040 |
| }, |
| { |
| "completion_length": 178.7109375, |
| "epoch": 0.5416833935501137, |
| "grad_norm": 4.5625, |
| "kl": 0.005025087855756283, |
| "learning_rate": 4.583166064498862e-07, |
| "loss": 0.0002, |
| "reward": -0.081031309440732, |
| "reward_std": 0.4695176286622882, |
| "rewards/reward_func": -0.081031309440732, |
| "step": 4048 |
| }, |
| { |
| "completion_length": 165.65625, |
| "epoch": 0.5427539140907266, |
| "grad_norm": 4.4375, |
| "kl": 0.0055138085735961795, |
| "learning_rate": 4.572460859092734e-07, |
| "loss": 0.0002, |
| "reward": -0.007585156708955765, |
| "reward_std": 0.5119953658431768, |
| "rewards/reward_func": -0.007585156708955765, |
| "step": 4056 |
| }, |
| { |
| "completion_length": 156.34375, |
| "epoch": 0.5438244346313394, |
| "grad_norm": 3.796875, |
| "kl": 0.0043381388823036104, |
| "learning_rate": 4.5617556536866045e-07, |
| "loss": 0.0002, |
| "reward": 0.13799802958965302, |
| "reward_std": 0.6221343949437141, |
| "rewards/reward_func": 0.13799802958965302, |
| "step": 4064 |
| }, |
| { |
| "completion_length": 191.2734375, |
| "epoch": 0.5448949551719524, |
| "grad_norm": 4.71875, |
| "kl": 0.004081014514667913, |
| "learning_rate": 4.5510504482804763e-07, |
| "loss": 0.0002, |
| "reward": -0.10252122208476067, |
| "reward_std": 0.5134240631014109, |
| "rewards/reward_func": -0.10252122208476067, |
| "step": 4072 |
| }, |
| { |
| "completion_length": 149.390625, |
| "epoch": 0.5459654757125653, |
| "grad_norm": 3.640625, |
| "kl": 0.004793624917510897, |
| "learning_rate": 4.540345242874347e-07, |
| "loss": 0.0002, |
| "reward": 0.42647568974643946, |
| "reward_std": 0.6049776747822762, |
| "rewards/reward_func": 0.42647568974643946, |
| "step": 4080 |
| }, |
| { |
| "completion_length": 169.515625, |
| "epoch": 0.5470359962531781, |
| "grad_norm": 5.375, |
| "kl": 0.005105009535327554, |
| "learning_rate": 4.5296400374682187e-07, |
| "loss": 0.0002, |
| "reward": 0.14484626054763794, |
| "reward_std": 0.715711385011673, |
| "rewards/reward_func": 0.14484626054763794, |
| "step": 4088 |
| }, |
| { |
| "completion_length": 184.5859375, |
| "epoch": 0.548106516793791, |
| "grad_norm": 2.21875, |
| "kl": 0.00399865786312148, |
| "learning_rate": 4.51893483206209e-07, |
| "loss": 0.0002, |
| "reward": 0.27984373830258846, |
| "reward_std": 0.6154143176972866, |
| "rewards/reward_func": 0.27984373830258846, |
| "step": 4096 |
| }, |
| { |
| "completion_length": 148.9765625, |
| "epoch": 0.5491770373344038, |
| "grad_norm": 4.8125, |
| "kl": 0.005411504651419818, |
| "learning_rate": 4.508229626655961e-07, |
| "loss": 0.0002, |
| "reward": 0.3810861259698868, |
| "reward_std": 0.6340535804629326, |
| "rewards/reward_func": 0.3810861259698868, |
| "step": 4104 |
| }, |
| { |
| "completion_length": 181.171875, |
| "epoch": 0.5502475578750168, |
| "grad_norm": 3.734375, |
| "kl": 0.003742568180314265, |
| "learning_rate": 4.4975244212498324e-07, |
| "loss": 0.0001, |
| "reward": 0.314508281648159, |
| "reward_std": 0.5607537031173706, |
| "rewards/reward_func": 0.314508281648159, |
| "step": 4112 |
| }, |
| { |
| "completion_length": 131.109375, |
| "epoch": 0.5513180784156296, |
| "grad_norm": 6.5625, |
| "kl": 0.005468558054417372, |
| "learning_rate": 4.486819215843704e-07, |
| "loss": 0.0002, |
| "reward": 0.43094983510673046, |
| "reward_std": 0.39848934579640627, |
| "rewards/reward_func": 0.43094983510673046, |
| "step": 4120 |
| }, |
| { |
| "completion_length": 144.765625, |
| "epoch": 0.5523885989562425, |
| "grad_norm": 5.15625, |
| "kl": 0.005072243511676788, |
| "learning_rate": 4.476114010437575e-07, |
| "loss": 0.0002, |
| "reward": 0.16479766555130482, |
| "reward_std": 0.624469917267561, |
| "rewards/reward_func": 0.16479766555130482, |
| "step": 4128 |
| }, |
| { |
| "completion_length": 150.828125, |
| "epoch": 0.5534591194968553, |
| "grad_norm": 5.5625, |
| "kl": 0.004988896253053099, |
| "learning_rate": 4.4654088050314465e-07, |
| "loss": 0.0002, |
| "reward": 0.23024853132665157, |
| "reward_std": 0.5588976237922907, |
| "rewards/reward_func": 0.23024853132665157, |
| "step": 4136 |
| }, |
| { |
| "completion_length": 162.78125, |
| "epoch": 0.5545296400374682, |
| "grad_norm": 6.53125, |
| "kl": 0.004712989641120657, |
| "learning_rate": 4.454703599625317e-07, |
| "loss": 0.0002, |
| "reward": 0.27441484900191426, |
| "reward_std": 0.4914160780608654, |
| "rewards/reward_func": 0.27441484900191426, |
| "step": 4144 |
| }, |
| { |
| "completion_length": 203.6171875, |
| "epoch": 0.555600160578081, |
| "grad_norm": 3.6875, |
| "kl": 0.003505587810650468, |
| "learning_rate": 4.443998394219189e-07, |
| "loss": 0.0001, |
| "reward": 0.009393353015184402, |
| "reward_std": 0.6114509087055922, |
| "rewards/reward_func": 0.009393353015184402, |
| "step": 4152 |
| }, |
| { |
| "completion_length": 166.7578125, |
| "epoch": 0.556670681118694, |
| "grad_norm": 4.78125, |
| "kl": 0.004777590365847573, |
| "learning_rate": 4.43329318881306e-07, |
| "loss": 0.0002, |
| "reward": 0.11833875393494964, |
| "reward_std": 0.5748403836041689, |
| "rewards/reward_func": 0.11833875393494964, |
| "step": 4160 |
| }, |
| { |
| "completion_length": 150.4140625, |
| "epoch": 0.5577412016593069, |
| "grad_norm": 3.28125, |
| "kl": 0.0049895147094503045, |
| "learning_rate": 4.4225879834069314e-07, |
| "loss": 0.0002, |
| "reward": 0.2932877875864506, |
| "reward_std": 0.6367702716961503, |
| "rewards/reward_func": 0.2932877875864506, |
| "step": 4168 |
| }, |
| { |
| "completion_length": 156.7109375, |
| "epoch": 0.5588117221999197, |
| "grad_norm": 5.09375, |
| "kl": 0.005086433404358104, |
| "learning_rate": 4.4118827780008026e-07, |
| "loss": 0.0002, |
| "reward": 0.14813962019979954, |
| "reward_std": 0.5115363541990519, |
| "rewards/reward_func": 0.14813962019979954, |
| "step": 4176 |
| }, |
| { |
| "completion_length": 166.015625, |
| "epoch": 0.5598822427405326, |
| "grad_norm": 3.953125, |
| "kl": 0.004459643067093566, |
| "learning_rate": 4.401177572594674e-07, |
| "loss": 0.0002, |
| "reward": 0.17805076017975807, |
| "reward_std": 0.7240184545516968, |
| "rewards/reward_func": 0.17805076017975807, |
| "step": 4184 |
| }, |
| { |
| "completion_length": 149.1328125, |
| "epoch": 0.5609527632811454, |
| "grad_norm": 3.8125, |
| "kl": 0.004602790024364367, |
| "learning_rate": 4.390472367188545e-07, |
| "loss": 0.0002, |
| "reward": 0.46490050479769707, |
| "reward_std": 0.4432865995913744, |
| "rewards/reward_func": 0.46490050479769707, |
| "step": 4192 |
| }, |
| { |
| "completion_length": 183.953125, |
| "epoch": 0.5620232838217584, |
| "grad_norm": 3.625, |
| "kl": 0.00446239989832975, |
| "learning_rate": 4.379767161782417e-07, |
| "loss": 0.0002, |
| "reward": 0.19806094001978636, |
| "reward_std": 0.6545614078640938, |
| "rewards/reward_func": 0.19806094001978636, |
| "step": 4200 |
| }, |
| { |
| "completion_length": 195.46875, |
| "epoch": 0.5630938043623712, |
| "grad_norm": 3.5625, |
| "kl": 0.003972954727942124, |
| "learning_rate": 4.3690619563762875e-07, |
| "loss": 0.0002, |
| "reward": -0.12718784296885133, |
| "reward_std": 0.5749151539057493, |
| "rewards/reward_func": -0.12718784296885133, |
| "step": 4208 |
| }, |
| { |
| "completion_length": 137.4453125, |
| "epoch": 0.5641643249029841, |
| "grad_norm": 4.25, |
| "kl": 0.004893360834103078, |
| "learning_rate": 4.358356750970159e-07, |
| "loss": 0.0002, |
| "reward": 0.24862979911267757, |
| "reward_std": 0.6906272917985916, |
| "rewards/reward_func": 0.24862979911267757, |
| "step": 4216 |
| }, |
| { |
| "completion_length": 153.109375, |
| "epoch": 0.5652348454435969, |
| "grad_norm": 3.578125, |
| "kl": 0.0049685456906445324, |
| "learning_rate": 4.3476515455640304e-07, |
| "loss": 0.0002, |
| "reward": 0.41499729454517365, |
| "reward_std": 0.4691876629367471, |
| "rewards/reward_func": 0.41499729454517365, |
| "step": 4224 |
| }, |
| { |
| "completion_length": 149.8515625, |
| "epoch": 0.5663053659842098, |
| "grad_norm": 5.59375, |
| "kl": 0.004520065325777978, |
| "learning_rate": 4.3369463401579017e-07, |
| "loss": 0.0002, |
| "reward": 0.318800778593868, |
| "reward_std": 0.6351992357522249, |
| "rewards/reward_func": 0.318800778593868, |
| "step": 4232 |
| }, |
| { |
| "completion_length": 142.421875, |
| "epoch": 0.5673758865248227, |
| "grad_norm": 4.59375, |
| "kl": 0.005744964553741738, |
| "learning_rate": 4.326241134751773e-07, |
| "loss": 0.0002, |
| "reward": 0.4124446418136358, |
| "reward_std": 0.5395534262061119, |
| "rewards/reward_func": 0.4124446418136358, |
| "step": 4240 |
| }, |
| { |
| "completion_length": 163.6953125, |
| "epoch": 0.5684464070654356, |
| "grad_norm": 4.6875, |
| "kl": 0.004186704114545137, |
| "learning_rate": 4.315535929345644e-07, |
| "loss": 0.0002, |
| "reward": 0.35636366717517376, |
| "reward_std": 0.6417583487927914, |
| "rewards/reward_func": 0.35636366717517376, |
| "step": 4248 |
| }, |
| { |
| "completion_length": 185.578125, |
| "epoch": 0.5695169276060484, |
| "grad_norm": 3.84375, |
| "kl": 0.004251696169376373, |
| "learning_rate": 4.3048307239395153e-07, |
| "loss": 0.0002, |
| "reward": 0.30847467109560966, |
| "reward_std": 0.44796227291226387, |
| "rewards/reward_func": 0.30847467109560966, |
| "step": 4256 |
| }, |
| { |
| "completion_length": 211.21875, |
| "epoch": 0.5705874481466613, |
| "grad_norm": 2.3125, |
| "kl": 0.004341925901826471, |
| "learning_rate": 4.294125518533387e-07, |
| "loss": 0.0002, |
| "reward": 0.2743415031582117, |
| "reward_std": 0.45934509858489037, |
| "rewards/reward_func": 0.2743415031582117, |
| "step": 4264 |
| }, |
| { |
| "completion_length": 171.7265625, |
| "epoch": 0.5716579686872741, |
| "grad_norm": 1.9453125, |
| "kl": 0.004344686400145292, |
| "learning_rate": 4.2834203131272577e-07, |
| "loss": 0.0002, |
| "reward": 0.2994256131350994, |
| "reward_std": 0.6268932148814201, |
| "rewards/reward_func": 0.2994256131350994, |
| "step": 4272 |
| }, |
| { |
| "completion_length": 138.3984375, |
| "epoch": 0.5727284892278871, |
| "grad_norm": 3.171875, |
| "kl": 0.006193301291204989, |
| "learning_rate": 4.2727151077211295e-07, |
| "loss": 0.0002, |
| "reward": 0.3124155914410949, |
| "reward_std": 0.49435919895768166, |
| "rewards/reward_func": 0.3124155914410949, |
| "step": 4280 |
| }, |
| { |
| "completion_length": 175.421875, |
| "epoch": 0.5737990097685, |
| "grad_norm": 3.609375, |
| "kl": 0.004771079227793962, |
| "learning_rate": 4.262009902315e-07, |
| "loss": 0.0002, |
| "reward": 0.27722545340657234, |
| "reward_std": 0.5442187786102295, |
| "rewards/reward_func": 0.27722545340657234, |
| "step": 4288 |
| }, |
| { |
| "completion_length": 224.7265625, |
| "epoch": 0.5748695303091128, |
| "grad_norm": 3.703125, |
| "kl": 0.0033396084618289024, |
| "learning_rate": 4.251304696908872e-07, |
| "loss": 0.0001, |
| "reward": -0.16931618377566338, |
| "reward_std": 0.5313975028693676, |
| "rewards/reward_func": -0.16931618377566338, |
| "step": 4296 |
| }, |
| { |
| "completion_length": 186.859375, |
| "epoch": 0.5759400508497257, |
| "grad_norm": 4.75, |
| "kl": 0.0042559900030028075, |
| "learning_rate": 4.240599491502743e-07, |
| "loss": 0.0002, |
| "reward": 0.13033189252018929, |
| "reward_std": 0.3756987228989601, |
| "rewards/reward_func": 0.13033189252018929, |
| "step": 4304 |
| }, |
| { |
| "completion_length": 142.578125, |
| "epoch": 0.5770105713903385, |
| "grad_norm": 3.9375, |
| "kl": 0.005709152843337506, |
| "learning_rate": 4.2298942860966143e-07, |
| "loss": 0.0002, |
| "reward": 0.3865806292742491, |
| "reward_std": 0.6126521602272987, |
| "rewards/reward_func": 0.3865806292742491, |
| "step": 4312 |
| }, |
| { |
| "completion_length": 164.7890625, |
| "epoch": 0.5780810919309515, |
| "grad_norm": 3.125, |
| "kl": 0.0045434608473442495, |
| "learning_rate": 4.2191890806904856e-07, |
| "loss": 0.0002, |
| "reward": 0.3333674664609134, |
| "reward_std": 0.6179927475750446, |
| "rewards/reward_func": 0.3333674664609134, |
| "step": 4320 |
| }, |
| { |
| "completion_length": 118.8515625, |
| "epoch": 0.5791516124715643, |
| "grad_norm": 3.703125, |
| "kl": 0.004566928721033037, |
| "learning_rate": 4.208483875284357e-07, |
| "loss": 0.0002, |
| "reward": 0.6828272566199303, |
| "reward_std": 0.41035995725542307, |
| "rewards/reward_func": 0.6828272566199303, |
| "step": 4328 |
| }, |
| { |
| "completion_length": 141.640625, |
| "epoch": 0.5802221330121772, |
| "grad_norm": 4.125, |
| "kl": 0.004908986215014011, |
| "learning_rate": 4.197778669878228e-07, |
| "loss": 0.0002, |
| "reward": 0.4755503498017788, |
| "reward_std": 0.5121949464082718, |
| "rewards/reward_func": 0.4755503498017788, |
| "step": 4336 |
| }, |
| { |
| "completion_length": 143.0625, |
| "epoch": 0.58129265355279, |
| "grad_norm": 4.09375, |
| "kl": 0.004488215548917651, |
| "learning_rate": 4.1870734644720997e-07, |
| "loss": 0.0002, |
| "reward": 0.5884530800394714, |
| "reward_std": 0.44153958186507225, |
| "rewards/reward_func": 0.5884530800394714, |
| "step": 4344 |
| }, |
| { |
| "completion_length": 148.1796875, |
| "epoch": 0.5823631740934029, |
| "grad_norm": 3.0, |
| "kl": 0.0046905699709896, |
| "learning_rate": 4.1763682590659704e-07, |
| "loss": 0.0002, |
| "reward": 0.47682703845202923, |
| "reward_std": 0.4733074624091387, |
| "rewards/reward_func": 0.47682703845202923, |
| "step": 4352 |
| }, |
| { |
| "completion_length": 150.296875, |
| "epoch": 0.5834336946340158, |
| "grad_norm": 4.28125, |
| "kl": 0.004917474143439904, |
| "learning_rate": 4.165663053659842e-07, |
| "loss": 0.0002, |
| "reward": 0.2567645199596882, |
| "reward_std": 0.6055057626217604, |
| "rewards/reward_func": 0.2567645199596882, |
| "step": 4360 |
| }, |
| { |
| "completion_length": 205.234375, |
| "epoch": 0.5845042151746287, |
| "grad_norm": 4.09375, |
| "kl": 0.00432168306724634, |
| "learning_rate": 4.1549578482537134e-07, |
| "loss": 0.0002, |
| "reward": 0.04042044514790177, |
| "reward_std": 0.5906463749706745, |
| "rewards/reward_func": 0.04042044514790177, |
| "step": 4368 |
| }, |
| { |
| "completion_length": 147.5625, |
| "epoch": 0.5855747357152415, |
| "grad_norm": 3.796875, |
| "kl": 0.005373828811571002, |
| "learning_rate": 4.1442526428475846e-07, |
| "loss": 0.0002, |
| "reward": 0.3917626924812794, |
| "reward_std": 0.5444907881319523, |
| "rewards/reward_func": 0.3917626924812794, |
| "step": 4376 |
| }, |
| { |
| "completion_length": 203.15625, |
| "epoch": 0.5866452562558544, |
| "grad_norm": 2.296875, |
| "kl": 0.0036935079260729253, |
| "learning_rate": 4.133547437441456e-07, |
| "loss": 0.0001, |
| "reward": 0.09531690180301666, |
| "reward_std": 0.5034721679985523, |
| "rewards/reward_func": 0.09531690180301666, |
| "step": 4384 |
| }, |
| { |
| "completion_length": 148.1328125, |
| "epoch": 0.5877157767964672, |
| "grad_norm": 6.0, |
| "kl": 0.0050933739403262734, |
| "learning_rate": 4.122842232035327e-07, |
| "loss": 0.0002, |
| "reward": 0.11112022027373314, |
| "reward_std": 0.5623177271336317, |
| "rewards/reward_func": 0.11112022027373314, |
| "step": 4392 |
| }, |
| { |
| "completion_length": 185.1796875, |
| "epoch": 0.5887862973370801, |
| "grad_norm": 2.796875, |
| "kl": 0.003696839907206595, |
| "learning_rate": 4.112137026629198e-07, |
| "loss": 0.0001, |
| "reward": 0.37225864082574844, |
| "reward_std": 0.6047016642987728, |
| "rewards/reward_func": 0.37225864082574844, |
| "step": 4400 |
| }, |
| { |
| "completion_length": 156.125, |
| "epoch": 0.5898568178776931, |
| "grad_norm": 5.5, |
| "kl": 0.004960794060025364, |
| "learning_rate": 4.10143182122307e-07, |
| "loss": 0.0002, |
| "reward": 0.4381309971213341, |
| "reward_std": 0.3526679091155529, |
| "rewards/reward_func": 0.4381309971213341, |
| "step": 4408 |
| }, |
| { |
| "completion_length": 173.9453125, |
| "epoch": 0.5909273384183059, |
| "grad_norm": 5.40625, |
| "kl": 0.0046878808352630585, |
| "learning_rate": 4.0907266158169407e-07, |
| "loss": 0.0002, |
| "reward": 0.11471654986962676, |
| "reward_std": 0.7081250138580799, |
| "rewards/reward_func": 0.11471654986962676, |
| "step": 4416 |
| }, |
| { |
| "completion_length": 164.6875, |
| "epoch": 0.5919978589589188, |
| "grad_norm": 3.875, |
| "kl": 0.004589978780131787, |
| "learning_rate": 4.0800214104108124e-07, |
| "loss": 0.0002, |
| "reward": 0.242947518825531, |
| "reward_std": 0.511182009242475, |
| "rewards/reward_func": 0.242947518825531, |
| "step": 4424 |
| }, |
| { |
| "completion_length": 170.96875, |
| "epoch": 0.5930683794995316, |
| "grad_norm": 3.203125, |
| "kl": 0.004504337441176176, |
| "learning_rate": 4.069316205004683e-07, |
| "loss": 0.0002, |
| "reward": 0.13327412493526936, |
| "reward_std": 0.7021276205778122, |
| "rewards/reward_func": 0.13327412493526936, |
| "step": 4432 |
| }, |
| { |
| "completion_length": 166.96875, |
| "epoch": 0.5941389000401445, |
| "grad_norm": 3.65625, |
| "kl": 0.004282756824977696, |
| "learning_rate": 4.0586109995985543e-07, |
| "loss": 0.0002, |
| "reward": 0.2241785153746605, |
| "reward_std": 0.5987379960715771, |
| "rewards/reward_func": 0.2241785153746605, |
| "step": 4440 |
| }, |
| { |
| "completion_length": 174.84375, |
| "epoch": 0.5952094205807574, |
| "grad_norm": 3.3125, |
| "kl": 0.004621970321750268, |
| "learning_rate": 4.047905794192426e-07, |
| "loss": 0.0002, |
| "reward": 0.356457632035017, |
| "reward_std": 0.5185628831386566, |
| "rewards/reward_func": 0.356457632035017, |
| "step": 4448 |
| }, |
| { |
| "completion_length": 151.5859375, |
| "epoch": 0.5962799411213703, |
| "grad_norm": 5.46875, |
| "kl": 0.004531825426965952, |
| "learning_rate": 4.037200588786297e-07, |
| "loss": 0.0002, |
| "reward": 0.41319750994443893, |
| "reward_std": 0.47001610416918993, |
| "rewards/reward_func": 0.41319750994443893, |
| "step": 4456 |
| }, |
| { |
| "completion_length": 155.9375, |
| "epoch": 0.5973504616619831, |
| "grad_norm": 3.875, |
| "kl": 0.005843940074555576, |
| "learning_rate": 4.0264953833801685e-07, |
| "loss": 0.0002, |
| "reward": 0.16792790032923222, |
| "reward_std": 0.45045214518904686, |
| "rewards/reward_func": 0.16792790032923222, |
| "step": 4464 |
| }, |
| { |
| "completion_length": 204.1484375, |
| "epoch": 0.598420982202596, |
| "grad_norm": 3.734375, |
| "kl": 0.0037444017361849546, |
| "learning_rate": 4.0157901779740397e-07, |
| "loss": 0.0001, |
| "reward": 0.11755906883627176, |
| "reward_std": 0.5679098833352327, |
| "rewards/reward_func": 0.11755906883627176, |
| "step": 4472 |
| }, |
| { |
| "completion_length": 153.7734375, |
| "epoch": 0.5994915027432088, |
| "grad_norm": 5.78125, |
| "kl": 0.004829052748391405, |
| "learning_rate": 4.005084972567911e-07, |
| "loss": 0.0002, |
| "reward": 0.19808213412761688, |
| "reward_std": 0.3854016624391079, |
| "rewards/reward_func": 0.19808213412761688, |
| "step": 4480 |
| }, |
| { |
| "completion_length": 203.421875, |
| "epoch": 0.6005620232838218, |
| "grad_norm": 5.9375, |
| "kl": 0.003742009517736733, |
| "learning_rate": 3.994379767161782e-07, |
| "loss": 0.0001, |
| "reward": 0.232608491089195, |
| "reward_std": 0.5558800995349884, |
| "rewards/reward_func": 0.232608491089195, |
| "step": 4488 |
| }, |
| { |
| "completion_length": 154.265625, |
| "epoch": 0.6016325438244347, |
| "grad_norm": 3.96875, |
| "kl": 0.004953162686433643, |
| "learning_rate": 3.9836745617556534e-07, |
| "loss": 0.0002, |
| "reward": 0.4288094639778137, |
| "reward_std": 0.416667815297842, |
| "rewards/reward_func": 0.4288094639778137, |
| "step": 4496 |
| }, |
| { |
| "completion_length": 166.2890625, |
| "epoch": 0.6027030643650475, |
| "grad_norm": 4.40625, |
| "kl": 0.004299461928894743, |
| "learning_rate": 3.9729693563495246e-07, |
| "loss": 0.0002, |
| "reward": 0.12346869148313999, |
| "reward_std": 0.6307908529415727, |
| "rewards/reward_func": 0.12346869148313999, |
| "step": 4504 |
| }, |
| { |
| "completion_length": 157.078125, |
| "epoch": 0.6037735849056604, |
| "grad_norm": 3.796875, |
| "kl": 0.004897929902654141, |
| "learning_rate": 3.9622641509433963e-07, |
| "loss": 0.0002, |
| "reward": 0.3149998290464282, |
| "reward_std": 0.5927382819354534, |
| "rewards/reward_func": 0.3149998290464282, |
| "step": 4512 |
| }, |
| { |
| "completion_length": 168.9296875, |
| "epoch": 0.6048441054462732, |
| "grad_norm": 5.3125, |
| "kl": 0.004709256027126685, |
| "learning_rate": 3.951558945537267e-07, |
| "loss": 0.0002, |
| "reward": 0.23328473046422005, |
| "reward_std": 0.633372450247407, |
| "rewards/reward_func": 0.23328473046422005, |
| "step": 4520 |
| }, |
| { |
| "completion_length": 182.09375, |
| "epoch": 0.6059146259868862, |
| "grad_norm": 2.8125, |
| "kl": 0.0044458308548200876, |
| "learning_rate": 3.940853740131139e-07, |
| "loss": 0.0002, |
| "reward": 0.04303564690053463, |
| "reward_std": 0.46831536665558815, |
| "rewards/reward_func": 0.04303564690053463, |
| "step": 4528 |
| }, |
| { |
| "completion_length": 150.3671875, |
| "epoch": 0.606985146527499, |
| "grad_norm": 4.25, |
| "kl": 0.004751139786094427, |
| "learning_rate": 3.9301485347250094e-07, |
| "loss": 0.0002, |
| "reward": 0.6235604397952557, |
| "reward_std": 0.43624259904026985, |
| "rewards/reward_func": 0.6235604397952557, |
| "step": 4536 |
| }, |
| { |
| "completion_length": 168.140625, |
| "epoch": 0.6080556670681119, |
| "grad_norm": 4.15625, |
| "kl": 0.004754378751385957, |
| "learning_rate": 3.919443329318881e-07, |
| "loss": 0.0002, |
| "reward": 0.07649134658277035, |
| "reward_std": 0.6275423960760236, |
| "rewards/reward_func": 0.07649134658277035, |
| "step": 4544 |
| }, |
| { |
| "completion_length": 145.2109375, |
| "epoch": 0.6091261876087247, |
| "grad_norm": 4.15625, |
| "kl": 0.004508535988861695, |
| "learning_rate": 3.9087381239127524e-07, |
| "loss": 0.0002, |
| "reward": 0.09985450841486454, |
| "reward_std": 0.6514963954687119, |
| "rewards/reward_func": 0.09985450841486454, |
| "step": 4552 |
| }, |
| { |
| "completion_length": 152.9921875, |
| "epoch": 0.6101967081493376, |
| "grad_norm": 3.9375, |
| "kl": 0.004148939304286614, |
| "learning_rate": 3.8980329185066236e-07, |
| "loss": 0.0002, |
| "reward": 0.45872488245368004, |
| "reward_std": 0.43476785998791456, |
| "rewards/reward_func": 0.45872488245368004, |
| "step": 4560 |
| }, |
| { |
| "completion_length": 130.296875, |
| "epoch": 0.6112672286899505, |
| "grad_norm": 5.84375, |
| "kl": 0.005563508428167552, |
| "learning_rate": 3.887327713100495e-07, |
| "loss": 0.0002, |
| "reward": 0.6556578651070595, |
| "reward_std": 0.44750246591866016, |
| "rewards/reward_func": 0.6556578651070595, |
| "step": 4568 |
| }, |
| { |
| "completion_length": 197.5078125, |
| "epoch": 0.6123377492305634, |
| "grad_norm": 3.734375, |
| "kl": 0.003922436822904274, |
| "learning_rate": 3.876622507694366e-07, |
| "loss": 0.0002, |
| "reward": -0.01122634531930089, |
| "reward_std": 0.5639722682535648, |
| "rewards/reward_func": -0.01122634531930089, |
| "step": 4576 |
| }, |
| { |
| "completion_length": 152.25, |
| "epoch": 0.6134082697711762, |
| "grad_norm": 3.53125, |
| "kl": 0.0060851232265122235, |
| "learning_rate": 3.865917302288237e-07, |
| "loss": 0.0002, |
| "reward": 0.5405775438994169, |
| "reward_std": 0.5025825463235378, |
| "rewards/reward_func": 0.5405775438994169, |
| "step": 4584 |
| }, |
| { |
| "completion_length": 165.8671875, |
| "epoch": 0.6144787903117891, |
| "grad_norm": 4.5, |
| "kl": 0.0056036147580016404, |
| "learning_rate": 3.855212096882109e-07, |
| "loss": 0.0002, |
| "reward": 0.015333790332078934, |
| "reward_std": 0.49498444236814976, |
| "rewards/reward_func": 0.015333790332078934, |
| "step": 4592 |
| }, |
| { |
| "completion_length": 185.2421875, |
| "epoch": 0.6155493108524019, |
| "grad_norm": 2.953125, |
| "kl": 0.003879066207446158, |
| "learning_rate": 3.8445068914759797e-07, |
| "loss": 0.0002, |
| "reward": 0.19957604305818677, |
| "reward_std": 0.5595576763153076, |
| "rewards/reward_func": 0.19957604305818677, |
| "step": 4600 |
| }, |
| { |
| "completion_length": 186.2578125, |
| "epoch": 0.6166198313930149, |
| "grad_norm": 3.0, |
| "kl": 0.004655700788134709, |
| "learning_rate": 3.8338016860698514e-07, |
| "loss": 0.0002, |
| "reward": 0.25618747901171446, |
| "reward_std": 0.5953042004257441, |
| "rewards/reward_func": 0.25618747901171446, |
| "step": 4608 |
| }, |
| { |
| "completion_length": 144.6640625, |
| "epoch": 0.6176903519336278, |
| "grad_norm": 3.703125, |
| "kl": 0.004998624965082854, |
| "learning_rate": 3.8230964806637226e-07, |
| "loss": 0.0002, |
| "reward": 0.5813037822954357, |
| "reward_std": 0.4772760821506381, |
| "rewards/reward_func": 0.5813037822954357, |
| "step": 4616 |
| }, |
| { |
| "completion_length": 197.1015625, |
| "epoch": 0.6187608724742406, |
| "grad_norm": 3.71875, |
| "kl": 0.004184005607385188, |
| "learning_rate": 3.812391275257594e-07, |
| "loss": 0.0002, |
| "reward": 0.07982266321778297, |
| "reward_std": 0.5760688092559576, |
| "rewards/reward_func": 0.07982266321778297, |
| "step": 4624 |
| }, |
| { |
| "completion_length": 174.1796875, |
| "epoch": 0.6198313930148535, |
| "grad_norm": 3.34375, |
| "kl": 0.004289099859306589, |
| "learning_rate": 3.801686069851465e-07, |
| "loss": 0.0002, |
| "reward": 0.24364805966615677, |
| "reward_std": 0.5018207374960184, |
| "rewards/reward_func": 0.24364805966615677, |
| "step": 4632 |
| }, |
| { |
| "completion_length": 164.1640625, |
| "epoch": 0.6209019135554663, |
| "grad_norm": 3.375, |
| "kl": 0.004735152295324951, |
| "learning_rate": 3.7909808644453363e-07, |
| "loss": 0.0002, |
| "reward": 0.34338719584047794, |
| "reward_std": 0.5795671846717596, |
| "rewards/reward_func": 0.34338719584047794, |
| "step": 4640 |
| }, |
| { |
| "completion_length": 133.390625, |
| "epoch": 0.6219724340960792, |
| "grad_norm": 5.25, |
| "kl": 0.006147218053229153, |
| "learning_rate": 3.7802756590392075e-07, |
| "loss": 0.0002, |
| "reward": 0.38445473089814186, |
| "reward_std": 0.49531611800193787, |
| "rewards/reward_func": 0.38445473089814186, |
| "step": 4648 |
| }, |
| { |
| "completion_length": 153.3984375, |
| "epoch": 0.6230429546366921, |
| "grad_norm": 4.34375, |
| "kl": 0.004714462149422616, |
| "learning_rate": 3.769570453633079e-07, |
| "loss": 0.0002, |
| "reward": 0.30648303404450417, |
| "reward_std": 0.4680379256606102, |
| "rewards/reward_func": 0.30648303404450417, |
| "step": 4656 |
| }, |
| { |
| "completion_length": 166.171875, |
| "epoch": 0.624113475177305, |
| "grad_norm": 3.75, |
| "kl": 0.004526323958998546, |
| "learning_rate": 3.75886524822695e-07, |
| "loss": 0.0002, |
| "reward": 0.35779890790581703, |
| "reward_std": 0.6776364631950855, |
| "rewards/reward_func": 0.35779890790581703, |
| "step": 4664 |
| }, |
| { |
| "completion_length": 179.2890625, |
| "epoch": 0.6251839957179178, |
| "grad_norm": 3.46875, |
| "kl": 0.004177290364168584, |
| "learning_rate": 3.7481600428208217e-07, |
| "loss": 0.0002, |
| "reward": 0.30811624182388186, |
| "reward_std": 0.6211207360029221, |
| "rewards/reward_func": 0.30811624182388186, |
| "step": 4672 |
| }, |
| { |
| "completion_length": 182.0234375, |
| "epoch": 0.6262545162585307, |
| "grad_norm": 3.140625, |
| "kl": 0.004683909472078085, |
| "learning_rate": 3.7374548374146924e-07, |
| "loss": 0.0002, |
| "reward": 0.008799735456705093, |
| "reward_std": 0.4996862728148699, |
| "rewards/reward_func": 0.008799735456705093, |
| "step": 4680 |
| }, |
| { |
| "completion_length": 148.359375, |
| "epoch": 0.6273250367991435, |
| "grad_norm": 6.59375, |
| "kl": 0.0059346232446841896, |
| "learning_rate": 3.726749632008564e-07, |
| "loss": 0.0002, |
| "reward": 0.1749492734670639, |
| "reward_std": 0.4117685044184327, |
| "rewards/reward_func": 0.1749492734670639, |
| "step": 4688 |
| }, |
| { |
| "completion_length": 164.515625, |
| "epoch": 0.6283955573397565, |
| "grad_norm": 3.28125, |
| "kl": 0.00426359154516831, |
| "learning_rate": 3.7160444266024353e-07, |
| "loss": 0.0002, |
| "reward": 0.10608149319887161, |
| "reward_std": 0.7313233427703381, |
| "rewards/reward_func": 0.10608149319887161, |
| "step": 4696 |
| }, |
| { |
| "completion_length": 162.1640625, |
| "epoch": 0.6294660778803693, |
| "grad_norm": 3.859375, |
| "kl": 0.004711132904049009, |
| "learning_rate": 3.7053392211963065e-07, |
| "loss": 0.0002, |
| "reward": 0.13499032519757748, |
| "reward_std": 0.6217631548643112, |
| "rewards/reward_func": 0.13499032519757748, |
| "step": 4704 |
| }, |
| { |
| "completion_length": 160.1015625, |
| "epoch": 0.6305365984209822, |
| "grad_norm": 4.5, |
| "kl": 0.004943192150676623, |
| "learning_rate": 3.694634015790178e-07, |
| "loss": 0.0002, |
| "reward": 0.20859276875853539, |
| "reward_std": 0.43620782624930143, |
| "rewards/reward_func": 0.20859276875853539, |
| "step": 4712 |
| }, |
| { |
| "completion_length": 179.40625, |
| "epoch": 0.631607118961595, |
| "grad_norm": 6.46875, |
| "kl": 0.004646303132176399, |
| "learning_rate": 3.6839288103840495e-07, |
| "loss": 0.0002, |
| "reward": 0.20350963808596134, |
| "reward_std": 0.6433871760964394, |
| "rewards/reward_func": 0.20350963808596134, |
| "step": 4720 |
| }, |
| { |
| "completion_length": 176.1953125, |
| "epoch": 0.6326776395022079, |
| "grad_norm": 4.71875, |
| "kl": 0.00438886127085425, |
| "learning_rate": 3.67322360497792e-07, |
| "loss": 0.0002, |
| "reward": 0.36116465739905834, |
| "reward_std": 0.6595458313822746, |
| "rewards/reward_func": 0.36116465739905834, |
| "step": 4728 |
| }, |
| { |
| "completion_length": 132.4765625, |
| "epoch": 0.6337481600428209, |
| "grad_norm": 4.09375, |
| "kl": 0.005916833528317511, |
| "learning_rate": 3.662518399571792e-07, |
| "loss": 0.0002, |
| "reward": 0.5307797193527222, |
| "reward_std": 0.43096242286264896, |
| "rewards/reward_func": 0.5307797193527222, |
| "step": 4736 |
| }, |
| { |
| "completion_length": 163.1953125, |
| "epoch": 0.6348186805834337, |
| "grad_norm": 3.65625, |
| "kl": 0.0043211055162828416, |
| "learning_rate": 3.6518131941656626e-07, |
| "loss": 0.0002, |
| "reward": 0.3800085699185729, |
| "reward_std": 0.6451602801680565, |
| "rewards/reward_func": 0.3800085699185729, |
| "step": 4744 |
| }, |
| { |
| "completion_length": 157.9609375, |
| "epoch": 0.6358892011240466, |
| "grad_norm": 3.671875, |
| "kl": 0.004449906060472131, |
| "learning_rate": 3.6411079887595344e-07, |
| "loss": 0.0002, |
| "reward": 0.17367325257509947, |
| "reward_std": 0.5829105107113719, |
| "rewards/reward_func": 0.17367325257509947, |
| "step": 4752 |
| }, |
| { |
| "completion_length": 159.6953125, |
| "epoch": 0.6369597216646594, |
| "grad_norm": 4.09375, |
| "kl": 0.0045379805960692465, |
| "learning_rate": 3.6304027833534056e-07, |
| "loss": 0.0002, |
| "reward": 0.4868684080429375, |
| "reward_std": 0.4918051455169916, |
| "rewards/reward_func": 0.4868684080429375, |
| "step": 4760 |
| }, |
| { |
| "completion_length": 192.5234375, |
| "epoch": 0.6380302422052723, |
| "grad_norm": 2.984375, |
| "kl": 0.003665678290417418, |
| "learning_rate": 3.619697577947277e-07, |
| "loss": 0.0001, |
| "reward": -0.02173500368371606, |
| "reward_std": 0.5965735167264938, |
| "rewards/reward_func": -0.02173500368371606, |
| "step": 4768 |
| }, |
| { |
| "completion_length": 162.7109375, |
| "epoch": 0.6391007627458852, |
| "grad_norm": 4.0625, |
| "kl": 0.005193614459130913, |
| "learning_rate": 3.608992372541148e-07, |
| "loss": 0.0002, |
| "reward": 0.3424297422170639, |
| "reward_std": 0.519868329167366, |
| "rewards/reward_func": 0.3424297422170639, |
| "step": 4776 |
| }, |
| { |
| "completion_length": 167.8046875, |
| "epoch": 0.6401712832864981, |
| "grad_norm": 4.21875, |
| "kl": 0.004302638117223978, |
| "learning_rate": 3.598287167135019e-07, |
| "loss": 0.0002, |
| "reward": 0.14556433307006955, |
| "reward_std": 0.7161346226930618, |
| "rewards/reward_func": 0.14556433307006955, |
| "step": 4784 |
| }, |
| { |
| "completion_length": 186.265625, |
| "epoch": 0.6412418038271109, |
| "grad_norm": 4.4375, |
| "kl": 0.004954680422088131, |
| "learning_rate": 3.5875819617288904e-07, |
| "loss": 0.0002, |
| "reward": 0.2309811543673277, |
| "reward_std": 0.6467564664781094, |
| "rewards/reward_func": 0.2309811543673277, |
| "step": 4792 |
| }, |
| { |
| "completion_length": 168.7890625, |
| "epoch": 0.6423123243677238, |
| "grad_norm": 4.65625, |
| "kl": 0.0042629605159163475, |
| "learning_rate": 3.576876756322762e-07, |
| "loss": 0.0002, |
| "reward": 0.052162475883960724, |
| "reward_std": 0.5783581472933292, |
| "rewards/reward_func": 0.052162475883960724, |
| "step": 4800 |
| }, |
| { |
| "completion_length": 148.328125, |
| "epoch": 0.6433828449083366, |
| "grad_norm": 3.546875, |
| "kl": 0.005317616189131513, |
| "learning_rate": 3.566171550916633e-07, |
| "loss": 0.0002, |
| "reward": 0.40866485610604286, |
| "reward_std": 0.5010849069803953, |
| "rewards/reward_func": 0.40866485610604286, |
| "step": 4808 |
| }, |
| { |
| "completion_length": 166.921875, |
| "epoch": 0.6444533654489496, |
| "grad_norm": 4.5, |
| "kl": 0.004913818440400064, |
| "learning_rate": 3.5554663455105046e-07, |
| "loss": 0.0002, |
| "reward": 0.2594773005694151, |
| "reward_std": 0.6219961307942867, |
| "rewards/reward_func": 0.2594773005694151, |
| "step": 4816 |
| }, |
| { |
| "completion_length": 186.9921875, |
| "epoch": 0.6455238859895625, |
| "grad_norm": 4.9375, |
| "kl": 0.004318988474551588, |
| "learning_rate": 3.5447611401043753e-07, |
| "loss": 0.0002, |
| "reward": 0.2544400542974472, |
| "reward_std": 0.7044170759618282, |
| "rewards/reward_func": 0.2544400542974472, |
| "step": 4824 |
| }, |
| { |
| "completion_length": 181.1953125, |
| "epoch": 0.6465944065301753, |
| "grad_norm": 2.625, |
| "kl": 0.00468209947575815, |
| "learning_rate": 3.534055934698247e-07, |
| "loss": 0.0002, |
| "reward": 0.19006637297570705, |
| "reward_std": 0.4856133693829179, |
| "rewards/reward_func": 0.19006637297570705, |
| "step": 4832 |
| }, |
| { |
| "completion_length": 148.046875, |
| "epoch": 0.6476649270707882, |
| "grad_norm": 4.5625, |
| "kl": 0.00532404551631771, |
| "learning_rate": 3.5233507292921183e-07, |
| "loss": 0.0002, |
| "reward": 0.5334251541644335, |
| "reward_std": 0.469427278265357, |
| "rewards/reward_func": 0.5334251541644335, |
| "step": 4840 |
| }, |
| { |
| "completion_length": 156.171875, |
| "epoch": 0.648735447611401, |
| "grad_norm": 5.0625, |
| "kl": 0.005237195349764079, |
| "learning_rate": 3.5126455238859895e-07, |
| "loss": 0.0002, |
| "reward": 0.33213027007877827, |
| "reward_std": 0.43815805949270725, |
| "rewards/reward_func": 0.33213027007877827, |
| "step": 4848 |
| }, |
| { |
| "completion_length": 124.3359375, |
| "epoch": 0.649805968152014, |
| "grad_norm": 3.890625, |
| "kl": 0.00520428063464351, |
| "learning_rate": 3.5019403184798607e-07, |
| "loss": 0.0002, |
| "reward": 0.6764433234930038, |
| "reward_std": 0.30982979480177164, |
| "rewards/reward_func": 0.6764433234930038, |
| "step": 4856 |
| }, |
| { |
| "completion_length": 148.2421875, |
| "epoch": 0.6508764886926268, |
| "grad_norm": 3.9375, |
| "kl": 0.005578657321166247, |
| "learning_rate": 3.4912351130737324e-07, |
| "loss": 0.0002, |
| "reward": 0.10781971551477909, |
| "reward_std": 0.4713937286287546, |
| "rewards/reward_func": 0.10781971551477909, |
| "step": 4864 |
| }, |
| { |
| "completion_length": 171.9453125, |
| "epoch": 0.6519470092332397, |
| "grad_norm": 7.0625, |
| "kl": 0.004915560552035458, |
| "learning_rate": 3.480529907667603e-07, |
| "loss": 0.0002, |
| "reward": 0.25978787057101727, |
| "reward_std": 0.534579697996378, |
| "rewards/reward_func": 0.25978787057101727, |
| "step": 4872 |
| }, |
| { |
| "completion_length": 173.890625, |
| "epoch": 0.6530175297738525, |
| "grad_norm": 3.578125, |
| "kl": 0.004566041403450072, |
| "learning_rate": 3.469824702261475e-07, |
| "loss": 0.0002, |
| "reward": 0.01513567566871643, |
| "reward_std": 0.39855797588825226, |
| "rewards/reward_func": 0.01513567566871643, |
| "step": 4880 |
| }, |
| { |
| "completion_length": 155.9921875, |
| "epoch": 0.6540880503144654, |
| "grad_norm": 2.90625, |
| "kl": 0.005055926798377186, |
| "learning_rate": 3.4591194968553456e-07, |
| "loss": 0.0002, |
| "reward": 0.29762477427721024, |
| "reward_std": 0.4921109788119793, |
| "rewards/reward_func": 0.29762477427721024, |
| "step": 4888 |
| }, |
| { |
| "completion_length": 183.1875, |
| "epoch": 0.6551585708550783, |
| "grad_norm": 3.1875, |
| "kl": 0.0038829974364489317, |
| "learning_rate": 3.448414291449217e-07, |
| "loss": 0.0002, |
| "reward": 0.15376039780676365, |
| "reward_std": 0.6010817158967257, |
| "rewards/reward_func": 0.15376039780676365, |
| "step": 4896 |
| }, |
| { |
| "completion_length": 143.6640625, |
| "epoch": 0.6562290913956912, |
| "grad_norm": 4.25, |
| "kl": 0.005372069776058197, |
| "learning_rate": 3.4377090860430885e-07, |
| "loss": 0.0002, |
| "reward": 0.45785857178270817, |
| "reward_std": 0.5780720338225365, |
| "rewards/reward_func": 0.45785857178270817, |
| "step": 4904 |
| }, |
| { |
| "completion_length": 170.703125, |
| "epoch": 0.657299611936304, |
| "grad_norm": 3.5, |
| "kl": 0.004123226040974259, |
| "learning_rate": 3.427003880636959e-07, |
| "loss": 0.0002, |
| "reward": 0.3207322843372822, |
| "reward_std": 0.5435153748840094, |
| "rewards/reward_func": 0.3207322843372822, |
| "step": 4912 |
| }, |
| { |
| "completion_length": 170.8515625, |
| "epoch": 0.6583701324769169, |
| "grad_norm": 2.296875, |
| "kl": 0.004442213830770925, |
| "learning_rate": 3.416298675230831e-07, |
| "loss": 0.0002, |
| "reward": 0.2148810252547264, |
| "reward_std": 0.3938889876008034, |
| "rewards/reward_func": 0.2148810252547264, |
| "step": 4920 |
| }, |
| { |
| "completion_length": 172.0546875, |
| "epoch": 0.6594406530175297, |
| "grad_norm": 3.515625, |
| "kl": 0.004354791803052649, |
| "learning_rate": 3.4055934698247016e-07, |
| "loss": 0.0002, |
| "reward": 0.1703500747680664, |
| "reward_std": 0.6057061813771725, |
| "rewards/reward_func": 0.1703500747680664, |
| "step": 4928 |
| }, |
| { |
| "completion_length": 175.7421875, |
| "epoch": 0.6605111735581426, |
| "grad_norm": 3.875, |
| "kl": 0.003823021659627557, |
| "learning_rate": 3.3948882644185734e-07, |
| "loss": 0.0002, |
| "reward": 0.4399567134678364, |
| "reward_std": 0.2992268856614828, |
| "rewards/reward_func": 0.4399567134678364, |
| "step": 4936 |
| }, |
| { |
| "completion_length": 168.5390625, |
| "epoch": 0.6615816940987556, |
| "grad_norm": 4.90625, |
| "kl": 0.0046115216973703355, |
| "learning_rate": 3.3841830590124446e-07, |
| "loss": 0.0002, |
| "reward": 0.21755497064441442, |
| "reward_std": 0.6609915122389793, |
| "rewards/reward_func": 0.21755497064441442, |
| "step": 4944 |
| }, |
| { |
| "completion_length": 152.484375, |
| "epoch": 0.6626522146393684, |
| "grad_norm": 4.125, |
| "kl": 0.004345663794083521, |
| "learning_rate": 3.373477853606316e-07, |
| "loss": 0.0002, |
| "reward": 0.5310599412769079, |
| "reward_std": 0.5352654401212931, |
| "rewards/reward_func": 0.5310599412769079, |
| "step": 4952 |
| }, |
| { |
| "completion_length": 147.4140625, |
| "epoch": 0.6637227351799813, |
| "grad_norm": 4.28125, |
| "kl": 0.004903295426629484, |
| "learning_rate": 3.362772648200187e-07, |
| "loss": 0.0002, |
| "reward": 0.47527459636330605, |
| "reward_std": 0.4394548684358597, |
| "rewards/reward_func": 0.47527459636330605, |
| "step": 4960 |
| }, |
| { |
| "completion_length": 128.90625, |
| "epoch": 0.6647932557205941, |
| "grad_norm": 4.59375, |
| "kl": 0.006024273345246911, |
| "learning_rate": 3.352067442794059e-07, |
| "loss": 0.0002, |
| "reward": 0.2654110789299011, |
| "reward_std": 0.5651892945170403, |
| "rewards/reward_func": 0.2654110789299011, |
| "step": 4968 |
| }, |
| { |
| "completion_length": 148.0546875, |
| "epoch": 0.665863776261207, |
| "grad_norm": 3.53125, |
| "kl": 0.004379941092338413, |
| "learning_rate": 3.3413622373879295e-07, |
| "loss": 0.0002, |
| "reward": 0.5237122774124146, |
| "reward_std": 0.5490029491484165, |
| "rewards/reward_func": 0.5237122774124146, |
| "step": 4976 |
| }, |
| { |
| "completion_length": 191.8046875, |
| "epoch": 0.6669342968018199, |
| "grad_norm": 3.09375, |
| "kl": 0.004273373109754175, |
| "learning_rate": 3.330657031981801e-07, |
| "loss": 0.0002, |
| "reward": 0.47103837318718433, |
| "reward_std": 0.45740975998342037, |
| "rewards/reward_func": 0.47103837318718433, |
| "step": 4984 |
| }, |
| { |
| "completion_length": 173.75, |
| "epoch": 0.6680048173424328, |
| "grad_norm": 2.796875, |
| "kl": 0.004224992386298254, |
| "learning_rate": 3.319951826575672e-07, |
| "loss": 0.0002, |
| "reward": 0.33773920126259327, |
| "reward_std": 0.6048417650163174, |
| "rewards/reward_func": 0.33773920126259327, |
| "step": 4992 |
| }, |
| { |
| "completion_length": 150.1015625, |
| "epoch": 0.6690753378830456, |
| "grad_norm": 5.6875, |
| "kl": 0.004986172774806619, |
| "learning_rate": 3.3092466211695436e-07, |
| "loss": 0.0002, |
| "reward": 0.26249578036367893, |
| "reward_std": 0.49397554993629456, |
| "rewards/reward_func": 0.26249578036367893, |
| "step": 5000 |
| }, |
| { |
| "completion_length": 168.59375, |
| "epoch": 0.6701458584236585, |
| "grad_norm": 3.59375, |
| "kl": 0.00485606407164596, |
| "learning_rate": 3.298541415763415e-07, |
| "loss": 0.0002, |
| "reward": 0.1886943932622671, |
| "reward_std": 0.6403144299983978, |
| "rewards/reward_func": 0.1886943932622671, |
| "step": 5008 |
| }, |
| { |
| "completion_length": 169.3125, |
| "epoch": 0.6712163789642713, |
| "grad_norm": 4.3125, |
| "kl": 0.004554765066131949, |
| "learning_rate": 3.287836210357286e-07, |
| "loss": 0.0002, |
| "reward": 0.20580013655126095, |
| "reward_std": 0.5662092342972755, |
| "rewards/reward_func": 0.20580013655126095, |
| "step": 5016 |
| }, |
| { |
| "completion_length": 159.5390625, |
| "epoch": 0.6722868995048843, |
| "grad_norm": 5.46875, |
| "kl": 0.004217549023451284, |
| "learning_rate": 3.2771310049511573e-07, |
| "loss": 0.0002, |
| "reward": 0.4258319865912199, |
| "reward_std": 0.5163000021129847, |
| "rewards/reward_func": 0.4258319865912199, |
| "step": 5024 |
| }, |
| { |
| "completion_length": 171.4140625, |
| "epoch": 0.6733574200454971, |
| "grad_norm": 3.53125, |
| "kl": 0.004711526213213801, |
| "learning_rate": 3.2664257995450285e-07, |
| "loss": 0.0002, |
| "reward": 0.12669032951816916, |
| "reward_std": 0.6522959657013416, |
| "rewards/reward_func": 0.12669032951816916, |
| "step": 5032 |
| }, |
| { |
| "completion_length": 143.515625, |
| "epoch": 0.67442794058611, |
| "grad_norm": 3.6875, |
| "kl": 0.005328927683876827, |
| "learning_rate": 3.2557205941388997e-07, |
| "loss": 0.0002, |
| "reward": 0.25894895382225513, |
| "reward_std": 0.6157816741615534, |
| "rewards/reward_func": 0.25894895382225513, |
| "step": 5040 |
| }, |
| { |
| "completion_length": 169.09375, |
| "epoch": 0.6754984611267228, |
| "grad_norm": 4.15625, |
| "kl": 0.004591718839947134, |
| "learning_rate": 3.2450153887327715e-07, |
| "loss": 0.0002, |
| "reward": 0.1223931759595871, |
| "reward_std": 0.7310500293970108, |
| "rewards/reward_func": 0.1223931759595871, |
| "step": 5048 |
| }, |
| { |
| "completion_length": 158.8984375, |
| "epoch": 0.6765689816673357, |
| "grad_norm": 4.375, |
| "kl": 0.00482406112132594, |
| "learning_rate": 3.234310183326642e-07, |
| "loss": 0.0002, |
| "reward": 0.30884232465177774, |
| "reward_std": 0.5993989063426852, |
| "rewards/reward_func": 0.30884232465177774, |
| "step": 5056 |
| }, |
| { |
| "completion_length": 153.9921875, |
| "epoch": 0.6776395022079487, |
| "grad_norm": 6.84375, |
| "kl": 0.0044682007865048945, |
| "learning_rate": 3.223604977920514e-07, |
| "loss": 0.0002, |
| "reward": 0.23793572932481766, |
| "reward_std": 0.47554378490895033, |
| "rewards/reward_func": 0.23793572932481766, |
| "step": 5064 |
| }, |
| { |
| "completion_length": 171.5390625, |
| "epoch": 0.6787100227485615, |
| "grad_norm": 6.90625, |
| "kl": 0.0044736934069078416, |
| "learning_rate": 3.2128997725143846e-07, |
| "loss": 0.0002, |
| "reward": 0.37867590319365263, |
| "reward_std": 0.49610742926597595, |
| "rewards/reward_func": 0.37867590319365263, |
| "step": 5072 |
| }, |
| { |
| "completion_length": 148.2890625, |
| "epoch": 0.6797805432891744, |
| "grad_norm": 4.625, |
| "kl": 0.004754859022796154, |
| "learning_rate": 3.2021945671082563e-07, |
| "loss": 0.0002, |
| "reward": 0.517847141250968, |
| "reward_std": 0.5063638836145401, |
| "rewards/reward_func": 0.517847141250968, |
| "step": 5080 |
| }, |
| { |
| "completion_length": 156.71875, |
| "epoch": 0.6808510638297872, |
| "grad_norm": 5.59375, |
| "kl": 0.005674656480550766, |
| "learning_rate": 3.1914893617021275e-07, |
| "loss": 0.0002, |
| "reward": 0.34883139841258526, |
| "reward_std": 0.33303822576999664, |
| "rewards/reward_func": 0.34883139841258526, |
| "step": 5088 |
| }, |
| { |
| "completion_length": 178.890625, |
| "epoch": 0.6819215843704001, |
| "grad_norm": 3.40625, |
| "kl": 0.0046264427655842155, |
| "learning_rate": 3.180784156295999e-07, |
| "loss": 0.0002, |
| "reward": 0.47927757538855076, |
| "reward_std": 0.5571104716509581, |
| "rewards/reward_func": 0.47927757538855076, |
| "step": 5096 |
| }, |
| { |
| "completion_length": 144.3359375, |
| "epoch": 0.682992104911013, |
| "grad_norm": 3.890625, |
| "kl": 0.004523319890722632, |
| "learning_rate": 3.17007895088987e-07, |
| "loss": 0.0002, |
| "reward": 0.34390855580568314, |
| "reward_std": 0.5760727934539318, |
| "rewards/reward_func": 0.34390855580568314, |
| "step": 5104 |
| }, |
| { |
| "completion_length": 159.6484375, |
| "epoch": 0.6840626254516259, |
| "grad_norm": 4.65625, |
| "kl": 0.004729281121399254, |
| "learning_rate": 3.1593737454837417e-07, |
| "loss": 0.0002, |
| "reward": 0.38299885392189026, |
| "reward_std": 0.3037977972999215, |
| "rewards/reward_func": 0.38299885392189026, |
| "step": 5112 |
| }, |
| { |
| "completion_length": 150.53125, |
| "epoch": 0.6851331459922387, |
| "grad_norm": 2.96875, |
| "kl": 0.005811055249068886, |
| "learning_rate": 3.1486685400776124e-07, |
| "loss": 0.0002, |
| "reward": 0.4124348498880863, |
| "reward_std": 0.5133458133786917, |
| "rewards/reward_func": 0.4124348498880863, |
| "step": 5120 |
| }, |
| { |
| "completion_length": 146.6875, |
| "epoch": 0.6862036665328516, |
| "grad_norm": 5.15625, |
| "kl": 0.004858777509070933, |
| "learning_rate": 3.137963334671484e-07, |
| "loss": 0.0002, |
| "reward": 0.1230292096734047, |
| "reward_std": 0.4463986298069358, |
| "rewards/reward_func": 0.1230292096734047, |
| "step": 5128 |
| }, |
| { |
| "completion_length": 160.390625, |
| "epoch": 0.6872741870734644, |
| "grad_norm": 2.96875, |
| "kl": 0.004541641887044534, |
| "learning_rate": 3.127258129265355e-07, |
| "loss": 0.0002, |
| "reward": 0.05217524245381355, |
| "reward_std": 0.45026756450533867, |
| "rewards/reward_func": 0.05217524245381355, |
| "step": 5136 |
| }, |
| { |
| "completion_length": 144.0703125, |
| "epoch": 0.6883447076140774, |
| "grad_norm": 8.6875, |
| "kl": 0.005810694128740579, |
| "learning_rate": 3.1165529238592266e-07, |
| "loss": 0.0002, |
| "reward": 0.31892623007297516, |
| "reward_std": 0.4961309377104044, |
| "rewards/reward_func": 0.31892623007297516, |
| "step": 5144 |
| }, |
| { |
| "completion_length": 202.375, |
| "epoch": 0.6894152281546903, |
| "grad_norm": 3.125, |
| "kl": 0.004103525396203622, |
| "learning_rate": 3.105847718453098e-07, |
| "loss": 0.0002, |
| "reward": 0.35768837202340364, |
| "reward_std": 0.5502582993358374, |
| "rewards/reward_func": 0.35768837202340364, |
| "step": 5152 |
| }, |
| { |
| "completion_length": 173.1484375, |
| "epoch": 0.6904857486953031, |
| "grad_norm": 3.40625, |
| "kl": 0.004345653491327539, |
| "learning_rate": 3.095142513046969e-07, |
| "loss": 0.0002, |
| "reward": 0.30987947806715965, |
| "reward_std": 0.5077685210853815, |
| "rewards/reward_func": 0.30987947806715965, |
| "step": 5160 |
| }, |
| { |
| "completion_length": 176.890625, |
| "epoch": 0.691556269235916, |
| "grad_norm": 3.515625, |
| "kl": 0.0047625836450606585, |
| "learning_rate": 3.08443730764084e-07, |
| "loss": 0.0002, |
| "reward": 0.37025075126439333, |
| "reward_std": 0.47811376582831144, |
| "rewards/reward_func": 0.37025075126439333, |
| "step": 5168 |
| }, |
| { |
| "completion_length": 156.3125, |
| "epoch": 0.6926267897765288, |
| "grad_norm": 3.375, |
| "kl": 0.004461723641725257, |
| "learning_rate": 3.0737321022347114e-07, |
| "loss": 0.0002, |
| "reward": 0.4771025739610195, |
| "reward_std": 0.4133305884897709, |
| "rewards/reward_func": 0.4771025739610195, |
| "step": 5176 |
| }, |
| { |
| "completion_length": 175.3359375, |
| "epoch": 0.6936973103171417, |
| "grad_norm": 3.71875, |
| "kl": 0.004555776889901608, |
| "learning_rate": 3.0630268968285827e-07, |
| "loss": 0.0002, |
| "reward": 0.21876542083919048, |
| "reward_std": 0.5979834999889135, |
| "rewards/reward_func": 0.21876542083919048, |
| "step": 5184 |
| }, |
| { |
| "completion_length": 143.2734375, |
| "epoch": 0.6947678308577546, |
| "grad_norm": 4.1875, |
| "kl": 0.006049849558621645, |
| "learning_rate": 3.0523216914224544e-07, |
| "loss": 0.0002, |
| "reward": 0.3804114758968353, |
| "reward_std": 0.43962680641561747, |
| "rewards/reward_func": 0.3804114758968353, |
| "step": 5192 |
| }, |
| { |
| "completion_length": 176.09375, |
| "epoch": 0.6958383513983675, |
| "grad_norm": 4.1875, |
| "kl": 0.004219004331389442, |
| "learning_rate": 3.041616486016325e-07, |
| "loss": 0.0002, |
| "reward": 0.033823274075984955, |
| "reward_std": 0.5529468916356564, |
| "rewards/reward_func": 0.033823274075984955, |
| "step": 5200 |
| }, |
| { |
| "completion_length": 167.1328125, |
| "epoch": 0.6969088719389803, |
| "grad_norm": 3.453125, |
| "kl": 0.0044156058866064996, |
| "learning_rate": 3.030911280610197e-07, |
| "loss": 0.0002, |
| "reward": 0.35997615940868855, |
| "reward_std": 0.6205689832568169, |
| "rewards/reward_func": 0.35997615940868855, |
| "step": 5208 |
| }, |
| { |
| "completion_length": 134.3515625, |
| "epoch": 0.6979793924795932, |
| "grad_norm": 6.03125, |
| "kl": 0.005597625044174492, |
| "learning_rate": 3.020206075204068e-07, |
| "loss": 0.0002, |
| "reward": 0.5491457581520081, |
| "reward_std": 0.5092198746278882, |
| "rewards/reward_func": 0.5491457581520081, |
| "step": 5216 |
| }, |
| { |
| "completion_length": 161.3515625, |
| "epoch": 0.699049913020206, |
| "grad_norm": 2.984375, |
| "kl": 0.005356652429327369, |
| "learning_rate": 3.009500869797939e-07, |
| "loss": 0.0002, |
| "reward": 0.4664277071133256, |
| "reward_std": 0.5567853916436434, |
| "rewards/reward_func": 0.4664277071133256, |
| "step": 5224 |
| }, |
| { |
| "completion_length": 169.203125, |
| "epoch": 0.700120433560819, |
| "grad_norm": 4.1875, |
| "kl": 0.0043607138795778155, |
| "learning_rate": 2.9987956643918105e-07, |
| "loss": 0.0002, |
| "reward": 0.34521481581032276, |
| "reward_std": 0.6393520161509514, |
| "rewards/reward_func": 0.34521481581032276, |
| "step": 5232 |
| }, |
| { |
| "completion_length": 176.28125, |
| "epoch": 0.7011909541014318, |
| "grad_norm": 3.921875, |
| "kl": 0.004437842464540154, |
| "learning_rate": 2.9880904589856817e-07, |
| "loss": 0.0002, |
| "reward": -0.07886990532279015, |
| "reward_std": 0.6460573114454746, |
| "rewards/reward_func": -0.07886990532279015, |
| "step": 5240 |
| }, |
| { |
| "completion_length": 212.4375, |
| "epoch": 0.7022614746420447, |
| "grad_norm": 3.640625, |
| "kl": 0.004008949821582064, |
| "learning_rate": 2.977385253579553e-07, |
| "loss": 0.0002, |
| "reward": 0.012970509007573128, |
| "reward_std": 0.5811912510544062, |
| "rewards/reward_func": 0.012970509007573128, |
| "step": 5248 |
| }, |
| { |
| "completion_length": 182.1328125, |
| "epoch": 0.7033319951826575, |
| "grad_norm": 4.4375, |
| "kl": 0.004463888035388663, |
| "learning_rate": 2.9666800481734247e-07, |
| "loss": 0.0002, |
| "reward": 0.295044606551528, |
| "reward_std": 0.5268499422818422, |
| "rewards/reward_func": 0.295044606551528, |
| "step": 5256 |
| }, |
| { |
| "completion_length": 158.8515625, |
| "epoch": 0.7044025157232704, |
| "grad_norm": 3.796875, |
| "kl": 0.005019562435336411, |
| "learning_rate": 2.9559748427672953e-07, |
| "loss": 0.0002, |
| "reward": 0.2960619358345866, |
| "reward_std": 0.5362240988761187, |
| "rewards/reward_func": 0.2960619358345866, |
| "step": 5264 |
| }, |
| { |
| "completion_length": 158.328125, |
| "epoch": 0.7054730362638834, |
| "grad_norm": 4.40625, |
| "kl": 0.004566931165754795, |
| "learning_rate": 2.945269637361167e-07, |
| "loss": 0.0002, |
| "reward": 0.5046307481825352, |
| "reward_std": 0.45363772846758366, |
| "rewards/reward_func": 0.5046307481825352, |
| "step": 5272 |
| }, |
| { |
| "completion_length": 184.515625, |
| "epoch": 0.7065435568044962, |
| "grad_norm": 7.125, |
| "kl": 0.004289998818421736, |
| "learning_rate": 2.934564431955038e-07, |
| "loss": 0.0002, |
| "reward": 0.4870417043566704, |
| "reward_std": 0.4741673758253455, |
| "rewards/reward_func": 0.4870417043566704, |
| "step": 5280 |
| }, |
| { |
| "completion_length": 161.359375, |
| "epoch": 0.7076140773451091, |
| "grad_norm": 5.53125, |
| "kl": 0.0042855191277340055, |
| "learning_rate": 2.9238592265489095e-07, |
| "loss": 0.0002, |
| "reward": 0.37416786467656493, |
| "reward_std": 0.5148907378315926, |
| "rewards/reward_func": 0.37416786467656493, |
| "step": 5288 |
| }, |
| { |
| "completion_length": 159.1484375, |
| "epoch": 0.7086845978857219, |
| "grad_norm": 4.03125, |
| "kl": 0.005468921910505742, |
| "learning_rate": 2.9131540211427807e-07, |
| "loss": 0.0002, |
| "reward": 0.2032206254079938, |
| "reward_std": 0.5835869964212179, |
| "rewards/reward_func": 0.2032206254079938, |
| "step": 5296 |
| }, |
| { |
| "completion_length": 178.046875, |
| "epoch": 0.7097551184263348, |
| "grad_norm": 5.46875, |
| "kl": 0.004879669373622164, |
| "learning_rate": 2.9024488157366514e-07, |
| "loss": 0.0002, |
| "reward": 0.07407154329121113, |
| "reward_std": 0.5483472738415003, |
| "rewards/reward_func": 0.07407154329121113, |
| "step": 5304 |
| }, |
| { |
| "completion_length": 172.4296875, |
| "epoch": 0.7108256389669477, |
| "grad_norm": 3.078125, |
| "kl": 0.004936008132062852, |
| "learning_rate": 2.891743610330523e-07, |
| "loss": 0.0002, |
| "reward": 0.1570496652275324, |
| "reward_std": 0.6552108749747276, |
| "rewards/reward_func": 0.1570496652275324, |
| "step": 5312 |
| }, |
| { |
| "completion_length": 183.265625, |
| "epoch": 0.7118961595075606, |
| "grad_norm": 3.15625, |
| "kl": 0.004192218388197944, |
| "learning_rate": 2.881038404924394e-07, |
| "loss": 0.0002, |
| "reward": 0.2290868228301406, |
| "reward_std": 0.6626240387558937, |
| "rewards/reward_func": 0.2290868228301406, |
| "step": 5320 |
| }, |
| { |
| "completion_length": 129.171875, |
| "epoch": 0.7129666800481734, |
| "grad_norm": 3.859375, |
| "kl": 0.00642680426244624, |
| "learning_rate": 2.8703331995182656e-07, |
| "loss": 0.0003, |
| "reward": 0.395254772156477, |
| "reward_std": 0.4721956867724657, |
| "rewards/reward_func": 0.395254772156477, |
| "step": 5328 |
| }, |
| { |
| "completion_length": 157.6640625, |
| "epoch": 0.7140372005887863, |
| "grad_norm": 3.234375, |
| "kl": 0.004553045437205583, |
| "learning_rate": 2.859627994112137e-07, |
| "loss": 0.0002, |
| "reward": 0.44277836102992296, |
| "reward_std": 0.5288186706602573, |
| "rewards/reward_func": 0.44277836102992296, |
| "step": 5336 |
| }, |
| { |
| "completion_length": 174.7578125, |
| "epoch": 0.7151077211293991, |
| "grad_norm": 3.578125, |
| "kl": 0.004835324827581644, |
| "learning_rate": 2.848922788706008e-07, |
| "loss": 0.0002, |
| "reward": 0.2129112258553505, |
| "reward_std": 0.518157972022891, |
| "rewards/reward_func": 0.2129112258553505, |
| "step": 5344 |
| }, |
| { |
| "completion_length": 183.0625, |
| "epoch": 0.7161782416700121, |
| "grad_norm": 4.40625, |
| "kl": 0.004699339595390484, |
| "learning_rate": 2.838217583299879e-07, |
| "loss": 0.0002, |
| "reward": -0.12015869608148932, |
| "reward_std": 0.6651497483253479, |
| "rewards/reward_func": -0.12015869608148932, |
| "step": 5352 |
| }, |
| { |
| "completion_length": 167.125, |
| "epoch": 0.717248762210625, |
| "grad_norm": 3.8125, |
| "kl": 0.004674197465647012, |
| "learning_rate": 2.827512377893751e-07, |
| "loss": 0.0002, |
| "reward": 0.08481440320611, |
| "reward_std": 0.6426707338541746, |
| "rewards/reward_func": 0.08481440320611, |
| "step": 5360 |
| }, |
| { |
| "completion_length": 187.109375, |
| "epoch": 0.7183192827512378, |
| "grad_norm": 3.359375, |
| "kl": 0.004370440146885812, |
| "learning_rate": 2.8168071724876217e-07, |
| "loss": 0.0002, |
| "reward": 0.10261328518390656, |
| "reward_std": 0.33446657191962004, |
| "rewards/reward_func": 0.10261328518390656, |
| "step": 5368 |
| }, |
| { |
| "completion_length": 169.8828125, |
| "epoch": 0.7193898032918506, |
| "grad_norm": 4.46875, |
| "kl": 0.005003685160772875, |
| "learning_rate": 2.8061019670814934e-07, |
| "loss": 0.0002, |
| "reward": 0.35325085651129484, |
| "reward_std": 0.5368635784834623, |
| "rewards/reward_func": 0.35325085651129484, |
| "step": 5376 |
| }, |
| { |
| "completion_length": 127.6328125, |
| "epoch": 0.7204603238324635, |
| "grad_norm": 5.0, |
| "kl": 0.006041952816303819, |
| "learning_rate": 2.795396761675364e-07, |
| "loss": 0.0002, |
| "reward": 0.31289495434612036, |
| "reward_std": 0.5351240076124668, |
| "rewards/reward_func": 0.31289495434612036, |
| "step": 5384 |
| }, |
| { |
| "completion_length": 132.1484375, |
| "epoch": 0.7215308443730765, |
| "grad_norm": 3.890625, |
| "kl": 0.004856948013184592, |
| "learning_rate": 2.784691556269236e-07, |
| "loss": 0.0002, |
| "reward": 0.5194568559527397, |
| "reward_std": 0.4919391795992851, |
| "rewards/reward_func": 0.5194568559527397, |
| "step": 5392 |
| }, |
| { |
| "completion_length": 172.6796875, |
| "epoch": 0.7226013649136893, |
| "grad_norm": 3.71875, |
| "kl": 0.004867620766162872, |
| "learning_rate": 2.773986350863107e-07, |
| "loss": 0.0002, |
| "reward": 0.1840124912559986, |
| "reward_std": 0.6040789932012558, |
| "rewards/reward_func": 0.1840124912559986, |
| "step": 5400 |
| }, |
| { |
| "completion_length": 162.0078125, |
| "epoch": 0.7236718854543022, |
| "grad_norm": 5.4375, |
| "kl": 0.0043890359229408205, |
| "learning_rate": 2.7632811454569783e-07, |
| "loss": 0.0002, |
| "reward": 0.40340816229581833, |
| "reward_std": 0.46000672224909067, |
| "rewards/reward_func": 0.40340816229581833, |
| "step": 5408 |
| }, |
| { |
| "completion_length": 178.796875, |
| "epoch": 0.724742405994915, |
| "grad_norm": 3.671875, |
| "kl": 0.004453314671991393, |
| "learning_rate": 2.7525759400508495e-07, |
| "loss": 0.0002, |
| "reward": 0.1911243163049221, |
| "reward_std": 0.5930454572662711, |
| "rewards/reward_func": 0.1911243163049221, |
| "step": 5416 |
| }, |
| { |
| "completion_length": 166.296875, |
| "epoch": 0.7258129265355279, |
| "grad_norm": 3.890625, |
| "kl": 0.004950450966134667, |
| "learning_rate": 2.7418707346447207e-07, |
| "loss": 0.0002, |
| "reward": 0.2432717476040125, |
| "reward_std": 0.4679036773741245, |
| "rewards/reward_func": 0.2432717476040125, |
| "step": 5424 |
| }, |
| { |
| "completion_length": 204.453125, |
| "epoch": 0.7268834470761407, |
| "grad_norm": 3.6875, |
| "kl": 0.004144096135860309, |
| "learning_rate": 2.731165529238592e-07, |
| "loss": 0.0002, |
| "reward": -0.028832857497036457, |
| "reward_std": 0.5423443503677845, |
| "rewards/reward_func": -0.028832857497036457, |
| "step": 5432 |
| }, |
| { |
| "completion_length": 144.3671875, |
| "epoch": 0.7279539676167537, |
| "grad_norm": 5.46875, |
| "kl": 0.0068962293735239655, |
| "learning_rate": 2.7204603238324637e-07, |
| "loss": 0.0003, |
| "reward": 0.5656752809882164, |
| "reward_std": 0.37680432945489883, |
| "rewards/reward_func": 0.5656752809882164, |
| "step": 5440 |
| }, |
| { |
| "completion_length": 156.8203125, |
| "epoch": 0.7290244881573665, |
| "grad_norm": 3.21875, |
| "kl": 0.005207971204072237, |
| "learning_rate": 2.7097551184263344e-07, |
| "loss": 0.0002, |
| "reward": 0.46704378351569176, |
| "reward_std": 0.4321159301325679, |
| "rewards/reward_func": 0.46704378351569176, |
| "step": 5448 |
| }, |
| { |
| "completion_length": 170.1953125, |
| "epoch": 0.7300950086979794, |
| "grad_norm": 4.28125, |
| "kl": 0.004335955512942746, |
| "learning_rate": 2.699049913020206e-07, |
| "loss": 0.0002, |
| "reward": 0.15294395573437214, |
| "reward_std": 0.6653651669621468, |
| "rewards/reward_func": 0.15294395573437214, |
| "step": 5456 |
| }, |
| { |
| "completion_length": 149.6015625, |
| "epoch": 0.7311655292385922, |
| "grad_norm": 3.90625, |
| "kl": 0.005089007405331358, |
| "learning_rate": 2.6883447076140773e-07, |
| "loss": 0.0002, |
| "reward": 0.2426714487373829, |
| "reward_std": 0.48969776928424835, |
| "rewards/reward_func": 0.2426714487373829, |
| "step": 5464 |
| }, |
| { |
| "completion_length": 171.625, |
| "epoch": 0.7322360497792051, |
| "grad_norm": 3.03125, |
| "kl": 0.004394051560666412, |
| "learning_rate": 2.6776395022079485e-07, |
| "loss": 0.0002, |
| "reward": 0.10768201760947704, |
| "reward_std": 0.4550578175112605, |
| "rewards/reward_func": 0.10768201760947704, |
| "step": 5472 |
| }, |
| { |
| "completion_length": 159.9609375, |
| "epoch": 0.733306570319818, |
| "grad_norm": 3.59375, |
| "kl": 0.005018858646508306, |
| "learning_rate": 2.66693429680182e-07, |
| "loss": 0.0002, |
| "reward": 0.2529556443914771, |
| "reward_std": 0.6179038770496845, |
| "rewards/reward_func": 0.2529556443914771, |
| "step": 5480 |
| }, |
| { |
| "completion_length": 154.8125, |
| "epoch": 0.7343770908604309, |
| "grad_norm": 5.0, |
| "kl": 0.005219785525696352, |
| "learning_rate": 2.656229091395691e-07, |
| "loss": 0.0002, |
| "reward": 0.3117452962324023, |
| "reward_std": 0.5476666176691651, |
| "rewards/reward_func": 0.3117452962324023, |
| "step": 5488 |
| }, |
| { |
| "completion_length": 140.671875, |
| "epoch": 0.7354476114010438, |
| "grad_norm": 4.78125, |
| "kl": 0.006368768343236297, |
| "learning_rate": 2.645523885989562e-07, |
| "loss": 0.0003, |
| "reward": 0.5569799374789, |
| "reward_std": 0.4139596875756979, |
| "rewards/reward_func": 0.5569799374789, |
| "step": 5496 |
| }, |
| { |
| "completion_length": 208.4375, |
| "epoch": 0.7365181319416566, |
| "grad_norm": 3.171875, |
| "kl": 0.004221481096465141, |
| "learning_rate": 2.634818680583434e-07, |
| "loss": 0.0002, |
| "reward": 0.13646352104842663, |
| "reward_std": 0.674302838742733, |
| "rewards/reward_func": 0.13646352104842663, |
| "step": 5504 |
| }, |
| { |
| "completion_length": 177.7578125, |
| "epoch": 0.7375886524822695, |
| "grad_norm": 4.09375, |
| "kl": 0.004927775065880269, |
| "learning_rate": 2.6241134751773046e-07, |
| "loss": 0.0002, |
| "reward": 0.179019657894969, |
| "reward_std": 0.4836566299200058, |
| "rewards/reward_func": 0.179019657894969, |
| "step": 5512 |
| }, |
| { |
| "completion_length": 168.359375, |
| "epoch": 0.7386591730228824, |
| "grad_norm": 3.71875, |
| "kl": 0.004563187627354637, |
| "learning_rate": 2.6134082697711764e-07, |
| "loss": 0.0002, |
| "reward": 0.0944369975477457, |
| "reward_std": 0.6901743151247501, |
| "rewards/reward_func": 0.0944369975477457, |
| "step": 5520 |
| }, |
| { |
| "completion_length": 141.9765625, |
| "epoch": 0.7397296935634953, |
| "grad_norm": 4.5, |
| "kl": 0.005385736672906205, |
| "learning_rate": 2.602703064365047e-07, |
| "loss": 0.0002, |
| "reward": 0.3004543990828097, |
| "reward_std": 0.6421327739953995, |
| "rewards/reward_func": 0.3004543990828097, |
| "step": 5528 |
| }, |
| { |
| "completion_length": 161.1015625, |
| "epoch": 0.7408002141041081, |
| "grad_norm": 4.53125, |
| "kl": 0.005307289626216516, |
| "learning_rate": 2.591997858958919e-07, |
| "loss": 0.0002, |
| "reward": 0.4302559047937393, |
| "reward_std": 0.296412231400609, |
| "rewards/reward_func": 0.4302559047937393, |
| "step": 5536 |
| }, |
| { |
| "completion_length": 160.15625, |
| "epoch": 0.741870734644721, |
| "grad_norm": 5.21875, |
| "kl": 0.00511023830040358, |
| "learning_rate": 2.58129265355279e-07, |
| "loss": 0.0002, |
| "reward": 0.39244108088314533, |
| "reward_std": 0.5584999155253172, |
| "rewards/reward_func": 0.39244108088314533, |
| "step": 5544 |
| }, |
| { |
| "completion_length": 161.0546875, |
| "epoch": 0.7429412551853338, |
| "grad_norm": 3.46875, |
| "kl": 0.004729041800601408, |
| "learning_rate": 2.570587448146661e-07, |
| "loss": 0.0002, |
| "reward": 0.30113553907722235, |
| "reward_std": 0.6211994774639606, |
| "rewards/reward_func": 0.30113553907722235, |
| "step": 5552 |
| }, |
| { |
| "completion_length": 176.578125, |
| "epoch": 0.7440117757259468, |
| "grad_norm": 3.28125, |
| "kl": 0.004635761812096462, |
| "learning_rate": 2.5598822427405324e-07, |
| "loss": 0.0002, |
| "reward": 0.3027530014514923, |
| "reward_std": 0.34483792912214994, |
| "rewards/reward_func": 0.3027530014514923, |
| "step": 5560 |
| }, |
| { |
| "completion_length": 159.375, |
| "epoch": 0.7450822962665596, |
| "grad_norm": 5.53125, |
| "kl": 0.005176402977667749, |
| "learning_rate": 2.5491770373344036e-07, |
| "loss": 0.0002, |
| "reward": 0.17298301681876183, |
| "reward_std": 0.584480419754982, |
| "rewards/reward_func": 0.17298301681876183, |
| "step": 5568 |
| }, |
| { |
| "completion_length": 157.3671875, |
| "epoch": 0.7461528168071725, |
| "grad_norm": 6.5625, |
| "kl": 0.005517173325642943, |
| "learning_rate": 2.538471831928275e-07, |
| "loss": 0.0002, |
| "reward": 0.17840459011495113, |
| "reward_std": 0.6545839756727219, |
| "rewards/reward_func": 0.17840459011495113, |
| "step": 5576 |
| }, |
| { |
| "completion_length": 163.765625, |
| "epoch": 0.7472233373477853, |
| "grad_norm": 3.296875, |
| "kl": 0.005615679023321718, |
| "learning_rate": 2.5277666265221466e-07, |
| "loss": 0.0002, |
| "reward": 0.3282418688759208, |
| "reward_std": 0.4674977771937847, |
| "rewards/reward_func": 0.3282418688759208, |
| "step": 5584 |
| }, |
| { |
| "completion_length": 199.1171875, |
| "epoch": 0.7482938578883982, |
| "grad_norm": 4.96875, |
| "kl": 0.004132435307838023, |
| "learning_rate": 2.5170614211160173e-07, |
| "loss": 0.0002, |
| "reward": 0.041125981137156487, |
| "reward_std": 0.5962537340819836, |
| "rewards/reward_func": 0.041125981137156487, |
| "step": 5592 |
| }, |
| { |
| "completion_length": 195.3828125, |
| "epoch": 0.7493643784290112, |
| "grad_norm": 4.0625, |
| "kl": 0.003978644759627059, |
| "learning_rate": 2.506356215709889e-07, |
| "loss": 0.0002, |
| "reward": 0.13140291906893253, |
| "reward_std": 0.44777560979127884, |
| "rewards/reward_func": 0.13140291906893253, |
| "step": 5600 |
| }, |
| { |
| "completion_length": 173.609375, |
| "epoch": 0.750434898969624, |
| "grad_norm": 2.796875, |
| "kl": 0.004251972888596356, |
| "learning_rate": 2.49565101030376e-07, |
| "loss": 0.0002, |
| "reward": 0.20013932138681412, |
| "reward_std": 0.6238753385841846, |
| "rewards/reward_func": 0.20013932138681412, |
| "step": 5608 |
| }, |
| { |
| "completion_length": 165.1328125, |
| "epoch": 0.7515054195102369, |
| "grad_norm": 7.53125, |
| "kl": 0.004290038690669462, |
| "learning_rate": 2.4849458048976315e-07, |
| "loss": 0.0002, |
| "reward": 0.2280603777617216, |
| "reward_std": 0.4963626991957426, |
| "rewards/reward_func": 0.2280603777617216, |
| "step": 5616 |
| }, |
| { |
| "completion_length": 141.671875, |
| "epoch": 0.7525759400508497, |
| "grad_norm": 4.46875, |
| "kl": 0.00585965282516554, |
| "learning_rate": 2.4742405994915027e-07, |
| "loss": 0.0002, |
| "reward": 0.4678545705974102, |
| "reward_std": 0.43725813180208206, |
| "rewards/reward_func": 0.4678545705974102, |
| "step": 5624 |
| }, |
| { |
| "completion_length": 160.421875, |
| "epoch": 0.7536464605914626, |
| "grad_norm": 4.96875, |
| "kl": 0.005591863940935582, |
| "learning_rate": 2.463535394085374e-07, |
| "loss": 0.0002, |
| "reward": 0.24596700817346573, |
| "reward_std": 0.4220298836007714, |
| "rewards/reward_func": 0.24596700817346573, |
| "step": 5632 |
| }, |
| { |
| "completion_length": 156.6171875, |
| "epoch": 0.7547169811320755, |
| "grad_norm": 3.171875, |
| "kl": 0.004576119041303173, |
| "learning_rate": 2.452830188679245e-07, |
| "loss": 0.0002, |
| "reward": 0.3924466483294964, |
| "reward_std": 0.6098343282938004, |
| "rewards/reward_func": 0.3924466483294964, |
| "step": 5640 |
| }, |
| { |
| "completion_length": 176.890625, |
| "epoch": 0.7557875016726884, |
| "grad_norm": 3.4375, |
| "kl": 0.003426549636060372, |
| "learning_rate": 2.4421249832731163e-07, |
| "loss": 0.0001, |
| "reward": 0.31396659277379513, |
| "reward_std": 0.507732754573226, |
| "rewards/reward_func": 0.31396659277379513, |
| "step": 5648 |
| }, |
| { |
| "completion_length": 155.109375, |
| "epoch": 0.7568580222133012, |
| "grad_norm": 5.0625, |
| "kl": 0.004432518238900229, |
| "learning_rate": 2.4314197778669875e-07, |
| "loss": 0.0002, |
| "reward": 0.3897492587566376, |
| "reward_std": 0.472976541146636, |
| "rewards/reward_func": 0.3897492587566376, |
| "step": 5656 |
| }, |
| { |
| "completion_length": 178.7421875, |
| "epoch": 0.7579285427539141, |
| "grad_norm": 1.96875, |
| "kl": 0.004251753707649186, |
| "learning_rate": 2.4207145724608593e-07, |
| "loss": 0.0002, |
| "reward": 0.08406687900424004, |
| "reward_std": 0.4810841968283057, |
| "rewards/reward_func": 0.08406687900424004, |
| "step": 5664 |
| }, |
| { |
| "completion_length": 166.5859375, |
| "epoch": 0.7589990632945269, |
| "grad_norm": 4.875, |
| "kl": 0.005223593441769481, |
| "learning_rate": 2.4100093670547305e-07, |
| "loss": 0.0002, |
| "reward": 0.3817774336785078, |
| "reward_std": 0.6594663038849831, |
| "rewards/reward_func": 0.3817774336785078, |
| "step": 5672 |
| }, |
| { |
| "completion_length": 161.3671875, |
| "epoch": 0.7600695838351398, |
| "grad_norm": 5.625, |
| "kl": 0.0044474324968177825, |
| "learning_rate": 2.3993041616486017e-07, |
| "loss": 0.0002, |
| "reward": 0.23454780131578445, |
| "reward_std": 0.37179601565003395, |
| "rewards/reward_func": 0.23454780131578445, |
| "step": 5680 |
| }, |
| { |
| "completion_length": 179.796875, |
| "epoch": 0.7611401043757527, |
| "grad_norm": 3.609375, |
| "kl": 0.00496278639184311, |
| "learning_rate": 2.388598956242473e-07, |
| "loss": 0.0002, |
| "reward": 0.10904507525265217, |
| "reward_std": 0.5533247627317905, |
| "rewards/reward_func": 0.10904507525265217, |
| "step": 5688 |
| }, |
| { |
| "completion_length": 160.4296875, |
| "epoch": 0.7622106249163656, |
| "grad_norm": 4.53125, |
| "kl": 0.005624369368888438, |
| "learning_rate": 2.3778937508363441e-07, |
| "loss": 0.0002, |
| "reward": 0.32307033240795135, |
| "reward_std": 0.3578721797093749, |
| "rewards/reward_func": 0.32307033240795135, |
| "step": 5696 |
| }, |
| { |
| "completion_length": 181.2578125, |
| "epoch": 0.7632811454569784, |
| "grad_norm": 3.09375, |
| "kl": 0.004423889273311943, |
| "learning_rate": 2.3671885454302154e-07, |
| "loss": 0.0002, |
| "reward": 0.2661805059760809, |
| "reward_std": 0.433091813698411, |
| "rewards/reward_func": 0.2661805059760809, |
| "step": 5704 |
| }, |
| { |
| "completion_length": 145.40625, |
| "epoch": 0.7643516659975913, |
| "grad_norm": 3.53125, |
| "kl": 0.005560883553698659, |
| "learning_rate": 2.3564833400240866e-07, |
| "loss": 0.0002, |
| "reward": 0.38319743797183037, |
| "reward_std": 0.5694666914641857, |
| "rewards/reward_func": 0.38319743797183037, |
| "step": 5712 |
| }, |
| { |
| "completion_length": 185.6640625, |
| "epoch": 0.7654221865382042, |
| "grad_norm": 3.375, |
| "kl": 0.004699640907347202, |
| "learning_rate": 2.3457781346179578e-07, |
| "loss": 0.0002, |
| "reward": 0.11784735321998596, |
| "reward_std": 0.5145326796919107, |
| "rewards/reward_func": 0.11784735321998596, |
| "step": 5720 |
| }, |
| { |
| "completion_length": 202.7265625, |
| "epoch": 0.7664927070788171, |
| "grad_norm": 3.65625, |
| "kl": 0.0042415427742525935, |
| "learning_rate": 2.335072929211829e-07, |
| "loss": 0.0002, |
| "reward": -0.14664648659527302, |
| "reward_std": 0.6029860116541386, |
| "rewards/reward_func": -0.14664648659527302, |
| "step": 5728 |
| }, |
| { |
| "completion_length": 176.671875, |
| "epoch": 0.76756322761943, |
| "grad_norm": 3.671875, |
| "kl": 0.005350680381525308, |
| "learning_rate": 2.3243677238057005e-07, |
| "loss": 0.0002, |
| "reward": 0.28928207233548164, |
| "reward_std": 0.49851767159998417, |
| "rewards/reward_func": 0.28928207233548164, |
| "step": 5736 |
| }, |
| { |
| "completion_length": 168.3828125, |
| "epoch": 0.7686337481600428, |
| "grad_norm": 4.3125, |
| "kl": 0.005111474136356264, |
| "learning_rate": 2.3136625183995717e-07, |
| "loss": 0.0002, |
| "reward": 0.20785732567310333, |
| "reward_std": 0.4819117970764637, |
| "rewards/reward_func": 0.20785732567310333, |
| "step": 5744 |
| }, |
| { |
| "completion_length": 161.5625, |
| "epoch": 0.7697042687006557, |
| "grad_norm": 5.75, |
| "kl": 0.0046100525360088795, |
| "learning_rate": 2.302957312993443e-07, |
| "loss": 0.0002, |
| "reward": 0.3344459980726242, |
| "reward_std": 0.48296352848410606, |
| "rewards/reward_func": 0.3344459980726242, |
| "step": 5752 |
| }, |
| { |
| "completion_length": 159.8203125, |
| "epoch": 0.7707747892412685, |
| "grad_norm": 3.84375, |
| "kl": 0.005216164543526247, |
| "learning_rate": 2.2922521075873141e-07, |
| "loss": 0.0002, |
| "reward": 0.4968814216554165, |
| "reward_std": 0.5169591847807169, |
| "rewards/reward_func": 0.4968814216554165, |
| "step": 5760 |
| }, |
| { |
| "completion_length": 181.0, |
| "epoch": 0.7718453097818815, |
| "grad_norm": 3.859375, |
| "kl": 0.0039515624375781044, |
| "learning_rate": 2.2815469021811856e-07, |
| "loss": 0.0002, |
| "reward": 0.1462385654449463, |
| "reward_std": 0.5148510783910751, |
| "rewards/reward_func": 0.1462385654449463, |
| "step": 5768 |
| }, |
| { |
| "completion_length": 186.953125, |
| "epoch": 0.7729158303224943, |
| "grad_norm": 3.234375, |
| "kl": 0.00495643715839833, |
| "learning_rate": 2.2708416967750568e-07, |
| "loss": 0.0002, |
| "reward": -0.019661023281514645, |
| "reward_std": 0.4568687481805682, |
| "rewards/reward_func": -0.019661023281514645, |
| "step": 5776 |
| }, |
| { |
| "completion_length": 152.9453125, |
| "epoch": 0.7739863508631072, |
| "grad_norm": 4.125, |
| "kl": 0.005363121483242139, |
| "learning_rate": 2.260136491368928e-07, |
| "loss": 0.0002, |
| "reward": 0.3975646123290062, |
| "reward_std": 0.5788163132965565, |
| "rewards/reward_func": 0.3975646123290062, |
| "step": 5784 |
| }, |
| { |
| "completion_length": 150.8046875, |
| "epoch": 0.77505687140372, |
| "grad_norm": 3.25, |
| "kl": 0.0049289112794213, |
| "learning_rate": 2.2494312859627993e-07, |
| "loss": 0.0002, |
| "reward": 0.29290657490491867, |
| "reward_std": 0.6054155379533768, |
| "rewards/reward_func": 0.29290657490491867, |
| "step": 5792 |
| }, |
| { |
| "completion_length": 149.1640625, |
| "epoch": 0.7761273919443329, |
| "grad_norm": 2.546875, |
| "kl": 0.006072040821891278, |
| "learning_rate": 2.2387260805566705e-07, |
| "loss": 0.0002, |
| "reward": 0.234967946074903, |
| "reward_std": 0.5344967059791088, |
| "rewards/reward_func": 0.234967946074903, |
| "step": 5800 |
| }, |
| { |
| "completion_length": 158.625, |
| "epoch": 0.7771979124849459, |
| "grad_norm": 4.0625, |
| "kl": 0.004590392898535356, |
| "learning_rate": 2.228020875150542e-07, |
| "loss": 0.0002, |
| "reward": 0.419980987906456, |
| "reward_std": 0.4606306320056319, |
| "rewards/reward_func": 0.419980987906456, |
| "step": 5808 |
| }, |
| { |
| "completion_length": 138.7265625, |
| "epoch": 0.7782684330255587, |
| "grad_norm": 3.8125, |
| "kl": 0.004858676256844774, |
| "learning_rate": 2.2173156697444132e-07, |
| "loss": 0.0002, |
| "reward": 0.5591896008700132, |
| "reward_std": 0.5148359183222055, |
| "rewards/reward_func": 0.5591896008700132, |
| "step": 5816 |
| }, |
| { |
| "completion_length": 191.6328125, |
| "epoch": 0.7793389535661716, |
| "grad_norm": 7.0625, |
| "kl": 0.004010791366454214, |
| "learning_rate": 2.2066104643382844e-07, |
| "loss": 0.0002, |
| "reward": 0.07061274722218513, |
| "reward_std": 0.6621855795383453, |
| "rewards/reward_func": 0.07061274722218513, |
| "step": 5824 |
| }, |
| { |
| "completion_length": 151.5546875, |
| "epoch": 0.7804094741067844, |
| "grad_norm": 3.5625, |
| "kl": 0.004385879990877584, |
| "learning_rate": 2.1959052589321556e-07, |
| "loss": 0.0002, |
| "reward": 0.4282612316310406, |
| "reward_std": 0.5311172138899565, |
| "rewards/reward_func": 0.4282612316310406, |
| "step": 5832 |
| }, |
| { |
| "completion_length": 183.8671875, |
| "epoch": 0.7814799946473973, |
| "grad_norm": 4.0, |
| "kl": 0.004209680715575814, |
| "learning_rate": 2.185200053526027e-07, |
| "loss": 0.0002, |
| "reward": 0.1611488163471222, |
| "reward_std": 0.5946944504976273, |
| "rewards/reward_func": 0.1611488163471222, |
| "step": 5840 |
| }, |
| { |
| "completion_length": 137.359375, |
| "epoch": 0.7825505151880102, |
| "grad_norm": 4.3125, |
| "kl": 0.004825499141588807, |
| "learning_rate": 2.1744948481198983e-07, |
| "loss": 0.0002, |
| "reward": 0.5471408823505044, |
| "reward_std": 0.5473849456757307, |
| "rewards/reward_func": 0.5471408823505044, |
| "step": 5848 |
| }, |
| { |
| "completion_length": 159.34375, |
| "epoch": 0.7836210357286231, |
| "grad_norm": 3.4375, |
| "kl": 0.005440732988063246, |
| "learning_rate": 2.1637896427137695e-07, |
| "loss": 0.0002, |
| "reward": 0.4683985644951463, |
| "reward_std": 0.5685102045536041, |
| "rewards/reward_func": 0.4683985644951463, |
| "step": 5856 |
| }, |
| { |
| "completion_length": 161.59375, |
| "epoch": 0.7846915562692359, |
| "grad_norm": 4.5625, |
| "kl": 0.004569044103845954, |
| "learning_rate": 2.1530844373076407e-07, |
| "loss": 0.0002, |
| "reward": 0.0613291235640645, |
| "reward_std": 0.48243121802806854, |
| "rewards/reward_func": 0.0613291235640645, |
| "step": 5864 |
| }, |
| { |
| "completion_length": 170.6328125, |
| "epoch": 0.7857620768098488, |
| "grad_norm": 4.21875, |
| "kl": 0.004493650107178837, |
| "learning_rate": 2.1423792319015122e-07, |
| "loss": 0.0002, |
| "reward": 0.373017355799675, |
| "reward_std": 0.5189967537298799, |
| "rewards/reward_func": 0.373017355799675, |
| "step": 5872 |
| }, |
| { |
| "completion_length": 217.6640625, |
| "epoch": 0.7868325973504616, |
| "grad_norm": 3.5, |
| "kl": 0.003907823265763, |
| "learning_rate": 2.1316740264953834e-07, |
| "loss": 0.0002, |
| "reward": -0.019582286477088928, |
| "reward_std": 0.5519250631332397, |
| "rewards/reward_func": -0.019582286477088928, |
| "step": 5880 |
| }, |
| { |
| "completion_length": 177.4765625, |
| "epoch": 0.7879031178910746, |
| "grad_norm": 4.40625, |
| "kl": 0.004457623173948377, |
| "learning_rate": 2.1209688210892546e-07, |
| "loss": 0.0002, |
| "reward": 0.17010945454239845, |
| "reward_std": 0.5244961641728878, |
| "rewards/reward_func": 0.17010945454239845, |
| "step": 5888 |
| }, |
| { |
| "completion_length": 177.25, |
| "epoch": 0.7889736384316874, |
| "grad_norm": 4.9375, |
| "kl": 0.0046152446011547, |
| "learning_rate": 2.1102636156831259e-07, |
| "loss": 0.0002, |
| "reward": 0.20693709515035152, |
| "reward_std": 0.602562677115202, |
| "rewards/reward_func": 0.20693709515035152, |
| "step": 5896 |
| }, |
| { |
| "completion_length": 165.15625, |
| "epoch": 0.7900441589723003, |
| "grad_norm": 3.546875, |
| "kl": 0.004225551267154515, |
| "learning_rate": 2.099558410276997e-07, |
| "loss": 0.0002, |
| "reward": 0.3072157595306635, |
| "reward_std": 0.522355480119586, |
| "rewards/reward_func": 0.3072157595306635, |
| "step": 5904 |
| }, |
| { |
| "completion_length": 170.8125, |
| "epoch": 0.7911146795129131, |
| "grad_norm": 4.625, |
| "kl": 0.005012799199903384, |
| "learning_rate": 2.0888532048708686e-07, |
| "loss": 0.0002, |
| "reward": 0.320420335046947, |
| "reward_std": 0.43850363977253437, |
| "rewards/reward_func": 0.320420335046947, |
| "step": 5912 |
| }, |
| { |
| "completion_length": 158.8203125, |
| "epoch": 0.792185200053526, |
| "grad_norm": 5.78125, |
| "kl": 0.005518296180525795, |
| "learning_rate": 2.0781479994647398e-07, |
| "loss": 0.0002, |
| "reward": 0.12541838502511382, |
| "reward_std": 0.4963842146098614, |
| "rewards/reward_func": 0.12541838502511382, |
| "step": 5920 |
| }, |
| { |
| "completion_length": 165.421875, |
| "epoch": 0.7932557205941388, |
| "grad_norm": 3.5, |
| "kl": 0.004109891131520271, |
| "learning_rate": 2.067442794058611e-07, |
| "loss": 0.0002, |
| "reward": 0.48840315639972687, |
| "reward_std": 0.5170729719102383, |
| "rewards/reward_func": 0.48840315639972687, |
| "step": 5928 |
| }, |
| { |
| "completion_length": 206.5, |
| "epoch": 0.7943262411347518, |
| "grad_norm": 2.671875, |
| "kl": 0.004218890477204695, |
| "learning_rate": 2.0567375886524822e-07, |
| "loss": 0.0002, |
| "reward": 0.017320919781923294, |
| "reward_std": 0.5469899624586105, |
| "rewards/reward_func": 0.017320919781923294, |
| "step": 5936 |
| }, |
| { |
| "completion_length": 189.0390625, |
| "epoch": 0.7953967616753647, |
| "grad_norm": 3.5, |
| "kl": 0.004204195429338142, |
| "learning_rate": 2.0460323832463537e-07, |
| "loss": 0.0002, |
| "reward": 0.05206027068197727, |
| "reward_std": 0.5685999430716038, |
| "rewards/reward_func": 0.05206027068197727, |
| "step": 5944 |
| }, |
| { |
| "completion_length": 182.0, |
| "epoch": 0.7964672822159775, |
| "grad_norm": 3.140625, |
| "kl": 0.004214008251437917, |
| "learning_rate": 2.035327177840225e-07, |
| "loss": 0.0002, |
| "reward": 0.04203222133219242, |
| "reward_std": 0.5610231403261423, |
| "rewards/reward_func": 0.04203222133219242, |
| "step": 5952 |
| }, |
| { |
| "completion_length": 179.5078125, |
| "epoch": 0.7975378027565904, |
| "grad_norm": 3.84375, |
| "kl": 0.004752454871777445, |
| "learning_rate": 2.024621972434096e-07, |
| "loss": 0.0002, |
| "reward": -0.10599182732403278, |
| "reward_std": 0.6240234952419996, |
| "rewards/reward_func": -0.10599182732403278, |
| "step": 5960 |
| }, |
| { |
| "completion_length": 166.1875, |
| "epoch": 0.7986083232972032, |
| "grad_norm": 4.96875, |
| "kl": 0.004443921585334465, |
| "learning_rate": 2.0139167670279673e-07, |
| "loss": 0.0002, |
| "reward": 0.329925112426281, |
| "reward_std": 0.42279865965247154, |
| "rewards/reward_func": 0.329925112426281, |
| "step": 5968 |
| }, |
| { |
| "completion_length": 152.3515625, |
| "epoch": 0.7996788438378162, |
| "grad_norm": 3.21875, |
| "kl": 0.004863968148129061, |
| "learning_rate": 2.0032115616218383e-07, |
| "loss": 0.0002, |
| "reward": 0.2605556510388851, |
| "reward_std": 0.46567713283002377, |
| "rewards/reward_func": 0.2605556510388851, |
| "step": 5976 |
| }, |
| { |
| "completion_length": 171.7578125, |
| "epoch": 0.800749364378429, |
| "grad_norm": 4.09375, |
| "kl": 0.004715076414868236, |
| "learning_rate": 1.99250635621571e-07, |
| "loss": 0.0002, |
| "reward": -0.13541333191096783, |
| "reward_std": 0.6721258126199245, |
| "rewards/reward_func": -0.13541333191096783, |
| "step": 5984 |
| }, |
| { |
| "completion_length": 143.6640625, |
| "epoch": 0.8018198849190419, |
| "grad_norm": 4.5, |
| "kl": 0.0056079100468195975, |
| "learning_rate": 1.981801150809581e-07, |
| "loss": 0.0002, |
| "reward": 0.3414863357320428, |
| "reward_std": 0.47020469419658184, |
| "rewards/reward_func": 0.3414863357320428, |
| "step": 5992 |
| }, |
| { |
| "completion_length": 168.5078125, |
| "epoch": 0.8028904054596547, |
| "grad_norm": 3.46875, |
| "kl": 0.004616849677404389, |
| "learning_rate": 1.9710959454034522e-07, |
| "loss": 0.0002, |
| "reward": 0.24850520677864552, |
| "reward_std": 0.5514967441558838, |
| "rewards/reward_func": 0.24850520677864552, |
| "step": 6000 |
| }, |
| { |
| "completion_length": 168.734375, |
| "epoch": 0.8039609260002676, |
| "grad_norm": 2.578125, |
| "kl": 0.004905187961412594, |
| "learning_rate": 1.9603907399973234e-07, |
| "loss": 0.0002, |
| "reward": 0.29693731665611267, |
| "reward_std": 0.5282188858836889, |
| "rewards/reward_func": 0.29693731665611267, |
| "step": 6008 |
| }, |
| { |
| "completion_length": 172.6328125, |
| "epoch": 0.8050314465408805, |
| "grad_norm": 3.625, |
| "kl": 0.00480208353837952, |
| "learning_rate": 1.949685534591195e-07, |
| "loss": 0.0002, |
| "reward": 0.40039923787117004, |
| "reward_std": 0.5602267645299435, |
| "rewards/reward_func": 0.40039923787117004, |
| "step": 6016 |
| }, |
| { |
| "completion_length": 148.671875, |
| "epoch": 0.8061019670814934, |
| "grad_norm": 6.78125, |
| "kl": 0.005018723517423496, |
| "learning_rate": 1.938980329185066e-07, |
| "loss": 0.0002, |
| "reward": 0.3693223036825657, |
| "reward_std": 0.4133735718205571, |
| "rewards/reward_func": 0.3693223036825657, |
| "step": 6024 |
| }, |
| { |
| "completion_length": 168.359375, |
| "epoch": 0.8071724876221062, |
| "grad_norm": 5.03125, |
| "kl": 0.004699432494817302, |
| "learning_rate": 1.9282751237789373e-07, |
| "loss": 0.0002, |
| "reward": 0.29968111030757427, |
| "reward_std": 0.5234957840293646, |
| "rewards/reward_func": 0.29968111030757427, |
| "step": 6032 |
| }, |
| { |
| "completion_length": 172.1171875, |
| "epoch": 0.8082430081627191, |
| "grad_norm": 4.40625, |
| "kl": 0.0047337598516605794, |
| "learning_rate": 1.9175699183728085e-07, |
| "loss": 0.0002, |
| "reward": -0.13689319603145123, |
| "reward_std": 0.5464825332164764, |
| "rewards/reward_func": -0.13689319603145123, |
| "step": 6040 |
| }, |
| { |
| "completion_length": 187.53125, |
| "epoch": 0.809313528703332, |
| "grad_norm": 4.71875, |
| "kl": 0.0043419343419373035, |
| "learning_rate": 1.9068647129666797e-07, |
| "loss": 0.0002, |
| "reward": 0.12531755585223436, |
| "reward_std": 0.7370849475264549, |
| "rewards/reward_func": 0.12531755585223436, |
| "step": 6048 |
| }, |
| { |
| "completion_length": 173.359375, |
| "epoch": 0.8103840492439449, |
| "grad_norm": 3.09375, |
| "kl": 0.004423053003847599, |
| "learning_rate": 1.8961595075605512e-07, |
| "loss": 0.0002, |
| "reward": 0.45560589246451855, |
| "reward_std": 0.3819491732865572, |
| "rewards/reward_func": 0.45560589246451855, |
| "step": 6056 |
| }, |
| { |
| "completion_length": 173.4296875, |
| "epoch": 0.8114545697845578, |
| "grad_norm": 4.8125, |
| "kl": 0.004857113177422434, |
| "learning_rate": 1.8854543021544224e-07, |
| "loss": 0.0002, |
| "reward": 0.08866522740572691, |
| "reward_std": 0.5376447830349207, |
| "rewards/reward_func": 0.08866522740572691, |
| "step": 6064 |
| }, |
| { |
| "completion_length": 164.6015625, |
| "epoch": 0.8125250903251706, |
| "grad_norm": 3.328125, |
| "kl": 0.004501277348026633, |
| "learning_rate": 1.8747490967482937e-07, |
| "loss": 0.0002, |
| "reward": 0.2745439810678363, |
| "reward_std": 0.4785211766138673, |
| "rewards/reward_func": 0.2745439810678363, |
| "step": 6072 |
| }, |
| { |
| "completion_length": 160.1796875, |
| "epoch": 0.8135956108657835, |
| "grad_norm": 4.78125, |
| "kl": 0.004779946495546028, |
| "learning_rate": 1.864043891342165e-07, |
| "loss": 0.0002, |
| "reward": 0.22340465802699327, |
| "reward_std": 0.557499123737216, |
| "rewards/reward_func": 0.22340465802699327, |
| "step": 6080 |
| }, |
| { |
| "completion_length": 160.6171875, |
| "epoch": 0.8146661314063963, |
| "grad_norm": 6.59375, |
| "kl": 0.005315470625646412, |
| "learning_rate": 1.8533386859360364e-07, |
| "loss": 0.0002, |
| "reward": 0.12089579226449132, |
| "reward_std": 0.6152683198451996, |
| "rewards/reward_func": 0.12089579226449132, |
| "step": 6088 |
| }, |
| { |
| "completion_length": 180.1171875, |
| "epoch": 0.8157366519470093, |
| "grad_norm": 4.90625, |
| "kl": 0.0042492037755437195, |
| "learning_rate": 1.8426334805299076e-07, |
| "loss": 0.0002, |
| "reward": 0.18059484660625458, |
| "reward_std": 0.5923185907304287, |
| "rewards/reward_func": 0.18059484660625458, |
| "step": 6096 |
| }, |
| { |
| "completion_length": 149.1796875, |
| "epoch": 0.8168071724876221, |
| "grad_norm": 5.125, |
| "kl": 0.00574629902257584, |
| "learning_rate": 1.8319282751237788e-07, |
| "loss": 0.0002, |
| "reward": 0.2305867071263492, |
| "reward_std": 0.47459197975695133, |
| "rewards/reward_func": 0.2305867071263492, |
| "step": 6104 |
| }, |
| { |
| "completion_length": 160.0703125, |
| "epoch": 0.817877693028235, |
| "grad_norm": 4.65625, |
| "kl": 0.005720962421037257, |
| "learning_rate": 1.82122306971765e-07, |
| "loss": 0.0002, |
| "reward": 0.4231163961812854, |
| "reward_std": 0.49531901255249977, |
| "rewards/reward_func": 0.4231163961812854, |
| "step": 6112 |
| }, |
| { |
| "completion_length": 179.625, |
| "epoch": 0.8189482135688478, |
| "grad_norm": 3.609375, |
| "kl": 0.004402774036861956, |
| "learning_rate": 1.8105178643115212e-07, |
| "loss": 0.0002, |
| "reward": -0.09616492129862309, |
| "reward_std": 0.5352960834279656, |
| "rewards/reward_func": -0.09616492129862309, |
| "step": 6120 |
| }, |
| { |
| "completion_length": 197.9921875, |
| "epoch": 0.8200187341094607, |
| "grad_norm": 3.96875, |
| "kl": 0.004230510094203055, |
| "learning_rate": 1.7998126589053927e-07, |
| "loss": 0.0002, |
| "reward": 0.09585804212838411, |
| "reward_std": 0.6544227637350559, |
| "rewards/reward_func": 0.09585804212838411, |
| "step": 6128 |
| }, |
| { |
| "completion_length": 174.0390625, |
| "epoch": 0.8210892546500737, |
| "grad_norm": 3.8125, |
| "kl": 0.004287142743123695, |
| "learning_rate": 1.789107453499264e-07, |
| "loss": 0.0002, |
| "reward": 0.2640516827814281, |
| "reward_std": 0.5714995982125401, |
| "rewards/reward_func": 0.2640516827814281, |
| "step": 6136 |
| }, |
| { |
| "completion_length": 171.5234375, |
| "epoch": 0.8221597751906865, |
| "grad_norm": 2.703125, |
| "kl": 0.004413856513565406, |
| "learning_rate": 1.778402248093135e-07, |
| "loss": 0.0002, |
| "reward": 0.32004706375300884, |
| "reward_std": 0.6919787935912609, |
| "rewards/reward_func": 0.32004706375300884, |
| "step": 6144 |
| }, |
| { |
| "completion_length": 164.6640625, |
| "epoch": 0.8232302957312994, |
| "grad_norm": 3.40625, |
| "kl": 0.004383451188914478, |
| "learning_rate": 1.7676970426870063e-07, |
| "loss": 0.0002, |
| "reward": 0.3686336353421211, |
| "reward_std": 0.5524613773450255, |
| "rewards/reward_func": 0.3686336353421211, |
| "step": 6152 |
| }, |
| { |
| "completion_length": 196.8203125, |
| "epoch": 0.8243008162719122, |
| "grad_norm": 3.078125, |
| "kl": 0.0042349822469986975, |
| "learning_rate": 1.7569918372808778e-07, |
| "loss": 0.0002, |
| "reward": 0.04516376554965973, |
| "reward_std": 0.5202826540917158, |
| "rewards/reward_func": 0.04516376554965973, |
| "step": 6160 |
| }, |
| { |
| "completion_length": 150.421875, |
| "epoch": 0.8253713368125251, |
| "grad_norm": 2.890625, |
| "kl": 0.004983038583304733, |
| "learning_rate": 1.746286631874749e-07, |
| "loss": 0.0002, |
| "reward": 0.320843068882823, |
| "reward_std": 0.4479983486235142, |
| "rewards/reward_func": 0.320843068882823, |
| "step": 6168 |
| }, |
| { |
| "completion_length": 180.46875, |
| "epoch": 0.8264418573531379, |
| "grad_norm": 4.71875, |
| "kl": 0.004280634428141639, |
| "learning_rate": 1.7355814264686203e-07, |
| "loss": 0.0002, |
| "reward": 0.42457358445972204, |
| "reward_std": 0.6277044154703617, |
| "rewards/reward_func": 0.42457358445972204, |
| "step": 6176 |
| }, |
| { |
| "completion_length": 177.421875, |
| "epoch": 0.8275123778937509, |
| "grad_norm": 4.3125, |
| "kl": 0.003841431171167642, |
| "learning_rate": 1.7248762210624915e-07, |
| "loss": 0.0002, |
| "reward": 0.4124793987721205, |
| "reward_std": 0.5699762850999832, |
| "rewards/reward_func": 0.4124793987721205, |
| "step": 6184 |
| }, |
| { |
| "completion_length": 165.5625, |
| "epoch": 0.8285828984343637, |
| "grad_norm": 3.53125, |
| "kl": 0.0045668908569496125, |
| "learning_rate": 1.714171015656363e-07, |
| "loss": 0.0002, |
| "reward": 0.24390191398561, |
| "reward_std": 0.5946025252342224, |
| "rewards/reward_func": 0.24390191398561, |
| "step": 6192 |
| }, |
| { |
| "completion_length": 144.796875, |
| "epoch": 0.8296534189749766, |
| "grad_norm": 6.15625, |
| "kl": 0.005119076173286885, |
| "learning_rate": 1.7034658102502342e-07, |
| "loss": 0.0002, |
| "reward": 0.37631342001259327, |
| "reward_std": 0.54392384365201, |
| "rewards/reward_func": 0.37631342001259327, |
| "step": 6200 |
| }, |
| { |
| "completion_length": 162.703125, |
| "epoch": 0.8307239395155894, |
| "grad_norm": 3.65625, |
| "kl": 0.004819765774300322, |
| "learning_rate": 1.6927606048441054e-07, |
| "loss": 0.0002, |
| "reward": 0.3171768644824624, |
| "reward_std": 0.6571879032999277, |
| "rewards/reward_func": 0.3171768644824624, |
| "step": 6208 |
| }, |
| { |
| "completion_length": 176.15625, |
| "epoch": 0.8317944600562023, |
| "grad_norm": 4.4375, |
| "kl": 0.004749486513901502, |
| "learning_rate": 1.6820553994379766e-07, |
| "loss": 0.0002, |
| "reward": 0.32477567065507174, |
| "reward_std": 0.584898017346859, |
| "rewards/reward_func": 0.32477567065507174, |
| "step": 6216 |
| }, |
| { |
| "completion_length": 164.546875, |
| "epoch": 0.8328649805968152, |
| "grad_norm": 4.96875, |
| "kl": 0.005459955689730123, |
| "learning_rate": 1.6713501940318478e-07, |
| "loss": 0.0002, |
| "reward": 0.3247902784496546, |
| "reward_std": 0.6046720538288355, |
| "rewards/reward_func": 0.3247902784496546, |
| "step": 6224 |
| }, |
| { |
| "completion_length": 173.3828125, |
| "epoch": 0.8339355011374281, |
| "grad_norm": 3.671875, |
| "kl": 0.005025158607168123, |
| "learning_rate": 1.6606449886257193e-07, |
| "loss": 0.0002, |
| "reward": 0.438681710511446, |
| "reward_std": 0.41498881857842207, |
| "rewards/reward_func": 0.438681710511446, |
| "step": 6232 |
| }, |
| { |
| "completion_length": 149.4453125, |
| "epoch": 0.8350060216780409, |
| "grad_norm": 4.125, |
| "kl": 0.0048881605616770685, |
| "learning_rate": 1.6499397832195905e-07, |
| "loss": 0.0002, |
| "reward": 0.3972213324159384, |
| "reward_std": 0.522408589720726, |
| "rewards/reward_func": 0.3972213324159384, |
| "step": 6240 |
| }, |
| { |
| "completion_length": 142.03125, |
| "epoch": 0.8360765422186538, |
| "grad_norm": 5.9375, |
| "kl": 0.00615677481982857, |
| "learning_rate": 1.6392345778134617e-07, |
| "loss": 0.0002, |
| "reward": 0.5399059653282166, |
| "reward_std": 0.5134044801816344, |
| "rewards/reward_func": 0.5399059653282166, |
| "step": 6248 |
| }, |
| { |
| "completion_length": 139.890625, |
| "epoch": 0.8371470627592666, |
| "grad_norm": 4.40625, |
| "kl": 0.005498810496646911, |
| "learning_rate": 1.628529372407333e-07, |
| "loss": 0.0002, |
| "reward": 0.26559029519557953, |
| "reward_std": 0.7036202065646648, |
| "rewards/reward_func": 0.26559029519557953, |
| "step": 6256 |
| }, |
| { |
| "completion_length": 167.046875, |
| "epoch": 0.8382175832998796, |
| "grad_norm": 3.8125, |
| "kl": 0.005057969567133114, |
| "learning_rate": 1.6178241670012044e-07, |
| "loss": 0.0002, |
| "reward": 0.26883680559694767, |
| "reward_std": 0.6107715517282486, |
| "rewards/reward_func": 0.26883680559694767, |
| "step": 6264 |
| }, |
| { |
| "completion_length": 183.6015625, |
| "epoch": 0.8392881038404925, |
| "grad_norm": 3.78125, |
| "kl": 0.00433379874448292, |
| "learning_rate": 1.6071189615950756e-07, |
| "loss": 0.0002, |
| "reward": 0.08581209369003773, |
| "reward_std": 0.601530484855175, |
| "rewards/reward_func": 0.08581209369003773, |
| "step": 6272 |
| }, |
| { |
| "completion_length": 177.2265625, |
| "epoch": 0.8403586243811053, |
| "grad_norm": 4.71875, |
| "kl": 0.0044705503969453275, |
| "learning_rate": 1.5964137561889469e-07, |
| "loss": 0.0002, |
| "reward": 0.23431246215477586, |
| "reward_std": 0.5960433762520552, |
| "rewards/reward_func": 0.23431246215477586, |
| "step": 6280 |
| }, |
| { |
| "completion_length": 152.078125, |
| "epoch": 0.8414291449217182, |
| "grad_norm": 4.8125, |
| "kl": 0.005255370575468987, |
| "learning_rate": 1.585708550782818e-07, |
| "loss": 0.0002, |
| "reward": 0.37102524004876614, |
| "reward_std": 0.6371021419763565, |
| "rewards/reward_func": 0.37102524004876614, |
| "step": 6288 |
| }, |
| { |
| "completion_length": 217.2734375, |
| "epoch": 0.842499665462331, |
| "grad_norm": 4.375, |
| "kl": 0.003196624806150794, |
| "learning_rate": 1.5750033453766893e-07, |
| "loss": 0.0001, |
| "reward": -0.045689786318689585, |
| "reward_std": 0.4852413050830364, |
| "rewards/reward_func": -0.045689786318689585, |
| "step": 6296 |
| }, |
| { |
| "completion_length": 227.7265625, |
| "epoch": 0.843570186002944, |
| "grad_norm": 3.71875, |
| "kl": 0.004137254873057827, |
| "learning_rate": 1.5642981399705608e-07, |
| "loss": 0.0002, |
| "reward": 6.577186286449432e-05, |
| "reward_std": 0.4605599669739604, |
| "rewards/reward_func": 6.577186286449432e-05, |
| "step": 6304 |
| }, |
| { |
| "completion_length": 176.59375, |
| "epoch": 0.8446407065435568, |
| "grad_norm": 3.25, |
| "kl": 0.004376317374408245, |
| "learning_rate": 1.553592934564432e-07, |
| "loss": 0.0002, |
| "reward": 0.12455911561846733, |
| "reward_std": 0.6250845305621624, |
| "rewards/reward_func": 0.12455911561846733, |
| "step": 6312 |
| }, |
| { |
| "completion_length": 150.9609375, |
| "epoch": 0.8457112270841697, |
| "grad_norm": 5.0625, |
| "kl": 0.004825094016268849, |
| "learning_rate": 1.5428877291583032e-07, |
| "loss": 0.0002, |
| "reward": 0.37383434921503067, |
| "reward_std": 0.6138091459870338, |
| "rewards/reward_func": 0.37383434921503067, |
| "step": 6320 |
| }, |
| { |
| "completion_length": 179.15625, |
| "epoch": 0.8467817476247825, |
| "grad_norm": 3.984375, |
| "kl": 0.004287428979296237, |
| "learning_rate": 1.5321825237521744e-07, |
| "loss": 0.0002, |
| "reward": 0.3161802035756409, |
| "reward_std": 0.5376028679311275, |
| "rewards/reward_func": 0.3161802035756409, |
| "step": 6328 |
| }, |
| { |
| "completion_length": 145.8828125, |
| "epoch": 0.8478522681653954, |
| "grad_norm": 3.171875, |
| "kl": 0.0048953695222735405, |
| "learning_rate": 1.521477318346046e-07, |
| "loss": 0.0002, |
| "reward": 0.27396881859749556, |
| "reward_std": 0.6160639338195324, |
| "rewards/reward_func": 0.27396881859749556, |
| "step": 6336 |
| }, |
| { |
| "completion_length": 170.7578125, |
| "epoch": 0.8489227887060083, |
| "grad_norm": 3.171875, |
| "kl": 0.004435895767528564, |
| "learning_rate": 1.510772112939917e-07, |
| "loss": 0.0002, |
| "reward": 0.34336171485483646, |
| "reward_std": 0.6223765797913074, |
| "rewards/reward_func": 0.34336171485483646, |
| "step": 6344 |
| }, |
| { |
| "completion_length": 134.5, |
| "epoch": 0.8499933092466212, |
| "grad_norm": 4.4375, |
| "kl": 0.005276092153508216, |
| "learning_rate": 1.5000669075337883e-07, |
| "loss": 0.0002, |
| "reward": 0.4353441474959254, |
| "reward_std": 0.5333261359483004, |
| "rewards/reward_func": 0.4353441474959254, |
| "step": 6352 |
| }, |
| { |
| "completion_length": 137.9921875, |
| "epoch": 0.851063829787234, |
| "grad_norm": 5.21875, |
| "kl": 0.0056113199389074, |
| "learning_rate": 1.4893617021276595e-07, |
| "loss": 0.0002, |
| "reward": 0.11142583098262548, |
| "reward_std": 0.6482997722923756, |
| "rewards/reward_func": 0.11142583098262548, |
| "step": 6360 |
| }, |
| { |
| "completion_length": 212.765625, |
| "epoch": 0.8521343503278469, |
| "grad_norm": 4.625, |
| "kl": 0.004180938733043149, |
| "learning_rate": 1.4786564967215308e-07, |
| "loss": 0.0002, |
| "reward": -0.04978405591100454, |
| "reward_std": 0.6307705044746399, |
| "rewards/reward_func": -0.04978405591100454, |
| "step": 6368 |
| }, |
| { |
| "completion_length": 156.7265625, |
| "epoch": 0.8532048708684598, |
| "grad_norm": 2.640625, |
| "kl": 0.005362185067497194, |
| "learning_rate": 1.4679512913154022e-07, |
| "loss": 0.0002, |
| "reward": -0.013063086196780205, |
| "reward_std": 0.5464332979172468, |
| "rewards/reward_func": -0.013063086196780205, |
| "step": 6376 |
| }, |
| { |
| "completion_length": 173.03125, |
| "epoch": 0.8542753914090727, |
| "grad_norm": 3.921875, |
| "kl": 0.004632528842194006, |
| "learning_rate": 1.4572460859092734e-07, |
| "loss": 0.0002, |
| "reward": 0.34282067604362965, |
| "reward_std": 0.622871071100235, |
| "rewards/reward_func": 0.34282067604362965, |
| "step": 6384 |
| }, |
| { |
| "completion_length": 169.6796875, |
| "epoch": 0.8553459119496856, |
| "grad_norm": 4.0, |
| "kl": 0.004290186625439674, |
| "learning_rate": 1.4465408805031447e-07, |
| "loss": 0.0002, |
| "reward": 0.2828236762434244, |
| "reward_std": 0.4671051539480686, |
| "rewards/reward_func": 0.2828236762434244, |
| "step": 6392 |
| }, |
| { |
| "completion_length": 163.65625, |
| "epoch": 0.8564164324902984, |
| "grad_norm": 3.9375, |
| "kl": 0.005294958682497963, |
| "learning_rate": 1.435835675097016e-07, |
| "loss": 0.0002, |
| "reward": 0.46301793679594994, |
| "reward_std": 0.5075423391535878, |
| "rewards/reward_func": 0.46301793679594994, |
| "step": 6400 |
| }, |
| { |
| "completion_length": 157.46875, |
| "epoch": 0.8574869530309113, |
| "grad_norm": 3.703125, |
| "kl": 0.004041396110551432, |
| "learning_rate": 1.4251304696908874e-07, |
| "loss": 0.0002, |
| "reward": 0.48526691645383835, |
| "reward_std": 0.3719025030732155, |
| "rewards/reward_func": 0.48526691645383835, |
| "step": 6408 |
| }, |
| { |
| "completion_length": 146.40625, |
| "epoch": 0.8585574735715241, |
| "grad_norm": 5.5, |
| "kl": 0.005460718472022563, |
| "learning_rate": 1.4144252642847586e-07, |
| "loss": 0.0002, |
| "reward": 0.2532842471264303, |
| "reward_std": 0.44650126062333584, |
| "rewards/reward_func": 0.2532842471264303, |
| "step": 6416 |
| }, |
| { |
| "completion_length": 183.3203125, |
| "epoch": 0.859627994112137, |
| "grad_norm": 4.09375, |
| "kl": 0.004378183133667335, |
| "learning_rate": 1.4037200588786295e-07, |
| "loss": 0.0002, |
| "reward": 0.13037376385182142, |
| "reward_std": 0.5470245387405157, |
| "rewards/reward_func": 0.13037376385182142, |
| "step": 6424 |
| }, |
| { |
| "completion_length": 169.9921875, |
| "epoch": 0.8606985146527499, |
| "grad_norm": 3.890625, |
| "kl": 0.0042949684138875455, |
| "learning_rate": 1.3930148534725007e-07, |
| "loss": 0.0002, |
| "reward": 0.1133667528629303, |
| "reward_std": 0.5052687106654048, |
| "rewards/reward_func": 0.1133667528629303, |
| "step": 6432 |
| }, |
| { |
| "completion_length": 149.46875, |
| "epoch": 0.8617690351933628, |
| "grad_norm": 3.203125, |
| "kl": 0.0049638144264463335, |
| "learning_rate": 1.3823096480663722e-07, |
| "loss": 0.0002, |
| "reward": 0.3671250296756625, |
| "reward_std": 0.7020149789750576, |
| "rewards/reward_func": 0.3671250296756625, |
| "step": 6440 |
| }, |
| { |
| "completion_length": 177.7578125, |
| "epoch": 0.8628395557339756, |
| "grad_norm": 3.15625, |
| "kl": 0.004953681491315365, |
| "learning_rate": 1.3716044426602434e-07, |
| "loss": 0.0002, |
| "reward": -0.07036676816642284, |
| "reward_std": 0.5223548822104931, |
| "rewards/reward_func": -0.07036676816642284, |
| "step": 6448 |
| }, |
| { |
| "completion_length": 153.890625, |
| "epoch": 0.8639100762745885, |
| "grad_norm": 4.0625, |
| "kl": 0.004841944552026689, |
| "learning_rate": 1.3608992372541147e-07, |
| "loss": 0.0002, |
| "reward": 0.2930979495868087, |
| "reward_std": 0.6658169776201248, |
| "rewards/reward_func": 0.2930979495868087, |
| "step": 6456 |
| }, |
| { |
| "completion_length": 163.359375, |
| "epoch": 0.8649805968152013, |
| "grad_norm": 3.796875, |
| "kl": 0.004769285937072709, |
| "learning_rate": 1.3501940318479859e-07, |
| "loss": 0.0002, |
| "reward": 0.48341894522309303, |
| "reward_std": 0.4925485821440816, |
| "rewards/reward_func": 0.48341894522309303, |
| "step": 6464 |
| }, |
| { |
| "completion_length": 160.4765625, |
| "epoch": 0.8660511173558143, |
| "grad_norm": 5.5625, |
| "kl": 0.005216082121478394, |
| "learning_rate": 1.339488826441857e-07, |
| "loss": 0.0002, |
| "reward": 0.26688177324831486, |
| "reward_std": 0.5798533223569393, |
| "rewards/reward_func": 0.26688177324831486, |
| "step": 6472 |
| }, |
| { |
| "completion_length": 180.109375, |
| "epoch": 0.8671216378964272, |
| "grad_norm": 4.34375, |
| "kl": 0.005117598222568631, |
| "learning_rate": 1.3287836210357286e-07, |
| "loss": 0.0002, |
| "reward": 0.0437483387067914, |
| "reward_std": 0.5821977593004704, |
| "rewards/reward_func": 0.0437483387067914, |
| "step": 6480 |
| }, |
| { |
| "completion_length": 203.53125, |
| "epoch": 0.86819215843704, |
| "grad_norm": 3.65625, |
| "kl": 0.004146075778407976, |
| "learning_rate": 1.3180784156295998e-07, |
| "loss": 0.0002, |
| "reward": -0.04123528301715851, |
| "reward_std": 0.5794482082128525, |
| "rewards/reward_func": -0.04123528301715851, |
| "step": 6488 |
| }, |
| { |
| "completion_length": 174.515625, |
| "epoch": 0.8692626789776529, |
| "grad_norm": 3.046875, |
| "kl": 0.004450612410437316, |
| "learning_rate": 1.307373210223471e-07, |
| "loss": 0.0002, |
| "reward": 0.17205783817917109, |
| "reward_std": 0.5600821115076542, |
| "rewards/reward_func": 0.17205783817917109, |
| "step": 6496 |
| }, |
| { |
| "completion_length": 160.5078125, |
| "epoch": 0.8703331995182657, |
| "grad_norm": 4.40625, |
| "kl": 0.005854069750057533, |
| "learning_rate": 1.2966680048173422e-07, |
| "loss": 0.0002, |
| "reward": 0.5184466666541994, |
| "reward_std": 0.43986151926219463, |
| "rewards/reward_func": 0.5184466666541994, |
| "step": 6504 |
| }, |
| { |
| "completion_length": 164.9296875, |
| "epoch": 0.8714037200588787, |
| "grad_norm": 3.484375, |
| "kl": 0.005087268742499873, |
| "learning_rate": 1.2859627994112137e-07, |
| "loss": 0.0002, |
| "reward": 0.15493404306471348, |
| "reward_std": 0.5500355400145054, |
| "rewards/reward_func": 0.15493404306471348, |
| "step": 6512 |
| }, |
| { |
| "completion_length": 166.6953125, |
| "epoch": 0.8724742405994915, |
| "grad_norm": 2.40625, |
| "kl": 0.005023477482609451, |
| "learning_rate": 1.275257594005085e-07, |
| "loss": 0.0002, |
| "reward": 0.21355824172496796, |
| "reward_std": 0.695870652794838, |
| "rewards/reward_func": 0.21355824172496796, |
| "step": 6520 |
| }, |
| { |
| "completion_length": 179.40625, |
| "epoch": 0.8735447611401044, |
| "grad_norm": 3.671875, |
| "kl": 0.0044980833772569895, |
| "learning_rate": 1.264552388598956e-07, |
| "loss": 0.0002, |
| "reward": 0.31162807578220963, |
| "reward_std": 0.49335628002882004, |
| "rewards/reward_func": 0.31162807578220963, |
| "step": 6528 |
| }, |
| { |
| "completion_length": 170.890625, |
| "epoch": 0.8746152816807172, |
| "grad_norm": 4.34375, |
| "kl": 0.004584902344504371, |
| "learning_rate": 1.2538471831928273e-07, |
| "loss": 0.0002, |
| "reward": 0.39711445942521095, |
| "reward_std": 0.4620585907250643, |
| "rewards/reward_func": 0.39711445942521095, |
| "step": 6536 |
| }, |
| { |
| "completion_length": 196.34375, |
| "epoch": 0.8756858022213301, |
| "grad_norm": 6.03125, |
| "kl": 0.004008692951174453, |
| "learning_rate": 1.2431419777866988e-07, |
| "loss": 0.0002, |
| "reward": 0.14655437879264355, |
| "reward_std": 0.5025924574583769, |
| "rewards/reward_func": 0.14655437879264355, |
| "step": 6544 |
| }, |
| { |
| "completion_length": 157.1328125, |
| "epoch": 0.876756322761943, |
| "grad_norm": 4.5, |
| "kl": 0.005272853362839669, |
| "learning_rate": 1.23243677238057e-07, |
| "loss": 0.0002, |
| "reward": 0.16074330359697342, |
| "reward_std": 0.4522952139377594, |
| "rewards/reward_func": 0.16074330359697342, |
| "step": 6552 |
| }, |
| { |
| "completion_length": 170.0390625, |
| "epoch": 0.8778268433025559, |
| "grad_norm": 3.71875, |
| "kl": 0.0051819840737152845, |
| "learning_rate": 1.2217315669744412e-07, |
| "loss": 0.0002, |
| "reward": 0.3139108493924141, |
| "reward_std": 0.4983799997717142, |
| "rewards/reward_func": 0.3139108493924141, |
| "step": 6560 |
| }, |
| { |
| "completion_length": 160.1953125, |
| "epoch": 0.8788973638431687, |
| "grad_norm": 2.5, |
| "kl": 0.004580837674438953, |
| "learning_rate": 1.2110263615683125e-07, |
| "loss": 0.0002, |
| "reward": 0.35805173218250275, |
| "reward_std": 0.4121380029246211, |
| "rewards/reward_func": 0.35805173218250275, |
| "step": 6568 |
| }, |
| { |
| "completion_length": 166.125, |
| "epoch": 0.8799678843837816, |
| "grad_norm": 3.890625, |
| "kl": 0.005891179316677153, |
| "learning_rate": 1.200321156162184e-07, |
| "loss": 0.0002, |
| "reward": 0.3935977406799793, |
| "reward_std": 0.4564328156411648, |
| "rewards/reward_func": 0.3935977406799793, |
| "step": 6576 |
| }, |
| { |
| "completion_length": 161.828125, |
| "epoch": 0.8810384049243944, |
| "grad_norm": 4.03125, |
| "kl": 0.004937338293530047, |
| "learning_rate": 1.189615950756055e-07, |
| "loss": 0.0002, |
| "reward": 0.3541194014251232, |
| "reward_std": 0.7327413186430931, |
| "rewards/reward_func": 0.3541194014251232, |
| "step": 6584 |
| }, |
| { |
| "completion_length": 166.890625, |
| "epoch": 0.8821089254650074, |
| "grad_norm": 4.0, |
| "kl": 0.004368811612948775, |
| "learning_rate": 1.1789107453499264e-07, |
| "loss": 0.0002, |
| "reward": 0.43425997346639633, |
| "reward_std": 0.5963248610496521, |
| "rewards/reward_func": 0.43425997346639633, |
| "step": 6592 |
| }, |
| { |
| "completion_length": 139.5, |
| "epoch": 0.8831794460056203, |
| "grad_norm": 3.546875, |
| "kl": 0.006153674854431301, |
| "learning_rate": 1.1682055399437976e-07, |
| "loss": 0.0002, |
| "reward": 0.4587271837517619, |
| "reward_std": 0.5946958791464567, |
| "rewards/reward_func": 0.4587271837517619, |
| "step": 6600 |
| }, |
| { |
| "completion_length": 160.2890625, |
| "epoch": 0.8842499665462331, |
| "grad_norm": 3.265625, |
| "kl": 0.004449796746484935, |
| "learning_rate": 1.1575003345376688e-07, |
| "loss": 0.0002, |
| "reward": 0.3748700972646475, |
| "reward_std": 0.5290507553145289, |
| "rewards/reward_func": 0.3748700972646475, |
| "step": 6608 |
| }, |
| { |
| "completion_length": 175.03125, |
| "epoch": 0.885320487086846, |
| "grad_norm": 3.4375, |
| "kl": 0.0046757735253777355, |
| "learning_rate": 1.1467951291315402e-07, |
| "loss": 0.0002, |
| "reward": 0.36219143867492676, |
| "reward_std": 0.5348459035158157, |
| "rewards/reward_func": 0.36219143867492676, |
| "step": 6616 |
| }, |
| { |
| "completion_length": 159.0078125, |
| "epoch": 0.8863910076274588, |
| "grad_norm": 4.375, |
| "kl": 0.005130204517627135, |
| "learning_rate": 1.1360899237254114e-07, |
| "loss": 0.0002, |
| "reward": 0.40300269052386284, |
| "reward_std": 0.5223680902272463, |
| "rewards/reward_func": 0.40300269052386284, |
| "step": 6624 |
| }, |
| { |
| "completion_length": 168.6328125, |
| "epoch": 0.8874615281680718, |
| "grad_norm": 4.03125, |
| "kl": 0.00497715815436095, |
| "learning_rate": 1.1253847183192827e-07, |
| "loss": 0.0002, |
| "reward": 0.3870235029608011, |
| "reward_std": 0.6206906009465456, |
| "rewards/reward_func": 0.3870235029608011, |
| "step": 6632 |
| }, |
| { |
| "completion_length": 167.6328125, |
| "epoch": 0.8885320487086846, |
| "grad_norm": 2.75, |
| "kl": 0.004565039882436395, |
| "learning_rate": 1.1146795129131539e-07, |
| "loss": 0.0002, |
| "reward": 0.19453393667936325, |
| "reward_std": 0.43898776825517416, |
| "rewards/reward_func": 0.19453393667936325, |
| "step": 6640 |
| }, |
| { |
| "completion_length": 155.34375, |
| "epoch": 0.8896025692492975, |
| "grad_norm": 2.140625, |
| "kl": 0.004799488058779389, |
| "learning_rate": 1.1039743075070253e-07, |
| "loss": 0.0002, |
| "reward": 0.44809896126389503, |
| "reward_std": 0.476587675511837, |
| "rewards/reward_func": 0.44809896126389503, |
| "step": 6648 |
| }, |
| { |
| "completion_length": 182.6640625, |
| "epoch": 0.8906730897899103, |
| "grad_norm": 3.328125, |
| "kl": 0.0043890890083275735, |
| "learning_rate": 1.0932691021008965e-07, |
| "loss": 0.0002, |
| "reward": 0.289157398045063, |
| "reward_std": 0.5493966788053513, |
| "rewards/reward_func": 0.289157398045063, |
| "step": 6656 |
| }, |
| { |
| "completion_length": 150.7421875, |
| "epoch": 0.8917436103305232, |
| "grad_norm": 3.234375, |
| "kl": 0.004936346551403403, |
| "learning_rate": 1.0825638966947678e-07, |
| "loss": 0.0002, |
| "reward": 0.5019879713654518, |
| "reward_std": 0.550602201372385, |
| "rewards/reward_func": 0.5019879713654518, |
| "step": 6664 |
| }, |
| { |
| "completion_length": 141.7265625, |
| "epoch": 0.892814130871136, |
| "grad_norm": 5.34375, |
| "kl": 0.0059457606403157115, |
| "learning_rate": 1.071858691288639e-07, |
| "loss": 0.0002, |
| "reward": 0.4372959118336439, |
| "reward_std": 0.46707610227167606, |
| "rewards/reward_func": 0.4372959118336439, |
| "step": 6672 |
| }, |
| { |
| "completion_length": 170.8046875, |
| "epoch": 0.893884651411749, |
| "grad_norm": 3.703125, |
| "kl": 0.004617019789293408, |
| "learning_rate": 1.0611534858825104e-07, |
| "loss": 0.0002, |
| "reward": 0.06940314406529069, |
| "reward_std": 0.6122306901961565, |
| "rewards/reward_func": 0.06940314406529069, |
| "step": 6680 |
| }, |
| { |
| "completion_length": 177.7265625, |
| "epoch": 0.8949551719523618, |
| "grad_norm": 2.578125, |
| "kl": 0.004395580617710948, |
| "learning_rate": 1.0504482804763816e-07, |
| "loss": 0.0002, |
| "reward": 0.09883344545960426, |
| "reward_std": 0.5640975758433342, |
| "rewards/reward_func": 0.09883344545960426, |
| "step": 6688 |
| }, |
| { |
| "completion_length": 179.0859375, |
| "epoch": 0.8960256924929747, |
| "grad_norm": 4.40625, |
| "kl": 0.004834166058572009, |
| "learning_rate": 1.0397430750702528e-07, |
| "loss": 0.0002, |
| "reward": 0.2342253029346466, |
| "reward_std": 0.6865712143480778, |
| "rewards/reward_func": 0.2342253029346466, |
| "step": 6696 |
| }, |
| { |
| "completion_length": 161.90625, |
| "epoch": 0.8970962130335876, |
| "grad_norm": 5.90625, |
| "kl": 0.004615583224222064, |
| "learning_rate": 1.0290378696641242e-07, |
| "loss": 0.0002, |
| "reward": 0.18776031211018562, |
| "reward_std": 0.5335487443953753, |
| "rewards/reward_func": 0.18776031211018562, |
| "step": 6704 |
| }, |
| { |
| "completion_length": 191.4453125, |
| "epoch": 0.8981667335742004, |
| "grad_norm": 4.46875, |
| "kl": 0.004502045980188996, |
| "learning_rate": 1.0183326642579954e-07, |
| "loss": 0.0002, |
| "reward": 0.22535105049610138, |
| "reward_std": 0.4681578129529953, |
| "rewards/reward_func": 0.22535105049610138, |
| "step": 6712 |
| }, |
| { |
| "completion_length": 187.609375, |
| "epoch": 0.8992372541148134, |
| "grad_norm": 4.875, |
| "kl": 0.004247891949489713, |
| "learning_rate": 1.0076274588518667e-07, |
| "loss": 0.0002, |
| "reward": 0.1750158555805683, |
| "reward_std": 0.6813812926411629, |
| "rewards/reward_func": 0.1750158555805683, |
| "step": 6720 |
| }, |
| { |
| "completion_length": 162.8203125, |
| "epoch": 0.9003077746554262, |
| "grad_norm": 4.25, |
| "kl": 0.004933495947625488, |
| "learning_rate": 9.96922253445738e-08, |
| "loss": 0.0002, |
| "reward": 0.23774974327534437, |
| "reward_std": 0.48002783581614494, |
| "rewards/reward_func": 0.23774974327534437, |
| "step": 6728 |
| }, |
| { |
| "completion_length": 162.859375, |
| "epoch": 0.9013782951960391, |
| "grad_norm": 4.5625, |
| "kl": 0.005467957467772067, |
| "learning_rate": 9.862170480396093e-08, |
| "loss": 0.0002, |
| "reward": -0.0023063644766807556, |
| "reward_std": 0.564548920840025, |
| "rewards/reward_func": -0.0023063644766807556, |
| "step": 6736 |
| }, |
| { |
| "completion_length": 187.5078125, |
| "epoch": 0.9024488157366519, |
| "grad_norm": 5.84375, |
| "kl": 0.004037181934108958, |
| "learning_rate": 9.755118426334805e-08, |
| "loss": 0.0002, |
| "reward": 0.27846864983439445, |
| "reward_std": 0.5254440493881702, |
| "rewards/reward_func": 0.27846864983439445, |
| "step": 6744 |
| }, |
| { |
| "completion_length": 174.0, |
| "epoch": 0.9035193362772648, |
| "grad_norm": 3.046875, |
| "kl": 0.004285787290427834, |
| "learning_rate": 9.648066372273519e-08, |
| "loss": 0.0002, |
| "reward": 0.22628629952669144, |
| "reward_std": 0.49990267865359783, |
| "rewards/reward_func": 0.22628629952669144, |
| "step": 6752 |
| }, |
| { |
| "completion_length": 164.0, |
| "epoch": 0.9045898568178777, |
| "grad_norm": 4.15625, |
| "kl": 0.005695787549484521, |
| "learning_rate": 9.54101431821223e-08, |
| "loss": 0.0002, |
| "reward": 0.26332173496484756, |
| "reward_std": 0.6010408755391836, |
| "rewards/reward_func": 0.26332173496484756, |
| "step": 6760 |
| }, |
| { |
| "completion_length": 165.484375, |
| "epoch": 0.9056603773584906, |
| "grad_norm": 2.65625, |
| "kl": 0.0045514948724303395, |
| "learning_rate": 9.433962264150943e-08, |
| "loss": 0.0002, |
| "reward": 0.1936313882470131, |
| "reward_std": 0.6337927635759115, |
| "rewards/reward_func": 0.1936313882470131, |
| "step": 6768 |
| }, |
| { |
| "completion_length": 161.5078125, |
| "epoch": 0.9067308978991034, |
| "grad_norm": 4.5, |
| "kl": 0.0048291504790540785, |
| "learning_rate": 9.326910210089655e-08, |
| "loss": 0.0002, |
| "reward": 0.137207493185997, |
| "reward_std": 0.5269910991191864, |
| "rewards/reward_func": 0.137207493185997, |
| "step": 6776 |
| }, |
| { |
| "completion_length": 152.515625, |
| "epoch": 0.9078014184397163, |
| "grad_norm": 4.34375, |
| "kl": 0.0052593986911233515, |
| "learning_rate": 9.219858156028367e-08, |
| "loss": 0.0002, |
| "reward": 0.5747008826583624, |
| "reward_std": 0.4051123149693012, |
| "rewards/reward_func": 0.5747008826583624, |
| "step": 6784 |
| }, |
| { |
| "completion_length": 182.9296875, |
| "epoch": 0.9088719389803291, |
| "grad_norm": 6.6875, |
| "kl": 0.004641913692466915, |
| "learning_rate": 9.112806101967081e-08, |
| "loss": 0.0002, |
| "reward": 0.21289030835032463, |
| "reward_std": 0.48070234432816505, |
| "rewards/reward_func": 0.21289030835032463, |
| "step": 6792 |
| }, |
| { |
| "completion_length": 174.640625, |
| "epoch": 0.9099424595209421, |
| "grad_norm": 3.25, |
| "kl": 0.004628196998964995, |
| "learning_rate": 9.005754047905793e-08, |
| "loss": 0.0002, |
| "reward": 0.1559174619615078, |
| "reward_std": 0.6800275854766369, |
| "rewards/reward_func": 0.1559174619615078, |
| "step": 6800 |
| }, |
| { |
| "completion_length": 157.203125, |
| "epoch": 0.911012980061555, |
| "grad_norm": 3.40625, |
| "kl": 0.006008526281220838, |
| "learning_rate": 8.898701993844506e-08, |
| "loss": 0.0002, |
| "reward": 0.12123461440205574, |
| "reward_std": 0.5197535315528512, |
| "rewards/reward_func": 0.12123461440205574, |
| "step": 6808 |
| }, |
| { |
| "completion_length": 164.4609375, |
| "epoch": 0.9120835006021678, |
| "grad_norm": 3.734375, |
| "kl": 0.004909445357043296, |
| "learning_rate": 8.791649939783219e-08, |
| "loss": 0.0002, |
| "reward": 0.4351821830496192, |
| "reward_std": 0.5581427849829197, |
| "rewards/reward_func": 0.4351821830496192, |
| "step": 6816 |
| }, |
| { |
| "completion_length": 157.296875, |
| "epoch": 0.9131540211427807, |
| "grad_norm": 2.625, |
| "kl": 0.004735041700769216, |
| "learning_rate": 8.684597885721932e-08, |
| "loss": 0.0002, |
| "reward": 0.209370581433177, |
| "reward_std": 0.5503848614171147, |
| "rewards/reward_func": 0.209370581433177, |
| "step": 6824 |
| }, |
| { |
| "completion_length": 161.8046875, |
| "epoch": 0.9142245416833935, |
| "grad_norm": 4.78125, |
| "kl": 0.004196583904558793, |
| "learning_rate": 8.577545831660644e-08, |
| "loss": 0.0002, |
| "reward": 0.29275982081890106, |
| "reward_std": 0.4565849918872118, |
| "rewards/reward_func": 0.29275982081890106, |
| "step": 6832 |
| }, |
| { |
| "completion_length": 166.5625, |
| "epoch": 0.9152950622240065, |
| "grad_norm": 4.09375, |
| "kl": 0.004536589724011719, |
| "learning_rate": 8.470493777599358e-08, |
| "loss": 0.0002, |
| "reward": 0.19659875519573689, |
| "reward_std": 0.5807360988110304, |
| "rewards/reward_func": 0.19659875519573689, |
| "step": 6840 |
| }, |
| { |
| "completion_length": 172.1328125, |
| "epoch": 0.9163655827646193, |
| "grad_norm": 3.921875, |
| "kl": 0.004410766297951341, |
| "learning_rate": 8.36344172353807e-08, |
| "loss": 0.0002, |
| "reward": 0.03568706847727299, |
| "reward_std": 0.6016153171658516, |
| "rewards/reward_func": 0.03568706847727299, |
| "step": 6848 |
| }, |
| { |
| "completion_length": 178.8203125, |
| "epoch": 0.9174361033052322, |
| "grad_norm": 3.484375, |
| "kl": 0.00475726873264648, |
| "learning_rate": 8.256389669476782e-08, |
| "loss": 0.0002, |
| "reward": 0.035759665071964264, |
| "reward_std": 0.5136286579072475, |
| "rewards/reward_func": 0.035759665071964264, |
| "step": 6856 |
| }, |
| { |
| "completion_length": 158.8671875, |
| "epoch": 0.918506623845845, |
| "grad_norm": 4.1875, |
| "kl": 0.004829802084714174, |
| "learning_rate": 8.149337615415496e-08, |
| "loss": 0.0002, |
| "reward": 0.43006047047674656, |
| "reward_std": 0.5332435881718993, |
| "rewards/reward_func": 0.43006047047674656, |
| "step": 6864 |
| }, |
| { |
| "completion_length": 158.5078125, |
| "epoch": 0.9195771443864579, |
| "grad_norm": 2.75, |
| "kl": 0.004407216591062024, |
| "learning_rate": 8.042285561354208e-08, |
| "loss": 0.0002, |
| "reward": 0.42078845389187336, |
| "reward_std": 0.5134297851473093, |
| "rewards/reward_func": 0.42078845389187336, |
| "step": 6872 |
| }, |
| { |
| "completion_length": 135.34375, |
| "epoch": 0.9206476649270708, |
| "grad_norm": 6.0625, |
| "kl": 0.006246095523238182, |
| "learning_rate": 7.935233507292921e-08, |
| "loss": 0.0002, |
| "reward": 0.2924302965402603, |
| "reward_std": 0.5908289672806859, |
| "rewards/reward_func": 0.2924302965402603, |
| "step": 6880 |
| }, |
| { |
| "completion_length": 152.2734375, |
| "epoch": 0.9217181854676837, |
| "grad_norm": 4.65625, |
| "kl": 0.005424696602858603, |
| "learning_rate": 7.828181453231633e-08, |
| "loss": 0.0002, |
| "reward": 0.3303174478933215, |
| "reward_std": 0.4598999507725239, |
| "rewards/reward_func": 0.3303174478933215, |
| "step": 6888 |
| }, |
| { |
| "completion_length": 126.9375, |
| "epoch": 0.9227887060082965, |
| "grad_norm": 4.1875, |
| "kl": 0.005991748097585514, |
| "learning_rate": 7.721129399170347e-08, |
| "loss": 0.0002, |
| "reward": 0.5790990553796291, |
| "reward_std": 0.4186181202530861, |
| "rewards/reward_func": 0.5790990553796291, |
| "step": 6896 |
| }, |
| { |
| "completion_length": 165.5859375, |
| "epoch": 0.9238592265489094, |
| "grad_norm": 4.21875, |
| "kl": 0.004846252937568352, |
| "learning_rate": 7.614077345109059e-08, |
| "loss": 0.0002, |
| "reward": 0.21630746312439442, |
| "reward_std": 0.5224413331598043, |
| "rewards/reward_func": 0.21630746312439442, |
| "step": 6904 |
| }, |
| { |
| "completion_length": 150.71875, |
| "epoch": 0.9249297470895222, |
| "grad_norm": 4.4375, |
| "kl": 0.004703165264800191, |
| "learning_rate": 7.507025291047772e-08, |
| "loss": 0.0002, |
| "reward": 0.49177973717451096, |
| "reward_std": 0.46361699141561985, |
| "rewards/reward_func": 0.49177973717451096, |
| "step": 6912 |
| }, |
| { |
| "completion_length": 182.15625, |
| "epoch": 0.9260002676301351, |
| "grad_norm": 4.75, |
| "kl": 0.004313376499339938, |
| "learning_rate": 7.399973236986485e-08, |
| "loss": 0.0002, |
| "reward": 0.12689837673678994, |
| "reward_std": 0.6647001150995493, |
| "rewards/reward_func": 0.12689837673678994, |
| "step": 6920 |
| }, |
| { |
| "completion_length": 155.71875, |
| "epoch": 0.9270707881707481, |
| "grad_norm": 3.203125, |
| "kl": 0.004443499754415825, |
| "learning_rate": 7.292921182925198e-08, |
| "loss": 0.0002, |
| "reward": 0.29615641478449106, |
| "reward_std": 0.5568899232894182, |
| "rewards/reward_func": 0.29615641478449106, |
| "step": 6928 |
| }, |
| { |
| "completion_length": 155.15625, |
| "epoch": 0.9281413087113609, |
| "grad_norm": 3.796875, |
| "kl": 0.0046441941522061825, |
| "learning_rate": 7.18586912886391e-08, |
| "loss": 0.0002, |
| "reward": 0.28125396044924855, |
| "reward_std": 0.4789434429258108, |
| "rewards/reward_func": 0.28125396044924855, |
| "step": 6936 |
| }, |
| { |
| "completion_length": 172.03125, |
| "epoch": 0.9292118292519738, |
| "grad_norm": 4.84375, |
| "kl": 0.0055302626569755375, |
| "learning_rate": 7.078817074802622e-08, |
| "loss": 0.0002, |
| "reward": 0.3774759713560343, |
| "reward_std": 0.5277713388204575, |
| "rewards/reward_func": 0.3774759713560343, |
| "step": 6944 |
| }, |
| { |
| "completion_length": 165.890625, |
| "epoch": 0.9302823497925866, |
| "grad_norm": 3.015625, |
| "kl": 0.004302407876821235, |
| "learning_rate": 6.971765020741336e-08, |
| "loss": 0.0002, |
| "reward": 0.35936339199543, |
| "reward_std": 0.5036177840083838, |
| "rewards/reward_func": 0.35936339199543, |
| "step": 6952 |
| }, |
| { |
| "completion_length": 163.0234375, |
| "epoch": 0.9313528703331995, |
| "grad_norm": 3.78125, |
| "kl": 0.0055159886833280325, |
| "learning_rate": 6.864712966680048e-08, |
| "loss": 0.0002, |
| "reward": 0.5663758469745517, |
| "reward_std": 0.4252478200942278, |
| "rewards/reward_func": 0.5663758469745517, |
| "step": 6960 |
| }, |
| { |
| "completion_length": 151.8984375, |
| "epoch": 0.9324233908738124, |
| "grad_norm": 4.625, |
| "kl": 0.005510843213414773, |
| "learning_rate": 6.757660912618761e-08, |
| "loss": 0.0002, |
| "reward": 0.3682685000821948, |
| "reward_std": 0.5349069032818079, |
| "rewards/reward_func": 0.3682685000821948, |
| "step": 6968 |
| }, |
| { |
| "completion_length": 150.671875, |
| "epoch": 0.9334939114144253, |
| "grad_norm": 4.5, |
| "kl": 0.005221706640440971, |
| "learning_rate": 6.650608858557472e-08, |
| "loss": 0.0002, |
| "reward": 0.48502959311008453, |
| "reward_std": 0.39637486822903156, |
| "rewards/reward_func": 0.48502959311008453, |
| "step": 6976 |
| }, |
| { |
| "completion_length": 170.3359375, |
| "epoch": 0.9345644319550381, |
| "grad_norm": 5.53125, |
| "kl": 0.004806717770406976, |
| "learning_rate": 6.543556804496186e-08, |
| "loss": 0.0002, |
| "reward": 0.20024515688419342, |
| "reward_std": 0.3813412329182029, |
| "rewards/reward_func": 0.20024515688419342, |
| "step": 6984 |
| }, |
| { |
| "completion_length": 165.296875, |
| "epoch": 0.935634952495651, |
| "grad_norm": 4.21875, |
| "kl": 0.0052095072460360825, |
| "learning_rate": 6.436504750434898e-08, |
| "loss": 0.0002, |
| "reward": 0.17343932949006557, |
| "reward_std": 0.542176740244031, |
| "rewards/reward_func": 0.17343932949006557, |
| "step": 6992 |
| }, |
| { |
| "completion_length": 169.6953125, |
| "epoch": 0.9367054730362638, |
| "grad_norm": 3.6875, |
| "kl": 0.00422157411230728, |
| "learning_rate": 6.329452696373611e-08, |
| "loss": 0.0002, |
| "reward": 0.06634609401226044, |
| "reward_std": 0.6044113449752331, |
| "rewards/reward_func": 0.06634609401226044, |
| "step": 7000 |
| }, |
| { |
| "completion_length": 146.5234375, |
| "epoch": 0.9377759935768768, |
| "grad_norm": 5.0625, |
| "kl": 0.005492849391885102, |
| "learning_rate": 6.222400642312324e-08, |
| "loss": 0.0002, |
| "reward": 0.23497827351093292, |
| "reward_std": 0.42981395684182644, |
| "rewards/reward_func": 0.23497827351093292, |
| "step": 7008 |
| }, |
| { |
| "completion_length": 170.328125, |
| "epoch": 0.9388465141174896, |
| "grad_norm": 4.65625, |
| "kl": 0.004708502208814025, |
| "learning_rate": 6.115348588251037e-08, |
| "loss": 0.0002, |
| "reward": -0.04050692915916443, |
| "reward_std": 0.6017125463113189, |
| "rewards/reward_func": -0.04050692915916443, |
| "step": 7016 |
| }, |
| { |
| "completion_length": 166.53125, |
| "epoch": 0.9399170346581025, |
| "grad_norm": 3.78125, |
| "kl": 0.0050715115503408015, |
| "learning_rate": 6.008296534189749e-08, |
| "loss": 0.0002, |
| "reward": 0.3130027763545513, |
| "reward_std": 0.47869889438152313, |
| "rewards/reward_func": 0.3130027763545513, |
| "step": 7024 |
| }, |
| { |
| "completion_length": 194.546875, |
| "epoch": 0.9409875551987154, |
| "grad_norm": 3.0625, |
| "kl": 0.003658687841380015, |
| "learning_rate": 5.901244480128462e-08, |
| "loss": 0.0001, |
| "reward": 0.33295007050037384, |
| "reward_std": 0.4185595214366913, |
| "rewards/reward_func": 0.33295007050037384, |
| "step": 7032 |
| }, |
| { |
| "completion_length": 180.828125, |
| "epoch": 0.9420580757393282, |
| "grad_norm": 2.59375, |
| "kl": 0.005314617330441251, |
| "learning_rate": 5.794192426067175e-08, |
| "loss": 0.0002, |
| "reward": 0.2711464911699295, |
| "reward_std": 0.529150040820241, |
| "rewards/reward_func": 0.2711464911699295, |
| "step": 7040 |
| }, |
| { |
| "completion_length": 166.5390625, |
| "epoch": 0.9431285962799412, |
| "grad_norm": 4.3125, |
| "kl": 0.005095012194942683, |
| "learning_rate": 5.6871403720058877e-08, |
| "loss": 0.0002, |
| "reward": 0.38804778829216957, |
| "reward_std": 0.6022228971123695, |
| "rewards/reward_func": 0.38804778829216957, |
| "step": 7048 |
| }, |
| { |
| "completion_length": 166.375, |
| "epoch": 0.944199116820554, |
| "grad_norm": 3.921875, |
| "kl": 0.005058724695118144, |
| "learning_rate": 5.5800883179446e-08, |
| "loss": 0.0002, |
| "reward": 0.201092598028481, |
| "reward_std": 0.5487896800041199, |
| "rewards/reward_func": 0.201092598028481, |
| "step": 7056 |
| }, |
| { |
| "completion_length": 175.8203125, |
| "epoch": 0.9452696373611669, |
| "grad_norm": 5.59375, |
| "kl": 0.0046728674788028, |
| "learning_rate": 5.4730362638833126e-08, |
| "loss": 0.0002, |
| "reward": 0.21519318595528603, |
| "reward_std": 0.6764262039214373, |
| "rewards/reward_func": 0.21519318595528603, |
| "step": 7064 |
| }, |
| { |
| "completion_length": 170.0625, |
| "epoch": 0.9463401579017797, |
| "grad_norm": 3.65625, |
| "kl": 0.00383082203916274, |
| "learning_rate": 5.3659842098220254e-08, |
| "loss": 0.0002, |
| "reward": 0.2285008803009987, |
| "reward_std": 0.47355389036238194, |
| "rewards/reward_func": 0.2285008803009987, |
| "step": 7072 |
| }, |
| { |
| "completion_length": 160.890625, |
| "epoch": 0.9474106784423926, |
| "grad_norm": 3.5, |
| "kl": 0.004751811851747334, |
| "learning_rate": 5.258932155760738e-08, |
| "loss": 0.0002, |
| "reward": 0.4270824361592531, |
| "reward_std": 0.41622067615389824, |
| "rewards/reward_func": 0.4270824361592531, |
| "step": 7080 |
| }, |
| { |
| "completion_length": 157.25, |
| "epoch": 0.9484811989830055, |
| "grad_norm": 4.875, |
| "kl": 0.005698841763660312, |
| "learning_rate": 5.151880101699451e-08, |
| "loss": 0.0002, |
| "reward": 0.30552778858691454, |
| "reward_std": 0.522463321685791, |
| "rewards/reward_func": 0.30552778858691454, |
| "step": 7088 |
| }, |
| { |
| "completion_length": 156.6796875, |
| "epoch": 0.9495517195236184, |
| "grad_norm": 3.890625, |
| "kl": 0.005308831227011979, |
| "learning_rate": 5.044828047638164e-08, |
| "loss": 0.0002, |
| "reward": 0.2509817620739341, |
| "reward_std": 0.5911254324018955, |
| "rewards/reward_func": 0.2509817620739341, |
| "step": 7096 |
| }, |
| { |
| "completion_length": 172.4609375, |
| "epoch": 0.9506222400642312, |
| "grad_norm": 3.40625, |
| "kl": 0.004089270456461236, |
| "learning_rate": 4.937775993576877e-08, |
| "loss": 0.0002, |
| "reward": 0.1337134689092636, |
| "reward_std": 0.34575022105127573, |
| "rewards/reward_func": 0.1337134689092636, |
| "step": 7104 |
| }, |
| { |
| "completion_length": 165.0625, |
| "epoch": 0.9516927606048441, |
| "grad_norm": 3.609375, |
| "kl": 0.004878541512880474, |
| "learning_rate": 4.8307239395155895e-08, |
| "loss": 0.0002, |
| "reward": 0.28754607075825334, |
| "reward_std": 0.49302330799400806, |
| "rewards/reward_func": 0.28754607075825334, |
| "step": 7112 |
| }, |
| { |
| "completion_length": 190.7109375, |
| "epoch": 0.9527632811454569, |
| "grad_norm": 3.265625, |
| "kl": 0.003856517461827025, |
| "learning_rate": 4.7236718854543023e-08, |
| "loss": 0.0002, |
| "reward": 0.05854572542011738, |
| "reward_std": 0.4840726386755705, |
| "rewards/reward_func": 0.05854572542011738, |
| "step": 7120 |
| }, |
| { |
| "completion_length": 178.7421875, |
| "epoch": 0.9538338016860699, |
| "grad_norm": 5.09375, |
| "kl": 0.005262946098810062, |
| "learning_rate": 4.616619831393015e-08, |
| "loss": 0.0002, |
| "reward": 0.1380448378622532, |
| "reward_std": 0.44803581573069096, |
| "rewards/reward_func": 0.1380448378622532, |
| "step": 7128 |
| }, |
| { |
| "completion_length": 162.75, |
| "epoch": 0.9549043222266828, |
| "grad_norm": 4.1875, |
| "kl": 0.0052962955087423325, |
| "learning_rate": 4.509567777331728e-08, |
| "loss": 0.0002, |
| "reward": 0.4494497813284397, |
| "reward_std": 0.4952176222577691, |
| "rewards/reward_func": 0.4494497813284397, |
| "step": 7136 |
| }, |
| { |
| "completion_length": 148.8671875, |
| "epoch": 0.9559748427672956, |
| "grad_norm": 3.703125, |
| "kl": 0.004964547406416386, |
| "learning_rate": 4.4025157232704395e-08, |
| "loss": 0.0002, |
| "reward": 0.4687324403785169, |
| "reward_std": 0.40094813890755177, |
| "rewards/reward_func": 0.4687324403785169, |
| "step": 7144 |
| }, |
| { |
| "completion_length": 197.28125, |
| "epoch": 0.9570453633079085, |
| "grad_norm": 4.09375, |
| "kl": 0.0037281967815943062, |
| "learning_rate": 4.295463669209152e-08, |
| "loss": 0.0001, |
| "reward": 0.02254125289618969, |
| "reward_std": 0.5664320774376392, |
| "rewards/reward_func": 0.02254125289618969, |
| "step": 7152 |
| }, |
| { |
| "completion_length": 148.703125, |
| "epoch": 0.9581158838485213, |
| "grad_norm": 3.796875, |
| "kl": 0.004984479019185528, |
| "learning_rate": 4.188411615147865e-08, |
| "loss": 0.0002, |
| "reward": 0.325860820710659, |
| "reward_std": 0.41767950914800167, |
| "rewards/reward_func": 0.325860820710659, |
| "step": 7160 |
| }, |
| { |
| "completion_length": 153.265625, |
| "epoch": 0.9591864043891342, |
| "grad_norm": 2.9375, |
| "kl": 0.005220895051024854, |
| "learning_rate": 4.081359561086578e-08, |
| "loss": 0.0002, |
| "reward": 0.4472418650984764, |
| "reward_std": 0.4130860110744834, |
| "rewards/reward_func": 0.4472418650984764, |
| "step": 7168 |
| }, |
| { |
| "completion_length": 169.0625, |
| "epoch": 0.9602569249297471, |
| "grad_norm": 4.5625, |
| "kl": 0.004276268708053976, |
| "learning_rate": 3.974307507025291e-08, |
| "loss": 0.0002, |
| "reward": 0.15905702486634254, |
| "reward_std": 0.4423768687993288, |
| "rewards/reward_func": 0.15905702486634254, |
| "step": 7176 |
| }, |
| { |
| "completion_length": 179.3125, |
| "epoch": 0.96132744547036, |
| "grad_norm": 3.234375, |
| "kl": 0.0035234860552009195, |
| "learning_rate": 3.8672554529640036e-08, |
| "loss": 0.0001, |
| "reward": 0.3482946362346411, |
| "reward_std": 0.6113561438396573, |
| "rewards/reward_func": 0.3482946362346411, |
| "step": 7184 |
| }, |
| { |
| "completion_length": 172.6796875, |
| "epoch": 0.9623979660109728, |
| "grad_norm": 3.890625, |
| "kl": 0.005020510870963335, |
| "learning_rate": 3.7602033989027164e-08, |
| "loss": 0.0002, |
| "reward": 0.12693638168275356, |
| "reward_std": 0.5951482262462378, |
| "rewards/reward_func": 0.12693638168275356, |
| "step": 7192 |
| }, |
| { |
| "completion_length": 161.015625, |
| "epoch": 0.9634684865515857, |
| "grad_norm": 2.765625, |
| "kl": 0.004839012573938817, |
| "learning_rate": 3.653151344841429e-08, |
| "loss": 0.0002, |
| "reward": 0.22009205259382725, |
| "reward_std": 0.607865285128355, |
| "rewards/reward_func": 0.22009205259382725, |
| "step": 7200 |
| }, |
| { |
| "completion_length": 158.515625, |
| "epoch": 0.9645390070921985, |
| "grad_norm": 4.71875, |
| "kl": 0.004412859241710976, |
| "learning_rate": 3.546099290780142e-08, |
| "loss": 0.0002, |
| "reward": 0.17766493232920766, |
| "reward_std": 0.6588699370622635, |
| "rewards/reward_func": 0.17766493232920766, |
| "step": 7208 |
| }, |
| { |
| "completion_length": 168.8828125, |
| "epoch": 0.9656095276328115, |
| "grad_norm": 3.859375, |
| "kl": 0.00440784459351562, |
| "learning_rate": 3.439047236718855e-08, |
| "loss": 0.0002, |
| "reward": 0.31805921625345945, |
| "reward_std": 0.5728737730532885, |
| "rewards/reward_func": 0.31805921625345945, |
| "step": 7216 |
| }, |
| { |
| "completion_length": 169.84375, |
| "epoch": 0.9666800481734243, |
| "grad_norm": 4.96875, |
| "kl": 0.004156895098276436, |
| "learning_rate": 3.331995182657567e-08, |
| "loss": 0.0002, |
| "reward": 0.13793382793664932, |
| "reward_std": 0.6552281193435192, |
| "rewards/reward_func": 0.13793382793664932, |
| "step": 7224 |
| }, |
| { |
| "completion_length": 140.65625, |
| "epoch": 0.9677505687140372, |
| "grad_norm": 4.21875, |
| "kl": 0.00553873396711424, |
| "learning_rate": 3.22494312859628e-08, |
| "loss": 0.0002, |
| "reward": 0.35460880724713206, |
| "reward_std": 0.3868136703968048, |
| "rewards/reward_func": 0.35460880724713206, |
| "step": 7232 |
| }, |
| { |
| "completion_length": 198.5546875, |
| "epoch": 0.96882108925465, |
| "grad_norm": 3.4375, |
| "kl": 0.004783908079843968, |
| "learning_rate": 3.1178910745349926e-08, |
| "loss": 0.0002, |
| "reward": 0.1244197292253375, |
| "reward_std": 0.5501943584531546, |
| "rewards/reward_func": 0.1244197292253375, |
| "step": 7240 |
| }, |
| { |
| "completion_length": 187.1875, |
| "epoch": 0.9698916097952629, |
| "grad_norm": 3.703125, |
| "kl": 0.004152452602284029, |
| "learning_rate": 3.0108390204737054e-08, |
| "loss": 0.0002, |
| "reward": 0.17286342615261674, |
| "reward_std": 0.461435928940773, |
| "rewards/reward_func": 0.17286342615261674, |
| "step": 7248 |
| }, |
| { |
| "completion_length": 122.7421875, |
| "epoch": 0.9709621303358759, |
| "grad_norm": 5.25, |
| "kl": 0.0060864063561894, |
| "learning_rate": 2.903786966412418e-08, |
| "loss": 0.0002, |
| "reward": 0.5303192976862192, |
| "reward_std": 0.4443682935088873, |
| "rewards/reward_func": 0.5303192976862192, |
| "step": 7256 |
| }, |
| { |
| "completion_length": 174.3984375, |
| "epoch": 0.9720326508764887, |
| "grad_norm": 3.546875, |
| "kl": 0.004882953886408359, |
| "learning_rate": 2.7967349123511307e-08, |
| "loss": 0.0002, |
| "reward": 0.40086287446320057, |
| "reward_std": 0.41974346339702606, |
| "rewards/reward_func": 0.40086287446320057, |
| "step": 7264 |
| }, |
| { |
| "completion_length": 191.515625, |
| "epoch": 0.9731031714171016, |
| "grad_norm": 2.28125, |
| "kl": 0.004744857433252037, |
| "learning_rate": 2.6896828582898435e-08, |
| "loss": 0.0002, |
| "reward": -0.035455760546028614, |
| "reward_std": 0.5775532089173794, |
| "rewards/reward_func": -0.035455760546028614, |
| "step": 7272 |
| }, |
| { |
| "completion_length": 181.6484375, |
| "epoch": 0.9741736919577144, |
| "grad_norm": 4.21875, |
| "kl": 0.004760511888889596, |
| "learning_rate": 2.5826308042285557e-08, |
| "loss": 0.0002, |
| "reward": 0.09731801599264145, |
| "reward_std": 0.5751747917383909, |
| "rewards/reward_func": 0.09731801599264145, |
| "step": 7280 |
| }, |
| { |
| "completion_length": 195.15625, |
| "epoch": 0.9752442124983273, |
| "grad_norm": 3.296875, |
| "kl": 0.004167939972830936, |
| "learning_rate": 2.4755787501672685e-08, |
| "loss": 0.0002, |
| "reward": -0.007298767566680908, |
| "reward_std": 0.47776357643306255, |
| "rewards/reward_func": -0.007298767566680908, |
| "step": 7288 |
| }, |
| { |
| "completion_length": 143.59375, |
| "epoch": 0.9763147330389402, |
| "grad_norm": 4.25, |
| "kl": 0.005093816755106673, |
| "learning_rate": 2.3685266961059813e-08, |
| "loss": 0.0002, |
| "reward": 0.3966046618297696, |
| "reward_std": 0.4395467219874263, |
| "rewards/reward_func": 0.3966046618297696, |
| "step": 7296 |
| }, |
| { |
| "completion_length": 196.1015625, |
| "epoch": 0.9773852535795531, |
| "grad_norm": 4.96875, |
| "kl": 0.003976713371230289, |
| "learning_rate": 2.261474642044694e-08, |
| "loss": 0.0002, |
| "reward": 0.05790833756327629, |
| "reward_std": 0.45180133171379566, |
| "rewards/reward_func": 0.05790833756327629, |
| "step": 7304 |
| }, |
| { |
| "completion_length": 158.6171875, |
| "epoch": 0.9784557741201659, |
| "grad_norm": 4.125, |
| "kl": 0.004875800863374025, |
| "learning_rate": 2.154422587983407e-08, |
| "loss": 0.0002, |
| "reward": 0.3239698866382241, |
| "reward_std": 0.6013830993324518, |
| "rewards/reward_func": 0.3239698866382241, |
| "step": 7312 |
| }, |
| { |
| "completion_length": 137.140625, |
| "epoch": 0.9795262946607788, |
| "grad_norm": 2.796875, |
| "kl": 0.0066487987351138145, |
| "learning_rate": 2.0473705339221198e-08, |
| "loss": 0.0003, |
| "reward": 0.42458152025938034, |
| "reward_std": 0.4684657920151949, |
| "rewards/reward_func": 0.42458152025938034, |
| "step": 7320 |
| }, |
| { |
| "completion_length": 184.9609375, |
| "epoch": 0.9805968152013916, |
| "grad_norm": 4.0625, |
| "kl": 0.003812081238720566, |
| "learning_rate": 1.9403184798608323e-08, |
| "loss": 0.0002, |
| "reward": 0.2897532992064953, |
| "reward_std": 0.6526387594640255, |
| "rewards/reward_func": 0.2897532992064953, |
| "step": 7328 |
| }, |
| { |
| "completion_length": 170.515625, |
| "epoch": 0.9816673357420046, |
| "grad_norm": 2.625, |
| "kl": 0.004741923592519015, |
| "learning_rate": 1.8332664257995448e-08, |
| "loss": 0.0002, |
| "reward": 0.1217675432562828, |
| "reward_std": 0.5977188646793365, |
| "rewards/reward_func": 0.1217675432562828, |
| "step": 7336 |
| }, |
| { |
| "completion_length": 215.9609375, |
| "epoch": 0.9827378562826175, |
| "grad_norm": 2.90625, |
| "kl": 0.0035596474481280893, |
| "learning_rate": 1.7262143717382576e-08, |
| "loss": 0.0001, |
| "reward": 0.14845915883779526, |
| "reward_std": 0.5360017623752356, |
| "rewards/reward_func": 0.14845915883779526, |
| "step": 7344 |
| }, |
| { |
| "completion_length": 219.90625, |
| "epoch": 0.9838083768232303, |
| "grad_norm": 4.5, |
| "kl": 0.00427751979441382, |
| "learning_rate": 1.6191623176769704e-08, |
| "loss": 0.0002, |
| "reward": -0.1681067142635584, |
| "reward_std": 0.5721786804497242, |
| "rewards/reward_func": -0.1681067142635584, |
| "step": 7352 |
| }, |
| { |
| "completion_length": 175.6953125, |
| "epoch": 0.9848788973638432, |
| "grad_norm": 3.890625, |
| "kl": 0.0041801958286669105, |
| "learning_rate": 1.5121102636156832e-08, |
| "loss": 0.0002, |
| "reward": 0.06939083803445101, |
| "reward_std": 0.6783208139240742, |
| "rewards/reward_func": 0.06939083803445101, |
| "step": 7360 |
| }, |
| { |
| "completion_length": 144.1875, |
| "epoch": 0.985949417904456, |
| "grad_norm": 3.828125, |
| "kl": 0.004889452655334026, |
| "learning_rate": 1.4050582095543959e-08, |
| "loss": 0.0002, |
| "reward": 0.4226034879684448, |
| "reward_std": 0.5269232532009482, |
| "rewards/reward_func": 0.4226034879684448, |
| "step": 7368 |
| }, |
| { |
| "completion_length": 163.59375, |
| "epoch": 0.987019938445069, |
| "grad_norm": 4.375, |
| "kl": 0.004188012710073963, |
| "learning_rate": 1.2980061554931083e-08, |
| "loss": 0.0002, |
| "reward": 0.39765281416475773, |
| "reward_std": 0.6691529527306557, |
| "rewards/reward_func": 0.39765281416475773, |
| "step": 7376 |
| }, |
| { |
| "completion_length": 171.4765625, |
| "epoch": 0.9880904589856818, |
| "grad_norm": 4.34375, |
| "kl": 0.004243878676788881, |
| "learning_rate": 1.1909541014318212e-08, |
| "loss": 0.0002, |
| "reward": 0.02129072230309248, |
| "reward_std": 0.5695307403802872, |
| "rewards/reward_func": 0.02129072230309248, |
| "step": 7384 |
| }, |
| { |
| "completion_length": 174.0625, |
| "epoch": 0.9891609795262947, |
| "grad_norm": 6.34375, |
| "kl": 0.004543175353319384, |
| "learning_rate": 1.0839020473705338e-08, |
| "loss": 0.0002, |
| "reward": 0.19244904909282923, |
| "reward_std": 0.6081365495920181, |
| "rewards/reward_func": 0.19244904909282923, |
| "step": 7392 |
| }, |
| { |
| "completion_length": 168.640625, |
| "epoch": 0.9902315000669075, |
| "grad_norm": 4.09375, |
| "kl": 0.004472158325370401, |
| "learning_rate": 9.768499933092466e-09, |
| "loss": 0.0002, |
| "reward": 0.23385661654174328, |
| "reward_std": 0.5880191251635551, |
| "rewards/reward_func": 0.23385661654174328, |
| "step": 7400 |
| }, |
| { |
| "completion_length": 167.7265625, |
| "epoch": 0.9913020206075204, |
| "grad_norm": 3.703125, |
| "kl": 0.004636053316062316, |
| "learning_rate": 8.697979392479593e-09, |
| "loss": 0.0002, |
| "reward": 0.1343773351982236, |
| "reward_std": 0.4601830244064331, |
| "rewards/reward_func": 0.1343773351982236, |
| "step": 7408 |
| }, |
| { |
| "completion_length": 181.546875, |
| "epoch": 0.9923725411481332, |
| "grad_norm": 3.515625, |
| "kl": 0.004200820723781362, |
| "learning_rate": 7.627458851866721e-09, |
| "loss": 0.0002, |
| "reward": 0.4208897929638624, |
| "reward_std": 0.5252015050500631, |
| "rewards/reward_func": 0.4208897929638624, |
| "step": 7416 |
| }, |
| { |
| "completion_length": 188.3671875, |
| "epoch": 0.9934430616887462, |
| "grad_norm": 3.90625, |
| "kl": 0.00420898012816906, |
| "learning_rate": 6.5569383112538474e-09, |
| "loss": 0.0002, |
| "reward": 0.01703132875263691, |
| "reward_std": 0.659230999648571, |
| "rewards/reward_func": 0.01703132875263691, |
| "step": 7424 |
| }, |
| { |
| "completion_length": 180.5546875, |
| "epoch": 0.994513582229359, |
| "grad_norm": 3.203125, |
| "kl": 0.004538079345365986, |
| "learning_rate": 5.486417770640974e-09, |
| "loss": 0.0002, |
| "reward": 0.2158992402255535, |
| "reward_std": 0.6045026630163193, |
| "rewards/reward_func": 0.2158992402255535, |
| "step": 7432 |
| }, |
| { |
| "completion_length": 162.9453125, |
| "epoch": 0.9955841027699719, |
| "grad_norm": 3.734375, |
| "kl": 0.004193991771899164, |
| "learning_rate": 4.4158972300281005e-09, |
| "loss": 0.0002, |
| "reward": 0.1715514180250466, |
| "reward_std": 0.6823503784835339, |
| "rewards/reward_func": 0.1715514180250466, |
| "step": 7440 |
| }, |
| { |
| "completion_length": 173.65625, |
| "epoch": 0.9966546233105847, |
| "grad_norm": 6.21875, |
| "kl": 0.004881884902715683, |
| "learning_rate": 3.345376689415228e-09, |
| "loss": 0.0002, |
| "reward": 0.003267081454396248, |
| "reward_std": 0.6868100538849831, |
| "rewards/reward_func": 0.003267081454396248, |
| "step": 7448 |
| }, |
| { |
| "completion_length": 141.2421875, |
| "epoch": 0.9977251438511976, |
| "grad_norm": 4.53125, |
| "kl": 0.005532538751140237, |
| "learning_rate": 2.2748561488023547e-09, |
| "loss": 0.0002, |
| "reward": 0.541567288339138, |
| "reward_std": 0.31684359908103943, |
| "rewards/reward_func": 0.541567288339138, |
| "step": 7456 |
| }, |
| { |
| "completion_length": 157.5546875, |
| "epoch": 0.9987956643918106, |
| "grad_norm": 3.953125, |
| "kl": 0.004744258592836559, |
| "learning_rate": 1.2043356081894823e-09, |
| "loss": 0.0002, |
| "reward": 0.255828570574522, |
| "reward_std": 0.6103123240172863, |
| "rewards/reward_func": 0.255828570574522, |
| "step": 7464 |
| }, |
| { |
| "completion_length": 151.453125, |
| "epoch": 0.9998661849324234, |
| "grad_norm": 3.484375, |
| "kl": 0.004390858637634665, |
| "learning_rate": 1.338150675766091e-10, |
| "loss": 0.0002, |
| "reward": 0.26983874663710594, |
| "reward_std": 0.5870513431727886, |
| "rewards/reward_func": 0.26983874663710594, |
| "step": 7472 |
| } |
| ], |
| "logging_steps": 8, |
| "max_steps": 7473, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 1868, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|