concise_grpo / trainer_state.json
SeoulForest's picture
Upload folder using huggingface_hub
0064416 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 7473,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 196.4453125,
"epoch": 0.001070520540612873,
"grad_norm": 5.03125,
"kl": 0.00023896918719401583,
"learning_rate": 9.98929479459387e-07,
"loss": 0.0,
"reward": 0.06715917773544788,
"reward_std": 0.6129379905760288,
"rewards/reward_func": 0.06715917773544788,
"step": 8
},
{
"completion_length": 177.9296875,
"epoch": 0.002141041081225746,
"grad_norm": 4.15625,
"kl": 0.00027647913702821825,
"learning_rate": 9.978589589187743e-07,
"loss": 0.0,
"reward": 0.02951172273606062,
"reward_std": 0.5923765227198601,
"rewards/reward_func": 0.02951172273606062,
"step": 16
},
{
"completion_length": 173.359375,
"epoch": 0.003211561621838619,
"grad_norm": 3.34375,
"kl": 0.0002405872492090566,
"learning_rate": 9.967884383781614e-07,
"loss": 0.0,
"reward": 0.12952731922268867,
"reward_std": 0.6536133792251348,
"rewards/reward_func": 0.12952731922268867,
"step": 24
},
{
"completion_length": 170.0859375,
"epoch": 0.004282082162451492,
"grad_norm": 3.21875,
"kl": 0.0002442408094793791,
"learning_rate": 9.957179178375484e-07,
"loss": 0.0,
"reward": 0.3293330520391464,
"reward_std": 0.5732803735882044,
"rewards/reward_func": 0.3293330520391464,
"step": 32
},
{
"completion_length": 190.3828125,
"epoch": 0.005352602703064365,
"grad_norm": 4.15625,
"kl": 0.00024058844792307355,
"learning_rate": 9.946473972969355e-07,
"loss": 0.0,
"reward": 0.36058899760246277,
"reward_std": 0.5922529064118862,
"rewards/reward_func": 0.36058899760246277,
"step": 40
},
{
"completion_length": 228.4453125,
"epoch": 0.006423123243677238,
"grad_norm": 2.765625,
"kl": 0.00029027340133325197,
"learning_rate": 9.935768767563228e-07,
"loss": 0.0,
"reward": 0.026657558977603912,
"reward_std": 0.48614570777863264,
"rewards/reward_func": 0.026657558977603912,
"step": 48
},
{
"completion_length": 182.453125,
"epoch": 0.007493643784290111,
"grad_norm": 4.40625,
"kl": 0.00024143315386027098,
"learning_rate": 9.925063562157099e-07,
"loss": 0.0,
"reward": 0.3887675404548645,
"reward_std": 0.53492078371346,
"rewards/reward_func": 0.3887675404548645,
"step": 56
},
{
"completion_length": 223.5703125,
"epoch": 0.008564164324902984,
"grad_norm": 2.578125,
"kl": 0.000258006857620785,
"learning_rate": 9.91435835675097e-07,
"loss": 0.0,
"reward": 0.31422215048223734,
"reward_std": 0.527774453163147,
"rewards/reward_func": 0.31422215048223734,
"step": 64
},
{
"completion_length": 207.1953125,
"epoch": 0.009634684865515858,
"grad_norm": 4.03125,
"kl": 0.0002873431112675462,
"learning_rate": 9.90365315134484e-07,
"loss": 0.0,
"reward": 0.23160290531814098,
"reward_std": 0.5312692299485207,
"rewards/reward_func": 0.23160290531814098,
"step": 72
},
{
"completion_length": 170.234375,
"epoch": 0.01070520540612873,
"grad_norm": 5.09375,
"kl": 0.0002804081450449303,
"learning_rate": 9.892947945938713e-07,
"loss": 0.0,
"reward": 0.2816220009699464,
"reward_std": 0.600088307633996,
"rewards/reward_func": 0.2816220009699464,
"step": 80
},
{
"completion_length": 175.84375,
"epoch": 0.011775725946741603,
"grad_norm": 4.125,
"kl": 0.00024894280068110675,
"learning_rate": 9.882242740532583e-07,
"loss": 0.0,
"reward": 0.42427181545645,
"reward_std": 0.5529402792453766,
"rewards/reward_func": 0.42427181545645,
"step": 88
},
{
"completion_length": 258.6484375,
"epoch": 0.012846246487354477,
"grad_norm": 4.03125,
"kl": 0.00030983560463937465,
"learning_rate": 9.871537535126454e-07,
"loss": 0.0,
"reward": -0.342121371999383,
"reward_std": 0.5698626078665257,
"rewards/reward_func": -0.342121371999383,
"step": 96
},
{
"completion_length": 225.90625,
"epoch": 0.013916767027967349,
"grad_norm": 3.484375,
"kl": 0.0002933137984655332,
"learning_rate": 9.860832329720325e-07,
"loss": 0.0,
"reward": 0.12576067401096225,
"reward_std": 0.6368702314794064,
"rewards/reward_func": 0.12576067401096225,
"step": 104
},
{
"completion_length": 168.5703125,
"epoch": 0.014987287568580221,
"grad_norm": 3.765625,
"kl": 0.0003037826609215699,
"learning_rate": 9.850127124314198e-07,
"loss": 0.0,
"reward": 0.17080755531787872,
"reward_std": 0.5442508868873119,
"rewards/reward_func": 0.17080755531787872,
"step": 112
},
{
"completion_length": 179.1640625,
"epoch": 0.016057808109193095,
"grad_norm": 3.390625,
"kl": 0.0002824857710947981,
"learning_rate": 9.839421918908068e-07,
"loss": 0.0,
"reward": 0.18145466595888138,
"reward_std": 0.5059491451829672,
"rewards/reward_func": 0.18145466595888138,
"step": 120
},
{
"completion_length": 191.78125,
"epoch": 0.017128328649805968,
"grad_norm": 3.75,
"kl": 0.00031089498952496797,
"learning_rate": 9.82871671350194e-07,
"loss": 0.0,
"reward": 0.2814232921227813,
"reward_std": 0.5498775225132704,
"rewards/reward_func": 0.2814232921227813,
"step": 128
},
{
"completion_length": 154.2578125,
"epoch": 0.01819884919041884,
"grad_norm": 3.140625,
"kl": 0.0003297478033346124,
"learning_rate": 9.818011508095812e-07,
"loss": 0.0,
"reward": 0.46491820737719536,
"reward_std": 0.48998717963695526,
"rewards/reward_func": 0.46491820737719536,
"step": 136
},
{
"completion_length": 180.1171875,
"epoch": 0.019269369731031716,
"grad_norm": 4.53125,
"kl": 0.00035076765561825596,
"learning_rate": 9.807306302689682e-07,
"loss": 0.0,
"reward": 0.27495551854372025,
"reward_std": 0.6060709934681654,
"rewards/reward_func": 0.27495551854372025,
"step": 144
},
{
"completion_length": 219.390625,
"epoch": 0.020339890271644588,
"grad_norm": 3.203125,
"kl": 0.00032744045893196017,
"learning_rate": 9.796601097283553e-07,
"loss": 0.0,
"reward": -0.04555501043796539,
"reward_std": 0.557650757022202,
"rewards/reward_func": -0.04555501043796539,
"step": 152
},
{
"completion_length": 197.6875,
"epoch": 0.02141041081225746,
"grad_norm": 4.1875,
"kl": 0.0003747515474969987,
"learning_rate": 9.785895891877424e-07,
"loss": 0.0,
"reward": 0.04799163248389959,
"reward_std": 0.7354221493005753,
"rewards/reward_func": 0.04799163248389959,
"step": 160
},
{
"completion_length": 158.921875,
"epoch": 0.022480931352870333,
"grad_norm": 4.71875,
"kl": 0.00034361303187324665,
"learning_rate": 9.775190686471297e-07,
"loss": 0.0,
"reward": 0.46762343868613243,
"reward_std": 0.5215234383940697,
"rewards/reward_func": 0.46762343868613243,
"step": 168
},
{
"completion_length": 179.4296875,
"epoch": 0.023551451893483205,
"grad_norm": 2.390625,
"kl": 0.0003124878894595895,
"learning_rate": 9.764485481065167e-07,
"loss": 0.0,
"reward": 0.15805792342871428,
"reward_std": 0.6787187531590462,
"rewards/reward_func": 0.15805792342871428,
"step": 176
},
{
"completion_length": 185.5546875,
"epoch": 0.02462197243409608,
"grad_norm": 3.078125,
"kl": 0.0003605757010518573,
"learning_rate": 9.75378027565904e-07,
"loss": 0.0,
"reward": 0.30135733261704445,
"reward_std": 0.4861781559884548,
"rewards/reward_func": 0.30135733261704445,
"step": 184
},
{
"completion_length": 176.8984375,
"epoch": 0.025692492974708953,
"grad_norm": 2.671875,
"kl": 0.0003968792916566599,
"learning_rate": 9.743075070252909e-07,
"loss": 0.0,
"reward": 0.18422270519658923,
"reward_std": 0.5276681184768677,
"rewards/reward_func": 0.18422270519658923,
"step": 192
},
{
"completion_length": 196.234375,
"epoch": 0.026763013515321826,
"grad_norm": 3.8125,
"kl": 0.0003164229347021319,
"learning_rate": 9.732369864846782e-07,
"loss": 0.0,
"reward": 0.09587159566581249,
"reward_std": 0.5885980241000652,
"rewards/reward_func": 0.09587159566581249,
"step": 200
},
{
"completion_length": 187.15625,
"epoch": 0.027833534055934698,
"grad_norm": 5.65625,
"kl": 0.00039002683661237825,
"learning_rate": 9.721664659440652e-07,
"loss": 0.0,
"reward": 0.19378361385315657,
"reward_std": 0.439416766166687,
"rewards/reward_func": 0.19378361385315657,
"step": 208
},
{
"completion_length": 186.359375,
"epoch": 0.02890405459654757,
"grad_norm": 4.15625,
"kl": 0.0004315389560360927,
"learning_rate": 9.710959454034525e-07,
"loss": 0.0,
"reward": 0.2533123311586678,
"reward_std": 0.6090654768049717,
"rewards/reward_func": 0.2533123311586678,
"step": 216
},
{
"completion_length": 206.7265625,
"epoch": 0.029974575137160443,
"grad_norm": 3.8125,
"kl": 0.00036658769749919884,
"learning_rate": 9.700254248628396e-07,
"loss": 0.0,
"reward": 0.11443387717008591,
"reward_std": 0.6023523053154349,
"rewards/reward_func": 0.11443387717008591,
"step": 224
},
{
"completion_length": 189.0703125,
"epoch": 0.03104509567777332,
"grad_norm": 4.78125,
"kl": 0.00041832886927295476,
"learning_rate": 9.689549043222266e-07,
"loss": 0.0,
"reward": -0.07620369084179401,
"reward_std": 0.6309537254273891,
"rewards/reward_func": -0.07620369084179401,
"step": 232
},
{
"completion_length": 193.34375,
"epoch": 0.03211561621838619,
"grad_norm": 5.03125,
"kl": 0.00045970915380166844,
"learning_rate": 9.678843837816137e-07,
"loss": 0.0,
"reward": 0.13946556020528078,
"reward_std": 0.5310352686792612,
"rewards/reward_func": 0.13946556020528078,
"step": 240
},
{
"completion_length": 198.046875,
"epoch": 0.03318613675899906,
"grad_norm": 4.34375,
"kl": 0.00045460829278454185,
"learning_rate": 9.66813863241001e-07,
"loss": 0.0,
"reward": 0.20321442000567913,
"reward_std": 0.7352660372853279,
"rewards/reward_func": 0.20321442000567913,
"step": 248
},
{
"completion_length": 197.2265625,
"epoch": 0.034256657299611935,
"grad_norm": 3.21875,
"kl": 0.0004281356050341856,
"learning_rate": 9.65743342700388e-07,
"loss": 0.0,
"reward": 0.20208348147571087,
"reward_std": 0.5523553621023893,
"rewards/reward_func": 0.20208348147571087,
"step": 256
},
{
"completion_length": 199.7109375,
"epoch": 0.03532717784022481,
"grad_norm": 5.15625,
"kl": 0.000508931974763982,
"learning_rate": 9.646728221597751e-07,
"loss": 0.0,
"reward": 0.15087968483567238,
"reward_std": 0.6751584373414516,
"rewards/reward_func": 0.15087968483567238,
"step": 264
},
{
"completion_length": 181.7578125,
"epoch": 0.03639769838083768,
"grad_norm": 3.109375,
"kl": 0.0004673556577472482,
"learning_rate": 9.636023016191622e-07,
"loss": 0.0,
"reward": 0.4762549586594105,
"reward_std": 0.5292778257280588,
"rewards/reward_func": 0.4762549586594105,
"step": 272
},
{
"completion_length": 172.3046875,
"epoch": 0.03746821892145055,
"grad_norm": 3.515625,
"kl": 0.00042566236152197234,
"learning_rate": 9.625317810785495e-07,
"loss": 0.0,
"reward": 0.36441371217370033,
"reward_std": 0.4376997593790293,
"rewards/reward_func": 0.36441371217370033,
"step": 280
},
{
"completion_length": 174.234375,
"epoch": 0.03853873946206343,
"grad_norm": 4.875,
"kl": 0.0004668605834012851,
"learning_rate": 9.614612605379365e-07,
"loss": 0.0,
"reward": 0.17927304655313492,
"reward_std": 0.5315880142152309,
"rewards/reward_func": 0.17927304655313492,
"step": 288
},
{
"completion_length": 152.8671875,
"epoch": 0.039609260002676304,
"grad_norm": 3.078125,
"kl": 0.0005132910300744697,
"learning_rate": 9.603907399973236e-07,
"loss": 0.0,
"reward": 0.5605849623680115,
"reward_std": 0.5153817608952522,
"rewards/reward_func": 0.5605849623680115,
"step": 296
},
{
"completion_length": 203.5390625,
"epoch": 0.040679780543289176,
"grad_norm": 3.796875,
"kl": 0.00048365409747930244,
"learning_rate": 9.593202194567109e-07,
"loss": 0.0,
"reward": -0.10374991549178958,
"reward_std": 0.5484482925385237,
"rewards/reward_func": -0.10374991549178958,
"step": 304
},
{
"completion_length": 203.1015625,
"epoch": 0.04175030108390205,
"grad_norm": 3.28125,
"kl": 0.0005178198443900328,
"learning_rate": 9.58249698916098e-07,
"loss": 0.0,
"reward": 0.15655188029631972,
"reward_std": 0.6044143028557301,
"rewards/reward_func": 0.15655188029631972,
"step": 312
},
{
"completion_length": 193.34375,
"epoch": 0.04282082162451492,
"grad_norm": 3.34375,
"kl": 0.0005464391106215771,
"learning_rate": 9.57179178375485e-07,
"loss": 0.0,
"reward": 0.20653630187734962,
"reward_std": 0.637122736312449,
"rewards/reward_func": 0.20653630187734962,
"step": 320
},
{
"completion_length": 182.390625,
"epoch": 0.04389134216512779,
"grad_norm": 3.953125,
"kl": 0.0005447999064926989,
"learning_rate": 9.56108657834872e-07,
"loss": 0.0,
"reward": 0.0629437193274498,
"reward_std": 0.6482261158525944,
"rewards/reward_func": 0.0629437193274498,
"step": 328
},
{
"completion_length": 210.1875,
"epoch": 0.044961862705740666,
"grad_norm": 3.796875,
"kl": 0.0005637894355459139,
"learning_rate": 9.550381372942594e-07,
"loss": 0.0,
"reward": 0.005680203437805176,
"reward_std": 0.5875861989334226,
"rewards/reward_func": 0.005680203437805176,
"step": 336
},
{
"completion_length": 179.8671875,
"epoch": 0.04603238324635354,
"grad_norm": 3.625,
"kl": 0.00048511310160392895,
"learning_rate": 9.539676167536464e-07,
"loss": 0.0,
"reward": 0.41209501400589943,
"reward_std": 0.49328203592449427,
"rewards/reward_func": 0.41209501400589943,
"step": 344
},
{
"completion_length": 223.6875,
"epoch": 0.04710290378696641,
"grad_norm": 3.765625,
"kl": 0.0005501342748175375,
"learning_rate": 9.528970962130335e-07,
"loss": 0.0,
"reward": -0.01573021337389946,
"reward_std": 0.6180381271988153,
"rewards/reward_func": -0.01573021337389946,
"step": 352
},
{
"completion_length": 179.4609375,
"epoch": 0.04817342432757928,
"grad_norm": 4.1875,
"kl": 0.0005429566881502979,
"learning_rate": 9.518265756724207e-07,
"loss": 0.0,
"reward": 0.3368812333792448,
"reward_std": 0.42025264725089073,
"rewards/reward_func": 0.3368812333792448,
"step": 360
},
{
"completion_length": 217.0234375,
"epoch": 0.04924394486819216,
"grad_norm": 3.640625,
"kl": 0.0005859209413756616,
"learning_rate": 9.507560551318078e-07,
"loss": 0.0,
"reward": 0.07761704362928867,
"reward_std": 0.539461400359869,
"rewards/reward_func": 0.07761704362928867,
"step": 368
},
{
"completion_length": 184.6484375,
"epoch": 0.050314465408805034,
"grad_norm": 3.46875,
"kl": 0.0006783130666008219,
"learning_rate": 9.496855345911949e-07,
"loss": 0.0,
"reward": 0.3878909517079592,
"reward_std": 0.5879320036619902,
"rewards/reward_func": 0.3878909517079592,
"step": 376
},
{
"completion_length": 237.3125,
"epoch": 0.051384985949417906,
"grad_norm": 2.828125,
"kl": 0.0005360892773751402,
"learning_rate": 9.486150140505821e-07,
"loss": 0.0,
"reward": 0.2408028580248356,
"reward_std": 0.6049665845930576,
"rewards/reward_func": 0.2408028580248356,
"step": 384
},
{
"completion_length": 181.765625,
"epoch": 0.05245550649003078,
"grad_norm": 4.1875,
"kl": 0.0007057523835101165,
"learning_rate": 9.475444935099693e-07,
"loss": 0.0,
"reward": 0.1389563176780939,
"reward_std": 0.5876767132431269,
"rewards/reward_func": 0.1389563176780939,
"step": 392
},
{
"completion_length": 167.578125,
"epoch": 0.05352602703064365,
"grad_norm": 3.859375,
"kl": 0.0006887019626447,
"learning_rate": 9.464739729693562e-07,
"loss": 0.0,
"reward": 0.6259515974670649,
"reward_std": 0.3476364128291607,
"rewards/reward_func": 0.6259515974670649,
"step": 400
},
{
"completion_length": 197.25,
"epoch": 0.05459654757125652,
"grad_norm": 3.03125,
"kl": 0.0006218861890374683,
"learning_rate": 9.454034524287434e-07,
"loss": 0.0,
"reward": 0.05700792092829943,
"reward_std": 0.428726595826447,
"rewards/reward_func": 0.05700792092829943,
"step": 408
},
{
"completion_length": 185.859375,
"epoch": 0.055667068111869396,
"grad_norm": 3.390625,
"kl": 0.0005967724355286919,
"learning_rate": 9.443329318881306e-07,
"loss": 0.0,
"reward": 0.2252086065709591,
"reward_std": 0.5162075459957123,
"rewards/reward_func": 0.2252086065709591,
"step": 416
},
{
"completion_length": 162.9296875,
"epoch": 0.05673758865248227,
"grad_norm": 4.28125,
"kl": 0.0008201822929549962,
"learning_rate": 9.432624113475178e-07,
"loss": 0.0,
"reward": 0.322255807928741,
"reward_std": 0.6453814581036568,
"rewards/reward_func": 0.322255807928741,
"step": 424
},
{
"completion_length": 211.1015625,
"epoch": 0.05780810919309514,
"grad_norm": 4.0,
"kl": 0.0006929989831405692,
"learning_rate": 9.421918908069048e-07,
"loss": 0.0,
"reward": 0.015094950795173645,
"reward_std": 0.5742807984352112,
"rewards/reward_func": 0.015094950795173645,
"step": 432
},
{
"completion_length": 190.5625,
"epoch": 0.05887862973370801,
"grad_norm": 2.421875,
"kl": 0.0007785350171616301,
"learning_rate": 9.411213702662919e-07,
"loss": 0.0,
"reward": 0.2807863000780344,
"reward_std": 0.38556696847081184,
"rewards/reward_func": 0.2807863000780344,
"step": 440
},
{
"completion_length": 205.90625,
"epoch": 0.059949150274320885,
"grad_norm": 3.34375,
"kl": 0.000683286452840548,
"learning_rate": 9.400508497256791e-07,
"loss": 0.0,
"reward": 0.17150266654789448,
"reward_std": 0.6842659376561642,
"rewards/reward_func": 0.17150266654789448,
"step": 448
},
{
"completion_length": 170.46875,
"epoch": 0.061019670814933764,
"grad_norm": 4.84375,
"kl": 0.0007365654601017013,
"learning_rate": 9.389803291850661e-07,
"loss": 0.0,
"reward": 0.28804031014442444,
"reward_std": 0.5351038463413715,
"rewards/reward_func": 0.28804031014442444,
"step": 456
},
{
"completion_length": 159.1875,
"epoch": 0.06209019135554664,
"grad_norm": 5.8125,
"kl": 0.0008180349177564494,
"learning_rate": 9.379098086444533e-07,
"loss": 0.0,
"reward": 0.3410092554986477,
"reward_std": 0.652816615998745,
"rewards/reward_func": 0.3410092554986477,
"step": 464
},
{
"completion_length": 164.8125,
"epoch": 0.0631607118961595,
"grad_norm": 5.21875,
"kl": 0.0009274725816794671,
"learning_rate": 9.368392881038405e-07,
"loss": 0.0,
"reward": 0.42266903538256884,
"reward_std": 0.5641947891563177,
"rewards/reward_func": 0.42266903538256884,
"step": 472
},
{
"completion_length": 194.984375,
"epoch": 0.06423123243677238,
"grad_norm": 3.078125,
"kl": 0.0007724944334768225,
"learning_rate": 9.357687675632276e-07,
"loss": 0.0,
"reward": 0.14429602678865194,
"reward_std": 0.6491441205143929,
"rewards/reward_func": 0.14429602678865194,
"step": 480
},
{
"completion_length": 162.75,
"epoch": 0.06530175297738525,
"grad_norm": 3.0625,
"kl": 0.0008644247645861469,
"learning_rate": 9.346982470226146e-07,
"loss": 0.0,
"reward": 0.4321166332811117,
"reward_std": 0.595779299736023,
"rewards/reward_func": 0.4321166332811117,
"step": 488
},
{
"completion_length": 206.21875,
"epoch": 0.06637227351799813,
"grad_norm": 5.3125,
"kl": 0.0008183796162484214,
"learning_rate": 9.336277264820018e-07,
"loss": 0.0,
"reward": 0.15042403992265463,
"reward_std": 0.6641352027654648,
"rewards/reward_func": 0.15042403992265463,
"step": 496
},
{
"completion_length": 169.53125,
"epoch": 0.067442794058611,
"grad_norm": 3.359375,
"kl": 0.0007982932875165716,
"learning_rate": 9.32557205941389e-07,
"loss": 0.0,
"reward": 0.5449392115697265,
"reward_std": 0.4078510096296668,
"rewards/reward_func": 0.5449392115697265,
"step": 504
},
{
"completion_length": 217.1328125,
"epoch": 0.06851331459922387,
"grad_norm": 4.46875,
"kl": 0.0007571115165774245,
"learning_rate": 9.314866854007762e-07,
"loss": 0.0,
"reward": -0.14380409568548203,
"reward_std": 0.42090473882853985,
"rewards/reward_func": -0.14380409568548203,
"step": 512
},
{
"completion_length": 188.7578125,
"epoch": 0.06958383513983675,
"grad_norm": 3.96875,
"kl": 0.0009429744168301113,
"learning_rate": 9.304161648601631e-07,
"loss": 0.0,
"reward": 0.2523565851151943,
"reward_std": 0.5918965879827738,
"rewards/reward_func": 0.2523565851151943,
"step": 520
},
{
"completion_length": 166.8671875,
"epoch": 0.07065435568044962,
"grad_norm": 3.796875,
"kl": 0.0008449588422081433,
"learning_rate": 9.293456443195503e-07,
"loss": 0.0,
"reward": 0.23421072773635387,
"reward_std": 0.5638821180909872,
"rewards/reward_func": 0.23421072773635387,
"step": 528
},
{
"completion_length": 162.359375,
"epoch": 0.0717248762210625,
"grad_norm": 4.03125,
"kl": 0.0010499468044145033,
"learning_rate": 9.282751237789375e-07,
"loss": 0.0,
"reward": 0.14197909273207188,
"reward_std": 0.5628513153642416,
"rewards/reward_func": 0.14197909273207188,
"step": 536
},
{
"completion_length": 149.3203125,
"epoch": 0.07279539676167536,
"grad_norm": 4.3125,
"kl": 0.0009130838298005983,
"learning_rate": 9.272046032383246e-07,
"loss": 0.0,
"reward": 0.42748846486210823,
"reward_std": 0.4888880178332329,
"rewards/reward_func": 0.42748846486210823,
"step": 544
},
{
"completion_length": 168.25,
"epoch": 0.07386591730228824,
"grad_norm": 3.09375,
"kl": 0.00098653764143819,
"learning_rate": 9.261340826977117e-07,
"loss": 0.0,
"reward": 0.32451459113508463,
"reward_std": 0.5782719142735004,
"rewards/reward_func": 0.32451459113508463,
"step": 552
},
{
"completion_length": 186.59375,
"epoch": 0.0749364378429011,
"grad_norm": 4.34375,
"kl": 0.0009752021069289185,
"learning_rate": 9.250635621570988e-07,
"loss": 0.0,
"reward": 0.20521394163370132,
"reward_std": 0.5094065079465508,
"rewards/reward_func": 0.20521394163370132,
"step": 560
},
{
"completion_length": 175.78125,
"epoch": 0.07600695838351398,
"grad_norm": 4.25,
"kl": 0.0010090179930557497,
"learning_rate": 9.23993041616486e-07,
"loss": 0.0,
"reward": 0.5578707046806812,
"reward_std": 0.4845643825829029,
"rewards/reward_func": 0.5578707046806812,
"step": 568
},
{
"completion_length": 158.234375,
"epoch": 0.07707747892412686,
"grad_norm": 4.6875,
"kl": 0.0011502801644382998,
"learning_rate": 9.229225210758731e-07,
"loss": 0.0,
"reward": 0.4859929271042347,
"reward_std": 0.5507038980722427,
"rewards/reward_func": 0.4859929271042347,
"step": 576
},
{
"completion_length": 200.0234375,
"epoch": 0.07814799946473973,
"grad_norm": 3.9375,
"kl": 0.0011561861392692663,
"learning_rate": 9.218520005352602e-07,
"loss": 0.0,
"reward": 0.16512918565422297,
"reward_std": 0.5027751969173551,
"rewards/reward_func": 0.16512918565422297,
"step": 584
},
{
"completion_length": 158.3515625,
"epoch": 0.07921852000535261,
"grad_norm": 3.390625,
"kl": 0.0012836234309361316,
"learning_rate": 9.207814799946474e-07,
"loss": 0.0001,
"reward": 0.4137600362300873,
"reward_std": 0.5193404145538807,
"rewards/reward_func": 0.4137600362300873,
"step": 592
},
{
"completion_length": 162.8828125,
"epoch": 0.08028904054596547,
"grad_norm": 2.125,
"kl": 0.0011869178197230212,
"learning_rate": 9.197109594540344e-07,
"loss": 0.0,
"reward": 0.39771614968776703,
"reward_std": 0.6107706986367702,
"rewards/reward_func": 0.39771614968776703,
"step": 600
},
{
"completion_length": 176.65625,
"epoch": 0.08135956108657835,
"grad_norm": 4.78125,
"kl": 0.0011795180544140749,
"learning_rate": 9.186404389134216e-07,
"loss": 0.0,
"reward": 0.0783949107863009,
"reward_std": 0.6460004411637783,
"rewards/reward_func": 0.0783949107863009,
"step": 608
},
{
"completion_length": 158.5390625,
"epoch": 0.08243008162719122,
"grad_norm": 3.59375,
"kl": 0.0013605851854663342,
"learning_rate": 9.175699183728087e-07,
"loss": 0.0001,
"reward": 0.3015612084418535,
"reward_std": 0.46242015063762665,
"rewards/reward_func": 0.3015612084418535,
"step": 616
},
{
"completion_length": 192.0,
"epoch": 0.0835006021678041,
"grad_norm": 6.0625,
"kl": 0.001107029449485708,
"learning_rate": 9.164993978321959e-07,
"loss": 0.0,
"reward": -0.052582718431949615,
"reward_std": 0.521589694544673,
"rewards/reward_func": -0.052582718431949615,
"step": 624
},
{
"completion_length": 167.3828125,
"epoch": 0.08457112270841696,
"grad_norm": 3.1875,
"kl": 0.0013894213043386117,
"learning_rate": 9.15428877291583e-07,
"loss": 0.0001,
"reward": 0.2013978809118271,
"reward_std": 0.6684001944959164,
"rewards/reward_func": 0.2013978809118271,
"step": 632
},
{
"completion_length": 186.6640625,
"epoch": 0.08564164324902984,
"grad_norm": 3.921875,
"kl": 0.0010865220101550221,
"learning_rate": 9.143583567509702e-07,
"loss": 0.0,
"reward": 0.6091820821166039,
"reward_std": 0.49955446273088455,
"rewards/reward_func": 0.6091820821166039,
"step": 640
},
{
"completion_length": 202.4609375,
"epoch": 0.08671216378964271,
"grad_norm": 3.6875,
"kl": 0.0012401975327520631,
"learning_rate": 9.132878362103572e-07,
"loss": 0.0,
"reward": 0.14060556702315807,
"reward_std": 0.6868433952331543,
"rewards/reward_func": 0.14060556702315807,
"step": 648
},
{
"completion_length": 157.953125,
"epoch": 0.08778268433025559,
"grad_norm": 4.21875,
"kl": 0.0014552801876561716,
"learning_rate": 9.122173156697443e-07,
"loss": 0.0001,
"reward": 0.27967471070587635,
"reward_std": 0.5355266528204083,
"rewards/reward_func": 0.27967471070587635,
"step": 656
},
{
"completion_length": 188.9609375,
"epoch": 0.08885320487086847,
"grad_norm": 2.8125,
"kl": 0.0012782855046680197,
"learning_rate": 9.111467951291315e-07,
"loss": 0.0001,
"reward": 0.2866704575717449,
"reward_std": 0.46457840129733086,
"rewards/reward_func": 0.2866704575717449,
"step": 664
},
{
"completion_length": 202.0078125,
"epoch": 0.08992372541148133,
"grad_norm": 3.625,
"kl": 0.0010314229875802994,
"learning_rate": 9.100762745885187e-07,
"loss": 0.0,
"reward": 0.21837860718369484,
"reward_std": 0.5863924492150545,
"rewards/reward_func": 0.21837860718369484,
"step": 672
},
{
"completion_length": 181.328125,
"epoch": 0.09099424595209421,
"grad_norm": 4.25,
"kl": 0.0011778115513152443,
"learning_rate": 9.090057540479058e-07,
"loss": 0.0,
"reward": 0.17519081057980657,
"reward_std": 0.5138188861310482,
"rewards/reward_func": 0.17519081057980657,
"step": 680
},
{
"completion_length": 196.46875,
"epoch": 0.09206476649270708,
"grad_norm": 4.84375,
"kl": 0.0013976221380289644,
"learning_rate": 9.079352335072928e-07,
"loss": 0.0001,
"reward": 0.07826100569218397,
"reward_std": 0.6565159633755684,
"rewards/reward_func": 0.07826100569218397,
"step": 688
},
{
"completion_length": 142.6171875,
"epoch": 0.09313528703331996,
"grad_norm": 4.3125,
"kl": 0.0014903126284480095,
"learning_rate": 9.0686471296668e-07,
"loss": 0.0001,
"reward": 0.4409363344311714,
"reward_std": 0.6269242819398642,
"rewards/reward_func": 0.4409363344311714,
"step": 696
},
{
"completion_length": 164.75,
"epoch": 0.09420580757393282,
"grad_norm": 2.859375,
"kl": 0.001257821699255146,
"learning_rate": 9.057941924260672e-07,
"loss": 0.0001,
"reward": 0.4695241190493107,
"reward_std": 0.4753529988229275,
"rewards/reward_func": 0.4695241190493107,
"step": 704
},
{
"completion_length": 175.765625,
"epoch": 0.0952763281145457,
"grad_norm": 3.5625,
"kl": 0.0015129576058825478,
"learning_rate": 9.047236718854542e-07,
"loss": 0.0001,
"reward": 0.02202584408223629,
"reward_std": 0.6471435278654099,
"rewards/reward_func": 0.02202584408223629,
"step": 712
},
{
"completion_length": 178.46875,
"epoch": 0.09634684865515857,
"grad_norm": 2.671875,
"kl": 0.0014852698805043474,
"learning_rate": 9.036531513448414e-07,
"loss": 0.0001,
"reward": 0.1112822787836194,
"reward_std": 0.6299657188355923,
"rewards/reward_func": 0.1112822787836194,
"step": 720
},
{
"completion_length": 179.171875,
"epoch": 0.09741736919577144,
"grad_norm": 3.65625,
"kl": 0.001357251821900718,
"learning_rate": 9.025826308042285e-07,
"loss": 0.0001,
"reward": 0.03050302341580391,
"reward_std": 0.4494458809494972,
"rewards/reward_func": 0.03050302341580391,
"step": 728
},
{
"completion_length": 179.171875,
"epoch": 0.09848788973638432,
"grad_norm": 3.21875,
"kl": 0.0012782294361386448,
"learning_rate": 9.015121102636157e-07,
"loss": 0.0001,
"reward": 0.07521175127476454,
"reward_std": 0.5754083581268787,
"rewards/reward_func": 0.07521175127476454,
"step": 736
},
{
"completion_length": 170.8046875,
"epoch": 0.09955841027699719,
"grad_norm": 3.640625,
"kl": 0.0014729191461810842,
"learning_rate": 9.004415897230027e-07,
"loss": 0.0001,
"reward": 0.19647281896322966,
"reward_std": 0.5569281429052353,
"rewards/reward_func": 0.19647281896322966,
"step": 744
},
{
"completion_length": 170.9609375,
"epoch": 0.10062893081761007,
"grad_norm": 3.390625,
"kl": 0.0015401854761876166,
"learning_rate": 8.993710691823899e-07,
"loss": 0.0001,
"reward": 0.3724030330777168,
"reward_std": 0.3632662743330002,
"rewards/reward_func": 0.3724030330777168,
"step": 752
},
{
"completion_length": 221.6171875,
"epoch": 0.10169945135822293,
"grad_norm": 4.15625,
"kl": 0.0013893990762881003,
"learning_rate": 8.983005486417771e-07,
"loss": 0.0001,
"reward": -0.09233509004116058,
"reward_std": 0.48617786914110184,
"rewards/reward_func": -0.09233509004116058,
"step": 760
},
{
"completion_length": 202.3828125,
"epoch": 0.10276997189883581,
"grad_norm": 4.375,
"kl": 0.0012778284071828239,
"learning_rate": 8.972300281011642e-07,
"loss": 0.0001,
"reward": 0.2091209925711155,
"reward_std": 0.6527585946023464,
"rewards/reward_func": 0.2091209925711155,
"step": 768
},
{
"completion_length": 193.1640625,
"epoch": 0.10384049243944868,
"grad_norm": 3.109375,
"kl": 0.0013181737012928352,
"learning_rate": 8.961595075605512e-07,
"loss": 0.0001,
"reward": 0.3330334695056081,
"reward_std": 0.541321462020278,
"rewards/reward_func": 0.3330334695056081,
"step": 776
},
{
"completion_length": 152.453125,
"epoch": 0.10491101298006156,
"grad_norm": 3.359375,
"kl": 0.0017155654932139441,
"learning_rate": 8.950889870199384e-07,
"loss": 0.0001,
"reward": 0.5970601001754403,
"reward_std": 0.4666150966659188,
"rewards/reward_func": 0.5970601001754403,
"step": 784
},
{
"completion_length": 169.125,
"epoch": 0.10598153352067442,
"grad_norm": 3.578125,
"kl": 0.0017139802366727963,
"learning_rate": 8.940184664793256e-07,
"loss": 0.0001,
"reward": 0.3862100951373577,
"reward_std": 0.4441477656364441,
"rewards/reward_func": 0.3862100951373577,
"step": 792
},
{
"completion_length": 155.3828125,
"epoch": 0.1070520540612873,
"grad_norm": 3.15625,
"kl": 0.0015526109855272807,
"learning_rate": 8.929479459387127e-07,
"loss": 0.0001,
"reward": 0.2984956353902817,
"reward_std": 0.586381059139967,
"rewards/reward_func": 0.2984956353902817,
"step": 800
},
{
"completion_length": 196.40625,
"epoch": 0.10812257460190017,
"grad_norm": 4.90625,
"kl": 0.0014231447203201242,
"learning_rate": 8.918774253980997e-07,
"loss": 0.0001,
"reward": 0.21418001921847463,
"reward_std": 0.6311414241790771,
"rewards/reward_func": 0.21418001921847463,
"step": 808
},
{
"completion_length": 138.2109375,
"epoch": 0.10919309514251305,
"grad_norm": 4.03125,
"kl": 0.0017638935969443992,
"learning_rate": 8.908069048574869e-07,
"loss": 0.0001,
"reward": 0.5397277176380157,
"reward_std": 0.5305888652801514,
"rewards/reward_func": 0.5397277176380157,
"step": 816
},
{
"completion_length": 178.0703125,
"epoch": 0.11026361568312593,
"grad_norm": 3.84375,
"kl": 0.0016187937144422904,
"learning_rate": 8.897363843168741e-07,
"loss": 0.0001,
"reward": 0.3325108243152499,
"reward_std": 0.5717135239392519,
"rewards/reward_func": 0.3325108243152499,
"step": 824
},
{
"completion_length": 148.953125,
"epoch": 0.11133413622373879,
"grad_norm": 5.0,
"kl": 0.0018482063169358298,
"learning_rate": 8.886658637762612e-07,
"loss": 0.0001,
"reward": 0.2947835847735405,
"reward_std": 0.4330580784007907,
"rewards/reward_func": 0.2947835847735405,
"step": 832
},
{
"completion_length": 164.9375,
"epoch": 0.11240465676435167,
"grad_norm": 4.09375,
"kl": 0.0016665154980728403,
"learning_rate": 8.875953432356483e-07,
"loss": 0.0001,
"reward": 0.4421437568962574,
"reward_std": 0.6194501928985119,
"rewards/reward_func": 0.4421437568962574,
"step": 840
},
{
"completion_length": 151.4140625,
"epoch": 0.11347517730496454,
"grad_norm": 3.0,
"kl": 0.0017541930938023143,
"learning_rate": 8.865248226950354e-07,
"loss": 0.0001,
"reward": 0.634210865944624,
"reward_std": 0.43934057652950287,
"rewards/reward_func": 0.634210865944624,
"step": 848
},
{
"completion_length": 163.0390625,
"epoch": 0.11454569784557742,
"grad_norm": 4.1875,
"kl": 0.001597623537236359,
"learning_rate": 8.854543021544225e-07,
"loss": 0.0001,
"reward": 0.49650172144174576,
"reward_std": 0.5100172646343708,
"rewards/reward_func": 0.49650172144174576,
"step": 856
},
{
"completion_length": 175.515625,
"epoch": 0.11561621838619028,
"grad_norm": 4.28125,
"kl": 0.0016097126208478585,
"learning_rate": 8.843837816138097e-07,
"loss": 0.0001,
"reward": 0.300532303750515,
"reward_std": 0.5050319191068411,
"rewards/reward_func": 0.300532303750515,
"step": 864
},
{
"completion_length": 166.609375,
"epoch": 0.11668673892680316,
"grad_norm": 3.375,
"kl": 0.001671285106567666,
"learning_rate": 8.833132610731968e-07,
"loss": 0.0001,
"reward": 0.3196424636989832,
"reward_std": 0.6211317032575607,
"rewards/reward_func": 0.3196424636989832,
"step": 872
},
{
"completion_length": 199.15625,
"epoch": 0.11775725946741603,
"grad_norm": 4.46875,
"kl": 0.0015437143010785803,
"learning_rate": 8.82242740532584e-07,
"loss": 0.0001,
"reward": -0.04952175496146083,
"reward_std": 0.6928350441157818,
"rewards/reward_func": -0.04952175496146083,
"step": 880
},
{
"completion_length": 195.078125,
"epoch": 0.1188277800080289,
"grad_norm": 3.625,
"kl": 0.0014302593117463402,
"learning_rate": 8.811722199919711e-07,
"loss": 0.0001,
"reward": 0.2765323193743825,
"reward_std": 0.5081478040665388,
"rewards/reward_func": 0.2765323193743825,
"step": 888
},
{
"completion_length": 162.1640625,
"epoch": 0.11989830054864177,
"grad_norm": 4.3125,
"kl": 0.002093081347993575,
"learning_rate": 8.801016994513581e-07,
"loss": 0.0001,
"reward": 0.43932087533175945,
"reward_std": 0.6151396594941616,
"rewards/reward_func": 0.43932087533175945,
"step": 896
},
{
"completion_length": 188.265625,
"epoch": 0.12096882108925465,
"grad_norm": 4.21875,
"kl": 0.0015162140916800126,
"learning_rate": 8.790311789107453e-07,
"loss": 0.0001,
"reward": 0.30038960836827755,
"reward_std": 0.6118085775524378,
"rewards/reward_func": 0.30038960836827755,
"step": 904
},
{
"completion_length": 185.625,
"epoch": 0.12203934162986753,
"grad_norm": 4.21875,
"kl": 0.0020139318803558126,
"learning_rate": 8.779606583701324e-07,
"loss": 0.0001,
"reward": 0.09830181300640106,
"reward_std": 0.4306083731353283,
"rewards/reward_func": 0.09830181300640106,
"step": 912
},
{
"completion_length": 184.1875,
"epoch": 0.1231098621704804,
"grad_norm": 3.15625,
"kl": 0.0016747360059525818,
"learning_rate": 8.768901378295196e-07,
"loss": 0.0001,
"reward": 0.14476243034005165,
"reward_std": 0.5790487378835678,
"rewards/reward_func": 0.14476243034005165,
"step": 920
},
{
"completion_length": 159.734375,
"epoch": 0.12418038271109327,
"grad_norm": 3.71875,
"kl": 0.0016227394880843349,
"learning_rate": 8.758196172889067e-07,
"loss": 0.0001,
"reward": 0.43369535729289055,
"reward_std": 0.6066659651696682,
"rewards/reward_func": 0.43369535729289055,
"step": 928
},
{
"completion_length": 197.9375,
"epoch": 0.12525090325170615,
"grad_norm": 4.5,
"kl": 0.0014731917763128877,
"learning_rate": 8.747490967482938e-07,
"loss": 0.0001,
"reward": 0.2775337900966406,
"reward_std": 0.4666815670207143,
"rewards/reward_func": 0.2775337900966406,
"step": 936
},
{
"completion_length": 190.015625,
"epoch": 0.126321423792319,
"grad_norm": 4.21875,
"kl": 0.0017878647340694442,
"learning_rate": 8.736785762076809e-07,
"loss": 0.0001,
"reward": 0.12123087048530579,
"reward_std": 0.5715042147785425,
"rewards/reward_func": 0.12123087048530579,
"step": 944
},
{
"completion_length": 167.1171875,
"epoch": 0.12739194433293188,
"grad_norm": 5.25,
"kl": 0.0019433694251347333,
"learning_rate": 8.726080556670681e-07,
"loss": 0.0001,
"reward": 0.45328211411833763,
"reward_std": 0.5355701018124819,
"rewards/reward_func": 0.45328211411833763,
"step": 952
},
{
"completion_length": 161.625,
"epoch": 0.12846246487354476,
"grad_norm": 5.0,
"kl": 0.002197007488575764,
"learning_rate": 8.715375351264552e-07,
"loss": 0.0001,
"reward": 0.3187681008130312,
"reward_std": 0.552251516841352,
"rewards/reward_func": 0.3187681008130312,
"step": 960
},
{
"completion_length": 210.890625,
"epoch": 0.12953298541415764,
"grad_norm": 5.46875,
"kl": 0.0015157314774114639,
"learning_rate": 8.704670145858424e-07,
"loss": 0.0001,
"reward": 0.28820460522547364,
"reward_std": 0.6035197824239731,
"rewards/reward_func": 0.28820460522547364,
"step": 968
},
{
"completion_length": 166.625,
"epoch": 0.1306035059547705,
"grad_norm": 3.765625,
"kl": 0.0017258078005397692,
"learning_rate": 8.693964940452294e-07,
"loss": 0.0001,
"reward": 0.5467304401099682,
"reward_std": 0.5410985443741083,
"rewards/reward_func": 0.5467304401099682,
"step": 976
},
{
"completion_length": 207.4765625,
"epoch": 0.13167402649538337,
"grad_norm": 2.9375,
"kl": 0.0016618163790553808,
"learning_rate": 8.683259735046166e-07,
"loss": 0.0001,
"reward": 0.24847363959997892,
"reward_std": 0.5600069649517536,
"rewards/reward_func": 0.24847363959997892,
"step": 984
},
{
"completion_length": 185.703125,
"epoch": 0.13274454703599625,
"grad_norm": 3.515625,
"kl": 0.0019170493469573557,
"learning_rate": 8.672554529640037e-07,
"loss": 0.0001,
"reward": 0.15675952192395926,
"reward_std": 0.4530050400644541,
"rewards/reward_func": 0.15675952192395926,
"step": 992
},
{
"completion_length": 185.6484375,
"epoch": 0.13381506757660913,
"grad_norm": 4.71875,
"kl": 0.0018418136023683473,
"learning_rate": 8.661849324233908e-07,
"loss": 0.0001,
"reward": 0.3646358111873269,
"reward_std": 0.5811815112829208,
"rewards/reward_func": 0.3646358111873269,
"step": 1000
},
{
"completion_length": 203.6171875,
"epoch": 0.134885588117222,
"grad_norm": 3.5625,
"kl": 0.0020316866575740278,
"learning_rate": 8.65114411882778e-07,
"loss": 0.0001,
"reward": 0.15310932788997889,
"reward_std": 0.5124245472252369,
"rewards/reward_func": 0.15310932788997889,
"step": 1008
},
{
"completion_length": 146.1484375,
"epoch": 0.13595610865783486,
"grad_norm": 3.65625,
"kl": 0.002132126915967092,
"learning_rate": 8.640438913421651e-07,
"loss": 0.0001,
"reward": 0.5445789489895105,
"reward_std": 0.5516379848122597,
"rewards/reward_func": 0.5445789489895105,
"step": 1016
},
{
"completion_length": 197.9375,
"epoch": 0.13702662919844774,
"grad_norm": 4.0625,
"kl": 0.001812848830013536,
"learning_rate": 8.629733708015521e-07,
"loss": 0.0001,
"reward": 0.07804312836378813,
"reward_std": 0.5468557141721249,
"rewards/reward_func": 0.07804312836378813,
"step": 1024
},
{
"completion_length": 160.8515625,
"epoch": 0.13809714973906062,
"grad_norm": 3.75,
"kl": 0.0020080953399883583,
"learning_rate": 8.619028502609393e-07,
"loss": 0.0001,
"reward": 0.3609808227047324,
"reward_std": 0.4902635822072625,
"rewards/reward_func": 0.3609808227047324,
"step": 1032
},
{
"completion_length": 166.921875,
"epoch": 0.1391676702796735,
"grad_norm": 3.953125,
"kl": 0.0021710527944378555,
"learning_rate": 8.608323297203265e-07,
"loss": 0.0001,
"reward": 0.04567475710064173,
"reward_std": 0.639631874859333,
"rewards/reward_func": 0.04567475710064173,
"step": 1040
},
{
"completion_length": 137.265625,
"epoch": 0.14023819082028635,
"grad_norm": 4.46875,
"kl": 0.0022375187691068277,
"learning_rate": 8.597618091797137e-07,
"loss": 0.0001,
"reward": 0.6706136465072632,
"reward_std": 0.470423087477684,
"rewards/reward_func": 0.6706136465072632,
"step": 1048
},
{
"completion_length": 174.9609375,
"epoch": 0.14130871136089923,
"grad_norm": 4.8125,
"kl": 0.002100168538163416,
"learning_rate": 8.586912886391006e-07,
"loss": 0.0001,
"reward": 0.4244903214275837,
"reward_std": 0.4562762314453721,
"rewards/reward_func": 0.4244903214275837,
"step": 1056
},
{
"completion_length": 147.9296875,
"epoch": 0.1423792319015121,
"grad_norm": 3.40625,
"kl": 0.0022357639973051846,
"learning_rate": 8.576207680984878e-07,
"loss": 0.0001,
"reward": 0.5205270126461983,
"reward_std": 0.483647458255291,
"rewards/reward_func": 0.5205270126461983,
"step": 1064
},
{
"completion_length": 212.625,
"epoch": 0.143449752442125,
"grad_norm": 3.46875,
"kl": 0.0017438856302760541,
"learning_rate": 8.56550247557875e-07,
"loss": 0.0001,
"reward": 0.04320483095943928,
"reward_std": 0.6948880217969418,
"rewards/reward_func": 0.04320483095943928,
"step": 1072
},
{
"completion_length": 181.5390625,
"epoch": 0.14452027298273787,
"grad_norm": 4.375,
"kl": 0.0018758865917334333,
"learning_rate": 8.554797270172622e-07,
"loss": 0.0001,
"reward": 0.30002590641379356,
"reward_std": 0.5271559292450547,
"rewards/reward_func": 0.30002590641379356,
"step": 1080
},
{
"completion_length": 176.609375,
"epoch": 0.14559079352335072,
"grad_norm": 3.59375,
"kl": 0.0016780206933617592,
"learning_rate": 8.544092064766492e-07,
"loss": 0.0001,
"reward": 0.29282525181770325,
"reward_std": 0.48009985871613026,
"rewards/reward_func": 0.29282525181770325,
"step": 1088
},
{
"completion_length": 218.75,
"epoch": 0.1466613140639636,
"grad_norm": 3.1875,
"kl": 0.002029647948802449,
"learning_rate": 8.533386859360363e-07,
"loss": 0.0001,
"reward": -0.1532103894278407,
"reward_std": 0.4770786985754967,
"rewards/reward_func": -0.1532103894278407,
"step": 1096
},
{
"completion_length": 184.4296875,
"epoch": 0.14773183460457648,
"grad_norm": 3.203125,
"kl": 0.0017034321062965319,
"learning_rate": 8.522681653954235e-07,
"loss": 0.0001,
"reward": 0.44974952936172485,
"reward_std": 0.5446614529937506,
"rewards/reward_func": 0.44974952936172485,
"step": 1104
},
{
"completion_length": 196.8203125,
"epoch": 0.14880235514518936,
"grad_norm": 5.0,
"kl": 0.001964334660442546,
"learning_rate": 8.511976448548106e-07,
"loss": 0.0001,
"reward": 0.056189559400081635,
"reward_std": 0.468637160025537,
"rewards/reward_func": 0.056189559400081635,
"step": 1112
},
{
"completion_length": 169.4921875,
"epoch": 0.1498728756858022,
"grad_norm": 4.15625,
"kl": 0.002173921908251941,
"learning_rate": 8.501271243141977e-07,
"loss": 0.0001,
"reward": 0.23756458796560764,
"reward_std": 0.6810929477214813,
"rewards/reward_func": 0.23756458796560764,
"step": 1120
},
{
"completion_length": 155.75,
"epoch": 0.1509433962264151,
"grad_norm": 4.65625,
"kl": 0.00248186280077789,
"learning_rate": 8.490566037735849e-07,
"loss": 0.0001,
"reward": 0.28897845139726996,
"reward_std": 0.5369884418323636,
"rewards/reward_func": 0.28897845139726996,
"step": 1128
},
{
"completion_length": 186.2109375,
"epoch": 0.15201391676702797,
"grad_norm": 3.21875,
"kl": 0.0022306487226160243,
"learning_rate": 8.479860832329721e-07,
"loss": 0.0001,
"reward": 0.10482135927304626,
"reward_std": 0.6790419593453407,
"rewards/reward_func": 0.10482135927304626,
"step": 1136
},
{
"completion_length": 192.546875,
"epoch": 0.15308443730764085,
"grad_norm": 3.71875,
"kl": 0.0019263384310761467,
"learning_rate": 8.469155626923591e-07,
"loss": 0.0001,
"reward": 0.14666470140218735,
"reward_std": 0.5422503855079412,
"rewards/reward_func": 0.14666470140218735,
"step": 1144
},
{
"completion_length": 184.40625,
"epoch": 0.15415495784825373,
"grad_norm": 2.734375,
"kl": 0.002109996523358859,
"learning_rate": 8.458450421517462e-07,
"loss": 0.0001,
"reward": 0.4111117944121361,
"reward_std": 0.4935786770656705,
"rewards/reward_func": 0.4111117944121361,
"step": 1152
},
{
"completion_length": 175.953125,
"epoch": 0.15522547838886658,
"grad_norm": 2.875,
"kl": 0.0024219048937084153,
"learning_rate": 8.447745216111334e-07,
"loss": 0.0001,
"reward": 0.26550869084894657,
"reward_std": 0.4649670384824276,
"rewards/reward_func": 0.26550869084894657,
"step": 1160
},
{
"completion_length": 161.7578125,
"epoch": 0.15629599892947946,
"grad_norm": 4.34375,
"kl": 0.0023637667181901634,
"learning_rate": 8.437040010705205e-07,
"loss": 0.0001,
"reward": 0.4199391510337591,
"reward_std": 0.5549349021166563,
"rewards/reward_func": 0.4199391510337591,
"step": 1168
},
{
"completion_length": 169.859375,
"epoch": 0.15736651947009234,
"grad_norm": 3.765625,
"kl": 0.002736452064709738,
"learning_rate": 8.426334805299077e-07,
"loss": 0.0001,
"reward": 0.09132032562047243,
"reward_std": 0.5800180211663246,
"rewards/reward_func": 0.09132032562047243,
"step": 1176
},
{
"completion_length": 165.875,
"epoch": 0.15843704001070522,
"grad_norm": 2.9375,
"kl": 0.0026032868336187676,
"learning_rate": 8.415629599892947e-07,
"loss": 0.0001,
"reward": 0.39741448499262333,
"reward_std": 0.553530789911747,
"rewards/reward_func": 0.39741448499262333,
"step": 1184
},
{
"completion_length": 152.6796875,
"epoch": 0.15950756055131807,
"grad_norm": 4.1875,
"kl": 0.0026048235449707136,
"learning_rate": 8.404924394486819e-07,
"loss": 0.0001,
"reward": 0.526178702712059,
"reward_std": 0.4392085522413254,
"rewards/reward_func": 0.526178702712059,
"step": 1192
},
{
"completion_length": 180.640625,
"epoch": 0.16057808109193095,
"grad_norm": 4.46875,
"kl": 0.0021871782082598656,
"learning_rate": 8.39421918908069e-07,
"loss": 0.0001,
"reward": 0.18006896087899804,
"reward_std": 0.6525123100727797,
"rewards/reward_func": 0.18006896087899804,
"step": 1200
},
{
"completion_length": 153.7734375,
"epoch": 0.16164860163254383,
"grad_norm": 2.75,
"kl": 0.0022942414943827316,
"learning_rate": 8.383513983674562e-07,
"loss": 0.0001,
"reward": 0.37028289400041103,
"reward_std": 0.49791209399700165,
"rewards/reward_func": 0.37028289400041103,
"step": 1208
},
{
"completion_length": 172.2109375,
"epoch": 0.1627191221731567,
"grad_norm": 5.03125,
"kl": 0.002093525734380819,
"learning_rate": 8.372808778268433e-07,
"loss": 0.0001,
"reward": 0.13176708482205868,
"reward_std": 0.6455358900129795,
"rewards/reward_func": 0.13176708482205868,
"step": 1216
},
{
"completion_length": 169.609375,
"epoch": 0.16378964271376958,
"grad_norm": 4.75,
"kl": 0.0022360333387041464,
"learning_rate": 8.362103572862303e-07,
"loss": 0.0001,
"reward": 0.3072196710854769,
"reward_std": 0.5846256157383323,
"rewards/reward_func": 0.3072196710854769,
"step": 1224
},
{
"completion_length": 166.1484375,
"epoch": 0.16486016325438244,
"grad_norm": 3.890625,
"kl": 0.002241482841782272,
"learning_rate": 8.351398367456175e-07,
"loss": 0.0001,
"reward": 0.3591133989393711,
"reward_std": 0.4736274667084217,
"rewards/reward_func": 0.3591133989393711,
"step": 1232
},
{
"completion_length": 199.203125,
"epoch": 0.16593068379499532,
"grad_norm": 3.328125,
"kl": 0.00244250099058263,
"learning_rate": 8.340693162050047e-07,
"loss": 0.0001,
"reward": 0.006516195833683014,
"reward_std": 0.6199562083929777,
"rewards/reward_func": 0.006516195833683014,
"step": 1240
},
{
"completion_length": 203.1328125,
"epoch": 0.1670012043356082,
"grad_norm": 4.65625,
"kl": 0.0022358261194312945,
"learning_rate": 8.329987956643918e-07,
"loss": 0.0001,
"reward": 0.10224719159305096,
"reward_std": 0.6809590011835098,
"rewards/reward_func": 0.10224719159305096,
"step": 1248
},
{
"completion_length": 148.3125,
"epoch": 0.16807172487622107,
"grad_norm": 4.375,
"kl": 0.0025668047892395407,
"learning_rate": 8.319282751237789e-07,
"loss": 0.0001,
"reward": 0.49531039223074913,
"reward_std": 0.4778098724782467,
"rewards/reward_func": 0.49531039223074913,
"step": 1256
},
{
"completion_length": 153.2578125,
"epoch": 0.16914224541683393,
"grad_norm": 3.34375,
"kl": 0.0024423423456028104,
"learning_rate": 8.30857754583166e-07,
"loss": 0.0001,
"reward": 0.35373237170279026,
"reward_std": 0.4996814336627722,
"rewards/reward_func": 0.35373237170279026,
"step": 1264
},
{
"completion_length": 171.234375,
"epoch": 0.1702127659574468,
"grad_norm": 3.1875,
"kl": 0.0021556682913796976,
"learning_rate": 8.297872340425532e-07,
"loss": 0.0001,
"reward": 0.28696669451892376,
"reward_std": 0.5421474725008011,
"rewards/reward_func": 0.28696669451892376,
"step": 1272
},
{
"completion_length": 188.515625,
"epoch": 0.17128328649805968,
"grad_norm": 3.703125,
"kl": 0.0021073912794236094,
"learning_rate": 8.287167135019402e-07,
"loss": 0.0001,
"reward": 0.21668443083763123,
"reward_std": 0.419855872169137,
"rewards/reward_func": 0.21668443083763123,
"step": 1280
},
{
"completion_length": 163.34375,
"epoch": 0.17235380703867256,
"grad_norm": 4.125,
"kl": 0.002413511203485541,
"learning_rate": 8.276461929613274e-07,
"loss": 0.0001,
"reward": 0.3908206336200237,
"reward_std": 0.5146115329116583,
"rewards/reward_func": 0.3908206336200237,
"step": 1288
},
{
"completion_length": 195.21875,
"epoch": 0.17342432757928541,
"grad_norm": 2.953125,
"kl": 0.002010537078604102,
"learning_rate": 8.265756724207146e-07,
"loss": 0.0001,
"reward": 0.2040023533627391,
"reward_std": 0.5783168002963066,
"rewards/reward_func": 0.2040023533627391,
"step": 1296
},
{
"completion_length": 147.5546875,
"epoch": 0.1744948481198983,
"grad_norm": 3.515625,
"kl": 0.0029850091959815472,
"learning_rate": 8.255051518801016e-07,
"loss": 0.0001,
"reward": 0.4321533404290676,
"reward_std": 0.3191776555031538,
"rewards/reward_func": 0.4321533404290676,
"step": 1304
},
{
"completion_length": 174.9921875,
"epoch": 0.17556536866051117,
"grad_norm": 3.84375,
"kl": 0.0024711176374694332,
"learning_rate": 8.244346313394887e-07,
"loss": 0.0001,
"reward": 0.31474856473505497,
"reward_std": 0.5751422699540854,
"rewards/reward_func": 0.31474856473505497,
"step": 1312
},
{
"completion_length": 180.5703125,
"epoch": 0.17663588920112405,
"grad_norm": 3.515625,
"kl": 0.002760413888609037,
"learning_rate": 8.233641107988759e-07,
"loss": 0.0001,
"reward": 0.2904138704761863,
"reward_std": 0.3093845183029771,
"rewards/reward_func": 0.2904138704761863,
"step": 1320
},
{
"completion_length": 171.0703125,
"epoch": 0.17770640974173693,
"grad_norm": 4.125,
"kl": 0.002526076335925609,
"learning_rate": 8.222935902582631e-07,
"loss": 0.0001,
"reward": 0.3407979141920805,
"reward_std": 0.6505857929587364,
"rewards/reward_func": 0.3407979141920805,
"step": 1328
},
{
"completion_length": 194.09375,
"epoch": 0.17877693028234978,
"grad_norm": 3.140625,
"kl": 0.002656547527294606,
"learning_rate": 8.212230697176503e-07,
"loss": 0.0001,
"reward": 0.16334644611924887,
"reward_std": 0.5975025221705437,
"rewards/reward_func": 0.16334644611924887,
"step": 1336
},
{
"completion_length": 169.984375,
"epoch": 0.17984745082296266,
"grad_norm": 6.1875,
"kl": 0.0022961402573855594,
"learning_rate": 8.201525491770372e-07,
"loss": 0.0001,
"reward": 0.11132130306214094,
"reward_std": 0.6224425416439772,
"rewards/reward_func": 0.11132130306214094,
"step": 1344
},
{
"completion_length": 174.171875,
"epoch": 0.18091797136357554,
"grad_norm": 3.6875,
"kl": 0.002571267934399657,
"learning_rate": 8.190820286364244e-07,
"loss": 0.0001,
"reward": 0.3750305436551571,
"reward_std": 0.6532670613378286,
"rewards/reward_func": 0.3750305436551571,
"step": 1352
},
{
"completion_length": 175.390625,
"epoch": 0.18198849190418842,
"grad_norm": 4.09375,
"kl": 0.0026113019848708063,
"learning_rate": 8.180115080958116e-07,
"loss": 0.0001,
"reward": 0.23261917755007744,
"reward_std": 0.5395109131932259,
"rewards/reward_func": 0.23261917755007744,
"step": 1360
},
{
"completion_length": 218.90625,
"epoch": 0.18305901244480127,
"grad_norm": 3.0625,
"kl": 0.002512662627850659,
"learning_rate": 8.169409875551986e-07,
"loss": 0.0001,
"reward": -0.04414751287549734,
"reward_std": 0.49756659008562565,
"rewards/reward_func": -0.04414751287549734,
"step": 1368
},
{
"completion_length": 203.4453125,
"epoch": 0.18412953298541415,
"grad_norm": 2.84375,
"kl": 0.0023657960555283353,
"learning_rate": 8.158704670145858e-07,
"loss": 0.0001,
"reward": 0.1807372528128326,
"reward_std": 0.6098357774317265,
"rewards/reward_func": 0.1807372528128326,
"step": 1376
},
{
"completion_length": 148.9609375,
"epoch": 0.18520005352602703,
"grad_norm": 4.03125,
"kl": 0.0027785369311459363,
"learning_rate": 8.14799946473973e-07,
"loss": 0.0001,
"reward": 0.587528869509697,
"reward_std": 0.4399991165846586,
"rewards/reward_func": 0.587528869509697,
"step": 1384
},
{
"completion_length": 166.234375,
"epoch": 0.1862705740666399,
"grad_norm": 4.4375,
"kl": 0.002863895075279288,
"learning_rate": 8.137294259333601e-07,
"loss": 0.0001,
"reward": 0.2870405614376068,
"reward_std": 0.5268293377012014,
"rewards/reward_func": 0.2870405614376068,
"step": 1392
},
{
"completion_length": 164.875,
"epoch": 0.1873410946072528,
"grad_norm": 4.3125,
"kl": 0.0027437864046078175,
"learning_rate": 8.126589053927471e-07,
"loss": 0.0001,
"reward": 0.27717010863125324,
"reward_std": 0.6858110204339027,
"rewards/reward_func": 0.27717010863125324,
"step": 1400
},
{
"completion_length": 160.875,
"epoch": 0.18841161514786564,
"grad_norm": 3.375,
"kl": 0.002744226367212832,
"learning_rate": 8.115883848521343e-07,
"loss": 0.0001,
"reward": 0.3531609745696187,
"reward_std": 0.41381734795868397,
"rewards/reward_func": 0.3531609745696187,
"step": 1408
},
{
"completion_length": 202.7734375,
"epoch": 0.18948213568847852,
"grad_norm": 2.859375,
"kl": 0.002227893375675194,
"learning_rate": 8.105178643115215e-07,
"loss": 0.0001,
"reward": 0.05168680660426617,
"reward_std": 0.5793404262512922,
"rewards/reward_func": 0.05168680660426617,
"step": 1416
},
{
"completion_length": 190.796875,
"epoch": 0.1905526562290914,
"grad_norm": 2.765625,
"kl": 0.002415237744571641,
"learning_rate": 8.094473437709086e-07,
"loss": 0.0001,
"reward": -0.012714797630906105,
"reward_std": 0.6679329574108124,
"rewards/reward_func": -0.012714797630906105,
"step": 1424
},
{
"completion_length": 160.0,
"epoch": 0.19162317676970428,
"grad_norm": 3.140625,
"kl": 0.0027612125559244305,
"learning_rate": 8.083768232302956e-07,
"loss": 0.0001,
"reward": 0.5069613344967365,
"reward_std": 0.5272765178233385,
"rewards/reward_func": 0.5069613344967365,
"step": 1432
},
{
"completion_length": 177.1328125,
"epoch": 0.19269369731031713,
"grad_norm": 3.9375,
"kl": 0.002587508424767293,
"learning_rate": 8.073063026896828e-07,
"loss": 0.0001,
"reward": 0.07287294790148735,
"reward_std": 0.3514406271278858,
"rewards/reward_func": 0.07287294790148735,
"step": 1440
},
{
"completion_length": 138.4609375,
"epoch": 0.19376421785093,
"grad_norm": 3.71875,
"kl": 0.002967173932120204,
"learning_rate": 8.0623578214907e-07,
"loss": 0.0001,
"reward": 0.40755754709243774,
"reward_std": 0.48442143853753805,
"rewards/reward_func": 0.40755754709243774,
"step": 1448
},
{
"completion_length": 160.5234375,
"epoch": 0.1948347383915429,
"grad_norm": 3.78125,
"kl": 0.0028059011965524405,
"learning_rate": 8.051652616084571e-07,
"loss": 0.0001,
"reward": 0.3703090399503708,
"reward_std": 0.4106726851314306,
"rewards/reward_func": 0.3703090399503708,
"step": 1456
},
{
"completion_length": 171.15625,
"epoch": 0.19590525893215577,
"grad_norm": 3.25,
"kl": 0.0026552542112767696,
"learning_rate": 8.040947410678442e-07,
"loss": 0.0001,
"reward": 0.3305620066821575,
"reward_std": 0.6117083020508289,
"rewards/reward_func": 0.3305620066821575,
"step": 1464
},
{
"completion_length": 155.4296875,
"epoch": 0.19697577947276865,
"grad_norm": 4.15625,
"kl": 0.003170755269820802,
"learning_rate": 8.030242205272313e-07,
"loss": 0.0001,
"reward": 0.6180750611238182,
"reward_std": 0.40046251006424427,
"rewards/reward_func": 0.6180750611238182,
"step": 1472
},
{
"completion_length": 180.125,
"epoch": 0.1980463000133815,
"grad_norm": 5.40625,
"kl": 0.002536381929530762,
"learning_rate": 8.019536999866184e-07,
"loss": 0.0001,
"reward": 0.2231542430818081,
"reward_std": 0.4958275035023689,
"rewards/reward_func": 0.2231542430818081,
"step": 1480
},
{
"completion_length": 180.4375,
"epoch": 0.19911682055399438,
"grad_norm": 4.875,
"kl": 0.0024219011975219473,
"learning_rate": 8.008831794460056e-07,
"loss": 0.0001,
"reward": 0.1133259404450655,
"reward_std": 0.48838030360639095,
"rewards/reward_func": 0.1133259404450655,
"step": 1488
},
{
"completion_length": 147.6171875,
"epoch": 0.20018734109460726,
"grad_norm": 5.125,
"kl": 0.0031812663073651493,
"learning_rate": 7.998126589053927e-07,
"loss": 0.0001,
"reward": 0.4617150817066431,
"reward_std": 0.32284008618444204,
"rewards/reward_func": 0.4617150817066431,
"step": 1496
},
{
"completion_length": 167.2109375,
"epoch": 0.20125786163522014,
"grad_norm": 4.09375,
"kl": 0.0028175316692795604,
"learning_rate": 7.987421383647799e-07,
"loss": 0.0001,
"reward": 0.3847576631233096,
"reward_std": 0.6411111112684011,
"rewards/reward_func": 0.3847576631233096,
"step": 1504
},
{
"completion_length": 177.4453125,
"epoch": 0.202328382175833,
"grad_norm": 2.796875,
"kl": 0.0026287745859008282,
"learning_rate": 7.976716178241669e-07,
"loss": 0.0001,
"reward": 0.5649865288287401,
"reward_std": 0.5727164149284363,
"rewards/reward_func": 0.5649865288287401,
"step": 1512
},
{
"completion_length": 172.1875,
"epoch": 0.20339890271644587,
"grad_norm": 3.8125,
"kl": 0.002526555268559605,
"learning_rate": 7.966010972835541e-07,
"loss": 0.0001,
"reward": 0.014871623367071152,
"reward_std": 0.7178319171071053,
"rewards/reward_func": 0.014871623367071152,
"step": 1520
},
{
"completion_length": 190.1328125,
"epoch": 0.20446942325705875,
"grad_norm": 3.484375,
"kl": 0.002442143566440791,
"learning_rate": 7.955305767429412e-07,
"loss": 0.0001,
"reward": -0.07442041672766209,
"reward_std": 0.4888562625274062,
"rewards/reward_func": -0.07442041672766209,
"step": 1528
},
{
"completion_length": 207.6328125,
"epoch": 0.20553994379767163,
"grad_norm": 3.359375,
"kl": 0.0033009210601449013,
"learning_rate": 7.944600562023284e-07,
"loss": 0.0001,
"reward": -0.009969270788133144,
"reward_std": 0.6862461306154728,
"rewards/reward_func": -0.009969270788133144,
"step": 1536
},
{
"completion_length": 162.59375,
"epoch": 0.20661046433828448,
"grad_norm": 4.65625,
"kl": 0.002739378687692806,
"learning_rate": 7.933895356617155e-07,
"loss": 0.0001,
"reward": 0.18637081049382687,
"reward_std": 0.5968187265098095,
"rewards/reward_func": 0.18637081049382687,
"step": 1544
},
{
"completion_length": 177.9453125,
"epoch": 0.20768098487889736,
"grad_norm": 3.53125,
"kl": 0.003128286494757049,
"learning_rate": 7.923190151211026e-07,
"loss": 0.0001,
"reward": 0.22664616536349058,
"reward_std": 0.6607938874512911,
"rewards/reward_func": 0.22664616536349058,
"step": 1552
},
{
"completion_length": 172.1328125,
"epoch": 0.20875150541951024,
"grad_norm": 3.953125,
"kl": 0.003004254394909367,
"learning_rate": 7.912484945804897e-07,
"loss": 0.0001,
"reward": 0.2532934434711933,
"reward_std": 0.45475378446280956,
"rewards/reward_func": 0.2532934434711933,
"step": 1560
},
{
"completion_length": 187.9609375,
"epoch": 0.20982202596012312,
"grad_norm": 3.890625,
"kl": 0.0027564516640268266,
"learning_rate": 7.901779740398768e-07,
"loss": 0.0001,
"reward": -0.02841023448854685,
"reward_std": 0.51457286067307,
"rewards/reward_func": -0.02841023448854685,
"step": 1568
},
{
"completion_length": 167.59375,
"epoch": 0.210892546500736,
"grad_norm": 2.515625,
"kl": 0.0030282980733318254,
"learning_rate": 7.89107453499264e-07,
"loss": 0.0001,
"reward": 0.4444689229130745,
"reward_std": 0.5939295422285795,
"rewards/reward_func": 0.4444689229130745,
"step": 1576
},
{
"completion_length": 170.5078125,
"epoch": 0.21196306704134885,
"grad_norm": 4.28125,
"kl": 0.0030873965588398278,
"learning_rate": 7.880369329586512e-07,
"loss": 0.0001,
"reward": 0.380419734865427,
"reward_std": 0.47338528744876385,
"rewards/reward_func": 0.380419734865427,
"step": 1584
},
{
"completion_length": 211.0234375,
"epoch": 0.21303358758196173,
"grad_norm": 2.75,
"kl": 0.002254050428746268,
"learning_rate": 7.869664124180381e-07,
"loss": 0.0001,
"reward": 0.2378121637739241,
"reward_std": 0.4477904764935374,
"rewards/reward_func": 0.2378121637739241,
"step": 1592
},
{
"completion_length": 172.953125,
"epoch": 0.2141041081225746,
"grad_norm": 3.296875,
"kl": 0.0025954252487281337,
"learning_rate": 7.858958918774253e-07,
"loss": 0.0001,
"reward": 0.2678052484989166,
"reward_std": 0.5658102091401815,
"rewards/reward_func": 0.2678052484989166,
"step": 1600
},
{
"completion_length": 186.3125,
"epoch": 0.21517462866318748,
"grad_norm": 3.59375,
"kl": 0.0029648117488250136,
"learning_rate": 7.848253713368125e-07,
"loss": 0.0001,
"reward": -0.019576035905629396,
"reward_std": 0.6440879367291927,
"rewards/reward_func": -0.019576035905629396,
"step": 1608
},
{
"completion_length": 172.1015625,
"epoch": 0.21624514920380034,
"grad_norm": 5.34375,
"kl": 0.0030590661335736513,
"learning_rate": 7.837548507961997e-07,
"loss": 0.0001,
"reward": 0.14971440564841032,
"reward_std": 0.7218026369810104,
"rewards/reward_func": 0.14971440564841032,
"step": 1616
},
{
"completion_length": 194.1796875,
"epoch": 0.21731566974441321,
"grad_norm": 3.859375,
"kl": 0.0026531801267992705,
"learning_rate": 7.826843302555867e-07,
"loss": 0.0001,
"reward": 0.10876535065472126,
"reward_std": 0.5477734114974737,
"rewards/reward_func": 0.10876535065472126,
"step": 1624
},
{
"completion_length": 171.3828125,
"epoch": 0.2183861902850261,
"grad_norm": 3.15625,
"kl": 0.0031873490661382675,
"learning_rate": 7.816138097149738e-07,
"loss": 0.0001,
"reward": 0.45664553716778755,
"reward_std": 0.41215432807803154,
"rewards/reward_func": 0.45664553716778755,
"step": 1632
},
{
"completion_length": 180.3515625,
"epoch": 0.21945671082563897,
"grad_norm": 2.953125,
"kl": 0.0028260272229090333,
"learning_rate": 7.80543289174361e-07,
"loss": 0.0001,
"reward": 0.12724297121167183,
"reward_std": 0.5571443336084485,
"rewards/reward_func": 0.12724297121167183,
"step": 1640
},
{
"completion_length": 171.859375,
"epoch": 0.22052723136625185,
"grad_norm": 4.125,
"kl": 0.003236800170270726,
"learning_rate": 7.794727686337482e-07,
"loss": 0.0001,
"reward": 0.3013367038220167,
"reward_std": 0.47213477827608585,
"rewards/reward_func": 0.3013367038220167,
"step": 1648
},
{
"completion_length": 170.015625,
"epoch": 0.2215977519068647,
"grad_norm": 4.125,
"kl": 0.0037978598556946963,
"learning_rate": 7.784022480931352e-07,
"loss": 0.0002,
"reward": 0.15545153710991144,
"reward_std": 0.4550722800195217,
"rewards/reward_func": 0.15545153710991144,
"step": 1656
},
{
"completion_length": 169.140625,
"epoch": 0.22266827244747758,
"grad_norm": 5.0,
"kl": 0.0032820345077198,
"learning_rate": 7.773317275525224e-07,
"loss": 0.0001,
"reward": 0.4377444460988045,
"reward_std": 0.3618372976779938,
"rewards/reward_func": 0.4377444460988045,
"step": 1664
},
{
"completion_length": 157.703125,
"epoch": 0.22373879298809046,
"grad_norm": 3.90625,
"kl": 0.0029510459426091984,
"learning_rate": 7.762612070119096e-07,
"loss": 0.0001,
"reward": 0.3989291125908494,
"reward_std": 0.6332539850845933,
"rewards/reward_func": 0.3989291125908494,
"step": 1672
},
{
"completion_length": 162.046875,
"epoch": 0.22480931352870334,
"grad_norm": 6.59375,
"kl": 0.0033083032321883366,
"learning_rate": 7.751906864712966e-07,
"loss": 0.0001,
"reward": 0.209829643368721,
"reward_std": 0.6311223246157169,
"rewards/reward_func": 0.209829643368721,
"step": 1680
},
{
"completion_length": 170.15625,
"epoch": 0.2258798340693162,
"grad_norm": 3.796875,
"kl": 0.0031797273695701733,
"learning_rate": 7.741201659306837e-07,
"loss": 0.0001,
"reward": 0.3279075580649078,
"reward_std": 0.5484324526041746,
"rewards/reward_func": 0.3279075580649078,
"step": 1688
},
{
"completion_length": 162.9921875,
"epoch": 0.22695035460992907,
"grad_norm": 4.03125,
"kl": 0.0034852683020289987,
"learning_rate": 7.730496453900709e-07,
"loss": 0.0001,
"reward": 0.4082569610327482,
"reward_std": 0.6405626218765974,
"rewards/reward_func": 0.4082569610327482,
"step": 1696
},
{
"completion_length": 181.515625,
"epoch": 0.22802087515054195,
"grad_norm": 4.40625,
"kl": 0.002649015310453251,
"learning_rate": 7.719791248494581e-07,
"loss": 0.0001,
"reward": 0.369276593439281,
"reward_std": 0.6541860643774271,
"rewards/reward_func": 0.369276593439281,
"step": 1704
},
{
"completion_length": 178.90625,
"epoch": 0.22909139569115483,
"grad_norm": 4.09375,
"kl": 0.003142255067359656,
"learning_rate": 7.709086043088452e-07,
"loss": 0.0001,
"reward": -0.047080494463443756,
"reward_std": 0.5225706771016121,
"rewards/reward_func": -0.047080494463443756,
"step": 1712
},
{
"completion_length": 186.4375,
"epoch": 0.2301619162317677,
"grad_norm": 3.53125,
"kl": 0.002581312000984326,
"learning_rate": 7.698380837682322e-07,
"loss": 0.0001,
"reward": 0.38728183694183826,
"reward_std": 0.5371669437736273,
"rewards/reward_func": 0.38728183694183826,
"step": 1720
},
{
"completion_length": 186.484375,
"epoch": 0.23123243677238056,
"grad_norm": 4.59375,
"kl": 0.0032444458920508623,
"learning_rate": 7.687675632276194e-07,
"loss": 0.0001,
"reward": 0.04241009894758463,
"reward_std": 0.571906641125679,
"rewards/reward_func": 0.04241009894758463,
"step": 1728
},
{
"completion_length": 162.625,
"epoch": 0.23230295731299344,
"grad_norm": 3.3125,
"kl": 0.003708757780259475,
"learning_rate": 7.676970426870065e-07,
"loss": 0.0001,
"reward": 0.44313428178429604,
"reward_std": 0.49149057269096375,
"rewards/reward_func": 0.44313428178429604,
"step": 1736
},
{
"completion_length": 152.2578125,
"epoch": 0.23337347785360632,
"grad_norm": 6.78125,
"kl": 0.004187669139355421,
"learning_rate": 7.666265221463937e-07,
"loss": 0.0002,
"reward": 0.28122030571103096,
"reward_std": 0.66860780864954,
"rewards/reward_func": 0.28122030571103096,
"step": 1744
},
{
"completion_length": 171.765625,
"epoch": 0.2344439983942192,
"grad_norm": 4.3125,
"kl": 0.0036364277184475213,
"learning_rate": 7.655560016057808e-07,
"loss": 0.0001,
"reward": 0.19042097311466932,
"reward_std": 0.6095598358660936,
"rewards/reward_func": 0.19042097311466932,
"step": 1752
},
{
"completion_length": 157.0,
"epoch": 0.23551451893483205,
"grad_norm": 5.0,
"kl": 0.003160024934913963,
"learning_rate": 7.644854810651679e-07,
"loss": 0.0001,
"reward": 0.4550087433308363,
"reward_std": 0.4861539136618376,
"rewards/reward_func": 0.4550087433308363,
"step": 1760
},
{
"completion_length": 166.6328125,
"epoch": 0.23658503947544493,
"grad_norm": 4.375,
"kl": 0.003758498263778165,
"learning_rate": 7.63414960524555e-07,
"loss": 0.0002,
"reward": 0.3313362691551447,
"reward_std": 0.6437762156128883,
"rewards/reward_func": 0.3313362691551447,
"step": 1768
},
{
"completion_length": 151.0859375,
"epoch": 0.2376555600160578,
"grad_norm": 4.5,
"kl": 0.0032230821962002665,
"learning_rate": 7.623444399839421e-07,
"loss": 0.0001,
"reward": 0.22410259768366814,
"reward_std": 0.5125870034098625,
"rewards/reward_func": 0.22410259768366814,
"step": 1776
},
{
"completion_length": 168.3984375,
"epoch": 0.2387260805566707,
"grad_norm": 8.0625,
"kl": 0.0032083308324217796,
"learning_rate": 7.612739194433293e-07,
"loss": 0.0001,
"reward": 0.33182543236762285,
"reward_std": 0.6010603215545416,
"rewards/reward_func": 0.33182543236762285,
"step": 1784
},
{
"completion_length": 207.2265625,
"epoch": 0.23979660109728354,
"grad_norm": 3.34375,
"kl": 0.002648265421157703,
"learning_rate": 7.602033989027165e-07,
"loss": 0.0001,
"reward": 0.03860955499112606,
"reward_std": 0.5622509643435478,
"rewards/reward_func": 0.03860955499112606,
"step": 1792
},
{
"completion_length": 176.15625,
"epoch": 0.24086712163789642,
"grad_norm": 4.8125,
"kl": 0.003711075463797897,
"learning_rate": 7.591328783621035e-07,
"loss": 0.0001,
"reward": 0.30269888415932655,
"reward_std": 0.5988016724586487,
"rewards/reward_func": 0.30269888415932655,
"step": 1800
},
{
"completion_length": 185.546875,
"epoch": 0.2419376421785093,
"grad_norm": 2.734375,
"kl": 0.0029471274465322495,
"learning_rate": 7.580623578214906e-07,
"loss": 0.0001,
"reward": 0.29960334673523903,
"reward_std": 0.5849093981087208,
"rewards/reward_func": 0.29960334673523903,
"step": 1808
},
{
"completion_length": 167.2109375,
"epoch": 0.24300816271912218,
"grad_norm": 4.96875,
"kl": 0.003220759332180023,
"learning_rate": 7.569918372808778e-07,
"loss": 0.0001,
"reward": 0.1449232380837202,
"reward_std": 0.6299447380006313,
"rewards/reward_func": 0.1449232380837202,
"step": 1816
},
{
"completion_length": 171.4375,
"epoch": 0.24407868325973506,
"grad_norm": 3.46875,
"kl": 0.003584496444091201,
"learning_rate": 7.559213167402649e-07,
"loss": 0.0001,
"reward": 0.1334919836372137,
"reward_std": 0.5379905849695206,
"rewards/reward_func": 0.1334919836372137,
"step": 1824
},
{
"completion_length": 176.0703125,
"epoch": 0.2451492038003479,
"grad_norm": 3.734375,
"kl": 0.0033595635613892227,
"learning_rate": 7.548507961996521e-07,
"loss": 0.0001,
"reward": 0.2927993945777416,
"reward_std": 0.48013297095894814,
"rewards/reward_func": 0.2927993945777416,
"step": 1832
},
{
"completion_length": 156.171875,
"epoch": 0.2462197243409608,
"grad_norm": 3.953125,
"kl": 0.0032605840533506125,
"learning_rate": 7.537802756590391e-07,
"loss": 0.0001,
"reward": 0.4554846081882715,
"reward_std": 0.46569772996008396,
"rewards/reward_func": 0.4554846081882715,
"step": 1840
},
{
"completion_length": 154.8515625,
"epoch": 0.24729024488157367,
"grad_norm": 5.09375,
"kl": 0.0038292294193524867,
"learning_rate": 7.527097551184263e-07,
"loss": 0.0002,
"reward": 0.2707878933288157,
"reward_std": 0.6204773802310228,
"rewards/reward_func": 0.2707878933288157,
"step": 1848
},
{
"completion_length": 173.046875,
"epoch": 0.24836076542218655,
"grad_norm": 3.671875,
"kl": 0.003133324411464855,
"learning_rate": 7.516392345778134e-07,
"loss": 0.0001,
"reward": 0.16592059656977654,
"reward_std": 0.5527506861835718,
"rewards/reward_func": 0.16592059656977654,
"step": 1856
},
{
"completion_length": 167.9375,
"epoch": 0.2494312859627994,
"grad_norm": 4.125,
"kl": 0.0033082127920351923,
"learning_rate": 7.505687140372006e-07,
"loss": 0.0001,
"reward": 0.20683408807963133,
"reward_std": 0.5755761060863733,
"rewards/reward_func": 0.20683408807963133,
"step": 1864
},
{
"completion_length": 184.7890625,
"epoch": 0.2505018065034123,
"grad_norm": 4.125,
"kl": 0.0028639852243941277,
"learning_rate": 7.494981934965877e-07,
"loss": 0.0001,
"reward": 0.22647499293088913,
"reward_std": 0.5401746807619929,
"rewards/reward_func": 0.22647499293088913,
"step": 1872
},
{
"completion_length": 177.9921875,
"epoch": 0.25157232704402516,
"grad_norm": 4.75,
"kl": 0.0032885581313166767,
"learning_rate": 7.484276729559747e-07,
"loss": 0.0001,
"reward": 0.19369017332792282,
"reward_std": 0.7017679810523987,
"rewards/reward_func": 0.19369017332792282,
"step": 1880
},
{
"completion_length": 165.171875,
"epoch": 0.252642847584638,
"grad_norm": 4.125,
"kl": 0.0038394963194150478,
"learning_rate": 7.473571524153619e-07,
"loss": 0.0002,
"reward": 0.4050522642210126,
"reward_std": 0.49607561621814966,
"rewards/reward_func": 0.4050522642210126,
"step": 1888
},
{
"completion_length": 212.78125,
"epoch": 0.2537133681252509,
"grad_norm": 4.625,
"kl": 0.003014246642123908,
"learning_rate": 7.462866318747491e-07,
"loss": 0.0001,
"reward": -0.03455093875527382,
"reward_std": 0.5755152553319931,
"rewards/reward_func": -0.03455093875527382,
"step": 1896
},
{
"completion_length": 154.8125,
"epoch": 0.25478388866586377,
"grad_norm": 5.5,
"kl": 0.0037700315297115594,
"learning_rate": 7.452161113341362e-07,
"loss": 0.0002,
"reward": 0.5173049904406071,
"reward_std": 0.5908821411430836,
"rewards/reward_func": 0.5173049904406071,
"step": 1904
},
{
"completion_length": 175.0703125,
"epoch": 0.2558544092064767,
"grad_norm": 3.734375,
"kl": 0.003177426100592129,
"learning_rate": 7.441455907935233e-07,
"loss": 0.0001,
"reward": 0.3084242893382907,
"reward_std": 0.5546926856040955,
"rewards/reward_func": 0.3084242893382907,
"step": 1912
},
{
"completion_length": 153.734375,
"epoch": 0.2569249297470895,
"grad_norm": 4.65625,
"kl": 0.0028014232811983675,
"learning_rate": 7.430750702529105e-07,
"loss": 0.0001,
"reward": 0.4781934395432472,
"reward_std": 0.3370926305651665,
"rewards/reward_func": 0.4781934395432472,
"step": 1920
},
{
"completion_length": 174.8203125,
"epoch": 0.2579954502877024,
"grad_norm": 3.703125,
"kl": 0.0027142605104018003,
"learning_rate": 7.420045497122976e-07,
"loss": 0.0001,
"reward": 0.34992816112935543,
"reward_std": 0.5419151671230793,
"rewards/reward_func": 0.34992816112935543,
"step": 1928
},
{
"completion_length": 164.3671875,
"epoch": 0.2590659708283153,
"grad_norm": 4.53125,
"kl": 0.003608020633691922,
"learning_rate": 7.409340291716846e-07,
"loss": 0.0001,
"reward": 0.23989262245595455,
"reward_std": 0.6161230951547623,
"rewards/reward_func": 0.23989262245595455,
"step": 1936
},
{
"completion_length": 176.71875,
"epoch": 0.26013649136892814,
"grad_norm": 3.46875,
"kl": 0.0028162887319922447,
"learning_rate": 7.398635086310718e-07,
"loss": 0.0001,
"reward": 0.2081197015941143,
"reward_std": 0.6199641041457653,
"rewards/reward_func": 0.2081197015941143,
"step": 1944
},
{
"completion_length": 188.7890625,
"epoch": 0.261207011909541,
"grad_norm": 3.9375,
"kl": 0.0030088693019934,
"learning_rate": 7.38792988090459e-07,
"loss": 0.0001,
"reward": 0.43774592503905296,
"reward_std": 0.5932074896991253,
"rewards/reward_func": 0.43774592503905296,
"step": 1952
},
{
"completion_length": 183.203125,
"epoch": 0.2622775324501539,
"grad_norm": 2.671875,
"kl": 0.0032375668233726174,
"learning_rate": 7.377224675498462e-07,
"loss": 0.0001,
"reward": 0.14370151609182358,
"reward_std": 0.6233563013374805,
"rewards/reward_func": 0.14370151609182358,
"step": 1960
},
{
"completion_length": 188.3984375,
"epoch": 0.26334805299076675,
"grad_norm": 3.640625,
"kl": 0.003420975699555129,
"learning_rate": 7.366519470092331e-07,
"loss": 0.0001,
"reward": 0.3819169942289591,
"reward_std": 0.4839291740208864,
"rewards/reward_func": 0.3819169942289591,
"step": 1968
},
{
"completion_length": 168.9140625,
"epoch": 0.26441857353137965,
"grad_norm": 3.84375,
"kl": 0.0034208787546958774,
"learning_rate": 7.355814264686203e-07,
"loss": 0.0001,
"reward": 0.17081641219556332,
"reward_std": 0.6360666044056416,
"rewards/reward_func": 0.17081641219556332,
"step": 1976
},
{
"completion_length": 147.734375,
"epoch": 0.2654890940719925,
"grad_norm": 3.890625,
"kl": 0.0038502227107528597,
"learning_rate": 7.345109059280075e-07,
"loss": 0.0002,
"reward": 0.5051426645368338,
"reward_std": 0.539917191490531,
"rewards/reward_func": 0.5051426645368338,
"step": 1984
},
{
"completion_length": 144.2421875,
"epoch": 0.26655961461260536,
"grad_norm": 4.40625,
"kl": 0.004192993917968124,
"learning_rate": 7.334403853873946e-07,
"loss": 0.0002,
"reward": 0.5341411675326526,
"reward_std": 0.4230798315256834,
"rewards/reward_func": 0.5341411675326526,
"step": 1992
},
{
"completion_length": 147.1953125,
"epoch": 0.26763013515321826,
"grad_norm": 4.65625,
"kl": 0.00358515654806979,
"learning_rate": 7.323698648467817e-07,
"loss": 0.0001,
"reward": 0.33829054702073336,
"reward_std": 0.48450249992311,
"rewards/reward_func": 0.33829054702073336,
"step": 2000
},
{
"completion_length": 166.828125,
"epoch": 0.2687006556938311,
"grad_norm": 2.21875,
"kl": 0.003072334686294198,
"learning_rate": 7.312993443061688e-07,
"loss": 0.0001,
"reward": 0.358464740216732,
"reward_std": 0.5359793957322836,
"rewards/reward_func": 0.358464740216732,
"step": 2008
},
{
"completion_length": 187.921875,
"epoch": 0.269771176234444,
"grad_norm": 2.984375,
"kl": 0.003413002035813406,
"learning_rate": 7.30228823765556e-07,
"loss": 0.0001,
"reward": 0.12693564407527447,
"reward_std": 0.5908421259373426,
"rewards/reward_func": 0.12693564407527447,
"step": 2016
},
{
"completion_length": 162.4453125,
"epoch": 0.2708416967750569,
"grad_norm": 3.859375,
"kl": 0.0031307056196965277,
"learning_rate": 7.291583032249431e-07,
"loss": 0.0001,
"reward": 0.38590772822499275,
"reward_std": 0.48073394782841206,
"rewards/reward_func": 0.38590772822499275,
"step": 2024
},
{
"completion_length": 192.0234375,
"epoch": 0.2719122173156697,
"grad_norm": 3.546875,
"kl": 0.0032598864345345646,
"learning_rate": 7.280877826843302e-07,
"loss": 0.0001,
"reward": 0.18051442131400108,
"reward_std": 0.6438321061432362,
"rewards/reward_func": 0.18051442131400108,
"step": 2032
},
{
"completion_length": 148.3984375,
"epoch": 0.27298273785628263,
"grad_norm": 4.09375,
"kl": 0.0036882674612570554,
"learning_rate": 7.270172621437174e-07,
"loss": 0.0001,
"reward": 0.4433805178850889,
"reward_std": 0.615590687841177,
"rewards/reward_func": 0.4433805178850889,
"step": 2040
},
{
"completion_length": 161.9375,
"epoch": 0.2740532583968955,
"grad_norm": 3.828125,
"kl": 0.00417991288122721,
"learning_rate": 7.259467416031044e-07,
"loss": 0.0002,
"reward": 0.21279390715062618,
"reward_std": 0.5614198036491871,
"rewards/reward_func": 0.21279390715062618,
"step": 2048
},
{
"completion_length": 157.0234375,
"epoch": 0.2751237789375084,
"grad_norm": 3.359375,
"kl": 0.0038566369330510497,
"learning_rate": 7.248762210624916e-07,
"loss": 0.0002,
"reward": 0.539018552750349,
"reward_std": 0.5531130824238062,
"rewards/reward_func": 0.539018552750349,
"step": 2056
},
{
"completion_length": 188.140625,
"epoch": 0.27619429947812124,
"grad_norm": 4.0,
"kl": 0.0034979561460204422,
"learning_rate": 7.238057005218787e-07,
"loss": 0.0001,
"reward": 0.11634453199803829,
"reward_std": 0.599401269108057,
"rewards/reward_func": 0.11634453199803829,
"step": 2064
},
{
"completion_length": 180.5625,
"epoch": 0.2772648200187341,
"grad_norm": 4.75,
"kl": 0.0037427434872370213,
"learning_rate": 7.227351799812659e-07,
"loss": 0.0001,
"reward": 0.2462961538694799,
"reward_std": 0.5912914611399174,
"rewards/reward_func": 0.2462961538694799,
"step": 2072
},
{
"completion_length": 169.1484375,
"epoch": 0.278335340559347,
"grad_norm": 3.859375,
"kl": 0.003938340552849695,
"learning_rate": 7.21664659440653e-07,
"loss": 0.0002,
"reward": 0.21049488708376884,
"reward_std": 0.513383561745286,
"rewards/reward_func": 0.21049488708376884,
"step": 2080
},
{
"completion_length": 163.03125,
"epoch": 0.27940586109995985,
"grad_norm": 3.90625,
"kl": 0.003593464905861765,
"learning_rate": 7.205941389000401e-07,
"loss": 0.0001,
"reward": 0.42887144535779953,
"reward_std": 0.5823842044919729,
"rewards/reward_func": 0.42887144535779953,
"step": 2088
},
{
"completion_length": 154.2421875,
"epoch": 0.2804763816405727,
"grad_norm": 5.71875,
"kl": 0.004313324898248538,
"learning_rate": 7.195236183594272e-07,
"loss": 0.0002,
"reward": 0.48943471536040306,
"reward_std": 0.5817722771316767,
"rewards/reward_func": 0.48943471536040306,
"step": 2096
},
{
"completion_length": 153.3984375,
"epoch": 0.2815469021811856,
"grad_norm": 4.875,
"kl": 0.003974846331402659,
"learning_rate": 7.184530978188144e-07,
"loss": 0.0002,
"reward": 0.4710603700950742,
"reward_std": 0.47509198915213346,
"rewards/reward_func": 0.4710603700950742,
"step": 2104
},
{
"completion_length": 152.15625,
"epoch": 0.28261742272179846,
"grad_norm": 3.390625,
"kl": 0.003992282261606306,
"learning_rate": 7.173825772782015e-07,
"loss": 0.0002,
"reward": 0.27247738372534513,
"reward_std": 0.5785027798265219,
"rewards/reward_func": 0.27247738372534513,
"step": 2112
},
{
"completion_length": 187.140625,
"epoch": 0.28368794326241137,
"grad_norm": 5.46875,
"kl": 0.003529240610077977,
"learning_rate": 7.163120567375887e-07,
"loss": 0.0001,
"reward": -0.031075291335582733,
"reward_std": 0.5039861313998699,
"rewards/reward_func": -0.031075291335582733,
"step": 2120
},
{
"completion_length": 179.78125,
"epoch": 0.2847584638030242,
"grad_norm": 5.0,
"kl": 0.0030008777684997767,
"learning_rate": 7.152415361969757e-07,
"loss": 0.0001,
"reward": 0.11691426858305931,
"reward_std": 0.5304726148024201,
"rewards/reward_func": 0.11691426858305931,
"step": 2128
},
{
"completion_length": 195.9765625,
"epoch": 0.28582898434363707,
"grad_norm": 3.4375,
"kl": 0.0033693104924168438,
"learning_rate": 7.141710156563628e-07,
"loss": 0.0001,
"reward": 0.1879887394607067,
"reward_std": 0.5351051315665245,
"rewards/reward_func": 0.1879887394607067,
"step": 2136
},
{
"completion_length": 184.78125,
"epoch": 0.28689950488425,
"grad_norm": 3.359375,
"kl": 0.003082483890466392,
"learning_rate": 7.1310049511575e-07,
"loss": 0.0001,
"reward": 0.16520871315151453,
"reward_std": 0.3897149385884404,
"rewards/reward_func": 0.16520871315151453,
"step": 2144
},
{
"completion_length": 161.625,
"epoch": 0.28797002542486283,
"grad_norm": 3.28125,
"kl": 0.0033815766510087997,
"learning_rate": 7.120299745751372e-07,
"loss": 0.0001,
"reward": 0.2989609017968178,
"reward_std": 0.6460195314139128,
"rewards/reward_func": 0.2989609017968178,
"step": 2152
},
{
"completion_length": 158.734375,
"epoch": 0.28904054596547574,
"grad_norm": 3.65625,
"kl": 0.003546297753928229,
"learning_rate": 7.109594540345243e-07,
"loss": 0.0001,
"reward": 0.18108150828629732,
"reward_std": 0.5971081778407097,
"rewards/reward_func": 0.18108150828629732,
"step": 2160
},
{
"completion_length": 162.09375,
"epoch": 0.2901110665060886,
"grad_norm": 4.1875,
"kl": 0.003662400442408398,
"learning_rate": 7.098889334939114e-07,
"loss": 0.0001,
"reward": 0.18057992309331894,
"reward_std": 0.5010180473327637,
"rewards/reward_func": 0.18057992309331894,
"step": 2168
},
{
"completion_length": 168.703125,
"epoch": 0.29118158704670144,
"grad_norm": 3.5625,
"kl": 0.0036488809855654836,
"learning_rate": 7.088184129532985e-07,
"loss": 0.0001,
"reward": 0.3672813940793276,
"reward_std": 0.5888884011656046,
"rewards/reward_func": 0.3672813940793276,
"step": 2176
},
{
"completion_length": 160.6015625,
"epoch": 0.29225210758731435,
"grad_norm": 4.03125,
"kl": 0.0035606406745500863,
"learning_rate": 7.077478924126857e-07,
"loss": 0.0001,
"reward": 0.34612463414669037,
"reward_std": 0.5678670313209295,
"rewards/reward_func": 0.34612463414669037,
"step": 2184
},
{
"completion_length": 186.1640625,
"epoch": 0.2933226281279272,
"grad_norm": 3.546875,
"kl": 0.0036609756061807275,
"learning_rate": 7.066773718720727e-07,
"loss": 0.0001,
"reward": 0.12424429133534431,
"reward_std": 0.5589212942868471,
"rewards/reward_func": 0.12424429133534431,
"step": 2192
},
{
"completion_length": 158.578125,
"epoch": 0.29439314866854005,
"grad_norm": 4.09375,
"kl": 0.003958662680815905,
"learning_rate": 7.056068513314599e-07,
"loss": 0.0002,
"reward": 0.2399831861257553,
"reward_std": 0.6095044985413551,
"rewards/reward_func": 0.2399831861257553,
"step": 2200
},
{
"completion_length": 150.6953125,
"epoch": 0.29546366920915296,
"grad_norm": 3.109375,
"kl": 0.004275305866030976,
"learning_rate": 7.045363307908471e-07,
"loss": 0.0002,
"reward": 0.42568245250731707,
"reward_std": 0.5035090297460556,
"rewards/reward_func": 0.42568245250731707,
"step": 2208
},
{
"completion_length": 198.25,
"epoch": 0.2965341897497658,
"grad_norm": 4.75,
"kl": 0.0037761297717224807,
"learning_rate": 7.034658102502341e-07,
"loss": 0.0002,
"reward": 0.20382929779589176,
"reward_std": 0.6004747971892357,
"rewards/reward_func": 0.20382929779589176,
"step": 2216
},
{
"completion_length": 166.953125,
"epoch": 0.2976047102903787,
"grad_norm": 5.59375,
"kl": 0.004744472214952111,
"learning_rate": 7.023952897096212e-07,
"loss": 0.0002,
"reward": 0.12384441681206226,
"reward_std": 0.5781398452818394,
"rewards/reward_func": 0.12384441681206226,
"step": 2224
},
{
"completion_length": 151.3125,
"epoch": 0.29867523083099157,
"grad_norm": 6.46875,
"kl": 0.004519026639172807,
"learning_rate": 7.013247691690084e-07,
"loss": 0.0002,
"reward": 0.36841049790382385,
"reward_std": 0.3033979944884777,
"rewards/reward_func": 0.36841049790382385,
"step": 2232
},
{
"completion_length": 166.9375,
"epoch": 0.2997457513716044,
"grad_norm": 4.03125,
"kl": 0.003987667616456747,
"learning_rate": 7.002542486283956e-07,
"loss": 0.0002,
"reward": -0.1171187162399292,
"reward_std": 0.42854547686874866,
"rewards/reward_func": -0.1171187162399292,
"step": 2240
},
{
"completion_length": 183.328125,
"epoch": 0.3008162719122173,
"grad_norm": 3.515625,
"kl": 0.0033717694896040484,
"learning_rate": 6.991837280877828e-07,
"loss": 0.0001,
"reward": 0.22414767649024725,
"reward_std": 0.6165672689676285,
"rewards/reward_func": 0.22414767649024725,
"step": 2248
},
{
"completion_length": 193.2421875,
"epoch": 0.3018867924528302,
"grad_norm": 3.140625,
"kl": 0.0031051966943778098,
"learning_rate": 6.981132075471697e-07,
"loss": 0.0001,
"reward": 0.3812776654958725,
"reward_std": 0.5649162493646145,
"rewards/reward_func": 0.3812776654958725,
"step": 2256
},
{
"completion_length": 183.9765625,
"epoch": 0.3029573129934431,
"grad_norm": 4.4375,
"kl": 0.003768587455851957,
"learning_rate": 6.970426870065569e-07,
"loss": 0.0002,
"reward": 0.12498392723500729,
"reward_std": 0.4842628054320812,
"rewards/reward_func": 0.12498392723500729,
"step": 2264
},
{
"completion_length": 200.5390625,
"epoch": 0.30402783353405594,
"grad_norm": 4.3125,
"kl": 0.0034181236114818603,
"learning_rate": 6.959721664659441e-07,
"loss": 0.0001,
"reward": -0.04165226221084595,
"reward_std": 0.5646636541932821,
"rewards/reward_func": -0.04165226221084595,
"step": 2272
},
{
"completion_length": 153.8125,
"epoch": 0.3050983540746688,
"grad_norm": 3.5625,
"kl": 0.0040518031746614724,
"learning_rate": 6.949016459253311e-07,
"loss": 0.0002,
"reward": 0.35562361404299736,
"reward_std": 0.44176073744893074,
"rewards/reward_func": 0.35562361404299736,
"step": 2280
},
{
"completion_length": 179.8046875,
"epoch": 0.3061688746152817,
"grad_norm": 4.53125,
"kl": 0.0034489443933125585,
"learning_rate": 6.938311253847183e-07,
"loss": 0.0001,
"reward": 0.30737813375890255,
"reward_std": 0.5192515105009079,
"rewards/reward_func": 0.30737813375890255,
"step": 2288
},
{
"completion_length": 175.1015625,
"epoch": 0.30723939515589455,
"grad_norm": 4.4375,
"kl": 0.003332895546918735,
"learning_rate": 6.927606048441054e-07,
"loss": 0.0001,
"reward": 0.16119882743805647,
"reward_std": 0.6122260540723801,
"rewards/reward_func": 0.16119882743805647,
"step": 2296
},
{
"completion_length": 166.7421875,
"epoch": 0.30830991569650745,
"grad_norm": 2.65625,
"kl": 0.0034852146345656365,
"learning_rate": 6.916900843034925e-07,
"loss": 0.0001,
"reward": 0.22000440582633018,
"reward_std": 0.5837470442056656,
"rewards/reward_func": 0.22000440582633018,
"step": 2304
},
{
"completion_length": 146.2421875,
"epoch": 0.3093804362371203,
"grad_norm": 3.59375,
"kl": 0.003727599134435877,
"learning_rate": 6.906195637628796e-07,
"loss": 0.0001,
"reward": 0.18992659822106361,
"reward_std": 0.5706657655537128,
"rewards/reward_func": 0.18992659822106361,
"step": 2312
},
{
"completion_length": 152.09375,
"epoch": 0.31045095677773316,
"grad_norm": 3.03125,
"kl": 0.004237443121382967,
"learning_rate": 6.895490432222668e-07,
"loss": 0.0002,
"reward": 0.5161734204739332,
"reward_std": 0.5621990244835615,
"rewards/reward_func": 0.5161734204739332,
"step": 2320
},
{
"completion_length": 139.578125,
"epoch": 0.31152147731834606,
"grad_norm": 3.546875,
"kl": 0.0043363839504309,
"learning_rate": 6.88478522681654e-07,
"loss": 0.0002,
"reward": 0.3602239452302456,
"reward_std": 0.6682011783123016,
"rewards/reward_func": 0.3602239452302456,
"step": 2328
},
{
"completion_length": 159.1171875,
"epoch": 0.3125919978589589,
"grad_norm": 3.296875,
"kl": 0.005018363182898611,
"learning_rate": 6.87408002141041e-07,
"loss": 0.0002,
"reward": 0.18990419153124094,
"reward_std": 0.38154869619756937,
"rewards/reward_func": 0.18990419153124094,
"step": 2336
},
{
"completion_length": 186.171875,
"epoch": 0.31366251839957177,
"grad_norm": 3.75,
"kl": 0.0034995676251128316,
"learning_rate": 6.863374816004281e-07,
"loss": 0.0001,
"reward": 0.28119928389787674,
"reward_std": 0.6371741183102131,
"rewards/reward_func": 0.28119928389787674,
"step": 2344
},
{
"completion_length": 144.5390625,
"epoch": 0.3147330389401847,
"grad_norm": 3.109375,
"kl": 0.003701402310980484,
"learning_rate": 6.852669610598153e-07,
"loss": 0.0001,
"reward": 0.2914201710373163,
"reward_std": 0.5679418547078967,
"rewards/reward_func": 0.2914201710373163,
"step": 2352
},
{
"completion_length": 158.453125,
"epoch": 0.3158035594807975,
"grad_norm": 4.59375,
"kl": 0.003569768596207723,
"learning_rate": 6.841964405192025e-07,
"loss": 0.0001,
"reward": 0.38121682219207287,
"reward_std": 0.5718358978629112,
"rewards/reward_func": 0.38121682219207287,
"step": 2360
},
{
"completion_length": 172.234375,
"epoch": 0.31687408002141043,
"grad_norm": 4.09375,
"kl": 0.003890137653797865,
"learning_rate": 6.831259199785896e-07,
"loss": 0.0002,
"reward": 0.19206082820892334,
"reward_std": 0.5519562661647797,
"rewards/reward_func": 0.19206082820892334,
"step": 2368
},
{
"completion_length": 136.1171875,
"epoch": 0.3179446005620233,
"grad_norm": 3.953125,
"kl": 0.004021885659312829,
"learning_rate": 6.820553994379766e-07,
"loss": 0.0002,
"reward": 0.43440112797543406,
"reward_std": 0.5649959053844213,
"rewards/reward_func": 0.43440112797543406,
"step": 2376
},
{
"completion_length": 189.59375,
"epoch": 0.31901512110263613,
"grad_norm": 7.4375,
"kl": 0.0037745212903246284,
"learning_rate": 6.809848788973638e-07,
"loss": 0.0002,
"reward": 0.08486939128488302,
"reward_std": 0.5615943241864443,
"rewards/reward_func": 0.08486939128488302,
"step": 2384
},
{
"completion_length": 145.40625,
"epoch": 0.32008564164324904,
"grad_norm": 6.15625,
"kl": 0.004177739087026566,
"learning_rate": 6.799143583567509e-07,
"loss": 0.0002,
"reward": 0.03109552478417754,
"reward_std": 0.6218379884958267,
"rewards/reward_func": 0.03109552478417754,
"step": 2392
},
{
"completion_length": 160.3203125,
"epoch": 0.3211561621838619,
"grad_norm": 4.71875,
"kl": 0.004120006924495101,
"learning_rate": 6.788438378161381e-07,
"loss": 0.0002,
"reward": 0.33427711576223373,
"reward_std": 0.5099399294704199,
"rewards/reward_func": 0.33427711576223373,
"step": 2400
},
{
"completion_length": 165.9765625,
"epoch": 0.3222266827244748,
"grad_norm": 3.859375,
"kl": 0.0034675312926992774,
"learning_rate": 6.777733172755252e-07,
"loss": 0.0001,
"reward": 0.4284206023439765,
"reward_std": 0.5410223100334406,
"rewards/reward_func": 0.4284206023439765,
"step": 2408
},
{
"completion_length": 203.5234375,
"epoch": 0.32329720326508765,
"grad_norm": 2.828125,
"kl": 0.003464344044914469,
"learning_rate": 6.767027967349124e-07,
"loss": 0.0001,
"reward": 0.32477592676877975,
"reward_std": 0.5011547729372978,
"rewards/reward_func": 0.32477592676877975,
"step": 2416
},
{
"completion_length": 150.3984375,
"epoch": 0.3243677238057005,
"grad_norm": 3.296875,
"kl": 0.003587738669011742,
"learning_rate": 6.756322761942994e-07,
"loss": 0.0001,
"reward": 0.44885979406535625,
"reward_std": 0.5460297726094723,
"rewards/reward_func": 0.44885979406535625,
"step": 2424
},
{
"completion_length": 169.2890625,
"epoch": 0.3254382443463134,
"grad_norm": 3.296875,
"kl": 0.003916321613360196,
"learning_rate": 6.745617556536866e-07,
"loss": 0.0002,
"reward": 0.12248068256303668,
"reward_std": 0.5732488930225372,
"rewards/reward_func": 0.12248068256303668,
"step": 2432
},
{
"completion_length": 199.9609375,
"epoch": 0.32650876488692626,
"grad_norm": 3.5625,
"kl": 0.0033171565155498683,
"learning_rate": 6.734912351130737e-07,
"loss": 0.0001,
"reward": 0.2281382903456688,
"reward_std": 0.5701967515051365,
"rewards/reward_func": 0.2281382903456688,
"step": 2440
},
{
"completion_length": 191.703125,
"epoch": 0.32757928542753917,
"grad_norm": 4.4375,
"kl": 0.0034852577664423734,
"learning_rate": 6.724207145724608e-07,
"loss": 0.0001,
"reward": 0.21391855087131262,
"reward_std": 0.6829859614372253,
"rewards/reward_func": 0.21391855087131262,
"step": 2448
},
{
"completion_length": 186.296875,
"epoch": 0.328649805968152,
"grad_norm": 3.765625,
"kl": 0.004041536885779351,
"learning_rate": 6.71350194031848e-07,
"loss": 0.0002,
"reward": 0.16749184112995863,
"reward_std": 0.5975307431071997,
"rewards/reward_func": 0.16749184112995863,
"step": 2456
},
{
"completion_length": 169.34375,
"epoch": 0.32972032650876487,
"grad_norm": 3.890625,
"kl": 0.003387822740478441,
"learning_rate": 6.702796734912351e-07,
"loss": 0.0001,
"reward": 0.4501562397927046,
"reward_std": 0.4912100899964571,
"rewards/reward_func": 0.4501562397927046,
"step": 2464
},
{
"completion_length": 147.0,
"epoch": 0.3307908470493778,
"grad_norm": 3.125,
"kl": 0.0038211781647987664,
"learning_rate": 6.692091529506222e-07,
"loss": 0.0002,
"reward": 0.10390966571867466,
"reward_std": 0.4674555938690901,
"rewards/reward_func": 0.10390966571867466,
"step": 2472
},
{
"completion_length": 165.9375,
"epoch": 0.33186136758999063,
"grad_norm": 5.96875,
"kl": 0.0037551842688117176,
"learning_rate": 6.681386324100093e-07,
"loss": 0.0002,
"reward": 0.3239047722890973,
"reward_std": 0.544673465192318,
"rewards/reward_func": 0.3239047722890973,
"step": 2480
},
{
"completion_length": 170.4375,
"epoch": 0.3329318881306035,
"grad_norm": 3.015625,
"kl": 0.004028416806249879,
"learning_rate": 6.670681118693965e-07,
"loss": 0.0002,
"reward": 0.16679776646196842,
"reward_std": 0.4815365634858608,
"rewards/reward_func": 0.16679776646196842,
"step": 2488
},
{
"completion_length": 162.2265625,
"epoch": 0.3340024086712164,
"grad_norm": 3.734375,
"kl": 0.003879312367644161,
"learning_rate": 6.659975913287837e-07,
"loss": 0.0002,
"reward": 0.4071632297709584,
"reward_std": 0.5334546230733395,
"rewards/reward_func": 0.4071632297709584,
"step": 2496
},
{
"completion_length": 154.6953125,
"epoch": 0.33507292921182924,
"grad_norm": 3.84375,
"kl": 0.0042274416191503406,
"learning_rate": 6.649270707881706e-07,
"loss": 0.0002,
"reward": 0.17819023504853249,
"reward_std": 0.49565806053578854,
"rewards/reward_func": 0.17819023504853249,
"step": 2504
},
{
"completion_length": 192.609375,
"epoch": 0.33614344975244215,
"grad_norm": 4.0625,
"kl": 0.0035822324571199715,
"learning_rate": 6.638565502475578e-07,
"loss": 0.0001,
"reward": -0.004483510740101337,
"reward_std": 0.443414025940001,
"rewards/reward_func": -0.004483510740101337,
"step": 2512
},
{
"completion_length": 171.921875,
"epoch": 0.337213970293055,
"grad_norm": 4.65625,
"kl": 0.004149035812588409,
"learning_rate": 6.62786029706945e-07,
"loss": 0.0002,
"reward": 0.08967352751642466,
"reward_std": 0.5806238334625959,
"rewards/reward_func": 0.08967352751642466,
"step": 2520
},
{
"completion_length": 153.5703125,
"epoch": 0.33828449083366785,
"grad_norm": 4.375,
"kl": 0.004091008595423773,
"learning_rate": 6.617155091663322e-07,
"loss": 0.0002,
"reward": 0.32766120694577694,
"reward_std": 0.5018663741648197,
"rewards/reward_func": 0.32766120694577694,
"step": 2528
},
{
"completion_length": 180.65625,
"epoch": 0.33935501137428076,
"grad_norm": 4.0625,
"kl": 0.003188255534041673,
"learning_rate": 6.606449886257192e-07,
"loss": 0.0001,
"reward": 0.09142577461898327,
"reward_std": 0.6621435023844242,
"rewards/reward_func": 0.09142577461898327,
"step": 2536
},
{
"completion_length": 175.28125,
"epoch": 0.3404255319148936,
"grad_norm": 4.46875,
"kl": 0.003918278380297124,
"learning_rate": 6.595744680851063e-07,
"loss": 0.0002,
"reward": 0.26254068687558174,
"reward_std": 0.4977311482653022,
"rewards/reward_func": 0.26254068687558174,
"step": 2544
},
{
"completion_length": 178.5859375,
"epoch": 0.3414960524555065,
"grad_norm": 2.78125,
"kl": 0.0037679201050195843,
"learning_rate": 6.585039475444935e-07,
"loss": 0.0002,
"reward": 0.2359130820259452,
"reward_std": 0.6390624288469553,
"rewards/reward_func": 0.2359130820259452,
"step": 2552
},
{
"completion_length": 185.7421875,
"epoch": 0.34256657299611937,
"grad_norm": 4.0625,
"kl": 0.003888906561769545,
"learning_rate": 6.574334270038807e-07,
"loss": 0.0002,
"reward": 0.06335067562758923,
"reward_std": 0.5590885141864419,
"rewards/reward_func": 0.06335067562758923,
"step": 2560
},
{
"completion_length": 156.1953125,
"epoch": 0.3436370935367322,
"grad_norm": 4.03125,
"kl": 0.004304436064558104,
"learning_rate": 6.563629064632677e-07,
"loss": 0.0002,
"reward": 0.20302090607583523,
"reward_std": 0.5702759772539139,
"rewards/reward_func": 0.20302090607583523,
"step": 2568
},
{
"completion_length": 132.640625,
"epoch": 0.3447076140773451,
"grad_norm": 4.34375,
"kl": 0.004362121399026364,
"learning_rate": 6.552923859226549e-07,
"loss": 0.0002,
"reward": 0.6651312373578548,
"reward_std": 0.3866056464612484,
"rewards/reward_func": 0.6651312373578548,
"step": 2576
},
{
"completion_length": 153.0078125,
"epoch": 0.345778134617958,
"grad_norm": 3.109375,
"kl": 0.0041726555791683495,
"learning_rate": 6.54221865382042e-07,
"loss": 0.0002,
"reward": 0.2306189425289631,
"reward_std": 0.4977853484451771,
"rewards/reward_func": 0.2306189425289631,
"step": 2584
},
{
"completion_length": 166.15625,
"epoch": 0.34684865515857083,
"grad_norm": 3.59375,
"kl": 0.0034308232716284692,
"learning_rate": 6.531513448414291e-07,
"loss": 0.0001,
"reward": 0.11097771301865578,
"reward_std": 0.6078328117728233,
"rewards/reward_func": 0.11097771301865578,
"step": 2592
},
{
"completion_length": 153.0,
"epoch": 0.34791917569918374,
"grad_norm": 4.15625,
"kl": 0.003664735675556585,
"learning_rate": 6.520808243008162e-07,
"loss": 0.0001,
"reward": 0.49361006263643503,
"reward_std": 0.5971282683312893,
"rewards/reward_func": 0.49361006263643503,
"step": 2600
},
{
"completion_length": 164.8828125,
"epoch": 0.3489896962397966,
"grad_norm": 5.0,
"kl": 0.00437125310418196,
"learning_rate": 6.510103037602034e-07,
"loss": 0.0002,
"reward": 0.1592898964881897,
"reward_std": 0.5312252482399344,
"rewards/reward_func": 0.1592898964881897,
"step": 2608
},
{
"completion_length": 165.421875,
"epoch": 0.3500602167804095,
"grad_norm": 4.34375,
"kl": 0.0037221178063191473,
"learning_rate": 6.499397832195906e-07,
"loss": 0.0001,
"reward": 0.41797424480319023,
"reward_std": 0.5129497703164816,
"rewards/reward_func": 0.41797424480319023,
"step": 2616
},
{
"completion_length": 182.96875,
"epoch": 0.35113073732102235,
"grad_norm": 3.65625,
"kl": 0.004204686090815812,
"learning_rate": 6.488692626789775e-07,
"loss": 0.0002,
"reward": 0.24847618490457535,
"reward_std": 0.5075008701533079,
"rewards/reward_func": 0.24847618490457535,
"step": 2624
},
{
"completion_length": 150.1875,
"epoch": 0.3522012578616352,
"grad_norm": 4.53125,
"kl": 0.0036759270005859435,
"learning_rate": 6.477987421383647e-07,
"loss": 0.0001,
"reward": 0.37406357005238533,
"reward_std": 0.43564846366643906,
"rewards/reward_func": 0.37406357005238533,
"step": 2632
},
{
"completion_length": 179.859375,
"epoch": 0.3532717784022481,
"grad_norm": 4.15625,
"kl": 0.003904950339347124,
"learning_rate": 6.467282215977519e-07,
"loss": 0.0002,
"reward": 0.30777904158458114,
"reward_std": 0.5255319569259882,
"rewards/reward_func": 0.30777904158458114,
"step": 2640
},
{
"completion_length": 153.1171875,
"epoch": 0.35434229894286096,
"grad_norm": 4.71875,
"kl": 0.004152281413553283,
"learning_rate": 6.45657701057139e-07,
"loss": 0.0002,
"reward": 0.25166825857013464,
"reward_std": 0.5060204975306988,
"rewards/reward_func": 0.25166825857013464,
"step": 2648
},
{
"completion_length": 179.9140625,
"epoch": 0.35541281948347386,
"grad_norm": 5.09375,
"kl": 0.0037966810341458768,
"learning_rate": 6.445871805165262e-07,
"loss": 0.0002,
"reward": 0.048941366374492645,
"reward_std": 0.5631339196115732,
"rewards/reward_func": 0.048941366374492645,
"step": 2656
},
{
"completion_length": 144.828125,
"epoch": 0.3564833400240867,
"grad_norm": 4.3125,
"kl": 0.004289341013645753,
"learning_rate": 6.435166599759133e-07,
"loss": 0.0002,
"reward": 0.2979842973873019,
"reward_std": 0.5271645337343216,
"rewards/reward_func": 0.2979842973873019,
"step": 2664
},
{
"completion_length": 171.3984375,
"epoch": 0.35755386056469957,
"grad_norm": 4.59375,
"kl": 0.0042981151200365275,
"learning_rate": 6.424461394353004e-07,
"loss": 0.0002,
"reward": 0.17236249335110188,
"reward_std": 0.6582519998773932,
"rewards/reward_func": 0.17236249335110188,
"step": 2672
},
{
"completion_length": 174.0390625,
"epoch": 0.3586243811053125,
"grad_norm": 3.828125,
"kl": 0.004372917755972594,
"learning_rate": 6.413756188946875e-07,
"loss": 0.0002,
"reward": 0.1233967412263155,
"reward_std": 0.5646042246371508,
"rewards/reward_func": 0.1233967412263155,
"step": 2680
},
{
"completion_length": 215.0859375,
"epoch": 0.3596949016459253,
"grad_norm": 3.265625,
"kl": 0.0034220777451992035,
"learning_rate": 6.403050983540746e-07,
"loss": 0.0001,
"reward": 0.022455230355262756,
"reward_std": 0.4265636382624507,
"rewards/reward_func": 0.022455230355262756,
"step": 2688
},
{
"completion_length": 165.6640625,
"epoch": 0.36076542218653823,
"grad_norm": 3.3125,
"kl": 0.0036657150485552847,
"learning_rate": 6.392345778134618e-07,
"loss": 0.0001,
"reward": 0.1397247351706028,
"reward_std": 0.6145136766135693,
"rewards/reward_func": 0.1397247351706028,
"step": 2696
},
{
"completion_length": 150.046875,
"epoch": 0.3618359427271511,
"grad_norm": 4.40625,
"kl": 0.0038523364637512714,
"learning_rate": 6.381640572728489e-07,
"loss": 0.0002,
"reward": 0.09757146798074245,
"reward_std": 0.5506066791713238,
"rewards/reward_func": 0.09757146798074245,
"step": 2704
},
{
"completion_length": 185.109375,
"epoch": 0.36290646326776393,
"grad_norm": 3.28125,
"kl": 0.0035323128395248204,
"learning_rate": 6.37093536732236e-07,
"loss": 0.0001,
"reward": -0.08146252483129501,
"reward_std": 0.4336923873052001,
"rewards/reward_func": -0.08146252483129501,
"step": 2712
},
{
"completion_length": 196.1015625,
"epoch": 0.36397698380837684,
"grad_norm": 2.65625,
"kl": 0.003823416627710685,
"learning_rate": 6.360230161916231e-07,
"loss": 0.0002,
"reward": 0.3048650873824954,
"reward_std": 0.6732109598815441,
"rewards/reward_func": 0.3048650873824954,
"step": 2720
},
{
"completion_length": 160.6640625,
"epoch": 0.3650475043489897,
"grad_norm": 4.15625,
"kl": 0.00394167794729583,
"learning_rate": 6.349524956510103e-07,
"loss": 0.0002,
"reward": 0.3657691851258278,
"reward_std": 0.6270986460149288,
"rewards/reward_func": 0.3657691851258278,
"step": 2728
},
{
"completion_length": 158.0625,
"epoch": 0.36611802488960254,
"grad_norm": 4.34375,
"kl": 0.004382914863526821,
"learning_rate": 6.338819751103974e-07,
"loss": 0.0002,
"reward": 0.23460809141397476,
"reward_std": 0.5130759598687291,
"rewards/reward_func": 0.23460809141397476,
"step": 2736
},
{
"completion_length": 185.953125,
"epoch": 0.36718854543021545,
"grad_norm": 5.65625,
"kl": 0.004101004218682647,
"learning_rate": 6.328114545697846e-07,
"loss": 0.0002,
"reward": 0.1987906889989972,
"reward_std": 0.6350626721978188,
"rewards/reward_func": 0.1987906889989972,
"step": 2744
},
{
"completion_length": 206.921875,
"epoch": 0.3682590659708283,
"grad_norm": 3.140625,
"kl": 0.0035723625624086708,
"learning_rate": 6.317409340291716e-07,
"loss": 0.0001,
"reward": 0.05194275360554457,
"reward_std": 0.5546863917261362,
"rewards/reward_func": 0.05194275360554457,
"step": 2752
},
{
"completion_length": 189.2578125,
"epoch": 0.3693295865114412,
"grad_norm": 5.0,
"kl": 0.004587263334542513,
"learning_rate": 6.306704134885587e-07,
"loss": 0.0002,
"reward": 0.1652057245373726,
"reward_std": 0.6709528639912605,
"rewards/reward_func": 0.1652057245373726,
"step": 2760
},
{
"completion_length": 178.6796875,
"epoch": 0.37040010705205406,
"grad_norm": 3.890625,
"kl": 0.004105961037566885,
"learning_rate": 6.295998929479459e-07,
"loss": 0.0002,
"reward": 0.2511095069348812,
"reward_std": 0.48893540259450674,
"rewards/reward_func": 0.2511095069348812,
"step": 2768
},
{
"completion_length": 202.7578125,
"epoch": 0.3714706275926669,
"grad_norm": 3.890625,
"kl": 0.003215010277926922,
"learning_rate": 6.285293724073331e-07,
"loss": 0.0001,
"reward": 0.1104801157489419,
"reward_std": 0.5324361100792885,
"rewards/reward_func": 0.1104801157489419,
"step": 2776
},
{
"completion_length": 216.71875,
"epoch": 0.3725411481332798,
"grad_norm": 3.546875,
"kl": 0.003063723910599947,
"learning_rate": 6.274588518667202e-07,
"loss": 0.0001,
"reward": 0.1753014111891389,
"reward_std": 0.5819915365427732,
"rewards/reward_func": 0.1753014111891389,
"step": 2784
},
{
"completion_length": 159.84375,
"epoch": 0.37361166867389267,
"grad_norm": 3.34375,
"kl": 0.004174454777967185,
"learning_rate": 6.263883313261072e-07,
"loss": 0.0002,
"reward": 0.45437810756266117,
"reward_std": 0.4158835466951132,
"rewards/reward_func": 0.45437810756266117,
"step": 2792
},
{
"completion_length": 208.875,
"epoch": 0.3746821892145056,
"grad_norm": 3.296875,
"kl": 0.0031874127162154764,
"learning_rate": 6.253178107854944e-07,
"loss": 0.0001,
"reward": 0.3352484591305256,
"reward_std": 0.5119593776762486,
"rewards/reward_func": 0.3352484591305256,
"step": 2800
},
{
"completion_length": 197.4765625,
"epoch": 0.37575270975511843,
"grad_norm": 3.28125,
"kl": 0.0036684646474896,
"learning_rate": 6.242472902448816e-07,
"loss": 0.0001,
"reward": 0.27760483510792255,
"reward_std": 0.6436055637896061,
"rewards/reward_func": 0.27760483510792255,
"step": 2808
},
{
"completion_length": 152.84375,
"epoch": 0.3768232302957313,
"grad_norm": 4.8125,
"kl": 0.004549535195110366,
"learning_rate": 6.231767697042686e-07,
"loss": 0.0002,
"reward": 0.46624478977173567,
"reward_std": 0.6175453588366508,
"rewards/reward_func": 0.46624478977173567,
"step": 2816
},
{
"completion_length": 144.796875,
"epoch": 0.3778937508363442,
"grad_norm": 3.421875,
"kl": 0.0038801982591394335,
"learning_rate": 6.221062491636558e-07,
"loss": 0.0002,
"reward": 0.3716530613601208,
"reward_std": 0.5027909129858017,
"rewards/reward_func": 0.3716530613601208,
"step": 2824
},
{
"completion_length": 189.3984375,
"epoch": 0.37896427137695704,
"grad_norm": 4.25,
"kl": 0.003531339403707534,
"learning_rate": 6.210357286230429e-07,
"loss": 0.0001,
"reward": 0.0588820856064558,
"reward_std": 0.5144321415573359,
"rewards/reward_func": 0.0588820856064558,
"step": 2832
},
{
"completion_length": 151.71875,
"epoch": 0.3800347919175699,
"grad_norm": 4.40625,
"kl": 0.004161316930549219,
"learning_rate": 6.199652080824301e-07,
"loss": 0.0002,
"reward": 0.21836877800524235,
"reward_std": 0.6778084672987461,
"rewards/reward_func": 0.21836877800524235,
"step": 2840
},
{
"completion_length": 156.9140625,
"epoch": 0.3811053124581828,
"grad_norm": 5.03125,
"kl": 0.0043344263976905495,
"learning_rate": 6.188946875418171e-07,
"loss": 0.0002,
"reward": 0.43052874132990837,
"reward_std": 0.5890010427683592,
"rewards/reward_func": 0.43052874132990837,
"step": 2848
},
{
"completion_length": 165.859375,
"epoch": 0.38217583299879565,
"grad_norm": 4.71875,
"kl": 0.0040927641675807536,
"learning_rate": 6.178241670012043e-07,
"loss": 0.0002,
"reward": 0.09390930086374283,
"reward_std": 0.4254406839609146,
"rewards/reward_func": 0.09390930086374283,
"step": 2856
},
{
"completion_length": 151.796875,
"epoch": 0.38324635353940856,
"grad_norm": 4.6875,
"kl": 0.004312922974349931,
"learning_rate": 6.167536464605915e-07,
"loss": 0.0002,
"reward": 0.16593856737017632,
"reward_std": 0.6011241041123867,
"rewards/reward_func": 0.16593856737017632,
"step": 2864
},
{
"completion_length": 147.5859375,
"epoch": 0.3843168740800214,
"grad_norm": 4.78125,
"kl": 0.004850049444939941,
"learning_rate": 6.156831259199785e-07,
"loss": 0.0002,
"reward": 0.3436956908553839,
"reward_std": 0.4854423590004444,
"rewards/reward_func": 0.3436956908553839,
"step": 2872
},
{
"completion_length": 144.6015625,
"epoch": 0.38538739462063426,
"grad_norm": 3.453125,
"kl": 0.00466370303183794,
"learning_rate": 6.146126053793656e-07,
"loss": 0.0002,
"reward": 0.3992779180407524,
"reward_std": 0.5042364671826363,
"rewards/reward_func": 0.3992779180407524,
"step": 2880
},
{
"completion_length": 144.2578125,
"epoch": 0.38645791516124717,
"grad_norm": 5.15625,
"kl": 0.004223753814585507,
"learning_rate": 6.135420848387528e-07,
"loss": 0.0002,
"reward": -0.015506982803344727,
"reward_std": 0.6913204118609428,
"rewards/reward_func": -0.015506982803344727,
"step": 2888
},
{
"completion_length": 201.4453125,
"epoch": 0.38752843570186,
"grad_norm": 3.84375,
"kl": 0.00345133469090797,
"learning_rate": 6.1247156429814e-07,
"loss": 0.0001,
"reward": -0.23047319240868092,
"reward_std": 0.5747088566422462,
"rewards/reward_func": -0.23047319240868092,
"step": 2896
},
{
"completion_length": 180.2890625,
"epoch": 0.3885989562424729,
"grad_norm": 3.125,
"kl": 0.004600081476382911,
"learning_rate": 6.114010437575271e-07,
"loss": 0.0002,
"reward": -0.17729684710502625,
"reward_std": 0.3335055038332939,
"rewards/reward_func": -0.17729684710502625,
"step": 2904
},
{
"completion_length": 153.8046875,
"epoch": 0.3896694767830858,
"grad_norm": 4.09375,
"kl": 0.003945650125388056,
"learning_rate": 6.103305232169142e-07,
"loss": 0.0002,
"reward": 0.2731490605510771,
"reward_std": 0.573139002546668,
"rewards/reward_func": 0.2731490605510771,
"step": 2912
},
{
"completion_length": 150.1171875,
"epoch": 0.39073999732369863,
"grad_norm": 3.90625,
"kl": 0.004451691260328516,
"learning_rate": 6.092600026763013e-07,
"loss": 0.0002,
"reward": 0.1808023676276207,
"reward_std": 0.5803926577791572,
"rewards/reward_func": 0.1808023676276207,
"step": 2920
},
{
"completion_length": 199.84375,
"epoch": 0.39181051786431154,
"grad_norm": 3.5625,
"kl": 0.0031813042878638953,
"learning_rate": 6.081894821356885e-07,
"loss": 0.0001,
"reward": 0.22766825137659907,
"reward_std": 0.6164026372134686,
"rewards/reward_func": 0.22766825137659907,
"step": 2928
},
{
"completion_length": 199.2734375,
"epoch": 0.3928810384049244,
"grad_norm": 3.28125,
"kl": 0.0038126638100948185,
"learning_rate": 6.071189615950756e-07,
"loss": 0.0002,
"reward": 0.16099986899644136,
"reward_std": 0.7263254784047604,
"rewards/reward_func": 0.16099986899644136,
"step": 2936
},
{
"completion_length": 157.765625,
"epoch": 0.3939515589455373,
"grad_norm": 3.5,
"kl": 0.004354376200353727,
"learning_rate": 6.060484410544627e-07,
"loss": 0.0002,
"reward": 0.31015807017683983,
"reward_std": 0.6586577072739601,
"rewards/reward_func": 0.31015807017683983,
"step": 2944
},
{
"completion_length": 155.671875,
"epoch": 0.39502207948615015,
"grad_norm": 3.609375,
"kl": 0.004368482739664614,
"learning_rate": 6.049779205138499e-07,
"loss": 0.0002,
"reward": 0.5017051734030247,
"reward_std": 0.42575474083423615,
"rewards/reward_func": 0.5017051734030247,
"step": 2952
},
{
"completion_length": 129.375,
"epoch": 0.396092600026763,
"grad_norm": 4.6875,
"kl": 0.004999362543458119,
"learning_rate": 6.039073999732369e-07,
"loss": 0.0002,
"reward": 0.5034809075295925,
"reward_std": 0.5035946983844042,
"rewards/reward_func": 0.5034809075295925,
"step": 2960
},
{
"completion_length": 171.203125,
"epoch": 0.3971631205673759,
"grad_norm": 3.09375,
"kl": 0.003695404506288469,
"learning_rate": 6.028368794326241e-07,
"loss": 0.0001,
"reward": 0.3904507216066122,
"reward_std": 0.5394172128289938,
"rewards/reward_func": 0.3904507216066122,
"step": 2968
},
{
"completion_length": 179.3515625,
"epoch": 0.39823364110798876,
"grad_norm": 2.796875,
"kl": 0.004422289348440245,
"learning_rate": 6.017663588920112e-07,
"loss": 0.0002,
"reward": 0.3067244812846184,
"reward_std": 0.48167256638407707,
"rewards/reward_func": 0.3067244812846184,
"step": 2976
},
{
"completion_length": 241.015625,
"epoch": 0.3993041616486016,
"grad_norm": 3.40625,
"kl": 0.0026670149818528444,
"learning_rate": 6.006958383513984e-07,
"loss": 0.0001,
"reward": -0.14665643870830536,
"reward_std": 0.43440048210322857,
"rewards/reward_func": -0.14665643870830536,
"step": 2984
},
{
"completion_length": 182.59375,
"epoch": 0.4003746821892145,
"grad_norm": 4.78125,
"kl": 0.004074128781212494,
"learning_rate": 5.996253178107855e-07,
"loss": 0.0002,
"reward": 0.2561218962073326,
"reward_std": 0.5822538835927844,
"rewards/reward_func": 0.2561218962073326,
"step": 2992
},
{
"completion_length": 193.3515625,
"epoch": 0.40144520272982737,
"grad_norm": 4.1875,
"kl": 0.00386466141208075,
"learning_rate": 5.985547972701726e-07,
"loss": 0.0002,
"reward": 0.21375904511660337,
"reward_std": 0.39464170206338167,
"rewards/reward_func": 0.21375904511660337,
"step": 3000
},
{
"completion_length": 159.328125,
"epoch": 0.4025157232704403,
"grad_norm": 3.734375,
"kl": 0.00333388164290227,
"learning_rate": 5.974842767295597e-07,
"loss": 0.0001,
"reward": 0.5523056299425662,
"reward_std": 0.47927504777908325,
"rewards/reward_func": 0.5523056299425662,
"step": 3008
},
{
"completion_length": 164.9921875,
"epoch": 0.4035862438110531,
"grad_norm": 3.625,
"kl": 0.004161383403697982,
"learning_rate": 5.964137561889468e-07,
"loss": 0.0002,
"reward": 0.17364376038312912,
"reward_std": 0.5346489679068327,
"rewards/reward_func": 0.17364376038312912,
"step": 3016
},
{
"completion_length": 159.7578125,
"epoch": 0.404656764351666,
"grad_norm": 4.0625,
"kl": 0.003841915662633255,
"learning_rate": 5.95343235648334e-07,
"loss": 0.0002,
"reward": 0.4289398565888405,
"reward_std": 0.47436373494565487,
"rewards/reward_func": 0.4289398565888405,
"step": 3024
},
{
"completion_length": 182.9609375,
"epoch": 0.4057272848922789,
"grad_norm": 2.453125,
"kl": 0.004264735238393769,
"learning_rate": 5.942727151077212e-07,
"loss": 0.0002,
"reward": 0.021798385307192802,
"reward_std": 0.5079176230356097,
"rewards/reward_func": 0.021798385307192802,
"step": 3032
},
{
"completion_length": 160.546875,
"epoch": 0.40679780543289173,
"grad_norm": 3.0,
"kl": 0.005132144928211346,
"learning_rate": 5.932021945671082e-07,
"loss": 0.0002,
"reward": 0.5438925623893738,
"reward_std": 0.42197058349847794,
"rewards/reward_func": 0.5438925623893738,
"step": 3040
},
{
"completion_length": 156.0859375,
"epoch": 0.40786832597350464,
"grad_norm": 4.1875,
"kl": 0.003920425719115883,
"learning_rate": 5.921316740264953e-07,
"loss": 0.0002,
"reward": 0.3435197048820555,
"reward_std": 0.584853507578373,
"rewards/reward_func": 0.3435197048820555,
"step": 3048
},
{
"completion_length": 148.8359375,
"epoch": 0.4089388465141175,
"grad_norm": 1.90625,
"kl": 0.004158479205216281,
"learning_rate": 5.910611534858825e-07,
"loss": 0.0002,
"reward": 0.3984090769663453,
"reward_std": 0.4647171348333359,
"rewards/reward_func": 0.3984090769663453,
"step": 3056
},
{
"completion_length": 175.2109375,
"epoch": 0.41000936705473034,
"grad_norm": 3.984375,
"kl": 0.003644221549620852,
"learning_rate": 5.899906329452697e-07,
"loss": 0.0001,
"reward": 0.16703728586435318,
"reward_std": 0.5931989103555679,
"rewards/reward_func": 0.16703728586435318,
"step": 3064
},
{
"completion_length": 173.90625,
"epoch": 0.41107988759534325,
"grad_norm": 3.21875,
"kl": 0.004099360230611637,
"learning_rate": 5.889201124046567e-07,
"loss": 0.0002,
"reward": -0.03924668487161398,
"reward_std": 0.6728265807032585,
"rewards/reward_func": -0.03924668487161398,
"step": 3072
},
{
"completion_length": 164.953125,
"epoch": 0.4121504081359561,
"grad_norm": 3.34375,
"kl": 0.004925543296849355,
"learning_rate": 5.878495918640438e-07,
"loss": 0.0002,
"reward": 0.3322628792375326,
"reward_std": 0.5970859546214342,
"rewards/reward_func": 0.3322628792375326,
"step": 3080
},
{
"completion_length": 162.0234375,
"epoch": 0.41322092867656895,
"grad_norm": 4.0625,
"kl": 0.004399422614369541,
"learning_rate": 5.86779071323431e-07,
"loss": 0.0002,
"reward": 0.36330926418304443,
"reward_std": 0.45976690761744976,
"rewards/reward_func": 0.36330926418304443,
"step": 3088
},
{
"completion_length": 138.796875,
"epoch": 0.41429144921718186,
"grad_norm": 3.140625,
"kl": 0.004546679730992764,
"learning_rate": 5.857085507828181e-07,
"loss": 0.0002,
"reward": 0.4072983153164387,
"reward_std": 0.600585313513875,
"rewards/reward_func": 0.4072983153164387,
"step": 3096
},
{
"completion_length": 214.953125,
"epoch": 0.4153619697577947,
"grad_norm": 2.9375,
"kl": 0.00350517057813704,
"learning_rate": 5.846380302422052e-07,
"loss": 0.0001,
"reward": 0.2474349234253168,
"reward_std": 0.498451117426157,
"rewards/reward_func": 0.2474349234253168,
"step": 3104
},
{
"completion_length": 139.453125,
"epoch": 0.4164324902984076,
"grad_norm": 4.1875,
"kl": 0.00461685229674913,
"learning_rate": 5.835675097015924e-07,
"loss": 0.0002,
"reward": 0.471061285585165,
"reward_std": 0.45820480585098267,
"rewards/reward_func": 0.471061285585165,
"step": 3112
},
{
"completion_length": 144.4453125,
"epoch": 0.41750301083902047,
"grad_norm": 4.6875,
"kl": 0.004540506488410756,
"learning_rate": 5.824969891609795e-07,
"loss": 0.0002,
"reward": 0.4106574021279812,
"reward_std": 0.47535229101777077,
"rewards/reward_func": 0.4106574021279812,
"step": 3120
},
{
"completion_length": 161.71875,
"epoch": 0.4185735313796333,
"grad_norm": 4.78125,
"kl": 0.004575909668346867,
"learning_rate": 5.814264686203665e-07,
"loss": 0.0002,
"reward": -0.04112925007939339,
"reward_std": 0.4119059517979622,
"rewards/reward_func": -0.04112925007939339,
"step": 3128
},
{
"completion_length": 196.5703125,
"epoch": 0.41964405192024623,
"grad_norm": 4.90625,
"kl": 0.003955840336857364,
"learning_rate": 5.803559480797537e-07,
"loss": 0.0002,
"reward": 0.024920357391238213,
"reward_std": 0.5616731429472566,
"rewards/reward_func": 0.024920357391238213,
"step": 3136
},
{
"completion_length": 204.4296875,
"epoch": 0.4207145724608591,
"grad_norm": 4.96875,
"kl": 0.0033879343245644122,
"learning_rate": 5.792854275391409e-07,
"loss": 0.0001,
"reward": 0.2112936358898878,
"reward_std": 0.5542439222335815,
"rewards/reward_func": 0.2112936358898878,
"step": 3144
},
{
"completion_length": 188.40625,
"epoch": 0.421785093001472,
"grad_norm": 4.40625,
"kl": 0.003975967440055683,
"learning_rate": 5.782149069985281e-07,
"loss": 0.0002,
"reward": -0.049715520814061165,
"reward_std": 0.6521423272788525,
"rewards/reward_func": -0.049715520814061165,
"step": 3152
},
{
"completion_length": 163.4375,
"epoch": 0.42285561354208484,
"grad_norm": 2.65625,
"kl": 0.004166945233009756,
"learning_rate": 5.771443864579151e-07,
"loss": 0.0002,
"reward": 0.41062634997069836,
"reward_std": 0.4943850450217724,
"rewards/reward_func": 0.41062634997069836,
"step": 3160
},
{
"completion_length": 130.859375,
"epoch": 0.4239261340826977,
"grad_norm": 3.859375,
"kl": 0.005400074122007936,
"learning_rate": 5.760738659173022e-07,
"loss": 0.0002,
"reward": 0.43160221725702286,
"reward_std": 0.5389326587319374,
"rewards/reward_func": 0.43160221725702286,
"step": 3168
},
{
"completion_length": 172.703125,
"epoch": 0.4249966546233106,
"grad_norm": 4.53125,
"kl": 0.005152460333192721,
"learning_rate": 5.750033453766894e-07,
"loss": 0.0002,
"reward": 0.06674006022512913,
"reward_std": 0.5032580755650997,
"rewards/reward_func": 0.06674006022512913,
"step": 3176
},
{
"completion_length": 154.6875,
"epoch": 0.42606717516392345,
"grad_norm": 4.125,
"kl": 0.004849692864809185,
"learning_rate": 5.739328248360766e-07,
"loss": 0.0002,
"reward": 0.33508316054940224,
"reward_std": 0.5568934958428144,
"rewards/reward_func": 0.33508316054940224,
"step": 3184
},
{
"completion_length": 149.921875,
"epoch": 0.42713769570453636,
"grad_norm": 3.5,
"kl": 0.004149941669311374,
"learning_rate": 5.728623042954636e-07,
"loss": 0.0002,
"reward": 0.560466131195426,
"reward_std": 0.4996361844241619,
"rewards/reward_func": 0.560466131195426,
"step": 3192
},
{
"completion_length": 162.3515625,
"epoch": 0.4282082162451492,
"grad_norm": 2.375,
"kl": 0.00443269161041826,
"learning_rate": 5.717917837548508e-07,
"loss": 0.0002,
"reward": 0.4073672443628311,
"reward_std": 0.4750672820955515,
"rewards/reward_func": 0.4073672443628311,
"step": 3200
},
{
"completion_length": 173.3203125,
"epoch": 0.42927873678576206,
"grad_norm": 4.125,
"kl": 0.0039921577263157815,
"learning_rate": 5.707212632142379e-07,
"loss": 0.0002,
"reward": -0.03342257114127278,
"reward_std": 0.6672232635319233,
"rewards/reward_func": -0.03342257114127278,
"step": 3208
},
{
"completion_length": 155.9375,
"epoch": 0.43034925732637497,
"grad_norm": 5.78125,
"kl": 0.004704885970568284,
"learning_rate": 5.69650742673625e-07,
"loss": 0.0002,
"reward": 0.3005738127976656,
"reward_std": 0.5849708952009678,
"rewards/reward_func": 0.3005738127976656,
"step": 3216
},
{
"completion_length": 185.84375,
"epoch": 0.4314197778669878,
"grad_norm": 2.890625,
"kl": 0.0036070215137442574,
"learning_rate": 5.685802221330121e-07,
"loss": 0.0001,
"reward": -0.018972497433423996,
"reward_std": 0.5354725271463394,
"rewards/reward_func": -0.018972497433423996,
"step": 3224
},
{
"completion_length": 178.546875,
"epoch": 0.43249029840760067,
"grad_norm": 2.625,
"kl": 0.004298602405469865,
"learning_rate": 5.675097015923993e-07,
"loss": 0.0002,
"reward": 0.3231694786809385,
"reward_std": 0.4985707551240921,
"rewards/reward_func": 0.3231694786809385,
"step": 3232
},
{
"completion_length": 162.3203125,
"epoch": 0.4335608189482136,
"grad_norm": 3.59375,
"kl": 0.0039607091166544706,
"learning_rate": 5.664391810517865e-07,
"loss": 0.0002,
"reward": 0.11156550701707602,
"reward_std": 0.7430830076336861,
"rewards/reward_func": 0.11156550701707602,
"step": 3240
},
{
"completion_length": 150.40625,
"epoch": 0.43463133948882643,
"grad_norm": 3.3125,
"kl": 0.0049680424854159355,
"learning_rate": 5.653686605111735e-07,
"loss": 0.0002,
"reward": 0.36818648502230644,
"reward_std": 0.4827171713113785,
"rewards/reward_func": 0.36818648502230644,
"step": 3248
},
{
"completion_length": 148.8046875,
"epoch": 0.43570186002943934,
"grad_norm": 5.40625,
"kl": 0.004517415567534044,
"learning_rate": 5.642981399705606e-07,
"loss": 0.0002,
"reward": 0.5136874578893185,
"reward_std": 0.4101978652179241,
"rewards/reward_func": 0.5136874578893185,
"step": 3256
},
{
"completion_length": 159.90625,
"epoch": 0.4367723805700522,
"grad_norm": 4.5,
"kl": 0.005192397540668026,
"learning_rate": 5.632276194299478e-07,
"loss": 0.0002,
"reward": 0.36011555418372154,
"reward_std": 0.590018224902451,
"rewards/reward_func": 0.36011555418372154,
"step": 3264
},
{
"completion_length": 165.8359375,
"epoch": 0.43784290111066504,
"grad_norm": 5.78125,
"kl": 0.004310069081839174,
"learning_rate": 5.621570988893349e-07,
"loss": 0.0002,
"reward": 0.44862041622400284,
"reward_std": 0.5028228275477886,
"rewards/reward_func": 0.44862041622400284,
"step": 3272
},
{
"completion_length": 163.8203125,
"epoch": 0.43891342165127795,
"grad_norm": 3.609375,
"kl": 0.004184526915196329,
"learning_rate": 5.610865783487221e-07,
"loss": 0.0002,
"reward": 0.3917035781778395,
"reward_std": 0.5541238645091653,
"rewards/reward_func": 0.3917035781778395,
"step": 3280
},
{
"completion_length": 186.40625,
"epoch": 0.4399839421918908,
"grad_norm": 4.125,
"kl": 0.003184476459864527,
"learning_rate": 5.600160578081091e-07,
"loss": 0.0001,
"reward": 0.12950839288532734,
"reward_std": 0.5569676849991083,
"rewards/reward_func": 0.12950839288532734,
"step": 3288
},
{
"completion_length": 139.9765625,
"epoch": 0.4410544627325037,
"grad_norm": 3.9375,
"kl": 0.004262359958374873,
"learning_rate": 5.589455372674963e-07,
"loss": 0.0002,
"reward": 0.28253707475960255,
"reward_std": 0.44188484735786915,
"rewards/reward_func": 0.28253707475960255,
"step": 3296
},
{
"completion_length": 174.6171875,
"epoch": 0.44212498327311656,
"grad_norm": 4.59375,
"kl": 0.004584858979796991,
"learning_rate": 5.578750167268834e-07,
"loss": 0.0002,
"reward": 0.10256939753890038,
"reward_std": 0.47010411880910397,
"rewards/reward_func": 0.10256939753890038,
"step": 3304
},
{
"completion_length": 151.453125,
"epoch": 0.4431955038137294,
"grad_norm": 3.3125,
"kl": 0.004451150889508426,
"learning_rate": 5.568044961862706e-07,
"loss": 0.0002,
"reward": 0.48164689540863037,
"reward_std": 0.4186716293916106,
"rewards/reward_func": 0.48164689540863037,
"step": 3312
},
{
"completion_length": 187.1796875,
"epoch": 0.4442660243543423,
"grad_norm": 5.3125,
"kl": 0.003924098331481218,
"learning_rate": 5.557339756456577e-07,
"loss": 0.0002,
"reward": 0.13027670048177242,
"reward_std": 0.5701944110915065,
"rewards/reward_func": 0.13027670048177242,
"step": 3320
},
{
"completion_length": 175.734375,
"epoch": 0.44533654489495517,
"grad_norm": 3.890625,
"kl": 0.004432059795362875,
"learning_rate": 5.546634551050447e-07,
"loss": 0.0002,
"reward": 0.15545489452779293,
"reward_std": 0.42713499814271927,
"rewards/reward_func": 0.15545489452779293,
"step": 3328
},
{
"completion_length": 159.7109375,
"epoch": 0.446407065435568,
"grad_norm": 3.953125,
"kl": 0.0041090622544288635,
"learning_rate": 5.535929345644319e-07,
"loss": 0.0002,
"reward": 0.5048990547657013,
"reward_std": 0.3645612169057131,
"rewards/reward_func": 0.5048990547657013,
"step": 3336
},
{
"completion_length": 170.46875,
"epoch": 0.4474775859761809,
"grad_norm": 3.109375,
"kl": 0.00416830470203422,
"learning_rate": 5.525224140238191e-07,
"loss": 0.0002,
"reward": 0.0841824202798307,
"reward_std": 0.5541789922863245,
"rewards/reward_func": 0.0841824202798307,
"step": 3344
},
{
"completion_length": 177.8984375,
"epoch": 0.4485481065167938,
"grad_norm": 2.90625,
"kl": 0.004115153366001323,
"learning_rate": 5.514518934832062e-07,
"loss": 0.0002,
"reward": 0.25281552597880363,
"reward_std": 0.5773179177194834,
"rewards/reward_func": 0.25281552597880363,
"step": 3352
},
{
"completion_length": 167.546875,
"epoch": 0.4496186270574067,
"grad_norm": 3.046875,
"kl": 0.004800075956154615,
"learning_rate": 5.503813729425933e-07,
"loss": 0.0002,
"reward": 0.15400892263278365,
"reward_std": 0.613510686904192,
"rewards/reward_func": 0.15400892263278365,
"step": 3360
},
{
"completion_length": 173.875,
"epoch": 0.45068914759801953,
"grad_norm": 5.125,
"kl": 0.004234513093251735,
"learning_rate": 5.493108524019804e-07,
"loss": 0.0002,
"reward": 0.14482227806001902,
"reward_std": 0.6577083393931389,
"rewards/reward_func": 0.14482227806001902,
"step": 3368
},
{
"completion_length": 196.0078125,
"epoch": 0.4517596681386324,
"grad_norm": 3.28125,
"kl": 0.003691094840178266,
"learning_rate": 5.482403318613676e-07,
"loss": 0.0001,
"reward": 0.20943116396665573,
"reward_std": 0.6141318120062351,
"rewards/reward_func": 0.20943116396665573,
"step": 3376
},
{
"completion_length": 189.9375,
"epoch": 0.4528301886792453,
"grad_norm": 3.625,
"kl": 0.004188001621514559,
"learning_rate": 5.471698113207546e-07,
"loss": 0.0002,
"reward": 0.12066240888088942,
"reward_std": 0.6333566196262836,
"rewards/reward_func": 0.12066240888088942,
"step": 3384
},
{
"completion_length": 224.203125,
"epoch": 0.45390070921985815,
"grad_norm": 3.46875,
"kl": 0.0038036782352719456,
"learning_rate": 5.460992907801418e-07,
"loss": 0.0002,
"reward": 0.026676064357161522,
"reward_std": 0.5244584791362286,
"rewards/reward_func": 0.026676064357161522,
"step": 3392
},
{
"completion_length": 135.8125,
"epoch": 0.45497122976047105,
"grad_norm": 3.78125,
"kl": 0.005447168223327026,
"learning_rate": 5.45028770239529e-07,
"loss": 0.0002,
"reward": 0.3792672948911786,
"reward_std": 0.5337657146155834,
"rewards/reward_func": 0.3792672948911786,
"step": 3400
},
{
"completion_length": 180.4765625,
"epoch": 0.4560417503010839,
"grad_norm": 3.84375,
"kl": 0.0039006134611554444,
"learning_rate": 5.439582496989162e-07,
"loss": 0.0002,
"reward": 0.2967074029147625,
"reward_std": 0.5028974749147892,
"rewards/reward_func": 0.2967074029147625,
"step": 3408
},
{
"completion_length": 173.46875,
"epoch": 0.45711227084169676,
"grad_norm": 3.59375,
"kl": 0.004641034756787121,
"learning_rate": 5.428877291583031e-07,
"loss": 0.0002,
"reward": 0.04013548418879509,
"reward_std": 0.647808875888586,
"rewards/reward_func": 0.04013548418879509,
"step": 3416
},
{
"completion_length": 182.6953125,
"epoch": 0.45818279138230966,
"grad_norm": 3.265625,
"kl": 0.003926090226741508,
"learning_rate": 5.418172086176903e-07,
"loss": 0.0002,
"reward": -0.02354210428893566,
"reward_std": 0.46280941739678383,
"rewards/reward_func": -0.02354210428893566,
"step": 3424
},
{
"completion_length": 171.9140625,
"epoch": 0.4592533119229225,
"grad_norm": 4.3125,
"kl": 0.004544450406683609,
"learning_rate": 5.407466880770775e-07,
"loss": 0.0002,
"reward": 0.2378298337571323,
"reward_std": 0.5396788232028484,
"rewards/reward_func": 0.2378298337571323,
"step": 3432
},
{
"completion_length": 168.5625,
"epoch": 0.4603238324635354,
"grad_norm": 4.125,
"kl": 0.003944898169720545,
"learning_rate": 5.396761675364647e-07,
"loss": 0.0002,
"reward": 0.3150383196771145,
"reward_std": 0.53007797524333,
"rewards/reward_func": 0.3150383196771145,
"step": 3440
},
{
"completion_length": 153.21875,
"epoch": 0.46139435300414827,
"grad_norm": 3.515625,
"kl": 0.004431029927218333,
"learning_rate": 5.386056469958517e-07,
"loss": 0.0002,
"reward": 0.10777561087161303,
"reward_std": 0.5590555854141712,
"rewards/reward_func": 0.10777561087161303,
"step": 3448
},
{
"completion_length": 156.1015625,
"epoch": 0.4624648735447611,
"grad_norm": 5.59375,
"kl": 0.004066320398123935,
"learning_rate": 5.375351264552388e-07,
"loss": 0.0002,
"reward": 0.49741687439382076,
"reward_std": 0.3438666444271803,
"rewards/reward_func": 0.49741687439382076,
"step": 3456
},
{
"completion_length": 180.890625,
"epoch": 0.46353539408537403,
"grad_norm": 3.5,
"kl": 0.004092392831807956,
"learning_rate": 5.36464605914626e-07,
"loss": 0.0002,
"reward": 0.261587081477046,
"reward_std": 0.4724911078810692,
"rewards/reward_func": 0.261587081477046,
"step": 3464
},
{
"completion_length": 188.34375,
"epoch": 0.4646059146259869,
"grad_norm": 4.5625,
"kl": 0.004102788210730068,
"learning_rate": 5.353940853740131e-07,
"loss": 0.0002,
"reward": 0.3170028403401375,
"reward_std": 0.5411418545991182,
"rewards/reward_func": 0.3170028403401375,
"step": 3472
},
{
"completion_length": 147.0078125,
"epoch": 0.46567643516659973,
"grad_norm": 3.90625,
"kl": 0.004658064717659727,
"learning_rate": 5.343235648334002e-07,
"loss": 0.0002,
"reward": 0.42856106348335743,
"reward_std": 0.45429209433496,
"rewards/reward_func": 0.42856106348335743,
"step": 3480
},
{
"completion_length": 185.578125,
"epoch": 0.46674695570721264,
"grad_norm": 3.515625,
"kl": 0.004181814001640305,
"learning_rate": 5.332530442927874e-07,
"loss": 0.0002,
"reward": 0.1980421096086502,
"reward_std": 0.46522266045212746,
"rewards/reward_func": 0.1980421096086502,
"step": 3488
},
{
"completion_length": 147.6015625,
"epoch": 0.4678174762478255,
"grad_norm": 2.875,
"kl": 0.005186378140933812,
"learning_rate": 5.321825237521745e-07,
"loss": 0.0002,
"reward": 0.33479253202676773,
"reward_std": 0.3981231078505516,
"rewards/reward_func": 0.33479253202676773,
"step": 3496
},
{
"completion_length": 196.953125,
"epoch": 0.4688879967884384,
"grad_norm": 2.109375,
"kl": 0.003901872376445681,
"learning_rate": 5.311120032115616e-07,
"loss": 0.0002,
"reward": -0.1805968815460801,
"reward_std": 0.5539918430149555,
"rewards/reward_func": -0.1805968815460801,
"step": 3504
},
{
"completion_length": 172.09375,
"epoch": 0.46995851732905125,
"grad_norm": 4.375,
"kl": 0.004344145359937102,
"learning_rate": 5.300414826709487e-07,
"loss": 0.0002,
"reward": 0.24764186749234796,
"reward_std": 0.5220493152737617,
"rewards/reward_func": 0.24764186749234796,
"step": 3512
},
{
"completion_length": 165.484375,
"epoch": 0.4710290378696641,
"grad_norm": 3.03125,
"kl": 0.004431087261764333,
"learning_rate": 5.289709621303359e-07,
"loss": 0.0002,
"reward": 0.207328287884593,
"reward_std": 0.621040590107441,
"rewards/reward_func": 0.207328287884593,
"step": 3520
},
{
"completion_length": 187.7734375,
"epoch": 0.472099558410277,
"grad_norm": 3.5,
"kl": 0.004218856105580926,
"learning_rate": 5.27900441589723e-07,
"loss": 0.0002,
"reward": 0.07554451934993267,
"reward_std": 0.6108374260365963,
"rewards/reward_func": 0.07554451934993267,
"step": 3528
},
{
"completion_length": 168.265625,
"epoch": 0.47317007895088986,
"grad_norm": 5.0625,
"kl": 0.004120910074561834,
"learning_rate": 5.2682992104911e-07,
"loss": 0.0002,
"reward": 0.03414946049451828,
"reward_std": 0.6455099135637283,
"rewards/reward_func": 0.03414946049451828,
"step": 3536
},
{
"completion_length": 187.5234375,
"epoch": 0.47424059949150277,
"grad_norm": 3.640625,
"kl": 0.003985106013715267,
"learning_rate": 5.257594005084972e-07,
"loss": 0.0002,
"reward": 0.2925253491848707,
"reward_std": 0.6335334703326225,
"rewards/reward_func": 0.2925253491848707,
"step": 3544
},
{
"completion_length": 163.46875,
"epoch": 0.4753111200321156,
"grad_norm": 4.125,
"kl": 0.00473158826935105,
"learning_rate": 5.246888799678844e-07,
"loss": 0.0002,
"reward": 0.3663984229788184,
"reward_std": 0.558024113997817,
"rewards/reward_func": 0.3663984229788184,
"step": 3552
},
{
"completion_length": 152.03125,
"epoch": 0.47638164057272847,
"grad_norm": 4.1875,
"kl": 0.004815980733837932,
"learning_rate": 5.236183594272715e-07,
"loss": 0.0002,
"reward": 0.13753212243318558,
"reward_std": 0.5678570009768009,
"rewards/reward_func": 0.13753212243318558,
"step": 3560
},
{
"completion_length": 178.7421875,
"epoch": 0.4774521611133414,
"grad_norm": 3.625,
"kl": 0.004101649799849838,
"learning_rate": 5.225478388866587e-07,
"loss": 0.0002,
"reward": 0.2735243234783411,
"reward_std": 0.6148385126143694,
"rewards/reward_func": 0.2735243234783411,
"step": 3568
},
{
"completion_length": 192.1484375,
"epoch": 0.47852268165395423,
"grad_norm": 4.84375,
"kl": 0.004463646182557568,
"learning_rate": 5.214773183460457e-07,
"loss": 0.0002,
"reward": 0.009237892925739288,
"reward_std": 0.4207034735009074,
"rewards/reward_func": 0.009237892925739288,
"step": 3576
},
{
"completion_length": 174.046875,
"epoch": 0.4795932021945671,
"grad_norm": 4.125,
"kl": 0.0036158739821985364,
"learning_rate": 5.204067978054328e-07,
"loss": 0.0001,
"reward": 0.26391329150646925,
"reward_std": 0.586926780641079,
"rewards/reward_func": 0.26391329150646925,
"step": 3584
},
{
"completion_length": 175.4375,
"epoch": 0.48066372273518,
"grad_norm": 3.578125,
"kl": 0.0043216931517235935,
"learning_rate": 5.1933627726482e-07,
"loss": 0.0002,
"reward": 0.28290559723973274,
"reward_std": 0.6564907301217318,
"rewards/reward_func": 0.28290559723973274,
"step": 3592
},
{
"completion_length": 178.8671875,
"epoch": 0.48173424327579284,
"grad_norm": 2.671875,
"kl": 0.004414036084199324,
"learning_rate": 5.182657567242071e-07,
"loss": 0.0002,
"reward": 0.3407918275333941,
"reward_std": 0.5300383027642965,
"rewards/reward_func": 0.3407918275333941,
"step": 3600
},
{
"completion_length": 168.0078125,
"epoch": 0.48280476381640575,
"grad_norm": 3.828125,
"kl": 0.004355661425506696,
"learning_rate": 5.171952361835943e-07,
"loss": 0.0002,
"reward": 0.15907337237149477,
"reward_std": 0.6283294912427664,
"rewards/reward_func": 0.15907337237149477,
"step": 3608
},
{
"completion_length": 169.671875,
"epoch": 0.4838752843570186,
"grad_norm": 4.40625,
"kl": 0.0041847134416457266,
"learning_rate": 5.161247156429813e-07,
"loss": 0.0002,
"reward": 0.394026106223464,
"reward_std": 0.5191534291952848,
"rewards/reward_func": 0.394026106223464,
"step": 3616
},
{
"completion_length": 187.9765625,
"epoch": 0.48494580489763145,
"grad_norm": 4.25,
"kl": 0.004289885691832751,
"learning_rate": 5.150541951023685e-07,
"loss": 0.0002,
"reward": 0.2409443873912096,
"reward_std": 0.714960128068924,
"rewards/reward_func": 0.2409443873912096,
"step": 3624
},
{
"completion_length": 167.8984375,
"epoch": 0.48601632543824436,
"grad_norm": 3.515625,
"kl": 0.004622265987563878,
"learning_rate": 5.139836745617556e-07,
"loss": 0.0002,
"reward": 0.3250633031129837,
"reward_std": 0.3942791158333421,
"rewards/reward_func": 0.3250633031129837,
"step": 3632
},
{
"completion_length": 181.3984375,
"epoch": 0.4870868459788572,
"grad_norm": 3.125,
"kl": 0.00506105026579462,
"learning_rate": 5.129131540211427e-07,
"loss": 0.0002,
"reward": 0.1745797097682953,
"reward_std": 0.5199177237227559,
"rewards/reward_func": 0.1745797097682953,
"step": 3640
},
{
"completion_length": 177.828125,
"epoch": 0.4881573665194701,
"grad_norm": 5.125,
"kl": 0.00410176973673515,
"learning_rate": 5.118426334805299e-07,
"loss": 0.0002,
"reward": 0.1287559773772955,
"reward_std": 0.5036085527390242,
"rewards/reward_func": 0.1287559773772955,
"step": 3648
},
{
"completion_length": 179.125,
"epoch": 0.48922788706008297,
"grad_norm": 4.0,
"kl": 0.004322856722865254,
"learning_rate": 5.107721129399171e-07,
"loss": 0.0002,
"reward": 0.07798391906544566,
"reward_std": 0.6537183858454227,
"rewards/reward_func": 0.07798391906544566,
"step": 3656
},
{
"completion_length": 154.4375,
"epoch": 0.4902984076006958,
"grad_norm": 3.03125,
"kl": 0.003943322895793244,
"learning_rate": 5.097015923993041e-07,
"loss": 0.0002,
"reward": 0.1709643267095089,
"reward_std": 0.5507702603936195,
"rewards/reward_func": 0.1709643267095089,
"step": 3664
},
{
"completion_length": 168.6875,
"epoch": 0.4913689281413087,
"grad_norm": 3.125,
"kl": 0.004549846984446049,
"learning_rate": 5.086310718586912e-07,
"loss": 0.0002,
"reward": 0.2190579893067479,
"reward_std": 0.5686514582484961,
"rewards/reward_func": 0.2190579893067479,
"step": 3672
},
{
"completion_length": 155.828125,
"epoch": 0.4924394486819216,
"grad_norm": 3.703125,
"kl": 0.00439119475777261,
"learning_rate": 5.075605513180784e-07,
"loss": 0.0002,
"reward": 0.45011513587087393,
"reward_std": 0.5558454534038901,
"rewards/reward_func": 0.45011513587087393,
"step": 3680
},
{
"completion_length": 158.1796875,
"epoch": 0.4935099692225345,
"grad_norm": 5.625,
"kl": 0.004681064456235617,
"learning_rate": 5.064900307774656e-07,
"loss": 0.0002,
"reward": 0.22979869320988655,
"reward_std": 0.3713596798479557,
"rewards/reward_func": 0.22979869320988655,
"step": 3688
},
{
"completion_length": 160.0234375,
"epoch": 0.49458048976314734,
"grad_norm": 4.375,
"kl": 0.00535585597390309,
"learning_rate": 5.054195102368527e-07,
"loss": 0.0002,
"reward": 0.051803894340991974,
"reward_std": 0.6633851379156113,
"rewards/reward_func": 0.051803894340991974,
"step": 3696
},
{
"completion_length": 175.9609375,
"epoch": 0.4956510103037602,
"grad_norm": 4.125,
"kl": 0.004008779738796875,
"learning_rate": 5.043489896962397e-07,
"loss": 0.0002,
"reward": 0.2548919077962637,
"reward_std": 0.5479347966611385,
"rewards/reward_func": 0.2548919077962637,
"step": 3704
},
{
"completion_length": 168.3671875,
"epoch": 0.4967215308443731,
"grad_norm": 3.78125,
"kl": 0.004640541330445558,
"learning_rate": 5.032784691556269e-07,
"loss": 0.0002,
"reward": 0.2829543873667717,
"reward_std": 0.40924315620213747,
"rewards/reward_func": 0.2829543873667717,
"step": 3712
},
{
"completion_length": 182.4921875,
"epoch": 0.49779205138498595,
"grad_norm": 3.015625,
"kl": 0.003496495133731514,
"learning_rate": 5.022079486150141e-07,
"loss": 0.0001,
"reward": 0.3831252008676529,
"reward_std": 0.4445470869541168,
"rewards/reward_func": 0.3831252008676529,
"step": 3720
},
{
"completion_length": 162.03125,
"epoch": 0.4988625719255988,
"grad_norm": 4.3125,
"kl": 0.004948699788656086,
"learning_rate": 5.011374280744011e-07,
"loss": 0.0002,
"reward": 0.32710376754403114,
"reward_std": 0.47466727904975414,
"rewards/reward_func": 0.32710376754403114,
"step": 3728
},
{
"completion_length": 176.5859375,
"epoch": 0.4999330924662117,
"grad_norm": 3.640625,
"kl": 0.004679859790485352,
"learning_rate": 5.000669075337883e-07,
"loss": 0.0002,
"reward": 0.13110784254968166,
"reward_std": 0.44122389145195484,
"rewards/reward_func": 0.13110784254968166,
"step": 3736
},
{
"completion_length": 167.71875,
"epoch": 0.5010036130068246,
"grad_norm": 4.78125,
"kl": 0.004513267427682877,
"learning_rate": 4.989963869931754e-07,
"loss": 0.0002,
"reward": 0.15786111541092396,
"reward_std": 0.606589537113905,
"rewards/reward_func": 0.15786111541092396,
"step": 3744
},
{
"completion_length": 150.421875,
"epoch": 0.5020741335474375,
"grad_norm": 5.3125,
"kl": 0.004389044945128262,
"learning_rate": 4.979258664525626e-07,
"loss": 0.0002,
"reward": 0.4018897293135524,
"reward_std": 0.44968966394662857,
"rewards/reward_func": 0.4018897293135524,
"step": 3752
},
{
"completion_length": 157.28125,
"epoch": 0.5031446540880503,
"grad_norm": 3.109375,
"kl": 0.004845765855861828,
"learning_rate": 4.968553459119496e-07,
"loss": 0.0002,
"reward": 0.5019057989120483,
"reward_std": 0.43940271995961666,
"rewards/reward_func": 0.5019057989120483,
"step": 3760
},
{
"completion_length": 183.84375,
"epoch": 0.5042151746286632,
"grad_norm": 2.890625,
"kl": 0.004980318364687264,
"learning_rate": 4.957848253713368e-07,
"loss": 0.0002,
"reward": 0.10100116580724716,
"reward_std": 0.5983940260484815,
"rewards/reward_func": 0.10100116580724716,
"step": 3768
},
{
"completion_length": 148.984375,
"epoch": 0.505285695169276,
"grad_norm": 2.859375,
"kl": 0.0051506354357115924,
"learning_rate": 4.947143048307239e-07,
"loss": 0.0002,
"reward": 0.2997464369982481,
"reward_std": 0.6431192979216576,
"rewards/reward_func": 0.2997464369982481,
"step": 3776
},
{
"completion_length": 148.5703125,
"epoch": 0.506356215709889,
"grad_norm": 3.890625,
"kl": 0.004320590727729723,
"learning_rate": 4.93643784290111e-07,
"loss": 0.0002,
"reward": 0.14957408607006073,
"reward_std": 0.5004684673622251,
"rewards/reward_func": 0.14957408607006073,
"step": 3784
},
{
"completion_length": 170.7734375,
"epoch": 0.5074267362505018,
"grad_norm": 4.96875,
"kl": 0.00426993565633893,
"learning_rate": 4.925732637494981e-07,
"loss": 0.0002,
"reward": 0.1513789612799883,
"reward_std": 0.6300474852323532,
"rewards/reward_func": 0.1513789612799883,
"step": 3792
},
{
"completion_length": 132.6796875,
"epoch": 0.5084972567911147,
"grad_norm": 3.65625,
"kl": 0.00518818135606125,
"learning_rate": 4.915027432088853e-07,
"loss": 0.0002,
"reward": 0.2980203665792942,
"reward_std": 0.39504921436309814,
"rewards/reward_func": 0.2980203665792942,
"step": 3800
},
{
"completion_length": 143.1875,
"epoch": 0.5095677773317275,
"grad_norm": 4.5625,
"kl": 0.004469432285986841,
"learning_rate": 4.904322226682725e-07,
"loss": 0.0002,
"reward": 0.4323331117630005,
"reward_std": 0.5411158930510283,
"rewards/reward_func": 0.4323331117630005,
"step": 3808
},
{
"completion_length": 204.7578125,
"epoch": 0.5106382978723404,
"grad_norm": 4.71875,
"kl": 0.003944508789572865,
"learning_rate": 4.893617021276595e-07,
"loss": 0.0002,
"reward": 0.06451552081853151,
"reward_std": 0.6014019660651684,
"rewards/reward_func": 0.06451552081853151,
"step": 3816
},
{
"completion_length": 171.0625,
"epoch": 0.5117088184129533,
"grad_norm": 6.53125,
"kl": 0.0044076822232455015,
"learning_rate": 4.882911815870467e-07,
"loss": 0.0002,
"reward": 0.26693916134536266,
"reward_std": 0.5402739644050598,
"rewards/reward_func": 0.26693916134536266,
"step": 3824
},
{
"completion_length": 160.0703125,
"epoch": 0.5127793389535662,
"grad_norm": 3.734375,
"kl": 0.004957833531079814,
"learning_rate": 4.872206610464339e-07,
"loss": 0.0002,
"reward": 0.2441606866195798,
"reward_std": 0.6625313609838486,
"rewards/reward_func": 0.2441606866195798,
"step": 3832
},
{
"completion_length": 155.8515625,
"epoch": 0.513849859494179,
"grad_norm": 3.640625,
"kl": 0.004840250330744311,
"learning_rate": 4.861501405058209e-07,
"loss": 0.0002,
"reward": 0.3202288933098316,
"reward_std": 0.6590756271034479,
"rewards/reward_func": 0.3202288933098316,
"step": 3840
},
{
"completion_length": 170.21875,
"epoch": 0.5149203800347919,
"grad_norm": 4.5625,
"kl": 0.005241601204033941,
"learning_rate": 4.850796199652081e-07,
"loss": 0.0002,
"reward": 0.11097644921392202,
"reward_std": 0.6563504040241241,
"rewards/reward_func": 0.11097644921392202,
"step": 3848
},
{
"completion_length": 172.3359375,
"epoch": 0.5159909005754048,
"grad_norm": 4.71875,
"kl": 0.0044063644600100815,
"learning_rate": 4.840090994245952e-07,
"loss": 0.0002,
"reward": 0.26450240099802613,
"reward_std": 0.6473797373473644,
"rewards/reward_func": 0.26450240099802613,
"step": 3856
},
{
"completion_length": 188.34375,
"epoch": 0.5170614211160177,
"grad_norm": 3.703125,
"kl": 0.004124164639506489,
"learning_rate": 4.829385788839824e-07,
"loss": 0.0002,
"reward": 0.09523116052150726,
"reward_std": 0.5340174566954374,
"rewards/reward_func": 0.09523116052150726,
"step": 3864
},
{
"completion_length": 157.3046875,
"epoch": 0.5181319416566306,
"grad_norm": 4.5,
"kl": 0.004781241004820913,
"learning_rate": 4.818680583433694e-07,
"loss": 0.0002,
"reward": 0.3139430582523346,
"reward_std": 0.5873579885810614,
"rewards/reward_func": 0.3139430582523346,
"step": 3872
},
{
"completion_length": 153.1015625,
"epoch": 0.5192024621972434,
"grad_norm": 4.28125,
"kl": 0.0045044064754620194,
"learning_rate": 4.807975378027566e-07,
"loss": 0.0002,
"reward": 0.24596689827740192,
"reward_std": 0.5791397895663977,
"rewards/reward_func": 0.24596689827740192,
"step": 3880
},
{
"completion_length": 166.8671875,
"epoch": 0.5202729827378563,
"grad_norm": 4.8125,
"kl": 0.004427089152159169,
"learning_rate": 4.797270172621437e-07,
"loss": 0.0002,
"reward": 0.3911690888926387,
"reward_std": 0.5238520000129938,
"rewards/reward_func": 0.3911690888926387,
"step": 3888
},
{
"completion_length": 182.296875,
"epoch": 0.5213435032784691,
"grad_norm": 3.6875,
"kl": 0.00470818518078886,
"learning_rate": 4.786564967215308e-07,
"loss": 0.0002,
"reward": -0.06911014439538121,
"reward_std": 0.6354586593806744,
"rewards/reward_func": -0.06911014439538121,
"step": 3896
},
{
"completion_length": 151.5859375,
"epoch": 0.522414023819082,
"grad_norm": 4.65625,
"kl": 0.004992738307919353,
"learning_rate": 4.775859761809179e-07,
"loss": 0.0002,
"reward": 0.441136134788394,
"reward_std": 0.5409799609333277,
"rewards/reward_func": 0.441136134788394,
"step": 3904
},
{
"completion_length": 158.1875,
"epoch": 0.5234845443596949,
"grad_norm": 3.921875,
"kl": 0.004533803061349317,
"learning_rate": 4.765154556403051e-07,
"loss": 0.0002,
"reward": 0.36645470559597015,
"reward_std": 0.5416577542200685,
"rewards/reward_func": 0.36645470559597015,
"step": 3912
},
{
"completion_length": 177.9140625,
"epoch": 0.5245550649003078,
"grad_norm": 2.78125,
"kl": 0.004515117674600333,
"learning_rate": 4.754449350996922e-07,
"loss": 0.0002,
"reward": 0.11683559231460094,
"reward_std": 0.5318781770765781,
"rewards/reward_func": 0.11683559231460094,
"step": 3920
},
{
"completion_length": 162.484375,
"epoch": 0.5256255854409206,
"grad_norm": 2.78125,
"kl": 0.00410384067799896,
"learning_rate": 4.7437441455907934e-07,
"loss": 0.0002,
"reward": 0.5109116761013865,
"reward_std": 0.389411685988307,
"rewards/reward_func": 0.5109116761013865,
"step": 3928
},
{
"completion_length": 179.484375,
"epoch": 0.5266961059815335,
"grad_norm": 4.5,
"kl": 0.004382628481835127,
"learning_rate": 4.7330389401846646e-07,
"loss": 0.0002,
"reward": 0.12338575161993504,
"reward_std": 0.49865792877972126,
"rewards/reward_func": 0.12338575161993504,
"step": 3936
},
{
"completion_length": 168.8203125,
"epoch": 0.5277666265221463,
"grad_norm": 3.78125,
"kl": 0.004615213518263772,
"learning_rate": 4.722333734778536e-07,
"loss": 0.0002,
"reward": 0.2909085564315319,
"reward_std": 0.44954105466604233,
"rewards/reward_func": 0.2909085564315319,
"step": 3944
},
{
"completion_length": 186.40625,
"epoch": 0.5288371470627593,
"grad_norm": 3.703125,
"kl": 0.003957096429076046,
"learning_rate": 4.7116285293724075e-07,
"loss": 0.0002,
"reward": 0.35753502883017063,
"reward_std": 0.5898796916007996,
"rewards/reward_func": 0.35753502883017063,
"step": 3952
},
{
"completion_length": 165.03125,
"epoch": 0.5299076676033722,
"grad_norm": 3.25,
"kl": 0.0045530806528404355,
"learning_rate": 4.700923323966278e-07,
"loss": 0.0002,
"reward": 0.2869006171822548,
"reward_std": 0.4535912126302719,
"rewards/reward_func": 0.2869006171822548,
"step": 3960
},
{
"completion_length": 148.8203125,
"epoch": 0.530978188143985,
"grad_norm": 4.25,
"kl": 0.00460378042771481,
"learning_rate": 4.69021811856015e-07,
"loss": 0.0002,
"reward": 0.48801288567483425,
"reward_std": 0.4225266771391034,
"rewards/reward_func": 0.48801288567483425,
"step": 3968
},
{
"completion_length": 174.203125,
"epoch": 0.5320487086845979,
"grad_norm": 2.65625,
"kl": 0.004049515846418217,
"learning_rate": 4.679512913154021e-07,
"loss": 0.0002,
"reward": 0.418088311329484,
"reward_std": 0.5685894265770912,
"rewards/reward_func": 0.418088311329484,
"step": 3976
},
{
"completion_length": 165.2578125,
"epoch": 0.5331192292252107,
"grad_norm": 3.25,
"kl": 0.00501069356687367,
"learning_rate": 4.668807707747892e-07,
"loss": 0.0002,
"reward": 0.31565719842910767,
"reward_std": 0.6409982740879059,
"rewards/reward_func": 0.31565719842910767,
"step": 3984
},
{
"completion_length": 162.015625,
"epoch": 0.5341897497658237,
"grad_norm": 3.671875,
"kl": 0.0046659239451400936,
"learning_rate": 4.6581025023417636e-07,
"loss": 0.0002,
"reward": -0.0461183600127697,
"reward_std": 0.7044645324349403,
"rewards/reward_func": -0.0461183600127697,
"step": 3992
},
{
"completion_length": 142.703125,
"epoch": 0.5352602703064365,
"grad_norm": 3.84375,
"kl": 0.004710770619567484,
"learning_rate": 4.6473972969356343e-07,
"loss": 0.0002,
"reward": 0.5219798712059855,
"reward_std": 0.4946548119187355,
"rewards/reward_func": 0.5219798712059855,
"step": 4000
},
{
"completion_length": 146.3203125,
"epoch": 0.5363307908470494,
"grad_norm": 3.21875,
"kl": 0.005039886600570753,
"learning_rate": 4.636692091529506e-07,
"loss": 0.0002,
"reward": 0.420873555354774,
"reward_std": 0.4259900487959385,
"rewards/reward_func": 0.420873555354774,
"step": 4008
},
{
"completion_length": 168.2109375,
"epoch": 0.5374013113876622,
"grad_norm": 4.96875,
"kl": 0.004895551188383251,
"learning_rate": 4.625986886123377e-07,
"loss": 0.0002,
"reward": 0.3381440285593271,
"reward_std": 0.5715998597443104,
"rewards/reward_func": 0.3381440285593271,
"step": 4016
},
{
"completion_length": 160.578125,
"epoch": 0.5384718319282751,
"grad_norm": 3.625,
"kl": 0.00470035380567424,
"learning_rate": 4.6152816807172485e-07,
"loss": 0.0002,
"reward": 0.3439123351126909,
"reward_std": 0.4550882736220956,
"rewards/reward_func": 0.3439123351126909,
"step": 4024
},
{
"completion_length": 159.9453125,
"epoch": 0.539542352468888,
"grad_norm": 4.375,
"kl": 0.00492598774144426,
"learning_rate": 4.6045764753111197e-07,
"loss": 0.0002,
"reward": 0.2067430024035275,
"reward_std": 0.5162056926637888,
"rewards/reward_func": 0.2067430024035275,
"step": 4032
},
{
"completion_length": 166.1015625,
"epoch": 0.5406128730095009,
"grad_norm": 3.0625,
"kl": 0.0042450258624739945,
"learning_rate": 4.593871269904991e-07,
"loss": 0.0002,
"reward": 0.3529038140550256,
"reward_std": 0.4770152699202299,
"rewards/reward_func": 0.3529038140550256,
"step": 4040
},
{
"completion_length": 178.7109375,
"epoch": 0.5416833935501137,
"grad_norm": 4.5625,
"kl": 0.005025087855756283,
"learning_rate": 4.583166064498862e-07,
"loss": 0.0002,
"reward": -0.081031309440732,
"reward_std": 0.4695176286622882,
"rewards/reward_func": -0.081031309440732,
"step": 4048
},
{
"completion_length": 165.65625,
"epoch": 0.5427539140907266,
"grad_norm": 4.4375,
"kl": 0.0055138085735961795,
"learning_rate": 4.572460859092734e-07,
"loss": 0.0002,
"reward": -0.007585156708955765,
"reward_std": 0.5119953658431768,
"rewards/reward_func": -0.007585156708955765,
"step": 4056
},
{
"completion_length": 156.34375,
"epoch": 0.5438244346313394,
"grad_norm": 3.796875,
"kl": 0.0043381388823036104,
"learning_rate": 4.5617556536866045e-07,
"loss": 0.0002,
"reward": 0.13799802958965302,
"reward_std": 0.6221343949437141,
"rewards/reward_func": 0.13799802958965302,
"step": 4064
},
{
"completion_length": 191.2734375,
"epoch": 0.5448949551719524,
"grad_norm": 4.71875,
"kl": 0.004081014514667913,
"learning_rate": 4.5510504482804763e-07,
"loss": 0.0002,
"reward": -0.10252122208476067,
"reward_std": 0.5134240631014109,
"rewards/reward_func": -0.10252122208476067,
"step": 4072
},
{
"completion_length": 149.390625,
"epoch": 0.5459654757125653,
"grad_norm": 3.640625,
"kl": 0.004793624917510897,
"learning_rate": 4.540345242874347e-07,
"loss": 0.0002,
"reward": 0.42647568974643946,
"reward_std": 0.6049776747822762,
"rewards/reward_func": 0.42647568974643946,
"step": 4080
},
{
"completion_length": 169.515625,
"epoch": 0.5470359962531781,
"grad_norm": 5.375,
"kl": 0.005105009535327554,
"learning_rate": 4.5296400374682187e-07,
"loss": 0.0002,
"reward": 0.14484626054763794,
"reward_std": 0.715711385011673,
"rewards/reward_func": 0.14484626054763794,
"step": 4088
},
{
"completion_length": 184.5859375,
"epoch": 0.548106516793791,
"grad_norm": 2.21875,
"kl": 0.00399865786312148,
"learning_rate": 4.51893483206209e-07,
"loss": 0.0002,
"reward": 0.27984373830258846,
"reward_std": 0.6154143176972866,
"rewards/reward_func": 0.27984373830258846,
"step": 4096
},
{
"completion_length": 148.9765625,
"epoch": 0.5491770373344038,
"grad_norm": 4.8125,
"kl": 0.005411504651419818,
"learning_rate": 4.508229626655961e-07,
"loss": 0.0002,
"reward": 0.3810861259698868,
"reward_std": 0.6340535804629326,
"rewards/reward_func": 0.3810861259698868,
"step": 4104
},
{
"completion_length": 181.171875,
"epoch": 0.5502475578750168,
"grad_norm": 3.734375,
"kl": 0.003742568180314265,
"learning_rate": 4.4975244212498324e-07,
"loss": 0.0001,
"reward": 0.314508281648159,
"reward_std": 0.5607537031173706,
"rewards/reward_func": 0.314508281648159,
"step": 4112
},
{
"completion_length": 131.109375,
"epoch": 0.5513180784156296,
"grad_norm": 6.5625,
"kl": 0.005468558054417372,
"learning_rate": 4.486819215843704e-07,
"loss": 0.0002,
"reward": 0.43094983510673046,
"reward_std": 0.39848934579640627,
"rewards/reward_func": 0.43094983510673046,
"step": 4120
},
{
"completion_length": 144.765625,
"epoch": 0.5523885989562425,
"grad_norm": 5.15625,
"kl": 0.005072243511676788,
"learning_rate": 4.476114010437575e-07,
"loss": 0.0002,
"reward": 0.16479766555130482,
"reward_std": 0.624469917267561,
"rewards/reward_func": 0.16479766555130482,
"step": 4128
},
{
"completion_length": 150.828125,
"epoch": 0.5534591194968553,
"grad_norm": 5.5625,
"kl": 0.004988896253053099,
"learning_rate": 4.4654088050314465e-07,
"loss": 0.0002,
"reward": 0.23024853132665157,
"reward_std": 0.5588976237922907,
"rewards/reward_func": 0.23024853132665157,
"step": 4136
},
{
"completion_length": 162.78125,
"epoch": 0.5545296400374682,
"grad_norm": 6.53125,
"kl": 0.004712989641120657,
"learning_rate": 4.454703599625317e-07,
"loss": 0.0002,
"reward": 0.27441484900191426,
"reward_std": 0.4914160780608654,
"rewards/reward_func": 0.27441484900191426,
"step": 4144
},
{
"completion_length": 203.6171875,
"epoch": 0.555600160578081,
"grad_norm": 3.6875,
"kl": 0.003505587810650468,
"learning_rate": 4.443998394219189e-07,
"loss": 0.0001,
"reward": 0.009393353015184402,
"reward_std": 0.6114509087055922,
"rewards/reward_func": 0.009393353015184402,
"step": 4152
},
{
"completion_length": 166.7578125,
"epoch": 0.556670681118694,
"grad_norm": 4.78125,
"kl": 0.004777590365847573,
"learning_rate": 4.43329318881306e-07,
"loss": 0.0002,
"reward": 0.11833875393494964,
"reward_std": 0.5748403836041689,
"rewards/reward_func": 0.11833875393494964,
"step": 4160
},
{
"completion_length": 150.4140625,
"epoch": 0.5577412016593069,
"grad_norm": 3.28125,
"kl": 0.0049895147094503045,
"learning_rate": 4.4225879834069314e-07,
"loss": 0.0002,
"reward": 0.2932877875864506,
"reward_std": 0.6367702716961503,
"rewards/reward_func": 0.2932877875864506,
"step": 4168
},
{
"completion_length": 156.7109375,
"epoch": 0.5588117221999197,
"grad_norm": 5.09375,
"kl": 0.005086433404358104,
"learning_rate": 4.4118827780008026e-07,
"loss": 0.0002,
"reward": 0.14813962019979954,
"reward_std": 0.5115363541990519,
"rewards/reward_func": 0.14813962019979954,
"step": 4176
},
{
"completion_length": 166.015625,
"epoch": 0.5598822427405326,
"grad_norm": 3.953125,
"kl": 0.004459643067093566,
"learning_rate": 4.401177572594674e-07,
"loss": 0.0002,
"reward": 0.17805076017975807,
"reward_std": 0.7240184545516968,
"rewards/reward_func": 0.17805076017975807,
"step": 4184
},
{
"completion_length": 149.1328125,
"epoch": 0.5609527632811454,
"grad_norm": 3.8125,
"kl": 0.004602790024364367,
"learning_rate": 4.390472367188545e-07,
"loss": 0.0002,
"reward": 0.46490050479769707,
"reward_std": 0.4432865995913744,
"rewards/reward_func": 0.46490050479769707,
"step": 4192
},
{
"completion_length": 183.953125,
"epoch": 0.5620232838217584,
"grad_norm": 3.625,
"kl": 0.00446239989832975,
"learning_rate": 4.379767161782417e-07,
"loss": 0.0002,
"reward": 0.19806094001978636,
"reward_std": 0.6545614078640938,
"rewards/reward_func": 0.19806094001978636,
"step": 4200
},
{
"completion_length": 195.46875,
"epoch": 0.5630938043623712,
"grad_norm": 3.5625,
"kl": 0.003972954727942124,
"learning_rate": 4.3690619563762875e-07,
"loss": 0.0002,
"reward": -0.12718784296885133,
"reward_std": 0.5749151539057493,
"rewards/reward_func": -0.12718784296885133,
"step": 4208
},
{
"completion_length": 137.4453125,
"epoch": 0.5641643249029841,
"grad_norm": 4.25,
"kl": 0.004893360834103078,
"learning_rate": 4.358356750970159e-07,
"loss": 0.0002,
"reward": 0.24862979911267757,
"reward_std": 0.6906272917985916,
"rewards/reward_func": 0.24862979911267757,
"step": 4216
},
{
"completion_length": 153.109375,
"epoch": 0.5652348454435969,
"grad_norm": 3.578125,
"kl": 0.0049685456906445324,
"learning_rate": 4.3476515455640304e-07,
"loss": 0.0002,
"reward": 0.41499729454517365,
"reward_std": 0.4691876629367471,
"rewards/reward_func": 0.41499729454517365,
"step": 4224
},
{
"completion_length": 149.8515625,
"epoch": 0.5663053659842098,
"grad_norm": 5.59375,
"kl": 0.004520065325777978,
"learning_rate": 4.3369463401579017e-07,
"loss": 0.0002,
"reward": 0.318800778593868,
"reward_std": 0.6351992357522249,
"rewards/reward_func": 0.318800778593868,
"step": 4232
},
{
"completion_length": 142.421875,
"epoch": 0.5673758865248227,
"grad_norm": 4.59375,
"kl": 0.005744964553741738,
"learning_rate": 4.326241134751773e-07,
"loss": 0.0002,
"reward": 0.4124446418136358,
"reward_std": 0.5395534262061119,
"rewards/reward_func": 0.4124446418136358,
"step": 4240
},
{
"completion_length": 163.6953125,
"epoch": 0.5684464070654356,
"grad_norm": 4.6875,
"kl": 0.004186704114545137,
"learning_rate": 4.315535929345644e-07,
"loss": 0.0002,
"reward": 0.35636366717517376,
"reward_std": 0.6417583487927914,
"rewards/reward_func": 0.35636366717517376,
"step": 4248
},
{
"completion_length": 185.578125,
"epoch": 0.5695169276060484,
"grad_norm": 3.84375,
"kl": 0.004251696169376373,
"learning_rate": 4.3048307239395153e-07,
"loss": 0.0002,
"reward": 0.30847467109560966,
"reward_std": 0.44796227291226387,
"rewards/reward_func": 0.30847467109560966,
"step": 4256
},
{
"completion_length": 211.21875,
"epoch": 0.5705874481466613,
"grad_norm": 2.3125,
"kl": 0.004341925901826471,
"learning_rate": 4.294125518533387e-07,
"loss": 0.0002,
"reward": 0.2743415031582117,
"reward_std": 0.45934509858489037,
"rewards/reward_func": 0.2743415031582117,
"step": 4264
},
{
"completion_length": 171.7265625,
"epoch": 0.5716579686872741,
"grad_norm": 1.9453125,
"kl": 0.004344686400145292,
"learning_rate": 4.2834203131272577e-07,
"loss": 0.0002,
"reward": 0.2994256131350994,
"reward_std": 0.6268932148814201,
"rewards/reward_func": 0.2994256131350994,
"step": 4272
},
{
"completion_length": 138.3984375,
"epoch": 0.5727284892278871,
"grad_norm": 3.171875,
"kl": 0.006193301291204989,
"learning_rate": 4.2727151077211295e-07,
"loss": 0.0002,
"reward": 0.3124155914410949,
"reward_std": 0.49435919895768166,
"rewards/reward_func": 0.3124155914410949,
"step": 4280
},
{
"completion_length": 175.421875,
"epoch": 0.5737990097685,
"grad_norm": 3.609375,
"kl": 0.004771079227793962,
"learning_rate": 4.262009902315e-07,
"loss": 0.0002,
"reward": 0.27722545340657234,
"reward_std": 0.5442187786102295,
"rewards/reward_func": 0.27722545340657234,
"step": 4288
},
{
"completion_length": 224.7265625,
"epoch": 0.5748695303091128,
"grad_norm": 3.703125,
"kl": 0.0033396084618289024,
"learning_rate": 4.251304696908872e-07,
"loss": 0.0001,
"reward": -0.16931618377566338,
"reward_std": 0.5313975028693676,
"rewards/reward_func": -0.16931618377566338,
"step": 4296
},
{
"completion_length": 186.859375,
"epoch": 0.5759400508497257,
"grad_norm": 4.75,
"kl": 0.0042559900030028075,
"learning_rate": 4.240599491502743e-07,
"loss": 0.0002,
"reward": 0.13033189252018929,
"reward_std": 0.3756987228989601,
"rewards/reward_func": 0.13033189252018929,
"step": 4304
},
{
"completion_length": 142.578125,
"epoch": 0.5770105713903385,
"grad_norm": 3.9375,
"kl": 0.005709152843337506,
"learning_rate": 4.2298942860966143e-07,
"loss": 0.0002,
"reward": 0.3865806292742491,
"reward_std": 0.6126521602272987,
"rewards/reward_func": 0.3865806292742491,
"step": 4312
},
{
"completion_length": 164.7890625,
"epoch": 0.5780810919309515,
"grad_norm": 3.125,
"kl": 0.0045434608473442495,
"learning_rate": 4.2191890806904856e-07,
"loss": 0.0002,
"reward": 0.3333674664609134,
"reward_std": 0.6179927475750446,
"rewards/reward_func": 0.3333674664609134,
"step": 4320
},
{
"completion_length": 118.8515625,
"epoch": 0.5791516124715643,
"grad_norm": 3.703125,
"kl": 0.004566928721033037,
"learning_rate": 4.208483875284357e-07,
"loss": 0.0002,
"reward": 0.6828272566199303,
"reward_std": 0.41035995725542307,
"rewards/reward_func": 0.6828272566199303,
"step": 4328
},
{
"completion_length": 141.640625,
"epoch": 0.5802221330121772,
"grad_norm": 4.125,
"kl": 0.004908986215014011,
"learning_rate": 4.197778669878228e-07,
"loss": 0.0002,
"reward": 0.4755503498017788,
"reward_std": 0.5121949464082718,
"rewards/reward_func": 0.4755503498017788,
"step": 4336
},
{
"completion_length": 143.0625,
"epoch": 0.58129265355279,
"grad_norm": 4.09375,
"kl": 0.004488215548917651,
"learning_rate": 4.1870734644720997e-07,
"loss": 0.0002,
"reward": 0.5884530800394714,
"reward_std": 0.44153958186507225,
"rewards/reward_func": 0.5884530800394714,
"step": 4344
},
{
"completion_length": 148.1796875,
"epoch": 0.5823631740934029,
"grad_norm": 3.0,
"kl": 0.0046905699709896,
"learning_rate": 4.1763682590659704e-07,
"loss": 0.0002,
"reward": 0.47682703845202923,
"reward_std": 0.4733074624091387,
"rewards/reward_func": 0.47682703845202923,
"step": 4352
},
{
"completion_length": 150.296875,
"epoch": 0.5834336946340158,
"grad_norm": 4.28125,
"kl": 0.004917474143439904,
"learning_rate": 4.165663053659842e-07,
"loss": 0.0002,
"reward": 0.2567645199596882,
"reward_std": 0.6055057626217604,
"rewards/reward_func": 0.2567645199596882,
"step": 4360
},
{
"completion_length": 205.234375,
"epoch": 0.5845042151746287,
"grad_norm": 4.09375,
"kl": 0.00432168306724634,
"learning_rate": 4.1549578482537134e-07,
"loss": 0.0002,
"reward": 0.04042044514790177,
"reward_std": 0.5906463749706745,
"rewards/reward_func": 0.04042044514790177,
"step": 4368
},
{
"completion_length": 147.5625,
"epoch": 0.5855747357152415,
"grad_norm": 3.796875,
"kl": 0.005373828811571002,
"learning_rate": 4.1442526428475846e-07,
"loss": 0.0002,
"reward": 0.3917626924812794,
"reward_std": 0.5444907881319523,
"rewards/reward_func": 0.3917626924812794,
"step": 4376
},
{
"completion_length": 203.15625,
"epoch": 0.5866452562558544,
"grad_norm": 2.296875,
"kl": 0.0036935079260729253,
"learning_rate": 4.133547437441456e-07,
"loss": 0.0001,
"reward": 0.09531690180301666,
"reward_std": 0.5034721679985523,
"rewards/reward_func": 0.09531690180301666,
"step": 4384
},
{
"completion_length": 148.1328125,
"epoch": 0.5877157767964672,
"grad_norm": 6.0,
"kl": 0.0050933739403262734,
"learning_rate": 4.122842232035327e-07,
"loss": 0.0002,
"reward": 0.11112022027373314,
"reward_std": 0.5623177271336317,
"rewards/reward_func": 0.11112022027373314,
"step": 4392
},
{
"completion_length": 185.1796875,
"epoch": 0.5887862973370801,
"grad_norm": 2.796875,
"kl": 0.003696839907206595,
"learning_rate": 4.112137026629198e-07,
"loss": 0.0001,
"reward": 0.37225864082574844,
"reward_std": 0.6047016642987728,
"rewards/reward_func": 0.37225864082574844,
"step": 4400
},
{
"completion_length": 156.125,
"epoch": 0.5898568178776931,
"grad_norm": 5.5,
"kl": 0.004960794060025364,
"learning_rate": 4.10143182122307e-07,
"loss": 0.0002,
"reward": 0.4381309971213341,
"reward_std": 0.3526679091155529,
"rewards/reward_func": 0.4381309971213341,
"step": 4408
},
{
"completion_length": 173.9453125,
"epoch": 0.5909273384183059,
"grad_norm": 5.40625,
"kl": 0.0046878808352630585,
"learning_rate": 4.0907266158169407e-07,
"loss": 0.0002,
"reward": 0.11471654986962676,
"reward_std": 0.7081250138580799,
"rewards/reward_func": 0.11471654986962676,
"step": 4416
},
{
"completion_length": 164.6875,
"epoch": 0.5919978589589188,
"grad_norm": 3.875,
"kl": 0.004589978780131787,
"learning_rate": 4.0800214104108124e-07,
"loss": 0.0002,
"reward": 0.242947518825531,
"reward_std": 0.511182009242475,
"rewards/reward_func": 0.242947518825531,
"step": 4424
},
{
"completion_length": 170.96875,
"epoch": 0.5930683794995316,
"grad_norm": 3.203125,
"kl": 0.004504337441176176,
"learning_rate": 4.069316205004683e-07,
"loss": 0.0002,
"reward": 0.13327412493526936,
"reward_std": 0.7021276205778122,
"rewards/reward_func": 0.13327412493526936,
"step": 4432
},
{
"completion_length": 166.96875,
"epoch": 0.5941389000401445,
"grad_norm": 3.65625,
"kl": 0.004282756824977696,
"learning_rate": 4.0586109995985543e-07,
"loss": 0.0002,
"reward": 0.2241785153746605,
"reward_std": 0.5987379960715771,
"rewards/reward_func": 0.2241785153746605,
"step": 4440
},
{
"completion_length": 174.84375,
"epoch": 0.5952094205807574,
"grad_norm": 3.3125,
"kl": 0.004621970321750268,
"learning_rate": 4.047905794192426e-07,
"loss": 0.0002,
"reward": 0.356457632035017,
"reward_std": 0.5185628831386566,
"rewards/reward_func": 0.356457632035017,
"step": 4448
},
{
"completion_length": 151.5859375,
"epoch": 0.5962799411213703,
"grad_norm": 5.46875,
"kl": 0.004531825426965952,
"learning_rate": 4.037200588786297e-07,
"loss": 0.0002,
"reward": 0.41319750994443893,
"reward_std": 0.47001610416918993,
"rewards/reward_func": 0.41319750994443893,
"step": 4456
},
{
"completion_length": 155.9375,
"epoch": 0.5973504616619831,
"grad_norm": 3.875,
"kl": 0.005843940074555576,
"learning_rate": 4.0264953833801685e-07,
"loss": 0.0002,
"reward": 0.16792790032923222,
"reward_std": 0.45045214518904686,
"rewards/reward_func": 0.16792790032923222,
"step": 4464
},
{
"completion_length": 204.1484375,
"epoch": 0.598420982202596,
"grad_norm": 3.734375,
"kl": 0.0037444017361849546,
"learning_rate": 4.0157901779740397e-07,
"loss": 0.0001,
"reward": 0.11755906883627176,
"reward_std": 0.5679098833352327,
"rewards/reward_func": 0.11755906883627176,
"step": 4472
},
{
"completion_length": 153.7734375,
"epoch": 0.5994915027432088,
"grad_norm": 5.78125,
"kl": 0.004829052748391405,
"learning_rate": 4.005084972567911e-07,
"loss": 0.0002,
"reward": 0.19808213412761688,
"reward_std": 0.3854016624391079,
"rewards/reward_func": 0.19808213412761688,
"step": 4480
},
{
"completion_length": 203.421875,
"epoch": 0.6005620232838218,
"grad_norm": 5.9375,
"kl": 0.003742009517736733,
"learning_rate": 3.994379767161782e-07,
"loss": 0.0001,
"reward": 0.232608491089195,
"reward_std": 0.5558800995349884,
"rewards/reward_func": 0.232608491089195,
"step": 4488
},
{
"completion_length": 154.265625,
"epoch": 0.6016325438244347,
"grad_norm": 3.96875,
"kl": 0.004953162686433643,
"learning_rate": 3.9836745617556534e-07,
"loss": 0.0002,
"reward": 0.4288094639778137,
"reward_std": 0.416667815297842,
"rewards/reward_func": 0.4288094639778137,
"step": 4496
},
{
"completion_length": 166.2890625,
"epoch": 0.6027030643650475,
"grad_norm": 4.40625,
"kl": 0.004299461928894743,
"learning_rate": 3.9729693563495246e-07,
"loss": 0.0002,
"reward": 0.12346869148313999,
"reward_std": 0.6307908529415727,
"rewards/reward_func": 0.12346869148313999,
"step": 4504
},
{
"completion_length": 157.078125,
"epoch": 0.6037735849056604,
"grad_norm": 3.796875,
"kl": 0.004897929902654141,
"learning_rate": 3.9622641509433963e-07,
"loss": 0.0002,
"reward": 0.3149998290464282,
"reward_std": 0.5927382819354534,
"rewards/reward_func": 0.3149998290464282,
"step": 4512
},
{
"completion_length": 168.9296875,
"epoch": 0.6048441054462732,
"grad_norm": 5.3125,
"kl": 0.004709256027126685,
"learning_rate": 3.951558945537267e-07,
"loss": 0.0002,
"reward": 0.23328473046422005,
"reward_std": 0.633372450247407,
"rewards/reward_func": 0.23328473046422005,
"step": 4520
},
{
"completion_length": 182.09375,
"epoch": 0.6059146259868862,
"grad_norm": 2.8125,
"kl": 0.0044458308548200876,
"learning_rate": 3.940853740131139e-07,
"loss": 0.0002,
"reward": 0.04303564690053463,
"reward_std": 0.46831536665558815,
"rewards/reward_func": 0.04303564690053463,
"step": 4528
},
{
"completion_length": 150.3671875,
"epoch": 0.606985146527499,
"grad_norm": 4.25,
"kl": 0.004751139786094427,
"learning_rate": 3.9301485347250094e-07,
"loss": 0.0002,
"reward": 0.6235604397952557,
"reward_std": 0.43624259904026985,
"rewards/reward_func": 0.6235604397952557,
"step": 4536
},
{
"completion_length": 168.140625,
"epoch": 0.6080556670681119,
"grad_norm": 4.15625,
"kl": 0.004754378751385957,
"learning_rate": 3.919443329318881e-07,
"loss": 0.0002,
"reward": 0.07649134658277035,
"reward_std": 0.6275423960760236,
"rewards/reward_func": 0.07649134658277035,
"step": 4544
},
{
"completion_length": 145.2109375,
"epoch": 0.6091261876087247,
"grad_norm": 4.15625,
"kl": 0.004508535988861695,
"learning_rate": 3.9087381239127524e-07,
"loss": 0.0002,
"reward": 0.09985450841486454,
"reward_std": 0.6514963954687119,
"rewards/reward_func": 0.09985450841486454,
"step": 4552
},
{
"completion_length": 152.9921875,
"epoch": 0.6101967081493376,
"grad_norm": 3.9375,
"kl": 0.004148939304286614,
"learning_rate": 3.8980329185066236e-07,
"loss": 0.0002,
"reward": 0.45872488245368004,
"reward_std": 0.43476785998791456,
"rewards/reward_func": 0.45872488245368004,
"step": 4560
},
{
"completion_length": 130.296875,
"epoch": 0.6112672286899505,
"grad_norm": 5.84375,
"kl": 0.005563508428167552,
"learning_rate": 3.887327713100495e-07,
"loss": 0.0002,
"reward": 0.6556578651070595,
"reward_std": 0.44750246591866016,
"rewards/reward_func": 0.6556578651070595,
"step": 4568
},
{
"completion_length": 197.5078125,
"epoch": 0.6123377492305634,
"grad_norm": 3.734375,
"kl": 0.003922436822904274,
"learning_rate": 3.876622507694366e-07,
"loss": 0.0002,
"reward": -0.01122634531930089,
"reward_std": 0.5639722682535648,
"rewards/reward_func": -0.01122634531930089,
"step": 4576
},
{
"completion_length": 152.25,
"epoch": 0.6134082697711762,
"grad_norm": 3.53125,
"kl": 0.0060851232265122235,
"learning_rate": 3.865917302288237e-07,
"loss": 0.0002,
"reward": 0.5405775438994169,
"reward_std": 0.5025825463235378,
"rewards/reward_func": 0.5405775438994169,
"step": 4584
},
{
"completion_length": 165.8671875,
"epoch": 0.6144787903117891,
"grad_norm": 4.5,
"kl": 0.0056036147580016404,
"learning_rate": 3.855212096882109e-07,
"loss": 0.0002,
"reward": 0.015333790332078934,
"reward_std": 0.49498444236814976,
"rewards/reward_func": 0.015333790332078934,
"step": 4592
},
{
"completion_length": 185.2421875,
"epoch": 0.6155493108524019,
"grad_norm": 2.953125,
"kl": 0.003879066207446158,
"learning_rate": 3.8445068914759797e-07,
"loss": 0.0002,
"reward": 0.19957604305818677,
"reward_std": 0.5595576763153076,
"rewards/reward_func": 0.19957604305818677,
"step": 4600
},
{
"completion_length": 186.2578125,
"epoch": 0.6166198313930149,
"grad_norm": 3.0,
"kl": 0.004655700788134709,
"learning_rate": 3.8338016860698514e-07,
"loss": 0.0002,
"reward": 0.25618747901171446,
"reward_std": 0.5953042004257441,
"rewards/reward_func": 0.25618747901171446,
"step": 4608
},
{
"completion_length": 144.6640625,
"epoch": 0.6176903519336278,
"grad_norm": 3.703125,
"kl": 0.004998624965082854,
"learning_rate": 3.8230964806637226e-07,
"loss": 0.0002,
"reward": 0.5813037822954357,
"reward_std": 0.4772760821506381,
"rewards/reward_func": 0.5813037822954357,
"step": 4616
},
{
"completion_length": 197.1015625,
"epoch": 0.6187608724742406,
"grad_norm": 3.71875,
"kl": 0.004184005607385188,
"learning_rate": 3.812391275257594e-07,
"loss": 0.0002,
"reward": 0.07982266321778297,
"reward_std": 0.5760688092559576,
"rewards/reward_func": 0.07982266321778297,
"step": 4624
},
{
"completion_length": 174.1796875,
"epoch": 0.6198313930148535,
"grad_norm": 3.34375,
"kl": 0.004289099859306589,
"learning_rate": 3.801686069851465e-07,
"loss": 0.0002,
"reward": 0.24364805966615677,
"reward_std": 0.5018207374960184,
"rewards/reward_func": 0.24364805966615677,
"step": 4632
},
{
"completion_length": 164.1640625,
"epoch": 0.6209019135554663,
"grad_norm": 3.375,
"kl": 0.004735152295324951,
"learning_rate": 3.7909808644453363e-07,
"loss": 0.0002,
"reward": 0.34338719584047794,
"reward_std": 0.5795671846717596,
"rewards/reward_func": 0.34338719584047794,
"step": 4640
},
{
"completion_length": 133.390625,
"epoch": 0.6219724340960792,
"grad_norm": 5.25,
"kl": 0.006147218053229153,
"learning_rate": 3.7802756590392075e-07,
"loss": 0.0002,
"reward": 0.38445473089814186,
"reward_std": 0.49531611800193787,
"rewards/reward_func": 0.38445473089814186,
"step": 4648
},
{
"completion_length": 153.3984375,
"epoch": 0.6230429546366921,
"grad_norm": 4.34375,
"kl": 0.004714462149422616,
"learning_rate": 3.769570453633079e-07,
"loss": 0.0002,
"reward": 0.30648303404450417,
"reward_std": 0.4680379256606102,
"rewards/reward_func": 0.30648303404450417,
"step": 4656
},
{
"completion_length": 166.171875,
"epoch": 0.624113475177305,
"grad_norm": 3.75,
"kl": 0.004526323958998546,
"learning_rate": 3.75886524822695e-07,
"loss": 0.0002,
"reward": 0.35779890790581703,
"reward_std": 0.6776364631950855,
"rewards/reward_func": 0.35779890790581703,
"step": 4664
},
{
"completion_length": 179.2890625,
"epoch": 0.6251839957179178,
"grad_norm": 3.46875,
"kl": 0.004177290364168584,
"learning_rate": 3.7481600428208217e-07,
"loss": 0.0002,
"reward": 0.30811624182388186,
"reward_std": 0.6211207360029221,
"rewards/reward_func": 0.30811624182388186,
"step": 4672
},
{
"completion_length": 182.0234375,
"epoch": 0.6262545162585307,
"grad_norm": 3.140625,
"kl": 0.004683909472078085,
"learning_rate": 3.7374548374146924e-07,
"loss": 0.0002,
"reward": 0.008799735456705093,
"reward_std": 0.4996862728148699,
"rewards/reward_func": 0.008799735456705093,
"step": 4680
},
{
"completion_length": 148.359375,
"epoch": 0.6273250367991435,
"grad_norm": 6.59375,
"kl": 0.0059346232446841896,
"learning_rate": 3.726749632008564e-07,
"loss": 0.0002,
"reward": 0.1749492734670639,
"reward_std": 0.4117685044184327,
"rewards/reward_func": 0.1749492734670639,
"step": 4688
},
{
"completion_length": 164.515625,
"epoch": 0.6283955573397565,
"grad_norm": 3.28125,
"kl": 0.00426359154516831,
"learning_rate": 3.7160444266024353e-07,
"loss": 0.0002,
"reward": 0.10608149319887161,
"reward_std": 0.7313233427703381,
"rewards/reward_func": 0.10608149319887161,
"step": 4696
},
{
"completion_length": 162.1640625,
"epoch": 0.6294660778803693,
"grad_norm": 3.859375,
"kl": 0.004711132904049009,
"learning_rate": 3.7053392211963065e-07,
"loss": 0.0002,
"reward": 0.13499032519757748,
"reward_std": 0.6217631548643112,
"rewards/reward_func": 0.13499032519757748,
"step": 4704
},
{
"completion_length": 160.1015625,
"epoch": 0.6305365984209822,
"grad_norm": 4.5,
"kl": 0.004943192150676623,
"learning_rate": 3.694634015790178e-07,
"loss": 0.0002,
"reward": 0.20859276875853539,
"reward_std": 0.43620782624930143,
"rewards/reward_func": 0.20859276875853539,
"step": 4712
},
{
"completion_length": 179.40625,
"epoch": 0.631607118961595,
"grad_norm": 6.46875,
"kl": 0.004646303132176399,
"learning_rate": 3.6839288103840495e-07,
"loss": 0.0002,
"reward": 0.20350963808596134,
"reward_std": 0.6433871760964394,
"rewards/reward_func": 0.20350963808596134,
"step": 4720
},
{
"completion_length": 176.1953125,
"epoch": 0.6326776395022079,
"grad_norm": 4.71875,
"kl": 0.00438886127085425,
"learning_rate": 3.67322360497792e-07,
"loss": 0.0002,
"reward": 0.36116465739905834,
"reward_std": 0.6595458313822746,
"rewards/reward_func": 0.36116465739905834,
"step": 4728
},
{
"completion_length": 132.4765625,
"epoch": 0.6337481600428209,
"grad_norm": 4.09375,
"kl": 0.005916833528317511,
"learning_rate": 3.662518399571792e-07,
"loss": 0.0002,
"reward": 0.5307797193527222,
"reward_std": 0.43096242286264896,
"rewards/reward_func": 0.5307797193527222,
"step": 4736
},
{
"completion_length": 163.1953125,
"epoch": 0.6348186805834337,
"grad_norm": 3.65625,
"kl": 0.0043211055162828416,
"learning_rate": 3.6518131941656626e-07,
"loss": 0.0002,
"reward": 0.3800085699185729,
"reward_std": 0.6451602801680565,
"rewards/reward_func": 0.3800085699185729,
"step": 4744
},
{
"completion_length": 157.9609375,
"epoch": 0.6358892011240466,
"grad_norm": 3.671875,
"kl": 0.004449906060472131,
"learning_rate": 3.6411079887595344e-07,
"loss": 0.0002,
"reward": 0.17367325257509947,
"reward_std": 0.5829105107113719,
"rewards/reward_func": 0.17367325257509947,
"step": 4752
},
{
"completion_length": 159.6953125,
"epoch": 0.6369597216646594,
"grad_norm": 4.09375,
"kl": 0.0045379805960692465,
"learning_rate": 3.6304027833534056e-07,
"loss": 0.0002,
"reward": 0.4868684080429375,
"reward_std": 0.4918051455169916,
"rewards/reward_func": 0.4868684080429375,
"step": 4760
},
{
"completion_length": 192.5234375,
"epoch": 0.6380302422052723,
"grad_norm": 2.984375,
"kl": 0.003665678290417418,
"learning_rate": 3.619697577947277e-07,
"loss": 0.0001,
"reward": -0.02173500368371606,
"reward_std": 0.5965735167264938,
"rewards/reward_func": -0.02173500368371606,
"step": 4768
},
{
"completion_length": 162.7109375,
"epoch": 0.6391007627458852,
"grad_norm": 4.0625,
"kl": 0.005193614459130913,
"learning_rate": 3.608992372541148e-07,
"loss": 0.0002,
"reward": 0.3424297422170639,
"reward_std": 0.519868329167366,
"rewards/reward_func": 0.3424297422170639,
"step": 4776
},
{
"completion_length": 167.8046875,
"epoch": 0.6401712832864981,
"grad_norm": 4.21875,
"kl": 0.004302638117223978,
"learning_rate": 3.598287167135019e-07,
"loss": 0.0002,
"reward": 0.14556433307006955,
"reward_std": 0.7161346226930618,
"rewards/reward_func": 0.14556433307006955,
"step": 4784
},
{
"completion_length": 186.265625,
"epoch": 0.6412418038271109,
"grad_norm": 4.4375,
"kl": 0.004954680422088131,
"learning_rate": 3.5875819617288904e-07,
"loss": 0.0002,
"reward": 0.2309811543673277,
"reward_std": 0.6467564664781094,
"rewards/reward_func": 0.2309811543673277,
"step": 4792
},
{
"completion_length": 168.7890625,
"epoch": 0.6423123243677238,
"grad_norm": 4.65625,
"kl": 0.0042629605159163475,
"learning_rate": 3.576876756322762e-07,
"loss": 0.0002,
"reward": 0.052162475883960724,
"reward_std": 0.5783581472933292,
"rewards/reward_func": 0.052162475883960724,
"step": 4800
},
{
"completion_length": 148.328125,
"epoch": 0.6433828449083366,
"grad_norm": 3.546875,
"kl": 0.005317616189131513,
"learning_rate": 3.566171550916633e-07,
"loss": 0.0002,
"reward": 0.40866485610604286,
"reward_std": 0.5010849069803953,
"rewards/reward_func": 0.40866485610604286,
"step": 4808
},
{
"completion_length": 166.921875,
"epoch": 0.6444533654489496,
"grad_norm": 4.5,
"kl": 0.004913818440400064,
"learning_rate": 3.5554663455105046e-07,
"loss": 0.0002,
"reward": 0.2594773005694151,
"reward_std": 0.6219961307942867,
"rewards/reward_func": 0.2594773005694151,
"step": 4816
},
{
"completion_length": 186.9921875,
"epoch": 0.6455238859895625,
"grad_norm": 4.9375,
"kl": 0.004318988474551588,
"learning_rate": 3.5447611401043753e-07,
"loss": 0.0002,
"reward": 0.2544400542974472,
"reward_std": 0.7044170759618282,
"rewards/reward_func": 0.2544400542974472,
"step": 4824
},
{
"completion_length": 181.1953125,
"epoch": 0.6465944065301753,
"grad_norm": 2.625,
"kl": 0.00468209947575815,
"learning_rate": 3.534055934698247e-07,
"loss": 0.0002,
"reward": 0.19006637297570705,
"reward_std": 0.4856133693829179,
"rewards/reward_func": 0.19006637297570705,
"step": 4832
},
{
"completion_length": 148.046875,
"epoch": 0.6476649270707882,
"grad_norm": 4.5625,
"kl": 0.00532404551631771,
"learning_rate": 3.5233507292921183e-07,
"loss": 0.0002,
"reward": 0.5334251541644335,
"reward_std": 0.469427278265357,
"rewards/reward_func": 0.5334251541644335,
"step": 4840
},
{
"completion_length": 156.171875,
"epoch": 0.648735447611401,
"grad_norm": 5.0625,
"kl": 0.005237195349764079,
"learning_rate": 3.5126455238859895e-07,
"loss": 0.0002,
"reward": 0.33213027007877827,
"reward_std": 0.43815805949270725,
"rewards/reward_func": 0.33213027007877827,
"step": 4848
},
{
"completion_length": 124.3359375,
"epoch": 0.649805968152014,
"grad_norm": 3.890625,
"kl": 0.00520428063464351,
"learning_rate": 3.5019403184798607e-07,
"loss": 0.0002,
"reward": 0.6764433234930038,
"reward_std": 0.30982979480177164,
"rewards/reward_func": 0.6764433234930038,
"step": 4856
},
{
"completion_length": 148.2421875,
"epoch": 0.6508764886926268,
"grad_norm": 3.9375,
"kl": 0.005578657321166247,
"learning_rate": 3.4912351130737324e-07,
"loss": 0.0002,
"reward": 0.10781971551477909,
"reward_std": 0.4713937286287546,
"rewards/reward_func": 0.10781971551477909,
"step": 4864
},
{
"completion_length": 171.9453125,
"epoch": 0.6519470092332397,
"grad_norm": 7.0625,
"kl": 0.004915560552035458,
"learning_rate": 3.480529907667603e-07,
"loss": 0.0002,
"reward": 0.25978787057101727,
"reward_std": 0.534579697996378,
"rewards/reward_func": 0.25978787057101727,
"step": 4872
},
{
"completion_length": 173.890625,
"epoch": 0.6530175297738525,
"grad_norm": 3.578125,
"kl": 0.004566041403450072,
"learning_rate": 3.469824702261475e-07,
"loss": 0.0002,
"reward": 0.01513567566871643,
"reward_std": 0.39855797588825226,
"rewards/reward_func": 0.01513567566871643,
"step": 4880
},
{
"completion_length": 155.9921875,
"epoch": 0.6540880503144654,
"grad_norm": 2.90625,
"kl": 0.005055926798377186,
"learning_rate": 3.4591194968553456e-07,
"loss": 0.0002,
"reward": 0.29762477427721024,
"reward_std": 0.4921109788119793,
"rewards/reward_func": 0.29762477427721024,
"step": 4888
},
{
"completion_length": 183.1875,
"epoch": 0.6551585708550783,
"grad_norm": 3.1875,
"kl": 0.0038829974364489317,
"learning_rate": 3.448414291449217e-07,
"loss": 0.0002,
"reward": 0.15376039780676365,
"reward_std": 0.6010817158967257,
"rewards/reward_func": 0.15376039780676365,
"step": 4896
},
{
"completion_length": 143.6640625,
"epoch": 0.6562290913956912,
"grad_norm": 4.25,
"kl": 0.005372069776058197,
"learning_rate": 3.4377090860430885e-07,
"loss": 0.0002,
"reward": 0.45785857178270817,
"reward_std": 0.5780720338225365,
"rewards/reward_func": 0.45785857178270817,
"step": 4904
},
{
"completion_length": 170.703125,
"epoch": 0.657299611936304,
"grad_norm": 3.5,
"kl": 0.004123226040974259,
"learning_rate": 3.427003880636959e-07,
"loss": 0.0002,
"reward": 0.3207322843372822,
"reward_std": 0.5435153748840094,
"rewards/reward_func": 0.3207322843372822,
"step": 4912
},
{
"completion_length": 170.8515625,
"epoch": 0.6583701324769169,
"grad_norm": 2.296875,
"kl": 0.004442213830770925,
"learning_rate": 3.416298675230831e-07,
"loss": 0.0002,
"reward": 0.2148810252547264,
"reward_std": 0.3938889876008034,
"rewards/reward_func": 0.2148810252547264,
"step": 4920
},
{
"completion_length": 172.0546875,
"epoch": 0.6594406530175297,
"grad_norm": 3.515625,
"kl": 0.004354791803052649,
"learning_rate": 3.4055934698247016e-07,
"loss": 0.0002,
"reward": 0.1703500747680664,
"reward_std": 0.6057061813771725,
"rewards/reward_func": 0.1703500747680664,
"step": 4928
},
{
"completion_length": 175.7421875,
"epoch": 0.6605111735581426,
"grad_norm": 3.875,
"kl": 0.003823021659627557,
"learning_rate": 3.3948882644185734e-07,
"loss": 0.0002,
"reward": 0.4399567134678364,
"reward_std": 0.2992268856614828,
"rewards/reward_func": 0.4399567134678364,
"step": 4936
},
{
"completion_length": 168.5390625,
"epoch": 0.6615816940987556,
"grad_norm": 4.90625,
"kl": 0.0046115216973703355,
"learning_rate": 3.3841830590124446e-07,
"loss": 0.0002,
"reward": 0.21755497064441442,
"reward_std": 0.6609915122389793,
"rewards/reward_func": 0.21755497064441442,
"step": 4944
},
{
"completion_length": 152.484375,
"epoch": 0.6626522146393684,
"grad_norm": 4.125,
"kl": 0.004345663794083521,
"learning_rate": 3.373477853606316e-07,
"loss": 0.0002,
"reward": 0.5310599412769079,
"reward_std": 0.5352654401212931,
"rewards/reward_func": 0.5310599412769079,
"step": 4952
},
{
"completion_length": 147.4140625,
"epoch": 0.6637227351799813,
"grad_norm": 4.28125,
"kl": 0.004903295426629484,
"learning_rate": 3.362772648200187e-07,
"loss": 0.0002,
"reward": 0.47527459636330605,
"reward_std": 0.4394548684358597,
"rewards/reward_func": 0.47527459636330605,
"step": 4960
},
{
"completion_length": 128.90625,
"epoch": 0.6647932557205941,
"grad_norm": 4.59375,
"kl": 0.006024273345246911,
"learning_rate": 3.352067442794059e-07,
"loss": 0.0002,
"reward": 0.2654110789299011,
"reward_std": 0.5651892945170403,
"rewards/reward_func": 0.2654110789299011,
"step": 4968
},
{
"completion_length": 148.0546875,
"epoch": 0.665863776261207,
"grad_norm": 3.53125,
"kl": 0.004379941092338413,
"learning_rate": 3.3413622373879295e-07,
"loss": 0.0002,
"reward": 0.5237122774124146,
"reward_std": 0.5490029491484165,
"rewards/reward_func": 0.5237122774124146,
"step": 4976
},
{
"completion_length": 191.8046875,
"epoch": 0.6669342968018199,
"grad_norm": 3.09375,
"kl": 0.004273373109754175,
"learning_rate": 3.330657031981801e-07,
"loss": 0.0002,
"reward": 0.47103837318718433,
"reward_std": 0.45740975998342037,
"rewards/reward_func": 0.47103837318718433,
"step": 4984
},
{
"completion_length": 173.75,
"epoch": 0.6680048173424328,
"grad_norm": 2.796875,
"kl": 0.004224992386298254,
"learning_rate": 3.319951826575672e-07,
"loss": 0.0002,
"reward": 0.33773920126259327,
"reward_std": 0.6048417650163174,
"rewards/reward_func": 0.33773920126259327,
"step": 4992
},
{
"completion_length": 150.1015625,
"epoch": 0.6690753378830456,
"grad_norm": 5.6875,
"kl": 0.004986172774806619,
"learning_rate": 3.3092466211695436e-07,
"loss": 0.0002,
"reward": 0.26249578036367893,
"reward_std": 0.49397554993629456,
"rewards/reward_func": 0.26249578036367893,
"step": 5000
},
{
"completion_length": 168.59375,
"epoch": 0.6701458584236585,
"grad_norm": 3.59375,
"kl": 0.00485606407164596,
"learning_rate": 3.298541415763415e-07,
"loss": 0.0002,
"reward": 0.1886943932622671,
"reward_std": 0.6403144299983978,
"rewards/reward_func": 0.1886943932622671,
"step": 5008
},
{
"completion_length": 169.3125,
"epoch": 0.6712163789642713,
"grad_norm": 4.3125,
"kl": 0.004554765066131949,
"learning_rate": 3.287836210357286e-07,
"loss": 0.0002,
"reward": 0.20580013655126095,
"reward_std": 0.5662092342972755,
"rewards/reward_func": 0.20580013655126095,
"step": 5016
},
{
"completion_length": 159.5390625,
"epoch": 0.6722868995048843,
"grad_norm": 5.46875,
"kl": 0.004217549023451284,
"learning_rate": 3.2771310049511573e-07,
"loss": 0.0002,
"reward": 0.4258319865912199,
"reward_std": 0.5163000021129847,
"rewards/reward_func": 0.4258319865912199,
"step": 5024
},
{
"completion_length": 171.4140625,
"epoch": 0.6733574200454971,
"grad_norm": 3.53125,
"kl": 0.004711526213213801,
"learning_rate": 3.2664257995450285e-07,
"loss": 0.0002,
"reward": 0.12669032951816916,
"reward_std": 0.6522959657013416,
"rewards/reward_func": 0.12669032951816916,
"step": 5032
},
{
"completion_length": 143.515625,
"epoch": 0.67442794058611,
"grad_norm": 3.6875,
"kl": 0.005328927683876827,
"learning_rate": 3.2557205941388997e-07,
"loss": 0.0002,
"reward": 0.25894895382225513,
"reward_std": 0.6157816741615534,
"rewards/reward_func": 0.25894895382225513,
"step": 5040
},
{
"completion_length": 169.09375,
"epoch": 0.6754984611267228,
"grad_norm": 4.15625,
"kl": 0.004591718839947134,
"learning_rate": 3.2450153887327715e-07,
"loss": 0.0002,
"reward": 0.1223931759595871,
"reward_std": 0.7310500293970108,
"rewards/reward_func": 0.1223931759595871,
"step": 5048
},
{
"completion_length": 158.8984375,
"epoch": 0.6765689816673357,
"grad_norm": 4.375,
"kl": 0.00482406112132594,
"learning_rate": 3.234310183326642e-07,
"loss": 0.0002,
"reward": 0.30884232465177774,
"reward_std": 0.5993989063426852,
"rewards/reward_func": 0.30884232465177774,
"step": 5056
},
{
"completion_length": 153.9921875,
"epoch": 0.6776395022079487,
"grad_norm": 6.84375,
"kl": 0.0044682007865048945,
"learning_rate": 3.223604977920514e-07,
"loss": 0.0002,
"reward": 0.23793572932481766,
"reward_std": 0.47554378490895033,
"rewards/reward_func": 0.23793572932481766,
"step": 5064
},
{
"completion_length": 171.5390625,
"epoch": 0.6787100227485615,
"grad_norm": 6.90625,
"kl": 0.0044736934069078416,
"learning_rate": 3.2128997725143846e-07,
"loss": 0.0002,
"reward": 0.37867590319365263,
"reward_std": 0.49610742926597595,
"rewards/reward_func": 0.37867590319365263,
"step": 5072
},
{
"completion_length": 148.2890625,
"epoch": 0.6797805432891744,
"grad_norm": 4.625,
"kl": 0.004754859022796154,
"learning_rate": 3.2021945671082563e-07,
"loss": 0.0002,
"reward": 0.517847141250968,
"reward_std": 0.5063638836145401,
"rewards/reward_func": 0.517847141250968,
"step": 5080
},
{
"completion_length": 156.71875,
"epoch": 0.6808510638297872,
"grad_norm": 5.59375,
"kl": 0.005674656480550766,
"learning_rate": 3.1914893617021275e-07,
"loss": 0.0002,
"reward": 0.34883139841258526,
"reward_std": 0.33303822576999664,
"rewards/reward_func": 0.34883139841258526,
"step": 5088
},
{
"completion_length": 178.890625,
"epoch": 0.6819215843704001,
"grad_norm": 3.40625,
"kl": 0.0046264427655842155,
"learning_rate": 3.180784156295999e-07,
"loss": 0.0002,
"reward": 0.47927757538855076,
"reward_std": 0.5571104716509581,
"rewards/reward_func": 0.47927757538855076,
"step": 5096
},
{
"completion_length": 144.3359375,
"epoch": 0.682992104911013,
"grad_norm": 3.890625,
"kl": 0.004523319890722632,
"learning_rate": 3.17007895088987e-07,
"loss": 0.0002,
"reward": 0.34390855580568314,
"reward_std": 0.5760727934539318,
"rewards/reward_func": 0.34390855580568314,
"step": 5104
},
{
"completion_length": 159.6484375,
"epoch": 0.6840626254516259,
"grad_norm": 4.65625,
"kl": 0.004729281121399254,
"learning_rate": 3.1593737454837417e-07,
"loss": 0.0002,
"reward": 0.38299885392189026,
"reward_std": 0.3037977972999215,
"rewards/reward_func": 0.38299885392189026,
"step": 5112
},
{
"completion_length": 150.53125,
"epoch": 0.6851331459922387,
"grad_norm": 2.96875,
"kl": 0.005811055249068886,
"learning_rate": 3.1486685400776124e-07,
"loss": 0.0002,
"reward": 0.4124348498880863,
"reward_std": 0.5133458133786917,
"rewards/reward_func": 0.4124348498880863,
"step": 5120
},
{
"completion_length": 146.6875,
"epoch": 0.6862036665328516,
"grad_norm": 5.15625,
"kl": 0.004858777509070933,
"learning_rate": 3.137963334671484e-07,
"loss": 0.0002,
"reward": 0.1230292096734047,
"reward_std": 0.4463986298069358,
"rewards/reward_func": 0.1230292096734047,
"step": 5128
},
{
"completion_length": 160.390625,
"epoch": 0.6872741870734644,
"grad_norm": 2.96875,
"kl": 0.004541641887044534,
"learning_rate": 3.127258129265355e-07,
"loss": 0.0002,
"reward": 0.05217524245381355,
"reward_std": 0.45026756450533867,
"rewards/reward_func": 0.05217524245381355,
"step": 5136
},
{
"completion_length": 144.0703125,
"epoch": 0.6883447076140774,
"grad_norm": 8.6875,
"kl": 0.005810694128740579,
"learning_rate": 3.1165529238592266e-07,
"loss": 0.0002,
"reward": 0.31892623007297516,
"reward_std": 0.4961309377104044,
"rewards/reward_func": 0.31892623007297516,
"step": 5144
},
{
"completion_length": 202.375,
"epoch": 0.6894152281546903,
"grad_norm": 3.125,
"kl": 0.004103525396203622,
"learning_rate": 3.105847718453098e-07,
"loss": 0.0002,
"reward": 0.35768837202340364,
"reward_std": 0.5502582993358374,
"rewards/reward_func": 0.35768837202340364,
"step": 5152
},
{
"completion_length": 173.1484375,
"epoch": 0.6904857486953031,
"grad_norm": 3.40625,
"kl": 0.004345653491327539,
"learning_rate": 3.095142513046969e-07,
"loss": 0.0002,
"reward": 0.30987947806715965,
"reward_std": 0.5077685210853815,
"rewards/reward_func": 0.30987947806715965,
"step": 5160
},
{
"completion_length": 176.890625,
"epoch": 0.691556269235916,
"grad_norm": 3.515625,
"kl": 0.0047625836450606585,
"learning_rate": 3.08443730764084e-07,
"loss": 0.0002,
"reward": 0.37025075126439333,
"reward_std": 0.47811376582831144,
"rewards/reward_func": 0.37025075126439333,
"step": 5168
},
{
"completion_length": 156.3125,
"epoch": 0.6926267897765288,
"grad_norm": 3.375,
"kl": 0.004461723641725257,
"learning_rate": 3.0737321022347114e-07,
"loss": 0.0002,
"reward": 0.4771025739610195,
"reward_std": 0.4133305884897709,
"rewards/reward_func": 0.4771025739610195,
"step": 5176
},
{
"completion_length": 175.3359375,
"epoch": 0.6936973103171417,
"grad_norm": 3.71875,
"kl": 0.004555776889901608,
"learning_rate": 3.0630268968285827e-07,
"loss": 0.0002,
"reward": 0.21876542083919048,
"reward_std": 0.5979834999889135,
"rewards/reward_func": 0.21876542083919048,
"step": 5184
},
{
"completion_length": 143.2734375,
"epoch": 0.6947678308577546,
"grad_norm": 4.1875,
"kl": 0.006049849558621645,
"learning_rate": 3.0523216914224544e-07,
"loss": 0.0002,
"reward": 0.3804114758968353,
"reward_std": 0.43962680641561747,
"rewards/reward_func": 0.3804114758968353,
"step": 5192
},
{
"completion_length": 176.09375,
"epoch": 0.6958383513983675,
"grad_norm": 4.1875,
"kl": 0.004219004331389442,
"learning_rate": 3.041616486016325e-07,
"loss": 0.0002,
"reward": 0.033823274075984955,
"reward_std": 0.5529468916356564,
"rewards/reward_func": 0.033823274075984955,
"step": 5200
},
{
"completion_length": 167.1328125,
"epoch": 0.6969088719389803,
"grad_norm": 3.453125,
"kl": 0.0044156058866064996,
"learning_rate": 3.030911280610197e-07,
"loss": 0.0002,
"reward": 0.35997615940868855,
"reward_std": 0.6205689832568169,
"rewards/reward_func": 0.35997615940868855,
"step": 5208
},
{
"completion_length": 134.3515625,
"epoch": 0.6979793924795932,
"grad_norm": 6.03125,
"kl": 0.005597625044174492,
"learning_rate": 3.020206075204068e-07,
"loss": 0.0002,
"reward": 0.5491457581520081,
"reward_std": 0.5092198746278882,
"rewards/reward_func": 0.5491457581520081,
"step": 5216
},
{
"completion_length": 161.3515625,
"epoch": 0.699049913020206,
"grad_norm": 2.984375,
"kl": 0.005356652429327369,
"learning_rate": 3.009500869797939e-07,
"loss": 0.0002,
"reward": 0.4664277071133256,
"reward_std": 0.5567853916436434,
"rewards/reward_func": 0.4664277071133256,
"step": 5224
},
{
"completion_length": 169.203125,
"epoch": 0.700120433560819,
"grad_norm": 4.1875,
"kl": 0.0043607138795778155,
"learning_rate": 2.9987956643918105e-07,
"loss": 0.0002,
"reward": 0.34521481581032276,
"reward_std": 0.6393520161509514,
"rewards/reward_func": 0.34521481581032276,
"step": 5232
},
{
"completion_length": 176.28125,
"epoch": 0.7011909541014318,
"grad_norm": 3.921875,
"kl": 0.004437842464540154,
"learning_rate": 2.9880904589856817e-07,
"loss": 0.0002,
"reward": -0.07886990532279015,
"reward_std": 0.6460573114454746,
"rewards/reward_func": -0.07886990532279015,
"step": 5240
},
{
"completion_length": 212.4375,
"epoch": 0.7022614746420447,
"grad_norm": 3.640625,
"kl": 0.004008949821582064,
"learning_rate": 2.977385253579553e-07,
"loss": 0.0002,
"reward": 0.012970509007573128,
"reward_std": 0.5811912510544062,
"rewards/reward_func": 0.012970509007573128,
"step": 5248
},
{
"completion_length": 182.1328125,
"epoch": 0.7033319951826575,
"grad_norm": 4.4375,
"kl": 0.004463888035388663,
"learning_rate": 2.9666800481734247e-07,
"loss": 0.0002,
"reward": 0.295044606551528,
"reward_std": 0.5268499422818422,
"rewards/reward_func": 0.295044606551528,
"step": 5256
},
{
"completion_length": 158.8515625,
"epoch": 0.7044025157232704,
"grad_norm": 3.796875,
"kl": 0.005019562435336411,
"learning_rate": 2.9559748427672953e-07,
"loss": 0.0002,
"reward": 0.2960619358345866,
"reward_std": 0.5362240988761187,
"rewards/reward_func": 0.2960619358345866,
"step": 5264
},
{
"completion_length": 158.328125,
"epoch": 0.7054730362638834,
"grad_norm": 4.40625,
"kl": 0.004566931165754795,
"learning_rate": 2.945269637361167e-07,
"loss": 0.0002,
"reward": 0.5046307481825352,
"reward_std": 0.45363772846758366,
"rewards/reward_func": 0.5046307481825352,
"step": 5272
},
{
"completion_length": 184.515625,
"epoch": 0.7065435568044962,
"grad_norm": 7.125,
"kl": 0.004289998818421736,
"learning_rate": 2.934564431955038e-07,
"loss": 0.0002,
"reward": 0.4870417043566704,
"reward_std": 0.4741673758253455,
"rewards/reward_func": 0.4870417043566704,
"step": 5280
},
{
"completion_length": 161.359375,
"epoch": 0.7076140773451091,
"grad_norm": 5.53125,
"kl": 0.0042855191277340055,
"learning_rate": 2.9238592265489095e-07,
"loss": 0.0002,
"reward": 0.37416786467656493,
"reward_std": 0.5148907378315926,
"rewards/reward_func": 0.37416786467656493,
"step": 5288
},
{
"completion_length": 159.1484375,
"epoch": 0.7086845978857219,
"grad_norm": 4.03125,
"kl": 0.005468921910505742,
"learning_rate": 2.9131540211427807e-07,
"loss": 0.0002,
"reward": 0.2032206254079938,
"reward_std": 0.5835869964212179,
"rewards/reward_func": 0.2032206254079938,
"step": 5296
},
{
"completion_length": 178.046875,
"epoch": 0.7097551184263348,
"grad_norm": 5.46875,
"kl": 0.004879669373622164,
"learning_rate": 2.9024488157366514e-07,
"loss": 0.0002,
"reward": 0.07407154329121113,
"reward_std": 0.5483472738415003,
"rewards/reward_func": 0.07407154329121113,
"step": 5304
},
{
"completion_length": 172.4296875,
"epoch": 0.7108256389669477,
"grad_norm": 3.078125,
"kl": 0.004936008132062852,
"learning_rate": 2.891743610330523e-07,
"loss": 0.0002,
"reward": 0.1570496652275324,
"reward_std": 0.6552108749747276,
"rewards/reward_func": 0.1570496652275324,
"step": 5312
},
{
"completion_length": 183.265625,
"epoch": 0.7118961595075606,
"grad_norm": 3.15625,
"kl": 0.004192218388197944,
"learning_rate": 2.881038404924394e-07,
"loss": 0.0002,
"reward": 0.2290868228301406,
"reward_std": 0.6626240387558937,
"rewards/reward_func": 0.2290868228301406,
"step": 5320
},
{
"completion_length": 129.171875,
"epoch": 0.7129666800481734,
"grad_norm": 3.859375,
"kl": 0.00642680426244624,
"learning_rate": 2.8703331995182656e-07,
"loss": 0.0003,
"reward": 0.395254772156477,
"reward_std": 0.4721956867724657,
"rewards/reward_func": 0.395254772156477,
"step": 5328
},
{
"completion_length": 157.6640625,
"epoch": 0.7140372005887863,
"grad_norm": 3.234375,
"kl": 0.004553045437205583,
"learning_rate": 2.859627994112137e-07,
"loss": 0.0002,
"reward": 0.44277836102992296,
"reward_std": 0.5288186706602573,
"rewards/reward_func": 0.44277836102992296,
"step": 5336
},
{
"completion_length": 174.7578125,
"epoch": 0.7151077211293991,
"grad_norm": 3.578125,
"kl": 0.004835324827581644,
"learning_rate": 2.848922788706008e-07,
"loss": 0.0002,
"reward": 0.2129112258553505,
"reward_std": 0.518157972022891,
"rewards/reward_func": 0.2129112258553505,
"step": 5344
},
{
"completion_length": 183.0625,
"epoch": 0.7161782416700121,
"grad_norm": 4.40625,
"kl": 0.004699339595390484,
"learning_rate": 2.838217583299879e-07,
"loss": 0.0002,
"reward": -0.12015869608148932,
"reward_std": 0.6651497483253479,
"rewards/reward_func": -0.12015869608148932,
"step": 5352
},
{
"completion_length": 167.125,
"epoch": 0.717248762210625,
"grad_norm": 3.8125,
"kl": 0.004674197465647012,
"learning_rate": 2.827512377893751e-07,
"loss": 0.0002,
"reward": 0.08481440320611,
"reward_std": 0.6426707338541746,
"rewards/reward_func": 0.08481440320611,
"step": 5360
},
{
"completion_length": 187.109375,
"epoch": 0.7183192827512378,
"grad_norm": 3.359375,
"kl": 0.004370440146885812,
"learning_rate": 2.8168071724876217e-07,
"loss": 0.0002,
"reward": 0.10261328518390656,
"reward_std": 0.33446657191962004,
"rewards/reward_func": 0.10261328518390656,
"step": 5368
},
{
"completion_length": 169.8828125,
"epoch": 0.7193898032918506,
"grad_norm": 4.46875,
"kl": 0.005003685160772875,
"learning_rate": 2.8061019670814934e-07,
"loss": 0.0002,
"reward": 0.35325085651129484,
"reward_std": 0.5368635784834623,
"rewards/reward_func": 0.35325085651129484,
"step": 5376
},
{
"completion_length": 127.6328125,
"epoch": 0.7204603238324635,
"grad_norm": 5.0,
"kl": 0.006041952816303819,
"learning_rate": 2.795396761675364e-07,
"loss": 0.0002,
"reward": 0.31289495434612036,
"reward_std": 0.5351240076124668,
"rewards/reward_func": 0.31289495434612036,
"step": 5384
},
{
"completion_length": 132.1484375,
"epoch": 0.7215308443730765,
"grad_norm": 3.890625,
"kl": 0.004856948013184592,
"learning_rate": 2.784691556269236e-07,
"loss": 0.0002,
"reward": 0.5194568559527397,
"reward_std": 0.4919391795992851,
"rewards/reward_func": 0.5194568559527397,
"step": 5392
},
{
"completion_length": 172.6796875,
"epoch": 0.7226013649136893,
"grad_norm": 3.71875,
"kl": 0.004867620766162872,
"learning_rate": 2.773986350863107e-07,
"loss": 0.0002,
"reward": 0.1840124912559986,
"reward_std": 0.6040789932012558,
"rewards/reward_func": 0.1840124912559986,
"step": 5400
},
{
"completion_length": 162.0078125,
"epoch": 0.7236718854543022,
"grad_norm": 5.4375,
"kl": 0.0043890359229408205,
"learning_rate": 2.7632811454569783e-07,
"loss": 0.0002,
"reward": 0.40340816229581833,
"reward_std": 0.46000672224909067,
"rewards/reward_func": 0.40340816229581833,
"step": 5408
},
{
"completion_length": 178.796875,
"epoch": 0.724742405994915,
"grad_norm": 3.671875,
"kl": 0.004453314671991393,
"learning_rate": 2.7525759400508495e-07,
"loss": 0.0002,
"reward": 0.1911243163049221,
"reward_std": 0.5930454572662711,
"rewards/reward_func": 0.1911243163049221,
"step": 5416
},
{
"completion_length": 166.296875,
"epoch": 0.7258129265355279,
"grad_norm": 3.890625,
"kl": 0.004950450966134667,
"learning_rate": 2.7418707346447207e-07,
"loss": 0.0002,
"reward": 0.2432717476040125,
"reward_std": 0.4679036773741245,
"rewards/reward_func": 0.2432717476040125,
"step": 5424
},
{
"completion_length": 204.453125,
"epoch": 0.7268834470761407,
"grad_norm": 3.6875,
"kl": 0.004144096135860309,
"learning_rate": 2.731165529238592e-07,
"loss": 0.0002,
"reward": -0.028832857497036457,
"reward_std": 0.5423443503677845,
"rewards/reward_func": -0.028832857497036457,
"step": 5432
},
{
"completion_length": 144.3671875,
"epoch": 0.7279539676167537,
"grad_norm": 5.46875,
"kl": 0.0068962293735239655,
"learning_rate": 2.7204603238324637e-07,
"loss": 0.0003,
"reward": 0.5656752809882164,
"reward_std": 0.37680432945489883,
"rewards/reward_func": 0.5656752809882164,
"step": 5440
},
{
"completion_length": 156.8203125,
"epoch": 0.7290244881573665,
"grad_norm": 3.21875,
"kl": 0.005207971204072237,
"learning_rate": 2.7097551184263344e-07,
"loss": 0.0002,
"reward": 0.46704378351569176,
"reward_std": 0.4321159301325679,
"rewards/reward_func": 0.46704378351569176,
"step": 5448
},
{
"completion_length": 170.1953125,
"epoch": 0.7300950086979794,
"grad_norm": 4.28125,
"kl": 0.004335955512942746,
"learning_rate": 2.699049913020206e-07,
"loss": 0.0002,
"reward": 0.15294395573437214,
"reward_std": 0.6653651669621468,
"rewards/reward_func": 0.15294395573437214,
"step": 5456
},
{
"completion_length": 149.6015625,
"epoch": 0.7311655292385922,
"grad_norm": 3.90625,
"kl": 0.005089007405331358,
"learning_rate": 2.6883447076140773e-07,
"loss": 0.0002,
"reward": 0.2426714487373829,
"reward_std": 0.48969776928424835,
"rewards/reward_func": 0.2426714487373829,
"step": 5464
},
{
"completion_length": 171.625,
"epoch": 0.7322360497792051,
"grad_norm": 3.03125,
"kl": 0.004394051560666412,
"learning_rate": 2.6776395022079485e-07,
"loss": 0.0002,
"reward": 0.10768201760947704,
"reward_std": 0.4550578175112605,
"rewards/reward_func": 0.10768201760947704,
"step": 5472
},
{
"completion_length": 159.9609375,
"epoch": 0.733306570319818,
"grad_norm": 3.59375,
"kl": 0.005018858646508306,
"learning_rate": 2.66693429680182e-07,
"loss": 0.0002,
"reward": 0.2529556443914771,
"reward_std": 0.6179038770496845,
"rewards/reward_func": 0.2529556443914771,
"step": 5480
},
{
"completion_length": 154.8125,
"epoch": 0.7343770908604309,
"grad_norm": 5.0,
"kl": 0.005219785525696352,
"learning_rate": 2.656229091395691e-07,
"loss": 0.0002,
"reward": 0.3117452962324023,
"reward_std": 0.5476666176691651,
"rewards/reward_func": 0.3117452962324023,
"step": 5488
},
{
"completion_length": 140.671875,
"epoch": 0.7354476114010438,
"grad_norm": 4.78125,
"kl": 0.006368768343236297,
"learning_rate": 2.645523885989562e-07,
"loss": 0.0003,
"reward": 0.5569799374789,
"reward_std": 0.4139596875756979,
"rewards/reward_func": 0.5569799374789,
"step": 5496
},
{
"completion_length": 208.4375,
"epoch": 0.7365181319416566,
"grad_norm": 3.171875,
"kl": 0.004221481096465141,
"learning_rate": 2.634818680583434e-07,
"loss": 0.0002,
"reward": 0.13646352104842663,
"reward_std": 0.674302838742733,
"rewards/reward_func": 0.13646352104842663,
"step": 5504
},
{
"completion_length": 177.7578125,
"epoch": 0.7375886524822695,
"grad_norm": 4.09375,
"kl": 0.004927775065880269,
"learning_rate": 2.6241134751773046e-07,
"loss": 0.0002,
"reward": 0.179019657894969,
"reward_std": 0.4836566299200058,
"rewards/reward_func": 0.179019657894969,
"step": 5512
},
{
"completion_length": 168.359375,
"epoch": 0.7386591730228824,
"grad_norm": 3.71875,
"kl": 0.004563187627354637,
"learning_rate": 2.6134082697711764e-07,
"loss": 0.0002,
"reward": 0.0944369975477457,
"reward_std": 0.6901743151247501,
"rewards/reward_func": 0.0944369975477457,
"step": 5520
},
{
"completion_length": 141.9765625,
"epoch": 0.7397296935634953,
"grad_norm": 4.5,
"kl": 0.005385736672906205,
"learning_rate": 2.602703064365047e-07,
"loss": 0.0002,
"reward": 0.3004543990828097,
"reward_std": 0.6421327739953995,
"rewards/reward_func": 0.3004543990828097,
"step": 5528
},
{
"completion_length": 161.1015625,
"epoch": 0.7408002141041081,
"grad_norm": 4.53125,
"kl": 0.005307289626216516,
"learning_rate": 2.591997858958919e-07,
"loss": 0.0002,
"reward": 0.4302559047937393,
"reward_std": 0.296412231400609,
"rewards/reward_func": 0.4302559047937393,
"step": 5536
},
{
"completion_length": 160.15625,
"epoch": 0.741870734644721,
"grad_norm": 5.21875,
"kl": 0.00511023830040358,
"learning_rate": 2.58129265355279e-07,
"loss": 0.0002,
"reward": 0.39244108088314533,
"reward_std": 0.5584999155253172,
"rewards/reward_func": 0.39244108088314533,
"step": 5544
},
{
"completion_length": 161.0546875,
"epoch": 0.7429412551853338,
"grad_norm": 3.46875,
"kl": 0.004729041800601408,
"learning_rate": 2.570587448146661e-07,
"loss": 0.0002,
"reward": 0.30113553907722235,
"reward_std": 0.6211994774639606,
"rewards/reward_func": 0.30113553907722235,
"step": 5552
},
{
"completion_length": 176.578125,
"epoch": 0.7440117757259468,
"grad_norm": 3.28125,
"kl": 0.004635761812096462,
"learning_rate": 2.5598822427405324e-07,
"loss": 0.0002,
"reward": 0.3027530014514923,
"reward_std": 0.34483792912214994,
"rewards/reward_func": 0.3027530014514923,
"step": 5560
},
{
"completion_length": 159.375,
"epoch": 0.7450822962665596,
"grad_norm": 5.53125,
"kl": 0.005176402977667749,
"learning_rate": 2.5491770373344036e-07,
"loss": 0.0002,
"reward": 0.17298301681876183,
"reward_std": 0.584480419754982,
"rewards/reward_func": 0.17298301681876183,
"step": 5568
},
{
"completion_length": 157.3671875,
"epoch": 0.7461528168071725,
"grad_norm": 6.5625,
"kl": 0.005517173325642943,
"learning_rate": 2.538471831928275e-07,
"loss": 0.0002,
"reward": 0.17840459011495113,
"reward_std": 0.6545839756727219,
"rewards/reward_func": 0.17840459011495113,
"step": 5576
},
{
"completion_length": 163.765625,
"epoch": 0.7472233373477853,
"grad_norm": 3.296875,
"kl": 0.005615679023321718,
"learning_rate": 2.5277666265221466e-07,
"loss": 0.0002,
"reward": 0.3282418688759208,
"reward_std": 0.4674977771937847,
"rewards/reward_func": 0.3282418688759208,
"step": 5584
},
{
"completion_length": 199.1171875,
"epoch": 0.7482938578883982,
"grad_norm": 4.96875,
"kl": 0.004132435307838023,
"learning_rate": 2.5170614211160173e-07,
"loss": 0.0002,
"reward": 0.041125981137156487,
"reward_std": 0.5962537340819836,
"rewards/reward_func": 0.041125981137156487,
"step": 5592
},
{
"completion_length": 195.3828125,
"epoch": 0.7493643784290112,
"grad_norm": 4.0625,
"kl": 0.003978644759627059,
"learning_rate": 2.506356215709889e-07,
"loss": 0.0002,
"reward": 0.13140291906893253,
"reward_std": 0.44777560979127884,
"rewards/reward_func": 0.13140291906893253,
"step": 5600
},
{
"completion_length": 173.609375,
"epoch": 0.750434898969624,
"grad_norm": 2.796875,
"kl": 0.004251972888596356,
"learning_rate": 2.49565101030376e-07,
"loss": 0.0002,
"reward": 0.20013932138681412,
"reward_std": 0.6238753385841846,
"rewards/reward_func": 0.20013932138681412,
"step": 5608
},
{
"completion_length": 165.1328125,
"epoch": 0.7515054195102369,
"grad_norm": 7.53125,
"kl": 0.004290038690669462,
"learning_rate": 2.4849458048976315e-07,
"loss": 0.0002,
"reward": 0.2280603777617216,
"reward_std": 0.4963626991957426,
"rewards/reward_func": 0.2280603777617216,
"step": 5616
},
{
"completion_length": 141.671875,
"epoch": 0.7525759400508497,
"grad_norm": 4.46875,
"kl": 0.00585965282516554,
"learning_rate": 2.4742405994915027e-07,
"loss": 0.0002,
"reward": 0.4678545705974102,
"reward_std": 0.43725813180208206,
"rewards/reward_func": 0.4678545705974102,
"step": 5624
},
{
"completion_length": 160.421875,
"epoch": 0.7536464605914626,
"grad_norm": 4.96875,
"kl": 0.005591863940935582,
"learning_rate": 2.463535394085374e-07,
"loss": 0.0002,
"reward": 0.24596700817346573,
"reward_std": 0.4220298836007714,
"rewards/reward_func": 0.24596700817346573,
"step": 5632
},
{
"completion_length": 156.6171875,
"epoch": 0.7547169811320755,
"grad_norm": 3.171875,
"kl": 0.004576119041303173,
"learning_rate": 2.452830188679245e-07,
"loss": 0.0002,
"reward": 0.3924466483294964,
"reward_std": 0.6098343282938004,
"rewards/reward_func": 0.3924466483294964,
"step": 5640
},
{
"completion_length": 176.890625,
"epoch": 0.7557875016726884,
"grad_norm": 3.4375,
"kl": 0.003426549636060372,
"learning_rate": 2.4421249832731163e-07,
"loss": 0.0001,
"reward": 0.31396659277379513,
"reward_std": 0.507732754573226,
"rewards/reward_func": 0.31396659277379513,
"step": 5648
},
{
"completion_length": 155.109375,
"epoch": 0.7568580222133012,
"grad_norm": 5.0625,
"kl": 0.004432518238900229,
"learning_rate": 2.4314197778669875e-07,
"loss": 0.0002,
"reward": 0.3897492587566376,
"reward_std": 0.472976541146636,
"rewards/reward_func": 0.3897492587566376,
"step": 5656
},
{
"completion_length": 178.7421875,
"epoch": 0.7579285427539141,
"grad_norm": 1.96875,
"kl": 0.004251753707649186,
"learning_rate": 2.4207145724608593e-07,
"loss": 0.0002,
"reward": 0.08406687900424004,
"reward_std": 0.4810841968283057,
"rewards/reward_func": 0.08406687900424004,
"step": 5664
},
{
"completion_length": 166.5859375,
"epoch": 0.7589990632945269,
"grad_norm": 4.875,
"kl": 0.005223593441769481,
"learning_rate": 2.4100093670547305e-07,
"loss": 0.0002,
"reward": 0.3817774336785078,
"reward_std": 0.6594663038849831,
"rewards/reward_func": 0.3817774336785078,
"step": 5672
},
{
"completion_length": 161.3671875,
"epoch": 0.7600695838351398,
"grad_norm": 5.625,
"kl": 0.0044474324968177825,
"learning_rate": 2.3993041616486017e-07,
"loss": 0.0002,
"reward": 0.23454780131578445,
"reward_std": 0.37179601565003395,
"rewards/reward_func": 0.23454780131578445,
"step": 5680
},
{
"completion_length": 179.796875,
"epoch": 0.7611401043757527,
"grad_norm": 3.609375,
"kl": 0.00496278639184311,
"learning_rate": 2.388598956242473e-07,
"loss": 0.0002,
"reward": 0.10904507525265217,
"reward_std": 0.5533247627317905,
"rewards/reward_func": 0.10904507525265217,
"step": 5688
},
{
"completion_length": 160.4296875,
"epoch": 0.7622106249163656,
"grad_norm": 4.53125,
"kl": 0.005624369368888438,
"learning_rate": 2.3778937508363441e-07,
"loss": 0.0002,
"reward": 0.32307033240795135,
"reward_std": 0.3578721797093749,
"rewards/reward_func": 0.32307033240795135,
"step": 5696
},
{
"completion_length": 181.2578125,
"epoch": 0.7632811454569784,
"grad_norm": 3.09375,
"kl": 0.004423889273311943,
"learning_rate": 2.3671885454302154e-07,
"loss": 0.0002,
"reward": 0.2661805059760809,
"reward_std": 0.433091813698411,
"rewards/reward_func": 0.2661805059760809,
"step": 5704
},
{
"completion_length": 145.40625,
"epoch": 0.7643516659975913,
"grad_norm": 3.53125,
"kl": 0.005560883553698659,
"learning_rate": 2.3564833400240866e-07,
"loss": 0.0002,
"reward": 0.38319743797183037,
"reward_std": 0.5694666914641857,
"rewards/reward_func": 0.38319743797183037,
"step": 5712
},
{
"completion_length": 185.6640625,
"epoch": 0.7654221865382042,
"grad_norm": 3.375,
"kl": 0.004699640907347202,
"learning_rate": 2.3457781346179578e-07,
"loss": 0.0002,
"reward": 0.11784735321998596,
"reward_std": 0.5145326796919107,
"rewards/reward_func": 0.11784735321998596,
"step": 5720
},
{
"completion_length": 202.7265625,
"epoch": 0.7664927070788171,
"grad_norm": 3.65625,
"kl": 0.0042415427742525935,
"learning_rate": 2.335072929211829e-07,
"loss": 0.0002,
"reward": -0.14664648659527302,
"reward_std": 0.6029860116541386,
"rewards/reward_func": -0.14664648659527302,
"step": 5728
},
{
"completion_length": 176.671875,
"epoch": 0.76756322761943,
"grad_norm": 3.671875,
"kl": 0.005350680381525308,
"learning_rate": 2.3243677238057005e-07,
"loss": 0.0002,
"reward": 0.28928207233548164,
"reward_std": 0.49851767159998417,
"rewards/reward_func": 0.28928207233548164,
"step": 5736
},
{
"completion_length": 168.3828125,
"epoch": 0.7686337481600428,
"grad_norm": 4.3125,
"kl": 0.005111474136356264,
"learning_rate": 2.3136625183995717e-07,
"loss": 0.0002,
"reward": 0.20785732567310333,
"reward_std": 0.4819117970764637,
"rewards/reward_func": 0.20785732567310333,
"step": 5744
},
{
"completion_length": 161.5625,
"epoch": 0.7697042687006557,
"grad_norm": 5.75,
"kl": 0.0046100525360088795,
"learning_rate": 2.302957312993443e-07,
"loss": 0.0002,
"reward": 0.3344459980726242,
"reward_std": 0.48296352848410606,
"rewards/reward_func": 0.3344459980726242,
"step": 5752
},
{
"completion_length": 159.8203125,
"epoch": 0.7707747892412685,
"grad_norm": 3.84375,
"kl": 0.005216164543526247,
"learning_rate": 2.2922521075873141e-07,
"loss": 0.0002,
"reward": 0.4968814216554165,
"reward_std": 0.5169591847807169,
"rewards/reward_func": 0.4968814216554165,
"step": 5760
},
{
"completion_length": 181.0,
"epoch": 0.7718453097818815,
"grad_norm": 3.859375,
"kl": 0.0039515624375781044,
"learning_rate": 2.2815469021811856e-07,
"loss": 0.0002,
"reward": 0.1462385654449463,
"reward_std": 0.5148510783910751,
"rewards/reward_func": 0.1462385654449463,
"step": 5768
},
{
"completion_length": 186.953125,
"epoch": 0.7729158303224943,
"grad_norm": 3.234375,
"kl": 0.00495643715839833,
"learning_rate": 2.2708416967750568e-07,
"loss": 0.0002,
"reward": -0.019661023281514645,
"reward_std": 0.4568687481805682,
"rewards/reward_func": -0.019661023281514645,
"step": 5776
},
{
"completion_length": 152.9453125,
"epoch": 0.7739863508631072,
"grad_norm": 4.125,
"kl": 0.005363121483242139,
"learning_rate": 2.260136491368928e-07,
"loss": 0.0002,
"reward": 0.3975646123290062,
"reward_std": 0.5788163132965565,
"rewards/reward_func": 0.3975646123290062,
"step": 5784
},
{
"completion_length": 150.8046875,
"epoch": 0.77505687140372,
"grad_norm": 3.25,
"kl": 0.0049289112794213,
"learning_rate": 2.2494312859627993e-07,
"loss": 0.0002,
"reward": 0.29290657490491867,
"reward_std": 0.6054155379533768,
"rewards/reward_func": 0.29290657490491867,
"step": 5792
},
{
"completion_length": 149.1640625,
"epoch": 0.7761273919443329,
"grad_norm": 2.546875,
"kl": 0.006072040821891278,
"learning_rate": 2.2387260805566705e-07,
"loss": 0.0002,
"reward": 0.234967946074903,
"reward_std": 0.5344967059791088,
"rewards/reward_func": 0.234967946074903,
"step": 5800
},
{
"completion_length": 158.625,
"epoch": 0.7771979124849459,
"grad_norm": 4.0625,
"kl": 0.004590392898535356,
"learning_rate": 2.228020875150542e-07,
"loss": 0.0002,
"reward": 0.419980987906456,
"reward_std": 0.4606306320056319,
"rewards/reward_func": 0.419980987906456,
"step": 5808
},
{
"completion_length": 138.7265625,
"epoch": 0.7782684330255587,
"grad_norm": 3.8125,
"kl": 0.004858676256844774,
"learning_rate": 2.2173156697444132e-07,
"loss": 0.0002,
"reward": 0.5591896008700132,
"reward_std": 0.5148359183222055,
"rewards/reward_func": 0.5591896008700132,
"step": 5816
},
{
"completion_length": 191.6328125,
"epoch": 0.7793389535661716,
"grad_norm": 7.0625,
"kl": 0.004010791366454214,
"learning_rate": 2.2066104643382844e-07,
"loss": 0.0002,
"reward": 0.07061274722218513,
"reward_std": 0.6621855795383453,
"rewards/reward_func": 0.07061274722218513,
"step": 5824
},
{
"completion_length": 151.5546875,
"epoch": 0.7804094741067844,
"grad_norm": 3.5625,
"kl": 0.004385879990877584,
"learning_rate": 2.1959052589321556e-07,
"loss": 0.0002,
"reward": 0.4282612316310406,
"reward_std": 0.5311172138899565,
"rewards/reward_func": 0.4282612316310406,
"step": 5832
},
{
"completion_length": 183.8671875,
"epoch": 0.7814799946473973,
"grad_norm": 4.0,
"kl": 0.004209680715575814,
"learning_rate": 2.185200053526027e-07,
"loss": 0.0002,
"reward": 0.1611488163471222,
"reward_std": 0.5946944504976273,
"rewards/reward_func": 0.1611488163471222,
"step": 5840
},
{
"completion_length": 137.359375,
"epoch": 0.7825505151880102,
"grad_norm": 4.3125,
"kl": 0.004825499141588807,
"learning_rate": 2.1744948481198983e-07,
"loss": 0.0002,
"reward": 0.5471408823505044,
"reward_std": 0.5473849456757307,
"rewards/reward_func": 0.5471408823505044,
"step": 5848
},
{
"completion_length": 159.34375,
"epoch": 0.7836210357286231,
"grad_norm": 3.4375,
"kl": 0.005440732988063246,
"learning_rate": 2.1637896427137695e-07,
"loss": 0.0002,
"reward": 0.4683985644951463,
"reward_std": 0.5685102045536041,
"rewards/reward_func": 0.4683985644951463,
"step": 5856
},
{
"completion_length": 161.59375,
"epoch": 0.7846915562692359,
"grad_norm": 4.5625,
"kl": 0.004569044103845954,
"learning_rate": 2.1530844373076407e-07,
"loss": 0.0002,
"reward": 0.0613291235640645,
"reward_std": 0.48243121802806854,
"rewards/reward_func": 0.0613291235640645,
"step": 5864
},
{
"completion_length": 170.6328125,
"epoch": 0.7857620768098488,
"grad_norm": 4.21875,
"kl": 0.004493650107178837,
"learning_rate": 2.1423792319015122e-07,
"loss": 0.0002,
"reward": 0.373017355799675,
"reward_std": 0.5189967537298799,
"rewards/reward_func": 0.373017355799675,
"step": 5872
},
{
"completion_length": 217.6640625,
"epoch": 0.7868325973504616,
"grad_norm": 3.5,
"kl": 0.003907823265763,
"learning_rate": 2.1316740264953834e-07,
"loss": 0.0002,
"reward": -0.019582286477088928,
"reward_std": 0.5519250631332397,
"rewards/reward_func": -0.019582286477088928,
"step": 5880
},
{
"completion_length": 177.4765625,
"epoch": 0.7879031178910746,
"grad_norm": 4.40625,
"kl": 0.004457623173948377,
"learning_rate": 2.1209688210892546e-07,
"loss": 0.0002,
"reward": 0.17010945454239845,
"reward_std": 0.5244961641728878,
"rewards/reward_func": 0.17010945454239845,
"step": 5888
},
{
"completion_length": 177.25,
"epoch": 0.7889736384316874,
"grad_norm": 4.9375,
"kl": 0.0046152446011547,
"learning_rate": 2.1102636156831259e-07,
"loss": 0.0002,
"reward": 0.20693709515035152,
"reward_std": 0.602562677115202,
"rewards/reward_func": 0.20693709515035152,
"step": 5896
},
{
"completion_length": 165.15625,
"epoch": 0.7900441589723003,
"grad_norm": 3.546875,
"kl": 0.004225551267154515,
"learning_rate": 2.099558410276997e-07,
"loss": 0.0002,
"reward": 0.3072157595306635,
"reward_std": 0.522355480119586,
"rewards/reward_func": 0.3072157595306635,
"step": 5904
},
{
"completion_length": 170.8125,
"epoch": 0.7911146795129131,
"grad_norm": 4.625,
"kl": 0.005012799199903384,
"learning_rate": 2.0888532048708686e-07,
"loss": 0.0002,
"reward": 0.320420335046947,
"reward_std": 0.43850363977253437,
"rewards/reward_func": 0.320420335046947,
"step": 5912
},
{
"completion_length": 158.8203125,
"epoch": 0.792185200053526,
"grad_norm": 5.78125,
"kl": 0.005518296180525795,
"learning_rate": 2.0781479994647398e-07,
"loss": 0.0002,
"reward": 0.12541838502511382,
"reward_std": 0.4963842146098614,
"rewards/reward_func": 0.12541838502511382,
"step": 5920
},
{
"completion_length": 165.421875,
"epoch": 0.7932557205941388,
"grad_norm": 3.5,
"kl": 0.004109891131520271,
"learning_rate": 2.067442794058611e-07,
"loss": 0.0002,
"reward": 0.48840315639972687,
"reward_std": 0.5170729719102383,
"rewards/reward_func": 0.48840315639972687,
"step": 5928
},
{
"completion_length": 206.5,
"epoch": 0.7943262411347518,
"grad_norm": 2.671875,
"kl": 0.004218890477204695,
"learning_rate": 2.0567375886524822e-07,
"loss": 0.0002,
"reward": 0.017320919781923294,
"reward_std": 0.5469899624586105,
"rewards/reward_func": 0.017320919781923294,
"step": 5936
},
{
"completion_length": 189.0390625,
"epoch": 0.7953967616753647,
"grad_norm": 3.5,
"kl": 0.004204195429338142,
"learning_rate": 2.0460323832463537e-07,
"loss": 0.0002,
"reward": 0.05206027068197727,
"reward_std": 0.5685999430716038,
"rewards/reward_func": 0.05206027068197727,
"step": 5944
},
{
"completion_length": 182.0,
"epoch": 0.7964672822159775,
"grad_norm": 3.140625,
"kl": 0.004214008251437917,
"learning_rate": 2.035327177840225e-07,
"loss": 0.0002,
"reward": 0.04203222133219242,
"reward_std": 0.5610231403261423,
"rewards/reward_func": 0.04203222133219242,
"step": 5952
},
{
"completion_length": 179.5078125,
"epoch": 0.7975378027565904,
"grad_norm": 3.84375,
"kl": 0.004752454871777445,
"learning_rate": 2.024621972434096e-07,
"loss": 0.0002,
"reward": -0.10599182732403278,
"reward_std": 0.6240234952419996,
"rewards/reward_func": -0.10599182732403278,
"step": 5960
},
{
"completion_length": 166.1875,
"epoch": 0.7986083232972032,
"grad_norm": 4.96875,
"kl": 0.004443921585334465,
"learning_rate": 2.0139167670279673e-07,
"loss": 0.0002,
"reward": 0.329925112426281,
"reward_std": 0.42279865965247154,
"rewards/reward_func": 0.329925112426281,
"step": 5968
},
{
"completion_length": 152.3515625,
"epoch": 0.7996788438378162,
"grad_norm": 3.21875,
"kl": 0.004863968148129061,
"learning_rate": 2.0032115616218383e-07,
"loss": 0.0002,
"reward": 0.2605556510388851,
"reward_std": 0.46567713283002377,
"rewards/reward_func": 0.2605556510388851,
"step": 5976
},
{
"completion_length": 171.7578125,
"epoch": 0.800749364378429,
"grad_norm": 4.09375,
"kl": 0.004715076414868236,
"learning_rate": 1.99250635621571e-07,
"loss": 0.0002,
"reward": -0.13541333191096783,
"reward_std": 0.6721258126199245,
"rewards/reward_func": -0.13541333191096783,
"step": 5984
},
{
"completion_length": 143.6640625,
"epoch": 0.8018198849190419,
"grad_norm": 4.5,
"kl": 0.0056079100468195975,
"learning_rate": 1.981801150809581e-07,
"loss": 0.0002,
"reward": 0.3414863357320428,
"reward_std": 0.47020469419658184,
"rewards/reward_func": 0.3414863357320428,
"step": 5992
},
{
"completion_length": 168.5078125,
"epoch": 0.8028904054596547,
"grad_norm": 3.46875,
"kl": 0.004616849677404389,
"learning_rate": 1.9710959454034522e-07,
"loss": 0.0002,
"reward": 0.24850520677864552,
"reward_std": 0.5514967441558838,
"rewards/reward_func": 0.24850520677864552,
"step": 6000
},
{
"completion_length": 168.734375,
"epoch": 0.8039609260002676,
"grad_norm": 2.578125,
"kl": 0.004905187961412594,
"learning_rate": 1.9603907399973234e-07,
"loss": 0.0002,
"reward": 0.29693731665611267,
"reward_std": 0.5282188858836889,
"rewards/reward_func": 0.29693731665611267,
"step": 6008
},
{
"completion_length": 172.6328125,
"epoch": 0.8050314465408805,
"grad_norm": 3.625,
"kl": 0.00480208353837952,
"learning_rate": 1.949685534591195e-07,
"loss": 0.0002,
"reward": 0.40039923787117004,
"reward_std": 0.5602267645299435,
"rewards/reward_func": 0.40039923787117004,
"step": 6016
},
{
"completion_length": 148.671875,
"epoch": 0.8061019670814934,
"grad_norm": 6.78125,
"kl": 0.005018723517423496,
"learning_rate": 1.938980329185066e-07,
"loss": 0.0002,
"reward": 0.3693223036825657,
"reward_std": 0.4133735718205571,
"rewards/reward_func": 0.3693223036825657,
"step": 6024
},
{
"completion_length": 168.359375,
"epoch": 0.8071724876221062,
"grad_norm": 5.03125,
"kl": 0.004699432494817302,
"learning_rate": 1.9282751237789373e-07,
"loss": 0.0002,
"reward": 0.29968111030757427,
"reward_std": 0.5234957840293646,
"rewards/reward_func": 0.29968111030757427,
"step": 6032
},
{
"completion_length": 172.1171875,
"epoch": 0.8082430081627191,
"grad_norm": 4.40625,
"kl": 0.0047337598516605794,
"learning_rate": 1.9175699183728085e-07,
"loss": 0.0002,
"reward": -0.13689319603145123,
"reward_std": 0.5464825332164764,
"rewards/reward_func": -0.13689319603145123,
"step": 6040
},
{
"completion_length": 187.53125,
"epoch": 0.809313528703332,
"grad_norm": 4.71875,
"kl": 0.0043419343419373035,
"learning_rate": 1.9068647129666797e-07,
"loss": 0.0002,
"reward": 0.12531755585223436,
"reward_std": 0.7370849475264549,
"rewards/reward_func": 0.12531755585223436,
"step": 6048
},
{
"completion_length": 173.359375,
"epoch": 0.8103840492439449,
"grad_norm": 3.09375,
"kl": 0.004423053003847599,
"learning_rate": 1.8961595075605512e-07,
"loss": 0.0002,
"reward": 0.45560589246451855,
"reward_std": 0.3819491732865572,
"rewards/reward_func": 0.45560589246451855,
"step": 6056
},
{
"completion_length": 173.4296875,
"epoch": 0.8114545697845578,
"grad_norm": 4.8125,
"kl": 0.004857113177422434,
"learning_rate": 1.8854543021544224e-07,
"loss": 0.0002,
"reward": 0.08866522740572691,
"reward_std": 0.5376447830349207,
"rewards/reward_func": 0.08866522740572691,
"step": 6064
},
{
"completion_length": 164.6015625,
"epoch": 0.8125250903251706,
"grad_norm": 3.328125,
"kl": 0.004501277348026633,
"learning_rate": 1.8747490967482937e-07,
"loss": 0.0002,
"reward": 0.2745439810678363,
"reward_std": 0.4785211766138673,
"rewards/reward_func": 0.2745439810678363,
"step": 6072
},
{
"completion_length": 160.1796875,
"epoch": 0.8135956108657835,
"grad_norm": 4.78125,
"kl": 0.004779946495546028,
"learning_rate": 1.864043891342165e-07,
"loss": 0.0002,
"reward": 0.22340465802699327,
"reward_std": 0.557499123737216,
"rewards/reward_func": 0.22340465802699327,
"step": 6080
},
{
"completion_length": 160.6171875,
"epoch": 0.8146661314063963,
"grad_norm": 6.59375,
"kl": 0.005315470625646412,
"learning_rate": 1.8533386859360364e-07,
"loss": 0.0002,
"reward": 0.12089579226449132,
"reward_std": 0.6152683198451996,
"rewards/reward_func": 0.12089579226449132,
"step": 6088
},
{
"completion_length": 180.1171875,
"epoch": 0.8157366519470093,
"grad_norm": 4.90625,
"kl": 0.0042492037755437195,
"learning_rate": 1.8426334805299076e-07,
"loss": 0.0002,
"reward": 0.18059484660625458,
"reward_std": 0.5923185907304287,
"rewards/reward_func": 0.18059484660625458,
"step": 6096
},
{
"completion_length": 149.1796875,
"epoch": 0.8168071724876221,
"grad_norm": 5.125,
"kl": 0.00574629902257584,
"learning_rate": 1.8319282751237788e-07,
"loss": 0.0002,
"reward": 0.2305867071263492,
"reward_std": 0.47459197975695133,
"rewards/reward_func": 0.2305867071263492,
"step": 6104
},
{
"completion_length": 160.0703125,
"epoch": 0.817877693028235,
"grad_norm": 4.65625,
"kl": 0.005720962421037257,
"learning_rate": 1.82122306971765e-07,
"loss": 0.0002,
"reward": 0.4231163961812854,
"reward_std": 0.49531901255249977,
"rewards/reward_func": 0.4231163961812854,
"step": 6112
},
{
"completion_length": 179.625,
"epoch": 0.8189482135688478,
"grad_norm": 3.609375,
"kl": 0.004402774036861956,
"learning_rate": 1.8105178643115212e-07,
"loss": 0.0002,
"reward": -0.09616492129862309,
"reward_std": 0.5352960834279656,
"rewards/reward_func": -0.09616492129862309,
"step": 6120
},
{
"completion_length": 197.9921875,
"epoch": 0.8200187341094607,
"grad_norm": 3.96875,
"kl": 0.004230510094203055,
"learning_rate": 1.7998126589053927e-07,
"loss": 0.0002,
"reward": 0.09585804212838411,
"reward_std": 0.6544227637350559,
"rewards/reward_func": 0.09585804212838411,
"step": 6128
},
{
"completion_length": 174.0390625,
"epoch": 0.8210892546500737,
"grad_norm": 3.8125,
"kl": 0.004287142743123695,
"learning_rate": 1.789107453499264e-07,
"loss": 0.0002,
"reward": 0.2640516827814281,
"reward_std": 0.5714995982125401,
"rewards/reward_func": 0.2640516827814281,
"step": 6136
},
{
"completion_length": 171.5234375,
"epoch": 0.8221597751906865,
"grad_norm": 2.703125,
"kl": 0.004413856513565406,
"learning_rate": 1.778402248093135e-07,
"loss": 0.0002,
"reward": 0.32004706375300884,
"reward_std": 0.6919787935912609,
"rewards/reward_func": 0.32004706375300884,
"step": 6144
},
{
"completion_length": 164.6640625,
"epoch": 0.8232302957312994,
"grad_norm": 3.40625,
"kl": 0.004383451188914478,
"learning_rate": 1.7676970426870063e-07,
"loss": 0.0002,
"reward": 0.3686336353421211,
"reward_std": 0.5524613773450255,
"rewards/reward_func": 0.3686336353421211,
"step": 6152
},
{
"completion_length": 196.8203125,
"epoch": 0.8243008162719122,
"grad_norm": 3.078125,
"kl": 0.0042349822469986975,
"learning_rate": 1.7569918372808778e-07,
"loss": 0.0002,
"reward": 0.04516376554965973,
"reward_std": 0.5202826540917158,
"rewards/reward_func": 0.04516376554965973,
"step": 6160
},
{
"completion_length": 150.421875,
"epoch": 0.8253713368125251,
"grad_norm": 2.890625,
"kl": 0.004983038583304733,
"learning_rate": 1.746286631874749e-07,
"loss": 0.0002,
"reward": 0.320843068882823,
"reward_std": 0.4479983486235142,
"rewards/reward_func": 0.320843068882823,
"step": 6168
},
{
"completion_length": 180.46875,
"epoch": 0.8264418573531379,
"grad_norm": 4.71875,
"kl": 0.004280634428141639,
"learning_rate": 1.7355814264686203e-07,
"loss": 0.0002,
"reward": 0.42457358445972204,
"reward_std": 0.6277044154703617,
"rewards/reward_func": 0.42457358445972204,
"step": 6176
},
{
"completion_length": 177.421875,
"epoch": 0.8275123778937509,
"grad_norm": 4.3125,
"kl": 0.003841431171167642,
"learning_rate": 1.7248762210624915e-07,
"loss": 0.0002,
"reward": 0.4124793987721205,
"reward_std": 0.5699762850999832,
"rewards/reward_func": 0.4124793987721205,
"step": 6184
},
{
"completion_length": 165.5625,
"epoch": 0.8285828984343637,
"grad_norm": 3.53125,
"kl": 0.0045668908569496125,
"learning_rate": 1.714171015656363e-07,
"loss": 0.0002,
"reward": 0.24390191398561,
"reward_std": 0.5946025252342224,
"rewards/reward_func": 0.24390191398561,
"step": 6192
},
{
"completion_length": 144.796875,
"epoch": 0.8296534189749766,
"grad_norm": 6.15625,
"kl": 0.005119076173286885,
"learning_rate": 1.7034658102502342e-07,
"loss": 0.0002,
"reward": 0.37631342001259327,
"reward_std": 0.54392384365201,
"rewards/reward_func": 0.37631342001259327,
"step": 6200
},
{
"completion_length": 162.703125,
"epoch": 0.8307239395155894,
"grad_norm": 3.65625,
"kl": 0.004819765774300322,
"learning_rate": 1.6927606048441054e-07,
"loss": 0.0002,
"reward": 0.3171768644824624,
"reward_std": 0.6571879032999277,
"rewards/reward_func": 0.3171768644824624,
"step": 6208
},
{
"completion_length": 176.15625,
"epoch": 0.8317944600562023,
"grad_norm": 4.4375,
"kl": 0.004749486513901502,
"learning_rate": 1.6820553994379766e-07,
"loss": 0.0002,
"reward": 0.32477567065507174,
"reward_std": 0.584898017346859,
"rewards/reward_func": 0.32477567065507174,
"step": 6216
},
{
"completion_length": 164.546875,
"epoch": 0.8328649805968152,
"grad_norm": 4.96875,
"kl": 0.005459955689730123,
"learning_rate": 1.6713501940318478e-07,
"loss": 0.0002,
"reward": 0.3247902784496546,
"reward_std": 0.6046720538288355,
"rewards/reward_func": 0.3247902784496546,
"step": 6224
},
{
"completion_length": 173.3828125,
"epoch": 0.8339355011374281,
"grad_norm": 3.671875,
"kl": 0.005025158607168123,
"learning_rate": 1.6606449886257193e-07,
"loss": 0.0002,
"reward": 0.438681710511446,
"reward_std": 0.41498881857842207,
"rewards/reward_func": 0.438681710511446,
"step": 6232
},
{
"completion_length": 149.4453125,
"epoch": 0.8350060216780409,
"grad_norm": 4.125,
"kl": 0.0048881605616770685,
"learning_rate": 1.6499397832195905e-07,
"loss": 0.0002,
"reward": 0.3972213324159384,
"reward_std": 0.522408589720726,
"rewards/reward_func": 0.3972213324159384,
"step": 6240
},
{
"completion_length": 142.03125,
"epoch": 0.8360765422186538,
"grad_norm": 5.9375,
"kl": 0.00615677481982857,
"learning_rate": 1.6392345778134617e-07,
"loss": 0.0002,
"reward": 0.5399059653282166,
"reward_std": 0.5134044801816344,
"rewards/reward_func": 0.5399059653282166,
"step": 6248
},
{
"completion_length": 139.890625,
"epoch": 0.8371470627592666,
"grad_norm": 4.40625,
"kl": 0.005498810496646911,
"learning_rate": 1.628529372407333e-07,
"loss": 0.0002,
"reward": 0.26559029519557953,
"reward_std": 0.7036202065646648,
"rewards/reward_func": 0.26559029519557953,
"step": 6256
},
{
"completion_length": 167.046875,
"epoch": 0.8382175832998796,
"grad_norm": 3.8125,
"kl": 0.005057969567133114,
"learning_rate": 1.6178241670012044e-07,
"loss": 0.0002,
"reward": 0.26883680559694767,
"reward_std": 0.6107715517282486,
"rewards/reward_func": 0.26883680559694767,
"step": 6264
},
{
"completion_length": 183.6015625,
"epoch": 0.8392881038404925,
"grad_norm": 3.78125,
"kl": 0.00433379874448292,
"learning_rate": 1.6071189615950756e-07,
"loss": 0.0002,
"reward": 0.08581209369003773,
"reward_std": 0.601530484855175,
"rewards/reward_func": 0.08581209369003773,
"step": 6272
},
{
"completion_length": 177.2265625,
"epoch": 0.8403586243811053,
"grad_norm": 4.71875,
"kl": 0.0044705503969453275,
"learning_rate": 1.5964137561889469e-07,
"loss": 0.0002,
"reward": 0.23431246215477586,
"reward_std": 0.5960433762520552,
"rewards/reward_func": 0.23431246215477586,
"step": 6280
},
{
"completion_length": 152.078125,
"epoch": 0.8414291449217182,
"grad_norm": 4.8125,
"kl": 0.005255370575468987,
"learning_rate": 1.585708550782818e-07,
"loss": 0.0002,
"reward": 0.37102524004876614,
"reward_std": 0.6371021419763565,
"rewards/reward_func": 0.37102524004876614,
"step": 6288
},
{
"completion_length": 217.2734375,
"epoch": 0.842499665462331,
"grad_norm": 4.375,
"kl": 0.003196624806150794,
"learning_rate": 1.5750033453766893e-07,
"loss": 0.0001,
"reward": -0.045689786318689585,
"reward_std": 0.4852413050830364,
"rewards/reward_func": -0.045689786318689585,
"step": 6296
},
{
"completion_length": 227.7265625,
"epoch": 0.843570186002944,
"grad_norm": 3.71875,
"kl": 0.004137254873057827,
"learning_rate": 1.5642981399705608e-07,
"loss": 0.0002,
"reward": 6.577186286449432e-05,
"reward_std": 0.4605599669739604,
"rewards/reward_func": 6.577186286449432e-05,
"step": 6304
},
{
"completion_length": 176.59375,
"epoch": 0.8446407065435568,
"grad_norm": 3.25,
"kl": 0.004376317374408245,
"learning_rate": 1.553592934564432e-07,
"loss": 0.0002,
"reward": 0.12455911561846733,
"reward_std": 0.6250845305621624,
"rewards/reward_func": 0.12455911561846733,
"step": 6312
},
{
"completion_length": 150.9609375,
"epoch": 0.8457112270841697,
"grad_norm": 5.0625,
"kl": 0.004825094016268849,
"learning_rate": 1.5428877291583032e-07,
"loss": 0.0002,
"reward": 0.37383434921503067,
"reward_std": 0.6138091459870338,
"rewards/reward_func": 0.37383434921503067,
"step": 6320
},
{
"completion_length": 179.15625,
"epoch": 0.8467817476247825,
"grad_norm": 3.984375,
"kl": 0.004287428979296237,
"learning_rate": 1.5321825237521744e-07,
"loss": 0.0002,
"reward": 0.3161802035756409,
"reward_std": 0.5376028679311275,
"rewards/reward_func": 0.3161802035756409,
"step": 6328
},
{
"completion_length": 145.8828125,
"epoch": 0.8478522681653954,
"grad_norm": 3.171875,
"kl": 0.0048953695222735405,
"learning_rate": 1.521477318346046e-07,
"loss": 0.0002,
"reward": 0.27396881859749556,
"reward_std": 0.6160639338195324,
"rewards/reward_func": 0.27396881859749556,
"step": 6336
},
{
"completion_length": 170.7578125,
"epoch": 0.8489227887060083,
"grad_norm": 3.171875,
"kl": 0.004435895767528564,
"learning_rate": 1.510772112939917e-07,
"loss": 0.0002,
"reward": 0.34336171485483646,
"reward_std": 0.6223765797913074,
"rewards/reward_func": 0.34336171485483646,
"step": 6344
},
{
"completion_length": 134.5,
"epoch": 0.8499933092466212,
"grad_norm": 4.4375,
"kl": 0.005276092153508216,
"learning_rate": 1.5000669075337883e-07,
"loss": 0.0002,
"reward": 0.4353441474959254,
"reward_std": 0.5333261359483004,
"rewards/reward_func": 0.4353441474959254,
"step": 6352
},
{
"completion_length": 137.9921875,
"epoch": 0.851063829787234,
"grad_norm": 5.21875,
"kl": 0.0056113199389074,
"learning_rate": 1.4893617021276595e-07,
"loss": 0.0002,
"reward": 0.11142583098262548,
"reward_std": 0.6482997722923756,
"rewards/reward_func": 0.11142583098262548,
"step": 6360
},
{
"completion_length": 212.765625,
"epoch": 0.8521343503278469,
"grad_norm": 4.625,
"kl": 0.004180938733043149,
"learning_rate": 1.4786564967215308e-07,
"loss": 0.0002,
"reward": -0.04978405591100454,
"reward_std": 0.6307705044746399,
"rewards/reward_func": -0.04978405591100454,
"step": 6368
},
{
"completion_length": 156.7265625,
"epoch": 0.8532048708684598,
"grad_norm": 2.640625,
"kl": 0.005362185067497194,
"learning_rate": 1.4679512913154022e-07,
"loss": 0.0002,
"reward": -0.013063086196780205,
"reward_std": 0.5464332979172468,
"rewards/reward_func": -0.013063086196780205,
"step": 6376
},
{
"completion_length": 173.03125,
"epoch": 0.8542753914090727,
"grad_norm": 3.921875,
"kl": 0.004632528842194006,
"learning_rate": 1.4572460859092734e-07,
"loss": 0.0002,
"reward": 0.34282067604362965,
"reward_std": 0.622871071100235,
"rewards/reward_func": 0.34282067604362965,
"step": 6384
},
{
"completion_length": 169.6796875,
"epoch": 0.8553459119496856,
"grad_norm": 4.0,
"kl": 0.004290186625439674,
"learning_rate": 1.4465408805031447e-07,
"loss": 0.0002,
"reward": 0.2828236762434244,
"reward_std": 0.4671051539480686,
"rewards/reward_func": 0.2828236762434244,
"step": 6392
},
{
"completion_length": 163.65625,
"epoch": 0.8564164324902984,
"grad_norm": 3.9375,
"kl": 0.005294958682497963,
"learning_rate": 1.435835675097016e-07,
"loss": 0.0002,
"reward": 0.46301793679594994,
"reward_std": 0.5075423391535878,
"rewards/reward_func": 0.46301793679594994,
"step": 6400
},
{
"completion_length": 157.46875,
"epoch": 0.8574869530309113,
"grad_norm": 3.703125,
"kl": 0.004041396110551432,
"learning_rate": 1.4251304696908874e-07,
"loss": 0.0002,
"reward": 0.48526691645383835,
"reward_std": 0.3719025030732155,
"rewards/reward_func": 0.48526691645383835,
"step": 6408
},
{
"completion_length": 146.40625,
"epoch": 0.8585574735715241,
"grad_norm": 5.5,
"kl": 0.005460718472022563,
"learning_rate": 1.4144252642847586e-07,
"loss": 0.0002,
"reward": 0.2532842471264303,
"reward_std": 0.44650126062333584,
"rewards/reward_func": 0.2532842471264303,
"step": 6416
},
{
"completion_length": 183.3203125,
"epoch": 0.859627994112137,
"grad_norm": 4.09375,
"kl": 0.004378183133667335,
"learning_rate": 1.4037200588786295e-07,
"loss": 0.0002,
"reward": 0.13037376385182142,
"reward_std": 0.5470245387405157,
"rewards/reward_func": 0.13037376385182142,
"step": 6424
},
{
"completion_length": 169.9921875,
"epoch": 0.8606985146527499,
"grad_norm": 3.890625,
"kl": 0.0042949684138875455,
"learning_rate": 1.3930148534725007e-07,
"loss": 0.0002,
"reward": 0.1133667528629303,
"reward_std": 0.5052687106654048,
"rewards/reward_func": 0.1133667528629303,
"step": 6432
},
{
"completion_length": 149.46875,
"epoch": 0.8617690351933628,
"grad_norm": 3.203125,
"kl": 0.0049638144264463335,
"learning_rate": 1.3823096480663722e-07,
"loss": 0.0002,
"reward": 0.3671250296756625,
"reward_std": 0.7020149789750576,
"rewards/reward_func": 0.3671250296756625,
"step": 6440
},
{
"completion_length": 177.7578125,
"epoch": 0.8628395557339756,
"grad_norm": 3.15625,
"kl": 0.004953681491315365,
"learning_rate": 1.3716044426602434e-07,
"loss": 0.0002,
"reward": -0.07036676816642284,
"reward_std": 0.5223548822104931,
"rewards/reward_func": -0.07036676816642284,
"step": 6448
},
{
"completion_length": 153.890625,
"epoch": 0.8639100762745885,
"grad_norm": 4.0625,
"kl": 0.004841944552026689,
"learning_rate": 1.3608992372541147e-07,
"loss": 0.0002,
"reward": 0.2930979495868087,
"reward_std": 0.6658169776201248,
"rewards/reward_func": 0.2930979495868087,
"step": 6456
},
{
"completion_length": 163.359375,
"epoch": 0.8649805968152013,
"grad_norm": 3.796875,
"kl": 0.004769285937072709,
"learning_rate": 1.3501940318479859e-07,
"loss": 0.0002,
"reward": 0.48341894522309303,
"reward_std": 0.4925485821440816,
"rewards/reward_func": 0.48341894522309303,
"step": 6464
},
{
"completion_length": 160.4765625,
"epoch": 0.8660511173558143,
"grad_norm": 5.5625,
"kl": 0.005216082121478394,
"learning_rate": 1.339488826441857e-07,
"loss": 0.0002,
"reward": 0.26688177324831486,
"reward_std": 0.5798533223569393,
"rewards/reward_func": 0.26688177324831486,
"step": 6472
},
{
"completion_length": 180.109375,
"epoch": 0.8671216378964272,
"grad_norm": 4.34375,
"kl": 0.005117598222568631,
"learning_rate": 1.3287836210357286e-07,
"loss": 0.0002,
"reward": 0.0437483387067914,
"reward_std": 0.5821977593004704,
"rewards/reward_func": 0.0437483387067914,
"step": 6480
},
{
"completion_length": 203.53125,
"epoch": 0.86819215843704,
"grad_norm": 3.65625,
"kl": 0.004146075778407976,
"learning_rate": 1.3180784156295998e-07,
"loss": 0.0002,
"reward": -0.04123528301715851,
"reward_std": 0.5794482082128525,
"rewards/reward_func": -0.04123528301715851,
"step": 6488
},
{
"completion_length": 174.515625,
"epoch": 0.8692626789776529,
"grad_norm": 3.046875,
"kl": 0.004450612410437316,
"learning_rate": 1.307373210223471e-07,
"loss": 0.0002,
"reward": 0.17205783817917109,
"reward_std": 0.5600821115076542,
"rewards/reward_func": 0.17205783817917109,
"step": 6496
},
{
"completion_length": 160.5078125,
"epoch": 0.8703331995182657,
"grad_norm": 4.40625,
"kl": 0.005854069750057533,
"learning_rate": 1.2966680048173422e-07,
"loss": 0.0002,
"reward": 0.5184466666541994,
"reward_std": 0.43986151926219463,
"rewards/reward_func": 0.5184466666541994,
"step": 6504
},
{
"completion_length": 164.9296875,
"epoch": 0.8714037200588787,
"grad_norm": 3.484375,
"kl": 0.005087268742499873,
"learning_rate": 1.2859627994112137e-07,
"loss": 0.0002,
"reward": 0.15493404306471348,
"reward_std": 0.5500355400145054,
"rewards/reward_func": 0.15493404306471348,
"step": 6512
},
{
"completion_length": 166.6953125,
"epoch": 0.8724742405994915,
"grad_norm": 2.40625,
"kl": 0.005023477482609451,
"learning_rate": 1.275257594005085e-07,
"loss": 0.0002,
"reward": 0.21355824172496796,
"reward_std": 0.695870652794838,
"rewards/reward_func": 0.21355824172496796,
"step": 6520
},
{
"completion_length": 179.40625,
"epoch": 0.8735447611401044,
"grad_norm": 3.671875,
"kl": 0.0044980833772569895,
"learning_rate": 1.264552388598956e-07,
"loss": 0.0002,
"reward": 0.31162807578220963,
"reward_std": 0.49335628002882004,
"rewards/reward_func": 0.31162807578220963,
"step": 6528
},
{
"completion_length": 170.890625,
"epoch": 0.8746152816807172,
"grad_norm": 4.34375,
"kl": 0.004584902344504371,
"learning_rate": 1.2538471831928273e-07,
"loss": 0.0002,
"reward": 0.39711445942521095,
"reward_std": 0.4620585907250643,
"rewards/reward_func": 0.39711445942521095,
"step": 6536
},
{
"completion_length": 196.34375,
"epoch": 0.8756858022213301,
"grad_norm": 6.03125,
"kl": 0.004008692951174453,
"learning_rate": 1.2431419777866988e-07,
"loss": 0.0002,
"reward": 0.14655437879264355,
"reward_std": 0.5025924574583769,
"rewards/reward_func": 0.14655437879264355,
"step": 6544
},
{
"completion_length": 157.1328125,
"epoch": 0.876756322761943,
"grad_norm": 4.5,
"kl": 0.005272853362839669,
"learning_rate": 1.23243677238057e-07,
"loss": 0.0002,
"reward": 0.16074330359697342,
"reward_std": 0.4522952139377594,
"rewards/reward_func": 0.16074330359697342,
"step": 6552
},
{
"completion_length": 170.0390625,
"epoch": 0.8778268433025559,
"grad_norm": 3.71875,
"kl": 0.0051819840737152845,
"learning_rate": 1.2217315669744412e-07,
"loss": 0.0002,
"reward": 0.3139108493924141,
"reward_std": 0.4983799997717142,
"rewards/reward_func": 0.3139108493924141,
"step": 6560
},
{
"completion_length": 160.1953125,
"epoch": 0.8788973638431687,
"grad_norm": 2.5,
"kl": 0.004580837674438953,
"learning_rate": 1.2110263615683125e-07,
"loss": 0.0002,
"reward": 0.35805173218250275,
"reward_std": 0.4121380029246211,
"rewards/reward_func": 0.35805173218250275,
"step": 6568
},
{
"completion_length": 166.125,
"epoch": 0.8799678843837816,
"grad_norm": 3.890625,
"kl": 0.005891179316677153,
"learning_rate": 1.200321156162184e-07,
"loss": 0.0002,
"reward": 0.3935977406799793,
"reward_std": 0.4564328156411648,
"rewards/reward_func": 0.3935977406799793,
"step": 6576
},
{
"completion_length": 161.828125,
"epoch": 0.8810384049243944,
"grad_norm": 4.03125,
"kl": 0.004937338293530047,
"learning_rate": 1.189615950756055e-07,
"loss": 0.0002,
"reward": 0.3541194014251232,
"reward_std": 0.7327413186430931,
"rewards/reward_func": 0.3541194014251232,
"step": 6584
},
{
"completion_length": 166.890625,
"epoch": 0.8821089254650074,
"grad_norm": 4.0,
"kl": 0.004368811612948775,
"learning_rate": 1.1789107453499264e-07,
"loss": 0.0002,
"reward": 0.43425997346639633,
"reward_std": 0.5963248610496521,
"rewards/reward_func": 0.43425997346639633,
"step": 6592
},
{
"completion_length": 139.5,
"epoch": 0.8831794460056203,
"grad_norm": 3.546875,
"kl": 0.006153674854431301,
"learning_rate": 1.1682055399437976e-07,
"loss": 0.0002,
"reward": 0.4587271837517619,
"reward_std": 0.5946958791464567,
"rewards/reward_func": 0.4587271837517619,
"step": 6600
},
{
"completion_length": 160.2890625,
"epoch": 0.8842499665462331,
"grad_norm": 3.265625,
"kl": 0.004449796746484935,
"learning_rate": 1.1575003345376688e-07,
"loss": 0.0002,
"reward": 0.3748700972646475,
"reward_std": 0.5290507553145289,
"rewards/reward_func": 0.3748700972646475,
"step": 6608
},
{
"completion_length": 175.03125,
"epoch": 0.885320487086846,
"grad_norm": 3.4375,
"kl": 0.0046757735253777355,
"learning_rate": 1.1467951291315402e-07,
"loss": 0.0002,
"reward": 0.36219143867492676,
"reward_std": 0.5348459035158157,
"rewards/reward_func": 0.36219143867492676,
"step": 6616
},
{
"completion_length": 159.0078125,
"epoch": 0.8863910076274588,
"grad_norm": 4.375,
"kl": 0.005130204517627135,
"learning_rate": 1.1360899237254114e-07,
"loss": 0.0002,
"reward": 0.40300269052386284,
"reward_std": 0.5223680902272463,
"rewards/reward_func": 0.40300269052386284,
"step": 6624
},
{
"completion_length": 168.6328125,
"epoch": 0.8874615281680718,
"grad_norm": 4.03125,
"kl": 0.00497715815436095,
"learning_rate": 1.1253847183192827e-07,
"loss": 0.0002,
"reward": 0.3870235029608011,
"reward_std": 0.6206906009465456,
"rewards/reward_func": 0.3870235029608011,
"step": 6632
},
{
"completion_length": 167.6328125,
"epoch": 0.8885320487086846,
"grad_norm": 2.75,
"kl": 0.004565039882436395,
"learning_rate": 1.1146795129131539e-07,
"loss": 0.0002,
"reward": 0.19453393667936325,
"reward_std": 0.43898776825517416,
"rewards/reward_func": 0.19453393667936325,
"step": 6640
},
{
"completion_length": 155.34375,
"epoch": 0.8896025692492975,
"grad_norm": 2.140625,
"kl": 0.004799488058779389,
"learning_rate": 1.1039743075070253e-07,
"loss": 0.0002,
"reward": 0.44809896126389503,
"reward_std": 0.476587675511837,
"rewards/reward_func": 0.44809896126389503,
"step": 6648
},
{
"completion_length": 182.6640625,
"epoch": 0.8906730897899103,
"grad_norm": 3.328125,
"kl": 0.0043890890083275735,
"learning_rate": 1.0932691021008965e-07,
"loss": 0.0002,
"reward": 0.289157398045063,
"reward_std": 0.5493966788053513,
"rewards/reward_func": 0.289157398045063,
"step": 6656
},
{
"completion_length": 150.7421875,
"epoch": 0.8917436103305232,
"grad_norm": 3.234375,
"kl": 0.004936346551403403,
"learning_rate": 1.0825638966947678e-07,
"loss": 0.0002,
"reward": 0.5019879713654518,
"reward_std": 0.550602201372385,
"rewards/reward_func": 0.5019879713654518,
"step": 6664
},
{
"completion_length": 141.7265625,
"epoch": 0.892814130871136,
"grad_norm": 5.34375,
"kl": 0.0059457606403157115,
"learning_rate": 1.071858691288639e-07,
"loss": 0.0002,
"reward": 0.4372959118336439,
"reward_std": 0.46707610227167606,
"rewards/reward_func": 0.4372959118336439,
"step": 6672
},
{
"completion_length": 170.8046875,
"epoch": 0.893884651411749,
"grad_norm": 3.703125,
"kl": 0.004617019789293408,
"learning_rate": 1.0611534858825104e-07,
"loss": 0.0002,
"reward": 0.06940314406529069,
"reward_std": 0.6122306901961565,
"rewards/reward_func": 0.06940314406529069,
"step": 6680
},
{
"completion_length": 177.7265625,
"epoch": 0.8949551719523618,
"grad_norm": 2.578125,
"kl": 0.004395580617710948,
"learning_rate": 1.0504482804763816e-07,
"loss": 0.0002,
"reward": 0.09883344545960426,
"reward_std": 0.5640975758433342,
"rewards/reward_func": 0.09883344545960426,
"step": 6688
},
{
"completion_length": 179.0859375,
"epoch": 0.8960256924929747,
"grad_norm": 4.40625,
"kl": 0.004834166058572009,
"learning_rate": 1.0397430750702528e-07,
"loss": 0.0002,
"reward": 0.2342253029346466,
"reward_std": 0.6865712143480778,
"rewards/reward_func": 0.2342253029346466,
"step": 6696
},
{
"completion_length": 161.90625,
"epoch": 0.8970962130335876,
"grad_norm": 5.90625,
"kl": 0.004615583224222064,
"learning_rate": 1.0290378696641242e-07,
"loss": 0.0002,
"reward": 0.18776031211018562,
"reward_std": 0.5335487443953753,
"rewards/reward_func": 0.18776031211018562,
"step": 6704
},
{
"completion_length": 191.4453125,
"epoch": 0.8981667335742004,
"grad_norm": 4.46875,
"kl": 0.004502045980188996,
"learning_rate": 1.0183326642579954e-07,
"loss": 0.0002,
"reward": 0.22535105049610138,
"reward_std": 0.4681578129529953,
"rewards/reward_func": 0.22535105049610138,
"step": 6712
},
{
"completion_length": 187.609375,
"epoch": 0.8992372541148134,
"grad_norm": 4.875,
"kl": 0.004247891949489713,
"learning_rate": 1.0076274588518667e-07,
"loss": 0.0002,
"reward": 0.1750158555805683,
"reward_std": 0.6813812926411629,
"rewards/reward_func": 0.1750158555805683,
"step": 6720
},
{
"completion_length": 162.8203125,
"epoch": 0.9003077746554262,
"grad_norm": 4.25,
"kl": 0.004933495947625488,
"learning_rate": 9.96922253445738e-08,
"loss": 0.0002,
"reward": 0.23774974327534437,
"reward_std": 0.48002783581614494,
"rewards/reward_func": 0.23774974327534437,
"step": 6728
},
{
"completion_length": 162.859375,
"epoch": 0.9013782951960391,
"grad_norm": 4.5625,
"kl": 0.005467957467772067,
"learning_rate": 9.862170480396093e-08,
"loss": 0.0002,
"reward": -0.0023063644766807556,
"reward_std": 0.564548920840025,
"rewards/reward_func": -0.0023063644766807556,
"step": 6736
},
{
"completion_length": 187.5078125,
"epoch": 0.9024488157366519,
"grad_norm": 5.84375,
"kl": 0.004037181934108958,
"learning_rate": 9.755118426334805e-08,
"loss": 0.0002,
"reward": 0.27846864983439445,
"reward_std": 0.5254440493881702,
"rewards/reward_func": 0.27846864983439445,
"step": 6744
},
{
"completion_length": 174.0,
"epoch": 0.9035193362772648,
"grad_norm": 3.046875,
"kl": 0.004285787290427834,
"learning_rate": 9.648066372273519e-08,
"loss": 0.0002,
"reward": 0.22628629952669144,
"reward_std": 0.49990267865359783,
"rewards/reward_func": 0.22628629952669144,
"step": 6752
},
{
"completion_length": 164.0,
"epoch": 0.9045898568178777,
"grad_norm": 4.15625,
"kl": 0.005695787549484521,
"learning_rate": 9.54101431821223e-08,
"loss": 0.0002,
"reward": 0.26332173496484756,
"reward_std": 0.6010408755391836,
"rewards/reward_func": 0.26332173496484756,
"step": 6760
},
{
"completion_length": 165.484375,
"epoch": 0.9056603773584906,
"grad_norm": 2.65625,
"kl": 0.0045514948724303395,
"learning_rate": 9.433962264150943e-08,
"loss": 0.0002,
"reward": 0.1936313882470131,
"reward_std": 0.6337927635759115,
"rewards/reward_func": 0.1936313882470131,
"step": 6768
},
{
"completion_length": 161.5078125,
"epoch": 0.9067308978991034,
"grad_norm": 4.5,
"kl": 0.0048291504790540785,
"learning_rate": 9.326910210089655e-08,
"loss": 0.0002,
"reward": 0.137207493185997,
"reward_std": 0.5269910991191864,
"rewards/reward_func": 0.137207493185997,
"step": 6776
},
{
"completion_length": 152.515625,
"epoch": 0.9078014184397163,
"grad_norm": 4.34375,
"kl": 0.0052593986911233515,
"learning_rate": 9.219858156028367e-08,
"loss": 0.0002,
"reward": 0.5747008826583624,
"reward_std": 0.4051123149693012,
"rewards/reward_func": 0.5747008826583624,
"step": 6784
},
{
"completion_length": 182.9296875,
"epoch": 0.9088719389803291,
"grad_norm": 6.6875,
"kl": 0.004641913692466915,
"learning_rate": 9.112806101967081e-08,
"loss": 0.0002,
"reward": 0.21289030835032463,
"reward_std": 0.48070234432816505,
"rewards/reward_func": 0.21289030835032463,
"step": 6792
},
{
"completion_length": 174.640625,
"epoch": 0.9099424595209421,
"grad_norm": 3.25,
"kl": 0.004628196998964995,
"learning_rate": 9.005754047905793e-08,
"loss": 0.0002,
"reward": 0.1559174619615078,
"reward_std": 0.6800275854766369,
"rewards/reward_func": 0.1559174619615078,
"step": 6800
},
{
"completion_length": 157.203125,
"epoch": 0.911012980061555,
"grad_norm": 3.40625,
"kl": 0.006008526281220838,
"learning_rate": 8.898701993844506e-08,
"loss": 0.0002,
"reward": 0.12123461440205574,
"reward_std": 0.5197535315528512,
"rewards/reward_func": 0.12123461440205574,
"step": 6808
},
{
"completion_length": 164.4609375,
"epoch": 0.9120835006021678,
"grad_norm": 3.734375,
"kl": 0.004909445357043296,
"learning_rate": 8.791649939783219e-08,
"loss": 0.0002,
"reward": 0.4351821830496192,
"reward_std": 0.5581427849829197,
"rewards/reward_func": 0.4351821830496192,
"step": 6816
},
{
"completion_length": 157.296875,
"epoch": 0.9131540211427807,
"grad_norm": 2.625,
"kl": 0.004735041700769216,
"learning_rate": 8.684597885721932e-08,
"loss": 0.0002,
"reward": 0.209370581433177,
"reward_std": 0.5503848614171147,
"rewards/reward_func": 0.209370581433177,
"step": 6824
},
{
"completion_length": 161.8046875,
"epoch": 0.9142245416833935,
"grad_norm": 4.78125,
"kl": 0.004196583904558793,
"learning_rate": 8.577545831660644e-08,
"loss": 0.0002,
"reward": 0.29275982081890106,
"reward_std": 0.4565849918872118,
"rewards/reward_func": 0.29275982081890106,
"step": 6832
},
{
"completion_length": 166.5625,
"epoch": 0.9152950622240065,
"grad_norm": 4.09375,
"kl": 0.004536589724011719,
"learning_rate": 8.470493777599358e-08,
"loss": 0.0002,
"reward": 0.19659875519573689,
"reward_std": 0.5807360988110304,
"rewards/reward_func": 0.19659875519573689,
"step": 6840
},
{
"completion_length": 172.1328125,
"epoch": 0.9163655827646193,
"grad_norm": 3.921875,
"kl": 0.004410766297951341,
"learning_rate": 8.36344172353807e-08,
"loss": 0.0002,
"reward": 0.03568706847727299,
"reward_std": 0.6016153171658516,
"rewards/reward_func": 0.03568706847727299,
"step": 6848
},
{
"completion_length": 178.8203125,
"epoch": 0.9174361033052322,
"grad_norm": 3.484375,
"kl": 0.00475726873264648,
"learning_rate": 8.256389669476782e-08,
"loss": 0.0002,
"reward": 0.035759665071964264,
"reward_std": 0.5136286579072475,
"rewards/reward_func": 0.035759665071964264,
"step": 6856
},
{
"completion_length": 158.8671875,
"epoch": 0.918506623845845,
"grad_norm": 4.1875,
"kl": 0.004829802084714174,
"learning_rate": 8.149337615415496e-08,
"loss": 0.0002,
"reward": 0.43006047047674656,
"reward_std": 0.5332435881718993,
"rewards/reward_func": 0.43006047047674656,
"step": 6864
},
{
"completion_length": 158.5078125,
"epoch": 0.9195771443864579,
"grad_norm": 2.75,
"kl": 0.004407216591062024,
"learning_rate": 8.042285561354208e-08,
"loss": 0.0002,
"reward": 0.42078845389187336,
"reward_std": 0.5134297851473093,
"rewards/reward_func": 0.42078845389187336,
"step": 6872
},
{
"completion_length": 135.34375,
"epoch": 0.9206476649270708,
"grad_norm": 6.0625,
"kl": 0.006246095523238182,
"learning_rate": 7.935233507292921e-08,
"loss": 0.0002,
"reward": 0.2924302965402603,
"reward_std": 0.5908289672806859,
"rewards/reward_func": 0.2924302965402603,
"step": 6880
},
{
"completion_length": 152.2734375,
"epoch": 0.9217181854676837,
"grad_norm": 4.65625,
"kl": 0.005424696602858603,
"learning_rate": 7.828181453231633e-08,
"loss": 0.0002,
"reward": 0.3303174478933215,
"reward_std": 0.4598999507725239,
"rewards/reward_func": 0.3303174478933215,
"step": 6888
},
{
"completion_length": 126.9375,
"epoch": 0.9227887060082965,
"grad_norm": 4.1875,
"kl": 0.005991748097585514,
"learning_rate": 7.721129399170347e-08,
"loss": 0.0002,
"reward": 0.5790990553796291,
"reward_std": 0.4186181202530861,
"rewards/reward_func": 0.5790990553796291,
"step": 6896
},
{
"completion_length": 165.5859375,
"epoch": 0.9238592265489094,
"grad_norm": 4.21875,
"kl": 0.004846252937568352,
"learning_rate": 7.614077345109059e-08,
"loss": 0.0002,
"reward": 0.21630746312439442,
"reward_std": 0.5224413331598043,
"rewards/reward_func": 0.21630746312439442,
"step": 6904
},
{
"completion_length": 150.71875,
"epoch": 0.9249297470895222,
"grad_norm": 4.4375,
"kl": 0.004703165264800191,
"learning_rate": 7.507025291047772e-08,
"loss": 0.0002,
"reward": 0.49177973717451096,
"reward_std": 0.46361699141561985,
"rewards/reward_func": 0.49177973717451096,
"step": 6912
},
{
"completion_length": 182.15625,
"epoch": 0.9260002676301351,
"grad_norm": 4.75,
"kl": 0.004313376499339938,
"learning_rate": 7.399973236986485e-08,
"loss": 0.0002,
"reward": 0.12689837673678994,
"reward_std": 0.6647001150995493,
"rewards/reward_func": 0.12689837673678994,
"step": 6920
},
{
"completion_length": 155.71875,
"epoch": 0.9270707881707481,
"grad_norm": 3.203125,
"kl": 0.004443499754415825,
"learning_rate": 7.292921182925198e-08,
"loss": 0.0002,
"reward": 0.29615641478449106,
"reward_std": 0.5568899232894182,
"rewards/reward_func": 0.29615641478449106,
"step": 6928
},
{
"completion_length": 155.15625,
"epoch": 0.9281413087113609,
"grad_norm": 3.796875,
"kl": 0.0046441941522061825,
"learning_rate": 7.18586912886391e-08,
"loss": 0.0002,
"reward": 0.28125396044924855,
"reward_std": 0.4789434429258108,
"rewards/reward_func": 0.28125396044924855,
"step": 6936
},
{
"completion_length": 172.03125,
"epoch": 0.9292118292519738,
"grad_norm": 4.84375,
"kl": 0.0055302626569755375,
"learning_rate": 7.078817074802622e-08,
"loss": 0.0002,
"reward": 0.3774759713560343,
"reward_std": 0.5277713388204575,
"rewards/reward_func": 0.3774759713560343,
"step": 6944
},
{
"completion_length": 165.890625,
"epoch": 0.9302823497925866,
"grad_norm": 3.015625,
"kl": 0.004302407876821235,
"learning_rate": 6.971765020741336e-08,
"loss": 0.0002,
"reward": 0.35936339199543,
"reward_std": 0.5036177840083838,
"rewards/reward_func": 0.35936339199543,
"step": 6952
},
{
"completion_length": 163.0234375,
"epoch": 0.9313528703331995,
"grad_norm": 3.78125,
"kl": 0.0055159886833280325,
"learning_rate": 6.864712966680048e-08,
"loss": 0.0002,
"reward": 0.5663758469745517,
"reward_std": 0.4252478200942278,
"rewards/reward_func": 0.5663758469745517,
"step": 6960
},
{
"completion_length": 151.8984375,
"epoch": 0.9324233908738124,
"grad_norm": 4.625,
"kl": 0.005510843213414773,
"learning_rate": 6.757660912618761e-08,
"loss": 0.0002,
"reward": 0.3682685000821948,
"reward_std": 0.5349069032818079,
"rewards/reward_func": 0.3682685000821948,
"step": 6968
},
{
"completion_length": 150.671875,
"epoch": 0.9334939114144253,
"grad_norm": 4.5,
"kl": 0.005221706640440971,
"learning_rate": 6.650608858557472e-08,
"loss": 0.0002,
"reward": 0.48502959311008453,
"reward_std": 0.39637486822903156,
"rewards/reward_func": 0.48502959311008453,
"step": 6976
},
{
"completion_length": 170.3359375,
"epoch": 0.9345644319550381,
"grad_norm": 5.53125,
"kl": 0.004806717770406976,
"learning_rate": 6.543556804496186e-08,
"loss": 0.0002,
"reward": 0.20024515688419342,
"reward_std": 0.3813412329182029,
"rewards/reward_func": 0.20024515688419342,
"step": 6984
},
{
"completion_length": 165.296875,
"epoch": 0.935634952495651,
"grad_norm": 4.21875,
"kl": 0.0052095072460360825,
"learning_rate": 6.436504750434898e-08,
"loss": 0.0002,
"reward": 0.17343932949006557,
"reward_std": 0.542176740244031,
"rewards/reward_func": 0.17343932949006557,
"step": 6992
},
{
"completion_length": 169.6953125,
"epoch": 0.9367054730362638,
"grad_norm": 3.6875,
"kl": 0.00422157411230728,
"learning_rate": 6.329452696373611e-08,
"loss": 0.0002,
"reward": 0.06634609401226044,
"reward_std": 0.6044113449752331,
"rewards/reward_func": 0.06634609401226044,
"step": 7000
},
{
"completion_length": 146.5234375,
"epoch": 0.9377759935768768,
"grad_norm": 5.0625,
"kl": 0.005492849391885102,
"learning_rate": 6.222400642312324e-08,
"loss": 0.0002,
"reward": 0.23497827351093292,
"reward_std": 0.42981395684182644,
"rewards/reward_func": 0.23497827351093292,
"step": 7008
},
{
"completion_length": 170.328125,
"epoch": 0.9388465141174896,
"grad_norm": 4.65625,
"kl": 0.004708502208814025,
"learning_rate": 6.115348588251037e-08,
"loss": 0.0002,
"reward": -0.04050692915916443,
"reward_std": 0.6017125463113189,
"rewards/reward_func": -0.04050692915916443,
"step": 7016
},
{
"completion_length": 166.53125,
"epoch": 0.9399170346581025,
"grad_norm": 3.78125,
"kl": 0.0050715115503408015,
"learning_rate": 6.008296534189749e-08,
"loss": 0.0002,
"reward": 0.3130027763545513,
"reward_std": 0.47869889438152313,
"rewards/reward_func": 0.3130027763545513,
"step": 7024
},
{
"completion_length": 194.546875,
"epoch": 0.9409875551987154,
"grad_norm": 3.0625,
"kl": 0.003658687841380015,
"learning_rate": 5.901244480128462e-08,
"loss": 0.0001,
"reward": 0.33295007050037384,
"reward_std": 0.4185595214366913,
"rewards/reward_func": 0.33295007050037384,
"step": 7032
},
{
"completion_length": 180.828125,
"epoch": 0.9420580757393282,
"grad_norm": 2.59375,
"kl": 0.005314617330441251,
"learning_rate": 5.794192426067175e-08,
"loss": 0.0002,
"reward": 0.2711464911699295,
"reward_std": 0.529150040820241,
"rewards/reward_func": 0.2711464911699295,
"step": 7040
},
{
"completion_length": 166.5390625,
"epoch": 0.9431285962799412,
"grad_norm": 4.3125,
"kl": 0.005095012194942683,
"learning_rate": 5.6871403720058877e-08,
"loss": 0.0002,
"reward": 0.38804778829216957,
"reward_std": 0.6022228971123695,
"rewards/reward_func": 0.38804778829216957,
"step": 7048
},
{
"completion_length": 166.375,
"epoch": 0.944199116820554,
"grad_norm": 3.921875,
"kl": 0.005058724695118144,
"learning_rate": 5.5800883179446e-08,
"loss": 0.0002,
"reward": 0.201092598028481,
"reward_std": 0.5487896800041199,
"rewards/reward_func": 0.201092598028481,
"step": 7056
},
{
"completion_length": 175.8203125,
"epoch": 0.9452696373611669,
"grad_norm": 5.59375,
"kl": 0.0046728674788028,
"learning_rate": 5.4730362638833126e-08,
"loss": 0.0002,
"reward": 0.21519318595528603,
"reward_std": 0.6764262039214373,
"rewards/reward_func": 0.21519318595528603,
"step": 7064
},
{
"completion_length": 170.0625,
"epoch": 0.9463401579017797,
"grad_norm": 3.65625,
"kl": 0.00383082203916274,
"learning_rate": 5.3659842098220254e-08,
"loss": 0.0002,
"reward": 0.2285008803009987,
"reward_std": 0.47355389036238194,
"rewards/reward_func": 0.2285008803009987,
"step": 7072
},
{
"completion_length": 160.890625,
"epoch": 0.9474106784423926,
"grad_norm": 3.5,
"kl": 0.004751811851747334,
"learning_rate": 5.258932155760738e-08,
"loss": 0.0002,
"reward": 0.4270824361592531,
"reward_std": 0.41622067615389824,
"rewards/reward_func": 0.4270824361592531,
"step": 7080
},
{
"completion_length": 157.25,
"epoch": 0.9484811989830055,
"grad_norm": 4.875,
"kl": 0.005698841763660312,
"learning_rate": 5.151880101699451e-08,
"loss": 0.0002,
"reward": 0.30552778858691454,
"reward_std": 0.522463321685791,
"rewards/reward_func": 0.30552778858691454,
"step": 7088
},
{
"completion_length": 156.6796875,
"epoch": 0.9495517195236184,
"grad_norm": 3.890625,
"kl": 0.005308831227011979,
"learning_rate": 5.044828047638164e-08,
"loss": 0.0002,
"reward": 0.2509817620739341,
"reward_std": 0.5911254324018955,
"rewards/reward_func": 0.2509817620739341,
"step": 7096
},
{
"completion_length": 172.4609375,
"epoch": 0.9506222400642312,
"grad_norm": 3.40625,
"kl": 0.004089270456461236,
"learning_rate": 4.937775993576877e-08,
"loss": 0.0002,
"reward": 0.1337134689092636,
"reward_std": 0.34575022105127573,
"rewards/reward_func": 0.1337134689092636,
"step": 7104
},
{
"completion_length": 165.0625,
"epoch": 0.9516927606048441,
"grad_norm": 3.609375,
"kl": 0.004878541512880474,
"learning_rate": 4.8307239395155895e-08,
"loss": 0.0002,
"reward": 0.28754607075825334,
"reward_std": 0.49302330799400806,
"rewards/reward_func": 0.28754607075825334,
"step": 7112
},
{
"completion_length": 190.7109375,
"epoch": 0.9527632811454569,
"grad_norm": 3.265625,
"kl": 0.003856517461827025,
"learning_rate": 4.7236718854543023e-08,
"loss": 0.0002,
"reward": 0.05854572542011738,
"reward_std": 0.4840726386755705,
"rewards/reward_func": 0.05854572542011738,
"step": 7120
},
{
"completion_length": 178.7421875,
"epoch": 0.9538338016860699,
"grad_norm": 5.09375,
"kl": 0.005262946098810062,
"learning_rate": 4.616619831393015e-08,
"loss": 0.0002,
"reward": 0.1380448378622532,
"reward_std": 0.44803581573069096,
"rewards/reward_func": 0.1380448378622532,
"step": 7128
},
{
"completion_length": 162.75,
"epoch": 0.9549043222266828,
"grad_norm": 4.1875,
"kl": 0.0052962955087423325,
"learning_rate": 4.509567777331728e-08,
"loss": 0.0002,
"reward": 0.4494497813284397,
"reward_std": 0.4952176222577691,
"rewards/reward_func": 0.4494497813284397,
"step": 7136
},
{
"completion_length": 148.8671875,
"epoch": 0.9559748427672956,
"grad_norm": 3.703125,
"kl": 0.004964547406416386,
"learning_rate": 4.4025157232704395e-08,
"loss": 0.0002,
"reward": 0.4687324403785169,
"reward_std": 0.40094813890755177,
"rewards/reward_func": 0.4687324403785169,
"step": 7144
},
{
"completion_length": 197.28125,
"epoch": 0.9570453633079085,
"grad_norm": 4.09375,
"kl": 0.0037281967815943062,
"learning_rate": 4.295463669209152e-08,
"loss": 0.0001,
"reward": 0.02254125289618969,
"reward_std": 0.5664320774376392,
"rewards/reward_func": 0.02254125289618969,
"step": 7152
},
{
"completion_length": 148.703125,
"epoch": 0.9581158838485213,
"grad_norm": 3.796875,
"kl": 0.004984479019185528,
"learning_rate": 4.188411615147865e-08,
"loss": 0.0002,
"reward": 0.325860820710659,
"reward_std": 0.41767950914800167,
"rewards/reward_func": 0.325860820710659,
"step": 7160
},
{
"completion_length": 153.265625,
"epoch": 0.9591864043891342,
"grad_norm": 2.9375,
"kl": 0.005220895051024854,
"learning_rate": 4.081359561086578e-08,
"loss": 0.0002,
"reward": 0.4472418650984764,
"reward_std": 0.4130860110744834,
"rewards/reward_func": 0.4472418650984764,
"step": 7168
},
{
"completion_length": 169.0625,
"epoch": 0.9602569249297471,
"grad_norm": 4.5625,
"kl": 0.004276268708053976,
"learning_rate": 3.974307507025291e-08,
"loss": 0.0002,
"reward": 0.15905702486634254,
"reward_std": 0.4423768687993288,
"rewards/reward_func": 0.15905702486634254,
"step": 7176
},
{
"completion_length": 179.3125,
"epoch": 0.96132744547036,
"grad_norm": 3.234375,
"kl": 0.0035234860552009195,
"learning_rate": 3.8672554529640036e-08,
"loss": 0.0001,
"reward": 0.3482946362346411,
"reward_std": 0.6113561438396573,
"rewards/reward_func": 0.3482946362346411,
"step": 7184
},
{
"completion_length": 172.6796875,
"epoch": 0.9623979660109728,
"grad_norm": 3.890625,
"kl": 0.005020510870963335,
"learning_rate": 3.7602033989027164e-08,
"loss": 0.0002,
"reward": 0.12693638168275356,
"reward_std": 0.5951482262462378,
"rewards/reward_func": 0.12693638168275356,
"step": 7192
},
{
"completion_length": 161.015625,
"epoch": 0.9634684865515857,
"grad_norm": 2.765625,
"kl": 0.004839012573938817,
"learning_rate": 3.653151344841429e-08,
"loss": 0.0002,
"reward": 0.22009205259382725,
"reward_std": 0.607865285128355,
"rewards/reward_func": 0.22009205259382725,
"step": 7200
},
{
"completion_length": 158.515625,
"epoch": 0.9645390070921985,
"grad_norm": 4.71875,
"kl": 0.004412859241710976,
"learning_rate": 3.546099290780142e-08,
"loss": 0.0002,
"reward": 0.17766493232920766,
"reward_std": 0.6588699370622635,
"rewards/reward_func": 0.17766493232920766,
"step": 7208
},
{
"completion_length": 168.8828125,
"epoch": 0.9656095276328115,
"grad_norm": 3.859375,
"kl": 0.00440784459351562,
"learning_rate": 3.439047236718855e-08,
"loss": 0.0002,
"reward": 0.31805921625345945,
"reward_std": 0.5728737730532885,
"rewards/reward_func": 0.31805921625345945,
"step": 7216
},
{
"completion_length": 169.84375,
"epoch": 0.9666800481734243,
"grad_norm": 4.96875,
"kl": 0.004156895098276436,
"learning_rate": 3.331995182657567e-08,
"loss": 0.0002,
"reward": 0.13793382793664932,
"reward_std": 0.6552281193435192,
"rewards/reward_func": 0.13793382793664932,
"step": 7224
},
{
"completion_length": 140.65625,
"epoch": 0.9677505687140372,
"grad_norm": 4.21875,
"kl": 0.00553873396711424,
"learning_rate": 3.22494312859628e-08,
"loss": 0.0002,
"reward": 0.35460880724713206,
"reward_std": 0.3868136703968048,
"rewards/reward_func": 0.35460880724713206,
"step": 7232
},
{
"completion_length": 198.5546875,
"epoch": 0.96882108925465,
"grad_norm": 3.4375,
"kl": 0.004783908079843968,
"learning_rate": 3.1178910745349926e-08,
"loss": 0.0002,
"reward": 0.1244197292253375,
"reward_std": 0.5501943584531546,
"rewards/reward_func": 0.1244197292253375,
"step": 7240
},
{
"completion_length": 187.1875,
"epoch": 0.9698916097952629,
"grad_norm": 3.703125,
"kl": 0.004152452602284029,
"learning_rate": 3.0108390204737054e-08,
"loss": 0.0002,
"reward": 0.17286342615261674,
"reward_std": 0.461435928940773,
"rewards/reward_func": 0.17286342615261674,
"step": 7248
},
{
"completion_length": 122.7421875,
"epoch": 0.9709621303358759,
"grad_norm": 5.25,
"kl": 0.0060864063561894,
"learning_rate": 2.903786966412418e-08,
"loss": 0.0002,
"reward": 0.5303192976862192,
"reward_std": 0.4443682935088873,
"rewards/reward_func": 0.5303192976862192,
"step": 7256
},
{
"completion_length": 174.3984375,
"epoch": 0.9720326508764887,
"grad_norm": 3.546875,
"kl": 0.004882953886408359,
"learning_rate": 2.7967349123511307e-08,
"loss": 0.0002,
"reward": 0.40086287446320057,
"reward_std": 0.41974346339702606,
"rewards/reward_func": 0.40086287446320057,
"step": 7264
},
{
"completion_length": 191.515625,
"epoch": 0.9731031714171016,
"grad_norm": 2.28125,
"kl": 0.004744857433252037,
"learning_rate": 2.6896828582898435e-08,
"loss": 0.0002,
"reward": -0.035455760546028614,
"reward_std": 0.5775532089173794,
"rewards/reward_func": -0.035455760546028614,
"step": 7272
},
{
"completion_length": 181.6484375,
"epoch": 0.9741736919577144,
"grad_norm": 4.21875,
"kl": 0.004760511888889596,
"learning_rate": 2.5826308042285557e-08,
"loss": 0.0002,
"reward": 0.09731801599264145,
"reward_std": 0.5751747917383909,
"rewards/reward_func": 0.09731801599264145,
"step": 7280
},
{
"completion_length": 195.15625,
"epoch": 0.9752442124983273,
"grad_norm": 3.296875,
"kl": 0.004167939972830936,
"learning_rate": 2.4755787501672685e-08,
"loss": 0.0002,
"reward": -0.007298767566680908,
"reward_std": 0.47776357643306255,
"rewards/reward_func": -0.007298767566680908,
"step": 7288
},
{
"completion_length": 143.59375,
"epoch": 0.9763147330389402,
"grad_norm": 4.25,
"kl": 0.005093816755106673,
"learning_rate": 2.3685266961059813e-08,
"loss": 0.0002,
"reward": 0.3966046618297696,
"reward_std": 0.4395467219874263,
"rewards/reward_func": 0.3966046618297696,
"step": 7296
},
{
"completion_length": 196.1015625,
"epoch": 0.9773852535795531,
"grad_norm": 4.96875,
"kl": 0.003976713371230289,
"learning_rate": 2.261474642044694e-08,
"loss": 0.0002,
"reward": 0.05790833756327629,
"reward_std": 0.45180133171379566,
"rewards/reward_func": 0.05790833756327629,
"step": 7304
},
{
"completion_length": 158.6171875,
"epoch": 0.9784557741201659,
"grad_norm": 4.125,
"kl": 0.004875800863374025,
"learning_rate": 2.154422587983407e-08,
"loss": 0.0002,
"reward": 0.3239698866382241,
"reward_std": 0.6013830993324518,
"rewards/reward_func": 0.3239698866382241,
"step": 7312
},
{
"completion_length": 137.140625,
"epoch": 0.9795262946607788,
"grad_norm": 2.796875,
"kl": 0.0066487987351138145,
"learning_rate": 2.0473705339221198e-08,
"loss": 0.0003,
"reward": 0.42458152025938034,
"reward_std": 0.4684657920151949,
"rewards/reward_func": 0.42458152025938034,
"step": 7320
},
{
"completion_length": 184.9609375,
"epoch": 0.9805968152013916,
"grad_norm": 4.0625,
"kl": 0.003812081238720566,
"learning_rate": 1.9403184798608323e-08,
"loss": 0.0002,
"reward": 0.2897532992064953,
"reward_std": 0.6526387594640255,
"rewards/reward_func": 0.2897532992064953,
"step": 7328
},
{
"completion_length": 170.515625,
"epoch": 0.9816673357420046,
"grad_norm": 2.625,
"kl": 0.004741923592519015,
"learning_rate": 1.8332664257995448e-08,
"loss": 0.0002,
"reward": 0.1217675432562828,
"reward_std": 0.5977188646793365,
"rewards/reward_func": 0.1217675432562828,
"step": 7336
},
{
"completion_length": 215.9609375,
"epoch": 0.9827378562826175,
"grad_norm": 2.90625,
"kl": 0.0035596474481280893,
"learning_rate": 1.7262143717382576e-08,
"loss": 0.0001,
"reward": 0.14845915883779526,
"reward_std": 0.5360017623752356,
"rewards/reward_func": 0.14845915883779526,
"step": 7344
},
{
"completion_length": 219.90625,
"epoch": 0.9838083768232303,
"grad_norm": 4.5,
"kl": 0.00427751979441382,
"learning_rate": 1.6191623176769704e-08,
"loss": 0.0002,
"reward": -0.1681067142635584,
"reward_std": 0.5721786804497242,
"rewards/reward_func": -0.1681067142635584,
"step": 7352
},
{
"completion_length": 175.6953125,
"epoch": 0.9848788973638432,
"grad_norm": 3.890625,
"kl": 0.0041801958286669105,
"learning_rate": 1.5121102636156832e-08,
"loss": 0.0002,
"reward": 0.06939083803445101,
"reward_std": 0.6783208139240742,
"rewards/reward_func": 0.06939083803445101,
"step": 7360
},
{
"completion_length": 144.1875,
"epoch": 0.985949417904456,
"grad_norm": 3.828125,
"kl": 0.004889452655334026,
"learning_rate": 1.4050582095543959e-08,
"loss": 0.0002,
"reward": 0.4226034879684448,
"reward_std": 0.5269232532009482,
"rewards/reward_func": 0.4226034879684448,
"step": 7368
},
{
"completion_length": 163.59375,
"epoch": 0.987019938445069,
"grad_norm": 4.375,
"kl": 0.004188012710073963,
"learning_rate": 1.2980061554931083e-08,
"loss": 0.0002,
"reward": 0.39765281416475773,
"reward_std": 0.6691529527306557,
"rewards/reward_func": 0.39765281416475773,
"step": 7376
},
{
"completion_length": 171.4765625,
"epoch": 0.9880904589856818,
"grad_norm": 4.34375,
"kl": 0.004243878676788881,
"learning_rate": 1.1909541014318212e-08,
"loss": 0.0002,
"reward": 0.02129072230309248,
"reward_std": 0.5695307403802872,
"rewards/reward_func": 0.02129072230309248,
"step": 7384
},
{
"completion_length": 174.0625,
"epoch": 0.9891609795262947,
"grad_norm": 6.34375,
"kl": 0.004543175353319384,
"learning_rate": 1.0839020473705338e-08,
"loss": 0.0002,
"reward": 0.19244904909282923,
"reward_std": 0.6081365495920181,
"rewards/reward_func": 0.19244904909282923,
"step": 7392
},
{
"completion_length": 168.640625,
"epoch": 0.9902315000669075,
"grad_norm": 4.09375,
"kl": 0.004472158325370401,
"learning_rate": 9.768499933092466e-09,
"loss": 0.0002,
"reward": 0.23385661654174328,
"reward_std": 0.5880191251635551,
"rewards/reward_func": 0.23385661654174328,
"step": 7400
},
{
"completion_length": 167.7265625,
"epoch": 0.9913020206075204,
"grad_norm": 3.703125,
"kl": 0.004636053316062316,
"learning_rate": 8.697979392479593e-09,
"loss": 0.0002,
"reward": 0.1343773351982236,
"reward_std": 0.4601830244064331,
"rewards/reward_func": 0.1343773351982236,
"step": 7408
},
{
"completion_length": 181.546875,
"epoch": 0.9923725411481332,
"grad_norm": 3.515625,
"kl": 0.004200820723781362,
"learning_rate": 7.627458851866721e-09,
"loss": 0.0002,
"reward": 0.4208897929638624,
"reward_std": 0.5252015050500631,
"rewards/reward_func": 0.4208897929638624,
"step": 7416
},
{
"completion_length": 188.3671875,
"epoch": 0.9934430616887462,
"grad_norm": 3.90625,
"kl": 0.00420898012816906,
"learning_rate": 6.5569383112538474e-09,
"loss": 0.0002,
"reward": 0.01703132875263691,
"reward_std": 0.659230999648571,
"rewards/reward_func": 0.01703132875263691,
"step": 7424
},
{
"completion_length": 180.5546875,
"epoch": 0.994513582229359,
"grad_norm": 3.203125,
"kl": 0.004538079345365986,
"learning_rate": 5.486417770640974e-09,
"loss": 0.0002,
"reward": 0.2158992402255535,
"reward_std": 0.6045026630163193,
"rewards/reward_func": 0.2158992402255535,
"step": 7432
},
{
"completion_length": 162.9453125,
"epoch": 0.9955841027699719,
"grad_norm": 3.734375,
"kl": 0.004193991771899164,
"learning_rate": 4.4158972300281005e-09,
"loss": 0.0002,
"reward": 0.1715514180250466,
"reward_std": 0.6823503784835339,
"rewards/reward_func": 0.1715514180250466,
"step": 7440
},
{
"completion_length": 173.65625,
"epoch": 0.9966546233105847,
"grad_norm": 6.21875,
"kl": 0.004881884902715683,
"learning_rate": 3.345376689415228e-09,
"loss": 0.0002,
"reward": 0.003267081454396248,
"reward_std": 0.6868100538849831,
"rewards/reward_func": 0.003267081454396248,
"step": 7448
},
{
"completion_length": 141.2421875,
"epoch": 0.9977251438511976,
"grad_norm": 4.53125,
"kl": 0.005532538751140237,
"learning_rate": 2.2748561488023547e-09,
"loss": 0.0002,
"reward": 0.541567288339138,
"reward_std": 0.31684359908103943,
"rewards/reward_func": 0.541567288339138,
"step": 7456
},
{
"completion_length": 157.5546875,
"epoch": 0.9987956643918106,
"grad_norm": 3.953125,
"kl": 0.004744258592836559,
"learning_rate": 1.2043356081894823e-09,
"loss": 0.0002,
"reward": 0.255828570574522,
"reward_std": 0.6103123240172863,
"rewards/reward_func": 0.255828570574522,
"step": 7464
},
{
"completion_length": 151.453125,
"epoch": 0.9998661849324234,
"grad_norm": 3.484375,
"kl": 0.004390858637634665,
"learning_rate": 1.338150675766091e-10,
"loss": 0.0002,
"reward": 0.26983874663710594,
"reward_std": 0.5870513431727886,
"rewards/reward_func": 0.26983874663710594,
"step": 7472
}
],
"logging_steps": 8,
"max_steps": 7473,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1868,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}