{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998655733297486, "eval_steps": 500, "global_step": 3719, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 459.0625, "epoch": 0.0013442667025137787, "grad_norm": 33.51481246948242, "kl": 0.001410222053527832, "learning_rate": 2.688172043010753e-07, "loss": 0.0001, "reward": 0.1325918197631836, "reward_std": 0.02248889727052301, "rewards/reward_func_1": 0.1325918197631836, "step": 5 }, { "completion_length": 479.13125, "epoch": 0.0026885334050275574, "grad_norm": 41.71512985229492, "kl": 0.005267477035522461, "learning_rate": 5.376344086021506e-07, "loss": 0.0002, "reward": 0.13215713500976561, "reward_std": 0.024013734073378146, "rewards/reward_func_1": 0.13215713500976561, "step": 10 }, { "completion_length": 454.19375, "epoch": 0.0040328001075413365, "grad_norm": 38.0341911315918, "kl": 0.15977706909179687, "learning_rate": 8.064516129032258e-07, "loss": 0.0064, "reward": 0.1277914047241211, "reward_std": 0.023130150814540684, "rewards/reward_func_1": 0.1277914047241211, "step": 15 }, { "completion_length": 525.4125, "epoch": 0.005377066810055115, "grad_norm": 22.35483169555664, "kl": 0.31226425170898436, "learning_rate": 1.0752688172043011e-06, "loss": 0.0125, "reward": 0.13453254699707032, "reward_std": 0.02571835172129795, "rewards/reward_func_1": 0.13453254699707032, "step": 20 }, { "completion_length": 537.725, "epoch": 0.006721333512568894, "grad_norm": 26.161535263061523, "kl": 0.36166534423828123, "learning_rate": 1.3440860215053765e-06, "loss": 0.0145, "reward": 0.14296913146972656, "reward_std": 0.022099756821990012, "rewards/reward_func_1": 0.14296913146972656, "step": 25 }, { "completion_length": 471.96875, "epoch": 0.008065600215082673, "grad_norm": 108.09612274169922, "kl": 0.27325439453125, "learning_rate": 1.6129032258064516e-06, "loss": 0.0109, "reward": 0.12444114685058594, "reward_std": 0.025380318914540113, "rewards/reward_func_1": 0.12444114685058594, "step": 30 }, { "completion_length": 566.39375, "epoch": 0.009409866917596451, "grad_norm": 28.055280685424805, "kl": 0.4273193359375, "learning_rate": 1.881720430107527e-06, "loss": 0.0171, "reward": 0.13999091386795043, "reward_std": 0.023209166852757333, "rewards/reward_func_1": 0.13999091386795043, "step": 35 }, { "completion_length": 558.03125, "epoch": 0.01075413362011023, "grad_norm": 92.63247680664062, "kl": 0.7135009765625, "learning_rate": 2.1505376344086023e-06, "loss": 0.0285, "reward": 0.13420333862304687, "reward_std": 0.021424611564725637, "rewards/reward_func_1": 0.13420333862304687, "step": 40 }, { "completion_length": 524.88125, "epoch": 0.012098400322624008, "grad_norm": 13.822972297668457, "kl": 13.015373229980469, "learning_rate": 2.4193548387096776e-06, "loss": 0.5195, "reward": 0.137255859375, "reward_std": 0.02337467367760837, "rewards/reward_func_1": 0.137255859375, "step": 45 }, { "completion_length": 554.8125, "epoch": 0.013442667025137788, "grad_norm": 43.86341857910156, "kl": 0.65223388671875, "learning_rate": 2.688172043010753e-06, "loss": 0.0261, "reward": 0.13823509216308594, "reward_std": 0.02139872215921059, "rewards/reward_func_1": 0.13823509216308594, "step": 50 }, { "completion_length": 453.91875, "epoch": 0.014786933727651566, "grad_norm": 17.72090721130371, "kl": 0.695703125, "learning_rate": 2.9569892473118283e-06, "loss": 0.0278, "reward": 0.13162574768066407, "reward_std": 0.0243722494575195, "rewards/reward_func_1": 0.13162574768066407, "step": 55 }, { "completion_length": 477.7375, "epoch": 0.016131200430165346, "grad_norm": 51.72431182861328, "kl": 1.161859130859375, "learning_rate": 3.225806451612903e-06, "loss": 0.0465, "reward": 0.13519821166992188, "reward_std": 0.019137346441857515, "rewards/reward_func_1": 0.13519821166992188, "step": 60 }, { "completion_length": 491.5375, "epoch": 0.017475467132679123, "grad_norm": 23.050928115844727, "kl": 1.3964111328125, "learning_rate": 3.494623655913979e-06, "loss": 0.0559, "reward": 0.12957611083984374, "reward_std": 0.021138915204210205, "rewards/reward_func_1": 0.12957611083984374, "step": 65 }, { "completion_length": 510.4125, "epoch": 0.018819733835192903, "grad_norm": 26.82096290588379, "kl": 2.96982421875, "learning_rate": 3.763440860215054e-06, "loss": 0.1189, "reward": 0.13300743103027343, "reward_std": 0.024069122620858252, "rewards/reward_func_1": 0.13300743103027343, "step": 70 }, { "completion_length": 442.7375, "epoch": 0.020164000537706683, "grad_norm": 26.9343318939209, "kl": 3.32802734375, "learning_rate": 4.032258064516129e-06, "loss": 0.1332, "reward": 0.12296409010887147, "reward_std": 0.025652985728811473, "rewards/reward_func_1": 0.12296409010887147, "step": 75 }, { "completion_length": 585.75625, "epoch": 0.02150826724022046, "grad_norm": 6.579223155975342, "kl": 69.7091796875, "learning_rate": 4.3010752688172045e-06, "loss": 2.7885, "reward": 0.1406890869140625, "reward_std": 0.021016028558369725, "rewards/reward_func_1": 0.1406890869140625, "step": 80 }, { "completion_length": 549.41875, "epoch": 0.02285253394273424, "grad_norm": 3.3090600967407227, "kl": 1.4935546875, "learning_rate": 4.56989247311828e-06, "loss": 0.0597, "reward": 0.13293228149414063, "reward_std": 0.0250552476150915, "rewards/reward_func_1": 0.13293228149414063, "step": 85 }, { "completion_length": 534.4625, "epoch": 0.024196800645248016, "grad_norm": 12.640344619750977, "kl": 1.730859375, "learning_rate": 4.838709677419355e-06, "loss": 0.0692, "reward": 0.12914085388183594, "reward_std": 0.021931628661695866, "rewards/reward_func_1": 0.12914085388183594, "step": 90 }, { "completion_length": 964.30625, "epoch": 0.025541067347761796, "grad_norm": 3.1176600456237793, "kl": 0.46341552734375, "learning_rate": 5.1075268817204305e-06, "loss": 0.0185, "reward": 0.12530202865600587, "reward_std": 0.030988389148842544, "rewards/reward_func_1": 0.12530202865600587, "step": 95 }, { "completion_length": 946.646875, "epoch": 0.026885334050275576, "grad_norm": 2.275771379470825, "kl": 1.3025634765625, "learning_rate": 5.376344086021506e-06, "loss": 0.0521, "reward": 0.11467647552490234, "reward_std": 0.028262564330361784, "rewards/reward_func_1": 0.11467647552490234, "step": 100 }, { "completion_length": 745.26875, "epoch": 0.028229600752789352, "grad_norm": 12.095799446105957, "kl": 2.6501708984375, "learning_rate": 5.645161290322582e-06, "loss": 0.106, "reward": 0.11658521220088006, "reward_std": 0.02962974151596427, "rewards/reward_func_1": 0.11658521220088006, "step": 105 }, { "completion_length": 602.88125, "epoch": 0.029573867455303132, "grad_norm": 4.594326496124268, "kl": 305.4122314453125, "learning_rate": 5.9139784946236566e-06, "loss": 12.2179, "reward": 0.12178945541381836, "reward_std": 0.022939921566285194, "rewards/reward_func_1": 0.12178945541381836, "step": 110 }, { "completion_length": 649.784375, "epoch": 0.030918134157816912, "grad_norm": 26.5841121673584, "kl": 1.881787109375, "learning_rate": 6.182795698924732e-06, "loss": 0.0753, "reward": 0.12521166801452638, "reward_std": 0.02316317391814664, "rewards/reward_func_1": 0.12521166801452638, "step": 115 }, { "completion_length": 720.3375, "epoch": 0.03226240086033069, "grad_norm": 4.994908809661865, "kl": 2.0610107421875, "learning_rate": 6.451612903225806e-06, "loss": 0.0825, "reward": 0.1228231817483902, "reward_std": 0.02567218211479485, "rewards/reward_func_1": 0.1228231817483902, "step": 120 }, { "completion_length": 675.49375, "epoch": 0.033606667562844465, "grad_norm": 10.518940925598145, "kl": 2.297900390625, "learning_rate": 6.720430107526882e-06, "loss": 0.0921, "reward": 0.10162264108657837, "reward_std": 0.026170244067907335, "rewards/reward_func_1": 0.10162264108657837, "step": 125 }, { "completion_length": 662.875, "epoch": 0.034950934265358245, "grad_norm": 6.318077564239502, "kl": 2.387255859375, "learning_rate": 6.989247311827958e-06, "loss": 0.0955, "reward": 0.11114879846572875, "reward_std": 0.031876870489213616, "rewards/reward_func_1": 0.11114879846572875, "step": 130 }, { "completion_length": 707.71875, "epoch": 0.036295200967872025, "grad_norm": 19.05810546875, "kl": 3.068359375, "learning_rate": 7.258064516129033e-06, "loss": 0.1228, "reward": 0.09947696328163147, "reward_std": 0.030518771056085824, "rewards/reward_func_1": 0.09947696328163147, "step": 135 }, { "completion_length": 807.725, "epoch": 0.037639467670385805, "grad_norm": 5.729363441467285, "kl": 2.4376953125, "learning_rate": 7.526881720430108e-06, "loss": 0.0975, "reward": 0.09508908390998841, "reward_std": 0.035996314510703085, "rewards/reward_func_1": 0.09508908390998841, "step": 140 }, { "completion_length": 921.71875, "epoch": 0.038983734372899585, "grad_norm": 279.34271240234375, "kl": 6.6451171875, "learning_rate": 7.795698924731183e-06, "loss": 0.2654, "reward": 0.06034855842590332, "reward_std": 0.03402297935681418, "rewards/reward_func_1": 0.06034855842590332, "step": 145 }, { "completion_length": 876.575, "epoch": 0.040328001075413365, "grad_norm": 198.41583251953125, "kl": 6.54765625, "learning_rate": 8.064516129032258e-06, "loss": 0.2619, "reward": 0.07277845814824105, "reward_std": 0.03974553793668747, "rewards/reward_func_1": 0.07277845814824105, "step": 150 }, { "completion_length": 780.615625, "epoch": 0.04167226777792714, "grad_norm": 16.293689727783203, "kl": 5.5056640625, "learning_rate": 8.333333333333334e-06, "loss": 0.2202, "reward": 0.0797007441520691, "reward_std": 0.03856456303037703, "rewards/reward_func_1": 0.0797007441520691, "step": 155 }, { "completion_length": 955.875, "epoch": 0.04301653448044092, "grad_norm": 5.261388778686523, "kl": 4.3888671875, "learning_rate": 8.602150537634409e-06, "loss": 0.1755, "reward": 0.06296098232269287, "reward_std": 0.032317174156196414, "rewards/reward_func_1": 0.06296098232269287, "step": 160 }, { "completion_length": 905.43125, "epoch": 0.0443608011829547, "grad_norm": 2.4820497035980225, "kl": 4.9009765625, "learning_rate": 8.870967741935484e-06, "loss": 0.196, "reward": 0.07761964425444604, "reward_std": 0.031909586675465104, "rewards/reward_func_1": 0.07761964425444604, "step": 165 }, { "completion_length": 755.90625, "epoch": 0.04570506788546848, "grad_norm": 3.4995851516723633, "kl": 4.2828125, "learning_rate": 9.13978494623656e-06, "loss": 0.1714, "reward": 0.0845573864877224, "reward_std": 0.028807568131014705, "rewards/reward_func_1": 0.0845573864877224, "step": 170 }, { "completion_length": 660.746875, "epoch": 0.04704933458798226, "grad_norm": 19.133451461791992, "kl": 4.21357421875, "learning_rate": 9.408602150537635e-06, "loss": 0.1686, "reward": 0.09643235206604003, "reward_std": 0.02881598025560379, "rewards/reward_func_1": 0.09643235206604003, "step": 175 }, { "completion_length": 651.1375, "epoch": 0.04839360129049603, "grad_norm": 5.1367645263671875, "kl": 5.13486328125, "learning_rate": 9.67741935483871e-06, "loss": 0.2055, "reward": 0.09348840713500976, "reward_std": 0.03065957601647824, "rewards/reward_func_1": 0.09348840713500976, "step": 180 }, { "completion_length": 799.3375, "epoch": 0.04973786799300981, "grad_norm": 9.540658950805664, "kl": 3.68408203125, "learning_rate": 9.946236559139786e-06, "loss": 0.1475, "reward": 0.08806414604187011, "reward_std": 0.03414921889780089, "rewards/reward_func_1": 0.08806414604187011, "step": 185 }, { "completion_length": 752.3375, "epoch": 0.05108213469552359, "grad_norm": 7.665460586547852, "kl": 8.005517578125, "learning_rate": 1.0215053763440861e-05, "loss": 0.3196, "reward": 0.07655536755919456, "reward_std": 0.03511982869822532, "rewards/reward_func_1": 0.07655536755919456, "step": 190 }, { "completion_length": 771.375, "epoch": 0.05242640139803737, "grad_norm": 10.91799259185791, "kl": 5.8822265625, "learning_rate": 1.0483870967741936e-05, "loss": 0.2353, "reward": 0.07151660919189454, "reward_std": 0.03899585076142102, "rewards/reward_func_1": 0.07151660919189454, "step": 195 }, { "completion_length": 898.54375, "epoch": 0.05377066810055115, "grad_norm": 8.995491027832031, "kl": 26.2333984375, "learning_rate": 1.0752688172043012e-05, "loss": 1.0536, "reward": 0.05515105128288269, "reward_std": 0.037433248152956365, "rewards/reward_func_1": 0.05515105128288269, "step": 200 }, { "completion_length": 801.6125, "epoch": 0.05511493480306493, "grad_norm": 2.727968454360962, "kl": 3.705029296875, "learning_rate": 1.1021505376344085e-05, "loss": 0.1481, "reward": 0.07217190265655518, "reward_std": 0.03416005950421095, "rewards/reward_func_1": 0.07217190265655518, "step": 205 }, { "completion_length": 759.43125, "epoch": 0.056459201505578704, "grad_norm": 2.115070343017578, "kl": 4.862158203125, "learning_rate": 1.1290322580645164e-05, "loss": 0.1946, "reward": 0.09042127132415771, "reward_std": 0.032155740447342394, "rewards/reward_func_1": 0.09042127132415771, "step": 210 }, { "completion_length": 932.6875, "epoch": 0.057803468208092484, "grad_norm": 2.2677536010742188, "kl": 2.760693359375, "learning_rate": 1.1559139784946238e-05, "loss": 0.1104, "reward": 0.062465869216248394, "reward_std": 0.033533206372521815, "rewards/reward_func_1": 0.062465869216248394, "step": 215 }, { "completion_length": 995.0375, "epoch": 0.059147734910606264, "grad_norm": 28.45672607421875, "kl": 2.51044921875, "learning_rate": 1.1827956989247313e-05, "loss": 0.1004, "reward": 0.04683060795068741, "reward_std": 0.03918457605177537, "rewards/reward_func_1": 0.04683060795068741, "step": 220 }, { "completion_length": 1012.8125, "epoch": 0.060492001613120044, "grad_norm": 657.45166015625, "kl": 8.178271484375, "learning_rate": 1.2096774193548388e-05, "loss": 0.328, "reward": 0.029974862933158875, "reward_std": 0.029841514525469393, "rewards/reward_func_1": 0.029974862933158875, "step": 225 }, { "completion_length": 902.15, "epoch": 0.061836268315633824, "grad_norm": 63.430824279785156, "kl": 6.02529296875, "learning_rate": 1.2365591397849464e-05, "loss": 0.2418, "reward": 0.03732140064239502, "reward_std": 0.030012273252941667, "rewards/reward_func_1": 0.03732140064239502, "step": 230 }, { "completion_length": 820.70625, "epoch": 0.0631805350181476, "grad_norm": 18.125106811523438, "kl": 2.8375, "learning_rate": 1.2634408602150539e-05, "loss": 0.1134, "reward": 0.0467583104968071, "reward_std": 0.026065533305518328, "rewards/reward_func_1": 0.0467583104968071, "step": 235 }, { "completion_length": 781.80625, "epoch": 0.06452480172066138, "grad_norm": 16.778675079345703, "kl": 4.069482421875, "learning_rate": 1.2903225806451613e-05, "loss": 0.1629, "reward": 0.039750583469867706, "reward_std": 0.02966789968777448, "rewards/reward_func_1": 0.039750583469867706, "step": 240 }, { "completion_length": 733.74375, "epoch": 0.06586906842317516, "grad_norm": 17.191570281982422, "kl": 3.77578125, "learning_rate": 1.3172043010752688e-05, "loss": 0.151, "reward": 0.031531840562820435, "reward_std": 0.03298547498416156, "rewards/reward_func_1": 0.031531840562820435, "step": 245 }, { "completion_length": 724.8125, "epoch": 0.06721333512568893, "grad_norm": 7.622962951660156, "kl": 4.6927734375, "learning_rate": 1.3440860215053763e-05, "loss": 0.1877, "reward": 0.028579163551330566, "reward_std": 0.03135324278846383, "rewards/reward_func_1": 0.028579163551330566, "step": 250 }, { "completion_length": 833.86875, "epoch": 0.06855760182820271, "grad_norm": 7.43701171875, "kl": 3.25869140625, "learning_rate": 1.3709677419354839e-05, "loss": 0.1303, "reward": 0.022608640044927596, "reward_std": 0.020842469058698042, "rewards/reward_func_1": 0.022608640044927596, "step": 255 }, { "completion_length": 954.625, "epoch": 0.06990186853071649, "grad_norm": 8.609543800354004, "kl": 1.60927734375, "learning_rate": 1.3978494623655916e-05, "loss": 0.0644, "reward": 0.006280577182769776, "reward_std": 0.01644945718580857, "rewards/reward_func_1": 0.006280577182769776, "step": 260 }, { "completion_length": 924.771875, "epoch": 0.07124613523323027, "grad_norm": 25.172861099243164, "kl": 3.16962890625, "learning_rate": 1.4247311827956991e-05, "loss": 0.1268, "reward": 0.008034330606460572, "reward_std": 0.014402125729247928, "rewards/reward_func_1": 0.008034330606460572, "step": 265 }, { "completion_length": 609.03125, "epoch": 0.07259040193574405, "grad_norm": 20.167495727539062, "kl": 6.338671875, "learning_rate": 1.4516129032258066e-05, "loss": 0.2535, "reward": 0.0008509188890457153, "reward_std": 0.015080565505195409, "rewards/reward_func_1": 0.0008509188890457153, "step": 270 }, { "completion_length": 669.51875, "epoch": 0.07393466863825783, "grad_norm": 4.5325541496276855, "kl": 8.6146484375, "learning_rate": 1.4784946236559142e-05, "loss": 0.3447, "reward": 0.0004961371421813964, "reward_std": 0.018004348664544523, "rewards/reward_func_1": 0.0004961371421813964, "step": 275 }, { "completion_length": 282.79375, "epoch": 0.07527893534077161, "grad_norm": 7.294330596923828, "kl": 11.61015625, "learning_rate": 1.5053763440860215e-05, "loss": 0.4646, "reward": 0.016247385740280153, "reward_std": 0.020933675090782346, "rewards/reward_func_1": 0.016247385740280153, "step": 280 }, { "completion_length": 618.675, "epoch": 0.07662320204328539, "grad_norm": 6.872623443603516, "kl": 25.2796875, "learning_rate": 1.5322580645161292e-05, "loss": 1.0111, "reward": 0.0221073180437088, "reward_std": 0.01539291434455663, "rewards/reward_func_1": 0.0221073180437088, "step": 285 }, { "completion_length": 71.5875, "epoch": 0.07796746874579917, "grad_norm": 4.054460048675537, "kl": 12.71484375, "learning_rate": 1.5591397849462366e-05, "loss": 0.5082, "reward": 0.029245705343782902, "reward_std": 0.01699454879271798, "rewards/reward_func_1": 0.029245705343782902, "step": 290 }, { "completion_length": 148.65, "epoch": 0.07931173544831295, "grad_norm": 5.3947062492370605, "kl": 10.1796875, "learning_rate": 1.586021505376344e-05, "loss": 0.4074, "reward": 0.03420259654521942, "reward_std": 0.021775428601540626, "rewards/reward_func_1": 0.03420259654521942, "step": 295 }, { "completion_length": 151.83125, "epoch": 0.08065600215082673, "grad_norm": 4.995931148529053, "kl": 8.5484375, "learning_rate": 1.6129032258064517e-05, "loss": 0.3418, "reward": 0.0415335863828659, "reward_std": 0.015774094988591968, "rewards/reward_func_1": 0.0415335863828659, "step": 300 }, { "completion_length": 249.5125, "epoch": 0.0820002688533405, "grad_norm": 6.766551971435547, "kl": 7.60859375, "learning_rate": 1.6397849462365594e-05, "loss": 0.3044, "reward": 0.04043524265289307, "reward_std": 0.01937261049170047, "rewards/reward_func_1": 0.04043524265289307, "step": 305 }, { "completion_length": 132.06875, "epoch": 0.08334453555585428, "grad_norm": 7.255300045013428, "kl": 33.04140625, "learning_rate": 1.6666666666666667e-05, "loss": 1.3153, "reward": 0.04444933533668518, "reward_std": 0.015411415329435841, "rewards/reward_func_1": 0.04444933533668518, "step": 310 }, { "completion_length": 427.43125, "epoch": 0.08468880225836806, "grad_norm": 4.3001933097839355, "kl": 7.823046875, "learning_rate": 1.6935483870967744e-05, "loss": 0.3131, "reward": 0.036715186480432746, "reward_std": 0.025042318180203436, "rewards/reward_func_1": 0.036715186480432746, "step": 315 }, { "completion_length": 617.3875, "epoch": 0.08603306896088184, "grad_norm": 5.300938129425049, "kl": 9.6765625, "learning_rate": 1.7204301075268818e-05, "loss": 0.3872, "reward": 0.02903536558151245, "reward_std": 0.03241579991299659, "rewards/reward_func_1": 0.02903536558151245, "step": 320 }, { "completion_length": 480.16875, "epoch": 0.08737733566339562, "grad_norm": 3.9806482791900635, "kl": 9.332421875, "learning_rate": 1.7473118279569895e-05, "loss": 0.3733, "reward": 0.03327850103378296, "reward_std": 0.02665868471376598, "rewards/reward_func_1": 0.03327850103378296, "step": 325 }, { "completion_length": 624.35625, "epoch": 0.0887216023659094, "grad_norm": 1.2687135934829712, "kl": 8.853515625, "learning_rate": 1.774193548387097e-05, "loss": 0.3541, "reward": 0.014437276124954223, "reward_std": 0.028702187002636492, "rewards/reward_func_1": 0.014437276124954223, "step": 330 }, { "completion_length": 482.596875, "epoch": 0.09006586906842318, "grad_norm": 4.883694648742676, "kl": 10.290625, "learning_rate": 1.8010752688172042e-05, "loss": 0.4117, "reward": -0.00023592226207256318, "reward_std": 0.02089865313610062, "rewards/reward_func_1": -0.00023592226207256318, "step": 335 }, { "completion_length": 551.590625, "epoch": 0.09141013577093696, "grad_norm": 31.997989654541016, "kl": 27.48359375, "learning_rate": 1.827956989247312e-05, "loss": 1.0984, "reward": 0.017952871322631837, "reward_std": 0.02710586852626875, "rewards/reward_func_1": 0.017952871322631837, "step": 340 }, { "completion_length": 90.98125, "epoch": 0.09275440247345074, "grad_norm": 3.1550872325897217, "kl": 6.864453125, "learning_rate": 1.8548387096774193e-05, "loss": 0.2747, "reward": 0.0417973518371582, "reward_std": 0.013006633780605625, "rewards/reward_func_1": 0.0417973518371582, "step": 345 }, { "completion_length": 93.0875, "epoch": 0.09409866917596452, "grad_norm": 7.118286609649658, "kl": 5.258984375, "learning_rate": 1.881720430107527e-05, "loss": 0.2103, "reward": 0.04214920997619629, "reward_std": 0.0113553161296295, "rewards/reward_func_1": 0.04214920997619629, "step": 350 }, { "completion_length": 23.6375, "epoch": 0.0954429358784783, "grad_norm": 0.8202024698257446, "kl": 5.026171875, "learning_rate": 1.9086021505376347e-05, "loss": 0.2011, "reward": 0.04604549407958984, "reward_std": 0.007001646169919695, "rewards/reward_func_1": 0.04604549407958984, "step": 355 }, { "completion_length": 420.46875, "epoch": 0.09678720258099206, "grad_norm": 3.111360788345337, "kl": 5.22421875, "learning_rate": 1.935483870967742e-05, "loss": 0.209, "reward": 0.018204644322395325, "reward_std": 0.012417513456603047, "rewards/reward_func_1": 0.018204644322395325, "step": 360 }, { "completion_length": 1005.70625, "epoch": 0.09813146928350584, "grad_norm": 0.18079593777656555, "kl": 2.351806640625, "learning_rate": 1.9623655913978498e-05, "loss": 0.0941, "reward": 0.032704389095306395, "reward_std": 0.01922294880496338, "rewards/reward_func_1": 0.032704389095306395, "step": 365 }, { "completion_length": 1019.66875, "epoch": 0.09947573598601962, "grad_norm": 0.2214096635580063, "kl": 0.1852294921875, "learning_rate": 1.989247311827957e-05, "loss": 0.0074, "reward": 0.061572599411010745, "reward_std": 0.01994357380317524, "rewards/reward_func_1": 0.061572599411010745, "step": 370 }, { "completion_length": 995.678125, "epoch": 0.1008200026885334, "grad_norm": 0.5282752513885498, "kl": 0.2708984375, "learning_rate": 1.9999960353893115e-05, "loss": 0.0108, "reward": 0.055290712416172026, "reward_std": 0.016450203530257567, "rewards/reward_func_1": 0.055290712416172026, "step": 375 }, { "completion_length": 1009.3375, "epoch": 0.10216426939104718, "grad_norm": 0.770745575428009, "kl": 0.82568359375, "learning_rate": 1.9999718073267252e-05, "loss": 0.0331, "reward": 0.05625443458557129, "reward_std": 0.01596820540144108, "rewards/reward_func_1": 0.05625443458557129, "step": 380 }, { "completion_length": 1016.053125, "epoch": 0.10350853609356096, "grad_norm": 0.473776251077652, "kl": 0.31748046875, "learning_rate": 1.9999255542960368e-05, "loss": 0.0127, "reward": 0.01015063002705574, "reward_std": 0.010674006148474292, "rewards/reward_func_1": 0.01015063002705574, "step": 385 }, { "completion_length": 1001.11875, "epoch": 0.10485280279607474, "grad_norm": 0.7848945260047913, "kl": 0.3754150390625, "learning_rate": 1.999857277315996e-05, "loss": 0.015, "reward": 0.0617163360118866, "reward_std": 0.021173181070480496, "rewards/reward_func_1": 0.0617163360118866, "step": 390 }, { "completion_length": 1021.315625, "epoch": 0.10619706949858852, "grad_norm": 16.4527645111084, "kl": 1.40760498046875, "learning_rate": 1.9997669778904446e-05, "loss": 0.0563, "reward": 0.04422735869884491, "reward_std": 0.022931134537793697, "rewards/reward_func_1": 0.04422735869884491, "step": 395 }, { "completion_length": 1017.70625, "epoch": 0.1075413362011023, "grad_norm": 0.4751810133457184, "kl": 1744830464.8029785, "learning_rate": 1.9996546580082792e-05, "loss": 69673728.0, "reward": 0.06360023021697998, "reward_std": 0.025275059579871594, "rewards/reward_func_1": 0.06360023021697998, "step": 400 }, { "completion_length": 1012.39375, "epoch": 0.10888560290361608, "grad_norm": 0.8353201746940613, "kl": 1.180908203125, "learning_rate": 1.9995203201434124e-05, "loss": 0.0472, "reward": 0.054718819260597226, "reward_std": 0.02895347127923742, "rewards/reward_func_1": 0.054718819260597226, "step": 405 }, { "completion_length": 883.959375, "epoch": 0.11022986960612986, "grad_norm": 5.007371425628662, "kl": 0.9612548828125, "learning_rate": 1.9993639672547146e-05, "loss": 0.0384, "reward": 0.06528196483850479, "reward_std": 0.028689574322197587, "rewards/reward_func_1": 0.06528196483850479, "step": 410 }, { "completion_length": 446.45625, "epoch": 0.11157413630864363, "grad_norm": 1.8732314109802246, "kl": 2.2693359375, "learning_rate": 1.9991856027859504e-05, "loss": 0.0908, "reward": 0.052714601159095764, "reward_std": 0.030338111356832086, "rewards/reward_func_1": 0.052714601159095764, "step": 415 }, { "completion_length": 662.909375, "epoch": 0.11291840301115741, "grad_norm": 6.701618194580078, "kl": 1.86259765625, "learning_rate": 1.9989852306657015e-05, "loss": 0.0745, "reward": 0.07098124027252198, "reward_std": 0.029971924761775882, "rewards/reward_func_1": 0.07098124027252198, "step": 420 }, { "completion_length": 946.375, "epoch": 0.11426266971367119, "grad_norm": 0.3844849169254303, "kl": 0.8337646484375, "learning_rate": 1.998762855307283e-05, "loss": 0.0333, "reward": 0.08022915720939636, "reward_std": 0.018059900577645747, "rewards/reward_func_1": 0.08022915720939636, "step": 425 }, { "completion_length": 1011.1875, "epoch": 0.11560693641618497, "grad_norm": 0.3679395020008087, "kl": 0.24794921875, "learning_rate": 1.998518481608643e-05, "loss": 0.0099, "reward": 0.08296351432800293, "reward_std": 0.022155718586873263, "rewards/reward_func_1": 0.08296351432800293, "step": 430 }, { "completion_length": 1024.0, "epoch": 0.11695120311869875, "grad_norm": 0.5217347145080566, "kl": 0.3544921875, "learning_rate": 1.998252114952255e-05, "loss": 0.0142, "reward": 0.047972720861434934, "reward_std": 0.01913239884888753, "rewards/reward_func_1": 0.047972720861434934, "step": 435 }, { "completion_length": 738.446875, "epoch": 0.11829546982121253, "grad_norm": 7.95990514755249, "kl": 1.81640625, "learning_rate": 1.9979637612050028e-05, "loss": 0.0727, "reward": 0.02692788541316986, "reward_std": 0.02701200459850952, "rewards/reward_func_1": 0.02692788541316986, "step": 440 }, { "completion_length": 434.340625, "epoch": 0.11963973652372631, "grad_norm": 11.07094955444336, "kl": 5.8614990234375, "learning_rate": 1.9976534267180464e-05, "loss": 0.2344, "reward": 0.03783460408449173, "reward_std": 0.018221309431828557, "rewards/reward_func_1": 0.03783460408449173, "step": 445 }, { "completion_length": 38.14375, "epoch": 0.12098400322624009, "grad_norm": 4.362886905670166, "kl": 17.2671875, "learning_rate": 1.997321118326687e-05, "loss": 0.6907, "reward": 0.041283273696899415, "reward_std": 0.011085864211781881, "rewards/reward_func_1": 0.041283273696899415, "step": 450 }, { "completion_length": 71.421875, "epoch": 0.12232826992875387, "grad_norm": 16.95098876953125, "kl": 14.43046875, "learning_rate": 1.996966843350212e-05, "loss": 0.5772, "reward": 0.029994052648544312, "reward_std": 0.014221129479119554, "rewards/reward_func_1": 0.029994052648544312, "step": 455 }, { "completion_length": 162.315625, "epoch": 0.12367253663126765, "grad_norm": 5.00581693649292, "kl": 13.87890625, "learning_rate": 1.996590609591736e-05, "loss": 0.5553, "reward": 0.03443393409252167, "reward_std": 0.015160623186966404, "rewards/reward_func_1": 0.03443393409252167, "step": 460 }, { "completion_length": 387.49375, "epoch": 0.12501680333378143, "grad_norm": 34.0945930480957, "kl": 25.624267578125, "learning_rate": 1.99619242533803e-05, "loss": 1.0272, "reward": 0.020095158740878104, "reward_std": 0.014898770145373419, "rewards/reward_func_1": 0.020095158740878104, "step": 465 }, { "completion_length": 313.875, "epoch": 0.1263610700362952, "grad_norm": 16.63882827758789, "kl": 11.6126220703125, "learning_rate": 1.9957722993593365e-05, "loss": 0.4642, "reward": 0.03102530874311924, "reward_std": 0.015879479701106904, "rewards/reward_func_1": 0.03102530874311924, "step": 470 }, { "completion_length": 43.409375, "epoch": 0.127705336738809, "grad_norm": 27.554349899291992, "kl": 16.75, "learning_rate": 1.9953302409091773e-05, "loss": 0.6699, "reward": 0.033424198627471924, "reward_std": 0.013823152912664227, "rewards/reward_func_1": 0.033424198627471924, "step": 475 }, { "completion_length": 38.784375, "epoch": 0.12904960344132277, "grad_norm": 0.890618085861206, "kl": 16.7375, "learning_rate": 1.9948662597241505e-05, "loss": 0.6692, "reward": 0.029546657204627992, "reward_std": 0.014900979267258663, "rewards/reward_func_1": 0.029546657204627992, "step": 480 }, { "completion_length": 2.1125, "epoch": 0.13039387014383655, "grad_norm": 5.359586238861084, "kl": 18.334375, "learning_rate": 1.9943803660237146e-05, "loss": 0.733, "reward": 0.042370176315307616, "reward_std": 0.011466928146546707, "rewards/reward_func_1": 0.042370176315307616, "step": 485 }, { "completion_length": 7.484375, "epoch": 0.13173813684635033, "grad_norm": 133.7327423095703, "kl": 17.6125, "learning_rate": 1.9938725705099652e-05, "loss": 0.7044, "reward": 0.042084154486656186, "reward_std": 0.014303012995515018, "rewards/reward_func_1": 0.042084154486656186, "step": 490 }, { "completion_length": 2.0125, "epoch": 0.13308240354886408, "grad_norm": 5.098598480224609, "kl": 17.9546875, "learning_rate": 1.9933428843673968e-05, "loss": 0.7184, "reward": 0.047114628553390506, "reward_std": 0.01080106117296964, "rewards/reward_func_1": 0.047114628553390506, "step": 495 }, { "completion_length": 2.065625, "epoch": 0.13442667025137786, "grad_norm": 3.663975715637207, "kl": 17.8578125, "learning_rate": 1.9927913192626597e-05, "loss": 0.714, "reward": 0.04776406288146973, "reward_std": 0.012929379957495258, "rewards/reward_func_1": 0.04776406288146973, "step": 500 }, { "completion_length": 1.9375, "epoch": 0.13577093695389164, "grad_norm": 30.02657699584961, "kl": 668487.05625, "learning_rate": 1.9922178873442998e-05, "loss": 26829.6063, "reward": 0.054135143756866455, "reward_std": 0.009887221396638779, "rewards/reward_func_1": 0.054135143756866455, "step": 505 }, { "completion_length": 24.953125, "epoch": 0.13711520365640542, "grad_norm": 6.359250068664551, "kl": 19.0421875, "learning_rate": 1.9916226012424925e-05, "loss": 0.7612, "reward": 0.05320845246315002, "reward_std": 0.010508252962245024, "rewards/reward_func_1": 0.05320845246315002, "step": 510 }, { "completion_length": 257.546875, "epoch": 0.1384594703589192, "grad_norm": 3.6431725025177, "kl": 14.698828125, "learning_rate": 1.991005474068765e-05, "loss": 0.5884, "reward": 0.03921504020690918, "reward_std": 0.014476554578868673, "rewards/reward_func_1": 0.03921504020690918, "step": 515 }, { "completion_length": 110.59375, "epoch": 0.13980373706143298, "grad_norm": 4.781929016113281, "kl": 16.16640625, "learning_rate": 1.9903665194157077e-05, "loss": 0.6467, "reward": 0.043272508680820464, "reward_std": 0.014059747860301286, "rewards/reward_func_1": 0.043272508680820464, "step": 520 }, { "completion_length": 1.365625, "epoch": 0.14114800376394676, "grad_norm": 12.174643516540527, "kl": 18.8140625, "learning_rate": 1.989705751356672e-05, "loss": 0.7527, "reward": 0.040436971187591556, "reward_std": 0.009100792693789116, "rewards/reward_func_1": 0.040436971187591556, "step": 525 }, { "completion_length": 3.353125, "epoch": 0.14249227046646054, "grad_norm": 1.5478957891464233, "kl": 15.30625, "learning_rate": 1.9890231844454643e-05, "loss": 0.6123, "reward": 0.035584007203578946, "reward_std": 0.013873677587253042, "rewards/reward_func_1": 0.035584007203578946, "step": 530 }, { "completion_length": 1.078125, "epoch": 0.14383653716897432, "grad_norm": 0.8689735531806946, "kl": 20.396875, "learning_rate": 1.9883188337160225e-05, "loss": 0.8161, "reward": 0.043527424335479736, "reward_std": 0.008268717869577813, "rewards/reward_func_1": 0.043527424335479736, "step": 535 }, { "completion_length": 1.775, "epoch": 0.1451808038714881, "grad_norm": 2.0533359050750732, "kl": 18.6109375, "learning_rate": 1.9875927146820867e-05, "loss": 0.7448, "reward": 0.0420529842376709, "reward_std": 0.008834328277953319, "rewards/reward_func_1": 0.0420529842376709, "step": 540 }, { "completion_length": 2.91875, "epoch": 0.14652507057400188, "grad_norm": 7.708526611328125, "kl": 17.74375, "learning_rate": 1.9868448433368567e-05, "loss": 0.7098, "reward": 0.03905548453330994, "reward_std": 0.011393586202757433, "rewards/reward_func_1": 0.03905548453330994, "step": 545 }, { "completion_length": 26.203125, "epoch": 0.14786933727651566, "grad_norm": 5.5177435874938965, "kl": 13.23671875, "learning_rate": 1.9860752361526384e-05, "loss": 0.5295, "reward": 0.030779826641082763, "reward_std": 0.017280431411927567, "rewards/reward_func_1": 0.030779826641082763, "step": 550 }, { "completion_length": 16.4625, "epoch": 0.14921360397902944, "grad_norm": 2.438591718673706, "kl": 16.7859375, "learning_rate": 1.985283910080484e-05, "loss": 0.6714, "reward": 0.03871009349822998, "reward_std": 0.012890951918961946, "rewards/reward_func_1": 0.03871009349822998, "step": 555 }, { "completion_length": 29.490625, "epoch": 0.15055787068154322, "grad_norm": 11.178017616271973, "kl": 16.7796875, "learning_rate": 1.9844708825498163e-05, "loss": 0.6712, "reward": 0.03712189197540283, "reward_std": 0.014122735538694541, "rewards/reward_func_1": 0.03712189197540283, "step": 560 }, { "completion_length": 8.56875, "epoch": 0.151902137384057, "grad_norm": 1.3991609811782837, "kl": 6695.2, "learning_rate": 1.983636171468046e-05, "loss": 269.0283, "reward": 0.045965385437011716, "reward_std": 0.011782036734803113, "rewards/reward_func_1": 0.045965385437011716, "step": 565 }, { "completion_length": 48.490625, "epoch": 0.15324640408657078, "grad_norm": 8.38524055480957, "kl": 16.6640625, "learning_rate": 1.9827797952201756e-05, "loss": 0.6669, "reward": 0.04424548149108887, "reward_std": 0.014852115589019377, "rewards/reward_func_1": 0.04424548149108887, "step": 570 }, { "completion_length": 93.028125, "epoch": 0.15459067078908456, "grad_norm": 3.190880298614502, "kl": 14.90859375, "learning_rate": 1.9819017726683966e-05, "loss": 0.5958, "reward": 0.030410957336425782, "reward_std": 0.021107864176156, "rewards/reward_func_1": 0.030410957336425782, "step": 575 }, { "completion_length": 18.278125, "epoch": 0.15593493749159834, "grad_norm": 304378225360896.0, "kl": 36779813791349.3, "learning_rate": 1.9810021231516733e-05, "loss": 1472844005376.0, "reward": 0.046595031023025514, "reward_std": 0.01781562084943289, "rewards/reward_func_1": 0.046595031023025514, "step": 580 }, { "completion_length": 8.640625, "epoch": 0.15727920419411212, "grad_norm": 1.4716880321502686, "kl": 1590480094.4328125, "learning_rate": 1.9800808664853162e-05, "loss": 63543705.6, "reward": 0.047375273704528806, "reward_std": 0.015040177796618082, "rewards/reward_func_1": 0.047375273704528806, "step": 585 }, { "completion_length": 5.3625, "epoch": 0.1586234708966259, "grad_norm": 0.6876189708709717, "kl": 18.7609375, "learning_rate": 1.979138022960546e-05, "loss": 0.7509, "reward": 0.04904801845550537, "reward_std": 0.013263128971448167, "rewards/reward_func_1": 0.04904801845550537, "step": 590 }, { "completion_length": 7.621875, "epoch": 0.15996773759913968, "grad_norm": 1.0694186687469482, "kl": 18.6953125, "learning_rate": 1.9781736133440462e-05, "loss": 0.748, "reward": 0.050295126438140866, "reward_std": 0.011883826142002363, "rewards/reward_func_1": 0.050295126438140866, "step": 595 }, { "completion_length": 11.071875, "epoch": 0.16131200430165346, "grad_norm": 0.40570515394210815, "kl": 197926011378090.44, "learning_rate": 1.9771876588775072e-05, "loss": 7919798059008.0, "reward": 0.048408856987953185, "reward_std": 0.013870497528841952, "rewards/reward_func_1": 0.048408856987953185, "step": 600 }, { "completion_length": 3.678125, "epoch": 0.1626562710041672, "grad_norm": 29.10760498046875, "kl": 1.1033819087057724e+16, "learning_rate": 1.976180181277157e-05, "loss": 440097890002534.4, "reward": 0.05020642280578613, "reward_std": 0.010890257774008205, "rewards/reward_func_1": 0.05020642280578613, "step": 605 }, { "completion_length": 3.86875, "epoch": 0.164000537706681, "grad_norm": 5.49629020690918, "kl": 18.6328125, "learning_rate": 1.975151202733283e-05, "loss": 0.7452, "reward": 0.04792967140674591, "reward_std": 0.013347143970895559, "rewards/reward_func_1": 0.04792967140674591, "step": 610 }, { "completion_length": 3.88125, "epoch": 0.16534480440919477, "grad_norm": 0.6328467726707458, "kl": 18.6640625, "learning_rate": 1.974100745909744e-05, "loss": 0.7466, "reward": 0.048745088279247284, "reward_std": 0.01318539776839316, "rewards/reward_func_1": 0.048745088279247284, "step": 615 }, { "completion_length": 6.23125, "epoch": 0.16668907111170855, "grad_norm": 10.044380187988281, "kl": 19.2171875, "learning_rate": 1.9730288339434698e-05, "loss": 0.7687, "reward": 0.05019671618938446, "reward_std": 0.011686628483585083, "rewards/reward_func_1": 0.05019671618938446, "step": 620 }, { "completion_length": 12.4625, "epoch": 0.16803333781422233, "grad_norm": 0.5477828979492188, "kl": 18.334375, "learning_rate": 1.9719354904439535e-05, "loss": 0.733, "reward": 0.04945822358131409, "reward_std": 0.01383852595463395, "rewards/reward_func_1": 0.04945822358131409, "step": 625 }, { "completion_length": 9.596875, "epoch": 0.1693776045167361, "grad_norm": 0.485173761844635, "kl": 19.209375, "learning_rate": 1.9708207394927294e-05, "loss": 0.7682, "reward": 0.05124917030334473, "reward_std": 0.010859370271646185, "rewards/reward_func_1": 0.05124917030334473, "step": 630 }, { "completion_length": 17.403125, "epoch": 0.1707218712192499, "grad_norm": 1.2319393157958984, "kl": 18.13125, "learning_rate": 1.969684605642844e-05, "loss": 0.7251, "reward": 0.046324634552001955, "reward_std": 0.01381837234366685, "rewards/reward_func_1": 0.046324634552001955, "step": 635 }, { "completion_length": 10.29375, "epoch": 0.17206613792176367, "grad_norm": 3.877319574356079, "kl": 3407890.8765625, "learning_rate": 1.9685271139183143e-05, "loss": 136448.95, "reward": 0.051538944244384766, "reward_std": 0.010041882294171956, "rewards/reward_func_1": 0.051538944244384766, "step": 640 }, { "completion_length": 4.340625, "epoch": 0.17341040462427745, "grad_norm": 9511026688.0, "kl": 79088875.28125, "learning_rate": 1.9673482898135774e-05, "loss": 3171008.6, "reward": 0.05210127830505371, "reward_std": 0.010918277798919008, "rewards/reward_func_1": 0.05210127830505371, "step": 645 }, { "completion_length": 3.2375, "epoch": 0.17475467132679123, "grad_norm": 1.1192519664764404, "kl": 17511264.096875, "learning_rate": 1.9661481592929293e-05, "loss": 700102.15, "reward": 0.059194572269916534, "reward_std": 0.010659490662510507, "rewards/reward_func_1": 0.059194572269916534, "step": 650 }, { "completion_length": 441.04375, "epoch": 0.176098938029305, "grad_norm": 3818.766845703125, "kl": 449005.0505859375, "learning_rate": 1.9649267487899507e-05, "loss": 18001.5281, "reward": 0.0049600392580032345, "reward_std": 0.014874692249577492, "rewards/reward_func_1": 0.0049600392580032345, "step": 655 }, { "completion_length": 24.63125, "epoch": 0.1774432047318188, "grad_norm": 1.1994178295135498, "kl": 7.169140625, "learning_rate": 1.9636840852069284e-05, "loss": 0.2868, "reward": 0.02127237692475319, "reward_std": 0.01846984715666622, "rewards/reward_func_1": 0.02127237692475319, "step": 660 }, { "completion_length": 4.38125, "epoch": 0.17878747143433257, "grad_norm": 271670.03125, "kl": 6157.2625, "learning_rate": 1.962420195914259e-05, "loss": 245.975, "reward": 0.04686172604560852, "reward_std": 0.013880293245892971, "rewards/reward_func_1": 0.04686172604560852, "step": 665 }, { "completion_length": 7.53125, "epoch": 0.18013173813684635, "grad_norm": 1.2155910730361938, "kl": 11889.5125, "learning_rate": 1.961135108749849e-05, "loss": 477.0842, "reward": 0.04393459558486938, "reward_std": 0.01759743633447215, "rewards/reward_func_1": 0.04393459558486938, "step": 670 }, { "completion_length": 12.85625, "epoch": 0.18147600483936013, "grad_norm": 1.1338227987289429, "kl": 10.88984375, "learning_rate": 1.9598288520185e-05, "loss": 0.4355, "reward": 0.029730018973350526, "reward_std": 0.018472507712431252, "rewards/reward_func_1": 0.029730018973350526, "step": 675 }, { "completion_length": 10.8125, "epoch": 0.1828202715418739, "grad_norm": 1.7272660732269287, "kl": 12.0734375, "learning_rate": 1.958501454491286e-05, "loss": 0.4834, "reward": 0.039747095108032225, "reward_std": 0.020039613964036106, "rewards/reward_func_1": 0.039747095108032225, "step": 680 }, { "completion_length": 9.821875, "epoch": 0.1841645382443877, "grad_norm": 0.9799548983573914, "kl": 15.834375, "learning_rate": 1.95715294540492e-05, "loss": 0.6336, "reward": 0.04382616728544235, "reward_std": 0.017328777379589155, "rewards/reward_func_1": 0.04382616728544235, "step": 685 }, { "completion_length": 6.875, "epoch": 0.18550880494690147, "grad_norm": 0.0692245364189148, "kl": 15.3328125, "learning_rate": 1.9557833544611083e-05, "loss": 0.6131, "reward": 0.05723133087158203, "reward_std": 0.012962383369449526, "rewards/reward_func_1": 0.05723133087158203, "step": 690 }, { "completion_length": 111.0, "epoch": 0.18685307164941525, "grad_norm": 29.649320602416992, "kl": 15.096875, "learning_rate": 1.9543927118258988e-05, "loss": 0.6041, "reward": 0.059267282485961914, "reward_std": 0.016366570102400148, "rewards/reward_func_1": 0.059267282485961914, "step": 695 }, { "completion_length": 33.9375, "epoch": 0.18819733835192903, "grad_norm": 0.007695461623370647, "kl": 322.8, "learning_rate": 1.9529810481290143e-05, "loss": 12.915, "reward": 0.056771063804626466, "reward_std": 0.012514285945508163, "rewards/reward_func_1": 0.056771063804626466, "step": 700 }, { "completion_length": 2.0, "epoch": 0.1895416050544428, "grad_norm": 0.023763682693243027, "kl": 18.1828125, "learning_rate": 1.9515483944631793e-05, "loss": 0.7269, "reward": 0.06125969886779785, "reward_std": 0.009932457827380859, "rewards/reward_func_1": 0.06125969886779785, "step": 705 }, { "completion_length": 5.2, "epoch": 0.1908858717569566, "grad_norm": 0.3108590841293335, "kl": 18.0453125, "learning_rate": 1.9500947823834345e-05, "loss": 0.7218, "reward": 0.0602872371673584, "reward_std": 0.009525550016041962, "rewards/reward_func_1": 0.0602872371673584, "step": 710 }, { "completion_length": 11.98125, "epoch": 0.19223013845947035, "grad_norm": 0.009819800965487957, "kl": 18.3109375, "learning_rate": 1.9486202439064433e-05, "loss": 0.732, "reward": 0.05926952362060547, "reward_std": 0.010095558775356039, "rewards/reward_func_1": 0.05926952362060547, "step": 715 }, { "completion_length": 4.425, "epoch": 0.19357440516198413, "grad_norm": 0.007831516675651073, "kl": 17.8375, "learning_rate": 1.9471248115097827e-05, "loss": 0.7131, "reward": 0.06079845428466797, "reward_std": 0.010249754647520603, "rewards/reward_func_1": 0.06079845428466797, "step": 720 }, { "completion_length": 2.0, "epoch": 0.1949186718644979, "grad_norm": 0.006839285604655743, "kl": 18.021875, "learning_rate": 1.9456085181312333e-05, "loss": 0.7214, "reward": 0.06195640563964844, "reward_std": 0.012506642258085777, "rewards/reward_func_1": 0.06195640563964844, "step": 725 }, { "completion_length": 2.00625, "epoch": 0.19626293856701169, "grad_norm": 0.0004078986239619553, "kl": 17.8234375, "learning_rate": 1.9440713971680494e-05, "loss": 0.7135, "reward": 0.05450363159179687, "reward_std": 0.010389497011783533, "rewards/reward_func_1": 0.05450363159179687, "step": 730 }, { "completion_length": 2.0, "epoch": 0.19760720526952547, "grad_norm": 2.693261922104284e-05, "kl": 18.0421875, "learning_rate": 1.9425134824762263e-05, "loss": 0.722, "reward": 0.06317386627197266, "reward_std": 0.01099952881995705, "rewards/reward_func_1": 0.06317386627197266, "step": 735 }, { "completion_length": 2.0, "epoch": 0.19895147197203925, "grad_norm": 0.0003110544930677861, "kl": 18.16875, "learning_rate": 1.9409348083697516e-05, "loss": 0.7272, "reward": 0.061242103576660156, "reward_std": 0.011685801808926043, "rewards/reward_func_1": 0.061242103576660156, "step": 740 }, { "completion_length": 2.0, "epoch": 0.20029573867455303, "grad_norm": 2.4723798560444266e-05, "kl": 17.915625, "learning_rate": 1.9393354096198535e-05, "loss": 0.7161, "reward": 0.054812145233154294, "reward_std": 0.01056510213547881, "rewards/reward_func_1": 0.054812145233154294, "step": 745 }, { "completion_length": 2.0, "epoch": 0.2016400053770668, "grad_norm": 0.00010621309047564864, "kl": 17.85625, "learning_rate": 1.937715321454232e-05, "loss": 0.7141, "reward": 0.05950497388839722, "reward_std": 0.011216246478579706, "rewards/reward_func_1": 0.05950497388839722, "step": 750 }, { "completion_length": 2.0, "epoch": 0.20298427207958059, "grad_norm": 7.163731061154976e-05, "kl": 17.6765625, "learning_rate": 1.9360745795562813e-05, "loss": 0.7074, "reward": 0.06266632080078124, "reward_std": 0.011260019605106208, "rewards/reward_func_1": 0.06266632080078124, "step": 755 }, { "completion_length": 2.0, "epoch": 0.20432853878209437, "grad_norm": 0.00017179730639327317, "kl": 17.6109375, "learning_rate": 1.9344132200643102e-05, "loss": 0.7048, "reward": 0.0631840705871582, "reward_std": 0.01339399583703198, "rewards/reward_func_1": 0.0631840705871582, "step": 760 }, { "completion_length": 2.0, "epoch": 0.20567280548460815, "grad_norm": 0.00033472245559096336, "kl": 17.9296875, "learning_rate": 1.9327312795707392e-05, "loss": 0.7169, "reward": 0.06261520385742188, "reward_std": 0.011459613528859335, "rewards/reward_func_1": 0.06261520385742188, "step": 765 }, { "completion_length": 2.0, "epoch": 0.20701707218712193, "grad_norm": 0.02129560336470604, "kl": 17.7078125, "learning_rate": 1.931028795121299e-05, "loss": 0.7074, "reward": 0.060090065002441406, "reward_std": 0.010715857451577904, "rewards/reward_func_1": 0.060090065002441406, "step": 770 }, { "completion_length": 2.0, "epoch": 0.2083613388896357, "grad_norm": 0.00020586424216162413, "kl": 17.7390625, "learning_rate": 1.9293058042142117e-05, "loss": 0.7097, "reward": 0.05955848693847656, "reward_std": 0.010681279400523635, "rewards/reward_func_1": 0.05955848693847656, "step": 775 }, { "completion_length": 2.0, "epoch": 0.20970560559214949, "grad_norm": 0.00016600097296759486, "kl": 17.5921875, "learning_rate": 1.9275623447993678e-05, "loss": 0.7034, "reward": 0.06024360656738281, "reward_std": 0.010565872873849002, "rewards/reward_func_1": 0.06024360656738281, "step": 780 }, { "completion_length": 2.0, "epoch": 0.21104987229466327, "grad_norm": 0.0002437598304823041, "kl": 17.684375, "learning_rate": 1.9257984552774874e-05, "loss": 0.7073, "reward": 0.06276130676269531, "reward_std": 0.013013198171756812, "rewards/reward_func_1": 0.06276130676269531, "step": 785 }, { "completion_length": 2.0, "epoch": 0.21239413899717705, "grad_norm": 0.00045892002526670694, "kl": 17.6234375, "learning_rate": 1.9240141744992763e-05, "loss": 0.7051, "reward": 0.06035938262939453, "reward_std": 0.013352590511203744, "rewards/reward_func_1": 0.06035938262939453, "step": 790 }, { "completion_length": 2.0, "epoch": 0.21373840569969083, "grad_norm": 0.00022575826733373106, "kl": 17.875, "learning_rate": 1.9222095417645695e-05, "loss": 0.7155, "reward": 0.058776569366455075, "reward_std": 0.011941832641605287, "rewards/reward_func_1": 0.058776569366455075, "step": 795 }, { "completion_length": 2.0, "epoch": 0.2150826724022046, "grad_norm": 0.0002709394320845604, "kl": 17.8890625, "learning_rate": 1.920384596821467e-05, "loss": 0.7157, "reward": 0.05806446075439453, "reward_std": 0.00929926319167862, "rewards/reward_func_1": 0.05806446075439453, "step": 800 }, { "completion_length": 2.0, "epoch": 0.21642693910471839, "grad_norm": 0.0005522365099750459, "kl": 18.215625, "learning_rate": 1.9185393798654547e-05, "loss": 0.7285, "reward": 0.060375118255615236, "reward_std": 0.012819936085725204, "rewards/reward_func_1": 0.060375118255615236, "step": 805 }, { "completion_length": 2.0, "epoch": 0.21777120580723217, "grad_norm": 0.00012125197827117518, "kl": 18.078125, "learning_rate": 1.9166739315385244e-05, "loss": 0.7234, "reward": 0.06392664909362793, "reward_std": 0.009824539528926835, "rewards/reward_func_1": 0.06392664909362793, "step": 810 }, { "completion_length": 2.0, "epoch": 0.21911547250974595, "grad_norm": 0.0004721728328149766, "kl": 17.84375, "learning_rate": 1.9147882929282734e-05, "loss": 0.7138, "reward": 0.061508560180664064, "reward_std": 0.011408517364179716, "rewards/reward_func_1": 0.061508560180664064, "step": 815 }, { "completion_length": 2.0, "epoch": 0.22045973921225973, "grad_norm": 0.0005403547547757626, "kl": 17.646875, "learning_rate": 1.9128825055670035e-05, "loss": 0.7059, "reward": 0.059009552001953125, "reward_std": 0.009685787269700086, "rewards/reward_func_1": 0.059009552001953125, "step": 820 }, { "completion_length": 2.0, "epoch": 0.22180400591477348, "grad_norm": 0.0022164226975291967, "kl": 17.9734375, "learning_rate": 1.9109566114308036e-05, "loss": 0.7187, "reward": 0.05564627647399902, "reward_std": 0.010326084749249276, "rewards/reward_func_1": 0.05564627647399902, "step": 825 }, { "completion_length": 2.35, "epoch": 0.22314827261728726, "grad_norm": 0.0012156780576333404, "kl": 17.66875, "learning_rate": 1.9090106529386263e-05, "loss": 0.7067, "reward": 0.0656036376953125, "reward_std": 0.015077763356384822, "rewards/reward_func_1": 0.0656036376953125, "step": 830 }, { "completion_length": 2.0, "epoch": 0.22449253931980104, "grad_norm": 0.0010059759952127934, "kl": 18.1734375, "learning_rate": 1.907044672951354e-05, "loss": 0.7272, "reward": 0.057573127746582034, "reward_std": 0.010585914782132022, "rewards/reward_func_1": 0.057573127746582034, "step": 835 }, { "completion_length": 2.0, "epoch": 0.22583680602231482, "grad_norm": 0.0004464346857275814, "kl": 17.9625, "learning_rate": 1.9050587147708544e-05, "loss": 0.7182, "reward": 0.06241474151611328, "reward_std": 0.009492194746417226, "rewards/reward_func_1": 0.06241474151611328, "step": 840 }, { "completion_length": 2.0, "epoch": 0.2271810727248286, "grad_norm": 0.0005409275181591511, "kl": 18.2171875, "learning_rate": 1.9030528221390255e-05, "loss": 0.7287, "reward": 0.06225318908691406, "reward_std": 0.011348171227291459, "rewards/reward_func_1": 0.06225318908691406, "step": 845 }, { "completion_length": 6.090625, "epoch": 0.22852533942734238, "grad_norm": 13.99404525756836, "kl": 17.190625, "learning_rate": 1.9010270392368343e-05, "loss": 0.6867, "reward": 0.0607336699962616, "reward_std": 0.014851068891584874, "rewards/reward_func_1": 0.0607336699962616, "step": 850 }, { "completion_length": 3.1, "epoch": 0.22986960612985616, "grad_norm": 0.0005020995158702135, "kl": 17.60625, "learning_rate": 1.898981410683343e-05, "loss": 0.7042, "reward": 0.06041567623615265, "reward_std": 0.012269638044381281, "rewards/reward_func_1": 0.06041567623615265, "step": 855 }, { "completion_length": 2.0, "epoch": 0.23121387283236994, "grad_norm": 0.0013556176563724875, "kl": 17.64375, "learning_rate": 1.8969159815347253e-05, "loss": 0.7065, "reward": 0.06363449096679688, "reward_std": 0.010352238497580402, "rewards/reward_func_1": 0.06363449096679688, "step": 860 }, { "completion_length": 2.0, "epoch": 0.23255813953488372, "grad_norm": 0.0008380432846024632, "kl": 17.7609375, "learning_rate": 1.8948307972832744e-05, "loss": 0.7101, "reward": 0.06133832931518555, "reward_std": 0.012271754596440587, "rewards/reward_func_1": 0.06133832931518555, "step": 865 }, { "completion_length": 2.0, "epoch": 0.2339024062373975, "grad_norm": 0.0016801492311060429, "kl": 17.803125, "learning_rate": 1.8927259038564023e-05, "loss": 0.7121, "reward": 0.06001472473144531, "reward_std": 0.011952074009786883, "rewards/reward_func_1": 0.06001472473144531, "step": 870 }, { "completion_length": 2.459375, "epoch": 0.23524667293991128, "grad_norm": 0.07210814952850342, "kl": 31147.065625, "learning_rate": 1.8906013476156265e-05, "loss": 1248.7868, "reward": 0.05841388702392578, "reward_std": 0.011889992751093814, "rewards/reward_func_1": 0.05841388702392578, "step": 875 }, { "completion_length": 2.028125, "epoch": 0.23659093964242506, "grad_norm": 0.04944615811109543, "kl": 17.9125, "learning_rate": 1.8884571753555495e-05, "loss": 0.7165, "reward": 0.059661483764648436, "reward_std": 0.01035475345343002, "rewards/reward_func_1": 0.059661483764648436, "step": 880 }, { "completion_length": 4.909375, "epoch": 0.23793520634493884, "grad_norm": 0.5363011360168457, "kl": 17.625, "learning_rate": 1.8862934343028288e-05, "loss": 0.7049, "reward": 0.06338434219360352, "reward_std": 0.012126463351160055, "rewards/reward_func_1": 0.06338434219360352, "step": 885 }, { "completion_length": 62.634375, "epoch": 0.23927947304745262, "grad_norm": 0.1920609325170517, "kl": 16.6390625, "learning_rate": 1.884110172115135e-05, "loss": 0.6654, "reward": 0.05351438522338867, "reward_std": 0.02005903590179514, "rewards/reward_func_1": 0.05351438522338867, "step": 890 }, { "completion_length": 2.0, "epoch": 0.2406237397499664, "grad_norm": 0.0013905576197430491, "kl": 18.1953125, "learning_rate": 1.8819074368801045e-05, "loss": 0.7282, "reward": 0.06563434600830079, "reward_std": 0.009951398673729272, "rewards/reward_func_1": 0.06563434600830079, "step": 895 }, { "completion_length": 2.0, "epoch": 0.24196800645248018, "grad_norm": 0.0008331938879564404, "kl": 18.003125, "learning_rate": 1.8796852771142778e-05, "loss": 0.7201, "reward": 0.061870574951171875, "reward_std": 0.012352473140344955, "rewards/reward_func_1": 0.061870574951171875, "step": 900 }, { "completion_length": 2.0, "epoch": 0.24331227315499396, "grad_norm": 0.00020705144561361521, "kl": 18.053125, "learning_rate": 1.8774437417620334e-05, "loss": 0.7223, "reward": 0.06816902160644531, "reward_std": 0.012587691200315021, "rewards/reward_func_1": 0.06816902160644531, "step": 905 }, { "completion_length": 2.0, "epoch": 0.24465653985750774, "grad_norm": 0.0003319174575153738, "kl": 17.8921875, "learning_rate": 1.8751828801945074e-05, "loss": 0.7151, "reward": 0.058438873291015624, "reward_std": 0.012193899090743799, "rewards/reward_func_1": 0.058438873291015624, "step": 910 }, { "completion_length": 2.0, "epoch": 0.24600080656002152, "grad_norm": 0.000264251691987738, "kl": 18.0390625, "learning_rate": 1.872902742208508e-05, "loss": 0.7217, "reward": 0.060257339477539064, "reward_std": 0.010663219789967116, "rewards/reward_func_1": 0.060257339477539064, "step": 915 }, { "completion_length": 2.0, "epoch": 0.2473450732625353, "grad_norm": 0.0001942398666869849, "kl": 17.9234375, "learning_rate": 1.8706033780254168e-05, "loss": 0.7168, "reward": 0.05674247741699219, "reward_std": 0.009646143747158931, "rewards/reward_func_1": 0.05674247741699219, "step": 920 }, { "completion_length": 2.0, "epoch": 0.24868933996504908, "grad_norm": 0.000582867010962218, "kl": 17.5609375, "learning_rate": 1.8682848382900852e-05, "loss": 0.7027, "reward": 0.06358718872070312, "reward_std": 0.01264539449075528, "rewards/reward_func_1": 0.06358718872070312, "step": 925 }, { "completion_length": 2.0, "epoch": 0.25003360666756286, "grad_norm": 0.0004526897973846644, "kl": 17.9296875, "learning_rate": 1.865947174069716e-05, "loss": 0.7172, "reward": 0.059136390686035156, "reward_std": 0.010592962511873338, "rewards/reward_func_1": 0.059136390686035156, "step": 930 }, { "completion_length": 2.0, "epoch": 0.2513778733700766, "grad_norm": 0.0003386743483133614, "kl": 17.75625, "learning_rate": 1.8635904368527406e-05, "loss": 0.7107, "reward": 0.06310138702392579, "reward_std": 0.011821250266802964, "rewards/reward_func_1": 0.06310138702392579, "step": 935 }, { "completion_length": 2.0, "epoch": 0.2527221400725904, "grad_norm": 0.00037925346987321973, "kl": 17.703125, "learning_rate": 1.861214678547685e-05, "loss": 0.7079, "reward": 0.06231670379638672, "reward_std": 0.010538342622749042, "rewards/reward_func_1": 0.06231670379638672, "step": 940 }, { "completion_length": 2.0, "epoch": 0.25406640677510417, "grad_norm": 0.0006748105515725911, "kl": 17.96875, "learning_rate": 1.858819951482026e-05, "loss": 0.7188, "reward": 0.05954875946044922, "reward_std": 0.011791958093454014, "rewards/reward_func_1": 0.05954875946044922, "step": 945 }, { "completion_length": 2.0, "epoch": 0.255410673477618, "grad_norm": 0.000709658779669553, "kl": 17.6859375, "learning_rate": 1.856406308401036e-05, "loss": 0.7072, "reward": 0.0561366081237793, "reward_std": 0.009839185555756557, "rewards/reward_func_1": 0.0561366081237793, "step": 950 }, { "completion_length": 2.0, "epoch": 0.25675494018013173, "grad_norm": 0.0004975300398655236, "kl": 17.975, "learning_rate": 1.853973802466627e-05, "loss": 0.7186, "reward": 0.0569252610206604, "reward_std": 0.009655545311397873, "rewards/reward_func_1": 0.0569252610206604, "step": 955 }, { "completion_length": 2.0, "epoch": 0.25809920688264554, "grad_norm": 0.0006206005346029997, "kl": 17.8875, "learning_rate": 1.8515224872561745e-05, "loss": 0.7151, "reward": 0.06045455932617187, "reward_std": 0.011623913834773703, "rewards/reward_func_1": 0.06045455932617187, "step": 960 }, { "completion_length": 2.0, "epoch": 0.2594434735851593, "grad_norm": 0.0007395711145363748, "kl": 18.040625, "learning_rate": 1.8490524167613405e-05, "loss": 0.7214, "reward": 0.057852745056152344, "reward_std": 0.011105244704231155, "rewards/reward_func_1": 0.057852745056152344, "step": 965 }, { "completion_length": 2.0, "epoch": 0.2607877402876731, "grad_norm": 0.0008309457916766405, "kl": 17.6515625, "learning_rate": 1.8465636453868825e-05, "loss": 0.7064, "reward": 0.06783523559570312, "reward_std": 0.009911755218854523, "rewards/reward_func_1": 0.06783523559570312, "step": 970 }, { "completion_length": 2.0, "epoch": 0.26213200699018685, "grad_norm": 0.000725765130482614, "kl": 17.36875, "learning_rate": 1.8440562279494557e-05, "loss": 0.695, "reward": 0.05620386600494385, "reward_std": 0.009591523706330918, "rewards/reward_func_1": 0.05620386600494385, "step": 975 }, { "completion_length": 2.0, "epoch": 0.26347627369270066, "grad_norm": 0.0008614324615336955, "kl": 17.940625, "learning_rate": 1.8415302196764068e-05, "loss": 0.7172, "reward": 0.06371011734008789, "reward_std": 0.012515782276750542, "rewards/reward_func_1": 0.06371011734008789, "step": 980 }, { "completion_length": 2.0, "epoch": 0.2648205403952144, "grad_norm": 0.0006585062947124243, "kl": 18.028125, "learning_rate": 1.8389856762045556e-05, "loss": 0.7213, "reward": 0.05774202346801758, "reward_std": 0.01049440445349319, "rewards/reward_func_1": 0.05774202346801758, "step": 985 }, { "completion_length": 2.0, "epoch": 0.26616480709772816, "grad_norm": 0.0015991459367796779, "kl": 17.9125, "learning_rate": 1.836422653578971e-05, "loss": 0.716, "reward": 0.061675214767456056, "reward_std": 0.010034650065063034, "rewards/reward_func_1": 0.061675214767456056, "step": 990 }, { "completion_length": 2.0, "epoch": 0.26750907380024197, "grad_norm": 0.0015729337465018034, "kl": 17.6875, "learning_rate": 1.8338412082517357e-05, "loss": 0.7081, "reward": 0.057560133934021, "reward_std": 0.010157572498792433, "rewards/reward_func_1": 0.057560133934021, "step": 995 }, { "completion_length": 2.0, "epoch": 0.2688533405027557, "grad_norm": 0.0010110485600307584, "kl": 18.175, "learning_rate": 1.8312413970807043e-05, "loss": 0.7263, "reward": 0.058531570434570315, "reward_std": 0.009644822326663416, "rewards/reward_func_1": 0.058531570434570315, "step": 1000 }, { "completion_length": 2.0, "epoch": 0.27019760720526953, "grad_norm": 0.0011084715370088816, "kl": 17.73125, "learning_rate": 1.8286232773282492e-05, "loss": 0.7093, "reward": 0.05668430328369141, "reward_std": 0.009714638943114551, "rewards/reward_func_1": 0.05668430328369141, "step": 1005 }, { "completion_length": 2.0, "epoch": 0.2715418739077833, "grad_norm": 0.0011657410068437457, "kl": 17.453125, "learning_rate": 1.8259869066600005e-05, "loss": 0.6981, "reward": 0.060795021057128903, "reward_std": 0.008335485706629698, "rewards/reward_func_1": 0.060795021057128903, "step": 1010 }, { "completion_length": 2.0, "epoch": 0.2728861406102971, "grad_norm": 0.0010427763918414712, "kl": 18.075, "learning_rate": 1.8233323431435744e-05, "loss": 0.723, "reward": 0.06016595363616943, "reward_std": 0.011402350757271052, "rewards/reward_func_1": 0.06016595363616943, "step": 1015 }, { "completion_length": 2.0, "epoch": 0.27423040731281084, "grad_norm": 0.00200888910330832, "kl": 17.9109375, "learning_rate": 1.820659645247296e-05, "loss": 0.717, "reward": 0.056774234771728514, "reward_std": 0.011336608513374813, "rewards/reward_func_1": 0.056774234771728514, "step": 1020 }, { "completion_length": 2.0, "epoch": 0.27557467401532465, "grad_norm": 0.0016117419581860304, "kl": 17.765625, "learning_rate": 1.8179688718389116e-05, "loss": 0.7105, "reward": 0.06235724687576294, "reward_std": 0.010060734899889212, "rewards/reward_func_1": 0.06235724687576294, "step": 1025 }, { "completion_length": 2.45, "epoch": 0.2769189407178384, "grad_norm": 0.0025889407843351364, "kl": 17.771875, "learning_rate": 1.8152600821842902e-05, "loss": 0.711, "reward": 0.058268165588378905, "reward_std": 0.01004049998264236, "rewards/reward_func_1": 0.058268165588378905, "step": 1030 }, { "completion_length": 2.0, "epoch": 0.2782632074203522, "grad_norm": 0.0021764549892395735, "kl": 17.765625, "learning_rate": 1.8125333359461194e-05, "loss": 0.7108, "reward": 0.05997686386108399, "reward_std": 0.010472600482171402, "rewards/reward_func_1": 0.05997686386108399, "step": 1035 }, { "completion_length": 2.0, "epoch": 0.27960747412286596, "grad_norm": 0.0027070462238043547, "kl": 18.00625, "learning_rate": 1.8097886931825916e-05, "loss": 0.72, "reward": 0.057472729682922365, "reward_std": 0.010958646313520148, "rewards/reward_func_1": 0.057472729682922365, "step": 1040 }, { "completion_length": 2.00625, "epoch": 0.28095174082537977, "grad_norm": 0.02638576552271843, "kl": 17.8140625, "learning_rate": 1.8070262143460803e-05, "loss": 0.7121, "reward": 0.06007643789052963, "reward_std": 0.01233230889774859, "rewards/reward_func_1": 0.06007643789052963, "step": 1045 }, { "completion_length": 2.4625, "epoch": 0.2822960075278935, "grad_norm": 0.38510629534721375, "kl": 17.2859375, "learning_rate": 1.8042459602818092e-05, "loss": 0.6911, "reward": 0.060521507263183595, "reward_std": 0.011037798667530296, "rewards/reward_func_1": 0.060521507263183595, "step": 1050 }, { "completion_length": 4.6375, "epoch": 0.28364027423040733, "grad_norm": 1.0965192317962646, "kl": 15.5390625, "learning_rate": 1.8014479922265117e-05, "loss": 0.6215, "reward": 0.05944366455078125, "reward_std": 0.01202023433870636, "rewards/reward_func_1": 0.05944366455078125, "step": 1055 }, { "completion_length": 68.578125, "epoch": 0.2849845409329211, "grad_norm": 136523.703125, "kl": 128.46640625, "learning_rate": 1.7986323718070826e-05, "loss": 5.144, "reward": 0.056297135353088376, "reward_std": 0.014432728511746973, "rewards/reward_func_1": 0.056297135353088376, "step": 1060 }, { "completion_length": 151.353125, "epoch": 0.2863288076354349, "grad_norm": 1.3205143213272095, "kl": 6.2421875, "learning_rate": 1.79579916103922e-05, "loss": 0.2498, "reward": 0.022794413566589355, "reward_std": 0.025071121371001936, "rewards/reward_func_1": 0.022794413566589355, "step": 1065 }, { "completion_length": 60.24375, "epoch": 0.28767307433794864, "grad_norm": 0.668519139289856, "kl": 13.2421875, "learning_rate": 1.79294842232606e-05, "loss": 0.5298, "reward": 0.02910344898700714, "reward_std": 0.021979624161031098, "rewards/reward_func_1": 0.02910344898700714, "step": 1070 }, { "completion_length": 69.1875, "epoch": 0.28901734104046245, "grad_norm": 0.38422349095344543, "kl": 15.446875, "learning_rate": 1.7900802184568024e-05, "loss": 0.6174, "reward": 0.032975000143051145, "reward_std": 0.019280125828663584, "rewards/reward_func_1": 0.032975000143051145, "step": 1075 }, { "completion_length": 1.93125, "epoch": 0.2903616077429762, "grad_norm": 0.4854346513748169, "kl": 18.4921875, "learning_rate": 1.7871946126053265e-05, "loss": 0.7396, "reward": 0.0445002555847168, "reward_std": 0.009190794143796666, "rewards/reward_func_1": 0.0445002555847168, "step": 1080 }, { "completion_length": 13.75625, "epoch": 0.29170587444549, "grad_norm": 0.23374588787555695, "kl": 16.453125, "learning_rate": 1.784291668328801e-05, "loss": 0.658, "reward": 0.04094771146774292, "reward_std": 0.010110658951089136, "rewards/reward_func_1": 0.04094771146774292, "step": 1085 }, { "completion_length": 55.621875, "epoch": 0.29305014114800376, "grad_norm": 0.29550668597221375, "kl": 14.3140625, "learning_rate": 1.781371449566284e-05, "loss": 0.5726, "reward": 0.039327383041381836, "reward_std": 0.014201272143691313, "rewards/reward_func_1": 0.039327383041381836, "step": 1090 }, { "completion_length": 2.878125, "epoch": 0.29439440785051757, "grad_norm": 0.15203788876533508, "kl": 16.3359375, "learning_rate": 1.7784340206373135e-05, "loss": 0.6532, "reward": 0.04413075447082519, "reward_std": 0.007815522653254447, "rewards/reward_func_1": 0.04413075447082519, "step": 1095 }, { "completion_length": 2.740625, "epoch": 0.2957386745530313, "grad_norm": 0.1516532450914383, "kl": 17.3421875, "learning_rate": 1.7754794462404924e-05, "loss": 0.6937, "reward": 0.045378980413079265, "reward_std": 0.008754154351481701, "rewards/reward_func_1": 0.045378980413079265, "step": 1100 }, { "completion_length": 2.86875, "epoch": 0.2970829412555451, "grad_norm": 0.3617139160633087, "kl": 17.16875, "learning_rate": 1.772507791452062e-05, "loss": 0.687, "reward": 0.03869695663452148, "reward_std": 0.008698664297844516, "rewards/reward_func_1": 0.03869695663452148, "step": 1105 }, { "completion_length": 6.446875, "epoch": 0.2984272079580589, "grad_norm": 0.12706607580184937, "kl": 16.409375, "learning_rate": 1.7695191217244694e-05, "loss": 0.6564, "reward": 0.04749107360839844, "reward_std": 0.008462011188385077, "rewards/reward_func_1": 0.04749107360839844, "step": 1110 }, { "completion_length": 72.753125, "epoch": 0.29977147466057263, "grad_norm": 0.321855753660202, "kl": 13.3921875, "learning_rate": 1.766513502884926e-05, "loss": 0.5358, "reward": 0.04237784147262573, "reward_std": 0.0151958847156493, "rewards/reward_func_1": 0.04237784147262573, "step": 1115 }, { "completion_length": 131.153125, "epoch": 0.30111574136308644, "grad_norm": 0.1894012689590454, "kl": 12.8296875, "learning_rate": 1.7634910011339576e-05, "loss": 0.5134, "reward": 0.03622118234634399, "reward_std": 0.016812963741540444, "rewards/reward_func_1": 0.03622118234634399, "step": 1120 }, { "completion_length": 225.79375, "epoch": 0.3024600080656002, "grad_norm": 0.6886034607887268, "kl": 119.9203125, "learning_rate": 1.7604516830439447e-05, "loss": 4.8239, "reward": 0.028684809803962708, "reward_std": 0.020281461635022424, "rewards/reward_func_1": 0.028684809803962708, "step": 1125 }, { "completion_length": 733.759375, "epoch": 0.303804274768114, "grad_norm": 0.5576857328414917, "kl": 7.11328125, "learning_rate": 1.7573956155576596e-05, "loss": 0.2844, "reward": 0.007758472859859466, "reward_std": 0.014932763832621276, "rewards/reward_func_1": 0.007758472859859466, "step": 1130 }, { "completion_length": 251.840625, "epoch": 0.30514854147062775, "grad_norm": 1.2845573425292969, "kl": 421385.557421875, "learning_rate": 1.7543228659867887e-05, "loss": 16848.1047, "reward": 0.007775214128196239, "reward_std": 0.017382631546934136, "rewards/reward_func_1": 0.007775214128196239, "step": 1135 }, { "completion_length": 2.09375, "epoch": 0.30649280817314156, "grad_norm": 1.5478886365890503, "kl": 18.3546875, "learning_rate": 1.7512335020104507e-05, "loss": 0.7346, "reward": 0.042022180557250974, "reward_std": 0.011799084773520008, "rewards/reward_func_1": 0.042022180557250974, "step": 1140 }, { "completion_length": 2.44375, "epoch": 0.3078370748756553, "grad_norm": 0.4710218012332916, "kl": 18.0078125, "learning_rate": 1.7481275916737077e-05, "loss": 0.7209, "reward": 0.043611574172973636, "reward_std": 0.016114455633214675, "rewards/reward_func_1": 0.043611574172973636, "step": 1145 }, { "completion_length": 3.021875, "epoch": 0.3091813415781691, "grad_norm": 1.128142237663269, "kl": 17.0234375, "learning_rate": 1.7450052033860643e-05, "loss": 0.681, "reward": 0.04341961294412613, "reward_std": 0.01588066411204636, "rewards/reward_func_1": 0.04341961294412613, "step": 1150 }, { "completion_length": 2.2375, "epoch": 0.3105256082806829, "grad_norm": 1.5875157117843628, "kl": 19.2234375, "learning_rate": 1.7418664059199615e-05, "loss": 0.7687, "reward": 0.05236520916223526, "reward_std": 0.013312915465940022, "rewards/reward_func_1": 0.05236520916223526, "step": 1155 }, { "completion_length": 2.0, "epoch": 0.3118698749831967, "grad_norm": 0.0017928759334608912, "kl": 20.63125, "learning_rate": 1.738711268409263e-05, "loss": 0.8251, "reward": 0.05562934875488281, "reward_std": 0.00945893834286835, "rewards/reward_func_1": 0.05562934875488281, "step": 1160 }, { "completion_length": 2.0, "epoch": 0.31321414168571043, "grad_norm": 0.003861919976770878, "kl": 21.05625, "learning_rate": 1.73553986034773e-05, "loss": 0.8425, "reward": 0.05521247386932373, "reward_std": 0.0097390030954557, "rewards/reward_func_1": 0.05521247386932373, "step": 1165 }, { "completion_length": 2.0, "epoch": 0.31455840838822424, "grad_norm": 0.003637129906564951, "kl": 20.484375, "learning_rate": 1.7323522515874945e-05, "loss": 0.8202, "reward": 0.05392255783081055, "reward_std": 0.009917261235386832, "rewards/reward_func_1": 0.05392255783081055, "step": 1170 }, { "completion_length": 2.0, "epoch": 0.315902675090738, "grad_norm": 0.006247695069760084, "kl": 20.565625, "learning_rate": 1.7291485123375164e-05, "loss": 0.8229, "reward": 0.056228256225585936, "reward_std": 0.009558047083555721, "rewards/reward_func_1": 0.056228256225585936, "step": 1175 }, { "completion_length": 1.99375, "epoch": 0.3172469417932518, "grad_norm": 0.004366494249552488, "kl": 20.55, "learning_rate": 1.72592871316204e-05, "loss": 0.8223, "reward": 0.062088823318481444, "reward_std": 0.009840463204818661, "rewards/reward_func_1": 0.062088823318481444, "step": 1180 }, { "completion_length": 2.0, "epoch": 0.31859120849576555, "grad_norm": 0.002056930446997285, "kl": 20.60625, "learning_rate": 1.722692924979039e-05, "loss": 0.8245, "reward": 0.05960988998413086, "reward_std": 0.008964826199371601, "rewards/reward_func_1": 0.05960988998413086, "step": 1185 }, { "completion_length": 2.0, "epoch": 0.31993547519827936, "grad_norm": 0.015865160152316093, "kl": 21.034375, "learning_rate": 1.719441219058654e-05, "loss": 0.8416, "reward": 0.05759906768798828, "reward_std": 0.009656935631937813, "rewards/reward_func_1": 0.05759906768798828, "step": 1190 }, { "completion_length": 2.003125, "epoch": 0.3212797419007931, "grad_norm": 0.0372232086956501, "kl": 20.875, "learning_rate": 1.7161736670216233e-05, "loss": 0.8354, "reward": 0.05712289810180664, "reward_std": 0.007381719051045366, "rewards/reward_func_1": 0.05712289810180664, "step": 1195 }, { "completion_length": 2.003125, "epoch": 0.3226240086033069, "grad_norm": 0.8008459210395813, "kl": 20.2625, "learning_rate": 1.7128903408377053e-05, "loss": 0.8105, "reward": 0.05510530471801758, "reward_std": 0.011877764340533758, "rewards/reward_func_1": 0.05510530471801758, "step": 1200 }, { "completion_length": 2.13125, "epoch": 0.3239682753058207, "grad_norm": 0.0008978499681688845, "kl": 20.628125, "learning_rate": 1.7095913128240936e-05, "loss": 0.8251, "reward": 0.05747789740562439, "reward_std": 0.011763529140444007, "rewards/reward_func_1": 0.05747789740562439, "step": 1205 }, { "completion_length": 2.0, "epoch": 0.3253125420083344, "grad_norm": 0.0013102364027872682, "kl": 20.58125, "learning_rate": 1.7062766556438233e-05, "loss": 0.8234, "reward": 0.05572299957275391, "reward_std": 0.008765837910323171, "rewards/reward_func_1": 0.05572299957275391, "step": 1210 }, { "completion_length": 2.0, "epoch": 0.32665680871084823, "grad_norm": 0.0024437177926301956, "kl": 20.46875, "learning_rate": 1.7029464423041713e-05, "loss": 0.8187, "reward": 0.05510997772216797, "reward_std": 0.008241662751242985, "rewards/reward_func_1": 0.05510997772216797, "step": 1215 }, { "completion_length": 2.0, "epoch": 0.328001075413362, "grad_norm": 0.0028229840099811554, "kl": 20.66875, "learning_rate": 1.6996007461550483e-05, "loss": 0.8269, "reward": 0.053923177719116214, "reward_std": 0.010693888347304892, "rewards/reward_func_1": 0.053923177719116214, "step": 1220 }, { "completion_length": 2.0, "epoch": 0.3293453421158758, "grad_norm": 0.0020496586803346872, "kl": 20.678125, "learning_rate": 1.6962396408873826e-05, "loss": 0.8276, "reward": 0.056081295013427734, "reward_std": 0.00998476523818681, "rewards/reward_func_1": 0.056081295013427734, "step": 1225 }, { "completion_length": 2.0, "epoch": 0.33068960881838955, "grad_norm": 0.002698215888813138, "kl": 20.396875, "learning_rate": 1.6928632005314983e-05, "loss": 0.8162, "reward": 0.05358821749687195, "reward_std": 0.010563099296268775, "rewards/reward_func_1": 0.05358821749687195, "step": 1230 }, { "completion_length": 2.0, "epoch": 0.33203387552090335, "grad_norm": 0.004789648577570915, "kl": 20.478125, "learning_rate": 1.689471499455482e-05, "loss": 0.8191, "reward": 0.05402927398681641, "reward_std": 0.009172404053242645, "rewards/reward_func_1": 0.05402927398681641, "step": 1235 }, { "completion_length": 2.0, "epoch": 0.3333781422234171, "grad_norm": 0.010540174320340157, "kl": 20.75625, "learning_rate": 1.6860646123635482e-05, "loss": 0.8302, "reward": 0.05363121032714844, "reward_std": 0.010847341820772271, "rewards/reward_func_1": 0.05363121032714844, "step": 1240 }, { "completion_length": 2.01875, "epoch": 0.3347224089259309, "grad_norm": 0.8453480005264282, "kl": 20.55625, "learning_rate": 1.6826426142943925e-05, "loss": 0.8223, "reward": 0.05549154281616211, "reward_std": 0.010959918500157073, "rewards/reward_func_1": 0.05549154281616211, "step": 1245 }, { "completion_length": 2.65625, "epoch": 0.33606667562844467, "grad_norm": 0.7238678932189941, "kl": 16.04375, "learning_rate": 1.679205580619538e-05, "loss": 0.6421, "reward": 0.060180139541625974, "reward_std": 0.012135649514675606, "rewards/reward_func_1": 0.060180139541625974, "step": 1250 }, { "completion_length": 2.0, "epoch": 0.3374109423309585, "grad_norm": 0.06061722710728645, "kl": 15.15, "learning_rate": 1.6757535870416755e-05, "loss": 0.6056, "reward": 0.06041898727416992, "reward_std": 0.010352135712309973, "rewards/reward_func_1": 0.06041898727416992, "step": 1255 }, { "completion_length": 2.0, "epoch": 0.3387552090334722, "grad_norm": 0.6215311884880066, "kl": 14.9609375, "learning_rate": 1.6722867095929976e-05, "loss": 0.5983, "reward": 0.05601742267608643, "reward_std": 0.010467067039280664, "rewards/reward_func_1": 0.05601742267608643, "step": 1260 }, { "completion_length": 2.0, "epoch": 0.34009947573598603, "grad_norm": 0.005102403461933136, "kl": 13.65625, "learning_rate": 1.6688050246335216e-05, "loss": 0.5462, "reward": 0.05668201446533203, "reward_std": 0.011318438543821686, "rewards/reward_func_1": 0.05668201446533203, "step": 1265 }, { "completion_length": 2.0, "epoch": 0.3414437424384998, "grad_norm": 0.05287908762693405, "kl": 13.490625, "learning_rate": 1.6653086088494106e-05, "loss": 0.5396, "reward": 0.05806665420532227, "reward_std": 0.011723793356213718, "rewards/reward_func_1": 0.05806665420532227, "step": 1270 }, { "completion_length": 2.0, "epoch": 0.3427880091410136, "grad_norm": 0.00010848957026610151, "kl": 13.9109375, "learning_rate": 1.6617975392512812e-05, "loss": 0.5563, "reward": 0.06332006454467773, "reward_std": 0.012813021524925717, "rewards/reward_func_1": 0.06332006454467773, "step": 1275 }, { "completion_length": 1.9875, "epoch": 0.34413227584352735, "grad_norm": 0.0001873042929219082, "kl": 13.890625, "learning_rate": 1.6582718931725094e-05, "loss": 0.5556, "reward": 0.05860910415649414, "reward_std": 0.012335632972826716, "rewards/reward_func_1": 0.05860910415649414, "step": 1280 }, { "completion_length": 2.0, "epoch": 0.34547654254604115, "grad_norm": 6.566791398654459e-06, "kl": 13.903125, "learning_rate": 1.6547317482675277e-05, "loss": 0.5563, "reward": 0.05752272605895996, "reward_std": 0.010450741471140645, "rewards/reward_func_1": 0.05752272605895996, "step": 1285 }, { "completion_length": 2.0, "epoch": 0.3468208092485549, "grad_norm": 1.5863972748775268e-06, "kl": 13.7296875, "learning_rate": 1.651177182510112e-05, "loss": 0.5491, "reward": 0.054108810424804685, "reward_std": 0.010263701246731215, "rewards/reward_func_1": 0.054108810424804685, "step": 1290 }, { "completion_length": 2.0, "epoch": 0.3481650759510687, "grad_norm": 3.900764113495825e-06, "kl": 13.80625, "learning_rate": 1.6476082741916677e-05, "loss": 0.5522, "reward": 0.06382217407226562, "reward_std": 0.01203576557818451, "rewards/reward_func_1": 0.06382217407226562, "step": 1295 }, { "completion_length": 2.0, "epoch": 0.34950934265358247, "grad_norm": 2.0937131921527907e-06, "kl": 13.7046875, "learning_rate": 1.644025101919503e-05, "loss": 0.5484, "reward": 0.05921125411987305, "reward_std": 0.011942052780796075, "rewards/reward_func_1": 0.05921125411987305, "step": 1300 }, { "completion_length": 2.0, "epoch": 0.3508536093560963, "grad_norm": 1.797810909920372e-06, "kl": 13.8578125, "learning_rate": 1.6404277446150968e-05, "loss": 0.5542, "reward": 0.061875534057617185, "reward_std": 0.010323166584566935, "rewards/reward_func_1": 0.061875534057617185, "step": 1305 }, { "completion_length": 2.0, "epoch": 0.35219787605861, "grad_norm": 4.8423517000628635e-06, "kl": 13.696875, "learning_rate": 1.6368162815123637e-05, "loss": 0.5476, "reward": 0.05844669342041016, "reward_std": 0.01120743685751222, "rewards/reward_func_1": 0.05844669342041016, "step": 1310 }, { "completion_length": 2.0, "epoch": 0.35354214276112383, "grad_norm": 1.9106237232335843e-05, "kl": 13.6734375, "learning_rate": 1.633190792155906e-05, "loss": 0.5468, "reward": 0.05963554382324219, "reward_std": 0.011439791695011081, "rewards/reward_func_1": 0.05963554382324219, "step": 1315 }, { "completion_length": 2.0, "epoch": 0.3548864094636376, "grad_norm": 3.11785652229446e-06, "kl": 13.5453125, "learning_rate": 1.629551356399262e-05, "loss": 0.5419, "reward": 0.06005144119262695, "reward_std": 0.00977201181158307, "rewards/reward_func_1": 0.06005144119262695, "step": 1320 }, { "completion_length": 2.0, "epoch": 0.35623067616615134, "grad_norm": 3.1484196370001882e-06, "kl": 13.89375, "learning_rate": 1.625898054403148e-05, "loss": 0.5557, "reward": 0.06197786331176758, "reward_std": 0.010603644404909573, "rewards/reward_func_1": 0.06197786331176758, "step": 1325 }, { "completion_length": 2.0, "epoch": 0.35757494286866515, "grad_norm": 2.623250566102797e-06, "kl": 14.0703125, "learning_rate": 1.6222309666336933e-05, "loss": 0.5626, "reward": 0.06794366836547852, "reward_std": 0.01082740986457793, "rewards/reward_func_1": 0.06794366836547852, "step": 1330 }, { "completion_length": 2.0, "epoch": 0.3589192095711789, "grad_norm": 1.4281185940490104e-06, "kl": 13.784375, "learning_rate": 1.6185501738606654e-05, "loss": 0.5515, "reward": 0.05785388946533203, "reward_std": 0.009857135304082476, "rewards/reward_func_1": 0.05785388946533203, "step": 1335 }, { "completion_length": 2.0, "epoch": 0.3602634762736927, "grad_norm": 1.23358740893309e-05, "kl": 13.928125, "learning_rate": 1.614855757155693e-05, "loss": 0.5574, "reward": 0.061734676361083984, "reward_std": 0.012116704345680773, "rewards/reward_func_1": 0.061734676361083984, "step": 1340 }, { "completion_length": 2.0, "epoch": 0.36160774297620646, "grad_norm": 3.1937454423314193e-06, "kl": 13.4515625, "learning_rate": 1.6111477978904813e-05, "loss": 0.5378, "reward": 0.05473334789276123, "reward_std": 0.010365012554575514, "rewards/reward_func_1": 0.05473334789276123, "step": 1345 }, { "completion_length": 2.0, "epoch": 0.36295200967872027, "grad_norm": 2.7216408398089698e-06, "kl": 13.628125, "learning_rate": 1.6074263777350167e-05, "loss": 0.5452, "reward": 0.0586578369140625, "reward_std": 0.010139925488329028, "rewards/reward_func_1": 0.0586578369140625, "step": 1350 }, { "completion_length": 2.0, "epoch": 0.364296276381234, "grad_norm": 1.4683068911836017e-06, "kl": 14.2203125, "learning_rate": 1.6036915786557705e-05, "loss": 0.569, "reward": 0.057494735717773436, "reward_std": 0.00982674182887422, "rewards/reward_func_1": 0.057494735717773436, "step": 1355 }, { "completion_length": 2.0, "epoch": 0.3656405430837478, "grad_norm": 2.2047534002922475e-06, "kl": 13.690625, "learning_rate": 1.5999434829138923e-05, "loss": 0.5477, "reward": 0.058840179443359376, "reward_std": 0.009708692382264416, "rewards/reward_func_1": 0.058840179443359376, "step": 1360 }, { "completion_length": 2.0, "epoch": 0.3669848097862616, "grad_norm": 8.66782011144096e-06, "kl": 13.884375, "learning_rate": 1.5961821730633986e-05, "loss": 0.5552, "reward": 0.06289253234863282, "reward_std": 0.013519753767468501, "rewards/reward_func_1": 0.06289253234863282, "step": 1365 }, { "completion_length": 2.0, "epoch": 0.3683290764887754, "grad_norm": 2.6345257992943516e-06, "kl": 13.71875, "learning_rate": 1.5924077319493546e-05, "loss": 0.5486, "reward": 0.05802221298217773, "reward_std": 0.010079689413032611, "rewards/reward_func_1": 0.05802221298217773, "step": 1370 }, { "completion_length": 2.0, "epoch": 0.36967334319128914, "grad_norm": 1.2998112651985139e-05, "kl": 13.8421875, "learning_rate": 1.5886202427060493e-05, "loss": 0.5539, "reward": 0.06998028755187988, "reward_std": 0.011192185156687628, "rewards/reward_func_1": 0.06998028755187988, "step": 1375 }, { "completion_length": 2.0, "epoch": 0.37101760989380295, "grad_norm": 5.262471859168727e-06, "kl": 13.78125, "learning_rate": 1.5848197887551643e-05, "loss": 0.5507, "reward": 0.05722208023071289, "reward_std": 0.010396837347070687, "rewards/reward_func_1": 0.05722208023071289, "step": 1380 }, { "completion_length": 2.0, "epoch": 0.3723618765963167, "grad_norm": 3.908398866769858e-06, "kl": 13.89375, "learning_rate": 1.5810064538039368e-05, "loss": 0.556, "reward": 0.059538209438323976, "reward_std": 0.010347902441571933, "rewards/reward_func_1": 0.059538209438323976, "step": 1385 }, { "completion_length": 2.0, "epoch": 0.3737061432988305, "grad_norm": 8.753636393521447e-06, "kl": 14.0859375, "learning_rate": 1.577180321843315e-05, "loss": 0.5638, "reward": 0.06107792854309082, "reward_std": 0.010959995364828501, "rewards/reward_func_1": 0.06107792854309082, "step": 1390 }, { "completion_length": 2.0, "epoch": 0.37505041000134426, "grad_norm": 1.4902374232406146e-06, "kl": 13.696875, "learning_rate": 1.5733414771461094e-05, "loss": 0.5476, "reward": 0.06554374694824219, "reward_std": 0.012540119105688063, "rewards/reward_func_1": 0.06554374694824219, "step": 1395 }, { "completion_length": 2.0, "epoch": 0.37639467670385807, "grad_norm": 3.1805816433916334e-06, "kl": 13.6890625, "learning_rate": 1.569490004265136e-05, "loss": 0.5474, "reward": 0.06188135147094727, "reward_std": 0.008616514109598938, "rewards/reward_func_1": 0.06188135147094727, "step": 1400 }, { "completion_length": 2.0, "epoch": 0.3777389434063718, "grad_norm": 1.402103134751087e-05, "kl": 13.7421875, "learning_rate": 1.5656259880313528e-05, "loss": 0.5496, "reward": 0.06100940704345703, "reward_std": 0.010816287656780332, "rewards/reward_func_1": 0.06100940704345703, "step": 1405 }, { "completion_length": 2.0, "epoch": 0.3790832101088856, "grad_norm": 2.776348765110015e-06, "kl": 13.56875, "learning_rate": 1.5617495135519946e-05, "loss": 0.5429, "reward": 0.05631539821624756, "reward_std": 0.012434210258652456, "rewards/reward_func_1": 0.05631539821624756, "step": 1410 }, { "completion_length": 2.0, "epoch": 0.3804274768113994, "grad_norm": 2.385505240454222e-06, "kl": 14.0, "learning_rate": 1.557860666208695e-05, "loss": 0.56, "reward": 0.05535392761230469, "reward_std": 0.01153669813356828, "rewards/reward_func_1": 0.05535392761230469, "step": 1415 }, { "completion_length": 2.0, "epoch": 0.3817717435139132, "grad_norm": 6.945092536625452e-06, "kl": 13.7421875, "learning_rate": 1.553959531655607e-05, "loss": 0.5495, "reward": 0.061875534057617185, "reward_std": 0.011464458813861711, "rewards/reward_func_1": 0.061875534057617185, "step": 1420 }, { "completion_length": 2.0, "epoch": 0.38311601021642694, "grad_norm": 1.5258659004757646e-05, "kl": 13.609375, "learning_rate": 1.5500461958175174e-05, "loss": 0.5442, "reward": 0.05548095703125, "reward_std": 0.0085141017458227, "rewards/reward_func_1": 0.05548095703125, "step": 1425 }, { "completion_length": 2.0, "epoch": 0.3844602769189407, "grad_norm": 3.450870644883253e-05, "kl": 13.884375, "learning_rate": 1.546120744887954e-05, "loss": 0.5551, "reward": 0.05991678237915039, "reward_std": 0.011126938453890034, "rewards/reward_func_1": 0.05991678237915039, "step": 1430 }, { "completion_length": 2.0, "epoch": 0.3858045436214545, "grad_norm": 4.157363036938477e-06, "kl": 13.8671875, "learning_rate": 1.5421832653272845e-05, "loss": 0.5547, "reward": 0.05983428955078125, "reward_std": 0.009211386787137598, "rewards/reward_func_1": 0.05983428955078125, "step": 1435 }, { "completion_length": 2.0, "epoch": 0.38714881032396825, "grad_norm": 3.8689913708367385e-06, "kl": 13.85625, "learning_rate": 1.5382338438608165e-05, "loss": 0.5545, "reward": 0.06216297149658203, "reward_std": 0.009720365148677957, "rewards/reward_func_1": 0.06216297149658203, "step": 1440 }, { "completion_length": 2.0, "epoch": 0.38849307702648206, "grad_norm": 3.0080229862505803e-06, "kl": 13.74375, "learning_rate": 1.5342725674768844e-05, "loss": 0.5499, "reward": 0.06219477653503418, "reward_std": 0.010860501191928051, "rewards/reward_func_1": 0.06219477653503418, "step": 1445 }, { "completion_length": 2.0, "epoch": 0.3898373437289958, "grad_norm": 2.156152959287283e-06, "kl": 13.659375, "learning_rate": 1.5302995234249335e-05, "loss": 0.5464, "reward": 0.05769138336181641, "reward_std": 0.011336278253293131, "rewards/reward_func_1": 0.05769138336181641, "step": 1450 }, { "completion_length": 2.0, "epoch": 0.3911816104315096, "grad_norm": 3.143540379824117e-05, "kl": 13.565625, "learning_rate": 1.5263147992135998e-05, "loss": 0.5427, "reward": 0.057453060150146486, "reward_std": 0.011146099481265992, "rewards/reward_func_1": 0.057453060150146486, "step": 1455 }, { "completion_length": 2.0, "epoch": 0.39252587713402337, "grad_norm": 2.1904502318648156e-06, "kl": 13.6375, "learning_rate": 1.5223184826087811e-05, "loss": 0.5455, "reward": 0.060272598266601564, "reward_std": 0.012035325131728314, "rewards/reward_func_1": 0.060272598266601564, "step": 1460 }, { "completion_length": 2.0, "epoch": 0.3938701438365372, "grad_norm": 3.4340512229391607e-06, "kl": 13.9953125, "learning_rate": 1.5183106616317048e-05, "loss": 0.5596, "reward": 0.06144716739654541, "reward_std": 0.013058099864429096, "rewards/reward_func_1": 0.06144716739654541, "step": 1465 }, { "completion_length": 2.0, "epoch": 0.39521441053905093, "grad_norm": 2.2754304609406972e-06, "kl": 14.1265625, "learning_rate": 1.5142914245569885e-05, "loss": 0.5651, "reward": 0.057547581195831296, "reward_std": 0.009754268628603313, "rewards/reward_func_1": 0.057547581195831296, "step": 1470 }, { "completion_length": 2.0, "epoch": 0.39655867724156474, "grad_norm": 2.4745954760874156e-06, "kl": 13.73125, "learning_rate": 1.5102608599106966e-05, "loss": 0.5491, "reward": 0.061440467834472656, "reward_std": 0.010538783113224781, "rewards/reward_func_1": 0.061440467834472656, "step": 1475 }, { "completion_length": 2.0, "epoch": 0.3979029439440785, "grad_norm": 6.128909262770321e-06, "kl": 13.5640625, "learning_rate": 1.5062190564683893e-05, "loss": 0.5427, "reward": 0.057086181640625, "reward_std": 0.01106494044579449, "rewards/reward_func_1": 0.057086181640625, "step": 1480 }, { "completion_length": 2.0, "epoch": 0.3992472106465923, "grad_norm": 1.8235305105918087e-06, "kl": 13.7984375, "learning_rate": 1.5021661032531692e-05, "loss": 0.552, "reward": 0.058293724060058595, "reward_std": 0.010370958992280067, "rewards/reward_func_1": 0.058293724060058595, "step": 1485 }, { "completion_length": 2.0, "epoch": 0.40059147734910605, "grad_norm": 2.1967098291497678e-06, "kl": 13.8296875, "learning_rate": 1.4981020895337175e-05, "loss": 0.5532, "reward": 0.05586849227547645, "reward_std": 0.011479038602556103, "rewards/reward_func_1": 0.05586849227547645, "step": 1490 }, { "completion_length": 2.0, "epoch": 0.40193574405161986, "grad_norm": 1.2756601108776522e-06, "kl": 13.4796875, "learning_rate": 1.4940271048223307e-05, "loss": 0.5394, "reward": 0.05812692642211914, "reward_std": 0.012107894703513011, "rewards/reward_func_1": 0.05812692642211914, "step": 1495 }, { "completion_length": 2.0, "epoch": 0.4032800107541336, "grad_norm": 1.9987196537840646e-06, "kl": 13.553125, "learning_rate": 1.4899412388729472e-05, "loss": 0.5421, "reward": 0.051792049407958986, "reward_std": 0.01174548725830391, "rewards/reward_func_1": 0.051792049407958986, "step": 1500 }, { "completion_length": 2.0, "epoch": 0.4046242774566474, "grad_norm": 1.8275987940796767e-06, "kl": 13.9421875, "learning_rate": 1.4858445816791718e-05, "loss": 0.5575, "reward": 0.05959300994873047, "reward_std": 0.009343751921551301, "rewards/reward_func_1": 0.05959300994873047, "step": 1505 }, { "completion_length": 2.0, "epoch": 0.40596854415916117, "grad_norm": 5.036072707298445e-06, "kl": 13.7765625, "learning_rate": 1.4817372234722918e-05, "loss": 0.551, "reward": 0.06010627746582031, "reward_std": 0.01253571416818886, "rewards/reward_func_1": 0.06010627746582031, "step": 1510 }, { "completion_length": 2.0, "epoch": 0.407312810861675, "grad_norm": 2.4326691345777363e-06, "kl": 14.025, "learning_rate": 1.4776192547192915e-05, "loss": 0.5612, "reward": 0.06703472137451172, "reward_std": 0.010542747608269565, "rewards/reward_func_1": 0.06703472137451172, "step": 1515 }, { "completion_length": 2.0, "epoch": 0.40865707756418873, "grad_norm": 2.35791821978637e-06, "kl": 13.6375, "learning_rate": 1.4734907661208587e-05, "loss": 0.5454, "reward": 0.05951080322265625, "reward_std": 0.009829605106278904, "rewards/reward_func_1": 0.05951080322265625, "step": 1520 }, { "completion_length": 2.0, "epoch": 0.41000134426670254, "grad_norm": 4.144026206631679e-06, "kl": 13.9828125, "learning_rate": 1.469351848609386e-05, "loss": 0.5595, "reward": 0.05874214172363281, "reward_std": 0.008459481771024003, "rewards/reward_func_1": 0.05874214172363281, "step": 1525 }, { "completion_length": 2.0, "epoch": 0.4113456109692163, "grad_norm": 4.286873263481539e-06, "kl": 13.88125, "learning_rate": 1.4652025933469705e-05, "loss": 0.5551, "reward": 0.06733989715576172, "reward_std": 0.01017450345098041, "rewards/reward_func_1": 0.06733989715576172, "step": 1530 }, { "completion_length": 2.0, "epoch": 0.4126898776717301, "grad_norm": 6.012166977598099e-06, "kl": 13.8875, "learning_rate": 1.461043091723403e-05, "loss": 0.5554, "reward": 0.06552686691284179, "reward_std": 0.010358074885152746, "rewards/reward_func_1": 0.06552686691284179, "step": 1535 }, { "completion_length": 2.0, "epoch": 0.41403414437424385, "grad_norm": 2.3474919998989208e-06, "kl": 13.9390625, "learning_rate": 1.4568734353541572e-05, "loss": 0.5574, "reward": 0.058895301818847653, "reward_std": 0.012217024579877033, "rewards/reward_func_1": 0.058895301818847653, "step": 1540 }, { "completion_length": 2.0, "epoch": 0.4153784110767576, "grad_norm": 3.684157127281651e-05, "kl": 13.8734375, "learning_rate": 1.4526937160783707e-05, "loss": 0.555, "reward": 0.05571174621582031, "reward_std": 0.009185398211957362, "rewards/reward_func_1": 0.05571174621582031, "step": 1545 }, { "completion_length": 2.0, "epoch": 0.4167226777792714, "grad_norm": 1.638373532841797e-06, "kl": 14.0515625, "learning_rate": 1.4485040259568228e-05, "loss": 0.5622, "reward": 0.06209487915039062, "reward_std": 0.010831043922371464, "rewards/reward_func_1": 0.06209487915039062, "step": 1550 }, { "completion_length": 2.0, "epoch": 0.41806694448178516, "grad_norm": 1.416817667632131e-06, "kl": 13.9046875, "learning_rate": 1.4443044572699058e-05, "loss": 0.556, "reward": 0.06337127685546876, "reward_std": 0.00729374265865772, "rewards/reward_func_1": 0.06337127685546876, "step": 1555 }, { "completion_length": 2.0, "epoch": 0.41941121118429897, "grad_norm": 4.485429144551745e-06, "kl": 13.6140625, "learning_rate": 1.440095102515595e-05, "loss": 0.5445, "reward": 0.060857629776000975, "reward_std": 0.011825820308149559, "rewards/reward_func_1": 0.060857629776000975, "step": 1560 }, { "completion_length": 2.0, "epoch": 0.4207554778868127, "grad_norm": 2.9214272672106745e-06, "kl": 14.034375, "learning_rate": 1.4358760544074074e-05, "loss": 0.5612, "reward": 0.06148242950439453, "reward_std": 0.012079593736416427, "rewards/reward_func_1": 0.06148242950439453, "step": 1565 }, { "completion_length": 2.0, "epoch": 0.42209974458932653, "grad_norm": 2.7933485853282036e-06, "kl": 13.725, "learning_rate": 1.4316474058723635e-05, "loss": 0.549, "reward": 0.06508445739746094, "reward_std": 0.009462902668019524, "rewards/reward_func_1": 0.06508445739746094, "step": 1570 }, { "completion_length": 2.0, "epoch": 0.4234440112918403, "grad_norm": 1.132559646066511e-05, "kl": 13.465625, "learning_rate": 1.4274092500489376e-05, "loss": 0.5386, "reward": 0.06119532585144043, "reward_std": 0.010296352157456567, "rewards/reward_func_1": 0.06119532585144043, "step": 1575 }, { "completion_length": 2.0, "epoch": 0.4247882779943541, "grad_norm": 3.3802455163822742e-06, "kl": 13.753125, "learning_rate": 1.423161680285009e-05, "loss": 0.55, "reward": 0.05784816741943359, "reward_std": 0.011395523198734736, "rewards/reward_func_1": 0.05784816741943359, "step": 1580 }, { "completion_length": 2.0, "epoch": 0.42613254469686784, "grad_norm": 3.657454954009154e-06, "kl": 13.796875, "learning_rate": 1.4189047901358033e-05, "loss": 0.5516, "reward": 0.0637430191040039, "reward_std": 0.012986108286713715, "rewards/reward_func_1": 0.0637430191040039, "step": 1585 }, { "completion_length": 2.0, "epoch": 0.42747681139938165, "grad_norm": 2.5834062853391515e-06, "kl": 13.596875, "learning_rate": 1.4146386733618338e-05, "loss": 0.5439, "reward": 0.059173583984375, "reward_std": 0.011515995301306248, "rewards/reward_func_1": 0.059173583984375, "step": 1590 }, { "completion_length": 2.0, "epoch": 0.4288210781018954, "grad_norm": 4.3796212594315875e-06, "kl": 13.8921875, "learning_rate": 1.4103634239268355e-05, "loss": 0.5556, "reward": 0.06446866989135742, "reward_std": 0.009868808073224499, "rewards/reward_func_1": 0.06446866989135742, "step": 1595 }, { "completion_length": 2.0, "epoch": 0.4301653448044092, "grad_norm": 8.73978751769755e-06, "kl": 14.025, "learning_rate": 1.4060791359956956e-05, "loss": 0.5611, "reward": 0.0633920669555664, "reward_std": 0.01284162982410635, "rewards/reward_func_1": 0.0633920669555664, "step": 1600 }, { "completion_length": 2.0, "epoch": 0.43150961150692296, "grad_norm": 3.975842446379829e-06, "kl": 13.9109375, "learning_rate": 1.401785903932379e-05, "loss": 0.5564, "reward": 0.06275310516357421, "reward_std": 0.011346189048344968, "rewards/reward_func_1": 0.06275310516357421, "step": 1605 }, { "completion_length": 2.0, "epoch": 0.43285387820943677, "grad_norm": 2.2908070604898967e-06, "kl": 13.628125, "learning_rate": 1.3974838222978517e-05, "loss": 0.5454, "reward": 0.06408562660217285, "reward_std": 0.011092856073810253, "rewards/reward_func_1": 0.06408562660217285, "step": 1610 }, { "completion_length": 2.0, "epoch": 0.4341981449119505, "grad_norm": 4.72618330604746e-06, "kl": 14.3875, "learning_rate": 1.3931729858479954e-05, "loss": 0.5759, "reward": 0.06461887359619141, "reward_std": 0.009859557841264178, "rewards/reward_func_1": 0.06461887359619141, "step": 1615 }, { "completion_length": 2.0, "epoch": 0.43554241161446433, "grad_norm": 3.2984760309773264e-06, "kl": 14.025, "learning_rate": 1.3888534895315222e-05, "loss": 0.561, "reward": 0.06250219345092774, "reward_std": 0.01138484149123542, "rewards/reward_func_1": 0.06250219345092774, "step": 1620 }, { "completion_length": 2.0, "epoch": 0.4368866783169781, "grad_norm": 1.7083860939237638e-06, "kl": 13.5921875, "learning_rate": 1.384525428487883e-05, "loss": 0.5439, "reward": 0.057589149475097655, "reward_std": 0.012263275221630465, "rewards/reward_func_1": 0.057589149475097655, "step": 1625 }, { "completion_length": 2.0, "epoch": 0.4382309450194919, "grad_norm": 1.8176706362282857e-05, "kl": 13.703125, "learning_rate": 1.380188898045172e-05, "loss": 0.5484, "reward": 0.05926389694213867, "reward_std": 0.011890075955307111, "rewards/reward_func_1": 0.05926389694213867, "step": 1630 }, { "completion_length": 2.0, "epoch": 0.43957521172200564, "grad_norm": 1.6534449969185516e-06, "kl": 13.7234375, "learning_rate": 1.3758439937180269e-05, "loss": 0.5489, "reward": 0.06111717224121094, "reward_std": 0.009865944929333636, "rewards/reward_func_1": 0.06111717224121094, "step": 1635 }, { "completion_length": 2.0, "epoch": 0.44091947842451945, "grad_norm": 5.1437118600006215e-06, "kl": 13.85625, "learning_rate": 1.371490811205524e-05, "loss": 0.554, "reward": 0.06157407760620117, "reward_std": 0.012079483611159958, "rewards/reward_func_1": 0.06157407760620117, "step": 1640 }, { "completion_length": 2.0, "epoch": 0.4422637451270332, "grad_norm": 5.265788786346093e-06, "kl": 13.75, "learning_rate": 1.3671294463890734e-05, "loss": 0.5499, "reward": 0.057445335388183597, "reward_std": 0.01464254588354379, "rewards/reward_func_1": 0.057445335388183597, "step": 1645 }, { "completion_length": 2.0, "epoch": 0.44360801182954696, "grad_norm": 1.6486019376316108e-05, "kl": 14.0984375, "learning_rate": 1.3627599953303036e-05, "loss": 0.5636, "reward": 0.062333667278289796, "reward_std": 0.00997068356446107, "rewards/reward_func_1": 0.062333667278289796, "step": 1650 }, { "completion_length": 2.0, "epoch": 0.44495227853206076, "grad_norm": 1.7718589333526324e-06, "kl": 13.64375, "learning_rate": 1.3583825542689486e-05, "loss": 0.5456, "reward": 0.05308668613433838, "reward_std": 0.01278767061594408, "rewards/reward_func_1": 0.05308668613433838, "step": 1655 }, { "completion_length": 2.0, "epoch": 0.4462965452345745, "grad_norm": 5.0791340981959365e-06, "kl": 14.1375, "learning_rate": 1.353997219620726e-05, "loss": 0.5657, "reward": 0.06480164527893066, "reward_std": 0.010595990939327749, "rewards/reward_func_1": 0.06480164527893066, "step": 1660 }, { "completion_length": 2.0, "epoch": 0.4476408119370883, "grad_norm": 1.975413169930107e-06, "kl": 13.9046875, "learning_rate": 1.3496040879752146e-05, "loss": 0.5562, "reward": 0.058099555969238284, "reward_std": 0.012097873717721086, "rewards/reward_func_1": 0.058099555969238284, "step": 1665 }, { "completion_length": 2.0, "epoch": 0.4489850786396021, "grad_norm": 3.103926246694755e-06, "kl": 14.0609375, "learning_rate": 1.3452032560937271e-05, "loss": 0.5626, "reward": 0.06475410461425782, "reward_std": 0.01139398144750885, "rewards/reward_func_1": 0.06475410461425782, "step": 1670 }, { "completion_length": 2.0, "epoch": 0.4503293453421159, "grad_norm": 1.6186576203836012e-06, "kl": 14.034375, "learning_rate": 1.3407948209071779e-05, "loss": 0.5614, "reward": 0.06743978261947632, "reward_std": 0.013266765065782237, "rewards/reward_func_1": 0.06743978261947632, "step": 1675 }, { "completion_length": 2.0, "epoch": 0.45167361204462964, "grad_norm": 1.8007198377745226e-05, "kl": 14.275, "learning_rate": 1.3363788795139487e-05, "loss": 0.571, "reward": 0.06518707275390626, "reward_std": 0.011862215257133357, "rewards/reward_func_1": 0.06518707275390626, "step": 1680 }, { "completion_length": 2.0, "epoch": 0.45301787874714344, "grad_norm": 5.033749403082766e-06, "kl": 13.915625, "learning_rate": 1.3319555291777501e-05, "loss": 0.5568, "reward": 0.06435184478759766, "reward_std": 0.011379225243217661, "rewards/reward_func_1": 0.06435184478759766, "step": 1685 }, { "completion_length": 2.0, "epoch": 0.4543621454496572, "grad_norm": 6.873391612316482e-06, "kl": 13.596875, "learning_rate": 1.3275248673254788e-05, "loss": 0.544, "reward": 0.06184234619140625, "reward_std": 0.010739423423365224, "rewards/reward_func_1": 0.06184234619140625, "step": 1690 }, { "completion_length": 2.0, "epoch": 0.455706412152171, "grad_norm": 4.609265033650445e-06, "kl": 13.6984375, "learning_rate": 1.3230869915450722e-05, "loss": 0.5481, "reward": 0.05451488494873047, "reward_std": 0.010388137760855898, "rewards/reward_func_1": 0.05451488494873047, "step": 1695 }, { "completion_length": 2.0, "epoch": 0.45705067885468476, "grad_norm": 2.1986843421473168e-06, "kl": 13.5859375, "learning_rate": 1.3186419995833582e-05, "loss": 0.5436, "reward": 0.05490055084228516, "reward_std": 0.011986211253679357, "rewards/reward_func_1": 0.05490055084228516, "step": 1700 }, { "completion_length": 2.0, "epoch": 0.45839494555719856, "grad_norm": 3.9593110159330536e-06, "kl": 13.828125, "learning_rate": 1.3141899893439032e-05, "loss": 0.5533, "reward": 0.061890792846679685, "reward_std": 0.01013331833673874, "rewards/reward_func_1": 0.061890792846679685, "step": 1705 }, { "completion_length": 2.0, "epoch": 0.4597392122597123, "grad_norm": 2.6946911475533852e-06, "kl": 14.0953125, "learning_rate": 1.3097310588848555e-05, "loss": 0.5641, "reward": 0.06313896179199219, "reward_std": 0.013185867536230944, "rewards/reward_func_1": 0.06313896179199219, "step": 1710 }, { "completion_length": 2.0, "epoch": 0.4610834789622261, "grad_norm": 3.4700431115197716e-06, "kl": 13.9703125, "learning_rate": 1.3052653064167848e-05, "loss": 0.5591, "reward": 0.057857322692871097, "reward_std": 0.01133649832190713, "rewards/reward_func_1": 0.057857322692871097, "step": 1715 }, { "completion_length": 2.0, "epoch": 0.4624277456647399, "grad_norm": 1.3458391549647786e-05, "kl": 13.5859375, "learning_rate": 1.3007928303005201e-05, "loss": 0.5436, "reward": 0.05681304931640625, "reward_std": 0.010392763031995855, "rewards/reward_func_1": 0.05681304931640625, "step": 1720 }, { "completion_length": 2.0, "epoch": 0.4637720123672537, "grad_norm": 4.882398570771329e-06, "kl": 13.8015625, "learning_rate": 1.2963137290449823e-05, "loss": 0.552, "reward": 0.062281131744384766, "reward_std": 0.012929836504918057, "rewards/reward_func_1": 0.062281131744384766, "step": 1725 }, { "completion_length": 2.0, "epoch": 0.46511627906976744, "grad_norm": 6.2865415202395525e-06, "kl": 13.8671875, "learning_rate": 1.291828101305015e-05, "loss": 0.5546, "reward": 0.061757802963256836, "reward_std": 0.0114550436315767, "rewards/reward_func_1": 0.061757802963256836, "step": 1730 }, { "completion_length": 2.0, "epoch": 0.46646054577228124, "grad_norm": 2.4083929019980133e-06, "kl": 14.290625, "learning_rate": 1.2873360458792114e-05, "loss": 0.5719, "reward": 0.06473960876464843, "reward_std": 0.012107564476900734, "rewards/reward_func_1": 0.06473960876464843, "step": 1735 }, { "completion_length": 2.0, "epoch": 0.467804812474795, "grad_norm": 2.2838785298517905e-05, "kl": 13.7671875, "learning_rate": 1.2828376617077385e-05, "loss": 0.5504, "reward": 0.059980010986328124, "reward_std": 0.00752433567395201, "rewards/reward_func_1": 0.059980010986328124, "step": 1740 }, { "completion_length": 2.0, "epoch": 0.4691490791773088, "grad_norm": 2.683865886865533e-06, "kl": 13.9546875, "learning_rate": 1.2783330478701572e-05, "loss": 0.558, "reward": 0.05912628173828125, "reward_std": 0.010197188393794932, "rewards/reward_func_1": 0.05912628173828125, "step": 1745 }, { "completion_length": 2.0, "epoch": 0.47049334587982256, "grad_norm": 2.190610530306003e-06, "kl": 14.090625, "learning_rate": 1.2738223035832412e-05, "loss": 0.5638, "reward": 0.06425952911376953, "reward_std": 0.008773326113077929, "rewards/reward_func_1": 0.06425952911376953, "step": 1750 }, { "completion_length": 2.0, "epoch": 0.47183761258233636, "grad_norm": 3.1278939331969013e-06, "kl": 14.303125, "learning_rate": 1.2693055281987903e-05, "loss": 0.5719, "reward": 0.06397314071655273, "reward_std": 0.01130247107357718, "rewards/reward_func_1": 0.06397314071655273, "step": 1755 }, { "completion_length": 2.0, "epoch": 0.4731818792848501, "grad_norm": 1.2678321581915952e-05, "kl": 13.93125, "learning_rate": 1.264782821201443e-05, "loss": 0.5571, "reward": 0.057790946960449216, "reward_std": 0.010541425982955844, "rewards/reward_func_1": 0.057790946960449216, "step": 1760 }, { "completion_length": 2.0, "epoch": 0.47452614598736387, "grad_norm": 2.6648983748600585e-06, "kl": 13.5921875, "learning_rate": 1.2602542822064852e-05, "loss": 0.5438, "reward": 0.06369266510009766, "reward_std": 0.010999528719548835, "rewards/reward_func_1": 0.06369266510009766, "step": 1765 }, { "completion_length": 2.0, "epoch": 0.4758704126898777, "grad_norm": 9.32496550376527e-06, "kl": 14.321875, "learning_rate": 1.2557200109576557e-05, "loss": 0.5729, "reward": 0.061204147338867185, "reward_std": 0.013532968414074276, "rewards/reward_func_1": 0.061204147338867185, "step": 1770 }, { "completion_length": 2.0, "epoch": 0.4772146793923914, "grad_norm": 1.453070012757962e-06, "kl": 13.70625, "learning_rate": 1.2511801073249499e-05, "loss": 0.5482, "reward": 0.060839509963989256, "reward_std": 0.01080455974151846, "rewards/reward_func_1": 0.060839509963989256, "step": 1775 }, { "completion_length": 2.0, "epoch": 0.47855894609490524, "grad_norm": 2.337250180062256e-06, "kl": 13.696875, "learning_rate": 1.2466346713024194e-05, "loss": 0.5479, "reward": 0.05852031707763672, "reward_std": 0.009971881102683256, "rewards/reward_func_1": 0.05852031707763672, "step": 1780 }, { "completion_length": 2.0, "epoch": 0.479903212797419, "grad_norm": 2.7332425815984607e-06, "kl": 13.7046875, "learning_rate": 1.2420838030059704e-05, "loss": 0.5481, "reward": 0.059021949768066406, "reward_std": 0.01221724480674311, "rewards/reward_func_1": 0.059021949768066406, "step": 1785 }, { "completion_length": 2.0, "epoch": 0.4812474794999328, "grad_norm": 2.8585989184648497e-06, "kl": 13.725, "learning_rate": 1.2375276026711576e-05, "loss": 0.5493, "reward": 0.0618377685546875, "reward_std": 0.011973217026388738, "rewards/reward_func_1": 0.0618377685546875, "step": 1790 }, { "completion_length": 2.0, "epoch": 0.48259174620244655, "grad_norm": 2.026320362347178e-05, "kl": 14.0484375, "learning_rate": 1.232966170650977e-05, "loss": 0.5619, "reward": 0.062344479560852054, "reward_std": 0.009559368583722971, "rewards/reward_func_1": 0.062344479560852054, "step": 1795 }, { "completion_length": 2.0, "epoch": 0.48393601290496036, "grad_norm": 2.701003268157365e-06, "kl": 13.63125, "learning_rate": 1.2283996074136566e-05, "loss": 0.5452, "reward": 0.06439933776855469, "reward_std": 0.011508286974640214, "rewards/reward_func_1": 0.06439933776855469, "step": 1800 }, { "completion_length": 2.0, "epoch": 0.4852802796074741, "grad_norm": 3.2230218494078144e-06, "kl": 13.9390625, "learning_rate": 1.2238280135404411e-05, "loss": 0.5573, "reward": 0.06914815902709961, "reward_std": 0.010676544258603825, "rewards/reward_func_1": 0.06914815902709961, "step": 1805 }, { "completion_length": 2.0, "epoch": 0.4866245463099879, "grad_norm": 4.649204583984101e-06, "kl": 13.778125, "learning_rate": 1.2192514897233789e-05, "loss": 0.5511, "reward": 0.0602226972579956, "reward_std": 0.011667439006851054, "rewards/reward_func_1": 0.0602226972579956, "step": 1810 }, { "completion_length": 2.0, "epoch": 0.48796881301250167, "grad_norm": 1.511799382569734e-06, "kl": 13.9296875, "learning_rate": 1.2146701367631027e-05, "loss": 0.5574, "reward": 0.06371002197265625, "reward_std": 0.010727530277472396, "rewards/reward_func_1": 0.06371002197265625, "step": 1815 }, { "completion_length": 2.0, "epoch": 0.4893130797150155, "grad_norm": 4.017825631308369e-06, "kl": 13.875, "learning_rate": 1.2100840555666101e-05, "loss": 0.5552, "reward": 0.06100995540618896, "reward_std": 0.01184462348173838, "rewards/reward_func_1": 0.06100995540618896, "step": 1820 }, { "completion_length": 2.0, "epoch": 0.4906573464175292, "grad_norm": 2.1121263671375345e-06, "kl": 13.44375, "learning_rate": 1.205493347145041e-05, "loss": 0.5377, "reward": 0.056847544759511946, "reward_std": 0.013147425842907979, "rewards/reward_func_1": 0.056847544759511946, "step": 1825 }, { "completion_length": 2.0, "epoch": 0.49200161312004304, "grad_norm": 3.4769893773045624e-06, "kl": 13.3515625, "learning_rate": 1.2008981126114523e-05, "loss": 0.5341, "reward": 0.0553741455078125, "reward_std": 0.010613445060516823, "rewards/reward_func_1": 0.0553741455078125, "step": 1830 }, { "completion_length": 2.0, "epoch": 0.4933458798225568, "grad_norm": 1.7378441043547355e-05, "kl": 13.7125, "learning_rate": 1.1962984531785922e-05, "loss": 0.5482, "reward": 0.05479507446289063, "reward_std": 0.009559368582631577, "rewards/reward_func_1": 0.05479507446289063, "step": 1835 }, { "completion_length": 2.0, "epoch": 0.4946901465250706, "grad_norm": 7.995906344149262e-06, "kl": 13.79375, "learning_rate": 1.1916944701566688e-05, "loss": 0.5518, "reward": 0.062036323547363284, "reward_std": 0.010829502148044411, "rewards/reward_func_1": 0.062036323547363284, "step": 1840 }, { "completion_length": 2.0, "epoch": 0.49603441322758435, "grad_norm": 5.335733931133291e-06, "kl": 14.0375, "learning_rate": 1.1870862649511201e-05, "loss": 0.5616, "reward": 0.0632176399230957, "reward_std": 0.011605303342366823, "rewards/reward_func_1": 0.0632176399230957, "step": 1845 }, { "completion_length": 2.0, "epoch": 0.49737867993009816, "grad_norm": 3.3654257549642352e-06, "kl": 13.9046875, "learning_rate": 1.1824739390603801e-05, "loss": 0.5563, "reward": 0.06469783782958985, "reward_std": 0.008109517797129229, "rewards/reward_func_1": 0.06469783782958985, "step": 1850 }, { "completion_length": 2.0, "epoch": 0.4987229466326119, "grad_norm": 3.5574796584114665e-06, "kl": 13.8203125, "learning_rate": 1.1778575940736439e-05, "loss": 0.5526, "reward": 0.05752217769622803, "reward_std": 0.011466909085720544, "rewards/reward_func_1": 0.05752217769622803, "step": 1855 }, { "completion_length": 2.0, "epoch": 0.5000672133351257, "grad_norm": 4.107642325834604e-06, "kl": 13.8015625, "learning_rate": 1.1732373316686292e-05, "loss": 0.5522, "reward": 0.06678012609481812, "reward_std": 0.011566609619330847, "rewards/reward_func_1": 0.06678012609481812, "step": 1860 }, { "completion_length": 2.0, "epoch": 0.5014114800376395, "grad_norm": 8.976133358373772e-06, "kl": 13.6671875, "learning_rate": 1.1686132536093367e-05, "loss": 0.5469, "reward": 0.05934562683105469, "reward_std": 0.012758818920701742, "rewards/reward_func_1": 0.05934562683105469, "step": 1865 }, { "completion_length": 2.0, "epoch": 0.5027557467401532, "grad_norm": 6.34239995633834e-06, "kl": 13.81875, "learning_rate": 1.1639854617438098e-05, "loss": 0.5528, "reward": 0.05746040344238281, "reward_std": 0.010571158733364427, "rewards/reward_func_1": 0.05746040344238281, "step": 1870 }, { "completion_length": 2.0, "epoch": 0.504100013442667, "grad_norm": 5.687683824362466e-06, "kl": 13.778125, "learning_rate": 1.1593540580018904e-05, "loss": 0.5512, "reward": 0.05971870422363281, "reward_std": 0.010947331442730501, "rewards/reward_func_1": 0.05971870422363281, "step": 1875 }, { "completion_length": 2.0, "epoch": 0.5054442801451808, "grad_norm": 4.094200448889751e-06, "kl": 13.8390625, "learning_rate": 1.1547191443929738e-05, "loss": 0.5535, "reward": 0.059337806701660153, "reward_std": 0.010353339680295903, "rewards/reward_func_1": 0.059337806701660153, "step": 1880 }, { "completion_length": 2.0, "epoch": 0.5067885468476946, "grad_norm": 3.398595481485245e-06, "kl": 13.9453125, "learning_rate": 1.1500808230037628e-05, "loss": 0.5578, "reward": 0.05960531234741211, "reward_std": 0.0105580543531687, "rewards/reward_func_1": 0.05960531234741211, "step": 1885 }, { "completion_length": 2.0, "epoch": 0.5081328135502083, "grad_norm": 6.3564093579771e-06, "kl": 13.6453125, "learning_rate": 1.145439195996018e-05, "loss": 0.5457, "reward": 0.05985393524169922, "reward_std": 0.011576782021438702, "rewards/reward_func_1": 0.05985393524169922, "step": 1890 }, { "completion_length": 2.0, "epoch": 0.5094770802527221, "grad_norm": 4.291194272809662e-06, "kl": 13.728125, "learning_rate": 1.1407943656043088e-05, "loss": 0.5492, "reward": 0.062256813049316406, "reward_std": 0.01353142662846949, "rewards/reward_func_1": 0.062256813049316406, "step": 1895 }, { "completion_length": 2.0, "epoch": 0.510821346955236, "grad_norm": 1.4876853811074398e-06, "kl": 13.75625, "learning_rate": 1.1361464341337604e-05, "loss": 0.5501, "reward": 0.05925731658935547, "reward_std": 0.009176148010010366, "rewards/reward_func_1": 0.05925731658935547, "step": 1900 }, { "completion_length": 2.0, "epoch": 0.5121656136577497, "grad_norm": 5.476561454997864e-06, "kl": 14.0125, "learning_rate": 1.1314955039578017e-05, "loss": 0.5605, "reward": 0.06184120178222656, "reward_std": 0.010909009404713288, "rewards/reward_func_1": 0.06184120178222656, "step": 1905 }, { "completion_length": 2.0, "epoch": 0.5135098803602635, "grad_norm": 6.360010047501419e-06, "kl": 13.7359375, "learning_rate": 1.126841677515909e-05, "loss": 0.5494, "reward": 0.06381258964538575, "reward_std": 0.009671746863750741, "rewards/reward_func_1": 0.06381258964538575, "step": 1910 }, { "completion_length": 2.0, "epoch": 0.5148541470627772, "grad_norm": 3.01595628116047e-06, "kl": 13.8765625, "learning_rate": 1.1221850573113515e-05, "loss": 0.5552, "reward": 0.062322235107421874, "reward_std": 0.011111631775202113, "rewards/reward_func_1": 0.062322235107421874, "step": 1915 }, { "completion_length": 2.0, "epoch": 0.5161984137652911, "grad_norm": 4.450800588529091e-06, "kl": 13.7265625, "learning_rate": 1.117525745908932e-05, "loss": 0.5491, "reward": 0.06123924255371094, "reward_std": 0.011204353353969054, "rewards/reward_func_1": 0.06123924255371094, "step": 1920 }, { "completion_length": 2.0, "epoch": 0.5175426804678048, "grad_norm": 6.122299964772537e-06, "kl": 13.78125, "learning_rate": 1.1128638459327288e-05, "loss": 0.5513, "reward": 0.06520743370056152, "reward_std": 0.010965666610718471, "rewards/reward_func_1": 0.06520743370056152, "step": 1925 }, { "completion_length": 2.0, "epoch": 0.5188869471703186, "grad_norm": 2.6760865239339182e-06, "kl": 13.5875, "learning_rate": 1.1081994600638353e-05, "loss": 0.5434, "reward": 0.056897735595703124, "reward_std": 0.009793705747870262, "rewards/reward_func_1": 0.056897735595703124, "step": 1930 }, { "completion_length": 2.0, "epoch": 0.5202312138728323, "grad_norm": 3.4807439988071565e-06, "kl": 13.7859375, "learning_rate": 1.1035326910380973e-05, "loss": 0.5516, "reward": 0.05627828128635883, "reward_std": 0.011038010333140846, "rewards/reward_func_1": 0.05627828128635883, "step": 1935 }, { "completion_length": 2.0, "epoch": 0.5215754805753462, "grad_norm": 3.09952747556963e-06, "kl": 13.984375, "learning_rate": 1.0988636416438521e-05, "loss": 0.5592, "reward": 0.057964515686035153, "reward_std": 0.010637010936625302, "rewards/reward_func_1": 0.057964515686035153, "step": 1940 }, { "completion_length": 2.0, "epoch": 0.52291974727786, "grad_norm": 2.1990246750647202e-05, "kl": 13.753125, "learning_rate": 1.094192414719663e-05, "loss": 0.5502, "reward": 0.061472320556640626, "reward_std": 0.010977944992919219, "rewards/reward_func_1": 0.061472320556640626, "step": 1945 }, { "completion_length": 2.0, "epoch": 0.5242640139803737, "grad_norm": 9.643837984185666e-06, "kl": 13.534375, "learning_rate": 1.0895191131520541e-05, "loss": 0.5414, "reward": 0.05615215301513672, "reward_std": 0.010869586077751592, "rewards/reward_func_1": 0.05615215301513672, "step": 1950 }, { "completion_length": 2.0, "epoch": 0.5256082806828875, "grad_norm": 2.5311317131127e-06, "kl": 14.0046875, "learning_rate": 1.0848438398732462e-05, "loss": 0.5601, "reward": 0.06194038391113281, "reward_std": 0.012956816235237057, "rewards/reward_func_1": 0.06194038391113281, "step": 1955 }, { "completion_length": 2.0, "epoch": 0.5269525473854013, "grad_norm": 5.817757482873276e-06, "kl": 13.809375, "learning_rate": 1.0801666978588865e-05, "loss": 0.5522, "reward": 0.06431331634521484, "reward_std": 0.011656289092206862, "rewards/reward_func_1": 0.06431331634521484, "step": 1960 }, { "completion_length": 2.0, "epoch": 0.5282968140879151, "grad_norm": 2.050035391221172e-06, "kl": 13.7453125, "learning_rate": 1.0754877901257831e-05, "loss": 0.5499, "reward": 0.059846115112304685, "reward_std": 0.01306429406904499, "rewards/reward_func_1": 0.059846115112304685, "step": 1965 }, { "completion_length": 2.0, "epoch": 0.5296410807904288, "grad_norm": 1.0048594958789181e-05, "kl": 13.7875, "learning_rate": 1.0708072197296356e-05, "loss": 0.5518, "reward": 0.06069736480712891, "reward_std": 0.011032124502526131, "rewards/reward_func_1": 0.06069736480712891, "step": 1970 }, { "completion_length": 2.0, "epoch": 0.5309853474929426, "grad_norm": 2.601335108920466e-05, "kl": 13.584375, "learning_rate": 1.0661250897627634e-05, "loss": 0.5436, "reward": 0.053227472305297854, "reward_std": 0.010092408429773058, "rewards/reward_func_1": 0.053227472305297854, "step": 1975 }, { "completion_length": 2.0, "epoch": 0.5323296141954563, "grad_norm": 1.8432465367368422e-06, "kl": 13.54375, "learning_rate": 1.061441503351837e-05, "loss": 0.5418, "reward": 0.05695056915283203, "reward_std": 0.009967696487728972, "rewards/reward_func_1": 0.05695056915283203, "step": 1980 }, { "completion_length": 2.0, "epoch": 0.5336738808979702, "grad_norm": 7.412290869979188e-06, "kl": 13.8828125, "learning_rate": 1.056756563655607e-05, "loss": 0.5549, "reward": 0.0633920669555664, "reward_std": 0.01274736642735661, "rewards/reward_func_1": 0.0633920669555664, "step": 1985 }, { "completion_length": 2.0, "epoch": 0.5350181476004839, "grad_norm": 4.0991371861309744e-06, "kl": 13.6109375, "learning_rate": 1.052070373862629e-05, "loss": 0.5444, "reward": 0.05980701446533203, "reward_std": 0.01131843865441624, "rewards/reward_func_1": 0.05980701446533203, "step": 1990 }, { "completion_length": 2.0, "epoch": 0.5363624143029977, "grad_norm": 1.9139324649586342e-06, "kl": 13.7296875, "learning_rate": 1.047383037188994e-05, "loss": 0.5491, "reward": 0.060098457336425784, "reward_std": 0.011104363739286782, "rewards/reward_func_1": 0.060098457336425784, "step": 1995 }, { "completion_length": 2.0, "epoch": 0.5377066810055114, "grad_norm": 2.1989344531903043e-05, "kl": 13.8578125, "learning_rate": 1.0426946568760534e-05, "loss": 0.5541, "reward": 0.06157665252685547, "reward_std": 0.010787656143656931, "rewards/reward_func_1": 0.06157665252685547, "step": 2000 }, { "completion_length": 2.0, "epoch": 0.5390509477080253, "grad_norm": 5.320031505107181e-06, "kl": 13.80625, "learning_rate": 1.0380053361881454e-05, "loss": 0.5523, "reward": 0.06187152862548828, "reward_std": 0.010924646601051791, "rewards/reward_func_1": 0.06187152862548828, "step": 2005 }, { "completion_length": 2.0, "epoch": 0.5403952144105391, "grad_norm": 2.400984612904722e-06, "kl": 13.659375, "learning_rate": 1.0333151784103204e-05, "loss": 0.5463, "reward": 0.06603701114654541, "reward_std": 0.012158522802928928, "rewards/reward_func_1": 0.06603701114654541, "step": 2010 }, { "completion_length": 2.0, "epoch": 0.5417394811130528, "grad_norm": 3.1185063562588766e-06, "kl": 13.85, "learning_rate": 1.0286242868460658e-05, "loss": 0.5541, "reward": 0.06205949783325195, "reward_std": 0.011392109425651142, "rewards/reward_func_1": 0.06205949783325195, "step": 2015 }, { "completion_length": 2.0, "epoch": 0.5430837478155666, "grad_norm": 1.7467871202825336e-06, "kl": 14.109375, "learning_rate": 1.0239327648150324e-05, "loss": 0.5644, "reward": 0.05861544609069824, "reward_std": 0.012278527018861497, "rewards/reward_func_1": 0.05861544609069824, "step": 2020 }, { "completion_length": 2.0, "epoch": 0.5444280145180804, "grad_norm": 2.397207026660908e-06, "kl": 13.9265625, "learning_rate": 1.0192407156507555e-05, "loss": 0.557, "reward": 0.06040668487548828, "reward_std": 0.010451347306661774, "rewards/reward_func_1": 0.06040668487548828, "step": 2025 }, { "completion_length": 2.0, "epoch": 0.5457722812205942, "grad_norm": 2.443831590426271e-06, "kl": 13.7421875, "learning_rate": 1.0145482426983829e-05, "loss": 0.5496, "reward": 0.05943064689636231, "reward_std": 0.010262710415554465, "rewards/reward_func_1": 0.05943064689636231, "step": 2030 }, { "completion_length": 2.0, "epoch": 0.5471165479231079, "grad_norm": 1.7376810319547076e-06, "kl": 14.0484375, "learning_rate": 1.0098554493123946e-05, "loss": 0.5619, "reward": 0.06599822044372558, "reward_std": 0.010502938941499451, "rewards/reward_func_1": 0.06599822044372558, "step": 2035 }, { "completion_length": 2.0, "epoch": 0.5484608146256217, "grad_norm": 1.9027496591661475e-06, "kl": 13.709375, "learning_rate": 1.0051624388543303e-05, "loss": 0.5482, "reward": 0.06099987030029297, "reward_std": 0.011544406516259187, "rewards/reward_func_1": 0.06099987030029297, "step": 2040 }, { "completion_length": 2.0, "epoch": 0.5498050813281355, "grad_norm": 3.951816779590445e-06, "kl": 13.634375, "learning_rate": 1.0004693146905086e-05, "loss": 0.5452, "reward": 0.06201457977294922, "reward_std": 0.00889511961795506, "rewards/reward_func_1": 0.06201457977294922, "step": 2045 }, { "completion_length": 2.0, "epoch": 0.5511493480306493, "grad_norm": 4.0156542127078865e-06, "kl": 13.6765625, "learning_rate": 9.957761801897546e-06, "loss": 0.547, "reward": 0.05980491638183594, "reward_std": 0.011892608562629903, "rewards/reward_func_1": 0.05980491638183594, "step": 2050 }, { "completion_length": 2.0, "epoch": 0.552493614733163, "grad_norm": 1.6064385590652819e-06, "kl": 13.7890625, "learning_rate": 9.910831387211203e-06, "loss": 0.552, "reward": 0.05945572853088379, "reward_std": 0.009774930006824434, "rewards/reward_func_1": 0.05945572853088379, "step": 2055 }, { "completion_length": 2.0, "epoch": 0.5538378814356768, "grad_norm": 5.076154593552928e-06, "kl": 13.6640625, "learning_rate": 9.863902936516079e-06, "loss": 0.5466, "reward": 0.0619448184967041, "reward_std": 0.011996782931964845, "rewards/reward_func_1": 0.0619448184967041, "step": 2060 }, { "completion_length": 2.0, "epoch": 0.5551821481381907, "grad_norm": 1.1968987564614508e-05, "kl": 13.6625, "learning_rate": 9.81697748343895e-06, "loss": 0.5466, "reward": 0.06266765594482422, "reward_std": 0.011780065088532864, "rewards/reward_func_1": 0.06266765594482422, "step": 2065 }, { "completion_length": 2.0, "epoch": 0.5565264148407044, "grad_norm": 2.355890319449827e-05, "kl": 13.9328125, "learning_rate": 9.77005606154056e-06, "loss": 0.5576, "reward": 0.05969257354736328, "reward_std": 0.009937083004115266, "rewards/reward_func_1": 0.05969257354736328, "step": 2070 }, { "completion_length": 2.0, "epoch": 0.5578706815432182, "grad_norm": 8.669927410664968e-06, "kl": 13.525, "learning_rate": 9.723139704292866e-06, "loss": 0.5408, "reward": 0.06073760986328125, "reward_std": 0.01386090821470134, "rewards/reward_func_1": 0.06073760986328125, "step": 2075 }, { "completion_length": 2.0, "epoch": 0.5592149482457319, "grad_norm": 3.4235183647979284e-06, "kl": 13.79375, "learning_rate": 9.676229445056269e-06, "loss": 0.552, "reward": 0.06341695785522461, "reward_std": 0.010479317836870904, "rewards/reward_func_1": 0.06341695785522461, "step": 2080 }, { "completion_length": 2.0, "epoch": 0.5605592149482457, "grad_norm": 2.277251951454673e-06, "kl": 14.06875, "learning_rate": 9.629326317056872e-06, "loss": 0.5628, "reward": 0.06306524276733398, "reward_std": 0.011487474158639089, "rewards/reward_func_1": 0.06306524276733398, "step": 2085 }, { "completion_length": 2.0, "epoch": 0.5619034816507595, "grad_norm": 4.600749434757745e-06, "kl": 13.9359375, "learning_rate": 9.582431353363687e-06, "loss": 0.5572, "reward": 0.0586451530456543, "reward_std": 0.009657706473444706, "rewards/reward_func_1": 0.0586451530456543, "step": 2090 }, { "completion_length": 2.0, "epoch": 0.5632477483532733, "grad_norm": 7.010680747043807e-06, "kl": 13.775, "learning_rate": 9.535545586865922e-06, "loss": 0.5508, "reward": 0.06332626342773437, "reward_std": 0.01151423337869346, "rewards/reward_func_1": 0.06332626342773437, "step": 2095 }, { "completion_length": 2.0, "epoch": 0.564592015055787, "grad_norm": 3.5268849387648515e-06, "kl": 13.725, "learning_rate": 9.488670050250195e-06, "loss": 0.5491, "reward": 0.05642566680908203, "reward_std": 0.01242845638480503, "rewards/reward_func_1": 0.05642566680908203, "step": 2100 }, { "completion_length": 2.0, "epoch": 0.5659362817583008, "grad_norm": 2.5853701117739547e-06, "kl": 13.7984375, "learning_rate": 9.441805775977822e-06, "loss": 0.5522, "reward": 0.05613641738891602, "reward_std": 0.011232214039591782, "rewards/reward_func_1": 0.05613641738891602, "step": 2105 }, { "completion_length": 2.0, "epoch": 0.5672805484608147, "grad_norm": 4.693101436714642e-05, "kl": 13.8296875, "learning_rate": 9.394953796262037e-06, "loss": 0.5533, "reward": 0.06460676193237305, "reward_std": 0.009554302979813656, "rewards/reward_func_1": 0.06460676193237305, "step": 2110 }, { "completion_length": 2.0, "epoch": 0.5686248151633284, "grad_norm": 3.470984211162431e-06, "kl": 13.940625, "learning_rate": 9.348115143045305e-06, "loss": 0.5579, "reward": 0.05569601058959961, "reward_std": 0.00965253066533478, "rewards/reward_func_1": 0.05569601058959961, "step": 2115 }, { "completion_length": 2.0, "epoch": 0.5699690818658422, "grad_norm": 5.797121502837399e-06, "kl": 13.55625, "learning_rate": 9.301290847976545e-06, "loss": 0.5421, "reward": 0.06446545943617821, "reward_std": 0.014136035600677133, "rewards/reward_func_1": 0.06446545943617821, "step": 2120 }, { "completion_length": 2.0, "epoch": 0.5713133485683559, "grad_norm": 2.2634183096670313e-06, "kl": 13.9234375, "learning_rate": 9.254481942388444e-06, "loss": 0.5566, "reward": 0.05872611999511719, "reward_std": 0.010083984247467015, "rewards/reward_func_1": 0.05872611999511719, "step": 2125 }, { "completion_length": 2.0, "epoch": 0.5726576152708698, "grad_norm": 2.7756636882259045e-06, "kl": 13.84375, "learning_rate": 9.207689457274716e-06, "loss": 0.5536, "reward": 0.06077961921691895, "reward_std": 0.01000992787303403, "rewards/reward_func_1": 0.06077961921691895, "step": 2130 }, { "completion_length": 2.0, "epoch": 0.5740018819733835, "grad_norm": 3.0030598736630054e-06, "kl": 14.05625, "learning_rate": 9.160914423267416e-06, "loss": 0.5621, "reward": 0.06290161609649658, "reward_std": 0.010859317294671201, "rewards/reward_func_1": 0.06290161609649658, "step": 2135 }, { "completion_length": 2.0, "epoch": 0.5753461486758973, "grad_norm": 8.018588232516777e-06, "kl": 14.0125, "learning_rate": 9.114157870614213e-06, "loss": 0.5605, "reward": 0.06700577735900878, "reward_std": 0.013860853042569943, "rewards/reward_func_1": 0.06700577735900878, "step": 2140 }, { "completion_length": 2.0, "epoch": 0.576690415378411, "grad_norm": 3.3272003747697454e-06, "kl": 14.1421875, "learning_rate": 9.067420829155731e-06, "loss": 0.5659, "reward": 0.06595449447631836, "reward_std": 0.010813094197510508, "rewards/reward_func_1": 0.06595449447631836, "step": 2145 }, { "completion_length": 2.0, "epoch": 0.5780346820809249, "grad_norm": 1.5704621546319686e-05, "kl": 13.5234375, "learning_rate": 9.020704328302829e-06, "loss": 0.5408, "reward": 0.06404781341552734, "reward_std": 0.011609598056566028, "rewards/reward_func_1": 0.06404781341552734, "step": 2150 }, { "completion_length": 2.0, "epoch": 0.5793789487834387, "grad_norm": 2.1239829948171973e-05, "kl": 13.6328125, "learning_rate": 8.974009397013965e-06, "loss": 0.5455, "reward": 0.058431386947631836, "reward_std": 0.010231435889363639, "rewards/reward_func_1": 0.058431386947631836, "step": 2155 }, { "completion_length": 2.0, "epoch": 0.5807232154859524, "grad_norm": 3.322323755128309e-05, "kl": 13.49375, "learning_rate": 8.927337063772504e-06, "loss": 0.5398, "reward": 0.059176063537597655, "reward_std": 0.010483282200584653, "rewards/reward_func_1": 0.059176063537597655, "step": 2160 }, { "completion_length": 2.0, "epoch": 0.5820674821884662, "grad_norm": 1.4426668712985702e-06, "kl": 13.8109375, "learning_rate": 8.88068835656408e-06, "loss": 0.5523, "reward": 0.06375694274902344, "reward_std": 0.011770154316764092, "rewards/reward_func_1": 0.06375694274902344, "step": 2165 }, { "completion_length": 2.0, "epoch": 0.58341174889098, "grad_norm": 2.1057694539194927e-06, "kl": 13.95625, "learning_rate": 8.834064302853944e-06, "loss": 0.5581, "reward": 0.06186666488647461, "reward_std": 0.011866289676254383, "rewards/reward_func_1": 0.06186666488647461, "step": 2170 }, { "completion_length": 2.0, "epoch": 0.5847560155934938, "grad_norm": 2.079190153381205e-06, "kl": 13.7578125, "learning_rate": 8.787465929564352e-06, "loss": 0.5504, "reward": 0.05595951080322266, "reward_std": 0.011447059749116306, "rewards/reward_func_1": 0.05595951080322266, "step": 2175 }, { "completion_length": 2.0, "epoch": 0.5861002822960075, "grad_norm": 3.0134608550724806e-06, "kl": 13.93125, "learning_rate": 8.740894263051913e-06, "loss": 0.557, "reward": 0.06060028076171875, "reward_std": 0.009739526234625373, "rewards/reward_func_1": 0.06060028076171875, "step": 2180 }, { "completion_length": 2.0, "epoch": 0.5874445489985213, "grad_norm": 3.109391400357708e-05, "kl": 13.9, "learning_rate": 8.694350329085028e-06, "loss": 0.5558, "reward": 0.0626680850982666, "reward_std": 0.010819756354612764, "rewards/reward_func_1": 0.0626680850982666, "step": 2185 }, { "completion_length": 2.0, "epoch": 0.5887888157010351, "grad_norm": 1.3205424693296663e-05, "kl": 13.6703125, "learning_rate": 8.647835152821252e-06, "loss": 0.5469, "reward": 0.05972356796264648, "reward_std": 0.010678636631928385, "rewards/reward_func_1": 0.05972356796264648, "step": 2190 }, { "completion_length": 2.0, "epoch": 0.5901330824035489, "grad_norm": 4.43181943410309e-06, "kl": 13.965625, "learning_rate": 8.601349758784744e-06, "loss": 0.5587, "reward": 0.06157550811767578, "reward_std": 0.010384173551574349, "rewards/reward_func_1": 0.06157550811767578, "step": 2195 }, { "completion_length": 2.0, "epoch": 0.5914773491060626, "grad_norm": 2.103875203829375e-06, "kl": 14.0875, "learning_rate": 8.55489517084369e-06, "loss": 0.5634, "reward": 0.06180839538574219, "reward_std": 0.01010028199889348, "rewards/reward_func_1": 0.06180839538574219, "step": 2200 }, { "completion_length": 2.0, "epoch": 0.5928216158085764, "grad_norm": 2.1625994122587144e-05, "kl": 14.025, "learning_rate": 8.508472412187759e-06, "loss": 0.5611, "reward": 0.06270132064819336, "reward_std": 0.012054376184096327, "rewards/reward_func_1": 0.06270132064819336, "step": 2205 }, { "completion_length": 2.0, "epoch": 0.5941658825110901, "grad_norm": 6.9565994635922834e-06, "kl": 13.8640625, "learning_rate": 8.462082505305547e-06, "loss": 0.5548, "reward": 0.06188421249389649, "reward_std": 0.011324935717857443, "rewards/reward_func_1": 0.06188421249389649, "step": 2210 }, { "completion_length": 2.0, "epoch": 0.595510149213604, "grad_norm": 2.927747118519619e-06, "kl": 13.953125, "learning_rate": 8.415726471962092e-06, "loss": 0.558, "reward": 0.060194778442382815, "reward_std": 0.009112278009342844, "rewards/reward_func_1": 0.060194778442382815, "step": 2215 }, { "completion_length": 2.0, "epoch": 0.5968544159161178, "grad_norm": 1.6029367770897807e-06, "kl": 13.9375, "learning_rate": 8.369405333176322e-06, "loss": 0.5573, "reward": 0.0625925064086914, "reward_std": 0.01229454953354434, "rewards/reward_func_1": 0.0625925064086914, "step": 2220 }, { "completion_length": 2.0, "epoch": 0.5981986826186315, "grad_norm": 3.083619731114595e-06, "kl": 14.1265625, "learning_rate": 8.323120109198616e-06, "loss": 0.5648, "reward": 0.06270506381988525, "reward_std": 0.011008751340705203, "rewards/reward_func_1": 0.06270506381988525, "step": 2225 }, { "completion_length": 2.0, "epoch": 0.5995429493211453, "grad_norm": 5.694411811418831e-06, "kl": 13.75, "learning_rate": 8.276871819488287e-06, "loss": 0.5501, "reward": 0.06018905639648438, "reward_std": 0.009886647743405775, "rewards/reward_func_1": 0.06018905639648438, "step": 2230 }, { "completion_length": 2.0, "epoch": 0.6008872160236591, "grad_norm": 2.441943252051715e-06, "kl": 13.8171875, "learning_rate": 8.230661482691168e-06, "loss": 0.5526, "reward": 0.0654977798461914, "reward_std": 0.010158205546758836, "rewards/reward_func_1": 0.0654977798461914, "step": 2235 }, { "completion_length": 2.0, "epoch": 0.6022314827261729, "grad_norm": 2.561245310062077e-06, "kl": 13.81875, "learning_rate": 8.18449011661714e-06, "loss": 0.5528, "reward": 0.059857940673828124, "reward_std": 0.009442199986369814, "rewards/reward_func_1": 0.059857940673828124, "step": 2240 }, { "completion_length": 2.0, "epoch": 0.6035757494286866, "grad_norm": 9.140064321400132e-06, "kl": 13.8984375, "learning_rate": 8.138358738217743e-06, "loss": 0.5559, "reward": 0.062485790252685545, "reward_std": 0.009677528292741044, "rewards/reward_func_1": 0.062485790252685545, "step": 2245 }, { "completion_length": 2.0, "epoch": 0.6049200161312004, "grad_norm": 1.274393162020715e-05, "kl": 13.6140625, "learning_rate": 8.09226836356376e-06, "loss": 0.5446, "reward": 0.06158370971679687, "reward_std": 0.012039840093348176, "rewards/reward_func_1": 0.06158370971679687, "step": 2250 }, { "completion_length": 2.0, "epoch": 0.6062642828337143, "grad_norm": 4.513978183240397e-06, "kl": 14.028125, "learning_rate": 8.046220007822845e-06, "loss": 0.5613, "reward": 0.05757331848144531, "reward_std": 0.011246860059327447, "rewards/reward_func_1": 0.05757331848144531, "step": 2255 }, { "completion_length": 2.0, "epoch": 0.607608549536228, "grad_norm": 4.966521828464465e-06, "kl": 13.853125, "learning_rate": 8.000214685237154e-06, "loss": 0.554, "reward": 0.059112969785928726, "reward_std": 0.013033414728124627, "rewards/reward_func_1": 0.059112969785928726, "step": 2260 }, { "completion_length": 2.0, "epoch": 0.6089528162387418, "grad_norm": 3.217521907572518e-06, "kl": 13.6359375, "learning_rate": 7.954253409101019e-06, "loss": 0.5456, "reward": 0.061025047302246095, "reward_std": 0.012668960404334939, "rewards/reward_func_1": 0.061025047302246095, "step": 2265 }, { "completion_length": 2.0, "epoch": 0.6102970829412555, "grad_norm": 3.596692977225757e-06, "kl": 13.675, "learning_rate": 7.908337191738625e-06, "loss": 0.5469, "reward": 0.05897402763366699, "reward_std": 0.010948267369531094, "rewards/reward_func_1": 0.05897402763366699, "step": 2270 }, { "completion_length": 2.0, "epoch": 0.6116413496437694, "grad_norm": 1.489972146373475e-05, "kl": 13.6046875, "learning_rate": 7.862467044481696e-06, "loss": 0.5443, "reward": 0.06776981353759766, "reward_std": 0.009472813666070579, "rewards/reward_func_1": 0.06776981353759766, "step": 2275 }, { "completion_length": 2.0, "epoch": 0.6129856163462831, "grad_norm": 4.7771932258910965e-06, "kl": 13.8671875, "learning_rate": 7.81664397764726e-06, "loss": 0.5547, "reward": 0.05934486389160156, "reward_std": 0.010944688416202553, "rewards/reward_func_1": 0.05934486389160156, "step": 2280 }, { "completion_length": 2.0, "epoch": 0.6143298830487969, "grad_norm": 1.628923541829863e-06, "kl": 13.64375, "learning_rate": 7.770869000515344e-06, "loss": 0.5459, "reward": 0.059722518920898436, "reward_std": 0.009479641152574913, "rewards/reward_func_1": 0.059722518920898436, "step": 2285 }, { "completion_length": 2.0, "epoch": 0.6156741497513106, "grad_norm": 3.469725243121502e-06, "kl": 13.8546875, "learning_rate": 7.725143121306793e-06, "loss": 0.5542, "reward": 0.05222053527832031, "reward_std": 0.011934454514994286, "rewards/reward_func_1": 0.05222053527832031, "step": 2290 }, { "completion_length": 2.0, "epoch": 0.6170184164538245, "grad_norm": 4.984348834113916e-06, "kl": 13.95625, "learning_rate": 7.679467347161025e-06, "loss": 0.5581, "reward": 0.060247611999511716, "reward_std": 0.01037932816798275, "rewards/reward_func_1": 0.060247611999511716, "step": 2295 }, { "completion_length": 2.0, "epoch": 0.6183626831563382, "grad_norm": 1.597562345523329e-06, "kl": 13.9984375, "learning_rate": 7.633842684113876e-06, "loss": 0.5599, "reward": 0.05987234115600586, "reward_std": 0.009311927141970955, "rewards/reward_func_1": 0.05987234115600586, "step": 2300 }, { "completion_length": 2.0, "epoch": 0.619706949858852, "grad_norm": 2.6105003598786425e-06, "kl": 13.990625, "learning_rate": 7.588270137075421e-06, "loss": 0.5599, "reward": 0.057819366455078125, "reward_std": 0.012239268912526313, "rewards/reward_func_1": 0.057819366455078125, "step": 2305 }, { "completion_length": 2.0, "epoch": 0.6210512165613657, "grad_norm": 1.7647248569119256e-06, "kl": 13.7203125, "learning_rate": 7.542750709807861e-06, "loss": 0.5489, "reward": 0.05905556678771973, "reward_std": 0.011430266295792534, "rewards/reward_func_1": 0.05905556678771973, "step": 2310 }, { "completion_length": 2.0, "epoch": 0.6223954832638795, "grad_norm": 2.6496518330532126e-06, "kl": 13.6484375, "learning_rate": 7.497285404903387e-06, "loss": 0.5465, "reward": 0.055854058265686034, "reward_std": 0.010062372921674978, "rewards/reward_func_1": 0.055854058265686034, "step": 2315 }, { "completion_length": 2.0, "epoch": 0.6237397499663934, "grad_norm": 1.3658197531185579e-05, "kl": 13.98125, "learning_rate": 7.451875223762129e-06, "loss": 0.5593, "reward": 0.06093788146972656, "reward_std": 0.012115493134479039, "rewards/reward_func_1": 0.06093788146972656, "step": 2320 }, { "completion_length": 2.0, "epoch": 0.6250840166689071, "grad_norm": 2.737979684752645e-06, "kl": 13.5125, "learning_rate": 7.4065211665700685e-06, "loss": 0.5404, "reward": 0.052369880676269534, "reward_std": 0.013848574734220164, "rewards/reward_func_1": 0.052369880676269534, "step": 2325 }, { "completion_length": 2.0, "epoch": 0.6264282833714209, "grad_norm": 4.7086259655770846e-06, "kl": 13.759375, "learning_rate": 7.36122423227704e-06, "loss": 0.5506, "reward": 0.06091470718383789, "reward_std": 0.011245868943660753, "rewards/reward_func_1": 0.06091470718383789, "step": 2330 }, { "completion_length": 2.0, "epoch": 0.6277725500739346, "grad_norm": 6.927496997377602e-06, "kl": 13.696875, "learning_rate": 7.315985418574693e-06, "loss": 0.5479, "reward": 0.05918540954589844, "reward_std": 0.012543642877426464, "rewards/reward_func_1": 0.05918540954589844, "step": 2335 }, { "completion_length": 2.0, "epoch": 0.6291168167764485, "grad_norm": 3.3043399980670074e-06, "kl": 13.603125, "learning_rate": 7.270805721874559e-06, "loss": 0.544, "reward": 0.058438873291015624, "reward_std": 0.010428001565014711, "rewards/reward_func_1": 0.058438873291015624, "step": 2340 }, { "completion_length": 2.0, "epoch": 0.6304610834789622, "grad_norm": 1.634690306673292e-05, "kl": 13.9828125, "learning_rate": 7.225686137286065e-06, "loss": 0.5591, "reward": 0.06279127690941096, "reward_std": 0.0103473931827466, "rewards/reward_func_1": 0.06279127690941096, "step": 2345 }, { "completion_length": 2.0, "epoch": 0.631805350181476, "grad_norm": 3.8762250369472895e-06, "kl": 14.221875, "learning_rate": 7.180627658594643e-06, "loss": 0.5689, "reward": 0.06403388977050781, "reward_std": 0.01183446466930036, "rewards/reward_func_1": 0.06403388977050781, "step": 2350 }, { "completion_length": 2.0, "epoch": 0.6331496168839897, "grad_norm": 2.116005816787947e-06, "kl": 14.1421875, "learning_rate": 7.135631278239823e-06, "loss": 0.5657, "reward": 0.059031105041503905, "reward_std": 0.010066585054846654, "rewards/reward_func_1": 0.059031105041503905, "step": 2355 }, { "completion_length": 2.0, "epoch": 0.6344938835865036, "grad_norm": 2.5220099360012682e-06, "kl": 13.640625, "learning_rate": 7.090697987293398e-06, "loss": 0.5456, "reward": 0.059407520294189456, "reward_std": 0.010223947776830755, "rewards/reward_func_1": 0.059407520294189456, "step": 2360 }, { "completion_length": 2.0, "epoch": 0.6358381502890174, "grad_norm": 1.0680985269573284e-06, "kl": 13.6078125, "learning_rate": 7.045828775437558e-06, "loss": 0.5443, "reward": 0.06002349853515625, "reward_std": 0.011791737930616364, "rewards/reward_func_1": 0.06002349853515625, "step": 2365 }, { "completion_length": 2.0, "epoch": 0.6371824169915311, "grad_norm": 4.526314114627894e-06, "kl": 13.4546875, "learning_rate": 7.001024630943134e-06, "loss": 0.5382, "reward": 0.05956945419311523, "reward_std": 0.012025964839267544, "rewards/reward_func_1": 0.05956945419311523, "step": 2370 }, { "completion_length": 2.0, "epoch": 0.6385266836940449, "grad_norm": 4.205965524306521e-06, "kl": 14.1140625, "learning_rate": 6.956286540647794e-06, "loss": 0.5649, "reward": 0.060262870788574216, "reward_std": 0.010221635182824684, "rewards/reward_func_1": 0.060262870788574216, "step": 2375 }, { "completion_length": 2.0, "epoch": 0.6398709503965587, "grad_norm": 2.007863940889365e-06, "kl": 13.9953125, "learning_rate": 6.9116154899343356e-06, "loss": 0.5597, "reward": 0.06056399345397949, "reward_std": 0.013314929121406749, "rewards/reward_func_1": 0.06056399345397949, "step": 2380 }, { "completion_length": 2.0, "epoch": 0.6412152170990725, "grad_norm": 3.056561354242149e-06, "kl": 13.8359375, "learning_rate": 6.867012462708963e-06, "loss": 0.5534, "reward": 0.059704828262329104, "reward_std": 0.011303682426660088, "rewards/reward_func_1": 0.059704828262329104, "step": 2385 }, { "completion_length": 2.0, "epoch": 0.6425594838015862, "grad_norm": 2.523838702472858e-06, "kl": 13.78125, "learning_rate": 6.8224784413796244e-06, "loss": 0.5513, "reward": 0.057023143768310545, "reward_std": 0.012784256822487804, "rewards/reward_func_1": 0.057023143768310545, "step": 2390 }, { "completion_length": 2.0, "epoch": 0.6439037505041, "grad_norm": 2.2542815258930204e-06, "kl": 13.7625, "learning_rate": 6.77801440683437e-06, "loss": 0.5508, "reward": 0.057397651672363284, "reward_std": 0.011636027062195353, "rewards/reward_func_1": 0.057397651672363284, "step": 2395 }, { "completion_length": 2.0, "epoch": 0.6452480172066138, "grad_norm": 3.0053756745473947e-06, "kl": 13.6421875, "learning_rate": 6.733621338419763e-06, "loss": 0.5457, "reward": 0.05742425918579101, "reward_std": 0.010394414755865, "rewards/reward_func_1": 0.05742425918579101, "step": 2400 }, { "completion_length": 2.0, "epoch": 0.6465922839091276, "grad_norm": 9.635583410272375e-06, "kl": 13.7828125, "learning_rate": 6.689300213919271e-06, "loss": 0.5511, "reward": 0.061480712890625, "reward_std": 0.010102924931379676, "rewards/reward_func_1": 0.061480712890625, "step": 2405 }, { "completion_length": 2.0, "epoch": 0.6479365506116413, "grad_norm": 1.539800905447919e-06, "kl": 13.753125, "learning_rate": 6.645052009531782e-06, "loss": 0.5501, "reward": 0.06248035430908203, "reward_std": 0.010455972234194633, "rewards/reward_func_1": 0.06248035430908203, "step": 2410 }, { "completion_length": 2.0, "epoch": 0.6492808173141551, "grad_norm": 2.588592906249687e-06, "kl": 13.6578125, "learning_rate": 6.600877699850052e-06, "loss": 0.5464, "reward": 0.05666141510009766, "reward_std": 0.015636276185978203, "rewards/reward_func_1": 0.05666141510009766, "step": 2415 }, { "completion_length": 2.0, "epoch": 0.6506250840166689, "grad_norm": 3.2917205317062326e-06, "kl": 14.1875, "learning_rate": 6.556778257839283e-06, "loss": 0.5674, "reward": 0.061602020263671876, "reward_std": 0.009157647862593876, "rewards/reward_func_1": 0.061602020263671876, "step": 2420 }, { "completion_length": 2.0, "epoch": 0.6519693507191827, "grad_norm": 2.6396586690680124e-05, "kl": 13.7578125, "learning_rate": 6.5127546548156535e-06, "loss": 0.5502, "reward": 0.06312904357910157, "reward_std": 0.011368433445022674, "rewards/reward_func_1": 0.06312904357910157, "step": 2425 }, { "completion_length": 2.0, "epoch": 0.6533136174216965, "grad_norm": 5.049357241659891e-06, "kl": 13.6421875, "learning_rate": 6.46880786042496e-06, "loss": 0.5455, "reward": 0.05724415183067322, "reward_std": 0.012990292893664445, "rewards/reward_func_1": 0.05724415183067322, "step": 2430 }, { "completion_length": 2.0, "epoch": 0.6546578841242102, "grad_norm": 2.6590103061607806e-06, "kl": 13.8875, "learning_rate": 6.424938842621231e-06, "loss": 0.5555, "reward": 0.0595550537109375, "reward_std": 0.011499256859679008, "rewards/reward_func_1": 0.0595550537109375, "step": 2435 }, { "completion_length": 2.0, "epoch": 0.656002150826724, "grad_norm": 6.467951152444584e-06, "kl": 13.7046875, "learning_rate": 6.38114856764543e-06, "loss": 0.5482, "reward": 0.0562408447265625, "reward_std": 0.011464899309066823, "rewards/reward_func_1": 0.0562408447265625, "step": 2440 }, { "completion_length": 2.0, "epoch": 0.6573464175292378, "grad_norm": 2.2811452708992874e-06, "kl": 14.015625, "learning_rate": 6.337438000004155e-06, "loss": 0.5606, "reward": 0.061225509643554686, "reward_std": 0.009458938350144308, "rewards/reward_func_1": 0.061225509643554686, "step": 2445 }, { "completion_length": 2.0, "epoch": 0.6586906842317516, "grad_norm": 8.717958735360298e-06, "kl": 13.875, "learning_rate": 6.293808102448409e-06, "loss": 0.5548, "reward": 0.056508952379226686, "reward_std": 0.01089983493402542, "rewards/reward_func_1": 0.056508952379226686, "step": 2450 }, { "completion_length": 2.0, "epoch": 0.6600349509342653, "grad_norm": 1.0740451216406655e-05, "kl": 13.8125, "learning_rate": 6.250259835952383e-06, "loss": 0.5524, "reward": 0.06640968322753907, "reward_std": 0.013152831193292514, "rewards/reward_func_1": 0.06640968322753907, "step": 2455 }, { "completion_length": 2.0, "epoch": 0.6613792176367791, "grad_norm": 3.2728050882724347e-06, "kl": 13.753125, "learning_rate": 6.206794159692304e-06, "loss": 0.5502, "reward": 0.05744953155517578, "reward_std": 0.01012560978961119, "rewards/reward_func_1": 0.05744953155517578, "step": 2460 }, { "completion_length": 2.0, "epoch": 0.662723484339293, "grad_norm": 1.4861791896692012e-05, "kl": 13.9265625, "learning_rate": 6.16341203102529e-06, "loss": 0.5569, "reward": 0.05549445152282715, "reward_std": 0.009857410499535035, "rewards/reward_func_1": 0.05549445152282715, "step": 2465 }, { "completion_length": 2.0, "epoch": 0.6640677510418067, "grad_norm": 5.46200089956983e-06, "kl": 13.8640625, "learning_rate": 6.120114405468285e-06, "loss": 0.5546, "reward": 0.05894393920898437, "reward_std": 0.009983553958227276, "rewards/reward_func_1": 0.05894393920898437, "step": 2470 }, { "completion_length": 2.0, "epoch": 0.6654120177443205, "grad_norm": 1.2916130799567327e-05, "kl": 13.81875, "learning_rate": 6.076902236676994e-06, "loss": 0.553, "reward": 0.055209779739379884, "reward_std": 0.01005386611832364, "rewards/reward_func_1": 0.055209779739379884, "step": 2475 }, { "completion_length": 2.0, "epoch": 0.6667562844468342, "grad_norm": 3.6688261388917454e-06, "kl": 13.7546875, "learning_rate": 6.033776476424888e-06, "loss": 0.5503, "reward": 0.06743335723876953, "reward_std": 0.01147701254230924, "rewards/reward_func_1": 0.06743335723876953, "step": 2480 }, { "completion_length": 2.0, "epoch": 0.6681005511493481, "grad_norm": 5.666680408467073e-06, "kl": 13.990625, "learning_rate": 5.990738074582243e-06, "loss": 0.5592, "reward": 0.06467456817626953, "reward_std": 0.013033680556691251, "rewards/reward_func_1": 0.06467456817626953, "step": 2485 }, { "completion_length": 2.0, "epoch": 0.6694448178518618, "grad_norm": 1.422238983650459e-05, "kl": 13.8578125, "learning_rate": 5.947787979095213e-06, "loss": 0.5543, "reward": 0.06046428680419922, "reward_std": 0.013719953599502333, "rewards/reward_func_1": 0.06046428680419922, "step": 2490 }, { "completion_length": 2.0, "epoch": 0.6707890845543756, "grad_norm": 5.443932877824409e-06, "kl": 13.81875, "learning_rate": 5.9049271359649466e-06, "loss": 0.5526, "reward": 0.056779670715332034, "reward_std": 0.009680721638142131, "rewards/reward_func_1": 0.056779670715332034, "step": 2495 }, { "completion_length": 2.0, "epoch": 0.6721333512568893, "grad_norm": 6.311719971563434e-06, "kl": 13.9265625, "learning_rate": 5.862156489226768e-06, "loss": 0.5572, "reward": 0.056317138671875, "reward_std": 0.012263055084622465, "rewards/reward_func_1": 0.056317138671875, "step": 2500 }, { "completion_length": 2.0, "epoch": 0.6734776179594032, "grad_norm": 3.1811771350476192e-06, "kl": 13.7046875, "learning_rate": 5.819476980929357e-06, "loss": 0.548, "reward": 0.05898451805114746, "reward_std": 0.011924323247512802, "rewards/reward_func_1": 0.05898451805114746, "step": 2505 }, { "completion_length": 2.0, "epoch": 0.674821884661917, "grad_norm": 2.1738133000326343e-06, "kl": 13.8359375, "learning_rate": 5.776889551114036e-06, "loss": 0.5537, "reward": 0.05574178695678711, "reward_std": 0.009539656856213696, "rewards/reward_func_1": 0.05574178695678711, "step": 2510 }, { "completion_length": 2.0, "epoch": 0.6761661513644307, "grad_norm": 3.079718953813426e-05, "kl": 13.7, "learning_rate": 5.734395137794022e-06, "loss": 0.5483, "reward": 0.058077239990234376, "reward_std": 0.00982013454704429, "rewards/reward_func_1": 0.058077239990234376, "step": 2515 }, { "completion_length": 2.0, "epoch": 0.6775104180669445, "grad_norm": 2.630328253871994e-06, "kl": 13.78125, "learning_rate": 5.691994676933808e-06, "loss": 0.5511, "reward": 0.05584440231323242, "reward_std": 0.009929925179494602, "rewards/reward_func_1": 0.05584440231323242, "step": 2520 }, { "completion_length": 2.0, "epoch": 0.6788546847694582, "grad_norm": 4.1391981540073175e-06, "kl": 13.6796875, "learning_rate": 5.6496891024285215e-06, "loss": 0.5475, "reward": 0.058974266052246094, "reward_std": 0.010749774679425173, "rewards/reward_func_1": 0.058974266052246094, "step": 2525 }, { "completion_length": 2.0, "epoch": 0.6801989514719721, "grad_norm": 2.052042191280634e-06, "kl": 13.6703125, "learning_rate": 5.607479346083355e-06, "loss": 0.5469, "reward": 0.05872535705566406, "reward_std": 0.011635806861886522, "rewards/reward_func_1": 0.05872535705566406, "step": 2530 }, { "completion_length": 2.0, "epoch": 0.6815432181744858, "grad_norm": 2.211010541941505e-05, "kl": 14.2609375, "learning_rate": 5.565366337593066e-06, "loss": 0.5708, "reward": 0.06311745643615722, "reward_std": 0.01183991582802264, "rewards/reward_func_1": 0.06311745643615722, "step": 2535 }, { "completion_length": 2.0, "epoch": 0.6828874848769996, "grad_norm": 4.809753590961918e-06, "kl": 14.0015625, "learning_rate": 5.523351004521462e-06, "loss": 0.5603, "reward": 0.05524139404296875, "reward_std": 0.010296737632233998, "rewards/reward_func_1": 0.05524139404296875, "step": 2540 }, { "completion_length": 2.0, "epoch": 0.6842317515795133, "grad_norm": 8.715678632142954e-06, "kl": 14.078125, "learning_rate": 5.481434272281013e-06, "loss": 0.5629, "reward": 0.06164817810058594, "reward_std": 0.013340477158635622, "rewards/reward_func_1": 0.06164817810058594, "step": 2545 }, { "completion_length": 2.0, "epoch": 0.6855760182820272, "grad_norm": 2.4492214834026527e-06, "kl": 13.778125, "learning_rate": 5.439617064112431e-06, "loss": 0.5511, "reward": 0.05745353698730469, "reward_std": 0.013168468393268995, "rewards/reward_func_1": 0.05745353698730469, "step": 2550 }, { "completion_length": 2.0, "epoch": 0.6869202849845409, "grad_norm": 2.812889078995795e-06, "kl": 13.9015625, "learning_rate": 5.3979003010643675e-06, "loss": 0.5562, "reward": 0.057623672485351565, "reward_std": 0.01229256743681617, "rewards/reward_func_1": 0.057623672485351565, "step": 2555 }, { "completion_length": 2.0, "epoch": 0.6882645516870547, "grad_norm": 3.987305262853624e-06, "kl": 13.965625, "learning_rate": 5.356284901973091e-06, "loss": 0.5588, "reward": 0.059996414184570315, "reward_std": 0.010288478545771796, "rewards/reward_func_1": 0.059996414184570315, "step": 2560 }, { "completion_length": 2.0, "epoch": 0.6896088183895684, "grad_norm": 1.2458726814656984e-05, "kl": 14.121875, "learning_rate": 5.314771783442292e-06, "loss": 0.5647, "reward": 0.05899205207824707, "reward_std": 0.010236831862857797, "rewards/reward_func_1": 0.05899205207824707, "step": 2565 }, { "completion_length": 2.0, "epoch": 0.6909530850920823, "grad_norm": 2.1745931917394046e-06, "kl": 13.653125, "learning_rate": 5.273361859822852e-06, "loss": 0.5463, "reward": 0.06059694290161133, "reward_std": 0.01137404957335093, "rewards/reward_func_1": 0.06059694290161133, "step": 2570 }, { "completion_length": 2.0, "epoch": 0.6922973517945961, "grad_norm": 5.8191294556309e-06, "kl": 14.08125, "learning_rate": 5.232056043192737e-06, "loss": 0.5633, "reward": 0.0685009479522705, "reward_std": 0.012357043109659571, "rewards/reward_func_1": 0.0685009479522705, "step": 2575 }, { "completion_length": 2.0, "epoch": 0.6936416184971098, "grad_norm": 3.856658622680698e-06, "kl": 13.8921875, "learning_rate": 5.190855243336883e-06, "loss": 0.5559, "reward": 0.06555595397949218, "reward_std": 0.011982467219604586, "rewards/reward_func_1": 0.06555595397949218, "step": 2580 }, { "completion_length": 2.0, "epoch": 0.6949858851996236, "grad_norm": 1.3858561032975558e-05, "kl": 14.0171875, "learning_rate": 5.1497603677271855e-06, "loss": 0.5606, "reward": 0.06087760925292969, "reward_std": 0.011154358516796492, "rewards/reward_func_1": 0.06087760925292969, "step": 2585 }, { "completion_length": 2.0, "epoch": 0.6963301519021374, "grad_norm": 4.5027968553768005e-06, "kl": 13.7328125, "learning_rate": 5.108772321502479e-06, "loss": 0.5494, "reward": 0.05637903213500976, "reward_std": 0.012003830538014881, "rewards/reward_func_1": 0.05637903213500976, "step": 2590 }, { "completion_length": 2.0, "epoch": 0.6976744186046512, "grad_norm": 2.7505389880388975e-05, "kl": 13.94375, "learning_rate": 5.0678920074486316e-06, "loss": 0.5578, "reward": 0.06141033172607422, "reward_std": 0.010152479278622195, "rewards/reward_func_1": 0.06141033172607422, "step": 2595 }, { "completion_length": 2.0, "epoch": 0.6990186853071649, "grad_norm": 2.104391796819982e-06, "kl": 14.35625, "learning_rate": 5.0271203259786395e-06, "loss": 0.5744, "reward": 0.06711845397949219, "reward_std": 0.011301479887515597, "rewards/reward_func_1": 0.06711845397949219, "step": 2600 }, { "completion_length": 2.0, "epoch": 0.7003629520096787, "grad_norm": 1.6334478232238325e-06, "kl": 13.753125, "learning_rate": 4.986458175112807e-06, "loss": 0.5501, "reward": 0.05772566795349121, "reward_std": 0.011294707475099131, "rewards/reward_func_1": 0.05772566795349121, "step": 2605 }, { "completion_length": 2.0, "epoch": 0.7017072187121925, "grad_norm": 4.255656222085236e-06, "kl": 13.778125, "learning_rate": 4.945906450458955e-06, "loss": 0.5511, "reward": 0.058788979053497316, "reward_std": 0.009819350033649244, "rewards/reward_func_1": 0.058788979053497316, "step": 2610 }, { "completion_length": 2.0, "epoch": 0.7030514854147063, "grad_norm": 3.1343079172074795e-06, "kl": 13.5140625, "learning_rate": 4.90546604519271e-06, "loss": 0.5407, "reward": 0.05972156524658203, "reward_std": 0.011394862360612023, "rewards/reward_func_1": 0.05972156524658203, "step": 2615 }, { "completion_length": 2.0, "epoch": 0.70439575211722, "grad_norm": 4.186888418189483e-06, "kl": 13.715625, "learning_rate": 4.865137850037817e-06, "loss": 0.5488, "reward": 0.057996368408203124, "reward_std": 0.011965288411010988, "rewards/reward_func_1": 0.057996368408203124, "step": 2620 }, { "completion_length": 2.0, "epoch": 0.7057400188197338, "grad_norm": 2.494949512765743e-06, "kl": 13.9375, "learning_rate": 4.824922753246534e-06, "loss": 0.5575, "reward": 0.05783071517944336, "reward_std": 0.011933683512324933, "rewards/reward_func_1": 0.05783071517944336, "step": 2625 }, { "completion_length": 2.0, "epoch": 0.7070842855222477, "grad_norm": 4.90788443130441e-06, "kl": 14.0078125, "learning_rate": 4.784821640580051e-06, "loss": 0.5603, "reward": 0.060264754295349124, "reward_std": 0.011658078715845477, "rewards/reward_func_1": 0.060264754295349124, "step": 2630 }, { "completion_length": 2.0, "epoch": 0.7084285522247614, "grad_norm": 2.91156266030157e-06, "kl": 13.8421875, "learning_rate": 4.744835395289002e-06, "loss": 0.5537, "reward": 0.05923728942871094, "reward_std": 0.012938315909923403, "rewards/reward_func_1": 0.05923728942871094, "step": 2635 }, { "completion_length": 2.0, "epoch": 0.7097728189272752, "grad_norm": 3.442679826548556e-06, "kl": 13.8109375, "learning_rate": 4.704964898093991e-06, "loss": 0.5527, "reward": 0.06276912689208984, "reward_std": 0.011696373121230863, "rewards/reward_func_1": 0.06276912689208984, "step": 2640 }, { "completion_length": 2.0, "epoch": 0.7111170856297889, "grad_norm": 4.503699074120959e-06, "kl": 13.875, "learning_rate": 4.665211027166209e-06, "loss": 0.5547, "reward": 0.059120559692382814, "reward_std": 0.011374600145063595, "rewards/reward_func_1": 0.059120559692382814, "step": 2645 }, { "completion_length": 2.0, "epoch": 0.7124613523323027, "grad_norm": 9.90547505352879e-06, "kl": 13.6984375, "learning_rate": 4.625574658108073e-06, "loss": 0.5478, "reward": 0.057414674758911134, "reward_std": 0.010342052261330536, "rewards/reward_func_1": 0.057414674758911134, "step": 2650 }, { "completion_length": 2.0, "epoch": 0.7138056190348165, "grad_norm": 2.0887078790110536e-05, "kl": 13.509375, "learning_rate": 4.586056663933969e-06, "loss": 0.5406, "reward": 0.05762338638305664, "reward_std": 0.014504123894221265, "rewards/reward_func_1": 0.05762338638305664, "step": 2655 }, { "completion_length": 2.0, "epoch": 0.7151498857373303, "grad_norm": 3.3340979825879913e-06, "kl": 13.7078125, "learning_rate": 4.546657915050988e-06, "loss": 0.5483, "reward": 0.06230294108390808, "reward_std": 0.009969272715534317, "rewards/reward_func_1": 0.06230294108390808, "step": 2660 }, { "completion_length": 2.0, "epoch": 0.716494152439844, "grad_norm": 3.2463603929500096e-06, "kl": 13.7390625, "learning_rate": 4.507379279239791e-06, "loss": 0.5496, "reward": 0.05877430438995361, "reward_std": 0.011900619864900364, "rewards/reward_func_1": 0.05877430438995361, "step": 2665 }, { "completion_length": 2.0, "epoch": 0.7178384191423578, "grad_norm": 3.1261215553968213e-06, "kl": 13.99375, "learning_rate": 4.468221621635462e-06, "loss": 0.5597, "reward": 0.05568780899047852, "reward_std": 0.008807793819141808, "rewards/reward_func_1": 0.05568780899047852, "step": 2670 }, { "completion_length": 2.0, "epoch": 0.7191826858448717, "grad_norm": 2.6415564207127318e-06, "kl": 13.9015625, "learning_rate": 4.42918580470848e-06, "loss": 0.5562, "reward": 0.060968208312988284, "reward_std": 0.009969678838388063, "rewards/reward_func_1": 0.060968208312988284, "step": 2675 }, { "completion_length": 2.0, "epoch": 0.7205269525473854, "grad_norm": 2.9144048312446102e-06, "kl": 14.0859375, "learning_rate": 4.39027268824571e-06, "loss": 0.5633, "reward": 0.06536164283752441, "reward_std": 0.011233370206900873, "rewards/reward_func_1": 0.06536164283752441, "step": 2680 }, { "completion_length": 2.0, "epoch": 0.7218712192498992, "grad_norm": 3.608389533837908e-06, "kl": 14.0390625, "learning_rate": 4.351483129331458e-06, "loss": 0.5612, "reward": 0.06538281440734864, "reward_std": 0.013752934670628747, "rewards/reward_func_1": 0.06538281440734864, "step": 2685 }, { "completion_length": 2.0, "epoch": 0.7232154859524129, "grad_norm": 6.397221568477107e-06, "kl": 13.865625, "learning_rate": 4.312817982328612e-06, "loss": 0.5546, "reward": 0.06181436069309711, "reward_std": 0.011076993081223918, "rewards/reward_func_1": 0.06181436069309711, "step": 2690 }, { "completion_length": 2.0, "epoch": 0.7245597526549268, "grad_norm": 9.608173968445044e-06, "kl": 13.825, "learning_rate": 4.2742780988598145e-06, "loss": 0.5534, "reward": 0.06040000915527344, "reward_std": 0.012127826601499692, "rewards/reward_func_1": 0.06040000915527344, "step": 2695 }, { "completion_length": 2.0, "epoch": 0.7259040193574405, "grad_norm": 7.110948445188114e-06, "kl": 14.2703125, "learning_rate": 4.235864327788692e-06, "loss": 0.5708, "reward": 0.06447288990020753, "reward_std": 0.00979026438217261, "rewards/reward_func_1": 0.06447288990020753, "step": 2700 }, { "completion_length": 2.0, "epoch": 0.7272482860599543, "grad_norm": 4.501914645516081e-06, "kl": 13.8, "learning_rate": 4.197577515201191e-06, "loss": 0.5523, "reward": 0.0603661984205246, "reward_std": 0.011186352090589935, "rewards/reward_func_1": 0.0603661984205246, "step": 2705 }, { "completion_length": 2.0, "epoch": 0.728592552762468, "grad_norm": 7.746289156784769e-06, "kl": 13.478125, "learning_rate": 4.159418504386904e-06, "loss": 0.5393, "reward": 0.057269958406686784, "reward_std": 0.01241556809945905, "rewards/reward_func_1": 0.057269958406686784, "step": 2710 }, { "completion_length": 2.0, "epoch": 0.7299368194649819, "grad_norm": 6.022199613653356e-06, "kl": 13.8953125, "learning_rate": 4.1213881358205275e-06, "loss": 0.5558, "reward": 0.0635772705078125, "reward_std": 0.009903606217267224, "rewards/reward_func_1": 0.0635772705078125, "step": 2715 }, { "completion_length": 2.0, "epoch": 0.7312810861674957, "grad_norm": 4.5010624489805195e-06, "kl": 13.7203125, "learning_rate": 4.083487247143326e-06, "loss": 0.5486, "reward": 0.06045243740081787, "reward_std": 0.011131673593808955, "rewards/reward_func_1": 0.06045243740081787, "step": 2720 }, { "completion_length": 2.0, "epoch": 0.7326253528700094, "grad_norm": 4.66905157736619e-06, "kl": 13.7625, "learning_rate": 4.045716673144706e-06, "loss": 0.5505, "reward": 0.061006355285644534, "reward_std": 0.011753415851853789, "rewards/reward_func_1": 0.061006355285644534, "step": 2725 }, { "completion_length": 2.0, "epoch": 0.7339696195725232, "grad_norm": 3.085681328229839e-06, "kl": 13.7109375, "learning_rate": 4.008077245743801e-06, "loss": 0.5486, "reward": 0.06153240203857422, "reward_std": 0.011887542959448183, "rewards/reward_func_1": 0.06153240203857422, "step": 2730 }, { "completion_length": 2.0, "epoch": 0.735313886275037, "grad_norm": 2.592519194877241e-06, "kl": 14.075, "learning_rate": 3.970569793971178e-06, "loss": 0.5628, "reward": 0.06015148162841797, "reward_std": 0.011894370408845134, "rewards/reward_func_1": 0.06015148162841797, "step": 2735 }, { "completion_length": 2.0, "epoch": 0.7366581529775508, "grad_norm": 1.798785774553835e-06, "kl": 13.7875, "learning_rate": 3.933195143950551e-06, "loss": 0.5514, "reward": 0.06206645965576172, "reward_std": 0.014155591612507124, "rewards/reward_func_1": 0.06206645965576172, "step": 2740 }, { "completion_length": 2.0, "epoch": 0.7380024196800645, "grad_norm": 3.439082775003044e-06, "kl": 13.7390625, "learning_rate": 3.89595411888061e-06, "loss": 0.5497, "reward": 0.06377677917480469, "reward_std": 0.010366774378053379, "rewards/reward_func_1": 0.06377677917480469, "step": 2745 }, { "completion_length": 2.0, "epoch": 0.7393466863825783, "grad_norm": 2.5652859676483786e-06, "kl": 13.728125, "learning_rate": 3.85884753901686e-06, "loss": 0.5493, "reward": 0.06238212585449219, "reward_std": 0.009483605425339192, "rewards/reward_func_1": 0.06238212585449219, "step": 2750 }, { "completion_length": 2.0, "epoch": 0.740690953085092, "grad_norm": 3.275632843724452e-05, "kl": 13.8734375, "learning_rate": 3.82187622165359e-06, "loss": 0.5549, "reward": 0.05930185317993164, "reward_std": 0.010815516777802259, "rewards/reward_func_1": 0.05930185317993164, "step": 2755 }, { "completion_length": 2.0, "epoch": 0.7420352197876059, "grad_norm": 5.840865014761221e-06, "kl": 13.6109375, "learning_rate": 3.7850409811058343e-06, "loss": 0.5445, "reward": 0.05905466079711914, "reward_std": 0.008731590279785451, "rewards/reward_func_1": 0.05905466079711914, "step": 2760 }, { "completion_length": 2.0, "epoch": 0.7433794864901196, "grad_norm": 1.5586972949677147e-05, "kl": 13.8890625, "learning_rate": 3.7483426286914705e-06, "loss": 0.5557, "reward": 0.0615997314453125, "reward_std": 0.012081135442713276, "rewards/reward_func_1": 0.0615997314453125, "step": 2765 }, { "completion_length": 2.0, "epoch": 0.7447237531926334, "grad_norm": 6.052292064850917e-06, "kl": 13.45, "learning_rate": 3.7117819727133254e-06, "loss": 0.5381, "reward": 0.06048717498779297, "reward_std": 0.008764956895902287, "rewards/reward_func_1": 0.06048717498779297, "step": 2770 }, { "completion_length": 2.0, "epoch": 0.7460680198951471, "grad_norm": 3.043762262677774e-06, "kl": 13.8125, "learning_rate": 3.6753598184413873e-06, "loss": 0.5528, "reward": 0.06238512992858887, "reward_std": 0.012483571946359007, "rewards/reward_func_1": 0.06238512992858887, "step": 2775 }, { "completion_length": 2.0, "epoch": 0.747412286597661, "grad_norm": 8.626040653325617e-06, "kl": 13.8703125, "learning_rate": 3.6390769680950544e-06, "loss": 0.5549, "reward": 0.06061878204345703, "reward_std": 0.012912878150018514, "rewards/reward_func_1": 0.06061878204345703, "step": 2780 }, { "completion_length": 2.0, "epoch": 0.7487565533001748, "grad_norm": 8.276882908830885e-06, "kl": 13.959375, "learning_rate": 3.6029342208254826e-06, "loss": 0.5585, "reward": 0.06053438186645508, "reward_std": 0.012466668507113355, "rewards/reward_func_1": 0.06053438186645508, "step": 2785 }, { "completion_length": 2.0, "epoch": 0.7501008200026885, "grad_norm": 8.260290996986441e-06, "kl": 13.8265625, "learning_rate": 3.5669323726979655e-06, "loss": 0.5533, "reward": 0.0611328125, "reward_std": 0.011878072742911172, "rewards/reward_func_1": 0.0611328125, "step": 2790 }, { "completion_length": 2.0, "epoch": 0.7514450867052023, "grad_norm": 4.150938366365153e-06, "kl": 13.6796875, "learning_rate": 3.531072216674418e-06, "loss": 0.5471, "reward": 0.06307134628295899, "reward_std": 0.01603546408514376, "rewards/reward_func_1": 0.06307134628295899, "step": 2795 }, { "completion_length": 2.0, "epoch": 0.7527893534077161, "grad_norm": 2.825467163347639e-06, "kl": 13.9296875, "learning_rate": 3.4953545425959047e-06, "loss": 0.557, "reward": 0.0554865837097168, "reward_std": 0.012560931847292522, "rewards/reward_func_1": 0.0554865837097168, "step": 2800 }, { "completion_length": 2.0, "epoch": 0.7541336201102299, "grad_norm": 9.65241724770749e-06, "kl": 14.05, "learning_rate": 3.4597801371652296e-06, "loss": 0.5621, "reward": 0.061970877647399905, "reward_std": 0.010358845694281627, "rewards/reward_func_1": 0.061970877647399905, "step": 2805 }, { "completion_length": 2.0, "epoch": 0.7554778868127436, "grad_norm": 1.7722894654070842e-06, "kl": 13.821875, "learning_rate": 3.424349783929636e-06, "loss": 0.5529, "reward": 0.061666107177734374, "reward_std": 0.011269324702152517, "rewards/reward_func_1": 0.061666107177734374, "step": 2810 }, { "completion_length": 2.0, "epoch": 0.7568221535152574, "grad_norm": 1.5782270565978251e-06, "kl": 13.9140625, "learning_rate": 3.3890642632635153e-06, "loss": 0.5564, "reward": 0.06216366291046142, "reward_std": 0.010467645124299452, "rewards/reward_func_1": 0.06216366291046142, "step": 2815 }, { "completion_length": 2.0, "epoch": 0.7581664202177713, "grad_norm": 9.261582817998715e-06, "kl": 13.984375, "learning_rate": 3.353924352351253e-06, "loss": 0.5595, "reward": 0.05453653335571289, "reward_std": 0.010780278017773526, "rewards/reward_func_1": 0.05453653335571289, "step": 2820 }, { "completion_length": 2.0, "epoch": 0.759510686920285, "grad_norm": 5.424847131507704e-06, "kl": 13.9875, "learning_rate": 3.3189308251700825e-06, "loss": 0.5595, "reward": 0.057614707946777345, "reward_std": 0.011758481396100251, "rewards/reward_func_1": 0.057614707946777345, "step": 2825 }, { "completion_length": 2.0, "epoch": 0.7608549536227988, "grad_norm": 1.4945719158276916e-05, "kl": 13.875, "learning_rate": 3.2840844524730577e-06, "loss": 0.555, "reward": 0.05519509315490723, "reward_std": 0.009796513656328897, "rewards/reward_func_1": 0.05519509315490723, "step": 2830 }, { "completion_length": 2.0, "epoch": 0.7621992203253125, "grad_norm": 2.619063934616861e-06, "kl": 14.10625, "learning_rate": 3.2493860017720567e-06, "loss": 0.5644, "reward": 0.055352401733398435, "reward_std": 0.010450466238398803, "rewards/reward_func_1": 0.055352401733398435, "step": 2835 }, { "completion_length": 2.0, "epoch": 0.7635434870278264, "grad_norm": 1.0150999514735304e-05, "kl": 13.84375, "learning_rate": 3.214836237320904e-06, "loss": 0.5538, "reward": 0.058098793029785156, "reward_std": 0.011656674755067797, "rewards/reward_func_1": 0.058098793029785156, "step": 2840 }, { "completion_length": 2.0, "epoch": 0.7648877537303401, "grad_norm": 4.336788151704241e-06, "kl": 13.6640625, "learning_rate": 3.1804359200985056e-06, "loss": 0.5466, "reward": 0.05649633407592773, "reward_std": 0.011210850576026133, "rewards/reward_func_1": 0.05649633407592773, "step": 2845 }, { "completion_length": 2.0, "epoch": 0.7662320204328539, "grad_norm": 7.680199814785738e-06, "kl": 13.525, "learning_rate": 3.14618580779212e-06, "loss": 0.5411, "reward": 0.05644134283065796, "reward_std": 0.01211523166639381, "rewards/reward_func_1": 0.05644134283065796, "step": 2850 }, { "completion_length": 2.0, "epoch": 0.7675762871353676, "grad_norm": 3.693724920594832e-06, "kl": 13.6765625, "learning_rate": 3.1120866547806394e-06, "loss": 0.547, "reward": 0.055395317077636716, "reward_std": 0.012751551120891236, "rewards/reward_func_1": 0.055395317077636716, "step": 2855 }, { "completion_length": 2.0, "epoch": 0.7689205538378814, "grad_norm": 6.3646293710917234e-06, "kl": 14.1234375, "learning_rate": 3.0781392121179986e-06, "loss": 0.5649, "reward": 0.05985813140869141, "reward_std": 0.010235290192213142, "rewards/reward_func_1": 0.05985813140869141, "step": 2860 }, { "completion_length": 2.0, "epoch": 0.7702648205403952, "grad_norm": 1.4028549230715726e-05, "kl": 14.3015625, "learning_rate": 3.0443442275166226e-06, "loss": 0.5718, "reward": 0.05446624755859375, "reward_std": 0.009798991409479641, "rewards/reward_func_1": 0.05446624755859375, "step": 2865 }, { "completion_length": 2.0, "epoch": 0.771609087242909, "grad_norm": 3.3037556477211183e-06, "kl": 13.9109375, "learning_rate": 3.0107024453309486e-06, "loss": 0.5564, "reward": 0.054990959167480466, "reward_std": 0.009734460682375356, "rewards/reward_func_1": 0.054990959167480466, "step": 2870 }, { "completion_length": 2.0, "epoch": 0.7729533539454227, "grad_norm": 3.8873076846357435e-06, "kl": 13.7890625, "learning_rate": 2.9772146065410477e-06, "loss": 0.5516, "reward": 0.057455134391784665, "reward_std": 0.010452806322427932, "rewards/reward_func_1": 0.057455134391784665, "step": 2875 }, { "completion_length": 2.0, "epoch": 0.7742976206479365, "grad_norm": 8.293524842883926e-06, "kl": 14.35, "learning_rate": 2.943881448736301e-06, "loss": 0.5742, "reward": 0.062497615814208984, "reward_std": 0.00955452322596102, "rewards/reward_func_1": 0.062497615814208984, "step": 2880 }, { "completion_length": 2.0, "epoch": 0.7756418873504504, "grad_norm": 2.3304564820136875e-06, "kl": 13.9890625, "learning_rate": 2.910703706099137e-06, "loss": 0.5594, "reward": 0.06021251678466797, "reward_std": 0.012565446839289507, "rewards/reward_func_1": 0.06021251678466797, "step": 2885 }, { "completion_length": 2.0, "epoch": 0.7769861540529641, "grad_norm": 2.393172962911194e-06, "kl": 13.796875, "learning_rate": 2.8776821093888883e-06, "loss": 0.552, "reward": 0.06193351745605469, "reward_std": 0.010560587099462282, "rewards/reward_func_1": 0.06193351745605469, "step": 2890 }, { "completion_length": 2.0, "epoch": 0.7783304207554779, "grad_norm": 8.713544048077893e-06, "kl": 14.021875, "learning_rate": 2.8448173859256665e-06, "loss": 0.5609, "reward": 0.060492420196533205, "reward_std": 0.01069757735276653, "rewards/reward_func_1": 0.060492420196533205, "step": 2895 }, { "completion_length": 2.0, "epoch": 0.7796746874579916, "grad_norm": 3.608857878134586e-05, "kl": 13.4609375, "learning_rate": 2.8121102595743732e-06, "loss": 0.5384, "reward": 0.05852642059326172, "reward_std": 0.010727309926369343, "rewards/reward_func_1": 0.05852642059326172, "step": 2900 }, { "completion_length": 2.0, "epoch": 0.7810189541605055, "grad_norm": 2.3314752979786135e-05, "kl": 13.8453125, "learning_rate": 2.779561450728725e-06, "loss": 0.5537, "reward": 0.06407814025878907, "reward_std": 0.010947771910286975, "rewards/reward_func_1": 0.06407814025878907, "step": 2905 }, { "completion_length": 2.0, "epoch": 0.7823632208630192, "grad_norm": 3.858524905808736e-06, "kl": 13.8734375, "learning_rate": 2.7471716762954183e-06, "loss": 0.5551, "reward": 0.05899543762207031, "reward_std": 0.012197423033649102, "rewards/reward_func_1": 0.05899543762207031, "step": 2910 }, { "completion_length": 2.0, "epoch": 0.783707487565533, "grad_norm": 9.215535101247951e-06, "kl": 13.7046875, "learning_rate": 2.7149416496783055e-06, "loss": 0.5481, "reward": 0.06349143981933594, "reward_std": 0.010476895228566718, "rewards/reward_func_1": 0.06349143981933594, "step": 2915 }, { "completion_length": 2.0, "epoch": 0.7850517542680467, "grad_norm": 1.8621502704263548e-06, "kl": 13.9234375, "learning_rate": 2.6828720807627173e-06, "loss": 0.5572, "reward": 0.05804300308227539, "reward_std": 0.010599679932784056, "rewards/reward_func_1": 0.05804300308227539, "step": 2920 }, { "completion_length": 2.0, "epoch": 0.7863960209705606, "grad_norm": 2.1705293420382077e-06, "kl": 13.7421875, "learning_rate": 2.6509636758997914e-06, "loss": 0.5496, "reward": 0.06185646057128906, "reward_std": 0.012096992944861995, "rewards/reward_func_1": 0.06185646057128906, "step": 2925 }, { "completion_length": 2.0, "epoch": 0.7877402876730744, "grad_norm": 3.4049091937049525e-06, "kl": 13.90625, "learning_rate": 2.619217137890949e-06, "loss": 0.5562, "reward": 0.06344146728515625, "reward_std": 0.010752197249166784, "rewards/reward_func_1": 0.06344146728515625, "step": 2930 }, { "completion_length": 2.0, "epoch": 0.7890845543755881, "grad_norm": 5.779114417236997e-06, "kl": 13.809375, "learning_rate": 2.587633165972384e-06, "loss": 0.5523, "reward": 0.061241436004638675, "reward_std": 0.011485271743731573, "rewards/reward_func_1": 0.061241436004638675, "step": 2935 }, { "completion_length": 2.0, "epoch": 0.7904288210781019, "grad_norm": 1.907208570628427e-05, "kl": 13.6140625, "learning_rate": 2.556212455799688e-06, "loss": 0.5447, "reward": 0.05872478485107422, "reward_std": 0.01105458896199707, "rewards/reward_func_1": 0.05872478485107422, "step": 2940 }, { "completion_length": 2.0, "epoch": 0.7917730877806157, "grad_norm": 1.2913329555885866e-05, "kl": 13.5984375, "learning_rate": 2.5249556994325063e-06, "loss": 0.5443, "reward": 0.05844389796257019, "reward_std": 0.011415049015340628, "rewards/reward_func_1": 0.05844389796257019, "step": 2945 }, { "completion_length": 2.0, "epoch": 0.7931173544831295, "grad_norm": 7.899559022916947e-06, "kl": 13.740625, "learning_rate": 2.4938635853193127e-06, "loss": 0.5495, "reward": 0.060787391662597653, "reward_std": 0.011730345609248616, "rewards/reward_func_1": 0.060787391662597653, "step": 2950 }, { "completion_length": 2.0, "epoch": 0.7944616211856432, "grad_norm": 2.142528501281049e-06, "kl": 14.0171875, "learning_rate": 2.462936798282236e-06, "loss": 0.5608, "reward": 0.05785312652587891, "reward_std": 0.012231780852016528, "rewards/reward_func_1": 0.05785312652587891, "step": 2955 }, { "completion_length": 2.0, "epoch": 0.795805887888157, "grad_norm": 2.0210850379953627e-06, "kl": 13.615625, "learning_rate": 2.4321760195019807e-06, "loss": 0.5444, "reward": 0.058881378173828124, "reward_std": 0.009882683275645832, "rewards/reward_func_1": 0.058881378173828124, "step": 2960 }, { "completion_length": 2.0, "epoch": 0.7971501545906707, "grad_norm": 4.331094714871142e-06, "kl": 13.8859375, "learning_rate": 2.401581926502814e-06, "loss": 0.5553, "reward": 0.06248741149902344, "reward_std": 0.01132394474479952, "rewards/reward_func_1": 0.06248741149902344, "step": 2965 }, { "completion_length": 2.0, "epoch": 0.7984944212931846, "grad_norm": 2.8427843972167466e-06, "kl": 14.0859375, "learning_rate": 2.371155193137662e-06, "loss": 0.5634, "reward": 0.06317214965820313, "reward_std": 0.011330111461211346, "rewards/reward_func_1": 0.06317214965820313, "step": 2970 }, { "completion_length": 2.0, "epoch": 0.7998386879956983, "grad_norm": 3.5108828342345078e-06, "kl": 13.83125, "learning_rate": 2.3408964895732433e-06, "loss": 0.5533, "reward": 0.05587625503540039, "reward_std": 0.009793595474184258, "rewards/reward_func_1": 0.05587625503540039, "step": 2975 }, { "completion_length": 2.0, "epoch": 0.8011829546982121, "grad_norm": 6.495894467661856e-06, "kl": 13.9984375, "learning_rate": 2.310806482275336e-06, "loss": 0.5598, "reward": 0.05911798477172851, "reward_std": 0.010464892169329687, "rewards/reward_func_1": 0.05911798477172851, "step": 2980 }, { "completion_length": 2.0, "epoch": 0.8025272214007259, "grad_norm": 1.9022724018213921e-06, "kl": 14.0625, "learning_rate": 2.2808858339940696e-06, "loss": 0.5627, "reward": 0.06507339477539062, "reward_std": 0.010237712813250255, "rewards/reward_func_1": 0.06507339477539062, "step": 2985 }, { "completion_length": 2.0, "epoch": 0.8038714881032397, "grad_norm": 8.21357753011398e-06, "kl": 14.1390625, "learning_rate": 2.251135203749353e-06, "loss": 0.5655, "reward": 0.054758310317993164, "reward_std": 0.010143174163385994, "rewards/reward_func_1": 0.054758310317993164, "step": 2990 }, { "completion_length": 2.0, "epoch": 0.8052157548057535, "grad_norm": 2.0820609734073514e-06, "kl": 13.9046875, "learning_rate": 2.221555246816335e-06, "loss": 0.5563, "reward": 0.05632228851318359, "reward_std": 0.007249694373967941, "rewards/reward_func_1": 0.05632228851318359, "step": 2995 }, { "completion_length": 2.0, "epoch": 0.8065600215082672, "grad_norm": 3.5432799450063612e-06, "kl": 13.8859375, "learning_rate": 2.1921466147109995e-06, "loss": 0.555, "reward": 0.06329879760742188, "reward_std": 0.011629640086903236, "rewards/reward_func_1": 0.06329879760742188, "step": 3000 }, { "completion_length": 2.0, "epoch": 0.807904288210781, "grad_norm": 5.990676982037257e-06, "kl": 13.85, "learning_rate": 2.162909955175786e-06, "loss": 0.5541, "reward": 0.059543299674987796, "reward_std": 0.011717213732481468, "rewards/reward_func_1": 0.059543299674987796, "step": 3005 }, { "completion_length": 2.0, "epoch": 0.8092485549132948, "grad_norm": 3.0730845992366085e-06, "kl": 14.1765625, "learning_rate": 2.1338459121653467e-06, "loss": 0.5671, "reward": 0.0583465576171875, "reward_std": 0.010720923148619476, "rewards/reward_func_1": 0.0583465576171875, "step": 3010 }, { "completion_length": 2.0, "epoch": 0.8105928216158086, "grad_norm": 4.65749917566427e-06, "kl": 13.7890625, "learning_rate": 2.1049551258323466e-06, "loss": 0.5514, "reward": 0.05720829963684082, "reward_std": 0.01134266530716559, "rewards/reward_func_1": 0.05720829963684082, "step": 3015 }, { "completion_length": 2.0, "epoch": 0.8119370883183223, "grad_norm": 1.0969602044497151e-05, "kl": 13.865625, "learning_rate": 2.076238232513377e-06, "loss": 0.5545, "reward": 0.05492105484008789, "reward_std": 0.009651319341355703, "rewards/reward_func_1": 0.05492105484008789, "step": 3020 }, { "completion_length": 2.0, "epoch": 0.8132813550208361, "grad_norm": 2.167436605304829e-06, "kl": 14.128125, "learning_rate": 2.0476958647149235e-06, "loss": 0.5653, "reward": 0.062408828735351564, "reward_std": 0.011062738049804465, "rewards/reward_func_1": 0.062408828735351564, "step": 3025 }, { "completion_length": 2.0, "epoch": 0.81462562172335, "grad_norm": 6.903650046297116e-06, "kl": 13.81875, "learning_rate": 2.019328651099458e-06, "loss": 0.5526, "reward": 0.06055660247802734, "reward_std": 0.01089865797512175, "rewards/reward_func_1": 0.06055660247802734, "step": 3030 }, { "completion_length": 2.0, "epoch": 0.8159698884258637, "grad_norm": 9.66745847108541e-06, "kl": 13.9421875, "learning_rate": 1.9911372164715617e-06, "loss": 0.558, "reward": 0.060862159729003905, "reward_std": 0.010128252705180784, "rewards/reward_func_1": 0.060862159729003905, "step": 3035 }, { "completion_length": 2.0, "epoch": 0.8173141551283775, "grad_norm": 2.5966969587898348e-06, "kl": 13.865625, "learning_rate": 1.963122181764194e-06, "loss": 0.5547, "reward": 0.05717315673828125, "reward_std": 0.010695815431245138, "rewards/reward_func_1": 0.05717315673828125, "step": 3040 }, { "completion_length": 2.0, "epoch": 0.8186584218308912, "grad_norm": 1.2404716471792199e-05, "kl": 13.609375, "learning_rate": 1.935284164024995e-06, "loss": 0.5443, "reward": 0.05703325271606445, "reward_std": 0.00922305959957157, "rewards/reward_func_1": 0.05703325271606445, "step": 3045 }, { "completion_length": 2.0, "epoch": 0.8200026885334051, "grad_norm": 3.450983740549418e-06, "kl": 13.8828125, "learning_rate": 1.9076237764027096e-06, "loss": 0.5555, "reward": 0.060849010944366455, "reward_std": 0.010863185320158664, "rewards/reward_func_1": 0.060849010944366455, "step": 3050 }, { "completion_length": 2.0, "epoch": 0.8213469552359188, "grad_norm": 2.398985998297576e-05, "kl": 14.0265625, "learning_rate": 1.8801416281336593e-06, "loss": 0.5611, "reward": 0.05967788696289063, "reward_std": 0.011869262975233141, "rewards/reward_func_1": 0.05967788696289063, "step": 3055 }, { "completion_length": 2.0, "epoch": 0.8226912219384326, "grad_norm": 2.969979050249094e-06, "kl": 13.9078125, "learning_rate": 1.8528383245283565e-06, "loss": 0.5565, "reward": 0.05824851989746094, "reward_std": 0.008635234561734251, "rewards/reward_func_1": 0.05824851989746094, "step": 3060 }, { "completion_length": 2.0, "epoch": 0.8240354886409463, "grad_norm": 4.690632522397209e-06, "kl": 13.828125, "learning_rate": 1.8257144669581405e-06, "loss": 0.5533, "reward": 0.06130073070526123, "reward_std": 0.011460411777079571, "rewards/reward_func_1": 0.06130073070526123, "step": 3065 }, { "completion_length": 2.0, "epoch": 0.8253797553434602, "grad_norm": 2.5304858354502358e-05, "kl": 13.7609375, "learning_rate": 1.7987706528419547e-06, "loss": 0.5505, "reward": 0.058181381225585936, "reward_std": 0.010406417903141119, "rewards/reward_func_1": 0.058181381225585936, "step": 3070 }, { "completion_length": 2.0, "epoch": 0.826724022045974, "grad_norm": 3.5739954000746366e-06, "kl": 13.9765625, "learning_rate": 1.7720074756331796e-06, "loss": 0.5591, "reward": 0.06419677734375, "reward_std": 0.010990058263996617, "rewards/reward_func_1": 0.06419677734375, "step": 3075 }, { "completion_length": 2.0, "epoch": 0.8280682887484877, "grad_norm": 2.814767640302307e-06, "kl": 13.90625, "learning_rate": 1.745425524806552e-06, "loss": 0.5562, "reward": 0.06376209259033203, "reward_std": 0.009371502423891797, "rewards/reward_func_1": 0.06376209259033203, "step": 3080 }, { "completion_length": 2.0, "epoch": 0.8294125554510015, "grad_norm": 4.085266937181586e-06, "kl": 14.1734375, "learning_rate": 1.7190253858452032e-06, "loss": 0.5674, "reward": 0.07096824645996094, "reward_std": 0.011460935025388608, "rewards/reward_func_1": 0.07096824645996094, "step": 3085 }, { "completion_length": 2.0, "epoch": 0.8307568221535152, "grad_norm": 2.656307515280787e-06, "kl": 13.6828125, "learning_rate": 1.6928076402277404e-06, "loss": 0.5474, "reward": 0.060984134674072266, "reward_std": 0.011912760638369945, "rewards/reward_func_1": 0.060984134674072266, "step": 3090 }, { "completion_length": 2.0, "epoch": 0.8321010888560291, "grad_norm": 7.214829565782566e-06, "kl": 13.8453125, "learning_rate": 1.666772865415458e-06, "loss": 0.5536, "reward": 0.061499595642089844, "reward_std": 0.010504865943221375, "rewards/reward_func_1": 0.061499595642089844, "step": 3095 }, { "completion_length": 2.0, "epoch": 0.8334453555585428, "grad_norm": 2.9270854611240793e-06, "kl": 13.5703125, "learning_rate": 1.640921634839605e-06, "loss": 0.5428, "reward": 0.05861034393310547, "reward_std": 0.012625352442410077, "rewards/reward_func_1": 0.05861034393310547, "step": 3100 }, { "completion_length": 2.0, "epoch": 0.8347896222610566, "grad_norm": 3.179326768076862e-06, "kl": 14.1953125, "learning_rate": 1.6152545178887657e-06, "loss": 0.568, "reward": 0.055633163452148436, "reward_std": 0.012100516646751203, "rewards/reward_func_1": 0.055633163452148436, "step": 3105 }, { "completion_length": 2.0, "epoch": 0.8361338889635703, "grad_norm": 4.799480848305393e-06, "kl": 13.778125, "learning_rate": 1.5897720798963079e-06, "loss": 0.5512, "reward": 0.06492023468017578, "reward_std": 0.010558164384565315, "rewards/reward_func_1": 0.06492023468017578, "step": 3110 }, { "completion_length": 2.0, "epoch": 0.8374781556660842, "grad_norm": 3.970403668063227e-06, "kl": 13.9, "learning_rate": 1.5644748821279409e-06, "loss": 0.5563, "reward": 0.05761244297027588, "reward_std": 0.01134748296753969, "rewards/reward_func_1": 0.05761244297027588, "step": 3115 }, { "completion_length": 2.0, "epoch": 0.8388224223685979, "grad_norm": 1.801868620532332e-06, "kl": 14.128125, "learning_rate": 1.5393634817693437e-06, "loss": 0.5652, "reward": 0.06230869293212891, "reward_std": 0.012545625171333086, "rewards/reward_func_1": 0.06230869293212891, "step": 3120 }, { "completion_length": 2.0, "epoch": 0.8401666890711117, "grad_norm": 2.9992136205692077e-06, "kl": 13.9578125, "learning_rate": 1.514438431913907e-06, "loss": 0.5582, "reward": 0.06606597900390625, "reward_std": 0.010844698862638325, "rewards/reward_func_1": 0.06606597900390625, "step": 3125 }, { "completion_length": 2.0, "epoch": 0.8415109557736254, "grad_norm": 5.0372464102110825e-06, "kl": 14.0875, "learning_rate": 1.4897002815505314e-06, "loss": 0.5638, "reward": 0.06385841369628906, "reward_std": 0.011049523478141055, "rewards/reward_func_1": 0.06385841369628906, "step": 3130 }, { "completion_length": 2.0, "epoch": 0.8428552224761393, "grad_norm": 7.81911876401864e-06, "kl": 13.8375, "learning_rate": 1.4651495755515522e-06, "loss": 0.5535, "reward": 0.06193408966064453, "reward_std": 0.010152038796877604, "rewards/reward_func_1": 0.06193408966064453, "step": 3135 }, { "completion_length": 2.0, "epoch": 0.8441994891786531, "grad_norm": 4.062687366968021e-05, "kl": 14.0359375, "learning_rate": 1.4407868546607319e-06, "loss": 0.5615, "reward": 0.06457939147949218, "reward_std": 0.012157779483823105, "rewards/reward_func_1": 0.06457939147949218, "step": 3140 }, { "completion_length": 2.0, "epoch": 0.8455437558811668, "grad_norm": 2.0320292151154717e-06, "kl": 13.8375, "learning_rate": 1.4166126554813508e-06, "loss": 0.5534, "reward": 0.055645179748535153, "reward_std": 0.009349037745414535, "rewards/reward_func_1": 0.055645179748535153, "step": 3145 }, { "completion_length": 2.0, "epoch": 0.8468880225836806, "grad_norm": 3.012095476151444e-06, "kl": 13.8390625, "learning_rate": 1.3926275104643816e-06, "loss": 0.5534, "reward": 0.06417160034179688, "reward_std": 0.011896352579060476, "rewards/reward_func_1": 0.06417160034179688, "step": 3150 }, { "completion_length": 2.0, "epoch": 0.8482322892861944, "grad_norm": 8.71294741955353e-06, "kl": 13.71875, "learning_rate": 1.3688319478967772e-06, "loss": 0.5486, "reward": 0.057692861557006835, "reward_std": 0.008569657542102505, "rewards/reward_func_1": 0.057692861557006835, "step": 3155 }, { "completion_length": 2.0, "epoch": 0.8495765559887082, "grad_norm": 2.56103658102802e-06, "kl": 13.7125, "learning_rate": 1.345226491889815e-06, "loss": 0.5482, "reward": 0.05779485702514649, "reward_std": 0.011118789602187462, "rewards/reward_func_1": 0.05779485702514649, "step": 3160 }, { "completion_length": 2.0, "epoch": 0.8509208226912219, "grad_norm": 2.6276434255123604e-06, "kl": 13.7640625, "learning_rate": 1.3218116623675737e-06, "loss": 0.5509, "reward": 0.055435562133789064, "reward_std": 0.010689648687912268, "rewards/reward_func_1": 0.055435562133789064, "step": 3165 }, { "completion_length": 2.0, "epoch": 0.8522650893937357, "grad_norm": 3.175247911713086e-06, "kl": 13.7875, "learning_rate": 1.298587975055462e-06, "loss": 0.5517, "reward": 0.058620452880859375, "reward_std": 0.012298734129581134, "rewards/reward_func_1": 0.058620452880859375, "step": 3170 }, { "completion_length": 2.0, "epoch": 0.8536093560962495, "grad_norm": 6.266296622925438e-06, "kl": 13.953125, "learning_rate": 1.2755559414688766e-06, "loss": 0.5581, "reward": 0.05827016830444336, "reward_std": 0.011117468139855192, "rewards/reward_func_1": 0.05827016830444336, "step": 3175 }, { "completion_length": 2.0, "epoch": 0.8549536227987633, "grad_norm": 1.6521969882887788e-05, "kl": 13.865625, "learning_rate": 1.2527160689019202e-06, "loss": 0.5546, "reward": 0.05954210758209229, "reward_std": 0.009821428551731515, "rewards/reward_func_1": 0.05954210758209229, "step": 3180 }, { "completion_length": 2.0, "epoch": 0.856297889501277, "grad_norm": 3.3688741041260073e-06, "kl": 13.9859375, "learning_rate": 1.2300688604162458e-06, "loss": 0.5597, "reward": 0.05932321548461914, "reward_std": 0.01184624767920468, "rewards/reward_func_1": 0.05932321548461914, "step": 3185 }, { "completion_length": 2.0, "epoch": 0.8576421562037908, "grad_norm": 2.135618524334859e-05, "kl": 14.153125, "learning_rate": 1.207614814829956e-06, "loss": 0.5663, "reward": 0.060421180725097653, "reward_std": 0.010734908378799446, "rewards/reward_func_1": 0.060421180725097653, "step": 3190 }, { "completion_length": 2.0, "epoch": 0.8589864229063046, "grad_norm": 3.3278847695328295e-06, "kl": 13.675, "learning_rate": 1.1853544267066353e-06, "loss": 0.547, "reward": 0.05666627883911133, "reward_std": 0.009691183110044221, "rewards/reward_func_1": 0.05666627883911133, "step": 3195 }, { "completion_length": 2.0, "epoch": 0.8603306896088184, "grad_norm": 2.5311237550340593e-05, "kl": 13.7171875, "learning_rate": 1.1632881863444412e-06, "loss": 0.5485, "reward": 0.0567962646484375, "reward_std": 0.010639874120533932, "rewards/reward_func_1": 0.0567962646484375, "step": 3200 }, { "completion_length": 2.0, "epoch": 0.8616749563113322, "grad_norm": 3.2120556170411874e-06, "kl": 13.6140625, "learning_rate": 1.141416579765321e-06, "loss": 0.5443, "reward": 0.05926952362060547, "reward_std": 0.011298176337732002, "rewards/reward_func_1": 0.05926952362060547, "step": 3205 }, { "completion_length": 2.0, "epoch": 0.8630192230138459, "grad_norm": 1.2166214219178073e-05, "kl": 13.803125, "learning_rate": 1.1197400887042876e-06, "loss": 0.552, "reward": 0.05761222839355469, "reward_std": 0.01062401667368249, "rewards/reward_func_1": 0.05761222839355469, "step": 3210 }, { "completion_length": 2.0, "epoch": 0.8643634897163597, "grad_norm": 8.964575499703642e-06, "kl": 13.70625, "learning_rate": 1.0982591905988304e-06, "loss": 0.5486, "reward": 0.05393571853637695, "reward_std": 0.01401166350406129, "rewards/reward_func_1": 0.05393571853637695, "step": 3215 }, { "completion_length": 2.0, "epoch": 0.8657077564188735, "grad_norm": 3.3451262879680144e-06, "kl": 13.8015625, "learning_rate": 1.076974358578381e-06, "loss": 0.5522, "reward": 0.05769004821777344, "reward_std": 0.010371179192588897, "rewards/reward_func_1": 0.05769004821777344, "step": 3220 }, { "completion_length": 2.0, "epoch": 0.8670520231213873, "grad_norm": 5.8671166698331945e-06, "kl": 13.8546875, "learning_rate": 1.0558860614539013e-06, "loss": 0.554, "reward": 0.06081085205078125, "reward_std": 0.01084117493883241, "rewards/reward_func_1": 0.06081085205078125, "step": 3225 }, { "completion_length": 2.0, "epoch": 0.868396289823901, "grad_norm": 9.06308378034737e-06, "kl": 13.728125, "learning_rate": 1.034994763707562e-06, "loss": 0.5495, "reward": 0.058400535583496095, "reward_std": 0.010280990242608822, "rewards/reward_func_1": 0.058400535583496095, "step": 3230 }, { "completion_length": 2.0, "epoch": 0.8697405565264148, "grad_norm": 2.764769988061744e-06, "kl": 13.7484375, "learning_rate": 1.014300925482501e-06, "loss": 0.5501, "reward": 0.06383857727050782, "reward_std": 0.011695932724978774, "rewards/reward_func_1": 0.06383857727050782, "step": 3235 }, { "completion_length": 2.0, "epoch": 0.8710848232289287, "grad_norm": 4.055384579260135e-06, "kl": 14.325, "learning_rate": 9.93805002572692e-07, "loss": 0.5734, "reward": 0.06781425476074218, "reward_std": 0.010846901237528073, "rewards/reward_func_1": 0.06781425476074218, "step": 3240 }, { "completion_length": 2.0, "epoch": 0.8724290899314424, "grad_norm": 3.197312707925448e-06, "kl": 13.928125, "learning_rate": 9.735074464129156e-07, "loss": 0.5572, "reward": 0.05835247039794922, "reward_std": 0.011373498971806839, "rewards/reward_func_1": 0.05835247039794922, "step": 3245 }, { "completion_length": 2.0, "epoch": 0.8737733566339562, "grad_norm": 2.8454533094190992e-06, "kl": 13.928125, "learning_rate": 9.534087040687978e-07, "loss": 0.5575, "reward": 0.058566713333129884, "reward_std": 0.012191201363748405, "rewards/reward_func_1": 0.058566713333129884, "step": 3250 }, { "completion_length": 2.0, "epoch": 0.8751176233364699, "grad_norm": 6.40214238956105e-06, "kl": 13.8328125, "learning_rate": 9.335092182269823e-07, "loss": 0.5531, "reward": 0.05842547416687012, "reward_std": 0.013222923100693151, "rewards/reward_func_1": 0.05842547416687012, "step": 3255 }, { "completion_length": 2.0, "epoch": 0.8764618900389838, "grad_norm": 3.3787041502364445e-06, "kl": 13.7484375, "learning_rate": 9.138094271853626e-07, "loss": 0.5499, "reward": 0.057961654663085935, "reward_std": 0.010742506683163811, "rewards/reward_func_1": 0.057961654663085935, "step": 3260 }, { "completion_length": 2.0, "epoch": 0.8778061567414975, "grad_norm": 3.2537961942580296e-06, "kl": 13.9765625, "learning_rate": 8.943097648434451e-07, "loss": 0.5591, "reward": 0.055088233947753903, "reward_std": 0.011486923421034589, "rewards/reward_func_1": 0.055088233947753903, "step": 3265 }, { "completion_length": 2.0, "epoch": 0.8791504234440113, "grad_norm": 4.171860564383678e-05, "kl": 13.75, "learning_rate": 8.750106606927756e-07, "loss": 0.5501, "reward": 0.06052291393280029, "reward_std": 0.010141026746714488, "rewards/reward_func_1": 0.06052291393280029, "step": 3270 }, { "completion_length": 2.0, "epoch": 0.880494690146525, "grad_norm": 5.266811058390886e-06, "kl": 13.8078125, "learning_rate": 8.559125398074941e-07, "loss": 0.5522, "reward": 0.06266212463378906, "reward_std": 0.011405433918116614, "rewards/reward_func_1": 0.06266212463378906, "step": 3275 }, { "completion_length": 2.0, "epoch": 0.8818389568490389, "grad_norm": 1.9777207853621803e-06, "kl": 13.625, "learning_rate": 8.370158228349611e-07, "loss": 0.5449, "reward": 0.059973645210266116, "reward_std": 0.01014815697853919, "rewards/reward_func_1": 0.059973645210266116, "step": 3280 }, { "completion_length": 2.0, "epoch": 0.8831832235515527, "grad_norm": 3.1561939977109432e-06, "kl": 14.0640625, "learning_rate": 8.18320925986501e-07, "loss": 0.5626, "reward": 0.061003684997558594, "reward_std": 0.011469524375570472, "rewards/reward_func_1": 0.061003684997558594, "step": 3285 }, { "completion_length": 2.0, "epoch": 0.8845274902540664, "grad_norm": 3.982539965363685e-06, "kl": 13.9203125, "learning_rate": 7.998282610282282e-07, "loss": 0.5569, "reward": 0.05995340347290039, "reward_std": 0.009807140480552335, "rewards/reward_func_1": 0.05995340347290039, "step": 3290 }, { "completion_length": 2.0, "epoch": 0.8858717569565802, "grad_norm": 5.228612280916423e-06, "kl": 13.903125, "learning_rate": 7.815382352719836e-07, "loss": 0.5559, "reward": 0.06256370544433594, "reward_std": 0.010237712755042594, "rewards/reward_func_1": 0.06256370544433594, "step": 3295 }, { "completion_length": 2.0, "epoch": 0.8872160236590939, "grad_norm": 3.3338096727675293e-06, "kl": 13.9515625, "learning_rate": 7.63451251566355e-07, "loss": 0.5578, "reward": 0.059042739868164065, "reward_std": 0.011339801916619763, "rewards/reward_func_1": 0.059042739868164065, "step": 3300 }, { "completion_length": 2.0, "epoch": 0.8885602903616078, "grad_norm": 1.969355253095273e-06, "kl": 13.7625, "learning_rate": 7.455677082878144e-07, "loss": 0.5507, "reward": 0.05534934997558594, "reward_std": 0.011414243758190423, "rewards/reward_func_1": 0.05534934997558594, "step": 3305 }, { "completion_length": 2.0, "epoch": 0.8899045570641215, "grad_norm": 2.7435919491836103e-06, "kl": 13.825, "learning_rate": 7.278879993319399e-07, "loss": 0.5528, "reward": 0.0598332405090332, "reward_std": 0.012269772328363616, "rewards/reward_func_1": 0.0598332405090332, "step": 3310 }, { "completion_length": 2.0, "epoch": 0.8912488237666353, "grad_norm": 1.6360121435354813e-06, "kl": 13.8203125, "learning_rate": 7.104125141047314e-07, "loss": 0.5529, "reward": 0.06058578491210938, "reward_std": 0.0112239549322112, "rewards/reward_func_1": 0.06058578491210938, "step": 3315 }, { "completion_length": 2.0, "epoch": 0.892593090469149, "grad_norm": 5.12127780893934e-06, "kl": 14.0453125, "learning_rate": 6.931416375140465e-07, "loss": 0.5618, "reward": 0.05910205841064453, "reward_std": 0.010322065434593242, "rewards/reward_func_1": 0.05910205841064453, "step": 3320 }, { "completion_length": 2.0, "epoch": 0.8939373571716629, "grad_norm": 1.6029716789489612e-05, "kl": 13.6296875, "learning_rate": 6.760757499611193e-07, "loss": 0.5452, "reward": 0.05997223854064941, "reward_std": 0.010195591623778455, "rewards/reward_func_1": 0.05997223854064941, "step": 3325 }, { "completion_length": 2.0, "epoch": 0.8952816238741766, "grad_norm": 9.6236435638275e-06, "kl": 13.7984375, "learning_rate": 6.592152273321706e-07, "loss": 0.5521, "reward": 0.051597309112548825, "reward_std": 0.011377132889901987, "rewards/reward_func_1": 0.051597309112548825, "step": 3330 }, { "completion_length": 2.0, "epoch": 0.8966258905766904, "grad_norm": 3.594194595279987e-06, "kl": 13.5984375, "learning_rate": 6.425604409901454e-07, "loss": 0.5443, "reward": 0.06611251831054688, "reward_std": 0.009431628473248566, "rewards/reward_func_1": 0.06611251831054688, "step": 3335 }, { "completion_length": 2.0, "epoch": 0.8979701572792042, "grad_norm": 5.3717056289315224e-05, "kl": 14.0, "learning_rate": 6.261117577665254e-07, "loss": 0.5599, "reward": 0.05563621520996094, "reward_std": 0.011836006561861723, "rewards/reward_func_1": 0.05563621520996094, "step": 3340 }, { "completion_length": 2.0, "epoch": 0.899314423981718, "grad_norm": 2.7849052912642946e-06, "kl": 14.0578125, "learning_rate": 6.098695399532451e-07, "loss": 0.5625, "reward": 0.06023540496826172, "reward_std": 0.012114391791692469, "rewards/reward_func_1": 0.06023540496826172, "step": 3345 }, { "completion_length": 2.0, "epoch": 0.9006586906842318, "grad_norm": 2.9332081794564147e-06, "kl": 14.121875, "learning_rate": 5.938341452947227e-07, "loss": 0.5648, "reward": 0.059270381927490234, "reward_std": 0.010218441683537093, "rewards/reward_func_1": 0.059270381927490234, "step": 3350 }, { "completion_length": 2.0, "epoch": 0.9020029573867455, "grad_norm": 2.1531218408199493e-06, "kl": 13.9484375, "learning_rate": 5.780059269799676e-07, "loss": 0.5583, "reward": 0.06291056871414184, "reward_std": 0.013561049330746755, "rewards/reward_func_1": 0.06291056871414184, "step": 3355 }, { "completion_length": 2.0, "epoch": 0.9033472240892593, "grad_norm": 1.5980087482603267e-05, "kl": 13.7625, "learning_rate": 5.623852336348156e-07, "loss": 0.5505, "reward": 0.06374626159667969, "reward_std": 0.011463577805261593, "rewards/reward_func_1": 0.06374626159667969, "step": 3360 }, { "completion_length": 2.0, "epoch": 0.9046914907917731, "grad_norm": 3.6081160033063497e-06, "kl": 13.6828125, "learning_rate": 5.469724093142359e-07, "loss": 0.5474, "reward": 0.058330869674682616, "reward_std": 0.010767394045251422, "rewards/reward_func_1": 0.058330869674682616, "step": 3365 }, { "completion_length": 2.0, "epoch": 0.9060357574942869, "grad_norm": 3.7672652979381382e-06, "kl": 14.1046875, "learning_rate": 5.317677934947652e-07, "loss": 0.5643, "reward": 0.06058921813964844, "reward_std": 0.01233749669700046, "rewards/reward_func_1": 0.06058921813964844, "step": 3370 }, { "completion_length": 2.0, "epoch": 0.9073800241968006, "grad_norm": 6.598625532205915e-06, "kl": 13.7140625, "learning_rate": 5.167717210670232e-07, "loss": 0.5486, "reward": 0.06335010528564453, "reward_std": 0.01017670587534667, "rewards/reward_func_1": 0.06335010528564453, "step": 3375 }, { "completion_length": 2.0, "epoch": 0.9087242908993144, "grad_norm": 6.818109341111267e-06, "kl": 14.1703125, "learning_rate": 5.019845223283393e-07, "loss": 0.5669, "reward": 0.06114330291748047, "reward_std": 0.012632400382426568, "rewards/reward_func_1": 0.06114330291748047, "step": 3380 }, { "completion_length": 2.0, "epoch": 0.9100685576018283, "grad_norm": 4.953264578944072e-06, "kl": 13.8640625, "learning_rate": 4.874065229754743e-07, "loss": 0.5543, "reward": 0.05895808935165405, "reward_std": 0.009361990720572066, "rewards/reward_func_1": 0.05895808935165405, "step": 3385 }, { "completion_length": 2.0, "epoch": 0.911412824304342, "grad_norm": 3.3497417462058365e-06, "kl": 13.8671875, "learning_rate": 4.730380440974536e-07, "loss": 0.5548, "reward": 0.05743751525878906, "reward_std": 0.011488685388758312, "rewards/reward_func_1": 0.05743751525878906, "step": 3390 }, { "completion_length": 2.0, "epoch": 0.9127570910068558, "grad_norm": 1.907120349642355e-05, "kl": 13.7640625, "learning_rate": 4.588794021684861e-07, "loss": 0.5505, "reward": 0.05307474136352539, "reward_std": 0.011330662002728786, "rewards/reward_func_1": 0.05307474136352539, "step": 3395 }, { "completion_length": 2.0, "epoch": 0.9141013577093695, "grad_norm": 1.0828506674442906e-05, "kl": 13.784375, "learning_rate": 4.4493090904100366e-07, "loss": 0.5516, "reward": 0.0565185546875, "reward_std": 0.009959327296382981, "rewards/reward_func_1": 0.0565185546875, "step": 3400 }, { "completion_length": 2.0, "epoch": 0.9154456244118833, "grad_norm": 6.197794391482603e-06, "kl": 13.771875, "learning_rate": 4.3119287193878035e-07, "loss": 0.5511, "reward": 0.059843674302101135, "reward_std": 0.013095743974554352, "rewards/reward_func_1": 0.059843674302101135, "step": 3405 }, { "completion_length": 2.0, "epoch": 0.9167898911143971, "grad_norm": 1.7243120282728341e-06, "kl": 13.775, "learning_rate": 4.176655934501783e-07, "loss": 0.5513, "reward": 0.060840415954589847, "reward_std": 0.009967916857567616, "rewards/reward_func_1": 0.060840415954589847, "step": 3410 }, { "completion_length": 2.0, "epoch": 0.9181341578169109, "grad_norm": 3.116844027317711e-06, "kl": 13.9265625, "learning_rate": 4.04349371521473e-07, "loss": 0.5572, "reward": 0.06282119750976563, "reward_std": 0.010909009316674202, "rewards/reward_func_1": 0.06282119750976563, "step": 3415 }, { "completion_length": 2.0, "epoch": 0.9194784245194246, "grad_norm": 3.4594206681504147e-06, "kl": 13.4734375, "learning_rate": 3.912444994503006e-07, "loss": 0.539, "reward": 0.055774879455566403, "reward_std": 0.012623810911463806, "rewards/reward_func_1": 0.055774879455566403, "step": 3420 }, { "completion_length": 2.0, "epoch": 0.9208226912219384, "grad_norm": 4.940301550959703e-06, "kl": 13.74375, "learning_rate": 3.783512658791821e-07, "loss": 0.55, "reward": 0.05670597553253174, "reward_std": 0.01125905594062715, "rewards/reward_func_1": 0.05670597553253174, "step": 3425 }, { "completion_length": 2.0, "epoch": 0.9221669579244522, "grad_norm": 5.229050202615326e-06, "kl": 13.6734375, "learning_rate": 3.6566995478918733e-07, "loss": 0.547, "reward": 0.059182238578796384, "reward_std": 0.0114708733453881, "rewards/reward_func_1": 0.059182238578796384, "step": 3430 }, { "completion_length": 2.0, "epoch": 0.923511224626966, "grad_norm": 4.601683485816466e-06, "kl": 13.7171875, "learning_rate": 3.5320084549365864e-07, "loss": 0.5489, "reward": 0.059515857696533205, "reward_std": 0.01207662059168797, "rewards/reward_func_1": 0.059515857696533205, "step": 3435 }, { "completion_length": 2.0, "epoch": 0.9248554913294798, "grad_norm": 7.413936600642046e-06, "kl": 14.0421875, "learning_rate": 3.409442126320761e-07, "loss": 0.5622, "reward": 0.0631723403930664, "reward_std": 0.011003713330137544, "rewards/reward_func_1": 0.0631723403930664, "step": 3440 }, { "completion_length": 2.0, "epoch": 0.9261997580319935, "grad_norm": 7.578887107229093e-06, "kl": 14.0140625, "learning_rate": 3.289003261639978e-07, "loss": 0.5607, "reward": 0.06122303009033203, "reward_std": 0.008386581853119423, "rewards/reward_func_1": 0.06122303009033203, "step": 3445 }, { "completion_length": 2.0, "epoch": 0.9275440247345074, "grad_norm": 2.735734824454994e-06, "kl": 13.5484375, "learning_rate": 3.170694513631178e-07, "loss": 0.5421, "reward": 0.05534172058105469, "reward_std": 0.01105921419657534, "rewards/reward_func_1": 0.05534172058105469, "step": 3450 }, { "completion_length": 2.0, "epoch": 0.9288882914370211, "grad_norm": 6.3263983065553475e-06, "kl": 13.9046875, "learning_rate": 3.054518488114211e-07, "loss": 0.5563, "reward": 0.054802989959716795, "reward_std": 0.011252586312184575, "rewards/reward_func_1": 0.054802989959716795, "step": 3455 }, { "completion_length": 2.0, "epoch": 0.9302325581395349, "grad_norm": 1.8264050595462322e-05, "kl": 13.9765625, "learning_rate": 2.9404777439345e-07, "loss": 0.5592, "reward": 0.06266040802001953, "reward_std": 0.011020671827282058, "rewards/reward_func_1": 0.06266040802001953, "step": 3460 }, { "completion_length": 2.0, "epoch": 0.9315768248420486, "grad_norm": 4.062319021613803e-06, "kl": 13.3328125, "learning_rate": 2.828574792906602e-07, "loss": 0.5334, "reward": 0.05159635543823242, "reward_std": 0.012718734997179126, "rewards/reward_func_1": 0.05159635543823242, "step": 3465 }, { "completion_length": 2.0, "epoch": 0.9329210915445625, "grad_norm": 3.980569545092294e-06, "kl": 14.04375, "learning_rate": 2.718812099758927e-07, "loss": 0.5621, "reward": 0.059010887145996095, "reward_std": 0.011960663207719335, "rewards/reward_func_1": 0.059010887145996095, "step": 3470 }, { "completion_length": 2.0, "epoch": 0.9342653582470762, "grad_norm": 7.070525043673115e-06, "kl": 13.7671875, "learning_rate": 2.61119208207945e-07, "loss": 0.5507, "reward": 0.06236776113510132, "reward_std": 0.012535562692573877, "rewards/reward_func_1": 0.06236776113510132, "step": 3475 }, { "completion_length": 2.0, "epoch": 0.93560962494959, "grad_norm": 1.0806640602822881e-05, "kl": 14.009375, "learning_rate": 2.5057171102624623e-07, "loss": 0.5605, "reward": 0.06075210571289062, "reward_std": 0.010957903016242198, "rewards/reward_func_1": 0.06075210571289062, "step": 3480 }, { "completion_length": 2.0, "epoch": 0.9369538916521037, "grad_norm": 7.847236702218652e-06, "kl": 13.5546875, "learning_rate": 2.4023895074563266e-07, "loss": 0.5421, "reward": 0.05129318237304688, "reward_std": 0.011580966626206645, "rewards/reward_func_1": 0.05129318237304688, "step": 3485 }, { "completion_length": 2.0, "epoch": 0.9382981583546176, "grad_norm": 1.2028275705233682e-05, "kl": 13.9296875, "learning_rate": 2.3012115495123944e-07, "loss": 0.5573, "reward": 0.060091400146484376, "reward_std": 0.012295650782471057, "rewards/reward_func_1": 0.060091400146484376, "step": 3490 }, { "completion_length": 2.0, "epoch": 0.9396424250571314, "grad_norm": 4.723935944639379e-06, "kl": 13.775, "learning_rate": 2.2021854649347696e-07, "loss": 0.551, "reward": 0.06290969848632813, "reward_std": 0.01066894597352075, "rewards/reward_func_1": 0.06290969848632813, "step": 3495 }, { "completion_length": 2.0, "epoch": 0.9409866917596451, "grad_norm": 2.703375912460615e-06, "kl": 14.0421875, "learning_rate": 2.105313434831302e-07, "loss": 0.5619, "reward": 0.061441230773925784, "reward_std": 0.01024222782725701, "rewards/reward_func_1": 0.061441230773925784, "step": 3500 }, { "completion_length": 2.0, "epoch": 0.9423309584621589, "grad_norm": 5.110204710945254e-06, "kl": 14.0078125, "learning_rate": 2.0105975928655154e-07, "loss": 0.5603, "reward": 0.05853328704833984, "reward_std": 0.011592859703523573, "rewards/reward_func_1": 0.05853328704833984, "step": 3505 }, { "completion_length": 2.0, "epoch": 0.9436752251646727, "grad_norm": 2.7313865302858176e-06, "kl": 13.821875, "learning_rate": 1.9180400252096332e-07, "loss": 0.5529, "reward": 0.05967512130737305, "reward_std": 0.010000402305377066, "rewards/reward_func_1": 0.05967512130737305, "step": 3510 }, { "completion_length": 2.0, "epoch": 0.9450194918671865, "grad_norm": 5.0758940233208705e-06, "kl": 14.1890625, "learning_rate": 1.8276427704985944e-07, "loss": 0.5674, "reward": 0.06556577682495117, "reward_std": 0.01243010827965918, "rewards/reward_func_1": 0.06556577682495117, "step": 3515 }, { "completion_length": 2.0, "epoch": 0.9463637585697002, "grad_norm": 4.3392617953941226e-05, "kl": 13.828125, "learning_rate": 1.7394078197851883e-07, "loss": 0.5531, "reward": 0.06168599128723144, "reward_std": 0.011275436536379857, "rewards/reward_func_1": 0.06168599128723144, "step": 3520 }, { "completion_length": 2.0, "epoch": 0.947708025272214, "grad_norm": 3.3501562484161695e-06, "kl": 13.7171875, "learning_rate": 1.6533371164961675e-07, "loss": 0.5485, "reward": 0.05614547729492188, "reward_std": 0.012279793498601066, "rewards/reward_func_1": 0.05614547729492188, "step": 3525 }, { "completion_length": 2.0, "epoch": 0.9490522919747277, "grad_norm": 1.6229182620008942e-06, "kl": 13.9203125, "learning_rate": 1.569432556389494e-07, "loss": 0.5568, "reward": 0.059717750549316405, "reward_std": 0.010331755940569565, "rewards/reward_func_1": 0.059717750549316405, "step": 3530 }, { "completion_length": 2.0, "epoch": 0.9503965586772416, "grad_norm": 4.219915808789665e-06, "kl": 14.1, "learning_rate": 1.4876959875125163e-07, "loss": 0.5642, "reward": 0.06116485595703125, "reward_std": 0.010313475892326096, "rewards/reward_func_1": 0.06116485595703125, "step": 3535 }, { "completion_length": 2.0, "epoch": 0.9517408253797554, "grad_norm": 2.8452825517888414e-06, "kl": 13.8171875, "learning_rate": 1.4081292101613241e-07, "loss": 0.5527, "reward": 0.05729732513427734, "reward_std": 0.010209962361841463, "rewards/reward_func_1": 0.05729732513427734, "step": 3540 }, { "completion_length": 2.0, "epoch": 0.9530850920822691, "grad_norm": 7.0745595621701796e-06, "kl": 13.7015625, "learning_rate": 1.3307339768410365e-07, "loss": 0.5482, "reward": 0.05454435348510742, "reward_std": 0.011191799660446122, "rewards/reward_func_1": 0.05454435348510742, "step": 3545 }, { "completion_length": 2.0, "epoch": 0.9544293587847829, "grad_norm": 5.309683274390409e-06, "kl": 14.0046875, "learning_rate": 1.2555119922272762e-07, "loss": 0.56, "reward": 0.060787296295166014, "reward_std": 0.011707495429436676, "rewards/reward_func_1": 0.060787296295166014, "step": 3550 }, { "completion_length": 2.0, "epoch": 0.9557736254872967, "grad_norm": 2.6409459223941667e-06, "kl": 13.8515625, "learning_rate": 1.182464913128556e-07, "loss": 0.5541, "reward": 0.05850715637207031, "reward_std": 0.011938859339716145, "rewards/reward_func_1": 0.05850715637207031, "step": 3555 }, { "completion_length": 2.0, "epoch": 0.9571178921898105, "grad_norm": 2.5191229724441655e-05, "kl": 13.9328125, "learning_rate": 1.1115943484498292e-07, "loss": 0.5573, "reward": 0.060850906372070315, "reward_std": 0.008408385679285858, "rewards/reward_func_1": 0.060850906372070315, "step": 3560 }, { "completion_length": 2.0, "epoch": 0.9584621588923242, "grad_norm": 2.427203526167432e-06, "kl": 13.684375, "learning_rate": 1.0429018591570195e-07, "loss": 0.5472, "reward": 0.062432861328125, "reward_std": 0.011829619569471105, "rewards/reward_func_1": 0.062432861328125, "step": 3565 }, { "completion_length": 2.0, "epoch": 0.959806425594838, "grad_norm": 6.474336259998381e-06, "kl": 13.7390625, "learning_rate": 9.7638895824268e-08, "loss": 0.5494, "reward": 0.06272506713867188, "reward_std": 0.010429763507272583, "rewards/reward_func_1": 0.06272506713867188, "step": 3570 }, { "completion_length": 2.0, "epoch": 0.9611506922973518, "grad_norm": 3.807508164754836e-06, "kl": 13.89375, "learning_rate": 9.120571106926212e-08, "loss": 0.5563, "reward": 0.05861101150512695, "reward_std": 0.012294219210161828, "rewards/reward_func_1": 0.05861101150512695, "step": 3575 }, { "completion_length": 2.0, "epoch": 0.9624949589998656, "grad_norm": 2.37896620092215e-06, "kl": 14.1109375, "learning_rate": 8.499077334536921e-08, "loss": 0.5646, "reward": 0.0572235107421875, "reward_std": 0.01089623533844133, "rewards/reward_func_1": 0.0572235107421875, "step": 3580 }, { "completion_length": 2.0, "epoch": 0.9638392257023793, "grad_norm": 3.802741503022844e-06, "kl": 13.753125, "learning_rate": 7.899421954025266e-08, "loss": 0.5501, "reward": 0.0643655776977539, "reward_std": 0.010973099654074758, "rewards/reward_func_1": 0.0643655776977539, "step": 3585 }, { "completion_length": 2.0, "epoch": 0.9651834924048931, "grad_norm": 5.4865031415829435e-06, "kl": 13.7015625, "learning_rate": 7.321618173154466e-08, "loss": 0.5481, "reward": 0.05659542083740234, "reward_std": 0.011230782363782055, "rewards/reward_func_1": 0.05659542083740234, "step": 3590 }, { "completion_length": 2.0, "epoch": 0.966527759107407, "grad_norm": 1.9297044673294295e-06, "kl": 14.0859375, "learning_rate": 6.765678718392843e-08, "loss": 0.5633, "reward": 0.0672616958618164, "reward_std": 0.013510283493087628, "rewards/reward_func_1": 0.0672616958618164, "step": 3595 }, { "completion_length": 2.0, "epoch": 0.9678720258099207, "grad_norm": 3.233315283068805e-06, "kl": 13.9375, "learning_rate": 6.231615834634497e-08, "loss": 0.5572, "reward": 0.06126976013183594, "reward_std": 0.010508609988391981, "rewards/reward_func_1": 0.06126976013183594, "step": 3600 }, { "completion_length": 2.0, "epoch": 0.9692162925124345, "grad_norm": 3.4369802506262204e-06, "kl": 13.734375, "learning_rate": 5.719441284929073e-08, "loss": 0.5495, "reward": 0.06183929443359375, "reward_std": 0.009256316086975858, "rewards/reward_func_1": 0.06183929443359375, "step": 3605 }, { "completion_length": 2.0, "epoch": 0.9705605592149482, "grad_norm": 2.0105403564230073e-06, "kl": 13.815625, "learning_rate": 5.229166350222747e-08, "loss": 0.5525, "reward": 0.06485710144042969, "reward_std": 0.010634588305765646, "rewards/reward_func_1": 0.06485710144042969, "step": 3610 }, { "completion_length": 2.0, "epoch": 0.9719048259174621, "grad_norm": 1.3547062735597137e-05, "kl": 13.8625, "learning_rate": 4.760801829109763e-08, "loss": 0.5546, "reward": 0.0634115219116211, "reward_std": 0.010813204231817508, "rewards/reward_func_1": 0.0634115219116211, "step": 3615 }, { "completion_length": 2.0, "epoch": 0.9732490926199758, "grad_norm": 1.8916140334113152e-06, "kl": 13.721875, "learning_rate": 4.3143580375945016e-08, "loss": 0.549, "reward": 0.05869293212890625, "reward_std": 0.010853508513537235, "rewards/reward_func_1": 0.05869293212890625, "step": 3620 }, { "completion_length": 2.0, "epoch": 0.9745933593224896, "grad_norm": 4.1851594687614124e-06, "kl": 13.4625, "learning_rate": 3.889844808864451e-08, "loss": 0.5387, "reward": 0.059955787658691403, "reward_std": 0.010277136050717672, "rewards/reward_func_1": 0.059955787658691403, "step": 3625 }, { "completion_length": 2.0, "epoch": 0.9759376260250033, "grad_norm": 7.070749688864453e-06, "kl": 13.9234375, "learning_rate": 3.487271493073596e-08, "loss": 0.5572, "reward": 0.0633173942565918, "reward_std": 0.008260493339184905, "rewards/reward_func_1": 0.0633173942565918, "step": 3630 }, { "completion_length": 2.0, "epoch": 0.9772818927275171, "grad_norm": 2.236297405033838e-06, "kl": 13.834375, "learning_rate": 3.106646957136472e-08, "loss": 0.5532, "reward": 0.06124534606933594, "reward_std": 0.011660033430234761, "rewards/reward_func_1": 0.06124534606933594, "step": 3635 }, { "completion_length": 2.0, "epoch": 0.978626159430031, "grad_norm": 2.2983462258707732e-05, "kl": 13.9046875, "learning_rate": 2.7479795845324342e-08, "loss": 0.5563, "reward": 0.05750617980957031, "reward_std": 0.01097970688406349, "rewards/reward_func_1": 0.05750617980957031, "step": 3640 }, { "completion_length": 2.0, "epoch": 0.9799704261325447, "grad_norm": 3.4399413380015176e-06, "kl": 13.965625, "learning_rate": 2.411277275121915e-08, "loss": 0.5586, "reward": 0.062087726593017575, "reward_std": 0.016095589974429458, "rewards/reward_func_1": 0.062087726593017575, "step": 3645 }, { "completion_length": 2.0, "epoch": 0.9813146928350585, "grad_norm": 5.6823751037882175e-06, "kl": 13.8625, "learning_rate": 2.096547444971453e-08, "loss": 0.5547, "reward": 0.06416492462158203, "reward_std": 0.012935893231770024, "rewards/reward_func_1": 0.06416492462158203, "step": 3650 }, { "completion_length": 2.0, "epoch": 0.9826589595375722, "grad_norm": 3.1565050448989496e-06, "kl": 13.8734375, "learning_rate": 1.8037970261909343e-08, "loss": 0.5549, "reward": 0.060312080383300784, "reward_std": 0.01092442618610221, "rewards/reward_func_1": 0.060312080383300784, "step": 3655 }, { "completion_length": 2.0, "epoch": 0.9840032262400861, "grad_norm": 3.1137699352257187e-06, "kl": 13.7796875, "learning_rate": 1.533032466780826e-08, "loss": 0.5516, "reward": 0.06079998016357422, "reward_std": 0.011588454757293221, "rewards/reward_func_1": 0.06079998016357422, "step": 3660 }, { "completion_length": 2.0, "epoch": 0.9853474929425998, "grad_norm": 1.9204537238692865e-05, "kl": 13.83125, "learning_rate": 1.2842597304901783e-08, "loss": 0.5532, "reward": 0.05525150299072266, "reward_std": 0.008975948392617283, "rewards/reward_func_1": 0.05525150299072266, "step": 3665 }, { "completion_length": 2.0, "epoch": 0.9866917596451136, "grad_norm": 3.472564230833086e-06, "kl": 13.6734375, "learning_rate": 1.057484296684841e-08, "loss": 0.5469, "reward": 0.06324386596679688, "reward_std": 0.013345763047982472, "rewards/reward_func_1": 0.06324386596679688, "step": 3670 }, { "completion_length": 2.0, "epoch": 0.9880360263476273, "grad_norm": 2.0865973056061193e-06, "kl": 13.6015625, "learning_rate": 8.527111602273375e-09, "loss": 0.5439, "reward": 0.057076644897460935, "reward_std": 0.01272732454162906, "rewards/reward_func_1": 0.057076644897460935, "step": 3675 }, { "completion_length": 2.0, "epoch": 0.9893802930501412, "grad_norm": 5.3152875807427336e-06, "kl": 13.7734375, "learning_rate": 6.699448313668422e-09, "loss": 0.5508, "reward": 0.06054000854492188, "reward_std": 0.010864520556788193, "rewards/reward_func_1": 0.06054000854492188, "step": 3680 }, { "completion_length": 2.0, "epoch": 0.9907245597526549, "grad_norm": 1.824535843297781e-06, "kl": 14.0265625, "learning_rate": 5.0918933563914866e-09, "loss": 0.561, "reward": 0.05926017761230469, "reward_std": 0.011259193480873364, "rewards/reward_func_1": 0.05926017761230469, "step": 3685 }, { "completion_length": 2.0, "epoch": 0.9920688264551687, "grad_norm": 3.396847887415788e-06, "kl": 13.934375, "learning_rate": 3.7044821377896225e-09, "loss": 0.5574, "reward": 0.058501815795898436, "reward_std": 0.010197188252641353, "rewards/reward_func_1": 0.058501815795898436, "step": 3690 }, { "completion_length": 2.0, "epoch": 0.9934130931576824, "grad_norm": 2.0717141069326317e-06, "kl": 13.6671875, "learning_rate": 2.537245216410744e-09, "loss": 0.5469, "reward": 0.06255474090576171, "reward_std": 0.010947992081491975, "rewards/reward_func_1": 0.06255474090576171, "step": 3695 }, { "completion_length": 2.0, "epoch": 0.9947573598601963, "grad_norm": 4.1668949961604085e-06, "kl": 13.8796875, "learning_rate": 1.590208301335272e-09, "loss": 0.5552, "reward": 0.06215476989746094, "reward_std": 0.011439351307490141, "rewards/reward_func_1": 0.06215476989746094, "step": 3700 }, { "completion_length": 2.0, "epoch": 0.9961016265627101, "grad_norm": 4.461783646547701e-06, "kl": 14.01875, "learning_rate": 8.633922516110283e-10, "loss": 0.5609, "reward": 0.05896968841552734, "reward_std": 0.01078853727231035, "rewards/reward_func_1": 0.05896968841552734, "step": 3705 }, { "completion_length": 2.0, "epoch": 0.9974458932652238, "grad_norm": 3.0964040433900664e-06, "kl": 13.6203125, "learning_rate": 3.568130757880539e-10, "loss": 0.545, "reward": 0.056448173522949216, "reward_std": 0.009608041982573923, "rewards/reward_func_1": 0.056448173522949216, "step": 3710 }, { "completion_length": 2.0, "epoch": 0.9987901599677376, "grad_norm": 6.054201548977289e-06, "kl": 13.7796875, "learning_rate": 7.048193157221939e-11, "loss": 0.5512, "reward": 0.06598529815673829, "reward_std": 0.011622372209239984, "rewards/reward_func_1": 0.06598529815673829, "step": 3715 }, { "completion_length": 2.0, "epoch": 0.9998655733297486, "kl": 13.673828125, "reward": 0.06168794631958008, "reward_std": 0.012013631283480208, "rewards/reward_func_1": 0.06168794631958008, "step": 3719, "total_flos": 0.0, "train_loss": 604316576171.3622, "train_runtime": 47153.6794, "train_samples_per_second": 1.262, "train_steps_per_second": 0.079 } ], "logging_steps": 5, "max_steps": 3719, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }