diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8963 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9998655733297486, + "eval_steps": 500, + "global_step": 3719, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 459.0625, + "epoch": 0.0013442667025137787, + "grad_norm": 33.51481246948242, + "kl": 0.001410222053527832, + "learning_rate": 2.688172043010753e-07, + "loss": 0.0001, + "reward": 0.1325918197631836, + "reward_std": 0.02248889727052301, + "rewards/reward_func_1": 0.1325918197631836, + "step": 5 + }, + { + "completion_length": 479.13125, + "epoch": 0.0026885334050275574, + "grad_norm": 41.71512985229492, + "kl": 0.005267477035522461, + "learning_rate": 5.376344086021506e-07, + "loss": 0.0002, + "reward": 0.13215713500976561, + "reward_std": 0.024013734073378146, + "rewards/reward_func_1": 0.13215713500976561, + "step": 10 + }, + { + "completion_length": 454.19375, + "epoch": 0.0040328001075413365, + "grad_norm": 38.0341911315918, + "kl": 0.15977706909179687, + "learning_rate": 8.064516129032258e-07, + "loss": 0.0064, + "reward": 0.1277914047241211, + "reward_std": 0.023130150814540684, + "rewards/reward_func_1": 0.1277914047241211, + "step": 15 + }, + { + "completion_length": 525.4125, + "epoch": 0.005377066810055115, + "grad_norm": 22.35483169555664, + "kl": 0.31226425170898436, + "learning_rate": 1.0752688172043011e-06, + "loss": 0.0125, + "reward": 0.13453254699707032, + "reward_std": 0.02571835172129795, + "rewards/reward_func_1": 0.13453254699707032, + "step": 20 + }, + { + "completion_length": 537.725, + "epoch": 0.006721333512568894, + "grad_norm": 26.161535263061523, + "kl": 0.36166534423828123, + "learning_rate": 1.3440860215053765e-06, + "loss": 0.0145, + "reward": 0.14296913146972656, + "reward_std": 0.022099756821990012, + "rewards/reward_func_1": 0.14296913146972656, + "step": 25 + }, + { + "completion_length": 471.96875, + "epoch": 0.008065600215082673, + "grad_norm": 108.09612274169922, + "kl": 0.27325439453125, + "learning_rate": 1.6129032258064516e-06, + "loss": 0.0109, + "reward": 0.12444114685058594, + "reward_std": 0.025380318914540113, + "rewards/reward_func_1": 0.12444114685058594, + "step": 30 + }, + { + "completion_length": 566.39375, + "epoch": 0.009409866917596451, + "grad_norm": 28.055280685424805, + "kl": 0.4273193359375, + "learning_rate": 1.881720430107527e-06, + "loss": 0.0171, + "reward": 0.13999091386795043, + "reward_std": 0.023209166852757333, + "rewards/reward_func_1": 0.13999091386795043, + "step": 35 + }, + { + "completion_length": 558.03125, + "epoch": 0.01075413362011023, + "grad_norm": 92.63247680664062, + "kl": 0.7135009765625, + "learning_rate": 2.1505376344086023e-06, + "loss": 0.0285, + "reward": 0.13420333862304687, + "reward_std": 0.021424611564725637, + "rewards/reward_func_1": 0.13420333862304687, + "step": 40 + }, + { + "completion_length": 524.88125, + "epoch": 0.012098400322624008, + "grad_norm": 13.822972297668457, + "kl": 13.015373229980469, + "learning_rate": 2.4193548387096776e-06, + "loss": 0.5195, + "reward": 0.137255859375, + "reward_std": 0.02337467367760837, + "rewards/reward_func_1": 0.137255859375, + "step": 45 + }, + { + "completion_length": 554.8125, + "epoch": 0.013442667025137788, + "grad_norm": 43.86341857910156, + "kl": 0.65223388671875, + "learning_rate": 2.688172043010753e-06, + "loss": 0.0261, + "reward": 0.13823509216308594, + "reward_std": 0.02139872215921059, + "rewards/reward_func_1": 0.13823509216308594, + "step": 50 + }, + { + "completion_length": 453.91875, + "epoch": 0.014786933727651566, + "grad_norm": 17.72090721130371, + "kl": 0.695703125, + "learning_rate": 2.9569892473118283e-06, + "loss": 0.0278, + "reward": 0.13162574768066407, + "reward_std": 0.0243722494575195, + "rewards/reward_func_1": 0.13162574768066407, + "step": 55 + }, + { + "completion_length": 477.7375, + "epoch": 0.016131200430165346, + "grad_norm": 51.72431182861328, + "kl": 1.161859130859375, + "learning_rate": 3.225806451612903e-06, + "loss": 0.0465, + "reward": 0.13519821166992188, + "reward_std": 0.019137346441857515, + "rewards/reward_func_1": 0.13519821166992188, + "step": 60 + }, + { + "completion_length": 491.5375, + "epoch": 0.017475467132679123, + "grad_norm": 23.050928115844727, + "kl": 1.3964111328125, + "learning_rate": 3.494623655913979e-06, + "loss": 0.0559, + "reward": 0.12957611083984374, + "reward_std": 0.021138915204210205, + "rewards/reward_func_1": 0.12957611083984374, + "step": 65 + }, + { + "completion_length": 510.4125, + "epoch": 0.018819733835192903, + "grad_norm": 26.82096290588379, + "kl": 2.96982421875, + "learning_rate": 3.763440860215054e-06, + "loss": 0.1189, + "reward": 0.13300743103027343, + "reward_std": 0.024069122620858252, + "rewards/reward_func_1": 0.13300743103027343, + "step": 70 + }, + { + "completion_length": 442.7375, + "epoch": 0.020164000537706683, + "grad_norm": 26.9343318939209, + "kl": 3.32802734375, + "learning_rate": 4.032258064516129e-06, + "loss": 0.1332, + "reward": 0.12296409010887147, + "reward_std": 0.025652985728811473, + "rewards/reward_func_1": 0.12296409010887147, + "step": 75 + }, + { + "completion_length": 585.75625, + "epoch": 0.02150826724022046, + "grad_norm": 6.579223155975342, + "kl": 69.7091796875, + "learning_rate": 4.3010752688172045e-06, + "loss": 2.7885, + "reward": 0.1406890869140625, + "reward_std": 0.021016028558369725, + "rewards/reward_func_1": 0.1406890869140625, + "step": 80 + }, + { + "completion_length": 549.41875, + "epoch": 0.02285253394273424, + "grad_norm": 3.3090600967407227, + "kl": 1.4935546875, + "learning_rate": 4.56989247311828e-06, + "loss": 0.0597, + "reward": 0.13293228149414063, + "reward_std": 0.0250552476150915, + "rewards/reward_func_1": 0.13293228149414063, + "step": 85 + }, + { + "completion_length": 534.4625, + "epoch": 0.024196800645248016, + "grad_norm": 12.640344619750977, + "kl": 1.730859375, + "learning_rate": 4.838709677419355e-06, + "loss": 0.0692, + "reward": 0.12914085388183594, + "reward_std": 0.021931628661695866, + "rewards/reward_func_1": 0.12914085388183594, + "step": 90 + }, + { + "completion_length": 964.30625, + "epoch": 0.025541067347761796, + "grad_norm": 3.1176600456237793, + "kl": 0.46341552734375, + "learning_rate": 5.1075268817204305e-06, + "loss": 0.0185, + "reward": 0.12530202865600587, + "reward_std": 0.030988389148842544, + "rewards/reward_func_1": 0.12530202865600587, + "step": 95 + }, + { + "completion_length": 946.646875, + "epoch": 0.026885334050275576, + "grad_norm": 2.275771379470825, + "kl": 1.3025634765625, + "learning_rate": 5.376344086021506e-06, + "loss": 0.0521, + "reward": 0.11467647552490234, + "reward_std": 0.028262564330361784, + "rewards/reward_func_1": 0.11467647552490234, + "step": 100 + }, + { + "completion_length": 745.26875, + "epoch": 0.028229600752789352, + "grad_norm": 12.095799446105957, + "kl": 2.6501708984375, + "learning_rate": 5.645161290322582e-06, + "loss": 0.106, + "reward": 0.11658521220088006, + "reward_std": 0.02962974151596427, + "rewards/reward_func_1": 0.11658521220088006, + "step": 105 + }, + { + "completion_length": 602.88125, + "epoch": 0.029573867455303132, + "grad_norm": 4.594326496124268, + "kl": 305.4122314453125, + "learning_rate": 5.9139784946236566e-06, + "loss": 12.2179, + "reward": 0.12178945541381836, + "reward_std": 0.022939921566285194, + "rewards/reward_func_1": 0.12178945541381836, + "step": 110 + }, + { + "completion_length": 649.784375, + "epoch": 0.030918134157816912, + "grad_norm": 26.5841121673584, + "kl": 1.881787109375, + "learning_rate": 6.182795698924732e-06, + "loss": 0.0753, + "reward": 0.12521166801452638, + "reward_std": 0.02316317391814664, + "rewards/reward_func_1": 0.12521166801452638, + "step": 115 + }, + { + "completion_length": 720.3375, + "epoch": 0.03226240086033069, + "grad_norm": 4.994908809661865, + "kl": 2.0610107421875, + "learning_rate": 6.451612903225806e-06, + "loss": 0.0825, + "reward": 0.1228231817483902, + "reward_std": 0.02567218211479485, + "rewards/reward_func_1": 0.1228231817483902, + "step": 120 + }, + { + "completion_length": 675.49375, + "epoch": 0.033606667562844465, + "grad_norm": 10.518940925598145, + "kl": 2.297900390625, + "learning_rate": 6.720430107526882e-06, + "loss": 0.0921, + "reward": 0.10162264108657837, + "reward_std": 0.026170244067907335, + "rewards/reward_func_1": 0.10162264108657837, + "step": 125 + }, + { + "completion_length": 662.875, + "epoch": 0.034950934265358245, + "grad_norm": 6.318077564239502, + "kl": 2.387255859375, + "learning_rate": 6.989247311827958e-06, + "loss": 0.0955, + "reward": 0.11114879846572875, + "reward_std": 0.031876870489213616, + "rewards/reward_func_1": 0.11114879846572875, + "step": 130 + }, + { + "completion_length": 707.71875, + "epoch": 0.036295200967872025, + "grad_norm": 19.05810546875, + "kl": 3.068359375, + "learning_rate": 7.258064516129033e-06, + "loss": 0.1228, + "reward": 0.09947696328163147, + "reward_std": 0.030518771056085824, + "rewards/reward_func_1": 0.09947696328163147, + "step": 135 + }, + { + "completion_length": 807.725, + "epoch": 0.037639467670385805, + "grad_norm": 5.729363441467285, + "kl": 2.4376953125, + "learning_rate": 7.526881720430108e-06, + "loss": 0.0975, + "reward": 0.09508908390998841, + "reward_std": 0.035996314510703085, + "rewards/reward_func_1": 0.09508908390998841, + "step": 140 + }, + { + "completion_length": 921.71875, + "epoch": 0.038983734372899585, + "grad_norm": 279.34271240234375, + "kl": 6.6451171875, + "learning_rate": 7.795698924731183e-06, + "loss": 0.2654, + "reward": 0.06034855842590332, + "reward_std": 0.03402297935681418, + "rewards/reward_func_1": 0.06034855842590332, + "step": 145 + }, + { + "completion_length": 876.575, + "epoch": 0.040328001075413365, + "grad_norm": 198.41583251953125, + "kl": 6.54765625, + "learning_rate": 8.064516129032258e-06, + "loss": 0.2619, + "reward": 0.07277845814824105, + "reward_std": 0.03974553793668747, + "rewards/reward_func_1": 0.07277845814824105, + "step": 150 + }, + { + "completion_length": 780.615625, + "epoch": 0.04167226777792714, + "grad_norm": 16.293689727783203, + "kl": 5.5056640625, + "learning_rate": 8.333333333333334e-06, + "loss": 0.2202, + "reward": 0.0797007441520691, + "reward_std": 0.03856456303037703, + "rewards/reward_func_1": 0.0797007441520691, + "step": 155 + }, + { + "completion_length": 955.875, + "epoch": 0.04301653448044092, + "grad_norm": 5.261388778686523, + "kl": 4.3888671875, + "learning_rate": 8.602150537634409e-06, + "loss": 0.1755, + "reward": 0.06296098232269287, + "reward_std": 0.032317174156196414, + "rewards/reward_func_1": 0.06296098232269287, + "step": 160 + }, + { + "completion_length": 905.43125, + "epoch": 0.0443608011829547, + "grad_norm": 2.4820497035980225, + "kl": 4.9009765625, + "learning_rate": 8.870967741935484e-06, + "loss": 0.196, + "reward": 0.07761964425444604, + "reward_std": 0.031909586675465104, + "rewards/reward_func_1": 0.07761964425444604, + "step": 165 + }, + { + "completion_length": 755.90625, + "epoch": 0.04570506788546848, + "grad_norm": 3.4995851516723633, + "kl": 4.2828125, + "learning_rate": 9.13978494623656e-06, + "loss": 0.1714, + "reward": 0.0845573864877224, + "reward_std": 0.028807568131014705, + "rewards/reward_func_1": 0.0845573864877224, + "step": 170 + }, + { + "completion_length": 660.746875, + "epoch": 0.04704933458798226, + "grad_norm": 19.133451461791992, + "kl": 4.21357421875, + "learning_rate": 9.408602150537635e-06, + "loss": 0.1686, + "reward": 0.09643235206604003, + "reward_std": 0.02881598025560379, + "rewards/reward_func_1": 0.09643235206604003, + "step": 175 + }, + { + "completion_length": 651.1375, + "epoch": 0.04839360129049603, + "grad_norm": 5.1367645263671875, + "kl": 5.13486328125, + "learning_rate": 9.67741935483871e-06, + "loss": 0.2055, + "reward": 0.09348840713500976, + "reward_std": 0.03065957601647824, + "rewards/reward_func_1": 0.09348840713500976, + "step": 180 + }, + { + "completion_length": 799.3375, + "epoch": 0.04973786799300981, + "grad_norm": 9.540658950805664, + "kl": 3.68408203125, + "learning_rate": 9.946236559139786e-06, + "loss": 0.1475, + "reward": 0.08806414604187011, + "reward_std": 0.03414921889780089, + "rewards/reward_func_1": 0.08806414604187011, + "step": 185 + }, + { + "completion_length": 752.3375, + "epoch": 0.05108213469552359, + "grad_norm": 7.665460586547852, + "kl": 8.005517578125, + "learning_rate": 1.0215053763440861e-05, + "loss": 0.3196, + "reward": 0.07655536755919456, + "reward_std": 0.03511982869822532, + "rewards/reward_func_1": 0.07655536755919456, + "step": 190 + }, + { + "completion_length": 771.375, + "epoch": 0.05242640139803737, + "grad_norm": 10.91799259185791, + "kl": 5.8822265625, + "learning_rate": 1.0483870967741936e-05, + "loss": 0.2353, + "reward": 0.07151660919189454, + "reward_std": 0.03899585076142102, + "rewards/reward_func_1": 0.07151660919189454, + "step": 195 + }, + { + "completion_length": 898.54375, + "epoch": 0.05377066810055115, + "grad_norm": 8.995491027832031, + "kl": 26.2333984375, + "learning_rate": 1.0752688172043012e-05, + "loss": 1.0536, + "reward": 0.05515105128288269, + "reward_std": 0.037433248152956365, + "rewards/reward_func_1": 0.05515105128288269, + "step": 200 + }, + { + "completion_length": 801.6125, + "epoch": 0.05511493480306493, + "grad_norm": 2.727968454360962, + "kl": 3.705029296875, + "learning_rate": 1.1021505376344085e-05, + "loss": 0.1481, + "reward": 0.07217190265655518, + "reward_std": 0.03416005950421095, + "rewards/reward_func_1": 0.07217190265655518, + "step": 205 + }, + { + "completion_length": 759.43125, + "epoch": 0.056459201505578704, + "grad_norm": 2.115070343017578, + "kl": 4.862158203125, + "learning_rate": 1.1290322580645164e-05, + "loss": 0.1946, + "reward": 0.09042127132415771, + "reward_std": 0.032155740447342394, + "rewards/reward_func_1": 0.09042127132415771, + "step": 210 + }, + { + "completion_length": 932.6875, + "epoch": 0.057803468208092484, + "grad_norm": 2.2677536010742188, + "kl": 2.760693359375, + "learning_rate": 1.1559139784946238e-05, + "loss": 0.1104, + "reward": 0.062465869216248394, + "reward_std": 0.033533206372521815, + "rewards/reward_func_1": 0.062465869216248394, + "step": 215 + }, + { + "completion_length": 995.0375, + "epoch": 0.059147734910606264, + "grad_norm": 28.45672607421875, + "kl": 2.51044921875, + "learning_rate": 1.1827956989247313e-05, + "loss": 0.1004, + "reward": 0.04683060795068741, + "reward_std": 0.03918457605177537, + "rewards/reward_func_1": 0.04683060795068741, + "step": 220 + }, + { + "completion_length": 1012.8125, + "epoch": 0.060492001613120044, + "grad_norm": 657.45166015625, + "kl": 8.178271484375, + "learning_rate": 1.2096774193548388e-05, + "loss": 0.328, + "reward": 0.029974862933158875, + "reward_std": 0.029841514525469393, + "rewards/reward_func_1": 0.029974862933158875, + "step": 225 + }, + { + "completion_length": 902.15, + "epoch": 0.061836268315633824, + "grad_norm": 63.430824279785156, + "kl": 6.02529296875, + "learning_rate": 1.2365591397849464e-05, + "loss": 0.2418, + "reward": 0.03732140064239502, + "reward_std": 0.030012273252941667, + "rewards/reward_func_1": 0.03732140064239502, + "step": 230 + }, + { + "completion_length": 820.70625, + "epoch": 0.0631805350181476, + "grad_norm": 18.125106811523438, + "kl": 2.8375, + "learning_rate": 1.2634408602150539e-05, + "loss": 0.1134, + "reward": 0.0467583104968071, + "reward_std": 0.026065533305518328, + "rewards/reward_func_1": 0.0467583104968071, + "step": 235 + }, + { + "completion_length": 781.80625, + "epoch": 0.06452480172066138, + "grad_norm": 16.778675079345703, + "kl": 4.069482421875, + "learning_rate": 1.2903225806451613e-05, + "loss": 0.1629, + "reward": 0.039750583469867706, + "reward_std": 0.02966789968777448, + "rewards/reward_func_1": 0.039750583469867706, + "step": 240 + }, + { + "completion_length": 733.74375, + "epoch": 0.06586906842317516, + "grad_norm": 17.191570281982422, + "kl": 3.77578125, + "learning_rate": 1.3172043010752688e-05, + "loss": 0.151, + "reward": 0.031531840562820435, + "reward_std": 0.03298547498416156, + "rewards/reward_func_1": 0.031531840562820435, + "step": 245 + }, + { + "completion_length": 724.8125, + "epoch": 0.06721333512568893, + "grad_norm": 7.622962951660156, + "kl": 4.6927734375, + "learning_rate": 1.3440860215053763e-05, + "loss": 0.1877, + "reward": 0.028579163551330566, + "reward_std": 0.03135324278846383, + "rewards/reward_func_1": 0.028579163551330566, + "step": 250 + }, + { + "completion_length": 833.86875, + "epoch": 0.06855760182820271, + "grad_norm": 7.43701171875, + "kl": 3.25869140625, + "learning_rate": 1.3709677419354839e-05, + "loss": 0.1303, + "reward": 0.022608640044927596, + "reward_std": 0.020842469058698042, + "rewards/reward_func_1": 0.022608640044927596, + "step": 255 + }, + { + "completion_length": 954.625, + "epoch": 0.06990186853071649, + "grad_norm": 8.609543800354004, + "kl": 1.60927734375, + "learning_rate": 1.3978494623655916e-05, + "loss": 0.0644, + "reward": 0.006280577182769776, + "reward_std": 0.01644945718580857, + "rewards/reward_func_1": 0.006280577182769776, + "step": 260 + }, + { + "completion_length": 924.771875, + "epoch": 0.07124613523323027, + "grad_norm": 25.172861099243164, + "kl": 3.16962890625, + "learning_rate": 1.4247311827956991e-05, + "loss": 0.1268, + "reward": 0.008034330606460572, + "reward_std": 0.014402125729247928, + "rewards/reward_func_1": 0.008034330606460572, + "step": 265 + }, + { + "completion_length": 609.03125, + "epoch": 0.07259040193574405, + "grad_norm": 20.167495727539062, + "kl": 6.338671875, + "learning_rate": 1.4516129032258066e-05, + "loss": 0.2535, + "reward": 0.0008509188890457153, + "reward_std": 0.015080565505195409, + "rewards/reward_func_1": 0.0008509188890457153, + "step": 270 + }, + { + "completion_length": 669.51875, + "epoch": 0.07393466863825783, + "grad_norm": 4.5325541496276855, + "kl": 8.6146484375, + "learning_rate": 1.4784946236559142e-05, + "loss": 0.3447, + "reward": 0.0004961371421813964, + "reward_std": 0.018004348664544523, + "rewards/reward_func_1": 0.0004961371421813964, + "step": 275 + }, + { + "completion_length": 282.79375, + "epoch": 0.07527893534077161, + "grad_norm": 7.294330596923828, + "kl": 11.61015625, + "learning_rate": 1.5053763440860215e-05, + "loss": 0.4646, + "reward": 0.016247385740280153, + "reward_std": 0.020933675090782346, + "rewards/reward_func_1": 0.016247385740280153, + "step": 280 + }, + { + "completion_length": 618.675, + "epoch": 0.07662320204328539, + "grad_norm": 6.872623443603516, + "kl": 25.2796875, + "learning_rate": 1.5322580645161292e-05, + "loss": 1.0111, + "reward": 0.0221073180437088, + "reward_std": 0.01539291434455663, + "rewards/reward_func_1": 0.0221073180437088, + "step": 285 + }, + { + "completion_length": 71.5875, + "epoch": 0.07796746874579917, + "grad_norm": 4.054460048675537, + "kl": 12.71484375, + "learning_rate": 1.5591397849462366e-05, + "loss": 0.5082, + "reward": 0.029245705343782902, + "reward_std": 0.01699454879271798, + "rewards/reward_func_1": 0.029245705343782902, + "step": 290 + }, + { + "completion_length": 148.65, + "epoch": 0.07931173544831295, + "grad_norm": 5.3947062492370605, + "kl": 10.1796875, + "learning_rate": 1.586021505376344e-05, + "loss": 0.4074, + "reward": 0.03420259654521942, + "reward_std": 0.021775428601540626, + "rewards/reward_func_1": 0.03420259654521942, + "step": 295 + }, + { + "completion_length": 151.83125, + "epoch": 0.08065600215082673, + "grad_norm": 4.995931148529053, + "kl": 8.5484375, + "learning_rate": 1.6129032258064517e-05, + "loss": 0.3418, + "reward": 0.0415335863828659, + "reward_std": 0.015774094988591968, + "rewards/reward_func_1": 0.0415335863828659, + "step": 300 + }, + { + "completion_length": 249.5125, + "epoch": 0.0820002688533405, + "grad_norm": 6.766551971435547, + "kl": 7.60859375, + "learning_rate": 1.6397849462365594e-05, + "loss": 0.3044, + "reward": 0.04043524265289307, + "reward_std": 0.01937261049170047, + "rewards/reward_func_1": 0.04043524265289307, + "step": 305 + }, + { + "completion_length": 132.06875, + "epoch": 0.08334453555585428, + "grad_norm": 7.255300045013428, + "kl": 33.04140625, + "learning_rate": 1.6666666666666667e-05, + "loss": 1.3153, + "reward": 0.04444933533668518, + "reward_std": 0.015411415329435841, + "rewards/reward_func_1": 0.04444933533668518, + "step": 310 + }, + { + "completion_length": 427.43125, + "epoch": 0.08468880225836806, + "grad_norm": 4.3001933097839355, + "kl": 7.823046875, + "learning_rate": 1.6935483870967744e-05, + "loss": 0.3131, + "reward": 0.036715186480432746, + "reward_std": 0.025042318180203436, + "rewards/reward_func_1": 0.036715186480432746, + "step": 315 + }, + { + "completion_length": 617.3875, + "epoch": 0.08603306896088184, + "grad_norm": 5.300938129425049, + "kl": 9.6765625, + "learning_rate": 1.7204301075268818e-05, + "loss": 0.3872, + "reward": 0.02903536558151245, + "reward_std": 0.03241579991299659, + "rewards/reward_func_1": 0.02903536558151245, + "step": 320 + }, + { + "completion_length": 480.16875, + "epoch": 0.08737733566339562, + "grad_norm": 3.9806482791900635, + "kl": 9.332421875, + "learning_rate": 1.7473118279569895e-05, + "loss": 0.3733, + "reward": 0.03327850103378296, + "reward_std": 0.02665868471376598, + "rewards/reward_func_1": 0.03327850103378296, + "step": 325 + }, + { + "completion_length": 624.35625, + "epoch": 0.0887216023659094, + "grad_norm": 1.2687135934829712, + "kl": 8.853515625, + "learning_rate": 1.774193548387097e-05, + "loss": 0.3541, + "reward": 0.014437276124954223, + "reward_std": 0.028702187002636492, + "rewards/reward_func_1": 0.014437276124954223, + "step": 330 + }, + { + "completion_length": 482.596875, + "epoch": 0.09006586906842318, + "grad_norm": 4.883694648742676, + "kl": 10.290625, + "learning_rate": 1.8010752688172042e-05, + "loss": 0.4117, + "reward": -0.00023592226207256318, + "reward_std": 0.02089865313610062, + "rewards/reward_func_1": -0.00023592226207256318, + "step": 335 + }, + { + "completion_length": 551.590625, + "epoch": 0.09141013577093696, + "grad_norm": 31.997989654541016, + "kl": 27.48359375, + "learning_rate": 1.827956989247312e-05, + "loss": 1.0984, + "reward": 0.017952871322631837, + "reward_std": 0.02710586852626875, + "rewards/reward_func_1": 0.017952871322631837, + "step": 340 + }, + { + "completion_length": 90.98125, + "epoch": 0.09275440247345074, + "grad_norm": 3.1550872325897217, + "kl": 6.864453125, + "learning_rate": 1.8548387096774193e-05, + "loss": 0.2747, + "reward": 0.0417973518371582, + "reward_std": 0.013006633780605625, + "rewards/reward_func_1": 0.0417973518371582, + "step": 345 + }, + { + "completion_length": 93.0875, + "epoch": 0.09409866917596452, + "grad_norm": 7.118286609649658, + "kl": 5.258984375, + "learning_rate": 1.881720430107527e-05, + "loss": 0.2103, + "reward": 0.04214920997619629, + "reward_std": 0.0113553161296295, + "rewards/reward_func_1": 0.04214920997619629, + "step": 350 + }, + { + "completion_length": 23.6375, + "epoch": 0.0954429358784783, + "grad_norm": 0.8202024698257446, + "kl": 5.026171875, + "learning_rate": 1.9086021505376347e-05, + "loss": 0.2011, + "reward": 0.04604549407958984, + "reward_std": 0.007001646169919695, + "rewards/reward_func_1": 0.04604549407958984, + "step": 355 + }, + { + "completion_length": 420.46875, + "epoch": 0.09678720258099206, + "grad_norm": 3.111360788345337, + "kl": 5.22421875, + "learning_rate": 1.935483870967742e-05, + "loss": 0.209, + "reward": 0.018204644322395325, + "reward_std": 0.012417513456603047, + "rewards/reward_func_1": 0.018204644322395325, + "step": 360 + }, + { + "completion_length": 1005.70625, + "epoch": 0.09813146928350584, + "grad_norm": 0.18079593777656555, + "kl": 2.351806640625, + "learning_rate": 1.9623655913978498e-05, + "loss": 0.0941, + "reward": 0.032704389095306395, + "reward_std": 0.01922294880496338, + "rewards/reward_func_1": 0.032704389095306395, + "step": 365 + }, + { + "completion_length": 1019.66875, + "epoch": 0.09947573598601962, + "grad_norm": 0.2214096635580063, + "kl": 0.1852294921875, + "learning_rate": 1.989247311827957e-05, + "loss": 0.0074, + "reward": 0.061572599411010745, + "reward_std": 0.01994357380317524, + "rewards/reward_func_1": 0.061572599411010745, + "step": 370 + }, + { + "completion_length": 995.678125, + "epoch": 0.1008200026885334, + "grad_norm": 0.5282752513885498, + "kl": 0.2708984375, + "learning_rate": 1.9999960353893115e-05, + "loss": 0.0108, + "reward": 0.055290712416172026, + "reward_std": 0.016450203530257567, + "rewards/reward_func_1": 0.055290712416172026, + "step": 375 + }, + { + "completion_length": 1009.3375, + "epoch": 0.10216426939104718, + "grad_norm": 0.770745575428009, + "kl": 0.82568359375, + "learning_rate": 1.9999718073267252e-05, + "loss": 0.0331, + "reward": 0.05625443458557129, + "reward_std": 0.01596820540144108, + "rewards/reward_func_1": 0.05625443458557129, + "step": 380 + }, + { + "completion_length": 1016.053125, + "epoch": 0.10350853609356096, + "grad_norm": 0.473776251077652, + "kl": 0.31748046875, + "learning_rate": 1.9999255542960368e-05, + "loss": 0.0127, + "reward": 0.01015063002705574, + "reward_std": 0.010674006148474292, + "rewards/reward_func_1": 0.01015063002705574, + "step": 385 + }, + { + "completion_length": 1001.11875, + "epoch": 0.10485280279607474, + "grad_norm": 0.7848945260047913, + "kl": 0.3754150390625, + "learning_rate": 1.999857277315996e-05, + "loss": 0.015, + "reward": 0.0617163360118866, + "reward_std": 0.021173181070480496, + "rewards/reward_func_1": 0.0617163360118866, + "step": 390 + }, + { + "completion_length": 1021.315625, + "epoch": 0.10619706949858852, + "grad_norm": 16.4527645111084, + "kl": 1.40760498046875, + "learning_rate": 1.9997669778904446e-05, + "loss": 0.0563, + "reward": 0.04422735869884491, + "reward_std": 0.022931134537793697, + "rewards/reward_func_1": 0.04422735869884491, + "step": 395 + }, + { + "completion_length": 1017.70625, + "epoch": 0.1075413362011023, + "grad_norm": 0.4751810133457184, + "kl": 1744830464.8029785, + "learning_rate": 1.9996546580082792e-05, + "loss": 69673728.0, + "reward": 0.06360023021697998, + "reward_std": 0.025275059579871594, + "rewards/reward_func_1": 0.06360023021697998, + "step": 400 + }, + { + "completion_length": 1012.39375, + "epoch": 0.10888560290361608, + "grad_norm": 0.8353201746940613, + "kl": 1.180908203125, + "learning_rate": 1.9995203201434124e-05, + "loss": 0.0472, + "reward": 0.054718819260597226, + "reward_std": 0.02895347127923742, + "rewards/reward_func_1": 0.054718819260597226, + "step": 405 + }, + { + "completion_length": 883.959375, + "epoch": 0.11022986960612986, + "grad_norm": 5.007371425628662, + "kl": 0.9612548828125, + "learning_rate": 1.9993639672547146e-05, + "loss": 0.0384, + "reward": 0.06528196483850479, + "reward_std": 0.028689574322197587, + "rewards/reward_func_1": 0.06528196483850479, + "step": 410 + }, + { + "completion_length": 446.45625, + "epoch": 0.11157413630864363, + "grad_norm": 1.8732314109802246, + "kl": 2.2693359375, + "learning_rate": 1.9991856027859504e-05, + "loss": 0.0908, + "reward": 0.052714601159095764, + "reward_std": 0.030338111356832086, + "rewards/reward_func_1": 0.052714601159095764, + "step": 415 + }, + { + "completion_length": 662.909375, + "epoch": 0.11291840301115741, + "grad_norm": 6.701618194580078, + "kl": 1.86259765625, + "learning_rate": 1.9989852306657015e-05, + "loss": 0.0745, + "reward": 0.07098124027252198, + "reward_std": 0.029971924761775882, + "rewards/reward_func_1": 0.07098124027252198, + "step": 420 + }, + { + "completion_length": 946.375, + "epoch": 0.11426266971367119, + "grad_norm": 0.3844849169254303, + "kl": 0.8337646484375, + "learning_rate": 1.998762855307283e-05, + "loss": 0.0333, + "reward": 0.08022915720939636, + "reward_std": 0.018059900577645747, + "rewards/reward_func_1": 0.08022915720939636, + "step": 425 + }, + { + "completion_length": 1011.1875, + "epoch": 0.11560693641618497, + "grad_norm": 0.3679395020008087, + "kl": 0.24794921875, + "learning_rate": 1.998518481608643e-05, + "loss": 0.0099, + "reward": 0.08296351432800293, + "reward_std": 0.022155718586873263, + "rewards/reward_func_1": 0.08296351432800293, + "step": 430 + }, + { + "completion_length": 1024.0, + "epoch": 0.11695120311869875, + "grad_norm": 0.5217347145080566, + "kl": 0.3544921875, + "learning_rate": 1.998252114952255e-05, + "loss": 0.0142, + "reward": 0.047972720861434934, + "reward_std": 0.01913239884888753, + "rewards/reward_func_1": 0.047972720861434934, + "step": 435 + }, + { + "completion_length": 738.446875, + "epoch": 0.11829546982121253, + "grad_norm": 7.95990514755249, + "kl": 1.81640625, + "learning_rate": 1.9979637612050028e-05, + "loss": 0.0727, + "reward": 0.02692788541316986, + "reward_std": 0.02701200459850952, + "rewards/reward_func_1": 0.02692788541316986, + "step": 440 + }, + { + "completion_length": 434.340625, + "epoch": 0.11963973652372631, + "grad_norm": 11.07094955444336, + "kl": 5.8614990234375, + "learning_rate": 1.9976534267180464e-05, + "loss": 0.2344, + "reward": 0.03783460408449173, + "reward_std": 0.018221309431828557, + "rewards/reward_func_1": 0.03783460408449173, + "step": 445 + }, + { + "completion_length": 38.14375, + "epoch": 0.12098400322624009, + "grad_norm": 4.362886905670166, + "kl": 17.2671875, + "learning_rate": 1.997321118326687e-05, + "loss": 0.6907, + "reward": 0.041283273696899415, + "reward_std": 0.011085864211781881, + "rewards/reward_func_1": 0.041283273696899415, + "step": 450 + }, + { + "completion_length": 71.421875, + "epoch": 0.12232826992875387, + "grad_norm": 16.95098876953125, + "kl": 14.43046875, + "learning_rate": 1.996966843350212e-05, + "loss": 0.5772, + "reward": 0.029994052648544312, + "reward_std": 0.014221129479119554, + "rewards/reward_func_1": 0.029994052648544312, + "step": 455 + }, + { + "completion_length": 162.315625, + "epoch": 0.12367253663126765, + "grad_norm": 5.00581693649292, + "kl": 13.87890625, + "learning_rate": 1.996590609591736e-05, + "loss": 0.5553, + "reward": 0.03443393409252167, + "reward_std": 0.015160623186966404, + "rewards/reward_func_1": 0.03443393409252167, + "step": 460 + }, + { + "completion_length": 387.49375, + "epoch": 0.12501680333378143, + "grad_norm": 34.0945930480957, + "kl": 25.624267578125, + "learning_rate": 1.99619242533803e-05, + "loss": 1.0272, + "reward": 0.020095158740878104, + "reward_std": 0.014898770145373419, + "rewards/reward_func_1": 0.020095158740878104, + "step": 465 + }, + { + "completion_length": 313.875, + "epoch": 0.1263610700362952, + "grad_norm": 16.63882827758789, + "kl": 11.6126220703125, + "learning_rate": 1.9957722993593365e-05, + "loss": 0.4642, + "reward": 0.03102530874311924, + "reward_std": 0.015879479701106904, + "rewards/reward_func_1": 0.03102530874311924, + "step": 470 + }, + { + "completion_length": 43.409375, + "epoch": 0.127705336738809, + "grad_norm": 27.554349899291992, + "kl": 16.75, + "learning_rate": 1.9953302409091773e-05, + "loss": 0.6699, + "reward": 0.033424198627471924, + "reward_std": 0.013823152912664227, + "rewards/reward_func_1": 0.033424198627471924, + "step": 475 + }, + { + "completion_length": 38.784375, + "epoch": 0.12904960344132277, + "grad_norm": 0.890618085861206, + "kl": 16.7375, + "learning_rate": 1.9948662597241505e-05, + "loss": 0.6692, + "reward": 0.029546657204627992, + "reward_std": 0.014900979267258663, + "rewards/reward_func_1": 0.029546657204627992, + "step": 480 + }, + { + "completion_length": 2.1125, + "epoch": 0.13039387014383655, + "grad_norm": 5.359586238861084, + "kl": 18.334375, + "learning_rate": 1.9943803660237146e-05, + "loss": 0.733, + "reward": 0.042370176315307616, + "reward_std": 0.011466928146546707, + "rewards/reward_func_1": 0.042370176315307616, + "step": 485 + }, + { + "completion_length": 7.484375, + "epoch": 0.13173813684635033, + "grad_norm": 133.7327423095703, + "kl": 17.6125, + "learning_rate": 1.9938725705099652e-05, + "loss": 0.7044, + "reward": 0.042084154486656186, + "reward_std": 0.014303012995515018, + "rewards/reward_func_1": 0.042084154486656186, + "step": 490 + }, + { + "completion_length": 2.0125, + "epoch": 0.13308240354886408, + "grad_norm": 5.098598480224609, + "kl": 17.9546875, + "learning_rate": 1.9933428843673968e-05, + "loss": 0.7184, + "reward": 0.047114628553390506, + "reward_std": 0.01080106117296964, + "rewards/reward_func_1": 0.047114628553390506, + "step": 495 + }, + { + "completion_length": 2.065625, + "epoch": 0.13442667025137786, + "grad_norm": 3.663975715637207, + "kl": 17.8578125, + "learning_rate": 1.9927913192626597e-05, + "loss": 0.714, + "reward": 0.04776406288146973, + "reward_std": 0.012929379957495258, + "rewards/reward_func_1": 0.04776406288146973, + "step": 500 + }, + { + "completion_length": 1.9375, + "epoch": 0.13577093695389164, + "grad_norm": 30.02657699584961, + "kl": 668487.05625, + "learning_rate": 1.9922178873442998e-05, + "loss": 26829.6063, + "reward": 0.054135143756866455, + "reward_std": 0.009887221396638779, + "rewards/reward_func_1": 0.054135143756866455, + "step": 505 + }, + { + "completion_length": 24.953125, + "epoch": 0.13711520365640542, + "grad_norm": 6.359250068664551, + "kl": 19.0421875, + "learning_rate": 1.9916226012424925e-05, + "loss": 0.7612, + "reward": 0.05320845246315002, + "reward_std": 0.010508252962245024, + "rewards/reward_func_1": 0.05320845246315002, + "step": 510 + }, + { + "completion_length": 257.546875, + "epoch": 0.1384594703589192, + "grad_norm": 3.6431725025177, + "kl": 14.698828125, + "learning_rate": 1.991005474068765e-05, + "loss": 0.5884, + "reward": 0.03921504020690918, + "reward_std": 0.014476554578868673, + "rewards/reward_func_1": 0.03921504020690918, + "step": 515 + }, + { + "completion_length": 110.59375, + "epoch": 0.13980373706143298, + "grad_norm": 4.781929016113281, + "kl": 16.16640625, + "learning_rate": 1.9903665194157077e-05, + "loss": 0.6467, + "reward": 0.043272508680820464, + "reward_std": 0.014059747860301286, + "rewards/reward_func_1": 0.043272508680820464, + "step": 520 + }, + { + "completion_length": 1.365625, + "epoch": 0.14114800376394676, + "grad_norm": 12.174643516540527, + "kl": 18.8140625, + "learning_rate": 1.989705751356672e-05, + "loss": 0.7527, + "reward": 0.040436971187591556, + "reward_std": 0.009100792693789116, + "rewards/reward_func_1": 0.040436971187591556, + "step": 525 + }, + { + "completion_length": 3.353125, + "epoch": 0.14249227046646054, + "grad_norm": 1.5478957891464233, + "kl": 15.30625, + "learning_rate": 1.9890231844454643e-05, + "loss": 0.6123, + "reward": 0.035584007203578946, + "reward_std": 0.013873677587253042, + "rewards/reward_func_1": 0.035584007203578946, + "step": 530 + }, + { + "completion_length": 1.078125, + "epoch": 0.14383653716897432, + "grad_norm": 0.8689735531806946, + "kl": 20.396875, + "learning_rate": 1.9883188337160225e-05, + "loss": 0.8161, + "reward": 0.043527424335479736, + "reward_std": 0.008268717869577813, + "rewards/reward_func_1": 0.043527424335479736, + "step": 535 + }, + { + "completion_length": 1.775, + "epoch": 0.1451808038714881, + "grad_norm": 2.0533359050750732, + "kl": 18.6109375, + "learning_rate": 1.9875927146820867e-05, + "loss": 0.7448, + "reward": 0.0420529842376709, + "reward_std": 0.008834328277953319, + "rewards/reward_func_1": 0.0420529842376709, + "step": 540 + }, + { + "completion_length": 2.91875, + "epoch": 0.14652507057400188, + "grad_norm": 7.708526611328125, + "kl": 17.74375, + "learning_rate": 1.9868448433368567e-05, + "loss": 0.7098, + "reward": 0.03905548453330994, + "reward_std": 0.011393586202757433, + "rewards/reward_func_1": 0.03905548453330994, + "step": 545 + }, + { + "completion_length": 26.203125, + "epoch": 0.14786933727651566, + "grad_norm": 5.5177435874938965, + "kl": 13.23671875, + "learning_rate": 1.9860752361526384e-05, + "loss": 0.5295, + "reward": 0.030779826641082763, + "reward_std": 0.017280431411927567, + "rewards/reward_func_1": 0.030779826641082763, + "step": 550 + }, + { + "completion_length": 16.4625, + "epoch": 0.14921360397902944, + "grad_norm": 2.438591718673706, + "kl": 16.7859375, + "learning_rate": 1.985283910080484e-05, + "loss": 0.6714, + "reward": 0.03871009349822998, + "reward_std": 0.012890951918961946, + "rewards/reward_func_1": 0.03871009349822998, + "step": 555 + }, + { + "completion_length": 29.490625, + "epoch": 0.15055787068154322, + "grad_norm": 11.178017616271973, + "kl": 16.7796875, + "learning_rate": 1.9844708825498163e-05, + "loss": 0.6712, + "reward": 0.03712189197540283, + "reward_std": 0.014122735538694541, + "rewards/reward_func_1": 0.03712189197540283, + "step": 560 + }, + { + "completion_length": 8.56875, + "epoch": 0.151902137384057, + "grad_norm": 1.3991609811782837, + "kl": 6695.2, + "learning_rate": 1.983636171468046e-05, + "loss": 269.0283, + "reward": 0.045965385437011716, + "reward_std": 0.011782036734803113, + "rewards/reward_func_1": 0.045965385437011716, + "step": 565 + }, + { + "completion_length": 48.490625, + "epoch": 0.15324640408657078, + "grad_norm": 8.38524055480957, + "kl": 16.6640625, + "learning_rate": 1.9827797952201756e-05, + "loss": 0.6669, + "reward": 0.04424548149108887, + "reward_std": 0.014852115589019377, + "rewards/reward_func_1": 0.04424548149108887, + "step": 570 + }, + { + "completion_length": 93.028125, + "epoch": 0.15459067078908456, + "grad_norm": 3.190880298614502, + "kl": 14.90859375, + "learning_rate": 1.9819017726683966e-05, + "loss": 0.5958, + "reward": 0.030410957336425782, + "reward_std": 0.021107864176156, + "rewards/reward_func_1": 0.030410957336425782, + "step": 575 + }, + { + "completion_length": 18.278125, + "epoch": 0.15593493749159834, + "grad_norm": 304378225360896.0, + "kl": 36779813791349.3, + "learning_rate": 1.9810021231516733e-05, + "loss": 1472844005376.0, + "reward": 0.046595031023025514, + "reward_std": 0.01781562084943289, + "rewards/reward_func_1": 0.046595031023025514, + "step": 580 + }, + { + "completion_length": 8.640625, + "epoch": 0.15727920419411212, + "grad_norm": 1.4716880321502686, + "kl": 1590480094.4328125, + "learning_rate": 1.9800808664853162e-05, + "loss": 63543705.6, + "reward": 0.047375273704528806, + "reward_std": 0.015040177796618082, + "rewards/reward_func_1": 0.047375273704528806, + "step": 585 + }, + { + "completion_length": 5.3625, + "epoch": 0.1586234708966259, + "grad_norm": 0.6876189708709717, + "kl": 18.7609375, + "learning_rate": 1.979138022960546e-05, + "loss": 0.7509, + "reward": 0.04904801845550537, + "reward_std": 0.013263128971448167, + "rewards/reward_func_1": 0.04904801845550537, + "step": 590 + }, + { + "completion_length": 7.621875, + "epoch": 0.15996773759913968, + "grad_norm": 1.0694186687469482, + "kl": 18.6953125, + "learning_rate": 1.9781736133440462e-05, + "loss": 0.748, + "reward": 0.050295126438140866, + "reward_std": 0.011883826142002363, + "rewards/reward_func_1": 0.050295126438140866, + "step": 595 + }, + { + "completion_length": 11.071875, + "epoch": 0.16131200430165346, + "grad_norm": 0.40570515394210815, + "kl": 197926011378090.44, + "learning_rate": 1.9771876588775072e-05, + "loss": 7919798059008.0, + "reward": 0.048408856987953185, + "reward_std": 0.013870497528841952, + "rewards/reward_func_1": 0.048408856987953185, + "step": 600 + }, + { + "completion_length": 3.678125, + "epoch": 0.1626562710041672, + "grad_norm": 29.10760498046875, + "kl": 1.1033819087057724e+16, + "learning_rate": 1.976180181277157e-05, + "loss": 440097890002534.4, + "reward": 0.05020642280578613, + "reward_std": 0.010890257774008205, + "rewards/reward_func_1": 0.05020642280578613, + "step": 605 + }, + { + "completion_length": 3.86875, + "epoch": 0.164000537706681, + "grad_norm": 5.49629020690918, + "kl": 18.6328125, + "learning_rate": 1.975151202733283e-05, + "loss": 0.7452, + "reward": 0.04792967140674591, + "reward_std": 0.013347143970895559, + "rewards/reward_func_1": 0.04792967140674591, + "step": 610 + }, + { + "completion_length": 3.88125, + "epoch": 0.16534480440919477, + "grad_norm": 0.6328467726707458, + "kl": 18.6640625, + "learning_rate": 1.974100745909744e-05, + "loss": 0.7466, + "reward": 0.048745088279247284, + "reward_std": 0.01318539776839316, + "rewards/reward_func_1": 0.048745088279247284, + "step": 615 + }, + { + "completion_length": 6.23125, + "epoch": 0.16668907111170855, + "grad_norm": 10.044380187988281, + "kl": 19.2171875, + "learning_rate": 1.9730288339434698e-05, + "loss": 0.7687, + "reward": 0.05019671618938446, + "reward_std": 0.011686628483585083, + "rewards/reward_func_1": 0.05019671618938446, + "step": 620 + }, + { + "completion_length": 12.4625, + "epoch": 0.16803333781422233, + "grad_norm": 0.5477828979492188, + "kl": 18.334375, + "learning_rate": 1.9719354904439535e-05, + "loss": 0.733, + "reward": 0.04945822358131409, + "reward_std": 0.01383852595463395, + "rewards/reward_func_1": 0.04945822358131409, + "step": 625 + }, + { + "completion_length": 9.596875, + "epoch": 0.1693776045167361, + "grad_norm": 0.485173761844635, + "kl": 19.209375, + "learning_rate": 1.9708207394927294e-05, + "loss": 0.7682, + "reward": 0.05124917030334473, + "reward_std": 0.010859370271646185, + "rewards/reward_func_1": 0.05124917030334473, + "step": 630 + }, + { + "completion_length": 17.403125, + "epoch": 0.1707218712192499, + "grad_norm": 1.2319393157958984, + "kl": 18.13125, + "learning_rate": 1.969684605642844e-05, + "loss": 0.7251, + "reward": 0.046324634552001955, + "reward_std": 0.01381837234366685, + "rewards/reward_func_1": 0.046324634552001955, + "step": 635 + }, + { + "completion_length": 10.29375, + "epoch": 0.17206613792176367, + "grad_norm": 3.877319574356079, + "kl": 3407890.8765625, + "learning_rate": 1.9685271139183143e-05, + "loss": 136448.95, + "reward": 0.051538944244384766, + "reward_std": 0.010041882294171956, + "rewards/reward_func_1": 0.051538944244384766, + "step": 640 + }, + { + "completion_length": 4.340625, + "epoch": 0.17341040462427745, + "grad_norm": 9511026688.0, + "kl": 79088875.28125, + "learning_rate": 1.9673482898135774e-05, + "loss": 3171008.6, + "reward": 0.05210127830505371, + "reward_std": 0.010918277798919008, + "rewards/reward_func_1": 0.05210127830505371, + "step": 645 + }, + { + "completion_length": 3.2375, + "epoch": 0.17475467132679123, + "grad_norm": 1.1192519664764404, + "kl": 17511264.096875, + "learning_rate": 1.9661481592929293e-05, + "loss": 700102.15, + "reward": 0.059194572269916534, + "reward_std": 0.010659490662510507, + "rewards/reward_func_1": 0.059194572269916534, + "step": 650 + }, + { + "completion_length": 441.04375, + "epoch": 0.176098938029305, + "grad_norm": 3818.766845703125, + "kl": 449005.0505859375, + "learning_rate": 1.9649267487899507e-05, + "loss": 18001.5281, + "reward": 0.0049600392580032345, + "reward_std": 0.014874692249577492, + "rewards/reward_func_1": 0.0049600392580032345, + "step": 655 + }, + { + "completion_length": 24.63125, + "epoch": 0.1774432047318188, + "grad_norm": 1.1994178295135498, + "kl": 7.169140625, + "learning_rate": 1.9636840852069284e-05, + "loss": 0.2868, + "reward": 0.02127237692475319, + "reward_std": 0.01846984715666622, + "rewards/reward_func_1": 0.02127237692475319, + "step": 660 + }, + { + "completion_length": 4.38125, + "epoch": 0.17878747143433257, + "grad_norm": 271670.03125, + "kl": 6157.2625, + "learning_rate": 1.962420195914259e-05, + "loss": 245.975, + "reward": 0.04686172604560852, + "reward_std": 0.013880293245892971, + "rewards/reward_func_1": 0.04686172604560852, + "step": 665 + }, + { + "completion_length": 7.53125, + "epoch": 0.18013173813684635, + "grad_norm": 1.2155910730361938, + "kl": 11889.5125, + "learning_rate": 1.961135108749849e-05, + "loss": 477.0842, + "reward": 0.04393459558486938, + "reward_std": 0.01759743633447215, + "rewards/reward_func_1": 0.04393459558486938, + "step": 670 + }, + { + "completion_length": 12.85625, + "epoch": 0.18147600483936013, + "grad_norm": 1.1338227987289429, + "kl": 10.88984375, + "learning_rate": 1.9598288520185e-05, + "loss": 0.4355, + "reward": 0.029730018973350526, + "reward_std": 0.018472507712431252, + "rewards/reward_func_1": 0.029730018973350526, + "step": 675 + }, + { + "completion_length": 10.8125, + "epoch": 0.1828202715418739, + "grad_norm": 1.7272660732269287, + "kl": 12.0734375, + "learning_rate": 1.958501454491286e-05, + "loss": 0.4834, + "reward": 0.039747095108032225, + "reward_std": 0.020039613964036106, + "rewards/reward_func_1": 0.039747095108032225, + "step": 680 + }, + { + "completion_length": 9.821875, + "epoch": 0.1841645382443877, + "grad_norm": 0.9799548983573914, + "kl": 15.834375, + "learning_rate": 1.95715294540492e-05, + "loss": 0.6336, + "reward": 0.04382616728544235, + "reward_std": 0.017328777379589155, + "rewards/reward_func_1": 0.04382616728544235, + "step": 685 + }, + { + "completion_length": 6.875, + "epoch": 0.18550880494690147, + "grad_norm": 0.0692245364189148, + "kl": 15.3328125, + "learning_rate": 1.9557833544611083e-05, + "loss": 0.6131, + "reward": 0.05723133087158203, + "reward_std": 0.012962383369449526, + "rewards/reward_func_1": 0.05723133087158203, + "step": 690 + }, + { + "completion_length": 111.0, + "epoch": 0.18685307164941525, + "grad_norm": 29.649320602416992, + "kl": 15.096875, + "learning_rate": 1.9543927118258988e-05, + "loss": 0.6041, + "reward": 0.059267282485961914, + "reward_std": 0.016366570102400148, + "rewards/reward_func_1": 0.059267282485961914, + "step": 695 + }, + { + "completion_length": 33.9375, + "epoch": 0.18819733835192903, + "grad_norm": 0.007695461623370647, + "kl": 322.8, + "learning_rate": 1.9529810481290143e-05, + "loss": 12.915, + "reward": 0.056771063804626466, + "reward_std": 0.012514285945508163, + "rewards/reward_func_1": 0.056771063804626466, + "step": 700 + }, + { + "completion_length": 2.0, + "epoch": 0.1895416050544428, + "grad_norm": 0.023763682693243027, + "kl": 18.1828125, + "learning_rate": 1.9515483944631793e-05, + "loss": 0.7269, + "reward": 0.06125969886779785, + "reward_std": 0.009932457827380859, + "rewards/reward_func_1": 0.06125969886779785, + "step": 705 + }, + { + "completion_length": 5.2, + "epoch": 0.1908858717569566, + "grad_norm": 0.3108590841293335, + "kl": 18.0453125, + "learning_rate": 1.9500947823834345e-05, + "loss": 0.7218, + "reward": 0.0602872371673584, + "reward_std": 0.009525550016041962, + "rewards/reward_func_1": 0.0602872371673584, + "step": 710 + }, + { + "completion_length": 11.98125, + "epoch": 0.19223013845947035, + "grad_norm": 0.009819800965487957, + "kl": 18.3109375, + "learning_rate": 1.9486202439064433e-05, + "loss": 0.732, + "reward": 0.05926952362060547, + "reward_std": 0.010095558775356039, + "rewards/reward_func_1": 0.05926952362060547, + "step": 715 + }, + { + "completion_length": 4.425, + "epoch": 0.19357440516198413, + "grad_norm": 0.007831516675651073, + "kl": 17.8375, + "learning_rate": 1.9471248115097827e-05, + "loss": 0.7131, + "reward": 0.06079845428466797, + "reward_std": 0.010249754647520603, + "rewards/reward_func_1": 0.06079845428466797, + "step": 720 + }, + { + "completion_length": 2.0, + "epoch": 0.1949186718644979, + "grad_norm": 0.006839285604655743, + "kl": 18.021875, + "learning_rate": 1.9456085181312333e-05, + "loss": 0.7214, + "reward": 0.06195640563964844, + "reward_std": 0.012506642258085777, + "rewards/reward_func_1": 0.06195640563964844, + "step": 725 + }, + { + "completion_length": 2.00625, + "epoch": 0.19626293856701169, + "grad_norm": 0.0004078986239619553, + "kl": 17.8234375, + "learning_rate": 1.9440713971680494e-05, + "loss": 0.7135, + "reward": 0.05450363159179687, + "reward_std": 0.010389497011783533, + "rewards/reward_func_1": 0.05450363159179687, + "step": 730 + }, + { + "completion_length": 2.0, + "epoch": 0.19760720526952547, + "grad_norm": 2.693261922104284e-05, + "kl": 18.0421875, + "learning_rate": 1.9425134824762263e-05, + "loss": 0.722, + "reward": 0.06317386627197266, + "reward_std": 0.01099952881995705, + "rewards/reward_func_1": 0.06317386627197266, + "step": 735 + }, + { + "completion_length": 2.0, + "epoch": 0.19895147197203925, + "grad_norm": 0.0003110544930677861, + "kl": 18.16875, + "learning_rate": 1.9409348083697516e-05, + "loss": 0.7272, + "reward": 0.061242103576660156, + "reward_std": 0.011685801808926043, + "rewards/reward_func_1": 0.061242103576660156, + "step": 740 + }, + { + "completion_length": 2.0, + "epoch": 0.20029573867455303, + "grad_norm": 2.4723798560444266e-05, + "kl": 17.915625, + "learning_rate": 1.9393354096198535e-05, + "loss": 0.7161, + "reward": 0.054812145233154294, + "reward_std": 0.01056510213547881, + "rewards/reward_func_1": 0.054812145233154294, + "step": 745 + }, + { + "completion_length": 2.0, + "epoch": 0.2016400053770668, + "grad_norm": 0.00010621309047564864, + "kl": 17.85625, + "learning_rate": 1.937715321454232e-05, + "loss": 0.7141, + "reward": 0.05950497388839722, + "reward_std": 0.011216246478579706, + "rewards/reward_func_1": 0.05950497388839722, + "step": 750 + }, + { + "completion_length": 2.0, + "epoch": 0.20298427207958059, + "grad_norm": 7.163731061154976e-05, + "kl": 17.6765625, + "learning_rate": 1.9360745795562813e-05, + "loss": 0.7074, + "reward": 0.06266632080078124, + "reward_std": 0.011260019605106208, + "rewards/reward_func_1": 0.06266632080078124, + "step": 755 + }, + { + "completion_length": 2.0, + "epoch": 0.20432853878209437, + "grad_norm": 0.00017179730639327317, + "kl": 17.6109375, + "learning_rate": 1.9344132200643102e-05, + "loss": 0.7048, + "reward": 0.0631840705871582, + "reward_std": 0.01339399583703198, + "rewards/reward_func_1": 0.0631840705871582, + "step": 760 + }, + { + "completion_length": 2.0, + "epoch": 0.20567280548460815, + "grad_norm": 0.00033472245559096336, + "kl": 17.9296875, + "learning_rate": 1.9327312795707392e-05, + "loss": 0.7169, + "reward": 0.06261520385742188, + "reward_std": 0.011459613528859335, + "rewards/reward_func_1": 0.06261520385742188, + "step": 765 + }, + { + "completion_length": 2.0, + "epoch": 0.20701707218712193, + "grad_norm": 0.02129560336470604, + "kl": 17.7078125, + "learning_rate": 1.931028795121299e-05, + "loss": 0.7074, + "reward": 0.060090065002441406, + "reward_std": 0.010715857451577904, + "rewards/reward_func_1": 0.060090065002441406, + "step": 770 + }, + { + "completion_length": 2.0, + "epoch": 0.2083613388896357, + "grad_norm": 0.00020586424216162413, + "kl": 17.7390625, + "learning_rate": 1.9293058042142117e-05, + "loss": 0.7097, + "reward": 0.05955848693847656, + "reward_std": 0.010681279400523635, + "rewards/reward_func_1": 0.05955848693847656, + "step": 775 + }, + { + "completion_length": 2.0, + "epoch": 0.20970560559214949, + "grad_norm": 0.00016600097296759486, + "kl": 17.5921875, + "learning_rate": 1.9275623447993678e-05, + "loss": 0.7034, + "reward": 0.06024360656738281, + "reward_std": 0.010565872873849002, + "rewards/reward_func_1": 0.06024360656738281, + "step": 780 + }, + { + "completion_length": 2.0, + "epoch": 0.21104987229466327, + "grad_norm": 0.0002437598304823041, + "kl": 17.684375, + "learning_rate": 1.9257984552774874e-05, + "loss": 0.7073, + "reward": 0.06276130676269531, + "reward_std": 0.013013198171756812, + "rewards/reward_func_1": 0.06276130676269531, + "step": 785 + }, + { + "completion_length": 2.0, + "epoch": 0.21239413899717705, + "grad_norm": 0.00045892002526670694, + "kl": 17.6234375, + "learning_rate": 1.9240141744992763e-05, + "loss": 0.7051, + "reward": 0.06035938262939453, + "reward_std": 0.013352590511203744, + "rewards/reward_func_1": 0.06035938262939453, + "step": 790 + }, + { + "completion_length": 2.0, + "epoch": 0.21373840569969083, + "grad_norm": 0.00022575826733373106, + "kl": 17.875, + "learning_rate": 1.9222095417645695e-05, + "loss": 0.7155, + "reward": 0.058776569366455075, + "reward_std": 0.011941832641605287, + "rewards/reward_func_1": 0.058776569366455075, + "step": 795 + }, + { + "completion_length": 2.0, + "epoch": 0.2150826724022046, + "grad_norm": 0.0002709394320845604, + "kl": 17.8890625, + "learning_rate": 1.920384596821467e-05, + "loss": 0.7157, + "reward": 0.05806446075439453, + "reward_std": 0.00929926319167862, + "rewards/reward_func_1": 0.05806446075439453, + "step": 800 + }, + { + "completion_length": 2.0, + "epoch": 0.21642693910471839, + "grad_norm": 0.0005522365099750459, + "kl": 18.215625, + "learning_rate": 1.9185393798654547e-05, + "loss": 0.7285, + "reward": 0.060375118255615236, + "reward_std": 0.012819936085725204, + "rewards/reward_func_1": 0.060375118255615236, + "step": 805 + }, + { + "completion_length": 2.0, + "epoch": 0.21777120580723217, + "grad_norm": 0.00012125197827117518, + "kl": 18.078125, + "learning_rate": 1.9166739315385244e-05, + "loss": 0.7234, + "reward": 0.06392664909362793, + "reward_std": 0.009824539528926835, + "rewards/reward_func_1": 0.06392664909362793, + "step": 810 + }, + { + "completion_length": 2.0, + "epoch": 0.21911547250974595, + "grad_norm": 0.0004721728328149766, + "kl": 17.84375, + "learning_rate": 1.9147882929282734e-05, + "loss": 0.7138, + "reward": 0.061508560180664064, + "reward_std": 0.011408517364179716, + "rewards/reward_func_1": 0.061508560180664064, + "step": 815 + }, + { + "completion_length": 2.0, + "epoch": 0.22045973921225973, + "grad_norm": 0.0005403547547757626, + "kl": 17.646875, + "learning_rate": 1.9128825055670035e-05, + "loss": 0.7059, + "reward": 0.059009552001953125, + "reward_std": 0.009685787269700086, + "rewards/reward_func_1": 0.059009552001953125, + "step": 820 + }, + { + "completion_length": 2.0, + "epoch": 0.22180400591477348, + "grad_norm": 0.0022164226975291967, + "kl": 17.9734375, + "learning_rate": 1.9109566114308036e-05, + "loss": 0.7187, + "reward": 0.05564627647399902, + "reward_std": 0.010326084749249276, + "rewards/reward_func_1": 0.05564627647399902, + "step": 825 + }, + { + "completion_length": 2.35, + "epoch": 0.22314827261728726, + "grad_norm": 0.0012156780576333404, + "kl": 17.66875, + "learning_rate": 1.9090106529386263e-05, + "loss": 0.7067, + "reward": 0.0656036376953125, + "reward_std": 0.015077763356384822, + "rewards/reward_func_1": 0.0656036376953125, + "step": 830 + }, + { + "completion_length": 2.0, + "epoch": 0.22449253931980104, + "grad_norm": 0.0010059759952127934, + "kl": 18.1734375, + "learning_rate": 1.907044672951354e-05, + "loss": 0.7272, + "reward": 0.057573127746582034, + "reward_std": 0.010585914782132022, + "rewards/reward_func_1": 0.057573127746582034, + "step": 835 + }, + { + "completion_length": 2.0, + "epoch": 0.22583680602231482, + "grad_norm": 0.0004464346857275814, + "kl": 17.9625, + "learning_rate": 1.9050587147708544e-05, + "loss": 0.7182, + "reward": 0.06241474151611328, + "reward_std": 0.009492194746417226, + "rewards/reward_func_1": 0.06241474151611328, + "step": 840 + }, + { + "completion_length": 2.0, + "epoch": 0.2271810727248286, + "grad_norm": 0.0005409275181591511, + "kl": 18.2171875, + "learning_rate": 1.9030528221390255e-05, + "loss": 0.7287, + "reward": 0.06225318908691406, + "reward_std": 0.011348171227291459, + "rewards/reward_func_1": 0.06225318908691406, + "step": 845 + }, + { + "completion_length": 6.090625, + "epoch": 0.22852533942734238, + "grad_norm": 13.99404525756836, + "kl": 17.190625, + "learning_rate": 1.9010270392368343e-05, + "loss": 0.6867, + "reward": 0.0607336699962616, + "reward_std": 0.014851068891584874, + "rewards/reward_func_1": 0.0607336699962616, + "step": 850 + }, + { + "completion_length": 3.1, + "epoch": 0.22986960612985616, + "grad_norm": 0.0005020995158702135, + "kl": 17.60625, + "learning_rate": 1.898981410683343e-05, + "loss": 0.7042, + "reward": 0.06041567623615265, + "reward_std": 0.012269638044381281, + "rewards/reward_func_1": 0.06041567623615265, + "step": 855 + }, + { + "completion_length": 2.0, + "epoch": 0.23121387283236994, + "grad_norm": 0.0013556176563724875, + "kl": 17.64375, + "learning_rate": 1.8969159815347253e-05, + "loss": 0.7065, + "reward": 0.06363449096679688, + "reward_std": 0.010352238497580402, + "rewards/reward_func_1": 0.06363449096679688, + "step": 860 + }, + { + "completion_length": 2.0, + "epoch": 0.23255813953488372, + "grad_norm": 0.0008380432846024632, + "kl": 17.7609375, + "learning_rate": 1.8948307972832744e-05, + "loss": 0.7101, + "reward": 0.06133832931518555, + "reward_std": 0.012271754596440587, + "rewards/reward_func_1": 0.06133832931518555, + "step": 865 + }, + { + "completion_length": 2.0, + "epoch": 0.2339024062373975, + "grad_norm": 0.0016801492311060429, + "kl": 17.803125, + "learning_rate": 1.8927259038564023e-05, + "loss": 0.7121, + "reward": 0.06001472473144531, + "reward_std": 0.011952074009786883, + "rewards/reward_func_1": 0.06001472473144531, + "step": 870 + }, + { + "completion_length": 2.459375, + "epoch": 0.23524667293991128, + "grad_norm": 0.07210814952850342, + "kl": 31147.065625, + "learning_rate": 1.8906013476156265e-05, + "loss": 1248.7868, + "reward": 0.05841388702392578, + "reward_std": 0.011889992751093814, + "rewards/reward_func_1": 0.05841388702392578, + "step": 875 + }, + { + "completion_length": 2.028125, + "epoch": 0.23659093964242506, + "grad_norm": 0.04944615811109543, + "kl": 17.9125, + "learning_rate": 1.8884571753555495e-05, + "loss": 0.7165, + "reward": 0.059661483764648436, + "reward_std": 0.01035475345343002, + "rewards/reward_func_1": 0.059661483764648436, + "step": 880 + }, + { + "completion_length": 4.909375, + "epoch": 0.23793520634493884, + "grad_norm": 0.5363011360168457, + "kl": 17.625, + "learning_rate": 1.8862934343028288e-05, + "loss": 0.7049, + "reward": 0.06338434219360352, + "reward_std": 0.012126463351160055, + "rewards/reward_func_1": 0.06338434219360352, + "step": 885 + }, + { + "completion_length": 62.634375, + "epoch": 0.23927947304745262, + "grad_norm": 0.1920609325170517, + "kl": 16.6390625, + "learning_rate": 1.884110172115135e-05, + "loss": 0.6654, + "reward": 0.05351438522338867, + "reward_std": 0.02005903590179514, + "rewards/reward_func_1": 0.05351438522338867, + "step": 890 + }, + { + "completion_length": 2.0, + "epoch": 0.2406237397499664, + "grad_norm": 0.0013905576197430491, + "kl": 18.1953125, + "learning_rate": 1.8819074368801045e-05, + "loss": 0.7282, + "reward": 0.06563434600830079, + "reward_std": 0.009951398673729272, + "rewards/reward_func_1": 0.06563434600830079, + "step": 895 + }, + { + "completion_length": 2.0, + "epoch": 0.24196800645248018, + "grad_norm": 0.0008331938879564404, + "kl": 18.003125, + "learning_rate": 1.8796852771142778e-05, + "loss": 0.7201, + "reward": 0.061870574951171875, + "reward_std": 0.012352473140344955, + "rewards/reward_func_1": 0.061870574951171875, + "step": 900 + }, + { + "completion_length": 2.0, + "epoch": 0.24331227315499396, + "grad_norm": 0.00020705144561361521, + "kl": 18.053125, + "learning_rate": 1.8774437417620334e-05, + "loss": 0.7223, + "reward": 0.06816902160644531, + "reward_std": 0.012587691200315021, + "rewards/reward_func_1": 0.06816902160644531, + "step": 905 + }, + { + "completion_length": 2.0, + "epoch": 0.24465653985750774, + "grad_norm": 0.0003319174575153738, + "kl": 17.8921875, + "learning_rate": 1.8751828801945074e-05, + "loss": 0.7151, + "reward": 0.058438873291015624, + "reward_std": 0.012193899090743799, + "rewards/reward_func_1": 0.058438873291015624, + "step": 910 + }, + { + "completion_length": 2.0, + "epoch": 0.24600080656002152, + "grad_norm": 0.000264251691987738, + "kl": 18.0390625, + "learning_rate": 1.872902742208508e-05, + "loss": 0.7217, + "reward": 0.060257339477539064, + "reward_std": 0.010663219789967116, + "rewards/reward_func_1": 0.060257339477539064, + "step": 915 + }, + { + "completion_length": 2.0, + "epoch": 0.2473450732625353, + "grad_norm": 0.0001942398666869849, + "kl": 17.9234375, + "learning_rate": 1.8706033780254168e-05, + "loss": 0.7168, + "reward": 0.05674247741699219, + "reward_std": 0.009646143747158931, + "rewards/reward_func_1": 0.05674247741699219, + "step": 920 + }, + { + "completion_length": 2.0, + "epoch": 0.24868933996504908, + "grad_norm": 0.000582867010962218, + "kl": 17.5609375, + "learning_rate": 1.8682848382900852e-05, + "loss": 0.7027, + "reward": 0.06358718872070312, + "reward_std": 0.01264539449075528, + "rewards/reward_func_1": 0.06358718872070312, + "step": 925 + }, + { + "completion_length": 2.0, + "epoch": 0.25003360666756286, + "grad_norm": 0.0004526897973846644, + "kl": 17.9296875, + "learning_rate": 1.865947174069716e-05, + "loss": 0.7172, + "reward": 0.059136390686035156, + "reward_std": 0.010592962511873338, + "rewards/reward_func_1": 0.059136390686035156, + "step": 930 + }, + { + "completion_length": 2.0, + "epoch": 0.2513778733700766, + "grad_norm": 0.0003386743483133614, + "kl": 17.75625, + "learning_rate": 1.8635904368527406e-05, + "loss": 0.7107, + "reward": 0.06310138702392579, + "reward_std": 0.011821250266802964, + "rewards/reward_func_1": 0.06310138702392579, + "step": 935 + }, + { + "completion_length": 2.0, + "epoch": 0.2527221400725904, + "grad_norm": 0.00037925346987321973, + "kl": 17.703125, + "learning_rate": 1.861214678547685e-05, + "loss": 0.7079, + "reward": 0.06231670379638672, + "reward_std": 0.010538342622749042, + "rewards/reward_func_1": 0.06231670379638672, + "step": 940 + }, + { + "completion_length": 2.0, + "epoch": 0.25406640677510417, + "grad_norm": 0.0006748105515725911, + "kl": 17.96875, + "learning_rate": 1.858819951482026e-05, + "loss": 0.7188, + "reward": 0.05954875946044922, + "reward_std": 0.011791958093454014, + "rewards/reward_func_1": 0.05954875946044922, + "step": 945 + }, + { + "completion_length": 2.0, + "epoch": 0.255410673477618, + "grad_norm": 0.000709658779669553, + "kl": 17.6859375, + "learning_rate": 1.856406308401036e-05, + "loss": 0.7072, + "reward": 0.0561366081237793, + "reward_std": 0.009839185555756557, + "rewards/reward_func_1": 0.0561366081237793, + "step": 950 + }, + { + "completion_length": 2.0, + "epoch": 0.25675494018013173, + "grad_norm": 0.0004975300398655236, + "kl": 17.975, + "learning_rate": 1.853973802466627e-05, + "loss": 0.7186, + "reward": 0.0569252610206604, + "reward_std": 0.009655545311397873, + "rewards/reward_func_1": 0.0569252610206604, + "step": 955 + }, + { + "completion_length": 2.0, + "epoch": 0.25809920688264554, + "grad_norm": 0.0006206005346029997, + "kl": 17.8875, + "learning_rate": 1.8515224872561745e-05, + "loss": 0.7151, + "reward": 0.06045455932617187, + "reward_std": 0.011623913834773703, + "rewards/reward_func_1": 0.06045455932617187, + "step": 960 + }, + { + "completion_length": 2.0, + "epoch": 0.2594434735851593, + "grad_norm": 0.0007395711145363748, + "kl": 18.040625, + "learning_rate": 1.8490524167613405e-05, + "loss": 0.7214, + "reward": 0.057852745056152344, + "reward_std": 0.011105244704231155, + "rewards/reward_func_1": 0.057852745056152344, + "step": 965 + }, + { + "completion_length": 2.0, + "epoch": 0.2607877402876731, + "grad_norm": 0.0008309457916766405, + "kl": 17.6515625, + "learning_rate": 1.8465636453868825e-05, + "loss": 0.7064, + "reward": 0.06783523559570312, + "reward_std": 0.009911755218854523, + "rewards/reward_func_1": 0.06783523559570312, + "step": 970 + }, + { + "completion_length": 2.0, + "epoch": 0.26213200699018685, + "grad_norm": 0.000725765130482614, + "kl": 17.36875, + "learning_rate": 1.8440562279494557e-05, + "loss": 0.695, + "reward": 0.05620386600494385, + "reward_std": 0.009591523706330918, + "rewards/reward_func_1": 0.05620386600494385, + "step": 975 + }, + { + "completion_length": 2.0, + "epoch": 0.26347627369270066, + "grad_norm": 0.0008614324615336955, + "kl": 17.940625, + "learning_rate": 1.8415302196764068e-05, + "loss": 0.7172, + "reward": 0.06371011734008789, + "reward_std": 0.012515782276750542, + "rewards/reward_func_1": 0.06371011734008789, + "step": 980 + }, + { + "completion_length": 2.0, + "epoch": 0.2648205403952144, + "grad_norm": 0.0006585062947124243, + "kl": 18.028125, + "learning_rate": 1.8389856762045556e-05, + "loss": 0.7213, + "reward": 0.05774202346801758, + "reward_std": 0.01049440445349319, + "rewards/reward_func_1": 0.05774202346801758, + "step": 985 + }, + { + "completion_length": 2.0, + "epoch": 0.26616480709772816, + "grad_norm": 0.0015991459367796779, + "kl": 17.9125, + "learning_rate": 1.836422653578971e-05, + "loss": 0.716, + "reward": 0.061675214767456056, + "reward_std": 0.010034650065063034, + "rewards/reward_func_1": 0.061675214767456056, + "step": 990 + }, + { + "completion_length": 2.0, + "epoch": 0.26750907380024197, + "grad_norm": 0.0015729337465018034, + "kl": 17.6875, + "learning_rate": 1.8338412082517357e-05, + "loss": 0.7081, + "reward": 0.057560133934021, + "reward_std": 0.010157572498792433, + "rewards/reward_func_1": 0.057560133934021, + "step": 995 + }, + { + "completion_length": 2.0, + "epoch": 0.2688533405027557, + "grad_norm": 0.0010110485600307584, + "kl": 18.175, + "learning_rate": 1.8312413970807043e-05, + "loss": 0.7263, + "reward": 0.058531570434570315, + "reward_std": 0.009644822326663416, + "rewards/reward_func_1": 0.058531570434570315, + "step": 1000 + }, + { + "completion_length": 2.0, + "epoch": 0.27019760720526953, + "grad_norm": 0.0011084715370088816, + "kl": 17.73125, + "learning_rate": 1.8286232773282492e-05, + "loss": 0.7093, + "reward": 0.05668430328369141, + "reward_std": 0.009714638943114551, + "rewards/reward_func_1": 0.05668430328369141, + "step": 1005 + }, + { + "completion_length": 2.0, + "epoch": 0.2715418739077833, + "grad_norm": 0.0011657410068437457, + "kl": 17.453125, + "learning_rate": 1.8259869066600005e-05, + "loss": 0.6981, + "reward": 0.060795021057128903, + "reward_std": 0.008335485706629698, + "rewards/reward_func_1": 0.060795021057128903, + "step": 1010 + }, + { + "completion_length": 2.0, + "epoch": 0.2728861406102971, + "grad_norm": 0.0010427763918414712, + "kl": 18.075, + "learning_rate": 1.8233323431435744e-05, + "loss": 0.723, + "reward": 0.06016595363616943, + "reward_std": 0.011402350757271052, + "rewards/reward_func_1": 0.06016595363616943, + "step": 1015 + }, + { + "completion_length": 2.0, + "epoch": 0.27423040731281084, + "grad_norm": 0.00200888910330832, + "kl": 17.9109375, + "learning_rate": 1.820659645247296e-05, + "loss": 0.717, + "reward": 0.056774234771728514, + "reward_std": 0.011336608513374813, + "rewards/reward_func_1": 0.056774234771728514, + "step": 1020 + }, + { + "completion_length": 2.0, + "epoch": 0.27557467401532465, + "grad_norm": 0.0016117419581860304, + "kl": 17.765625, + "learning_rate": 1.8179688718389116e-05, + "loss": 0.7105, + "reward": 0.06235724687576294, + "reward_std": 0.010060734899889212, + "rewards/reward_func_1": 0.06235724687576294, + "step": 1025 + }, + { + "completion_length": 2.45, + "epoch": 0.2769189407178384, + "grad_norm": 0.0025889407843351364, + "kl": 17.771875, + "learning_rate": 1.8152600821842902e-05, + "loss": 0.711, + "reward": 0.058268165588378905, + "reward_std": 0.01004049998264236, + "rewards/reward_func_1": 0.058268165588378905, + "step": 1030 + }, + { + "completion_length": 2.0, + "epoch": 0.2782632074203522, + "grad_norm": 0.0021764549892395735, + "kl": 17.765625, + "learning_rate": 1.8125333359461194e-05, + "loss": 0.7108, + "reward": 0.05997686386108399, + "reward_std": 0.010472600482171402, + "rewards/reward_func_1": 0.05997686386108399, + "step": 1035 + }, + { + "completion_length": 2.0, + "epoch": 0.27960747412286596, + "grad_norm": 0.0027070462238043547, + "kl": 18.00625, + "learning_rate": 1.8097886931825916e-05, + "loss": 0.72, + "reward": 0.057472729682922365, + "reward_std": 0.010958646313520148, + "rewards/reward_func_1": 0.057472729682922365, + "step": 1040 + }, + { + "completion_length": 2.00625, + "epoch": 0.28095174082537977, + "grad_norm": 0.02638576552271843, + "kl": 17.8140625, + "learning_rate": 1.8070262143460803e-05, + "loss": 0.7121, + "reward": 0.06007643789052963, + "reward_std": 0.01233230889774859, + "rewards/reward_func_1": 0.06007643789052963, + "step": 1045 + }, + { + "completion_length": 2.4625, + "epoch": 0.2822960075278935, + "grad_norm": 0.38510629534721375, + "kl": 17.2859375, + "learning_rate": 1.8042459602818092e-05, + "loss": 0.6911, + "reward": 0.060521507263183595, + "reward_std": 0.011037798667530296, + "rewards/reward_func_1": 0.060521507263183595, + "step": 1050 + }, + { + "completion_length": 4.6375, + "epoch": 0.28364027423040733, + "grad_norm": 1.0965192317962646, + "kl": 15.5390625, + "learning_rate": 1.8014479922265117e-05, + "loss": 0.6215, + "reward": 0.05944366455078125, + "reward_std": 0.01202023433870636, + "rewards/reward_func_1": 0.05944366455078125, + "step": 1055 + }, + { + "completion_length": 68.578125, + "epoch": 0.2849845409329211, + "grad_norm": 136523.703125, + "kl": 128.46640625, + "learning_rate": 1.7986323718070826e-05, + "loss": 5.144, + "reward": 0.056297135353088376, + "reward_std": 0.014432728511746973, + "rewards/reward_func_1": 0.056297135353088376, + "step": 1060 + }, + { + "completion_length": 151.353125, + "epoch": 0.2863288076354349, + "grad_norm": 1.3205143213272095, + "kl": 6.2421875, + "learning_rate": 1.79579916103922e-05, + "loss": 0.2498, + "reward": 0.022794413566589355, + "reward_std": 0.025071121371001936, + "rewards/reward_func_1": 0.022794413566589355, + "step": 1065 + }, + { + "completion_length": 60.24375, + "epoch": 0.28767307433794864, + "grad_norm": 0.668519139289856, + "kl": 13.2421875, + "learning_rate": 1.79294842232606e-05, + "loss": 0.5298, + "reward": 0.02910344898700714, + "reward_std": 0.021979624161031098, + "rewards/reward_func_1": 0.02910344898700714, + "step": 1070 + }, + { + "completion_length": 69.1875, + "epoch": 0.28901734104046245, + "grad_norm": 0.38422349095344543, + "kl": 15.446875, + "learning_rate": 1.7900802184568024e-05, + "loss": 0.6174, + "reward": 0.032975000143051145, + "reward_std": 0.019280125828663584, + "rewards/reward_func_1": 0.032975000143051145, + "step": 1075 + }, + { + "completion_length": 1.93125, + "epoch": 0.2903616077429762, + "grad_norm": 0.4854346513748169, + "kl": 18.4921875, + "learning_rate": 1.7871946126053265e-05, + "loss": 0.7396, + "reward": 0.0445002555847168, + "reward_std": 0.009190794143796666, + "rewards/reward_func_1": 0.0445002555847168, + "step": 1080 + }, + { + "completion_length": 13.75625, + "epoch": 0.29170587444549, + "grad_norm": 0.23374588787555695, + "kl": 16.453125, + "learning_rate": 1.784291668328801e-05, + "loss": 0.658, + "reward": 0.04094771146774292, + "reward_std": 0.010110658951089136, + "rewards/reward_func_1": 0.04094771146774292, + "step": 1085 + }, + { + "completion_length": 55.621875, + "epoch": 0.29305014114800376, + "grad_norm": 0.29550668597221375, + "kl": 14.3140625, + "learning_rate": 1.781371449566284e-05, + "loss": 0.5726, + "reward": 0.039327383041381836, + "reward_std": 0.014201272143691313, + "rewards/reward_func_1": 0.039327383041381836, + "step": 1090 + }, + { + "completion_length": 2.878125, + "epoch": 0.29439440785051757, + "grad_norm": 0.15203788876533508, + "kl": 16.3359375, + "learning_rate": 1.7784340206373135e-05, + "loss": 0.6532, + "reward": 0.04413075447082519, + "reward_std": 0.007815522653254447, + "rewards/reward_func_1": 0.04413075447082519, + "step": 1095 + }, + { + "completion_length": 2.740625, + "epoch": 0.2957386745530313, + "grad_norm": 0.1516532450914383, + "kl": 17.3421875, + "learning_rate": 1.7754794462404924e-05, + "loss": 0.6937, + "reward": 0.045378980413079265, + "reward_std": 0.008754154351481701, + "rewards/reward_func_1": 0.045378980413079265, + "step": 1100 + }, + { + "completion_length": 2.86875, + "epoch": 0.2970829412555451, + "grad_norm": 0.3617139160633087, + "kl": 17.16875, + "learning_rate": 1.772507791452062e-05, + "loss": 0.687, + "reward": 0.03869695663452148, + "reward_std": 0.008698664297844516, + "rewards/reward_func_1": 0.03869695663452148, + "step": 1105 + }, + { + "completion_length": 6.446875, + "epoch": 0.2984272079580589, + "grad_norm": 0.12706607580184937, + "kl": 16.409375, + "learning_rate": 1.7695191217244694e-05, + "loss": 0.6564, + "reward": 0.04749107360839844, + "reward_std": 0.008462011188385077, + "rewards/reward_func_1": 0.04749107360839844, + "step": 1110 + }, + { + "completion_length": 72.753125, + "epoch": 0.29977147466057263, + "grad_norm": 0.321855753660202, + "kl": 13.3921875, + "learning_rate": 1.766513502884926e-05, + "loss": 0.5358, + "reward": 0.04237784147262573, + "reward_std": 0.0151958847156493, + "rewards/reward_func_1": 0.04237784147262573, + "step": 1115 + }, + { + "completion_length": 131.153125, + "epoch": 0.30111574136308644, + "grad_norm": 0.1894012689590454, + "kl": 12.8296875, + "learning_rate": 1.7634910011339576e-05, + "loss": 0.5134, + "reward": 0.03622118234634399, + "reward_std": 0.016812963741540444, + "rewards/reward_func_1": 0.03622118234634399, + "step": 1120 + }, + { + "completion_length": 225.79375, + "epoch": 0.3024600080656002, + "grad_norm": 0.6886034607887268, + "kl": 119.9203125, + "learning_rate": 1.7604516830439447e-05, + "loss": 4.8239, + "reward": 0.028684809803962708, + "reward_std": 0.020281461635022424, + "rewards/reward_func_1": 0.028684809803962708, + "step": 1125 + }, + { + "completion_length": 733.759375, + "epoch": 0.303804274768114, + "grad_norm": 0.5576857328414917, + "kl": 7.11328125, + "learning_rate": 1.7573956155576596e-05, + "loss": 0.2844, + "reward": 0.007758472859859466, + "reward_std": 0.014932763832621276, + "rewards/reward_func_1": 0.007758472859859466, + "step": 1130 + }, + { + "completion_length": 251.840625, + "epoch": 0.30514854147062775, + "grad_norm": 1.2845573425292969, + "kl": 421385.557421875, + "learning_rate": 1.7543228659867887e-05, + "loss": 16848.1047, + "reward": 0.007775214128196239, + "reward_std": 0.017382631546934136, + "rewards/reward_func_1": 0.007775214128196239, + "step": 1135 + }, + { + "completion_length": 2.09375, + "epoch": 0.30649280817314156, + "grad_norm": 1.5478886365890503, + "kl": 18.3546875, + "learning_rate": 1.7512335020104507e-05, + "loss": 0.7346, + "reward": 0.042022180557250974, + "reward_std": 0.011799084773520008, + "rewards/reward_func_1": 0.042022180557250974, + "step": 1140 + }, + { + "completion_length": 2.44375, + "epoch": 0.3078370748756553, + "grad_norm": 0.4710218012332916, + "kl": 18.0078125, + "learning_rate": 1.7481275916737077e-05, + "loss": 0.7209, + "reward": 0.043611574172973636, + "reward_std": 0.016114455633214675, + "rewards/reward_func_1": 0.043611574172973636, + "step": 1145 + }, + { + "completion_length": 3.021875, + "epoch": 0.3091813415781691, + "grad_norm": 1.128142237663269, + "kl": 17.0234375, + "learning_rate": 1.7450052033860643e-05, + "loss": 0.681, + "reward": 0.04341961294412613, + "reward_std": 0.01588066411204636, + "rewards/reward_func_1": 0.04341961294412613, + "step": 1150 + }, + { + "completion_length": 2.2375, + "epoch": 0.3105256082806829, + "grad_norm": 1.5875157117843628, + "kl": 19.2234375, + "learning_rate": 1.7418664059199615e-05, + "loss": 0.7687, + "reward": 0.05236520916223526, + "reward_std": 0.013312915465940022, + "rewards/reward_func_1": 0.05236520916223526, + "step": 1155 + }, + { + "completion_length": 2.0, + "epoch": 0.3118698749831967, + "grad_norm": 0.0017928759334608912, + "kl": 20.63125, + "learning_rate": 1.738711268409263e-05, + "loss": 0.8251, + "reward": 0.05562934875488281, + "reward_std": 0.00945893834286835, + "rewards/reward_func_1": 0.05562934875488281, + "step": 1160 + }, + { + "completion_length": 2.0, + "epoch": 0.31321414168571043, + "grad_norm": 0.003861919976770878, + "kl": 21.05625, + "learning_rate": 1.73553986034773e-05, + "loss": 0.8425, + "reward": 0.05521247386932373, + "reward_std": 0.0097390030954557, + "rewards/reward_func_1": 0.05521247386932373, + "step": 1165 + }, + { + "completion_length": 2.0, + "epoch": 0.31455840838822424, + "grad_norm": 0.003637129906564951, + "kl": 20.484375, + "learning_rate": 1.7323522515874945e-05, + "loss": 0.8202, + "reward": 0.05392255783081055, + "reward_std": 0.009917261235386832, + "rewards/reward_func_1": 0.05392255783081055, + "step": 1170 + }, + { + "completion_length": 2.0, + "epoch": 0.315902675090738, + "grad_norm": 0.006247695069760084, + "kl": 20.565625, + "learning_rate": 1.7291485123375164e-05, + "loss": 0.8229, + "reward": 0.056228256225585936, + "reward_std": 0.009558047083555721, + "rewards/reward_func_1": 0.056228256225585936, + "step": 1175 + }, + { + "completion_length": 1.99375, + "epoch": 0.3172469417932518, + "grad_norm": 0.004366494249552488, + "kl": 20.55, + "learning_rate": 1.72592871316204e-05, + "loss": 0.8223, + "reward": 0.062088823318481444, + "reward_std": 0.009840463204818661, + "rewards/reward_func_1": 0.062088823318481444, + "step": 1180 + }, + { + "completion_length": 2.0, + "epoch": 0.31859120849576555, + "grad_norm": 0.002056930446997285, + "kl": 20.60625, + "learning_rate": 1.722692924979039e-05, + "loss": 0.8245, + "reward": 0.05960988998413086, + "reward_std": 0.008964826199371601, + "rewards/reward_func_1": 0.05960988998413086, + "step": 1185 + }, + { + "completion_length": 2.0, + "epoch": 0.31993547519827936, + "grad_norm": 0.015865160152316093, + "kl": 21.034375, + "learning_rate": 1.719441219058654e-05, + "loss": 0.8416, + "reward": 0.05759906768798828, + "reward_std": 0.009656935631937813, + "rewards/reward_func_1": 0.05759906768798828, + "step": 1190 + }, + { + "completion_length": 2.003125, + "epoch": 0.3212797419007931, + "grad_norm": 0.0372232086956501, + "kl": 20.875, + "learning_rate": 1.7161736670216233e-05, + "loss": 0.8354, + "reward": 0.05712289810180664, + "reward_std": 0.007381719051045366, + "rewards/reward_func_1": 0.05712289810180664, + "step": 1195 + }, + { + "completion_length": 2.003125, + "epoch": 0.3226240086033069, + "grad_norm": 0.8008459210395813, + "kl": 20.2625, + "learning_rate": 1.7128903408377053e-05, + "loss": 0.8105, + "reward": 0.05510530471801758, + "reward_std": 0.011877764340533758, + "rewards/reward_func_1": 0.05510530471801758, + "step": 1200 + }, + { + "completion_length": 2.13125, + "epoch": 0.3239682753058207, + "grad_norm": 0.0008978499681688845, + "kl": 20.628125, + "learning_rate": 1.7095913128240936e-05, + "loss": 0.8251, + "reward": 0.05747789740562439, + "reward_std": 0.011763529140444007, + "rewards/reward_func_1": 0.05747789740562439, + "step": 1205 + }, + { + "completion_length": 2.0, + "epoch": 0.3253125420083344, + "grad_norm": 0.0013102364027872682, + "kl": 20.58125, + "learning_rate": 1.7062766556438233e-05, + "loss": 0.8234, + "reward": 0.05572299957275391, + "reward_std": 0.008765837910323171, + "rewards/reward_func_1": 0.05572299957275391, + "step": 1210 + }, + { + "completion_length": 2.0, + "epoch": 0.32665680871084823, + "grad_norm": 0.0024437177926301956, + "kl": 20.46875, + "learning_rate": 1.7029464423041713e-05, + "loss": 0.8187, + "reward": 0.05510997772216797, + "reward_std": 0.008241662751242985, + "rewards/reward_func_1": 0.05510997772216797, + "step": 1215 + }, + { + "completion_length": 2.0, + "epoch": 0.328001075413362, + "grad_norm": 0.0028229840099811554, + "kl": 20.66875, + "learning_rate": 1.6996007461550483e-05, + "loss": 0.8269, + "reward": 0.053923177719116214, + "reward_std": 0.010693888347304892, + "rewards/reward_func_1": 0.053923177719116214, + "step": 1220 + }, + { + "completion_length": 2.0, + "epoch": 0.3293453421158758, + "grad_norm": 0.0020496586803346872, + "kl": 20.678125, + "learning_rate": 1.6962396408873826e-05, + "loss": 0.8276, + "reward": 0.056081295013427734, + "reward_std": 0.00998476523818681, + "rewards/reward_func_1": 0.056081295013427734, + "step": 1225 + }, + { + "completion_length": 2.0, + "epoch": 0.33068960881838955, + "grad_norm": 0.002698215888813138, + "kl": 20.396875, + "learning_rate": 1.6928632005314983e-05, + "loss": 0.8162, + "reward": 0.05358821749687195, + "reward_std": 0.010563099296268775, + "rewards/reward_func_1": 0.05358821749687195, + "step": 1230 + }, + { + "completion_length": 2.0, + "epoch": 0.33203387552090335, + "grad_norm": 0.004789648577570915, + "kl": 20.478125, + "learning_rate": 1.689471499455482e-05, + "loss": 0.8191, + "reward": 0.05402927398681641, + "reward_std": 0.009172404053242645, + "rewards/reward_func_1": 0.05402927398681641, + "step": 1235 + }, + { + "completion_length": 2.0, + "epoch": 0.3333781422234171, + "grad_norm": 0.010540174320340157, + "kl": 20.75625, + "learning_rate": 1.6860646123635482e-05, + "loss": 0.8302, + "reward": 0.05363121032714844, + "reward_std": 0.010847341820772271, + "rewards/reward_func_1": 0.05363121032714844, + "step": 1240 + }, + { + "completion_length": 2.01875, + "epoch": 0.3347224089259309, + "grad_norm": 0.8453480005264282, + "kl": 20.55625, + "learning_rate": 1.6826426142943925e-05, + "loss": 0.8223, + "reward": 0.05549154281616211, + "reward_std": 0.010959918500157073, + "rewards/reward_func_1": 0.05549154281616211, + "step": 1245 + }, + { + "completion_length": 2.65625, + "epoch": 0.33606667562844467, + "grad_norm": 0.7238678932189941, + "kl": 16.04375, + "learning_rate": 1.679205580619538e-05, + "loss": 0.6421, + "reward": 0.060180139541625974, + "reward_std": 0.012135649514675606, + "rewards/reward_func_1": 0.060180139541625974, + "step": 1250 + }, + { + "completion_length": 2.0, + "epoch": 0.3374109423309585, + "grad_norm": 0.06061722710728645, + "kl": 15.15, + "learning_rate": 1.6757535870416755e-05, + "loss": 0.6056, + "reward": 0.06041898727416992, + "reward_std": 0.010352135712309973, + "rewards/reward_func_1": 0.06041898727416992, + "step": 1255 + }, + { + "completion_length": 2.0, + "epoch": 0.3387552090334722, + "grad_norm": 0.6215311884880066, + "kl": 14.9609375, + "learning_rate": 1.6722867095929976e-05, + "loss": 0.5983, + "reward": 0.05601742267608643, + "reward_std": 0.010467067039280664, + "rewards/reward_func_1": 0.05601742267608643, + "step": 1260 + }, + { + "completion_length": 2.0, + "epoch": 0.34009947573598603, + "grad_norm": 0.005102403461933136, + "kl": 13.65625, + "learning_rate": 1.6688050246335216e-05, + "loss": 0.5462, + "reward": 0.05668201446533203, + "reward_std": 0.011318438543821686, + "rewards/reward_func_1": 0.05668201446533203, + "step": 1265 + }, + { + "completion_length": 2.0, + "epoch": 0.3414437424384998, + "grad_norm": 0.05287908762693405, + "kl": 13.490625, + "learning_rate": 1.6653086088494106e-05, + "loss": 0.5396, + "reward": 0.05806665420532227, + "reward_std": 0.011723793356213718, + "rewards/reward_func_1": 0.05806665420532227, + "step": 1270 + }, + { + "completion_length": 2.0, + "epoch": 0.3427880091410136, + "grad_norm": 0.00010848957026610151, + "kl": 13.9109375, + "learning_rate": 1.6617975392512812e-05, + "loss": 0.5563, + "reward": 0.06332006454467773, + "reward_std": 0.012813021524925717, + "rewards/reward_func_1": 0.06332006454467773, + "step": 1275 + }, + { + "completion_length": 1.9875, + "epoch": 0.34413227584352735, + "grad_norm": 0.0001873042929219082, + "kl": 13.890625, + "learning_rate": 1.6582718931725094e-05, + "loss": 0.5556, + "reward": 0.05860910415649414, + "reward_std": 0.012335632972826716, + "rewards/reward_func_1": 0.05860910415649414, + "step": 1280 + }, + { + "completion_length": 2.0, + "epoch": 0.34547654254604115, + "grad_norm": 6.566791398654459e-06, + "kl": 13.903125, + "learning_rate": 1.6547317482675277e-05, + "loss": 0.5563, + "reward": 0.05752272605895996, + "reward_std": 0.010450741471140645, + "rewards/reward_func_1": 0.05752272605895996, + "step": 1285 + }, + { + "completion_length": 2.0, + "epoch": 0.3468208092485549, + "grad_norm": 1.5863972748775268e-06, + "kl": 13.7296875, + "learning_rate": 1.651177182510112e-05, + "loss": 0.5491, + "reward": 0.054108810424804685, + "reward_std": 0.010263701246731215, + "rewards/reward_func_1": 0.054108810424804685, + "step": 1290 + }, + { + "completion_length": 2.0, + "epoch": 0.3481650759510687, + "grad_norm": 3.900764113495825e-06, + "kl": 13.80625, + "learning_rate": 1.6476082741916677e-05, + "loss": 0.5522, + "reward": 0.06382217407226562, + "reward_std": 0.01203576557818451, + "rewards/reward_func_1": 0.06382217407226562, + "step": 1295 + }, + { + "completion_length": 2.0, + "epoch": 0.34950934265358247, + "grad_norm": 2.0937131921527907e-06, + "kl": 13.7046875, + "learning_rate": 1.644025101919503e-05, + "loss": 0.5484, + "reward": 0.05921125411987305, + "reward_std": 0.011942052780796075, + "rewards/reward_func_1": 0.05921125411987305, + "step": 1300 + }, + { + "completion_length": 2.0, + "epoch": 0.3508536093560963, + "grad_norm": 1.797810909920372e-06, + "kl": 13.8578125, + "learning_rate": 1.6404277446150968e-05, + "loss": 0.5542, + "reward": 0.061875534057617185, + "reward_std": 0.010323166584566935, + "rewards/reward_func_1": 0.061875534057617185, + "step": 1305 + }, + { + "completion_length": 2.0, + "epoch": 0.35219787605861, + "grad_norm": 4.8423517000628635e-06, + "kl": 13.696875, + "learning_rate": 1.6368162815123637e-05, + "loss": 0.5476, + "reward": 0.05844669342041016, + "reward_std": 0.01120743685751222, + "rewards/reward_func_1": 0.05844669342041016, + "step": 1310 + }, + { + "completion_length": 2.0, + "epoch": 0.35354214276112383, + "grad_norm": 1.9106237232335843e-05, + "kl": 13.6734375, + "learning_rate": 1.633190792155906e-05, + "loss": 0.5468, + "reward": 0.05963554382324219, + "reward_std": 0.011439791695011081, + "rewards/reward_func_1": 0.05963554382324219, + "step": 1315 + }, + { + "completion_length": 2.0, + "epoch": 0.3548864094636376, + "grad_norm": 3.11785652229446e-06, + "kl": 13.5453125, + "learning_rate": 1.629551356399262e-05, + "loss": 0.5419, + "reward": 0.06005144119262695, + "reward_std": 0.00977201181158307, + "rewards/reward_func_1": 0.06005144119262695, + "step": 1320 + }, + { + "completion_length": 2.0, + "epoch": 0.35623067616615134, + "grad_norm": 3.1484196370001882e-06, + "kl": 13.89375, + "learning_rate": 1.625898054403148e-05, + "loss": 0.5557, + "reward": 0.06197786331176758, + "reward_std": 0.010603644404909573, + "rewards/reward_func_1": 0.06197786331176758, + "step": 1325 + }, + { + "completion_length": 2.0, + "epoch": 0.35757494286866515, + "grad_norm": 2.623250566102797e-06, + "kl": 14.0703125, + "learning_rate": 1.6222309666336933e-05, + "loss": 0.5626, + "reward": 0.06794366836547852, + "reward_std": 0.01082740986457793, + "rewards/reward_func_1": 0.06794366836547852, + "step": 1330 + }, + { + "completion_length": 2.0, + "epoch": 0.3589192095711789, + "grad_norm": 1.4281185940490104e-06, + "kl": 13.784375, + "learning_rate": 1.6185501738606654e-05, + "loss": 0.5515, + "reward": 0.05785388946533203, + "reward_std": 0.009857135304082476, + "rewards/reward_func_1": 0.05785388946533203, + "step": 1335 + }, + { + "completion_length": 2.0, + "epoch": 0.3602634762736927, + "grad_norm": 1.23358740893309e-05, + "kl": 13.928125, + "learning_rate": 1.614855757155693e-05, + "loss": 0.5574, + "reward": 0.061734676361083984, + "reward_std": 0.012116704345680773, + "rewards/reward_func_1": 0.061734676361083984, + "step": 1340 + }, + { + "completion_length": 2.0, + "epoch": 0.36160774297620646, + "grad_norm": 3.1937454423314193e-06, + "kl": 13.4515625, + "learning_rate": 1.6111477978904813e-05, + "loss": 0.5378, + "reward": 0.05473334789276123, + "reward_std": 0.010365012554575514, + "rewards/reward_func_1": 0.05473334789276123, + "step": 1345 + }, + { + "completion_length": 2.0, + "epoch": 0.36295200967872027, + "grad_norm": 2.7216408398089698e-06, + "kl": 13.628125, + "learning_rate": 1.6074263777350167e-05, + "loss": 0.5452, + "reward": 0.0586578369140625, + "reward_std": 0.010139925488329028, + "rewards/reward_func_1": 0.0586578369140625, + "step": 1350 + }, + { + "completion_length": 2.0, + "epoch": 0.364296276381234, + "grad_norm": 1.4683068911836017e-06, + "kl": 14.2203125, + "learning_rate": 1.6036915786557705e-05, + "loss": 0.569, + "reward": 0.057494735717773436, + "reward_std": 0.00982674182887422, + "rewards/reward_func_1": 0.057494735717773436, + "step": 1355 + }, + { + "completion_length": 2.0, + "epoch": 0.3656405430837478, + "grad_norm": 2.2047534002922475e-06, + "kl": 13.690625, + "learning_rate": 1.5999434829138923e-05, + "loss": 0.5477, + "reward": 0.058840179443359376, + "reward_std": 0.009708692382264416, + "rewards/reward_func_1": 0.058840179443359376, + "step": 1360 + }, + { + "completion_length": 2.0, + "epoch": 0.3669848097862616, + "grad_norm": 8.66782011144096e-06, + "kl": 13.884375, + "learning_rate": 1.5961821730633986e-05, + "loss": 0.5552, + "reward": 0.06289253234863282, + "reward_std": 0.013519753767468501, + "rewards/reward_func_1": 0.06289253234863282, + "step": 1365 + }, + { + "completion_length": 2.0, + "epoch": 0.3683290764887754, + "grad_norm": 2.6345257992943516e-06, + "kl": 13.71875, + "learning_rate": 1.5924077319493546e-05, + "loss": 0.5486, + "reward": 0.05802221298217773, + "reward_std": 0.010079689413032611, + "rewards/reward_func_1": 0.05802221298217773, + "step": 1370 + }, + { + "completion_length": 2.0, + "epoch": 0.36967334319128914, + "grad_norm": 1.2998112651985139e-05, + "kl": 13.8421875, + "learning_rate": 1.5886202427060493e-05, + "loss": 0.5539, + "reward": 0.06998028755187988, + "reward_std": 0.011192185156687628, + "rewards/reward_func_1": 0.06998028755187988, + "step": 1375 + }, + { + "completion_length": 2.0, + "epoch": 0.37101760989380295, + "grad_norm": 5.262471859168727e-06, + "kl": 13.78125, + "learning_rate": 1.5848197887551643e-05, + "loss": 0.5507, + "reward": 0.05722208023071289, + "reward_std": 0.010396837347070687, + "rewards/reward_func_1": 0.05722208023071289, + "step": 1380 + }, + { + "completion_length": 2.0, + "epoch": 0.3723618765963167, + "grad_norm": 3.908398866769858e-06, + "kl": 13.89375, + "learning_rate": 1.5810064538039368e-05, + "loss": 0.556, + "reward": 0.059538209438323976, + "reward_std": 0.010347902441571933, + "rewards/reward_func_1": 0.059538209438323976, + "step": 1385 + }, + { + "completion_length": 2.0, + "epoch": 0.3737061432988305, + "grad_norm": 8.753636393521447e-06, + "kl": 14.0859375, + "learning_rate": 1.577180321843315e-05, + "loss": 0.5638, + "reward": 0.06107792854309082, + "reward_std": 0.010959995364828501, + "rewards/reward_func_1": 0.06107792854309082, + "step": 1390 + }, + { + "completion_length": 2.0, + "epoch": 0.37505041000134426, + "grad_norm": 1.4902374232406146e-06, + "kl": 13.696875, + "learning_rate": 1.5733414771461094e-05, + "loss": 0.5476, + "reward": 0.06554374694824219, + "reward_std": 0.012540119105688063, + "rewards/reward_func_1": 0.06554374694824219, + "step": 1395 + }, + { + "completion_length": 2.0, + "epoch": 0.37639467670385807, + "grad_norm": 3.1805816433916334e-06, + "kl": 13.6890625, + "learning_rate": 1.569490004265136e-05, + "loss": 0.5474, + "reward": 0.06188135147094727, + "reward_std": 0.008616514109598938, + "rewards/reward_func_1": 0.06188135147094727, + "step": 1400 + }, + { + "completion_length": 2.0, + "epoch": 0.3777389434063718, + "grad_norm": 1.402103134751087e-05, + "kl": 13.7421875, + "learning_rate": 1.5656259880313528e-05, + "loss": 0.5496, + "reward": 0.06100940704345703, + "reward_std": 0.010816287656780332, + "rewards/reward_func_1": 0.06100940704345703, + "step": 1405 + }, + { + "completion_length": 2.0, + "epoch": 0.3790832101088856, + "grad_norm": 2.776348765110015e-06, + "kl": 13.56875, + "learning_rate": 1.5617495135519946e-05, + "loss": 0.5429, + "reward": 0.05631539821624756, + "reward_std": 0.012434210258652456, + "rewards/reward_func_1": 0.05631539821624756, + "step": 1410 + }, + { + "completion_length": 2.0, + "epoch": 0.3804274768113994, + "grad_norm": 2.385505240454222e-06, + "kl": 14.0, + "learning_rate": 1.557860666208695e-05, + "loss": 0.56, + "reward": 0.05535392761230469, + "reward_std": 0.01153669813356828, + "rewards/reward_func_1": 0.05535392761230469, + "step": 1415 + }, + { + "completion_length": 2.0, + "epoch": 0.3817717435139132, + "grad_norm": 6.945092536625452e-06, + "kl": 13.7421875, + "learning_rate": 1.553959531655607e-05, + "loss": 0.5495, + "reward": 0.061875534057617185, + "reward_std": 0.011464458813861711, + "rewards/reward_func_1": 0.061875534057617185, + "step": 1420 + }, + { + "completion_length": 2.0, + "epoch": 0.38311601021642694, + "grad_norm": 1.5258659004757646e-05, + "kl": 13.609375, + "learning_rate": 1.5500461958175174e-05, + "loss": 0.5442, + "reward": 0.05548095703125, + "reward_std": 0.0085141017458227, + "rewards/reward_func_1": 0.05548095703125, + "step": 1425 + }, + { + "completion_length": 2.0, + "epoch": 0.3844602769189407, + "grad_norm": 3.450870644883253e-05, + "kl": 13.884375, + "learning_rate": 1.546120744887954e-05, + "loss": 0.5551, + "reward": 0.05991678237915039, + "reward_std": 0.011126938453890034, + "rewards/reward_func_1": 0.05991678237915039, + "step": 1430 + }, + { + "completion_length": 2.0, + "epoch": 0.3858045436214545, + "grad_norm": 4.157363036938477e-06, + "kl": 13.8671875, + "learning_rate": 1.5421832653272845e-05, + "loss": 0.5547, + "reward": 0.05983428955078125, + "reward_std": 0.009211386787137598, + "rewards/reward_func_1": 0.05983428955078125, + "step": 1435 + }, + { + "completion_length": 2.0, + "epoch": 0.38714881032396825, + "grad_norm": 3.8689913708367385e-06, + "kl": 13.85625, + "learning_rate": 1.5382338438608165e-05, + "loss": 0.5545, + "reward": 0.06216297149658203, + "reward_std": 0.009720365148677957, + "rewards/reward_func_1": 0.06216297149658203, + "step": 1440 + }, + { + "completion_length": 2.0, + "epoch": 0.38849307702648206, + "grad_norm": 3.0080229862505803e-06, + "kl": 13.74375, + "learning_rate": 1.5342725674768844e-05, + "loss": 0.5499, + "reward": 0.06219477653503418, + "reward_std": 0.010860501191928051, + "rewards/reward_func_1": 0.06219477653503418, + "step": 1445 + }, + { + "completion_length": 2.0, + "epoch": 0.3898373437289958, + "grad_norm": 2.156152959287283e-06, + "kl": 13.659375, + "learning_rate": 1.5302995234249335e-05, + "loss": 0.5464, + "reward": 0.05769138336181641, + "reward_std": 0.011336278253293131, + "rewards/reward_func_1": 0.05769138336181641, + "step": 1450 + }, + { + "completion_length": 2.0, + "epoch": 0.3911816104315096, + "grad_norm": 3.143540379824117e-05, + "kl": 13.565625, + "learning_rate": 1.5263147992135998e-05, + "loss": 0.5427, + "reward": 0.057453060150146486, + "reward_std": 0.011146099481265992, + "rewards/reward_func_1": 0.057453060150146486, + "step": 1455 + }, + { + "completion_length": 2.0, + "epoch": 0.39252587713402337, + "grad_norm": 2.1904502318648156e-06, + "kl": 13.6375, + "learning_rate": 1.5223184826087811e-05, + "loss": 0.5455, + "reward": 0.060272598266601564, + "reward_std": 0.012035325131728314, + "rewards/reward_func_1": 0.060272598266601564, + "step": 1460 + }, + { + "completion_length": 2.0, + "epoch": 0.3938701438365372, + "grad_norm": 3.4340512229391607e-06, + "kl": 13.9953125, + "learning_rate": 1.5183106616317048e-05, + "loss": 0.5596, + "reward": 0.06144716739654541, + "reward_std": 0.013058099864429096, + "rewards/reward_func_1": 0.06144716739654541, + "step": 1465 + }, + { + "completion_length": 2.0, + "epoch": 0.39521441053905093, + "grad_norm": 2.2754304609406972e-06, + "kl": 14.1265625, + "learning_rate": 1.5142914245569885e-05, + "loss": 0.5651, + "reward": 0.057547581195831296, + "reward_std": 0.009754268628603313, + "rewards/reward_func_1": 0.057547581195831296, + "step": 1470 + }, + { + "completion_length": 2.0, + "epoch": 0.39655867724156474, + "grad_norm": 2.4745954760874156e-06, + "kl": 13.73125, + "learning_rate": 1.5102608599106966e-05, + "loss": 0.5491, + "reward": 0.061440467834472656, + "reward_std": 0.010538783113224781, + "rewards/reward_func_1": 0.061440467834472656, + "step": 1475 + }, + { + "completion_length": 2.0, + "epoch": 0.3979029439440785, + "grad_norm": 6.128909262770321e-06, + "kl": 13.5640625, + "learning_rate": 1.5062190564683893e-05, + "loss": 0.5427, + "reward": 0.057086181640625, + "reward_std": 0.01106494044579449, + "rewards/reward_func_1": 0.057086181640625, + "step": 1480 + }, + { + "completion_length": 2.0, + "epoch": 0.3992472106465923, + "grad_norm": 1.8235305105918087e-06, + "kl": 13.7984375, + "learning_rate": 1.5021661032531692e-05, + "loss": 0.552, + "reward": 0.058293724060058595, + "reward_std": 0.010370958992280067, + "rewards/reward_func_1": 0.058293724060058595, + "step": 1485 + }, + { + "completion_length": 2.0, + "epoch": 0.40059147734910605, + "grad_norm": 2.1967098291497678e-06, + "kl": 13.8296875, + "learning_rate": 1.4981020895337175e-05, + "loss": 0.5532, + "reward": 0.05586849227547645, + "reward_std": 0.011479038602556103, + "rewards/reward_func_1": 0.05586849227547645, + "step": 1490 + }, + { + "completion_length": 2.0, + "epoch": 0.40193574405161986, + "grad_norm": 1.2756601108776522e-06, + "kl": 13.4796875, + "learning_rate": 1.4940271048223307e-05, + "loss": 0.5394, + "reward": 0.05812692642211914, + "reward_std": 0.012107894703513011, + "rewards/reward_func_1": 0.05812692642211914, + "step": 1495 + }, + { + "completion_length": 2.0, + "epoch": 0.4032800107541336, + "grad_norm": 1.9987196537840646e-06, + "kl": 13.553125, + "learning_rate": 1.4899412388729472e-05, + "loss": 0.5421, + "reward": 0.051792049407958986, + "reward_std": 0.01174548725830391, + "rewards/reward_func_1": 0.051792049407958986, + "step": 1500 + }, + { + "completion_length": 2.0, + "epoch": 0.4046242774566474, + "grad_norm": 1.8275987940796767e-06, + "kl": 13.9421875, + "learning_rate": 1.4858445816791718e-05, + "loss": 0.5575, + "reward": 0.05959300994873047, + "reward_std": 0.009343751921551301, + "rewards/reward_func_1": 0.05959300994873047, + "step": 1505 + }, + { + "completion_length": 2.0, + "epoch": 0.40596854415916117, + "grad_norm": 5.036072707298445e-06, + "kl": 13.7765625, + "learning_rate": 1.4817372234722918e-05, + "loss": 0.551, + "reward": 0.06010627746582031, + "reward_std": 0.01253571416818886, + "rewards/reward_func_1": 0.06010627746582031, + "step": 1510 + }, + { + "completion_length": 2.0, + "epoch": 0.407312810861675, + "grad_norm": 2.4326691345777363e-06, + "kl": 14.025, + "learning_rate": 1.4776192547192915e-05, + "loss": 0.5612, + "reward": 0.06703472137451172, + "reward_std": 0.010542747608269565, + "rewards/reward_func_1": 0.06703472137451172, + "step": 1515 + }, + { + "completion_length": 2.0, + "epoch": 0.40865707756418873, + "grad_norm": 2.35791821978637e-06, + "kl": 13.6375, + "learning_rate": 1.4734907661208587e-05, + "loss": 0.5454, + "reward": 0.05951080322265625, + "reward_std": 0.009829605106278904, + "rewards/reward_func_1": 0.05951080322265625, + "step": 1520 + }, + { + "completion_length": 2.0, + "epoch": 0.41000134426670254, + "grad_norm": 4.144026206631679e-06, + "kl": 13.9828125, + "learning_rate": 1.469351848609386e-05, + "loss": 0.5595, + "reward": 0.05874214172363281, + "reward_std": 0.008459481771024003, + "rewards/reward_func_1": 0.05874214172363281, + "step": 1525 + }, + { + "completion_length": 2.0, + "epoch": 0.4113456109692163, + "grad_norm": 4.286873263481539e-06, + "kl": 13.88125, + "learning_rate": 1.4652025933469705e-05, + "loss": 0.5551, + "reward": 0.06733989715576172, + "reward_std": 0.01017450345098041, + "rewards/reward_func_1": 0.06733989715576172, + "step": 1530 + }, + { + "completion_length": 2.0, + "epoch": 0.4126898776717301, + "grad_norm": 6.012166977598099e-06, + "kl": 13.8875, + "learning_rate": 1.461043091723403e-05, + "loss": 0.5554, + "reward": 0.06552686691284179, + "reward_std": 0.010358074885152746, + "rewards/reward_func_1": 0.06552686691284179, + "step": 1535 + }, + { + "completion_length": 2.0, + "epoch": 0.41403414437424385, + "grad_norm": 2.3474919998989208e-06, + "kl": 13.9390625, + "learning_rate": 1.4568734353541572e-05, + "loss": 0.5574, + "reward": 0.058895301818847653, + "reward_std": 0.012217024579877033, + "rewards/reward_func_1": 0.058895301818847653, + "step": 1540 + }, + { + "completion_length": 2.0, + "epoch": 0.4153784110767576, + "grad_norm": 3.684157127281651e-05, + "kl": 13.8734375, + "learning_rate": 1.4526937160783707e-05, + "loss": 0.555, + "reward": 0.05571174621582031, + "reward_std": 0.009185398211957362, + "rewards/reward_func_1": 0.05571174621582031, + "step": 1545 + }, + { + "completion_length": 2.0, + "epoch": 0.4167226777792714, + "grad_norm": 1.638373532841797e-06, + "kl": 14.0515625, + "learning_rate": 1.4485040259568228e-05, + "loss": 0.5622, + "reward": 0.06209487915039062, + "reward_std": 0.010831043922371464, + "rewards/reward_func_1": 0.06209487915039062, + "step": 1550 + }, + { + "completion_length": 2.0, + "epoch": 0.41806694448178516, + "grad_norm": 1.416817667632131e-06, + "kl": 13.9046875, + "learning_rate": 1.4443044572699058e-05, + "loss": 0.556, + "reward": 0.06337127685546876, + "reward_std": 0.00729374265865772, + "rewards/reward_func_1": 0.06337127685546876, + "step": 1555 + }, + { + "completion_length": 2.0, + "epoch": 0.41941121118429897, + "grad_norm": 4.485429144551745e-06, + "kl": 13.6140625, + "learning_rate": 1.440095102515595e-05, + "loss": 0.5445, + "reward": 0.060857629776000975, + "reward_std": 0.011825820308149559, + "rewards/reward_func_1": 0.060857629776000975, + "step": 1560 + }, + { + "completion_length": 2.0, + "epoch": 0.4207554778868127, + "grad_norm": 2.9214272672106745e-06, + "kl": 14.034375, + "learning_rate": 1.4358760544074074e-05, + "loss": 0.5612, + "reward": 0.06148242950439453, + "reward_std": 0.012079593736416427, + "rewards/reward_func_1": 0.06148242950439453, + "step": 1565 + }, + { + "completion_length": 2.0, + "epoch": 0.42209974458932653, + "grad_norm": 2.7933485853282036e-06, + "kl": 13.725, + "learning_rate": 1.4316474058723635e-05, + "loss": 0.549, + "reward": 0.06508445739746094, + "reward_std": 0.009462902668019524, + "rewards/reward_func_1": 0.06508445739746094, + "step": 1570 + }, + { + "completion_length": 2.0, + "epoch": 0.4234440112918403, + "grad_norm": 1.132559646066511e-05, + "kl": 13.465625, + "learning_rate": 1.4274092500489376e-05, + "loss": 0.5386, + "reward": 0.06119532585144043, + "reward_std": 0.010296352157456567, + "rewards/reward_func_1": 0.06119532585144043, + "step": 1575 + }, + { + "completion_length": 2.0, + "epoch": 0.4247882779943541, + "grad_norm": 3.3802455163822742e-06, + "kl": 13.753125, + "learning_rate": 1.423161680285009e-05, + "loss": 0.55, + "reward": 0.05784816741943359, + "reward_std": 0.011395523198734736, + "rewards/reward_func_1": 0.05784816741943359, + "step": 1580 + }, + { + "completion_length": 2.0, + "epoch": 0.42613254469686784, + "grad_norm": 3.657454954009154e-06, + "kl": 13.796875, + "learning_rate": 1.4189047901358033e-05, + "loss": 0.5516, + "reward": 0.0637430191040039, + "reward_std": 0.012986108286713715, + "rewards/reward_func_1": 0.0637430191040039, + "step": 1585 + }, + { + "completion_length": 2.0, + "epoch": 0.42747681139938165, + "grad_norm": 2.5834062853391515e-06, + "kl": 13.596875, + "learning_rate": 1.4146386733618338e-05, + "loss": 0.5439, + "reward": 0.059173583984375, + "reward_std": 0.011515995301306248, + "rewards/reward_func_1": 0.059173583984375, + "step": 1590 + }, + { + "completion_length": 2.0, + "epoch": 0.4288210781018954, + "grad_norm": 4.3796212594315875e-06, + "kl": 13.8921875, + "learning_rate": 1.4103634239268355e-05, + "loss": 0.5556, + "reward": 0.06446866989135742, + "reward_std": 0.009868808073224499, + "rewards/reward_func_1": 0.06446866989135742, + "step": 1595 + }, + { + "completion_length": 2.0, + "epoch": 0.4301653448044092, + "grad_norm": 8.73978751769755e-06, + "kl": 14.025, + "learning_rate": 1.4060791359956956e-05, + "loss": 0.5611, + "reward": 0.0633920669555664, + "reward_std": 0.01284162982410635, + "rewards/reward_func_1": 0.0633920669555664, + "step": 1600 + }, + { + "completion_length": 2.0, + "epoch": 0.43150961150692296, + "grad_norm": 3.975842446379829e-06, + "kl": 13.9109375, + "learning_rate": 1.401785903932379e-05, + "loss": 0.5564, + "reward": 0.06275310516357421, + "reward_std": 0.011346189048344968, + "rewards/reward_func_1": 0.06275310516357421, + "step": 1605 + }, + { + "completion_length": 2.0, + "epoch": 0.43285387820943677, + "grad_norm": 2.2908070604898967e-06, + "kl": 13.628125, + "learning_rate": 1.3974838222978517e-05, + "loss": 0.5454, + "reward": 0.06408562660217285, + "reward_std": 0.011092856073810253, + "rewards/reward_func_1": 0.06408562660217285, + "step": 1610 + }, + { + "completion_length": 2.0, + "epoch": 0.4341981449119505, + "grad_norm": 4.72618330604746e-06, + "kl": 14.3875, + "learning_rate": 1.3931729858479954e-05, + "loss": 0.5759, + "reward": 0.06461887359619141, + "reward_std": 0.009859557841264178, + "rewards/reward_func_1": 0.06461887359619141, + "step": 1615 + }, + { + "completion_length": 2.0, + "epoch": 0.43554241161446433, + "grad_norm": 3.2984760309773264e-06, + "kl": 14.025, + "learning_rate": 1.3888534895315222e-05, + "loss": 0.561, + "reward": 0.06250219345092774, + "reward_std": 0.01138484149123542, + "rewards/reward_func_1": 0.06250219345092774, + "step": 1620 + }, + { + "completion_length": 2.0, + "epoch": 0.4368866783169781, + "grad_norm": 1.7083860939237638e-06, + "kl": 13.5921875, + "learning_rate": 1.384525428487883e-05, + "loss": 0.5439, + "reward": 0.057589149475097655, + "reward_std": 0.012263275221630465, + "rewards/reward_func_1": 0.057589149475097655, + "step": 1625 + }, + { + "completion_length": 2.0, + "epoch": 0.4382309450194919, + "grad_norm": 1.8176706362282857e-05, + "kl": 13.703125, + "learning_rate": 1.380188898045172e-05, + "loss": 0.5484, + "reward": 0.05926389694213867, + "reward_std": 0.011890075955307111, + "rewards/reward_func_1": 0.05926389694213867, + "step": 1630 + }, + { + "completion_length": 2.0, + "epoch": 0.43957521172200564, + "grad_norm": 1.6534449969185516e-06, + "kl": 13.7234375, + "learning_rate": 1.3758439937180269e-05, + "loss": 0.5489, + "reward": 0.06111717224121094, + "reward_std": 0.009865944929333636, + "rewards/reward_func_1": 0.06111717224121094, + "step": 1635 + }, + { + "completion_length": 2.0, + "epoch": 0.44091947842451945, + "grad_norm": 5.1437118600006215e-06, + "kl": 13.85625, + "learning_rate": 1.371490811205524e-05, + "loss": 0.554, + "reward": 0.06157407760620117, + "reward_std": 0.012079483611159958, + "rewards/reward_func_1": 0.06157407760620117, + "step": 1640 + }, + { + "completion_length": 2.0, + "epoch": 0.4422637451270332, + "grad_norm": 5.265788786346093e-06, + "kl": 13.75, + "learning_rate": 1.3671294463890734e-05, + "loss": 0.5499, + "reward": 0.057445335388183597, + "reward_std": 0.01464254588354379, + "rewards/reward_func_1": 0.057445335388183597, + "step": 1645 + }, + { + "completion_length": 2.0, + "epoch": 0.44360801182954696, + "grad_norm": 1.6486019376316108e-05, + "kl": 14.0984375, + "learning_rate": 1.3627599953303036e-05, + "loss": 0.5636, + "reward": 0.062333667278289796, + "reward_std": 0.00997068356446107, + "rewards/reward_func_1": 0.062333667278289796, + "step": 1650 + }, + { + "completion_length": 2.0, + "epoch": 0.44495227853206076, + "grad_norm": 1.7718589333526324e-06, + "kl": 13.64375, + "learning_rate": 1.3583825542689486e-05, + "loss": 0.5456, + "reward": 0.05308668613433838, + "reward_std": 0.01278767061594408, + "rewards/reward_func_1": 0.05308668613433838, + "step": 1655 + }, + { + "completion_length": 2.0, + "epoch": 0.4462965452345745, + "grad_norm": 5.0791340981959365e-06, + "kl": 14.1375, + "learning_rate": 1.353997219620726e-05, + "loss": 0.5657, + "reward": 0.06480164527893066, + "reward_std": 0.010595990939327749, + "rewards/reward_func_1": 0.06480164527893066, + "step": 1660 + }, + { + "completion_length": 2.0, + "epoch": 0.4476408119370883, + "grad_norm": 1.975413169930107e-06, + "kl": 13.9046875, + "learning_rate": 1.3496040879752146e-05, + "loss": 0.5562, + "reward": 0.058099555969238284, + "reward_std": 0.012097873717721086, + "rewards/reward_func_1": 0.058099555969238284, + "step": 1665 + }, + { + "completion_length": 2.0, + "epoch": 0.4489850786396021, + "grad_norm": 3.103926246694755e-06, + "kl": 14.0609375, + "learning_rate": 1.3452032560937271e-05, + "loss": 0.5626, + "reward": 0.06475410461425782, + "reward_std": 0.01139398144750885, + "rewards/reward_func_1": 0.06475410461425782, + "step": 1670 + }, + { + "completion_length": 2.0, + "epoch": 0.4503293453421159, + "grad_norm": 1.6186576203836012e-06, + "kl": 14.034375, + "learning_rate": 1.3407948209071779e-05, + "loss": 0.5614, + "reward": 0.06743978261947632, + "reward_std": 0.013266765065782237, + "rewards/reward_func_1": 0.06743978261947632, + "step": 1675 + }, + { + "completion_length": 2.0, + "epoch": 0.45167361204462964, + "grad_norm": 1.8007198377745226e-05, + "kl": 14.275, + "learning_rate": 1.3363788795139487e-05, + "loss": 0.571, + "reward": 0.06518707275390626, + "reward_std": 0.011862215257133357, + "rewards/reward_func_1": 0.06518707275390626, + "step": 1680 + }, + { + "completion_length": 2.0, + "epoch": 0.45301787874714344, + "grad_norm": 5.033749403082766e-06, + "kl": 13.915625, + "learning_rate": 1.3319555291777501e-05, + "loss": 0.5568, + "reward": 0.06435184478759766, + "reward_std": 0.011379225243217661, + "rewards/reward_func_1": 0.06435184478759766, + "step": 1685 + }, + { + "completion_length": 2.0, + "epoch": 0.4543621454496572, + "grad_norm": 6.873391612316482e-06, + "kl": 13.596875, + "learning_rate": 1.3275248673254788e-05, + "loss": 0.544, + "reward": 0.06184234619140625, + "reward_std": 0.010739423423365224, + "rewards/reward_func_1": 0.06184234619140625, + "step": 1690 + }, + { + "completion_length": 2.0, + "epoch": 0.455706412152171, + "grad_norm": 4.609265033650445e-06, + "kl": 13.6984375, + "learning_rate": 1.3230869915450722e-05, + "loss": 0.5481, + "reward": 0.05451488494873047, + "reward_std": 0.010388137760855898, + "rewards/reward_func_1": 0.05451488494873047, + "step": 1695 + }, + { + "completion_length": 2.0, + "epoch": 0.45705067885468476, + "grad_norm": 2.1986843421473168e-06, + "kl": 13.5859375, + "learning_rate": 1.3186419995833582e-05, + "loss": 0.5436, + "reward": 0.05490055084228516, + "reward_std": 0.011986211253679357, + "rewards/reward_func_1": 0.05490055084228516, + "step": 1700 + }, + { + "completion_length": 2.0, + "epoch": 0.45839494555719856, + "grad_norm": 3.9593110159330536e-06, + "kl": 13.828125, + "learning_rate": 1.3141899893439032e-05, + "loss": 0.5533, + "reward": 0.061890792846679685, + "reward_std": 0.01013331833673874, + "rewards/reward_func_1": 0.061890792846679685, + "step": 1705 + }, + { + "completion_length": 2.0, + "epoch": 0.4597392122597123, + "grad_norm": 2.6946911475533852e-06, + "kl": 14.0953125, + "learning_rate": 1.3097310588848555e-05, + "loss": 0.5641, + "reward": 0.06313896179199219, + "reward_std": 0.013185867536230944, + "rewards/reward_func_1": 0.06313896179199219, + "step": 1710 + }, + { + "completion_length": 2.0, + "epoch": 0.4610834789622261, + "grad_norm": 3.4700431115197716e-06, + "kl": 13.9703125, + "learning_rate": 1.3052653064167848e-05, + "loss": 0.5591, + "reward": 0.057857322692871097, + "reward_std": 0.01133649832190713, + "rewards/reward_func_1": 0.057857322692871097, + "step": 1715 + }, + { + "completion_length": 2.0, + "epoch": 0.4624277456647399, + "grad_norm": 1.3458391549647786e-05, + "kl": 13.5859375, + "learning_rate": 1.3007928303005201e-05, + "loss": 0.5436, + "reward": 0.05681304931640625, + "reward_std": 0.010392763031995855, + "rewards/reward_func_1": 0.05681304931640625, + "step": 1720 + }, + { + "completion_length": 2.0, + "epoch": 0.4637720123672537, + "grad_norm": 4.882398570771329e-06, + "kl": 13.8015625, + "learning_rate": 1.2963137290449823e-05, + "loss": 0.552, + "reward": 0.062281131744384766, + "reward_std": 0.012929836504918057, + "rewards/reward_func_1": 0.062281131744384766, + "step": 1725 + }, + { + "completion_length": 2.0, + "epoch": 0.46511627906976744, + "grad_norm": 6.2865415202395525e-06, + "kl": 13.8671875, + "learning_rate": 1.291828101305015e-05, + "loss": 0.5546, + "reward": 0.061757802963256836, + "reward_std": 0.0114550436315767, + "rewards/reward_func_1": 0.061757802963256836, + "step": 1730 + }, + { + "completion_length": 2.0, + "epoch": 0.46646054577228124, + "grad_norm": 2.4083929019980133e-06, + "kl": 14.290625, + "learning_rate": 1.2873360458792114e-05, + "loss": 0.5719, + "reward": 0.06473960876464843, + "reward_std": 0.012107564476900734, + "rewards/reward_func_1": 0.06473960876464843, + "step": 1735 + }, + { + "completion_length": 2.0, + "epoch": 0.467804812474795, + "grad_norm": 2.2838785298517905e-05, + "kl": 13.7671875, + "learning_rate": 1.2828376617077385e-05, + "loss": 0.5504, + "reward": 0.059980010986328124, + "reward_std": 0.00752433567395201, + "rewards/reward_func_1": 0.059980010986328124, + "step": 1740 + }, + { + "completion_length": 2.0, + "epoch": 0.4691490791773088, + "grad_norm": 2.683865886865533e-06, + "kl": 13.9546875, + "learning_rate": 1.2783330478701572e-05, + "loss": 0.558, + "reward": 0.05912628173828125, + "reward_std": 0.010197188393794932, + "rewards/reward_func_1": 0.05912628173828125, + "step": 1745 + }, + { + "completion_length": 2.0, + "epoch": 0.47049334587982256, + "grad_norm": 2.190610530306003e-06, + "kl": 14.090625, + "learning_rate": 1.2738223035832412e-05, + "loss": 0.5638, + "reward": 0.06425952911376953, + "reward_std": 0.008773326113077929, + "rewards/reward_func_1": 0.06425952911376953, + "step": 1750 + }, + { + "completion_length": 2.0, + "epoch": 0.47183761258233636, + "grad_norm": 3.1278939331969013e-06, + "kl": 14.303125, + "learning_rate": 1.2693055281987903e-05, + "loss": 0.5719, + "reward": 0.06397314071655273, + "reward_std": 0.01130247107357718, + "rewards/reward_func_1": 0.06397314071655273, + "step": 1755 + }, + { + "completion_length": 2.0, + "epoch": 0.4731818792848501, + "grad_norm": 1.2678321581915952e-05, + "kl": 13.93125, + "learning_rate": 1.264782821201443e-05, + "loss": 0.5571, + "reward": 0.057790946960449216, + "reward_std": 0.010541425982955844, + "rewards/reward_func_1": 0.057790946960449216, + "step": 1760 + }, + { + "completion_length": 2.0, + "epoch": 0.47452614598736387, + "grad_norm": 2.6648983748600585e-06, + "kl": 13.5921875, + "learning_rate": 1.2602542822064852e-05, + "loss": 0.5438, + "reward": 0.06369266510009766, + "reward_std": 0.010999528719548835, + "rewards/reward_func_1": 0.06369266510009766, + "step": 1765 + }, + { + "completion_length": 2.0, + "epoch": 0.4758704126898777, + "grad_norm": 9.32496550376527e-06, + "kl": 14.321875, + "learning_rate": 1.2557200109576557e-05, + "loss": 0.5729, + "reward": 0.061204147338867185, + "reward_std": 0.013532968414074276, + "rewards/reward_func_1": 0.061204147338867185, + "step": 1770 + }, + { + "completion_length": 2.0, + "epoch": 0.4772146793923914, + "grad_norm": 1.453070012757962e-06, + "kl": 13.70625, + "learning_rate": 1.2511801073249499e-05, + "loss": 0.5482, + "reward": 0.060839509963989256, + "reward_std": 0.01080455974151846, + "rewards/reward_func_1": 0.060839509963989256, + "step": 1775 + }, + { + "completion_length": 2.0, + "epoch": 0.47855894609490524, + "grad_norm": 2.337250180062256e-06, + "kl": 13.696875, + "learning_rate": 1.2466346713024194e-05, + "loss": 0.5479, + "reward": 0.05852031707763672, + "reward_std": 0.009971881102683256, + "rewards/reward_func_1": 0.05852031707763672, + "step": 1780 + }, + { + "completion_length": 2.0, + "epoch": 0.479903212797419, + "grad_norm": 2.7332425815984607e-06, + "kl": 13.7046875, + "learning_rate": 1.2420838030059704e-05, + "loss": 0.5481, + "reward": 0.059021949768066406, + "reward_std": 0.01221724480674311, + "rewards/reward_func_1": 0.059021949768066406, + "step": 1785 + }, + { + "completion_length": 2.0, + "epoch": 0.4812474794999328, + "grad_norm": 2.8585989184648497e-06, + "kl": 13.725, + "learning_rate": 1.2375276026711576e-05, + "loss": 0.5493, + "reward": 0.0618377685546875, + "reward_std": 0.011973217026388738, + "rewards/reward_func_1": 0.0618377685546875, + "step": 1790 + }, + { + "completion_length": 2.0, + "epoch": 0.48259174620244655, + "grad_norm": 2.026320362347178e-05, + "kl": 14.0484375, + "learning_rate": 1.232966170650977e-05, + "loss": 0.5619, + "reward": 0.062344479560852054, + "reward_std": 0.009559368583722971, + "rewards/reward_func_1": 0.062344479560852054, + "step": 1795 + }, + { + "completion_length": 2.0, + "epoch": 0.48393601290496036, + "grad_norm": 2.701003268157365e-06, + "kl": 13.63125, + "learning_rate": 1.2283996074136566e-05, + "loss": 0.5452, + "reward": 0.06439933776855469, + "reward_std": 0.011508286974640214, + "rewards/reward_func_1": 0.06439933776855469, + "step": 1800 + }, + { + "completion_length": 2.0, + "epoch": 0.4852802796074741, + "grad_norm": 3.2230218494078144e-06, + "kl": 13.9390625, + "learning_rate": 1.2238280135404411e-05, + "loss": 0.5573, + "reward": 0.06914815902709961, + "reward_std": 0.010676544258603825, + "rewards/reward_func_1": 0.06914815902709961, + "step": 1805 + }, + { + "completion_length": 2.0, + "epoch": 0.4866245463099879, + "grad_norm": 4.649204583984101e-06, + "kl": 13.778125, + "learning_rate": 1.2192514897233789e-05, + "loss": 0.5511, + "reward": 0.0602226972579956, + "reward_std": 0.011667439006851054, + "rewards/reward_func_1": 0.0602226972579956, + "step": 1810 + }, + { + "completion_length": 2.0, + "epoch": 0.48796881301250167, + "grad_norm": 1.511799382569734e-06, + "kl": 13.9296875, + "learning_rate": 1.2146701367631027e-05, + "loss": 0.5574, + "reward": 0.06371002197265625, + "reward_std": 0.010727530277472396, + "rewards/reward_func_1": 0.06371002197265625, + "step": 1815 + }, + { + "completion_length": 2.0, + "epoch": 0.4893130797150155, + "grad_norm": 4.017825631308369e-06, + "kl": 13.875, + "learning_rate": 1.2100840555666101e-05, + "loss": 0.5552, + "reward": 0.06100995540618896, + "reward_std": 0.01184462348173838, + "rewards/reward_func_1": 0.06100995540618896, + "step": 1820 + }, + { + "completion_length": 2.0, + "epoch": 0.4906573464175292, + "grad_norm": 2.1121263671375345e-06, + "kl": 13.44375, + "learning_rate": 1.205493347145041e-05, + "loss": 0.5377, + "reward": 0.056847544759511946, + "reward_std": 0.013147425842907979, + "rewards/reward_func_1": 0.056847544759511946, + "step": 1825 + }, + { + "completion_length": 2.0, + "epoch": 0.49200161312004304, + "grad_norm": 3.4769893773045624e-06, + "kl": 13.3515625, + "learning_rate": 1.2008981126114523e-05, + "loss": 0.5341, + "reward": 0.0553741455078125, + "reward_std": 0.010613445060516823, + "rewards/reward_func_1": 0.0553741455078125, + "step": 1830 + }, + { + "completion_length": 2.0, + "epoch": 0.4933458798225568, + "grad_norm": 1.7378441043547355e-05, + "kl": 13.7125, + "learning_rate": 1.1962984531785922e-05, + "loss": 0.5482, + "reward": 0.05479507446289063, + "reward_std": 0.009559368582631577, + "rewards/reward_func_1": 0.05479507446289063, + "step": 1835 + }, + { + "completion_length": 2.0, + "epoch": 0.4946901465250706, + "grad_norm": 7.995906344149262e-06, + "kl": 13.79375, + "learning_rate": 1.1916944701566688e-05, + "loss": 0.5518, + "reward": 0.062036323547363284, + "reward_std": 0.010829502148044411, + "rewards/reward_func_1": 0.062036323547363284, + "step": 1840 + }, + { + "completion_length": 2.0, + "epoch": 0.49603441322758435, + "grad_norm": 5.335733931133291e-06, + "kl": 14.0375, + "learning_rate": 1.1870862649511201e-05, + "loss": 0.5616, + "reward": 0.0632176399230957, + "reward_std": 0.011605303342366823, + "rewards/reward_func_1": 0.0632176399230957, + "step": 1845 + }, + { + "completion_length": 2.0, + "epoch": 0.49737867993009816, + "grad_norm": 3.3654257549642352e-06, + "kl": 13.9046875, + "learning_rate": 1.1824739390603801e-05, + "loss": 0.5563, + "reward": 0.06469783782958985, + "reward_std": 0.008109517797129229, + "rewards/reward_func_1": 0.06469783782958985, + "step": 1850 + }, + { + "completion_length": 2.0, + "epoch": 0.4987229466326119, + "grad_norm": 3.5574796584114665e-06, + "kl": 13.8203125, + "learning_rate": 1.1778575940736439e-05, + "loss": 0.5526, + "reward": 0.05752217769622803, + "reward_std": 0.011466909085720544, + "rewards/reward_func_1": 0.05752217769622803, + "step": 1855 + }, + { + "completion_length": 2.0, + "epoch": 0.5000672133351257, + "grad_norm": 4.107642325834604e-06, + "kl": 13.8015625, + "learning_rate": 1.1732373316686292e-05, + "loss": 0.5522, + "reward": 0.06678012609481812, + "reward_std": 0.011566609619330847, + "rewards/reward_func_1": 0.06678012609481812, + "step": 1860 + }, + { + "completion_length": 2.0, + "epoch": 0.5014114800376395, + "grad_norm": 8.976133358373772e-06, + "kl": 13.6671875, + "learning_rate": 1.1686132536093367e-05, + "loss": 0.5469, + "reward": 0.05934562683105469, + "reward_std": 0.012758818920701742, + "rewards/reward_func_1": 0.05934562683105469, + "step": 1865 + }, + { + "completion_length": 2.0, + "epoch": 0.5027557467401532, + "grad_norm": 6.34239995633834e-06, + "kl": 13.81875, + "learning_rate": 1.1639854617438098e-05, + "loss": 0.5528, + "reward": 0.05746040344238281, + "reward_std": 0.010571158733364427, + "rewards/reward_func_1": 0.05746040344238281, + "step": 1870 + }, + { + "completion_length": 2.0, + "epoch": 0.504100013442667, + "grad_norm": 5.687683824362466e-06, + "kl": 13.778125, + "learning_rate": 1.1593540580018904e-05, + "loss": 0.5512, + "reward": 0.05971870422363281, + "reward_std": 0.010947331442730501, + "rewards/reward_func_1": 0.05971870422363281, + "step": 1875 + }, + { + "completion_length": 2.0, + "epoch": 0.5054442801451808, + "grad_norm": 4.094200448889751e-06, + "kl": 13.8390625, + "learning_rate": 1.1547191443929738e-05, + "loss": 0.5535, + "reward": 0.059337806701660153, + "reward_std": 0.010353339680295903, + "rewards/reward_func_1": 0.059337806701660153, + "step": 1880 + }, + { + "completion_length": 2.0, + "epoch": 0.5067885468476946, + "grad_norm": 3.398595481485245e-06, + "kl": 13.9453125, + "learning_rate": 1.1500808230037628e-05, + "loss": 0.5578, + "reward": 0.05960531234741211, + "reward_std": 0.0105580543531687, + "rewards/reward_func_1": 0.05960531234741211, + "step": 1885 + }, + { + "completion_length": 2.0, + "epoch": 0.5081328135502083, + "grad_norm": 6.3564093579771e-06, + "kl": 13.6453125, + "learning_rate": 1.145439195996018e-05, + "loss": 0.5457, + "reward": 0.05985393524169922, + "reward_std": 0.011576782021438702, + "rewards/reward_func_1": 0.05985393524169922, + "step": 1890 + }, + { + "completion_length": 2.0, + "epoch": 0.5094770802527221, + "grad_norm": 4.291194272809662e-06, + "kl": 13.728125, + "learning_rate": 1.1407943656043088e-05, + "loss": 0.5492, + "reward": 0.062256813049316406, + "reward_std": 0.01353142662846949, + "rewards/reward_func_1": 0.062256813049316406, + "step": 1895 + }, + { + "completion_length": 2.0, + "epoch": 0.510821346955236, + "grad_norm": 1.4876853811074398e-06, + "kl": 13.75625, + "learning_rate": 1.1361464341337604e-05, + "loss": 0.5501, + "reward": 0.05925731658935547, + "reward_std": 0.009176148010010366, + "rewards/reward_func_1": 0.05925731658935547, + "step": 1900 + }, + { + "completion_length": 2.0, + "epoch": 0.5121656136577497, + "grad_norm": 5.476561454997864e-06, + "kl": 14.0125, + "learning_rate": 1.1314955039578017e-05, + "loss": 0.5605, + "reward": 0.06184120178222656, + "reward_std": 0.010909009404713288, + "rewards/reward_func_1": 0.06184120178222656, + "step": 1905 + }, + { + "completion_length": 2.0, + "epoch": 0.5135098803602635, + "grad_norm": 6.360010047501419e-06, + "kl": 13.7359375, + "learning_rate": 1.126841677515909e-05, + "loss": 0.5494, + "reward": 0.06381258964538575, + "reward_std": 0.009671746863750741, + "rewards/reward_func_1": 0.06381258964538575, + "step": 1910 + }, + { + "completion_length": 2.0, + "epoch": 0.5148541470627772, + "grad_norm": 3.01595628116047e-06, + "kl": 13.8765625, + "learning_rate": 1.1221850573113515e-05, + "loss": 0.5552, + "reward": 0.062322235107421874, + "reward_std": 0.011111631775202113, + "rewards/reward_func_1": 0.062322235107421874, + "step": 1915 + }, + { + "completion_length": 2.0, + "epoch": 0.5161984137652911, + "grad_norm": 4.450800588529091e-06, + "kl": 13.7265625, + "learning_rate": 1.117525745908932e-05, + "loss": 0.5491, + "reward": 0.06123924255371094, + "reward_std": 0.011204353353969054, + "rewards/reward_func_1": 0.06123924255371094, + "step": 1920 + }, + { + "completion_length": 2.0, + "epoch": 0.5175426804678048, + "grad_norm": 6.122299964772537e-06, + "kl": 13.78125, + "learning_rate": 1.1128638459327288e-05, + "loss": 0.5513, + "reward": 0.06520743370056152, + "reward_std": 0.010965666610718471, + "rewards/reward_func_1": 0.06520743370056152, + "step": 1925 + }, + { + "completion_length": 2.0, + "epoch": 0.5188869471703186, + "grad_norm": 2.6760865239339182e-06, + "kl": 13.5875, + "learning_rate": 1.1081994600638353e-05, + "loss": 0.5434, + "reward": 0.056897735595703124, + "reward_std": 0.009793705747870262, + "rewards/reward_func_1": 0.056897735595703124, + "step": 1930 + }, + { + "completion_length": 2.0, + "epoch": 0.5202312138728323, + "grad_norm": 3.4807439988071565e-06, + "kl": 13.7859375, + "learning_rate": 1.1035326910380973e-05, + "loss": 0.5516, + "reward": 0.05627828128635883, + "reward_std": 0.011038010333140846, + "rewards/reward_func_1": 0.05627828128635883, + "step": 1935 + }, + { + "completion_length": 2.0, + "epoch": 0.5215754805753462, + "grad_norm": 3.09952747556963e-06, + "kl": 13.984375, + "learning_rate": 1.0988636416438521e-05, + "loss": 0.5592, + "reward": 0.057964515686035153, + "reward_std": 0.010637010936625302, + "rewards/reward_func_1": 0.057964515686035153, + "step": 1940 + }, + { + "completion_length": 2.0, + "epoch": 0.52291974727786, + "grad_norm": 2.1990246750647202e-05, + "kl": 13.753125, + "learning_rate": 1.094192414719663e-05, + "loss": 0.5502, + "reward": 0.061472320556640626, + "reward_std": 0.010977944992919219, + "rewards/reward_func_1": 0.061472320556640626, + "step": 1945 + }, + { + "completion_length": 2.0, + "epoch": 0.5242640139803737, + "grad_norm": 9.643837984185666e-06, + "kl": 13.534375, + "learning_rate": 1.0895191131520541e-05, + "loss": 0.5414, + "reward": 0.05615215301513672, + "reward_std": 0.010869586077751592, + "rewards/reward_func_1": 0.05615215301513672, + "step": 1950 + }, + { + "completion_length": 2.0, + "epoch": 0.5256082806828875, + "grad_norm": 2.5311317131127e-06, + "kl": 14.0046875, + "learning_rate": 1.0848438398732462e-05, + "loss": 0.5601, + "reward": 0.06194038391113281, + "reward_std": 0.012956816235237057, + "rewards/reward_func_1": 0.06194038391113281, + "step": 1955 + }, + { + "completion_length": 2.0, + "epoch": 0.5269525473854013, + "grad_norm": 5.817757482873276e-06, + "kl": 13.809375, + "learning_rate": 1.0801666978588865e-05, + "loss": 0.5522, + "reward": 0.06431331634521484, + "reward_std": 0.011656289092206862, + "rewards/reward_func_1": 0.06431331634521484, + "step": 1960 + }, + { + "completion_length": 2.0, + "epoch": 0.5282968140879151, + "grad_norm": 2.050035391221172e-06, + "kl": 13.7453125, + "learning_rate": 1.0754877901257831e-05, + "loss": 0.5499, + "reward": 0.059846115112304685, + "reward_std": 0.01306429406904499, + "rewards/reward_func_1": 0.059846115112304685, + "step": 1965 + }, + { + "completion_length": 2.0, + "epoch": 0.5296410807904288, + "grad_norm": 1.0048594958789181e-05, + "kl": 13.7875, + "learning_rate": 1.0708072197296356e-05, + "loss": 0.5518, + "reward": 0.06069736480712891, + "reward_std": 0.011032124502526131, + "rewards/reward_func_1": 0.06069736480712891, + "step": 1970 + }, + { + "completion_length": 2.0, + "epoch": 0.5309853474929426, + "grad_norm": 2.601335108920466e-05, + "kl": 13.584375, + "learning_rate": 1.0661250897627634e-05, + "loss": 0.5436, + "reward": 0.053227472305297854, + "reward_std": 0.010092408429773058, + "rewards/reward_func_1": 0.053227472305297854, + "step": 1975 + }, + { + "completion_length": 2.0, + "epoch": 0.5323296141954563, + "grad_norm": 1.8432465367368422e-06, + "kl": 13.54375, + "learning_rate": 1.061441503351837e-05, + "loss": 0.5418, + "reward": 0.05695056915283203, + "reward_std": 0.009967696487728972, + "rewards/reward_func_1": 0.05695056915283203, + "step": 1980 + }, + { + "completion_length": 2.0, + "epoch": 0.5336738808979702, + "grad_norm": 7.412290869979188e-06, + "kl": 13.8828125, + "learning_rate": 1.056756563655607e-05, + "loss": 0.5549, + "reward": 0.0633920669555664, + "reward_std": 0.01274736642735661, + "rewards/reward_func_1": 0.0633920669555664, + "step": 1985 + }, + { + "completion_length": 2.0, + "epoch": 0.5350181476004839, + "grad_norm": 4.0991371861309744e-06, + "kl": 13.6109375, + "learning_rate": 1.052070373862629e-05, + "loss": 0.5444, + "reward": 0.05980701446533203, + "reward_std": 0.01131843865441624, + "rewards/reward_func_1": 0.05980701446533203, + "step": 1990 + }, + { + "completion_length": 2.0, + "epoch": 0.5363624143029977, + "grad_norm": 1.9139324649586342e-06, + "kl": 13.7296875, + "learning_rate": 1.047383037188994e-05, + "loss": 0.5491, + "reward": 0.060098457336425784, + "reward_std": 0.011104363739286782, + "rewards/reward_func_1": 0.060098457336425784, + "step": 1995 + }, + { + "completion_length": 2.0, + "epoch": 0.5377066810055114, + "grad_norm": 2.1989344531903043e-05, + "kl": 13.8578125, + "learning_rate": 1.0426946568760534e-05, + "loss": 0.5541, + "reward": 0.06157665252685547, + "reward_std": 0.010787656143656931, + "rewards/reward_func_1": 0.06157665252685547, + "step": 2000 + }, + { + "completion_length": 2.0, + "epoch": 0.5390509477080253, + "grad_norm": 5.320031505107181e-06, + "kl": 13.80625, + "learning_rate": 1.0380053361881454e-05, + "loss": 0.5523, + "reward": 0.06187152862548828, + "reward_std": 0.010924646601051791, + "rewards/reward_func_1": 0.06187152862548828, + "step": 2005 + }, + { + "completion_length": 2.0, + "epoch": 0.5403952144105391, + "grad_norm": 2.400984612904722e-06, + "kl": 13.659375, + "learning_rate": 1.0333151784103204e-05, + "loss": 0.5463, + "reward": 0.06603701114654541, + "reward_std": 0.012158522802928928, + "rewards/reward_func_1": 0.06603701114654541, + "step": 2010 + }, + { + "completion_length": 2.0, + "epoch": 0.5417394811130528, + "grad_norm": 3.1185063562588766e-06, + "kl": 13.85, + "learning_rate": 1.0286242868460658e-05, + "loss": 0.5541, + "reward": 0.06205949783325195, + "reward_std": 0.011392109425651142, + "rewards/reward_func_1": 0.06205949783325195, + "step": 2015 + }, + { + "completion_length": 2.0, + "epoch": 0.5430837478155666, + "grad_norm": 1.7467871202825336e-06, + "kl": 14.109375, + "learning_rate": 1.0239327648150324e-05, + "loss": 0.5644, + "reward": 0.05861544609069824, + "reward_std": 0.012278527018861497, + "rewards/reward_func_1": 0.05861544609069824, + "step": 2020 + }, + { + "completion_length": 2.0, + "epoch": 0.5444280145180804, + "grad_norm": 2.397207026660908e-06, + "kl": 13.9265625, + "learning_rate": 1.0192407156507555e-05, + "loss": 0.557, + "reward": 0.06040668487548828, + "reward_std": 0.010451347306661774, + "rewards/reward_func_1": 0.06040668487548828, + "step": 2025 + }, + { + "completion_length": 2.0, + "epoch": 0.5457722812205942, + "grad_norm": 2.443831590426271e-06, + "kl": 13.7421875, + "learning_rate": 1.0145482426983829e-05, + "loss": 0.5496, + "reward": 0.05943064689636231, + "reward_std": 0.010262710415554465, + "rewards/reward_func_1": 0.05943064689636231, + "step": 2030 + }, + { + "completion_length": 2.0, + "epoch": 0.5471165479231079, + "grad_norm": 1.7376810319547076e-06, + "kl": 14.0484375, + "learning_rate": 1.0098554493123946e-05, + "loss": 0.5619, + "reward": 0.06599822044372558, + "reward_std": 0.010502938941499451, + "rewards/reward_func_1": 0.06599822044372558, + "step": 2035 + }, + { + "completion_length": 2.0, + "epoch": 0.5484608146256217, + "grad_norm": 1.9027496591661475e-06, + "kl": 13.709375, + "learning_rate": 1.0051624388543303e-05, + "loss": 0.5482, + "reward": 0.06099987030029297, + "reward_std": 0.011544406516259187, + "rewards/reward_func_1": 0.06099987030029297, + "step": 2040 + }, + { + "completion_length": 2.0, + "epoch": 0.5498050813281355, + "grad_norm": 3.951816779590445e-06, + "kl": 13.634375, + "learning_rate": 1.0004693146905086e-05, + "loss": 0.5452, + "reward": 0.06201457977294922, + "reward_std": 0.00889511961795506, + "rewards/reward_func_1": 0.06201457977294922, + "step": 2045 + }, + { + "completion_length": 2.0, + "epoch": 0.5511493480306493, + "grad_norm": 4.0156542127078865e-06, + "kl": 13.6765625, + "learning_rate": 9.957761801897546e-06, + "loss": 0.547, + "reward": 0.05980491638183594, + "reward_std": 0.011892608562629903, + "rewards/reward_func_1": 0.05980491638183594, + "step": 2050 + }, + { + "completion_length": 2.0, + "epoch": 0.552493614733163, + "grad_norm": 1.6064385590652819e-06, + "kl": 13.7890625, + "learning_rate": 9.910831387211203e-06, + "loss": 0.552, + "reward": 0.05945572853088379, + "reward_std": 0.009774930006824434, + "rewards/reward_func_1": 0.05945572853088379, + "step": 2055 + }, + { + "completion_length": 2.0, + "epoch": 0.5538378814356768, + "grad_norm": 5.076154593552928e-06, + "kl": 13.6640625, + "learning_rate": 9.863902936516079e-06, + "loss": 0.5466, + "reward": 0.0619448184967041, + "reward_std": 0.011996782931964845, + "rewards/reward_func_1": 0.0619448184967041, + "step": 2060 + }, + { + "completion_length": 2.0, + "epoch": 0.5551821481381907, + "grad_norm": 1.1968987564614508e-05, + "kl": 13.6625, + "learning_rate": 9.81697748343895e-06, + "loss": 0.5466, + "reward": 0.06266765594482422, + "reward_std": 0.011780065088532864, + "rewards/reward_func_1": 0.06266765594482422, + "step": 2065 + }, + { + "completion_length": 2.0, + "epoch": 0.5565264148407044, + "grad_norm": 2.355890319449827e-05, + "kl": 13.9328125, + "learning_rate": 9.77005606154056e-06, + "loss": 0.5576, + "reward": 0.05969257354736328, + "reward_std": 0.009937083004115266, + "rewards/reward_func_1": 0.05969257354736328, + "step": 2070 + }, + { + "completion_length": 2.0, + "epoch": 0.5578706815432182, + "grad_norm": 8.669927410664968e-06, + "kl": 13.525, + "learning_rate": 9.723139704292866e-06, + "loss": 0.5408, + "reward": 0.06073760986328125, + "reward_std": 0.01386090821470134, + "rewards/reward_func_1": 0.06073760986328125, + "step": 2075 + }, + { + "completion_length": 2.0, + "epoch": 0.5592149482457319, + "grad_norm": 3.4235183647979284e-06, + "kl": 13.79375, + "learning_rate": 9.676229445056269e-06, + "loss": 0.552, + "reward": 0.06341695785522461, + "reward_std": 0.010479317836870904, + "rewards/reward_func_1": 0.06341695785522461, + "step": 2080 + }, + { + "completion_length": 2.0, + "epoch": 0.5605592149482457, + "grad_norm": 2.277251951454673e-06, + "kl": 14.06875, + "learning_rate": 9.629326317056872e-06, + "loss": 0.5628, + "reward": 0.06306524276733398, + "reward_std": 0.011487474158639089, + "rewards/reward_func_1": 0.06306524276733398, + "step": 2085 + }, + { + "completion_length": 2.0, + "epoch": 0.5619034816507595, + "grad_norm": 4.600749434757745e-06, + "kl": 13.9359375, + "learning_rate": 9.582431353363687e-06, + "loss": 0.5572, + "reward": 0.0586451530456543, + "reward_std": 0.009657706473444706, + "rewards/reward_func_1": 0.0586451530456543, + "step": 2090 + }, + { + "completion_length": 2.0, + "epoch": 0.5632477483532733, + "grad_norm": 7.010680747043807e-06, + "kl": 13.775, + "learning_rate": 9.535545586865922e-06, + "loss": 0.5508, + "reward": 0.06332626342773437, + "reward_std": 0.01151423337869346, + "rewards/reward_func_1": 0.06332626342773437, + "step": 2095 + }, + { + "completion_length": 2.0, + "epoch": 0.564592015055787, + "grad_norm": 3.5268849387648515e-06, + "kl": 13.725, + "learning_rate": 9.488670050250195e-06, + "loss": 0.5491, + "reward": 0.05642566680908203, + "reward_std": 0.01242845638480503, + "rewards/reward_func_1": 0.05642566680908203, + "step": 2100 + }, + { + "completion_length": 2.0, + "epoch": 0.5659362817583008, + "grad_norm": 2.5853701117739547e-06, + "kl": 13.7984375, + "learning_rate": 9.441805775977822e-06, + "loss": 0.5522, + "reward": 0.05613641738891602, + "reward_std": 0.011232214039591782, + "rewards/reward_func_1": 0.05613641738891602, + "step": 2105 + }, + { + "completion_length": 2.0, + "epoch": 0.5672805484608147, + "grad_norm": 4.693101436714642e-05, + "kl": 13.8296875, + "learning_rate": 9.394953796262037e-06, + "loss": 0.5533, + "reward": 0.06460676193237305, + "reward_std": 0.009554302979813656, + "rewards/reward_func_1": 0.06460676193237305, + "step": 2110 + }, + { + "completion_length": 2.0, + "epoch": 0.5686248151633284, + "grad_norm": 3.470984211162431e-06, + "kl": 13.940625, + "learning_rate": 9.348115143045305e-06, + "loss": 0.5579, + "reward": 0.05569601058959961, + "reward_std": 0.00965253066533478, + "rewards/reward_func_1": 0.05569601058959961, + "step": 2115 + }, + { + "completion_length": 2.0, + "epoch": 0.5699690818658422, + "grad_norm": 5.797121502837399e-06, + "kl": 13.55625, + "learning_rate": 9.301290847976545e-06, + "loss": 0.5421, + "reward": 0.06446545943617821, + "reward_std": 0.014136035600677133, + "rewards/reward_func_1": 0.06446545943617821, + "step": 2120 + }, + { + "completion_length": 2.0, + "epoch": 0.5713133485683559, + "grad_norm": 2.2634183096670313e-06, + "kl": 13.9234375, + "learning_rate": 9.254481942388444e-06, + "loss": 0.5566, + "reward": 0.05872611999511719, + "reward_std": 0.010083984247467015, + "rewards/reward_func_1": 0.05872611999511719, + "step": 2125 + }, + { + "completion_length": 2.0, + "epoch": 0.5726576152708698, + "grad_norm": 2.7756636882259045e-06, + "kl": 13.84375, + "learning_rate": 9.207689457274716e-06, + "loss": 0.5536, + "reward": 0.06077961921691895, + "reward_std": 0.01000992787303403, + "rewards/reward_func_1": 0.06077961921691895, + "step": 2130 + }, + { + "completion_length": 2.0, + "epoch": 0.5740018819733835, + "grad_norm": 3.0030598736630054e-06, + "kl": 14.05625, + "learning_rate": 9.160914423267416e-06, + "loss": 0.5621, + "reward": 0.06290161609649658, + "reward_std": 0.010859317294671201, + "rewards/reward_func_1": 0.06290161609649658, + "step": 2135 + }, + { + "completion_length": 2.0, + "epoch": 0.5753461486758973, + "grad_norm": 8.018588232516777e-06, + "kl": 14.0125, + "learning_rate": 9.114157870614213e-06, + "loss": 0.5605, + "reward": 0.06700577735900878, + "reward_std": 0.013860853042569943, + "rewards/reward_func_1": 0.06700577735900878, + "step": 2140 + }, + { + "completion_length": 2.0, + "epoch": 0.576690415378411, + "grad_norm": 3.3272003747697454e-06, + "kl": 14.1421875, + "learning_rate": 9.067420829155731e-06, + "loss": 0.5659, + "reward": 0.06595449447631836, + "reward_std": 0.010813094197510508, + "rewards/reward_func_1": 0.06595449447631836, + "step": 2145 + }, + { + "completion_length": 2.0, + "epoch": 0.5780346820809249, + "grad_norm": 1.5704621546319686e-05, + "kl": 13.5234375, + "learning_rate": 9.020704328302829e-06, + "loss": 0.5408, + "reward": 0.06404781341552734, + "reward_std": 0.011609598056566028, + "rewards/reward_func_1": 0.06404781341552734, + "step": 2150 + }, + { + "completion_length": 2.0, + "epoch": 0.5793789487834387, + "grad_norm": 2.1239829948171973e-05, + "kl": 13.6328125, + "learning_rate": 8.974009397013965e-06, + "loss": 0.5455, + "reward": 0.058431386947631836, + "reward_std": 0.010231435889363639, + "rewards/reward_func_1": 0.058431386947631836, + "step": 2155 + }, + { + "completion_length": 2.0, + "epoch": 0.5807232154859524, + "grad_norm": 3.322323755128309e-05, + "kl": 13.49375, + "learning_rate": 8.927337063772504e-06, + "loss": 0.5398, + "reward": 0.059176063537597655, + "reward_std": 0.010483282200584653, + "rewards/reward_func_1": 0.059176063537597655, + "step": 2160 + }, + { + "completion_length": 2.0, + "epoch": 0.5820674821884662, + "grad_norm": 1.4426668712985702e-06, + "kl": 13.8109375, + "learning_rate": 8.88068835656408e-06, + "loss": 0.5523, + "reward": 0.06375694274902344, + "reward_std": 0.011770154316764092, + "rewards/reward_func_1": 0.06375694274902344, + "step": 2165 + }, + { + "completion_length": 2.0, + "epoch": 0.58341174889098, + "grad_norm": 2.1057694539194927e-06, + "kl": 13.95625, + "learning_rate": 8.834064302853944e-06, + "loss": 0.5581, + "reward": 0.06186666488647461, + "reward_std": 0.011866289676254383, + "rewards/reward_func_1": 0.06186666488647461, + "step": 2170 + }, + { + "completion_length": 2.0, + "epoch": 0.5847560155934938, + "grad_norm": 2.079190153381205e-06, + "kl": 13.7578125, + "learning_rate": 8.787465929564352e-06, + "loss": 0.5504, + "reward": 0.05595951080322266, + "reward_std": 0.011447059749116306, + "rewards/reward_func_1": 0.05595951080322266, + "step": 2175 + }, + { + "completion_length": 2.0, + "epoch": 0.5861002822960075, + "grad_norm": 3.0134608550724806e-06, + "kl": 13.93125, + "learning_rate": 8.740894263051913e-06, + "loss": 0.557, + "reward": 0.06060028076171875, + "reward_std": 0.009739526234625373, + "rewards/reward_func_1": 0.06060028076171875, + "step": 2180 + }, + { + "completion_length": 2.0, + "epoch": 0.5874445489985213, + "grad_norm": 3.109391400357708e-05, + "kl": 13.9, + "learning_rate": 8.694350329085028e-06, + "loss": 0.5558, + "reward": 0.0626680850982666, + "reward_std": 0.010819756354612764, + "rewards/reward_func_1": 0.0626680850982666, + "step": 2185 + }, + { + "completion_length": 2.0, + "epoch": 0.5887888157010351, + "grad_norm": 1.3205424693296663e-05, + "kl": 13.6703125, + "learning_rate": 8.647835152821252e-06, + "loss": 0.5469, + "reward": 0.05972356796264648, + "reward_std": 0.010678636631928385, + "rewards/reward_func_1": 0.05972356796264648, + "step": 2190 + }, + { + "completion_length": 2.0, + "epoch": 0.5901330824035489, + "grad_norm": 4.43181943410309e-06, + "kl": 13.965625, + "learning_rate": 8.601349758784744e-06, + "loss": 0.5587, + "reward": 0.06157550811767578, + "reward_std": 0.010384173551574349, + "rewards/reward_func_1": 0.06157550811767578, + "step": 2195 + }, + { + "completion_length": 2.0, + "epoch": 0.5914773491060626, + "grad_norm": 2.103875203829375e-06, + "kl": 14.0875, + "learning_rate": 8.55489517084369e-06, + "loss": 0.5634, + "reward": 0.06180839538574219, + "reward_std": 0.01010028199889348, + "rewards/reward_func_1": 0.06180839538574219, + "step": 2200 + }, + { + "completion_length": 2.0, + "epoch": 0.5928216158085764, + "grad_norm": 2.1625994122587144e-05, + "kl": 14.025, + "learning_rate": 8.508472412187759e-06, + "loss": 0.5611, + "reward": 0.06270132064819336, + "reward_std": 0.012054376184096327, + "rewards/reward_func_1": 0.06270132064819336, + "step": 2205 + }, + { + "completion_length": 2.0, + "epoch": 0.5941658825110901, + "grad_norm": 6.9565994635922834e-06, + "kl": 13.8640625, + "learning_rate": 8.462082505305547e-06, + "loss": 0.5548, + "reward": 0.06188421249389649, + "reward_std": 0.011324935717857443, + "rewards/reward_func_1": 0.06188421249389649, + "step": 2210 + }, + { + "completion_length": 2.0, + "epoch": 0.595510149213604, + "grad_norm": 2.927747118519619e-06, + "kl": 13.953125, + "learning_rate": 8.415726471962092e-06, + "loss": 0.558, + "reward": 0.060194778442382815, + "reward_std": 0.009112278009342844, + "rewards/reward_func_1": 0.060194778442382815, + "step": 2215 + }, + { + "completion_length": 2.0, + "epoch": 0.5968544159161178, + "grad_norm": 1.6029367770897807e-06, + "kl": 13.9375, + "learning_rate": 8.369405333176322e-06, + "loss": 0.5573, + "reward": 0.0625925064086914, + "reward_std": 0.01229454953354434, + "rewards/reward_func_1": 0.0625925064086914, + "step": 2220 + }, + { + "completion_length": 2.0, + "epoch": 0.5981986826186315, + "grad_norm": 3.083619731114595e-06, + "kl": 14.1265625, + "learning_rate": 8.323120109198616e-06, + "loss": 0.5648, + "reward": 0.06270506381988525, + "reward_std": 0.011008751340705203, + "rewards/reward_func_1": 0.06270506381988525, + "step": 2225 + }, + { + "completion_length": 2.0, + "epoch": 0.5995429493211453, + "grad_norm": 5.694411811418831e-06, + "kl": 13.75, + "learning_rate": 8.276871819488287e-06, + "loss": 0.5501, + "reward": 0.06018905639648438, + "reward_std": 0.009886647743405775, + "rewards/reward_func_1": 0.06018905639648438, + "step": 2230 + }, + { + "completion_length": 2.0, + "epoch": 0.6008872160236591, + "grad_norm": 2.441943252051715e-06, + "kl": 13.8171875, + "learning_rate": 8.230661482691168e-06, + "loss": 0.5526, + "reward": 0.0654977798461914, + "reward_std": 0.010158205546758836, + "rewards/reward_func_1": 0.0654977798461914, + "step": 2235 + }, + { + "completion_length": 2.0, + "epoch": 0.6022314827261729, + "grad_norm": 2.561245310062077e-06, + "kl": 13.81875, + "learning_rate": 8.18449011661714e-06, + "loss": 0.5528, + "reward": 0.059857940673828124, + "reward_std": 0.009442199986369814, + "rewards/reward_func_1": 0.059857940673828124, + "step": 2240 + }, + { + "completion_length": 2.0, + "epoch": 0.6035757494286866, + "grad_norm": 9.140064321400132e-06, + "kl": 13.8984375, + "learning_rate": 8.138358738217743e-06, + "loss": 0.5559, + "reward": 0.062485790252685545, + "reward_std": 0.009677528292741044, + "rewards/reward_func_1": 0.062485790252685545, + "step": 2245 + }, + { + "completion_length": 2.0, + "epoch": 0.6049200161312004, + "grad_norm": 1.274393162020715e-05, + "kl": 13.6140625, + "learning_rate": 8.09226836356376e-06, + "loss": 0.5446, + "reward": 0.06158370971679687, + "reward_std": 0.012039840093348176, + "rewards/reward_func_1": 0.06158370971679687, + "step": 2250 + }, + { + "completion_length": 2.0, + "epoch": 0.6062642828337143, + "grad_norm": 4.513978183240397e-06, + "kl": 14.028125, + "learning_rate": 8.046220007822845e-06, + "loss": 0.5613, + "reward": 0.05757331848144531, + "reward_std": 0.011246860059327447, + "rewards/reward_func_1": 0.05757331848144531, + "step": 2255 + }, + { + "completion_length": 2.0, + "epoch": 0.607608549536228, + "grad_norm": 4.966521828464465e-06, + "kl": 13.853125, + "learning_rate": 8.000214685237154e-06, + "loss": 0.554, + "reward": 0.059112969785928726, + "reward_std": 0.013033414728124627, + "rewards/reward_func_1": 0.059112969785928726, + "step": 2260 + }, + { + "completion_length": 2.0, + "epoch": 0.6089528162387418, + "grad_norm": 3.217521907572518e-06, + "kl": 13.6359375, + "learning_rate": 7.954253409101019e-06, + "loss": 0.5456, + "reward": 0.061025047302246095, + "reward_std": 0.012668960404334939, + "rewards/reward_func_1": 0.061025047302246095, + "step": 2265 + }, + { + "completion_length": 2.0, + "epoch": 0.6102970829412555, + "grad_norm": 3.596692977225757e-06, + "kl": 13.675, + "learning_rate": 7.908337191738625e-06, + "loss": 0.5469, + "reward": 0.05897402763366699, + "reward_std": 0.010948267369531094, + "rewards/reward_func_1": 0.05897402763366699, + "step": 2270 + }, + { + "completion_length": 2.0, + "epoch": 0.6116413496437694, + "grad_norm": 1.489972146373475e-05, + "kl": 13.6046875, + "learning_rate": 7.862467044481696e-06, + "loss": 0.5443, + "reward": 0.06776981353759766, + "reward_std": 0.009472813666070579, + "rewards/reward_func_1": 0.06776981353759766, + "step": 2275 + }, + { + "completion_length": 2.0, + "epoch": 0.6129856163462831, + "grad_norm": 4.7771932258910965e-06, + "kl": 13.8671875, + "learning_rate": 7.81664397764726e-06, + "loss": 0.5547, + "reward": 0.05934486389160156, + "reward_std": 0.010944688416202553, + "rewards/reward_func_1": 0.05934486389160156, + "step": 2280 + }, + { + "completion_length": 2.0, + "epoch": 0.6143298830487969, + "grad_norm": 1.628923541829863e-06, + "kl": 13.64375, + "learning_rate": 7.770869000515344e-06, + "loss": 0.5459, + "reward": 0.059722518920898436, + "reward_std": 0.009479641152574913, + "rewards/reward_func_1": 0.059722518920898436, + "step": 2285 + }, + { + "completion_length": 2.0, + "epoch": 0.6156741497513106, + "grad_norm": 3.469725243121502e-06, + "kl": 13.8546875, + "learning_rate": 7.725143121306793e-06, + "loss": 0.5542, + "reward": 0.05222053527832031, + "reward_std": 0.011934454514994286, + "rewards/reward_func_1": 0.05222053527832031, + "step": 2290 + }, + { + "completion_length": 2.0, + "epoch": 0.6170184164538245, + "grad_norm": 4.984348834113916e-06, + "kl": 13.95625, + "learning_rate": 7.679467347161025e-06, + "loss": 0.5581, + "reward": 0.060247611999511716, + "reward_std": 0.01037932816798275, + "rewards/reward_func_1": 0.060247611999511716, + "step": 2295 + }, + { + "completion_length": 2.0, + "epoch": 0.6183626831563382, + "grad_norm": 1.597562345523329e-06, + "kl": 13.9984375, + "learning_rate": 7.633842684113876e-06, + "loss": 0.5599, + "reward": 0.05987234115600586, + "reward_std": 0.009311927141970955, + "rewards/reward_func_1": 0.05987234115600586, + "step": 2300 + }, + { + "completion_length": 2.0, + "epoch": 0.619706949858852, + "grad_norm": 2.6105003598786425e-06, + "kl": 13.990625, + "learning_rate": 7.588270137075421e-06, + "loss": 0.5599, + "reward": 0.057819366455078125, + "reward_std": 0.012239268912526313, + "rewards/reward_func_1": 0.057819366455078125, + "step": 2305 + }, + { + "completion_length": 2.0, + "epoch": 0.6210512165613657, + "grad_norm": 1.7647248569119256e-06, + "kl": 13.7203125, + "learning_rate": 7.542750709807861e-06, + "loss": 0.5489, + "reward": 0.05905556678771973, + "reward_std": 0.011430266295792534, + "rewards/reward_func_1": 0.05905556678771973, + "step": 2310 + }, + { + "completion_length": 2.0, + "epoch": 0.6223954832638795, + "grad_norm": 2.6496518330532126e-06, + "kl": 13.6484375, + "learning_rate": 7.497285404903387e-06, + "loss": 0.5465, + "reward": 0.055854058265686034, + "reward_std": 0.010062372921674978, + "rewards/reward_func_1": 0.055854058265686034, + "step": 2315 + }, + { + "completion_length": 2.0, + "epoch": 0.6237397499663934, + "grad_norm": 1.3658197531185579e-05, + "kl": 13.98125, + "learning_rate": 7.451875223762129e-06, + "loss": 0.5593, + "reward": 0.06093788146972656, + "reward_std": 0.012115493134479039, + "rewards/reward_func_1": 0.06093788146972656, + "step": 2320 + }, + { + "completion_length": 2.0, + "epoch": 0.6250840166689071, + "grad_norm": 2.737979684752645e-06, + "kl": 13.5125, + "learning_rate": 7.4065211665700685e-06, + "loss": 0.5404, + "reward": 0.052369880676269534, + "reward_std": 0.013848574734220164, + "rewards/reward_func_1": 0.052369880676269534, + "step": 2325 + }, + { + "completion_length": 2.0, + "epoch": 0.6264282833714209, + "grad_norm": 4.7086259655770846e-06, + "kl": 13.759375, + "learning_rate": 7.36122423227704e-06, + "loss": 0.5506, + "reward": 0.06091470718383789, + "reward_std": 0.011245868943660753, + "rewards/reward_func_1": 0.06091470718383789, + "step": 2330 + }, + { + "completion_length": 2.0, + "epoch": 0.6277725500739346, + "grad_norm": 6.927496997377602e-06, + "kl": 13.696875, + "learning_rate": 7.315985418574693e-06, + "loss": 0.5479, + "reward": 0.05918540954589844, + "reward_std": 0.012543642877426464, + "rewards/reward_func_1": 0.05918540954589844, + "step": 2335 + }, + { + "completion_length": 2.0, + "epoch": 0.6291168167764485, + "grad_norm": 3.3043399980670074e-06, + "kl": 13.603125, + "learning_rate": 7.270805721874559e-06, + "loss": 0.544, + "reward": 0.058438873291015624, + "reward_std": 0.010428001565014711, + "rewards/reward_func_1": 0.058438873291015624, + "step": 2340 + }, + { + "completion_length": 2.0, + "epoch": 0.6304610834789622, + "grad_norm": 1.634690306673292e-05, + "kl": 13.9828125, + "learning_rate": 7.225686137286065e-06, + "loss": 0.5591, + "reward": 0.06279127690941096, + "reward_std": 0.0103473931827466, + "rewards/reward_func_1": 0.06279127690941096, + "step": 2345 + }, + { + "completion_length": 2.0, + "epoch": 0.631805350181476, + "grad_norm": 3.8762250369472895e-06, + "kl": 14.221875, + "learning_rate": 7.180627658594643e-06, + "loss": 0.5689, + "reward": 0.06403388977050781, + "reward_std": 0.01183446466930036, + "rewards/reward_func_1": 0.06403388977050781, + "step": 2350 + }, + { + "completion_length": 2.0, + "epoch": 0.6331496168839897, + "grad_norm": 2.116005816787947e-06, + "kl": 14.1421875, + "learning_rate": 7.135631278239823e-06, + "loss": 0.5657, + "reward": 0.059031105041503905, + "reward_std": 0.010066585054846654, + "rewards/reward_func_1": 0.059031105041503905, + "step": 2355 + }, + { + "completion_length": 2.0, + "epoch": 0.6344938835865036, + "grad_norm": 2.5220099360012682e-06, + "kl": 13.640625, + "learning_rate": 7.090697987293398e-06, + "loss": 0.5456, + "reward": 0.059407520294189456, + "reward_std": 0.010223947776830755, + "rewards/reward_func_1": 0.059407520294189456, + "step": 2360 + }, + { + "completion_length": 2.0, + "epoch": 0.6358381502890174, + "grad_norm": 1.0680985269573284e-06, + "kl": 13.6078125, + "learning_rate": 7.045828775437558e-06, + "loss": 0.5443, + "reward": 0.06002349853515625, + "reward_std": 0.011791737930616364, + "rewards/reward_func_1": 0.06002349853515625, + "step": 2365 + }, + { + "completion_length": 2.0, + "epoch": 0.6371824169915311, + "grad_norm": 4.526314114627894e-06, + "kl": 13.4546875, + "learning_rate": 7.001024630943134e-06, + "loss": 0.5382, + "reward": 0.05956945419311523, + "reward_std": 0.012025964839267544, + "rewards/reward_func_1": 0.05956945419311523, + "step": 2370 + }, + { + "completion_length": 2.0, + "epoch": 0.6385266836940449, + "grad_norm": 4.205965524306521e-06, + "kl": 14.1140625, + "learning_rate": 6.956286540647794e-06, + "loss": 0.5649, + "reward": 0.060262870788574216, + "reward_std": 0.010221635182824684, + "rewards/reward_func_1": 0.060262870788574216, + "step": 2375 + }, + { + "completion_length": 2.0, + "epoch": 0.6398709503965587, + "grad_norm": 2.007863940889365e-06, + "kl": 13.9953125, + "learning_rate": 6.9116154899343356e-06, + "loss": 0.5597, + "reward": 0.06056399345397949, + "reward_std": 0.013314929121406749, + "rewards/reward_func_1": 0.06056399345397949, + "step": 2380 + }, + { + "completion_length": 2.0, + "epoch": 0.6412152170990725, + "grad_norm": 3.056561354242149e-06, + "kl": 13.8359375, + "learning_rate": 6.867012462708963e-06, + "loss": 0.5534, + "reward": 0.059704828262329104, + "reward_std": 0.011303682426660088, + "rewards/reward_func_1": 0.059704828262329104, + "step": 2385 + }, + { + "completion_length": 2.0, + "epoch": 0.6425594838015862, + "grad_norm": 2.523838702472858e-06, + "kl": 13.78125, + "learning_rate": 6.8224784413796244e-06, + "loss": 0.5513, + "reward": 0.057023143768310545, + "reward_std": 0.012784256822487804, + "rewards/reward_func_1": 0.057023143768310545, + "step": 2390 + }, + { + "completion_length": 2.0, + "epoch": 0.6439037505041, + "grad_norm": 2.2542815258930204e-06, + "kl": 13.7625, + "learning_rate": 6.77801440683437e-06, + "loss": 0.5508, + "reward": 0.057397651672363284, + "reward_std": 0.011636027062195353, + "rewards/reward_func_1": 0.057397651672363284, + "step": 2395 + }, + { + "completion_length": 2.0, + "epoch": 0.6452480172066138, + "grad_norm": 3.0053756745473947e-06, + "kl": 13.6421875, + "learning_rate": 6.733621338419763e-06, + "loss": 0.5457, + "reward": 0.05742425918579101, + "reward_std": 0.010394414755865, + "rewards/reward_func_1": 0.05742425918579101, + "step": 2400 + }, + { + "completion_length": 2.0, + "epoch": 0.6465922839091276, + "grad_norm": 9.635583410272375e-06, + "kl": 13.7828125, + "learning_rate": 6.689300213919271e-06, + "loss": 0.5511, + "reward": 0.061480712890625, + "reward_std": 0.010102924931379676, + "rewards/reward_func_1": 0.061480712890625, + "step": 2405 + }, + { + "completion_length": 2.0, + "epoch": 0.6479365506116413, + "grad_norm": 1.539800905447919e-06, + "kl": 13.753125, + "learning_rate": 6.645052009531782e-06, + "loss": 0.5501, + "reward": 0.06248035430908203, + "reward_std": 0.010455972234194633, + "rewards/reward_func_1": 0.06248035430908203, + "step": 2410 + }, + { + "completion_length": 2.0, + "epoch": 0.6492808173141551, + "grad_norm": 2.588592906249687e-06, + "kl": 13.6578125, + "learning_rate": 6.600877699850052e-06, + "loss": 0.5464, + "reward": 0.05666141510009766, + "reward_std": 0.015636276185978203, + "rewards/reward_func_1": 0.05666141510009766, + "step": 2415 + }, + { + "completion_length": 2.0, + "epoch": 0.6506250840166689, + "grad_norm": 3.2917205317062326e-06, + "kl": 14.1875, + "learning_rate": 6.556778257839283e-06, + "loss": 0.5674, + "reward": 0.061602020263671876, + "reward_std": 0.009157647862593876, + "rewards/reward_func_1": 0.061602020263671876, + "step": 2420 + }, + { + "completion_length": 2.0, + "epoch": 0.6519693507191827, + "grad_norm": 2.6396586690680124e-05, + "kl": 13.7578125, + "learning_rate": 6.5127546548156535e-06, + "loss": 0.5502, + "reward": 0.06312904357910157, + "reward_std": 0.011368433445022674, + "rewards/reward_func_1": 0.06312904357910157, + "step": 2425 + }, + { + "completion_length": 2.0, + "epoch": 0.6533136174216965, + "grad_norm": 5.049357241659891e-06, + "kl": 13.6421875, + "learning_rate": 6.46880786042496e-06, + "loss": 0.5455, + "reward": 0.05724415183067322, + "reward_std": 0.012990292893664445, + "rewards/reward_func_1": 0.05724415183067322, + "step": 2430 + }, + { + "completion_length": 2.0, + "epoch": 0.6546578841242102, + "grad_norm": 2.6590103061607806e-06, + "kl": 13.8875, + "learning_rate": 6.424938842621231e-06, + "loss": 0.5555, + "reward": 0.0595550537109375, + "reward_std": 0.011499256859679008, + "rewards/reward_func_1": 0.0595550537109375, + "step": 2435 + }, + { + "completion_length": 2.0, + "epoch": 0.656002150826724, + "grad_norm": 6.467951152444584e-06, + "kl": 13.7046875, + "learning_rate": 6.38114856764543e-06, + "loss": 0.5482, + "reward": 0.0562408447265625, + "reward_std": 0.011464899309066823, + "rewards/reward_func_1": 0.0562408447265625, + "step": 2440 + }, + { + "completion_length": 2.0, + "epoch": 0.6573464175292378, + "grad_norm": 2.2811452708992874e-06, + "kl": 14.015625, + "learning_rate": 6.337438000004155e-06, + "loss": 0.5606, + "reward": 0.061225509643554686, + "reward_std": 0.009458938350144308, + "rewards/reward_func_1": 0.061225509643554686, + "step": 2445 + }, + { + "completion_length": 2.0, + "epoch": 0.6586906842317516, + "grad_norm": 8.717958735360298e-06, + "kl": 13.875, + "learning_rate": 6.293808102448409e-06, + "loss": 0.5548, + "reward": 0.056508952379226686, + "reward_std": 0.01089983493402542, + "rewards/reward_func_1": 0.056508952379226686, + "step": 2450 + }, + { + "completion_length": 2.0, + "epoch": 0.6600349509342653, + "grad_norm": 1.0740451216406655e-05, + "kl": 13.8125, + "learning_rate": 6.250259835952383e-06, + "loss": 0.5524, + "reward": 0.06640968322753907, + "reward_std": 0.013152831193292514, + "rewards/reward_func_1": 0.06640968322753907, + "step": 2455 + }, + { + "completion_length": 2.0, + "epoch": 0.6613792176367791, + "grad_norm": 3.2728050882724347e-06, + "kl": 13.753125, + "learning_rate": 6.206794159692304e-06, + "loss": 0.5502, + "reward": 0.05744953155517578, + "reward_std": 0.01012560978961119, + "rewards/reward_func_1": 0.05744953155517578, + "step": 2460 + }, + { + "completion_length": 2.0, + "epoch": 0.662723484339293, + "grad_norm": 1.4861791896692012e-05, + "kl": 13.9265625, + "learning_rate": 6.16341203102529e-06, + "loss": 0.5569, + "reward": 0.05549445152282715, + "reward_std": 0.009857410499535035, + "rewards/reward_func_1": 0.05549445152282715, + "step": 2465 + }, + { + "completion_length": 2.0, + "epoch": 0.6640677510418067, + "grad_norm": 5.46200089956983e-06, + "kl": 13.8640625, + "learning_rate": 6.120114405468285e-06, + "loss": 0.5546, + "reward": 0.05894393920898437, + "reward_std": 0.009983553958227276, + "rewards/reward_func_1": 0.05894393920898437, + "step": 2470 + }, + { + "completion_length": 2.0, + "epoch": 0.6654120177443205, + "grad_norm": 1.2916130799567327e-05, + "kl": 13.81875, + "learning_rate": 6.076902236676994e-06, + "loss": 0.553, + "reward": 0.055209779739379884, + "reward_std": 0.01005386611832364, + "rewards/reward_func_1": 0.055209779739379884, + "step": 2475 + }, + { + "completion_length": 2.0, + "epoch": 0.6667562844468342, + "grad_norm": 3.6688261388917454e-06, + "kl": 13.7546875, + "learning_rate": 6.033776476424888e-06, + "loss": 0.5503, + "reward": 0.06743335723876953, + "reward_std": 0.01147701254230924, + "rewards/reward_func_1": 0.06743335723876953, + "step": 2480 + }, + { + "completion_length": 2.0, + "epoch": 0.6681005511493481, + "grad_norm": 5.666680408467073e-06, + "kl": 13.990625, + "learning_rate": 5.990738074582243e-06, + "loss": 0.5592, + "reward": 0.06467456817626953, + "reward_std": 0.013033680556691251, + "rewards/reward_func_1": 0.06467456817626953, + "step": 2485 + }, + { + "completion_length": 2.0, + "epoch": 0.6694448178518618, + "grad_norm": 1.422238983650459e-05, + "kl": 13.8578125, + "learning_rate": 5.947787979095213e-06, + "loss": 0.5543, + "reward": 0.06046428680419922, + "reward_std": 0.013719953599502333, + "rewards/reward_func_1": 0.06046428680419922, + "step": 2490 + }, + { + "completion_length": 2.0, + "epoch": 0.6707890845543756, + "grad_norm": 5.443932877824409e-06, + "kl": 13.81875, + "learning_rate": 5.9049271359649466e-06, + "loss": 0.5526, + "reward": 0.056779670715332034, + "reward_std": 0.009680721638142131, + "rewards/reward_func_1": 0.056779670715332034, + "step": 2495 + }, + { + "completion_length": 2.0, + "epoch": 0.6721333512568893, + "grad_norm": 6.311719971563434e-06, + "kl": 13.9265625, + "learning_rate": 5.862156489226768e-06, + "loss": 0.5572, + "reward": 0.056317138671875, + "reward_std": 0.012263055084622465, + "rewards/reward_func_1": 0.056317138671875, + "step": 2500 + }, + { + "completion_length": 2.0, + "epoch": 0.6734776179594032, + "grad_norm": 3.1811771350476192e-06, + "kl": 13.7046875, + "learning_rate": 5.819476980929357e-06, + "loss": 0.548, + "reward": 0.05898451805114746, + "reward_std": 0.011924323247512802, + "rewards/reward_func_1": 0.05898451805114746, + "step": 2505 + }, + { + "completion_length": 2.0, + "epoch": 0.674821884661917, + "grad_norm": 2.1738133000326343e-06, + "kl": 13.8359375, + "learning_rate": 5.776889551114036e-06, + "loss": 0.5537, + "reward": 0.05574178695678711, + "reward_std": 0.009539656856213696, + "rewards/reward_func_1": 0.05574178695678711, + "step": 2510 + }, + { + "completion_length": 2.0, + "epoch": 0.6761661513644307, + "grad_norm": 3.079718953813426e-05, + "kl": 13.7, + "learning_rate": 5.734395137794022e-06, + "loss": 0.5483, + "reward": 0.058077239990234376, + "reward_std": 0.00982013454704429, + "rewards/reward_func_1": 0.058077239990234376, + "step": 2515 + }, + { + "completion_length": 2.0, + "epoch": 0.6775104180669445, + "grad_norm": 2.630328253871994e-06, + "kl": 13.78125, + "learning_rate": 5.691994676933808e-06, + "loss": 0.5511, + "reward": 0.05584440231323242, + "reward_std": 0.009929925179494602, + "rewards/reward_func_1": 0.05584440231323242, + "step": 2520 + }, + { + "completion_length": 2.0, + "epoch": 0.6788546847694582, + "grad_norm": 4.1391981540073175e-06, + "kl": 13.6796875, + "learning_rate": 5.6496891024285215e-06, + "loss": 0.5475, + "reward": 0.058974266052246094, + "reward_std": 0.010749774679425173, + "rewards/reward_func_1": 0.058974266052246094, + "step": 2525 + }, + { + "completion_length": 2.0, + "epoch": 0.6801989514719721, + "grad_norm": 2.052042191280634e-06, + "kl": 13.6703125, + "learning_rate": 5.607479346083355e-06, + "loss": 0.5469, + "reward": 0.05872535705566406, + "reward_std": 0.011635806861886522, + "rewards/reward_func_1": 0.05872535705566406, + "step": 2530 + }, + { + "completion_length": 2.0, + "epoch": 0.6815432181744858, + "grad_norm": 2.211010541941505e-05, + "kl": 14.2609375, + "learning_rate": 5.565366337593066e-06, + "loss": 0.5708, + "reward": 0.06311745643615722, + "reward_std": 0.01183991582802264, + "rewards/reward_func_1": 0.06311745643615722, + "step": 2535 + }, + { + "completion_length": 2.0, + "epoch": 0.6828874848769996, + "grad_norm": 4.809753590961918e-06, + "kl": 14.0015625, + "learning_rate": 5.523351004521462e-06, + "loss": 0.5603, + "reward": 0.05524139404296875, + "reward_std": 0.010296737632233998, + "rewards/reward_func_1": 0.05524139404296875, + "step": 2540 + }, + { + "completion_length": 2.0, + "epoch": 0.6842317515795133, + "grad_norm": 8.715678632142954e-06, + "kl": 14.078125, + "learning_rate": 5.481434272281013e-06, + "loss": 0.5629, + "reward": 0.06164817810058594, + "reward_std": 0.013340477158635622, + "rewards/reward_func_1": 0.06164817810058594, + "step": 2545 + }, + { + "completion_length": 2.0, + "epoch": 0.6855760182820272, + "grad_norm": 2.4492214834026527e-06, + "kl": 13.778125, + "learning_rate": 5.439617064112431e-06, + "loss": 0.5511, + "reward": 0.05745353698730469, + "reward_std": 0.013168468393268995, + "rewards/reward_func_1": 0.05745353698730469, + "step": 2550 + }, + { + "completion_length": 2.0, + "epoch": 0.6869202849845409, + "grad_norm": 2.812889078995795e-06, + "kl": 13.9015625, + "learning_rate": 5.3979003010643675e-06, + "loss": 0.5562, + "reward": 0.057623672485351565, + "reward_std": 0.01229256743681617, + "rewards/reward_func_1": 0.057623672485351565, + "step": 2555 + }, + { + "completion_length": 2.0, + "epoch": 0.6882645516870547, + "grad_norm": 3.987305262853624e-06, + "kl": 13.965625, + "learning_rate": 5.356284901973091e-06, + "loss": 0.5588, + "reward": 0.059996414184570315, + "reward_std": 0.010288478545771796, + "rewards/reward_func_1": 0.059996414184570315, + "step": 2560 + }, + { + "completion_length": 2.0, + "epoch": 0.6896088183895684, + "grad_norm": 1.2458726814656984e-05, + "kl": 14.121875, + "learning_rate": 5.314771783442292e-06, + "loss": 0.5647, + "reward": 0.05899205207824707, + "reward_std": 0.010236831862857797, + "rewards/reward_func_1": 0.05899205207824707, + "step": 2565 + }, + { + "completion_length": 2.0, + "epoch": 0.6909530850920823, + "grad_norm": 2.1745931917394046e-06, + "kl": 13.653125, + "learning_rate": 5.273361859822852e-06, + "loss": 0.5463, + "reward": 0.06059694290161133, + "reward_std": 0.01137404957335093, + "rewards/reward_func_1": 0.06059694290161133, + "step": 2570 + }, + { + "completion_length": 2.0, + "epoch": 0.6922973517945961, + "grad_norm": 5.8191294556309e-06, + "kl": 14.08125, + "learning_rate": 5.232056043192737e-06, + "loss": 0.5633, + "reward": 0.0685009479522705, + "reward_std": 0.012357043109659571, + "rewards/reward_func_1": 0.0685009479522705, + "step": 2575 + }, + { + "completion_length": 2.0, + "epoch": 0.6936416184971098, + "grad_norm": 3.856658622680698e-06, + "kl": 13.8921875, + "learning_rate": 5.190855243336883e-06, + "loss": 0.5559, + "reward": 0.06555595397949218, + "reward_std": 0.011982467219604586, + "rewards/reward_func_1": 0.06555595397949218, + "step": 2580 + }, + { + "completion_length": 2.0, + "epoch": 0.6949858851996236, + "grad_norm": 1.3858561032975558e-05, + "kl": 14.0171875, + "learning_rate": 5.1497603677271855e-06, + "loss": 0.5606, + "reward": 0.06087760925292969, + "reward_std": 0.011154358516796492, + "rewards/reward_func_1": 0.06087760925292969, + "step": 2585 + }, + { + "completion_length": 2.0, + "epoch": 0.6963301519021374, + "grad_norm": 4.5027968553768005e-06, + "kl": 13.7328125, + "learning_rate": 5.108772321502479e-06, + "loss": 0.5494, + "reward": 0.05637903213500976, + "reward_std": 0.012003830538014881, + "rewards/reward_func_1": 0.05637903213500976, + "step": 2590 + }, + { + "completion_length": 2.0, + "epoch": 0.6976744186046512, + "grad_norm": 2.7505389880388975e-05, + "kl": 13.94375, + "learning_rate": 5.0678920074486316e-06, + "loss": 0.5578, + "reward": 0.06141033172607422, + "reward_std": 0.010152479278622195, + "rewards/reward_func_1": 0.06141033172607422, + "step": 2595 + }, + { + "completion_length": 2.0, + "epoch": 0.6990186853071649, + "grad_norm": 2.104391796819982e-06, + "kl": 14.35625, + "learning_rate": 5.0271203259786395e-06, + "loss": 0.5744, + "reward": 0.06711845397949219, + "reward_std": 0.011301479887515597, + "rewards/reward_func_1": 0.06711845397949219, + "step": 2600 + }, + { + "completion_length": 2.0, + "epoch": 0.7003629520096787, + "grad_norm": 1.6334478232238325e-06, + "kl": 13.753125, + "learning_rate": 4.986458175112807e-06, + "loss": 0.5501, + "reward": 0.05772566795349121, + "reward_std": 0.011294707475099131, + "rewards/reward_func_1": 0.05772566795349121, + "step": 2605 + }, + { + "completion_length": 2.0, + "epoch": 0.7017072187121925, + "grad_norm": 4.255656222085236e-06, + "kl": 13.778125, + "learning_rate": 4.945906450458955e-06, + "loss": 0.5511, + "reward": 0.058788979053497316, + "reward_std": 0.009819350033649244, + "rewards/reward_func_1": 0.058788979053497316, + "step": 2610 + }, + { + "completion_length": 2.0, + "epoch": 0.7030514854147063, + "grad_norm": 3.1343079172074795e-06, + "kl": 13.5140625, + "learning_rate": 4.90546604519271e-06, + "loss": 0.5407, + "reward": 0.05972156524658203, + "reward_std": 0.011394862360612023, + "rewards/reward_func_1": 0.05972156524658203, + "step": 2615 + }, + { + "completion_length": 2.0, + "epoch": 0.70439575211722, + "grad_norm": 4.186888418189483e-06, + "kl": 13.715625, + "learning_rate": 4.865137850037817e-06, + "loss": 0.5488, + "reward": 0.057996368408203124, + "reward_std": 0.011965288411010988, + "rewards/reward_func_1": 0.057996368408203124, + "step": 2620 + }, + { + "completion_length": 2.0, + "epoch": 0.7057400188197338, + "grad_norm": 2.494949512765743e-06, + "kl": 13.9375, + "learning_rate": 4.824922753246534e-06, + "loss": 0.5575, + "reward": 0.05783071517944336, + "reward_std": 0.011933683512324933, + "rewards/reward_func_1": 0.05783071517944336, + "step": 2625 + }, + { + "completion_length": 2.0, + "epoch": 0.7070842855222477, + "grad_norm": 4.90788443130441e-06, + "kl": 14.0078125, + "learning_rate": 4.784821640580051e-06, + "loss": 0.5603, + "reward": 0.060264754295349124, + "reward_std": 0.011658078715845477, + "rewards/reward_func_1": 0.060264754295349124, + "step": 2630 + }, + { + "completion_length": 2.0, + "epoch": 0.7084285522247614, + "grad_norm": 2.91156266030157e-06, + "kl": 13.8421875, + "learning_rate": 4.744835395289002e-06, + "loss": 0.5537, + "reward": 0.05923728942871094, + "reward_std": 0.012938315909923403, + "rewards/reward_func_1": 0.05923728942871094, + "step": 2635 + }, + { + "completion_length": 2.0, + "epoch": 0.7097728189272752, + "grad_norm": 3.442679826548556e-06, + "kl": 13.8109375, + "learning_rate": 4.704964898093991e-06, + "loss": 0.5527, + "reward": 0.06276912689208984, + "reward_std": 0.011696373121230863, + "rewards/reward_func_1": 0.06276912689208984, + "step": 2640 + }, + { + "completion_length": 2.0, + "epoch": 0.7111170856297889, + "grad_norm": 4.503699074120959e-06, + "kl": 13.875, + "learning_rate": 4.665211027166209e-06, + "loss": 0.5547, + "reward": 0.059120559692382814, + "reward_std": 0.011374600145063595, + "rewards/reward_func_1": 0.059120559692382814, + "step": 2645 + }, + { + "completion_length": 2.0, + "epoch": 0.7124613523323027, + "grad_norm": 9.90547505352879e-06, + "kl": 13.6984375, + "learning_rate": 4.625574658108073e-06, + "loss": 0.5478, + "reward": 0.057414674758911134, + "reward_std": 0.010342052261330536, + "rewards/reward_func_1": 0.057414674758911134, + "step": 2650 + }, + { + "completion_length": 2.0, + "epoch": 0.7138056190348165, + "grad_norm": 2.0887078790110536e-05, + "kl": 13.509375, + "learning_rate": 4.586056663933969e-06, + "loss": 0.5406, + "reward": 0.05762338638305664, + "reward_std": 0.014504123894221265, + "rewards/reward_func_1": 0.05762338638305664, + "step": 2655 + }, + { + "completion_length": 2.0, + "epoch": 0.7151498857373303, + "grad_norm": 3.3340979825879913e-06, + "kl": 13.7078125, + "learning_rate": 4.546657915050988e-06, + "loss": 0.5483, + "reward": 0.06230294108390808, + "reward_std": 0.009969272715534317, + "rewards/reward_func_1": 0.06230294108390808, + "step": 2660 + }, + { + "completion_length": 2.0, + "epoch": 0.716494152439844, + "grad_norm": 3.2463603929500096e-06, + "kl": 13.7390625, + "learning_rate": 4.507379279239791e-06, + "loss": 0.5496, + "reward": 0.05877430438995361, + "reward_std": 0.011900619864900364, + "rewards/reward_func_1": 0.05877430438995361, + "step": 2665 + }, + { + "completion_length": 2.0, + "epoch": 0.7178384191423578, + "grad_norm": 3.1261215553968213e-06, + "kl": 13.99375, + "learning_rate": 4.468221621635462e-06, + "loss": 0.5597, + "reward": 0.05568780899047852, + "reward_std": 0.008807793819141808, + "rewards/reward_func_1": 0.05568780899047852, + "step": 2670 + }, + { + "completion_length": 2.0, + "epoch": 0.7191826858448717, + "grad_norm": 2.6415564207127318e-06, + "kl": 13.9015625, + "learning_rate": 4.42918580470848e-06, + "loss": 0.5562, + "reward": 0.060968208312988284, + "reward_std": 0.009969678838388063, + "rewards/reward_func_1": 0.060968208312988284, + "step": 2675 + }, + { + "completion_length": 2.0, + "epoch": 0.7205269525473854, + "grad_norm": 2.9144048312446102e-06, + "kl": 14.0859375, + "learning_rate": 4.39027268824571e-06, + "loss": 0.5633, + "reward": 0.06536164283752441, + "reward_std": 0.011233370206900873, + "rewards/reward_func_1": 0.06536164283752441, + "step": 2680 + }, + { + "completion_length": 2.0, + "epoch": 0.7218712192498992, + "grad_norm": 3.608389533837908e-06, + "kl": 14.0390625, + "learning_rate": 4.351483129331458e-06, + "loss": 0.5612, + "reward": 0.06538281440734864, + "reward_std": 0.013752934670628747, + "rewards/reward_func_1": 0.06538281440734864, + "step": 2685 + }, + { + "completion_length": 2.0, + "epoch": 0.7232154859524129, + "grad_norm": 6.397221568477107e-06, + "kl": 13.865625, + "learning_rate": 4.312817982328612e-06, + "loss": 0.5546, + "reward": 0.06181436069309711, + "reward_std": 0.011076993081223918, + "rewards/reward_func_1": 0.06181436069309711, + "step": 2690 + }, + { + "completion_length": 2.0, + "epoch": 0.7245597526549268, + "grad_norm": 9.608173968445044e-06, + "kl": 13.825, + "learning_rate": 4.2742780988598145e-06, + "loss": 0.5534, + "reward": 0.06040000915527344, + "reward_std": 0.012127826601499692, + "rewards/reward_func_1": 0.06040000915527344, + "step": 2695 + }, + { + "completion_length": 2.0, + "epoch": 0.7259040193574405, + "grad_norm": 7.110948445188114e-06, + "kl": 14.2703125, + "learning_rate": 4.235864327788692e-06, + "loss": 0.5708, + "reward": 0.06447288990020753, + "reward_std": 0.00979026438217261, + "rewards/reward_func_1": 0.06447288990020753, + "step": 2700 + }, + { + "completion_length": 2.0, + "epoch": 0.7272482860599543, + "grad_norm": 4.501914645516081e-06, + "kl": 13.8, + "learning_rate": 4.197577515201191e-06, + "loss": 0.5523, + "reward": 0.0603661984205246, + "reward_std": 0.011186352090589935, + "rewards/reward_func_1": 0.0603661984205246, + "step": 2705 + }, + { + "completion_length": 2.0, + "epoch": 0.728592552762468, + "grad_norm": 7.746289156784769e-06, + "kl": 13.478125, + "learning_rate": 4.159418504386904e-06, + "loss": 0.5393, + "reward": 0.057269958406686784, + "reward_std": 0.01241556809945905, + "rewards/reward_func_1": 0.057269958406686784, + "step": 2710 + }, + { + "completion_length": 2.0, + "epoch": 0.7299368194649819, + "grad_norm": 6.022199613653356e-06, + "kl": 13.8953125, + "learning_rate": 4.1213881358205275e-06, + "loss": 0.5558, + "reward": 0.0635772705078125, + "reward_std": 0.009903606217267224, + "rewards/reward_func_1": 0.0635772705078125, + "step": 2715 + }, + { + "completion_length": 2.0, + "epoch": 0.7312810861674957, + "grad_norm": 4.5010624489805195e-06, + "kl": 13.7203125, + "learning_rate": 4.083487247143326e-06, + "loss": 0.5486, + "reward": 0.06045243740081787, + "reward_std": 0.011131673593808955, + "rewards/reward_func_1": 0.06045243740081787, + "step": 2720 + }, + { + "completion_length": 2.0, + "epoch": 0.7326253528700094, + "grad_norm": 4.66905157736619e-06, + "kl": 13.7625, + "learning_rate": 4.045716673144706e-06, + "loss": 0.5505, + "reward": 0.061006355285644534, + "reward_std": 0.011753415851853789, + "rewards/reward_func_1": 0.061006355285644534, + "step": 2725 + }, + { + "completion_length": 2.0, + "epoch": 0.7339696195725232, + "grad_norm": 3.085681328229839e-06, + "kl": 13.7109375, + "learning_rate": 4.008077245743801e-06, + "loss": 0.5486, + "reward": 0.06153240203857422, + "reward_std": 0.011887542959448183, + "rewards/reward_func_1": 0.06153240203857422, + "step": 2730 + }, + { + "completion_length": 2.0, + "epoch": 0.735313886275037, + "grad_norm": 2.592519194877241e-06, + "kl": 14.075, + "learning_rate": 3.970569793971178e-06, + "loss": 0.5628, + "reward": 0.06015148162841797, + "reward_std": 0.011894370408845134, + "rewards/reward_func_1": 0.06015148162841797, + "step": 2735 + }, + { + "completion_length": 2.0, + "epoch": 0.7366581529775508, + "grad_norm": 1.798785774553835e-06, + "kl": 13.7875, + "learning_rate": 3.933195143950551e-06, + "loss": 0.5514, + "reward": 0.06206645965576172, + "reward_std": 0.014155591612507124, + "rewards/reward_func_1": 0.06206645965576172, + "step": 2740 + }, + { + "completion_length": 2.0, + "epoch": 0.7380024196800645, + "grad_norm": 3.439082775003044e-06, + "kl": 13.7390625, + "learning_rate": 3.89595411888061e-06, + "loss": 0.5497, + "reward": 0.06377677917480469, + "reward_std": 0.010366774378053379, + "rewards/reward_func_1": 0.06377677917480469, + "step": 2745 + }, + { + "completion_length": 2.0, + "epoch": 0.7393466863825783, + "grad_norm": 2.5652859676483786e-06, + "kl": 13.728125, + "learning_rate": 3.85884753901686e-06, + "loss": 0.5493, + "reward": 0.06238212585449219, + "reward_std": 0.009483605425339192, + "rewards/reward_func_1": 0.06238212585449219, + "step": 2750 + }, + { + "completion_length": 2.0, + "epoch": 0.740690953085092, + "grad_norm": 3.275632843724452e-05, + "kl": 13.8734375, + "learning_rate": 3.82187622165359e-06, + "loss": 0.5549, + "reward": 0.05930185317993164, + "reward_std": 0.010815516777802259, + "rewards/reward_func_1": 0.05930185317993164, + "step": 2755 + }, + { + "completion_length": 2.0, + "epoch": 0.7420352197876059, + "grad_norm": 5.840865014761221e-06, + "kl": 13.6109375, + "learning_rate": 3.7850409811058343e-06, + "loss": 0.5445, + "reward": 0.05905466079711914, + "reward_std": 0.008731590279785451, + "rewards/reward_func_1": 0.05905466079711914, + "step": 2760 + }, + { + "completion_length": 2.0, + "epoch": 0.7433794864901196, + "grad_norm": 1.5586972949677147e-05, + "kl": 13.8890625, + "learning_rate": 3.7483426286914705e-06, + "loss": 0.5557, + "reward": 0.0615997314453125, + "reward_std": 0.012081135442713276, + "rewards/reward_func_1": 0.0615997314453125, + "step": 2765 + }, + { + "completion_length": 2.0, + "epoch": 0.7447237531926334, + "grad_norm": 6.052292064850917e-06, + "kl": 13.45, + "learning_rate": 3.7117819727133254e-06, + "loss": 0.5381, + "reward": 0.06048717498779297, + "reward_std": 0.008764956895902287, + "rewards/reward_func_1": 0.06048717498779297, + "step": 2770 + }, + { + "completion_length": 2.0, + "epoch": 0.7460680198951471, + "grad_norm": 3.043762262677774e-06, + "kl": 13.8125, + "learning_rate": 3.6753598184413873e-06, + "loss": 0.5528, + "reward": 0.06238512992858887, + "reward_std": 0.012483571946359007, + "rewards/reward_func_1": 0.06238512992858887, + "step": 2775 + }, + { + "completion_length": 2.0, + "epoch": 0.747412286597661, + "grad_norm": 8.626040653325617e-06, + "kl": 13.8703125, + "learning_rate": 3.6390769680950544e-06, + "loss": 0.5549, + "reward": 0.06061878204345703, + "reward_std": 0.012912878150018514, + "rewards/reward_func_1": 0.06061878204345703, + "step": 2780 + }, + { + "completion_length": 2.0, + "epoch": 0.7487565533001748, + "grad_norm": 8.276882908830885e-06, + "kl": 13.959375, + "learning_rate": 3.6029342208254826e-06, + "loss": 0.5585, + "reward": 0.06053438186645508, + "reward_std": 0.012466668507113355, + "rewards/reward_func_1": 0.06053438186645508, + "step": 2785 + }, + { + "completion_length": 2.0, + "epoch": 0.7501008200026885, + "grad_norm": 8.260290996986441e-06, + "kl": 13.8265625, + "learning_rate": 3.5669323726979655e-06, + "loss": 0.5533, + "reward": 0.0611328125, + "reward_std": 0.011878072742911172, + "rewards/reward_func_1": 0.0611328125, + "step": 2790 + }, + { + "completion_length": 2.0, + "epoch": 0.7514450867052023, + "grad_norm": 4.150938366365153e-06, + "kl": 13.6796875, + "learning_rate": 3.531072216674418e-06, + "loss": 0.5471, + "reward": 0.06307134628295899, + "reward_std": 0.01603546408514376, + "rewards/reward_func_1": 0.06307134628295899, + "step": 2795 + }, + { + "completion_length": 2.0, + "epoch": 0.7527893534077161, + "grad_norm": 2.825467163347639e-06, + "kl": 13.9296875, + "learning_rate": 3.4953545425959047e-06, + "loss": 0.557, + "reward": 0.0554865837097168, + "reward_std": 0.012560931847292522, + "rewards/reward_func_1": 0.0554865837097168, + "step": 2800 + }, + { + "completion_length": 2.0, + "epoch": 0.7541336201102299, + "grad_norm": 9.65241724770749e-06, + "kl": 14.05, + "learning_rate": 3.4597801371652296e-06, + "loss": 0.5621, + "reward": 0.061970877647399905, + "reward_std": 0.010358845694281627, + "rewards/reward_func_1": 0.061970877647399905, + "step": 2805 + }, + { + "completion_length": 2.0, + "epoch": 0.7554778868127436, + "grad_norm": 1.7722894654070842e-06, + "kl": 13.821875, + "learning_rate": 3.424349783929636e-06, + "loss": 0.5529, + "reward": 0.061666107177734374, + "reward_std": 0.011269324702152517, + "rewards/reward_func_1": 0.061666107177734374, + "step": 2810 + }, + { + "completion_length": 2.0, + "epoch": 0.7568221535152574, + "grad_norm": 1.5782270565978251e-06, + "kl": 13.9140625, + "learning_rate": 3.3890642632635153e-06, + "loss": 0.5564, + "reward": 0.06216366291046142, + "reward_std": 0.010467645124299452, + "rewards/reward_func_1": 0.06216366291046142, + "step": 2815 + }, + { + "completion_length": 2.0, + "epoch": 0.7581664202177713, + "grad_norm": 9.261582817998715e-06, + "kl": 13.984375, + "learning_rate": 3.353924352351253e-06, + "loss": 0.5595, + "reward": 0.05453653335571289, + "reward_std": 0.010780278017773526, + "rewards/reward_func_1": 0.05453653335571289, + "step": 2820 + }, + { + "completion_length": 2.0, + "epoch": 0.759510686920285, + "grad_norm": 5.424847131507704e-06, + "kl": 13.9875, + "learning_rate": 3.3189308251700825e-06, + "loss": 0.5595, + "reward": 0.057614707946777345, + "reward_std": 0.011758481396100251, + "rewards/reward_func_1": 0.057614707946777345, + "step": 2825 + }, + { + "completion_length": 2.0, + "epoch": 0.7608549536227988, + "grad_norm": 1.4945719158276916e-05, + "kl": 13.875, + "learning_rate": 3.2840844524730577e-06, + "loss": 0.555, + "reward": 0.05519509315490723, + "reward_std": 0.009796513656328897, + "rewards/reward_func_1": 0.05519509315490723, + "step": 2830 + }, + { + "completion_length": 2.0, + "epoch": 0.7621992203253125, + "grad_norm": 2.619063934616861e-06, + "kl": 14.10625, + "learning_rate": 3.2493860017720567e-06, + "loss": 0.5644, + "reward": 0.055352401733398435, + "reward_std": 0.010450466238398803, + "rewards/reward_func_1": 0.055352401733398435, + "step": 2835 + }, + { + "completion_length": 2.0, + "epoch": 0.7635434870278264, + "grad_norm": 1.0150999514735304e-05, + "kl": 13.84375, + "learning_rate": 3.214836237320904e-06, + "loss": 0.5538, + "reward": 0.058098793029785156, + "reward_std": 0.011656674755067797, + "rewards/reward_func_1": 0.058098793029785156, + "step": 2840 + }, + { + "completion_length": 2.0, + "epoch": 0.7648877537303401, + "grad_norm": 4.336788151704241e-06, + "kl": 13.6640625, + "learning_rate": 3.1804359200985056e-06, + "loss": 0.5466, + "reward": 0.05649633407592773, + "reward_std": 0.011210850576026133, + "rewards/reward_func_1": 0.05649633407592773, + "step": 2845 + }, + { + "completion_length": 2.0, + "epoch": 0.7662320204328539, + "grad_norm": 7.680199814785738e-06, + "kl": 13.525, + "learning_rate": 3.14618580779212e-06, + "loss": 0.5411, + "reward": 0.05644134283065796, + "reward_std": 0.01211523166639381, + "rewards/reward_func_1": 0.05644134283065796, + "step": 2850 + }, + { + "completion_length": 2.0, + "epoch": 0.7675762871353676, + "grad_norm": 3.693724920594832e-06, + "kl": 13.6765625, + "learning_rate": 3.1120866547806394e-06, + "loss": 0.547, + "reward": 0.055395317077636716, + "reward_std": 0.012751551120891236, + "rewards/reward_func_1": 0.055395317077636716, + "step": 2855 + }, + { + "completion_length": 2.0, + "epoch": 0.7689205538378814, + "grad_norm": 6.3646293710917234e-06, + "kl": 14.1234375, + "learning_rate": 3.0781392121179986e-06, + "loss": 0.5649, + "reward": 0.05985813140869141, + "reward_std": 0.010235290192213142, + "rewards/reward_func_1": 0.05985813140869141, + "step": 2860 + }, + { + "completion_length": 2.0, + "epoch": 0.7702648205403952, + "grad_norm": 1.4028549230715726e-05, + "kl": 14.3015625, + "learning_rate": 3.0443442275166226e-06, + "loss": 0.5718, + "reward": 0.05446624755859375, + "reward_std": 0.009798991409479641, + "rewards/reward_func_1": 0.05446624755859375, + "step": 2865 + }, + { + "completion_length": 2.0, + "epoch": 0.771609087242909, + "grad_norm": 3.3037556477211183e-06, + "kl": 13.9109375, + "learning_rate": 3.0107024453309486e-06, + "loss": 0.5564, + "reward": 0.054990959167480466, + "reward_std": 0.009734460682375356, + "rewards/reward_func_1": 0.054990959167480466, + "step": 2870 + }, + { + "completion_length": 2.0, + "epoch": 0.7729533539454227, + "grad_norm": 3.8873076846357435e-06, + "kl": 13.7890625, + "learning_rate": 2.9772146065410477e-06, + "loss": 0.5516, + "reward": 0.057455134391784665, + "reward_std": 0.010452806322427932, + "rewards/reward_func_1": 0.057455134391784665, + "step": 2875 + }, + { + "completion_length": 2.0, + "epoch": 0.7742976206479365, + "grad_norm": 8.293524842883926e-06, + "kl": 14.35, + "learning_rate": 2.943881448736301e-06, + "loss": 0.5742, + "reward": 0.062497615814208984, + "reward_std": 0.00955452322596102, + "rewards/reward_func_1": 0.062497615814208984, + "step": 2880 + }, + { + "completion_length": 2.0, + "epoch": 0.7756418873504504, + "grad_norm": 2.3304564820136875e-06, + "kl": 13.9890625, + "learning_rate": 2.910703706099137e-06, + "loss": 0.5594, + "reward": 0.06021251678466797, + "reward_std": 0.012565446839289507, + "rewards/reward_func_1": 0.06021251678466797, + "step": 2885 + }, + { + "completion_length": 2.0, + "epoch": 0.7769861540529641, + "grad_norm": 2.393172962911194e-06, + "kl": 13.796875, + "learning_rate": 2.8776821093888883e-06, + "loss": 0.552, + "reward": 0.06193351745605469, + "reward_std": 0.010560587099462282, + "rewards/reward_func_1": 0.06193351745605469, + "step": 2890 + }, + { + "completion_length": 2.0, + "epoch": 0.7783304207554779, + "grad_norm": 8.713544048077893e-06, + "kl": 14.021875, + "learning_rate": 2.8448173859256665e-06, + "loss": 0.5609, + "reward": 0.060492420196533205, + "reward_std": 0.01069757735276653, + "rewards/reward_func_1": 0.060492420196533205, + "step": 2895 + }, + { + "completion_length": 2.0, + "epoch": 0.7796746874579916, + "grad_norm": 3.608857878134586e-05, + "kl": 13.4609375, + "learning_rate": 2.8121102595743732e-06, + "loss": 0.5384, + "reward": 0.05852642059326172, + "reward_std": 0.010727309926369343, + "rewards/reward_func_1": 0.05852642059326172, + "step": 2900 + }, + { + "completion_length": 2.0, + "epoch": 0.7810189541605055, + "grad_norm": 2.3314752979786135e-05, + "kl": 13.8453125, + "learning_rate": 2.779561450728725e-06, + "loss": 0.5537, + "reward": 0.06407814025878907, + "reward_std": 0.010947771910286975, + "rewards/reward_func_1": 0.06407814025878907, + "step": 2905 + }, + { + "completion_length": 2.0, + "epoch": 0.7823632208630192, + "grad_norm": 3.858524905808736e-06, + "kl": 13.8734375, + "learning_rate": 2.7471716762954183e-06, + "loss": 0.5551, + "reward": 0.05899543762207031, + "reward_std": 0.012197423033649102, + "rewards/reward_func_1": 0.05899543762207031, + "step": 2910 + }, + { + "completion_length": 2.0, + "epoch": 0.783707487565533, + "grad_norm": 9.215535101247951e-06, + "kl": 13.7046875, + "learning_rate": 2.7149416496783055e-06, + "loss": 0.5481, + "reward": 0.06349143981933594, + "reward_std": 0.010476895228566718, + "rewards/reward_func_1": 0.06349143981933594, + "step": 2915 + }, + { + "completion_length": 2.0, + "epoch": 0.7850517542680467, + "grad_norm": 1.8621502704263548e-06, + "kl": 13.9234375, + "learning_rate": 2.6828720807627173e-06, + "loss": 0.5572, + "reward": 0.05804300308227539, + "reward_std": 0.010599679932784056, + "rewards/reward_func_1": 0.05804300308227539, + "step": 2920 + }, + { + "completion_length": 2.0, + "epoch": 0.7863960209705606, + "grad_norm": 2.1705293420382077e-06, + "kl": 13.7421875, + "learning_rate": 2.6509636758997914e-06, + "loss": 0.5496, + "reward": 0.06185646057128906, + "reward_std": 0.012096992944861995, + "rewards/reward_func_1": 0.06185646057128906, + "step": 2925 + }, + { + "completion_length": 2.0, + "epoch": 0.7877402876730744, + "grad_norm": 3.4049091937049525e-06, + "kl": 13.90625, + "learning_rate": 2.619217137890949e-06, + "loss": 0.5562, + "reward": 0.06344146728515625, + "reward_std": 0.010752197249166784, + "rewards/reward_func_1": 0.06344146728515625, + "step": 2930 + }, + { + "completion_length": 2.0, + "epoch": 0.7890845543755881, + "grad_norm": 5.779114417236997e-06, + "kl": 13.809375, + "learning_rate": 2.587633165972384e-06, + "loss": 0.5523, + "reward": 0.061241436004638675, + "reward_std": 0.011485271743731573, + "rewards/reward_func_1": 0.061241436004638675, + "step": 2935 + }, + { + "completion_length": 2.0, + "epoch": 0.7904288210781019, + "grad_norm": 1.907208570628427e-05, + "kl": 13.6140625, + "learning_rate": 2.556212455799688e-06, + "loss": 0.5447, + "reward": 0.05872478485107422, + "reward_std": 0.01105458896199707, + "rewards/reward_func_1": 0.05872478485107422, + "step": 2940 + }, + { + "completion_length": 2.0, + "epoch": 0.7917730877806157, + "grad_norm": 1.2913329555885866e-05, + "kl": 13.5984375, + "learning_rate": 2.5249556994325063e-06, + "loss": 0.5443, + "reward": 0.05844389796257019, + "reward_std": 0.011415049015340628, + "rewards/reward_func_1": 0.05844389796257019, + "step": 2945 + }, + { + "completion_length": 2.0, + "epoch": 0.7931173544831295, + "grad_norm": 7.899559022916947e-06, + "kl": 13.740625, + "learning_rate": 2.4938635853193127e-06, + "loss": 0.5495, + "reward": 0.060787391662597653, + "reward_std": 0.011730345609248616, + "rewards/reward_func_1": 0.060787391662597653, + "step": 2950 + }, + { + "completion_length": 2.0, + "epoch": 0.7944616211856432, + "grad_norm": 2.142528501281049e-06, + "kl": 14.0171875, + "learning_rate": 2.462936798282236e-06, + "loss": 0.5608, + "reward": 0.05785312652587891, + "reward_std": 0.012231780852016528, + "rewards/reward_func_1": 0.05785312652587891, + "step": 2955 + }, + { + "completion_length": 2.0, + "epoch": 0.795805887888157, + "grad_norm": 2.0210850379953627e-06, + "kl": 13.615625, + "learning_rate": 2.4321760195019807e-06, + "loss": 0.5444, + "reward": 0.058881378173828124, + "reward_std": 0.009882683275645832, + "rewards/reward_func_1": 0.058881378173828124, + "step": 2960 + }, + { + "completion_length": 2.0, + "epoch": 0.7971501545906707, + "grad_norm": 4.331094714871142e-06, + "kl": 13.8859375, + "learning_rate": 2.401581926502814e-06, + "loss": 0.5553, + "reward": 0.06248741149902344, + "reward_std": 0.01132394474479952, + "rewards/reward_func_1": 0.06248741149902344, + "step": 2965 + }, + { + "completion_length": 2.0, + "epoch": 0.7984944212931846, + "grad_norm": 2.8427843972167466e-06, + "kl": 14.0859375, + "learning_rate": 2.371155193137662e-06, + "loss": 0.5634, + "reward": 0.06317214965820313, + "reward_std": 0.011330111461211346, + "rewards/reward_func_1": 0.06317214965820313, + "step": 2970 + }, + { + "completion_length": 2.0, + "epoch": 0.7998386879956983, + "grad_norm": 3.5108828342345078e-06, + "kl": 13.83125, + "learning_rate": 2.3408964895732433e-06, + "loss": 0.5533, + "reward": 0.05587625503540039, + "reward_std": 0.009793595474184258, + "rewards/reward_func_1": 0.05587625503540039, + "step": 2975 + }, + { + "completion_length": 2.0, + "epoch": 0.8011829546982121, + "grad_norm": 6.495894467661856e-06, + "kl": 13.9984375, + "learning_rate": 2.310806482275336e-06, + "loss": 0.5598, + "reward": 0.05911798477172851, + "reward_std": 0.010464892169329687, + "rewards/reward_func_1": 0.05911798477172851, + "step": 2980 + }, + { + "completion_length": 2.0, + "epoch": 0.8025272214007259, + "grad_norm": 1.9022724018213921e-06, + "kl": 14.0625, + "learning_rate": 2.2808858339940696e-06, + "loss": 0.5627, + "reward": 0.06507339477539062, + "reward_std": 0.010237712813250255, + "rewards/reward_func_1": 0.06507339477539062, + "step": 2985 + }, + { + "completion_length": 2.0, + "epoch": 0.8038714881032397, + "grad_norm": 8.21357753011398e-06, + "kl": 14.1390625, + "learning_rate": 2.251135203749353e-06, + "loss": 0.5655, + "reward": 0.054758310317993164, + "reward_std": 0.010143174163385994, + "rewards/reward_func_1": 0.054758310317993164, + "step": 2990 + }, + { + "completion_length": 2.0, + "epoch": 0.8052157548057535, + "grad_norm": 2.0820609734073514e-06, + "kl": 13.9046875, + "learning_rate": 2.221555246816335e-06, + "loss": 0.5563, + "reward": 0.05632228851318359, + "reward_std": 0.007249694373967941, + "rewards/reward_func_1": 0.05632228851318359, + "step": 2995 + }, + { + "completion_length": 2.0, + "epoch": 0.8065600215082672, + "grad_norm": 3.5432799450063612e-06, + "kl": 13.8859375, + "learning_rate": 2.1921466147109995e-06, + "loss": 0.555, + "reward": 0.06329879760742188, + "reward_std": 0.011629640086903236, + "rewards/reward_func_1": 0.06329879760742188, + "step": 3000 + }, + { + "completion_length": 2.0, + "epoch": 0.807904288210781, + "grad_norm": 5.990676982037257e-06, + "kl": 13.85, + "learning_rate": 2.162909955175786e-06, + "loss": 0.5541, + "reward": 0.059543299674987796, + "reward_std": 0.011717213732481468, + "rewards/reward_func_1": 0.059543299674987796, + "step": 3005 + }, + { + "completion_length": 2.0, + "epoch": 0.8092485549132948, + "grad_norm": 3.0730845992366085e-06, + "kl": 14.1765625, + "learning_rate": 2.1338459121653467e-06, + "loss": 0.5671, + "reward": 0.0583465576171875, + "reward_std": 0.010720923148619476, + "rewards/reward_func_1": 0.0583465576171875, + "step": 3010 + }, + { + "completion_length": 2.0, + "epoch": 0.8105928216158086, + "grad_norm": 4.65749917566427e-06, + "kl": 13.7890625, + "learning_rate": 2.1049551258323466e-06, + "loss": 0.5514, + "reward": 0.05720829963684082, + "reward_std": 0.01134266530716559, + "rewards/reward_func_1": 0.05720829963684082, + "step": 3015 + }, + { + "completion_length": 2.0, + "epoch": 0.8119370883183223, + "grad_norm": 1.0969602044497151e-05, + "kl": 13.865625, + "learning_rate": 2.076238232513377e-06, + "loss": 0.5545, + "reward": 0.05492105484008789, + "reward_std": 0.009651319341355703, + "rewards/reward_func_1": 0.05492105484008789, + "step": 3020 + }, + { + "completion_length": 2.0, + "epoch": 0.8132813550208361, + "grad_norm": 2.167436605304829e-06, + "kl": 14.128125, + "learning_rate": 2.0476958647149235e-06, + "loss": 0.5653, + "reward": 0.062408828735351564, + "reward_std": 0.011062738049804465, + "rewards/reward_func_1": 0.062408828735351564, + "step": 3025 + }, + { + "completion_length": 2.0, + "epoch": 0.81462562172335, + "grad_norm": 6.903650046297116e-06, + "kl": 13.81875, + "learning_rate": 2.019328651099458e-06, + "loss": 0.5526, + "reward": 0.06055660247802734, + "reward_std": 0.01089865797512175, + "rewards/reward_func_1": 0.06055660247802734, + "step": 3030 + }, + { + "completion_length": 2.0, + "epoch": 0.8159698884258637, + "grad_norm": 9.66745847108541e-06, + "kl": 13.9421875, + "learning_rate": 1.9911372164715617e-06, + "loss": 0.558, + "reward": 0.060862159729003905, + "reward_std": 0.010128252705180784, + "rewards/reward_func_1": 0.060862159729003905, + "step": 3035 + }, + { + "completion_length": 2.0, + "epoch": 0.8173141551283775, + "grad_norm": 2.5966969587898348e-06, + "kl": 13.865625, + "learning_rate": 1.963122181764194e-06, + "loss": 0.5547, + "reward": 0.05717315673828125, + "reward_std": 0.010695815431245138, + "rewards/reward_func_1": 0.05717315673828125, + "step": 3040 + }, + { + "completion_length": 2.0, + "epoch": 0.8186584218308912, + "grad_norm": 1.2404716471792199e-05, + "kl": 13.609375, + "learning_rate": 1.935284164024995e-06, + "loss": 0.5443, + "reward": 0.05703325271606445, + "reward_std": 0.00922305959957157, + "rewards/reward_func_1": 0.05703325271606445, + "step": 3045 + }, + { + "completion_length": 2.0, + "epoch": 0.8200026885334051, + "grad_norm": 3.450983740549418e-06, + "kl": 13.8828125, + "learning_rate": 1.9076237764027096e-06, + "loss": 0.5555, + "reward": 0.060849010944366455, + "reward_std": 0.010863185320158664, + "rewards/reward_func_1": 0.060849010944366455, + "step": 3050 + }, + { + "completion_length": 2.0, + "epoch": 0.8213469552359188, + "grad_norm": 2.398985998297576e-05, + "kl": 14.0265625, + "learning_rate": 1.8801416281336593e-06, + "loss": 0.5611, + "reward": 0.05967788696289063, + "reward_std": 0.011869262975233141, + "rewards/reward_func_1": 0.05967788696289063, + "step": 3055 + }, + { + "completion_length": 2.0, + "epoch": 0.8226912219384326, + "grad_norm": 2.969979050249094e-06, + "kl": 13.9078125, + "learning_rate": 1.8528383245283565e-06, + "loss": 0.5565, + "reward": 0.05824851989746094, + "reward_std": 0.008635234561734251, + "rewards/reward_func_1": 0.05824851989746094, + "step": 3060 + }, + { + "completion_length": 2.0, + "epoch": 0.8240354886409463, + "grad_norm": 4.690632522397209e-06, + "kl": 13.828125, + "learning_rate": 1.8257144669581405e-06, + "loss": 0.5533, + "reward": 0.06130073070526123, + "reward_std": 0.011460411777079571, + "rewards/reward_func_1": 0.06130073070526123, + "step": 3065 + }, + { + "completion_length": 2.0, + "epoch": 0.8253797553434602, + "grad_norm": 2.5304858354502358e-05, + "kl": 13.7609375, + "learning_rate": 1.7987706528419547e-06, + "loss": 0.5505, + "reward": 0.058181381225585936, + "reward_std": 0.010406417903141119, + "rewards/reward_func_1": 0.058181381225585936, + "step": 3070 + }, + { + "completion_length": 2.0, + "epoch": 0.826724022045974, + "grad_norm": 3.5739954000746366e-06, + "kl": 13.9765625, + "learning_rate": 1.7720074756331796e-06, + "loss": 0.5591, + "reward": 0.06419677734375, + "reward_std": 0.010990058263996617, + "rewards/reward_func_1": 0.06419677734375, + "step": 3075 + }, + { + "completion_length": 2.0, + "epoch": 0.8280682887484877, + "grad_norm": 2.814767640302307e-06, + "kl": 13.90625, + "learning_rate": 1.745425524806552e-06, + "loss": 0.5562, + "reward": 0.06376209259033203, + "reward_std": 0.009371502423891797, + "rewards/reward_func_1": 0.06376209259033203, + "step": 3080 + }, + { + "completion_length": 2.0, + "epoch": 0.8294125554510015, + "grad_norm": 4.085266937181586e-06, + "kl": 14.1734375, + "learning_rate": 1.7190253858452032e-06, + "loss": 0.5674, + "reward": 0.07096824645996094, + "reward_std": 0.011460935025388608, + "rewards/reward_func_1": 0.07096824645996094, + "step": 3085 + }, + { + "completion_length": 2.0, + "epoch": 0.8307568221535152, + "grad_norm": 2.656307515280787e-06, + "kl": 13.6828125, + "learning_rate": 1.6928076402277404e-06, + "loss": 0.5474, + "reward": 0.060984134674072266, + "reward_std": 0.011912760638369945, + "rewards/reward_func_1": 0.060984134674072266, + "step": 3090 + }, + { + "completion_length": 2.0, + "epoch": 0.8321010888560291, + "grad_norm": 7.214829565782566e-06, + "kl": 13.8453125, + "learning_rate": 1.666772865415458e-06, + "loss": 0.5536, + "reward": 0.061499595642089844, + "reward_std": 0.010504865943221375, + "rewards/reward_func_1": 0.061499595642089844, + "step": 3095 + }, + { + "completion_length": 2.0, + "epoch": 0.8334453555585428, + "grad_norm": 2.9270854611240793e-06, + "kl": 13.5703125, + "learning_rate": 1.640921634839605e-06, + "loss": 0.5428, + "reward": 0.05861034393310547, + "reward_std": 0.012625352442410077, + "rewards/reward_func_1": 0.05861034393310547, + "step": 3100 + }, + { + "completion_length": 2.0, + "epoch": 0.8347896222610566, + "grad_norm": 3.179326768076862e-06, + "kl": 14.1953125, + "learning_rate": 1.6152545178887657e-06, + "loss": 0.568, + "reward": 0.055633163452148436, + "reward_std": 0.012100516646751203, + "rewards/reward_func_1": 0.055633163452148436, + "step": 3105 + }, + { + "completion_length": 2.0, + "epoch": 0.8361338889635703, + "grad_norm": 4.799480848305393e-06, + "kl": 13.778125, + "learning_rate": 1.5897720798963079e-06, + "loss": 0.5512, + "reward": 0.06492023468017578, + "reward_std": 0.010558164384565315, + "rewards/reward_func_1": 0.06492023468017578, + "step": 3110 + }, + { + "completion_length": 2.0, + "epoch": 0.8374781556660842, + "grad_norm": 3.970403668063227e-06, + "kl": 13.9, + "learning_rate": 1.5644748821279409e-06, + "loss": 0.5563, + "reward": 0.05761244297027588, + "reward_std": 0.01134748296753969, + "rewards/reward_func_1": 0.05761244297027588, + "step": 3115 + }, + { + "completion_length": 2.0, + "epoch": 0.8388224223685979, + "grad_norm": 1.801868620532332e-06, + "kl": 14.128125, + "learning_rate": 1.5393634817693437e-06, + "loss": 0.5652, + "reward": 0.06230869293212891, + "reward_std": 0.012545625171333086, + "rewards/reward_func_1": 0.06230869293212891, + "step": 3120 + }, + { + "completion_length": 2.0, + "epoch": 0.8401666890711117, + "grad_norm": 2.9992136205692077e-06, + "kl": 13.9578125, + "learning_rate": 1.514438431913907e-06, + "loss": 0.5582, + "reward": 0.06606597900390625, + "reward_std": 0.010844698862638325, + "rewards/reward_func_1": 0.06606597900390625, + "step": 3125 + }, + { + "completion_length": 2.0, + "epoch": 0.8415109557736254, + "grad_norm": 5.0372464102110825e-06, + "kl": 14.0875, + "learning_rate": 1.4897002815505314e-06, + "loss": 0.5638, + "reward": 0.06385841369628906, + "reward_std": 0.011049523478141055, + "rewards/reward_func_1": 0.06385841369628906, + "step": 3130 + }, + { + "completion_length": 2.0, + "epoch": 0.8428552224761393, + "grad_norm": 7.81911876401864e-06, + "kl": 13.8375, + "learning_rate": 1.4651495755515522e-06, + "loss": 0.5535, + "reward": 0.06193408966064453, + "reward_std": 0.010152038796877604, + "rewards/reward_func_1": 0.06193408966064453, + "step": 3135 + }, + { + "completion_length": 2.0, + "epoch": 0.8441994891786531, + "grad_norm": 4.062687366968021e-05, + "kl": 14.0359375, + "learning_rate": 1.4407868546607319e-06, + "loss": 0.5615, + "reward": 0.06457939147949218, + "reward_std": 0.012157779483823105, + "rewards/reward_func_1": 0.06457939147949218, + "step": 3140 + }, + { + "completion_length": 2.0, + "epoch": 0.8455437558811668, + "grad_norm": 2.0320292151154717e-06, + "kl": 13.8375, + "learning_rate": 1.4166126554813508e-06, + "loss": 0.5534, + "reward": 0.055645179748535153, + "reward_std": 0.009349037745414535, + "rewards/reward_func_1": 0.055645179748535153, + "step": 3145 + }, + { + "completion_length": 2.0, + "epoch": 0.8468880225836806, + "grad_norm": 3.012095476151444e-06, + "kl": 13.8390625, + "learning_rate": 1.3926275104643816e-06, + "loss": 0.5534, + "reward": 0.06417160034179688, + "reward_std": 0.011896352579060476, + "rewards/reward_func_1": 0.06417160034179688, + "step": 3150 + }, + { + "completion_length": 2.0, + "epoch": 0.8482322892861944, + "grad_norm": 8.71294741955353e-06, + "kl": 13.71875, + "learning_rate": 1.3688319478967772e-06, + "loss": 0.5486, + "reward": 0.057692861557006835, + "reward_std": 0.008569657542102505, + "rewards/reward_func_1": 0.057692861557006835, + "step": 3155 + }, + { + "completion_length": 2.0, + "epoch": 0.8495765559887082, + "grad_norm": 2.56103658102802e-06, + "kl": 13.7125, + "learning_rate": 1.345226491889815e-06, + "loss": 0.5482, + "reward": 0.05779485702514649, + "reward_std": 0.011118789602187462, + "rewards/reward_func_1": 0.05779485702514649, + "step": 3160 + }, + { + "completion_length": 2.0, + "epoch": 0.8509208226912219, + "grad_norm": 2.6276434255123604e-06, + "kl": 13.7640625, + "learning_rate": 1.3218116623675737e-06, + "loss": 0.5509, + "reward": 0.055435562133789064, + "reward_std": 0.010689648687912268, + "rewards/reward_func_1": 0.055435562133789064, + "step": 3165 + }, + { + "completion_length": 2.0, + "epoch": 0.8522650893937357, + "grad_norm": 3.175247911713086e-06, + "kl": 13.7875, + "learning_rate": 1.298587975055462e-06, + "loss": 0.5517, + "reward": 0.058620452880859375, + "reward_std": 0.012298734129581134, + "rewards/reward_func_1": 0.058620452880859375, + "step": 3170 + }, + { + "completion_length": 2.0, + "epoch": 0.8536093560962495, + "grad_norm": 6.266296622925438e-06, + "kl": 13.953125, + "learning_rate": 1.2755559414688766e-06, + "loss": 0.5581, + "reward": 0.05827016830444336, + "reward_std": 0.011117468139855192, + "rewards/reward_func_1": 0.05827016830444336, + "step": 3175 + }, + { + "completion_length": 2.0, + "epoch": 0.8549536227987633, + "grad_norm": 1.6521969882887788e-05, + "kl": 13.865625, + "learning_rate": 1.2527160689019202e-06, + "loss": 0.5546, + "reward": 0.05954210758209229, + "reward_std": 0.009821428551731515, + "rewards/reward_func_1": 0.05954210758209229, + "step": 3180 + }, + { + "completion_length": 2.0, + "epoch": 0.856297889501277, + "grad_norm": 3.3688741041260073e-06, + "kl": 13.9859375, + "learning_rate": 1.2300688604162458e-06, + "loss": 0.5597, + "reward": 0.05932321548461914, + "reward_std": 0.01184624767920468, + "rewards/reward_func_1": 0.05932321548461914, + "step": 3185 + }, + { + "completion_length": 2.0, + "epoch": 0.8576421562037908, + "grad_norm": 2.135618524334859e-05, + "kl": 14.153125, + "learning_rate": 1.207614814829956e-06, + "loss": 0.5663, + "reward": 0.060421180725097653, + "reward_std": 0.010734908378799446, + "rewards/reward_func_1": 0.060421180725097653, + "step": 3190 + }, + { + "completion_length": 2.0, + "epoch": 0.8589864229063046, + "grad_norm": 3.3278847695328295e-06, + "kl": 13.675, + "learning_rate": 1.1853544267066353e-06, + "loss": 0.547, + "reward": 0.05666627883911133, + "reward_std": 0.009691183110044221, + "rewards/reward_func_1": 0.05666627883911133, + "step": 3195 + }, + { + "completion_length": 2.0, + "epoch": 0.8603306896088184, + "grad_norm": 2.5311237550340593e-05, + "kl": 13.7171875, + "learning_rate": 1.1632881863444412e-06, + "loss": 0.5485, + "reward": 0.0567962646484375, + "reward_std": 0.010639874120533932, + "rewards/reward_func_1": 0.0567962646484375, + "step": 3200 + }, + { + "completion_length": 2.0, + "epoch": 0.8616749563113322, + "grad_norm": 3.2120556170411874e-06, + "kl": 13.6140625, + "learning_rate": 1.141416579765321e-06, + "loss": 0.5443, + "reward": 0.05926952362060547, + "reward_std": 0.011298176337732002, + "rewards/reward_func_1": 0.05926952362060547, + "step": 3205 + }, + { + "completion_length": 2.0, + "epoch": 0.8630192230138459, + "grad_norm": 1.2166214219178073e-05, + "kl": 13.803125, + "learning_rate": 1.1197400887042876e-06, + "loss": 0.552, + "reward": 0.05761222839355469, + "reward_std": 0.01062401667368249, + "rewards/reward_func_1": 0.05761222839355469, + "step": 3210 + }, + { + "completion_length": 2.0, + "epoch": 0.8643634897163597, + "grad_norm": 8.964575499703642e-06, + "kl": 13.70625, + "learning_rate": 1.0982591905988304e-06, + "loss": 0.5486, + "reward": 0.05393571853637695, + "reward_std": 0.01401166350406129, + "rewards/reward_func_1": 0.05393571853637695, + "step": 3215 + }, + { + "completion_length": 2.0, + "epoch": 0.8657077564188735, + "grad_norm": 3.3451262879680144e-06, + "kl": 13.8015625, + "learning_rate": 1.076974358578381e-06, + "loss": 0.5522, + "reward": 0.05769004821777344, + "reward_std": 0.010371179192588897, + "rewards/reward_func_1": 0.05769004821777344, + "step": 3220 + }, + { + "completion_length": 2.0, + "epoch": 0.8670520231213873, + "grad_norm": 5.8671166698331945e-06, + "kl": 13.8546875, + "learning_rate": 1.0558860614539013e-06, + "loss": 0.554, + "reward": 0.06081085205078125, + "reward_std": 0.01084117493883241, + "rewards/reward_func_1": 0.06081085205078125, + "step": 3225 + }, + { + "completion_length": 2.0, + "epoch": 0.868396289823901, + "grad_norm": 9.06308378034737e-06, + "kl": 13.728125, + "learning_rate": 1.034994763707562e-06, + "loss": 0.5495, + "reward": 0.058400535583496095, + "reward_std": 0.010280990242608822, + "rewards/reward_func_1": 0.058400535583496095, + "step": 3230 + }, + { + "completion_length": 2.0, + "epoch": 0.8697405565264148, + "grad_norm": 2.764769988061744e-06, + "kl": 13.7484375, + "learning_rate": 1.014300925482501e-06, + "loss": 0.5501, + "reward": 0.06383857727050782, + "reward_std": 0.011695932724978774, + "rewards/reward_func_1": 0.06383857727050782, + "step": 3235 + }, + { + "completion_length": 2.0, + "epoch": 0.8710848232289287, + "grad_norm": 4.055384579260135e-06, + "kl": 14.325, + "learning_rate": 9.93805002572692e-07, + "loss": 0.5734, + "reward": 0.06781425476074218, + "reward_std": 0.010846901237528073, + "rewards/reward_func_1": 0.06781425476074218, + "step": 3240 + }, + { + "completion_length": 2.0, + "epoch": 0.8724290899314424, + "grad_norm": 3.197312707925448e-06, + "kl": 13.928125, + "learning_rate": 9.735074464129156e-07, + "loss": 0.5572, + "reward": 0.05835247039794922, + "reward_std": 0.011373498971806839, + "rewards/reward_func_1": 0.05835247039794922, + "step": 3245 + }, + { + "completion_length": 2.0, + "epoch": 0.8737733566339562, + "grad_norm": 2.8454533094190992e-06, + "kl": 13.928125, + "learning_rate": 9.534087040687978e-07, + "loss": 0.5575, + "reward": 0.058566713333129884, + "reward_std": 0.012191201363748405, + "rewards/reward_func_1": 0.058566713333129884, + "step": 3250 + }, + { + "completion_length": 2.0, + "epoch": 0.8751176233364699, + "grad_norm": 6.40214238956105e-06, + "kl": 13.8328125, + "learning_rate": 9.335092182269823e-07, + "loss": 0.5531, + "reward": 0.05842547416687012, + "reward_std": 0.013222923100693151, + "rewards/reward_func_1": 0.05842547416687012, + "step": 3255 + }, + { + "completion_length": 2.0, + "epoch": 0.8764618900389838, + "grad_norm": 3.3787041502364445e-06, + "kl": 13.7484375, + "learning_rate": 9.138094271853626e-07, + "loss": 0.5499, + "reward": 0.057961654663085935, + "reward_std": 0.010742506683163811, + "rewards/reward_func_1": 0.057961654663085935, + "step": 3260 + }, + { + "completion_length": 2.0, + "epoch": 0.8778061567414975, + "grad_norm": 3.2537961942580296e-06, + "kl": 13.9765625, + "learning_rate": 8.943097648434451e-07, + "loss": 0.5591, + "reward": 0.055088233947753903, + "reward_std": 0.011486923421034589, + "rewards/reward_func_1": 0.055088233947753903, + "step": 3265 + }, + { + "completion_length": 2.0, + "epoch": 0.8791504234440113, + "grad_norm": 4.171860564383678e-05, + "kl": 13.75, + "learning_rate": 8.750106606927756e-07, + "loss": 0.5501, + "reward": 0.06052291393280029, + "reward_std": 0.010141026746714488, + "rewards/reward_func_1": 0.06052291393280029, + "step": 3270 + }, + { + "completion_length": 2.0, + "epoch": 0.880494690146525, + "grad_norm": 5.266811058390886e-06, + "kl": 13.8078125, + "learning_rate": 8.559125398074941e-07, + "loss": 0.5522, + "reward": 0.06266212463378906, + "reward_std": 0.011405433918116614, + "rewards/reward_func_1": 0.06266212463378906, + "step": 3275 + }, + { + "completion_length": 2.0, + "epoch": 0.8818389568490389, + "grad_norm": 1.9777207853621803e-06, + "kl": 13.625, + "learning_rate": 8.370158228349611e-07, + "loss": 0.5449, + "reward": 0.059973645210266116, + "reward_std": 0.01014815697853919, + "rewards/reward_func_1": 0.059973645210266116, + "step": 3280 + }, + { + "completion_length": 2.0, + "epoch": 0.8831832235515527, + "grad_norm": 3.1561939977109432e-06, + "kl": 14.0640625, + "learning_rate": 8.18320925986501e-07, + "loss": 0.5626, + "reward": 0.061003684997558594, + "reward_std": 0.011469524375570472, + "rewards/reward_func_1": 0.061003684997558594, + "step": 3285 + }, + { + "completion_length": 2.0, + "epoch": 0.8845274902540664, + "grad_norm": 3.982539965363685e-06, + "kl": 13.9203125, + "learning_rate": 7.998282610282282e-07, + "loss": 0.5569, + "reward": 0.05995340347290039, + "reward_std": 0.009807140480552335, + "rewards/reward_func_1": 0.05995340347290039, + "step": 3290 + }, + { + "completion_length": 2.0, + "epoch": 0.8858717569565802, + "grad_norm": 5.228612280916423e-06, + "kl": 13.903125, + "learning_rate": 7.815382352719836e-07, + "loss": 0.5559, + "reward": 0.06256370544433594, + "reward_std": 0.010237712755042594, + "rewards/reward_func_1": 0.06256370544433594, + "step": 3295 + }, + { + "completion_length": 2.0, + "epoch": 0.8872160236590939, + "grad_norm": 3.3338096727675293e-06, + "kl": 13.9515625, + "learning_rate": 7.63451251566355e-07, + "loss": 0.5578, + "reward": 0.059042739868164065, + "reward_std": 0.011339801916619763, + "rewards/reward_func_1": 0.059042739868164065, + "step": 3300 + }, + { + "completion_length": 2.0, + "epoch": 0.8885602903616078, + "grad_norm": 1.969355253095273e-06, + "kl": 13.7625, + "learning_rate": 7.455677082878144e-07, + "loss": 0.5507, + "reward": 0.05534934997558594, + "reward_std": 0.011414243758190423, + "rewards/reward_func_1": 0.05534934997558594, + "step": 3305 + }, + { + "completion_length": 2.0, + "epoch": 0.8899045570641215, + "grad_norm": 2.7435919491836103e-06, + "kl": 13.825, + "learning_rate": 7.278879993319399e-07, + "loss": 0.5528, + "reward": 0.0598332405090332, + "reward_std": 0.012269772328363616, + "rewards/reward_func_1": 0.0598332405090332, + "step": 3310 + }, + { + "completion_length": 2.0, + "epoch": 0.8912488237666353, + "grad_norm": 1.6360121435354813e-06, + "kl": 13.8203125, + "learning_rate": 7.104125141047314e-07, + "loss": 0.5529, + "reward": 0.06058578491210938, + "reward_std": 0.0112239549322112, + "rewards/reward_func_1": 0.06058578491210938, + "step": 3315 + }, + { + "completion_length": 2.0, + "epoch": 0.892593090469149, + "grad_norm": 5.12127780893934e-06, + "kl": 14.0453125, + "learning_rate": 6.931416375140465e-07, + "loss": 0.5618, + "reward": 0.05910205841064453, + "reward_std": 0.010322065434593242, + "rewards/reward_func_1": 0.05910205841064453, + "step": 3320 + }, + { + "completion_length": 2.0, + "epoch": 0.8939373571716629, + "grad_norm": 1.6029716789489612e-05, + "kl": 13.6296875, + "learning_rate": 6.760757499611193e-07, + "loss": 0.5452, + "reward": 0.05997223854064941, + "reward_std": 0.010195591623778455, + "rewards/reward_func_1": 0.05997223854064941, + "step": 3325 + }, + { + "completion_length": 2.0, + "epoch": 0.8952816238741766, + "grad_norm": 9.6236435638275e-06, + "kl": 13.7984375, + "learning_rate": 6.592152273321706e-07, + "loss": 0.5521, + "reward": 0.051597309112548825, + "reward_std": 0.011377132889901987, + "rewards/reward_func_1": 0.051597309112548825, + "step": 3330 + }, + { + "completion_length": 2.0, + "epoch": 0.8966258905766904, + "grad_norm": 3.594194595279987e-06, + "kl": 13.5984375, + "learning_rate": 6.425604409901454e-07, + "loss": 0.5443, + "reward": 0.06611251831054688, + "reward_std": 0.009431628473248566, + "rewards/reward_func_1": 0.06611251831054688, + "step": 3335 + }, + { + "completion_length": 2.0, + "epoch": 0.8979701572792042, + "grad_norm": 5.3717056289315224e-05, + "kl": 14.0, + "learning_rate": 6.261117577665254e-07, + "loss": 0.5599, + "reward": 0.05563621520996094, + "reward_std": 0.011836006561861723, + "rewards/reward_func_1": 0.05563621520996094, + "step": 3340 + }, + { + "completion_length": 2.0, + "epoch": 0.899314423981718, + "grad_norm": 2.7849052912642946e-06, + "kl": 14.0578125, + "learning_rate": 6.098695399532451e-07, + "loss": 0.5625, + "reward": 0.06023540496826172, + "reward_std": 0.012114391791692469, + "rewards/reward_func_1": 0.06023540496826172, + "step": 3345 + }, + { + "completion_length": 2.0, + "epoch": 0.9006586906842318, + "grad_norm": 2.9332081794564147e-06, + "kl": 14.121875, + "learning_rate": 5.938341452947227e-07, + "loss": 0.5648, + "reward": 0.059270381927490234, + "reward_std": 0.010218441683537093, + "rewards/reward_func_1": 0.059270381927490234, + "step": 3350 + }, + { + "completion_length": 2.0, + "epoch": 0.9020029573867455, + "grad_norm": 2.1531218408199493e-06, + "kl": 13.9484375, + "learning_rate": 5.780059269799676e-07, + "loss": 0.5583, + "reward": 0.06291056871414184, + "reward_std": 0.013561049330746755, + "rewards/reward_func_1": 0.06291056871414184, + "step": 3355 + }, + { + "completion_length": 2.0, + "epoch": 0.9033472240892593, + "grad_norm": 1.5980087482603267e-05, + "kl": 13.7625, + "learning_rate": 5.623852336348156e-07, + "loss": 0.5505, + "reward": 0.06374626159667969, + "reward_std": 0.011463577805261593, + "rewards/reward_func_1": 0.06374626159667969, + "step": 3360 + }, + { + "completion_length": 2.0, + "epoch": 0.9046914907917731, + "grad_norm": 3.6081160033063497e-06, + "kl": 13.6828125, + "learning_rate": 5.469724093142359e-07, + "loss": 0.5474, + "reward": 0.058330869674682616, + "reward_std": 0.010767394045251422, + "rewards/reward_func_1": 0.058330869674682616, + "step": 3365 + }, + { + "completion_length": 2.0, + "epoch": 0.9060357574942869, + "grad_norm": 3.7672652979381382e-06, + "kl": 14.1046875, + "learning_rate": 5.317677934947652e-07, + "loss": 0.5643, + "reward": 0.06058921813964844, + "reward_std": 0.01233749669700046, + "rewards/reward_func_1": 0.06058921813964844, + "step": 3370 + }, + { + "completion_length": 2.0, + "epoch": 0.9073800241968006, + "grad_norm": 6.598625532205915e-06, + "kl": 13.7140625, + "learning_rate": 5.167717210670232e-07, + "loss": 0.5486, + "reward": 0.06335010528564453, + "reward_std": 0.01017670587534667, + "rewards/reward_func_1": 0.06335010528564453, + "step": 3375 + }, + { + "completion_length": 2.0, + "epoch": 0.9087242908993144, + "grad_norm": 6.818109341111267e-06, + "kl": 14.1703125, + "learning_rate": 5.019845223283393e-07, + "loss": 0.5669, + "reward": 0.06114330291748047, + "reward_std": 0.012632400382426568, + "rewards/reward_func_1": 0.06114330291748047, + "step": 3380 + }, + { + "completion_length": 2.0, + "epoch": 0.9100685576018283, + "grad_norm": 4.953264578944072e-06, + "kl": 13.8640625, + "learning_rate": 4.874065229754743e-07, + "loss": 0.5543, + "reward": 0.05895808935165405, + "reward_std": 0.009361990720572066, + "rewards/reward_func_1": 0.05895808935165405, + "step": 3385 + }, + { + "completion_length": 2.0, + "epoch": 0.911412824304342, + "grad_norm": 3.3497417462058365e-06, + "kl": 13.8671875, + "learning_rate": 4.730380440974536e-07, + "loss": 0.5548, + "reward": 0.05743751525878906, + "reward_std": 0.011488685388758312, + "rewards/reward_func_1": 0.05743751525878906, + "step": 3390 + }, + { + "completion_length": 2.0, + "epoch": 0.9127570910068558, + "grad_norm": 1.907120349642355e-05, + "kl": 13.7640625, + "learning_rate": 4.588794021684861e-07, + "loss": 0.5505, + "reward": 0.05307474136352539, + "reward_std": 0.011330662002728786, + "rewards/reward_func_1": 0.05307474136352539, + "step": 3395 + }, + { + "completion_length": 2.0, + "epoch": 0.9141013577093695, + "grad_norm": 1.0828506674442906e-05, + "kl": 13.784375, + "learning_rate": 4.4493090904100366e-07, + "loss": 0.5516, + "reward": 0.0565185546875, + "reward_std": 0.009959327296382981, + "rewards/reward_func_1": 0.0565185546875, + "step": 3400 + }, + { + "completion_length": 2.0, + "epoch": 0.9154456244118833, + "grad_norm": 6.197794391482603e-06, + "kl": 13.771875, + "learning_rate": 4.3119287193878035e-07, + "loss": 0.5511, + "reward": 0.059843674302101135, + "reward_std": 0.013095743974554352, + "rewards/reward_func_1": 0.059843674302101135, + "step": 3405 + }, + { + "completion_length": 2.0, + "epoch": 0.9167898911143971, + "grad_norm": 1.7243120282728341e-06, + "kl": 13.775, + "learning_rate": 4.176655934501783e-07, + "loss": 0.5513, + "reward": 0.060840415954589847, + "reward_std": 0.009967916857567616, + "rewards/reward_func_1": 0.060840415954589847, + "step": 3410 + }, + { + "completion_length": 2.0, + "epoch": 0.9181341578169109, + "grad_norm": 3.116844027317711e-06, + "kl": 13.9265625, + "learning_rate": 4.04349371521473e-07, + "loss": 0.5572, + "reward": 0.06282119750976563, + "reward_std": 0.010909009316674202, + "rewards/reward_func_1": 0.06282119750976563, + "step": 3415 + }, + { + "completion_length": 2.0, + "epoch": 0.9194784245194246, + "grad_norm": 3.4594206681504147e-06, + "kl": 13.4734375, + "learning_rate": 3.912444994503006e-07, + "loss": 0.539, + "reward": 0.055774879455566403, + "reward_std": 0.012623810911463806, + "rewards/reward_func_1": 0.055774879455566403, + "step": 3420 + }, + { + "completion_length": 2.0, + "epoch": 0.9208226912219384, + "grad_norm": 4.940301550959703e-06, + "kl": 13.74375, + "learning_rate": 3.783512658791821e-07, + "loss": 0.55, + "reward": 0.05670597553253174, + "reward_std": 0.01125905594062715, + "rewards/reward_func_1": 0.05670597553253174, + "step": 3425 + }, + { + "completion_length": 2.0, + "epoch": 0.9221669579244522, + "grad_norm": 5.229050202615326e-06, + "kl": 13.6734375, + "learning_rate": 3.6566995478918733e-07, + "loss": 0.547, + "reward": 0.059182238578796384, + "reward_std": 0.0114708733453881, + "rewards/reward_func_1": 0.059182238578796384, + "step": 3430 + }, + { + "completion_length": 2.0, + "epoch": 0.923511224626966, + "grad_norm": 4.601683485816466e-06, + "kl": 13.7171875, + "learning_rate": 3.5320084549365864e-07, + "loss": 0.5489, + "reward": 0.059515857696533205, + "reward_std": 0.01207662059168797, + "rewards/reward_func_1": 0.059515857696533205, + "step": 3435 + }, + { + "completion_length": 2.0, + "epoch": 0.9248554913294798, + "grad_norm": 7.413936600642046e-06, + "kl": 14.0421875, + "learning_rate": 3.409442126320761e-07, + "loss": 0.5622, + "reward": 0.0631723403930664, + "reward_std": 0.011003713330137544, + "rewards/reward_func_1": 0.0631723403930664, + "step": 3440 + }, + { + "completion_length": 2.0, + "epoch": 0.9261997580319935, + "grad_norm": 7.578887107229093e-06, + "kl": 14.0140625, + "learning_rate": 3.289003261639978e-07, + "loss": 0.5607, + "reward": 0.06122303009033203, + "reward_std": 0.008386581853119423, + "rewards/reward_func_1": 0.06122303009033203, + "step": 3445 + }, + { + "completion_length": 2.0, + "epoch": 0.9275440247345074, + "grad_norm": 2.735734824454994e-06, + "kl": 13.5484375, + "learning_rate": 3.170694513631178e-07, + "loss": 0.5421, + "reward": 0.05534172058105469, + "reward_std": 0.01105921419657534, + "rewards/reward_func_1": 0.05534172058105469, + "step": 3450 + }, + { + "completion_length": 2.0, + "epoch": 0.9288882914370211, + "grad_norm": 6.3263983065553475e-06, + "kl": 13.9046875, + "learning_rate": 3.054518488114211e-07, + "loss": 0.5563, + "reward": 0.054802989959716795, + "reward_std": 0.011252586312184575, + "rewards/reward_func_1": 0.054802989959716795, + "step": 3455 + }, + { + "completion_length": 2.0, + "epoch": 0.9302325581395349, + "grad_norm": 1.8264050595462322e-05, + "kl": 13.9765625, + "learning_rate": 2.9404777439345e-07, + "loss": 0.5592, + "reward": 0.06266040802001953, + "reward_std": 0.011020671827282058, + "rewards/reward_func_1": 0.06266040802001953, + "step": 3460 + }, + { + "completion_length": 2.0, + "epoch": 0.9315768248420486, + "grad_norm": 4.062319021613803e-06, + "kl": 13.3328125, + "learning_rate": 2.828574792906602e-07, + "loss": 0.5334, + "reward": 0.05159635543823242, + "reward_std": 0.012718734997179126, + "rewards/reward_func_1": 0.05159635543823242, + "step": 3465 + }, + { + "completion_length": 2.0, + "epoch": 0.9329210915445625, + "grad_norm": 3.980569545092294e-06, + "kl": 14.04375, + "learning_rate": 2.718812099758927e-07, + "loss": 0.5621, + "reward": 0.059010887145996095, + "reward_std": 0.011960663207719335, + "rewards/reward_func_1": 0.059010887145996095, + "step": 3470 + }, + { + "completion_length": 2.0, + "epoch": 0.9342653582470762, + "grad_norm": 7.070525043673115e-06, + "kl": 13.7671875, + "learning_rate": 2.61119208207945e-07, + "loss": 0.5507, + "reward": 0.06236776113510132, + "reward_std": 0.012535562692573877, + "rewards/reward_func_1": 0.06236776113510132, + "step": 3475 + }, + { + "completion_length": 2.0, + "epoch": 0.93560962494959, + "grad_norm": 1.0806640602822881e-05, + "kl": 14.009375, + "learning_rate": 2.5057171102624623e-07, + "loss": 0.5605, + "reward": 0.06075210571289062, + "reward_std": 0.010957903016242198, + "rewards/reward_func_1": 0.06075210571289062, + "step": 3480 + }, + { + "completion_length": 2.0, + "epoch": 0.9369538916521037, + "grad_norm": 7.847236702218652e-06, + "kl": 13.5546875, + "learning_rate": 2.4023895074563266e-07, + "loss": 0.5421, + "reward": 0.05129318237304688, + "reward_std": 0.011580966626206645, + "rewards/reward_func_1": 0.05129318237304688, + "step": 3485 + }, + { + "completion_length": 2.0, + "epoch": 0.9382981583546176, + "grad_norm": 1.2028275705233682e-05, + "kl": 13.9296875, + "learning_rate": 2.3012115495123944e-07, + "loss": 0.5573, + "reward": 0.060091400146484376, + "reward_std": 0.012295650782471057, + "rewards/reward_func_1": 0.060091400146484376, + "step": 3490 + }, + { + "completion_length": 2.0, + "epoch": 0.9396424250571314, + "grad_norm": 4.723935944639379e-06, + "kl": 13.775, + "learning_rate": 2.2021854649347696e-07, + "loss": 0.551, + "reward": 0.06290969848632813, + "reward_std": 0.01066894597352075, + "rewards/reward_func_1": 0.06290969848632813, + "step": 3495 + }, + { + "completion_length": 2.0, + "epoch": 0.9409866917596451, + "grad_norm": 2.703375912460615e-06, + "kl": 14.0421875, + "learning_rate": 2.105313434831302e-07, + "loss": 0.5619, + "reward": 0.061441230773925784, + "reward_std": 0.01024222782725701, + "rewards/reward_func_1": 0.061441230773925784, + "step": 3500 + }, + { + "completion_length": 2.0, + "epoch": 0.9423309584621589, + "grad_norm": 5.110204710945254e-06, + "kl": 14.0078125, + "learning_rate": 2.0105975928655154e-07, + "loss": 0.5603, + "reward": 0.05853328704833984, + "reward_std": 0.011592859703523573, + "rewards/reward_func_1": 0.05853328704833984, + "step": 3505 + }, + { + "completion_length": 2.0, + "epoch": 0.9436752251646727, + "grad_norm": 2.7313865302858176e-06, + "kl": 13.821875, + "learning_rate": 1.9180400252096332e-07, + "loss": 0.5529, + "reward": 0.05967512130737305, + "reward_std": 0.010000402305377066, + "rewards/reward_func_1": 0.05967512130737305, + "step": 3510 + }, + { + "completion_length": 2.0, + "epoch": 0.9450194918671865, + "grad_norm": 5.0758940233208705e-06, + "kl": 14.1890625, + "learning_rate": 1.8276427704985944e-07, + "loss": 0.5674, + "reward": 0.06556577682495117, + "reward_std": 0.01243010827965918, + "rewards/reward_func_1": 0.06556577682495117, + "step": 3515 + }, + { + "completion_length": 2.0, + "epoch": 0.9463637585697002, + "grad_norm": 4.3392617953941226e-05, + "kl": 13.828125, + "learning_rate": 1.7394078197851883e-07, + "loss": 0.5531, + "reward": 0.06168599128723144, + "reward_std": 0.011275436536379857, + "rewards/reward_func_1": 0.06168599128723144, + "step": 3520 + }, + { + "completion_length": 2.0, + "epoch": 0.947708025272214, + "grad_norm": 3.3501562484161695e-06, + "kl": 13.7171875, + "learning_rate": 1.6533371164961675e-07, + "loss": 0.5485, + "reward": 0.05614547729492188, + "reward_std": 0.012279793498601066, + "rewards/reward_func_1": 0.05614547729492188, + "step": 3525 + }, + { + "completion_length": 2.0, + "epoch": 0.9490522919747277, + "grad_norm": 1.6229182620008942e-06, + "kl": 13.9203125, + "learning_rate": 1.569432556389494e-07, + "loss": 0.5568, + "reward": 0.059717750549316405, + "reward_std": 0.010331755940569565, + "rewards/reward_func_1": 0.059717750549316405, + "step": 3530 + }, + { + "completion_length": 2.0, + "epoch": 0.9503965586772416, + "grad_norm": 4.219915808789665e-06, + "kl": 14.1, + "learning_rate": 1.4876959875125163e-07, + "loss": 0.5642, + "reward": 0.06116485595703125, + "reward_std": 0.010313475892326096, + "rewards/reward_func_1": 0.06116485595703125, + "step": 3535 + }, + { + "completion_length": 2.0, + "epoch": 0.9517408253797554, + "grad_norm": 2.8452825517888414e-06, + "kl": 13.8171875, + "learning_rate": 1.4081292101613241e-07, + "loss": 0.5527, + "reward": 0.05729732513427734, + "reward_std": 0.010209962361841463, + "rewards/reward_func_1": 0.05729732513427734, + "step": 3540 + }, + { + "completion_length": 2.0, + "epoch": 0.9530850920822691, + "grad_norm": 7.0745595621701796e-06, + "kl": 13.7015625, + "learning_rate": 1.3307339768410365e-07, + "loss": 0.5482, + "reward": 0.05454435348510742, + "reward_std": 0.011191799660446122, + "rewards/reward_func_1": 0.05454435348510742, + "step": 3545 + }, + { + "completion_length": 2.0, + "epoch": 0.9544293587847829, + "grad_norm": 5.309683274390409e-06, + "kl": 14.0046875, + "learning_rate": 1.2555119922272762e-07, + "loss": 0.56, + "reward": 0.060787296295166014, + "reward_std": 0.011707495429436676, + "rewards/reward_func_1": 0.060787296295166014, + "step": 3550 + }, + { + "completion_length": 2.0, + "epoch": 0.9557736254872967, + "grad_norm": 2.6409459223941667e-06, + "kl": 13.8515625, + "learning_rate": 1.182464913128556e-07, + "loss": 0.5541, + "reward": 0.05850715637207031, + "reward_std": 0.011938859339716145, + "rewards/reward_func_1": 0.05850715637207031, + "step": 3555 + }, + { + "completion_length": 2.0, + "epoch": 0.9571178921898105, + "grad_norm": 2.5191229724441655e-05, + "kl": 13.9328125, + "learning_rate": 1.1115943484498292e-07, + "loss": 0.5573, + "reward": 0.060850906372070315, + "reward_std": 0.008408385679285858, + "rewards/reward_func_1": 0.060850906372070315, + "step": 3560 + }, + { + "completion_length": 2.0, + "epoch": 0.9584621588923242, + "grad_norm": 2.427203526167432e-06, + "kl": 13.684375, + "learning_rate": 1.0429018591570195e-07, + "loss": 0.5472, + "reward": 0.062432861328125, + "reward_std": 0.011829619569471105, + "rewards/reward_func_1": 0.062432861328125, + "step": 3565 + }, + { + "completion_length": 2.0, + "epoch": 0.959806425594838, + "grad_norm": 6.474336259998381e-06, + "kl": 13.7390625, + "learning_rate": 9.7638895824268e-08, + "loss": 0.5494, + "reward": 0.06272506713867188, + "reward_std": 0.010429763507272583, + "rewards/reward_func_1": 0.06272506713867188, + "step": 3570 + }, + { + "completion_length": 2.0, + "epoch": 0.9611506922973518, + "grad_norm": 3.807508164754836e-06, + "kl": 13.89375, + "learning_rate": 9.120571106926212e-08, + "loss": 0.5563, + "reward": 0.05861101150512695, + "reward_std": 0.012294219210161828, + "rewards/reward_func_1": 0.05861101150512695, + "step": 3575 + }, + { + "completion_length": 2.0, + "epoch": 0.9624949589998656, + "grad_norm": 2.37896620092215e-06, + "kl": 14.1109375, + "learning_rate": 8.499077334536921e-08, + "loss": 0.5646, + "reward": 0.0572235107421875, + "reward_std": 0.01089623533844133, + "rewards/reward_func_1": 0.0572235107421875, + "step": 3580 + }, + { + "completion_length": 2.0, + "epoch": 0.9638392257023793, + "grad_norm": 3.802741503022844e-06, + "kl": 13.753125, + "learning_rate": 7.899421954025266e-08, + "loss": 0.5501, + "reward": 0.0643655776977539, + "reward_std": 0.010973099654074758, + "rewards/reward_func_1": 0.0643655776977539, + "step": 3585 + }, + { + "completion_length": 2.0, + "epoch": 0.9651834924048931, + "grad_norm": 5.4865031415829435e-06, + "kl": 13.7015625, + "learning_rate": 7.321618173154466e-08, + "loss": 0.5481, + "reward": 0.05659542083740234, + "reward_std": 0.011230782363782055, + "rewards/reward_func_1": 0.05659542083740234, + "step": 3590 + }, + { + "completion_length": 2.0, + "epoch": 0.966527759107407, + "grad_norm": 1.9297044673294295e-06, + "kl": 14.0859375, + "learning_rate": 6.765678718392843e-08, + "loss": 0.5633, + "reward": 0.0672616958618164, + "reward_std": 0.013510283493087628, + "rewards/reward_func_1": 0.0672616958618164, + "step": 3595 + }, + { + "completion_length": 2.0, + "epoch": 0.9678720258099207, + "grad_norm": 3.233315283068805e-06, + "kl": 13.9375, + "learning_rate": 6.231615834634497e-08, + "loss": 0.5572, + "reward": 0.06126976013183594, + "reward_std": 0.010508609988391981, + "rewards/reward_func_1": 0.06126976013183594, + "step": 3600 + }, + { + "completion_length": 2.0, + "epoch": 0.9692162925124345, + "grad_norm": 3.4369802506262204e-06, + "kl": 13.734375, + "learning_rate": 5.719441284929073e-08, + "loss": 0.5495, + "reward": 0.06183929443359375, + "reward_std": 0.009256316086975858, + "rewards/reward_func_1": 0.06183929443359375, + "step": 3605 + }, + { + "completion_length": 2.0, + "epoch": 0.9705605592149482, + "grad_norm": 2.0105403564230073e-06, + "kl": 13.815625, + "learning_rate": 5.229166350222747e-08, + "loss": 0.5525, + "reward": 0.06485710144042969, + "reward_std": 0.010634588305765646, + "rewards/reward_func_1": 0.06485710144042969, + "step": 3610 + }, + { + "completion_length": 2.0, + "epoch": 0.9719048259174621, + "grad_norm": 1.3547062735597137e-05, + "kl": 13.8625, + "learning_rate": 4.760801829109763e-08, + "loss": 0.5546, + "reward": 0.0634115219116211, + "reward_std": 0.010813204231817508, + "rewards/reward_func_1": 0.0634115219116211, + "step": 3615 + }, + { + "completion_length": 2.0, + "epoch": 0.9732490926199758, + "grad_norm": 1.8916140334113152e-06, + "kl": 13.721875, + "learning_rate": 4.3143580375945016e-08, + "loss": 0.549, + "reward": 0.05869293212890625, + "reward_std": 0.010853508513537235, + "rewards/reward_func_1": 0.05869293212890625, + "step": 3620 + }, + { + "completion_length": 2.0, + "epoch": 0.9745933593224896, + "grad_norm": 4.1851594687614124e-06, + "kl": 13.4625, + "learning_rate": 3.889844808864451e-08, + "loss": 0.5387, + "reward": 0.059955787658691403, + "reward_std": 0.010277136050717672, + "rewards/reward_func_1": 0.059955787658691403, + "step": 3625 + }, + { + "completion_length": 2.0, + "epoch": 0.9759376260250033, + "grad_norm": 7.070749688864453e-06, + "kl": 13.9234375, + "learning_rate": 3.487271493073596e-08, + "loss": 0.5572, + "reward": 0.0633173942565918, + "reward_std": 0.008260493339184905, + "rewards/reward_func_1": 0.0633173942565918, + "step": 3630 + }, + { + "completion_length": 2.0, + "epoch": 0.9772818927275171, + "grad_norm": 2.236297405033838e-06, + "kl": 13.834375, + "learning_rate": 3.106646957136472e-08, + "loss": 0.5532, + "reward": 0.06124534606933594, + "reward_std": 0.011660033430234761, + "rewards/reward_func_1": 0.06124534606933594, + "step": 3635 + }, + { + "completion_length": 2.0, + "epoch": 0.978626159430031, + "grad_norm": 2.2983462258707732e-05, + "kl": 13.9046875, + "learning_rate": 2.7479795845324342e-08, + "loss": 0.5563, + "reward": 0.05750617980957031, + "reward_std": 0.01097970688406349, + "rewards/reward_func_1": 0.05750617980957031, + "step": 3640 + }, + { + "completion_length": 2.0, + "epoch": 0.9799704261325447, + "grad_norm": 3.4399413380015176e-06, + "kl": 13.965625, + "learning_rate": 2.411277275121915e-08, + "loss": 0.5586, + "reward": 0.062087726593017575, + "reward_std": 0.016095589974429458, + "rewards/reward_func_1": 0.062087726593017575, + "step": 3645 + }, + { + "completion_length": 2.0, + "epoch": 0.9813146928350585, + "grad_norm": 5.6823751037882175e-06, + "kl": 13.8625, + "learning_rate": 2.096547444971453e-08, + "loss": 0.5547, + "reward": 0.06416492462158203, + "reward_std": 0.012935893231770024, + "rewards/reward_func_1": 0.06416492462158203, + "step": 3650 + }, + { + "completion_length": 2.0, + "epoch": 0.9826589595375722, + "grad_norm": 3.1565050448989496e-06, + "kl": 13.8734375, + "learning_rate": 1.8037970261909343e-08, + "loss": 0.5549, + "reward": 0.060312080383300784, + "reward_std": 0.01092442618610221, + "rewards/reward_func_1": 0.060312080383300784, + "step": 3655 + }, + { + "completion_length": 2.0, + "epoch": 0.9840032262400861, + "grad_norm": 3.1137699352257187e-06, + "kl": 13.7796875, + "learning_rate": 1.533032466780826e-08, + "loss": 0.5516, + "reward": 0.06079998016357422, + "reward_std": 0.011588454757293221, + "rewards/reward_func_1": 0.06079998016357422, + "step": 3660 + }, + { + "completion_length": 2.0, + "epoch": 0.9853474929425998, + "grad_norm": 1.9204537238692865e-05, + "kl": 13.83125, + "learning_rate": 1.2842597304901783e-08, + "loss": 0.5532, + "reward": 0.05525150299072266, + "reward_std": 0.008975948392617283, + "rewards/reward_func_1": 0.05525150299072266, + "step": 3665 + }, + { + "completion_length": 2.0, + "epoch": 0.9866917596451136, + "grad_norm": 3.472564230833086e-06, + "kl": 13.6734375, + "learning_rate": 1.057484296684841e-08, + "loss": 0.5469, + "reward": 0.06324386596679688, + "reward_std": 0.013345763047982472, + "rewards/reward_func_1": 0.06324386596679688, + "step": 3670 + }, + { + "completion_length": 2.0, + "epoch": 0.9880360263476273, + "grad_norm": 2.0865973056061193e-06, + "kl": 13.6015625, + "learning_rate": 8.527111602273375e-09, + "loss": 0.5439, + "reward": 0.057076644897460935, + "reward_std": 0.01272732454162906, + "rewards/reward_func_1": 0.057076644897460935, + "step": 3675 + }, + { + "completion_length": 2.0, + "epoch": 0.9893802930501412, + "grad_norm": 5.3152875807427336e-06, + "kl": 13.7734375, + "learning_rate": 6.699448313668422e-09, + "loss": 0.5508, + "reward": 0.06054000854492188, + "reward_std": 0.010864520556788193, + "rewards/reward_func_1": 0.06054000854492188, + "step": 3680 + }, + { + "completion_length": 2.0, + "epoch": 0.9907245597526549, + "grad_norm": 1.824535843297781e-06, + "kl": 14.0265625, + "learning_rate": 5.0918933563914866e-09, + "loss": 0.561, + "reward": 0.05926017761230469, + "reward_std": 0.011259193480873364, + "rewards/reward_func_1": 0.05926017761230469, + "step": 3685 + }, + { + "completion_length": 2.0, + "epoch": 0.9920688264551687, + "grad_norm": 3.396847887415788e-06, + "kl": 13.934375, + "learning_rate": 3.7044821377896225e-09, + "loss": 0.5574, + "reward": 0.058501815795898436, + "reward_std": 0.010197188252641353, + "rewards/reward_func_1": 0.058501815795898436, + "step": 3690 + }, + { + "completion_length": 2.0, + "epoch": 0.9934130931576824, + "grad_norm": 2.0717141069326317e-06, + "kl": 13.6671875, + "learning_rate": 2.537245216410744e-09, + "loss": 0.5469, + "reward": 0.06255474090576171, + "reward_std": 0.010947992081491975, + "rewards/reward_func_1": 0.06255474090576171, + "step": 3695 + }, + { + "completion_length": 2.0, + "epoch": 0.9947573598601963, + "grad_norm": 4.1668949961604085e-06, + "kl": 13.8796875, + "learning_rate": 1.590208301335272e-09, + "loss": 0.5552, + "reward": 0.06215476989746094, + "reward_std": 0.011439351307490141, + "rewards/reward_func_1": 0.06215476989746094, + "step": 3700 + }, + { + "completion_length": 2.0, + "epoch": 0.9961016265627101, + "grad_norm": 4.461783646547701e-06, + "kl": 14.01875, + "learning_rate": 8.633922516110283e-10, + "loss": 0.5609, + "reward": 0.05896968841552734, + "reward_std": 0.01078853727231035, + "rewards/reward_func_1": 0.05896968841552734, + "step": 3705 + }, + { + "completion_length": 2.0, + "epoch": 0.9974458932652238, + "grad_norm": 3.0964040433900664e-06, + "kl": 13.6203125, + "learning_rate": 3.568130757880539e-10, + "loss": 0.545, + "reward": 0.056448173522949216, + "reward_std": 0.009608041982573923, + "rewards/reward_func_1": 0.056448173522949216, + "step": 3710 + }, + { + "completion_length": 2.0, + "epoch": 0.9987901599677376, + "grad_norm": 6.054201548977289e-06, + "kl": 13.7796875, + "learning_rate": 7.048193157221939e-11, + "loss": 0.5512, + "reward": 0.06598529815673829, + "reward_std": 0.011622372209239984, + "rewards/reward_func_1": 0.06598529815673829, + "step": 3715 + }, + { + "completion_length": 2.0, + "epoch": 0.9998655733297486, + "kl": 13.673828125, + "reward": 0.06168794631958008, + "reward_std": 0.012013631283480208, + "rewards/reward_func_1": 0.06168794631958008, + "step": 3719, + "total_flos": 0.0, + "train_loss": 604316576171.3622, + "train_runtime": 47153.6794, + "train_samples_per_second": 1.262, + "train_steps_per_second": 0.079 + } + ], + "logging_steps": 5, + "max_steps": 3719, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}