diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13542 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5714285714285714, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "advantage_max": 1.112048827111721, + "advantage_mean": -2.2972623692218974e-08, + "advantage_min": -1.0226236283779144, + "advantage_std": 0.809523094445467, + "completion_length": 2571.2083587646484, + "epoch": 0.001142857142857143, + "grad_norm": 0.14223624765872955, + "kl": 0.0, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2e-08, + "loss": 0.0762, + "reward": 0.383966077119112, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.383966077119112, + "reward_after_std": 0.8095231093466282, + "reward_before_mean": 0.4897647276520729, + "reward_before_std": 0.8290339298546314, + "reward_change_max": 0.000140361487865448, + "reward_change_mean": -0.10579865705221891, + "reward_change_min": -0.2073100507259369, + "reward_change_std": 0.08411919022910297, + "reward_std": 0.8095231391489506, + "rewards/cosine_scaled_reward": -0.015534311532974243, + "rewards/format_reward": 0.5208333488553762, + "step": 1 + }, + { + "advantage_max": 0.5256818607449532, + "advantage_mean": -1.7384688744126464e-08, + "advantage_min": -0.5523970872163773, + "advantage_std": 0.42011943086981773, + "completion_length": 2804.395881652832, + "epoch": 0.002285714285714286, + "grad_norm": 0.0626918151974678, + "kl": 0.0, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4e-08, + "loss": 0.0254, + "reward": 0.17750850692391396, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.17750850692391396, + "reward_after_std": 0.42011942341923714, + "reward_before_mean": 0.27539755403995514, + "reward_before_std": 0.42092561535537243, + "reward_change_max": 0.0003265589475631714, + "reward_change_mean": -0.09788906387984753, + "reward_change_min": -0.1594111192971468, + "reward_change_std": 0.06503142253495753, + "reward_std": 0.42011944204568863, + "rewards/cosine_scaled_reward": -0.04980122856795788, + "rewards/format_reward": 0.37500000558793545, + "step": 2 + }, + { + "advantage_max": 0.855322539806366, + "advantage_mean": 3.725290520506519e-09, + "advantage_min": -0.6352410092949867, + "advantage_std": 0.5779101513326168, + "completion_length": 3328.5416717529297, + "epoch": 0.0034285714285714284, + "grad_norm": 0.09340520948171616, + "kl": 4.819035530090332e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6e-08, + "loss": -0.0049, + "reward": -0.1236064094118774, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.1236064094118774, + "reward_after_std": 0.5779101625084877, + "reward_before_mean": -0.05990193039178848, + "reward_before_std": 0.5883516669273376, + "reward_change_max": 0.0, + "reward_change_mean": -0.06370447622612119, + "reward_change_min": -0.1256130812689662, + "reward_change_std": 0.05005356459878385, + "reward_std": 0.5779101811349392, + "rewards/cosine_scaled_reward": -0.13411763461772352, + "rewards/format_reward": 0.2083333358168602, + "step": 3 + }, + { + "advantage_max": 1.494198601692915, + "advantage_mean": -2.1730860888524717e-08, + "advantage_min": -0.9187480844557285, + "advantage_std": 0.9113606847822666, + "completion_length": 2199.1667098999023, + "epoch": 0.004571428571428572, + "grad_norm": 0.1944754272699356, + "kl": 3.168359398841858e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8e-08, + "loss": 0.0598, + "reward": 0.5572676844894886, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5572676844894886, + "reward_after_std": 0.9113606996834278, + "reward_before_mean": 0.672414306551218, + "reward_before_std": 0.9033316560089588, + "reward_change_max": 7.709860801696777e-05, + "reward_change_mean": -0.11514659691601992, + "reward_change_min": -0.20749736204743385, + "reward_change_std": 0.07951459102332592, + "reward_std": 0.9113607443869114, + "rewards/cosine_scaled_reward": -0.017959539778530598, + "rewards/format_reward": 0.7083333432674408, + "step": 4 + }, + { + "advantage_max": 1.2024615556001663, + "advantage_mean": 7.140139812733537e-09, + "advantage_min": -0.6320518404245377, + "advantage_std": 0.6965500302612782, + "completion_length": 3183.3958740234375, + "epoch": 0.005714285714285714, + "grad_norm": 0.12876158952713013, + "kl": 4.059821367263794e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1e-07, + "loss": 0.04, + "reward": -0.1184095498174429, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.1184095498174429, + "reward_after_std": 0.6965500302612782, + "reward_before_mean": -0.06059689383255318, + "reward_before_std": 0.6977124996483326, + "reward_change_max": 0.00010875612497329712, + "reward_change_mean": -0.05781265441328287, + "reward_change_min": -0.12108294386416674, + "reward_change_std": 0.04674133914522827, + "reward_std": 0.6965500451624393, + "rewards/cosine_scaled_reward": -0.21779845468699932, + "rewards/format_reward": 0.37500000558793545, + "step": 5 + }, + { + "advantage_max": 1.084119837731123, + "advantage_mean": -1.2417634476236117e-08, + "advantage_min": -0.7321569994091988, + "advantage_std": 0.7003737837076187, + "completion_length": 3075.5833892822266, + "epoch": 0.006857142857142857, + "grad_norm": 0.1684904843568802, + "kl": 4.678219556808472e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.2e-07, + "loss": 0.0172, + "reward": 0.0019676077645272017, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.0019676077645272017, + "reward_after_std": 0.700373774394393, + "reward_before_mean": 0.07271714322268963, + "reward_before_std": 0.708720869384706, + "reward_change_max": 0.00048591941595077515, + "reward_change_mean": -0.07074952917173505, + "reward_change_min": -0.14616843592375517, + "reward_change_std": 0.06034345901571214, + "reward_std": 0.7003737911581993, + "rewards/cosine_scaled_reward": -0.13030810561031103, + "rewards/format_reward": 0.3333333395421505, + "step": 6 + }, + { + "advantage_max": 1.569422885775566, + "advantage_mean": -1.2417634698280722e-08, + "advantage_min": -1.1949431672692299, + "advantage_std": 1.095862664282322, + "completion_length": 3141.2084197998047, + "epoch": 0.008, + "grad_norm": 0.21224753558635712, + "kl": 3.159046173095703e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.4e-07, + "loss": 0.0914, + "reward": 0.26081612706184387, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.26081612706184387, + "reward_after_std": 1.0958626568317413, + "reward_before_mean": 0.3473575795069337, + "reward_before_std": 1.130651280283928, + "reward_change_max": 9.550899267196655e-05, + "reward_change_mean": -0.08654144563479349, + "reward_change_min": -0.18773514311760664, + "reward_change_std": 0.08604789082892239, + "reward_std": 1.0958627052605152, + "rewards/cosine_scaled_reward": -0.045071213971823454, + "rewards/format_reward": 0.43750000558793545, + "step": 7 + }, + { + "advantage_max": 1.1458450332283974, + "advantage_mean": -3.849466767569254e-08, + "advantage_min": -0.843430656939745, + "advantage_std": 0.7730266377329826, + "completion_length": 2811.3333740234375, + "epoch": 0.009142857142857144, + "grad_norm": 0.13289576768875122, + "kl": 1.934915781021118e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.6e-07, + "loss": 0.0725, + "reward": 0.418302733451128, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.418302733451128, + "reward_after_std": 0.7730266749858856, + "reward_before_mean": 0.5264986455440521, + "reward_before_std": 0.7791682276874781, + "reward_change_max": 6.0439109802246094e-05, + "reward_change_mean": -0.10819594142958522, + "reward_change_min": -0.2019729232415557, + "reward_change_std": 0.08076721499674022, + "reward_std": 0.7730266973376274, + "rewards/cosine_scaled_reward": 0.06533264694735408, + "rewards/format_reward": 0.39583333767950535, + "step": 8 + }, + { + "advantage_max": 0.9698346555233002, + "advantage_mean": 9.002785295031401e-09, + "advantage_min": -0.8632857594639063, + "advantage_std": 0.7153309807181358, + "completion_length": 3139.7709045410156, + "epoch": 0.010285714285714285, + "grad_norm": 0.1402571052312851, + "kl": 4.3511390686035156e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.8e-07, + "loss": 0.0534, + "reward": 0.06348151830025017, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.06348151830025017, + "reward_after_std": 0.715330995619297, + "reward_before_mean": 0.14234677236527205, + "reward_before_std": 0.7345745638012886, + "reward_change_max": 0.0003286302089691162, + "reward_change_mean": -0.07886525010690093, + "reward_change_min": -0.1620482699945569, + "reward_change_std": 0.06977574108168483, + "reward_std": 0.7153310105204582, + "rewards/cosine_scaled_reward": -0.13715994078665972, + "rewards/format_reward": 0.416666679084301, + "step": 9 + }, + { + "advantage_max": 1.0015601068735123, + "advantage_mean": -1.2417630257388623e-09, + "advantage_min": -0.5780720449984074, + "advantage_std": 0.6456005871295929, + "completion_length": 2626.4791831970215, + "epoch": 0.011428571428571429, + "grad_norm": 0.07943809777498245, + "kl": 3.2432377338409424e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2e-07, + "loss": 0.0169, + "reward": -0.04100732016377151, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.04100732016377151, + "reward_after_std": 0.6456005834043026, + "reward_before_mean": 0.02712206542491913, + "reward_before_std": 0.6530085504055023, + "reward_change_max": 0.00024922192096710205, + "reward_change_mean": -0.06812940072268248, + "reward_change_min": -0.14841055870056152, + "reward_change_std": 0.05704877176322043, + "reward_std": 0.6456005908548832, + "rewards/cosine_scaled_reward": -0.18435563705861568, + "rewards/format_reward": 0.39583333767950535, + "step": 10 + }, + { + "advantage_max": 1.211370985955, + "advantage_mean": -6.829699028543246e-09, + "advantage_min": -0.7788897380232811, + "advantage_std": 0.8170069120824337, + "completion_length": 3313.9791870117188, + "epoch": 0.012571428571428572, + "grad_norm": 0.16173361241817474, + "kl": 3.5762786865234375e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.1999999999999998e-07, + "loss": 0.1018, + "reward": -0.009695451706647873, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.009695451706647873, + "reward_after_std": 0.8170069074258208, + "reward_before_mean": 0.057828957214951515, + "reward_before_std": 0.8377649770118296, + "reward_change_max": 7.179379463195801e-05, + "reward_change_mean": -0.06752440868876874, + "reward_change_min": -0.17309920210391283, + "reward_change_std": 0.06957878917455673, + "reward_std": 0.8170069148764014, + "rewards/cosine_scaled_reward": -0.09608553373254836, + "rewards/format_reward": 0.25000000558793545, + "step": 11 + }, + { + "advantage_max": 1.1693915463984013, + "advantage_mean": -2.6697914712325854e-08, + "advantage_min": -0.9561029076576233, + "advantage_std": 0.811566423624754, + "completion_length": 2493.3959197998047, + "epoch": 0.013714285714285714, + "grad_norm": 0.11758553236722946, + "kl": 3.711692988872528e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.4e-07, + "loss": 0.0303, + "reward": 0.4016623003408313, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4016623003408313, + "reward_after_std": 0.8115664124488831, + "reward_before_mean": 0.5080796424299479, + "reward_before_std": 0.8227832280099392, + "reward_change_max": 0.0005908533930778503, + "reward_change_mean": -0.10641733650118113, + "reward_change_min": -0.20470311492681503, + "reward_change_std": 0.08011856814846396, + "reward_std": 0.811566423624754, + "rewards/cosine_scaled_reward": -0.06887686066329479, + "rewards/format_reward": 0.6458333469927311, + "step": 12 + }, + { + "advantage_max": 0.9242342077195644, + "advantage_mean": 1.024454859832602e-08, + "advantage_min": -0.8469453305006027, + "advantage_std": 0.6706404201686382, + "completion_length": 2971.375030517578, + "epoch": 0.014857142857142857, + "grad_norm": 0.11529818177223206, + "kl": 3.364682197570801e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.6e-07, + "loss": 0.0336, + "reward": 0.17752011120319366, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.17752011120319366, + "reward_after_std": 0.6706404276192188, + "reward_before_mean": 0.267868370981887, + "reward_before_std": 0.6849480085074902, + "reward_change_max": 7.892400026321411e-05, + "reward_change_mean": -0.09034825977869332, + "reward_change_min": -0.1780348438769579, + "reward_change_std": 0.07077532494440675, + "reward_std": 0.6706404536962509, + "rewards/cosine_scaled_reward": -0.07439915277063847, + "rewards/format_reward": 0.41666667722165585, + "step": 13 + }, + { + "advantage_max": 0.9090108498930931, + "advantage_mean": -2.23517424569053e-08, + "advantage_min": -0.8783823177218437, + "advantage_std": 0.6800604276359081, + "completion_length": 2952.2708892822266, + "epoch": 0.016, + "grad_norm": 0.13060258328914642, + "kl": 2.4463282898068428e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.8e-07, + "loss": 0.0531, + "reward": 0.250303965061903, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.250303965061903, + "reward_after_std": 0.6800604052841663, + "reward_before_mean": 0.34783143922686577, + "reward_before_std": 0.6966002192348242, + "reward_change_max": 0.0002243444323539734, + "reward_change_mean": -0.09752750615007244, + "reward_change_min": -0.18189744092524052, + "reward_change_std": 0.07319995859870687, + "reward_std": 0.6800604090094566, + "rewards/cosine_scaled_reward": -0.024000946432352066, + "rewards/format_reward": 0.39583334513008595, + "step": 14 + }, + { + "advantage_max": 0.8086681701242924, + "advantage_mean": -1.0554989271494009e-08, + "advantage_min": -0.8142276704311371, + "advantage_std": 0.6346138492226601, + "completion_length": 2722.041702270508, + "epoch": 0.017142857142857144, + "grad_norm": 0.06529524177312851, + "kl": 1.6065314412117004e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3e-07, + "loss": 0.0207, + "reward": 0.2515428556362167, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2515428556362167, + "reward_after_std": 0.6346138343214989, + "reward_before_mean": 0.35106278862804174, + "reward_before_std": 0.6507196612656116, + "reward_change_max": 0.00019127130508422852, + "reward_change_mean": -0.09951994521543384, + "reward_change_min": -0.1813431465998292, + "reward_change_std": 0.0730478495825082, + "reward_std": 0.6346138529479504, + "rewards/cosine_scaled_reward": -0.02238526940345764, + "rewards/format_reward": 0.39583333767950535, + "step": 15 + }, + { + "advantage_max": 1.0669633708894253, + "advantage_mean": 2.2972624136308184e-08, + "advantage_min": -0.6060903668403625, + "advantage_std": 0.672724723815918, + "completion_length": 3512.1666870117188, + "epoch": 0.018285714285714287, + "grad_norm": 0.12345141172409058, + "kl": 4.026293754577637e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.2e-07, + "loss": 0.0155, + "reward": -0.3499115873128176, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.3499115873128176, + "reward_after_std": 0.6727247424423695, + "reward_before_mean": -0.31081429310142994, + "reward_before_std": 0.68895673006773, + "reward_change_max": 0.00038511306047439575, + "reward_change_mean": -0.03909728996222839, + "reward_change_min": -0.10860877297818661, + "reward_change_std": 0.04502260871231556, + "reward_std": 0.6727247759699821, + "rewards/cosine_scaled_reward": -0.20749048702418804, + "rewards/format_reward": 0.10416666977107525, + "step": 16 + }, + { + "advantage_max": 0.855644017457962, + "advantage_mean": -1.4280279181377864e-08, + "advantage_min": -0.8395511396229267, + "advantage_std": 0.6393131166696548, + "completion_length": 2412.354179382324, + "epoch": 0.019428571428571427, + "grad_norm": 0.112917959690094, + "kl": 3.923475742340088e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.4000000000000003e-07, + "loss": 0.0264, + "reward": 0.4075395166873932, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4075395166873932, + "reward_after_std": 0.6393131166696548, + "reward_before_mean": 0.5204492211341858, + "reward_before_std": 0.6488973777741194, + "reward_change_max": 0.00010866671800613403, + "reward_change_mean": -0.11290970025584102, + "reward_change_min": -0.21267955005168915, + "reward_change_std": 0.07953456928953528, + "reward_std": 0.639313155785203, + "rewards/cosine_scaled_reward": -0.02102540386840701, + "rewards/format_reward": 0.5625000074505806, + "step": 17 + }, + { + "advantage_max": 1.1588939391076565, + "advantage_mean": -1.3038515989105548e-08, + "advantage_min": -0.6808314770460129, + "advantage_std": 0.7133111041039228, + "completion_length": 2964.750030517578, + "epoch": 0.02057142857142857, + "grad_norm": 0.18582671880722046, + "kl": 2.3312866687774658e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.6e-07, + "loss": 0.0591, + "reward": 0.21607510233297944, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.21607510233297944, + "reward_after_std": 0.713311119005084, + "reward_before_mean": 0.3058173172175884, + "reward_before_std": 0.7081319317221642, + "reward_change_max": 0.0002251937985420227, + "reward_change_mean": -0.08974221721291542, + "reward_change_min": -0.17165155429393053, + "reward_change_std": 0.06901370617561042, + "reward_std": 0.7133111339062452, + "rewards/cosine_scaled_reward": -0.04500802047550678, + "rewards/format_reward": 0.39583333767950535, + "step": 18 + }, + { + "advantage_max": 1.3655128255486488, + "advantage_mean": 4.3461718668424965e-09, + "advantage_min": -0.9774691089987755, + "advantage_std": 0.9219000674784184, + "completion_length": 2742.479232788086, + "epoch": 0.021714285714285714, + "grad_norm": 0.16909454762935638, + "kl": 2.4370849132537842e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.7999999999999996e-07, + "loss": 0.0466, + "reward": 0.8763818801380694, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.8763818801380694, + "reward_after_std": 0.921900074928999, + "reward_before_mean": 1.0231477841734886, + "reward_before_std": 0.921125877648592, + "reward_change_max": 0.0003154948353767395, + "reward_change_mean": -0.1467658975161612, + "reward_change_min": -0.25718063302338123, + "reward_change_std": 0.10302350029814988, + "reward_std": 0.9219001047313213, + "rewards/cosine_scaled_reward": 0.23032389022409916, + "rewards/format_reward": 0.562500013038516, + "step": 19 + }, + { + "advantage_max": 1.1179804876446724, + "advantage_mean": -2.9181441429937394e-08, + "advantage_min": -0.9255211316049099, + "advantage_std": 0.8370347116142511, + "completion_length": 2553.62508392334, + "epoch": 0.022857142857142857, + "grad_norm": 0.15487366914749146, + "kl": 1.2110918760299683e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4e-07, + "loss": 0.1127, + "reward": 0.6232295744121075, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6232295744121075, + "reward_after_std": 0.8370346948504448, + "reward_before_mean": 0.7511535082012415, + "reward_before_std": 0.8457692014053464, + "reward_change_max": 0.00010447204113006592, + "reward_change_mean": -0.12792393937706947, + "reward_change_min": -0.24280552193522453, + "reward_change_std": 0.10140588087961078, + "reward_std": 0.8370347060263157, + "rewards/cosine_scaled_reward": 0.05266007035970688, + "rewards/format_reward": 0.6458333395421505, + "step": 20 + }, + { + "advantage_max": 0.8383369371294975, + "advantage_mean": -6.208816238917336e-10, + "advantage_min": -0.7214223481714725, + "advantage_std": 0.5726866647601128, + "completion_length": 2885.1250610351562, + "epoch": 0.024, + "grad_norm": 0.09243390709161758, + "kl": 3.4226104617118835e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.1999999999999995e-07, + "loss": 0.0314, + "reward": 0.13582478649914265, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.13582478649914265, + "reward_after_std": 0.5726866982877254, + "reward_before_mean": 0.22373342886567116, + "reward_before_std": 0.5761883333325386, + "reward_change_max": 0.0, + "reward_change_mean": -0.08790865843184292, + "reward_change_min": -0.14482644200325012, + "reward_change_std": 0.06016037776134908, + "reward_std": 0.5726867038756609, + "rewards/cosine_scaled_reward": -0.08604994229972363, + "rewards/format_reward": 0.39583334513008595, + "step": 21 + }, + { + "advantage_max": 0.9808385744690895, + "advantage_mean": -2.6077032422300306e-08, + "advantage_min": -0.7824203744530678, + "advantage_std": 0.6632657460868359, + "completion_length": 1853.5833587646484, + "epoch": 0.025142857142857144, + "grad_norm": 0.15798239409923553, + "kl": 3.5993754863739014e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.3999999999999997e-07, + "loss": 0.0177, + "reward": 0.6410952005535364, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6410952005535364, + "reward_after_std": 0.6632657647132874, + "reward_before_mean": 0.7731111478060484, + "reward_before_std": 0.6540880165994167, + "reward_change_max": 0.00019576400518417358, + "reward_change_mean": -0.1320159137248993, + "reward_change_min": -0.22317629400640726, + "reward_change_std": 0.0877161487005651, + "reward_std": 0.6632657833397388, + "rewards/cosine_scaled_reward": 0.021972209215164185, + "rewards/format_reward": 0.7291666753590107, + "step": 22 + }, + { + "advantage_max": 0.7594370692968369, + "advantage_mean": 6.829699139565548e-09, + "advantage_min": -0.5250892378389835, + "advantage_std": 0.4808049462735653, + "completion_length": 2642.6458740234375, + "epoch": 0.026285714285714287, + "grad_norm": 0.0673590898513794, + "kl": 3.293342888355255e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.6e-07, + "loss": 0.047, + "reward": -0.059821839444339275, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.059821839444339275, + "reward_after_std": 0.4808049499988556, + "reward_before_mean": 0.011269157752394676, + "reward_before_std": 0.47694574669003487, + "reward_change_max": 0.0, + "reward_change_mean": -0.07109098765067756, + "reward_change_min": -0.13114846032112837, + "reward_change_std": 0.048369639087468386, + "reward_std": 0.4808049723505974, + "rewards/cosine_scaled_reward": -0.19228209322318435, + "rewards/format_reward": 0.3958333358168602, + "step": 23 + }, + { + "advantage_max": 1.3633764162659645, + "advantage_mean": -7.450580596923828e-09, + "advantage_min": -1.2567225024104118, + "advantage_std": 1.080951388925314, + "completion_length": 3095.5833740234375, + "epoch": 0.027428571428571427, + "grad_norm": 0.18176376819610596, + "kl": 2.3409724235534668e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.8e-07, + "loss": 0.085, + "reward": 0.42115747928619385, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.42115747928619385, + "reward_after_std": 1.0809513852000237, + "reward_before_mean": 0.5261210352182388, + "reward_before_std": 1.12699656188488, + "reward_change_max": 9.399652481079102e-05, + "reward_change_mean": -0.10496355732902884, + "reward_change_min": -0.24463928490877151, + "reward_change_std": 0.10649053333327174, + "reward_std": 1.0809514187276363, + "rewards/cosine_scaled_reward": 0.03389384981710464, + "rewards/format_reward": 0.4583333432674408, + "step": 24 + }, + { + "advantage_max": 0.9184271581470966, + "advantage_mean": -2.2351742401394148e-08, + "advantage_min": -0.8210932016372681, + "advantage_std": 0.6771118529140949, + "completion_length": 2621.937530517578, + "epoch": 0.02857142857142857, + "grad_norm": 0.0775315910577774, + "kl": 3.3681513741612434e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5e-07, + "loss": 0.0084, + "reward": 0.45451467111706734, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.45451467111706734, + "reward_after_std": 0.6771118529140949, + "reward_before_mean": 0.5703998990356922, + "reward_before_std": 0.6853658109903336, + "reward_change_max": 0.0002382621169090271, + "reward_change_mean": -0.11588521907106042, + "reward_change_min": -0.2103752288967371, + "reward_change_std": 0.0814046454615891, + "reward_std": 0.6771118678152561, + "rewards/cosine_scaled_reward": 0.024783269211184233, + "rewards/format_reward": 0.520833333954215, + "step": 25 + }, + { + "advantage_max": 0.646996196359396, + "advantage_mean": -1.6453366169510986e-08, + "advantage_min": -0.7164939921349287, + "advantage_std": 0.5295879691839218, + "completion_length": 3004.125030517578, + "epoch": 0.029714285714285714, + "grad_norm": 0.08456138521432877, + "kl": 2.5331974029541016e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.2e-07, + "loss": 0.0187, + "reward": 0.19068976771086454, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.19068976771086454, + "reward_after_std": 0.5295879682525992, + "reward_before_mean": 0.28746249340474606, + "reward_before_std": 0.541738043539226, + "reward_change_max": 0.0001690015196800232, + "reward_change_mean": -0.09677271894179285, + "reward_change_min": -0.17185449041426182, + "reward_change_std": 0.06900891847908497, + "reward_std": 0.5295879831537604, + "rewards/cosine_scaled_reward": -0.06460210494697094, + "rewards/format_reward": 0.416666679084301, + "step": 26 + }, + { + "advantage_max": 1.1920481473207474, + "advantage_mean": 1.8626450937198058e-09, + "advantage_min": -0.8214569389820099, + "advantage_std": 0.7810939662158489, + "completion_length": 2973.958396911621, + "epoch": 0.030857142857142857, + "grad_norm": 0.17237314581871033, + "kl": 2.349168062210083e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.4e-07, + "loss": 0.0396, + "reward": 0.12759432382881641, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.12759432382881641, + "reward_after_std": 0.7810939662158489, + "reward_before_mean": 0.20839167991653085, + "reward_before_std": 0.7917383573949337, + "reward_change_max": 0.0, + "reward_change_mean": -0.08079732768237591, + "reward_change_min": -0.15045349579304457, + "reward_change_std": 0.06384936673566699, + "reward_std": 0.781093992292881, + "rewards/cosine_scaled_reward": -0.09372084098868072, + "rewards/format_reward": 0.39583333767950535, + "step": 27 + }, + { + "advantage_max": 0.9704475104808807, + "advantage_mean": 1.6142925107764938e-08, + "advantage_min": -0.9775747060775757, + "advantage_std": 0.707570880651474, + "completion_length": 2929.395896911621, + "epoch": 0.032, + "grad_norm": 0.10196535289287567, + "kl": 1.9277911633253098e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.6e-07, + "loss": 0.0309, + "reward": 0.24632756784558296, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.24632756784558296, + "reward_after_std": 0.7075709030032158, + "reward_before_mean": 0.34219497814774513, + "reward_before_std": 0.7241467647254467, + "reward_change_max": 0.0002382919192314148, + "reward_change_mean": -0.09586740791564807, + "reward_change_min": -0.16923209372907877, + "reward_change_std": 0.0729997109156102, + "reward_std": 0.7075709067285061, + "rewards/cosine_scaled_reward": -0.02681917743757367, + "rewards/format_reward": 0.39583334140479565, + "step": 28 + }, + { + "advantage_max": 0.8871082998812199, + "advantage_mean": 2.2817403591557373e-08, + "advantage_min": -0.593153104186058, + "advantage_std": 0.561270035803318, + "completion_length": 3434.416717529297, + "epoch": 0.03314285714285714, + "grad_norm": 0.11913178116083145, + "kl": 1.4252960681915283e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.8e-07, + "loss": 0.0478, + "reward": -0.35079627111554146, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.35079627111554146, + "reward_after_std": 0.5612700264900923, + "reward_before_mean": -0.3084347862750292, + "reward_before_std": 0.5713800620287657, + "reward_change_max": 0.00014052540063858032, + "reward_change_mean": -0.042361461790278554, + "reward_change_min": -0.10619675740599632, + "reward_change_std": 0.04429285158403218, + "reward_std": 0.5612700562924147, + "rewards/cosine_scaled_reward": -0.2375507289543748, + "rewards/format_reward": 0.16666666977107525, + "step": 29 + }, + { + "advantage_max": 1.0437628850340843, + "advantage_mean": -1.0554989215982857e-08, + "advantage_min": -0.8979457393288612, + "advantage_std": 0.6987891010940075, + "completion_length": 2934.1250610351562, + "epoch": 0.03428571428571429, + "grad_norm": 0.1300898790359497, + "kl": 2.6658177375793457e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6e-07, + "loss": 0.0668, + "reward": 0.4072896996513009, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4072896996513009, + "reward_after_std": 0.6987890899181366, + "reward_before_mean": 0.516874086111784, + "reward_before_std": 0.7041825018823147, + "reward_change_max": 0.0003464892506599426, + "reward_change_mean": -0.1095843828516081, + "reward_change_min": -0.17533477023243904, + "reward_change_std": 0.07200261077377945, + "reward_std": 0.6987891085445881, + "rewards/cosine_scaled_reward": 0.008437026292085648, + "rewards/format_reward": 0.5000000167638063, + "step": 30 + }, + { + "advantage_max": 1.0937358774244785, + "advantage_mean": -1.2417634420724966e-08, + "advantage_min": -0.7162820585072041, + "advantage_std": 0.7303737886250019, + "completion_length": 3102.354202270508, + "epoch": 0.03542857142857143, + "grad_norm": 0.12131313979625702, + "kl": 3.250502049922943e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.2e-07, + "loss": 0.0399, + "reward": -0.049326577223837376, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.049326577223837376, + "reward_after_std": 0.7303738072514534, + "reward_before_mean": 0.016891899227630347, + "reward_before_std": 0.7446071989834309, + "reward_change_max": 0.0001913905143737793, + "reward_change_mean": -0.06621849071234465, + "reward_change_min": -0.14599811471998692, + "reward_change_std": 0.060666448436677456, + "reward_std": 0.7303738184273243, + "rewards/cosine_scaled_reward": -0.12697071363800205, + "rewards/format_reward": 0.2708333432674408, + "step": 31 + }, + { + "advantage_max": 1.0439314730465412, + "advantage_mean": -2.220446049250313e-16, + "advantage_min": -0.8557010069489479, + "advantage_std": 0.717094523832202, + "completion_length": 3120.104217529297, + "epoch": 0.036571428571428574, + "grad_norm": 0.13845178484916687, + "kl": 3.4399330615997314e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.4e-07, + "loss": 0.034, + "reward": 0.10101676848717034, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.10101676848717034, + "reward_after_std": 0.717094536870718, + "reward_before_mean": 0.18201410118490458, + "reward_before_std": 0.7300637681037188, + "reward_change_max": 0.00033176690340042114, + "reward_change_mean": -0.0809973543509841, + "reward_change_min": -0.15452067088335752, + "reward_change_std": 0.06580975430551916, + "reward_std": 0.7170945592224598, + "rewards/cosine_scaled_reward": -0.0964929424226284, + "rewards/format_reward": 0.3750000111758709, + "step": 32 + }, + { + "advantage_max": 1.3146421052515507, + "advantage_mean": 4.346171533775589e-09, + "advantage_min": -0.9072512723505497, + "advantage_std": 0.9102408867329359, + "completion_length": 3378.291717529297, + "epoch": 0.037714285714285714, + "grad_norm": 0.13238871097564697, + "kl": 3.647804260253906e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.6e-07, + "loss": 0.0247, + "reward": -0.030413204804062843, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.030413204804062843, + "reward_after_std": 0.9102408867329359, + "reward_before_mean": 0.03352950140833855, + "reward_before_std": 0.9408820159733295, + "reward_change_max": 0.0003842562437057495, + "reward_change_mean": -0.06394269852899015, + "reward_change_min": -0.1856099097058177, + "reward_change_std": 0.07536959834396839, + "reward_std": 0.910240899771452, + "rewards/cosine_scaled_reward": -0.10823525360319763, + "rewards/format_reward": 0.25000000931322575, + "step": 33 + }, + { + "advantage_max": 1.2045657709240913, + "advantage_mean": -1.3038516377683607e-08, + "advantage_min": -1.0522035360336304, + "advantage_std": 0.8648568205535412, + "completion_length": 2520.2708587646484, + "epoch": 0.038857142857142854, + "grad_norm": 0.17876236140727997, + "kl": 0.00010145828127861023, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.800000000000001e-07, + "loss": 0.0492, + "reward": 0.6656378395855427, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6656378395855427, + "reward_after_std": 0.8648568540811539, + "reward_before_mean": 0.7960071973502636, + "reward_before_std": 0.8760812990367413, + "reward_change_max": 9.147077798843384e-05, + "reward_change_mean": -0.13036933494731784, + "reward_change_min": -0.2507061818614602, + "reward_change_std": 0.09630749723874032, + "reward_std": 0.8648568838834763, + "rewards/cosine_scaled_reward": 0.1271702533122152, + "rewards/format_reward": 0.5416666772216558, + "step": 34 + }, + { + "advantage_max": 1.3862405456602573, + "advantage_mean": -7.450580596923828e-09, + "advantage_min": -0.8664616197347641, + "advantage_std": 0.918156361207366, + "completion_length": 2984.1666870117188, + "epoch": 0.04, + "grad_norm": 0.13064010441303253, + "kl": 8.083879947662354e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7e-07, + "loss": 0.0494, + "reward": 0.1301069421460852, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1301069421460852, + "reward_after_std": 0.918156361207366, + "reward_before_mean": 0.20769228972494602, + "reward_before_std": 0.9387262333184481, + "reward_change_max": 0.0002269744873046875, + "reward_change_mean": -0.077585359220393, + "reward_change_min": -0.19662514980882406, + "reward_change_std": 0.07557157322298735, + "reward_std": 0.9181563761085272, + "rewards/cosine_scaled_reward": -0.07323719232226722, + "rewards/format_reward": 0.3541666716337204, + "step": 35 + }, + { + "advantage_max": 0.5974312610924244, + "advantage_mean": 0.0, + "advantage_min": -0.4941564276814461, + "advantage_std": 0.41885973513126373, + "completion_length": 3372.6666870117188, + "epoch": 0.04114285714285714, + "grad_norm": 0.07860232889652252, + "kl": 0.00011217966675758362, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.2e-07, + "loss": 0.037, + "reward": -0.4468262065201998, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.4468262065201998, + "reward_after_std": 0.41885972395539284, + "reward_before_mean": -0.40820746310055256, + "reward_before_std": 0.4311361648142338, + "reward_change_max": 0.0005601868033409119, + "reward_change_mean": -0.03861874993890524, + "reward_change_min": -0.09212729427963495, + "reward_change_std": 0.03797971783205867, + "reward_std": 0.41885973140597343, + "rewards/cosine_scaled_reward": -0.29785373993217945, + "rewards/format_reward": 0.18750000558793545, + "step": 36 + }, + { + "advantage_max": 0.7455697171390057, + "advantage_mean": -1.2417633032946185e-09, + "advantage_min": -0.5482769273221493, + "advantage_std": 0.47954942658543587, + "completion_length": 3295.4166870117188, + "epoch": 0.04228571428571429, + "grad_norm": 0.09077224880456924, + "kl": 6.353668868541718e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.4e-07, + "loss": 0.0224, + "reward": -0.30808172933757305, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.30808172933757305, + "reward_after_std": 0.47954943403601646, + "reward_before_mean": -0.259306862950325, + "reward_before_std": 0.4846712723374367, + "reward_change_max": 6.873160600662231e-05, + "reward_change_mean": -0.048774888389743865, + "reward_change_min": -0.09794063959270716, + "reward_change_std": 0.038542215479537845, + "reward_std": 0.47954943776130676, + "rewards/cosine_scaled_reward": -0.2338200956583023, + "rewards/format_reward": 0.2083333432674408, + "step": 37 + }, + { + "advantage_max": 0.9543525539338589, + "advantage_mean": -2.483526828633842e-09, + "advantage_min": -0.537376195192337, + "advantage_std": 0.5838014744222164, + "completion_length": 3174.2083435058594, + "epoch": 0.04342857142857143, + "grad_norm": 0.10276984423398972, + "kl": 0.00011193100363016129, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.599999999999999e-07, + "loss": 0.0019, + "reward": 0.018349453806877136, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.018349453806877136, + "reward_after_std": 0.5838014855980873, + "reward_before_mean": 0.09325479716062546, + "reward_before_std": 0.5771887041628361, + "reward_change_max": 0.00024618208408355713, + "reward_change_mean": -0.0749053421895951, + "reward_change_min": -0.15158047154545784, + "reward_change_std": 0.05743449926376343, + "reward_std": 0.5838015079498291, + "rewards/cosine_scaled_reward": -0.07837261259555817, + "rewards/format_reward": 0.2500000037252903, + "step": 38 + }, + { + "advantage_max": 0.6583370007574558, + "advantage_mean": 2.1730860333413204e-08, + "advantage_min": -0.6974942088127136, + "advantage_std": 0.5173010732978582, + "completion_length": 2943.625030517578, + "epoch": 0.044571428571428574, + "grad_norm": 0.0883026197552681, + "kl": 8.260644972324371e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.799999999999999e-07, + "loss": 0.0277, + "reward": 0.2838073279708624, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2838073279708624, + "reward_after_std": 0.5173010714352131, + "reward_before_mean": 0.38903780886903405, + "reward_before_std": 0.519108316861093, + "reward_change_max": 0.000254705548286438, + "reward_change_mean": -0.10523045598529279, + "reward_change_min": -0.18225091230124235, + "reward_change_std": 0.07373343408107758, + "reward_std": 0.5173010863363743, + "rewards/cosine_scaled_reward": -0.04506444372236729, + "rewards/format_reward": 0.479166679084301, + "step": 39 + }, + { + "advantage_max": 0.8892773240804672, + "advantage_mean": -1.3659398168108794e-08, + "advantage_min": -0.722268171608448, + "advantage_std": 0.6039473228156567, + "completion_length": 2825.562545776367, + "epoch": 0.045714285714285714, + "grad_norm": 0.10289324820041656, + "kl": 0.0005248039960861206, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8e-07, + "loss": 0.0336, + "reward": 0.20918016554787755, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.20918016554787755, + "reward_after_std": 0.6039473339915276, + "reward_before_mean": 0.30307046277448535, + "reward_before_std": 0.6087586358189583, + "reward_change_max": 0.0, + "reward_change_mean": -0.09389030793681741, + "reward_change_min": -0.1630670754238963, + "reward_change_std": 0.06309279799461365, + "reward_std": 0.6039473377168179, + "rewards/cosine_scaled_reward": -0.06721477210521698, + "rewards/format_reward": 0.4375000037252903, + "step": 40 + }, + { + "advantage_max": 1.5364714190363884, + "advantage_mean": -5.587935503204022e-09, + "advantage_min": -0.7693024277687073, + "advantage_std": 0.8801210299134254, + "completion_length": 3060.625030517578, + "epoch": 0.046857142857142854, + "grad_norm": 0.17200098931789398, + "kl": 0.00013019144535064697, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.199999999999999e-07, + "loss": 0.0641, + "reward": -0.11428980063647032, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.11428980063647032, + "reward_after_std": 0.8801210466772318, + "reward_before_mean": -0.06253744126297534, + "reward_before_std": 0.8866547737270594, + "reward_change_max": 0.0007996335625648499, + "reward_change_mean": -0.051752346043940634, + "reward_change_min": -0.13621089048683643, + "reward_change_std": 0.052859612624160945, + "reward_std": 0.8801210653036833, + "rewards/cosine_scaled_reward": -0.19793538935482502, + "rewards/format_reward": 0.33333334513008595, + "step": 41 + }, + { + "advantage_max": 0.47750604152679443, + "advantage_mean": 1.614292477469803e-08, + "advantage_min": -0.4910551328212023, + "advantage_std": 0.38328032568097115, + "completion_length": 2901.8541774749756, + "epoch": 0.048, + "grad_norm": 0.057079702615737915, + "kl": 8.524954319000244e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.399999999999999e-07, + "loss": -0.0038, + "reward": -0.3438019538298249, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.3438019538298249, + "reward_after_std": 0.38328034430742264, + "reward_before_mean": -0.2935850992798805, + "reward_before_std": 0.3953288784250617, + "reward_change_max": 0.0003351941704750061, + "reward_change_mean": -0.05021684942767024, + "reward_change_min": -0.10241542104631662, + "reward_change_std": 0.04318908369168639, + "reward_std": 0.38328035920858383, + "rewards/cosine_scaled_reward": -0.2613758873194456, + "rewards/format_reward": 0.2291666716337204, + "step": 42 + }, + { + "advantage_max": 0.915099672973156, + "advantage_mean": 6.829699028543246e-09, + "advantage_min": -0.6373906396329403, + "advantage_std": 0.6149002127349377, + "completion_length": 3080.7500228881836, + "epoch": 0.04914285714285714, + "grad_norm": 0.10517507046461105, + "kl": 0.00013168156147003174, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.599999999999999e-07, + "loss": 0.0442, + "reward": -0.11758936569094658, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.11758936569094658, + "reward_after_std": 0.6149002015590668, + "reward_before_mean": -0.054422732442617416, + "reward_before_std": 0.626342048868537, + "reward_change_max": 0.0004749596118927002, + "reward_change_mean": -0.06316663301549852, + "reward_change_min": -0.13307728618383408, + "reward_change_std": 0.05423153773881495, + "reward_std": 0.6149002015590668, + "rewards/cosine_scaled_reward": -0.1417947057634592, + "rewards/format_reward": 0.22916666977107525, + "step": 43 + }, + { + "advantage_max": 0.6789183877408504, + "advantage_mean": -2.3593506592867186e-08, + "advantage_min": -0.6343220472335815, + "advantage_std": 0.5155519731342793, + "completion_length": 2674.687530517578, + "epoch": 0.05028571428571429, + "grad_norm": 0.07515233010053635, + "kl": 0.00036903470754623413, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.799999999999999e-07, + "loss": -0.0067, + "reward": 0.3380686726886779, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3380686726886779, + "reward_after_std": 0.5155519805848598, + "reward_before_mean": 0.4477055589668453, + "reward_before_std": 0.5174354799091816, + "reward_change_max": 0.00022473931312561035, + "reward_change_mean": -0.10963688185438514, + "reward_change_min": -0.1835013646632433, + "reward_change_std": 0.07373402267694473, + "reward_std": 0.5155520141124725, + "rewards/cosine_scaled_reward": -0.02614724636077881, + "rewards/format_reward": 0.5000000037252903, + "step": 44 + }, + { + "advantage_max": 0.9863204881548882, + "advantage_mean": -4.967054212379196e-09, + "advantage_min": -0.8636655509471893, + "advantage_std": 0.7028949670493603, + "completion_length": 3406.4791870117188, + "epoch": 0.05142857142857143, + "grad_norm": 0.10877612978219986, + "kl": 0.00013073720037937164, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9e-07, + "loss": 0.0269, + "reward": -0.03182817902415991, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.03182817902415991, + "reward_after_std": 0.70289496332407, + "reward_before_mean": 0.03812003450002521, + "reward_before_std": 0.7237787656486034, + "reward_change_max": 0.00034902244806289673, + "reward_change_mean": -0.06994821969419718, + "reward_change_min": -0.15288027469068766, + "reward_change_std": 0.06384195922873914, + "reward_std": 0.7028949670493603, + "rewards/cosine_scaled_reward": -0.11635664664208889, + "rewards/format_reward": 0.27083334140479565, + "step": 45 + }, + { + "advantage_max": 0.7321793995797634, + "advantage_mean": 4.346172088887101e-09, + "advantage_min": -0.5258349291980267, + "advantage_std": 0.4733831323683262, + "completion_length": 3226.1875, + "epoch": 0.052571428571428575, + "grad_norm": 0.0635356530547142, + "kl": 0.0003596842288970947, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.2e-07, + "loss": 0.0042, + "reward": -0.3120793215930462, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": -0.3120793215930462, + "reward_after_std": 0.4733831509947777, + "reward_before_mean": -0.26340748369693756, + "reward_before_std": 0.4777502194046974, + "reward_change_max": 0.00038643181324005127, + "reward_change_mean": -0.0486718516331166, + "reward_change_min": -0.09745941311120987, + "reward_change_std": 0.04000273603014648, + "reward_std": 0.4733831658959389, + "rewards/cosine_scaled_reward": -0.20462040696293116, + "rewards/format_reward": 0.14583333395421505, + "step": 46 + }, + { + "advantage_max": 1.245551098138094, + "advantage_mean": 1.3038516488705909e-08, + "advantage_min": -1.1025776639580727, + "advantage_std": 0.9889654777944088, + "completion_length": 2921.208396911621, + "epoch": 0.053714285714285714, + "grad_norm": 0.13856422901153564, + "kl": 0.0002876073122024536, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.399999999999999e-07, + "loss": 0.0558, + "reward": 0.4111839346587658, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4111839346587658, + "reward_after_std": 0.9889654964208603, + "reward_before_mean": 0.5171041414141655, + "reward_before_std": 1.0297090746462345, + "reward_change_max": 0.0004226118326187134, + "reward_change_mean": -0.10592022282071412, + "reward_change_min": -0.24152968171983957, + "reward_change_std": 0.10095622227527201, + "reward_std": 0.9889655113220215, + "rewards/cosine_scaled_reward": 0.029385413974523544, + "rewards/format_reward": 0.4583333395421505, + "step": 47 + }, + { + "advantage_max": 1.1130247823894024, + "advantage_mean": -2.5456151464542387e-08, + "advantage_min": -0.6210677027702332, + "advantage_std": 0.6353725697845221, + "completion_length": 2762.416717529297, + "epoch": 0.054857142857142854, + "grad_norm": 0.13798625767230988, + "kl": 0.000944383442401886, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.6e-07, + "loss": 0.0428, + "reward": 0.0739603266119957, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.0739603266119957, + "reward_after_std": 0.6353725772351027, + "reward_before_mean": 0.15098485595081002, + "reward_before_std": 0.6252976432442665, + "reward_change_max": 0.0002612695097923279, + "reward_change_mean": -0.0770245383027941, + "reward_change_min": -0.13992419466376305, + "reward_change_std": 0.05226586083881557, + "reward_std": 0.6353725884109735, + "rewards/cosine_scaled_reward": -0.11200757790356874, + "rewards/format_reward": 0.3750000037252903, + "step": 48 + }, + { + "advantage_max": 1.1139013655483723, + "advantage_mean": -2.483527011820641e-08, + "advantage_min": -0.7361991293728352, + "advantage_std": 0.7340975552797318, + "completion_length": 2435.8125610351562, + "epoch": 0.056, + "grad_norm": 0.11226435005664825, + "kl": 0.0004952177405357361, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.8e-07, + "loss": 0.0351, + "reward": 0.22510142996907234, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.22510142996907234, + "reward_after_std": 0.7340975552797318, + "reward_before_mean": 0.31644671969115734, + "reward_before_std": 0.7392865046858788, + "reward_change_max": 7.228553295135498e-05, + "reward_change_mean": -0.09134529763832688, + "reward_change_min": -0.18282661493867636, + "reward_change_std": 0.06951131741516292, + "reward_std": 0.7340975776314735, + "rewards/cosine_scaled_reward": -0.1021933276206255, + "rewards/format_reward": 0.5208333432674408, + "step": 49 + }, + { + "advantage_max": 0.8116575442254543, + "advantage_mean": -1.9868215128671096e-08, + "advantage_min": -0.7131141312420368, + "advantage_std": 0.5905804056674242, + "completion_length": 2969.750030517578, + "epoch": 0.05714285714285714, + "grad_norm": 0.11635003983974457, + "kl": 0.0006887298077344894, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1e-06, + "loss": 0.0295, + "reward": 0.14436959475278854, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.14436959475278854, + "reward_after_std": 0.5905804093927145, + "reward_before_mean": 0.23427366465330124, + "reward_before_std": 0.5951451063156128, + "reward_change_max": 0.00026381760835647583, + "reward_change_mean": -0.08990406547673047, + "reward_change_min": -0.17287831474095583, + "reward_change_std": 0.07043572561815381, + "reward_std": 0.5905804317444563, + "rewards/cosine_scaled_reward": -0.039113187696784735, + "rewards/format_reward": 0.31250000558793545, + "step": 50 + }, + { + "advantage_max": 1.1374978050589561, + "advantage_mean": -2.0333876360467684e-08, + "advantage_min": -0.7362420186400414, + "advantage_std": 0.7058875262737274, + "completion_length": 2325.437557220459, + "epoch": 0.05828571428571429, + "grad_norm": 0.10930530726909637, + "kl": 0.00194627046585083, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.999890338174275e-07, + "loss": 0.033, + "reward": 0.18171239644289017, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.18171239644289017, + "reward_after_std": 0.7058875374495983, + "reward_before_mean": 0.2686680965125561, + "reward_before_std": 0.7034656405448914, + "reward_change_max": 0.0003391280770301819, + "reward_change_mean": -0.08695570658892393, + "reward_change_min": -0.17610237654298544, + "reward_change_std": 0.06442677089944482, + "reward_std": 0.7058875598013401, + "rewards/cosine_scaled_reward": -0.12608262081630528, + "rewards/format_reward": 0.5208333376795053, + "step": 51 + }, + { + "advantage_max": 1.2928729727864265, + "advantage_mean": -1.9868215517249155e-08, + "advantage_min": -0.9549924209713936, + "advantage_std": 0.9508500788360834, + "completion_length": 2870.7500534057617, + "epoch": 0.05942857142857143, + "grad_norm": 0.14845716953277588, + "kl": 0.0017962455749511719, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.999561358041868e-07, + "loss": 0.0508, + "reward": 0.34414372593164444, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.34414372593164444, + "reward_after_std": 0.9508501011878252, + "reward_before_mean": 0.44306235015392303, + "reward_before_std": 0.98055231384933, + "reward_change_max": 0.00012401491403579712, + "reward_change_mean": -0.09891863886150531, + "reward_change_min": -0.2104501435533166, + "reward_change_std": 0.08957649956573732, + "reward_std": 0.950850136578083, + "rewards/cosine_scaled_reward": 0.023614494362846017, + "rewards/format_reward": 0.3958333358168602, + "step": 52 + }, + { + "advantage_max": 1.2351016141474247, + "advantage_mean": 6.208818459363386e-10, + "advantage_min": -1.186292514204979, + "advantage_std": 0.9753887392580509, + "completion_length": 2831.3333587646484, + "epoch": 0.060571428571428575, + "grad_norm": 0.16659170389175415, + "kl": 0.003074079751968384, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.999013075636804e-07, + "loss": 0.0531, + "reward": 0.5398093909025192, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5398093909025192, + "reward_after_std": 0.9753887094557285, + "reward_before_mean": 0.6579989977180958, + "reward_before_std": 1.008890239521861, + "reward_change_max": 0.00019691884517669678, + "reward_change_mean": -0.11818960297387093, + "reward_change_min": -0.25947624258697033, + "reward_change_std": 0.10222019837237895, + "reward_std": 0.9753887467086315, + "rewards/cosine_scaled_reward": 0.047749497927725315, + "rewards/format_reward": 0.5625000111758709, + "step": 53 + }, + { + "advantage_max": 1.079960823059082, + "advantage_mean": -2.3593505593666464e-08, + "advantage_min": -1.5959435552358627, + "advantage_std": 0.9909502565860748, + "completion_length": 2885.0000610351562, + "epoch": 0.061714285714285715, + "grad_norm": 0.19337964057922363, + "kl": 0.0005113296210765839, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.998245517681593e-07, + "loss": 0.1239, + "reward": 0.781525231897831, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.781525231897831, + "reward_after_std": 0.9909502565860748, + "reward_before_mean": 0.9249188676476479, + "reward_before_std": 1.039504911750555, + "reward_change_max": 0.0005872845649719238, + "reward_change_mean": -0.14339364634361118, + "reward_change_min": -0.26627498492598534, + "reward_change_std": 0.11663470219355077, + "reward_std": 0.9909502938389778, + "rewards/cosine_scaled_reward": 0.19162609428167343, + "rewards/format_reward": 0.5416666865348816, + "step": 54 + }, + { + "advantage_max": 1.3312099613249302, + "advantage_mean": 1.8626449826975033e-09, + "advantage_min": -0.5829783342778683, + "advantage_std": 0.7693915814161301, + "completion_length": 3077.0208587646484, + "epoch": 0.06285714285714286, + "grad_norm": 0.13275492191314697, + "kl": 0.0009839534759521484, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.997258721585931e-07, + "loss": 0.0549, + "reward": 0.10769246646668762, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.10769246646668762, + "reward_after_std": 0.7693915888667107, + "reward_before_mean": 0.1841644076630473, + "reward_before_std": 0.7638834398239851, + "reward_change_max": 5.056709051132202e-05, + "reward_change_mean": -0.07647194608580321, + "reward_change_min": -0.15568101592361927, + "reward_change_std": 0.060355184017680585, + "reward_std": 0.7693916037678719, + "rewards/cosine_scaled_reward": -0.07458446500822902, + "rewards/format_reward": 0.3333333358168602, + "step": 55 + }, + { + "advantage_max": 0.6472447663545609, + "advantage_mean": -9.93410786964688e-09, + "advantage_min": -0.5673340857028961, + "advantage_std": 0.4818668272346258, + "completion_length": 2857.8542098999023, + "epoch": 0.064, + "grad_norm": 0.07451209425926208, + "kl": 0.000834345817565918, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.996052735444862e-07, + "loss": 0.004, + "reward": 0.1308621042408049, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.1308621042408049, + "reward_after_std": 0.48186682537198067, + "reward_before_mean": 0.2224470037035644, + "reward_before_std": 0.4868644941598177, + "reward_change_max": 0.0003842562437057495, + "reward_change_mean": -0.09158491797279567, + "reward_change_min": -0.1648717550560832, + "reward_change_std": 0.06347237096633762, + "reward_std": 0.48186685517430305, + "rewards/cosine_scaled_reward": -0.0971098318696022, + "rewards/format_reward": 0.416666679084301, + "step": 56 + }, + { + "advantage_max": 0.9662680625915527, + "advantage_mean": 3.104408563547878e-09, + "advantage_min": -0.8334208503365517, + "advantage_std": 0.7065017186105251, + "completion_length": 3336.7708740234375, + "epoch": 0.06514285714285714, + "grad_norm": 0.25560063123703003, + "kl": 0.026805490255355835, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.994627618036452e-07, + "loss": 0.0182, + "reward": -0.058415647596120834, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.058415647596120834, + "reward_after_std": 0.7065017186105251, + "reward_before_mean": 0.009561575949192047, + "reward_before_std": 0.7297605946660042, + "reward_change_max": 0.00029649585485458374, + "reward_change_mean": -0.06797723285853863, + "reward_change_min": -0.15673517063260078, + "reward_change_std": 0.06704360526055098, + "reward_std": 0.7065017279237509, + "rewards/cosine_scaled_reward": -0.14105254039168358, + "rewards/format_reward": 0.29166667722165585, + "step": 57 + }, + { + "advantage_max": 1.276023730635643, + "advantage_mean": -2.110997909809953e-08, + "advantage_min": -0.9838252775371075, + "advantage_std": 0.9304085336625576, + "completion_length": 2217.479217529297, + "epoch": 0.06628571428571428, + "grad_norm": 0.19546259939670563, + "kl": 0.003041982650756836, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.992983438818915e-07, + "loss": 0.0658, + "reward": 0.7234169393777847, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7234169393777847, + "reward_after_std": 0.9304085709154606, + "reward_before_mean": 0.8578736670315266, + "reward_before_std": 0.9398499056696892, + "reward_change_max": 0.0002982392907142639, + "reward_change_mean": -0.134456732776016, + "reward_change_min": -0.2525649508461356, + "reward_change_std": 0.10482977563515306, + "reward_std": 0.9304086118936539, + "rewards/cosine_scaled_reward": 0.06435349676758051, + "rewards/format_reward": 0.7291666697710752, + "step": 58 + }, + { + "advantage_max": 0.8459695763885975, + "advantage_mean": -7.45058070794613e-09, + "advantage_min": -0.7203714326024055, + "advantage_std": 0.5940009132027626, + "completion_length": 3036.3125076293945, + "epoch": 0.06742857142857143, + "grad_norm": 0.10515865683555603, + "kl": 0.0018346011638641357, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.991120277927223e-07, + "loss": 0.0007, + "reward": 0.016265645623207092, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.016265645623207092, + "reward_after_std": 0.5940009374171495, + "reward_before_mean": 0.09327167272567749, + "reward_before_std": 0.6062601394951344, + "reward_change_max": 0.00019018352031707764, + "reward_change_mean": -0.07700600824318826, + "reward_change_min": -0.1557639017701149, + "reward_change_std": 0.06312599789816886, + "reward_std": 0.594000943005085, + "rewards/cosine_scaled_reward": -0.0783641804009676, + "rewards/format_reward": 0.2500000037252903, + "step": 59 + }, + { + "advantage_max": 1.0619999282062054, + "advantage_mean": 6.208817238118058e-09, + "advantage_min": -0.6794852465391159, + "advantage_std": 0.6504358239471912, + "completion_length": 2965.6458740234375, + "epoch": 0.06857142857142857, + "grad_norm": 0.10484857112169266, + "kl": 0.0010578781366348267, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.989038226169207e-07, + "loss": 0.0099, + "reward": -0.017510805279016495, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.017510805279016495, + "reward_after_std": 0.6504357941448689, + "reward_before_mean": 0.05231862887740135, + "reward_before_std": 0.6509438417851925, + "reward_change_max": 0.0002585947513580322, + "reward_change_mean": -0.06982943858020008, + "reward_change_min": -0.14922125078737736, + "reward_change_std": 0.05565405311062932, + "reward_std": 0.6504357997328043, + "rewards/cosine_scaled_reward": -0.1613406909746118, + "rewards/format_reward": 0.37500000186264515, + "step": 60 + }, + { + "advantage_max": 0.9533044211566448, + "advantage_mean": 1.241763691872677e-09, + "advantage_min": -0.7701556235551834, + "advantage_std": 0.6510935872793198, + "completion_length": 3141.937530517578, + "epoch": 0.06971428571428571, + "grad_norm": 0.12680523097515106, + "kl": 0.001262512058019638, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.98673738502114e-07, + "loss": 0.0681, + "reward": 0.3617987995967269, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3617987995967269, + "reward_after_std": 0.6510935984551907, + "reward_before_mean": 0.46881416253745556, + "reward_before_std": 0.6530401557683945, + "reward_change_max": 0.00017626583576202393, + "reward_change_mean": -0.10701534803956747, + "reward_change_min": -0.1809440078213811, + "reward_change_std": 0.0730011141858995, + "reward_std": 0.6510936245322227, + "rewards/cosine_scaled_reward": -0.015592940151691437, + "rewards/format_reward": 0.5000000074505806, + "step": 61 + }, + { + "advantage_max": 1.4476277567446232, + "advantage_mean": 1.2417638028949796e-09, + "advantage_min": -1.1485518887639046, + "advantage_std": 1.023965161293745, + "completion_length": 2572.458427429199, + "epoch": 0.07085714285714285, + "grad_norm": 0.12858064472675323, + "kl": 0.0027109384536743164, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.98421786662277e-07, + "loss": 0.032, + "reward": 0.4915749344509095, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4915749344509095, + "reward_after_std": 1.0239651463925838, + "reward_before_mean": 0.6014097668230534, + "reward_before_std": 1.0463957451283932, + "reward_change_max": 1.6786158084869385e-05, + "reward_change_mean": -0.10983482981100678, + "reward_change_min": -0.23927107453346252, + "reward_change_std": 0.09747252892702818, + "reward_std": 1.0239651799201965, + "rewards/cosine_scaled_reward": -0.022211784962564707, + "rewards/format_reward": 0.6458333469927311, + "step": 62 + }, + { + "advantage_max": 1.250469371676445, + "advantage_mean": -1.1175871006408045e-08, + "advantage_min": -1.0716526806354523, + "advantage_std": 0.8724186569452286, + "completion_length": 2255.791717529297, + "epoch": 0.072, + "grad_norm": 0.138187974691391, + "kl": 0.0027687549591064453, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.981479793771866e-07, + "loss": 0.0289, + "reward": 0.7566146403551102, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7566146403551102, + "reward_after_std": 0.8724186569452286, + "reward_before_mean": 0.8947709426283836, + "reward_before_std": 0.8801536336541176, + "reward_change_max": 0.0, + "reward_change_mean": -0.1381562864407897, + "reward_change_min": -0.2546289935708046, + "reward_change_std": 0.09683123417198658, + "reward_std": 0.8724186718463898, + "rewards/cosine_scaled_reward": 0.08280211873352528, + "rewards/format_reward": 0.7291666828095913, + "step": 63 + }, + { + "advantage_max": 1.065049335360527, + "advantage_mean": -1.676380662063437e-08, + "advantage_min": -0.9578208141028881, + "advantage_std": 0.8730638399720192, + "completion_length": 2977.041732788086, + "epoch": 0.07314285714285715, + "grad_norm": 0.16038207709789276, + "kl": 0.0029485225677490234, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.97852329991824e-07, + "loss": 0.1249, + "reward": 0.17387355864048004, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.17387355864048004, + "reward_after_std": 0.8730638436973095, + "reward_before_mean": 0.26105861994437873, + "reward_before_std": 0.9116242602467537, + "reward_change_max": 0.00032076984643936157, + "reward_change_mean": -0.08718509506434202, + "reward_change_min": -0.205215728841722, + "reward_change_std": 0.08837446081452072, + "reward_std": 0.873063862323761, + "rewards/cosine_scaled_reward": -0.056970683857798576, + "rewards/format_reward": 0.3750000111758709, + "step": 64 + }, + { + "advantage_max": 1.068474367260933, + "advantage_mean": -7.450581263057643e-09, + "advantage_min": -0.6501489132642746, + "advantage_std": 0.6808883212506771, + "completion_length": 2789.729221343994, + "epoch": 0.07428571428571429, + "grad_norm": 0.14595411717891693, + "kl": 0.005219936370849609, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.975348529157229e-07, + "loss": -0.0022, + "reward": 0.16831020638346672, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.16831020638346672, + "reward_after_std": 0.6808883063495159, + "reward_before_mean": 0.25501670874655247, + "reward_before_std": 0.6813295837491751, + "reward_change_max": 4.532933235168457e-05, + "reward_change_mean": -0.08670650841668248, + "reward_change_min": -0.17894641030579805, + "reward_change_std": 0.06918191979639232, + "reward_std": 0.6808883361518383, + "rewards/cosine_scaled_reward": -0.10165832610800862, + "rewards/format_reward": 0.45833333767950535, + "step": 65 + }, + { + "advantage_max": 0.7757134735584259, + "advantage_mean": -1.6763806787167823e-08, + "advantage_min": -0.6671213656663895, + "advantage_std": 0.5757374875247478, + "completion_length": 2300.583351135254, + "epoch": 0.07542857142857143, + "grad_norm": 0.06580257415771484, + "kl": 0.0022726058959960938, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.971955636222684e-07, + "loss": -0.0312, + "reward": 0.38516049087047577, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.38516049087047577, + "reward_after_std": 0.5757375098764896, + "reward_before_mean": 0.49741994962096214, + "reward_before_std": 0.5758095439523458, + "reward_change_max": 0.00032460689544677734, + "reward_change_mean": -0.11225945455953479, + "reward_change_min": -0.19416429102420807, + "reward_change_std": 0.0764637878164649, + "reward_std": 0.5757375229150057, + "rewards/cosine_scaled_reward": -0.0012900326400995255, + "rewards/format_reward": 0.5, + "step": 66 + }, + { + "advantage_max": 0.7150251120328903, + "advantage_mean": 1.4280279625467074e-08, + "advantage_min": -0.47850729525089264, + "advantage_std": 0.4646703340113163, + "completion_length": 3501.041717529297, + "epoch": 0.07657142857142857, + "grad_norm": 0.06360700726509094, + "kl": 0.0019628703594207764, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.968344786479415e-07, + "loss": 0.0201, + "reward": -0.5012560524046421, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.5012560524046421, + "reward_after_std": 0.46467033214867115, + "reward_before_mean": -0.46999809332191944, + "reward_before_std": 0.47615547478199005, + "reward_change_max": 0.0004980117082595825, + "reward_change_mean": -0.03125794976949692, + "reward_change_min": -0.08866909518837929, + "reward_change_std": 0.03602162795141339, + "reward_std": 0.46467035450041294, + "rewards/cosine_scaled_reward": -0.29749905318021774, + "rewards/format_reward": 0.1250000037252903, + "step": 67 + }, + { + "advantage_max": 1.1670538075268269, + "advantage_mean": -1.6763806898190126e-08, + "advantage_min": -0.8921488225460052, + "advantage_std": 0.8203412033617496, + "completion_length": 2412.750045776367, + "epoch": 0.07771428571428571, + "grad_norm": 0.13555291295051575, + "kl": 0.007524013519287109, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.964516155915151e-07, + "loss": 0.0309, + "reward": 0.35750674456357956, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.35750674456357956, + "reward_after_std": 0.8203412406146526, + "reward_before_mean": 0.4597671739757061, + "reward_before_std": 0.8346320576965809, + "reward_change_max": 8.17328691482544e-05, + "reward_change_mean": -0.10226041614077985, + "reward_change_min": -0.20731043443083763, + "reward_change_std": 0.08281638938933611, + "reward_std": 0.8203412480652332, + "rewards/cosine_scaled_reward": -0.051366430008783937, + "rewards/format_reward": 0.5625000037252903, + "step": 68 + }, + { + "advantage_max": 1.0627647154033184, + "advantage_mean": 1.2417633588057697e-09, + "advantage_min": -0.5945528820157051, + "advantage_std": 0.658388938754797, + "completion_length": 2798.000045776367, + "epoch": 0.07885714285714286, + "grad_norm": 0.14615222811698914, + "kl": 0.0053558349609375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.960469931131936e-07, + "loss": 0.0488, + "reward": -0.11247947625815868, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.11247947625815868, + "reward_after_std": 0.6583889536559582, + "reward_before_mean": -0.052137549966573715, + "reward_before_std": 0.6629227660596371, + "reward_change_max": 0.00019734352827072144, + "reward_change_mean": -0.06034192198421806, + "reward_change_min": -0.1471708407625556, + "reward_change_std": 0.05584263487253338, + "reward_std": 0.6583889685571194, + "rewards/cosine_scaled_reward": -0.20315211405977607, + "rewards/format_reward": 0.35416666977107525, + "step": 69 + }, + { + "advantage_max": 1.1941916979849339, + "advantage_mean": -9.93410786964688e-09, + "advantage_min": -0.6053403690457344, + "advantage_std": 0.710853785276413, + "completion_length": 3077.541702270508, + "epoch": 0.08, + "grad_norm": 0.10297655314207077, + "kl": 0.009349465370178223, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.956206309337066e-07, + "loss": 0.016, + "reward": -0.00988800823688507, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.00988800823688507, + "reward_after_std": 0.7108537815511227, + "reward_before_mean": 0.05815399810671806, + "reward_before_std": 0.7120511084794998, + "reward_change_max": 0.0005014985799789429, + "reward_change_mean": -0.06804199749603868, + "reward_change_min": -0.13841038011014462, + "reward_change_std": 0.05216056061908603, + "reward_std": 0.710853811353445, + "rewards/cosine_scaled_reward": -0.12717300606891513, + "rewards/format_reward": 0.31250000186264515, + "step": 70 + }, + { + "advantage_max": 0.8809920065104961, + "advantage_mean": -1.6142924941231485e-08, + "advantage_min": -0.8779931887984276, + "advantage_std": 0.6846279054880142, + "completion_length": 2682.1875228881836, + "epoch": 0.08114285714285714, + "grad_norm": 0.1809944361448288, + "kl": 0.011813968420028687, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.951725498333448e-07, + "loss": 0.0711, + "reward": 0.2543748412281275, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2543748412281275, + "reward_after_std": 0.6846279166638851, + "reward_before_mean": 0.35289028100669384, + "reward_before_std": 0.7065125461667776, + "reward_change_max": 0.0002439543604850769, + "reward_change_mean": -0.09851543279364705, + "reward_change_min": -0.19406738318502903, + "reward_change_std": 0.0774632137035951, + "reward_std": 0.684627927839756, + "rewards/cosine_scaled_reward": -0.04230487486347556, + "rewards/format_reward": 0.43750000558793545, + "step": 71 + }, + { + "advantage_max": 0.9895018897950649, + "advantage_mean": 7.450580818968433e-09, + "advantage_min": -0.6588705629110336, + "advantage_std": 0.6476662866771221, + "completion_length": 3254.2291870117188, + "epoch": 0.08228571428571428, + "grad_norm": 0.13209903240203857, + "kl": 0.002953052520751953, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.947027716509488e-07, + "loss": 0.0271, + "reward": -0.14486753195524216, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": -0.14486753195524216, + "reward_after_std": 0.6476662829518318, + "reward_before_mean": -0.08536670729517937, + "reward_before_std": 0.6584763266146183, + "reward_change_max": 0.00038442760705947876, + "reward_change_mean": -0.05950081581249833, + "reward_change_min": -0.1290422910824418, + "reward_change_std": 0.051864347769878805, + "reward_std": 0.6476662904024124, + "rewards/cosine_scaled_reward": -0.17810002015903592, + "rewards/format_reward": 0.2708333395421505, + "step": 72 + }, + { + "advantage_max": 1.1704095806926489, + "advantage_mean": -1.0244548209747961e-08, + "advantage_min": -0.8206432610750198, + "advantage_std": 0.8143216799944639, + "completion_length": 3489.2291870117188, + "epoch": 0.08342857142857144, + "grad_norm": 0.13383091986179352, + "kl": 0.0011025667190551758, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.942113192828444e-07, + "loss": 0.0327, + "reward": -0.05213266983628273, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.05213266983628273, + "reward_after_std": 0.8143216762691736, + "reward_before_mean": 0.01244833879172802, + "reward_before_std": 0.8403085879981518, + "reward_change_max": 0.00028943270444869995, + "reward_change_mean": -0.06458103121258318, + "reward_change_min": -0.18091739807277918, + "reward_change_std": 0.07164288056083024, + "reward_std": 0.8143216967582703, + "rewards/cosine_scaled_reward": -0.09794248826801777, + "rewards/format_reward": 0.2083333395421505, + "step": 73 + }, + { + "advantage_max": 0.8129484131932259, + "advantage_mean": -4.346172199909404e-09, + "advantage_min": -0.45292046666145325, + "advantage_std": 0.48586198315024376, + "completion_length": 3319.2083435058594, + "epoch": 0.08457142857142858, + "grad_norm": 0.07312816381454468, + "kl": 0.002932727336883545, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.93698216681727e-07, + "loss": 0.0273, + "reward": -0.18033896386623383, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.18033896386623383, + "reward_after_std": 0.4858619924634695, + "reward_before_mean": -0.12051686272025108, + "reward_before_std": 0.48246172443032265, + "reward_change_max": 0.0002520233392715454, + "reward_change_mean": -0.05982208307250403, + "reward_change_min": -0.1109585091471672, + "reward_change_std": 0.044366022309986874, + "reward_std": 0.48586202412843704, + "rewards/cosine_scaled_reward": -0.1540084406733513, + "rewards/format_reward": 0.18750000186264515, + "step": 74 + }, + { + "advantage_max": 1.102239165455103, + "advantage_mean": -9.313226134732844e-09, + "advantage_min": -0.6440135687589645, + "advantage_std": 0.6835919357836246, + "completion_length": 3212.562515258789, + "epoch": 0.08571428571428572, + "grad_norm": 0.12819837033748627, + "kl": 0.0043468475341796875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.931634888554935e-07, + "loss": 0.0585, + "reward": 0.17928399704396725, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.17928399704396725, + "reward_after_std": 0.6835919404402375, + "reward_before_mean": 0.2666355334222317, + "reward_before_std": 0.6772829368710518, + "reward_change_max": 0.00011337548494338989, + "reward_change_mean": -0.08735153428278863, + "reward_change_min": -0.17496745940297842, + "reward_change_std": 0.07009288971312344, + "reward_std": 0.6835919748991728, + "rewards/cosine_scaled_reward": 0.008317755535244942, + "rewards/format_reward": 0.25000000186264515, + "step": 75 + }, + { + "advantage_max": 1.0620285347104073, + "advantage_mean": 5.587935503204022e-09, + "advantage_min": -0.6922560930252075, + "advantage_std": 0.6782711632549763, + "completion_length": 2798.0625762939453, + "epoch": 0.08685714285714285, + "grad_norm": 0.12901480495929718, + "kl": 0.0019115209579467773, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.926071618660237e-07, + "loss": 0.0855, + "reward": -0.03348325379192829, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.03348325379192829, + "reward_after_std": 0.6782711483538151, + "reward_before_mean": 0.034606458619236946, + "reward_before_std": 0.6863775365054607, + "reward_change_max": 0.00021073222160339355, + "reward_change_mean": -0.06808969052508473, + "reward_change_min": -0.1497401585802436, + "reward_change_std": 0.0586240931879729, + "reward_std": 0.6782711558043957, + "rewards/cosine_scaled_reward": -0.19103011582046747, + "rewards/format_reward": 0.41666667722165585, + "step": 76 + }, + { + "advantage_max": 0.7430855147540569, + "advantage_mean": 6.829699139565548e-09, + "advantage_min": -0.6691855564713478, + "advantage_std": 0.5303375329822302, + "completion_length": 3146.0833587646484, + "epoch": 0.088, + "grad_norm": 0.09785456210374832, + "kl": 0.001885056495666504, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.9202926282791e-07, + "loss": 0.0195, + "reward": -0.005782470107078552, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.005782470107078552, + "reward_after_std": 0.5303375329822302, + "reward_before_mean": 0.07121062092483044, + "reward_before_std": 0.5396563746035099, + "reward_change_max": 0.0004342496395111084, + "reward_change_mean": -0.07699309976305813, + "reward_change_min": -0.1357054617255926, + "reward_change_std": 0.05527487176004797, + "reward_std": 0.5303375385701656, + "rewards/cosine_scaled_reward": -0.12064469419419765, + "rewards/format_reward": 0.3125000111758709, + "step": 77 + }, + { + "advantage_max": 1.2232607677578926, + "advantage_mean": 1.5522040319737584e-09, + "advantage_min": -1.0113331899046898, + "advantage_std": 0.8812691904604435, + "completion_length": 3311.5208740234375, + "epoch": 0.08914285714285715, + "grad_norm": 0.13331255316734314, + "kl": 0.002320528030395508, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.91429819907136e-07, + "loss": 0.048, + "reward": 0.2586913288978394, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2586913288978394, + "reward_after_std": 0.8812691792845726, + "reward_before_mean": 0.35136597882956266, + "reward_before_std": 0.9065258577466011, + "reward_change_max": 0.00028318166732788086, + "reward_change_mean": -0.09267466515302658, + "reward_change_min": -0.20396779384464025, + "reward_change_std": 0.08194569800980389, + "reward_std": 0.8812692165374756, + "rewards/cosine_scaled_reward": -0.0014003375545144081, + "rewards/format_reward": 0.35416667722165585, + "step": 78 + }, + { + "advantage_max": 0.9030539467930794, + "advantage_mean": 9.313226190243995e-09, + "advantage_min": -0.963399812579155, + "advantage_std": 0.7022213935852051, + "completion_length": 2346.0833587646484, + "epoch": 0.09028571428571429, + "grad_norm": 0.10168980807065964, + "kl": 0.0042231082916259766, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.908088623197048e-07, + "loss": 0.038, + "reward": 0.38052323646843433, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.38052323646843433, + "reward_after_std": 0.7022214084863663, + "reward_before_mean": 0.48965731263160706, + "reward_before_std": 0.7216234467923641, + "reward_change_max": 0.000313684344291687, + "reward_change_mean": -0.10913406126201153, + "reward_change_min": -0.2034011334180832, + "reward_change_std": 0.08371774014085531, + "reward_std": 0.7022214457392693, + "rewards/cosine_scaled_reward": -0.03642135614063591, + "rewards/format_reward": 0.5625000074505806, + "step": 79 + }, + { + "advantage_max": 0.9310047589242458, + "advantage_mean": 1.1175870895385742e-08, + "advantage_min": -0.7848172262310982, + "advantage_std": 0.7024698052555323, + "completion_length": 3314.250045776367, + "epoch": 0.09142857142857143, + "grad_norm": 0.1139289066195488, + "kl": 0.003377556800842285, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.901664203302124e-07, + "loss": 0.0271, + "reward": 0.0665772594511509, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.0665772594511509, + "reward_after_std": 0.7024698089808226, + "reward_before_mean": 0.14694275334477425, + "reward_before_std": 0.7254483439028263, + "reward_change_max": 0.0002681538462638855, + "reward_change_mean": -0.08036548434756696, + "reward_change_min": -0.18215056881308556, + "reward_change_std": 0.07275031437166035, + "reward_std": 0.7024698238819838, + "rewards/cosine_scaled_reward": -0.09319528564810753, + "rewards/format_reward": 0.3333333395421505, + "step": 80 + }, + { + "advantage_max": 1.0726732574403286, + "advantage_mean": 1.0554989549049765e-08, + "advantage_min": -0.6589022949337959, + "advantage_std": 0.7009899169206619, + "completion_length": 3062.2708892822266, + "epoch": 0.09257142857142857, + "grad_norm": 0.15159904956817627, + "kl": 0.009969711303710938, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.895025252503755e-07, + "loss": 0.0548, + "reward": -0.11807727441191673, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.11807727441191673, + "reward_after_std": 0.7009899355471134, + "reward_before_mean": -0.05808864161372185, + "reward_before_std": 0.7156584821641445, + "reward_change_max": 8.110702037811279e-05, + "reward_change_mean": -0.059988636523485184, + "reward_change_min": -0.14878776855766773, + "reward_change_std": 0.05785088497214019, + "reward_std": 0.7009899355471134, + "rewards/cosine_scaled_reward": -0.17487765941768885, + "rewards/format_reward": 0.2916666716337204, + "step": 81 + }, + { + "advantage_max": 1.000360194593668, + "advantage_mean": -1.6763806509612067e-08, + "advantage_min": -0.7447901144623756, + "advantage_std": 0.676895584911108, + "completion_length": 2876.7083740234375, + "epoch": 0.09371428571428571, + "grad_norm": 0.1397034078836441, + "kl": 0.0031021833419799805, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.888172094375033e-07, + "loss": 0.0471, + "reward": 0.17887724190950394, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.17887724190950394, + "reward_after_std": 0.6768955774605274, + "reward_before_mean": 0.26815129816532135, + "reward_before_std": 0.6837855763733387, + "reward_change_max": 7.466226816177368e-05, + "reward_change_mean": -0.08927406487055123, + "reward_change_min": -0.18324447609484196, + "reward_change_std": 0.07135608559474349, + "reward_std": 0.6768956035375595, + "rewards/cosine_scaled_reward": -0.07425769325345755, + "rewards/format_reward": 0.41666667722165585, + "step": 82 + }, + { + "advantage_max": 0.5430999919772148, + "advantage_mean": 1.0554989271494009e-08, + "advantage_min": -0.5749870277941227, + "advantage_std": 0.4309024289250374, + "completion_length": 2885.1458435058594, + "epoch": 0.09485714285714286, + "grad_norm": 0.07022266834974289, + "kl": 0.004255771636962891, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.881105062929221e-07, + "loss": 0.0056, + "reward": -0.10269813984632492, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.10269813984632492, + "reward_after_std": 0.4309024289250374, + "reward_before_mean": -0.03149991109967232, + "reward_before_std": 0.44176167342811823, + "reward_change_max": 0.0, + "reward_change_mean": -0.07119821500964463, + "reward_change_min": -0.12623351998627186, + "reward_change_std": 0.052647512522526085, + "reward_std": 0.4309024512767792, + "rewards/cosine_scaled_reward": -0.1615832932293415, + "rewards/format_reward": 0.2916666679084301, + "step": 83 + }, + { + "advantage_max": 1.1471601538360119, + "advantage_mean": -3.352761324126874e-08, + "advantage_min": -0.9769178181886673, + "advantage_std": 0.8521611541509628, + "completion_length": 3055.7083740234375, + "epoch": 0.096, + "grad_norm": 0.13345955312252045, + "kl": 0.0014927387237548828, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.873824502603459e-07, + "loss": 0.0476, + "reward": 0.539270993322134, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.539270993322134, + "reward_after_std": 0.8521611448377371, + "reward_before_mean": 0.6591531746089458, + "reward_before_std": 0.87022246979177, + "reward_change_max": 0.0005726292729377747, + "reward_change_mean": -0.11988217453472316, + "reward_change_min": -0.22343150340020657, + "reward_change_std": 0.09291968471370637, + "reward_std": 0.8521611541509628, + "rewards/cosine_scaled_reward": 0.11082656681537628, + "rewards/format_reward": 0.43750000558793545, + "step": 84 + }, + { + "advantage_max": 1.1944213286042213, + "advantage_mean": -2.4214387883692012e-08, + "advantage_min": -0.78194659948349, + "advantage_std": 0.7695669606328011, + "completion_length": 3144.687545776367, + "epoch": 0.09714285714285714, + "grad_norm": 0.11960656940937042, + "kl": 0.0030364990234375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.866330768241983e-07, + "loss": 0.0324, + "reward": 0.35563176590949297, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.35563176590949297, + "reward_after_std": 0.7695669494569302, + "reward_before_mean": 0.4578101532533765, + "reward_before_std": 0.7720634564757347, + "reward_change_max": 0.0002278536558151245, + "reward_change_mean": -0.10217837616801262, + "reward_change_min": -0.17799327243119478, + "reward_change_std": 0.07303832424804568, + "reward_std": 0.7695669531822205, + "rewards/cosine_scaled_reward": -0.00026161037385463715, + "rewards/format_reward": 0.4583333432674408, + "step": 85 + }, + { + "advantage_max": 0.8763966374099255, + "advantage_mean": -1.2417634698280722e-09, + "advantage_min": -0.4972258657217026, + "advantage_std": 0.5113778002560139, + "completion_length": 3030.8125762939453, + "epoch": 0.09828571428571428, + "grad_norm": 0.10097847878932953, + "kl": 0.0046710968017578125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.85862422507884e-07, + "loss": 0.032, + "reward": 0.045074569061398506, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.045074569061398506, + "reward_after_std": 0.5113777853548527, + "reward_before_mean": 0.12412225641310215, + "reward_before_std": 0.49582573771476746, + "reward_change_max": 0.0003281235694885254, + "reward_change_mean": -0.07904768036678433, + "reward_change_min": -0.13752009812742472, + "reward_change_std": 0.053194016218185425, + "reward_std": 0.511377789080143, + "rewards/cosine_scaled_reward": -0.12543888704385608, + "rewards/format_reward": 0.37500000186264515, + "step": 86 + }, + { + "advantage_max": 1.0924755409359932, + "advantage_mean": -5.277494885547185e-09, + "advantage_min": -1.026725873351097, + "advantage_std": 0.8062815628945827, + "completion_length": 2812.083366394043, + "epoch": 0.09942857142857142, + "grad_norm": 0.14117565751075745, + "kl": 0.007817387580871582, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.850705248720068e-07, + "loss": 0.0421, + "reward": 0.33314487524330616, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.33314487524330616, + "reward_after_std": 0.8062815628945827, + "reward_before_mean": 0.4346958175301552, + "reward_before_std": 0.8250590972602367, + "reward_change_max": 0.0003003552556037903, + "reward_change_mean": -0.10155095206573606, + "reward_change_min": -0.20147591084241867, + "reward_change_std": 0.08329685684293509, + "reward_std": 0.8062815852463245, + "rewards/cosine_scaled_reward": -0.06390209496021271, + "rewards/format_reward": 0.5625000093132257, + "step": 87 + }, + { + "advantage_max": 1.5729316174983978, + "advantage_mean": 1.9868215073159945e-08, + "advantage_min": -1.1915039494633675, + "advantage_std": 1.0524433217942715, + "completion_length": 3050.3125610351562, + "epoch": 0.10057142857142858, + "grad_norm": 0.224959596991539, + "kl": 0.009051322937011719, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.8425742251254e-07, + "loss": 0.0922, + "reward": 0.32696538232266903, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.32696538232266903, + "reward_after_std": 1.0524433366954327, + "reward_before_mean": 0.41934662498533726, + "reward_before_std": 1.0758509896695614, + "reward_change_max": 0.00041228532791137695, + "reward_change_mean": -0.09238121099770069, + "reward_change_min": -0.19882145337760448, + "reward_change_std": 0.08465232839807868, + "reward_std": 1.0524433702230453, + "rewards/cosine_scaled_reward": 0.011756634339690208, + "rewards/format_reward": 0.39583334513008595, + "step": 88 + }, + { + "advantage_max": 0.6932465061545372, + "advantage_mean": 4.346172310931706e-09, + "advantage_min": -0.7988357171416283, + "advantage_std": 0.5768093653023243, + "completion_length": 3378.166717529297, + "epoch": 0.10171428571428572, + "grad_norm": 0.14413271844387054, + "kl": 0.004611492156982422, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.83423155058946e-07, + "loss": 0.0403, + "reward": -0.06333770230412483, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.06333770230412483, + "reward_after_std": 0.57680937461555, + "reward_before_mean": 0.009203372988849878, + "reward_before_std": 0.6004473958164454, + "reward_change_max": 0.00012886524200439453, + "reward_change_mean": -0.07254105247557163, + "reward_change_min": -0.14528132881969213, + "reward_change_std": 0.06309301522560418, + "reward_std": 0.5768093969672918, + "rewards/cosine_scaled_reward": -0.1203983323648572, + "rewards/format_reward": 0.2500000074505806, + "step": 89 + }, + { + "advantage_max": 0.7317495383322239, + "advantage_mean": -2.483526828633842e-09, + "advantage_min": -0.6038379520177841, + "advantage_std": 0.502998935058713, + "completion_length": 2579.770851135254, + "epoch": 0.10285714285714286, + "grad_norm": 0.09057570993900299, + "kl": 0.012523651123046875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.825677631722435e-07, + "loss": 0.0084, + "reward": 0.053218359127640724, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.053218359127640724, + "reward_after_std": 0.5029989331960678, + "reward_before_mean": 0.13523414358496666, + "reward_before_std": 0.5063311252743006, + "reward_change_max": 0.0001977384090423584, + "reward_change_mean": -0.08201578538864851, + "reward_change_min": -0.13647860940545797, + "reward_change_std": 0.053851797711104155, + "reward_std": 0.5029989369213581, + "rewards/cosine_scaled_reward": -0.17196626088116318, + "rewards/format_reward": 0.47916666977107525, + "step": 90 + }, + { + "advantage_max": 0.8002164922654629, + "advantage_mean": 2.1109978487476866e-08, + "advantage_min": -0.663936335593462, + "advantage_std": 0.598348455503583, + "completion_length": 3102.229202270508, + "epoch": 0.104, + "grad_norm": 0.1666155755519867, + "kl": 0.007025480270385742, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.816912885430258e-07, + "loss": 0.0472, + "reward": -0.064543966203928, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.064543966203928, + "reward_after_std": 0.5983484406024218, + "reward_before_mean": 0.005597323179244995, + "reward_before_std": 0.616460170596838, + "reward_change_max": 0.0004862844944000244, + "reward_change_mean": -0.07014129433082417, + "reward_change_min": -0.15747894253581762, + "reward_change_std": 0.060970506398007274, + "reward_std": 0.5983484499156475, + "rewards/cosine_scaled_reward": -0.1847013352671638, + "rewards/format_reward": 0.3750000037252903, + "step": 91 + }, + { + "advantage_max": 0.9697154983878136, + "advantage_mean": 6.829698917520943e-09, + "advantage_min": -0.8269370794296265, + "advantage_std": 0.7136020623147488, + "completion_length": 2876.666717529297, + "epoch": 0.10514285714285715, + "grad_norm": 0.11581981927156448, + "kl": 0.008868694305419922, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.807937738894303e-07, + "loss": 0.0375, + "reward": 0.10207435674965382, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.10207435674965382, + "reward_after_std": 0.7136020660400391, + "reward_before_mean": 0.18458046624436975, + "reward_before_std": 0.7316001541912556, + "reward_change_max": 0.00030371546745300293, + "reward_change_mean": -0.0825060837669298, + "reward_change_min": -0.16981272120028734, + "reward_change_std": 0.072384690400213, + "reward_std": 0.7136020846664906, + "rewards/cosine_scaled_reward": -0.1577097848057747, + "rewards/format_reward": 0.5000000111758709, + "step": 92 + }, + { + "advantage_max": 0.5377812571823597, + "advantage_mean": 9.623667057701013e-09, + "advantage_min": -0.44808217138051987, + "advantage_std": 0.37609364092350006, + "completion_length": 3464.3958435058594, + "epoch": 0.10628571428571429, + "grad_norm": 0.09607281535863876, + "kl": 0.0065135955810546875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.798752629550546e-07, + "loss": 0.0231, + "reward": -0.473403861746192, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.473403861746192, + "reward_after_std": 0.37609364092350006, + "reward_before_mean": -0.43631402403116226, + "reward_before_std": 0.38595324009656906, + "reward_change_max": 0.00021993368864059448, + "reward_change_mean": -0.03708983049727976, + "reward_change_min": -0.08134532067924738, + "reward_change_std": 0.0337000098079443, + "reward_std": 0.37609364464879036, + "rewards/cosine_scaled_reward": -0.23899034783244133, + "rewards/format_reward": 0.0416666679084301, + "step": 93 + }, + { + "advantage_max": 0.7897338904440403, + "advantage_mean": 1.614292521878724e-08, + "advantage_min": -0.6556259952485561, + "advantage_std": 0.53734415397048, + "completion_length": 3259.875030517578, + "epoch": 0.10742857142857143, + "grad_norm": 0.08970505744218826, + "kl": 0.008943557739257812, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.78935800506826e-07, + "loss": 0.013, + "reward": -0.11383907869458199, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.11383907869458199, + "reward_after_std": 0.5373441409319639, + "reward_before_mean": -0.04790402948856354, + "reward_before_std": 0.5454462002962828, + "reward_change_max": 0.0002677515149116516, + "reward_change_mean": -0.06593502941541374, + "reward_change_min": -0.12173356115818024, + "reward_change_std": 0.04828887898474932, + "reward_std": 0.5373441465198994, + "rewards/cosine_scaled_reward": -0.10728535335510969, + "rewards/format_reward": 0.1666666679084301, + "step": 94 + }, + { + "advantage_max": 0.7336770445108414, + "advantage_mean": 1.7384688466570708e-08, + "advantage_min": -0.5926066339015961, + "advantage_std": 0.5384083883836865, + "completion_length": 3437.1250610351562, + "epoch": 0.10857142857142857, + "grad_norm": 0.09171445667743683, + "kl": 0.0028939247131347656, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.779754323328192e-07, + "loss": 0.0225, + "reward": -0.22875665500760078, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.22875665500760078, + "reward_after_std": 0.538408393971622, + "reward_before_mean": -0.17224636115133762, + "reward_before_std": 0.5542042218148708, + "reward_change_max": 0.00020701438188552856, + "reward_change_mean": -0.056510292924940586, + "reward_change_min": -0.12093199416995049, + "reward_change_std": 0.0512371362419799, + "reward_std": 0.5384083949029446, + "rewards/cosine_scaled_reward": -0.20070651546120644, + "rewards/format_reward": 0.2291666753590107, + "step": 95 + }, + { + "advantage_max": 1.1119077168405056, + "advantage_mean": -3.1044084525255755e-09, + "advantage_min": -0.8286112174391747, + "advantage_std": 0.8079782947897911, + "completion_length": 3218.187545776367, + "epoch": 0.10971428571428571, + "grad_norm": 0.1471080780029297, + "kl": 0.008174657821655273, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.769942052400235e-07, + "loss": 0.0564, + "reward": 0.08850634610280395, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.08850634610280395, + "reward_after_std": 0.8079782761633396, + "reward_before_mean": 0.16710020136088133, + "reward_before_std": 0.8330794721841812, + "reward_change_max": 0.00022210925817489624, + "reward_change_mean": -0.07859386596828699, + "reward_change_min": -0.19585560448467731, + "reward_change_std": 0.07616083696484566, + "reward_std": 0.8079783134162426, + "rewards/cosine_scaled_reward": -0.0726998969912529, + "rewards/format_reward": 0.31250000558793545, + "step": 96 + }, + { + "advantage_max": 0.774888951331377, + "advantage_mean": 8.07146216530441e-09, + "advantage_min": -0.7759076952934265, + "advantage_std": 0.6380616500973701, + "completion_length": 3350.541717529297, + "epoch": 0.11085714285714286, + "grad_norm": 0.13446907699108124, + "kl": 0.0053157806396484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.759921670520634e-07, + "loss": 0.0727, + "reward": -0.1549377404153347, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.1549377404153347, + "reward_after_std": 0.638061661273241, + "reward_before_mean": -0.09221046045422554, + "reward_before_std": 0.6677076295018196, + "reward_change_max": 0.00012560933828353882, + "reward_change_mean": -0.06272726762108505, + "reward_change_min": -0.14727217331528664, + "reward_change_std": 0.06415313272736967, + "reward_std": 0.6380616910755634, + "rewards/cosine_scaled_reward": -0.13985523395240307, + "rewards/format_reward": 0.1875, + "step": 97 + }, + { + "advantage_max": 0.544066796079278, + "advantage_mean": 9.934107592091124e-09, + "advantage_min": -0.6370603069663048, + "advantage_std": 0.4388393219560385, + "completion_length": 2994.9583892822266, + "epoch": 0.112, + "grad_norm": 0.08746904134750366, + "kl": 0.004663944244384766, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.749693666068663e-07, + "loss": 0.0244, + "reward": -0.046972109004855156, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.046972109004855156, + "reward_after_std": 0.4388393219560385, + "reward_before_mean": 0.029481630073860288, + "reward_before_std": 0.45071091689169407, + "reward_change_max": 8.702278137207031e-06, + "reward_change_mean": -0.07645371626131237, + "reward_change_min": -0.13023042678833008, + "reward_change_std": 0.05339970113709569, + "reward_std": 0.4388393349945545, + "rewards/cosine_scaled_reward": -0.183175852522254, + "rewards/format_reward": 0.3958333507180214, + "step": 98 + }, + { + "advantage_max": 0.8721281476318836, + "advantage_mean": 3.725290520506519e-09, + "advantage_min": -0.6215038821101189, + "advantage_std": 0.5652411505579948, + "completion_length": 2965.500015258789, + "epoch": 0.11314285714285714, + "grad_norm": 0.10346469283103943, + "kl": 0.007491111755371094, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.739258537542835e-07, + "loss": 0.0261, + "reward": 0.052828481420874596, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.052828481420874596, + "reward_after_std": 0.5652411617338657, + "reward_before_mean": 0.1324861806933768, + "reward_before_std": 0.5641120858490467, + "reward_change_max": 5.237758159637451e-05, + "reward_change_mean": -0.07965772063471377, + "reward_change_min": -0.1496228538453579, + "reward_change_std": 0.06120690796524286, + "reward_std": 0.5652411766350269, + "rewards/cosine_scaled_reward": -0.06917357502970845, + "rewards/format_reward": 0.2708333395421505, + "step": 99 + }, + { + "advantage_max": 1.177578266710043, + "advantage_mean": -6.20881640545079e-09, + "advantage_min": -1.0268895998597145, + "advantage_std": 0.9141651093959808, + "completion_length": 2879.3958740234375, + "epoch": 0.11428571428571428, + "grad_norm": 0.16082407534122467, + "kl": 0.011318206787109375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.728616793536587e-07, + "loss": 0.0754, + "reward": 0.26343181263655424, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.26343181263655424, + "reward_after_std": 0.914165124297142, + "reward_before_mean": 0.3574203960597515, + "reward_before_std": 0.9503813628107309, + "reward_change_max": 0.0003396347165107727, + "reward_change_mean": -0.09398859040811658, + "reward_change_min": -0.20327434316277504, + "reward_change_std": 0.08825246221385896, + "reward_std": 0.9141651410609484, + "rewards/cosine_scaled_reward": -0.02962313499301672, + "rewards/format_reward": 0.41666667722165585, + "step": 100 + }, + { + "advantage_max": 0.7537824623286724, + "advantage_mean": 3.290673178391046e-08, + "advantage_min": -0.5371817983686924, + "advantage_std": 0.48059073835611343, + "completion_length": 2831.1041870117188, + "epoch": 0.11542857142857142, + "grad_norm": 0.10156463831663132, + "kl": 0.005992412567138672, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.717768952713511e-07, + "loss": 0.0389, + "reward": 0.06839887425303459, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.06839887425303459, + "reward_after_std": 0.4805907569825649, + "reward_before_mean": 0.15205013938248158, + "reward_before_std": 0.47612071223556995, + "reward_change_max": 0.0001445487141609192, + "reward_change_mean": -0.08365123742260039, + "reward_change_min": -0.146190470084548, + "reward_change_std": 0.05711901048198342, + "reward_std": 0.4805907681584358, + "rewards/cosine_scaled_reward": -0.09064160846173763, + "rewards/format_reward": 0.33333334140479565, + "step": 101 + }, + { + "advantage_max": 1.2887723445892334, + "advantage_mean": 1.862645426786713e-09, + "advantage_min": -0.8732288219034672, + "advantage_std": 0.7983098104596138, + "completion_length": 3045.4584350585938, + "epoch": 0.11657142857142858, + "grad_norm": 0.17374233901500702, + "kl": 0.013805389404296875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.706715543782064e-07, + "loss": 0.0493, + "reward": 0.11814034357666969, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.11814034357666969, + "reward_after_std": 0.7983098365366459, + "reward_before_mean": 0.1961025595664978, + "reward_before_std": 0.8020947612822056, + "reward_change_max": 0.0001891404390335083, + "reward_change_mean": -0.07796221878379583, + "reward_change_min": -0.15777896530926228, + "reward_change_std": 0.06380243599414825, + "reward_std": 0.7983098588883877, + "rewards/cosine_scaled_reward": -0.1311153913848102, + "rewards/format_reward": 0.45833334140479565, + "step": 102 + }, + { + "advantage_max": 1.012158952653408, + "advantage_mean": 4.346171922353648e-09, + "advantage_min": -0.9280484542250633, + "advantage_std": 0.806487213820219, + "completion_length": 3334.104248046875, + "epoch": 0.11771428571428572, + "grad_norm": 0.16222251951694489, + "kl": 0.010880470275878906, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.695457105469804e-07, + "loss": 0.0803, + "reward": -0.02617565030232072, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.02617565030232072, + "reward_after_std": 0.8064871914684772, + "reward_before_mean": 0.043522678315639496, + "reward_before_std": 0.8436622954905033, + "reward_change_max": 0.0005996227264404297, + "reward_change_mean": -0.06969833420589566, + "reward_change_min": -0.17376555316150188, + "reward_change_std": 0.07702636765316129, + "reward_std": 0.8064872398972511, + "rewards/cosine_scaled_reward": -0.1240720006171614, + "rewards/format_reward": 0.29166666977107525, + "step": 103 + }, + { + "advantage_max": 0.4506031647324562, + "advantage_mean": 1.98682153507157e-08, + "advantage_min": -0.49186787754297256, + "advantage_std": 0.3530017454177141, + "completion_length": 2766.1041870117188, + "epoch": 0.11885714285714286, + "grad_norm": 0.06478388607501984, + "kl": 0.013357162475585938, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.683994186497132e-07, + "loss": 0.0074, + "reward": 0.146314088255167, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.146314088255167, + "reward_after_std": 0.3530017528682947, + "reward_before_mean": 0.2424488589167595, + "reward_before_std": 0.35200561955571175, + "reward_change_max": 0.00045055896043777466, + "reward_change_mean": -0.09613475622609258, + "reward_change_min": -0.14989042840898037, + "reward_change_std": 0.059963473584502935, + "reward_std": 0.35300176963210106, + "rewards/cosine_scaled_reward": -0.06627557054162025, + "rewards/format_reward": 0.375, + "step": 104 + }, + { + "advantage_max": 1.1343233175575733, + "advantage_mean": -1.92473328941567e-08, + "advantage_min": -0.9812361150979996, + "advantage_std": 0.786447387188673, + "completion_length": 2954.8333740234375, + "epoch": 0.12, + "grad_norm": 0.1910223364830017, + "kl": 0.008061408996582031, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.672327345550543e-07, + "loss": 0.0804, + "reward": 0.12759940326213837, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.12759940326213837, + "reward_after_std": 0.7864474020898342, + "reward_before_mean": 0.20970958936959505, + "reward_before_std": 0.804718591272831, + "reward_change_max": 0.00041823089122772217, + "reward_change_mean": -0.08211018680594862, + "reward_change_min": -0.166708511300385, + "reward_change_std": 0.07057785498909652, + "reward_std": 0.7864474169909954, + "rewards/cosine_scaled_reward": -0.0826452155597508, + "rewards/format_reward": 0.3750000149011612, + "step": 105 + }, + { + "advantage_max": 1.1130489706993103, + "advantage_mean": -3.228584932735146e-08, + "advantage_min": -1.018290713429451, + "advantage_std": 0.8191616833209991, + "completion_length": 2387.500057220459, + "epoch": 0.12114285714285715, + "grad_norm": 0.11806398630142212, + "kl": 0.013179779052734375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.66045715125541e-07, + "loss": 0.0162, + "reward": 0.8837066609412432, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.8837066609412432, + "reward_after_std": 0.8191616646945477, + "reward_before_mean": 1.036157036665827, + "reward_before_std": 0.8276788517832756, + "reward_change_max": 0.0, + "reward_change_mean": -0.15245038806460798, + "reward_change_min": -0.2699962416663766, + "reward_change_std": 0.10578735335730016, + "reward_std": 0.8191616907715797, + "rewards/cosine_scaled_reward": 0.18474517948925495, + "rewards/format_reward": 0.666666679084301, + "step": 106 + }, + { + "advantage_max": 0.8276206701993942, + "advantage_mean": -1.9247333005179e-08, + "advantage_min": -0.7223168984055519, + "advantage_std": 0.5651622377336025, + "completion_length": 2848.854232788086, + "epoch": 0.12228571428571429, + "grad_norm": 0.08836143463850021, + "kl": 0.00736236572265625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.648384182148252e-07, + "loss": 0.0252, + "reward": 0.30740308575332165, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.30740308575332165, + "reward_after_std": 0.5651622079312801, + "reward_before_mean": 0.41169095039367676, + "reward_before_std": 0.5657424293458462, + "reward_change_max": 7.306039333343506e-05, + "reward_change_mean": -0.1042878954904154, + "reward_change_min": -0.16713330894708633, + "reward_change_std": 0.06733074027579278, + "reward_std": 0.5651622265577316, + "rewards/cosine_scaled_reward": -0.044154517352581024, + "rewards/format_reward": 0.5000000111758709, + "step": 107 + }, + { + "advantage_max": 1.0464369431138039, + "advantage_mean": 4.967053879312289e-09, + "advantage_min": -0.920787513256073, + "advantage_std": 0.8237064126878977, + "completion_length": 3022.750030517578, + "epoch": 0.12342857142857143, + "grad_norm": 5.710745811462402, + "kl": 0.679987907409668, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.636109026648554e-07, + "loss": 0.1282, + "reward": -0.0018061436712741852, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.0018061436712741852, + "reward_after_std": 0.8237064089626074, + "reward_before_mean": 0.06948780082166195, + "reward_before_std": 0.8604318238794804, + "reward_change_max": 0.0007350221276283264, + "reward_change_mean": -0.07129394076764584, + "reward_change_min": -0.18075018282979727, + "reward_change_std": 0.07853512931615114, + "reward_std": 0.8237064350396395, + "rewards/cosine_scaled_reward": -0.12150610354728997, + "rewards/format_reward": 0.3125000074505806, + "step": 108 + }, + { + "advantage_max": 0.8055699914693832, + "advantage_mean": -6.2088179597630244e-09, + "advantage_min": -0.5043129064142704, + "advantage_std": 0.495711550116539, + "completion_length": 3027.312530517578, + "epoch": 0.12457142857142857, + "grad_norm": 0.0767723098397255, + "kl": 0.006561279296875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.623632283030077e-07, + "loss": 0.0116, + "reward": 0.04255384439602494, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.04255384439602494, + "reward_after_std": 0.495711550116539, + "reward_before_mean": 0.12285317666828632, + "reward_before_std": 0.49004461243748665, + "reward_change_max": 0.00023395568132400513, + "reward_change_mean": -0.08029932924546301, + "reward_change_min": -0.13384640123695135, + "reward_change_std": 0.053254172671586275, + "reward_std": 0.4957115687429905, + "rewards/cosine_scaled_reward": -0.10524008236825466, + "rewards/format_reward": 0.33333334140479565, + "step": 109 + }, + { + "advantage_max": 0.9220863282680511, + "advantage_mean": -8.692344399818808e-09, + "advantage_min": -0.5900181010365486, + "advantage_std": 0.6116988677531481, + "completion_length": 3102.7916717529297, + "epoch": 0.12571428571428572, + "grad_norm": 0.10041435062885284, + "kl": 0.008144378662109375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.610954559391704e-07, + "loss": 0.0172, + "reward": -0.07845552056096494, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.07845552056096494, + "reward_after_std": 0.6116988696157932, + "reward_before_mean": -0.011700557777658105, + "reward_before_std": 0.6220829505473375, + "reward_change_max": 0.0005408599972724915, + "reward_change_mean": -0.06675496569368988, + "reward_change_min": -0.15565920621156693, + "reward_change_std": 0.05837497836910188, + "reward_std": 0.6116988770663738, + "rewards/cosine_scaled_reward": -0.17251694481819868, + "rewards/format_reward": 0.33333333767950535, + "step": 110 + }, + { + "advantage_max": 0.8496062196791172, + "advantage_mean": -1.2107193692045826e-08, + "advantage_min": -0.7639882974326611, + "advantage_std": 0.6249940134584904, + "completion_length": 3443.8541870117188, + "epoch": 0.12685714285714286, + "grad_norm": 0.11901643127202988, + "kl": 0.010395050048828125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.598076473627796e-07, + "loss": 0.0361, + "reward": -0.019697923213243484, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.019697923213243484, + "reward_after_std": 0.6249940060079098, + "reward_before_mean": 0.05418802239000797, + "reward_before_std": 0.64259023219347, + "reward_change_max": 9.847432374954224e-05, + "reward_change_mean": -0.07388598122633994, + "reward_change_min": -0.14417694509029388, + "reward_change_std": 0.060518967104144394, + "reward_std": 0.6249940097332001, + "rewards/cosine_scaled_reward": -0.05623931344598532, + "rewards/format_reward": 0.1666666716337204, + "step": 111 + }, + { + "advantage_max": 1.38174469769001, + "advantage_mean": 7.761021492136422e-09, + "advantage_min": -0.975238636136055, + "advantage_std": 0.8997639156877995, + "completion_length": 3331.937530517578, + "epoch": 0.128, + "grad_norm": 0.16729916632175446, + "kl": 0.0065631866455078125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.58499865339809e-07, + "loss": 0.0344, + "reward": 0.40944708324968815, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.40944708324968815, + "reward_after_std": 0.8997639268636703, + "reward_before_mean": 0.5130116026848555, + "reward_before_std": 0.9069201238453388, + "reward_change_max": 0.0003801584243774414, + "reward_change_mean": -0.10356450360268354, + "reward_change_min": -0.21430773753672838, + "reward_change_std": 0.08684224355965853, + "reward_std": 0.899763960391283, + "rewards/cosine_scaled_reward": 0.06900579854846, + "rewards/format_reward": 0.37500000558793545, + "step": 112 + }, + { + "advantage_max": 1.1800456158816814, + "advantage_mean": 1.8626449826975033e-09, + "advantage_min": -0.7997603341937065, + "advantage_std": 0.7590612880885601, + "completion_length": 2949.9583740234375, + "epoch": 0.12914285714285714, + "grad_norm": 0.21318159997463226, + "kl": 0.01378631591796875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.571721736097088e-07, + "loss": 0.0792, + "reward": -0.0245030396617949, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.0245030396617949, + "reward_after_std": 0.7590613029897213, + "reward_before_mean": 0.04212821088731289, + "reward_before_std": 0.7715008705854416, + "reward_change_max": 0.00035149604082107544, + "reward_change_mean": -0.06663124216720462, + "reward_change_min": -0.15565018076449633, + "reward_change_std": 0.0617706241318956, + "reward_std": 0.7590613178908825, + "rewards/cosine_scaled_reward": -0.17685257643461227, + "rewards/format_reward": 0.3958333395421505, + "step": 113 + }, + { + "advantage_max": 0.80317697301507, + "advantage_mean": -2.4835269951672956e-09, + "advantage_min": -0.6111781224608421, + "advantage_std": 0.5317833982408047, + "completion_length": 2686.229217529297, + "epoch": 0.13028571428571428, + "grad_norm": 0.08997328579425812, + "kl": 0.0075283050537109375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.55824636882301e-07, + "loss": 0.0055, + "reward": 0.14073886536061764, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.14073886536061764, + "reward_after_std": 0.5317834094166756, + "reward_before_mean": 0.2300937306135893, + "reward_before_std": 0.5275977291166782, + "reward_change_max": 0.0, + "reward_change_mean": -0.08935487153939903, + "reward_change_min": -0.16146521922200918, + "reward_change_std": 0.059935636119917035, + "reward_std": 0.5317834354937077, + "rewards/cosine_scaled_reward": -0.17661980912089348, + "rewards/format_reward": 0.5833333488553762, + "step": 114 + }, + { + "advantage_max": 1.2508711740374565, + "advantage_mean": 1.4280279958533981e-08, + "advantage_min": -0.6822191178798676, + "advantage_std": 0.8110570348799229, + "completion_length": 2879.437530517578, + "epoch": 0.13142857142857142, + "grad_norm": 0.1294396072626114, + "kl": 0.0072422027587890625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.54457320834625e-07, + "loss": 0.0491, + "reward": -0.02063230169005692, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.02063230169005692, + "reward_after_std": 0.8110570386052132, + "reward_before_mean": 0.04543160554021597, + "reward_before_std": 0.8268074579536915, + "reward_change_max": 6.017833948135376e-05, + "reward_change_mean": -0.06606390746310353, + "reward_change_min": -0.18289340753108263, + "reward_change_std": 0.06786925066262484, + "reward_std": 0.8110570646822453, + "rewards/cosine_scaled_reward": -0.16478420107159764, + "rewards/format_reward": 0.3750000074505806, + "step": 115 + }, + { + "advantage_max": 0.9620575942099094, + "advantage_mean": 7.450580929990736e-09, + "advantage_min": -0.7253379821777344, + "advantage_std": 0.6947087794542313, + "completion_length": 3391.4166870117188, + "epoch": 0.13257142857142856, + "grad_norm": 0.1599225401878357, + "kl": 0.00789642333984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.530702921077358e-07, + "loss": 0.0384, + "reward": -0.1465127021074295, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.1465127021074295, + "reward_after_std": 0.6947087645530701, + "reward_before_mean": -0.086684700101614, + "reward_before_std": 0.7185013070702553, + "reward_change_max": 0.0002536848187446594, + "reward_change_mean": -0.05982800526544452, + "reward_change_min": -0.14509317371994257, + "reward_change_std": 0.0610161469085142, + "reward_std": 0.694708775728941, + "rewards/cosine_scaled_reward": -0.1266756821423769, + "rewards/format_reward": 0.16666666977107525, + "step": 116 + }, + { + "advantage_max": 0.9573287703096867, + "advantage_mean": -6.829699028543246e-09, + "advantage_min": -0.5708078518509865, + "advantage_std": 0.608824010938406, + "completion_length": 3233.687530517578, + "epoch": 0.1337142857142857, + "grad_norm": 0.11774991452693939, + "kl": 0.01103973388671875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.516636183034564e-07, + "loss": 0.0571, + "reward": -0.3432406187057495, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.3432406187057495, + "reward_after_std": 0.6088240072131157, + "reward_before_mean": -0.3020863775163889, + "reward_before_std": 0.62282644957304, + "reward_change_max": 0.00045599788427352905, + "reward_change_mean": -0.04115423874463886, + "reward_change_min": -0.10824007168412209, + "reward_change_std": 0.04387360031250864, + "reward_std": 0.6088240295648575, + "rewards/cosine_scaled_reward": -0.23437653545988724, + "rewards/format_reward": 0.16666666977107525, + "step": 117 + }, + { + "advantage_max": 1.032807346433401, + "advantage_mean": -4.967053324200776e-09, + "advantage_min": -0.780588660389185, + "advantage_std": 0.7265257742255926, + "completion_length": 2903.208366394043, + "epoch": 0.13485714285714287, + "grad_norm": 0.11558439582586288, + "kl": 0.00542449951171875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.502373679810839e-07, + "loss": 0.0019, + "reward": 0.364423219114542, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.364423219114542, + "reward_after_std": 0.7265257630497217, + "reward_before_mean": 0.4703831188380718, + "reward_before_std": 0.7357173040509224, + "reward_change_max": 0.0, + "reward_change_mean": -0.10595988691784441, + "reward_change_min": -0.21627100184559822, + "reward_change_std": 0.0819319833535701, + "reward_std": 0.7265257872641087, + "rewards/cosine_scaled_reward": 0.006024882197380066, + "rewards/format_reward": 0.4583333395421505, + "step": 118 + }, + { + "advantage_max": 0.9601045697927475, + "advantage_mean": -2.2972624136308184e-08, + "advantage_min": -0.8546690195798874, + "advantage_std": 0.7206116262823343, + "completion_length": 2613.458351135254, + "epoch": 0.136, + "grad_norm": 0.28752654790878296, + "kl": 0.11657333374023438, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.487916106540465e-07, + "loss": 0.0707, + "reward": 0.3269413001835346, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3269413001835346, + "reward_after_std": 0.7206116206943989, + "reward_before_mean": 0.4298500046133995, + "reward_before_std": 0.7387393917888403, + "reward_change_max": 0.0004165545105934143, + "reward_change_mean": -0.1029087018687278, + "reward_change_min": -0.19583096075803041, + "reward_change_std": 0.07978490833193064, + "reward_std": 0.7206116504967213, + "rewards/cosine_scaled_reward": -0.02465835213661194, + "rewards/format_reward": 0.4791666716337204, + "step": 119 + }, + { + "advantage_max": 1.0327376946806908, + "advantage_mean": -2.4835267176115394e-09, + "advantage_min": -0.8866618499159813, + "advantage_std": 0.7398336865007877, + "completion_length": 2519.7708892822266, + "epoch": 0.13714285714285715, + "grad_norm": 0.16047891974449158, + "kl": 0.01155853271484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.473264167865171e-07, + "loss": 0.0541, + "reward": 0.4021491319872439, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4021491319872439, + "reward_after_std": 0.7398336827754974, + "reward_before_mean": 0.5111243925057352, + "reward_before_std": 0.7496637143194675, + "reward_change_max": 0.00031457841396331787, + "reward_change_mean": -0.10897527076303959, + "reward_change_min": -0.21183301974087954, + "reward_change_std": 0.08238738542422652, + "reward_std": 0.7398337163031101, + "rewards/cosine_scaled_reward": 0.005562208592891693, + "rewards/format_reward": 0.5000000093132257, + "step": 120 + }, + { + "advantage_max": 1.0117009952664375, + "advantage_mean": 1.5522044760629683e-09, + "advantage_min": -0.8701439537107944, + "advantage_std": 0.7287258952856064, + "completion_length": 2291.4166870117188, + "epoch": 0.1382857142857143, + "grad_norm": 0.15173783898353577, + "kl": 0.01496124267578125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.458418577899774e-07, + "loss": 0.0628, + "reward": 0.2898542070761323, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2898542070761323, + "reward_after_std": 0.7287258952856064, + "reward_before_mean": 0.38868426950648427, + "reward_before_std": 0.7429370507597923, + "reward_change_max": 0.0002674087882041931, + "reward_change_mean": -0.0988300465978682, + "reward_change_min": -0.1901166085153818, + "reward_change_std": 0.07626715092919767, + "reward_std": 0.7287259213626385, + "rewards/cosine_scaled_reward": -0.1181578729301691, + "rewards/format_reward": 0.6250000093132257, + "step": 121 + }, + { + "advantage_max": 0.6881704181432724, + "advantage_mean": -6.829698862009792e-09, + "advantage_min": -0.5806205496191978, + "advantage_std": 0.5137464459985495, + "completion_length": 2758.06258392334, + "epoch": 0.13942857142857143, + "grad_norm": 0.12575501203536987, + "kl": 0.008263587951660156, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.443380060197385e-07, + "loss": 0.049, + "reward": 0.34448036178946495, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.34448036178946495, + "reward_after_std": 0.5137464534491301, + "reward_before_mean": 0.454755075275898, + "reward_before_std": 0.5069625999312848, + "reward_change_max": 0.0003061145544052124, + "reward_change_mean": -0.11027471465058625, + "reward_change_min": -0.18773560784757137, + "reward_change_std": 0.07475250354036689, + "reward_std": 0.5137464664876461, + "rewards/cosine_scaled_reward": 0.008627532981336117, + "rewards/format_reward": 0.4375000037252903, + "step": 122 + }, + { + "advantage_max": 0.8377508036792278, + "advantage_mean": -2.79396769609086e-09, + "advantage_min": -0.789555948227644, + "advantage_std": 0.6173290908336639, + "completion_length": 2958.666717529297, + "epoch": 0.14057142857142857, + "grad_norm": 0.1022125631570816, + "kl": 0.008504867553710938, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.428149347714143e-07, + "loss": 0.0552, + "reward": 0.09822776913642883, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.09822776913642883, + "reward_after_std": 0.6173290759325027, + "reward_before_mean": 0.18268409557640553, + "reward_before_std": 0.6303858272731304, + "reward_change_max": 0.0, + "reward_change_mean": -0.08445633691735566, + "reward_change_min": -0.16947042290121317, + "reward_change_std": 0.06656381417997181, + "reward_std": 0.6173290759325027, + "rewards/cosine_scaled_reward": -0.10657462244853377, + "rewards/format_reward": 0.3958333395421505, + "step": 123 + }, + { + "advantage_max": 1.3881800100207329, + "advantage_mean": -2.3903946710923663e-08, + "advantage_min": -0.8908319100737572, + "advantage_std": 0.8987450338900089, + "completion_length": 2499.6458892822266, + "epoch": 0.1417142857142857, + "grad_norm": 0.12640397250652313, + "kl": 0.01064300537109375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.412727182773486e-07, + "loss": 0.0231, + "reward": 0.2822089372202754, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2822089372202754, + "reward_after_std": 0.8987450078129768, + "reward_before_mean": 0.37344337720423937, + "reward_before_std": 0.9098445884883404, + "reward_change_max": 0.0, + "reward_change_mean": -0.09123445488512516, + "reward_change_min": -0.18366647511720657, + "reward_change_std": 0.07298110821284354, + "reward_std": 0.8987450487911701, + "rewards/cosine_scaled_reward": -0.1049449909478426, + "rewards/format_reward": 0.583333345130086, + "step": 124 + }, + { + "advantage_max": 0.9330607615411282, + "advantage_mean": -1.986821573929376e-08, + "advantage_min": -0.5653700307011604, + "advantage_std": 0.5595428682863712, + "completion_length": 2889.250030517578, + "epoch": 0.14285714285714285, + "grad_norm": 0.07559899240732193, + "kl": 0.008258819580078125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.397114317029974e-07, + "loss": 0.005, + "reward": 0.06734631024301052, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.06734631024301052, + "reward_after_std": 0.5595428682863712, + "reward_before_mean": 0.14727872982621193, + "reward_before_std": 0.549531988799572, + "reward_change_max": 0.0004915222525596619, + "reward_change_mean": -0.07993244659155607, + "reward_change_min": -0.14166234154254198, + "reward_change_std": 0.054563989629969, + "reward_std": 0.5595428794622421, + "rewards/cosine_scaled_reward": -0.07219396787695587, + "rewards/format_reward": 0.2916666679084301, + "step": 125 + }, + { + "advantage_max": 1.0017745271325111, + "advantage_mean": -1.800557009046244e-08, + "advantage_min": -0.9506868049502373, + "advantage_std": 0.7525536194443703, + "completion_length": 2789.166717529297, + "epoch": 0.144, + "grad_norm": 0.18889941275119781, + "kl": 0.006542205810546875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.381311511432658e-07, + "loss": 0.0565, + "reward": 0.3237967677414417, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3237967677414417, + "reward_after_std": 0.7525536343455315, + "reward_before_mean": 0.42645698226988316, + "reward_before_std": 0.7722005806863308, + "reward_change_max": 9.433180093765259e-05, + "reward_change_mean": -0.10266020614653826, + "reward_change_min": -0.20530739519745111, + "reward_change_std": 0.08366158325225115, + "reward_std": 0.7525536641478539, + "rewards/cosine_scaled_reward": -0.03677151817828417, + "rewards/format_reward": 0.5000000037252903, + "step": 126 + }, + { + "advantage_max": 1.0069448873400688, + "advantage_mean": -2.7755575615628914e-16, + "advantage_min": -0.7727540284395218, + "advantage_std": 0.6543919891119003, + "completion_length": 3197.250015258789, + "epoch": 0.14514285714285713, + "grad_norm": 0.10907334089279175, + "kl": 0.01126861572265625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.36531953618799e-07, + "loss": 0.0366, + "reward": -0.1559242196381092, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.1559242196381092, + "reward_after_std": 0.6543919928371906, + "reward_before_mean": -0.0979102123528719, + "reward_before_std": 0.6656555905938148, + "reward_change_max": 0.0006112977862358093, + "reward_change_mean": -0.058013999834656715, + "reward_change_min": -0.12248460203409195, + "reward_change_std": 0.05088945245370269, + "reward_std": 0.654392022639513, + "rewards/cosine_scaled_reward": -0.18437177990563214, + "rewards/format_reward": 0.27083334140479565, + "step": 127 + }, + { + "advantage_max": 0.896282397210598, + "advantage_mean": 8.692343733684993e-09, + "advantage_min": -1.1638800874352455, + "advantage_std": 0.8231517635285854, + "completion_length": 2812.562530517578, + "epoch": 0.1462857142857143, + "grad_norm": 0.13105569779872894, + "kl": 0.01023101806640625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.34913917072228e-07, + "loss": 0.0184, + "reward": 0.4567085765302181, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4567085765302181, + "reward_after_std": 0.8231517747044563, + "reward_before_mean": 0.5736916027963161, + "reward_before_std": 0.8603745512664318, + "reward_change_max": 0.0002776235342025757, + "reward_change_mean": -0.11698298552073538, + "reward_change_min": -0.2090506199747324, + "reward_change_std": 0.0941194579936564, + "reward_std": 0.8231517784297466, + "rewards/cosine_scaled_reward": 0.05767912045121193, + "rewards/format_reward": 0.4583333432674408, + "step": 128 + }, + { + "advantage_max": 1.215379349887371, + "advantage_mean": -2.23517425679276e-08, + "advantage_min": -0.631638377904892, + "advantage_std": 0.6916131414473057, + "completion_length": 3391.3958435058594, + "epoch": 0.14742857142857144, + "grad_norm": 0.13277558982372284, + "kl": 0.011829376220703125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.332771203643714e-07, + "loss": 0.0186, + "reward": -0.27772839553654194, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.27772839553654194, + "reward_after_std": 0.6916131395846605, + "reward_before_mean": -0.23465027287602425, + "reward_before_std": 0.6941855438053608, + "reward_change_max": 0.0004948228597640991, + "reward_change_mean": -0.04307813826017082, + "reward_change_min": -0.10449518729001284, + "reward_change_std": 0.04153990955092013, + "reward_std": 0.6916131433099508, + "rewards/cosine_scaled_reward": -0.20065847536170622, + "rewards/format_reward": 0.1666666716337204, + "step": 129 + }, + { + "advantage_max": 0.8671197295188904, + "advantage_mean": 6.829699028543246e-09, + "advantage_min": -0.6465287543833256, + "advantage_std": 0.5738713406026363, + "completion_length": 3142.7291717529297, + "epoch": 0.14857142857142858, + "grad_norm": 0.10791204124689102, + "kl": 0.01093292236328125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.316216432703916e-07, + "loss": 0.0106, + "reward": -0.05407215282320976, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.05407215282320976, + "reward_after_std": 0.5738713443279266, + "reward_before_mean": 0.01597772934474051, + "reward_before_std": 0.5806875862181187, + "reward_change_max": 0.0, + "reward_change_mean": -0.07004988705739379, + "reward_change_min": -0.13436190504580736, + "reward_change_std": 0.05401943693868816, + "reward_std": 0.5738713517785072, + "rewards/cosine_scaled_reward": -0.10659446427598596, + "rewards/format_reward": 0.22916666977107525, + "step": 130 + }, + { + "advantage_max": 1.1662424430251122, + "advantage_mean": -1.2417634920325327e-08, + "advantage_min": -0.8795076087117195, + "advantage_std": 0.807879064232111, + "completion_length": 2856.1667404174805, + "epoch": 0.14971428571428572, + "grad_norm": 0.15617187321186066, + "kl": 0.013149261474609375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.299475664759068e-07, + "loss": 0.083, + "reward": 0.32507515139877796, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.32507515139877796, + "reward_after_std": 0.8078790456056595, + "reward_before_mean": 0.42472299188375473, + "reward_before_std": 0.8255997374653816, + "reward_change_max": 0.00011242181062698364, + "reward_change_mean": -0.09964784735348076, + "reward_change_min": -0.20078612212091684, + "reward_change_std": 0.08023856161162257, + "reward_std": 0.8078790530562401, + "rewards/cosine_scaled_reward": -0.006388511508703232, + "rewards/format_reward": 0.4375000074505806, + "step": 131 + }, + { + "advantage_max": 0.7666987664997578, + "advantage_mean": 2.4835269507583746e-08, + "advantage_min": -0.7063778378069401, + "advantage_std": 0.5695809759199619, + "completion_length": 2699.0625228881836, + "epoch": 0.15085714285714286, + "grad_norm": 0.08800819516181946, + "kl": 0.0089569091796875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.282549715730579e-07, + "loss": -0.0231, + "reward": 0.20854278281331062, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.20854278281331062, + "reward_after_std": 0.5695809768512845, + "reward_before_mean": 0.3043249621987343, + "reward_before_std": 0.577835189178586, + "reward_change_max": 9.389221668243408e-05, + "reward_change_mean": -0.09578215330839157, + "reward_change_min": -0.17351566720753908, + "reward_change_std": 0.06650701339822263, + "reward_std": 0.5695809926837683, + "rewards/cosine_scaled_reward": -0.03533751145005226, + "rewards/format_reward": 0.375, + "step": 132 + }, + { + "advantage_max": 0.8124303817749023, + "advantage_mean": 3.104408841103634e-09, + "advantage_min": -0.5613624155521393, + "advantage_std": 0.5233605541288853, + "completion_length": 3215.000030517578, + "epoch": 0.152, + "grad_norm": 0.11630722880363464, + "kl": 0.013019561767578125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.265439410565328e-07, + "loss": 0.0422, + "reward": -0.26990117644891143, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.26990117644891143, + "reward_after_std": 0.5233605578541756, + "reward_before_mean": -0.21911684470251203, + "reward_before_std": 0.5289643295109272, + "reward_change_max": 0.00056438148021698, + "reward_change_mean": -0.05078433989547193, + "reward_change_min": -0.10732162557542324, + "reward_change_std": 0.0434577246196568, + "reward_std": 0.5233605690300465, + "rewards/cosine_scaled_reward": -0.23455842956900597, + "rewards/format_reward": 0.25000000931322575, + "step": 133 + }, + { + "advantage_max": 1.3054607100784779, + "advantage_mean": 4.96705393482344e-09, + "advantage_min": -0.8411310873925686, + "advantage_std": 0.8696665279567242, + "completion_length": 2491.3333587646484, + "epoch": 0.15314285714285714, + "grad_norm": 0.17809806764125824, + "kl": 0.013080596923828125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.248145583195447e-07, + "loss": 0.0184, + "reward": 0.2889123857021332, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2889123857021332, + "reward_after_std": 0.8696665279567242, + "reward_before_mean": 0.38274707458913326, + "reward_before_std": 0.8822485581040382, + "reward_change_max": 0.0, + "reward_change_mean": -0.09383466956205666, + "reward_change_min": -0.18727249279618263, + "reward_change_std": 0.07527722232043743, + "reward_std": 0.869666550308466, + "rewards/cosine_scaled_reward": -0.08987647667527199, + "rewards/format_reward": 0.5625000055879354, + "step": 134 + }, + { + "advantage_max": 1.1486023664474487, + "advantage_mean": -2.0489097529718947e-08, + "advantage_min": -1.1527246832847595, + "advantage_std": 0.9214292168617249, + "completion_length": 2233.895851135254, + "epoch": 0.15428571428571428, + "grad_norm": 0.4533507227897644, + "kl": 0.012312889099121094, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.230669076497687e-07, + "loss": 0.086, + "reward": 0.7777070086449385, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7777070086449385, + "reward_after_std": 0.9214292392134666, + "reward_before_mean": 0.9196147546172142, + "reward_before_std": 0.9462558915838599, + "reward_change_max": 0.00037839263677597046, + "reward_change_mean": -0.14190778695046902, + "reward_change_min": -0.27010027691721916, + "reward_change_std": 0.11189589183777571, + "reward_std": 0.9214292727410793, + "rewards/cosine_scaled_reward": 0.16814071801491082, + "rewards/format_reward": 0.5833333395421505, + "step": 135 + }, + { + "advantage_max": 1.1181330271065235, + "advantage_mean": -4.996003610813204e-16, + "advantage_min": -1.023313906043768, + "advantage_std": 0.7946756221354008, + "completion_length": 2813.791748046875, + "epoch": 0.15542857142857142, + "grad_norm": 0.17351149022579193, + "kl": 0.015716552734375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.213010742252327e-07, + "loss": 0.0675, + "reward": 0.4721411466598511, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4721411466598511, + "reward_after_std": 0.7946756184101105, + "reward_before_mean": 0.5856084409169853, + "reward_before_std": 0.80930364318192, + "reward_change_max": 2.7738511562347412e-05, + "reward_change_mean": -0.11346728634089231, + "reward_change_min": -0.1980165708810091, + "reward_change_std": 0.08056956913787872, + "reward_std": 0.7946756482124329, + "rewards/cosine_scaled_reward": 0.042804209515452385, + "rewards/format_reward": 0.5000000149011612, + "step": 136 + }, + { + "advantage_max": 1.137401022017002, + "advantage_mean": 1.3969838702498905e-08, + "advantage_min": -0.7347718961536884, + "advantage_std": 0.7254753410816193, + "completion_length": 3117.041717529297, + "epoch": 0.15657142857142858, + "grad_norm": 0.14943963289260864, + "kl": 0.013874053955078125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.195171441101668e-07, + "loss": 0.059, + "reward": -0.13048943784087896, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.13048943784087896, + "reward_after_std": 0.7254753559827805, + "reward_before_mean": -0.07333034556359053, + "reward_before_std": 0.7382145449519157, + "reward_change_max": 0.0002055242657661438, + "reward_change_mean": -0.0571591054322198, + "reward_change_min": -0.13340783957391977, + "reward_change_std": 0.0530292927287519, + "reward_std": 0.7254753783345222, + "rewards/cosine_scaled_reward": -0.18249849835410714, + "rewards/format_reward": 0.2916666753590107, + "step": 137 + }, + { + "advantage_max": 1.3317741975188255, + "advantage_mean": 1.676380706472358e-08, + "advantage_min": -0.9980174265801907, + "advantage_std": 0.9026450999081135, + "completion_length": 2635.625045776367, + "epoch": 0.15771428571428572, + "grad_norm": 0.17180199921131134, + "kl": 0.014453887939453125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.177152042508077e-07, + "loss": 0.0665, + "reward": 0.4164327224716544, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4164327224716544, + "reward_after_std": 0.9026450850069523, + "reward_before_mean": 0.5212587993592024, + "reward_before_std": 0.916192002594471, + "reward_change_max": 0.00020529329776763916, + "reward_change_mean": -0.10482604894787073, + "reward_change_min": -0.20750916376709938, + "reward_change_std": 0.08401510119438171, + "reward_std": 0.9026451222598553, + "rewards/cosine_scaled_reward": 0.00021273503080010414, + "rewards/format_reward": 0.5208333376795053, + "step": 138 + }, + { + "advantage_max": 1.2564072161912918, + "advantage_mean": -1.2417634254191512e-08, + "advantage_min": -0.9992383420467377, + "advantage_std": 0.918564435094595, + "completion_length": 3073.0834045410156, + "epoch": 0.15885714285714286, + "grad_norm": 0.1684163361787796, + "kl": 0.01435089111328125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.158953424711624e-07, + "loss": 0.0436, + "reward": 0.3841327941045165, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3841327941045165, + "reward_after_std": 0.9185644574463367, + "reward_before_mean": 0.4876405708491802, + "reward_before_std": 0.9425558559596539, + "reward_change_max": 0.00016095489263534546, + "reward_change_mean": -0.10350779164582491, + "reward_change_min": -0.2199224689975381, + "reward_change_std": 0.09149896074086428, + "reward_std": 0.9185644909739494, + "rewards/cosine_scaled_reward": -0.04784638062119484, + "rewards/format_reward": 0.5833333414047956, + "step": 139 + }, + { + "advantage_max": 1.2031887620687485, + "advantage_mean": 6.208817349140361e-09, + "advantage_min": -0.7054705396294594, + "advantage_std": 0.7336429953575134, + "completion_length": 2998.2084045410156, + "epoch": 0.16, + "grad_norm": 0.6675270795822144, + "kl": 0.020298004150390625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.140576474687263e-07, + "loss": 0.12, + "reward": 0.07706247363239527, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.07706247363239527, + "reward_after_std": 0.7336429879069328, + "reward_before_mean": 0.15163643937557936, + "reward_before_std": 0.7301466800272465, + "reward_change_max": 0.0005590468645095825, + "reward_change_mean": -0.07457395037636161, + "reward_change_min": -0.1526154913008213, + "reward_change_std": 0.06238272855989635, + "reward_std": 0.7336430214345455, + "rewards/cosine_scaled_reward": -0.07001511752605438, + "rewards/format_reward": 0.29166667349636555, + "step": 140 + }, + { + "advantage_max": 1.0287379138171673, + "advantage_mean": -1.3659398501175701e-08, + "advantage_min": -0.7826720699667931, + "advantage_std": 0.7179284952580929, + "completion_length": 2630.1459045410156, + "epoch": 0.16114285714285714, + "grad_norm": 0.13093984127044678, + "kl": 0.01914215087890625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.122022088101613e-07, + "loss": 0.0512, + "reward": 0.1532113216817379, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.1532113216817379, + "reward_after_std": 0.717928521335125, + "reward_before_mean": 0.23904765769839287, + "reward_before_std": 0.7325572483241558, + "reward_change_max": 0.00014876574277877808, + "reward_change_mean": -0.085836345795542, + "reward_change_min": -0.18532127793878317, + "reward_change_std": 0.07249903492629528, + "reward_std": 0.7179285399615765, + "rewards/cosine_scaled_reward": -0.15130950883030891, + "rewards/format_reward": 0.5416666753590107, + "step": 141 + }, + { + "advantage_max": 0.9296289533376694, + "advantage_mean": -1.862645149230957e-09, + "advantage_min": -0.812540490180254, + "advantage_std": 0.6487125307321548, + "completion_length": 2815.1875610351562, + "epoch": 0.16228571428571428, + "grad_norm": 0.13400672376155853, + "kl": 0.01612091064453125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.103291169269299e-07, + "loss": 0.0648, + "reward": 0.33853449299931526, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.33853449299931526, + "reward_after_std": 0.6487125344574451, + "reward_before_mean": 0.4438725169748068, + "reward_before_std": 0.6510545462369919, + "reward_change_max": 0.00021427124738693237, + "reward_change_mean": -0.10533801536075771, + "reward_change_min": -0.18465242069214582, + "reward_change_std": 0.07665713713504374, + "reward_std": 0.6487125381827354, + "rewards/cosine_scaled_reward": -0.05931374244391918, + "rewards/format_reward": 0.5625000111758709, + "step": 142 + }, + { + "advantage_max": 1.0350090842694044, + "advantage_mean": -2.949188115941581e-09, + "advantage_min": -0.6123997960239649, + "advantage_std": 0.6286649722605944, + "completion_length": 2656.666732788086, + "epoch": 0.16342857142857142, + "grad_norm": 0.1824156790971756, + "kl": 0.0194549560546875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.084384631108882e-07, + "loss": 0.0744, + "reward": 0.09283385192975402, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.09283385192975402, + "reward_after_std": 0.6286649648100138, + "reward_before_mean": 0.17288653645664454, + "reward_before_std": 0.6227559931576252, + "reward_change_max": 0.0002013370394706726, + "reward_change_mean": -0.08005267137195915, + "reward_change_min": -0.1376419337466359, + "reward_change_std": 0.05598044104408473, + "reward_std": 0.6286649834364653, + "rewards/cosine_scaled_reward": -0.16355674155056477, + "rewards/format_reward": 0.5000000074505806, + "step": 143 + }, + { + "advantage_max": 1.0671398043632507, + "advantage_mean": 2.359350581571107e-08, + "advantage_min": -0.9071869850158691, + "advantage_std": 0.8145530968904495, + "completion_length": 2874.708396911621, + "epoch": 0.16457142857142856, + "grad_norm": 0.14423750340938568, + "kl": 0.016429901123046875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.065303395098358e-07, + "loss": 0.0199, + "reward": 0.361312011256814, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.361312011256814, + "reward_after_std": 0.8145530857145786, + "reward_before_mean": 0.4656823016703129, + "reward_before_std": 0.8409868739545345, + "reward_change_max": 0.0002870485186576843, + "reward_change_mean": -0.10437025898136199, + "reward_change_min": -0.2307471977546811, + "reward_change_std": 0.08893731469288468, + "reward_std": 0.8145531080663204, + "rewards/cosine_scaled_reward": 0.02450781175866723, + "rewards/format_reward": 0.4166666679084301, + "step": 144 + }, + { + "advantage_max": 1.0821139886975288, + "advantage_mean": 3.725291630729544e-09, + "advantage_min": -0.6549829691648483, + "advantage_std": 0.6488157249987125, + "completion_length": 2207.770866394043, + "epoch": 0.1657142857142857, + "grad_norm": 0.11618901789188385, + "kl": 0.01737213134765625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.046048391230247e-07, + "loss": 0.0195, + "reward": 0.41563169774599373, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.41563169774599373, + "reward_after_std": 0.648815743625164, + "reward_before_mean": 0.5244897492229939, + "reward_before_std": 0.6296542976051569, + "reward_change_max": 0.0001682192087173462, + "reward_change_mean": -0.10885802179109305, + "reward_change_min": -0.17047070525586605, + "reward_change_std": 0.0684051540447399, + "reward_std": 0.6488157473504543, + "rewards/cosine_scaled_reward": -0.039838479831814766, + "rewards/format_reward": 0.6041666697710752, + "step": 145 + }, + { + "advantage_max": 1.0146339759230614, + "advantage_mean": -3.414849431004896e-09, + "advantage_min": -0.8650781996548176, + "advantage_std": 0.7028192803263664, + "completion_length": 2288.000045776367, + "epoch": 0.16685714285714287, + "grad_norm": 0.20492036640644073, + "kl": 0.01233673095703125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.026620557966279e-07, + "loss": 0.1043, + "reward": 0.12842521956190467, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.12842521956190467, + "reward_after_std": 0.7028192915022373, + "reward_before_mean": 0.21201793756335974, + "reward_before_std": 0.7153302617371082, + "reward_change_max": 0.0005949661135673523, + "reward_change_mean": -0.0835927234729752, + "reward_change_min": -0.17898117750883102, + "reward_change_std": 0.06886330037377775, + "reward_std": 0.7028193026781082, + "rewards/cosine_scaled_reward": -0.2169077042490244, + "rewards/format_reward": 0.6458333432674408, + "step": 146 + }, + { + "advantage_max": 1.0399056002497673, + "advantage_mean": -3.7252904094842165e-09, + "advantage_min": -0.6083156391978264, + "advantage_std": 0.6332942806184292, + "completion_length": 2729.125015258789, + "epoch": 0.168, + "grad_norm": 0.1052529439330101, + "kl": 0.021240234375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.007020842191634e-07, + "loss": 0.0038, + "reward": -0.017400827258825302, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.017400827258825302, + "reward_after_std": 0.6332942768931389, + "reward_before_mean": 0.052338654175400734, + "reward_before_std": 0.6333304084837437, + "reward_change_max": 0.00016516447067260742, + "reward_change_mean": -0.06973948935046792, + "reward_change_min": -0.1433351282030344, + "reward_change_std": 0.052607121178880334, + "reward_std": 0.633294302970171, + "rewards/cosine_scaled_reward": -0.18216401617974043, + "rewards/format_reward": 0.41666667349636555, + "step": 147 + }, + { + "advantage_max": 0.816329799592495, + "advantage_mean": -3.787378699549038e-08, + "advantage_min": -0.718300499022007, + "advantage_std": 0.5935985185205936, + "completion_length": 2441.3750534057617, + "epoch": 0.16914285714285715, + "grad_norm": 0.14333200454711914, + "kl": 0.019672393798828125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.987250199168808e-07, + "loss": 0.0355, + "reward": 0.3582087382674217, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3582087382674217, + "reward_after_std": 0.5935985222458839, + "reward_before_mean": 0.46716008335351944, + "reward_before_std": 0.598968580365181, + "reward_change_max": 0.0, + "reward_change_mean": -0.10895138035994023, + "reward_change_min": -0.1871479470282793, + "reward_change_std": 0.07555282616522163, + "reward_std": 0.5935985259711742, + "rewards/cosine_scaled_reward": -0.05808662064373493, + "rewards/format_reward": 0.5833333432674408, + "step": 148 + }, + { + "advantage_max": 1.1604133136570454, + "advantage_mean": -2.110997909809953e-08, + "advantage_min": -0.7638447806239128, + "advantage_std": 0.7076913714408875, + "completion_length": 2790.479232788086, + "epoch": 0.1702857142857143, + "grad_norm": 0.18803834915161133, + "kl": 0.016819000244140625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.967309592491052e-07, + "loss": 0.0947, + "reward": 0.3587049674242735, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3587049674242735, + "reward_after_std": 0.7076913937926292, + "reward_before_mean": 0.46114486269652843, + "reward_before_std": 0.6992642693221569, + "reward_change_max": 0.0007874518632888794, + "reward_change_mean": -0.10243987792637199, + "reward_change_min": -0.16466401610523462, + "reward_change_std": 0.0671718236990273, + "reward_std": 0.7076914049685001, + "rewards/cosine_scaled_reward": -0.02984423842281103, + "rewards/format_reward": 0.5208333395421505, + "step": 149 + }, + { + "advantage_max": 1.2045219093561172, + "advantage_mean": 2.2972623692218974e-08, + "advantage_min": -1.143689103424549, + "advantage_std": 0.9179808422923088, + "completion_length": 2979.7083740234375, + "epoch": 0.17142857142857143, + "grad_norm": 0.3000147342681885, + "kl": 0.02593994140625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.9471999940354e-07, + "loss": 0.079, + "reward": 0.2739081159234047, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2739081159234047, + "reward_after_std": 0.9179808460175991, + "reward_before_mean": 0.3676854632794857, + "reward_before_std": 0.9514680132269859, + "reward_change_max": 0.0006517991423606873, + "reward_change_mean": -0.09377730148844421, + "reward_change_min": -0.2144545027986169, + "reward_change_std": 0.08906646957620978, + "reward_std": 0.9179808907210827, + "rewards/cosine_scaled_reward": -0.02449061779771, + "rewards/format_reward": 0.41666668467223644, + "step": 150 + }, + { + "advantage_max": 1.5817717239260674, + "advantage_mean": -2.2506963848201167e-08, + "advantage_min": -0.9173250868916512, + "advantage_std": 0.9457920156419277, + "completion_length": 2821.354232788086, + "epoch": 0.17257142857142857, + "grad_norm": 0.23507796227931976, + "kl": 0.02570343017578125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.926922383915315e-07, + "loss": 0.0797, + "reward": 0.46917635947465897, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.46917635947465897, + "reward_after_std": 0.9457920081913471, + "reward_before_mean": 0.5744377570226789, + "reward_before_std": 0.941125662997365, + "reward_change_max": 0.00025819987058639526, + "reward_change_mean": -0.1052614112268202, + "reward_change_min": -0.1981993718072772, + "reward_change_std": 0.07828537304885685, + "reward_std": 0.9457920119166374, + "rewards/cosine_scaled_reward": 0.047635551696657785, + "rewards/format_reward": 0.47916668094694614, + "step": 151 + }, + { + "advantage_max": 0.9127373062074184, + "advantage_mean": -3.1044122827950105e-10, + "advantage_min": -0.7578173317015171, + "advantage_std": 0.6449372805655003, + "completion_length": 2599.1041870117188, + "epoch": 0.1737142857142857, + "grad_norm": 0.1737074851989746, + "kl": 0.02773284912109375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.906477750432903e-07, + "loss": 0.0573, + "reward": 0.0322701595723629, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.0322701595723629, + "reward_after_std": 0.6449372880160809, + "reward_before_mean": 0.10861435905098915, + "reward_before_std": 0.6586843468248844, + "reward_change_max": 0.0002729371190071106, + "reward_change_mean": -0.07634419621899724, + "reward_change_min": -0.1607343116775155, + "reward_change_std": 0.06475975178182125, + "reward_std": 0.6449373215436935, + "rewards/cosine_scaled_reward": -0.12277615629136562, + "rewards/format_reward": 0.3541666679084301, + "step": 152 + }, + { + "advantage_max": 1.0964186489582062, + "advantage_mean": 1.3038516599728212e-08, + "advantage_min": -0.8900428488850594, + "advantage_std": 0.767373725771904, + "completion_length": 2993.458366394043, + "epoch": 0.17485714285714285, + "grad_norm": 0.20991554856300354, + "kl": 0.039215087890625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.88586709003076e-07, + "loss": 0.0699, + "reward": -0.014602228999137878, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.014602228999137878, + "reward_after_std": 0.7673737443983555, + "reward_before_mean": 0.05484032817184925, + "reward_before_std": 0.7900453805923462, + "reward_change_max": 0.0001817941665649414, + "reward_change_mean": -0.06944253481924534, + "reward_change_min": -0.16902944073081017, + "reward_change_std": 0.06724012573249638, + "reward_std": 0.7673737592995167, + "rewards/cosine_scaled_reward": -0.118413170799613, + "rewards/format_reward": 0.29166667722165585, + "step": 153 + }, + { + "advantage_max": 1.2021274827420712, + "advantage_mean": 3.476937660007451e-08, + "advantage_min": -1.179232008755207, + "advantage_std": 0.9691634774208069, + "completion_length": 3311.541748046875, + "epoch": 0.176, + "grad_norm": 0.3098473846912384, + "kl": 0.02216339111328125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.865091407243394e-07, + "loss": 0.0688, + "reward": 0.3431315952911973, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3431315952911973, + "reward_after_std": 0.9691634848713875, + "reward_before_mean": 0.44397585839033127, + "reward_before_std": 1.010753821581602, + "reward_change_max": 0.00024543702602386475, + "reward_change_mean": -0.10084426286630332, + "reward_change_min": -0.22217622213065624, + "reward_change_std": 0.09855047892779112, + "reward_std": 0.969163540750742, + "rewards/cosine_scaled_reward": 0.04490460641682148, + "rewards/format_reward": 0.35416667349636555, + "step": 154 + }, + { + "advantage_max": 1.155366025865078, + "advantage_mean": -3.601113995888028e-08, + "advantage_min": -0.9142054095864296, + "advantage_std": 0.8050542362034321, + "completion_length": 2577.333366394043, + "epoch": 0.17714285714285713, + "grad_norm": 0.15611477196216583, + "kl": 0.0316009521484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.844151714648274e-07, + "loss": 0.0187, + "reward": 0.43145788833498955, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.43145788833498955, + "reward_after_std": 0.8050542436540127, + "reward_before_mean": 0.5406967643648386, + "reward_before_std": 0.8168010376393795, + "reward_change_max": 0.00125044584274292, + "reward_change_mean": -0.10923890233971179, + "reward_change_min": -0.224298395216465, + "reward_change_std": 0.08746692817658186, + "reward_std": 0.8050542436540127, + "rewards/cosine_scaled_reward": -0.0004849610850214958, + "rewards/format_reward": 0.5416666716337204, + "step": 155 + }, + { + "advantage_max": 1.3044027090072632, + "advantage_mean": 7.45058109652419e-09, + "advantage_min": -0.7066097818315029, + "advantage_std": 0.7796169742941856, + "completion_length": 3190.8333587646484, + "epoch": 0.1782857142857143, + "grad_norm": 0.21351027488708496, + "kl": 0.02843475341796875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.823049032816478e-07, + "loss": 0.0818, + "reward": -0.1954272324219346, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.1954272324219346, + "reward_after_std": 0.7796169780194759, + "reward_before_mean": -0.14665334671735764, + "reward_before_std": 0.7897529415786266, + "reward_change_max": 0.0005297735333442688, + "reward_change_mean": -0.048773885355331004, + "reward_change_min": -0.13390359189361334, + "reward_change_std": 0.05359873874112964, + "reward_std": 0.7796170227229595, + "rewards/cosine_scaled_reward": -0.18791001569479704, + "rewards/format_reward": 0.2291666716337204, + "step": 156 + }, + { + "advantage_max": 0.6623891666531563, + "advantage_mean": -1.4280279514444771e-08, + "advantage_min": -0.7862861528992653, + "advantage_std": 0.5497554168105125, + "completion_length": 3102.7083740234375, + "epoch": 0.17942857142857144, + "grad_norm": 0.10205285996198654, + "kl": 0.03250885009765625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.801784390262943e-07, + "loss": 0.0245, + "reward": -0.04365145298652351, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.04365145298652351, + "reward_after_std": 0.5497554168105125, + "reward_before_mean": 0.03132460406050086, + "reward_before_std": 0.5714639872312546, + "reward_change_max": 0.00042669475078582764, + "reward_change_mean": -0.07497606868855655, + "reward_change_min": -0.14072032365947962, + "reward_change_std": 0.061189956264570355, + "reward_std": 0.549755435436964, + "rewards/cosine_scaled_reward": -0.1405876912176609, + "rewards/format_reward": 0.31250001303851604, + "step": 157 + }, + { + "advantage_max": 1.007291927933693, + "advantage_mean": -1.6142925052253787e-08, + "advantage_min": -0.8780169375240803, + "advantage_std": 0.7204512059688568, + "completion_length": 2872.750045776367, + "epoch": 0.18057142857142858, + "grad_norm": 0.1799384355545044, + "kl": 0.0306549072265625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.780358823396352e-07, + "loss": 0.0455, + "reward": 0.6302909525111318, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6302909525111318, + "reward_after_std": 0.7204512022435665, + "reward_before_mean": 0.7603835063055158, + "reward_before_std": 0.7301176488399506, + "reward_change_max": 0.0009801387786865234, + "reward_change_mean": -0.13009255612269044, + "reward_change_min": -0.22687509935349226, + "reward_change_std": 0.09314224496483803, + "reward_std": 0.7204512171447277, + "rewards/cosine_scaled_reward": 0.1301917377859354, + "rewards/format_reward": 0.5000000111758709, + "step": 158 + }, + { + "advantage_max": 1.2010982930660248, + "advantage_mean": 1.614292521878724e-08, + "advantage_min": -0.7741870544850826, + "advantage_std": 0.760103264823556, + "completion_length": 3368.625, + "epoch": 0.18171428571428572, + "grad_norm": 0.16853055357933044, + "kl": 0.0409698486328125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.758773376468604e-07, + "loss": 0.0166, + "reward": -0.1673340299166739, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.1673340299166739, + "reward_after_std": 0.7601032964885235, + "reward_before_mean": -0.11435879208147526, + "reward_before_std": 0.7753438986837864, + "reward_change_max": 0.0008056163787841797, + "reward_change_mean": -0.05297523224726319, + "reward_change_min": -0.14327362179756165, + "reward_change_std": 0.05677987774834037, + "reward_std": 0.760103328153491, + "rewards/cosine_scaled_reward": -0.16134606953710318, + "rewards/format_reward": 0.2083333395421505, + "step": 159 + }, + { + "advantage_max": 1.3550818711519241, + "advantage_mean": -2.173086155465853e-09, + "advantage_min": -0.8745719566941261, + "advantage_std": 0.8815088830888271, + "completion_length": 2607.3333740234375, + "epoch": 0.18285714285714286, + "grad_norm": 0.17933402955532074, + "kl": 0.04034423828125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.737029101523929e-07, + "loss": 0.0235, + "reward": 0.2858357895165682, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2858357895165682, + "reward_after_std": 0.8815088346600533, + "reward_before_mean": 0.37776060961186886, + "reward_before_std": 0.8969038352370262, + "reward_change_max": 0.00037732720375061035, + "reward_change_mean": -0.09192477163742296, + "reward_change_min": -0.19541947543621063, + "reward_change_std": 0.07548659306485206, + "reward_std": 0.8815088532865047, + "rewards/cosine_scaled_reward": 0.053463623858988285, + "rewards/format_reward": 0.27083334140479565, + "step": 160 + }, + { + "advantage_max": 0.843145627528429, + "advantage_mean": -4.0357312713901194e-08, + "advantage_min": -0.7640031352639198, + "advantage_std": 0.6416959650814533, + "completion_length": 3008.541732788086, + "epoch": 0.184, + "grad_norm": 0.17466787993907928, + "kl": 0.0492401123046875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.715127058347614e-07, + "loss": 0.0098, + "reward": 0.4757814444601536, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4757814444601536, + "reward_after_std": 0.6416959837079048, + "reward_before_mean": 0.5949108861386776, + "reward_before_std": 0.6474121138453484, + "reward_change_max": 7.144361734390259e-05, + "reward_change_mean": -0.11912947986274958, + "reward_change_min": -0.21004556119441986, + "reward_change_std": 0.0840856155846268, + "reward_std": 0.6416960023343563, + "rewards/cosine_scaled_reward": 0.07870544213801622, + "rewards/format_reward": 0.4375000074505806, + "step": 161 + }, + { + "advantage_max": 1.1495609432458878, + "advantage_mean": -1.862645426786713e-09, + "advantage_min": -0.6219765916466713, + "advantage_std": 0.679344154894352, + "completion_length": 3318.291702270508, + "epoch": 0.18514285714285714, + "grad_norm": 0.19393867254257202, + "kl": 0.05615234375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.693068314414344e-07, + "loss": 0.0367, + "reward": -0.14834206318482757, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.14834206318482757, + "reward_after_std": 0.679344154894352, + "reward_before_mean": -0.09237463027238846, + "reward_before_std": 0.6833413615822792, + "reward_change_max": 0.0, + "reward_change_mean": -0.055967450607568026, + "reward_change_min": -0.10517737921327353, + "reward_change_std": 0.042807565070688725, + "reward_std": 0.6793441660702229, + "rewards/cosine_scaled_reward": -0.1607706407085061, + "rewards/format_reward": 0.22916666977107525, + "step": 162 + }, + { + "advantage_max": 0.818055797368288, + "advantage_mean": -1.6653345369377348e-16, + "advantage_min": -0.8302821703255177, + "advantage_std": 0.6327940300107002, + "completion_length": 2528.3541717529297, + "epoch": 0.18628571428571428, + "grad_norm": 0.20737354457378387, + "kl": 0.0474853515625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.670853944836176e-07, + "loss": -0.0322, + "reward": 0.3084873203188181, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3084873203188181, + "reward_after_std": 0.6327940560877323, + "reward_before_mean": 0.4131042119115591, + "reward_before_std": 0.6455821953713894, + "reward_change_max": 0.0003267824649810791, + "reward_change_mean": -0.1046168792527169, + "reward_change_min": -0.18456690851598978, + "reward_change_std": 0.07689245650544763, + "reward_std": 0.6327940858900547, + "rewards/cosine_scaled_reward": 0.01905210316181183, + "rewards/format_reward": 0.37500000931322575, + "step": 163 + }, + { + "advantage_max": 0.904144998639822, + "advantage_mean": -3.476937709967487e-08, + "advantage_min": -0.9024236872792244, + "advantage_std": 0.693291537463665, + "completion_length": 2415.916717529297, + "epoch": 0.18742857142857142, + "grad_norm": 0.1441878080368042, + "kl": 0.0462646484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.648485032310144e-07, + "loss": -0.0149, + "reward": 0.39764015562832355, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.39764015562832355, + "reward_after_std": 0.693291537463665, + "reward_before_mean": 0.508028662065044, + "reward_before_std": 0.7092392519116402, + "reward_change_max": 0.0002870708703994751, + "reward_change_mean": -0.11038849456235766, + "reward_change_min": -0.19456447195261717, + "reward_change_std": 0.08194146119058132, + "reward_std": 0.6932915560901165, + "rewards/cosine_scaled_reward": 0.07693097367882729, + "rewards/format_reward": 0.35416666977107525, + "step": 164 + }, + { + "advantage_max": 1.1269632391631603, + "advantage_mean": 8.692344732885715e-09, + "advantage_min": -0.7582625076174736, + "advantage_std": 0.7576115392148495, + "completion_length": 2974.5000610351562, + "epoch": 0.18857142857142858, + "grad_norm": 0.2055661678314209, + "kl": 0.053497314453125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.625962667065487e-07, + "loss": 0.0408, + "reward": -0.031486984342336655, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.031486984342336655, + "reward_after_std": 0.7576115392148495, + "reward_before_mean": 0.0358324833214283, + "reward_before_std": 0.776401586830616, + "reward_change_max": 0.00020839273929595947, + "reward_change_mean": -0.06731945066712797, + "reward_change_min": -0.1540833543986082, + "reward_change_std": 0.0621999790892005, + "reward_std": 0.7576115429401398, + "rewards/cosine_scaled_reward": -0.07583376299589872, + "rewards/format_reward": 0.1875000037252903, + "step": 165 + }, + { + "advantage_max": 1.1014218032360077, + "advantage_mean": 1.2417634920325327e-08, + "advantage_min": -0.7755156680941582, + "advantage_std": 0.7053066603839397, + "completion_length": 3034.6041717529297, + "epoch": 0.18971428571428572, + "grad_norm": 0.18134304881095886, + "kl": 0.047393798828125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.603287946810513e-07, + "loss": 0.0598, + "reward": 0.07770548108965158, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.07770548108965158, + "reward_after_std": 0.70530666410923, + "reward_before_mean": 0.15491134487092495, + "reward_before_std": 0.7133532389998436, + "reward_change_max": 0.0005127936601638794, + "reward_change_mean": -0.0772058351431042, + "reward_change_min": -0.15264319721609354, + "reward_change_std": 0.05929103307425976, + "reward_std": 0.7053066939115524, + "rewards/cosine_scaled_reward": -0.037127673625946045, + "rewards/format_reward": 0.2291666753590107, + "step": 166 + }, + { + "advantage_max": 1.3006695583462715, + "advantage_mean": -6.829699028543246e-09, + "advantage_min": -0.8689005374908447, + "advantage_std": 0.8276662826538086, + "completion_length": 2107.291748046875, + "epoch": 0.19085714285714286, + "grad_norm": 0.17068101465702057, + "kl": 0.047607421875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.580461976679099e-07, + "loss": 0.0175, + "reward": 0.23405415192246437, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.23405415192246437, + "reward_after_std": 0.827666288241744, + "reward_before_mean": 0.32163101993501186, + "reward_before_std": 0.8379855081439018, + "reward_change_max": 0.0003604292869567871, + "reward_change_mean": -0.08757684961892664, + "reward_change_min": -0.18768702819943428, + "reward_change_std": 0.07284934795461595, + "reward_std": 0.8276663199067116, + "rewards/cosine_scaled_reward": -0.057934501208364964, + "rewards/format_reward": 0.4375000111758709, + "step": 167 + }, + { + "advantage_max": 0.8111299090087414, + "advantage_mean": 1.3969838424943148e-08, + "advantage_min": -1.0020024217665195, + "advantage_std": 0.6965993195772171, + "completion_length": 2974.041717529297, + "epoch": 0.192, + "grad_norm": 0.20416678488254547, + "kl": 0.050445556640625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.557485869176825e-07, + "loss": 0.0191, + "reward": 0.3391971345990896, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3391971345990896, + "reward_after_std": 0.6965993344783783, + "reward_before_mean": 0.4466320239007473, + "reward_before_std": 0.722343236207962, + "reward_change_max": 0.0, + "reward_change_mean": -0.10743483621627092, + "reward_change_min": -0.19180598575621843, + "reward_change_std": 0.0832341960631311, + "reward_std": 0.6965993642807007, + "rewards/cosine_scaled_reward": 0.014982646331191063, + "rewards/format_reward": 0.41666668094694614, + "step": 168 + }, + { + "advantage_max": 1.1438056454062462, + "advantage_mean": -3.04232063430554e-08, + "advantage_min": -1.1217550933361053, + "advantage_std": 0.8912684917449951, + "completion_length": 2452.6250762939453, + "epoch": 0.19314285714285714, + "grad_norm": 0.25557219982147217, + "kl": 0.0565338134765625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.534360744126753e-07, + "loss": 0.0333, + "reward": 1.0176847303519025, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 1.0176847303519025, + "reward_after_std": 0.8912684805691242, + "reward_before_mean": 1.1816631648689508, + "reward_before_std": 0.9058678522706032, + "reward_change_max": 0.0, + "reward_change_mean": -0.16397839970886707, + "reward_change_min": -0.2810376714915037, + "reward_change_std": 0.11516173789277673, + "reward_std": 0.8912685364484787, + "rewards/cosine_scaled_reward": 0.28874821588397026, + "rewards/format_reward": 0.6041666697710752, + "step": 169 + }, + { + "advantage_max": 0.8939481191337109, + "advantage_mean": -2.545615107596433e-08, + "advantage_min": -0.827484168112278, + "advantage_std": 0.6734241135418415, + "completion_length": 2655.2500762939453, + "epoch": 0.19428571428571428, + "grad_norm": 0.1779620349407196, + "kl": 0.061553955078125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.511087728614862e-07, + "loss": 0.0345, + "reward": 0.4281429387629032, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4281429387629032, + "reward_after_std": 0.6734241284430027, + "reward_before_mean": 0.5414133286103606, + "reward_before_std": 0.6823275052011013, + "reward_change_max": 0.00027061253786087036, + "reward_change_mean": -0.11327041126787663, + "reward_change_min": -0.20588709693402052, + "reward_change_std": 0.0820755553431809, + "reward_std": 0.6734241358935833, + "rewards/cosine_scaled_reward": 0.0936233289539814, + "rewards/format_reward": 0.3541666753590107, + "step": 170 + }, + { + "advantage_max": 0.9380842223763466, + "advantage_mean": 9.313225579621331e-09, + "advantage_min": -0.9550211504101753, + "advantage_std": 0.7192481569945812, + "completion_length": 2867.312530517578, + "epoch": 0.19542857142857142, + "grad_norm": 0.2206490933895111, + "kl": 0.05462646484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.487667956935087e-07, + "loss": 0.051, + "reward": 0.11909876018762589, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.11909876018762589, + "reward_after_std": 0.7192481234669685, + "reward_before_mean": 0.20382621884346008, + "reward_before_std": 0.7419612966477871, + "reward_change_max": 0.0008624270558357239, + "reward_change_mean": -0.0847274367697537, + "reward_change_min": -0.17237501591444016, + "reward_change_std": 0.07184617791790515, + "reward_std": 0.7192481495440006, + "rewards/cosine_scaled_reward": -0.05433690547943115, + "rewards/format_reward": 0.31250001303851604, + "step": 171 + }, + { + "advantage_max": 1.2377153486013412, + "advantage_mean": -2.4835269396561444e-09, + "advantage_min": -1.0486519858241081, + "advantage_std": 0.9159692004323006, + "completion_length": 2520.37508392334, + "epoch": 0.19657142857142856, + "grad_norm": 0.36089661717414856, + "kl": 0.061431884765625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.464102570534061e-07, + "loss": 0.0775, + "reward": 0.4834242947399616, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4834242947399616, + "reward_after_std": 0.9159692041575909, + "reward_before_mean": 0.5955887548625469, + "reward_before_std": 0.9399407245218754, + "reward_change_max": 6.854534149169922e-05, + "reward_change_mean": -0.11216441867873073, + "reward_change_min": -0.23346925619989634, + "reward_change_std": 0.09769316925667226, + "reward_std": 0.9159692190587521, + "rewards/cosine_scaled_reward": 0.16237769648432732, + "rewards/format_reward": 0.27083333767950535, + "step": 172 + }, + { + "advantage_max": 1.0018565282225609, + "advantage_mean": 1.0554989493538613e-08, + "advantage_min": -0.613800659775734, + "advantage_std": 0.6544171050190926, + "completion_length": 1890.8750457763672, + "epoch": 0.1977142857142857, + "grad_norm": 0.2708069980144501, + "kl": 0.058868408203125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.440392717955475e-07, + "loss": 0.0501, + "reward": -0.31603203853592277, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.31603203853592277, + "reward_after_std": 0.6544171161949635, + "reward_before_mean": -0.27385718561708927, + "reward_before_std": 0.6710513979196548, + "reward_change_max": 0.0004888400435447693, + "reward_change_mean": -0.042174856178462505, + "reward_change_min": -0.1296614371240139, + "reward_change_std": 0.051378472126089036, + "reward_std": 0.6544171385467052, + "rewards/cosine_scaled_reward": -0.2306785937398672, + "rewards/format_reward": 0.18750000558793545, + "step": 173 + }, + { + "advantage_max": 0.8003799468278885, + "advantage_mean": -1.2417634698280722e-08, + "advantage_min": -0.7577654309570789, + "advantage_std": 0.5632036030292511, + "completion_length": 2550.3334350585938, + "epoch": 0.19885714285714284, + "grad_norm": 0.18074588477611542, + "kl": 0.07696533203125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.416539554784089e-07, + "loss": -0.0175, + "reward": 0.2051242869347334, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2051242869347334, + "reward_after_std": 0.5632036104798317, + "reward_before_mean": 0.3000453729182482, + "reward_before_std": 0.5662722326815128, + "reward_change_max": 0.0, + "reward_change_mean": -0.09492109343409538, + "reward_change_min": -0.16723101679235697, + "reward_change_std": 0.06537155085243285, + "reward_std": 0.5632036216557026, + "rewards/cosine_scaled_reward": -0.07914398796856403, + "rewards/format_reward": 0.45833333767950535, + "step": 174 + }, + { + "advantage_max": 0.8833422213792801, + "advantage_mean": 1.0554988993938252e-08, + "advantage_min": -0.8566731512546539, + "advantage_std": 0.6538542471826077, + "completion_length": 2570.6666870117188, + "epoch": 0.2, + "grad_norm": 0.23281656205654144, + "kl": 0.0728302001953125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.392544243589427e-07, + "loss": 0.0225, + "reward": 0.17386609059758484, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.17386609059758484, + "reward_after_std": 0.6538542248308659, + "reward_before_mean": 0.2645402289927006, + "reward_before_std": 0.6693241335451603, + "reward_change_max": 4.032999277114868e-05, + "reward_change_mean": -0.09067412395961583, + "reward_change_min": -0.1672326810657978, + "reward_change_std": 0.07087655737996101, + "reward_std": 0.6538542471826077, + "rewards/cosine_scaled_reward": -0.034396563190966845, + "rewards/format_reward": 0.33333333767950535, + "step": 175 + }, + { + "advantage_max": 1.658276230096817, + "advantage_mean": -1.1796752852344383e-08, + "advantage_min": -1.1352946013212204, + "advantage_std": 1.1043973043560982, + "completion_length": 2494.0208892822266, + "epoch": 0.20114285714285715, + "grad_norm": 0.35480260848999023, + "kl": 0.069122314453125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.368407953869103e-07, + "loss": -0.0014, + "reward": 0.17692266777157784, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.17692266777157784, + "reward_after_std": 1.1043973118066788, + "reward_before_mean": 0.2529789046384394, + "reward_before_std": 1.135535355657339, + "reward_change_max": 0.00035287439823150635, + "reward_change_mean": -0.07605624804273248, + "reward_change_min": -0.19880263041704893, + "reward_change_std": 0.08487494708970189, + "reward_std": 1.1043973341584206, + "rewards/cosine_scaled_reward": -0.008927222341299057, + "rewards/format_reward": 0.2708333358168602, + "step": 176 + }, + { + "advantage_max": 1.333267793059349, + "advantage_mean": -5.898376537194494e-09, + "advantage_min": -1.057743813842535, + "advantage_std": 0.9380174241960049, + "completion_length": 2683.3126068115234, + "epoch": 0.2022857142857143, + "grad_norm": 0.3615868389606476, + "kl": 0.081451416015625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.344131861991828e-07, + "loss": -0.0352, + "reward": 0.2688114494085312, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2688114494085312, + "reward_after_std": 0.9380174838006496, + "reward_before_mean": 0.35989896580576897, + "reward_before_std": 0.9653622359037399, + "reward_change_max": 0.0009453520178794861, + "reward_change_mean": -0.0910875026602298, + "reward_change_min": -0.19741658121347427, + "reward_change_std": 0.080465252045542, + "reward_std": 0.9380174949765205, + "rewards/cosine_scaled_reward": -0.028383860364556313, + "rewards/format_reward": 0.4166666828095913, + "step": 177 + }, + { + "advantage_max": 0.7999058216810226, + "advantage_mean": 1.8471231655325937e-08, + "advantage_min": -0.5875363126397133, + "advantage_std": 0.525285542011261, + "completion_length": 2208.791721343994, + "epoch": 0.20342857142857143, + "grad_norm": 0.24783383309841156, + "kl": 0.094207763671875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.319717151140072e-07, + "loss": -0.0245, + "reward": -0.32124644331634045, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.32124644331634045, + "reward_after_std": 0.5252855531871319, + "reward_before_mean": -0.2753131858771667, + "reward_before_std": 0.5358833186328411, + "reward_change_max": 0.0009209141135215759, + "reward_change_mean": -0.045933238812722266, + "reward_change_min": -0.1145911905914545, + "reward_change_std": 0.04406271001789719, + "reward_std": 0.5252855755388737, + "rewards/cosine_scaled_reward": -0.18973992549581453, + "rewards/format_reward": 0.10416666977107525, + "step": 178 + }, + { + "advantage_max": 0.9616053961217403, + "advantage_mean": -6.208815128694312e-10, + "advantage_min": -0.5495030656456947, + "advantage_std": 0.5631580613553524, + "completion_length": 2417.0833740234375, + "epoch": 0.20457142857142857, + "grad_norm": 0.2671527862548828, + "kl": 0.072052001953125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.295165011252396e-07, + "loss": 0.0493, + "reward": -0.20262894342886284, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.20262894342886284, + "reward_after_std": 0.563158068805933, + "reward_before_mean": -0.14828139916062355, + "reward_before_std": 0.5615837536752224, + "reward_change_max": 0.0001543089747428894, + "reward_change_mean": -0.054347540717571974, + "reward_change_min": -0.10567384958267212, + "reward_change_std": 0.041040402837097645, + "reward_std": 0.5631580837070942, + "rewards/cosine_scaled_reward": -0.18872403725981712, + "rewards/format_reward": 0.22916666977107525, + "step": 179 + }, + { + "advantage_max": 1.3213228471577168, + "advantage_mean": 3.1044087300813317e-09, + "advantage_min": -0.9479734636843204, + "advantage_std": 0.8566804938018322, + "completion_length": 2123.750011444092, + "epoch": 0.2057142857142857, + "grad_norm": 0.42133620381355286, + "kl": 0.0872802734375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.270476638965461e-07, + "loss": 0.0471, + "reward": 0.38085563108325005, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.38085563108325005, + "reward_after_std": 0.8566805012524128, + "reward_before_mean": 0.48239467665553093, + "reward_before_std": 0.8608980290591717, + "reward_change_max": 0.00010352581739425659, + "reward_change_mean": -0.10153904324397445, + "reward_change_min": -0.20027936436235905, + "reward_change_std": 0.08030586317181587, + "reward_std": 0.8566805496811867, + "rewards/cosine_scaled_reward": 0.13703067554160953, + "rewards/format_reward": 0.2083333358168602, + "step": 180 + }, + { + "advantage_max": 0.93392089381814, + "advantage_mean": 9.313225857177088e-09, + "advantage_min": -0.6315050572156906, + "advantage_std": 0.6295647770166397, + "completion_length": 2717.8958892822266, + "epoch": 0.20685714285714285, + "grad_norm": 0.26544320583343506, + "kl": 0.094696044921875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.245653237555705e-07, + "loss": 0.0447, + "reward": -0.0804877057671547, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.0804877057671547, + "reward_after_std": 0.6295647732913494, + "reward_before_mean": -0.014551796950399876, + "reward_before_std": 0.6384981628507376, + "reward_change_max": 0.00037204474210739136, + "reward_change_mean": -0.06593590078409761, + "reward_change_min": -0.15629394073039293, + "reward_change_std": 0.06047847680747509, + "reward_std": 0.6295647993683815, + "rewards/cosine_scaled_reward": -0.08019256498664618, + "rewards/format_reward": 0.1458333358168602, + "step": 181 + }, + { + "advantage_max": 1.3371423408389091, + "advantage_mean": -1.2417634698280722e-09, + "advantage_min": -0.9195161461830139, + "advantage_std": 0.9004073217511177, + "completion_length": 2558.1667098999023, + "epoch": 0.208, + "grad_norm": 0.335359662771225, + "kl": 0.093170166015625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.220696016880687e-07, + "loss": 0.0294, + "reward": 0.4829629212617874, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4829629212617874, + "reward_after_std": 0.9004073329269886, + "reward_before_mean": 0.5930858813226223, + "reward_before_std": 0.9114826694130898, + "reward_change_max": 0.0, + "reward_change_mean": -0.1101229446940124, + "reward_change_min": -0.2271582130342722, + "reward_change_std": 0.08647168381139636, + "reward_std": 0.9004073478281498, + "rewards/cosine_scaled_reward": 0.10904293693602085, + "rewards/format_reward": 0.3750000111758709, + "step": 182 + }, + { + "advantage_max": 1.254299134016037, + "advantage_mean": -1.490116174895917e-08, + "advantage_min": -0.9943030625581741, + "advantage_std": 0.8474318087100983, + "completion_length": 2121.4375915527344, + "epoch": 0.20914285714285713, + "grad_norm": 0.3153325021266937, + "kl": 0.08837890625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.195606193320136e-07, + "loss": 0.0504, + "reward": 0.1891594659537077, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1891594659537077, + "reward_after_std": 0.8474318385124207, + "reward_before_mean": 0.2741537671536207, + "reward_before_std": 0.8667086064815521, + "reward_change_max": 0.000354960560798645, + "reward_change_mean": -0.08499433984979987, + "reward_change_min": -0.18232434801757336, + "reward_change_std": 0.07570598181337118, + "reward_std": 0.8474318720400333, + "rewards/cosine_scaled_reward": 0.0016602184623479843, + "rewards/format_reward": 0.27083333767950535, + "step": 183 + }, + { + "advantage_max": 1.1184355579316616, + "advantage_mean": 4.346172532976311e-09, + "advantage_min": -0.5413721092045307, + "advantage_std": 0.6433587558567524, + "completion_length": 2671.7291984558105, + "epoch": 0.2102857142857143, + "grad_norm": 0.22072502970695496, + "kl": 0.11517333984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.170384989716657e-07, + "loss": 0.0022, + "reward": -0.31343852914869785, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.31343852914869785, + "reward_after_std": 0.6433587558567524, + "reward_before_mean": -0.27286421274766326, + "reward_before_std": 0.6472878158092499, + "reward_change_max": 0.0005966871976852417, + "reward_change_mean": -0.04057432198897004, + "reward_change_min": -0.10668485425412655, + "reward_change_std": 0.04195940599311143, + "reward_std": 0.6433587614446878, + "rewards/cosine_scaled_reward": -0.19893210940063, + "rewards/format_reward": 0.1250000037252903, + "step": 184 + }, + { + "advantage_max": 1.1658845506608486, + "advantage_mean": -1.4280279847511679e-08, + "advantage_min": -0.6802770979702473, + "advantage_std": 0.744084857404232, + "completion_length": 2459.6875534057617, + "epoch": 0.21142857142857144, + "grad_norm": 0.35991156101226807, + "kl": 0.111419677734375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.145033635316128e-07, + "loss": 0.0663, + "reward": -0.05943065322935581, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.05943065322935581, + "reward_after_std": 0.7440848480910063, + "reward_before_mean": 0.004029544070363045, + "reward_before_std": 0.7564304284751415, + "reward_change_max": 0.0, + "reward_change_mean": -0.06346021476201713, + "reward_change_min": -0.1444684062153101, + "reward_change_std": 0.05615873821079731, + "reward_std": 0.7440848704427481, + "rewards/cosine_scaled_reward": -0.11256856098771095, + "rewards/format_reward": 0.22916666977107525, + "step": 185 + }, + { + "advantage_max": 0.811072587966919, + "advantage_mean": 2.17308601113686e-08, + "advantage_min": -0.6112252026796341, + "advantage_std": 0.6088230907917023, + "completion_length": 2535.125045776367, + "epoch": 0.21257142857142858, + "grad_norm": 0.3562098443508148, + "kl": 0.129058837890625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.119553365707802e-07, + "loss": -0.011, + "reward": 0.07544249296188354, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.07544249296188354, + "reward_after_std": 0.608823087066412, + "reward_before_mean": 0.15811622887849808, + "reward_before_std": 0.6218390204012394, + "reward_change_max": 0.00039686262607574463, + "reward_change_mean": -0.08267371519468725, + "reward_change_min": -0.16685685515403748, + "reward_change_std": 0.06896232924191281, + "reward_std": 0.6088231094181538, + "rewards/cosine_scaled_reward": -0.01469188928604126, + "rewards/format_reward": 0.1875000037252903, + "step": 186 + }, + { + "advantage_max": 1.175539281219244, + "advantage_mean": 1.5522043539384356e-08, + "advantage_min": -0.8305178135633469, + "advantage_std": 0.783175390213728, + "completion_length": 2573.791732788086, + "epoch": 0.21371428571428572, + "grad_norm": 0.33450013399124146, + "kl": 0.130859375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.093945422764069e-07, + "loss": -0.005, + "reward": -0.05765972752124071, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.05765972752124071, + "reward_after_std": 0.783175390213728, + "reward_before_mean": 0.0057212114334106445, + "reward_before_std": 0.8013257309794426, + "reward_change_max": 0.00036757439374923706, + "reward_change_mean": -0.06338093103840947, + "reward_change_min": -0.15380204655230045, + "reward_change_std": 0.061657859245315194, + "reward_std": 0.7831753939390182, + "rewards/cosine_scaled_reward": -0.10130605194717646, + "rewards/format_reward": 0.2083333395421505, + "step": 187 + }, + { + "advantage_max": 0.6556166559457779, + "advantage_mean": 5.587935225648266e-09, + "advantage_min": -0.505927637219429, + "advantage_std": 0.4643473494797945, + "completion_length": 3036.4583435058594, + "epoch": 0.21485714285714286, + "grad_norm": 0.2525724470615387, + "kl": 0.1484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.068211054579943e-07, + "loss": 0.0062, + "reward": -0.31691102124750614, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.31691102124750614, + "reward_after_std": 0.46434734389185905, + "reward_before_mean": -0.26725206710398197, + "reward_before_std": 0.47667811438441277, + "reward_change_max": 0.0005693808197975159, + "reward_change_mean": -0.04965895973145962, + "reward_change_min": -0.11332780588418245, + "reward_change_std": 0.04475319117773324, + "reward_std": 0.4643473718315363, + "rewards/cosine_scaled_reward": -0.15445936284959316, + "rewards/format_reward": 0.0416666679084301, + "step": 188 + }, + { + "advantage_max": 0.8341766893863678, + "advantage_mean": -9.313224635931761e-10, + "advantage_min": -0.7918973080813885, + "advantage_std": 0.60749601572752, + "completion_length": 2524.3958587646484, + "epoch": 0.216, + "grad_norm": 0.2288750410079956, + "kl": 0.1602783203125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.04235151541222e-07, + "loss": 0.0357, + "reward": 0.26920123770833015, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.26920123770833015, + "reward_after_std": 0.6074960138648748, + "reward_before_mean": 0.36930515244603157, + "reward_before_std": 0.6160230301320553, + "reward_change_max": 0.00011741369962692261, + "reward_change_mean": -0.10010391753166914, + "reward_change_min": -0.17794792912900448, + "reward_change_std": 0.06970303924754262, + "reward_std": 0.6074960231781006, + "rewards/cosine_scaled_reward": -0.013264082372188568, + "rewards/format_reward": 0.3958333507180214, + "step": 189 + }, + { + "advantage_max": 0.8611217215657234, + "advantage_mean": 4.967053768289986e-09, + "advantage_min": -0.8677000142633915, + "advantage_std": 0.6190670412033796, + "completion_length": 2455.5833587646484, + "epoch": 0.21714285714285714, + "grad_norm": 0.1976422667503357, + "kl": 0.13385009765625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.01636806561836e-07, + "loss": 0.0189, + "reward": 0.12271896377205849, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.12271896377205849, + "reward_after_std": 0.6190670412033796, + "reward_before_mean": 0.2086525820195675, + "reward_before_std": 0.6303278524428606, + "reward_change_max": 0.0020866915583610535, + "reward_change_mean": -0.08593360241502523, + "reward_change_min": -0.15425695106387138, + "reward_change_std": 0.0642830905271694, + "reward_std": 0.6190670430660248, + "rewards/cosine_scaled_reward": -0.04150705365464091, + "rewards/format_reward": 0.29166667722165585, + "step": 190 + }, + { + "advantage_max": 0.8121273927390575, + "advantage_mean": -1.3659397779530735e-08, + "advantage_min": -0.6374476179480553, + "advantage_std": 0.5626640729606152, + "completion_length": 2784.687545776367, + "epoch": 0.21828571428571428, + "grad_norm": 0.2620941698551178, + "kl": 0.16229248046875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.990261971595048e-07, + "loss": 0.0151, + "reward": 0.05612104572355747, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.05612104572355747, + "reward_after_std": 0.5626640692353249, + "reward_before_mean": 0.13682471262291074, + "reward_before_std": 0.56803297996521, + "reward_change_max": 0.00027626752853393555, + "reward_change_mean": -0.08070366689935327, + "reward_change_min": -0.15503592789173126, + "reward_change_std": 0.0627133441157639, + "reward_std": 0.5626640915870667, + "rewards/cosine_scaled_reward": -0.05658765323460102, + "rewards/format_reward": 0.25000000186264515, + "step": 191 + }, + { + "advantage_max": 0.9729629419744015, + "advantage_mean": 1.1796752907855534e-08, + "advantage_min": -0.7198412269353867, + "advantage_std": 0.6481819711625576, + "completion_length": 3039.5208740234375, + "epoch": 0.21942857142857142, + "grad_norm": 0.27418825030326843, + "kl": 0.13934326171875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.964034505716476e-07, + "loss": 0.0349, + "reward": -0.16575850173830986, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.16575850173830986, + "reward_after_std": 0.648181963711977, + "reward_before_mean": -0.10841276589781046, + "reward_before_std": 0.6632645688951015, + "reward_change_max": 0.00047516077756881714, + "reward_change_mean": -0.057345751440152526, + "reward_change_min": -0.13206746894866228, + "reward_change_std": 0.05364243150688708, + "reward_std": 0.6481819748878479, + "rewards/cosine_scaled_reward": -0.1479563768953085, + "rewards/format_reward": 0.18750000558793545, + "step": 192 + }, + { + "advantage_max": 0.9263895452022552, + "advantage_mean": 2.9181441985048906e-08, + "advantage_min": -0.5460419282317162, + "advantage_std": 0.5525044910609722, + "completion_length": 3144.125045776367, + "epoch": 0.22057142857142858, + "grad_norm": 0.27585750818252563, + "kl": 0.16455078125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.93768694627233e-07, + "loss": 0.0501, + "reward": -0.3938119038939476, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.3938119038939476, + "reward_after_std": 0.5525044947862625, + "reward_before_mean": -0.3569503426551819, + "reward_before_std": 0.5581550113856792, + "reward_change_max": 0.0011642500758171082, + "reward_change_mean": -0.0368615499464795, + "reward_change_min": -0.07977760676294565, + "reward_change_std": 0.03481626545544714, + "reward_std": 0.5525045059621334, + "rewards/cosine_scaled_reward": -0.2618085015565157, + "rewards/format_reward": 0.1666666716337204, + "step": 193 + }, + { + "advantage_max": 1.1692192666232586, + "advantage_mean": -1.9868215517249155e-08, + "advantage_min": -0.9652174562215805, + "advantage_std": 0.8269610479474068, + "completion_length": 2549.2708740234375, + "epoch": 0.22171428571428572, + "grad_norm": 0.5415308475494385, + "kl": 0.110260009765625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.911220577405484e-07, + "loss": 0.0546, + "reward": 0.4768530046567321, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4768530046567321, + "reward_after_std": 0.8269610404968262, + "reward_before_mean": 0.5903498325496912, + "reward_before_std": 0.8395131379365921, + "reward_change_max": 0.0006142705678939819, + "reward_change_mean": -0.11349680949933827, + "reward_change_min": -0.21761109866201878, + "reward_change_std": 0.09163685445673764, + "reward_std": 0.826961062848568, + "rewards/cosine_scaled_reward": 0.07642490416765213, + "rewards/format_reward": 0.4375, + "step": 194 + }, + { + "advantage_max": 1.4493412896990776, + "advantage_mean": -2.1730860888524717e-08, + "advantage_min": -1.0076495185494423, + "advantage_std": 0.9671072959899902, + "completion_length": 2750.6250610351562, + "epoch": 0.22285714285714286, + "grad_norm": 1.0602513551712036, + "kl": 0.14349365234375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.884636689049422e-07, + "loss": 0.1005, + "reward": 0.08260958641767502, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.08260958641767502, + "reward_after_std": 0.9671072959899902, + "reward_before_mean": 0.15350980064249597, + "reward_before_std": 0.9919899739325047, + "reward_change_max": 0.00022814422845840454, + "reward_change_mean": -0.07090024533681571, + "reward_change_min": -0.17355335224419832, + "reward_change_std": 0.0735201274510473, + "reward_std": 0.9671073220670223, + "rewards/cosine_scaled_reward": -0.05866176821291447, + "rewards/format_reward": 0.2708333395421505, + "step": 195 + }, + { + "advantage_max": 1.2421837113797665, + "advantage_mean": -3.1664969646350016e-08, + "advantage_min": -0.8666118606925011, + "advantage_std": 0.7809932995587587, + "completion_length": 2964.3333740234375, + "epoch": 0.224, + "grad_norm": 0.564656674861908, + "kl": 0.175201416015625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.857936576865356e-07, + "loss": 0.0615, + "reward": 0.04462842829525471, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.04462842829525471, + "reward_after_std": 0.7809933163225651, + "reward_before_mean": 0.11695368587970734, + "reward_before_std": 0.7898044027388096, + "reward_change_max": 0.0002730339765548706, + "reward_change_mean": -0.07232528855092824, + "reward_change_min": -0.14141991455107927, + "reward_change_std": 0.05830873898230493, + "reward_std": 0.780993327498436, + "rewards/cosine_scaled_reward": -0.0873564900830388, + "rewards/format_reward": 0.29166667722165585, + "step": 196 + }, + { + "advantage_max": 1.8417157232761383, + "advantage_mean": -2.359350548264416e-08, + "advantage_min": -1.2466395795345306, + "advantage_std": 1.2031425386667252, + "completion_length": 2360.041702270508, + "epoch": 0.22514285714285714, + "grad_norm": 0.5715523958206177, + "kl": 0.1849365234375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.831121542179086e-07, + "loss": 0.0325, + "reward": 0.5395329678431153, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5395329678431153, + "reward_after_std": 1.2031425312161446, + "reward_before_mean": 0.6457790993154049, + "reward_before_std": 1.2249048128724098, + "reward_change_max": 0.00048523396253585815, + "reward_change_mean": -0.10624613519757986, + "reward_change_min": -0.24607283994555473, + "reward_change_std": 0.0999097554013133, + "reward_std": 1.2031425833702087, + "rewards/cosine_scaled_reward": 0.1249728761613369, + "rewards/format_reward": 0.3958333395421505, + "step": 197 + }, + { + "advantage_max": 0.6936135701835155, + "advantage_mean": 6.208814573582799e-10, + "advantage_min": -0.8528610952198505, + "advantage_std": 0.556768286973238, + "completion_length": 2614.0625762939453, + "epoch": 0.22628571428571428, + "grad_norm": 0.4623723328113556, + "kl": 0.2156982421875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.804192891917571e-07, + "loss": -0.0034, + "reward": 0.06352788954973221, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.06352788954973221, + "reward_after_std": 0.5567682851105928, + "reward_before_mean": 0.1473788940347731, + "reward_before_std": 0.5750712193548679, + "reward_change_max": 0.0010342374444007874, + "reward_change_mean": -0.0838509879540652, + "reward_change_min": -0.14691872242838144, + "reward_change_std": 0.06345773441717029, + "reward_std": 0.556768300011754, + "rewards/cosine_scaled_reward": -0.08256056532263756, + "rewards/format_reward": 0.31250000558793545, + "step": 198 + }, + { + "advantage_max": 1.3256859108805656, + "advantage_mean": 8.07146260939362e-09, + "advantage_min": -0.6804698929190636, + "advantage_std": 0.7932882234454155, + "completion_length": 2771.416717529297, + "epoch": 0.22742857142857142, + "grad_norm": 0.39074909687042236, + "kl": 0.2138671875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.777151938545235e-07, + "loss": 0.0087, + "reward": -0.22427453845739365, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.22427453845739365, + "reward_after_std": 0.7932882159948349, + "reward_before_mean": -0.1793069913983345, + "reward_before_std": 0.8059480749070644, + "reward_change_max": 0.002735838294029236, + "reward_change_mean": -0.044967556837946177, + "reward_change_min": -0.13585915230214596, + "reward_change_std": 0.053579527186229825, + "reward_std": 0.7932882308959961, + "rewards/cosine_scaled_reward": -0.1625701580196619, + "rewards/format_reward": 0.14583333767950535, + "step": 199 + }, + { + "advantage_max": 1.051039144396782, + "advantage_mean": -1.5522042928761692e-08, + "advantage_min": -0.7579713463783264, + "advantage_std": 0.6761558391153812, + "completion_length": 2306.9583740234375, + "epoch": 0.22857142857142856, + "grad_norm": 0.4172566831111908, + "kl": 0.211669921875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.75e-07, + "loss": -0.008, + "reward": 0.3839730089530349, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3839730089530349, + "reward_after_std": 0.6761558391153812, + "reward_before_mean": 0.4907239656895399, + "reward_before_std": 0.671320891007781, + "reward_change_max": 0.000662676990032196, + "reward_change_mean": -0.10675097140483558, + "reward_change_min": -0.18421044945716858, + "reward_change_std": 0.07500778371468186, + "reward_std": 0.6761558465659618, + "rewards/cosine_scaled_reward": 0.01619531214237213, + "rewards/format_reward": 0.4583333432674408, + "step": 200 + }, + { + "advantage_max": 1.002537775784731, + "advantage_mean": -4.594524793954946e-08, + "advantage_min": -0.9626957811415195, + "advantage_std": 0.7547497116029263, + "completion_length": 2121.229179382324, + "epoch": 0.2297142857142857, + "grad_norm": 0.3952004015445709, + "kl": 0.17352294921875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.72273839962904e-07, + "loss": -0.0059, + "reward": 0.8897911142557859, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.8897911142557859, + "reward_after_std": 0.7547497153282166, + "reward_before_mean": 1.044318351894617, + "reward_before_std": 0.7571042329072952, + "reward_change_max": 0.0001737847924232483, + "reward_change_mean": -0.15452725533396006, + "reward_change_min": -0.2652826514095068, + "reward_change_std": 0.1033217788208276, + "reward_std": 0.7547497302293777, + "rewards/cosine_scaled_reward": 0.25132583547383547, + "rewards/format_reward": 0.5416666697710752, + "step": 201 + }, + { + "advantage_max": 0.7856088727712631, + "advantage_mean": -4.5945247717504856e-08, + "advantage_min": -0.6831889897584915, + "advantage_std": 0.5087563134729862, + "completion_length": 2350.479202270508, + "epoch": 0.23085714285714284, + "grad_norm": 0.30211445689201355, + "kl": 0.24383544921875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.695368466124296e-07, + "loss": 0.0061, + "reward": 0.8790129721164703, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.8790129721164703, + "reward_after_std": 0.5087563041597605, + "reward_before_mean": 1.0367812784388661, + "reward_before_std": 0.4771445747464895, + "reward_change_max": 0.00018440932035446167, + "reward_change_mean": -0.1577683356590569, + "reward_change_min": -0.23149394802749157, + "reward_change_std": 0.08965065004304051, + "reward_std": 0.5087563060224056, + "rewards/cosine_scaled_reward": 0.24755729362368584, + "rewards/format_reward": 0.5416666734963655, + "step": 202 + }, + { + "advantage_max": 1.0101400669664145, + "advantage_mean": 7.45058115203534e-09, + "advantage_min": -0.6545524224638939, + "advantage_std": 0.6288202814757824, + "completion_length": 2820.7291870117188, + "epoch": 0.232, + "grad_norm": 0.4497270882129669, + "kl": 0.27264404296875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.667891533457718e-07, + "loss": 0.0651, + "reward": 0.19614388910122216, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.19614388910122216, + "reward_after_std": 0.6288202852010727, + "reward_before_mean": 0.28668341506272554, + "reward_before_std": 0.6246930155903101, + "reward_change_max": 0.0, + "reward_change_mean": -0.09053952293470502, + "reward_change_min": -0.16773551888763905, + "reward_change_std": 0.06451876182109118, + "reward_std": 0.6288202926516533, + "rewards/cosine_scaled_reward": 0.04959171311929822, + "rewards/format_reward": 0.18750000186264515, + "step": 203 + }, + { + "advantage_max": 0.8654856495559216, + "advantage_mean": -1.0554989493538613e-08, + "advantage_min": -0.9278236515820026, + "advantage_std": 0.6752170845866203, + "completion_length": 2224.7083892822266, + "epoch": 0.23314285714285715, + "grad_norm": 0.5278440713882446, + "kl": 0.205322265625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.640308940816239e-07, + "loss": 0.0108, + "reward": 0.5590728987008333, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5590728987008333, + "reward_after_std": 0.6752170845866203, + "reward_before_mean": 0.6843808209523559, + "reward_before_std": 0.6862643286585808, + "reward_change_max": 0.001576930284500122, + "reward_change_mean": -0.1253079129382968, + "reward_change_min": -0.2180102663114667, + "reward_change_std": 0.08850083267316222, + "reward_std": 0.6752171032130718, + "rewards/cosine_scaled_reward": 0.008857070934027433, + "rewards/format_reward": 0.666666679084301, + "step": 204 + }, + { + "advantage_max": 1.3751740790903568, + "advantage_mean": -8.071461887748654e-09, + "advantage_min": -1.3816149085760117, + "advantage_std": 1.0784570425748825, + "completion_length": 2667.6459197998047, + "epoch": 0.2342857142857143, + "grad_norm": 0.509384036064148, + "kl": 0.229400634765625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.612622032536507e-07, + "loss": 0.032, + "reward": 0.587292967364192, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.587292967364192, + "reward_after_std": 1.078457035124302, + "reward_before_mean": 0.7071726471185684, + "reward_before_std": 1.1184897869825363, + "reward_change_max": 8.384138345718384e-05, + "reward_change_mean": -0.11987964902073145, + "reward_change_min": -0.24095893651247025, + "reward_change_std": 0.10655779344961047, + "reward_std": 1.0784570574760437, + "rewards/cosine_scaled_reward": 0.176502981223166, + "rewards/format_reward": 0.35416667722165585, + "step": 205 + }, + { + "advantage_max": 1.5005393326282501, + "advantage_mean": 6.208817127095756e-09, + "advantage_min": -0.8025929555296898, + "advantage_std": 0.8781850822269917, + "completion_length": 3067.854248046875, + "epoch": 0.23542857142857143, + "grad_norm": 0.41775602102279663, + "kl": 0.2396240234375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.584832158039378e-07, + "loss": 0.0312, + "reward": 0.10771947354078293, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.10771947354078293, + "reward_after_std": 0.8781850971281528, + "reward_before_mean": 0.18033906631171703, + "reward_before_std": 0.8780070766806602, + "reward_change_max": 2.4966895580291748e-05, + "reward_change_mean": -0.07261957018636167, + "reward_change_min": -0.1609745966270566, + "reward_change_std": 0.061680351849645376, + "reward_std": 0.8781851306557655, + "rewards/cosine_scaled_reward": -0.07649714685976505, + "rewards/format_reward": 0.33333333767950535, + "step": 206 + }, + { + "advantage_max": 0.859190009534359, + "advantage_mean": 3.725290353973065e-09, + "advantage_min": -0.7211577333509922, + "advantage_std": 0.5771542005240917, + "completion_length": 3049.291717529297, + "epoch": 0.23657142857142857, + "grad_norm": 0.35315126180648804, + "kl": 0.32470703125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.556940671764124e-07, + "loss": 0.0438, + "reward": -0.008801928721368313, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.008801928721368313, + "reward_after_std": 0.5771542023867369, + "reward_before_mean": 0.06464581179898232, + "reward_before_std": 0.5813978333026171, + "reward_change_max": 0.0005838200449943542, + "reward_change_mean": -0.07344775041565299, + "reward_change_min": -0.14188366383314133, + "reward_change_std": 0.05655882274731994, + "reward_std": 0.5771542396396399, + "rewards/cosine_scaled_reward": -0.13434376008808613, + "rewards/format_reward": 0.3333333432674408, + "step": 207 + }, + { + "advantage_max": 1.1458739191293716, + "advantage_mean": -1.8005571034152013e-08, + "advantage_min": -0.638024490326643, + "advantage_std": 0.6989194564521313, + "completion_length": 2599.7500610351562, + "epoch": 0.2377142857142857, + "grad_norm": 0.6301652789115906, + "kl": 0.255859375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.528948933102438e-07, + "loss": 0.0427, + "reward": 0.1499016471207142, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1499016471207142, + "reward_after_std": 0.6989194788038731, + "reward_before_mean": 0.23315771110355854, + "reward_before_std": 0.6953848674893379, + "reward_change_max": 0.0002986416220664978, + "reward_change_mean": -0.08325609937310219, + "reward_change_min": -0.16083112079650164, + "reward_change_std": 0.06415085797198117, + "reward_std": 0.6989195011556149, + "rewards/cosine_scaled_reward": -0.0917544849216938, + "rewards/format_reward": 0.4166666716337204, + "step": 208 + }, + { + "advantage_max": 0.9560815170407295, + "advantage_mean": -1.3038516155639002e-08, + "advantage_min": -0.669091060757637, + "advantage_std": 0.6263339519500732, + "completion_length": 2470.625045776367, + "epoch": 0.23885714285714285, + "grad_norm": 0.47854506969451904, + "kl": 0.2359619140625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.500858306332172e-07, + "loss": -0.0121, + "reward": 0.574387613683939, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.574387613683939, + "reward_after_std": 0.6263339295983315, + "reward_before_mean": 0.7000777684152126, + "reward_before_std": 0.612229947000742, + "reward_change_max": 0.0, + "reward_change_mean": -0.12569017568603158, + "reward_change_min": -0.2115925382822752, + "reward_change_std": 0.08066713158041239, + "reward_std": 0.6263339556753635, + "rewards/cosine_scaled_reward": 0.07920554932206869, + "rewards/format_reward": 0.5416666828095913, + "step": 209 + }, + { + "advantage_max": 0.6764157526195049, + "advantage_mean": 2.5456150853919723e-08, + "advantage_min": -0.6963140219449997, + "advantage_std": 0.5000756457448006, + "completion_length": 2879.4583740234375, + "epoch": 0.24, + "grad_norm": 0.2873426675796509, + "kl": 0.24578857421875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.472670160550848e-07, + "loss": 0.027, + "reward": 0.2614448294043541, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2614448294043541, + "reward_after_std": 0.50007563829422, + "reward_before_mean": 0.364066656678915, + "reward_before_std": 0.4979663249105215, + "reward_change_max": 0.00027251243591308594, + "reward_change_mean": -0.10262180212885141, + "reward_change_min": -0.1705446420237422, + "reward_change_std": 0.06758826458826661, + "reward_std": 0.5000756569206715, + "rewards/cosine_scaled_reward": 0.00494999997317791, + "rewards/format_reward": 0.3541666753590107, + "step": 210 + }, + { + "advantage_max": 1.1394590884447098, + "advantage_mean": -8.6923440667519e-09, + "advantage_min": -0.8035576418042183, + "advantage_std": 0.7197157442569733, + "completion_length": 2390.9791870117188, + "epoch": 0.24114285714285713, + "grad_norm": 0.29365745186805725, + "kl": 0.21905517578125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.444385869608921e-07, + "loss": 0.001, + "reward": 0.21905515156686306, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.21905515156686306, + "reward_after_std": 0.719715740531683, + "reward_before_mean": 0.3086434584110975, + "reward_before_std": 0.723445076495409, + "reward_change_max": 0.00035256147384643555, + "reward_change_mean": -0.08958828420145437, + "reward_change_min": -0.16625287476927042, + "reward_change_std": 0.06521394196897745, + "reward_std": 0.7197157703340054, + "rewards/cosine_scaled_reward": -0.043594954535365105, + "rewards/format_reward": 0.3958333507180214, + "step": 211 + }, + { + "advantage_max": 1.094607064500451, + "advantage_mean": -1.117587122845265e-08, + "advantage_min": -0.6028978629037738, + "advantage_std": 0.6431901277974248, + "completion_length": 2397.0833892822266, + "epoch": 0.2422857142857143, + "grad_norm": 0.32407593727111816, + "kl": 0.22100830078125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.416006812042827e-07, + "loss": 0.0192, + "reward": 0.5966612412594259, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5966612412594259, + "reward_after_std": 0.6431901091709733, + "reward_before_mean": 0.7225769180804491, + "reward_before_std": 0.6182768186554313, + "reward_change_max": 0.0001141279935836792, + "reward_change_mean": -0.12591569544747472, + "reward_change_min": -0.20083122700452805, + "reward_change_std": 0.07545896037481725, + "reward_std": 0.6431901175528765, + "rewards/cosine_scaled_reward": 0.06962178461253643, + "rewards/format_reward": 0.5833333395421505, + "step": 212 + }, + { + "advantage_max": 1.282714981585741, + "advantage_mean": -9.313226023710541e-10, + "advantage_min": -1.1995286270976067, + "advantage_std": 0.9880655072629452, + "completion_length": 2257.3750610351562, + "epoch": 0.24342857142857144, + "grad_norm": 0.42579054832458496, + "kl": 0.225311279296875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.387534371007797e-07, + "loss": 0.0334, + "reward": 0.43806671584025025, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.43806671584025025, + "reward_after_std": 0.9880655221641064, + "reward_before_mean": 0.5443241535685956, + "reward_before_std": 1.0230080299079418, + "reward_change_max": 0.0003249123692512512, + "reward_change_mean": -0.1062574377283454, + "reward_change_min": -0.23165474273264408, + "reward_change_std": 0.09968985430896282, + "reward_std": 0.9880655538290739, + "rewards/cosine_scaled_reward": 0.0013287439942359924, + "rewards/format_reward": 0.5416666734963655, + "step": 213 + }, + { + "advantage_max": 1.1717126108705997, + "advantage_mean": 6.053596915411852e-09, + "advantage_min": -1.053213369101286, + "advantage_std": 0.842260368168354, + "completion_length": 2521.250045776367, + "epoch": 0.24457142857142858, + "grad_norm": 0.41738176345825195, + "kl": 0.2025146484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.358969934210438e-07, + "loss": 0.0276, + "reward": 0.4556270924513228, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4556270924513228, + "reward_after_std": 0.8422603793442249, + "reward_before_mean": 0.5666961893439293, + "reward_before_std": 0.8589375950396061, + "reward_change_max": 0.0005218684673309326, + "reward_change_mean": -0.11106909299269319, + "reward_change_min": -0.21090741362422705, + "reward_change_std": 0.08776850858703256, + "reward_std": 0.8422603905200958, + "rewards/cosine_scaled_reward": 0.0020980946719646454, + "rewards/format_reward": 0.5625000149011612, + "step": 214 + }, + { + "advantage_max": 0.7589559145271778, + "advantage_mean": 2.048909669705168e-08, + "advantage_min": -0.7794839665293694, + "advantage_std": 0.5524455606937408, + "completion_length": 2386.7708854675293, + "epoch": 0.24571428571428572, + "grad_norm": 0.28002220392227173, + "kl": 0.220703125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.330314893841101e-07, + "loss": 0.0386, + "reward": 0.11122296750545502, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.11122296750545502, + "reward_after_std": 0.5524455606937408, + "reward_before_mean": 0.19726988906040788, + "reward_before_std": 0.5620923303067684, + "reward_change_max": 0.00015363842248916626, + "reward_change_mean": -0.08604689175263047, + "reward_change_min": -0.15219917241483927, + "reward_change_std": 0.062479326501488686, + "reward_std": 0.5524455681443214, + "rewards/cosine_scaled_reward": -0.0992817347869277, + "rewards/format_reward": 0.39583334140479565, + "step": 215 + }, + { + "advantage_max": 1.3616463989019394, + "advantage_mean": -1.862645188088763e-08, + "advantage_min": -0.9499138370156288, + "advantage_std": 0.8744436055421829, + "completion_length": 2387.5625762939453, + "epoch": 0.24685714285714286, + "grad_norm": 0.42455509305000305, + "kl": 0.2247314453125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.301570646506027e-07, + "loss": 0.0104, + "reward": 0.495633814483881, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.495633814483881, + "reward_after_std": 0.8744436204433441, + "reward_before_mean": 0.6068383371457458, + "reward_before_std": 0.8772630579769611, + "reward_change_max": 0.0, + "reward_change_mean": -0.11120450543239713, + "reward_change_min": -0.20330642815679312, + "reward_change_std": 0.07988221733830869, + "reward_std": 0.8744436576962471, + "rewards/cosine_scaled_reward": -0.009080850519239902, + "rewards/format_reward": 0.6250000093132257, + "step": 216 + }, + { + "advantage_max": 1.5420643910765648, + "advantage_mean": 1.2417634476236117e-08, + "advantage_min": -1.0352942943572998, + "advantage_std": 1.060675971210003, + "completion_length": 2722.8125610351562, + "epoch": 0.248, + "grad_norm": 0.7412684559822083, + "kl": 0.2513427734375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.27273859315928e-07, + "loss": 0.0535, + "reward": 0.6235943953506649, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6235943953506649, + "reward_after_std": 1.0606759525835514, + "reward_before_mean": 0.7431438378989697, + "reward_before_std": 1.0822565481066704, + "reward_change_max": 2.171844244003296e-05, + "reward_change_mean": -0.11954941879957914, + "reward_change_min": -0.25604639016091824, + "reward_change_std": 0.09751309640705585, + "reward_std": 1.0606759898364544, + "rewards/cosine_scaled_reward": 0.09032191522419453, + "rewards/format_reward": 0.5625000055879354, + "step": 217 + }, + { + "advantage_max": 1.2730883322656155, + "advantage_mean": -2.4214387217558198e-08, + "advantage_min": -0.7694863900542259, + "advantage_std": 0.7907967679202557, + "completion_length": 2718.5833587646484, + "epoch": 0.24914285714285714, + "grad_norm": 0.3669975996017456, + "kl": 0.41796875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.243820139034464e-07, + "loss": 0.0147, + "reward": 0.1338215246796608, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1338215246796608, + "reward_after_std": 0.7907967641949654, + "reward_before_mean": 0.21257536113262177, + "reward_before_std": 0.7941794954240322, + "reward_change_max": 6.996095180511475e-06, + "reward_change_mean": -0.07875384530052543, + "reward_change_min": -0.1696187872439623, + "reward_change_std": 0.06339895771816373, + "reward_std": 0.7907967828214169, + "rewards/cosine_scaled_reward": -0.09162899415241554, + "rewards/format_reward": 0.39583334513008595, + "step": 218 + }, + { + "advantage_max": 1.155998595058918, + "advantage_mean": -5.2154065066645217e-08, + "advantage_min": -1.1375319361686707, + "advantage_std": 0.8279825672507286, + "completion_length": 2491.791763305664, + "epoch": 0.2502857142857143, + "grad_norm": 0.8123669624328613, + "kl": 0.228179931640625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.214816693576234e-07, + "loss": 0.0692, + "reward": 0.6602697218768299, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6602697218768299, + "reward_after_std": 0.8279825747013092, + "reward_before_mean": 0.7905087154358625, + "reward_before_std": 0.8417788743972778, + "reward_change_max": 0.0013681799173355103, + "reward_change_mean": -0.13023906177841127, + "reward_change_min": -0.2316719852387905, + "reward_change_std": 0.09396075457334518, + "reward_std": 0.8279825821518898, + "rewards/cosine_scaled_reward": 0.030671026557683945, + "rewards/format_reward": 0.7291666846722364, + "step": 219 + }, + { + "advantage_max": 0.7060927897691727, + "advantage_mean": 8.07146260939362e-09, + "advantage_min": -0.6109671518206596, + "advantage_std": 0.47745291143655777, + "completion_length": 2906.229217529297, + "epoch": 0.25142857142857145, + "grad_norm": 0.3627898395061493, + "kl": 0.3309326171875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.185729670371604e-07, + "loss": 0.0493, + "reward": -0.19565529376268387, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.19565529376268387, + "reward_after_std": 0.47745292633771896, + "reward_before_mean": -0.13670195546001196, + "reward_before_std": 0.48440743423998356, + "reward_change_max": 0.00014778226613998413, + "reward_change_mean": -0.05895334016531706, + "reward_change_min": -0.1121221724897623, + "reward_change_std": 0.04572835494764149, + "reward_std": 0.477452939376235, + "rewards/cosine_scaled_reward": -0.24543431401252747, + "rewards/format_reward": 0.35416667349636555, + "step": 220 + }, + { + "advantage_max": 1.122801061719656, + "advantage_mean": -3.7873785246889113e-08, + "advantage_min": -0.9494878984987736, + "advantage_std": 0.7546558827161789, + "completion_length": 2318.7083740234375, + "epoch": 0.25257142857142856, + "grad_norm": 0.3165279030799866, + "kl": 0.25885009765625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.156560487081051e-07, + "loss": 0.042, + "reward": 0.46347523806616664, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.46347523806616664, + "reward_after_std": 0.7546558603644371, + "reward_before_mean": 0.5765351159498096, + "reward_before_std": 0.7582212314009666, + "reward_change_max": 0.0001964569091796875, + "reward_change_mean": -0.11305988254025578, + "reward_change_min": -0.189128577709198, + "reward_change_std": 0.07670311396941543, + "reward_std": 0.7546558678150177, + "rewards/cosine_scaled_reward": 0.038267549593001604, + "rewards/format_reward": 0.5000000055879354, + "step": 221 + }, + { + "advantage_max": 1.036661870777607, + "advantage_mean": 5.58793539218172e-09, + "advantage_min": -1.0472398027777672, + "advantage_std": 0.75783945992589, + "completion_length": 2533.145881652832, + "epoch": 0.2537142857142857, + "grad_norm": 0.44233042001724243, + "kl": 0.31976318359375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.127310565369415e-07, + "loss": 0.0246, + "reward": 0.4872835408896208, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4872835408896208, + "reward_after_std": 0.7578394636511803, + "reward_before_mean": 0.60423843562603, + "reward_before_std": 0.7705888003110886, + "reward_change_max": 0.00047094374895095825, + "reward_change_mean": -0.11695488588884473, + "reward_change_min": -0.21127595845609903, + "reward_change_std": 0.08577916398644447, + "reward_std": 0.7578394673764706, + "rewards/cosine_scaled_reward": -0.03121412917971611, + "rewards/format_reward": 0.6666666772216558, + "step": 222 + }, + { + "advantage_max": 0.7724505327641964, + "advantage_mean": -5.587935669737476e-09, + "advantage_min": -0.7214891128242016, + "advantage_std": 0.60935864970088, + "completion_length": 2594.416717529297, + "epoch": 0.25485714285714284, + "grad_norm": 0.3700931966304779, + "kl": 0.373779296875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.097981330836616e-07, + "loss": 0.0301, + "reward": 0.4279163293540478, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4279163293540478, + "reward_after_std": 0.60935864970088, + "reward_before_mean": 0.5440852921456099, + "reward_before_std": 0.6187305934727192, + "reward_change_max": 0.00026647746562957764, + "reward_change_mean": -0.11616896372288465, + "reward_change_min": -0.20987431332468987, + "reward_change_std": 0.08221541903913021, + "reward_std": 0.60935864970088, + "rewards/cosine_scaled_reward": -0.04045736789703369, + "rewards/format_reward": 0.6250000111758709, + "step": 223 + }, + { + "advantage_max": 0.9514374248683453, + "advantage_mean": -2.1730860055857448e-08, + "advantage_min": -0.6713517233729362, + "advantage_std": 0.6334418430924416, + "completion_length": 3012.479232788086, + "epoch": 0.256, + "grad_norm": 0.5133031010627747, + "kl": 0.428955078125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.068574212948169e-07, + "loss": 0.0498, + "reward": 0.3854692354798317, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3854692354798317, + "reward_after_std": 0.6334418393671513, + "reward_before_mean": 0.4944905759766698, + "reward_before_std": 0.6296762898564339, + "reward_change_max": 0.0, + "reward_change_mean": -0.10902136843651533, + "reward_change_min": -0.19977211859077215, + "reward_change_std": 0.07383694127202034, + "reward_std": 0.633441861718893, + "rewards/cosine_scaled_reward": 0.03891195636242628, + "rewards/format_reward": 0.41666667722165585, + "step": 224 + }, + { + "advantage_max": 0.8476278707385063, + "advantage_mean": -1.0554989937627823e-08, + "advantage_min": -0.9130581766366959, + "advantage_std": 0.6610419899225235, + "completion_length": 2864.3333740234375, + "epoch": 0.2571428571428571, + "grad_norm": 0.6178968548774719, + "kl": 0.413818359375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.039090644965509e-07, + "loss": 0.0233, + "reward": 0.24999480694532394, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.24999480694532394, + "reward_after_std": 0.6610419973731041, + "reward_before_mean": 0.3482043823460117, + "reward_before_std": 0.6777895800769329, + "reward_change_max": 0.0002451390027999878, + "reward_change_mean": -0.09820958506315947, + "reward_change_min": -0.17957793455570936, + "reward_change_std": 0.0756394388154149, + "reward_std": 0.6610420010983944, + "rewards/cosine_scaled_reward": -0.07589781284332275, + "rewards/format_reward": 0.5000000055879354, + "step": 225 + }, + { + "advantage_max": 0.9860688522458076, + "advantage_mean": -1.3659397779530735e-08, + "advantage_min": -0.8861872833222151, + "advantage_std": 0.7225923500955105, + "completion_length": 2687.875045776367, + "epoch": 0.2582857142857143, + "grad_norm": 0.42785775661468506, + "kl": 0.431304931640625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.009532063876148e-07, + "loss": 0.0251, + "reward": 0.3786590788513422, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3786590788513422, + "reward_after_std": 0.7225923649966717, + "reward_before_mean": 0.48628670489415526, + "reward_before_std": 0.7363221980631351, + "reward_change_max": 0.00018220394849777222, + "reward_change_mean": -0.10762762138620019, + "reward_change_min": -0.20438862685114145, + "reward_change_std": 0.08181980717927217, + "reward_std": 0.7225923947989941, + "rewards/cosine_scaled_reward": 0.034810012206435204, + "rewards/format_reward": 0.41666667722165585, + "step": 226 + }, + { + "advantage_max": 1.3538540825247765, + "advantage_mean": -1.2417634698280722e-09, + "advantage_min": -1.0006621703505516, + "advantage_std": 0.8676325753331184, + "completion_length": 2686.437530517578, + "epoch": 0.25942857142857145, + "grad_norm": 0.5147333741188049, + "kl": 0.4132080078125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.979899910323624e-07, + "loss": 0.0256, + "reward": 0.4257585988380015, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4257585988380015, + "reward_after_std": 0.8676326051354408, + "reward_before_mean": 0.5303960796445608, + "reward_before_std": 0.8729932010173798, + "reward_change_max": 0.0, + "reward_change_mean": -0.10463747382164001, + "reward_change_min": -0.2075261939316988, + "reward_change_std": 0.08033762080594897, + "reward_std": 0.8676326274871826, + "rewards/cosine_scaled_reward": -0.047301971819251776, + "rewards/format_reward": 0.6250000204890966, + "step": 227 + }, + { + "advantage_max": 0.8109984621405602, + "advantage_mean": -3.91155482448724e-08, + "advantage_min": -0.8960036411881447, + "advantage_std": 0.6084200330078602, + "completion_length": 2570.0000610351562, + "epoch": 0.26057142857142856, + "grad_norm": 0.45605042576789856, + "kl": 0.366058349609375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.950195628537299e-07, + "loss": 0.0226, + "reward": 0.5789503455162048, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5789503455162048, + "reward_after_std": 0.6084200479090214, + "reward_before_mean": 0.7085773483850062, + "reward_before_std": 0.6109695844352245, + "reward_change_max": 0.00010479241609573364, + "reward_change_mean": -0.12962702754884958, + "reward_change_min": -0.21464707050472498, + "reward_change_std": 0.08285582205280662, + "reward_std": 0.6084200702607632, + "rewards/cosine_scaled_reward": 0.08345534279942513, + "rewards/format_reward": 0.5416666753590107, + "step": 228 + }, + { + "advantage_max": 1.1666472516953945, + "advantage_mean": 4.967053657267684e-09, + "advantage_min": -0.8106258250772953, + "advantage_std": 0.7726855352520943, + "completion_length": 2959.2084045410156, + "epoch": 0.26171428571428573, + "grad_norm": 0.5730072259902954, + "kl": 0.5205078125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.920420666261961e-07, + "loss": 0.0382, + "reward": 0.43685874436050653, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.43685874436050653, + "reward_after_std": 0.7726855203509331, + "reward_before_mean": 0.5464709792286158, + "reward_before_std": 0.7791089043021202, + "reward_change_max": 4.570186138153076e-05, + "reward_change_mean": -0.10961220692843199, + "reward_change_min": -0.19915625173598528, + "reward_change_std": 0.08033318631350994, + "reward_std": 0.7726855352520943, + "rewards/cosine_scaled_reward": 0.03365214308723807, + "rewards/format_reward": 0.47916667349636555, + "step": 229 + }, + { + "advantage_max": 0.9596210494637489, + "advantage_mean": 1.30385160446167e-08, + "advantage_min": -0.6449937969446182, + "advantage_std": 0.6544265672564507, + "completion_length": 3251.0209045410156, + "epoch": 0.26285714285714284, + "grad_norm": 0.5122941732406616, + "kl": 0.48828125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.890576474687263e-07, + "loss": 0.0303, + "reward": -0.04866902204230428, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.04866902204230428, + "reward_after_std": 0.6544265672564507, + "reward_before_mean": 0.01986833941191435, + "reward_before_std": 0.6688491478562355, + "reward_change_max": 0.0005970969796180725, + "reward_change_mean": -0.06853736680932343, + "reward_change_min": -0.15719961281865835, + "reward_change_std": 0.061842745169997215, + "reward_std": 0.6544266007840633, + "rewards/cosine_scaled_reward": -0.20881584659218788, + "rewards/format_reward": 0.4375000037252903, + "step": 230 + }, + { + "advantage_max": 0.9483978226780891, + "advantage_mean": -1.8936892887122525e-08, + "advantage_min": -0.8996751829981804, + "advantage_std": 0.6520739421248436, + "completion_length": 2641.604202270508, + "epoch": 0.264, + "grad_norm": 0.38522300124168396, + "kl": 0.384124755859375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.860664508377001e-07, + "loss": 0.0357, + "reward": 0.3406216111034155, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3406216111034155, + "reward_after_std": 0.6520739495754242, + "reward_before_mean": 0.4451865218579769, + "reward_before_std": 0.6562475748360157, + "reward_change_max": 0.00021515041589736938, + "reward_change_mean": -0.104564911685884, + "reward_change_min": -0.18150869477540255, + "reward_change_std": 0.07289927126839757, + "reward_std": 0.6520739682018757, + "rewards/cosine_scaled_reward": -0.08990675210952759, + "rewards/format_reward": 0.6250000149011612, + "step": 231 + }, + { + "advantage_max": 0.7513163685798645, + "advantage_mean": 6.208817238118058e-09, + "advantage_min": -0.4088404346257448, + "advantage_std": 0.4395688883960247, + "completion_length": 3061.7083740234375, + "epoch": 0.2651428571428571, + "grad_norm": 0.5501198768615723, + "kl": 0.492919921875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.83068622519821e-07, + "loss": 0.0351, + "reward": -0.35827588848769665, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.35827588848769665, + "reward_after_std": 0.4395688883960247, + "reward_before_mean": -0.3145838063210249, + "reward_before_std": 0.4367716293781996, + "reward_change_max": 0.0004380419850349426, + "reward_change_mean": -0.043692084145732224, + "reward_change_min": -0.09585795551538467, + "reward_change_std": 0.03534274536650628, + "reward_std": 0.4395688995718956, + "rewards/cosine_scaled_reward": -0.29270857013761997, + "rewards/format_reward": 0.2708333358168602, + "step": 232 + }, + { + "advantage_max": 1.0172811076045036, + "advantage_mean": 6.829698917520943e-09, + "advantage_min": -0.6046523898839951, + "advantage_std": 0.6398416385054588, + "completion_length": 2803.06258392334, + "epoch": 0.2662857142857143, + "grad_norm": 0.31811264157295227, + "kl": 0.377960205078125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.800643086250121e-07, + "loss": 0.0386, + "reward": 0.09723816439509392, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.09723816439509392, + "reward_after_std": 0.6398416422307491, + "reward_before_mean": 0.17845547664910555, + "reward_before_std": 0.6404211223125458, + "reward_change_max": 0.00018434226512908936, + "reward_change_mean": -0.08121731854043901, + "reward_change_min": -0.16058277525007725, + "reward_change_std": 0.06039998144842684, + "reward_std": 0.6398416813462973, + "rewards/cosine_scaled_reward": -0.1503555942326784, + "rewards/format_reward": 0.4791666716337204, + "step": 233 + }, + { + "advantage_max": 0.788167878985405, + "advantage_mean": 1.3038516488705909e-08, + "advantage_min": -0.6035764142870903, + "advantage_std": 0.5259315725415945, + "completion_length": 2721.187568664551, + "epoch": 0.2674285714285714, + "grad_norm": 0.27355051040649414, + "kl": 0.3179168701171875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.770536555792944e-07, + "loss": 0.0403, + "reward": 0.13820985238999128, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.13820985238999128, + "reward_after_std": 0.5259315762668848, + "reward_before_mean": 0.22728153504431248, + "reward_before_std": 0.5203691460192204, + "reward_change_max": 0.00012595951557159424, + "reward_change_mean": -0.08907167753204703, + "reward_change_min": -0.16210692562162876, + "reward_change_std": 0.06338219251483679, + "reward_std": 0.5259315762668848, + "rewards/cosine_scaled_reward": -0.12594256736338139, + "rewards/format_reward": 0.4791666716337204, + "step": 234 + }, + { + "advantage_max": 1.1957035660743713, + "advantage_mean": 2.0489098029319308e-08, + "advantage_min": -1.0198877342045307, + "advantage_std": 0.8198182247579098, + "completion_length": 2557.6667251586914, + "epoch": 0.26857142857142857, + "grad_norm": 0.43036338686943054, + "kl": 0.30059814453125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.740368101176495e-07, + "loss": 0.0263, + "reward": 0.5482924771495163, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5482924771495163, + "reward_after_std": 0.8198181949555874, + "reward_before_mean": 0.6676017071586102, + "reward_before_std": 0.8265979290008545, + "reward_change_max": 0.0005790963768959045, + "reward_change_mean": -0.11930918972939253, + "reward_change_min": -0.21050137467682362, + "reward_change_std": 0.08706249436363578, + "reward_std": 0.819818202406168, + "rewards/cosine_scaled_reward": 0.04213416809216142, + "rewards/format_reward": 0.5833333432674408, + "step": 235 + }, + { + "advantage_max": 1.244063027203083, + "advantage_mean": -4.656613428188905e-09, + "advantage_min": -0.9987089484930038, + "advantage_std": 0.884406566619873, + "completion_length": 3065.1458740234375, + "epoch": 0.26971428571428574, + "grad_norm": 0.8933318257331848, + "kl": 0.308349609375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.710139192768694e-07, + "loss": 0.0414, + "reward": 0.1958321612328291, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1958321612328291, + "reward_after_std": 0.8844065554440022, + "reward_before_mean": 0.28194663720205426, + "reward_before_std": 0.9100110270082951, + "reward_change_max": 1.2174248695373535e-05, + "reward_change_mean": -0.08611448504962027, + "reward_change_min": -0.20149937644600868, + "reward_change_std": 0.08233956387266517, + "reward_std": 0.8844065703451633, + "rewards/cosine_scaled_reward": -0.10902668349444866, + "rewards/format_reward": 0.5000000055879354, + "step": 236 + }, + { + "advantage_max": 1.2769840061664581, + "advantage_mean": -3.601114062501409e-08, + "advantage_min": -0.9948960766196251, + "advantage_std": 0.8261019103229046, + "completion_length": 2952.625030517578, + "epoch": 0.27085714285714285, + "grad_norm": 0.582528829574585, + "kl": 0.30987548828125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.679851303883891e-07, + "loss": 0.0507, + "reward": 0.4712846730835736, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4712846730835736, + "reward_after_std": 0.8261019252240658, + "reward_before_mean": 0.5824574897997081, + "reward_before_std": 0.8286083415150642, + "reward_change_max": 0.00019712001085281372, + "reward_change_mean": -0.11117283953353763, + "reward_change_min": -0.19031432271003723, + "reward_change_std": 0.07673935976345092, + "reward_std": 0.8261019699275494, + "rewards/cosine_scaled_reward": -0.00043792277574539185, + "rewards/format_reward": 0.5833333414047956, + "step": 237 + }, + { + "advantage_max": 1.4660193622112274, + "advantage_mean": -4.221995741904294e-08, + "advantage_min": -1.3478331565856934, + "advantage_std": 1.0443166494369507, + "completion_length": 2659.354217529297, + "epoch": 0.272, + "grad_norm": 1.605945110321045, + "kl": 0.28875732421875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.649505910711058e-07, + "loss": 0.0653, + "reward": 0.7084148563444614, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7084148563444614, + "reward_after_std": 1.0443166941404343, + "reward_before_mean": 0.8377973195165396, + "reward_before_std": 1.068930521607399, + "reward_change_max": 4.151463508605957e-05, + "reward_change_mean": -0.1293824641034007, + "reward_change_min": -0.24387627188116312, + "reward_change_std": 0.10105510335415602, + "reward_std": 1.0443167239427567, + "rewards/cosine_scaled_reward": 0.08556529941779445, + "rewards/format_reward": 0.666666679084301, + "step": 238 + }, + { + "advantage_max": 0.8369556181132793, + "advantage_mean": -9.313225635132483e-09, + "advantage_min": -0.6261783614754677, + "advantage_std": 0.5559764578938484, + "completion_length": 2174.7708740234375, + "epoch": 0.27314285714285713, + "grad_norm": 0.5835373997688293, + "kl": 0.25140380859375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.619104492241847e-07, + "loss": 0.0381, + "reward": 0.7473947200924158, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7473947200924158, + "reward_after_std": 0.5559764541685581, + "reward_before_mean": 0.8919164468534291, + "reward_before_std": 0.5329598225653172, + "reward_change_max": 0.00044843554496765137, + "reward_change_mean": -0.14452172303572297, + "reward_change_min": -0.2255661329254508, + "reward_change_std": 0.08641267288476229, + "reward_std": 0.555976465344429, + "rewards/cosine_scaled_reward": 0.1751248836517334, + "rewards/format_reward": 0.5416666679084301, + "step": 239 + }, + { + "advantage_max": 0.6366955451667309, + "advantage_mean": 3.1044086745701804e-09, + "advantage_min": -0.6315915696322918, + "advantage_std": 0.48512188345193863, + "completion_length": 2778.166717529297, + "epoch": 0.2742857142857143, + "grad_norm": 1.1766997575759888, + "kl": 0.4515380859375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.588648530198504e-07, + "loss": 0.0056, + "reward": -0.018587548285722733, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.018587548285722733, + "reward_after_std": 0.48512188345193863, + "reward_before_mean": 0.05906429514288902, + "reward_before_std": 0.496453870087862, + "reward_change_max": 4.595518112182617e-05, + "reward_change_mean": -0.07765184435993433, + "reward_change_min": -0.14331062696874142, + "reward_change_std": 0.056621880736202, + "reward_std": 0.4851218946278095, + "rewards/cosine_scaled_reward": -0.2413011882454157, + "rewards/format_reward": 0.5416666846722364, + "step": 240 + }, + { + "advantage_max": 0.9776230975985527, + "advantage_mean": 4.03573130469681e-09, + "advantage_min": -0.7462376952171326, + "advantage_std": 0.6642108038067818, + "completion_length": 2740.2500610351562, + "epoch": 0.2754285714285714, + "grad_norm": 1.105962872505188, + "kl": 0.43316650390625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.558139508961654e-07, + "loss": -0.0129, + "reward": 0.13068385515362024, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.13068385515362024, + "reward_after_std": 0.6642108038067818, + "reward_before_mean": 0.2154865637421608, + "reward_before_std": 0.6723109371960163, + "reward_change_max": 0.0003187432885169983, + "reward_change_mean": -0.08480269648134708, + "reward_change_min": -0.17307660542428493, + "reward_change_std": 0.06677990918979049, + "reward_std": 0.6642108112573624, + "rewards/cosine_scaled_reward": -0.13184005906805396, + "rewards/format_reward": 0.47916667722165585, + "step": 241 + }, + { + "advantage_max": 0.7249393723905087, + "advantage_mean": -8.692344732885715e-09, + "advantage_min": -0.7632653787732124, + "advantage_std": 0.5563169829547405, + "completion_length": 2379.2292098999023, + "epoch": 0.2765714285714286, + "grad_norm": 1.055472493171692, + "kl": 0.380096435546875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.527578915497951e-07, + "loss": -0.0068, + "reward": 0.6038695313036442, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6038695313036442, + "reward_after_std": 0.5563170053064823, + "reward_before_mean": 0.7374595701694489, + "reward_before_std": 0.5531607791781425, + "reward_change_max": 0.0008292198181152344, + "reward_change_mean": -0.1335900668054819, + "reward_change_min": -0.2169865220785141, + "reward_change_std": 0.0838638637214899, + "reward_std": 0.5563170239329338, + "rewards/cosine_scaled_reward": -0.027103547006845474, + "rewards/format_reward": 0.7916666753590107, + "step": 242 + }, + { + "advantage_max": 1.0612565129995346, + "advantage_mean": -4.470348402563218e-08, + "advantage_min": -1.2809006348252296, + "advantage_std": 0.8737168461084366, + "completion_length": 2800.3125762939453, + "epoch": 0.2777142857142857, + "grad_norm": 0.7638176083564758, + "kl": 0.39288330078125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.496968239287603e-07, + "loss": 0.057, + "reward": 0.8082469571381807, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.8082469571381807, + "reward_after_std": 0.8737168610095978, + "reward_before_mean": 0.9544677063822746, + "reward_before_std": 0.8974614664912224, + "reward_change_max": 2.9906630516052246e-05, + "reward_change_mean": -0.14622076135128736, + "reward_change_min": -0.2652246952056885, + "reward_change_std": 0.10711004957556725, + "reward_std": 0.8737168833613396, + "rewards/cosine_scaled_reward": 0.13348384480923414, + "rewards/format_reward": 0.6875000204890966, + "step": 243 + }, + { + "advantage_max": 1.3663447052240372, + "advantage_mean": -1.552204320631745e-08, + "advantage_min": -1.0774092823266983, + "advantage_std": 0.935070589184761, + "completion_length": 2798.687545776367, + "epoch": 0.27885714285714286, + "grad_norm": 0.5033749938011169, + "kl": 0.48846435546875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.466308972251785e-07, + "loss": 0.017, + "reward": 0.47303982824087143, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.47303982824087143, + "reward_after_std": 0.935070589184761, + "reward_before_mean": 0.5819773841649294, + "reward_before_std": 0.9512156844139099, + "reward_change_max": 0.00012916326522827148, + "reward_change_mean": -0.108937568962574, + "reward_change_min": -0.21076048258692026, + "reward_change_std": 0.08802835131064057, + "reward_std": 0.9350706189870834, + "rewards/cosine_scaled_reward": 0.020155361853539944, + "rewards/format_reward": 0.541666679084301, + "step": 244 + }, + { + "advantage_max": 1.3429825454950333, + "advantage_mean": -1.3193736880801055e-08, + "advantage_min": -0.9926909804344177, + "advantage_std": 0.8736944049596786, + "completion_length": 3171.916717529297, + "epoch": 0.28, + "grad_norm": 0.7295897603034973, + "kl": 0.476318359375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.435602608679916e-07, + "loss": 0.0051, + "reward": 0.36052384227514267, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.36052384227514267, + "reward_after_std": 0.873694408684969, + "reward_before_mean": 0.46028553135693073, + "reward_before_std": 0.881868090480566, + "reward_change_max": 0.00019543617963790894, + "reward_change_mean": -0.0997616951353848, + "reward_change_min": -0.20378663670271635, + "reward_change_std": 0.08069562003947794, + "reward_std": 0.8736944310367107, + "rewards/cosine_scaled_reward": -0.01985724247060716, + "rewards/format_reward": 0.5000000037252903, + "step": 245 + }, + { + "advantage_max": 1.2377834096550941, + "advantage_mean": 1.2417632477834672e-09, + "advantage_min": -0.9669884629547596, + "advantage_std": 0.8542461507022381, + "completion_length": 2991.4584045410156, + "epoch": 0.28114285714285714, + "grad_norm": 1.2443678379058838, + "kl": 0.4268798828125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.404850645156841e-07, + "loss": 0.0874, + "reward": 0.3200700846500695, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3200700846500695, + "reward_after_std": 0.8542461507022381, + "reward_before_mean": 0.4179497007280588, + "reward_before_std": 0.8701425269246101, + "reward_change_max": 0.0, + "reward_change_mean": -0.09787960723042488, + "reward_change_min": -0.21520974393934011, + "reward_change_std": 0.08369475090876222, + "reward_std": 0.8542461581528187, + "rewards/cosine_scaled_reward": -0.09310848778113723, + "rewards/format_reward": 0.6041666828095913, + "step": 246 + }, + { + "advantage_max": 0.7722934186458588, + "advantage_mean": 4.03573130469681e-09, + "advantage_min": -0.6312546953558922, + "advantage_std": 0.5287173539400101, + "completion_length": 3171.2291870117188, + "epoch": 0.2822857142857143, + "grad_norm": 0.8870450258255005, + "kl": 0.4293212890625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.374054580489873e-07, + "loss": 0.0132, + "reward": 0.06275376630946994, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.06275376630946994, + "reward_after_std": 0.5287173539400101, + "reward_before_mean": 0.14523836690932512, + "reward_before_std": 0.5309392772614956, + "reward_change_max": 0.0001341402530670166, + "reward_change_mean": -0.08248458697926253, + "reward_change_min": -0.15690710861235857, + "reward_change_std": 0.05856957985088229, + "reward_std": 0.5287173762917519, + "rewards/cosine_scaled_reward": -0.25029749423265457, + "rewards/format_reward": 0.6458333432674408, + "step": 247 + }, + { + "advantage_max": 0.9769742004573345, + "advantage_mean": -2.1109978431965715e-08, + "advantage_min": -1.1480994373559952, + "advantage_std": 0.7594380117952824, + "completion_length": 2604.81258392334, + "epoch": 0.2834285714285714, + "grad_norm": 0.4662351608276367, + "kl": 0.339996337890625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.343215915635761e-07, + "loss": 0.0193, + "reward": 0.5111812395043671, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5111812395043671, + "reward_after_std": 0.7594380043447018, + "reward_before_mean": 0.6313334191218019, + "reward_before_std": 0.7793891243636608, + "reward_change_max": 0.0005892887711524963, + "reward_change_mean": -0.12015216751024127, + "reward_change_min": -0.209529516287148, + "reward_change_std": 0.08815564028918743, + "reward_std": 0.7594380229711533, + "rewards/cosine_scaled_reward": 0.055250026285648346, + "rewards/format_reward": 0.520833345130086, + "step": 248 + }, + { + "advantage_max": 1.025376234203577, + "advantage_mean": -2.359350625980028e-08, + "advantage_min": -1.2076657563447952, + "advantage_std": 0.8415849506855011, + "completion_length": 2305.395851135254, + "epoch": 0.2845714285714286, + "grad_norm": 0.4987713396549225, + "kl": 0.225341796875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.31233615362752e-07, + "loss": 0.0169, + "reward": 0.8182168155908585, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.8182168155908585, + "reward_after_std": 0.8415849320590496, + "reward_before_mean": 0.9659371078014374, + "reward_before_std": 0.8593480549752712, + "reward_change_max": 0.00013027340173721313, + "reward_change_mean": -0.14772030501626432, + "reward_change_min": -0.24447989743202925, + "reward_change_std": 0.10552188637666404, + "reward_std": 0.8415849320590496, + "rewards/cosine_scaled_reward": 0.1496352255344391, + "rewards/format_reward": 0.6666666772216558, + "step": 249 + }, + { + "advantage_max": 1.0746857300400734, + "advantage_mean": -8.071462664904772e-09, + "advantage_min": -0.7048100084066391, + "advantage_std": 0.6700468733906746, + "completion_length": 2668.5000762939453, + "epoch": 0.2857142857142857, + "grad_norm": 0.6423651576042175, + "kl": 0.287353515625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.281416799501187e-07, + "loss": 0.0064, + "reward": 0.3210712969303131, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3210712969303131, + "reward_after_std": 0.6700468808412552, + "reward_before_mean": 0.42153167352080345, + "reward_before_std": 0.6627967059612274, + "reward_change_max": 0.0004504099488258362, + "reward_change_mean": -0.10046036634594202, + "reward_change_min": -0.17996104154735804, + "reward_change_std": 0.06692047603428364, + "reward_std": 0.6700468994677067, + "rewards/cosine_scaled_reward": -0.1538175237365067, + "rewards/format_reward": 0.7291666828095913, + "step": 250 + }, + { + "advantage_max": 1.202012088149786, + "advantage_mean": -3.135452805724803e-08, + "advantage_min": -0.7174399569630623, + "advantage_std": 0.7109669633209705, + "completion_length": 2205.416732788086, + "epoch": 0.28685714285714287, + "grad_norm": 0.6144553422927856, + "kl": 0.200531005859375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.25045936022246e-07, + "loss": -0.0299, + "reward": 0.5154064744710922, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5154064744710922, + "reward_after_std": 0.7109669931232929, + "reward_before_mean": 0.6320094037801027, + "reward_before_std": 0.6927180550992489, + "reward_change_max": 0.0, + "reward_change_mean": -0.11660293349996209, + "reward_change_min": -0.1988846454769373, + "reward_change_std": 0.073700116481632, + "reward_std": 0.7109670229256153, + "rewards/cosine_scaled_reward": -0.06941197859123349, + "rewards/format_reward": 0.7708333414047956, + "step": 251 + }, + { + "advantage_max": 0.7281642220914364, + "advantage_mean": -6.208821790032459e-10, + "advantage_min": -0.744192611426115, + "advantage_std": 0.5717556402087212, + "completion_length": 2732.6250762939453, + "epoch": 0.288, + "grad_norm": 0.36412715911865234, + "kl": 0.254913330078125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.219465344613258e-07, + "loss": -0.0016, + "reward": 0.3161662006750703, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3161662006750703, + "reward_after_std": 0.5717556513845921, + "reward_before_mean": 0.4227669003084884, + "reward_before_std": 0.5833228453993797, + "reward_change_max": 0.00025747716426849365, + "reward_change_mean": -0.10660070087760687, + "reward_change_min": -0.1928198728710413, + "reward_change_std": 0.07527502113953233, + "reward_std": 0.5717556811869144, + "rewards/cosine_scaled_reward": -0.09069989109411836, + "rewards/format_reward": 0.6041666772216558, + "step": 252 + }, + { + "advantage_max": 1.0409824773669243, + "advantage_mean": -1.3814618338159335e-08, + "advantage_min": -1.000199355185032, + "advantage_std": 0.7667047046124935, + "completion_length": 2554.875015258789, + "epoch": 0.28914285714285715, + "grad_norm": 0.5692912936210632, + "kl": 0.2001953125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.188436263278172e-07, + "loss": 0.0317, + "reward": 0.5202262690290809, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5202262690290809, + "reward_after_std": 0.7667046971619129, + "reward_before_mean": 0.6402729395776987, + "reward_before_std": 0.7784381285309792, + "reward_change_max": 0.0, + "reward_change_mean": -0.12004667799919844, + "reward_change_min": -0.21133162081241608, + "reward_change_std": 0.08583518001250923, + "reward_std": 0.766704723238945, + "rewards/cosine_scaled_reward": 0.0076364679262042046, + "rewards/format_reward": 0.6250000074505806, + "step": 253 + }, + { + "advantage_max": 1.2403109297156334, + "advantage_mean": -9.313229354379615e-10, + "advantage_min": -0.7695105597376823, + "advantage_std": 0.7703098207712173, + "completion_length": 3132.8125915527344, + "epoch": 0.29028571428571426, + "grad_norm": 0.38544797897338867, + "kl": 0.282958984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.157373628530852e-07, + "loss": 0.0181, + "reward": 0.18571746069937944, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.18571746069937944, + "reward_after_std": 0.7703097984194756, + "reward_before_mean": 0.27099318616092205, + "reward_before_std": 0.7718499638140202, + "reward_change_max": 0.0, + "reward_change_mean": -0.08527570590376854, + "reward_change_min": -0.16410515550523996, + "reward_change_std": 0.06472577340900898, + "reward_std": 0.7703098133206367, + "rewards/cosine_scaled_reward": -0.10408675856888294, + "rewards/format_reward": 0.47916667349636555, + "step": 254 + }, + { + "advantage_max": 1.3647899143397808, + "advantage_mean": -3.042320501078777e-08, + "advantage_min": -1.0756276212632656, + "advantage_std": 0.9934910573065281, + "completion_length": 2786.395950317383, + "epoch": 0.2914285714285714, + "grad_norm": 0.8177282214164734, + "kl": 0.234375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.126278954320294e-07, + "loss": 0.0246, + "reward": 0.36817927472293377, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.36817927472293377, + "reward_after_std": 0.9934910237789154, + "reward_before_mean": 0.4680708646774292, + "reward_before_std": 1.0233058147132397, + "reward_change_max": 0.0005884990096092224, + "reward_change_mean": -0.09989159693941474, + "reward_change_min": -0.22440185863524675, + "reward_change_std": 0.09326563123613596, + "reward_std": 0.9934910461306572, + "rewards/cosine_scaled_reward": -0.026381254196166992, + "rewards/format_reward": 0.5208333432674408, + "step": 255 + }, + { + "advantage_max": 1.232586644589901, + "advantage_mean": -2.793967751602011e-08, + "advantage_min": -0.8954048380255699, + "advantage_std": 0.7754578627645969, + "completion_length": 2832.1250610351562, + "epoch": 0.2925714285714286, + "grad_norm": 0.8683269619941711, + "kl": 0.204010009765625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.095153756157051e-07, + "loss": 0.0341, + "reward": 0.38214224576950073, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.38214224576950073, + "reward_after_std": 0.7754578702151775, + "reward_before_mean": 0.4855796182528138, + "reward_before_std": 0.7749796956777573, + "reward_change_max": 0.00045193731784820557, + "reward_change_mean": -0.1034374050796032, + "reward_change_min": -0.18192225974053144, + "reward_change_std": 0.07434211485087872, + "reward_std": 0.7754578925669193, + "rewards/cosine_scaled_reward": -0.03846019133925438, + "rewards/format_reward": 0.562500013038516, + "step": 256 + }, + { + "advantage_max": 1.3854403644800186, + "advantage_mean": -2.1109979209121832e-08, + "advantage_min": -1.1317920796573162, + "advantage_std": 0.901426050812006, + "completion_length": 3173.6459350585938, + "epoch": 0.2937142857142857, + "grad_norm": 0.5222746133804321, + "kl": 0.26904296875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.06399955103937e-07, + "loss": 0.0204, + "reward": 0.4658824288053438, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4658824288053438, + "reward_after_std": 0.9014260284602642, + "reward_before_mean": 0.5744250370189548, + "reward_before_std": 0.9102607406675816, + "reward_change_max": 4.0337443351745605e-05, + "reward_change_mean": -0.10854260809719563, + "reward_change_min": -0.200001772493124, + "reward_change_std": 0.08366946689784527, + "reward_std": 0.9014260694384575, + "rewards/cosine_scaled_reward": -0.025287493132054806, + "rewards/format_reward": 0.6250000149011612, + "step": 257 + }, + { + "advantage_max": 1.3329339772462845, + "advantage_mean": 9.002785128497948e-09, + "advantage_min": -1.2666821628808975, + "advantage_std": 0.9974622689187527, + "completion_length": 3105.6875915527344, + "epoch": 0.2948571428571429, + "grad_norm": 1.931921124458313, + "kl": 0.2685546875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.032817857379256e-07, + "loss": 0.0957, + "reward": 0.4899730863980949, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4899730863980949, + "reward_after_std": 0.9974622763693333, + "reward_before_mean": 0.6017860121792182, + "reward_before_std": 1.0291601456701756, + "reward_change_max": 3.0837953090667725e-05, + "reward_change_mean": -0.11181289050728083, + "reward_change_min": -0.23886566143482924, + "reward_change_std": 0.10037148464471102, + "reward_std": 0.9974623024463654, + "rewards/cosine_scaled_reward": -0.022023675497621298, + "rewards/format_reward": 0.645833358168602, + "step": 258 + }, + { + "advantage_max": 0.9765863195061684, + "advantage_mean": -1.7384688688615313e-08, + "advantage_min": -0.8986761644482613, + "advantage_std": 0.6964166983962059, + "completion_length": 2581.208381652832, + "epoch": 0.296, + "grad_norm": 0.5722829699516296, + "kl": 0.2742767333984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.001610194928464e-07, + "loss": -0.0139, + "reward": 0.38566339667886496, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.38566339667886496, + "reward_after_std": 0.6964167132973671, + "reward_before_mean": 0.4943455453030765, + "reward_before_std": 0.7021567225456238, + "reward_change_max": 0.0011222511529922485, + "reward_change_mean": -0.10868217144161463, + "reward_change_min": -0.1917388141155243, + "reward_change_std": 0.07943293126299977, + "reward_std": 0.6964167356491089, + "rewards/cosine_scaled_reward": -0.06532722525298595, + "rewards/format_reward": 0.6250000111758709, + "step": 259 + }, + { + "advantage_max": 1.106583446264267, + "advantage_mean": -2.6077032533322608e-08, + "advantage_min": -0.7876567766070366, + "advantage_std": 0.7193886376917362, + "completion_length": 2335.1041870117188, + "epoch": 0.29714285714285715, + "grad_norm": 0.3414742648601532, + "kl": 0.25067138671875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.97037808470444e-07, + "loss": 0.0166, + "reward": 0.7363164806738496, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7363164806738496, + "reward_after_std": 0.7193886302411556, + "reward_before_mean": 0.8749077282845974, + "reward_before_std": 0.7035922594368458, + "reward_change_max": 0.00020250678062438965, + "reward_change_mean": -0.13859128253534436, + "reward_change_min": -0.2278941599652171, + "reward_change_std": 0.08999049849808216, + "reward_std": 0.7193886563181877, + "rewards/cosine_scaled_reward": 0.08328720182180405, + "rewards/format_reward": 0.708333345130086, + "step": 260 + }, + { + "advantage_max": 1.001849688589573, + "advantage_mean": -1.3969839007810236e-08, + "advantage_min": -0.8723765462636948, + "advantage_std": 0.7362087070941925, + "completion_length": 3038.8334045410156, + "epoch": 0.29828571428571427, + "grad_norm": 0.6822984218597412, + "kl": 0.32794189453125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.939123048916173e-07, + "loss": 0.0571, + "reward": 0.10068884119391441, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.10068884119391441, + "reward_after_std": 0.7362087145447731, + "reward_before_mean": 0.18243083090055734, + "reward_before_std": 0.756432544440031, + "reward_change_max": 0.0, + "reward_change_mean": -0.08174200495705009, + "reward_change_min": -0.18367165885865688, + "reward_change_std": 0.07381555391475558, + "reward_std": 0.7362087294459343, + "rewards/cosine_scaled_reward": -0.21086792647838593, + "rewards/format_reward": 0.6041666809469461, + "step": 261 + }, + { + "advantage_max": 0.9958531409502029, + "advantage_mean": -2.4835269396561444e-09, + "advantage_min": -1.0813852436840534, + "advantage_std": 0.763180959969759, + "completion_length": 3018.3334045410156, + "epoch": 0.29942857142857143, + "grad_norm": 0.37624964118003845, + "kl": 0.2686767578125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.907846610890011e-07, + "loss": 0.0111, + "reward": 0.41146421869052574, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.41146421869052574, + "reward_after_std": 0.7631809748709202, + "reward_before_mean": 0.5222134934738278, + "reward_before_std": 0.783074539154768, + "reward_change_max": 0.00028192996978759766, + "reward_change_mean": -0.11074927542358637, + "reward_change_min": -0.19585992395877838, + "reward_change_std": 0.08221290446817875, + "reward_std": 0.7631809934973717, + "rewards/cosine_scaled_reward": -0.13472658768296242, + "rewards/format_reward": 0.7916666865348816, + "step": 262 + }, + { + "advantage_max": 1.0192478522658348, + "advantage_mean": -3.725290353973065e-09, + "advantage_min": -0.6987932249903679, + "advantage_std": 0.6710875891149044, + "completion_length": 3003.0834045410156, + "epoch": 0.30057142857142854, + "grad_norm": 0.4050155282020569, + "kl": 0.33062744140625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.87655029499542e-07, + "loss": 0.0446, + "reward": 0.20357975119259208, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.20357975119259208, + "reward_after_std": 0.6710875928401947, + "reward_before_mean": 0.2945442688651383, + "reward_before_std": 0.6724608726799488, + "reward_change_max": 0.0, + "reward_change_mean": -0.09096452733501792, + "reward_change_min": -0.17119490914046764, + "reward_change_std": 0.06640669284388423, + "reward_std": 0.6710876412689686, + "rewards/cosine_scaled_reward": -0.18606120673939586, + "rewards/format_reward": 0.6666666716337204, + "step": 263 + }, + { + "advantage_max": 1.0631650909781456, + "advantage_mean": -3.849466756467024e-08, + "advantage_min": -0.8726494163274765, + "advantage_std": 0.7215870209038258, + "completion_length": 3126.125, + "epoch": 0.3017142857142857, + "grad_norm": 0.8072293400764465, + "kl": 0.355224609375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.845235626570683e-07, + "loss": 0.0737, + "reward": 0.5229286458343267, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5229286458343267, + "reward_after_std": 0.7215870320796967, + "reward_before_mean": 0.6427558902651072, + "reward_before_std": 0.718892015516758, + "reward_change_max": 0.0, + "reward_change_mean": -0.11982727702707052, + "reward_change_min": -0.19109745789319277, + "reward_change_std": 0.07728103827685118, + "reward_std": 0.7215870432555676, + "rewards/cosine_scaled_reward": -0.0848720595240593, + "rewards/format_reward": 0.8125000186264515, + "step": 264 + }, + { + "advantage_max": 1.0975913107395172, + "advantage_mean": -3.352761424046946e-08, + "advantage_min": -0.926276370882988, + "advantage_std": 0.7606191523373127, + "completion_length": 2679.354217529297, + "epoch": 0.3028571428571429, + "grad_norm": 0.5886125564575195, + "kl": 0.30853271484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.813904131848564e-07, + "loss": 0.0555, + "reward": 0.6098247529007494, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6098247529007494, + "reward_after_std": 0.7606191374361515, + "reward_before_mean": 0.7371023533632979, + "reward_before_std": 0.7642861232161522, + "reward_change_max": 0.0, + "reward_change_mean": -0.1272775838151574, + "reward_change_min": -0.2192865088582039, + "reward_change_std": 0.08723006211221218, + "reward_std": 0.7606191672384739, + "rewards/cosine_scaled_reward": -0.06894884817302227, + "rewards/format_reward": 0.8750000149011612, + "step": 265 + }, + { + "advantage_max": 0.9084913916885853, + "advantage_mean": -4.967053879312289e-09, + "advantage_min": -0.6256868913769722, + "advantage_std": 0.5924951620399952, + "completion_length": 3102.8334045410156, + "epoch": 0.304, + "grad_norm": 0.4780454933643341, + "kl": 0.46588134765625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.78255733788191e-07, + "loss": 0.0411, + "reward": 0.1223881570622325, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.1223881570622325, + "reward_after_std": 0.5924951657652855, + "reward_before_mean": 0.20807474991306663, + "reward_before_std": 0.5930629894137383, + "reward_change_max": 0.0004331320524215698, + "reward_change_mean": -0.08568659145385027, + "reward_change_min": -0.16301941219717264, + "reward_change_std": 0.061356313060969114, + "reward_std": 0.5924951769411564, + "rewards/cosine_scaled_reward": -0.22929597226902843, + "rewards/format_reward": 0.666666679084301, + "step": 266 + }, + { + "advantage_max": 1.3950220122933388, + "advantage_mean": 6.829699084054397e-09, + "advantage_min": -0.9353981837630272, + "advantage_std": 0.8997549638152122, + "completion_length": 3407.604217529297, + "epoch": 0.30514285714285716, + "grad_norm": 0.9051889777183533, + "kl": 0.6298828125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.751196772469237e-07, + "loss": 0.0458, + "reward": 0.0949839185923338, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.0949839185923338, + "reward_after_std": 0.8997549638152122, + "reward_before_mean": 0.16919339634478092, + "reward_before_std": 0.915122464299202, + "reward_change_max": 0.0003774687647819519, + "reward_change_mean": -0.0742094716988504, + "reward_change_min": -0.16107130236923695, + "reward_change_std": 0.06824947381392121, + "reward_std": 0.899754986166954, + "rewards/cosine_scaled_reward": -0.1654033064842224, + "rewards/format_reward": 0.5000000149011612, + "step": 267 + }, + { + "advantage_max": 1.3419854082167149, + "advantage_mean": -2.0489097307674342e-08, + "advantage_min": -0.9137064553797245, + "advantage_std": 0.8581621535122395, + "completion_length": 2647.0834197998047, + "epoch": 0.3062857142857143, + "grad_norm": 0.5118639469146729, + "kl": 0.40020751953125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.71982396408026e-07, + "loss": 0.0267, + "reward": 0.2210367638617754, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2210367638617754, + "reward_after_std": 0.858162172138691, + "reward_before_mean": 0.30672881565988064, + "reward_before_std": 0.8692396897822618, + "reward_change_max": 0.0002562180161476135, + "reward_change_mean": -0.0856920457445085, + "reward_change_min": -0.19060960225760937, + "reward_change_std": 0.07353492826223373, + "reward_std": 0.8581621907651424, + "rewards/cosine_scaled_reward": -0.1591356061398983, + "rewards/format_reward": 0.6250000111758709, + "step": 268 + }, + { + "advantage_max": 1.0998243726789951, + "advantage_mean": -3.476937859847595e-08, + "advantage_min": -0.9841247573494911, + "advantage_std": 0.7637926824390888, + "completion_length": 2915.8750610351562, + "epoch": 0.30742857142857144, + "grad_norm": 0.5941024422645569, + "kl": 0.42169189453125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.688440441781398e-07, + "loss": 0.0477, + "reward": 0.608371525653638, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.608371525653638, + "reward_after_std": 0.7637926861643791, + "reward_before_mean": 0.7351412307471037, + "reward_before_std": 0.7678135149180889, + "reward_change_max": 0.0, + "reward_change_mean": -0.12676969449967146, + "reward_change_min": -0.22412027046084404, + "reward_change_std": 0.08635794976726174, + "reward_std": 0.7637927196919918, + "rewards/cosine_scaled_reward": -0.03867942001670599, + "rewards/format_reward": 0.8125000149011612, + "step": 269 + }, + { + "advantage_max": 1.4132294282317162, + "advantage_mean": -1.9868216072360667e-08, + "advantage_min": -1.029867060482502, + "advantage_std": 0.9192027598619461, + "completion_length": 2846.041748046875, + "epoch": 0.30857142857142855, + "grad_norm": 0.4943372309207916, + "kl": 0.4290771484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.657047735161255e-07, + "loss": 0.0507, + "reward": 0.6552489723544568, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6552489723544568, + "reward_after_std": 0.9192027673125267, + "reward_before_mean": 0.7806877940893173, + "reward_before_std": 0.920238234102726, + "reward_change_max": 0.00011301040649414062, + "reward_change_mean": -0.12543882615864277, + "reward_change_min": -0.23829853534698486, + "reward_change_std": 0.08876523096114397, + "reward_std": 0.9192028120160103, + "rewards/cosine_scaled_reward": 0.0049272209871560335, + "rewards/format_reward": 0.7708333432674408, + "step": 270 + }, + { + "advantage_max": 0.8533101379871368, + "advantage_mean": 1.117587122845265e-08, + "advantage_min": -0.8463800251483917, + "advantage_std": 0.6255792900919914, + "completion_length": 2484.5625610351562, + "epoch": 0.3097142857142857, + "grad_norm": 0.6052711606025696, + "kl": 0.325164794921875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.625647374256061e-07, + "loss": 0.0482, + "reward": 0.8622908256947994, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.8622908256947994, + "reward_after_std": 0.625579297542572, + "reward_before_mean": 1.0175694283097982, + "reward_before_std": 0.6174435224384069, + "reward_change_max": 0.0001889541745185852, + "reward_change_mean": -0.15527859865687788, + "reward_change_min": -0.24020841717720032, + "reward_change_std": 0.09653888270258904, + "reward_std": 0.6255793198943138, + "rewards/cosine_scaled_reward": 0.15461806394159794, + "rewards/format_reward": 0.7083333525806665, + "step": 271 + }, + { + "advantage_max": 1.2468843683600426, + "advantage_mean": -2.3283064809476173e-08, + "advantage_min": -1.231340929865837, + "advantage_std": 0.9128948226571083, + "completion_length": 2975.2500915527344, + "epoch": 0.31085714285714283, + "grad_norm": 0.5220039486885071, + "kl": 0.556640625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.594240889475106e-07, + "loss": 0.0567, + "reward": 0.5736992135643959, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5736992135643959, + "reward_after_std": 0.9128948375582695, + "reward_before_mean": 0.6949340249411762, + "reward_before_std": 0.9327790774405003, + "reward_change_max": 0.0, + "reward_change_mean": -0.121234814170748, + "reward_change_min": -0.2265043631196022, + "reward_change_std": 0.09475089283660054, + "reward_std": 0.9128948450088501, + "rewards/cosine_scaled_reward": 0.02455033385194838, + "rewards/format_reward": 0.6458333544433117, + "step": 272 + }, + { + "advantage_max": 1.0025320798158646, + "advantage_mean": -3.476937715518602e-08, + "advantage_min": -1.0941287279129028, + "advantage_std": 0.7559049688279629, + "completion_length": 2887.3959350585938, + "epoch": 0.312, + "grad_norm": 0.577610433101654, + "kl": 0.543914794921875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.562829811526154e-07, + "loss": 0.0487, + "reward": 0.6015369053930044, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6015369053930044, + "reward_after_std": 0.7559049762785435, + "reward_before_mean": 0.7293371063424274, + "reward_before_std": 0.7717136181890965, + "reward_change_max": 0.0007090941071510315, + "reward_change_mean": -0.1278002504259348, + "reward_change_min": -0.22420413699001074, + "reward_change_std": 0.09308719309046865, + "reward_std": 0.755904994904995, + "rewards/cosine_scaled_reward": -0.010331441648304462, + "rewards/format_reward": 0.7500000186264515, + "step": 273 + }, + { + "advantage_max": 0.98700400441885, + "advantage_mean": -2.7318795781106076e-08, + "advantage_min": -1.1434022709727287, + "advantage_std": 0.8054995872080326, + "completion_length": 2140.0209197998047, + "epoch": 0.31314285714285717, + "grad_norm": 0.5769612193107605, + "kl": 0.376312255859375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.531415671340826e-07, + "loss": 0.007, + "reward": 1.085528818424791, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 1.085528818424791, + "reward_after_std": 0.8054995872080326, + "reward_before_mean": 1.2585789044387639, + "reward_before_std": 0.8196575529873371, + "reward_change_max": 0.0, + "reward_change_mean": -0.17305006738752127, + "reward_change_min": -0.2874143682420254, + "reward_change_std": 0.1149548664689064, + "reward_std": 0.8054995909333229, + "rewards/cosine_scaled_reward": 0.2542894408106804, + "rewards/format_reward": 0.7500000186264515, + "step": 274 + }, + { + "advantage_max": 1.4050886556506157, + "advantage_mean": -1.614292433060882e-08, + "advantage_min": -1.2774573862552643, + "advantage_std": 0.9720547124743462, + "completion_length": 2419.020896911621, + "epoch": 0.3142857142857143, + "grad_norm": 0.4276851415634155, + "kl": 0.441986083984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.5e-07, + "loss": 0.0204, + "reward": 0.9169086045585573, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.9169086045585573, + "reward_after_std": 0.9720547050237656, + "reward_before_mean": 1.0666476301848888, + "reward_before_std": 0.9818779900670052, + "reward_change_max": 0.00012986361980438232, + "reward_change_mean": -0.14973904564976692, + "reward_change_min": -0.26850674487650394, + "reward_change_std": 0.10206946218386292, + "reward_std": 0.972054734826088, + "rewards/cosine_scaled_reward": 0.17915714625269175, + "rewards/format_reward": 0.708333345130086, + "step": 275 + }, + { + "advantage_max": 1.4430090934038162, + "advantage_mean": -1.8626451658843024e-08, + "advantage_min": -1.3472927510738373, + "advantage_std": 1.0916124135255814, + "completion_length": 2607.895896911621, + "epoch": 0.31542857142857145, + "grad_norm": 0.6251004338264465, + "kl": 0.42620849609375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.468584328659172e-07, + "loss": 0.0515, + "reward": 0.9942958541214466, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.9942958541214466, + "reward_after_std": 1.0916124545037746, + "reward_before_mean": 1.1503968685865402, + "reward_before_std": 1.116207093000412, + "reward_change_max": 0.0, + "reward_change_mean": -0.15610099490731955, + "reward_change_min": -0.30604325234889984, + "reward_change_std": 0.11970954155549407, + "reward_std": 1.091612495481968, + "rewards/cosine_scaled_reward": 0.14811508357524872, + "rewards/format_reward": 0.8541666865348816, + "step": 276 + }, + { + "advantage_max": 1.238736167550087, + "advantage_mean": -3.104408841103634e-09, + "advantage_min": -0.7994346469640732, + "advantage_std": 0.7560862153768539, + "completion_length": 2522.9583892822266, + "epoch": 0.31657142857142856, + "grad_norm": 0.6035568118095398, + "kl": 0.409759521484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.437170188473847e-07, + "loss": 0.0173, + "reward": 0.3422631425783038, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3422631425783038, + "reward_after_std": 0.7560862153768539, + "reward_before_mean": 0.44170549139380455, + "reward_before_std": 0.7494552433490753, + "reward_change_max": 0.0009563863277435303, + "reward_change_mean": -0.099442342761904, + "reward_change_min": -0.18363922648131847, + "reward_change_std": 0.07015622220933437, + "reward_std": 0.7560862228274345, + "rewards/cosine_scaled_reward": -0.08123059757053852, + "rewards/format_reward": 0.604166679084301, + "step": 277 + }, + { + "advantage_max": 1.2197536677122116, + "advantage_mean": -2.6077032755367213e-08, + "advantage_min": -0.6300609707832336, + "advantage_std": 0.7097632475197315, + "completion_length": 2167.6666870117188, + "epoch": 0.3177142857142857, + "grad_norm": 0.5051394104957581, + "kl": 0.344940185546875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.405759110524894e-07, + "loss": -0.012, + "reward": 0.770529605448246, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.770529605448246, + "reward_after_std": 0.7097632475197315, + "reward_before_mean": 0.9097544327378273, + "reward_before_std": 0.6844562515616417, + "reward_change_max": 0.0, + "reward_change_mean": -0.13922483753412962, + "reward_change_min": -0.22356512024998665, + "reward_change_std": 0.0822983281686902, + "reward_std": 0.7097632624208927, + "rewards/cosine_scaled_reward": 0.04862721357494593, + "rewards/format_reward": 0.8125000149011612, + "step": 278 + }, + { + "advantage_max": 1.5004774332046509, + "advantage_mean": -1.4901161971003773e-08, + "advantage_min": -1.1552449837327003, + "advantage_std": 0.99924161657691, + "completion_length": 3084.0000915527344, + "epoch": 0.31885714285714284, + "grad_norm": 1.2428194284439087, + "kl": 0.51416015625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.37435262574394e-07, + "loss": 0.0967, + "reward": 0.4770056903362274, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4770056903362274, + "reward_after_std": 0.9992415867745876, + "reward_before_mean": 0.5843611843883991, + "reward_before_std": 1.010837584733963, + "reward_change_max": 0.0008535534143447876, + "reward_change_mean": -0.10735548380762339, + "reward_change_min": -0.21564262732863426, + "reward_change_std": 0.09189391741529107, + "reward_std": 0.9992415942251682, + "rewards/cosine_scaled_reward": -0.09323608374688774, + "rewards/format_reward": 0.770833358168602, + "step": 279 + }, + { + "advantage_max": 1.2737629637122154, + "advantage_mean": -2.4214387439602802e-08, + "advantage_min": -1.3873290121555328, + "advantage_std": 0.9990286827087402, + "completion_length": 2394.5625381469727, + "epoch": 0.32, + "grad_norm": 0.9669297337532043, + "kl": 0.3808135986328125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.342952264838747e-07, + "loss": 0.0576, + "reward": 0.9942773611983284, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.9942773611983284, + "reward_after_std": 0.999028667807579, + "reward_before_mean": 1.1533079743385315, + "reward_before_std": 1.0210191681981087, + "reward_change_max": 0.0, + "reward_change_mean": -0.15903064515441656, + "reward_change_min": -0.28807317093014717, + "reward_change_std": 0.11587743367999792, + "reward_std": 0.9990286976099014, + "rewards/cosine_scaled_reward": 0.18082065833732486, + "rewards/format_reward": 0.7916666865348816, + "step": 280 + }, + { + "advantage_max": 0.9880325570702553, + "advantage_mean": -1.8626452047421083e-09, + "advantage_min": -0.7407893761992455, + "advantage_std": 0.6236730217933655, + "completion_length": 3392.2709350585938, + "epoch": 0.3211428571428571, + "grad_norm": 1.2373231649398804, + "kl": 0.67236328125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.311559558218603e-07, + "loss": 0.0384, + "reward": 0.04596708505414426, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.04596708505414426, + "reward_after_std": 0.6236730217933655, + "reward_before_mean": 0.12323334789834917, + "reward_before_std": 0.6242800801992416, + "reward_change_max": 0.0001795440912246704, + "reward_change_mean": -0.0772662702947855, + "reward_change_min": -0.14489263761788607, + "reward_change_std": 0.05781849008053541, + "reward_std": 0.6236730627715588, + "rewards/cosine_scaled_reward": -0.24046666733920574, + "rewards/format_reward": 0.6041666753590107, + "step": 281 + }, + { + "advantage_max": 1.0304613038897514, + "advantage_mean": -2.731879727990716e-08, + "advantage_min": -0.9215684104710817, + "advantage_std": 0.7275769785046577, + "completion_length": 2536.229248046875, + "epoch": 0.3222857142857143, + "grad_norm": 0.366413950920105, + "kl": 0.36102294921875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.28017603591974e-07, + "loss": 0.0364, + "reward": 0.8688683672808111, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.8688683672808111, + "reward_after_std": 0.7275770045816898, + "reward_before_mean": 1.0214176895096898, + "reward_before_std": 0.722488921135664, + "reward_change_max": 0.0, + "reward_change_mean": -0.15254933293908834, + "reward_change_min": -0.23969142325222492, + "reward_change_std": 0.09497861238196492, + "reward_std": 0.7275770120322704, + "rewards/cosine_scaled_reward": 0.07320883683860302, + "rewards/format_reward": 0.8750000074505806, + "step": 282 + }, + { + "advantage_max": 1.31711445748806, + "advantage_mean": -8.692343955729598e-09, + "advantage_min": -0.9042372852563858, + "advantage_std": 0.8224817290902138, + "completion_length": 2863.3541870117188, + "epoch": 0.32342857142857145, + "grad_norm": 1.1226824522018433, + "kl": 0.5413818359375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.248803227530763e-07, + "loss": 0.0282, + "reward": 0.6211966900154948, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6211966900154948, + "reward_after_std": 0.8224817328155041, + "reward_before_mean": 0.7448102962225676, + "reward_before_std": 0.814110279083252, + "reward_change_max": 0.0, + "reward_change_mean": -0.12361361924558878, + "reward_change_min": -0.21856752410531044, + "reward_change_std": 0.0800158535130322, + "reward_std": 0.8224817700684071, + "rewards/cosine_scaled_reward": -0.01301151653751731, + "rewards/format_reward": 0.7708333469927311, + "step": 283 + }, + { + "advantage_max": 1.0646238774061203, + "advantage_mean": -2.1730860499946658e-08, + "advantage_min": -1.107110746204853, + "advantage_std": 0.7788519412279129, + "completion_length": 2708.2500762939453, + "epoch": 0.32457142857142857, + "grad_norm": 0.45679256319999695, + "kl": 0.387939453125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.21744266211809e-07, + "loss": 0.0454, + "reward": 0.8224916737526655, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.8224916737526655, + "reward_after_std": 0.7788519263267517, + "reward_before_mean": 0.9693624749779701, + "reward_before_std": 0.784541878849268, + "reward_change_max": 0.0, + "reward_change_mean": -0.14687081146985292, + "reward_change_min": -0.2431420534849167, + "reward_change_std": 0.09596487786620855, + "reward_std": 0.7788519412279129, + "rewards/cosine_scaled_reward": 0.047181230038404465, + "rewards/format_reward": 0.8750000223517418, + "step": 284 + }, + { + "advantage_max": 0.9761894531548023, + "advantage_mean": -2.2351741957304938e-08, + "advantage_min": -1.0390981957316399, + "advantage_std": 0.7664303183555603, + "completion_length": 2226.979232788086, + "epoch": 0.32571428571428573, + "grad_norm": 0.5204533934593201, + "kl": 0.251617431640625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.186095868151436e-07, + "loss": -0.0131, + "reward": 0.5884040435776114, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5884040435776114, + "reward_after_std": 0.7664303220808506, + "reward_before_mean": 0.7158968299627304, + "reward_before_std": 0.7851696014404297, + "reward_change_max": 0.0, + "reward_change_mean": -0.1274928255006671, + "reward_change_min": -0.24082167074084282, + "reward_change_std": 0.09329877560958266, + "reward_std": 0.7664303407073021, + "rewards/cosine_scaled_reward": -0.027468256652355194, + "rewards/format_reward": 0.7708333469927311, + "step": 285 + }, + { + "advantage_max": 1.318112462759018, + "advantage_mean": -1.3038516488705909e-08, + "advantage_min": -0.8753121644258499, + "advantage_std": 0.8130339942872524, + "completion_length": 2779.125045776367, + "epoch": 0.32685714285714285, + "grad_norm": 1.1802911758422852, + "kl": 0.4268951416015625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.154764373429315e-07, + "loss": 0.073, + "reward": 0.59721265360713, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.59721265360713, + "reward_after_std": 0.813034001737833, + "reward_before_mean": 0.7192092165350914, + "reward_before_std": 0.80387282371521, + "reward_change_max": 5.13419508934021e-05, + "reward_change_mean": -0.12199656944721937, + "reward_change_min": -0.2146667167544365, + "reward_change_std": 0.08022241853177547, + "reward_std": 0.8130340240895748, + "rewards/cosine_scaled_reward": -0.036228728480637074, + "rewards/format_reward": 0.7916666865348816, + "step": 286 + }, + { + "advantage_max": 1.050818793475628, + "advantage_mean": -1.738468857759301e-08, + "advantage_min": -1.055874053388834, + "advantage_std": 0.798844076693058, + "completion_length": 2294.9584197998047, + "epoch": 0.328, + "grad_norm": 0.42027631402015686, + "kl": 0.3113861083984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.123449705004581e-07, + "loss": 0.018, + "reward": 0.5748773384839296, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5748773384839296, + "reward_after_std": 0.7988440804183483, + "reward_before_mean": 0.6994684183155186, + "reward_before_std": 0.816389974206686, + "reward_change_max": 0.0005205199122428894, + "reward_change_mean": -0.12459108140319586, + "reward_change_min": -0.2359716072678566, + "reward_change_std": 0.09250497492030263, + "reward_std": 0.7988441213965416, + "rewards/cosine_scaled_reward": -0.035682463087141514, + "rewards/format_reward": 0.770833358168602, + "step": 287 + }, + { + "advantage_max": 1.2926170453429222, + "advantage_mean": -9.313226134732844e-09, + "advantage_min": -1.0218810439109802, + "advantage_std": 0.8248549252748489, + "completion_length": 2799.666732788086, + "epoch": 0.3291428571428571, + "grad_norm": 0.7103667855262756, + "kl": 0.32928466796875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.09215338910999e-07, + "loss": 0.0461, + "reward": 0.7114497211296111, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7114497211296111, + "reward_after_std": 0.8248548954725266, + "reward_before_mean": 0.8447219170629978, + "reward_before_std": 0.8177867233753204, + "reward_change_max": 0.0, + "reward_change_mean": -0.13327217940241098, + "reward_change_min": -0.22586338967084885, + "reward_change_std": 0.086294736713171, + "reward_std": 0.8248549252748489, + "rewards/cosine_scaled_reward": -0.004722386132925749, + "rewards/format_reward": 0.8541666828095913, + "step": 288 + }, + { + "advantage_max": 0.8171020597219467, + "advantage_mean": -2.4835277168122616e-09, + "advantage_min": -0.8185729384422302, + "advantage_std": 0.5793700739741325, + "completion_length": 2317.0834197998047, + "epoch": 0.3302857142857143, + "grad_norm": 0.2669129967689514, + "kl": 0.31494140625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.060876951083828e-07, + "loss": 0.03, + "reward": 0.7267960589379072, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.7267960589379072, + "reward_after_std": 0.5793700367212296, + "reward_before_mean": 0.8693213667720556, + "reward_before_std": 0.5672201681882143, + "reward_change_max": 0.0003613904118537903, + "reward_change_mean": -0.1425252864137292, + "reward_change_min": -0.21070224232971668, + "reward_change_std": 0.08446116000413895, + "reward_std": 0.5793700478971004, + "rewards/cosine_scaled_reward": -0.002839326858520508, + "rewards/format_reward": 0.8750000074505806, + "step": 289 + }, + { + "advantage_max": 1.2880224622786045, + "advantage_mean": -2.1575639885806908e-08, + "advantage_min": -0.8768154866993427, + "advantage_std": 0.7828379794955254, + "completion_length": 2832.7709045410156, + "epoch": 0.3314285714285714, + "grad_norm": 1.381922960281372, + "kl": 0.517333984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.02962191529556e-07, + "loss": 0.0092, + "reward": 0.6558038564398885, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6558038564398885, + "reward_after_std": 0.7828379720449448, + "reward_before_mean": 0.7833415642380714, + "reward_before_std": 0.7664486858993769, + "reward_change_max": 0.0001693814992904663, + "reward_change_mean": -0.12753771618008614, + "reward_change_min": -0.22125390730798244, + "reward_change_std": 0.08206295082345605, + "reward_std": 0.782837986946106, + "rewards/cosine_scaled_reward": -0.07707922626286745, + "rewards/format_reward": 0.9375000074505806, + "step": 290 + }, + { + "advantage_max": 1.7112552598118782, + "advantage_mean": -1.6142924996742636e-08, + "advantage_min": -0.9665621928870678, + "advantage_std": 1.038928933441639, + "completion_length": 2836.479248046875, + "epoch": 0.3325714285714286, + "grad_norm": 0.6068376302719116, + "kl": 0.44158935546875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.998389805071536e-07, + "loss": 0.0334, + "reward": 0.6122805885970592, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6122805885970592, + "reward_after_std": 1.0389289036393166, + "reward_before_mean": 0.7282662447541952, + "reward_before_std": 1.0348973795771599, + "reward_change_max": 0.0002287551760673523, + "reward_change_mean": -0.11598564963787794, + "reward_change_min": -0.23866364359855652, + "reward_change_std": 0.08767437376081944, + "reward_std": 1.0389289483428001, + "rewards/cosine_scaled_reward": -0.042116889264434576, + "rewards/format_reward": 0.8125000074505806, + "step": 291 + }, + { + "advantage_max": 0.834974117577076, + "advantage_mean": 2.483526828633842e-09, + "advantage_min": -0.7933941446244717, + "advantage_std": 0.5692648068070412, + "completion_length": 3146.5625610351562, + "epoch": 0.33371428571428574, + "grad_norm": 0.9853768348693848, + "kl": 0.4609375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.967182142620745e-07, + "loss": 0.0284, + "reward": 0.31484566256403923, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.31484566256403923, + "reward_after_std": 0.5692648142576218, + "reward_before_mean": 0.41969927214086056, + "reward_before_std": 0.5654185600578785, + "reward_change_max": 0.0, + "reward_change_mean": -0.10485361609607935, + "reward_change_min": -0.16875915229320526, + "reward_change_std": 0.06821413105353713, + "reward_std": 0.569264829158783, + "rewards/cosine_scaled_reward": -0.1339003685861826, + "rewards/format_reward": 0.6875000186264515, + "step": 292 + }, + { + "advantage_max": 0.906766127794981, + "advantage_mean": -5.122274235325186e-08, + "advantage_min": -0.7590428665280342, + "advantage_std": 0.6250920966267586, + "completion_length": 2332.7709197998047, + "epoch": 0.33485714285714285, + "grad_norm": 1.096217393875122, + "kl": 0.2679443359375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.93600044896063e-07, + "loss": -0.0133, + "reward": 0.7127600498497486, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.7127600498497486, + "reward_after_std": 0.6250920966267586, + "reward_before_mean": 0.8526535518467426, + "reward_before_std": 0.6178444065153599, + "reward_change_max": 0.0003881379961967468, + "reward_change_mean": -0.13989355321973562, + "reward_change_min": -0.22731606289744377, + "reward_change_std": 0.08642083266749978, + "reward_std": 0.6250921115279198, + "rewards/cosine_scaled_reward": -0.0007565605919808149, + "rewards/format_reward": 0.854166679084301, + "step": 293 + }, + { + "advantage_max": 0.9793417081236839, + "advantage_mean": -2.1730860388924356e-08, + "advantage_min": -0.9472349882125854, + "advantage_std": 0.7169453538954258, + "completion_length": 3070.6875915527344, + "epoch": 0.336, + "grad_norm": 0.9082557559013367, + "kl": 0.422149658203125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.904846243842949e-07, + "loss": 0.0235, + "reward": 0.5174050983041525, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5174050983041525, + "reward_after_std": 0.7169453427195549, + "reward_before_mean": 0.6380012258887291, + "reward_before_std": 0.7268225848674774, + "reward_change_max": 0.00013885647058486938, + "reward_change_mean": -0.12059612292796373, + "reward_change_min": -0.21557171922177076, + "reward_change_std": 0.08509104792028666, + "reward_std": 0.7169453501701355, + "rewards/cosine_scaled_reward": -0.01433273358270526, + "rewards/format_reward": 0.6666666809469461, + "step": 294 + }, + { + "advantage_max": 1.2744291499257088, + "advantage_mean": -2.6077031867188794e-08, + "advantage_min": -1.3888452351093292, + "advantage_std": 1.0510571710765362, + "completion_length": 2997.5001068115234, + "epoch": 0.33714285714285713, + "grad_norm": 0.5747594237327576, + "kl": 0.39453125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.873721045679706e-07, + "loss": 0.0133, + "reward": 1.1005014963448048, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 1.1005014963448048, + "reward_after_std": 1.0510571599006653, + "reward_before_mean": 1.2698866855353117, + "reward_before_std": 1.082613728940487, + "reward_change_max": 0.0, + "reward_change_mean": -0.1693851826712489, + "reward_change_min": -0.2982129603624344, + "reward_change_std": 0.1265355981886387, + "reward_std": 1.0510571897029877, + "rewards/cosine_scaled_reward": 0.23910999950021505, + "rewards/format_reward": 0.7916666865348816, + "step": 295 + }, + { + "advantage_max": 0.8809316977858543, + "advantage_mean": -2.0489097363185493e-08, + "advantage_min": -0.9141372889280319, + "advantage_std": 0.693729005753994, + "completion_length": 3155.250030517578, + "epoch": 0.3382857142857143, + "grad_norm": 0.6257061958312988, + "kl": 0.3314208984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.842626371469149e-07, + "loss": 0.0234, + "reward": 0.43978652730584145, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.43978652730584145, + "reward_after_std": 0.6937290169298649, + "reward_before_mean": 0.5553540624678135, + "reward_before_std": 0.7088712975382805, + "reward_change_max": 0.0, + "reward_change_mean": -0.11556753842160106, + "reward_change_min": -0.20852719619870186, + "reward_change_std": 0.08426861232146621, + "reward_std": 0.6937290467321873, + "rewards/cosine_scaled_reward": -0.09732297994196415, + "rewards/format_reward": 0.7500000149011612, + "step": 296 + }, + { + "advantage_max": 1.0527857542037964, + "advantage_mean": -8.071462664904772e-09, + "advantage_min": -0.8540095165371895, + "advantage_std": 0.7035439126193523, + "completion_length": 3335.041717529297, + "epoch": 0.3394285714285714, + "grad_norm": 0.4017558693885803, + "kl": 0.2974853515625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.811563736721829e-07, + "loss": 0.0169, + "reward": 0.5707921356661245, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5707921356661245, + "reward_after_std": 0.7035439126193523, + "reward_before_mean": 0.6951440321281552, + "reward_before_std": 0.6948239048942924, + "reward_change_max": 1.8790364265441895e-05, + "reward_change_mean": -0.12435187119990587, + "reward_change_min": -0.21546250581741333, + "reward_change_std": 0.08488783519715071, + "reward_std": 0.7035439349710941, + "rewards/cosine_scaled_reward": -0.01701133605092764, + "rewards/format_reward": 0.7291666902601719, + "step": 297 + }, + { + "advantage_max": 0.8668230138719082, + "advantage_mean": -7.450581041013038e-09, + "advantage_min": -0.713488981127739, + "advantage_std": 0.5623467974364758, + "completion_length": 2715.2084045410156, + "epoch": 0.3405714285714286, + "grad_norm": 0.33654358983039856, + "kl": 0.1712646484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.780534655386743e-07, + "loss": -0.0011, + "reward": 0.5386511981487274, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5386511981487274, + "reward_after_std": 0.5623467974364758, + "reward_before_mean": 0.663620114326477, + "reward_before_std": 0.54873913154006, + "reward_change_max": 0.00012120604515075684, + "reward_change_mean": -0.12496891105547547, + "reward_change_min": -0.19604242593050003, + "reward_change_std": 0.07404929213225842, + "reward_std": 0.5623467974364758, + "rewards/cosine_scaled_reward": -0.09527328005060554, + "rewards/format_reward": 0.8541666828095913, + "step": 298 + }, + { + "advantage_max": 1.1117474511265755, + "advantage_mean": -5.277494585786968e-08, + "advantage_min": -0.9139253497123718, + "advantage_std": 0.7722109295427799, + "completion_length": 3096.2084197998047, + "epoch": 0.3417142857142857, + "grad_norm": 0.6103402376174927, + "kl": 0.14898681640625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.749540639777539e-07, + "loss": 0.0318, + "reward": 0.7331810034811497, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.7331810034811497, + "reward_after_std": 0.7722109220921993, + "reward_before_mean": 0.8717173463664949, + "reward_before_std": 0.7732285261154175, + "reward_change_max": 8.639693260192871e-05, + "reward_change_mean": -0.13853636337444186, + "reward_change_min": -0.24009987153112888, + "reward_change_std": 0.09422993147745728, + "reward_std": 0.7722109258174896, + "rewards/cosine_scaled_reward": 0.08169198967516422, + "rewards/format_reward": 0.7083333469927311, + "step": 299 + }, + { + "advantage_max": 1.3425211235880852, + "advantage_mean": -2.4835269396561444e-09, + "advantage_min": -0.9276892244815826, + "advantage_std": 0.9135083742439747, + "completion_length": 3236.479217529297, + "epoch": 0.34285714285714286, + "grad_norm": 1.0381654500961304, + "kl": 0.1722412109375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.7185832004988133e-07, + "loss": 0.0563, + "reward": 0.4564114101231098, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4564114101231098, + "reward_after_std": 0.9135083518922329, + "reward_before_mean": 0.5646192319691181, + "reward_before_std": 0.9233267642557621, + "reward_change_max": 0.0, + "reward_change_mean": -0.10820781346410513, + "reward_change_min": -0.2130340477451682, + "reward_change_std": 0.08647576486691833, + "reward_std": 0.913508377969265, + "rewards/cosine_scaled_reward": -0.07185705937445164, + "rewards/format_reward": 0.7083333488553762, + "step": 300 + }, + { + "advantage_max": 0.8454348556697369, + "advantage_mean": -3.725290520506519e-09, + "advantage_min": -0.6558514572679996, + "advantage_std": 0.5693494603037834, + "completion_length": 2719.812530517578, + "epoch": 0.344, + "grad_norm": 0.5055859088897705, + "kl": 0.1689453125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.68766384637248e-07, + "loss": 0.0221, + "reward": 0.32430399395525455, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.32430399395525455, + "reward_after_std": 0.5693494640290737, + "reward_before_mean": 0.42983817867934704, + "reward_before_std": 0.5612027458846569, + "reward_change_max": 5.427747964859009e-05, + "reward_change_mean": -0.10553418751806021, + "reward_change_min": -0.1772037437185645, + "reward_change_std": 0.07288656942546368, + "reward_std": 0.5693494826555252, + "rewards/cosine_scaled_reward": -0.08716425858438015, + "rewards/format_reward": 0.6041666734963655, + "step": 301 + }, + { + "advantage_max": 1.102532796561718, + "advantage_mean": -1.428027990302283e-08, + "advantage_min": -0.8525043353438377, + "advantage_std": 0.7396768815815449, + "completion_length": 2854.2083892822266, + "epoch": 0.34514285714285714, + "grad_norm": 0.38935554027557373, + "kl": 0.149932861328125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.656784084364238e-07, + "loss": 0.0121, + "reward": 0.798190388828516, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.798190388828516, + "reward_after_std": 0.7396768666803837, + "reward_before_mean": 0.942897442728281, + "reward_before_std": 0.7333322390913963, + "reward_change_max": 8.501112461090088e-05, + "reward_change_mean": -0.14470706274732947, + "reward_change_min": -0.24838530272245407, + "reward_change_std": 0.09280881565064192, + "reward_std": 0.739676907658577, + "rewards/cosine_scaled_reward": 0.06519871880300343, + "rewards/format_reward": 0.8125000074505806, + "step": 302 + }, + { + "advantage_max": 1.1021673679351807, + "advantage_mean": -5.277494774524882e-09, + "advantage_min": -1.2830260694026947, + "advantage_std": 0.8821544721722603, + "completion_length": 2851.916748046875, + "epoch": 0.3462857142857143, + "grad_norm": 0.24663765728473663, + "kl": 0.149932861328125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.6259454195101267e-07, + "loss": -0.0177, + "reward": 0.7762341545894742, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7762341545894742, + "reward_after_std": 0.8821544609963894, + "reward_before_mean": 0.9183368273079395, + "reward_before_std": 0.9060549177229404, + "reward_change_max": 0.0, + "reward_change_mean": -0.1421026550233364, + "reward_change_min": -0.2564028147608042, + "reward_change_std": 0.10370448138564825, + "reward_std": 0.8821544758975506, + "rewards/cosine_scaled_reward": 0.07375173456966877, + "rewards/format_reward": 0.7708333507180214, + "step": 303 + }, + { + "advantage_max": 0.8954240456223488, + "advantage_mean": -3.7252904094842165e-09, + "advantage_min": -0.7039155513048172, + "advantage_std": 0.6271420307457447, + "completion_length": 3038.6459350585938, + "epoch": 0.3474285714285714, + "grad_norm": 0.29657599329948425, + "kl": 0.173858642578125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.59514935484316e-07, + "loss": 0.0226, + "reward": 0.5110467355698347, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5110467355698347, + "reward_after_std": 0.6271420381963253, + "reward_before_mean": 0.6330133527517319, + "reward_before_std": 0.6232540085911751, + "reward_change_max": 0.0, + "reward_change_mean": -0.12196661904454231, + "reward_change_min": -0.21976127475500107, + "reward_change_std": 0.08127523586153984, + "reward_std": 0.6271420530974865, + "rewards/cosine_scaled_reward": -0.08974333480000496, + "rewards/format_reward": 0.8125000037252903, + "step": 304 + }, + { + "advantage_max": 0.9175121039152145, + "advantage_mean": -8.381903671139668e-09, + "advantage_min": -0.6949810571968555, + "advantage_std": 0.5830536782741547, + "completion_length": 3196.8958740234375, + "epoch": 0.3485714285714286, + "grad_norm": 0.1946064531803131, + "kl": 0.188507080078125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.5643973913200837e-07, + "loss": 0.0192, + "reward": 0.47706126113189384, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.47706126113189384, + "reward_after_std": 0.5830536782741547, + "reward_before_mean": 0.595546220894903, + "reward_before_std": 0.5727699100971222, + "reward_change_max": 8.014589548110962e-05, + "reward_change_mean": -0.11848497577011585, + "reward_change_min": -0.18940737936645746, + "reward_change_std": 0.07213541446253657, + "reward_std": 0.5830537006258965, + "rewards/cosine_scaled_reward": -0.014726895838975906, + "rewards/format_reward": 0.6250000055879354, + "step": 305 + }, + { + "advantage_max": 1.2761941775679588, + "advantage_mean": -4.004687093051018e-08, + "advantage_min": -1.1722888499498367, + "advantage_std": 0.9643121734261513, + "completion_length": 3052.8334045410156, + "epoch": 0.3497142857142857, + "grad_norm": 0.893444299697876, + "kl": 0.17730712890625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.5336910277482155e-07, + "loss": 0.0269, + "reward": 0.9540768321603537, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.9540768321603537, + "reward_after_std": 0.9643121659755707, + "reward_before_mean": 1.1099904502625577, + "reward_before_std": 0.9816011004149914, + "reward_change_max": 0.0, + "reward_change_mean": -0.15591357089579105, + "reward_change_min": -0.29588284343481064, + "reward_change_std": 0.11661555664613843, + "reward_std": 0.9643121883273125, + "rewards/cosine_scaled_reward": 0.16957851639017463, + "rewards/format_reward": 0.7708333507180214, + "step": 306 + }, + { + "advantage_max": 1.2773448526859283, + "advantage_mean": -1.552204287325054e-08, + "advantage_min": -1.1109627187252045, + "advantage_std": 0.8845279589295387, + "completion_length": 2951.166748046875, + "epoch": 0.35085714285714287, + "grad_norm": 0.8295795321464539, + "kl": 0.1986083984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.503031760712397e-07, + "loss": 0.0341, + "reward": 0.4779685065150261, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4779685065150261, + "reward_after_std": 0.8845279365777969, + "reward_before_mean": 0.5894667021930218, + "reward_before_std": 0.8981221094727516, + "reward_change_max": 0.0001471191644668579, + "reward_change_mean": -0.11149819893762469, + "reward_change_min": -0.21481576841324568, + "reward_change_std": 0.08700747834518552, + "reward_std": 0.8845279365777969, + "rewards/cosine_scaled_reward": -0.08026665821671486, + "rewards/format_reward": 0.7500000204890966, + "step": 307 + }, + { + "advantage_max": 1.3166191279888153, + "advantage_mean": -2.4524828390326547e-08, + "advantage_min": -0.987498015165329, + "advantage_std": 0.9139077328145504, + "completion_length": 3266.2709350585938, + "epoch": 0.352, + "grad_norm": 0.409768670797348, + "kl": 0.284423828125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.4724210845020494e-07, + "loss": 0.0142, + "reward": 0.44744166173040867, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.44744166173040867, + "reward_after_std": 0.9139077588915825, + "reward_before_mean": 0.555396830663085, + "reward_before_std": 0.9306747391819954, + "reward_change_max": 0.0, + "reward_change_mean": -0.10795515077188611, + "reward_change_min": -0.22507263347506523, + "reward_change_std": 0.0874811913818121, + "reward_std": 0.9139078035950661, + "rewards/cosine_scaled_reward": -0.06605160096660256, + "rewards/format_reward": 0.6875000093132257, + "step": 308 + }, + { + "advantage_max": 1.3376531079411507, + "advantage_mean": -1.6453365614399473e-08, + "advantage_min": -1.0297049805521965, + "advantage_std": 0.9688749574124813, + "completion_length": 3037.020950317383, + "epoch": 0.35314285714285715, + "grad_norm": 0.8121221661567688, + "kl": 0.24346923828125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.441860491038345e-07, + "loss": 0.0187, + "reward": 0.40955112874507904, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.40955112874507904, + "reward_after_std": 0.9688749276101589, + "reward_before_mean": 0.5138961069751531, + "reward_before_std": 0.99384855479002, + "reward_change_max": 0.0001594945788383484, + "reward_change_mean": -0.10434500779956579, + "reward_change_min": -0.22386514395475388, + "reward_change_std": 0.09169822558760643, + "reward_std": 0.9688749499619007, + "rewards/cosine_scaled_reward": -0.06596861826255918, + "rewards/format_reward": 0.6458333488553762, + "step": 309 + }, + { + "advantage_max": 1.017396479845047, + "advantage_mean": -1.1796752963366686e-08, + "advantage_min": -0.9639165177941322, + "advantage_std": 0.7473810315132141, + "completion_length": 2709.916732788086, + "epoch": 0.35428571428571426, + "grad_norm": 0.41897907853126526, + "kl": 0.2879638671875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.4113514698014953e-07, + "loss": 0.0421, + "reward": 0.46168462419882417, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.46168462419882417, + "reward_after_std": 0.7473810240626335, + "reward_before_mean": 0.5762034375220537, + "reward_before_std": 0.7604282721877098, + "reward_change_max": 0.0, + "reward_change_mean": -0.11451879888772964, + "reward_change_min": -0.22137450985610485, + "reward_change_std": 0.08378444751724601, + "reward_std": 0.7473810315132141, + "rewards/cosine_scaled_reward": -0.07648163288831711, + "rewards/format_reward": 0.729166679084301, + "step": 310 + }, + { + "advantage_max": 1.0443845875561237, + "advantage_mean": -3.818422714130243e-08, + "advantage_min": -1.2661523073911667, + "advantage_std": 0.8653931841254234, + "completion_length": 2787.8126220703125, + "epoch": 0.3554285714285714, + "grad_norm": 1.1151634454727173, + "kl": 0.28533935546875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.3808955077581546e-07, + "loss": 0.068, + "reward": 0.7599540562368929, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.7599540562368929, + "reward_after_std": 0.8653931654989719, + "reward_before_mean": 0.9020670726895332, + "reward_before_std": 0.8923026695847511, + "reward_change_max": 1.7762184143066406e-05, + "reward_change_mean": -0.14211303647607565, + "reward_change_min": -0.26006680727005005, + "reward_change_std": 0.10649908194318414, + "reward_std": 0.865393191576004, + "rewards/cosine_scaled_reward": 0.076033522374928, + "rewards/format_reward": 0.7500000260770321, + "step": 311 + }, + { + "advantage_max": 0.9848646819591522, + "advantage_mean": -1.241763458725842e-08, + "advantage_min": -0.8820649348199368, + "advantage_std": 0.672922782599926, + "completion_length": 2488.7500915527344, + "epoch": 0.3565714285714286, + "grad_norm": 0.5053475499153137, + "kl": 0.272918701171875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.350494089288943e-07, + "loss": 0.0146, + "reward": 1.1891271751374006, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 1.1891271751374006, + "reward_after_std": 0.6729227676987648, + "reward_before_mean": 1.3720673564821482, + "reward_before_std": 0.6550161205232143, + "reward_change_max": 0.0, + "reward_change_mean": -0.18294014502316713, + "reward_change_min": -0.2811252400279045, + "reward_change_std": 0.10329680563881993, + "reward_std": 0.6729227751493454, + "rewards/cosine_scaled_reward": 0.2172836670652032, + "rewards/format_reward": 0.9375000074505806, + "step": 312 + }, + { + "advantage_max": 1.3496510535478592, + "advantage_mean": -1.1175871006408045e-08, + "advantage_min": -1.0200391113758087, + "advantage_std": 0.8189485613256693, + "completion_length": 2915.5208892822266, + "epoch": 0.3577142857142857, + "grad_norm": 0.46193355321884155, + "kl": 0.361846923828125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.3201486961161093e-07, + "loss": 0.02, + "reward": 0.9410210661590099, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.9410210661590099, + "reward_after_std": 0.8189485538750887, + "reward_before_mean": 1.0941763501614332, + "reward_before_std": 0.7930347509682178, + "reward_change_max": 0.0, + "reward_change_mean": -0.15315525699406862, + "reward_change_min": -0.24925747141242027, + "reward_change_std": 0.09821537788957357, + "reward_std": 0.8189485874027014, + "rewards/cosine_scaled_reward": 0.1512548227328807, + "rewards/format_reward": 0.7916666865348816, + "step": 313 + }, + { + "advantage_max": 1.112821839749813, + "advantage_mean": -6.643434680153604e-08, + "advantage_min": -0.8538470044732094, + "advantage_std": 0.7443932630121708, + "completion_length": 2624.104248046875, + "epoch": 0.3588571428571429, + "grad_norm": 0.8362889289855957, + "kl": 0.273956298828125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.2898608072313045e-07, + "loss": 0.0064, + "reward": 1.070881293155253, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 1.070881293155253, + "reward_after_std": 0.744393277913332, + "reward_before_mean": 1.2406534266774543, + "reward_before_std": 0.7279466539621353, + "reward_change_max": 0.0, + "reward_change_mean": -0.16977215185761452, + "reward_change_min": -0.2630367986857891, + "reward_change_std": 0.1019047787413001, + "reward_std": 0.7443933002650738, + "rewards/cosine_scaled_reward": 0.1724100224673748, + "rewards/format_reward": 0.8958333432674408, + "step": 314 + }, + { + "advantage_max": 1.2765894085168839, + "advantage_mean": -2.359350637082258e-08, + "advantage_min": -1.1392202600836754, + "advantage_std": 0.9222354628145695, + "completion_length": 3017.5000610351562, + "epoch": 0.36, + "grad_norm": 0.67600417137146, + "kl": 0.384124755859375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.2596318988235037e-07, + "loss": 0.0605, + "reward": 0.5837108045816422, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5837108045816422, + "reward_after_std": 0.9222354479134083, + "reward_before_mean": 0.7050627954304218, + "reward_before_std": 0.943105410784483, + "reward_change_max": 0.0, + "reward_change_mean": -0.12135198432952166, + "reward_change_min": -0.23846616130322218, + "reward_change_std": 0.09463956812396646, + "reward_std": 0.9222354926168919, + "rewards/cosine_scaled_reward": -0.012051953002810478, + "rewards/format_reward": 0.7291666828095913, + "step": 315 + }, + { + "advantage_max": 1.2091378793120384, + "advantage_mean": 1.8626449826975033e-09, + "advantage_min": -0.8695778399705887, + "advantage_std": 0.7904097400605679, + "completion_length": 3343.4584350585938, + "epoch": 0.36114285714285715, + "grad_norm": 0.8994598984718323, + "kl": 0.4453125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.2294634442070553e-07, + "loss": 0.0191, + "reward": 0.16371046472340822, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.16371046472340822, + "reward_after_std": 0.790409728884697, + "reward_before_mean": 0.2475222758948803, + "reward_before_std": 0.7998642735183239, + "reward_change_max": 0.00028289854526519775, + "reward_change_mean": -0.08381181326694787, + "reward_change_min": -0.16807855293154716, + "reward_change_std": 0.06676852668169886, + "reward_std": 0.7904097698628902, + "rewards/cosine_scaled_reward": -0.17832219880074263, + "rewards/format_reward": 0.6041666883975267, + "step": 316 + }, + { + "advantage_max": 1.1376716941595078, + "advantage_mean": -1.5522043539384356e-08, + "advantage_min": -1.1290230266749859, + "advantage_std": 0.894180990755558, + "completion_length": 3227.0625915527344, + "epoch": 0.36228571428571427, + "grad_norm": 0.6548424959182739, + "kl": 0.5126953125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.1993569137498776e-07, + "loss": 0.0612, + "reward": 0.5353844849159941, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5353844849159941, + "reward_after_std": 0.8941810317337513, + "reward_before_mean": 0.6545494571328163, + "reward_before_std": 0.9253632761538029, + "reward_change_max": 9.782612323760986e-05, + "reward_change_mean": -0.11916495487093925, + "reward_change_min": -0.24563197046518326, + "reward_change_std": 0.0988409100100398, + "reward_std": 0.8941810466349125, + "rewards/cosine_scaled_reward": -0.03730860911309719, + "rewards/format_reward": 0.7291666828095913, + "step": 317 + }, + { + "advantage_max": 1.2796707078814507, + "advantage_mean": 6.208814573582799e-10, + "advantage_min": -1.1328086107969284, + "advantage_std": 0.889403197914362, + "completion_length": 2370.916763305664, + "epoch": 0.36342857142857143, + "grad_norm": 0.7421766519546509, + "kl": 0.215087890625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.1693137748017915e-07, + "loss": -0.0195, + "reward": 0.7891982682049274, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.7891982682049274, + "reward_after_std": 0.8894031867384911, + "reward_before_mean": 0.9295575264841318, + "reward_before_std": 0.8947510085999966, + "reward_change_max": 0.000398002564907074, + "reward_change_mean": -0.14035920519381762, + "reward_change_min": -0.25510624423623085, + "reward_change_std": 0.09936856152489781, + "reward_std": 0.8894032202661037, + "rewards/cosine_scaled_reward": 0.04811206506565213, + "rewards/format_reward": 0.8333333544433117, + "step": 318 + }, + { + "advantage_max": 1.3221676275134087, + "advantage_mean": -2.7939677571531263e-08, + "advantage_min": -0.9854774251580238, + "advantage_std": 0.8276299238204956, + "completion_length": 3122.7917404174805, + "epoch": 0.36457142857142855, + "grad_norm": 0.7270869016647339, + "kl": 0.437164306640625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.1393354916230005e-07, + "loss": 0.0172, + "reward": 0.21932624652981758, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.21932624652981758, + "reward_after_std": 0.8276299238204956, + "reward_before_mean": 0.30613830126821995, + "reward_before_std": 0.8316191211342812, + "reward_change_max": 0.00018126517534255981, + "reward_change_mean": -0.08681207243353128, + "reward_change_min": -0.17364921793341637, + "reward_change_std": 0.06764841824769974, + "reward_std": 0.8276299461722374, + "rewards/cosine_scaled_reward": -0.15943086054176092, + "rewards/format_reward": 0.6250000149011612, + "step": 319 + }, + { + "advantage_max": 0.8936111852526665, + "advantage_mean": -1.2107193581023523e-08, + "advantage_min": -0.8252501785755157, + "advantage_std": 0.6222108080983162, + "completion_length": 2639.7083740234375, + "epoch": 0.3657142857142857, + "grad_norm": 0.6508719325065613, + "kl": 0.24078369140625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.1094235253127374e-07, + "loss": 0.0466, + "reward": 0.7138673812150955, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7138673812150955, + "reward_after_std": 0.6222108118236065, + "reward_before_mean": 0.8545886836946011, + "reward_before_std": 0.6140736099332571, + "reward_change_max": 0.0, + "reward_change_mean": -0.14072128757834435, + "reward_change_min": -0.2278207279741764, + "reward_change_std": 0.08737712493166327, + "reward_std": 0.6222108118236065, + "rewards/cosine_scaled_reward": 0.00021099857985973358, + "rewards/format_reward": 0.8541666753590107, + "step": 320 + }, + { + "advantage_max": 1.420757032930851, + "advantage_mean": -3.290673195044391e-08, + "advantage_min": -0.9632410444319248, + "advantage_std": 0.9021714180707932, + "completion_length": 2570.7084045410156, + "epoch": 0.3668571428571429, + "grad_norm": 0.9943587779998779, + "kl": 0.2659912109375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.079579333738039e-07, + "loss": 0.0601, + "reward": 0.8740504225715995, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.8740504225715995, + "reward_after_std": 0.9021714143455029, + "reward_before_mean": 1.0198165625333786, + "reward_before_std": 0.8919219821691513, + "reward_change_max": 0.0, + "reward_change_mean": -0.14576612878590822, + "reward_change_min": -0.251723725348711, + "reward_change_std": 0.09419048205018044, + "reward_std": 0.9021714515984058, + "rewards/cosine_scaled_reward": 0.1036582519300282, + "rewards/format_reward": 0.8125000111758709, + "step": 321 + }, + { + "advantage_max": 1.508301742374897, + "advantage_mean": -1.5211601978037947e-08, + "advantage_min": -1.0270988121628761, + "advantage_std": 0.931220531463623, + "completion_length": 3147.7083892822266, + "epoch": 0.368, + "grad_norm": 0.33928442001342773, + "kl": 0.297698974609375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.0498043714627006e-07, + "loss": 0.033, + "reward": 0.5224417466670275, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5224417466670275, + "reward_after_std": 0.9312205240130424, + "reward_before_mean": 0.634037971496582, + "reward_before_std": 0.9298334568738937, + "reward_change_max": 0.00011686980724334717, + "reward_change_mean": -0.11159622902050614, + "reward_change_min": -0.20448446460068226, + "reward_change_std": 0.08062839088961482, + "reward_std": 0.9312205761671066, + "rewards/cosine_scaled_reward": -0.057981026358902454, + "rewards/format_reward": 0.7500000111758709, + "step": 322 + }, + { + "advantage_max": 0.8821052312850952, + "advantage_mean": -4.159907673884078e-08, + "advantage_min": -0.8327700607478619, + "advantage_std": 0.6381414122879505, + "completion_length": 3029.5000762939453, + "epoch": 0.36914285714285716, + "grad_norm": 0.443503737449646, + "kl": 0.2681427001953125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.020100089676376e-07, + "loss": 0.0181, + "reward": 0.4748626947402954, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4748626947402954, + "reward_after_std": 0.6381414271891117, + "reward_before_mean": 0.5938120167702436, + "reward_before_std": 0.6410843506455421, + "reward_change_max": 0.00010659545660018921, + "reward_change_mean": -0.11894936440512538, + "reward_change_min": -0.2031552977859974, + "reward_change_std": 0.07823430374264717, + "reward_std": 0.6381414458155632, + "rewards/cosine_scaled_reward": -0.07809399953112006, + "rewards/format_reward": 0.7500000111758709, + "step": 323 + }, + { + "advantage_max": 1.1430136933922768, + "advantage_mean": -1.800557053455165e-08, + "advantage_min": -0.9775372706353664, + "advantage_std": 0.7667670547962189, + "completion_length": 3201.3959045410156, + "epoch": 0.3702857142857143, + "grad_norm": 0.37577441334724426, + "kl": 0.375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.9904679361238526e-07, + "loss": 0.0391, + "reward": 0.40068634756607935, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.40068634756607935, + "reward_after_std": 0.7667670398950577, + "reward_before_mean": 0.5081015911418945, + "reward_before_std": 0.7719154208898544, + "reward_change_max": 0.0002768710255622864, + "reward_change_mean": -0.1074152598157525, + "reward_change_min": -0.19459333643317223, + "reward_change_std": 0.07792914099991322, + "reward_std": 0.7667670398950577, + "rewards/cosine_scaled_reward": -0.08969920873641968, + "rewards/format_reward": 0.6875000186264515, + "step": 324 + }, + { + "advantage_max": 1.1993984952569008, + "advantage_mean": -1.2417634698280722e-08, + "advantage_min": -0.8039500899612904, + "advantage_std": 0.7809208258986473, + "completion_length": 3013.3750534057617, + "epoch": 0.37142857142857144, + "grad_norm": 0.4150097966194153, + "kl": 0.376617431640625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.9609093550344907e-07, + "loss": 0.0383, + "reward": 0.3445624615997076, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3445624615997076, + "reward_after_std": 0.780920822173357, + "reward_before_mean": 0.44553670287132263, + "reward_before_std": 0.78526845946908, + "reward_change_max": 0.000149555504322052, + "reward_change_mean": -0.10097424406558275, + "reward_change_min": -0.20076271332800388, + "reward_change_std": 0.0749687859788537, + "reward_std": 0.7809208557009697, + "rewards/cosine_scaled_reward": -0.0897316625341773, + "rewards/format_reward": 0.6250000093132257, + "step": 325 + }, + { + "advantage_max": 1.1256632208824158, + "advantage_mean": -5.463759156221215e-08, + "advantage_min": -0.97540083527565, + "advantage_std": 0.7677330262959003, + "completion_length": 2688.729263305664, + "epoch": 0.37257142857142855, + "grad_norm": 0.29752317070961, + "kl": 0.26910400390625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.931425787051832e-07, + "loss": 0.0378, + "reward": 0.792833048501052, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.792833048501052, + "reward_after_std": 0.7677330300211906, + "reward_before_mean": 0.936364995315671, + "reward_before_std": 0.7656964734196663, + "reward_change_max": 0.0, + "reward_change_mean": -0.14353200886398554, + "reward_change_min": -0.2429923191666603, + "reward_change_std": 0.09230223717167974, + "reward_std": 0.7677330449223518, + "rewards/cosine_scaled_reward": 0.051515836268663406, + "rewards/format_reward": 0.8333333432674408, + "step": 326 + }, + { + "advantage_max": 1.245759092271328, + "advantage_mean": -5.401671110405459e-08, + "advantage_min": -1.0755857825279236, + "advantage_std": 0.8496082611382008, + "completion_length": 2883.9375534057617, + "epoch": 0.3737142857142857, + "grad_norm": 0.5994224548339844, + "kl": 0.33734130859375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.902018669163384e-07, + "loss": 0.0484, + "reward": 0.8499644990079105, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.8499644990079105, + "reward_after_std": 0.8496082648634911, + "reward_before_mean": 0.9967611813917756, + "reward_before_std": 0.8455248475074768, + "reward_change_max": 0.0, + "reward_change_mean": -0.14679674478247762, + "reward_change_min": -0.23957818932831287, + "reward_change_std": 0.09821966802701354, + "reward_std": 0.8496082872152328, + "rewards/cosine_scaled_reward": 0.1025472705514403, + "rewards/format_reward": 0.791666679084301, + "step": 327 + }, + { + "advantage_max": 1.5196349136531353, + "advantage_mean": -2.7318796225195285e-08, + "advantage_min": -0.9332630708813667, + "advantage_std": 0.9141910150647163, + "completion_length": 3421.2084045410156, + "epoch": 0.37485714285714283, + "grad_norm": 0.5353078842163086, + "kl": 0.3829345703125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.872689434630585e-07, + "loss": 0.0259, + "reward": 0.2595358984544873, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2595358984544873, + "reward_after_std": 0.9141910225152969, + "reward_before_mean": 0.34659624099731445, + "reward_before_std": 0.917808011174202, + "reward_change_max": 0.0003191158175468445, + "reward_change_mean": -0.0870603434741497, + "reward_change_min": -0.1851830156520009, + "reward_change_std": 0.07253103656694293, + "reward_std": 0.9141910411417484, + "rewards/cosine_scaled_reward": -0.0767018897458911, + "rewards/format_reward": 0.5000000093132257, + "step": 328 + }, + { + "advantage_max": 1.1640001758933067, + "advantage_mean": -3.7252904094842165e-08, + "advantage_min": -0.9084269776940346, + "advantage_std": 0.7873306609690189, + "completion_length": 2226.8542308807373, + "epoch": 0.376, + "grad_norm": 0.33948275446891785, + "kl": 0.23187255859375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.843439512918949e-07, + "loss": 0.0203, + "reward": 0.8589357230812311, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.8589357230812311, + "reward_after_std": 0.7873306758701801, + "reward_before_mean": 1.0080676265060902, + "reward_before_std": 0.779254674911499, + "reward_change_max": 9.436160326004028e-05, + "reward_change_mean": -0.14913191087543964, + "reward_change_min": -0.250965254381299, + "reward_change_std": 0.09604426752775908, + "reward_std": 0.7873307056725025, + "rewards/cosine_scaled_reward": 0.0873671374283731, + "rewards/format_reward": 0.8333333395421505, + "step": 329 + }, + { + "advantage_max": 1.157850719988346, + "advantage_mean": -6.208817349140361e-09, + "advantage_min": -0.9563889093697071, + "advantage_std": 0.7745515741407871, + "completion_length": 2771.8125610351562, + "epoch": 0.37714285714285717, + "grad_norm": 0.6797903180122375, + "kl": 0.32012939453125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.8142703296283953e-07, + "loss": 0.0015, + "reward": 0.566857360303402, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.566857360303402, + "reward_after_std": 0.7745515815913677, + "reward_before_mean": 0.6887968610972166, + "reward_before_std": 0.7713383361697197, + "reward_change_max": 0.0, + "reward_change_mean": -0.12193948682397604, + "reward_change_min": -0.20816569216549397, + "reward_change_std": 0.0807517715729773, + "reward_std": 0.7745516300201416, + "rewards/cosine_scaled_reward": -0.05143492412753403, + "rewards/format_reward": 0.7916666846722364, + "step": 330 + }, + { + "advantage_max": 1.1063204184174538, + "advantage_mean": 3.7252904094842165e-09, + "advantage_min": -0.5477707237005234, + "advantage_std": 0.6514084860682487, + "completion_length": 2844.1458492279053, + "epoch": 0.3782857142857143, + "grad_norm": 0.8791049718856812, + "kl": 0.452423095703125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.785183306423767e-07, + "loss": 0.0217, + "reward": -0.13264837488532066, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.13264837488532066, + "reward_after_std": 0.6514084823429585, + "reward_before_mean": -0.07454398460686207, + "reward_before_std": 0.6529114097356796, + "reward_change_max": 0.00013102591037750244, + "reward_change_mean": -0.05810439004562795, + "reward_change_min": -0.11956090945750475, + "reward_change_std": 0.04494661255739629, + "reward_std": 0.6514085009694099, + "rewards/cosine_scaled_reward": -0.24560532672330737, + "rewards/format_reward": 0.4166666753590107, + "step": 331 + }, + { + "advantage_max": 1.065722979605198, + "advantage_mean": 3.3306690738754696e-16, + "advantage_min": -1.053912278264761, + "advantage_std": 0.8020082227885723, + "completion_length": 2842.166717529297, + "epoch": 0.37942857142857145, + "grad_norm": 0.8386214971542358, + "kl": 0.2808074951171875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.7561798609655373e-07, + "loss": 0.0607, + "reward": 0.7436191029846668, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7436191029846668, + "reward_after_std": 0.8020082227885723, + "reward_before_mean": 0.8833221234381199, + "reward_before_std": 0.8112938217818737, + "reward_change_max": 3.3855438232421875e-05, + "reward_change_mean": -0.13970299996435642, + "reward_change_min": -0.25867921486496925, + "reward_change_std": 0.09962275065481663, + "reward_std": 0.8020082265138626, + "rewards/cosine_scaled_reward": 0.0354110449552536, + "rewards/format_reward": 0.8125000223517418, + "step": 332 + }, + { + "advantage_max": 0.9847873151302338, + "advantage_mean": -1.1175871117430347e-08, + "advantage_min": -0.936510294675827, + "advantage_std": 0.7264337800443172, + "completion_length": 2575.6250915527344, + "epoch": 0.38057142857142856, + "grad_norm": 0.5724052786827087, + "kl": 0.20947265625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.72726140684072e-07, + "loss": 0.0837, + "reward": 0.7284279093146324, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.7284279093146324, + "reward_after_std": 0.7264337874948978, + "reward_before_mean": 0.8686392232775688, + "reward_before_std": 0.732000857591629, + "reward_change_max": 6.556510925292969e-07, + "reward_change_mean": -0.14021131303161383, + "reward_change_min": -0.232681673951447, + "reward_change_std": 0.09069301281124353, + "reward_std": 0.7264338135719299, + "rewards/cosine_scaled_reward": 0.007236262783408165, + "rewards/format_reward": 0.854166679084301, + "step": 333 + }, + { + "advantage_max": 0.7661653384566307, + "advantage_mean": -9.31322685637781e-10, + "advantage_min": -0.93858827278018, + "advantage_std": 0.6230427138507366, + "completion_length": 3269.1875915527344, + "epoch": 0.38171428571428573, + "grad_norm": 0.38517114520072937, + "kl": 0.3935546875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.6984293534939737e-07, + "loss": 0.0393, + "reward": 0.34002362564206123, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.34002362564206123, + "reward_after_std": 0.6230427138507366, + "reward_before_mean": 0.44862550497055054, + "reward_before_std": 0.6393397077918053, + "reward_change_max": 4.176795482635498e-05, + "reward_change_mean": -0.1086018648929894, + "reward_change_min": -0.1885532783344388, + "reward_change_std": 0.07740613957867026, + "reward_std": 0.6230427287518978, + "rewards/cosine_scaled_reward": -0.14027060009539127, + "rewards/format_reward": 0.7291666902601719, + "step": 334 + }, + { + "advantage_max": 1.0034086257219315, + "advantage_mean": 1.4280279680978225e-08, + "advantage_min": -0.8655779659748077, + "advantage_std": 0.7271784171462059, + "completion_length": 2679.8541717529297, + "epoch": 0.38285714285714284, + "grad_norm": 0.5499774217605591, + "kl": 0.272064208984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.6696851061588994e-07, + "loss": 0.0389, + "reward": 0.7168587098713033, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7168587098713033, + "reward_after_std": 0.7271784208714962, + "reward_before_mean": 0.8559580298606306, + "reward_before_std": 0.7216392774134874, + "reward_change_max": 0.00017315149307250977, + "reward_change_mean": -0.13909927383065224, + "reward_change_min": -0.2359151355922222, + "reward_change_std": 0.09495427086949348, + "reward_std": 0.7271784581243992, + "rewards/cosine_scaled_reward": 0.06339564686641097, + "rewards/format_reward": 0.7291666716337204, + "step": 335 + }, + { + "advantage_max": 1.1399415507912636, + "advantage_mean": -3.29067312843101e-08, + "advantage_min": -1.0761992260813713, + "advantage_std": 0.8604688420891762, + "completion_length": 3144.4584350585938, + "epoch": 0.384, + "grad_norm": 0.7100808024406433, + "kl": 0.298583984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.641030065789562e-07, + "loss": 0.0395, + "reward": 0.699290337972343, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.699290337972343, + "reward_after_std": 0.8604688420891762, + "reward_before_mean": 0.8338933810591698, + "reward_before_std": 0.8771174177527428, + "reward_change_max": 0.0, + "reward_change_mean": -0.1346030319109559, + "reward_change_min": -0.24664145708084106, + "reward_change_std": 0.09776118211448193, + "reward_std": 0.860468864440918, + "rewards/cosine_scaled_reward": 0.0002800021320581436, + "rewards/format_reward": 0.833333358168602, + "step": 336 + }, + { + "advantage_max": 1.0692218244075775, + "advantage_mean": -2.8560560083601416e-08, + "advantage_min": -1.02787471935153, + "advantage_std": 0.8040499426424503, + "completion_length": 3072.0000610351562, + "epoch": 0.3851428571428571, + "grad_norm": 0.36398187279701233, + "kl": 0.3551025390625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.612465628992203e-07, + "loss": 0.0351, + "reward": 0.6912481244653463, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6912481244653463, + "reward_after_std": 0.8040499426424503, + "reward_before_mean": 0.8266817089170218, + "reward_before_std": 0.8181922435760498, + "reward_change_max": 0.0, + "reward_change_mean": -0.13543362356722355, + "reward_change_min": -0.24755028635263443, + "reward_change_std": 0.09691136796027422, + "reward_std": 0.8040499612689018, + "rewards/cosine_scaled_reward": -0.0033258050680160522, + "rewards/format_reward": 0.8333333507180214, + "step": 337 + }, + { + "advantage_max": 1.2663331478834152, + "advantage_mean": -1.676380706472358e-08, + "advantage_min": -0.9755394570529461, + "advantage_std": 0.8277748636901379, + "completion_length": 2442.7708892822266, + "epoch": 0.3862857142857143, + "grad_norm": 1.02471125125885, + "kl": 0.27752685546875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.5839931879571725e-07, + "loss": 0.036, + "reward": 0.5828519398346543, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5828519398346543, + "reward_after_std": 0.8277748636901379, + "reward_before_mean": 0.7042157929390669, + "reward_before_std": 0.8291206769645214, + "reward_change_max": 0.00014644861221313477, + "reward_change_mean": -0.12136386055499315, + "reward_change_min": -0.21317408978939056, + "reward_change_std": 0.08293768810108304, + "reward_std": 0.8277748972177505, + "rewards/cosine_scaled_reward": -0.022892115055583417, + "rewards/format_reward": 0.7500000093132257, + "step": 338 + }, + { + "advantage_max": 1.0612557902932167, + "advantage_mean": 1.924733389335742e-08, + "advantage_min": -0.9199167042970657, + "advantage_std": 0.7026827409863472, + "completion_length": 3077.7709045410156, + "epoch": 0.38742857142857146, + "grad_norm": 0.5753820538520813, + "kl": 0.400177001953125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.555614130391079e-07, + "loss": 0.0287, + "reward": 0.3159185843542218, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3159185843542218, + "reward_after_std": 0.7026827484369278, + "reward_before_mean": 0.41648441832512617, + "reward_before_std": 0.7069019302725792, + "reward_change_max": 0.00016146153211593628, + "reward_change_mean": -0.10056579299271107, + "reward_change_min": -0.1769854985177517, + "reward_change_std": 0.07176952017471194, + "reward_std": 0.7026827856898308, + "rewards/cosine_scaled_reward": -0.08342447318136692, + "rewards/format_reward": 0.583333345130086, + "step": 339 + }, + { + "advantage_max": 1.4452608078718185, + "advantage_mean": -2.0799538091864633e-08, + "advantage_min": -0.8381898179650307, + "advantage_std": 0.8512439541518688, + "completion_length": 2903.0209197998047, + "epoch": 0.38857142857142857, + "grad_norm": 0.4870928227901459, + "kl": 0.34814453125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.5273298394491515e-07, + "loss": 0.0612, + "reward": 0.5209975503385067, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5209975503385067, + "reward_after_std": 0.851243931800127, + "reward_before_mean": 0.6333642583340406, + "reward_before_std": 0.8363424316048622, + "reward_change_max": 0.0, + "reward_change_mean": -0.1123666986823082, + "reward_change_min": -0.20301656797528267, + "reward_change_std": 0.07352043082937598, + "reward_std": 0.8512439504265785, + "rewards/cosine_scaled_reward": -0.09998455084860325, + "rewards/format_reward": 0.833333358168602, + "step": 340 + }, + { + "advantage_max": 1.1241364851593971, + "advantage_mean": -5.898376231883162e-09, + "advantage_min": -0.9545042403042316, + "advantage_std": 0.8547794707119465, + "completion_length": 2718.6875610351562, + "epoch": 0.38971428571428574, + "grad_norm": 0.7362974286079407, + "kl": 0.2999114990234375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.4991416936678276e-07, + "loss": 0.0321, + "reward": 0.787535191513598, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.787535191513598, + "reward_after_std": 0.8547794744372368, + "reward_before_mean": 0.9302486808373942, + "reward_before_std": 0.8745679631829262, + "reward_change_max": 0.0002143457531929016, + "reward_change_mean": -0.1427134550176561, + "reward_change_min": -0.26017661951482296, + "reward_change_std": 0.10592424450442195, + "reward_std": 0.854779489338398, + "rewards/cosine_scaled_reward": 0.10054099000990391, + "rewards/format_reward": 0.7291666753590107, + "step": 341 + }, + { + "advantage_max": 1.2140427753329277, + "advantage_mean": -7.450580818968433e-09, + "advantage_min": -1.0092006474733353, + "advantage_std": 0.8508938550949097, + "completion_length": 3101.4584350585938, + "epoch": 0.39085714285714285, + "grad_norm": 0.7306240797042847, + "kl": 0.5538330078125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.471051066897562e-07, + "loss": 0.0329, + "reward": 0.34557378385216, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.34557378385216, + "reward_after_std": 0.8508938923478127, + "reward_before_mean": 0.44565768260508776, + "reward_before_std": 0.8663280420005322, + "reward_change_max": 0.0, + "reward_change_mean": -0.10008389223366976, + "reward_change_min": -0.20153771713376045, + "reward_change_std": 0.0821488774381578, + "reward_std": 0.8508939146995544, + "rewards/cosine_scaled_reward": -0.08967117220163345, + "rewards/format_reward": 0.6250000149011612, + "step": 342 + }, + { + "advantage_max": 1.0857341140508652, + "advantage_mean": 1.2417635419925688e-08, + "advantage_min": -0.8121387511491776, + "advantage_std": 0.7338540963828564, + "completion_length": 3103.625030517578, + "epoch": 0.392, + "grad_norm": 0.8223617076873779, + "kl": 0.365509033203125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.4430593282358777e-07, + "loss": 0.0477, + "reward": 0.6110813869163394, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6110813869163394, + "reward_after_std": 0.733854103833437, + "reward_before_mean": 0.7386172357946634, + "reward_before_std": 0.7282578460872173, + "reward_change_max": 0.00015385448932647705, + "reward_change_mean": -0.12753579672425985, + "reward_change_min": -0.22481666505336761, + "reward_change_std": 0.08690722612664104, + "reward_std": 0.7338541373610497, + "rewards/cosine_scaled_reward": 0.0463919285684824, + "rewards/format_reward": 0.6458333488553762, + "step": 343 + }, + { + "advantage_max": 0.8111766427755356, + "advantage_mean": -4.718701207551135e-08, + "advantage_min": -1.0656887590885162, + "advantage_std": 0.7146199978888035, + "completion_length": 2326.5834045410156, + "epoch": 0.3931428571428571, + "grad_norm": 0.4540533125400543, + "kl": 0.2355194091796875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.4151678419606233e-07, + "loss": 0.0034, + "reward": 1.0658331364393234, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 1.0658331364393234, + "reward_after_std": 0.7146199941635132, + "reward_before_mean": 1.2407823614776134, + "reward_before_std": 0.7263977602124214, + "reward_change_max": 0.00012203305959701538, + "reward_change_mean": -0.17494926508516073, + "reward_change_min": -0.2789658457040787, + "reward_change_std": 0.11326393391937017, + "reward_std": 0.7146199978888035, + "rewards/cosine_scaled_reward": 0.20372450165450573, + "rewards/format_reward": 0.833333358168602, + "step": 344 + }, + { + "advantage_max": 1.0266596004366875, + "advantage_mean": 5.277494247168946e-09, + "advantage_min": -1.0351220294833183, + "advantage_std": 0.7456030026078224, + "completion_length": 2850.041748046875, + "epoch": 0.3942857142857143, + "grad_norm": 1.002143144607544, + "kl": 0.340423583984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.387377967463493e-07, + "loss": -0.0068, + "reward": 0.8741661226376891, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.8741661226376891, + "reward_after_std": 0.7456029951572418, + "reward_before_mean": 1.02674968726933, + "reward_before_std": 0.7466263473033905, + "reward_change_max": 0.0, + "reward_change_mean": -0.1525835506618023, + "reward_change_min": -0.2507946826517582, + "reward_change_std": 0.09826913708820939, + "reward_std": 0.745603010058403, + "rewards/cosine_scaled_reward": 0.10712484084069729, + "rewards/format_reward": 0.8125000074505806, + "step": 345 + }, + { + "advantage_max": 0.791664257645607, + "advantage_mean": -3.414849514271623e-09, + "advantage_min": -0.6630604565143585, + "advantage_std": 0.5060148164629936, + "completion_length": 3213.3958740234375, + "epoch": 0.3954285714285714, + "grad_norm": 1.1361817121505737, + "kl": 0.53509521484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.359691059183761e-07, + "loss": 0.034, + "reward": 0.22787731047719717, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.22787731047719717, + "reward_after_std": 0.506014809012413, + "reward_before_mean": 0.3254447274375707, + "reward_before_std": 0.49645309150218964, + "reward_change_max": 7.732957601547241e-05, + "reward_change_mean": -0.09756739297881722, + "reward_change_min": -0.15314494539052248, + "reward_change_std": 0.0625298055820167, + "reward_std": 0.5060148164629936, + "rewards/cosine_scaled_reward": -0.11852765083312988, + "rewards/format_reward": 0.562500013038516, + "step": 346 + }, + { + "advantage_max": 0.7966568246483803, + "advantage_mean": -1.986821579480491e-08, + "advantage_min": -0.7594610974192619, + "advantage_std": 0.5565367415547371, + "completion_length": 3140.479248046875, + "epoch": 0.3965714285714286, + "grad_norm": 0.8332196474075317, + "kl": 0.4498291015625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.3321084665422803e-07, + "loss": 0.0203, + "reward": 0.19970552437007427, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.19970552437007427, + "reward_after_std": 0.5565367415547371, + "reward_before_mean": 0.29436481976881623, + "reward_before_std": 0.5582841373980045, + "reward_change_max": 0.0, + "reward_change_mean": -0.09465931123122573, + "reward_change_min": -0.16640873905271292, + "reward_change_std": 0.06378639955073595, + "reward_std": 0.5565367564558983, + "rewards/cosine_scaled_reward": -0.22781759407371283, + "rewards/format_reward": 0.7500000260770321, + "step": 347 + }, + { + "advantage_max": 1.0456115677952766, + "advantage_mean": -2.359350537162186e-08, + "advantage_min": -0.7629845626652241, + "advantage_std": 0.6946265436708927, + "completion_length": 2854.5000762939453, + "epoch": 0.3977142857142857, + "grad_norm": 0.5373441576957703, + "kl": 0.4082489013671875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.3046315338757026e-07, + "loss": 0.0432, + "reward": 0.27077578753232956, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.27077578753232956, + "reward_after_std": 0.6946265511214733, + "reward_before_mean": 0.36715321242809296, + "reward_before_std": 0.6996577307581902, + "reward_change_max": 6.26593828201294e-05, + "reward_change_mean": -0.09637744631618261, + "reward_change_min": -0.18722706474363804, + "reward_change_std": 0.07272940431721509, + "reward_std": 0.6946265697479248, + "rewards/cosine_scaled_reward": -0.13934006914496422, + "rewards/format_reward": 0.6458333358168602, + "step": 348 + }, + { + "advantage_max": 1.3121556118130684, + "advantage_mean": -1.1486312123665243e-08, + "advantage_min": -0.8878070712089539, + "advantage_std": 0.7979016825556755, + "completion_length": 3066.3125915527344, + "epoch": 0.39885714285714285, + "grad_norm": 0.48574212193489075, + "kl": 0.22210693359375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.2772616003709616e-07, + "loss": 0.0252, + "reward": 0.4082105464185588, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4082105464185588, + "reward_after_std": 0.7979016974568367, + "reward_before_mean": 0.5126420352607965, + "reward_before_std": 0.7913182973861694, + "reward_change_max": 0.0002217814326286316, + "reward_change_mean": -0.10443148808553815, + "reward_change_min": -0.18364053964614868, + "reward_change_std": 0.07094059698283672, + "reward_std": 0.7979017086327076, + "rewards/cosine_scaled_reward": -0.1290956644807011, + "rewards/format_reward": 0.7708333469927311, + "step": 349 + }, + { + "advantage_max": 1.0644586831331253, + "advantage_mean": -3.476937715518602e-08, + "advantage_min": -0.9515664502978325, + "advantage_std": 0.7851700074970722, + "completion_length": 2800.104248046875, + "epoch": 0.4, + "grad_norm": 0.5524913668632507, + "kl": 0.424530029296875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.250000000000001e-07, + "loss": 0.0579, + "reward": 0.7616480272263288, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.7616480272263288, + "reward_after_std": 0.7851700149476528, + "reward_before_mean": 0.9033026099205017, + "reward_before_std": 0.7952011059969664, + "reward_change_max": 0.0, + "reward_change_mean": -0.14165459107607603, + "reward_change_min": -0.25414634868502617, + "reward_change_std": 0.09784402325749397, + "reward_std": 0.7851700223982334, + "rewards/cosine_scaled_reward": 0.08706796666956507, + "rewards/format_reward": 0.7291666716337204, + "step": 350 + }, + { + "advantage_max": 1.0468278601765633, + "advantage_mean": -5.8518101919702303e-08, + "advantage_min": -0.871786467730999, + "advantage_std": 0.7094918079674244, + "completion_length": 2834.916732788086, + "epoch": 0.40114285714285713, + "grad_norm": 0.26139718294143677, + "kl": 0.2466888427734375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.222848061454764e-07, + "loss": 0.0272, + "reward": 0.857110857963562, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.857110857963562, + "reward_after_std": 0.7094917967915535, + "reward_before_mean": 1.0084098004736006, + "reward_before_std": 0.6962695913389325, + "reward_change_max": 0.0, + "reward_change_mean": -0.15129899140447378, + "reward_change_min": -0.24284421186894178, + "reward_change_std": 0.0927650211378932, + "reward_std": 0.7094918265938759, + "rewards/cosine_scaled_reward": 0.11878822930157185, + "rewards/format_reward": 0.7708333488553762, + "step": 351 + }, + { + "advantage_max": 1.302818451076746, + "advantage_mean": -2.0489097474207796e-08, + "advantage_min": -1.003556728363037, + "advantage_std": 0.826737018302083, + "completion_length": 2807.3751220703125, + "epoch": 0.4022857142857143, + "grad_norm": 1.261825442314148, + "kl": 0.3189697265625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.195807108082429e-07, + "loss": 0.0777, + "reward": 0.48880088748410344, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.48880088748410344, + "reward_after_std": 0.8267370592802763, + "reward_before_mean": 0.6008866727352142, + "reward_before_std": 0.8274815808981657, + "reward_change_max": 0.0, + "reward_change_mean": -0.11208577593788505, + "reward_change_min": -0.19520364236086607, + "reward_change_std": 0.07618892658501863, + "reward_std": 0.8267370741814375, + "rewards/cosine_scaled_reward": 0.008776647970080376, + "rewards/format_reward": 0.5833333469927311, + "step": 352 + }, + { + "advantage_max": 1.1022524684667587, + "advantage_mean": -3.725290242950763e-09, + "advantage_min": -0.8354335837066174, + "advantage_std": 0.7104697525501251, + "completion_length": 2324.833450317383, + "epoch": 0.4034285714285714, + "grad_norm": 0.7030723094940186, + "kl": 0.220916748046875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.168878457820915e-07, + "loss": 0.0001, + "reward": 0.7568352874368429, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7568352874368429, + "reward_after_std": 0.7104697562754154, + "reward_before_mean": 0.8968926519155502, + "reward_before_std": 0.6982098333537579, + "reward_change_max": 0.0, + "reward_change_mean": -0.1400573654100299, + "reward_change_min": -0.24266061559319496, + "reward_change_std": 0.08802814176306129, + "reward_std": 0.7104697898030281, + "rewards/cosine_scaled_reward": 0.06302965292707086, + "rewards/format_reward": 0.770833358168602, + "step": 353 + }, + { + "advantage_max": 0.791136309504509, + "advantage_mean": -1.4901161526914564e-08, + "advantage_min": -1.0148145444691181, + "advantage_std": 0.6643821857869625, + "completion_length": 2448.854217529297, + "epoch": 0.4045714285714286, + "grad_norm": 0.360441654920578, + "kl": 0.2796478271484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.142063423134644e-07, + "loss": 0.0113, + "reward": 0.7733204569667578, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.7733204569667578, + "reward_after_std": 0.6643821746110916, + "reward_before_mean": 0.9214807010721415, + "reward_before_std": 0.675186101347208, + "reward_change_max": 0.0, + "reward_change_mean": -0.14816024899482727, + "reward_change_min": -0.23208301700651646, + "reward_change_std": 0.0934093315154314, + "reward_std": 0.6643821746110916, + "rewards/cosine_scaled_reward": 0.12740701530128717, + "rewards/format_reward": 0.6666666846722364, + "step": 354 + }, + { + "advantage_max": 1.4501032158732414, + "advantage_mean": -1.98682153507157e-08, + "advantage_min": -1.158358946442604, + "advantage_std": 0.9547172710299492, + "completion_length": 2681.2084045410156, + "epoch": 0.4057142857142857, + "grad_norm": 0.4072975218296051, + "kl": 0.275482177734375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.115363310950578e-07, + "loss": 0.0327, + "reward": 0.6503871716558933, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6503871716558933, + "reward_after_std": 0.9547172412276268, + "reward_before_mean": 0.7749169375747442, + "reward_before_std": 0.9606028571724892, + "reward_change_max": 9.2335045337677e-05, + "reward_change_mean": -0.1245297659188509, + "reward_change_min": -0.22305811289697886, + "reward_change_std": 0.09232740569859743, + "reward_std": 0.9547172635793686, + "rewards/cosine_scaled_reward": 0.012458451557904482, + "rewards/format_reward": 0.7500000074505806, + "step": 355 + }, + { + "advantage_max": 0.966142512857914, + "advantage_mean": -8.071462387349015e-09, + "advantage_min": -1.140071079134941, + "advantage_std": 0.7658510934561491, + "completion_length": 2957.479202270508, + "epoch": 0.40685714285714286, + "grad_norm": 0.5873293280601501, + "kl": 0.2852935791015625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.0887794225945143e-07, + "loss": 0.0161, + "reward": 0.6667291913181543, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6667291913181543, + "reward_after_std": 0.7658511009067297, + "reward_before_mean": 0.8018401209264994, + "reward_before_std": 0.7819232614710927, + "reward_change_max": 0.00015025585889816284, + "reward_change_mean": -0.13511092774569988, + "reward_change_min": -0.23053260147571564, + "reward_change_std": 0.09672127198427916, + "reward_std": 0.7658511158078909, + "rewards/cosine_scaled_reward": 0.0155033846385777, + "rewards/format_reward": 0.7708333469927311, + "step": 356 + }, + { + "advantage_max": 1.0412839315831661, + "advantage_mean": -2.235174231812742e-08, + "advantage_min": -1.0026082322001457, + "advantage_std": 0.763444721698761, + "completion_length": 3193.3541870117188, + "epoch": 0.408, + "grad_norm": 0.664465606212616, + "kl": 0.3111572265625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.062313053727671e-07, + "loss": 0.0059, + "reward": 0.4894076222553849, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4894076222553849, + "reward_after_std": 0.7634447365999222, + "reward_before_mean": 0.6064597554504871, + "reward_before_std": 0.775292593985796, + "reward_change_max": 0.0004100501537322998, + "reward_change_mean": -0.11705213971436024, + "reward_change_min": -0.22646293975412846, + "reward_change_std": 0.0875883437693119, + "reward_std": 0.7634447701275349, + "rewards/cosine_scaled_reward": -0.08218680415302515, + "rewards/format_reward": 0.770833358168602, + "step": 357 + }, + { + "advantage_max": 1.1054131537675858, + "advantage_mean": 1.1486311679576033e-08, + "advantage_min": -0.9517468959093094, + "advantage_std": 0.7820233516395092, + "completion_length": 2913.7084045410156, + "epoch": 0.40914285714285714, + "grad_norm": 0.9437838196754456, + "kl": 0.4264373779296875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.0359654942835247e-07, + "loss": 0.0231, + "reward": 0.6220018891617656, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6220018891617656, + "reward_after_std": 0.7820233516395092, + "reward_before_mean": 0.7498711105436087, + "reward_before_std": 0.787973016500473, + "reward_change_max": 0.0, + "reward_change_mean": -0.12786916503682733, + "reward_change_min": -0.22747263871133327, + "reward_change_std": 0.09117141552269459, + "reward_std": 0.7820233665406704, + "rewards/cosine_scaled_reward": -6.447359919548035e-05, + "rewards/format_reward": 0.7500000223517418, + "step": 358 + }, + { + "advantage_max": 0.9187195301055908, + "advantage_mean": -9.934106759423855e-09, + "advantage_min": -0.8075991421937943, + "advantage_std": 0.6383391171693802, + "completion_length": 2860.8334350585938, + "epoch": 0.4102857142857143, + "grad_norm": 0.2313191443681717, + "kl": 0.248382568359375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.0097380284049523e-07, + "loss": 0.0254, + "reward": 0.6255717375315726, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6255717375315726, + "reward_after_std": 0.6383391208946705, + "reward_before_mean": 0.7577385175973177, + "reward_before_std": 0.6361598484218121, + "reward_change_max": 0.0001360177993774414, + "reward_change_mean": -0.1321667837910354, + "reward_change_min": -0.2153165964409709, + "reward_change_std": 0.0826862514950335, + "reward_std": 0.6383391432464123, + "rewards/cosine_scaled_reward": -0.027380744460970163, + "rewards/format_reward": 0.8125000149011612, + "step": 359 + }, + { + "advantage_max": 1.344765804708004, + "advantage_mean": -4.47034849138106e-08, + "advantage_min": -1.3960507363080978, + "advantage_std": 1.0359720475971699, + "completion_length": 3082.541778564453, + "epoch": 0.4114285714285714, + "grad_norm": 0.9665197134017944, + "kl": 0.27655029296875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.9836319343816397e-07, + "loss": 0.0287, + "reward": 0.9328369447030127, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.9328369447030127, + "reward_after_std": 1.0359720401465893, + "reward_before_mean": 1.0855822588782758, + "reward_before_std": 1.061440672725439, + "reward_change_max": 0.0002325400710105896, + "reward_change_mean": -0.15274538099765778, + "reward_change_min": -0.2875901088118553, + "reward_change_std": 0.1183041324838996, + "reward_std": 1.0359720438718796, + "rewards/cosine_scaled_reward": 0.1573744739871472, + "rewards/format_reward": 0.770833358168602, + "step": 360 + }, + { + "advantage_max": 1.1589705422520638, + "advantage_mean": 1.2107193525512372e-08, + "advantage_min": -1.2056332975625992, + "advantage_std": 0.8765405416488647, + "completion_length": 3018.666717529297, + "epoch": 0.4125714285714286, + "grad_norm": 1.0897059440612793, + "kl": 0.3499755859375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.9576484845877793e-07, + "loss": 0.0639, + "reward": 0.4580980301834643, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4580980301834643, + "reward_after_std": 0.8765405118465424, + "reward_before_mean": 0.5701448558829725, + "reward_before_std": 0.9030290320515633, + "reward_change_max": 0.0002184361219406128, + "reward_change_mean": -0.1120468145236373, + "reward_change_min": -0.22999682277441025, + "reward_change_std": 0.09206069586798549, + "reward_std": 0.876540519297123, + "rewards/cosine_scaled_reward": -0.03784424933837727, + "rewards/format_reward": 0.6458333469927311, + "step": 361 + }, + { + "advantage_max": 0.8124096095561981, + "advantage_mean": -3.8494666620980666e-08, + "advantage_min": -0.8067378401756287, + "advantage_std": 0.6147194467484951, + "completion_length": 1966.5416717529297, + "epoch": 0.4137142857142857, + "grad_norm": 0.4702957272529602, + "kl": 0.219390869140625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.931788945420058e-07, + "loss": 0.0247, + "reward": 0.5928452904336154, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5928452904336154, + "reward_after_std": 0.6147194467484951, + "reward_before_mean": 0.7235520584508777, + "reward_before_std": 0.6164083369076252, + "reward_change_max": 0.0007086694240570068, + "reward_change_mean": -0.1307067759335041, + "reward_change_min": -0.22446970269083977, + "reward_change_std": 0.08686780696734786, + "reward_std": 0.6147194840013981, + "rewards/cosine_scaled_reward": -0.034057313576340675, + "rewards/format_reward": 0.7916666902601719, + "step": 362 + }, + { + "advantage_max": 1.1513478010892868, + "advantage_mean": -1.2417634920325327e-08, + "advantage_min": -1.4183903858065605, + "advantage_std": 0.934955932199955, + "completion_length": 2242.1042251586914, + "epoch": 0.41485714285714287, + "grad_norm": 0.4347902536392212, + "kl": 0.3087310791015625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.9060545772359305e-07, + "loss": 0.0381, + "reward": 0.7093102987855673, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.7093102987855673, + "reward_after_std": 0.934955932199955, + "reward_before_mean": 0.8446380607783794, + "reward_before_std": 0.9661568701267242, + "reward_change_max": 0.00010865926742553711, + "reward_change_mean": -0.13532774476334453, + "reward_change_min": -0.2547372132539749, + "reward_change_std": 0.10575387300923467, + "reward_std": 0.9349559731781483, + "rewards/cosine_scaled_reward": 0.0681523447856307, + "rewards/format_reward": 0.708333358168602, + "step": 363 + }, + { + "advantage_max": 0.9506546705961227, + "advantage_mean": -2.173086155465853e-09, + "advantage_min": -0.6697430238127708, + "advantage_std": 0.5997806861996651, + "completion_length": 2946.916748046875, + "epoch": 0.416, + "grad_norm": 0.42780113220214844, + "kl": 0.3773040771484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.8804466342921987e-07, + "loss": 0.0328, + "reward": 0.20062502287328243, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.20062502287328243, + "reward_after_std": 0.5997806712985039, + "reward_before_mean": 0.2927093543112278, + "reward_before_std": 0.5955142378807068, + "reward_change_max": 0.0, + "reward_change_mean": -0.09208432491868734, + "reward_change_min": -0.16610389854758978, + "reward_change_std": 0.06054901331663132, + "reward_std": 0.5997806787490845, + "rewards/cosine_scaled_reward": -0.1453120014630258, + "rewards/format_reward": 0.5833333414047956, + "step": 364 + }, + { + "advantage_max": 0.8594930022954941, + "advantage_mean": -3.523503833147146e-08, + "advantage_min": -0.6349957399070263, + "advantage_std": 0.5609443206340075, + "completion_length": 3024.7500915527344, + "epoch": 0.41714285714285715, + "grad_norm": 0.6753454804420471, + "kl": 0.3546142578125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.854966364683872e-07, + "loss": 0.0101, + "reward": 0.6145913098007441, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6145913098007441, + "reward_after_std": 0.5609443113207817, + "reward_before_mean": 0.7464557122439146, + "reward_before_std": 0.5405437704175711, + "reward_change_max": 0.0001978650689125061, + "reward_change_mean": -0.13186442031292245, + "reward_change_min": -0.20231307670474052, + "reward_change_std": 0.07916592317633331, + "reward_std": 0.5609443187713623, + "rewards/cosine_scaled_reward": 0.019061174243688583, + "rewards/format_reward": 0.708333345130086, + "step": 365 + }, + { + "advantage_max": 1.1641277149319649, + "advantage_mean": -6.208817460162663e-09, + "advantage_min": -1.2483574375510216, + "advantage_std": 0.9762383066117764, + "completion_length": 2363.6250534057617, + "epoch": 0.41828571428571426, + "grad_norm": 1.7422231435775757, + "kl": 0.303375244140625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.829615010283344e-07, + "loss": 0.066, + "reward": 0.6917076036334038, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6917076036334038, + "reward_after_std": 0.9762383252382278, + "reward_before_mean": 0.8244628701359034, + "reward_before_std": 1.0097406208515167, + "reward_change_max": 0.0004345700144767761, + "reward_change_mean": -0.13275525951758027, + "reward_change_min": -0.2532838536426425, + "reward_change_std": 0.11263198498636484, + "reward_std": 0.9762383289635181, + "rewards/cosine_scaled_reward": 0.1309814564883709, + "rewards/format_reward": 0.562500013038516, + "step": 366 + }, + { + "advantage_max": 1.122251644730568, + "advantage_mean": -2.599942261483079e-08, + "advantage_min": -0.8712231889367104, + "advantage_std": 0.7956646084785461, + "completion_length": 3046.1250610351562, + "epoch": 0.41942857142857143, + "grad_norm": 0.8266307711601257, + "kl": 0.40216064453125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.8043938066798645e-07, + "loss": 0.0425, + "reward": 0.48944999772356823, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.48944999772356823, + "reward_after_std": 0.7956646457314491, + "reward_before_mean": 0.6048614283499774, + "reward_before_std": 0.804796252399683, + "reward_change_max": 0.00014813244342803955, + "reward_change_mean": -0.1154114343225956, + "reward_change_min": -0.2323084594681859, + "reward_change_std": 0.09151481185108423, + "reward_std": 0.7956646531820297, + "rewards/cosine_scaled_reward": -0.010069283656775951, + "rewards/format_reward": 0.6250000074505806, + "step": 367 + }, + { + "advantage_max": 0.7003500536084175, + "advantage_mean": 2.173085961176824e-09, + "advantage_min": -0.5562456995248795, + "advantage_std": 0.4822025038301945, + "completion_length": 3284.3334045410156, + "epoch": 0.4205714285714286, + "grad_norm": 0.9353283047676086, + "kl": 0.426483154296875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.7793039831193133e-07, + "loss": 0.0277, + "reward": 0.2110738381743431, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2110738381743431, + "reward_after_std": 0.48220251128077507, + "reward_before_mean": 0.308893536683172, + "reward_before_std": 0.4765991158783436, + "reward_change_max": 0.00024100393056869507, + "reward_change_mean": -0.09781968453899026, + "reward_change_min": -0.16675298567861319, + "reward_change_std": 0.06492633419111371, + "reward_std": 0.48220251500606537, + "rewards/cosine_scaled_reward": -0.14763657189905643, + "rewards/format_reward": 0.6041666679084301, + "step": 368 + }, + { + "advantage_max": 1.342196799814701, + "advantage_mean": -2.23517424569053e-08, + "advantage_min": -1.038889728486538, + "advantage_std": 0.8970093280076981, + "completion_length": 2942.5208892822266, + "epoch": 0.4217142857142857, + "grad_norm": 0.43765193223953247, + "kl": 0.38726806640625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.7543467624442956e-07, + "loss": 0.0149, + "reward": 0.7111932290717959, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7111932290717959, + "reward_after_std": 0.8970093317329884, + "reward_before_mean": 0.8434848883189261, + "reward_before_std": 0.8970467895269394, + "reward_change_max": 0.00014856457710266113, + "reward_change_mean": -0.13229163456708193, + "reward_change_min": -0.2422568015754223, + "reward_change_std": 0.09590382222086191, + "reward_std": 0.8970093578100204, + "rewards/cosine_scaled_reward": 0.07799242623150349, + "rewards/format_reward": 0.6875000149011612, + "step": 369 + }, + { + "advantage_max": 0.7304398790001869, + "advantage_mean": 9.313225579621331e-09, + "advantage_min": -0.5804145596921444, + "advantage_std": 0.4967747814953327, + "completion_length": 3077.9584045410156, + "epoch": 0.4228571428571429, + "grad_norm": 1.005544662475586, + "kl": 0.44821929931640625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.729523361034538e-07, + "loss": 0.0229, + "reward": 0.4255966132041067, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4255966132041067, + "reward_after_std": 0.4967747703194618, + "reward_before_mean": 0.5427881754003465, + "reward_before_std": 0.47967731952667236, + "reward_change_max": 0.0, + "reward_change_mean": -0.11719155265018344, + "reward_change_min": -0.18624725379049778, + "reward_change_std": 0.07463060226291418, + "reward_std": 0.496774785220623, + "rewards/cosine_scaled_reward": -0.06193925626575947, + "rewards/format_reward": 0.6666666753590107, + "step": 370 + }, + { + "advantage_max": 0.8285253420472145, + "advantage_mean": -4.346172255420555e-08, + "advantage_min": -0.7580652683973312, + "advantage_std": 0.5659327665343881, + "completion_length": 2190.5417556762695, + "epoch": 0.424, + "grad_norm": 0.43017879128456116, + "kl": 0.3216705322265625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.7048349887476037e-07, + "loss": 0.0227, + "reward": 0.9253095942549407, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.9253095942549407, + "reward_after_std": 0.5659327721223235, + "reward_before_mean": 1.0867008964996785, + "reward_before_std": 0.5481990473344922, + "reward_change_max": 7.56382942199707e-05, + "reward_change_mean": -0.16139132343232632, + "reward_change_min": -0.2438336256891489, + "reward_change_std": 0.09487550053745508, + "reward_std": 0.5659327721223235, + "rewards/cosine_scaled_reward": 0.19960043695755303, + "rewards/format_reward": 0.6875000149011612, + "step": 371 + }, + { + "advantage_max": 1.2706674709916115, + "advantage_mean": -2.1730867105773655e-09, + "advantage_min": -0.9622760713100433, + "advantage_std": 0.8690132163465023, + "completion_length": 3044.3750610351562, + "epoch": 0.42514285714285716, + "grad_norm": 0.9394458532333374, + "kl": 0.3265533447265625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.6802828488599294e-07, + "loss": 0.0538, + "reward": 0.5415406846441329, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5415406846441329, + "reward_after_std": 0.8690132237970829, + "reward_before_mean": 0.6591636501252651, + "reward_before_std": 0.8758064955472946, + "reward_change_max": 0.000122852623462677, + "reward_change_mean": -0.11762292124330997, + "reward_change_min": -0.2325716745108366, + "reward_change_std": 0.09012753423303366, + "reward_std": 0.8690132312476635, + "rewards/cosine_scaled_reward": 0.027498478069901466, + "rewards/format_reward": 0.6041666753590107, + "step": 372 + }, + { + "advantage_max": 1.1446843966841698, + "advantage_mean": -1.3659397946064189e-08, + "advantage_min": -1.0722761787474155, + "advantage_std": 0.8411088809370995, + "completion_length": 2142.3125228881836, + "epoch": 0.42628571428571427, + "grad_norm": 0.688064694404602, + "kl": 0.5388641357421875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.655868138008171e-07, + "loss": 0.0153, + "reward": 0.7092705629765987, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.7092705629765987, + "reward_after_std": 0.8411088772118092, + "reward_before_mean": 0.8444973253645003, + "reward_before_std": 0.8505554907023907, + "reward_change_max": 0.0, + "reward_change_mean": -0.13522674329578876, + "reward_change_min": -0.2483549453318119, + "reward_change_std": 0.09696428989991546, + "reward_std": 0.84110888838768, + "rewards/cosine_scaled_reward": 0.07849864475429058, + "rewards/format_reward": 0.6875000074505806, + "step": 373 + }, + { + "advantage_max": 1.3396911844611168, + "advantage_mean": 6.829699583654758e-09, + "advantage_min": -0.8294327259063721, + "advantage_std": 0.8366260938346386, + "completion_length": 2615.3125762939453, + "epoch": 0.42742857142857144, + "grad_norm": 0.3500214219093323, + "kl": 0.336761474609375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.631592046130896e-07, + "loss": 0.0417, + "reward": 0.4654297400265932, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4654297400265932, + "reward_after_std": 0.8366261087357998, + "reward_before_mean": 0.5744652273133397, + "reward_before_std": 0.8310881219804287, + "reward_change_max": 0.0004261508584022522, + "reward_change_mean": -0.10903546074405313, + "reward_change_min": -0.20918164774775505, + "reward_change_std": 0.07874666526913643, + "reward_std": 0.836626123636961, + "rewards/cosine_scaled_reward": -0.014850735664367676, + "rewards/format_reward": 0.604166679084301, + "step": 374 + }, + { + "advantage_max": 1.05471608415246, + "advantage_mean": -4.346171977864799e-09, + "advantage_min": -0.9635727629065514, + "advantage_std": 0.7893678471446037, + "completion_length": 2934.291702270508, + "epoch": 0.42857142857142855, + "grad_norm": 0.8874136209487915, + "kl": 0.328857421875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.6074557564105724e-07, + "loss": 0.0395, + "reward": 0.6218974577786867, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6218974577786867, + "reward_after_std": 0.7893678620457649, + "reward_before_mean": 0.7510060481727123, + "reward_before_std": 0.7985278442502022, + "reward_change_max": 0.00015664845705032349, + "reward_change_mean": -0.12910857424139977, + "reward_change_min": -0.22664515953511, + "reward_change_std": 0.09451806033030152, + "reward_std": 0.7893679030239582, + "rewards/cosine_scaled_reward": 0.06300301384180784, + "rewards/format_reward": 0.6250000074505806, + "step": 375 + }, + { + "advantage_max": 0.8759348541498184, + "advantage_mean": -2.048909714114089e-08, + "advantage_min": -0.7458357661962509, + "advantage_std": 0.5922916419804096, + "completion_length": 2636.729202270508, + "epoch": 0.4297142857142857, + "grad_norm": 0.45231547951698303, + "kl": 0.308197021484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.583460445215911e-07, + "loss": 0.0328, + "reward": 0.4231463046744466, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4231463046744466, + "reward_after_std": 0.5922916382551193, + "reward_before_mean": 0.5374979162588716, + "reward_before_std": 0.5894507952034473, + "reward_change_max": 8.402764797210693e-05, + "reward_change_mean": -0.1143516362644732, + "reward_change_min": -0.18643983826041222, + "reward_change_std": 0.07226712163537741, + "reward_std": 0.5922916643321514, + "rewards/cosine_scaled_reward": -0.09583438094705343, + "rewards/format_reward": 0.7291666734963655, + "step": 376 + }, + { + "advantage_max": 1.383926510810852, + "advantage_mean": 2.4835269063494536e-08, + "advantage_min": -0.9945981428027153, + "advantage_std": 0.9026396945118904, + "completion_length": 3282.8125915527344, + "epoch": 0.4308571428571429, + "grad_norm": 1.0070041418075562, + "kl": 0.3175048828125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.5596072820445254e-07, + "loss": 0.0442, + "reward": 0.44021230190992355, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.44021230190992355, + "reward_after_std": 0.902639701962471, + "reward_before_mean": 0.5466061439365149, + "reward_before_std": 0.9098929949104786, + "reward_change_max": 0.00017774105072021484, + "reward_change_mean": -0.10639382712543011, + "reward_change_min": -0.22205941379070282, + "reward_change_std": 0.08372075716033578, + "reward_std": 0.9026397354900837, + "rewards/cosine_scaled_reward": -0.06003026259713806, + "rewards/format_reward": 0.6666666865348816, + "step": 377 + }, + { + "advantage_max": 0.9764475971460342, + "advantage_mean": -5.246450671125835e-08, + "advantage_min": -0.873407207429409, + "advantage_std": 0.67820955067873, + "completion_length": 2588.2500610351562, + "epoch": 0.432, + "grad_norm": 0.4015950858592987, + "kl": 0.2265777587890625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.5358974294659373e-07, + "loss": 0.0183, + "reward": 0.7747280902694911, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7747280902694911, + "reward_after_std": 0.6782095581293106, + "reward_before_mean": 0.9191812109202147, + "reward_before_std": 0.671349611133337, + "reward_change_max": 0.0, + "reward_change_mean": -0.14445316838100553, + "reward_change_min": -0.24850592855364084, + "reward_change_std": 0.09129061782732606, + "reward_std": 0.6782095953822136, + "rewards/cosine_scaled_reward": 0.09500727988779545, + "rewards/format_reward": 0.729166679084301, + "step": 378 + }, + { + "advantage_max": 1.2924234345555305, + "advantage_mean": -1.6142925329809543e-08, + "advantage_min": -0.9254086911678314, + "advantage_std": 0.8383762203156948, + "completion_length": 3108.2709350585938, + "epoch": 0.43314285714285716, + "grad_norm": 0.3910759687423706, + "kl": 0.40936279296875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.512332043064913e-07, + "loss": 0.0498, + "reward": 0.43546567182056606, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.43546567182056606, + "reward_after_std": 0.838376197963953, + "reward_before_mean": 0.542799973860383, + "reward_before_std": 0.8410528190433979, + "reward_change_max": 0.00024215131998062134, + "reward_change_mean": -0.10733431112021208, + "reward_change_min": -0.19753815606236458, + "reward_change_std": 0.0784850474447012, + "reward_std": 0.8383762016892433, + "rewards/cosine_scaled_reward": -0.03068335447460413, + "rewards/format_reward": 0.6041666865348816, + "step": 379 + }, + { + "advantage_max": 0.9396484643220901, + "advantage_mean": -3.91155505208296e-08, + "advantage_min": -0.6655237004160881, + "advantage_std": 0.5877484232187271, + "completion_length": 2655.104217529297, + "epoch": 0.4342857142857143, + "grad_norm": 1.0875959396362305, + "kl": 0.264251708984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.488912271385139e-07, + "loss": -0.0023, + "reward": 0.5554260544013232, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5554260544013232, + "reward_after_std": 0.5877484120428562, + "reward_before_mean": 0.6805648133158684, + "reward_before_std": 0.5702716708183289, + "reward_change_max": 0.0003554224967956543, + "reward_change_mean": -0.12513879756443202, + "reward_change_min": -0.19960128888487816, + "reward_change_std": 0.07485141255892813, + "reward_std": 0.5877484232187271, + "rewards/cosine_scaled_reward": -0.04513426497578621, + "rewards/format_reward": 0.770833333954215, + "step": 380 + }, + { + "advantage_max": 1.091643925756216, + "advantage_mean": -1.4280280735690098e-08, + "advantage_min": -0.9078643172979355, + "advantage_std": 0.7367625050246716, + "completion_length": 3038.1459045410156, + "epoch": 0.43542857142857144, + "grad_norm": 0.650126576423645, + "kl": 0.44171142578125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.465639255873246e-07, + "loss": 0.0265, + "reward": 0.2852631863206625, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2852631863206625, + "reward_after_std": 0.7367624826729298, + "reward_before_mean": 0.3824858106672764, + "reward_before_std": 0.7439739629626274, + "reward_change_max": 8.361786603927612e-05, + "reward_change_mean": -0.09722263645380735, + "reward_change_min": -0.18690539337694645, + "reward_change_std": 0.0746005296241492, + "reward_std": 0.7367625087499619, + "rewards/cosine_scaled_reward": -0.13167377142235637, + "rewards/format_reward": 0.6458333469927311, + "step": 381 + }, + { + "advantage_max": 1.0695801936089993, + "advantage_mean": -5.898376675972372e-09, + "advantage_min": -0.8300218358635902, + "advantage_std": 0.7193886451423168, + "completion_length": 2674.8959197998047, + "epoch": 0.43657142857142855, + "grad_norm": 0.5527292490005493, + "kl": 0.3749847412109375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.4425141308231765e-07, + "loss": 0.0426, + "reward": 0.19800740387290716, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.19800740387290716, + "reward_after_std": 0.7193886376917362, + "reward_before_mean": 0.287336059845984, + "reward_before_std": 0.7278939560055733, + "reward_change_max": 0.0003422573208808899, + "reward_change_mean": -0.08932866249233484, + "reward_change_min": -0.18138791900128126, + "reward_change_std": 0.07123636966571212, + "reward_std": 0.7193886563181877, + "rewards/cosine_scaled_reward": -0.20008197613060474, + "rewards/format_reward": 0.6875000149011612, + "step": 382 + }, + { + "advantage_max": 1.1456727720797062, + "advantage_mean": 1.8626449826975033e-09, + "advantage_min": -1.1615408807992935, + "advantage_std": 0.8968832269310951, + "completion_length": 3055.729232788086, + "epoch": 0.4377142857142857, + "grad_norm": 0.47822508215904236, + "kl": 0.319732666015625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.4195380233209006e-07, + "loss": 0.0526, + "reward": 0.6082937435712665, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6082937435712665, + "reward_after_std": 0.8968832530081272, + "reward_before_mean": 0.7342727006180212, + "reward_before_std": 0.9225108250975609, + "reward_change_max": 0.00024039298295974731, + "reward_change_mean": -0.12597893876954913, + "reward_change_min": -0.2359147211536765, + "reward_change_std": 0.09706644853577018, + "reward_std": 0.896883275359869, + "rewards/cosine_scaled_reward": 0.023386333137750626, + "rewards/format_reward": 0.6875000186264515, + "step": 383 + }, + { + "advantage_max": 1.2379422560334206, + "advantage_mean": -1.1175872061119918e-08, + "advantage_min": -1.424992460757494, + "advantage_std": 0.9928190894424915, + "completion_length": 2224.7708740234375, + "epoch": 0.43885714285714283, + "grad_norm": 0.6266052722930908, + "kl": 0.1634674072265625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.3967120531894857e-07, + "loss": -0.0301, + "reward": 1.2281536404043436, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 1.2281536404043436, + "reward_after_std": 0.9928190894424915, + "reward_before_mean": 1.4098486751317978, + "reward_before_std": 1.014763057231903, + "reward_change_max": 0.0, + "reward_change_mean": -0.18169502541422844, + "reward_change_min": -0.31081850454211235, + "reward_change_std": 0.1270025339908898, + "reward_std": 0.9928191304206848, + "rewards/cosine_scaled_reward": 0.28825767897069454, + "rewards/format_reward": 0.8333333469927311, + "step": 384 + }, + { + "advantage_max": 1.0953758209943771, + "advantage_mean": -3.60111410691033e-08, + "advantage_min": -0.8757317326962948, + "advantage_std": 0.7891853414475918, + "completion_length": 2996.7084045410156, + "epoch": 0.44, + "grad_norm": 0.6368526220321655, + "kl": 0.202911376953125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.374037332934512e-07, + "loss": 0.0445, + "reward": 0.5555390305817127, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5555390305817127, + "reward_after_std": 0.7891853488981724, + "reward_before_mean": 0.6777182146906853, + "reward_before_std": 0.7959320954978466, + "reward_change_max": 0.00031816959381103516, + "reward_change_mean": -0.12217918690294027, + "reward_change_min": -0.23252204339951277, + "reward_change_std": 0.09129771264269948, + "reward_std": 0.7891853675246239, + "rewards/cosine_scaled_reward": 0.005525756627321243, + "rewards/format_reward": 0.6666666734963655, + "step": 385 + }, + { + "advantage_max": 1.217541165649891, + "advantage_mean": -3.97364305904091e-08, + "advantage_min": -1.2796841636300087, + "advantage_std": 0.9395193532109261, + "completion_length": 3087.791702270508, + "epoch": 0.44114285714285717, + "grad_norm": 0.6705446839332581, + "kl": 0.3101806640625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.3515149676898552e-07, + "loss": 0.0018, + "reward": 0.665900741238147, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.665900741238147, + "reward_after_std": 0.9395193681120872, + "reward_before_mean": 0.7959863543510437, + "reward_before_std": 0.9656849205493927, + "reward_change_max": 0.0, + "reward_change_mean": -0.13008562522009015, + "reward_change_min": -0.24509291164577007, + "reward_change_std": 0.10135661391541362, + "reward_std": 0.939519390463829, + "rewards/cosine_scaled_reward": 0.033409830182790756, + "rewards/format_reward": 0.7291666865348816, + "step": 386 + }, + { + "advantage_max": 1.2698442712426186, + "advantage_mean": -2.9181440930337033e-08, + "advantage_min": -1.0950742289423943, + "advantage_std": 0.8762321844696999, + "completion_length": 3122.6250762939453, + "epoch": 0.4422857142857143, + "grad_norm": 0.5375394821166992, + "kl": 0.3720245361328125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.3291460551638237e-07, + "loss": 0.0262, + "reward": 0.5023184239398688, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5023184239398688, + "reward_after_std": 0.8762321695685387, + "reward_before_mean": 0.6163734996225685, + "reward_before_std": 0.8881137296557426, + "reward_change_max": 0.0003941729664802551, + "reward_change_mean": -0.11405510362237692, + "reward_change_min": -0.21109131071716547, + "reward_change_std": 0.08576344698667526, + "reward_std": 0.8762321919202805, + "rewards/cosine_scaled_reward": -0.014729912392795086, + "rewards/format_reward": 0.6458333414047956, + "step": 387 + }, + { + "advantage_max": 1.0198524445295334, + "advantage_mean": -1.862645199190993e-08, + "advantage_min": -0.8256707489490509, + "advantage_std": 0.7120313681662083, + "completion_length": 2877.916778564453, + "epoch": 0.44342857142857145, + "grad_norm": 0.4498549699783325, + "kl": 0.2955780029296875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.306931685585657e-07, + "loss": 0.0315, + "reward": 0.6390794757753611, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6390794757753611, + "reward_after_std": 0.7120313458144665, + "reward_before_mean": 0.770656397100538, + "reward_before_std": 0.712759368121624, + "reward_change_max": 0.0, + "reward_change_mean": -0.13157692411914468, + "reward_change_min": -0.23201371356844902, + "reward_change_std": 0.0902018048800528, + "reward_std": 0.7120313681662083, + "rewards/cosine_scaled_reward": 0.031161522027105093, + "rewards/format_reward": 0.7083333395421505, + "step": 388 + }, + { + "advantage_max": 1.2392387315630913, + "advantage_mean": -3.352761368535795e-08, + "advantage_min": -1.2367150112986565, + "advantage_std": 0.9424854889512062, + "completion_length": 2731.2084045410156, + "epoch": 0.44457142857142856, + "grad_norm": 0.47933250665664673, + "kl": 0.285736083984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.2848729416523859e-07, + "loss": 0.0165, + "reward": 0.7998714097775519, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7998714097775519, + "reward_after_std": 0.9424855038523674, + "reward_before_mean": 0.9414779755752534, + "reward_before_std": 0.9657678753137589, + "reward_change_max": 0.0003373771905899048, + "reward_change_mean": -0.14160654600709677, + "reward_change_min": -0.26733815390616655, + "reward_change_std": 0.10602262848988175, + "reward_std": 0.9424855262041092, + "rewards/cosine_scaled_reward": 0.10615562507882714, + "rewards/format_reward": 0.729166679084301, + "step": 389 + }, + { + "advantage_max": 1.5434679314494133, + "advantage_mean": -1.117587122845265e-08, + "advantage_min": -0.987445343285799, + "advantage_std": 0.9908928908407688, + "completion_length": 2833.2501068115234, + "epoch": 0.44571428571428573, + "grad_norm": 0.9356141090393066, + "kl": 0.3245697021484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.2629708984760706e-07, + "loss": 0.0495, + "reward": 0.40416749380528927, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.40416749380528927, + "reward_after_std": 0.9908928833901882, + "reward_before_mean": 0.5042689014226198, + "reward_before_std": 1.0035798326134682, + "reward_change_max": 0.0, + "reward_change_mean": -0.10010139970108867, + "reward_change_min": -0.22962007019668818, + "reward_change_std": 0.08653676975518465, + "reward_std": 0.9908929243683815, + "rewards/cosine_scaled_reward": -0.06036556634353474, + "rewards/format_reward": 0.6250000093132257, + "step": 390 + }, + { + "advantage_max": 1.4135627299547195, + "advantage_mean": -6.208816238917336e-10, + "advantage_min": -1.1502118483185768, + "advantage_std": 1.0554303713142872, + "completion_length": 2525.7709045410156, + "epoch": 0.44685714285714284, + "grad_norm": 1.8921399116516113, + "kl": 0.26483154296875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.2412266235313973e-07, + "loss": 0.0684, + "reward": 0.6798726860433817, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6798726860433817, + "reward_after_std": 1.0554304011166096, + "reward_before_mean": 0.8076426119860116, + "reward_before_std": 1.0806686542928219, + "reward_change_max": 0.000661991536617279, + "reward_change_mean": -0.12776989629492164, + "reward_change_min": -0.2711303811520338, + "reward_change_std": 0.11271528014913201, + "reward_std": 1.0554304346442223, + "rewards/cosine_scaled_reward": 0.07048794813454151, + "rewards/format_reward": 0.6666666753590107, + "step": 391 + }, + { + "advantage_max": 1.0657275505363941, + "advantage_mean": -2.3903946183567726e-08, + "advantage_min": -0.8924349471926689, + "advantage_std": 0.7697158344089985, + "completion_length": 2493.2708892822266, + "epoch": 0.448, + "grad_norm": 0.7100505232810974, + "kl": 0.3263092041015625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.2196411766036487e-07, + "loss": 0.0461, + "reward": 0.48778675869107246, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.48778675869107246, + "reward_after_std": 0.769715815782547, + "reward_before_mean": 0.6039394419640303, + "reward_before_std": 0.7809349689632654, + "reward_change_max": 0.00030147284269332886, + "reward_change_mean": -0.11615270469337702, + "reward_change_min": -0.24055636301636696, + "reward_change_std": 0.09163055196404457, + "reward_std": 0.7697158381342888, + "rewards/cosine_scaled_reward": -0.03136360924690962, + "rewards/format_reward": 0.6666666753590107, + "step": 392 + }, + { + "advantage_max": 1.5679549127817154, + "advantage_mean": -3.973643114552061e-08, + "advantage_min": -1.5204594507813454, + "advantage_std": 1.2267358228564262, + "completion_length": 2753.729278564453, + "epoch": 0.4491428571428571, + "grad_norm": 1.968538522720337, + "kl": 0.2186279296875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.1982156097370557e-07, + "loss": 0.091, + "reward": 1.0800715144723654, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 1.0800715144723654, + "reward_after_std": 1.2267358228564262, + "reward_before_mean": 1.242100728675723, + "reward_before_std": 1.264978300780058, + "reward_change_max": 8.808821439743042e-05, + "reward_change_mean": -0.16202920861542225, + "reward_change_min": -0.3233691677451134, + "reward_change_std": 0.13310158113017678, + "reward_std": 1.2267358973622322, + "rewards/cosine_scaled_reward": 0.23563368245959282, + "rewards/format_reward": 0.7708333507180214, + "step": 393 + }, + { + "advantage_max": 1.1774890311062336, + "advantage_mean": 3.7252904094842165e-09, + "advantage_min": -0.7850269712507725, + "advantage_std": 0.7283446006476879, + "completion_length": 3308.5834045410156, + "epoch": 0.4502857142857143, + "grad_norm": 0.6988861560821533, + "kl": 0.4105224609375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.1769509671835223e-07, + "loss": 0.0356, + "reward": 0.07187341991811991, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.07187341991811991, + "reward_after_std": 0.7283445969223976, + "reward_before_mean": 0.14786747004836798, + "reward_before_std": 0.731137853115797, + "reward_change_max": 0.0, + "reward_change_mean": -0.07599404593929648, + "reward_change_min": -0.15443053469061852, + "reward_change_std": 0.060382971074432135, + "reward_std": 0.7283446118235588, + "rewards/cosine_scaled_reward": -0.17606628267094493, + "rewards/format_reward": 0.5000000111758709, + "step": 394 + }, + { + "advantage_max": 1.1270059682428837, + "advantage_mean": -1.7384688244526103e-08, + "advantage_min": -1.0076443776488304, + "advantage_std": 0.7832858189940453, + "completion_length": 2537.8959197998047, + "epoch": 0.4514285714285714, + "grad_norm": 0.5426788926124573, + "kl": 0.230255126953125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.1558482853517253e-07, + "loss": 0.0156, + "reward": 0.8596542216837406, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.8596542216837406, + "reward_after_std": 0.783285815268755, + "reward_before_mean": 1.0088724344968796, + "reward_before_std": 0.7832663394510746, + "reward_change_max": 0.0, + "reward_change_mean": -0.1492181965149939, + "reward_change_min": -0.25157369300723076, + "reward_change_std": 0.09793313452973962, + "reward_std": 0.7832858189940453, + "rewards/cosine_scaled_reward": 0.11901953746564686, + "rewards/format_reward": 0.7708333469927311, + "step": 395 + }, + { + "advantage_max": 1.4208708554506302, + "advantage_mean": -2.9802322942806825e-08, + "advantage_min": -1.0534027591347694, + "advantage_std": 0.9146955572068691, + "completion_length": 3212.0626220703125, + "epoch": 0.45257142857142857, + "grad_norm": 0.7655185461044312, + "kl": 0.42913818359375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.134908592756607e-07, + "loss": 0.0721, + "reward": 0.55569236446172, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.55569236446172, + "reward_after_std": 0.9146955460309982, + "reward_before_mean": 0.6718438614625484, + "reward_before_std": 0.9166472069919109, + "reward_change_max": 0.0, + "reward_change_mean": -0.116151487454772, + "reward_change_min": -0.2097308114171028, + "reward_change_std": 0.0818931176327169, + "reward_std": 0.9146955572068691, + "rewards/cosine_scaled_reward": 0.002588571864180267, + "rewards/format_reward": 0.666666692122817, + "step": 396 + }, + { + "advantage_max": 1.170277938246727, + "advantage_mean": -9.934107814135729e-09, + "advantage_min": -0.7947026267647743, + "advantage_std": 0.7339077740907669, + "completion_length": 2839.416748046875, + "epoch": 0.45371428571428574, + "grad_norm": 0.4377634525299072, + "kl": 0.171051025390625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.1141329099692406e-07, + "loss": 0.0284, + "reward": 0.6288949530571699, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6288949530571699, + "reward_after_std": 0.7339077889919281, + "reward_before_mean": 0.7566788010299206, + "reward_before_std": 0.7199768051505089, + "reward_change_max": 0.00020315498113632202, + "reward_change_mean": -0.1277838561218232, + "reward_change_min": -0.20927806198596954, + "reward_change_std": 0.08163281762972474, + "reward_std": 0.7339078187942505, + "rewards/cosine_scaled_reward": -0.00707725714892149, + "rewards/format_reward": 0.7708333432674408, + "step": 397 + }, + { + "advantage_max": 1.0407202914357185, + "advantage_mean": 9.934108424758392e-09, + "advantage_min": -0.8572842329740524, + "advantage_std": 0.6855289153754711, + "completion_length": 2837.166732788086, + "epoch": 0.45485714285714285, + "grad_norm": 1.068537712097168, + "kl": 0.31781005859375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.0935222495670968e-07, + "loss": 0.0099, + "reward": 0.3281229701824486, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3281229701824486, + "reward_after_std": 0.6855289153754711, + "reward_before_mean": 0.43000722490251064, + "reward_before_std": 0.6828913427889347, + "reward_change_max": 0.00022487342357635498, + "reward_change_mean": -0.10188420582562685, + "reward_change_min": -0.18412470445036888, + "reward_change_std": 0.07508498663082719, + "reward_std": 0.6855289451777935, + "rewards/cosine_scaled_reward": -0.10791307222098112, + "rewards/format_reward": 0.6458333488553762, + "step": 398 + }, + { + "advantage_max": 1.341847501695156, + "advantage_mean": -4.346172111091562e-08, + "advantage_min": -1.1149200797080994, + "advantage_std": 0.9616120122373104, + "completion_length": 2445.3333740234375, + "epoch": 0.456, + "grad_norm": 1.9017913341522217, + "kl": 0.23044586181640625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.0730776160846853e-07, + "loss": 0.098, + "reward": 0.8614312242716551, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.8614312242716551, + "reward_after_std": 0.9616120085120201, + "reward_before_mean": 1.0071106106042862, + "reward_before_std": 0.9746308140456676, + "reward_change_max": 0.0, + "reward_change_mean": -0.14567939471453428, + "reward_change_min": -0.28493910282850266, + "reward_change_std": 0.10870950575917959, + "reward_std": 0.9616120085120201, + "rewards/cosine_scaled_reward": 0.10772196669131517, + "rewards/format_reward": 0.7916666865348816, + "step": 399 + }, + { + "advantage_max": 1.374092049896717, + "advantage_mean": -1.924733378233512e-08, + "advantage_min": -1.041805051267147, + "advantage_std": 0.9453696236014366, + "completion_length": 2148.9583740234375, + "epoch": 0.45714285714285713, + "grad_norm": 0.6918766498565674, + "kl": 0.1371002197265625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.0528000059645995e-07, + "loss": 0.0116, + "reward": 1.2651937678456306, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 1.2651937678456306, + "reward_after_std": 0.9453696310520172, + "reward_before_mean": 1.4482441246509552, + "reward_before_std": 0.9435032866895199, + "reward_change_max": 0.0, + "reward_change_mean": -0.18305037543177605, + "reward_change_min": -0.3167814239859581, + "reward_change_std": 0.12038358487188816, + "reward_std": 0.9453696794807911, + "rewards/cosine_scaled_reward": 0.2762054104823619, + "rewards/format_reward": 0.8958333432674408, + "step": 400 + }, + { + "advantage_max": 1.067967213690281, + "advantage_mean": -8.071463275527435e-09, + "advantage_min": -1.150337852537632, + "advantage_std": 0.8019844517111778, + "completion_length": 3032.1250915527344, + "epoch": 0.4582857142857143, + "grad_norm": 0.7639958262443542, + "kl": 0.423126220703125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.032690407508949e-07, + "loss": 0.0298, + "reward": 0.43492011073976755, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.43492011073976755, + "reward_after_std": 0.8019844405353069, + "reward_before_mean": 0.5461161928251386, + "reward_before_std": 0.8226968869566917, + "reward_change_max": 8.496642112731934e-05, + "reward_change_mean": -0.11119608022272587, + "reward_change_min": -0.19766813702881336, + "reward_change_std": 0.08586510363966227, + "reward_std": 0.8019844740629196, + "rewards/cosine_scaled_reward": -0.06027523800730705, + "rewards/format_reward": 0.6666666865348816, + "step": 401 + }, + { + "advantage_max": 0.9436207935214043, + "advantage_mean": -3.104408685672411e-08, + "advantage_min": -0.7495729178190231, + "advantage_std": 0.6268677823245525, + "completion_length": 2520.229217529297, + "epoch": 0.4594285714285714, + "grad_norm": 0.8622435927391052, + "kl": 0.3449249267578125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.0127498008311922e-07, + "loss": 0.0176, + "reward": 0.41957162227481604, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.41957162227481604, + "reward_after_std": 0.6268677823245525, + "reward_before_mean": 0.5319516197778285, + "reward_before_std": 0.6171875484287739, + "reward_change_max": 8.885562419891357e-05, + "reward_change_mean": -0.11238000728189945, + "reward_change_min": -0.1966311875730753, + "reward_change_std": 0.07510491507127881, + "reward_std": 0.6268677972257137, + "rewards/cosine_scaled_reward": -0.07777421269565821, + "rewards/format_reward": 0.6875000167638063, + "step": 402 + }, + { + "advantage_max": 1.0418415665626526, + "advantage_mean": 4.9670538238011375e-09, + "advantage_min": -1.0355667266994715, + "advantage_std": 0.7518435195088387, + "completion_length": 2340.083366394043, + "epoch": 0.4605714285714286, + "grad_norm": 0.7467604875564575, + "kl": 0.1708984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.9929791578083655e-07, + "loss": 0.0142, + "reward": 0.5140412461478263, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5140412461478263, + "reward_after_std": 0.7518434971570969, + "reward_before_mean": 0.6334147532470524, + "reward_before_std": 0.75905573181808, + "reward_change_max": 0.00014617294073104858, + "reward_change_mean": -0.11937349662184715, + "reward_change_min": -0.19436869025230408, + "reward_change_std": 0.08348292578011751, + "reward_std": 0.7518435046076775, + "rewards/cosine_scaled_reward": 0.014624039176851511, + "rewards/format_reward": 0.6041666865348816, + "step": 403 + }, + { + "advantage_max": 0.6259809099137783, + "advantage_mean": -1.7229467852430957e-08, + "advantage_min": -0.5122467614710331, + "advantage_std": 0.43664512410759926, + "completion_length": 2699.7083740234375, + "epoch": 0.4617142857142857, + "grad_norm": 0.715581476688385, + "kl": 0.31291961669921875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.9733794420337213e-07, + "loss": 0.0129, + "reward": 0.520903637050651, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.520903637050651, + "reward_after_std": 0.43664513528347015, + "reward_before_mean": 0.6486049126833677, + "reward_before_std": 0.41897546872496605, + "reward_change_max": 0.0, + "reward_change_mean": -0.12770125456154346, + "reward_change_min": -0.1968603590503335, + "reward_change_std": 0.07488203165121377, + "reward_std": 0.43664515018463135, + "rewards/cosine_scaled_reward": -0.009030893445014954, + "rewards/format_reward": 0.6666666716337204, + "step": 404 + }, + { + "advantage_max": 1.0945487841963768, + "advantage_mean": -2.7939678071131624e-08, + "advantage_min": -1.289386235177517, + "advantage_std": 0.9108894169330597, + "completion_length": 2435.4792404174805, + "epoch": 0.46285714285714286, + "grad_norm": 0.494119256734848, + "kl": 0.3772735595703125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.9539516087697517e-07, + "loss": 0.0535, + "reward": 0.7584564303979278, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7584564303979278, + "reward_after_std": 0.9108893796801567, + "reward_before_mean": 0.8993258298141882, + "reward_before_std": 0.942900113761425, + "reward_change_max": 0.0, + "reward_change_mean": -0.14086939627304673, + "reward_change_min": -0.25777516420930624, + "reward_change_std": 0.10722628142684698, + "reward_std": 0.9108894020318985, + "rewards/cosine_scaled_reward": 0.12674623914062977, + "rewards/format_reward": 0.6458333507180214, + "step": 405 + }, + { + "advantage_max": 1.0511068068444729, + "advantage_mean": -6.084641079873165e-08, + "advantage_min": -1.0219291038811207, + "advantage_std": 0.7899200022220612, + "completion_length": 2561.8334045410156, + "epoch": 0.464, + "grad_norm": 0.665023148059845, + "kl": 0.25701904296875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.934696604901642e-07, + "loss": 0.0351, + "reward": 1.0501489378511906, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 1.0501489378511906, + "reward_after_std": 0.7899199984967709, + "reward_before_mean": 1.219081237912178, + "reward_before_std": 0.7882814519107342, + "reward_change_max": 0.00011001527309417725, + "reward_change_mean": -0.16893231682479382, + "reward_change_min": -0.282260874286294, + "reward_change_std": 0.10838037542998791, + "reward_std": 0.7899200432002544, + "rewards/cosine_scaled_reward": 0.18245727149769664, + "rewards/format_reward": 0.8541666772216558, + "step": 406 + }, + { + "advantage_max": 0.902554202824831, + "advantage_mean": -3.7252904538931375e-08, + "advantage_min": -0.7991390824317932, + "advantage_std": 0.6586268059909344, + "completion_length": 2573.2291946411133, + "epoch": 0.46514285714285714, + "grad_norm": 0.5235826969146729, + "kl": 0.2978057861328125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.915615368891117e-07, + "loss": 0.0504, + "reward": 0.7006958748097531, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7006958748097531, + "reward_after_std": 0.6586268097162247, + "reward_before_mean": 0.8396258531138301, + "reward_before_std": 0.6540814377367496, + "reward_change_max": 0.0, + "reward_change_mean": -0.13892999943345785, + "reward_change_min": -0.2336738519370556, + "reward_change_std": 0.09190351748839021, + "reward_std": 0.6586268320679665, + "rewards/cosine_scaled_reward": 0.07606291398406029, + "rewards/format_reward": 0.6875000037252903, + "step": 407 + }, + { + "advantage_max": 1.0302674248814583, + "advantage_mean": -4.346172205460519e-08, + "advantage_min": -1.0587206855416298, + "advantage_std": 0.7980893142521381, + "completion_length": 2748.6042098999023, + "epoch": 0.4662857142857143, + "grad_norm": 0.49642956256866455, + "kl": 0.37957763671875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.8967088307307e-07, + "loss": 0.0551, + "reward": 0.5339444152486976, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5339444152486976, + "reward_after_std": 0.7980893142521381, + "reward_before_mean": 0.6553622125647962, + "reward_before_std": 0.8187281489372253, + "reward_change_max": 0.0005005300045013428, + "reward_change_mean": -0.1214177985675633, + "reward_change_min": -0.23425185028463602, + "reward_change_std": 0.09430251410230994, + "reward_std": 0.7980893328785896, + "rewards/cosine_scaled_reward": 0.015181094408035278, + "rewards/format_reward": 0.6250000111758709, + "step": 408 + }, + { + "advantage_max": 1.1790355741977692, + "advantage_mean": -9.934107536579972e-09, + "advantage_min": -1.1195921525359154, + "advantage_std": 0.8626318871974945, + "completion_length": 3257.8750610351562, + "epoch": 0.4674285714285714, + "grad_norm": 1.0251151323318481, + "kl": 0.440673828125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.8779779118983867e-07, + "loss": 0.0102, + "reward": 0.34495530603453517, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.34495530603453517, + "reward_after_std": 0.8626318797469139, + "reward_before_mean": 0.4460237352177501, + "reward_before_std": 0.8854938633739948, + "reward_change_max": 0.00019879639148712158, + "reward_change_mean": -0.1010684184730053, + "reward_change_min": -0.21322645619511604, + "reward_change_std": 0.0844197073020041, + "reward_std": 0.8626318946480751, + "rewards/cosine_scaled_reward": -0.1311548100784421, + "rewards/format_reward": 0.708333358168602, + "step": 409 + }, + { + "advantage_max": 1.0071330815553665, + "advantage_mean": 3.1044085080367267e-09, + "advantage_min": -0.9616048745810986, + "advantage_std": 0.7821291163563728, + "completion_length": 2852.8125610351562, + "epoch": 0.4685714285714286, + "grad_norm": 0.9547456502914429, + "kl": 0.550445556640625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.8594235253127372e-07, + "loss": 0.0562, + "reward": 0.2552414983510971, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2552414983510971, + "reward_after_std": 0.7821291275322437, + "reward_before_mean": 0.3507663235068321, + "reward_before_std": 0.804282508790493, + "reward_change_max": 6.768107414245605e-05, + "reward_change_mean": -0.09552482329308987, + "reward_change_min": -0.18692217115312815, + "reward_change_std": 0.0792939979583025, + "reward_std": 0.7821291498839855, + "rewards/cosine_scaled_reward": -0.08503350615501404, + "rewards/format_reward": 0.5208333488553762, + "step": 410 + }, + { + "advantage_max": 0.9848299585282803, + "advantage_mean": -1.8936892498544466e-08, + "advantage_min": -1.0076408050954342, + "advantage_std": 0.7584920935332775, + "completion_length": 3145.854278564453, + "epoch": 0.4697142857142857, + "grad_norm": 0.4657355844974518, + "kl": 0.417236328125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.8410465752883758e-07, + "loss": 0.0387, + "reward": 0.5571842957288027, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5571842957288027, + "reward_after_std": 0.758492112159729, + "reward_before_mean": 0.6817128874827176, + "reward_before_std": 0.7747105807065964, + "reward_change_max": 0.00012461096048355103, + "reward_change_mean": -0.12452859850600362, + "reward_change_min": -0.22812595777213573, + "reward_change_std": 0.0898899482563138, + "reward_std": 0.7584921382367611, + "rewards/cosine_scaled_reward": -0.013310234993696213, + "rewards/format_reward": 0.7083333525806665, + "step": 411 + }, + { + "advantage_max": 1.2851012870669365, + "advantage_mean": -2.4835269396561444e-09, + "advantage_min": -1.1866544671356678, + "advantage_std": 0.9409245103597641, + "completion_length": 3118.4584045410156, + "epoch": 0.47085714285714286, + "grad_norm": 0.8590179085731506, + "kl": 0.3385009765625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.822847957491922e-07, + "loss": 0.0475, + "reward": 0.6860095746815205, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6860095746815205, + "reward_after_std": 0.9409245178103447, + "reward_before_mean": 0.8169820861658081, + "reward_before_std": 0.9591667316854, + "reward_change_max": 0.0, + "reward_change_mean": -0.130972508341074, + "reward_change_min": -0.2485770285129547, + "reward_change_std": 0.09931665565818548, + "reward_std": 0.9409245401620865, + "rewards/cosine_scaled_reward": 0.012657706625759602, + "rewards/format_reward": 0.7916666865348816, + "step": 412 + }, + { + "advantage_max": 1.0960227698087692, + "advantage_mean": -2.3593505704688766e-08, + "advantage_min": -1.1873703114688396, + "advantage_std": 0.8447449170053005, + "completion_length": 2740.6251068115234, + "epoch": 0.472, + "grad_norm": 0.8115954995155334, + "kl": 0.342376708984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.804828558898332e-07, + "loss": 0.0565, + "reward": 0.6243542423471808, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6243542423471808, + "reward_after_std": 0.8447449207305908, + "reward_before_mean": 0.7526226807385683, + "reward_before_std": 0.8638904727995396, + "reward_change_max": 0.0, + "reward_change_mean": -0.12826845049858093, + "reward_change_min": -0.22999808378517628, + "reward_change_std": 0.09474454261362553, + "reward_std": 0.8447449542582035, + "rewards/cosine_scaled_reward": 0.022144658491015434, + "rewards/format_reward": 0.7083333469927311, + "step": 413 + }, + { + "advantage_max": 1.248441867530346, + "advantage_mean": 7.76102138111412e-09, + "advantage_min": -0.8844185099005699, + "advantage_std": 0.8054223507642746, + "completion_length": 3167.0209350585938, + "epoch": 0.47314285714285714, + "grad_norm": 0.6134005784988403, + "kl": 0.59326171875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.7869892577476722e-07, + "loss": 0.072, + "reward": 0.06549638044089079, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.06549638044089079, + "reward_after_std": 0.8054223656654358, + "reward_before_mean": 0.1393390439916402, + "reward_before_std": 0.8182446286082268, + "reward_change_max": 0.0002498254179954529, + "reward_change_mean": -0.07384267030283809, + "reward_change_min": -0.15592787880450487, + "reward_change_std": 0.06589569803327322, + "reward_std": 0.8054223731160164, + "rewards/cosine_scaled_reward": -0.1803304860368371, + "rewards/format_reward": 0.5000000111758709, + "step": 414 + }, + { + "advantage_max": 1.1541544646024704, + "advantage_mean": 9.313226023710541e-09, + "advantage_min": -0.7554533295333385, + "advantage_std": 0.7500776499509811, + "completion_length": 3323.0834045410156, + "epoch": 0.4742857142857143, + "grad_norm": 0.9510299563407898, + "kl": 0.78515625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.7693309235023127e-07, + "loss": 0.0881, + "reward": 0.026195455342531204, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.026195455342531204, + "reward_after_std": 0.7500776499509811, + "reward_before_mean": 0.0982854962348938, + "reward_before_std": 0.7606666944921017, + "reward_change_max": 0.0, + "reward_change_mean": -0.07209003553725779, + "reward_change_min": -0.15941832214593887, + "reward_change_std": 0.06286179949529469, + "reward_std": 0.7500776574015617, + "rewards/cosine_scaled_reward": -0.12794058211147785, + "rewards/format_reward": 0.3541666716337204, + "step": 415 + }, + { + "advantage_max": 1.1921259351074696, + "advantage_mean": -2.4214388605336978e-08, + "advantage_min": -1.027435451745987, + "advantage_std": 0.8444834239780903, + "completion_length": 2599.604263305664, + "epoch": 0.4754285714285714, + "grad_norm": 0.5963953733444214, + "kl": 0.261688232421875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.7518544168045524e-07, + "loss": -0.014, + "reward": 0.5983389317989349, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5983389317989349, + "reward_after_std": 0.8444834165275097, + "reward_before_mean": 0.7227395437657833, + "reward_before_std": 0.8563327789306641, + "reward_change_max": 0.0002361685037612915, + "reward_change_mean": -0.1244005998596549, + "reward_change_min": -0.24791191704571247, + "reward_change_std": 0.09307907475158572, + "reward_std": 0.8444834351539612, + "rewards/cosine_scaled_reward": -0.03446358081419021, + "rewards/format_reward": 0.7916666902601719, + "step": 416 + }, + { + "advantage_max": 1.2748055160045624, + "advantage_mean": -1.2417634698280722e-08, + "advantage_min": -0.8345907405018806, + "advantage_std": 0.8307083323597908, + "completion_length": 3296.3959045410156, + "epoch": 0.4765714285714286, + "grad_norm": 0.658596396446228, + "kl": 0.4886474609375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.7345605894346726e-07, + "loss": 0.0529, + "reward": 0.01480916142463684, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.01480916142463684, + "reward_after_std": 0.8307083398103714, + "reward_before_mean": 0.08347079087980092, + "reward_before_std": 0.8477018140256405, + "reward_change_max": 0.0005599185824394226, + "reward_change_mean": -0.06866162805818021, + "reward_change_min": -0.17168361693620682, + "reward_change_std": 0.06843259232118726, + "reward_std": 0.8307083509862423, + "rewards/cosine_scaled_reward": -0.16659794500446878, + "rewards/format_reward": 0.41666667722165585, + "step": 417 + }, + { + "advantage_max": 1.0702146142721176, + "advantage_mean": -2.2972623692218974e-08, + "advantage_min": -1.0531324371695518, + "advantage_std": 0.7828693352639675, + "completion_length": 2456.52091217041, + "epoch": 0.4777142857142857, + "grad_norm": 0.5361673831939697, + "kl": 0.20428466796875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.7174502842694212e-07, + "loss": 0.0059, + "reward": 0.9526145923882723, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.9526145923882723, + "reward_after_std": 0.7828693278133869, + "reward_before_mean": 1.1116907584219007, + "reward_before_std": 0.7802752666175365, + "reward_change_max": 0.00038120150566101074, + "reward_change_mean": -0.15907620172947645, + "reward_change_min": -0.25695937499403954, + "reward_change_std": 0.10400415351614356, + "reward_std": 0.7828693389892578, + "rewards/cosine_scaled_reward": 0.1391787314787507, + "rewards/format_reward": 0.8333333469927311, + "step": 418 + }, + { + "advantage_max": 1.3333582356572151, + "advantage_mean": -2.126519908773883e-08, + "advantage_min": -1.1237758100032806, + "advantage_std": 0.9614992318674922, + "completion_length": 3064.479248046875, + "epoch": 0.47885714285714287, + "grad_norm": 0.8987430930137634, + "kl": 0.388641357421875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.7005243352409333e-07, + "loss": 0.0697, + "reward": 0.7200150811113417, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7200150811113417, + "reward_after_std": 0.9614992393180728, + "reward_before_mean": 0.8526841946877539, + "reward_before_std": 0.9812450297176838, + "reward_change_max": 7.344037294387817e-05, + "reward_change_mean": -0.1326691498979926, + "reward_change_min": -0.2474001133814454, + "reward_change_std": 0.10230770613998175, + "reward_std": 0.961499254219234, + "rewards/cosine_scaled_reward": 0.10342542547732592, + "rewards/format_reward": 0.645833345130086, + "step": 419 + }, + { + "advantage_max": 0.8133354522287846, + "advantage_mean": -2.6077033421501028e-08, + "advantage_min": -0.9922023713588715, + "advantage_std": 0.6553318947553635, + "completion_length": 2480.4375762939453, + "epoch": 0.48, + "grad_norm": 0.4735735356807709, + "kl": 0.289306640625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.6837835672960831e-07, + "loss": 0.0418, + "reward": 0.5324579540174454, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.5324579540174454, + "reward_after_std": 0.6553318910300732, + "reward_before_mean": 0.657228053547442, + "reward_before_std": 0.6681869141757488, + "reward_change_max": 0.00011757761240005493, + "reward_change_mean": -0.12477011140435934, + "reward_change_min": -0.21673501282930374, + "reward_change_std": 0.08654853561893106, + "reward_std": 0.6553319171071053, + "rewards/cosine_scaled_reward": -0.05680265463888645, + "rewards/format_reward": 0.7708333469927311, + "step": 420 + }, + { + "advantage_max": 1.2419108375906944, + "advantage_mean": -1.9247333726823967e-08, + "advantage_min": -0.9631900601089001, + "advantage_std": 0.8234320022165775, + "completion_length": 3331.9375915527344, + "epoch": 0.48114285714285715, + "grad_norm": 0.5381994843482971, + "kl": 0.4439697265625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.6672287963562852e-07, + "loss": 0.048, + "reward": 0.3111051223240793, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3111051223240793, + "reward_after_std": 0.8234320022165775, + "reward_before_mean": 0.40812125336378813, + "reward_before_std": 0.8344125263392925, + "reward_change_max": 6.553530693054199e-05, + "reward_change_mean": -0.09701612405478954, + "reward_change_min": -0.1940639168024063, + "reward_change_std": 0.07709357934072614, + "reward_std": 0.823432020843029, + "rewards/cosine_scaled_reward": -0.1292727179825306, + "rewards/format_reward": 0.666666679084301, + "step": 421 + }, + { + "advantage_max": 0.9083509668707848, + "advantage_mean": 9.313225912688239e-09, + "advantage_min": -0.9410501569509506, + "advantage_std": 0.7073789015412331, + "completion_length": 2946.7084197998047, + "epoch": 0.48228571428571426, + "grad_norm": 0.702170193195343, + "kl": 0.3853759765625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.6508608292777203e-07, + "loss": 0.0758, + "reward": 0.5041839517652988, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5041839517652988, + "reward_after_std": 0.7073789089918137, + "reward_before_mean": 0.6250940449535847, + "reward_before_std": 0.7243058681488037, + "reward_change_max": 0.00022362172603607178, + "reward_change_mean": -0.12091004336252809, + "reward_change_min": -0.21036983001977205, + "reward_change_std": 0.08578781271353364, + "reward_std": 0.7073789089918137, + "rewards/cosine_scaled_reward": -0.010369660332798958, + "rewards/format_reward": 0.6458333432674408, + "step": 422 + }, + { + "advantage_max": 1.0704151839017868, + "advantage_mean": 1.2417634698280722e-09, + "advantage_min": -0.9820942431688309, + "advantage_std": 0.7664654031395912, + "completion_length": 2857.979248046875, + "epoch": 0.48342857142857143, + "grad_norm": 0.4890437126159668, + "kl": 0.3306427001953125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.6346804638120098e-07, + "loss": 0.0261, + "reward": 0.2683728828560561, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2683728828560561, + "reward_after_std": 0.7664653956890106, + "reward_before_mean": 0.36373477918095887, + "reward_before_std": 0.7833064757287502, + "reward_change_max": 0.00026363134384155273, + "reward_change_mean": -0.09536188654601574, + "reward_change_min": -0.189723776653409, + "reward_change_std": 0.07672601472586393, + "reward_std": 0.7664654068648815, + "rewards/cosine_scaled_reward": -0.16188262542709708, + "rewards/format_reward": 0.6875000074505806, + "step": 423 + }, + { + "advantage_max": 1.5122771635651588, + "advantage_mean": -8.8475646253805e-09, + "advantage_min": -0.9919125214219093, + "advantage_std": 0.9600072056055069, + "completion_length": 3214.666778564453, + "epoch": 0.4845714285714286, + "grad_norm": 0.8926342725753784, + "kl": 0.4385986328125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.6186884885673413e-07, + "loss": 0.0271, + "reward": 0.34624925162643194, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.34624925162643194, + "reward_after_std": 0.9600072205066681, + "reward_before_mean": 0.4413239473942667, + "reward_before_std": 0.9716643802821636, + "reward_change_max": 0.00024952739477157593, + "reward_change_mean": -0.09507470857352018, + "reward_change_min": -0.20797365996986628, + "reward_change_std": 0.07991787604987621, + "reward_std": 0.9600072354078293, + "rewards/cosine_scaled_reward": -0.10225469525903463, + "rewards/format_reward": 0.6458333414047956, + "step": 424 + }, + { + "advantage_max": 0.8413158319890499, + "advantage_mean": 5.5879355587151736e-09, + "advantage_min": -1.0183219015598297, + "advantage_std": 0.6862456165254116, + "completion_length": 2265.0625762939453, + "epoch": 0.4857142857142857, + "grad_norm": 1.4573155641555786, + "kl": 0.28778076171875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.6028856829700258e-07, + "loss": 0.0909, + "reward": 1.170523855369538, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 1.170523855369538, + "reward_after_std": 0.6862456277012825, + "reward_before_mean": 1.3542293733917177, + "reward_before_std": 0.6802042722702026, + "reward_change_max": 0.00010611116886138916, + "reward_change_mean": -0.18370548216626048, + "reward_change_min": -0.2846982665359974, + "reward_change_std": 0.11613735929131508, + "reward_std": 0.6862456537783146, + "rewards/cosine_scaled_reward": 0.3021146897226572, + "rewards/format_reward": 0.7500000149011612, + "step": 425 + }, + { + "advantage_max": 1.037292331457138, + "advantage_mean": -4.8428776211473235e-08, + "advantage_min": -0.7733364477753639, + "advantage_std": 0.6561478115618229, + "completion_length": 2578.9375762939453, + "epoch": 0.4868571428571429, + "grad_norm": 0.6892431974411011, + "kl": 0.2954254150390625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.5872728172265146e-07, + "loss": 0.0218, + "reward": 0.6788006233982742, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6788006233982742, + "reward_after_std": 0.6561478264629841, + "reward_before_mean": 0.8136748373508453, + "reward_before_std": 0.6412300877273083, + "reward_change_max": 4.620850086212158e-05, + "reward_change_mean": -0.13487422419711947, + "reward_change_min": -0.21194105129688978, + "reward_change_std": 0.08242963580414653, + "reward_std": 0.6561478637158871, + "rewards/cosine_scaled_reward": 0.0005873972550034523, + "rewards/format_reward": 0.8125000111758709, + "step": 426 + }, + { + "advantage_max": 1.1418819837272167, + "advantage_mean": -1.3038516377683607e-08, + "advantage_min": -0.6787732392549515, + "advantage_std": 0.7229117751121521, + "completion_length": 3192.9584045410156, + "epoch": 0.488, + "grad_norm": 0.9335200786590576, + "kl": 0.42462158203125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.5718506522858572e-07, + "loss": 0.0768, + "reward": 0.11853348463773727, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.11853348463773727, + "reward_after_std": 0.7229117825627327, + "reward_before_mean": 0.19935083203017712, + "reward_before_std": 0.7258897684514523, + "reward_change_max": 0.0003618001937866211, + "reward_change_mean": -0.08081738196779042, + "reward_change_min": -0.16983817890286446, + "reward_change_std": 0.06711064896080643, + "reward_std": 0.7229118067771196, + "rewards/cosine_scaled_reward": -0.11907458305358887, + "rewards/format_reward": 0.43750000558793545, + "step": 427 + }, + { + "advantage_max": 1.176058478653431, + "advantage_mean": -1.2417634531747268e-08, + "advantage_min": -0.9602077603340149, + "advantage_std": 0.7935724556446075, + "completion_length": 3005.7500915527344, + "epoch": 0.48914285714285716, + "grad_norm": 1.1662989854812622, + "kl": 0.6168212890625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.5566199398026147e-07, + "loss": 0.0661, + "reward": 0.18571929726749659, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.18571929726749659, + "reward_after_std": 0.7935724854469299, + "reward_before_mean": 0.2716755969449878, + "reward_before_std": 0.807749580591917, + "reward_change_max": 0.00014090538024902344, + "reward_change_mean": -0.08595629991032183, + "reward_change_min": -0.16239676997065544, + "reward_change_std": 0.06813311390578747, + "reward_std": 0.7935724891722202, + "rewards/cosine_scaled_reward": -0.12457887083292007, + "rewards/format_reward": 0.5208333469927311, + "step": 428 + }, + { + "advantage_max": 1.0645763352513313, + "advantage_mean": -1.3969839729455202e-09, + "advantage_min": -0.9452930763363838, + "advantage_std": 0.7396648898720741, + "completion_length": 2547.31258392334, + "epoch": 0.49028571428571427, + "grad_norm": 0.6578018665313721, + "kl": 0.35150146484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.5415814221002265e-07, + "loss": 0.0201, + "reward": 0.5265534557402134, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5265534557402134, + "reward_after_std": 0.7396649047732353, + "reward_before_mean": 0.6463110996410251, + "reward_before_std": 0.7420226447284222, + "reward_change_max": 0.00036235153675079346, + "reward_change_mean": -0.11975763086229563, + "reward_change_min": -0.21091782487928867, + "reward_change_std": 0.08160475362092257, + "reward_std": 0.7396649271249771, + "rewards/cosine_scaled_reward": -0.0726778069511056, + "rewards/format_reward": 0.7916666828095913, + "step": 429 + }, + { + "advantage_max": 1.0652894973754883, + "advantage_mean": -1.7384688355548406e-08, + "advantage_min": -0.9615795835852623, + "advantage_std": 0.7271205633878708, + "completion_length": 2463.6666870117188, + "epoch": 0.49142857142857144, + "grad_norm": 1.3026567697525024, + "kl": 0.322784423828125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.5267358321348285e-07, + "loss": -0.0016, + "reward": 0.6043818918988109, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6043818918988109, + "reward_after_std": 0.727120541036129, + "reward_before_mean": 0.7314365280326456, + "reward_before_std": 0.7297204360365868, + "reward_change_max": 0.0002305358648300171, + "reward_change_mean": -0.12705462612211704, + "reward_change_min": -0.2095300555229187, + "reward_change_std": 0.08553120819851756, + "reward_std": 0.7271205447614193, + "rewards/cosine_scaled_reward": 0.03238492365926504, + "rewards/format_reward": 0.666666679084301, + "step": 430 + }, + { + "advantage_max": 0.7961925268173218, + "advantage_mean": -5.898376398416616e-09, + "advantage_min": -0.7208251170814037, + "advantage_std": 0.5705397799611092, + "completion_length": 2729.104232788086, + "epoch": 0.49257142857142855, + "grad_norm": 1.024469256401062, + "kl": 0.43096160888671875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.5120838934595337e-07, + "loss": 0.0403, + "reward": 0.13876285403966904, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.13876285403966904, + "reward_after_std": 0.5705397836863995, + "reward_before_mean": 0.22783136554062366, + "reward_before_std": 0.5793787594884634, + "reward_change_max": 0.0009065419435501099, + "reward_change_mean": -0.08906851289793849, + "reward_change_min": -0.1641480876132846, + "reward_change_std": 0.06509990012273192, + "reward_std": 0.5705397874116898, + "rewards/cosine_scaled_reward": -0.18816765770316124, + "rewards/format_reward": 0.6041666865348816, + "step": 431 + }, + { + "advantage_max": 0.9430552236735821, + "advantage_mean": -3.104408785592483e-09, + "advantage_min": -0.7989522684365511, + "advantage_std": 0.705777489580214, + "completion_length": 3185.3125915527344, + "epoch": 0.4937142857142857, + "grad_norm": 0.7096803188323975, + "kl": 0.595458984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.4976263201891613e-07, + "loss": 0.0778, + "reward": 0.13925552484579384, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.13925552484579384, + "reward_after_std": 0.705777489580214, + "reward_before_mean": 0.22575496323406696, + "reward_before_std": 0.7232791353017092, + "reward_change_max": 0.00015789270401000977, + "reward_change_mean": -0.08649945515207946, + "reward_change_min": -0.1785599086433649, + "reward_change_std": 0.07354599726386368, + "reward_std": 0.7057775054126978, + "rewards/cosine_scaled_reward": -0.15795585606247187, + "rewards/format_reward": 0.5416666753590107, + "step": 432 + }, + { + "advantage_max": 0.9957953058183193, + "advantage_mean": -4.0978195170460197e-08, + "advantage_min": -1.1007677465677261, + "advantage_std": 0.7739236429333687, + "completion_length": 2984.6250915527344, + "epoch": 0.4948571428571429, + "grad_norm": 0.754916787147522, + "kl": 0.4191131591796875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.483363816965435e-07, + "loss": 0.0321, + "reward": 0.5658143066102639, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5658143066102639, + "reward_after_std": 0.773923646658659, + "reward_before_mean": 0.6907664993777871, + "reward_before_std": 0.7903828285634518, + "reward_change_max": 0.00024560093879699707, + "reward_change_mean": -0.12495221896097064, + "reward_change_min": -0.22821590304374695, + "reward_change_std": 0.09297089325264096, + "reward_std": 0.7739236764609814, + "rewards/cosine_scaled_reward": 0.032883236184716225, + "rewards/format_reward": 0.6250000186264515, + "step": 433 + }, + { + "advantage_max": 0.766203161329031, + "advantage_mean": -5.587935336670569e-09, + "advantage_min": -0.6076318752020597, + "advantage_std": 0.512691916897893, + "completion_length": 3136.812530517578, + "epoch": 0.496, + "grad_norm": 0.6693951487541199, + "kl": 0.37371826171875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.469297078922642e-07, + "loss": 0.0292, + "reward": 0.08525129873305559, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.08525129873305559, + "reward_after_std": 0.5126919113099575, + "reward_before_mean": 0.16985240951180458, + "reward_before_std": 0.5098276436328888, + "reward_change_max": 0.0, + "reward_change_mean": -0.08460111077874899, + "reward_change_min": -0.14587281458079815, + "reward_change_std": 0.057145274709910154, + "reward_std": 0.5126919187605381, + "rewards/cosine_scaled_reward": -0.19632380595430732, + "rewards/format_reward": 0.562500013038516, + "step": 434 + }, + { + "advantage_max": 0.9805018194019794, + "advantage_mean": -3.352761368535795e-08, + "advantage_min": -0.7217847239226103, + "advantage_std": 0.6718719862401485, + "completion_length": 2380.2500534057617, + "epoch": 0.49714285714285716, + "grad_norm": 1.3378220796585083, + "kl": 0.2252197265625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.4554267916537495e-07, + "loss": -0.0097, + "reward": 0.570288053364493, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.570288053364493, + "reward_after_std": 0.6718719862401485, + "reward_before_mean": 0.6957142185419798, + "reward_before_std": 0.6674840692430735, + "reward_change_max": 0.0, + "reward_change_mean": -0.12542616669088602, + "reward_change_min": -0.2183425473049283, + "reward_change_std": 0.07983373990282416, + "reward_std": 0.6718719974160194, + "rewards/cosine_scaled_reward": -0.07922623585909605, + "rewards/format_reward": 0.8541666716337204, + "step": 435 + }, + { + "advantage_max": 1.3979903608560562, + "advantage_mean": -4.967053879312289e-09, + "advantage_min": -0.9107687398791313, + "advantage_std": 0.8552838861942291, + "completion_length": 2412.479217529297, + "epoch": 0.4982857142857143, + "grad_norm": 0.7646883726119995, + "kl": 0.275054931640625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.4417536311769885e-07, + "loss": -0.0185, + "reward": 0.7218849333003163, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7218849333003163, + "reward_after_std": 0.8552838936448097, + "reward_before_mean": 0.8540548323653638, + "reward_before_std": 0.8422790169715881, + "reward_change_max": 9.534507989883423e-05, + "reward_change_mean": -0.13216988369822502, + "reward_change_min": -0.21940491441637278, + "reward_change_std": 0.0889145196415484, + "reward_std": 0.8552839010953903, + "rewards/cosine_scaled_reward": 0.08327740104869008, + "rewards/format_reward": 0.6875000149011612, + "step": 436 + }, + { + "advantage_max": 1.2850229367613792, + "advantage_mean": -1.4901161915492622e-08, + "advantage_min": -0.8939843475818634, + "advantage_std": 0.845024760812521, + "completion_length": 3216.6459045410156, + "epoch": 0.49942857142857144, + "grad_norm": 0.5741217732429504, + "kl": 0.383544921875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.4282782639029128e-07, + "loss": 0.0534, + "reward": 0.37288548797369003, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.37288548797369003, + "reward_after_std": 0.8450247794389725, + "reward_before_mean": 0.4743971526622772, + "reward_before_std": 0.8533979505300522, + "reward_change_max": 0.00014210492372512817, + "reward_change_mean": -0.10151165537536144, + "reward_change_min": -0.2013682834804058, + "reward_change_std": 0.08048270735889673, + "reward_std": 0.8450247906148434, + "rewards/cosine_scaled_reward": -0.09613476321101189, + "rewards/format_reward": 0.6666666809469461, + "step": 437 + }, + { + "advantage_max": 1.011772993952036, + "advantage_mean": 1.8626452047421083e-09, + "advantage_min": -1.0724971368908882, + "advantage_std": 0.7722194865345955, + "completion_length": 3073.229248046875, + "epoch": 0.5005714285714286, + "grad_norm": 0.7125306129455566, + "kl": 0.4136962890625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.4150013466019114e-07, + "loss": 0.0266, + "reward": 0.41990641644224524, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.41990641644224524, + "reward_after_std": 0.7722194865345955, + "reward_before_mean": 0.5311546549201012, + "reward_before_std": 0.7916942909359932, + "reward_change_max": 0.00031879544258117676, + "reward_change_mean": -0.11124823242425919, + "reward_change_min": -0.21365612745285034, + "reward_change_std": 0.08680269494652748, + "reward_std": 0.7722194939851761, + "rewards/cosine_scaled_reward": -0.036506010219454765, + "rewards/format_reward": 0.6041666772216558, + "step": 438 + }, + { + "advantage_max": 1.165678858757019, + "advantage_mean": 1.3969838813521207e-08, + "advantage_min": -0.8330172151327133, + "advantage_std": 0.7271898277103901, + "completion_length": 2788.0208892822266, + "epoch": 0.5017142857142857, + "grad_norm": 0.5193164348602295, + "kl": 0.2929534912109375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.4019235263722034e-07, + "loss": 0.0461, + "reward": 0.04328843858093023, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.04328843858093023, + "reward_after_std": 0.727189838886261, + "reward_before_mean": 0.11658047512173653, + "reward_before_std": 0.7333553172647953, + "reward_change_max": 0.0002767816185951233, + "reward_change_mean": -0.07329201139509678, + "reward_change_min": -0.1345710689201951, + "reward_change_std": 0.05936182173900306, + "reward_std": 0.7271898537874222, + "rewards/cosine_scaled_reward": -0.19170977361500263, + "rewards/format_reward": 0.5000000074505806, + "step": 439 + }, + { + "advantage_max": 0.95999875664711, + "advantage_mean": -2.5766592415266132e-08, + "advantage_min": -0.671626940369606, + "advantage_std": 0.6221942529082298, + "completion_length": 3048.4375762939453, + "epoch": 0.5028571428571429, + "grad_norm": 0.40915608406066895, + "kl": 0.3280029296875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.3890454406082956e-07, + "loss": 0.0393, + "reward": 0.22823767503723502, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.22823767503723502, + "reward_after_std": 0.6221942454576492, + "reward_before_mean": 0.32270977552980185, + "reward_before_std": 0.6170362383127213, + "reward_change_max": 0.0, + "reward_change_mean": -0.09447211399674416, + "reward_change_min": -0.17463115137070417, + "reward_change_std": 0.06761556677520275, + "reward_std": 0.6221942603588104, + "rewards/cosine_scaled_reward": -0.09906180715188384, + "rewards/format_reward": 0.5208333488553762, + "step": 440 + }, + { + "advantage_max": 1.2837908938527107, + "advantage_mean": -8.071462387349015e-09, + "advantage_min": -1.015868429094553, + "advantage_std": 0.8546808660030365, + "completion_length": 3028.9583892822266, + "epoch": 0.504, + "grad_norm": 0.35905030369758606, + "kl": 0.2718505859375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.3763677169699217e-07, + "loss": 0.0362, + "reward": 0.4694512798450887, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4694512798450887, + "reward_after_std": 0.8546808734536171, + "reward_before_mean": 0.5803429093211889, + "reward_before_std": 0.8632423244416714, + "reward_change_max": 0.00014021247625350952, + "reward_change_mean": -0.11089161830022931, + "reward_change_min": -0.21306519862264395, + "reward_change_std": 0.08292792178690434, + "reward_std": 0.8546808958053589, + "rewards/cosine_scaled_reward": -0.05357855744659901, + "rewards/format_reward": 0.6875000111758709, + "step": 441 + }, + { + "advantage_max": 1.0009881034493446, + "advantage_mean": -2.220446049250313e-16, + "advantage_min": -0.8761501684784889, + "advantage_std": 0.7383112497627735, + "completion_length": 3243.604248046875, + "epoch": 0.5051428571428571, + "grad_norm": 0.4938925504684448, + "kl": 0.2559814453125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.3638909733514452e-07, + "loss": 0.0458, + "reward": 0.7655965192243457, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.7655965192243457, + "reward_after_std": 0.7383112460374832, + "reward_before_mean": 0.9093698719516397, + "reward_before_std": 0.742842298001051, + "reward_change_max": 0.0, + "reward_change_mean": -0.14377336343750358, + "reward_change_min": -0.24906758219003677, + "reward_change_std": 0.10069033224135637, + "reward_std": 0.7383112534880638, + "rewards/cosine_scaled_reward": 0.12135160126490518, + "rewards/format_reward": 0.6666666734963655, + "step": 442 + }, + { + "advantage_max": 1.106526430696249, + "advantage_mean": -6.208816905051151e-09, + "advantage_min": -0.7966680526733398, + "advantage_std": 0.6943358443677425, + "completion_length": 3220.4791870117188, + "epoch": 0.5062857142857143, + "grad_norm": 0.3847130835056305, + "kl": 0.34722900390625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.351615817851748e-07, + "loss": 0.0509, + "reward": 0.33592464402318, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.33592464402318, + "reward_after_std": 0.6943358294665813, + "reward_before_mean": 0.4375531330006197, + "reward_before_std": 0.6889032647013664, + "reward_change_max": 0.0, + "reward_change_mean": -0.10162849072366953, + "reward_change_min": -0.17682076431810856, + "reward_change_std": 0.0679366267286241, + "reward_std": 0.6943358518183231, + "rewards/cosine_scaled_reward": -0.09372343437280506, + "rewards/format_reward": 0.6250000223517418, + "step": 443 + }, + { + "advantage_max": 1.1278332397341728, + "advantage_mean": -1.2417633588057697e-09, + "advantage_min": -0.7738394141197205, + "advantage_std": 0.7633233778178692, + "completion_length": 3012.895896911621, + "epoch": 0.5074285714285715, + "grad_norm": 0.6785799860954285, + "kl": 0.29107666015625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.3395428487445914e-07, + "loss": 0.0142, + "reward": 0.12987452652305365, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.12987452652305365, + "reward_after_std": 0.7633233852684498, + "reward_before_mean": 0.21220868080854416, + "reward_before_std": 0.7764322087168694, + "reward_change_max": 0.0003639534115791321, + "reward_change_mean": -0.0823341638315469, + "reward_change_min": -0.1722795907407999, + "reward_change_std": 0.0688524863217026, + "reward_std": 0.7633234038949013, + "rewards/cosine_scaled_reward": -0.15431233122944832, + "rewards/format_reward": 0.5208333395421505, + "step": 444 + }, + { + "advantage_max": 1.5314294025301933, + "advantage_mean": -2.483526961860605e-08, + "advantage_min": -0.8162381164729595, + "advantage_std": 0.9250198267400265, + "completion_length": 3092.7709045410156, + "epoch": 0.5085714285714286, + "grad_norm": 1.2392834424972534, + "kl": 0.28601837158203125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.3276726544494571e-07, + "loss": 0.0478, + "reward": 0.495319290086627, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.495319290086627, + "reward_after_std": 0.9250198528170586, + "reward_before_mean": 0.6040422953665257, + "reward_before_std": 0.9202543366700411, + "reward_change_max": 9.849667549133301e-05, + "reward_change_mean": -0.10872298898175359, + "reward_change_min": -0.20648100413382053, + "reward_change_std": 0.08262008614838123, + "reward_std": 0.925019882619381, + "rewards/cosine_scaled_reward": 0.010354459285736084, + "rewards/format_reward": 0.5833333414047956, + "step": 445 + }, + { + "advantage_max": 1.1504263132810593, + "advantage_mean": 7.45058070794613e-09, + "advantage_min": -1.0259768851101398, + "advantage_std": 0.782253373414278, + "completion_length": 3035.2708892822266, + "epoch": 0.5097142857142857, + "grad_norm": 0.3403970003128052, + "kl": 0.2048492431640625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.316005813502869e-07, + "loss": 0.0314, + "reward": 0.4214175812667236, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4214175812667236, + "reward_after_std": 0.7822533883154392, + "reward_before_mean": 0.529712213203311, + "reward_before_std": 0.7904498800635338, + "reward_change_max": 0.00035788118839263916, + "reward_change_mean": -0.1082946015521884, + "reward_change_min": -0.18838701210916042, + "reward_change_std": 0.07935713930055499, + "reward_std": 0.7822534218430519, + "rewards/cosine_scaled_reward": -0.04764390899799764, + "rewards/format_reward": 0.6250000186264515, + "step": 446 + }, + { + "advantage_max": 0.8362768590450287, + "advantage_mean": -7.76102188071448e-09, + "advantage_min": -1.0166893266141415, + "advantage_std": 0.6835263110697269, + "completion_length": 2488.0625610351562, + "epoch": 0.5108571428571429, + "grad_norm": 0.8194699883460999, + "kl": 0.17340087890625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.3045428945301953e-07, + "loss": 0.0391, + "reward": 0.7818196527659893, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7818196527659893, + "reward_after_std": 0.6835263259708881, + "reward_before_mean": 0.9298957334831357, + "reward_before_std": 0.6929495614022017, + "reward_change_max": 0.0, + "reward_change_mean": -0.1480760732665658, + "reward_change_min": -0.23649182450026274, + "reward_change_std": 0.09618731448426843, + "reward_std": 0.6835263390094042, + "rewards/cosine_scaled_reward": 0.02744785137474537, + "rewards/format_reward": 0.8750000074505806, + "step": 447 + }, + { + "advantage_max": 1.0442199632525444, + "advantage_mean": -7.45058070794613e-09, + "advantage_min": -0.869889423251152, + "advantage_std": 0.7612986005842686, + "completion_length": 2814.187545776367, + "epoch": 0.512, + "grad_norm": 0.4111681878566742, + "kl": 0.24267578125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.2932844562179352e-07, + "loss": 0.0191, + "reward": 0.6223300769925117, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6223300769925117, + "reward_after_std": 0.761298593133688, + "reward_before_mean": 0.7515882328152657, + "reward_before_std": 0.7706958763301373, + "reward_change_max": 0.0003437027335166931, + "reward_change_mean": -0.12925810925662518, + "reward_change_min": -0.2418409138917923, + "reward_change_std": 0.0902960505336523, + "reward_std": 0.761298593133688, + "rewards/cosine_scaled_reward": 0.0007940866053104401, + "rewards/format_reward": 0.7500000055879354, + "step": 448 + }, + { + "advantage_max": 0.8848019167780876, + "advantage_mean": -1.1796752796833232e-08, + "advantage_min": -1.0409802421927452, + "advantage_std": 0.7183829769492149, + "completion_length": 2573.0000610351562, + "epoch": 0.5131428571428571, + "grad_norm": 0.21112467348575592, + "kl": 0.1236724853515625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.2822310472864885e-07, + "loss": 0.0106, + "reward": 0.6152279544621706, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6152279544621706, + "reward_after_std": 0.7183829881250858, + "reward_before_mean": 0.7470788657665253, + "reward_before_std": 0.7360207587480545, + "reward_change_max": 0.0003293454647064209, + "reward_change_mean": -0.13185090059414506, + "reward_change_min": -0.22446555085480213, + "reward_change_std": 0.09343716083094478, + "reward_std": 0.7183830142021179, + "rewards/cosine_scaled_reward": 0.029789404943585396, + "rewards/format_reward": 0.6875000167638063, + "step": 449 + }, + { + "advantage_max": 0.9070663601160049, + "advantage_mean": 6.208816238917336e-10, + "advantage_min": -0.7719485089182854, + "advantage_std": 0.6348018012940884, + "completion_length": 2985.458366394043, + "epoch": 0.5142857142857142, + "grad_norm": 0.2994707524776459, + "kl": 0.1786041259765625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.2713832064634125e-07, + "loss": 0.015, + "reward": 0.2393764741718769, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2393764741718769, + "reward_after_std": 0.6348018012940884, + "reward_before_mean": 0.3358287693117745, + "reward_before_std": 0.6404321789741516, + "reward_change_max": 8.175522089004517e-05, + "reward_change_mean": -0.09645229065790772, + "reward_change_min": -0.18162197712808847, + "reward_change_std": 0.06990106124430895, + "reward_std": 0.6348018124699593, + "rewards/cosine_scaled_reward": -0.12375229911413044, + "rewards/format_reward": 0.5833333395421505, + "step": 450 + }, + { + "advantage_max": 1.2291451916098595, + "advantage_mean": -4.656612540010485e-09, + "advantage_min": -0.9268405549228191, + "advantage_std": 0.8121864721179008, + "completion_length": 2530.9375534057617, + "epoch": 0.5154285714285715, + "grad_norm": 0.9159375429153442, + "kl": 0.1414642333984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.260741462457165e-07, + "loss": 0.0199, + "reward": 0.6339735409710556, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6339735409710556, + "reward_after_std": 0.8121864832937717, + "reward_before_mean": 0.7607014384120703, + "reward_before_std": 0.8097108751535416, + "reward_change_max": 0.0, + "reward_change_mean": -0.1267279153689742, + "reward_change_min": -0.22259232308715582, + "reward_change_std": 0.0881945351138711, + "reward_std": 0.8121864981949329, + "rewards/cosine_scaled_reward": 0.07826739549636841, + "rewards/format_reward": 0.6041666772216558, + "step": 451 + }, + { + "advantage_max": 1.7310744225978851, + "advantage_mean": -2.6077032533322608e-08, + "advantage_min": -1.0266026742756367, + "advantage_std": 1.0066105760633945, + "completion_length": 3234.8334045410156, + "epoch": 0.5165714285714286, + "grad_norm": 0.441463828086853, + "kl": 0.22369384765625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.2503063339313356e-07, + "loss": 0.0203, + "reward": 0.507617705501616, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.507617705501616, + "reward_after_std": 1.0066105723381042, + "reward_before_mean": 0.6138361915946007, + "reward_before_std": 0.9988056197762489, + "reward_change_max": 0.00042747706174850464, + "reward_change_mean": -0.10621848376467824, + "reward_change_min": -0.20366192236542702, + "reward_change_std": 0.08123700972646475, + "reward_std": 1.0066106170415878, + "rewards/cosine_scaled_reward": -0.005581922363489866, + "rewards/format_reward": 0.6250000149011612, + "step": 452 + }, + { + "advantage_max": 1.331640511751175, + "advantage_mean": -2.1730860721991263e-08, + "advantage_min": -1.3682594373822212, + "advantage_std": 1.0288936495780945, + "completion_length": 2916.625102996826, + "epoch": 0.5177142857142857, + "grad_norm": 0.6561626195907593, + "kl": 0.218505859375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.2400783294793668e-07, + "loss": 0.0292, + "reward": 0.7023884258233011, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7023884258233011, + "reward_after_std": 1.0288936868309975, + "reward_before_mean": 0.8339581657201052, + "reward_before_std": 1.0595490783452988, + "reward_change_max": 2.3633241653442383e-05, + "reward_change_mean": -0.13156970776617527, + "reward_change_min": -0.2569648250937462, + "reward_change_std": 0.10804524039849639, + "reward_std": 1.0288937389850616, + "rewards/cosine_scaled_reward": 0.07322905701585114, + "rewards/format_reward": 0.6875000223517418, + "step": 453 + }, + { + "advantage_max": 1.180284183472395, + "advantage_mean": -2.483526828633842e-09, + "advantage_min": -0.8286248743534088, + "advantage_std": 0.7885466255247593, + "completion_length": 2831.9584045410156, + "epoch": 0.5188571428571429, + "grad_norm": 0.9318577647209167, + "kl": 0.230743408203125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.2300579475997657e-07, + "loss": 0.0306, + "reward": 0.11954196076840162, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.11954196076840162, + "reward_after_std": 0.788546621799469, + "reward_before_mean": 0.19991375133395195, + "reward_before_std": 0.8013152182102203, + "reward_change_max": 0.0003981366753578186, + "reward_change_mean": -0.08037177752703428, + "reward_change_min": -0.1756641110405326, + "reward_change_std": 0.07187676522880793, + "reward_std": 0.7885466404259205, + "rewards/cosine_scaled_reward": -0.16045980621129274, + "rewards/format_reward": 0.5208333432674408, + "step": 454 + }, + { + "advantage_max": 1.049359679222107, + "advantage_mean": -4.65661276205509e-09, + "advantage_min": -0.8113865703344345, + "advantage_std": 0.7036336697638035, + "completion_length": 3105.0834045410156, + "epoch": 0.52, + "grad_norm": 0.4618373215198517, + "kl": 0.35552978515625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.220245676671809e-07, + "loss": 0.029, + "reward": 0.06979416310787201, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.06979416310787201, + "reward_after_std": 0.7036336623132229, + "reward_before_mean": 0.14800235256552696, + "reward_before_std": 0.7142827957868576, + "reward_change_max": 0.0004022940993309021, + "reward_change_mean": -0.07820818712934852, + "reward_change_min": -0.158931165933609, + "reward_change_std": 0.06535158446058631, + "reward_std": 0.7036336846649647, + "rewards/cosine_scaled_reward": -0.20724883582443, + "rewards/format_reward": 0.5625000074505806, + "step": 455 + }, + { + "advantage_max": 1.2895260006189346, + "advantage_mean": 3.104408841103634e-09, + "advantage_min": -0.9913373813033104, + "advantage_std": 0.830539807677269, + "completion_length": 3312.8334045410156, + "epoch": 0.5211428571428571, + "grad_norm": 0.4404752850532532, + "kl": 0.27899169921875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.2106419949317388e-07, + "loss": 0.0332, + "reward": 0.1983939576894045, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1983939576894045, + "reward_after_std": 0.8305398598313332, + "reward_before_mean": 0.28388113901019096, + "reward_before_std": 0.8402737453579903, + "reward_change_max": 0.0003027394413948059, + "reward_change_mean": -0.08548715803772211, + "reward_change_min": -0.18772473465651274, + "reward_change_std": 0.07269950956106186, + "reward_std": 0.8305399157106876, + "rewards/cosine_scaled_reward": -0.170559449121356, + "rewards/format_reward": 0.6250000223517418, + "step": 456 + }, + { + "advantage_max": 0.982379175722599, + "advantage_mean": 5.510325196134147e-09, + "advantage_min": -0.8538262210786343, + "advantage_std": 0.6569894328713417, + "completion_length": 3048.4375610351562, + "epoch": 0.5222857142857142, + "grad_norm": 0.39246371388435364, + "kl": 0.21966552734375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.2012473704494537e-07, + "loss": 0.0653, + "reward": 0.3919263742864132, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3919263742864132, + "reward_after_std": 0.6569894254207611, + "reward_before_mean": 0.5009766188450158, + "reward_before_std": 0.6586984917521477, + "reward_change_max": 0.00028721243143081665, + "reward_change_mean": -0.10905022826045752, + "reward_change_min": -0.18408588599413633, + "reward_change_std": 0.07368638599291444, + "reward_std": 0.6569894477725029, + "rewards/cosine_scaled_reward": -0.03076169639825821, + "rewards/format_reward": 0.5625000074505806, + "step": 457 + }, + { + "advantage_max": 1.3128767609596252, + "advantage_mean": -2.6077032977411818e-08, + "advantage_min": -0.8374460823833942, + "advantage_std": 0.8251619078218937, + "completion_length": 2803.0000915527344, + "epoch": 0.5234285714285715, + "grad_norm": 0.8610221743583679, + "kl": 0.218505859375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.1920622611056974e-07, + "loss": 0.0219, + "reward": 0.32338718697428703, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.32338718697428703, + "reward_after_std": 0.8251619152724743, + "reward_before_mean": 0.4198822174221277, + "reward_before_std": 0.8259499967098236, + "reward_change_max": 0.00017363578081130981, + "reward_change_mean": -0.09649505442939699, + "reward_change_min": -0.19915975630283356, + "reward_change_std": 0.07579607097432017, + "reward_std": 0.8251619413495064, + "rewards/cosine_scaled_reward": -0.14422556664794683, + "rewards/format_reward": 0.708333333954215, + "step": 458 + }, + { + "advantage_max": 1.0017457380890846, + "advantage_mean": -3.72529057601767e-09, + "advantage_min": -1.1438434720039368, + "advantage_std": 0.8000783547759056, + "completion_length": 2578.291717529297, + "epoch": 0.5245714285714286, + "grad_norm": 0.5348182916641235, + "kl": 0.27069091796875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.1830871145697412e-07, + "loss": 0.054, + "reward": 0.6234663780778646, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6234663780778646, + "reward_after_std": 0.8000783659517765, + "reward_before_mean": 0.7535809008404613, + "reward_before_std": 0.821883074939251, + "reward_change_max": 9.752810001373291e-06, + "reward_change_mean": -0.130114508792758, + "reward_change_min": -0.23948706313967705, + "reward_change_std": 0.09379158820956945, + "reward_std": 0.8000783957540989, + "rewards/cosine_scaled_reward": 0.04345710389316082, + "rewards/format_reward": 0.6666666772216558, + "step": 459 + }, + { + "advantage_max": 1.278119184076786, + "advantage_mean": -1.0244548737103898e-08, + "advantage_min": -0.9226012080907822, + "advantage_std": 0.8290133886039257, + "completion_length": 3373.6250915527344, + "epoch": 0.5257142857142857, + "grad_norm": 0.5007908940315247, + "kl": 0.3909912109375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.1743223682775649e-07, + "loss": 0.0507, + "reward": 0.09350781515240669, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.09350781515240669, + "reward_after_std": 0.8290134109556675, + "reward_before_mean": 0.16933209914714098, + "reward_before_std": 0.8431417122483253, + "reward_change_max": 1.5437602996826172e-05, + "reward_change_mean": -0.07582429051399231, + "reward_change_min": -0.16332056745886803, + "reward_change_std": 0.06777232186868787, + "reward_std": 0.8290134258568287, + "rewards/cosine_scaled_reward": -0.17575062531977892, + "rewards/format_reward": 0.5208333488553762, + "step": 460 + }, + { + "advantage_max": 1.0003699101507664, + "advantage_mean": 3.104408563547878e-09, + "advantage_min": -0.7306459732353687, + "advantage_std": 0.6616128720343113, + "completion_length": 3015.0000610351562, + "epoch": 0.5268571428571428, + "grad_norm": 0.7025476694107056, + "kl": 0.32537841796875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.1657684494105386e-07, + "loss": 0.0168, + "reward": 0.6671255268156528, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6671255268156528, + "reward_after_std": 0.6616128571331501, + "reward_before_mean": 0.8014153416152112, + "reward_before_std": 0.652360200881958, + "reward_change_max": 0.0, + "reward_change_mean": -0.13428980251774192, + "reward_change_min": -0.2290246021002531, + "reward_change_std": 0.08612646535038948, + "reward_std": 0.6616128906607628, + "rewards/cosine_scaled_reward": 0.0882076546549797, + "rewards/format_reward": 0.6250000093132257, + "step": 461 + }, + { + "advantage_max": 0.9321895092725754, + "advantage_mean": 4.346172144398253e-09, + "advantage_min": -0.8385965377092361, + "advantage_std": 0.6647354513406754, + "completion_length": 3085.791763305664, + "epoch": 0.528, + "grad_norm": 0.46087542176246643, + "kl": 0.250518798828125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.1574257748745986e-07, + "loss": 0.0394, + "reward": 0.05673941969871521, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.05673941969871521, + "reward_after_std": 0.6647354401648045, + "reward_before_mean": 0.13604146614670753, + "reward_before_std": 0.6804654821753502, + "reward_change_max": 0.00011178851127624512, + "reward_change_mean": -0.0793020457495004, + "reward_change_min": -0.16675184946507215, + "reward_change_std": 0.06660343706607819, + "reward_std": 0.6647354438900948, + "rewards/cosine_scaled_reward": -0.20281260646879673, + "rewards/format_reward": 0.5416666846722364, + "step": 462 + }, + { + "advantage_max": 1.1660524047911167, + "advantage_mean": -1.7384688633104162e-08, + "advantage_min": -0.8106147684156895, + "advantage_std": 0.7458341121673584, + "completion_length": 3201.1459197998047, + "epoch": 0.5291428571428571, + "grad_norm": 0.5006526708602905, + "kl": 0.373992919921875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.1492947512799328e-07, + "loss": 0.024, + "reward": 0.18125806841999292, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.18125806841999292, + "reward_after_std": 0.7458341140300035, + "reward_before_mean": 0.2674159649759531, + "reward_before_std": 0.7493571043014526, + "reward_change_max": 0.0, + "reward_change_mean": -0.0861579212360084, + "reward_change_min": -0.16297503747045994, + "reward_change_std": 0.0660725818015635, + "reward_std": 0.7458341177552938, + "rewards/cosine_scaled_reward": -0.1162920305505395, + "rewards/format_reward": 0.5000000074505806, + "step": 463 + }, + { + "advantage_max": 1.0869667418301105, + "advantage_mean": -3.2906732894133484e-08, + "advantage_min": -0.6707272604107857, + "advantage_std": 0.6322797127068043, + "completion_length": 2362.604217529297, + "epoch": 0.5302857142857142, + "grad_norm": 0.8603577017784119, + "kl": 0.2738189697265625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.1413757749211602e-07, + "loss": -0.0015, + "reward": 0.8882392137311399, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.8882392137311399, + "reward_after_std": 0.632279708981514, + "reward_before_mean": 1.0414677765220404, + "reward_before_std": 0.5925928438082337, + "reward_change_max": 9.009987115859985e-05, + "reward_change_mean": -0.15322855673730373, + "reward_change_min": -0.22892808262258768, + "reward_change_std": 0.08658680645748973, + "reward_std": 0.6322797238826752, + "rewards/cosine_scaled_reward": 0.11448386963456869, + "rewards/format_reward": 0.8125000111758709, + "step": 464 + }, + { + "advantage_max": 1.2360083982348442, + "advantage_mean": -1.2417636363615259e-09, + "advantage_min": -0.8381345644593239, + "advantage_std": 0.7440574951469898, + "completion_length": 2958.4583740234375, + "epoch": 0.5314285714285715, + "grad_norm": 0.4997173249721527, + "kl": 0.386474609375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.1336692317580158e-07, + "loss": 0.0389, + "reward": 0.08523056594276568, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.08523056594276568, + "reward_after_std": 0.7440575174987316, + "reward_before_mean": 0.1612645024433732, + "reward_before_std": 0.7435659244656563, + "reward_change_max": 0.00045468658208847046, + "reward_change_mean": -0.07603393588215113, + "reward_change_min": -0.14358344301581383, + "reward_change_std": 0.059222247917205095, + "reward_std": 0.744057547301054, + "rewards/cosine_scaled_reward": -0.23186775855720043, + "rewards/format_reward": 0.625000013038516, + "step": 465 + }, + { + "advantage_max": 1.3792641945183277, + "advantage_mean": -3.104408785592483e-09, + "advantage_min": -0.9853333793580532, + "advantage_std": 0.900844220072031, + "completion_length": 3042.2500610351562, + "epoch": 0.5325714285714286, + "grad_norm": 0.44595518708229065, + "kl": 0.2362060546875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.1261754973965422e-07, + "loss": 0.0166, + "reward": 0.6323232520371675, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6323232520371675, + "reward_after_std": 0.9008442088961601, + "reward_before_mean": 0.7564093824476004, + "reward_before_std": 0.9056645855307579, + "reward_change_max": 0.00016022473573684692, + "reward_change_mean": -0.12408614112064242, + "reward_change_min": -0.23610640596598387, + "reward_change_std": 0.09193866746500134, + "reward_std": 0.9008442126214504, + "rewards/cosine_scaled_reward": 0.06570468074642122, + "rewards/format_reward": 0.6250000074505806, + "step": 466 + }, + { + "advantage_max": 1.192842148244381, + "advantage_mean": -1.4280279847511679e-08, + "advantage_min": -0.8308476731181145, + "advantage_std": 0.7643921412527561, + "completion_length": 3214.166748046875, + "epoch": 0.5337142857142857, + "grad_norm": 0.5949474573135376, + "kl": 0.2874755859375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.1188949370707787e-07, + "loss": 0.0319, + "reward": 0.46276107244193554, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.46276107244193554, + "reward_after_std": 0.7643921487033367, + "reward_before_mean": 0.5747087150812149, + "reward_before_std": 0.7634487450122833, + "reward_change_max": 0.00010569393634796143, + "reward_change_mean": -0.11194764589890838, + "reward_change_min": -0.20960522443056107, + "reward_change_std": 0.08081883913837373, + "reward_std": 0.7643921747803688, + "rewards/cosine_scaled_reward": -0.045978982001543045, + "rewards/format_reward": 0.6666666846722364, + "step": 467 + }, + { + "advantage_max": 1.1976465657353401, + "advantage_mean": -4.3461720111714897e-08, + "advantage_min": -0.9038081467151642, + "advantage_std": 0.8358358480036259, + "completion_length": 3259.104217529297, + "epoch": 0.5348571428571428, + "grad_norm": 0.9248505234718323, + "kl": 0.40380859375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.1118279056249653e-07, + "loss": 0.0779, + "reward": 0.3013373212888837, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3013373212888837, + "reward_after_std": 0.8358358442783356, + "reward_before_mean": 0.3982266914099455, + "reward_before_std": 0.8553115110844374, + "reward_change_max": 8.344650268554688e-05, + "reward_change_mean": -0.09688938385806978, + "reward_change_min": -0.21377692930400372, + "reward_change_std": 0.08293247455731034, + "reward_std": 0.8358358591794968, + "rewards/cosine_scaled_reward": -0.07171999849379063, + "rewards/format_reward": 0.541666679084301, + "step": 468 + }, + { + "advantage_max": 1.1891687586903572, + "advantage_mean": -1.986821618338297e-08, + "advantage_min": -0.9869108945131302, + "advantage_std": 0.7981296181678772, + "completion_length": 2615.1875915527344, + "epoch": 0.536, + "grad_norm": 0.4499533176422119, + "kl": 0.39539337158203125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.1049747474962444e-07, + "loss": 0.0624, + "reward": 0.42476166412234306, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.42476166412234306, + "reward_after_std": 0.7981296330690384, + "reward_before_mean": 0.5331416502594948, + "reward_before_std": 0.8063963502645493, + "reward_change_max": 0.0, + "reward_change_mean": -0.10837997868657112, + "reward_change_min": -0.2086690105497837, + "reward_change_std": 0.08071521436795592, + "reward_std": 0.7981296479701996, + "rewards/cosine_scaled_reward": -0.0250958614051342, + "rewards/format_reward": 0.5833333395421505, + "step": 469 + }, + { + "advantage_max": 1.1581247821450233, + "advantage_mean": 1.365939888975376e-08, + "advantage_min": -0.7778391763567924, + "advantage_std": 0.722015731036663, + "completion_length": 3278.6876220703125, + "epoch": 0.5371428571428571, + "grad_norm": 0.613844096660614, + "kl": 0.3515625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0983357966978745e-07, + "loss": 0.0299, + "reward": 0.41915637208148837, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.41915637208148837, + "reward_after_std": 0.7220157347619534, + "reward_before_mean": 0.5277159176766872, + "reward_before_std": 0.7151578031480312, + "reward_change_max": 0.0, + "reward_change_mean": -0.10855951346457005, + "reward_change_min": -0.1773476181551814, + "reward_change_std": 0.0683783870190382, + "reward_std": 0.7220157794654369, + "rewards/cosine_scaled_reward": -0.07989205606281757, + "rewards/format_reward": 0.6875000186264515, + "step": 470 + }, + { + "advantage_max": 1.2751684002578259, + "advantage_mean": -1.1020650836357504e-08, + "advantage_min": -1.0654745399951935, + "advantage_std": 0.8733825888484716, + "completion_length": 3038.0625610351562, + "epoch": 0.5382857142857143, + "grad_norm": 0.466342031955719, + "kl": 0.26397705078125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0919113768029517e-07, + "loss": 0.0334, + "reward": 0.7088228650391102, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7088228650391102, + "reward_after_std": 0.8733825515955687, + "reward_before_mean": 0.841802254319191, + "reward_before_std": 0.879924139007926, + "reward_change_max": 8.447468280792236e-05, + "reward_change_mean": -0.13297936227172613, + "reward_change_min": -0.2324786437675357, + "reward_change_std": 0.09299619169905782, + "reward_std": 0.8733825888484716, + "rewards/cosine_scaled_reward": 0.06673444528132677, + "rewards/format_reward": 0.7083333525806665, + "step": 471 + }, + { + "advantage_max": 1.2374247685074806, + "advantage_mean": -1.6453366308288864e-08, + "advantage_min": -0.881297804415226, + "advantage_std": 0.8313139267265797, + "completion_length": 3053.979248046875, + "epoch": 0.5394285714285715, + "grad_norm": 0.41991597414016724, + "kl": 0.28839111328125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0857018009286381e-07, + "loss": 0.027, + "reward": 0.39327038638293743, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.39327038638293743, + "reward_after_std": 0.8313139192759991, + "reward_before_mean": 0.49783482804195955, + "reward_before_std": 0.8396384790539742, + "reward_change_max": 0.0, + "reward_change_mean": -0.10456445254385471, + "reward_change_min": -0.21990286745131016, + "reward_change_std": 0.0824456731788814, + "reward_std": 0.8313139192759991, + "rewards/cosine_scaled_reward": -0.0948325915960595, + "rewards/format_reward": 0.6875000111758709, + "step": 472 + }, + { + "advantage_max": 1.042616032063961, + "advantage_mean": 9.313226134732844e-09, + "advantage_min": -0.8628039546310902, + "advantage_std": 0.711292676627636, + "completion_length": 3158.979217529297, + "epoch": 0.5405714285714286, + "grad_norm": 0.7266313433647156, + "kl": 0.2745819091796875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0797073717209013e-07, + "loss": 0.0557, + "reward": 0.16985632851719856, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.16985632851719856, + "reward_after_std": 0.7112926989793777, + "reward_before_mean": 0.25732479616999626, + "reward_before_std": 0.720692828297615, + "reward_change_max": 0.00014837831258773804, + "reward_change_mean": -0.08746843785047531, + "reward_change_min": -0.16882281191647053, + "reward_change_std": 0.06796854501590133, + "reward_std": 0.711292702704668, + "rewards/cosine_scaled_reward": -0.14217094890773296, + "rewards/format_reward": 0.5416666772216558, + "step": 473 + }, + { + "advantage_max": 1.1585516035556793, + "advantage_mean": -2.0799537953086755e-08, + "advantage_min": -0.8155636340379715, + "advantage_std": 0.7681524343788624, + "completion_length": 2514.1458740234375, + "epoch": 0.5417142857142857, + "grad_norm": 0.43213552236557007, + "kl": 0.24603271484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0739283813397639e-07, + "loss": 0.0473, + "reward": 0.7646679431200027, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.7646679431200027, + "reward_after_std": 0.7681524567306042, + "reward_before_mean": 0.9050909709185362, + "reward_before_std": 0.7603873573243618, + "reward_change_max": 0.0, + "reward_change_mean": -0.1404230184853077, + "reward_change_min": -0.25275282841175795, + "reward_change_std": 0.0934959203004837, + "reward_std": 0.7681524753570557, + "rewards/cosine_scaled_reward": 0.10879547521471977, + "rewards/format_reward": 0.687500013038516, + "step": 474 + }, + { + "advantage_max": 1.455761842429638, + "advantage_mean": -2.4214387495113954e-08, + "advantage_min": -1.21954682841897, + "advantage_std": 0.9812054596841335, + "completion_length": 2689.604263305664, + "epoch": 0.5428571428571428, + "grad_norm": 1.0528361797332764, + "kl": 0.314056396484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.068365111445064e-07, + "loss": 0.0525, + "reward": 0.8001913847401738, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.8001913847401738, + "reward_after_std": 0.9812054559588432, + "reward_before_mean": 0.9382050596177578, + "reward_before_std": 0.9881223868578672, + "reward_change_max": 0.00012006610631942749, + "reward_change_mean": -0.13801368651911616, + "reward_change_min": -0.24854467622935772, + "reward_change_std": 0.1031024232506752, + "reward_std": 0.981205478310585, + "rewards/cosine_scaled_reward": 0.14618585677817464, + "rewards/format_reward": 0.6458333469927311, + "step": 475 + }, + { + "advantage_max": 1.6118089109659195, + "advantage_mean": -2.4835262735223296e-09, + "advantage_min": -1.260014183819294, + "advantage_std": 1.1286941394209862, + "completion_length": 2836.4584350585938, + "epoch": 0.544, + "grad_norm": 1.7638888359069824, + "kl": 0.315277099609375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.063017833182728e-07, + "loss": 0.1327, + "reward": 0.7785492744296789, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.7785492744296789, + "reward_after_std": 1.1286941319704056, + "reward_before_mean": 0.9117186167277396, + "reward_before_std": 1.1516238190233707, + "reward_change_max": 0.00016049295663833618, + "reward_change_mean": -0.13316931016743183, + "reward_change_min": -0.28096507117152214, + "reward_change_std": 0.11068580951541662, + "reward_std": 1.128694150596857, + "rewards/cosine_scaled_reward": 0.09127595031168312, + "rewards/format_reward": 0.7291666828095913, + "step": 476 + }, + { + "advantage_max": 1.0612576007843018, + "advantage_mean": -6.332993651714247e-08, + "advantage_min": -1.2174795642495155, + "advantage_std": 0.8141191489994526, + "completion_length": 2536.416732788086, + "epoch": 0.5451428571428572, + "grad_norm": 1.627563714981079, + "kl": 0.17498779296875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0578868071715544e-07, + "loss": 0.0816, + "reward": 1.1968099847435951, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 1.1968099847435951, + "reward_after_std": 0.8141191452741623, + "reward_before_mean": 1.3791041374206543, + "reward_before_std": 0.8197311982512474, + "reward_change_max": 0.00014181435108184814, + "reward_change_mean": -0.18229419272392988, + "reward_change_min": -0.280188612639904, + "reward_change_std": 0.1127679473720491, + "reward_std": 0.8141191527247429, + "rewards/cosine_scaled_reward": 0.2312187310308218, + "rewards/format_reward": 0.916666679084301, + "step": 477 + }, + { + "advantage_max": 1.0841128677129745, + "advantage_mean": -8.692343900218447e-09, + "advantage_min": -0.8327232263982296, + "advantage_std": 0.7114966996014118, + "completion_length": 2950.1459045410156, + "epoch": 0.5462857142857143, + "grad_norm": 0.3843514323234558, + "kl": 0.3557281494140625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0529722834905125e-07, + "loss": 0.0494, + "reward": 0.19972308538854122, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.19972308538854122, + "reward_after_std": 0.7114967443048954, + "reward_before_mean": 0.28897368628531694, + "reward_before_std": 0.7166823334991932, + "reward_change_max": 2.7410686016082764e-05, + "reward_change_mean": -0.08925061579793692, + "reward_change_min": -0.16781168803572655, + "reward_change_std": 0.0676483353599906, + "reward_std": 0.7114967629313469, + "rewards/cosine_scaled_reward": -0.14717982709407806, + "rewards/format_reward": 0.583333345130086, + "step": 478 + }, + { + "advantage_max": 1.284256212413311, + "advantage_mean": -1.4280279625467074e-08, + "advantage_min": -0.9000277370214462, + "advantage_std": 0.8560518994927406, + "completion_length": 3038.6875762939453, + "epoch": 0.5474285714285714, + "grad_norm": 0.36497536301612854, + "kl": 0.350341796875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0482745016665526e-07, + "loss": 0.0193, + "reward": 0.265819541644305, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.265819541644305, + "reward_after_std": 0.8560518845915794, + "reward_before_mean": 0.35775492154061794, + "reward_before_std": 0.8692752569913864, + "reward_change_max": 0.0, + "reward_change_mean": -0.09193538874387741, + "reward_change_min": -0.21421499364078045, + "reward_change_std": 0.0782300722785294, + "reward_std": 0.856051929295063, + "rewards/cosine_scaled_reward": -0.1440392080694437, + "rewards/format_reward": 0.6458333507180214, + "step": 479 + }, + { + "advantage_max": 1.2477297633886337, + "advantage_mean": -6.208817904251873e-10, + "advantage_min": -0.8636210225522518, + "advantage_std": 0.7761502750217915, + "completion_length": 2664.708366394043, + "epoch": 0.5485714285714286, + "grad_norm": 0.6572485566139221, + "kl": 0.333465576171875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0437936906629334e-07, + "loss": 0.0535, + "reward": 0.34925887174904346, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.34925887174904346, + "reward_after_std": 0.7761502973735332, + "reward_before_mean": 0.44965414330363274, + "reward_before_std": 0.7742989175021648, + "reward_change_max": 0.0005605295300483704, + "reward_change_mean": -0.10039527481421828, + "reward_change_min": -0.18748685158789158, + "reward_change_std": 0.07196449814364314, + "reward_std": 0.7761503122746944, + "rewards/cosine_scaled_reward": -0.13975626602768898, + "rewards/format_reward": 0.729166679084301, + "step": 480 + }, + { + "advantage_max": 1.130104836076498, + "advantage_mean": -6.519258577419862e-09, + "advantage_min": -0.8260618653148413, + "advantage_std": 0.7510270290076733, + "completion_length": 3182.3959045410156, + "epoch": 0.5497142857142857, + "grad_norm": 0.7979830503463745, + "kl": 0.3316650390625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0395300688680625e-07, + "loss": 0.0327, + "reward": 0.225021761842072, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.225021761842072, + "reward_after_std": 0.7510270290076733, + "reward_before_mean": 0.3158915303647518, + "reward_before_std": 0.7570718768984079, + "reward_change_max": 0.0, + "reward_change_mean": -0.09086976759135723, + "reward_change_min": -0.186422617174685, + "reward_change_std": 0.0734526920132339, + "reward_std": 0.7510270439088345, + "rewards/cosine_scaled_reward": -0.19622090552002192, + "rewards/format_reward": 0.7083333507180214, + "step": 481 + }, + { + "advantage_max": 1.2530571520328522, + "advantage_mean": -6.208817349140361e-09, + "advantage_min": -1.070625051856041, + "advantage_std": 0.8753399774432182, + "completion_length": 2833.3750762939453, + "epoch": 0.5508571428571428, + "grad_norm": 1.2419958114624023, + "kl": 0.3258514404296875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0354838440848501e-07, + "loss": 0.041, + "reward": 0.8800165746361017, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.8800165746361017, + "reward_after_std": 0.8753399886190891, + "reward_before_mean": 1.0293409526348114, + "reward_before_std": 0.8727457858622074, + "reward_change_max": 0.00011719763278961182, + "reward_change_mean": -0.14932430908083916, + "reward_change_min": -0.2527621230110526, + "reward_change_std": 0.10746940225362778, + "reward_std": 0.8753399923443794, + "rewards/cosine_scaled_reward": 0.19175377115607262, + "rewards/format_reward": 0.645833345130086, + "step": 482 + }, + { + "advantage_max": 1.1174155697226524, + "advantage_mean": -2.266218279700638e-08, + "advantage_min": -0.8972061052918434, + "advantage_std": 0.7504922579973936, + "completion_length": 3080.7084350585938, + "epoch": 0.552, + "grad_norm": 0.5229855179786682, + "kl": 0.44921875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0316552135205837e-07, + "loss": 0.0455, + "reward": 0.38609249144792557, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.38609249144792557, + "reward_after_std": 0.7504922654479742, + "reward_before_mean": 0.4921985100954771, + "reward_before_std": 0.7534452546387911, + "reward_change_max": 0.0002275332808494568, + "reward_change_mean": -0.10610603354871273, + "reward_change_min": -0.2087285201996565, + "reward_change_std": 0.08030886575579643, + "reward_std": 0.7504922710359097, + "rewards/cosine_scaled_reward": -0.10806741891428828, + "rewards/format_reward": 0.7083333395421505, + "step": 483 + }, + { + "advantage_max": 1.2352019250392914, + "advantage_mean": -2.4835269951672956e-08, + "advantage_min": -1.0424505099654198, + "advantage_std": 0.8791324347257614, + "completion_length": 2653.541763305664, + "epoch": 0.5531428571428572, + "grad_norm": 1.738598346710205, + "kl": 0.27093505859375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0280443637773163e-07, + "loss": 0.0924, + "reward": 0.667680477257818, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.667680477257818, + "reward_after_std": 0.8791324757039547, + "reward_before_mean": 0.7975321440026164, + "reward_before_std": 0.8878633677959442, + "reward_change_max": 0.00017753243446350098, + "reward_change_mean": -0.1298516825772822, + "reward_change_min": -0.23955402616411448, + "reward_change_std": 0.09890555776655674, + "reward_std": 0.8791324906051159, + "rewards/cosine_scaled_reward": 0.02376605849713087, + "rewards/format_reward": 0.7500000186264515, + "step": 484 + }, + { + "advantage_max": 1.1654871627688408, + "advantage_mean": -2.3903947460324204e-08, + "advantage_min": -1.0133636444807053, + "advantage_std": 0.8400661423802376, + "completion_length": 2845.104248046875, + "epoch": 0.5542857142857143, + "grad_norm": 0.7887821793556213, + "kl": 0.353485107421875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0246514708427701e-07, + "loss": 0.0404, + "reward": 0.620442176819779, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.620442176819779, + "reward_after_std": 0.8400661423802376, + "reward_before_mean": 0.747199842473492, + "reward_before_std": 0.8507244884967804, + "reward_change_max": 0.0, + "reward_change_mean": -0.12675767857581377, + "reward_change_min": -0.24706415832042694, + "reward_change_std": 0.09639016725122929, + "reward_std": 0.8400661535561085, + "rewards/cosine_scaled_reward": 0.01943324040621519, + "rewards/format_reward": 0.7083333507180214, + "step": 485 + }, + { + "advantage_max": 1.2393508404493332, + "advantage_mean": -2.545615163107584e-08, + "advantage_min": -0.7347784452140331, + "advantage_std": 0.740032946690917, + "completion_length": 2606.354248046875, + "epoch": 0.5554285714285714, + "grad_norm": 0.6385639309883118, + "kl": 0.3489837646484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0214767000817596e-07, + "loss": 0.035, + "reward": 0.5880567478016019, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5880567478016019, + "reward_after_std": 0.7400329541414976, + "reward_before_mean": 0.7102154456079006, + "reward_before_std": 0.7220157235860825, + "reward_change_max": 0.0, + "reward_change_mean": -0.12215872760862112, + "reward_change_min": -0.20704109221696854, + "reward_change_std": 0.07904792111366987, + "reward_std": 0.740032970905304, + "rewards/cosine_scaled_reward": -0.01989229116588831, + "rewards/format_reward": 0.7500000111758709, + "step": 486 + }, + { + "advantage_max": 1.4690705388784409, + "advantage_mean": -1.7384688355548406e-08, + "advantage_min": -1.2353796288371086, + "advantage_std": 1.0522098690271378, + "completion_length": 2319.312568664551, + "epoch": 0.5565714285714286, + "grad_norm": 0.7796392440795898, + "kl": 0.3911285400390625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0185202062281336e-07, + "loss": 0.0243, + "reward": 0.6309237442910671, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6309237442910671, + "reward_after_std": 1.0522098690271378, + "reward_before_mean": 0.7531594757456332, + "reward_before_std": 1.0770602822303772, + "reward_change_max": 0.0, + "reward_change_mean": -0.12223571492359042, + "reward_change_min": -0.26455293595790863, + "reward_change_std": 0.10230514733120799, + "reward_std": 1.0522098764777184, + "rewards/cosine_scaled_reward": 0.0744963875040412, + "rewards/format_reward": 0.6041666772216558, + "step": 487 + }, + { + "advantage_max": 0.8800669386982918, + "advantage_mean": -2.856055891786724e-08, + "advantage_min": -0.706706915050745, + "advantage_std": 0.5929678082466125, + "completion_length": 2522.666732788086, + "epoch": 0.5577142857142857, + "grad_norm": 0.30417007207870483, + "kl": 0.33161163330078125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0157821333772304e-07, + "loss": 0.029, + "reward": 0.5513467136770487, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5513467136770487, + "reward_after_std": 0.592967800796032, + "reward_before_mean": 0.6770103182643652, + "reward_before_std": 0.5850005187094212, + "reward_change_max": 2.4974346160888672e-05, + "reward_change_mean": -0.12566362135112286, + "reward_change_min": -0.20945844426751137, + "reward_change_std": 0.07968493644148111, + "reward_std": 0.5929678119719028, + "rewards/cosine_scaled_reward": -0.005244861356914043, + "rewards/format_reward": 0.6875000074505806, + "step": 488 + }, + { + "advantage_max": 0.7658723294734955, + "advantage_mean": 3.725290464995368e-09, + "advantage_min": -0.7253937609493732, + "advantage_std": 0.5821077227592468, + "completion_length": 3303.229217529297, + "epoch": 0.5588571428571428, + "grad_norm": 1.5999469757080078, + "kl": 0.68994140625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.013262614978859e-07, + "loss": 0.0506, + "reward": -0.12529479584190995, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.12529479584190995, + "reward_after_std": 0.5821077264845371, + "reward_before_mean": -0.060184720903635025, + "reward_before_std": 0.6017212346196175, + "reward_change_max": 0.00026082247495651245, + "reward_change_mean": -0.06511008506640792, + "reward_change_min": -0.1411078181117773, + "reward_change_std": 0.05900137731805444, + "reward_std": 0.5821077451109886, + "rewards/cosine_scaled_reward": -0.25925903022289276, + "rewards/format_reward": 0.45833334513008595, + "step": 489 + }, + { + "advantage_max": 0.9868172481656075, + "advantage_mean": -1.5211602394371582e-08, + "advantage_min": -0.8524841107428074, + "advantage_std": 0.6771835945546627, + "completion_length": 2362.2500534057617, + "epoch": 0.56, + "grad_norm": 0.821845293045044, + "kl": 0.3515167236328125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0109617738307911e-07, + "loss": 0.0115, + "reward": 0.6511377788410755, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6511377788410755, + "reward_after_std": 0.6771836057305336, + "reward_before_mean": 0.784302718937397, + "reward_before_std": 0.6745435670018196, + "reward_change_max": 0.0, + "reward_change_mean": -0.1331649529747665, + "reward_change_min": -0.21483693923801184, + "reward_change_std": 0.0822962406091392, + "reward_std": 0.6771836318075657, + "rewards/cosine_scaled_reward": 0.0067346952855587006, + "rewards/format_reward": 0.7708333488553762, + "step": 490 + }, + { + "advantage_max": 1.1321530863642693, + "advantage_mean": 1.8626452602532595e-09, + "advantage_min": -1.2378709018230438, + "advantage_std": 0.9353579767048359, + "completion_length": 2859.8750762939453, + "epoch": 0.5611428571428572, + "grad_norm": 0.9638422131538391, + "kl": 0.392730712890625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0088797220727779e-07, + "loss": 0.0344, + "reward": 0.7370665986090899, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.7370665986090899, + "reward_after_std": 0.9353579990565777, + "reward_before_mean": 0.8755270391702652, + "reward_before_std": 0.9644673950970173, + "reward_change_max": 0.0, + "reward_change_mean": -0.13846042612567544, + "reward_change_min": -0.2663510059937835, + "reward_change_std": 0.11175253661349416, + "reward_std": 0.9353580549359322, + "rewards/cosine_scaled_reward": 0.166930191218853, + "rewards/format_reward": 0.541666679084301, + "step": 491 + }, + { + "advantage_max": 1.1642880029976368, + "advantage_mean": -1.7384688244526103e-08, + "advantage_min": -0.9464364871382713, + "advantage_std": 0.8110112994909286, + "completion_length": 2653.62508392334, + "epoch": 0.5622857142857143, + "grad_norm": 0.6525238752365112, + "kl": 0.4062652587890625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0070165611810855e-07, + "loss": 0.0486, + "reward": 0.4646597392857075, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4646597392857075, + "reward_after_std": 0.811011303216219, + "reward_before_mean": 0.577180489897728, + "reward_before_std": 0.8244456201791763, + "reward_change_max": 0.00011387467384338379, + "reward_change_mean": -0.11252076458185911, + "reward_change_min": -0.22796605806797743, + "reward_change_std": 0.0855433689430356, + "reward_std": 0.8110113255679607, + "rewards/cosine_scaled_reward": -0.013493089005351067, + "rewards/format_reward": 0.6041666828095913, + "step": 492 + }, + { + "advantage_max": 0.9895866885781288, + "advantage_mean": -3.6632022137883524e-08, + "advantage_min": -0.9520559869706631, + "advantage_std": 0.7569357864558697, + "completion_length": 2731.1875762939453, + "epoch": 0.5634285714285714, + "grad_norm": 0.7446244955062866, + "kl": 0.6183624267578125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.005372381963547e-07, + "loss": 0.0883, + "reward": 0.7343036928214133, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7343036928214133, + "reward_after_std": 0.7569357715547085, + "reward_before_mean": 0.8745586993172765, + "reward_before_std": 0.7680754400789738, + "reward_change_max": 0.00016526877880096436, + "reward_change_mean": -0.14025500882416964, + "reward_change_min": -0.24564260337501764, + "reward_change_std": 0.09688371233642101, + "reward_std": 0.7569358013570309, + "rewards/cosine_scaled_reward": 0.12477933615446091, + "rewards/format_reward": 0.6250000018626451, + "step": 493 + }, + { + "advantage_max": 1.3244344219565392, + "advantage_mean": -4.346172310931706e-09, + "advantage_min": -0.9265795089304447, + "advantage_std": 0.8622128590941429, + "completion_length": 2606.4584350585938, + "epoch": 0.5645714285714286, + "grad_norm": 0.9010090231895447, + "kl": 0.408294677734375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0039472645551372e-07, + "loss": 0.0822, + "reward": 0.4567707823589444, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4567707823589444, + "reward_after_std": 0.8622128367424011, + "reward_before_mean": 0.5650326677132398, + "reward_before_std": 0.8653600625693798, + "reward_change_max": 0.0009450465440750122, + "reward_change_mean": -0.10826187301427126, + "reward_change_min": -0.20170058961957693, + "reward_change_std": 0.07992936880327761, + "reward_std": 0.862212885171175, + "rewards/cosine_scaled_reward": -0.06123367277905345, + "rewards/format_reward": 0.6875000186264515, + "step": 494 + }, + { + "advantage_max": 1.2726031877100468, + "advantage_mean": -2.1109978431965715e-08, + "advantage_min": -1.070934422314167, + "advantage_std": 0.9266544282436371, + "completion_length": 3046.1459350585938, + "epoch": 0.5657142857142857, + "grad_norm": 0.9610704183578491, + "kl": 0.594482421875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.002741278414069e-07, + "loss": 0.0694, + "reward": 0.35878500062972307, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.35878500062972307, + "reward_after_std": 0.9266544803977013, + "reward_before_mean": 0.45935659296810627, + "reward_before_std": 0.9541935361921787, + "reward_change_max": 0.00044248998165130615, + "reward_change_mean": -0.10057160933502018, + "reward_change_min": -0.22374487854540348, + "reward_change_std": 0.09001280879601836, + "reward_std": 0.9266545325517654, + "rewards/cosine_scaled_reward": -0.05157171795144677, + "rewards/format_reward": 0.5625000149011612, + "step": 495 + }, + { + "advantage_max": 0.7787556611001492, + "advantage_mean": -2.918144220709351e-08, + "advantage_min": -0.6704142689704895, + "advantage_std": 0.5368506982922554, + "completion_length": 2543.6458740234375, + "epoch": 0.5668571428571428, + "grad_norm": 0.6127541661262512, + "kl": 0.4067840576171875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0017544823184055e-07, + "loss": 0.0377, + "reward": 0.6063330564647913, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6063330564647913, + "reward_after_std": 0.5368506982922554, + "reward_before_mean": 0.7393186707049608, + "reward_before_std": 0.5250267013907433, + "reward_change_max": 0.0, + "reward_change_mean": -0.13298564730212092, + "reward_change_min": -0.20881926268339157, + "reward_change_std": 0.07957718800753355, + "reward_std": 0.536850705742836, + "rewards/cosine_scaled_reward": 0.015492672100663185, + "rewards/format_reward": 0.7083333414047956, + "step": 496 + }, + { + "advantage_max": 0.9398028701543808, + "advantage_mean": 8.381903615628516e-09, + "advantage_min": -0.8651764281094074, + "advantage_std": 0.69316166639328, + "completion_length": 2541.041748046875, + "epoch": 0.568, + "grad_norm": 0.8797805309295654, + "kl": 0.28133392333984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0009869243631952e-07, + "loss": 0.0176, + "reward": 1.1073258856777102, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 1.1073258856777102, + "reward_after_std": 0.6931616589426994, + "reward_before_mean": 1.2837290642783046, + "reward_before_std": 0.680363591760397, + "reward_change_max": 0.0, + "reward_change_mean": -0.17640314577147365, + "reward_change_min": -0.2838462581858039, + "reward_change_std": 0.1093319933861494, + "reward_std": 0.6931616626679897, + "rewards/cosine_scaled_reward": 0.25644785538315773, + "rewards/format_reward": 0.7708333507180214, + "step": 497 + }, + { + "advantage_max": 1.589196316897869, + "advantage_mean": 4.656612789810666e-09, + "advantage_min": -0.8506803512573242, + "advantage_std": 0.9172989800572395, + "completion_length": 3036.6459350585938, + "epoch": 0.5691428571428572, + "grad_norm": 1.152613639831543, + "kl": 0.76190185546875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.000438641958131e-07, + "loss": 0.0751, + "reward": 0.21077799936756492, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.21077799936756492, + "reward_after_std": 0.9172989800572395, + "reward_before_mean": 0.29192004108335823, + "reward_before_std": 0.9136406257748604, + "reward_change_max": 0.0, + "reward_change_mean": -0.08114203019067645, + "reward_change_min": -0.15874368697404861, + "reward_change_std": 0.06351864710450172, + "reward_std": 0.9172989800572395, + "rewards/cosine_scaled_reward": -0.15612333035096526, + "rewards/format_reward": 0.6041666734963655, + "step": 498 + }, + { + "advantage_max": 1.6222087368369102, + "advantage_mean": 4.967054212379196e-09, + "advantage_min": -1.1398800686001778, + "advantage_std": 1.0871395617723465, + "completion_length": 2844.1250915527344, + "epoch": 0.5702857142857143, + "grad_norm": 1.1812705993652344, + "kl": 0.4490966796875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0001096618257236e-07, + "loss": 0.066, + "reward": 0.5475275591015816, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5475275591015816, + "reward_after_std": 1.087139554321766, + "reward_before_mean": 0.6594566758722067, + "reward_before_std": 1.1065706722438335, + "reward_change_max": 0.00030337274074554443, + "reward_change_mean": -0.11192911583930254, + "reward_change_min": -0.24595525488257408, + "reward_change_std": 0.09533399250358343, + "reward_std": 1.087139569222927, + "rewards/cosine_scaled_reward": -0.0036049976479262114, + "rewards/format_reward": 0.6666666902601719, + "step": 499 + }, + { + "advantage_max": 1.4636687450110912, + "advantage_mean": -2.0489097307674342e-08, + "advantage_min": -1.0613201428204775, + "advantage_std": 0.9504660218954086, + "completion_length": 3248.0833740234375, + "epoch": 0.5714285714285714, + "grad_norm": 0.7001128196716309, + "kl": 0.5921630859375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1e-07, + "loss": 0.0719, + "reward": 0.3895249618217349, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3895249618217349, + "reward_after_std": 0.9504660293459892, + "reward_before_mean": 0.48947494849562645, + "reward_before_std": 0.9621049575507641, + "reward_change_max": 0.00021667778491973877, + "reward_change_mean": -0.09994998946785927, + "reward_change_min": -0.20708435587584972, + "reward_change_std": 0.08308770577423275, + "reward_std": 0.9504660815000534, + "rewards/cosine_scaled_reward": -0.04692920472007245, + "rewards/format_reward": 0.583333345130086, + "step": 500 + }, + { + "epoch": 0.5714285714285714, + "step": 500, + "total_flos": 0.0, + "train_loss": 0.023616178158205003, + "train_runtime": 58431.9199, + "train_samples_per_second": 0.411, + "train_steps_per_second": 0.009 + } + ], + "logging_steps": 1, + "max_steps": 500, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}