diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13542 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5714285714285714, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "advantage_max": 1.3236461393535137, + "advantage_mean": -1.6653345369377348e-16, + "advantage_min": -0.8878969363868237, + "advantage_std": 0.7976016215980053, + "completion_length": 2571.2083587646484, + "epoch": 0.001142857142857143, + "grad_norm": 0.12771576642990112, + "kl": 0.0, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2e-08, + "loss": 0.0681, + "reward": 0.1723687592893839, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1723687592893839, + "reward_after_std": 0.7976016290485859, + "reward_before_mean": 0.4897647276520729, + "reward_before_std": 0.8290339298546314, + "reward_change_max": 0.00042107701301574707, + "reward_change_mean": -0.31739595998078585, + "reward_change_min": -0.6219300664961338, + "reward_change_std": 0.2523575215600431, + "reward_std": 0.7976016625761986, + "rewards/cosine_scaled_reward": -0.015534311532974243, + "rewards/format_reward": 0.5208333488553762, + "step": 1 + }, + { + "advantage_max": 0.7214599922299385, + "advantage_mean": 8.692344399818808e-09, + "advantage_min": -0.49231788888573647, + "advantage_std": 0.4440294001251459, + "completion_length": 2804.395881652832, + "epoch": 0.002285714285714286, + "grad_norm": 0.06164511293172836, + "kl": 0.0, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4e-08, + "loss": 0.0245, + "reward": -0.018269629566930234, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.018269629566930234, + "reward_after_std": 0.444029388949275, + "reward_before_mean": 0.27539755403995514, + "reward_before_std": 0.42092561535537243, + "reward_change_max": 0.0009796768426895142, + "reward_change_mean": -0.29366718512028456, + "reward_change_min": -0.478233277797699, + "reward_change_std": 0.19509424595162272, + "reward_std": 0.44402940198779106, + "rewards/cosine_scaled_reward": -0.04980122856795788, + "rewards/format_reward": 0.37500000558793545, + "step": 2 + }, + { + "advantage_max": 0.7810538075864315, + "advantage_mean": 1.8005570256995895e-08, + "advantage_min": -0.5050339177250862, + "advantage_std": 0.47483229637145996, + "completion_length": 3403.375, + "epoch": 0.0034285714285714284, + "grad_norm": 0.09680631011724472, + "kl": 4.32431697845459e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6e-08, + "loss": -0.0069, + "reward": -0.4477355405688286, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.4477355405688286, + "reward_after_std": 0.47483232244849205, + "reward_before_mean": -0.3116879053413868, + "reward_before_std": 0.5111051723361015, + "reward_change_max": 0.0012290999293327332, + "reward_change_mean": -0.13604762544855475, + "reward_change_min": -0.3396712355315685, + "reward_change_std": 0.136130525264889, + "reward_std": 0.47483235225081444, + "rewards/cosine_scaled_reward": -0.2183439557757083, + "rewards/format_reward": 0.12500000558793545, + "step": 3 + }, + { + "advantage_max": 1.2861799150705338, + "advantage_mean": 6.208821234920947e-10, + "advantage_min": -0.6494872495532036, + "advantage_std": 0.7340018600225449, + "completion_length": 2357.833366394043, + "epoch": 0.004571428571428572, + "grad_norm": 0.1758621782064438, + "kl": 3.581494092941284e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8e-08, + "loss": 0.0506, + "reward": 0.11002347664907575, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.11002347664907575, + "reward_after_std": 0.7340018600225449, + "reward_before_mean": 0.40769497863948345, + "reward_before_std": 0.716522503644228, + "reward_change_max": 0.00016899406909942627, + "reward_change_mean": -0.2976714950054884, + "reward_change_min": -0.5929372683167458, + "reward_change_std": 0.22305172309279442, + "reward_std": 0.734001874923706, + "rewards/cosine_scaled_reward": -0.09823585068807006, + "rewards/format_reward": 0.604166679084301, + "step": 4 + }, + { + "advantage_max": 1.3395042344927788, + "advantage_mean": 5.587935725248627e-09, + "advantage_min": -0.6843537390232086, + "advantage_std": 0.7548569068312645, + "completion_length": 3192.6459045410156, + "epoch": 0.005714285714285714, + "grad_norm": 0.1231139749288559, + "kl": 3.738701343536377e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1e-07, + "loss": 0.0299, + "reward": -0.13100245175883174, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.13100245175883174, + "reward_after_std": 0.7548569180071354, + "reward_before_mean": 0.07772888010367751, + "reward_before_std": 0.7701464891433716, + "reward_change_max": 0.00015503168106079102, + "reward_change_mean": -0.20873133465647697, + "reward_change_min": -0.44208815693855286, + "reward_change_std": 0.18292431626468897, + "reward_std": 0.7548569515347481, + "rewards/cosine_scaled_reward": -0.15905223228037357, + "rewards/format_reward": 0.39583334140479565, + "step": 5 + }, + { + "advantage_max": 1.438813678920269, + "advantage_mean": 3.725290076417309e-09, + "advantage_min": -0.5762999951839447, + "advantage_std": 0.7623046673834324, + "completion_length": 3154.604202270508, + "epoch": 0.006857142857142857, + "grad_norm": 0.14749404788017273, + "kl": 3.841519355773926e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.2e-07, + "loss": 0.0406, + "reward": -0.23128792829811573, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.23128792829811573, + "reward_after_std": 0.7623046673834324, + "reward_before_mean": -0.06624754145741463, + "reward_before_std": 0.7461451888084412, + "reward_change_max": 0.0002317279577255249, + "reward_change_mean": -0.16504040150903165, + "reward_change_min": -0.3834940567612648, + "reward_change_std": 0.14161420124582946, + "reward_std": 0.762304674834013, + "rewards/cosine_scaled_reward": -0.17895710514858365, + "rewards/format_reward": 0.291666679084301, + "step": 6 + }, + { + "advantage_max": 1.3337594084441662, + "advantage_mean": -1.1796752963366686e-08, + "advantage_min": -0.6820095479488373, + "advantage_std": 0.7653882391750813, + "completion_length": 3154.5209045410156, + "epoch": 0.008, + "grad_norm": 0.11123297363519669, + "kl": 2.4020671844482422e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.4e-07, + "loss": 0.0161, + "reward": -0.04566465876996517, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.04566465876996517, + "reward_after_std": 0.7653882466256618, + "reward_before_mean": 0.19614734745118767, + "reward_before_std": 0.7696255072951317, + "reward_change_max": 0.000983603298664093, + "reward_change_mean": -0.24181200610473752, + "reward_change_min": -0.5278668366372585, + "reward_change_std": 0.2141664084047079, + "reward_std": 0.7653882782906294, + "rewards/cosine_scaled_reward": -0.1519263405352831, + "rewards/format_reward": 0.5000000093132257, + "step": 7 + }, + { + "advantage_max": 1.153030887246132, + "advantage_mean": -8.071462720415923e-09, + "advantage_min": -0.8050092048943043, + "advantage_std": 0.7189371399581432, + "completion_length": 2714.7291717529297, + "epoch": 0.009142857142857144, + "grad_norm": 0.17044638097286224, + "kl": 2.3853033781051636e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.6e-07, + "loss": 0.0227, + "reward": 0.3043976202607155, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3043976202607155, + "reward_after_std": 0.7189371287822723, + "reward_before_mean": 0.6821175068616867, + "reward_before_std": 0.7231598682701588, + "reward_change_max": 0.0009385868906974792, + "reward_change_mean": -0.37771990802139044, + "reward_change_min": -0.6880337987095118, + "reward_change_std": 0.2805122025310993, + "reward_std": 0.7189371511340141, + "rewards/cosine_scaled_reward": 0.111892094835639, + "rewards/format_reward": 0.4583333395421505, + "step": 8 + }, + { + "advantage_max": 1.0858414433896542, + "advantage_mean": 4.967053768289986e-09, + "advantage_min": -0.6400764584541321, + "advantage_std": 0.6383172105997801, + "completion_length": 3332.1458435058594, + "epoch": 0.010285714285714285, + "grad_norm": 0.11889996379613876, + "kl": 4.9740076065063477e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.8e-07, + "loss": 0.0537, + "reward": -0.2352797817438841, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2352797817438841, + "reward_after_std": 0.6383171789348125, + "reward_before_mean": -0.04502727324143052, + "reward_before_std": 0.6668666442856193, + "reward_change_max": 0.0008114203810691833, + "reward_change_mean": -0.19025251083076, + "reward_change_min": -0.4170127771794796, + "reward_change_std": 0.1755459113046527, + "reward_std": 0.6383172050118446, + "rewards/cosine_scaled_reward": -0.16834697453305125, + "rewards/format_reward": 0.2916666716337204, + "step": 9 + }, + { + "advantage_max": 1.48471187800169, + "advantage_mean": 1.2417635808503746e-09, + "advantage_min": -0.7741466164588928, + "advantage_std": 0.8413009904325008, + "completion_length": 2900.958335876465, + "epoch": 0.011428571428571429, + "grad_norm": 0.15922096371650696, + "kl": 3.575533628463745e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2e-07, + "loss": 0.0418, + "reward": -0.09273874759674072, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.09273874759674072, + "reward_after_std": 0.8413009904325008, + "reward_before_mean": 0.11865614727139473, + "reward_before_std": 0.8663091957569122, + "reward_change_max": 0.0007829740643501282, + "reward_change_mean": -0.21139488369226456, + "reward_change_min": -0.4577816314995289, + "reward_change_std": 0.1882876893505454, + "reward_std": 0.8413010165095329, + "rewards/cosine_scaled_reward": -0.10733860358595848, + "rewards/format_reward": 0.33333334140479565, + "step": 10 + }, + { + "advantage_max": 1.0778188593685627, + "advantage_mean": 2.1730860888524717e-08, + "advantage_min": -0.5051753893494606, + "advantage_std": 0.5997787564992905, + "completion_length": 3446.5625610351562, + "epoch": 0.012571428571428572, + "grad_norm": 0.09384988248348236, + "kl": 3.8117170333862305e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.1999999999999998e-07, + "loss": 0.0257, + "reward": -0.47689105570316315, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.47689105570316315, + "reward_after_std": 0.5997787676751614, + "reward_before_mean": -0.37315269373357296, + "reward_before_std": 0.6239271499216557, + "reward_change_max": 0.0018914267420768738, + "reward_change_mean": -0.1037383598741144, + "reward_change_min": -0.27506355568766594, + "reward_change_std": 0.11384909274056554, + "reward_std": 0.599778788164258, + "rewards/cosine_scaled_reward": -0.2490763533860445, + "rewards/format_reward": 0.12500000186264515, + "step": 11 + }, + { + "advantage_max": 1.2988129183650017, + "advantage_mean": 1.5522045315741195e-09, + "advantage_min": -0.6877567246556282, + "advantage_std": 0.7454183585941792, + "completion_length": 2431.416702270508, + "epoch": 0.013714285714285714, + "grad_norm": 0.12335586547851562, + "kl": 3.510713577270508e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.4e-07, + "loss": 0.0696, + "reward": 0.06804788112640381, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.06804788112640381, + "reward_after_std": 0.7454183287918568, + "reward_before_mean": 0.35046464344486594, + "reward_before_std": 0.741436411626637, + "reward_change_max": 0.0008105039596557617, + "reward_change_mean": -0.2824167497456074, + "reward_change_min": -0.5328971222043037, + "reward_change_std": 0.21176872844807804, + "reward_std": 0.7454183623194695, + "rewards/cosine_scaled_reward": -0.11643435899168253, + "rewards/format_reward": 0.583333345130086, + "step": 12 + }, + { + "advantage_max": 0.8294573128223419, + "advantage_mean": -1.4280279847511679e-08, + "advantage_min": -0.4593823775649071, + "advantage_std": 0.4747231351211667, + "completion_length": 2889.416702270508, + "epoch": 0.014857142857142857, + "grad_norm": 0.06548355519771576, + "kl": 3.244727849960327e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.6e-07, + "loss": 0.022, + "reward": 0.18717875331640244, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.18717875331640244, + "reward_after_std": 0.4747231239452958, + "reward_before_mean": 0.544963575899601, + "reward_before_std": 0.4018893027678132, + "reward_change_max": 0.000781618058681488, + "reward_change_mean": -0.35778482200112194, + "reward_change_min": -0.5649526380002499, + "reward_change_std": 0.21307120216079056, + "reward_std": 0.47472314070910215, + "rewards/cosine_scaled_reward": 0.032898444682359695, + "rewards/format_reward": 0.4791666716337204, + "step": 13 + }, + { + "advantage_max": 1.322419997304678, + "advantage_mean": 2.1109978987077227e-08, + "advantage_min": -0.8189655467867851, + "advantage_std": 0.8067995980381966, + "completion_length": 2697.854248046875, + "epoch": 0.016, + "grad_norm": 0.15838854014873505, + "kl": 2.3767352104187012e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.8e-07, + "loss": 0.0594, + "reward": 0.19783409871160984, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.19783409871160984, + "reward_after_std": 0.806799590587616, + "reward_before_mean": 0.5256497077643871, + "reward_before_std": 0.839010551571846, + "reward_change_max": 0.0003392919898033142, + "reward_change_mean": -0.32781560346484184, + "reward_change_min": -0.6789125502109528, + "reward_change_std": 0.26764219626784325, + "reward_std": 0.8067996315658092, + "rewards/cosine_scaled_reward": 0.03365818038582802, + "rewards/format_reward": 0.45833333767950535, + "step": 14 + }, + { + "advantage_max": 0.9029887653887272, + "advantage_mean": 1.1796752907855534e-08, + "advantage_min": -0.44812071323394775, + "advantage_std": 0.5208632545545697, + "completion_length": 2737.166702270508, + "epoch": 0.017142857142857144, + "grad_norm": 0.054936591535806656, + "kl": 2.24970281124115e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3e-07, + "loss": 0.0079, + "reward": -0.13882983848452568, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.13882983848452568, + "reward_after_std": 0.5208632694557309, + "reward_before_mean": 0.09872580878436565, + "reward_before_std": 0.5114677743986249, + "reward_change_max": 0.0, + "reward_change_mean": -0.2375556300394237, + "reward_change_min": -0.4582338333129883, + "reward_change_std": 0.16593290120363235, + "reward_std": 0.5208632759749889, + "rewards/cosine_scaled_reward": -0.12772043980658054, + "rewards/format_reward": 0.3541666679084301, + "step": 15 + }, + { + "advantage_max": 0.5143468156456947, + "advantage_mean": 2.483526917451684e-08, + "advantage_min": -0.3243384584784508, + "advantage_std": 0.30771737545728683, + "completion_length": 3521.9375, + "epoch": 0.018285714285714287, + "grad_norm": 0.04844345152378082, + "kl": 4.1229650378227234e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.2e-07, + "loss": 0.0078, + "reward": -0.6069029793143272, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.6069029793143272, + "reward_after_std": 0.30771736800670624, + "reward_before_mean": -0.5090135782957077, + "reward_before_std": 0.3248976990580559, + "reward_change_max": 0.0016613304615020752, + "reward_change_mean": -0.09788939589634538, + "reward_change_min": -0.2121806014329195, + "reward_change_std": 0.08965697605162859, + "reward_std": 0.30771737918257713, + "rewards/cosine_scaled_reward": -0.26492345705628395, + "rewards/format_reward": 0.02083333395421505, + "step": 16 + }, + { + "advantage_max": 1.6180179975926876, + "advantage_mean": -3.725290298461914e-09, + "advantage_min": -0.7726177126169205, + "advantage_std": 0.8878979086875916, + "completion_length": 2487.3958892822266, + "epoch": 0.019428571428571427, + "grad_norm": 0.17348401248455048, + "kl": 4.330277442932129e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.4000000000000003e-07, + "loss": 0.0565, + "reward": 0.1379982978105545, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1379982978105545, + "reward_after_std": 0.8878978937864304, + "reward_before_mean": 0.4221195343416184, + "reward_before_std": 0.873494204133749, + "reward_change_max": 0.0, + "reward_change_mean": -0.284121235832572, + "reward_change_min": -0.5485293567180634, + "reward_change_std": 0.20937805250287056, + "reward_std": 0.8878979422152042, + "rewards/cosine_scaled_reward": -0.02852356492076069, + "rewards/format_reward": 0.4791666753590107, + "step": 17 + }, + { + "advantage_max": 1.0580488927662373, + "advantage_mean": 5.587935336670569e-09, + "advantage_min": -0.6208096109330654, + "advantage_std": 0.6125941518694162, + "completion_length": 2949.2083740234375, + "epoch": 0.02057142857142857, + "grad_norm": 0.14818914234638214, + "kl": 2.2158026695251465e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.6e-07, + "loss": 0.0716, + "reward": -0.11659494414925575, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.11659494414925575, + "reward_after_std": 0.6125941406935453, + "reward_before_mean": 0.118058524094522, + "reward_before_std": 0.6145357359200716, + "reward_change_max": 0.0011220648884773254, + "reward_change_mean": -0.23465345334261656, + "reward_change_min": -0.40788656659424305, + "reward_change_std": 0.17815768904983997, + "reward_std": 0.6125941649079323, + "rewards/cosine_scaled_reward": -0.10763741098344326, + "rewards/format_reward": 0.3333333395421505, + "step": 18 + }, + { + "advantage_max": 1.2318142503499985, + "advantage_mean": 4.346171977864799e-09, + "advantage_min": -0.7438315749168396, + "advantage_std": 0.7457190416753292, + "completion_length": 2978.6875610351562, + "epoch": 0.021714285714285714, + "grad_norm": 0.11854659020900726, + "kl": 2.814456820487976e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.7999999999999996e-07, + "loss": 0.0561, + "reward": 0.14310528058558702, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.14310528058558702, + "reward_after_std": 0.7457190491259098, + "reward_before_mean": 0.45691246911883354, + "reward_before_std": 0.7543267421424389, + "reward_change_max": 0.0008727908134460449, + "reward_change_mean": -0.3138071997091174, + "reward_change_min": -0.5745680537074804, + "reward_change_std": 0.2437105644494295, + "reward_std": 0.7457190677523613, + "rewards/cosine_scaled_reward": 0.009706247597932816, + "rewards/format_reward": 0.43750000931322575, + "step": 19 + }, + { + "advantage_max": 1.3907008990645409, + "advantage_mean": -1.552204281773939e-09, + "advantage_min": -0.7317355498671532, + "advantage_std": 0.7999358735978603, + "completion_length": 2609.3959045410156, + "epoch": 0.022857142857142857, + "grad_norm": 0.13953657448291779, + "kl": 2.8684735298156738e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4e-07, + "loss": 0.0822, + "reward": 0.31031588884070516, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.31031588884070516, + "reward_after_std": 0.7999358661472797, + "reward_before_mean": 0.6719230581074953, + "reward_before_std": 0.7620627535507083, + "reward_change_max": 3.715604543685913e-05, + "reward_change_mean": -0.36160713620483875, + "reward_change_min": -0.6067077554762363, + "reward_change_std": 0.2561584496870637, + "reward_std": 0.7999358959496021, + "rewards/cosine_scaled_reward": 0.033878179267048836, + "rewards/format_reward": 0.6041666734963655, + "step": 20 + }, + { + "advantage_max": 1.4857213720679283, + "advantage_mean": -3.725290298461914e-09, + "advantage_min": -0.8182055205106735, + "advantage_std": 0.8357969745993614, + "completion_length": 2668.145866394043, + "epoch": 0.024, + "grad_norm": 0.11965252459049225, + "kl": 3.9711594581604004e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.1999999999999995e-07, + "loss": 0.0684, + "reward": 0.039832099340856075, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.039832099340856075, + "reward_after_std": 0.8357969783246517, + "reward_before_mean": 0.2982704224996269, + "reward_before_std": 0.8404805772006512, + "reward_change_max": 0.0007499381899833679, + "reward_change_mean": -0.25843835016712546, + "reward_change_min": -0.5017091780900955, + "reward_change_std": 0.20348974969238043, + "reward_std": 0.8357969857752323, + "rewards/cosine_scaled_reward": -0.06961478537414223, + "rewards/format_reward": 0.43750001303851604, + "step": 21 + }, + { + "advantage_max": 0.9170179665088654, + "advantage_mean": -1.2417634753791873e-08, + "advantage_min": -0.6024731658399105, + "advantage_std": 0.5401135012507439, + "completion_length": 1652.4166793823242, + "epoch": 0.025142857142857144, + "grad_norm": 0.06278178095817566, + "kl": 3.166962414979935e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.3999999999999997e-07, + "loss": -0.0109, + "reward": 0.2182811200618744, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2182811200618744, + "reward_after_std": 0.540113490074873, + "reward_before_mean": 0.5821179002523422, + "reward_before_std": 0.5008180625736713, + "reward_change_max": 0.0, + "reward_change_mean": -0.3638368174433708, + "reward_change_min": -0.5668350532650948, + "reward_change_std": 0.22511073760688305, + "reward_std": 0.5401134938001633, + "rewards/cosine_scaled_reward": -0.11519105918705463, + "rewards/format_reward": 0.8125, + "step": 22 + }, + { + "advantage_max": 1.2377678006887436, + "advantage_mean": -3.7252901874396116e-09, + "advantage_min": -0.7406592965126038, + "advantage_std": 0.7070612488314509, + "completion_length": 2549.0833740234375, + "epoch": 0.026285714285714287, + "grad_norm": 0.10234551876783371, + "kl": 2.5782734155654907e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.6e-07, + "loss": 0.0368, + "reward": -0.08573945984244347, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.08573945984244347, + "reward_after_std": 0.7070612283423543, + "reward_before_mean": 0.14411595277488232, + "reward_before_std": 0.7186607727780938, + "reward_change_max": 0.0006104856729507446, + "reward_change_mean": -0.2298554142471403, + "reward_change_min": -0.48414019867777824, + "reward_change_std": 0.18986564758233726, + "reward_std": 0.7070612665265799, + "rewards/cosine_scaled_reward": -0.146692031994462, + "rewards/format_reward": 0.43750001303851604, + "step": 23 + }, + { + "advantage_max": 1.4403239861130714, + "advantage_mean": -1.4901161637936866e-08, + "advantage_min": -0.9893696680665016, + "advantage_std": 0.8664927519857883, + "completion_length": 2878.916702270508, + "epoch": 0.027428571428571427, + "grad_norm": 0.1490224152803421, + "kl": 1.432560384273529e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.8e-07, + "loss": 0.0584, + "reward": 0.3888702280819416, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3888702280819416, + "reward_after_std": 0.8664927519857883, + "reward_before_mean": 0.7756715640425682, + "reward_before_std": 0.8816492557525635, + "reward_change_max": 0.0002819374203681946, + "reward_change_mean": -0.3868013136088848, + "reward_change_min": -0.6882800199091434, + "reward_change_std": 0.28499543759971857, + "reward_std": 0.8664927929639816, + "rewards/cosine_scaled_reward": 0.10658576083369553, + "rewards/format_reward": 0.5625000223517418, + "step": 24 + }, + { + "advantage_max": 0.9528279937803745, + "advantage_mean": 7.761021575403149e-09, + "advantage_min": -0.6989870108664036, + "advantage_std": 0.5915895830839872, + "completion_length": 2845.562515258789, + "epoch": 0.02857142857142857, + "grad_norm": 0.149390310049057, + "kl": 4.055723547935486e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5e-07, + "loss": 0.059, + "reward": -0.12853457941673696, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.12853457941673696, + "reward_after_std": 0.5915896017104387, + "reward_before_mean": 0.10707342872046866, + "reward_before_std": 0.6251619644463062, + "reward_change_max": 0.0005048885941505432, + "reward_change_mean": -0.23560800775885582, + "reward_change_min": -0.48452158086001873, + "reward_change_std": 0.19485379848629236, + "reward_std": 0.5915896091610193, + "rewards/cosine_scaled_reward": -0.1235466287471354, + "rewards/format_reward": 0.3541666716337204, + "step": 25 + }, + { + "advantage_max": 0.951295755803585, + "advantage_mean": 4.34617203337595e-09, + "advantage_min": -0.5947780013084412, + "advantage_std": 0.5643060579895973, + "completion_length": 2929.104217529297, + "epoch": 0.029714285714285714, + "grad_norm": 0.09629985690116882, + "kl": 2.4005770683288574e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.2e-07, + "loss": 0.0539, + "reward": 0.045638392213732004, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.045638392213732004, + "reward_after_std": 0.5643060579895973, + "reward_before_mean": 0.3457363471388817, + "reward_before_std": 0.5388211533427238, + "reward_change_max": 0.0, + "reward_change_mean": -0.30009796749800444, + "reward_change_min": -0.5169609375298023, + "reward_change_std": 0.2061476781964302, + "reward_std": 0.5643060654401779, + "rewards/cosine_scaled_reward": -0.04588180594146252, + "rewards/format_reward": 0.43750001303851604, + "step": 26 + }, + { + "advantage_max": 1.1732506714761257, + "advantage_mean": -1.5522043372850902e-08, + "advantage_min": -0.7885406948626041, + "advantage_std": 0.7137688174843788, + "completion_length": 2981.6042098999023, + "epoch": 0.030857142857142857, + "grad_norm": 0.14982038736343384, + "kl": 2.8835609555244446e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.4e-07, + "loss": 0.0481, + "reward": 0.09526193886995316, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.09526193886995316, + "reward_after_std": 0.7137688249349594, + "reward_before_mean": 0.3976815016940236, + "reward_before_std": 0.7458325773477554, + "reward_change_max": 0.000577881932258606, + "reward_change_mean": -0.30241959635168314, + "reward_change_min": -0.5662531852722168, + "reward_change_std": 0.22851586807519197, + "reward_std": 0.7137688659131527, + "rewards/cosine_scaled_reward": -0.040742579847574234, + "rewards/format_reward": 0.479166679084301, + "step": 27 + }, + { + "advantage_max": 1.4417070969939232, + "advantage_mean": -1.1175871450497255e-08, + "advantage_min": -0.7128314636647701, + "advantage_std": 0.789174672216177, + "completion_length": 2737.812530517578, + "epoch": 0.032, + "grad_norm": 0.12516100704669952, + "kl": 2.178177237510681e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.6e-07, + "loss": 0.0128, + "reward": 0.24161657877266407, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.24161657877266407, + "reward_after_std": 0.7891746796667576, + "reward_before_mean": 0.5766212344169617, + "reward_before_std": 0.741939015686512, + "reward_change_max": 1.2122094631195068e-05, + "reward_change_mean": -0.3350046342238784, + "reward_change_min": -0.5396411195397377, + "reward_change_std": 0.21770456805825233, + "reward_std": 0.78917470946908, + "rewards/cosine_scaled_reward": 0.05914393765851855, + "rewards/format_reward": 0.4583333395421505, + "step": 28 + }, + { + "advantage_max": 0.9136006869375706, + "advantage_mean": 1.8626452213954536e-08, + "advantage_min": -0.46621380001306534, + "advantage_std": 0.5133081059902906, + "completion_length": 3170.187545776367, + "epoch": 0.03314285714285714, + "grad_norm": 0.11526963859796524, + "kl": 1.806020736694336e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.8e-07, + "loss": 0.068, + "reward": -0.2716685086488724, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2716685086488724, + "reward_after_std": 0.5133080948144197, + "reward_before_mean": -0.08273126650601625, + "reward_before_std": 0.5100496802479029, + "reward_change_max": 0.0, + "reward_change_mean": -0.18893724866211414, + "reward_change_min": -0.33541516587138176, + "reward_change_std": 0.13039901200681925, + "reward_std": 0.5133081208914518, + "rewards/cosine_scaled_reward": -0.15594896860420704, + "rewards/format_reward": 0.22916667349636555, + "step": 29 + }, + { + "advantage_max": 1.8409449309110641, + "advantage_mean": -2.204130222782652e-08, + "advantage_min": -0.9385720863938332, + "advantage_std": 1.040012452751398, + "completion_length": 3010.666702270508, + "epoch": 0.03428571428571429, + "grad_norm": 0.210233673453331, + "kl": 2.4873297661542892e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6e-07, + "loss": 0.0371, + "reward": 0.4122390305856243, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4122390305856243, + "reward_after_std": 1.0400124676525593, + "reward_before_mean": 0.7786657903343439, + "reward_before_std": 1.038475975394249, + "reward_change_max": 0.000269085168838501, + "reward_change_mean": -0.3664267407730222, + "reward_change_min": -0.7022055611014366, + "reward_change_std": 0.2809586049988866, + "reward_std": 1.0400125198066235, + "rewards/cosine_scaled_reward": 0.12891620831214823, + "rewards/format_reward": 0.5208333488553762, + "step": 30 + }, + { + "advantage_max": 1.3186752051115036, + "advantage_mean": 2.5456151742098143e-08, + "advantage_min": -0.6435412839055061, + "advantage_std": 0.7557070441544056, + "completion_length": 2892.187545776367, + "epoch": 0.03542857142857143, + "grad_norm": 0.13154418766498566, + "kl": 1.9848346710205078e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.2e-07, + "loss": 0.0475, + "reward": -0.2269106972962618, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.2269106972962618, + "reward_after_std": 0.7557070478796959, + "reward_before_mean": -0.051385149359703064, + "reward_before_std": 0.788501251488924, + "reward_change_max": 0.0, + "reward_change_mean": -0.17552555818110704, + "reward_change_min": -0.4460537787526846, + "reward_change_std": 0.18104476854205132, + "reward_std": 0.7557070702314377, + "rewards/cosine_scaled_reward": -0.19235923327505589, + "rewards/format_reward": 0.33333333767950535, + "step": 31 + }, + { + "advantage_max": 1.3948543444275856, + "advantage_mean": 1.862645149230957e-09, + "advantage_min": -0.7191313877701759, + "advantage_std": 0.7711601257324219, + "completion_length": 3234.8333740234375, + "epoch": 0.036571428571428574, + "grad_norm": 0.10185429453849792, + "kl": 2.5451648980379105e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.4e-07, + "loss": 0.0248, + "reward": 0.06370437087025493, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.06370437087025493, + "reward_after_std": 0.7711601257324219, + "reward_before_mean": 0.338614858686924, + "reward_before_std": 0.74234314635396, + "reward_change_max": 0.0004075467586517334, + "reward_change_mean": -0.2749105137772858, + "reward_change_min": -0.49326785281300545, + "reward_change_std": 0.20088558923453093, + "reward_std": 0.7711601331830025, + "rewards/cosine_scaled_reward": -0.007775906473398209, + "rewards/format_reward": 0.35416667349636555, + "step": 32 + }, + { + "advantage_max": 1.4922088123857975, + "advantage_mean": 5.587935225648266e-09, + "advantage_min": -0.7815022058784962, + "advantage_std": 0.8810334913432598, + "completion_length": 3333.5834045410156, + "epoch": 0.037714285714285714, + "grad_norm": 0.12162205576896667, + "kl": 3.6016106605529785e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.6e-07, + "loss": 0.0661, + "reward": -0.06509976089000702, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.06509976089000702, + "reward_after_std": 0.8810334764420986, + "reward_before_mean": 0.1571635976433754, + "reward_before_std": 0.9383543431758881, + "reward_change_max": 0.00016274303197860718, + "reward_change_mean": -0.2222633557394147, + "reward_change_min": -0.5449208281934261, + "reward_change_std": 0.22707031201571226, + "reward_std": 0.881033506244421, + "rewards/cosine_scaled_reward": -0.056834882125258446, + "rewards/format_reward": 0.2708333395421505, + "step": 33 + }, + { + "advantage_max": 1.4944884777069092, + "advantage_mean": -3.228585032655218e-08, + "advantage_min": -0.9274916350841522, + "advantage_std": 0.9044716916978359, + "completion_length": 2455.6458740234375, + "epoch": 0.038857142857142854, + "grad_norm": 0.14022988080978394, + "kl": 0.00020068883895874023, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.800000000000001e-07, + "loss": 0.0203, + "reward": 0.346778467297554, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.346778467297554, + "reward_after_std": 0.9044716954231262, + "reward_before_mean": 0.71558902785182, + "reward_before_std": 0.9435294065624475, + "reward_change_max": 0.00022082775831222534, + "reward_change_mean": -0.36881057592108846, + "reward_change_min": -0.6944437511265278, + "reward_change_std": 0.28569589368999004, + "reward_std": 0.904471717774868, + "rewards/cosine_scaled_reward": 0.097377834841609, + "rewards/format_reward": 0.5208333432674408, + "step": 34 + }, + { + "advantage_max": 1.3393050953745842, + "advantage_mean": -8.07146305348283e-09, + "advantage_min": -0.7060817331075668, + "advantage_std": 0.7657259926199913, + "completion_length": 3187.5833892822266, + "epoch": 0.04, + "grad_norm": 0.110480897128582, + "kl": 0.00013943761587142944, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7e-07, + "loss": 0.017, + "reward": -0.1551370123634115, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.1551370123634115, + "reward_after_std": 0.7657259963452816, + "reward_before_mean": 0.04514682665467262, + "reward_before_std": 0.7847169302403927, + "reward_change_max": 0.0, + "reward_change_mean": -0.20028384402394295, + "reward_change_min": -0.41303151473402977, + "reward_change_std": 0.1785027701407671, + "reward_std": 0.7657260186970234, + "rewards/cosine_scaled_reward": -0.13367659132927656, + "rewards/format_reward": 0.31250000558793545, + "step": 35 + }, + { + "advantage_max": 0.6390106528997421, + "advantage_mean": 1.5522043650406658e-08, + "advantage_min": -0.3384590819478035, + "advantage_std": 0.3662732969969511, + "completion_length": 3560.0416870117188, + "epoch": 0.04114285714285714, + "grad_norm": 0.06496626883745193, + "kl": 7.263757288455963e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.2e-07, + "loss": 0.0044, + "reward": -0.551641970872879, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.551641970872879, + "reward_after_std": 0.3662732820957899, + "reward_before_mean": -0.44342844747006893, + "reward_before_std": 0.3724683914333582, + "reward_change_max": 0.0006786733865737915, + "reward_change_mean": -0.10821351455524564, + "reward_change_min": -0.224327702075243, + "reward_change_std": 0.09183390927501023, + "reward_std": 0.36627329140901566, + "rewards/cosine_scaled_reward": -0.2529642302542925, + "rewards/format_reward": 0.06250000186264515, + "step": 36 + }, + { + "advantage_max": 0.7393645793199539, + "advantage_mean": 1.7384688244526103e-08, + "advantage_min": -0.4619470313191414, + "advantage_std": 0.44865885004401207, + "completion_length": 3252.2708435058594, + "epoch": 0.04228571428571429, + "grad_norm": 0.08295831084251404, + "kl": 5.307118408381939e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.4e-07, + "loss": 0.022, + "reward": -0.35455665923655033, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.35455665923655033, + "reward_after_std": 0.44865884631872177, + "reward_before_mean": -0.18188916146755219, + "reward_before_std": 0.46603875420987606, + "reward_change_max": 0.0007766708731651306, + "reward_change_mean": -0.17266748752444983, + "reward_change_min": -0.36579640582203865, + "reward_change_std": 0.14287791587412357, + "reward_std": 0.44865885376930237, + "rewards/cosine_scaled_reward": -0.1951112560927868, + "rewards/format_reward": 0.2083333358168602, + "step": 37 + }, + { + "advantage_max": 0.6996339820325375, + "advantage_mean": 1.98682153507157e-08, + "advantage_min": -0.484468013048172, + "advantage_std": 0.410785099491477, + "completion_length": 3253.0416870117188, + "epoch": 0.04342857142857143, + "grad_norm": 0.053032856434583664, + "kl": 6.0859136283397675e-05, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.599999999999999e-07, + "loss": 0.0058, + "reward": -0.33535145223140717, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.33535145223140717, + "reward_after_std": 0.41078509762883186, + "reward_before_mean": -0.15529391914606094, + "reward_before_std": 0.41112925857305527, + "reward_change_max": 0.002039514482021332, + "reward_change_mean": -0.1800575191155076, + "reward_change_min": -0.30918743275105953, + "reward_change_std": 0.1306900419294834, + "reward_std": 0.41078510507941246, + "rewards/cosine_scaled_reward": -0.150563626550138, + "rewards/format_reward": 0.14583333395421505, + "step": 38 + }, + { + "advantage_max": 0.9096625335514545, + "advantage_mean": 8.07146260939362e-09, + "advantage_min": -0.46032148599624634, + "advantage_std": 0.5079090781509876, + "completion_length": 2682.062530517578, + "epoch": 0.044571428571428574, + "grad_norm": 0.0740419402718544, + "kl": 0.0001322571188211441, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.799999999999999e-07, + "loss": 0.0176, + "reward": 0.18389996141195297, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.18389996141195297, + "reward_after_std": 0.5079090669751167, + "reward_before_mean": 0.5366221237927675, + "reward_before_std": 0.413540075533092, + "reward_change_max": 0.0003514885902404785, + "reward_change_mean": -0.35272214096039534, + "reward_change_min": -0.5291457362473011, + "reward_change_std": 0.20978331100195646, + "reward_std": 0.5079090893268585, + "rewards/cosine_scaled_reward": -0.023355623707175255, + "rewards/format_reward": 0.5833333414047956, + "step": 39 + }, + { + "advantage_max": 1.2809822633862495, + "advantage_mean": 1.1175871006408045e-08, + "advantage_min": -0.6526738554239273, + "advantage_std": 0.6940833032131195, + "completion_length": 2629.8541870117188, + "epoch": 0.045714285714285714, + "grad_norm": 0.12017546594142914, + "kl": 0.00029501691460609436, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8e-07, + "loss": 0.0788, + "reward": 0.07090002810582519, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.07090002810582519, + "reward_after_std": 0.6940833032131195, + "reward_before_mean": 0.3545444831252098, + "reward_before_std": 0.6401285659521818, + "reward_change_max": 0.00034108012914657593, + "reward_change_mean": -0.28364445082843304, + "reward_change_min": -0.5075650922954082, + "reward_change_std": 0.1958700818940997, + "reward_std": 0.6940833181142807, + "rewards/cosine_scaled_reward": -0.07272776251193136, + "rewards/format_reward": 0.5000000074505806, + "step": 40 + }, + { + "advantage_max": 1.0797162391245365, + "advantage_mean": 1.490116185998147e-08, + "advantage_min": -0.6975666042417288, + "advantage_std": 0.6240663919597864, + "completion_length": 3017.2916870117188, + "epoch": 0.046857142857142854, + "grad_norm": 0.09810768067836761, + "kl": 0.00014271587133407593, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.199999999999999e-07, + "loss": 0.0073, + "reward": -0.19936674274504185, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.19936674274504185, + "reward_after_std": 0.6240663770586252, + "reward_before_mean": 0.0034020310267806053, + "reward_before_std": 0.6362273693084717, + "reward_change_max": 0.0014721229672431946, + "reward_change_mean": -0.20276877097785473, + "reward_change_min": -0.3929155748337507, + "reward_change_std": 0.16917146416381001, + "reward_std": 0.6240663770586252, + "rewards/cosine_scaled_reward": -0.20663231890648603, + "rewards/format_reward": 0.4166666753590107, + "step": 41 + }, + { + "advantage_max": 0.9113545380532742, + "advantage_mean": -8.692344177774203e-09, + "advantage_min": -0.48987017199397087, + "advantage_std": 0.5287072211503983, + "completion_length": 2871.6250228881836, + "epoch": 0.048, + "grad_norm": 0.0773473009467125, + "kl": 0.00018630176782608032, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.399999999999999e-07, + "loss": -0.0028, + "reward": -0.2969640102237463, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.2969640102237463, + "reward_after_std": 0.5287072211503983, + "reward_before_mean": -0.11779316561296582, + "reward_before_std": 0.5369173996150494, + "reward_change_max": 0.0, + "reward_change_mean": -0.17917086835950613, + "reward_change_min": -0.34727344289422035, + "reward_change_std": 0.14140096702612936, + "reward_std": 0.5287072323262691, + "rewards/cosine_scaled_reward": -0.23597991233691573, + "rewards/format_reward": 0.35416666977107525, + "step": 42 + }, + { + "advantage_max": 1.219473421573639, + "advantage_mean": 9.313225579621331e-09, + "advantage_min": -0.641626700758934, + "advantage_std": 0.6846173517405987, + "completion_length": 3060.500030517578, + "epoch": 0.04914285714285714, + "grad_norm": 0.11512342095375061, + "kl": 0.00010600313544273376, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.599999999999999e-07, + "loss": 0.0332, + "reward": -0.21309549175202847, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.21309549175202847, + "reward_after_std": 0.6846173368394375, + "reward_before_mean": -0.025469645857810974, + "reward_before_std": 0.6994537971913815, + "reward_change_max": 0.0007215887308120728, + "reward_change_mean": -0.18762585893273354, + "reward_change_min": -0.3715638890862465, + "reward_change_std": 0.15713506587781012, + "reward_std": 0.6846173815429211, + "rewards/cosine_scaled_reward": -0.12731814879225567, + "rewards/format_reward": 0.2291666679084301, + "step": 43 + }, + { + "advantage_max": 1.4568804576992989, + "advantage_mean": -1.3659397612997282e-08, + "advantage_min": -0.6851945444941521, + "advantage_std": 0.8075256794691086, + "completion_length": 2762.770866394043, + "epoch": 0.05028571428571429, + "grad_norm": 0.11844295263290405, + "kl": 0.00044381991028785706, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.799999999999999e-07, + "loss": 0.0426, + "reward": 0.10861534625291824, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.10861534625291824, + "reward_after_std": 0.807525709271431, + "reward_before_mean": 0.393554262816906, + "reward_before_std": 0.7881722338497639, + "reward_change_max": 0.0009457021951675415, + "reward_change_mean": -0.2849389025941491, + "reward_change_min": -0.5463197156786919, + "reward_change_std": 0.2175145372748375, + "reward_std": 0.8075257502496243, + "rewards/cosine_scaled_reward": -0.04280621279031038, + "rewards/format_reward": 0.4791666753590107, + "step": 44 + }, + { + "advantage_max": 1.256616048514843, + "advantage_mean": 1.2417631367611648e-09, + "advantage_min": -0.7105851396918297, + "advantage_std": 0.7391474787145853, + "completion_length": 3474.5208435058594, + "epoch": 0.05142857142857143, + "grad_norm": 0.10600760579109192, + "kl": 0.00013585388660430908, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9e-07, + "loss": 0.0036, + "reward": -0.13553864229470491, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.13553864229470491, + "reward_after_std": 0.7391474638134241, + "reward_before_mean": 0.07757800817489624, + "reward_before_std": 0.7742020450532436, + "reward_change_max": 0.0005881339311599731, + "reward_change_mean": -0.21311665140092373, + "reward_change_min": -0.4759393446147442, + "reward_change_std": 0.19536291249096394, + "reward_std": 0.7391474694013596, + "rewards/cosine_scaled_reward": -0.08621100289747119, + "rewards/format_reward": 0.25000000931322575, + "step": 45 + }, + { + "advantage_max": 0.9702697545289993, + "advantage_mean": 2.6077032755367213e-08, + "advantage_min": -0.4514354318380356, + "advantage_std": 0.541413675993681, + "completion_length": 3213.500015258789, + "epoch": 0.052571428571428575, + "grad_norm": 0.09703514724969864, + "kl": 0.00024537742137908936, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.2e-07, + "loss": 0.0322, + "reward": -0.3946500001475215, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.3946500001475215, + "reward_after_std": 0.5414136610925198, + "reward_before_mean": -0.2541161496192217, + "reward_before_std": 0.5485969968140125, + "reward_change_max": 0.00014875829219818115, + "reward_change_mean": -0.14053383423015475, + "reward_change_min": -0.2998774442821741, + "reward_change_std": 0.12114948220551014, + "reward_std": 0.5414136685431004, + "rewards/cosine_scaled_reward": -0.21039140783250332, + "rewards/format_reward": 0.1666666679084301, + "step": 46 + }, + { + "advantage_max": 1.5182860642671585, + "advantage_mean": 3.725290076417309e-09, + "advantage_min": -0.8598650246858597, + "advantage_std": 0.9061323516070843, + "completion_length": 2918.541702270508, + "epoch": 0.053714285714285714, + "grad_norm": 0.1575622260570526, + "kl": 0.00014133378863334656, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.399999999999999e-07, + "loss": 0.1144, + "reward": 0.02792397327721119, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.02792397327721119, + "reward_after_std": 0.906132385134697, + "reward_before_mean": 0.2801897209137678, + "reward_before_std": 0.963929258286953, + "reward_change_max": 0.0010666772723197937, + "reward_change_mean": -0.2522657341323793, + "reward_change_min": -0.5717835687100887, + "reward_change_std": 0.23776433896273375, + "reward_std": 0.906132385134697, + "rewards/cosine_scaled_reward": -0.04740514978766441, + "rewards/format_reward": 0.37500000931322575, + "step": 47 + }, + { + "advantage_max": 1.406430073082447, + "advantage_mean": 3.725290853573426e-09, + "advantage_min": -0.8095233663916588, + "advantage_std": 0.8473255261778831, + "completion_length": 2847.0833587646484, + "epoch": 0.054857142857142854, + "grad_norm": 0.1331583559513092, + "kl": 0.0009892657399177551, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.6e-07, + "loss": 0.0373, + "reward": 0.08239079266786575, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.08239079266786575, + "reward_after_std": 0.8473255299031734, + "reward_before_mean": 0.36185096576809883, + "reward_before_std": 0.8943199291825294, + "reward_change_max": 0.0010818466544151306, + "reward_change_mean": -0.2794601647183299, + "reward_change_min": -0.6500570997595787, + "reward_change_std": 0.24804373178631067, + "reward_std": 0.847325537353754, + "rewards/cosine_scaled_reward": -0.01699118735268712, + "rewards/format_reward": 0.39583333767950535, + "step": 48 + }, + { + "advantage_max": 1.3203575275838375, + "advantage_mean": -1.1175871339474952e-08, + "advantage_min": -0.8599809035658836, + "advantage_std": 0.8113159202039242, + "completion_length": 2324.7292251586914, + "epoch": 0.056, + "grad_norm": 0.11754000186920166, + "kl": 0.0004737917333841324, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.8e-07, + "loss": 0.0227, + "reward": 0.22125684656202793, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.22125684656202793, + "reward_after_std": 0.8113159164786339, + "reward_before_mean": 0.557825468480587, + "reward_before_std": 0.8478972613811493, + "reward_change_max": 0.0, + "reward_change_mean": -0.3365686163306236, + "reward_change_min": -0.6611680220812559, + "reward_change_std": 0.26093919202685356, + "reward_std": 0.8113159202039242, + "rewards/cosine_scaled_reward": -0.023170609027147293, + "rewards/format_reward": 0.6041666753590107, + "step": 49 + }, + { + "advantage_max": 1.2256408035755157, + "advantage_mean": -1.3659398390153399e-08, + "advantage_min": -0.7610447257757187, + "advantage_std": 0.7761923484504223, + "completion_length": 2976.6458740234375, + "epoch": 0.05714285714285714, + "grad_norm": 0.21591882407665253, + "kl": 0.0005423109978437424, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1e-06, + "loss": 0.0274, + "reward": 0.21824848279356956, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.21824848279356956, + "reward_after_std": 0.7761923521757126, + "reward_before_mean": 0.5609084158204496, + "reward_before_std": 0.8036927813664079, + "reward_change_max": 0.0, + "reward_change_mean": -0.34265993256121874, + "reward_change_min": -0.6551012843847275, + "reward_change_std": 0.28258705232292414, + "reward_std": 0.7761923968791962, + "rewards/cosine_scaled_reward": 0.0721208662725985, + "rewards/format_reward": 0.41666667349636555, + "step": 50 + }, + { + "advantage_max": 0.9211089834570885, + "advantage_mean": 1.365939800157534e-08, + "advantage_min": -0.5625776872038841, + "advantage_std": 0.5281440187245607, + "completion_length": 2261.437515258789, + "epoch": 0.05828571428571429, + "grad_norm": 0.0686202198266983, + "kl": 0.0023306608200073242, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.999890338174275e-07, + "loss": 0.0083, + "reward": 0.09554161503911018, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.09554161503911018, + "reward_after_std": 0.5281440075486898, + "reward_before_mean": 0.41558761009946465, + "reward_before_std": 0.4915418364107609, + "reward_change_max": 0.0001592785120010376, + "reward_change_mean": -0.32004596339538693, + "reward_change_min": -0.5329243093729019, + "reward_change_std": 0.19881984498351812, + "reward_std": 0.5281440112739801, + "rewards/cosine_scaled_reward": -0.07345621287822723, + "rewards/format_reward": 0.5625, + "step": 51 + }, + { + "advantage_max": 1.6391232684254646, + "advantage_mean": -6.2088170160734535e-09, + "advantage_min": -0.9773569256067276, + "advantage_std": 0.9745854027569294, + "completion_length": 3048.729217529297, + "epoch": 0.05942857142857143, + "grad_norm": 0.13857057690620422, + "kl": 0.0015719830989837646, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.999561358041868e-07, + "loss": 0.0581, + "reward": 0.18856710754334927, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.18856710754334927, + "reward_after_std": 0.97458541020751, + "reward_before_mean": 0.4898769208230078, + "reward_before_std": 1.0282159596681595, + "reward_change_max": 0.001281455159187317, + "reward_change_mean": -0.30130978557281196, + "reward_change_min": -0.6202972773462534, + "reward_change_std": 0.27393114077858627, + "reward_std": 0.9745854176580906, + "rewards/cosine_scaled_reward": 0.0470217689871788, + "rewards/format_reward": 0.39583333767950535, + "step": 52 + }, + { + "advantage_max": 1.5374806299805641, + "advantage_mean": 7.45058070794613e-09, + "advantage_min": -0.955609530210495, + "advantage_std": 0.8936157710850239, + "completion_length": 2898.937545776367, + "epoch": 0.060571428571428575, + "grad_norm": 0.1304454803466797, + "kl": 0.0008092299103736877, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.999013075636804e-07, + "loss": 0.0303, + "reward": 0.15414141491055489, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.15414141491055489, + "reward_after_std": 0.8936157636344433, + "reward_before_mean": 0.451077688485384, + "reward_before_std": 0.9141882732510567, + "reward_change_max": 0.0007978379726409912, + "reward_change_mean": -0.2969362363219261, + "reward_change_min": -0.5617908462882042, + "reward_change_std": 0.236876605078578, + "reward_std": 0.8936157822608948, + "rewards/cosine_scaled_reward": -0.0036278427578508854, + "rewards/format_reward": 0.4583333432674408, + "step": 53 + }, + { + "advantage_max": 1.4440747387707233, + "advantage_mean": -6.208817349140361e-09, + "advantage_min": -0.9045711942017078, + "advantage_std": 0.8898514695465565, + "completion_length": 2969.437530517578, + "epoch": 0.061714285714285715, + "grad_norm": 0.1441929042339325, + "kl": 0.0003513023257255554, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.998245517681593e-07, + "loss": 0.0623, + "reward": 0.4009180925786495, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4009180925786495, + "reward_after_std": 0.8898514620959759, + "reward_before_mean": 0.7924199029803276, + "reward_before_std": 0.9223152846097946, + "reward_change_max": 0.0, + "reward_change_mean": -0.3915017740800977, + "reward_change_min": -0.7978204973042011, + "reward_change_std": 0.31471778359264135, + "reward_std": 0.8898514732718468, + "rewards/cosine_scaled_reward": 0.15662658959627151, + "rewards/format_reward": 0.4791666753590107, + "step": 54 + }, + { + "advantage_max": 1.3430213667452335, + "advantage_mean": -1.0554989660072067e-08, + "advantage_min": -0.6609198525547981, + "advantage_std": 0.7468325290828943, + "completion_length": 3155.041732788086, + "epoch": 0.06285714285714286, + "grad_norm": 0.1293783038854599, + "kl": 0.0011532604694366455, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.997258721585931e-07, + "loss": 0.0199, + "reward": 0.04226991068571806, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.04226991068571806, + "reward_after_std": 0.7468325309455395, + "reward_before_mean": 0.3128039035946131, + "reward_before_std": 0.7251314949244261, + "reward_change_max": 0.0, + "reward_change_mean": -0.2705340012907982, + "reward_change_min": -0.4766126349568367, + "reward_change_std": 0.20029573515057564, + "reward_std": 0.746832549571991, + "rewards/cosine_scaled_reward": -0.010264725424349308, + "rewards/format_reward": 0.33333333767950535, + "step": 55 + }, + { + "advantage_max": 1.0376209765672684, + "advantage_mean": -1.2417633588057697e-09, + "advantage_min": -0.8000453487038612, + "advantage_std": 0.6578298974782228, + "completion_length": 3013.416679382324, + "epoch": 0.064, + "grad_norm": 0.11045597493648529, + "kl": 0.0005884170532226562, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.996052735444862e-07, + "loss": 0.0461, + "reward": 0.003945750184357166, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.003945750184357166, + "reward_after_std": 0.6578298974782228, + "reward_before_mean": 0.28323000913951546, + "reward_before_std": 0.702507134526968, + "reward_change_max": 0.0010666921734809875, + "reward_change_mean": -0.27928424440324306, + "reward_change_min": -0.5404662974178791, + "reward_change_std": 0.22895370610058308, + "reward_std": 0.6578299012035131, + "rewards/cosine_scaled_reward": -0.056301675736904144, + "rewards/format_reward": 0.3958333395421505, + "step": 56 + }, + { + "advantage_max": 0.8575425706803799, + "advantage_mean": -5.587935281159417e-09, + "advantage_min": -0.6129637286067009, + "advantage_std": 0.5624299887567759, + "completion_length": 3314.312530517578, + "epoch": 0.06514285714285714, + "grad_norm": 0.08359239995479584, + "kl": 0.0003393888473510742, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.994627618036452e-07, + "loss": 0.0254, + "reward": -0.23580889869481325, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.23580889869481325, + "reward_after_std": 0.5624299924820662, + "reward_before_mean": -0.028469612821936607, + "reward_before_std": 0.6206865087151527, + "reward_change_max": 0.00027373433113098145, + "reward_change_mean": -0.2073393096216023, + "reward_change_min": -0.45087942108511925, + "reward_change_std": 0.19378007715567946, + "reward_std": 0.5624300055205822, + "rewards/cosine_scaled_reward": -0.14965146128088236, + "rewards/format_reward": 0.27083333767950535, + "step": 57 + }, + { + "advantage_max": 1.5007806494832039, + "advantage_mean": 1.862645426786713e-09, + "advantage_min": -0.9062970317900181, + "advantage_std": 0.8869751691818237, + "completion_length": 2272.770881652832, + "epoch": 0.06628571428571428, + "grad_norm": 0.12403688579797745, + "kl": 0.007107377052307129, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.992983438818915e-07, + "loss": 0.059, + "reward": 0.43791239289566875, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.43791239289566875, + "reward_after_std": 0.8869751468300819, + "reward_before_mean": 0.8376260045915842, + "reward_before_std": 0.8879306688904762, + "reward_change_max": 0.0, + "reward_change_mean": -0.3997136056423187, + "reward_change_min": -0.7445529289543629, + "reward_change_std": 0.29559123795479536, + "reward_std": 0.8869751766324043, + "rewards/cosine_scaled_reward": 0.08547966694459319, + "rewards/format_reward": 0.6666666753590107, + "step": 58 + }, + { + "advantage_max": 0.9958036541938782, + "advantage_mean": -1.5522043483873205e-08, + "advantage_min": -0.6712572649121284, + "advantage_std": 0.6627084948122501, + "completion_length": 2968.375015258789, + "epoch": 0.06742857142857143, + "grad_norm": 0.13538452982902527, + "kl": 0.0009574443101882935, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.991120277927223e-07, + "loss": 0.0752, + "reward": -0.19166237860918045, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.19166237860918045, + "reward_after_std": 0.6627085022628307, + "reward_before_mean": 0.018512267619371414, + "reward_before_std": 0.7423705346882343, + "reward_change_max": 0.0015425384044647217, + "reward_change_mean": -0.21017466066405177, + "reward_change_min": -0.5010223798453808, + "reward_change_std": 0.22122562769800425, + "reward_std": 0.6627085246145725, + "rewards/cosine_scaled_reward": -0.13657721132040024, + "rewards/format_reward": 0.29166666977107525, + "step": 59 + }, + { + "advantage_max": 1.114711195230484, + "advantage_mean": 2.421438738409165e-08, + "advantage_min": -0.43552929908037186, + "advantage_std": 0.5868423134088516, + "completion_length": 3090.9583587646484, + "epoch": 0.06857142857142857, + "grad_norm": 0.08353926241397858, + "kl": 0.001408219337463379, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.989038226169207e-07, + "loss": 0.0065, + "reward": -0.34867841517552733, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.34867841517552733, + "reward_after_std": 0.586842292919755, + "reward_before_mean": -0.200962302275002, + "reward_before_std": 0.5666951425373554, + "reward_change_max": 0.001304030418395996, + "reward_change_mean": -0.14771609636954963, + "reward_change_min": -0.28777224756777287, + "reward_change_std": 0.11059607611969113, + "reward_std": 0.5868423096835613, + "rewards/cosine_scaled_reward": -0.24631450232118368, + "rewards/format_reward": 0.2916666679084301, + "step": 60 + }, + { + "advantage_max": 1.2283815741539001, + "advantage_mean": 1.5832484101530042e-08, + "advantage_min": -0.7546460404992104, + "advantage_std": 0.7360720597207546, + "completion_length": 3173.0625610351562, + "epoch": 0.06971428571428571, + "grad_norm": 0.15681175887584686, + "kl": 0.0024908222258090973, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.98673738502114e-07, + "loss": 0.036, + "reward": -0.0034181829541921616, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.0034181829541921616, + "reward_after_std": 0.7360720634460449, + "reward_before_mean": 0.2585161756724119, + "reward_before_std": 0.7624808065593243, + "reward_change_max": 0.0004996657371520996, + "reward_change_mean": -0.2619343502447009, + "reward_change_min": -0.5183047540485859, + "reward_change_std": 0.21530343778431416, + "reward_std": 0.7360720820724964, + "rewards/cosine_scaled_reward": -0.06865859217941761, + "rewards/format_reward": 0.3958333469927311, + "step": 61 + }, + { + "advantage_max": 1.6221475005149841, + "advantage_mean": 6.208817127095756e-09, + "advantage_min": -0.7927829623222351, + "advantage_std": 0.9064479470252991, + "completion_length": 2707.541732788086, + "epoch": 0.07085714285714285, + "grad_norm": 0.1787949800491333, + "kl": 0.020225495100021362, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.98421786662277e-07, + "loss": 0.0607, + "reward": 0.13127783383242786, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.13127783383242786, + "reward_after_std": 0.9064479619264603, + "reward_before_mean": 0.4136840028950246, + "reward_before_std": 0.9145010150969028, + "reward_change_max": 0.0001274794340133667, + "reward_change_mean": -0.2824061932042241, + "reward_change_min": -0.5440523903816938, + "reward_change_std": 0.21861811820417643, + "reward_std": 0.9064479991793633, + "rewards/cosine_scaled_reward": -0.043158004991710186, + "rewards/format_reward": 0.5000000037252903, + "step": 62 + }, + { + "advantage_max": 1.1011157259345055, + "advantage_mean": 0.0, + "advantage_min": -0.6950971148908138, + "advantage_std": 0.6476048491895199, + "completion_length": 2512.0833892822266, + "epoch": 0.072, + "grad_norm": 0.0958164632320404, + "kl": 0.003099203109741211, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.981479793771866e-07, + "loss": 0.0544, + "reward": 0.3186934031546116, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3186934031546116, + "reward_after_std": 0.6476048715412617, + "reward_before_mean": 0.7054652385413647, + "reward_before_std": 0.6132442802190781, + "reward_change_max": 0.0004886388778686523, + "reward_change_mean": -0.38677185960114, + "reward_change_min": -0.6413705144077539, + "reward_change_std": 0.2497247066348791, + "reward_std": 0.6476048901677132, + "rewards/cosine_scaled_reward": 0.04023261368274689, + "rewards/format_reward": 0.6250000055879354, + "step": 63 + }, + { + "advantage_max": 1.2440772727131844, + "advantage_mean": 4.967053879312289e-09, + "advantage_min": -0.6886808797717094, + "advantage_std": 0.7557894457131624, + "completion_length": 3082.8958435058594, + "epoch": 0.07314285714285715, + "grad_norm": 0.12037858366966248, + "kl": 0.0024671554565429688, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.97852329991824e-07, + "loss": 0.069, + "reward": -0.07378544472157955, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.07378544472157955, + "reward_after_std": 0.7557894717901945, + "reward_before_mean": 0.16163264960050583, + "reward_before_std": 0.8081401251256466, + "reward_change_max": 0.0011908113956451416, + "reward_change_mean": -0.2354180756956339, + "reward_change_min": -0.558480516076088, + "reward_change_std": 0.2264844672754407, + "reward_std": 0.7557894978672266, + "rewards/cosine_scaled_reward": -0.07543368389201532, + "rewards/format_reward": 0.3125000037252903, + "step": 64 + }, + { + "advantage_max": 1.0268595106899738, + "advantage_mean": -1.1796752963366686e-08, + "advantage_min": -0.4311934597790241, + "advantage_std": 0.5640022493898869, + "completion_length": 2795.437511444092, + "epoch": 0.07428571428571429, + "grad_norm": 0.10451654344797134, + "kl": 0.0019540786743164062, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.975348529157229e-07, + "loss": 0.0135, + "reward": -0.20017614914104342, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.20017614914104342, + "reward_after_std": 0.5640022493898869, + "reward_before_mean": 0.006564559414982796, + "reward_before_std": 0.5450711958110332, + "reward_change_max": 0.0015406087040901184, + "reward_change_mean": -0.20674069644883275, + "reward_change_min": -0.381564624607563, + "reward_change_std": 0.15082712983712554, + "reward_std": 0.5640022531151772, + "rewards/cosine_scaled_reward": -0.18421773053705692, + "rewards/format_reward": 0.37500000186264515, + "step": 65 + }, + { + "advantage_max": 1.0503377504646778, + "advantage_mean": -1.2417634254191512e-08, + "advantage_min": -0.6915866583585739, + "advantage_std": 0.6261376366019249, + "completion_length": 2167.041679382324, + "epoch": 0.07542857142857143, + "grad_norm": 0.0698748528957367, + "kl": 0.0012662410736083984, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.971955636222684e-07, + "loss": 0.0057, + "reward": 0.2023143582046032, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2023143582046032, + "reward_after_std": 0.6261376515030861, + "reward_before_mean": 0.5499310828745365, + "reward_before_std": 0.604407899081707, + "reward_change_max": 0.0, + "reward_change_mean": -0.3476167330518365, + "reward_change_min": -0.5529674589633942, + "reward_change_std": 0.23018301371484995, + "reward_std": 0.6261376589536667, + "rewards/cosine_scaled_reward": 0.035382192581892014, + "rewards/format_reward": 0.4791666716337204, + "step": 66 + }, + { + "advantage_max": 0.7263390906155109, + "advantage_mean": 3.10440866346795e-08, + "advantage_min": -0.3848998099565506, + "advantage_std": 0.4047544952481985, + "completion_length": 3579.5625, + "epoch": 0.07657142857142857, + "grad_norm": 0.059073422104120255, + "kl": 0.0013927817344665527, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.968344786479415e-07, + "loss": 0.0008, + "reward": -0.5511298915371299, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.5511298915371299, + "reward_after_std": 0.40475449338555336, + "reward_before_mean": -0.44834938645362854, + "reward_before_std": 0.4108631107956171, + "reward_change_max": 3.879517316818237e-05, + "reward_change_mean": -0.10278050391934812, + "reward_change_min": -0.2122868075966835, + "reward_change_std": 0.0857694000005722, + "reward_std": 0.40475449711084366, + "rewards/cosine_scaled_reward": -0.25542469043284655, + "rewards/format_reward": 0.06250000186264515, + "step": 67 + }, + { + "advantage_max": 1.2082672864198685, + "advantage_mean": 5.587935614226325e-09, + "advantage_min": -0.665333541110158, + "advantage_std": 0.746182668954134, + "completion_length": 2459.562572479248, + "epoch": 0.07771428571428571, + "grad_norm": 0.11337409168481827, + "kl": 0.005399227142333984, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.964516155915151e-07, + "loss": 0.0337, + "reward": 0.010181301273405552, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.010181301273405552, + "reward_after_std": 0.7461826372891665, + "reward_before_mean": 0.27732523111626506, + "reward_before_std": 0.7844086028635502, + "reward_change_max": 0.00016658753156661987, + "reward_change_mean": -0.26714393263682723, + "reward_change_min": -0.6000061314553022, + "reward_change_std": 0.23759357165545225, + "reward_std": 0.7461826726794243, + "rewards/cosine_scaled_reward": -0.14258738327771425, + "rewards/format_reward": 0.5625000111758709, + "step": 68 + }, + { + "advantage_max": 1.0632721930742264, + "advantage_mean": 1.3659398057086491e-08, + "advantage_min": -0.5626996904611588, + "advantage_std": 0.6013586819171906, + "completion_length": 2767.0416870117188, + "epoch": 0.07885714285714286, + "grad_norm": 0.11885175853967667, + "kl": 0.0038573741912841797, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.960469931131936e-07, + "loss": 0.0527, + "reward": -0.31397517397999763, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.31397517397999763, + "reward_after_std": 0.6013586968183517, + "reward_before_mean": -0.15239904704503715, + "reward_before_std": 0.6141092143952847, + "reward_change_max": 0.0019856542348861694, + "reward_change_mean": -0.16157611832022667, + "reward_change_min": -0.3111561890691519, + "reward_change_std": 0.13281467324122787, + "reward_std": 0.6013587154448032, + "rewards/cosine_scaled_reward": -0.24286619946360588, + "rewards/format_reward": 0.3333333395421505, + "step": 69 + }, + { + "advantage_max": 1.0964757651090622, + "advantage_mean": 1.0554989271494009e-08, + "advantage_min": -0.5003730542957783, + "advantage_std": 0.5867039151489735, + "completion_length": 2997.041702270508, + "epoch": 0.08, + "grad_norm": 0.08089398592710495, + "kl": 0.001575469970703125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.956206309337066e-07, + "loss": 0.0055, + "reward": -0.2361793201416731, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.2361793201416731, + "reward_after_std": 0.5867039300501347, + "reward_before_mean": -0.04719802783802152, + "reward_before_std": 0.5565843749791384, + "reward_change_max": 0.0007937699556350708, + "reward_change_mean": -0.18898130021989346, + "reward_change_min": -0.33782116137444973, + "reward_change_std": 0.13402173947542906, + "reward_std": 0.5867039673030376, + "rewards/cosine_scaled_reward": -0.2006823541596532, + "rewards/format_reward": 0.3541666679084301, + "step": 70 + }, + { + "advantage_max": 1.1093778908252716, + "advantage_mean": 3.1044086745701804e-09, + "advantage_min": -0.5899154469370842, + "advantage_std": 0.6263005174696445, + "completion_length": 2871.3958435058594, + "epoch": 0.08114285714285714, + "grad_norm": 0.13375209271907806, + "kl": 0.006723284721374512, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.951725498333448e-07, + "loss": 0.0477, + "reward": -0.0509650744497776, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.0509650744497776, + "reward_after_std": 0.6263005062937737, + "reward_before_mean": 0.2027706727385521, + "reward_before_std": 0.6161373816430569, + "reward_change_max": 0.0005811154842376709, + "reward_change_mean": -0.253735733916983, + "reward_change_min": -0.46558609418570995, + "reward_change_std": 0.17825811356306076, + "reward_std": 0.6263005137443542, + "rewards/cosine_scaled_reward": -0.06528134597465396, + "rewards/format_reward": 0.33333334140479565, + "step": 71 + }, + { + "advantage_max": 0.974971279501915, + "advantage_mean": 2.6697914323747796e-08, + "advantage_min": -0.6151207871735096, + "advantage_std": 0.5957626178860664, + "completion_length": 3153.9166870117188, + "epoch": 0.08228571428571428, + "grad_norm": 0.09279810637235641, + "kl": 0.005246877670288086, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.947027716509488e-07, + "loss": 0.0394, + "reward": -0.3170163119211793, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.3170163119211793, + "reward_after_std": 0.5957626029849052, + "reward_before_mean": -0.14919825922697783, + "reward_before_std": 0.6452187933027744, + "reward_change_max": 0.0013034045696258545, + "reward_change_mean": -0.16781801730394363, + "reward_change_min": -0.41230191849172115, + "reward_change_std": 0.16988197807222605, + "reward_std": 0.5957626290619373, + "rewards/cosine_scaled_reward": -0.21001579985022545, + "rewards/format_reward": 0.27083333767950535, + "step": 72 + }, + { + "advantage_max": 1.1339119151234627, + "advantage_mean": 2.4835269452072595e-08, + "advantage_min": -0.5014988705515862, + "advantage_std": 0.6255671754479408, + "completion_length": 3523.3333740234375, + "epoch": 0.08342857142857144, + "grad_norm": 0.10664436221122742, + "kl": 0.0007270574569702148, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.942113192828444e-07, + "loss": 0.0274, + "reward": -0.43988874554634094, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.43988874554634094, + "reward_after_std": 0.6255671717226505, + "reward_before_mean": -0.3272168878465891, + "reward_before_std": 0.6468932591378689, + "reward_change_max": 0.0018475502729415894, + "reward_change_mean": -0.11267185024917126, + "reward_change_min": -0.30092281103134155, + "reward_change_std": 0.1199548018630594, + "reward_std": 0.6255671866238117, + "rewards/cosine_scaled_reward": -0.2261084453202784, + "rewards/format_reward": 0.12500000186264515, + "step": 73 + }, + { + "advantage_max": 1.3062651008367538, + "advantage_mean": -1.1796752130699417e-08, + "advantage_min": -0.6358704566955566, + "advantage_std": 0.7469595354050398, + "completion_length": 3221.5208587646484, + "epoch": 0.08457142857142858, + "grad_norm": 0.132724791765213, + "kl": 0.0031901895999908447, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.93698216681727e-07, + "loss": 0.0676, + "reward": -0.025578954257071018, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.025578954257071018, + "reward_after_std": 0.7469595540314913, + "reward_before_mean": 0.2248452752828598, + "reward_before_std": 0.7438722159713507, + "reward_change_max": 0.0008772239089012146, + "reward_change_mean": -0.2504242234863341, + "reward_change_min": -0.5414825454354286, + "reward_change_std": 0.21578879188746214, + "reward_std": 0.7469595912843943, + "rewards/cosine_scaled_reward": -0.033410708769224584, + "rewards/format_reward": 0.29166666977107525, + "step": 74 + }, + { + "advantage_max": 1.1306501738727093, + "advantage_mean": 1.7384688633104162e-08, + "advantage_min": -0.5787255503237247, + "advantage_std": 0.6625193282961845, + "completion_length": 3090.604232788086, + "epoch": 0.08571428571428572, + "grad_norm": 0.11262889206409454, + "kl": 0.0034093856811523438, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.931634888554935e-07, + "loss": 0.0406, + "reward": 0.06313235312700272, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.06313235312700272, + "reward_after_std": 0.6625193208456039, + "reward_before_mean": 0.3570488765835762, + "reward_before_std": 0.64370296895504, + "reward_change_max": 0.00019219517707824707, + "reward_change_mean": -0.29391652159392834, + "reward_change_min": -0.6067571640014648, + "reward_change_std": 0.22870568884536624, + "reward_std": 0.6625193618237972, + "rewards/cosine_scaled_reward": 0.0014410973526537418, + "rewards/format_reward": 0.3541666716337204, + "step": 75 + }, + { + "advantage_max": 0.8555637449026108, + "advantage_mean": 4.967053768289986e-09, + "advantage_min": -0.5915851294994354, + "advantage_std": 0.5106821767985821, + "completion_length": 2776.2083892822266, + "epoch": 0.08685714285714285, + "grad_norm": 0.08106429129838943, + "kl": 0.0007976293563842773, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.926071618660237e-07, + "loss": 0.0483, + "reward": -0.12464714795351028, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.12464714795351028, + "reward_after_std": 0.5106821544468403, + "reward_before_mean": 0.12069362960755825, + "reward_before_std": 0.5108134467154741, + "reward_change_max": 0.0007242336869239807, + "reward_change_mean": -0.2453407747671008, + "reward_change_min": -0.4396594688296318, + "reward_change_std": 0.17849350557662547, + "reward_std": 0.5106821693480015, + "rewards/cosine_scaled_reward": -0.18965318612754345, + "rewards/format_reward": 0.5000000093132257, + "step": 76 + }, + { + "advantage_max": 0.8869626522064209, + "advantage_mean": 3.1044084525255755e-09, + "advantage_min": -0.6437748745083809, + "advantage_std": 0.5324870087206364, + "completion_length": 3121.979217529297, + "epoch": 0.088, + "grad_norm": 0.08320073783397675, + "kl": 0.0010444223880767822, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.9202926282791e-07, + "loss": 0.0294, + "reward": -0.21349376626312733, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.21349376626312733, + "reward_after_std": 0.5324870124459267, + "reward_before_mean": -0.0016396627761423588, + "reward_before_std": 0.5460649132728577, + "reward_change_max": 0.0005499720573425293, + "reward_change_mean": -0.21185409231111407, + "reward_change_min": -0.3882265854626894, + "reward_change_std": 0.16535702906548977, + "reward_std": 0.5324870124459267, + "rewards/cosine_scaled_reward": -0.1883198358118534, + "rewards/format_reward": 0.37500000931322575, + "step": 77 + }, + { + "advantage_max": 1.5690804421901703, + "advantage_mean": -3.601114062501409e-08, + "advantage_min": -0.7654212564229965, + "advantage_std": 0.8929519504308701, + "completion_length": 2964.062545776367, + "epoch": 0.08914285714285715, + "grad_norm": 0.1362496167421341, + "kl": 0.0023328065872192383, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.91429819907136e-07, + "loss": 0.0472, + "reward": 0.23330368660390377, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.23330368660390377, + "reward_after_std": 0.8929519504308701, + "reward_before_mean": 0.5573525791987777, + "reward_before_std": 0.8918314576148987, + "reward_change_max": 0.0002750083804130554, + "reward_change_mean": -0.3240489256568253, + "reward_change_min": -0.6681363992393017, + "reward_change_std": 0.259742493275553, + "reward_std": 0.8929519802331924, + "rewards/cosine_scaled_reward": 0.07034294621553272, + "rewards/format_reward": 0.41666666977107525, + "step": 78 + }, + { + "advantage_max": 1.1290984824299812, + "advantage_mean": 2.483526828633842e-09, + "advantage_min": -0.566096693277359, + "advantage_std": 0.624596331268549, + "completion_length": 2287.541690826416, + "epoch": 0.09028571428571429, + "grad_norm": 0.10173244774341583, + "kl": 0.0024640560150146484, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.908088623197048e-07, + "loss": -0.0065, + "reward": 0.14616259443573654, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.14616259443573654, + "reward_after_std": 0.6245963498950005, + "reward_before_mean": 0.46677736937999725, + "reward_before_std": 0.5631000082939863, + "reward_change_max": 0.0004928857088088989, + "reward_change_mean": -0.3206147523596883, + "reward_change_min": -0.5135222245007753, + "reward_change_std": 0.20620128232985735, + "reward_std": 0.6245963498950005, + "rewards/cosine_scaled_reward": -0.03744465671479702, + "rewards/format_reward": 0.5416666679084301, + "step": 79 + }, + { + "advantage_max": 1.4897634759545326, + "advantage_mean": -3.4148496752539614e-08, + "advantage_min": -0.6975793838500977, + "advantage_std": 0.8379081785678864, + "completion_length": 3263.166717529297, + "epoch": 0.09142857142857143, + "grad_norm": 0.15222951769828796, + "kl": 0.0024718046188354492, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.901664203302124e-07, + "loss": 0.0648, + "reward": 0.0717293145135045, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.0717293145135045, + "reward_after_std": 0.8379081785678864, + "reward_before_mean": 0.34337579587008804, + "reward_before_std": 0.8289961963891983, + "reward_change_max": 0.00044892728328704834, + "reward_change_mean": -0.2716464791446924, + "reward_change_min": -0.4956607408821583, + "reward_change_std": 0.20896095503121614, + "reward_std": 0.837908186018467, + "rewards/cosine_scaled_reward": -0.015812127850949764, + "rewards/format_reward": 0.3750000037252903, + "step": 80 + }, + { + "advantage_max": 1.2281683385372162, + "advantage_mean": 2.483527383745354e-09, + "advantage_min": -0.618021085858345, + "advantage_std": 0.690451554954052, + "completion_length": 3113.541732788086, + "epoch": 0.09257142857142857, + "grad_norm": 0.20810602605342865, + "kl": 0.005570411682128906, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.895025252503755e-07, + "loss": 0.0338, + "reward": -0.2511444576084614, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2511444576084614, + "reward_after_std": 0.6904515661299229, + "reward_before_mean": -0.07739553973078728, + "reward_before_std": 0.7059197537600994, + "reward_change_max": 0.000757955014705658, + "reward_change_mean": -0.1737489178776741, + "reward_change_min": -0.411263357847929, + "reward_change_std": 0.16339826956391335, + "reward_std": 0.6904515847563744, + "rewards/cosine_scaled_reward": -0.1949477707967162, + "rewards/format_reward": 0.31250000558793545, + "step": 81 + }, + { + "advantage_max": 1.31115597859025, + "advantage_mean": 1.862645149230957e-09, + "advantage_min": -0.8089087083935738, + "advantage_std": 0.7565962858498096, + "completion_length": 2834.8958740234375, + "epoch": 0.09371428571428571, + "grad_norm": 0.09568361937999725, + "kl": 0.002544999122619629, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.888172094375033e-07, + "loss": 0.0091, + "reward": 0.1376865222118795, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1376865222118795, + "reward_after_std": 0.756596315652132, + "reward_before_mean": 0.4453375115990639, + "reward_before_std": 0.7576594576239586, + "reward_change_max": 0.0004483461380004883, + "reward_change_mean": -0.3076509768143296, + "reward_change_min": -0.5397645384073257, + "reward_change_std": 0.21682811994105577, + "reward_std": 0.7565963491797447, + "rewards/cosine_scaled_reward": 0.03516875783680007, + "rewards/format_reward": 0.37500000558793545, + "step": 82 + }, + { + "advantage_max": 1.5486390925943851, + "advantage_mean": 1.2417632477834672e-09, + "advantage_min": -0.7127926684916019, + "advantage_std": 0.8253224082291126, + "completion_length": 2919.6458435058594, + "epoch": 0.09485714285714286, + "grad_norm": 0.1437751054763794, + "kl": 0.002646923065185547, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.881105062929221e-07, + "loss": 0.004, + "reward": 0.000450813677161932, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.000450813677161932, + "reward_after_std": 0.8253223933279514, + "reward_before_mean": 0.24097184976562858, + "reward_before_std": 0.7998228445649147, + "reward_change_max": 0.0002821981906890869, + "reward_change_mean": -0.2405210305005312, + "reward_change_min": -0.4127188418060541, + "reward_change_std": 0.16908379085361958, + "reward_std": 0.825322400778532, + "rewards/cosine_scaled_reward": -0.025347420232719742, + "rewards/format_reward": 0.2916666679084301, + "step": 83 + }, + { + "advantage_max": 1.3038128688931465, + "advantage_mean": -6.208817904251873e-10, + "advantage_min": -0.6896253600716591, + "advantage_std": 0.7823732793331146, + "completion_length": 3117.812530517578, + "epoch": 0.096, + "grad_norm": 0.12220973521471024, + "kl": 0.001085519790649414, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.873824502603459e-07, + "loss": 0.0568, + "reward": 0.06299414858222008, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.06299414858222008, + "reward_after_std": 0.782373296096921, + "reward_before_mean": 0.34435543417930603, + "reward_before_std": 0.811880424618721, + "reward_change_max": 0.0007270798087120056, + "reward_change_mean": -0.28136129723861814, + "reward_change_min": -0.6436299737542868, + "reward_change_std": 0.25078502343967557, + "reward_std": 0.7823732979595661, + "rewards/cosine_scaled_reward": 0.005511032417416573, + "rewards/format_reward": 0.3333333358168602, + "step": 84 + }, + { + "advantage_max": 1.8904551193118095, + "advantage_mean": -6.208817349140361e-09, + "advantage_min": -0.8693808242678642, + "advantage_std": 1.0314135998487473, + "completion_length": 3134.875030517578, + "epoch": 0.09714285714285714, + "grad_norm": 0.1699473261833191, + "kl": 0.0018897056579589844, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.866330768241983e-07, + "loss": 0.0623, + "reward": -0.04264771193265915, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.04264771193265915, + "reward_after_std": 1.0314136110246181, + "reward_before_mean": 0.1589924634899944, + "reward_before_std": 1.0488222055137157, + "reward_change_max": 0.001532226800918579, + "reward_change_mean": -0.20164018403738737, + "reward_change_min": -0.44846535101532936, + "reward_change_std": 0.19201862812042236, + "reward_std": 1.031413622200489, + "rewards/cosine_scaled_reward": -0.08717044070363045, + "rewards/format_reward": 0.33333334140479565, + "step": 85 + }, + { + "advantage_max": 1.0618792101740837, + "advantage_mean": 8.07146260939362e-09, + "advantage_min": -0.7318792194128036, + "advantage_std": 0.6476677916944027, + "completion_length": 2897.2291717529297, + "epoch": 0.09828571428571428, + "grad_norm": 0.08707182109355927, + "kl": 0.0034961700439453125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.85862422507884e-07, + "loss": 0.013, + "reward": 0.017172118183225393, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.017172118183225393, + "reward_after_std": 0.6476677916944027, + "reward_before_mean": 0.2989570014178753, + "reward_before_std": 0.6632709354162216, + "reward_change_max": 0.0015084072947502136, + "reward_change_mean": -0.2817848902195692, + "reward_change_min": -0.5358342751860619, + "reward_change_std": 0.21289643086493015, + "reward_std": 0.6476677916944027, + "rewards/cosine_scaled_reward": -0.07968816673383117, + "rewards/format_reward": 0.4583333432674408, + "step": 86 + }, + { + "advantage_max": 1.3855399899184704, + "advantage_mean": -8.692345065952622e-09, + "advantage_min": -0.8480601236224174, + "advantage_std": 0.8235466033220291, + "completion_length": 2927.166717529297, + "epoch": 0.09942857142857142, + "grad_norm": 0.20664729177951813, + "kl": 0.010286808013916016, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.850705248720068e-07, + "loss": 0.0528, + "reward": 0.04761325381696224, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.04761325381696224, + "reward_after_std": 0.8235466033220291, + "reward_before_mean": 0.3165686149150133, + "reward_before_std": 0.8553726263344288, + "reward_change_max": 0.0007121041417121887, + "reward_change_mean": -0.2689553592354059, + "reward_change_min": -0.5325675681233406, + "reward_change_std": 0.22574665397405624, + "reward_std": 0.8235466182231903, + "rewards/cosine_scaled_reward": -0.08129904745146632, + "rewards/format_reward": 0.47916668467223644, + "step": 87 + }, + { + "advantage_max": 1.5240605920553207, + "advantage_mean": -6.208818015274176e-09, + "advantage_min": -1.0140509381890297, + "advantage_std": 0.9408680759370327, + "completion_length": 2959.166748046875, + "epoch": 0.10057142857142858, + "grad_norm": 0.16569511592388153, + "kl": 0.007565021514892578, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.8425742251254e-07, + "loss": 0.0452, + "reward": 0.22164431703276932, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.22164431703276932, + "reward_after_std": 0.9408680908381939, + "reward_before_mean": 0.5424534603953362, + "reward_before_std": 1.0073655508458614, + "reward_change_max": 0.000623852014541626, + "reward_change_mean": -0.3208091165870428, + "reward_change_min": -0.6903372332453728, + "reward_change_std": 0.2898290455341339, + "reward_std": 0.9408681094646454, + "rewards/cosine_scaled_reward": 0.03164337668567896, + "rewards/format_reward": 0.4791666753590107, + "step": 88 + }, + { + "advantage_max": 1.2696744501590729, + "advantage_mean": 1.8626453157644107e-09, + "advantage_min": -0.5966677069664001, + "advantage_std": 0.7119504939764738, + "completion_length": 3250.8958587646484, + "epoch": 0.10171428571428572, + "grad_norm": 0.1325407475233078, + "kl": 0.0038471221923828125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.83423155058946e-07, + "loss": 0.0007, + "reward": -0.111092375125736, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.111092375125736, + "reward_after_std": 0.7119505014270544, + "reward_before_mean": 0.1102348892018199, + "reward_before_std": 0.7136143017560244, + "reward_change_max": 0.0010024309158325195, + "reward_change_mean": -0.22132724151015282, + "reward_change_min": -0.47170834988355637, + "reward_change_std": 0.18059588875621557, + "reward_std": 0.7119505144655704, + "rewards/cosine_scaled_reward": -0.06988256610929966, + "rewards/format_reward": 0.25000000186264515, + "step": 89 + }, + { + "advantage_max": 1.1662665717303753, + "advantage_mean": 0.0, + "advantage_min": -0.5448751635849476, + "advantage_std": 0.6311663277447224, + "completion_length": 2775.437515258789, + "epoch": 0.10285714285714286, + "grad_norm": 0.11538201570510864, + "kl": 0.011361122131347656, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.825677631722435e-07, + "loss": 0.0164, + "reward": -0.16298224218189716, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.16298224218189716, + "reward_after_std": 0.6311663128435612, + "reward_before_mean": 0.04312063241377473, + "reward_before_std": 0.6125261038541794, + "reward_change_max": 0.0005078762769699097, + "reward_change_mean": -0.20610287599265575, + "reward_change_min": -0.38208043575286865, + "reward_change_std": 0.15109846275299788, + "reward_std": 0.6311663277447224, + "rewards/cosine_scaled_reward": -0.1867730226367712, + "rewards/format_reward": 0.4166666716337204, + "step": 90 + }, + { + "advantage_max": 1.474507611244917, + "advantage_mean": 1.4280280180578586e-08, + "advantage_min": -0.9642866589128971, + "advantage_std": 0.9065007567405701, + "completion_length": 3187.0000915527344, + "epoch": 0.104, + "grad_norm": 0.15550005435943604, + "kl": 0.005296945571899414, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.816912885430258e-07, + "loss": 0.038, + "reward": 0.1353749530389905, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.1353749530389905, + "reward_after_std": 0.9065007455646992, + "reward_before_mean": 0.42992572486400604, + "reward_before_std": 0.9722138866782188, + "reward_change_max": 0.0, + "reward_change_mean": -0.29455077461898327, + "reward_change_min": -0.6697384584695101, + "reward_change_std": 0.2784268017858267, + "reward_std": 0.9065007641911507, + "rewards/cosine_scaled_reward": 0.017046190798282623, + "rewards/format_reward": 0.3958333432674408, + "step": 91 + }, + { + "advantage_max": 1.0653186030685902, + "advantage_mean": 5.898376342905465e-09, + "advantage_min": -0.8072712197899818, + "advantage_std": 0.6613656990230083, + "completion_length": 2990.500015258789, + "epoch": 0.10514285714285715, + "grad_norm": 0.11002243310213089, + "kl": 0.00823211669921875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.807937738894303e-07, + "loss": 0.0423, + "reward": -0.012807987630367279, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.012807987630367279, + "reward_after_std": 0.661365695297718, + "reward_before_mean": 0.25806828774511814, + "reward_before_std": 0.6961026005446911, + "reward_change_max": 0.0003693774342536926, + "reward_change_mean": -0.2708762800320983, + "reward_change_min": -0.492202278226614, + "reward_change_std": 0.21451785834506154, + "reward_std": 0.6613657251000404, + "rewards/cosine_scaled_reward": -0.1001325212419033, + "rewards/format_reward": 0.45833334885537624, + "step": 92 + }, + { + "advantage_max": 0.7933408431708813, + "advantage_mean": 2.1730860666480112e-08, + "advantage_min": -0.46355464309453964, + "advantage_std": 0.4695439264178276, + "completion_length": 3566.375, + "epoch": 0.10628571428571429, + "grad_norm": 0.08727142214775085, + "kl": 0.005443572998046875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.798752629550546e-07, + "loss": 0.0051, + "reward": -0.46245191991329193, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.46245191991329193, + "reward_after_std": 0.4695439338684082, + "reward_before_mean": -0.33290886227041483, + "reward_before_std": 0.4970332011580467, + "reward_change_max": 0.0008160322904586792, + "reward_change_mean": -0.1295430501922965, + "reward_change_min": -0.2736487351357937, + "reward_change_std": 0.12085827440023422, + "reward_std": 0.4695439413189888, + "rewards/cosine_scaled_reward": -0.18728776648640633, + "rewards/format_reward": 0.0416666679084301, + "step": 93 + }, + { + "advantage_max": 1.0807769075036049, + "advantage_mean": -1.6142924885720333e-08, + "advantage_min": -0.5711992047727108, + "advantage_std": 0.6211388818919659, + "completion_length": 3111.437530517578, + "epoch": 0.10742857142857143, + "grad_norm": 0.12472743541002274, + "kl": 0.010393619537353516, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.78935800506826e-07, + "loss": 0.0441, + "reward": -0.10525502264499664, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.10525502264499664, + "reward_after_std": 0.6211388818919659, + "reward_before_mean": 0.13153162971138954, + "reward_before_std": 0.6166800931096077, + "reward_change_max": 0.00021298229694366455, + "reward_change_mean": -0.23678663885220885, + "reward_change_min": -0.49180080369114876, + "reward_change_std": 0.18674441473558545, + "reward_std": 0.6211389005184174, + "rewards/cosine_scaled_reward": -0.06965086422860622, + "rewards/format_reward": 0.27083333767950535, + "step": 94 + }, + { + "advantage_max": 1.2856793850660324, + "advantage_mean": 8.6923440667519e-09, + "advantage_min": -0.5027649588882923, + "advantage_std": 0.6929790526628494, + "completion_length": 3391.0416870117188, + "epoch": 0.10857142857142857, + "grad_norm": 0.09327510744333267, + "kl": 0.002196788787841797, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.779754323328192e-07, + "loss": 0.0196, + "reward": -0.3284091189270839, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.3284091189270839, + "reward_after_std": 0.6929790526628494, + "reward_before_mean": -0.18555639125406742, + "reward_before_std": 0.6929865293204784, + "reward_change_max": 0.0010388195514678955, + "reward_change_mean": -0.14285272872075438, + "reward_change_min": -0.36844565719366074, + "reward_change_std": 0.13806079514324665, + "reward_std": 0.69297906011343, + "rewards/cosine_scaled_reward": -0.20736153866164386, + "rewards/format_reward": 0.22916666977107525, + "step": 95 + }, + { + "advantage_max": 0.9502243474125862, + "advantage_mean": 2.7755575615628914e-16, + "advantage_min": -0.762770913541317, + "advantage_std": 0.6231532171368599, + "completion_length": 3168.2708587646484, + "epoch": 0.10971428571428571, + "grad_norm": 0.11269133538007736, + "kl": 0.008467674255371094, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.769942052400235e-07, + "loss": 0.0454, + "reward": -0.09130434179678559, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.09130434179678559, + "reward_after_std": 0.6231532134115696, + "reward_before_mean": 0.15943890064954758, + "reward_before_std": 0.6864537037909031, + "reward_change_max": 0.002087824046611786, + "reward_change_mean": -0.25074325082823634, + "reward_change_min": -0.5374267399311066, + "reward_change_std": 0.21889762580394745, + "reward_std": 0.6231532245874405, + "rewards/cosine_scaled_reward": -0.07653055200353265, + "rewards/format_reward": 0.3125000111758709, + "step": 96 + }, + { + "advantage_max": 0.894398532807827, + "advantage_mean": 9.934107536579972e-09, + "advantage_min": -0.6122366264462471, + "advantage_std": 0.5579078607261181, + "completion_length": 3316.2083740234375, + "epoch": 0.11085714285714286, + "grad_norm": 0.11165490746498108, + "kl": 0.00435638427734375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.759921670520634e-07, + "loss": 0.0248, + "reward": -0.07307976484298706, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.07307976484298706, + "reward_after_std": 0.5579078570008278, + "reward_before_mean": 0.1892811693251133, + "reward_before_std": 0.5676539484411478, + "reward_change_max": 0.00020658224821090698, + "reward_change_mean": -0.26236093137413263, + "reward_change_min": -0.4734621290117502, + "reward_change_std": 0.2033616118133068, + "reward_std": 0.5579078681766987, + "rewards/cosine_scaled_reward": -0.0407760813832283, + "rewards/format_reward": 0.27083333767950535, + "step": 97 + }, + { + "advantage_max": 1.1973653174936771, + "advantage_mean": 1.8626444830971423e-09, + "advantage_min": -0.49922803044319153, + "advantage_std": 0.6366796083748341, + "completion_length": 3146.0833740234375, + "epoch": 0.112, + "grad_norm": 0.1001494824886322, + "kl": 0.003482341766357422, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.749693666068663e-07, + "loss": 0.0621, + "reward": -0.19620523788034916, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.19620523788034916, + "reward_after_std": 0.6366796176880598, + "reward_before_mean": -0.00034199096262454987, + "reward_before_std": 0.5961110591888428, + "reward_change_max": 0.0005563125014305115, + "reward_change_mean": -0.19586324412375689, + "reward_change_min": -0.3818340804427862, + "reward_change_std": 0.14894506987184286, + "reward_std": 0.6366796419024467, + "rewards/cosine_scaled_reward": -0.1668376699090004, + "rewards/format_reward": 0.33333333767950535, + "step": 98 + }, + { + "advantage_max": 0.9004656635224819, + "advantage_mean": -3.7252897433504017e-09, + "advantage_min": -0.4743019826710224, + "advantage_std": 0.5211676489561796, + "completion_length": 2839.750015258789, + "epoch": 0.11314285714285714, + "grad_norm": 0.08176909387111664, + "kl": 0.01638031005859375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.739258537542835e-07, + "loss": 0.0088, + "reward": -0.25589028745889664, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.25589028745889664, + "reward_after_std": 0.5211676508188248, + "reward_before_mean": -0.05980054661631584, + "reward_before_std": 0.5296378303319216, + "reward_change_max": 0.0010651499032974243, + "reward_change_mean": -0.196089755743742, + "reward_change_min": -0.39398371428251266, + "reward_change_std": 0.15637101605534554, + "reward_std": 0.5211676601320505, + "rewards/cosine_scaled_reward": -0.1861502705141902, + "rewards/format_reward": 0.3125, + "step": 99 + }, + { + "advantage_max": 1.4020006023347378, + "advantage_mean": -1.2417636363615259e-09, + "advantage_min": -0.9309210404753685, + "advantage_std": 0.8519085347652435, + "completion_length": 3042.062530517578, + "epoch": 0.11428571428571428, + "grad_norm": 0.1596578061580658, + "kl": 0.009717941284179688, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.728616793536587e-07, + "loss": 0.0867, + "reward": 0.09608042938634753, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.09608042938634753, + "reward_after_std": 0.8519085198640823, + "reward_before_mean": 0.3819417469203472, + "reward_before_std": 0.9046727884560823, + "reward_change_max": 0.0, + "reward_change_mean": -0.2858613282442093, + "reward_change_min": -0.5586092844605446, + "reward_change_std": 0.2462056027725339, + "reward_std": 0.851908553391695, + "rewards/cosine_scaled_reward": -0.006945790722966194, + "rewards/format_reward": 0.39583334885537624, + "step": 100 + }, + { + "advantage_max": 0.637329638004303, + "advantage_mean": 3.4148494920671624e-08, + "advantage_min": -0.44175083562731743, + "advantage_std": 0.37393420562148094, + "completion_length": 2976.416717529297, + "epoch": 0.11542857142857142, + "grad_norm": 0.06858979165554047, + "kl": 0.004735231399536133, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.717768952713511e-07, + "loss": 0.0141, + "reward": -0.12358441762626171, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.12358441762626171, + "reward_after_std": 0.37393419072031975, + "reward_before_mean": 0.13824375998228788, + "reward_before_std": 0.3420494869351387, + "reward_change_max": 0.0, + "reward_change_mean": -0.26182815805077553, + "reward_change_min": -0.40689949691295624, + "reward_change_std": 0.16052342765033245, + "reward_std": 0.37393420189619064, + "rewards/cosine_scaled_reward": -0.0871281186118722, + "rewards/format_reward": 0.31250000186264515, + "step": 101 + }, + { + "advantage_max": 1.4978736191987991, + "advantage_mean": 1.614292521878724e-08, + "advantage_min": -0.9803852066397667, + "advantage_std": 0.9344386383891106, + "completion_length": 2877.0000915527344, + "epoch": 0.11657142857142858, + "grad_norm": 0.22138696908950806, + "kl": 0.012332916259765625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.706715543782064e-07, + "loss": 0.112, + "reward": 0.25213714223355055, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.25213714223355055, + "reward_after_std": 0.9344386458396912, + "reward_before_mean": 0.5871431287378073, + "reward_before_std": 1.0055402405560017, + "reward_change_max": 0.0006800442934036255, + "reward_change_mean": -0.3350059576332569, + "reward_change_min": -0.7078782953321934, + "reward_change_std": 0.2961591836065054, + "reward_std": 0.9344386756420135, + "rewards/cosine_scaled_reward": 0.033154879696667194, + "rewards/format_reward": 0.5208333395421505, + "step": 102 + }, + { + "advantage_max": 1.4239412993192673, + "advantage_mean": 9.934107647602275e-09, + "advantage_min": -0.7207528688013554, + "advantage_std": 0.8292893841862679, + "completion_length": 3183.3958740234375, + "epoch": 0.11771428571428572, + "grad_norm": 0.14050276577472687, + "kl": 0.009951591491699219, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.695457105469804e-07, + "loss": 0.098, + "reward": -0.10000946186482906, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.10000946186482906, + "reward_after_std": 0.8292893879115582, + "reward_before_mean": 0.11191971227526665, + "reward_before_std": 0.8726517036557198, + "reward_change_max": 0.0009425804018974304, + "reward_change_mean": -0.2119291506242007, + "reward_change_min": -0.5215573105961084, + "reward_change_std": 0.21027667145244777, + "reward_std": 0.8292894102632999, + "rewards/cosine_scaled_reward": -0.11070681922137737, + "rewards/format_reward": 0.3333333432674408, + "step": 103 + }, + { + "advantage_max": 1.1159479767084122, + "advantage_mean": 8.071462720415923e-09, + "advantage_min": -0.6675407961010933, + "advantage_std": 0.661103330552578, + "completion_length": 2837.000015258789, + "epoch": 0.11885714285714286, + "grad_norm": 0.23616887629032135, + "kl": 0.006572723388671875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.683994186497132e-07, + "loss": 0.0544, + "reward": -0.12476626224815845, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.12476626224815845, + "reward_after_std": 0.6611033640801907, + "reward_before_mean": 0.10218615579651669, + "reward_before_std": 0.6853909529745579, + "reward_change_max": 0.0016945451498031616, + "reward_change_mean": -0.2269524084404111, + "reward_change_min": -0.4498551990836859, + "reward_change_std": 0.18958128709346056, + "reward_std": 0.6611033827066422, + "rewards/cosine_scaled_reward": -0.136406933888793, + "rewards/format_reward": 0.37500000186264515, + "step": 104 + }, + { + "advantage_max": 1.7909524142742157, + "advantage_mean": 3.1044085080367267e-09, + "advantage_min": -0.9422320500016212, + "advantage_std": 1.0115957744419575, + "completion_length": 2932.2708435058594, + "epoch": 0.12, + "grad_norm": 0.12929129600524902, + "kl": 0.0063686370849609375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.672327345550543e-07, + "loss": 0.0374, + "reward": 0.13043325836770236, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.13043325836770236, + "reward_after_std": 1.0115957781672478, + "reward_before_mean": 0.4006117479875684, + "reward_before_std": 1.0370574593544006, + "reward_change_max": 0.0004918202757835388, + "reward_change_mean": -0.2701784926466644, + "reward_change_min": -0.5277115534991026, + "reward_change_std": 0.2267649695277214, + "reward_std": 1.0115957856178284, + "rewards/cosine_scaled_reward": 0.002389195084106177, + "rewards/format_reward": 0.3958333432674408, + "step": 105 + }, + { + "advantage_max": 1.4532872326672077, + "advantage_mean": -3.1044086745701804e-09, + "advantage_min": -0.8069161958992481, + "advantage_std": 0.8219671659171581, + "completion_length": 2498.3333740234375, + "epoch": 0.12114285714285715, + "grad_norm": 0.12542401254177094, + "kl": 0.013767242431640625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.66045715125541e-07, + "loss": 0.0723, + "reward": 0.41410426795482635, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.41410426795482635, + "reward_after_std": 0.821967139840126, + "reward_before_mean": 0.8086659302935004, + "reward_before_std": 0.7867364194244146, + "reward_change_max": 0.0, + "reward_change_mean": -0.3945616828277707, + "reward_change_min": -0.6701525822281837, + "reward_change_std": 0.264983544126153, + "reward_std": 0.8219671808183193, + "rewards/cosine_scaled_reward": 0.11266629956662655, + "rewards/format_reward": 0.5833333376795053, + "step": 106 + }, + { + "advantage_max": 0.9903253465890884, + "advantage_mean": -1.3659397946064189e-08, + "advantage_min": -0.5689501836895943, + "advantage_std": 0.5777853094041348, + "completion_length": 2992.3333435058594, + "epoch": 0.12228571428571429, + "grad_norm": 0.12914782762527466, + "kl": 0.00812530517578125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.648384182148252e-07, + "loss": 0.0535, + "reward": -0.03730320557951927, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.03730320557951927, + "reward_after_std": 0.577785300090909, + "reward_before_mean": 0.23000611877068877, + "reward_before_std": 0.5657899845391512, + "reward_change_max": 0.0007830634713172913, + "reward_change_mean": -0.26730930525809526, + "reward_change_min": -0.46656811609864235, + "reward_change_std": 0.1908670226112008, + "reward_std": 0.5777853112667799, + "rewards/cosine_scaled_reward": -0.09333029016852379, + "rewards/format_reward": 0.41666666977107525, + "step": 107 + }, + { + "advantage_max": 1.4422883205115795, + "advantage_mean": 4.967053657267684e-09, + "advantage_min": -0.6572414226830006, + "advantage_std": 0.8186516799032688, + "completion_length": 2927.0417098999023, + "epoch": 0.12342857142857143, + "grad_norm": 24.618846893310547, + "kl": 3.3649168014526367, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.636109026648554e-07, + "loss": 0.1124, + "reward": -0.15085413120687008, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.15085413120687008, + "reward_after_std": 0.8186516761779785, + "reward_before_mean": 0.04168054834008217, + "reward_before_std": 0.8399753011763096, + "reward_change_max": 0.0003223493695259094, + "reward_change_mean": -0.19253466837108135, + "reward_change_min": -0.4192156232893467, + "reward_change_std": 0.18019374925643206, + "reward_std": 0.8186517059803009, + "rewards/cosine_scaled_reward": -0.1458263983950019, + "rewards/format_reward": 0.3333333358168602, + "step": 108 + }, + { + "advantage_max": 0.8548574820160866, + "advantage_mean": 9.934107536579972e-09, + "advantage_min": -0.5198825635015965, + "advantage_std": 0.49553442001342773, + "completion_length": 3041.3541870117188, + "epoch": 0.12457142857142857, + "grad_norm": 0.07054049521684647, + "kl": 0.0053844451904296875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.623632283030077e-07, + "loss": 0.0056, + "reward": -0.21929301880300045, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.21929301880300045, + "reward_after_std": 0.49553442373871803, + "reward_before_mean": -0.006567927077412605, + "reward_before_std": 0.49612240865826607, + "reward_change_max": 0.0013580769300460815, + "reward_change_mean": -0.21272510197013617, + "reward_change_min": -0.3920324221253395, + "reward_change_std": 0.15321388468146324, + "reward_std": 0.4955344498157501, + "rewards/cosine_scaled_reward": -0.15953395422548056, + "rewards/format_reward": 0.31250000186264515, + "step": 109 + }, + { + "advantage_max": 1.7169987708330154, + "advantage_mean": 1.521160253314946e-08, + "advantage_min": -0.773616373538971, + "advantage_std": 0.9723969623446465, + "completion_length": 2968.9584045410156, + "epoch": 0.12571428571428572, + "grad_norm": 0.16165363788604736, + "kl": 0.007773399353027344, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.610954559391704e-07, + "loss": 0.0213, + "reward": 0.1421629348769784, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1421629348769784, + "reward_after_std": 0.9723969735205173, + "reward_before_mean": 0.4209832027554512, + "reward_before_std": 0.9890514109283686, + "reward_change_max": 0.00023964792490005493, + "reward_change_mean": -0.27882024459540844, + "reward_change_min": -0.6649527996778488, + "reward_change_std": 0.2514312257990241, + "reward_std": 0.9723969958722591, + "rewards/cosine_scaled_reward": -0.02909173769876361, + "rewards/format_reward": 0.479166679084301, + "step": 110 + }, + { + "advantage_max": 1.0488375090062618, + "advantage_mean": 1.4280279181377864e-08, + "advantage_min": -0.6760149672627449, + "advantage_std": 0.6392879411578178, + "completion_length": 3331.0208740234375, + "epoch": 0.12685714285714286, + "grad_norm": 0.1319161206483841, + "kl": 0.008884429931640625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.598076473627796e-07, + "loss": 0.0618, + "reward": -0.24013402685523033, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.24013402685523033, + "reward_after_std": 0.6392879635095596, + "reward_before_mean": -0.04797346517443657, + "reward_before_std": 0.6857679709792137, + "reward_change_max": 0.0008327588438987732, + "reward_change_mean": -0.1921605784446001, + "reward_change_min": -0.4485097285360098, + "reward_change_std": 0.18650522828102112, + "reward_std": 0.6392879746854305, + "rewards/cosine_scaled_reward": -0.10732006467878819, + "rewards/format_reward": 0.16666666977107525, + "step": 111 + }, + { + "advantage_max": 1.4520698636770248, + "advantage_mean": -3.1044085080367267e-09, + "advantage_min": -0.7237064838409424, + "advantage_std": 0.8273558132350445, + "completion_length": 3388.4584045410156, + "epoch": 0.128, + "grad_norm": 0.13587014377117157, + "kl": 0.004698753356933594, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.58499865339809e-07, + "loss": 0.0292, + "reward": 0.06877225078642368, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.06877225078642368, + "reward_after_std": 0.8273558430373669, + "reward_before_mean": 0.34132440108805895, + "reward_before_std": 0.827323455363512, + "reward_change_max": 0.0007152184844017029, + "reward_change_mean": -0.2725521409884095, + "reward_change_min": -0.5733534023165703, + "reward_change_std": 0.22624664986506104, + "reward_std": 0.8273558542132378, + "rewards/cosine_scaled_reward": 0.003995520528405905, + "rewards/format_reward": 0.3333333358168602, + "step": 112 + }, + { + "advantage_max": 1.3849294260144234, + "advantage_mean": -2.4214387051024744e-08, + "advantage_min": -0.9319385662674904, + "advantage_std": 0.8326413743197918, + "completion_length": 2787.5625610351562, + "epoch": 0.12914285714285714, + "grad_norm": 0.17472368478775024, + "kl": 0.0091705322265625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.571721736097088e-07, + "loss": 0.0602, + "reward": 0.22794928343500942, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.22794928343500942, + "reward_after_std": 0.8326413743197918, + "reward_before_mean": 0.561581514775753, + "reward_before_std": 0.8620108254253864, + "reward_change_max": 0.0, + "reward_change_mean": -0.333632237277925, + "reward_change_min": -0.6458872146904469, + "reward_change_std": 0.2622876074165106, + "reward_std": 0.832641389220953, + "rewards/cosine_scaled_reward": -0.010875914245843887, + "rewards/format_reward": 0.5833333414047956, + "step": 113 + }, + { + "advantage_max": 0.9513976350426674, + "advantage_mean": -1.8626453157644107e-09, + "advantage_min": -0.44018446281552315, + "advantage_std": 0.5162045583128929, + "completion_length": 2600.62508392334, + "epoch": 0.13028571428571428, + "grad_norm": 0.07465193420648575, + "kl": 0.0067729949951171875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.55824636882301e-07, + "loss": 0.014, + "reward": -0.1275107857072726, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.1275107857072726, + "reward_after_std": 0.5162045545876026, + "reward_before_mean": 0.11073758825659752, + "reward_before_std": 0.46768177300691605, + "reward_change_max": 0.0012068524956703186, + "reward_change_mean": -0.23824838874861598, + "reward_change_min": -0.41094420850276947, + "reward_change_std": 0.15441239904612303, + "reward_std": 0.5162045657634735, + "rewards/cosine_scaled_reward": -0.2571312137879431, + "rewards/format_reward": 0.6250000055879354, + "step": 114 + }, + { + "advantage_max": 1.2352916896343231, + "advantage_mean": -1.1175871117430347e-08, + "advantage_min": -0.5806517638266087, + "advantage_std": 0.6685930602252483, + "completion_length": 2888.1250228881836, + "epoch": 0.13142857142857142, + "grad_norm": 0.09872996062040329, + "kl": 0.006771087646484375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.54457320834625e-07, + "loss": 0.0075, + "reward": -0.10066608339548111, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.10066608339548111, + "reward_after_std": 0.668593030422926, + "reward_before_mean": 0.1253925976343453, + "reward_before_std": 0.6527570895850658, + "reward_change_max": 0.001193806529045105, + "reward_change_mean": -0.22605869453400373, + "reward_change_min": -0.3762983959168196, + "reward_change_std": 0.15092294523492455, + "reward_std": 0.6685930527746677, + "rewards/cosine_scaled_reward": -0.09355370327830315, + "rewards/format_reward": 0.31250000186264515, + "step": 115 + }, + { + "advantage_max": 0.9336372055113316, + "advantage_mean": 1.862645371275562e-09, + "advantage_min": -0.5836285278201103, + "advantage_std": 0.5509970504790545, + "completion_length": 3403.2291870117188, + "epoch": 0.13257142857142856, + "grad_norm": 0.1181810051202774, + "kl": 0.006374359130859375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.530702921077358e-07, + "loss": 0.0328, + "reward": -0.2978327311575413, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.2978327311575413, + "reward_after_std": 0.5509970467537642, + "reward_before_mean": -0.11921874433755875, + "reward_before_std": 0.577593807131052, + "reward_change_max": 0.000988095998764038, + "reward_change_mean": -0.17861400917172432, + "reward_change_min": -0.3633838780224323, + "reward_change_std": 0.15010416228324175, + "reward_std": 0.5509970523416996, + "rewards/cosine_scaled_reward": -0.14294270798563957, + "rewards/format_reward": 0.16666667349636555, + "step": 116 + }, + { + "advantage_max": 1.1050181835889816, + "advantage_mean": 2.23517425679276e-08, + "advantage_min": -0.5356793627142906, + "advantage_std": 0.6055473424494267, + "completion_length": 3196.375030517578, + "epoch": 0.1337142857142857, + "grad_norm": 0.10387641191482544, + "kl": 0.008967399597167969, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.516636183034564e-07, + "loss": 0.0234, + "reward": -0.341567924246192, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.341567924246192, + "reward_after_std": 0.6055473275482655, + "reward_before_mean": -0.19230719900224358, + "reward_before_std": 0.612238947302103, + "reward_change_max": 0.0010152682662010193, + "reward_change_mean": -0.14926070533692837, + "reward_change_min": -0.3155266009271145, + "reward_change_std": 0.13089507957920432, + "reward_std": 0.6055473312735558, + "rewards/cosine_scaled_reward": -0.2419869415462017, + "rewards/format_reward": 0.29166666977107525, + "step": 117 + }, + { + "advantage_max": 1.641647595912218, + "advantage_mean": -4.967053213178474e-09, + "advantage_min": -0.9000925049185753, + "advantage_std": 0.9510752744972706, + "completion_length": 2933.125030517578, + "epoch": 0.13485714285714287, + "grad_norm": 0.1362229734659195, + "kl": 0.004642486572265625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.502373679810839e-07, + "loss": 0.0111, + "reward": 0.3653463274240494, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3653463274240494, + "reward_after_std": 0.9510752744972706, + "reward_before_mean": 0.7301360741257668, + "reward_before_std": 0.9502398185431957, + "reward_change_max": 0.0, + "reward_change_mean": -0.3647897462360561, + "reward_change_min": -0.7364008165895939, + "reward_change_std": 0.28538533207029104, + "reward_std": 0.951075304299593, + "rewards/cosine_scaled_reward": 0.11506801494397223, + "rewards/format_reward": 0.5000000055879354, + "step": 118 + }, + { + "advantage_max": 1.2773814722895622, + "advantage_mean": -4.440892098500626e-16, + "advantage_min": -0.7336373254656792, + "advantage_std": 0.7546707466244698, + "completion_length": 2631.645851135254, + "epoch": 0.136, + "grad_norm": 0.11543148756027222, + "kl": 0.010685920715332031, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.487916106540465e-07, + "loss": 0.0418, + "reward": 0.12168693542480469, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.12168693542480469, + "reward_after_std": 0.7546707466244698, + "reward_before_mean": 0.4236811628798023, + "reward_before_std": 0.7664337083697319, + "reward_change_max": 0.0007797032594680786, + "reward_change_mean": -0.301994226872921, + "reward_change_min": -0.5957271978259087, + "reward_change_std": 0.2320833122357726, + "reward_std": 0.7546707689762115, + "rewards/cosine_scaled_reward": -0.06940942443907261, + "rewards/format_reward": 0.5625000037252903, + "step": 119 + }, + { + "advantage_max": 1.3410765826702118, + "advantage_mean": 5.898376453927767e-09, + "advantage_min": -0.6475675106048584, + "advantage_std": 0.7686546426266432, + "completion_length": 2500.8750610351562, + "epoch": 0.13714285714285715, + "grad_norm": 0.1472538560628891, + "kl": 0.009729385375976562, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.473264167865171e-07, + "loss": 0.0945, + "reward": 0.17069168761372566, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.17069168761372566, + "reward_after_std": 0.7686546500772238, + "reward_before_mean": 0.48508234234759584, + "reward_before_std": 0.7526429118588567, + "reward_change_max": 0.0, + "reward_change_mean": -0.3143906258046627, + "reward_change_min": -0.6074437368661165, + "reward_change_std": 0.23728215042501688, + "reward_std": 0.7686546761542559, + "rewards/cosine_scaled_reward": -0.007458832420525141, + "rewards/format_reward": 0.500000013038516, + "step": 120 + }, + { + "advantage_max": 1.3169677779078484, + "advantage_mean": -1.8316011485275396e-08, + "advantage_min": -0.5936854109168053, + "advantage_std": 0.7040582299232483, + "completion_length": 2078.000015258789, + "epoch": 0.1382857142857143, + "grad_norm": 0.13888248801231384, + "kl": 0.012456893920898438, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.458418577899774e-07, + "loss": 0.07, + "reward": 0.36981683829799294, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.36981683829799294, + "reward_after_std": 0.7040582150220871, + "reward_before_mean": 0.7583018532022834, + "reward_before_std": 0.6163065582513809, + "reward_change_max": 0.000717945396900177, + "reward_change_mean": -0.3884850200265646, + "reward_change_min": -0.6099117025732994, + "reward_change_std": 0.2327408418059349, + "reward_std": 0.7040582410991192, + "rewards/cosine_scaled_reward": 0.014567593112587929, + "rewards/format_reward": 0.7291666679084301, + "step": 121 + }, + { + "advantage_max": 1.396921530365944, + "advantage_mean": 4.967053990334591e-09, + "advantage_min": -0.8277639225125313, + "advantage_std": 0.8302930146455765, + "completion_length": 2936.9584159851074, + "epoch": 0.13942857142857143, + "grad_norm": 0.14620067179203033, + "kl": 0.007358551025390625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.443380060197385e-07, + "loss": 0.0048, + "reward": 0.14990826323628426, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.14990826323628426, + "reward_after_std": 0.8302930146455765, + "reward_before_mean": 0.45474499464035034, + "reward_before_std": 0.8626224808394909, + "reward_change_max": 6.0871243476867676e-05, + "reward_change_mean": -0.30483673978596926, + "reward_change_min": -0.6525392979383469, + "reward_change_std": 0.2505636941641569, + "reward_std": 0.8302930295467377, + "rewards/cosine_scaled_reward": -0.012210835237056017, + "rewards/format_reward": 0.47916667722165585, + "step": 122 + }, + { + "advantage_max": 1.1955565959215164, + "advantage_mean": 1.3038516488705909e-08, + "advantage_min": -0.7384285181760788, + "advantage_std": 0.7622129395604134, + "completion_length": 3045.6459045410156, + "epoch": 0.14057142857142857, + "grad_norm": 0.1293623447418213, + "kl": 0.00701904296875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.428149347714143e-07, + "loss": 0.0924, + "reward": -0.10539140645414591, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.10539140645414591, + "reward_after_std": 0.7622129209339619, + "reward_before_mean": 0.12175704911351204, + "reward_before_std": 0.8335296474397182, + "reward_change_max": 0.0007355660200119019, + "reward_change_mean": -0.22714845649898052, + "reward_change_min": -0.5292183440178633, + "reward_change_std": 0.23081361688673496, + "reward_std": 0.7622129283845425, + "rewards/cosine_scaled_reward": -0.13703814148902893, + "rewards/format_reward": 0.39583333767950535, + "step": 123 + }, + { + "advantage_max": 1.2478653825819492, + "advantage_mean": -2.2972624080797033e-08, + "advantage_min": -0.7673164531588554, + "advantage_std": 0.756400678306818, + "completion_length": 2592.687515258789, + "epoch": 0.1417142857142857, + "grad_norm": 0.12547579407691956, + "kl": 0.01032257080078125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.412727182773486e-07, + "loss": 0.039, + "reward": 0.17265462409704924, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.17265462409704924, + "reward_after_std": 0.7564007006585598, + "reward_before_mean": 0.4965659724548459, + "reward_before_std": 0.7837951183319092, + "reward_change_max": 0.0, + "reward_change_mean": -0.3239113837480545, + "reward_change_min": -0.667121559381485, + "reward_change_std": 0.26153578516095877, + "reward_std": 0.7564007118344307, + "rewards/cosine_scaled_reward": -0.022550346329808235, + "rewards/format_reward": 0.5416666716337204, + "step": 124 + }, + { + "advantage_max": 1.150554358959198, + "advantage_mean": -6.208818126296478e-09, + "advantage_min": -0.5988728702068329, + "advantage_std": 0.6304605938494205, + "completion_length": 2893.791702270508, + "epoch": 0.14285714285714285, + "grad_norm": 0.08245649188756943, + "kl": 0.0050067901611328125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.397114317029974e-07, + "loss": 0.0061, + "reward": 0.041422320529818535, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.041422320529818535, + "reward_after_std": 0.6304605901241302, + "reward_before_mean": 0.3251929506659508, + "reward_before_std": 0.5924066100269556, + "reward_change_max": 0.0006644278764724731, + "reward_change_mean": -0.2837706417776644, + "reward_change_min": -0.46433842554688454, + "reward_change_std": 0.1861866288818419, + "reward_std": 0.6304606199264526, + "rewards/cosine_scaled_reward": 0.006346469279378653, + "rewards/format_reward": 0.31250000186264515, + "step": 125 + }, + { + "advantage_max": 1.4216941222548485, + "advantage_mean": -7.450580763457282e-09, + "advantage_min": -0.7550601400434971, + "advantage_std": 0.821855939924717, + "completion_length": 2790.604217529297, + "epoch": 0.144, + "grad_norm": 0.1472250074148178, + "kl": 0.005031585693359375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.381311511432658e-07, + "loss": 0.0522, + "reward": 0.052971549332141876, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.052971549332141876, + "reward_after_std": 0.821855939924717, + "reward_before_mean": 0.3223703149706125, + "reward_before_std": 0.8384648263454437, + "reward_change_max": 0.0003665909171104431, + "reward_change_mean": -0.2693987749516964, + "reward_change_min": -0.5547848157584667, + "reward_change_std": 0.21912654396146536, + "reward_std": 0.821855966001749, + "rewards/cosine_scaled_reward": -0.06798151088878512, + "rewards/format_reward": 0.45833333767950535, + "step": 126 + }, + { + "advantage_max": 0.954961534589529, + "advantage_mean": -1.241763414316921e-09, + "advantage_min": -0.6304600276052952, + "advantage_std": 0.608190419152379, + "completion_length": 3243.7500610351562, + "epoch": 0.14514285714285713, + "grad_norm": 0.16301438212394714, + "kl": 0.00958251953125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.36531953618799e-07, + "loss": 0.0694, + "reward": -0.1906900741159916, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.1906900741159916, + "reward_after_std": 0.608190419152379, + "reward_before_mean": 0.024494127836078405, + "reward_before_std": 0.6589562017470598, + "reward_change_max": 0.001492425799369812, + "reward_change_mean": -0.21518420707434416, + "reward_change_min": -0.5101010613143444, + "reward_change_std": 0.20124634448438883, + "reward_std": 0.6081904359161854, + "rewards/cosine_scaled_reward": -0.154419609811157, + "rewards/format_reward": 0.3333333432674408, + "step": 127 + }, + { + "advantage_max": 1.0362044796347618, + "advantage_mean": -1.4280280069556284e-08, + "advantage_min": -0.8516915030777454, + "advantage_std": 0.6831777542829514, + "completion_length": 2831.104202270508, + "epoch": 0.1462857142857143, + "grad_norm": 0.10564086586236954, + "kl": 0.007222175598144531, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.34913917072228e-07, + "loss": 0.0296, + "reward": 0.30764661356806755, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.30764661356806755, + "reward_after_std": 0.6831777645274997, + "reward_before_mean": 0.6954053975641727, + "reward_before_std": 0.7116181813180447, + "reward_change_max": 0.0012954026460647583, + "reward_change_mean": -0.387758809607476, + "reward_change_min": -0.6460206061601639, + "reward_change_std": 0.27978145400993526, + "reward_std": 0.6831777868792415, + "rewards/cosine_scaled_reward": 0.13936935923993587, + "rewards/format_reward": 0.4166666716337204, + "step": 128 + }, + { + "advantage_max": 1.2157826200127602, + "advantage_mean": 7.45058070794613e-09, + "advantage_min": -0.6898716762661934, + "advantage_std": 0.6953651700168848, + "completion_length": 3310.3333740234375, + "epoch": 0.14742857142857144, + "grad_norm": 0.12506195902824402, + "kl": 0.008449554443359375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.332771203643714e-07, + "loss": -0.0103, + "reward": -0.12795098591595888, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.12795098591595888, + "reward_after_std": 0.6953651700168848, + "reward_before_mean": 0.090589489787817, + "reward_before_std": 0.7126827575266361, + "reward_change_max": 0.0, + "reward_change_mean": -0.2185404673218727, + "reward_change_min": -0.46337801590561867, + "reward_change_std": 0.18281217105686665, + "reward_std": 0.695365184918046, + "rewards/cosine_scaled_reward": -0.05887192999944091, + "rewards/format_reward": 0.2083333358168602, + "step": 129 + }, + { + "advantage_max": 0.8977086283266544, + "advantage_mean": 1.9868215628271457e-08, + "advantage_min": -0.6305883452296257, + "advantage_std": 0.5447062347084284, + "completion_length": 3246.8958740234375, + "epoch": 0.14857142857142858, + "grad_norm": 0.1151081845164299, + "kl": 0.009033203125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.316216432703916e-07, + "loss": 0.0062, + "reward": -0.19078312814235687, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.19078312814235687, + "reward_after_std": 0.5447062440216541, + "reward_before_mean": 0.02786485105752945, + "reward_before_std": 0.568111153319478, + "reward_change_max": 0.0009800717234611511, + "reward_change_mean": -0.21864797454327345, + "reward_change_min": -0.3908236641436815, + "reward_change_std": 0.16550936549901962, + "reward_std": 0.544706255197525, + "rewards/cosine_scaled_reward": -0.07981756143271923, + "rewards/format_reward": 0.18750000186264515, + "step": 130 + }, + { + "advantage_max": 1.2027404643595219, + "advantage_mean": 9.934108036180334e-09, + "advantage_min": -0.8151110913604498, + "advantage_std": 0.7555824033915997, + "completion_length": 2788.770866394043, + "epoch": 0.14971428571428572, + "grad_norm": 0.1706915944814682, + "kl": 0.009748458862304688, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.299475664759068e-07, + "loss": 0.078, + "reward": 0.23934237146750093, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.23934237146750093, + "reward_after_std": 0.75558240711689, + "reward_before_mean": 0.5904918815940619, + "reward_before_std": 0.7709046499803662, + "reward_change_max": 0.0, + "reward_change_mean": -0.3511495003476739, + "reward_change_min": -0.64772904291749, + "reward_change_std": 0.27793479710817337, + "reward_std": 0.755582433193922, + "rewards/cosine_scaled_reward": 0.07649594731628895, + "rewards/format_reward": 0.43750000186264515, + "step": 131 + }, + { + "advantage_max": 1.5452817603945732, + "advantage_mean": 6.208817349140361e-09, + "advantage_min": -0.7206988781690598, + "advantage_std": 0.8455226197838783, + "completion_length": 2813.6458435058594, + "epoch": 0.15085714285714286, + "grad_norm": 0.1316002458333969, + "kl": 0.00800323486328125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.282549715730579e-07, + "loss": -0.0289, + "reward": 0.009623751044273376, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.009623751044273376, + "reward_after_std": 0.8455226235091686, + "reward_before_mean": 0.25404807389713824, + "reward_before_std": 0.8349966257810593, + "reward_change_max": 0.0010694265365600586, + "reward_change_mean": -0.24442429654300213, + "reward_change_min": -0.5337617993354797, + "reward_change_std": 0.1964772827923298, + "reward_std": 0.8455226384103298, + "rewards/cosine_scaled_reward": -0.08130931667983532, + "rewards/format_reward": 0.41666667349636555, + "step": 132 + }, + { + "advantage_max": 1.050060760229826, + "advantage_mean": 1.862645426786713e-09, + "advantage_min": -0.4991532042622566, + "advantage_std": 0.5961620546877384, + "completion_length": 3129.2916870117188, + "epoch": 0.152, + "grad_norm": 0.12014055252075195, + "kl": 0.009761810302734375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.265439410565328e-07, + "loss": 0.0423, + "reward": -0.35092977434396744, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.35092977434396744, + "reward_after_std": 0.5961620360612869, + "reward_before_mean": -0.20190617628395557, + "reward_before_std": 0.6122478432953358, + "reward_change_max": 0.00027695298194885254, + "reward_change_mean": -0.14902361016720533, + "reward_change_min": -0.3222372457385063, + "reward_change_std": 0.12988192215561867, + "reward_std": 0.5961620435118675, + "rewards/cosine_scaled_reward": -0.23636975418776274, + "rewards/format_reward": 0.2708333358168602, + "step": 133 + }, + { + "advantage_max": 1.163169089704752, + "advantage_mean": -1.645336678013365e-08, + "advantage_min": -0.5319757275283337, + "advantage_std": 0.6446932945400476, + "completion_length": 2510.104202270508, + "epoch": 0.15314285714285714, + "grad_norm": 0.10554244369268417, + "kl": 0.011915206909179688, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.248145583195447e-07, + "loss": 0.0326, + "reward": 0.05566288158297539, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.05566288158297539, + "reward_after_std": 0.6446932852268219, + "reward_before_mean": 0.34459465090185404, + "reward_before_std": 0.6043676100671291, + "reward_change_max": 0.0004299283027648926, + "reward_change_mean": -0.28893175069242716, + "reward_change_min": -0.5042743273079395, + "reward_change_std": 0.19494254142045975, + "reward_std": 0.6446933001279831, + "rewards/cosine_scaled_reward": -0.07770269550383091, + "rewards/format_reward": 0.5000000018626451, + "step": 134 + }, + { + "advantage_max": 1.4917153492569923, + "advantage_mean": 1.0554988993938252e-08, + "advantage_min": -0.7013456299901009, + "advantage_std": 0.8363952152431011, + "completion_length": 1998.5416984558105, + "epoch": 0.15428571428571428, + "grad_norm": 0.17164035141468048, + "kl": 0.009614944458007812, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.230669076497687e-07, + "loss": -0.0149, + "reward": 0.5657868012785912, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5657868012785912, + "reward_after_std": 0.8363952077925205, + "reward_before_mean": 1.0127003118395805, + "reward_before_std": 0.7460980167379603, + "reward_change_max": 0.0, + "reward_change_mean": -0.446913446765393, + "reward_change_min": -0.7371262945234776, + "reward_change_std": 0.2943604183383286, + "reward_std": 0.8363952338695526, + "rewards/cosine_scaled_reward": 0.14176679588854313, + "rewards/format_reward": 0.7291666697710752, + "step": 135 + }, + { + "advantage_max": 1.4442752003669739, + "advantage_mean": -2.918144209607121e-08, + "advantage_min": -0.8376066125929356, + "advantage_std": 0.8569142334163189, + "completion_length": 2878.3958892822266, + "epoch": 0.15542857142857142, + "grad_norm": 0.13732051849365234, + "kl": 0.010951995849609375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.213010742252327e-07, + "loss": 0.0786, + "reward": 0.3354130834341049, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3354130834341049, + "reward_after_std": 0.8569142334163189, + "reward_before_mean": 0.7032555975019932, + "reward_before_std": 0.8669465184211731, + "reward_change_max": 0.0, + "reward_change_mean": -0.367842567153275, + "reward_change_min": -0.7203944735229015, + "reward_change_std": 0.2801892305724323, + "reward_std": 0.8569142483174801, + "rewards/cosine_scaled_reward": 0.1432944694533944, + "rewards/format_reward": 0.4166666753590107, + "step": 136 + }, + { + "advantage_max": 1.1755945719778538, + "advantage_mean": 3.885780586188048e-16, + "advantage_min": -0.5631213709712029, + "advantage_std": 0.6682571768760681, + "completion_length": 3051.250030517578, + "epoch": 0.15657142857142858, + "grad_norm": 0.12679171562194824, + "kl": 0.009998321533203125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.195171441101668e-07, + "loss": 0.0445, + "reward": -0.17650610394775867, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.17650610394775867, + "reward_after_std": 0.6682571917772293, + "reward_before_mean": 0.026825436390936375, + "reward_before_std": 0.6791456611827016, + "reward_change_max": 0.001605108380317688, + "reward_change_mean": -0.20333154685795307, + "reward_change_min": -0.4427060279995203, + "reward_change_std": 0.17323465924710035, + "reward_std": 0.6682572159916162, + "rewards/cosine_scaled_reward": -0.12200394924730062, + "rewards/format_reward": 0.2708333432674408, + "step": 137 + }, + { + "advantage_max": 0.9305066950619221, + "advantage_mean": 1.800557042352935e-08, + "advantage_min": -0.5270604677498341, + "advantage_std": 0.5412895157933235, + "completion_length": 2504.0625381469727, + "epoch": 0.15771428571428572, + "grad_norm": 0.09073272347450256, + "kl": 0.009454727172851562, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.177152042508077e-07, + "loss": 0.0546, + "reward": -0.015904040075838566, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.015904040075838566, + "reward_after_std": 0.5412895157933235, + "reward_before_mean": 0.2626056023873389, + "reward_before_std": 0.5118083693087101, + "reward_change_max": 0.0006034299731254578, + "reward_change_mean": -0.2785096103325486, + "reward_change_min": -0.4878885019570589, + "reward_change_std": 0.19265306554734707, + "reward_std": 0.5412895232439041, + "rewards/cosine_scaled_reward": -0.12911387719213963, + "rewards/format_reward": 0.5208333358168602, + "step": 138 + }, + { + "advantage_max": 1.397744432091713, + "advantage_mean": 1.1175871006408045e-08, + "advantage_min": -0.6843620836734772, + "advantage_std": 0.7991587594151497, + "completion_length": 3182.5625610351562, + "epoch": 0.15885714285714286, + "grad_norm": 0.17045240104198456, + "kl": 0.012096405029296875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.158953424711624e-07, + "loss": 0.0473, + "reward": -0.06700982432812452, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.06700982432812452, + "reward_after_std": 0.7991587519645691, + "reward_before_mean": 0.16040014289319515, + "reward_before_std": 0.815875044092536, + "reward_change_max": 0.0009377151727676392, + "reward_change_mean": -0.2274099476635456, + "reward_change_min": -0.5065705124288797, + "reward_change_std": 0.19903714209794998, + "reward_std": 0.79915876314044, + "rewards/cosine_scaled_reward": -0.09688326716423035, + "rewards/format_reward": 0.35416666977107525, + "step": 139 + }, + { + "advantage_max": 1.0005671940743923, + "advantage_mean": 1.6763806842678974e-08, + "advantage_min": -0.4939218834042549, + "advantage_std": 0.5732725989073515, + "completion_length": 3125.104217529297, + "epoch": 0.16, + "grad_norm": 0.40375617146492004, + "kl": 0.018047332763671875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.140576474687263e-07, + "loss": 0.0712, + "reward": -0.18270614463835955, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.18270614463835955, + "reward_after_std": 0.5732725989073515, + "reward_before_mean": 0.03130531124770641, + "reward_before_std": 0.5579951740801334, + "reward_change_max": 0.0002490878105163574, + "reward_change_mean": -0.21401146426796913, + "reward_change_min": -0.45000362023711205, + "reward_change_std": 0.178125046659261, + "reward_std": 0.5732726082205772, + "rewards/cosine_scaled_reward": -0.09893068426754326, + "rewards/format_reward": 0.22916666977107525, + "step": 140 + }, + { + "advantage_max": 1.3642967343330383, + "advantage_mean": -1.6653345369377348e-16, + "advantage_min": -0.8970592468976974, + "advantage_std": 0.8663997799158096, + "completion_length": 2833.9375610351562, + "epoch": 0.16114285714285714, + "grad_norm": 0.14736533164978027, + "kl": 0.013208389282226562, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.122022088101613e-07, + "loss": 0.0636, + "reward": 0.19854285567998886, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.19854285567998886, + "reward_after_std": 0.8663997799158096, + "reward_before_mean": 0.5228808233514428, + "reward_before_std": 0.9317158870398998, + "reward_change_max": 0.0006700456142425537, + "reward_change_mean": -0.32433797139674425, + "reward_change_min": -0.6870186366140842, + "reward_change_std": 0.28708665631711483, + "reward_std": 0.8663998134434223, + "rewards/cosine_scaled_reward": -0.04064292460680008, + "rewards/format_reward": 0.6041666734963655, + "step": 141 + }, + { + "advantage_max": 1.1075918152928352, + "advantage_mean": -1.862645243599914e-08, + "advantage_min": -0.9935803860425949, + "advantage_std": 0.7360248751938343, + "completion_length": 2854.979217529297, + "epoch": 0.16228571428571428, + "grad_norm": 0.13701511919498444, + "kl": 0.01279449462890625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.103291169269299e-07, + "loss": 0.0439, + "reward": 0.2674298919737339, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2674298919737339, + "reward_after_std": 0.736024871468544, + "reward_before_mean": 0.6353370100259781, + "reward_before_std": 0.799102995544672, + "reward_change_max": 0.000289037823677063, + "reward_change_mean": -0.3679071478545666, + "reward_change_min": -0.6680124215781689, + "reward_change_std": 0.28033728897571564, + "reward_std": 0.7360248789191246, + "rewards/cosine_scaled_reward": 0.015585171058773994, + "rewards/format_reward": 0.604166692122817, + "step": 142 + }, + { + "advantage_max": 0.9835025705397129, + "advantage_mean": 1.0865430083439875e-08, + "advantage_min": -0.7354140728712082, + "advantage_std": 0.618965107947588, + "completion_length": 2793.750030517578, + "epoch": 0.16342857142857142, + "grad_norm": 0.13960637152194977, + "kl": 0.015850067138671875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.084384631108882e-07, + "loss": 0.0635, + "reward": -0.17446402180939913, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.17446402180939913, + "reward_after_std": 0.6189651004970074, + "reward_before_mean": 0.04228054964914918, + "reward_before_std": 0.6649498995393515, + "reward_change_max": 0.0016652792692184448, + "reward_change_mean": -0.21674457285553217, + "reward_change_min": -0.429037906229496, + "reward_change_std": 0.19230018742382526, + "reward_std": 0.6189651042222977, + "rewards/cosine_scaled_reward": -0.18719306215643883, + "rewards/format_reward": 0.41666668094694614, + "step": 143 + }, + { + "advantage_max": 1.6626508310437202, + "advantage_mean": -2.483526606589237e-09, + "advantage_min": -0.8582720793783665, + "advantage_std": 0.9675541780889034, + "completion_length": 3074.4166679382324, + "epoch": 0.16457142857142856, + "grad_norm": 0.1934293508529663, + "kl": 0.012912750244140625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.065303395098358e-07, + "loss": 0.0595, + "reward": -0.01596003444865346, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.01596003444865346, + "reward_after_std": 0.9675541818141937, + "reward_before_mean": 0.21122129168361425, + "reward_before_std": 1.0259087830781937, + "reward_change_max": 0.0013353228569030762, + "reward_change_mean": -0.22718132566660643, + "reward_change_min": -0.5569618083536625, + "reward_change_std": 0.2365485643967986, + "reward_std": 0.9675541929900646, + "rewards/cosine_scaled_reward": -0.050639352295547724, + "rewards/format_reward": 0.31250000186264515, + "step": 144 + }, + { + "advantage_max": 1.6574936211109161, + "advantage_mean": 7.45058065243498e-09, + "advantage_min": -0.8055602125823498, + "advantage_std": 0.9307769909501076, + "completion_length": 2110.416717529297, + "epoch": 0.1657142857142857, + "grad_norm": 0.1797972172498703, + "kl": 0.010667800903320312, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.046048391230247e-07, + "loss": 0.0496, + "reward": 0.3544669998809695, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3544669998809695, + "reward_after_std": 0.9307770021259785, + "reward_before_mean": 0.7125979978591204, + "reward_before_std": 0.909045472741127, + "reward_change_max": 0.0006473585963249207, + "reward_change_mean": -0.3581309705041349, + "reward_change_min": -0.6491826735436916, + "reward_change_std": 0.25680120568722486, + "reward_std": 0.9307770058512688, + "rewards/cosine_scaled_reward": 0.03338230960071087, + "rewards/format_reward": 0.6458333414047956, + "step": 145 + }, + { + "advantage_max": 1.0858964622020721, + "advantage_mean": -1.1796753296433593e-08, + "advantage_min": -0.5913076885044575, + "advantage_std": 0.6359936855733395, + "completion_length": 2598.5000762939453, + "epoch": 0.16685714285714287, + "grad_norm": 0.12754735350608826, + "kl": 0.00946044921875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.026620557966279e-07, + "loss": 0.0484, + "reward": -0.09592493623495102, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.09592493623495102, + "reward_after_std": 0.6359937191009521, + "reward_before_mean": 0.14266785606741905, + "reward_before_std": 0.6422541700303555, + "reward_change_max": 0.0005887970328330994, + "reward_change_mean": -0.238592809997499, + "reward_change_min": -0.5087453033775091, + "reward_change_std": 0.19102911744266748, + "reward_std": 0.6359937265515327, + "rewards/cosine_scaled_reward": -0.19949940592050552, + "rewards/format_reward": 0.5416666734963655, + "step": 146 + }, + { + "advantage_max": 1.05124119669199, + "advantage_mean": 6.208817238118058e-09, + "advantage_min": -0.6531185433268547, + "advantage_std": 0.6395123526453972, + "completion_length": 2919.750015258789, + "epoch": 0.168, + "grad_norm": 0.1187099739909172, + "kl": 0.016239166259765625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 9.007020842191634e-07, + "loss": 0.0615, + "reward": -0.23719407757744193, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.23719407757744193, + "reward_after_std": 0.6395123563706875, + "reward_before_mean": -0.04856202378869057, + "reward_before_std": 0.6837803050875664, + "reward_change_max": 0.001722574234008789, + "reward_change_mean": -0.18863204028457403, + "reward_change_min": -0.48571337200701237, + "reward_change_std": 0.18479610979557037, + "reward_std": 0.6395123600959778, + "rewards/cosine_scaled_reward": -0.18053102178964764, + "rewards/format_reward": 0.3125000074505806, + "step": 147 + }, + { + "advantage_max": 0.9280873015522957, + "advantage_mean": 1.2417634698280722e-08, + "advantage_min": -0.6955177783966064, + "advantage_std": 0.5650780126452446, + "completion_length": 2600.5000915527344, + "epoch": 0.16914285714285715, + "grad_norm": 0.07447077333927155, + "kl": 0.014652252197265625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.987250199168808e-07, + "loss": 0.0204, + "reward": 0.0697112400084734, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.0697112400084734, + "reward_after_std": 0.5650780089199543, + "reward_before_mean": 0.37948842719197273, + "reward_before_std": 0.5654736310243607, + "reward_change_max": 0.00041494518518447876, + "reward_change_mean": -0.30977714341133833, + "reward_change_min": -0.5139572322368622, + "reward_change_std": 0.21031226217746735, + "reward_std": 0.5650780126452446, + "rewards/cosine_scaled_reward": -0.09150580875575542, + "rewards/format_reward": 0.5625000074505806, + "step": 148 + }, + { + "advantage_max": 1.217205923050642, + "advantage_mean": 1.1796752907855534e-08, + "advantage_min": -0.8463896103203297, + "advantage_std": 0.7423972543329, + "completion_length": 2888.5208740234375, + "epoch": 0.1702857142857143, + "grad_norm": 0.13410669565200806, + "kl": 0.0128021240234375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.967309592491052e-07, + "loss": 0.0404, + "reward": 0.05353837716393173, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.05353837716393173, + "reward_after_std": 0.7423972692340612, + "reward_before_mean": 0.3378589116036892, + "reward_before_std": 0.7749494183808565, + "reward_change_max": 0.0017078742384910583, + "reward_change_mean": -0.2843204974196851, + "reward_change_min": -0.5100828967988491, + "reward_change_std": 0.2250691340304911, + "reward_std": 0.7423973102122545, + "rewards/cosine_scaled_reward": -0.060237223748117685, + "rewards/format_reward": 0.4583333395421505, + "step": 149 + }, + { + "advantage_max": 1.3684921450912952, + "advantage_mean": -3.104408119458668e-09, + "advantage_min": -0.7457129880785942, + "advantage_std": 0.790175162255764, + "completion_length": 2972.166732788086, + "epoch": 0.17142857142857143, + "grad_norm": 0.14953668415546417, + "kl": 0.017360687255859375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.9471999940354e-07, + "loss": 0.0354, + "reward": 0.059539347887039185, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.059539347887039185, + "reward_after_std": 0.7901751659810543, + "reward_before_mean": 0.333737276494503, + "reward_before_std": 0.8040711954236031, + "reward_change_max": 0.0, + "reward_change_mean": -0.27419792767614126, + "reward_change_min": -0.5881333574652672, + "reward_change_std": 0.22270974004641175, + "reward_std": 0.7901751957833767, + "rewards/cosine_scaled_reward": -0.031048028729856014, + "rewards/format_reward": 0.39583333767950535, + "step": 150 + }, + { + "advantage_max": 1.38007552921772, + "advantage_mean": -9.934107647602275e-09, + "advantage_min": -0.7273972257971764, + "advantage_std": 0.7743493728339672, + "completion_length": 2510.8334045410156, + "epoch": 0.17257142857142857, + "grad_norm": 0.16105739772319794, + "kl": 0.014842987060546875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.926922383915315e-07, + "loss": -0.0076, + "reward": 0.3971906192600727, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3971906192600727, + "reward_after_std": 0.7743494212627411, + "reward_before_mean": 0.791034422814846, + "reward_before_std": 0.7206472083926201, + "reward_change_max": 0.0, + "reward_change_mean": -0.39384382497519255, + "reward_change_min": -0.6590692065656185, + "reward_change_std": 0.25266471691429615, + "reward_std": 0.7743494361639023, + "rewards/cosine_scaled_reward": 0.07260053791105747, + "rewards/format_reward": 0.6458333432674408, + "step": 151 + }, + { + "advantage_max": 1.392113920301199, + "advantage_mean": -1.0554989493538613e-08, + "advantage_min": -0.6385779082775116, + "advantage_std": 0.7757796794176102, + "completion_length": 2763.18754196167, + "epoch": 0.1737142857142857, + "grad_norm": 0.1782752126455307, + "kl": 0.01556396484375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.906477750432903e-07, + "loss": 0.044, + "reward": -0.10477269627153873, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.10477269627153873, + "reward_after_std": 0.7757796980440617, + "reward_before_mean": 0.10918856167700142, + "reward_before_std": 0.7816188298165798, + "reward_change_max": 0.0007291659712791443, + "reward_change_mean": -0.2139612501487136, + "reward_change_min": -0.44862732477486134, + "reward_change_std": 0.180716834962368, + "reward_std": 0.7757797166705132, + "rewards/cosine_scaled_reward": -0.15373907564207911, + "rewards/format_reward": 0.4166666716337204, + "step": 152 + }, + { + "advantage_max": 0.8868526294827461, + "advantage_mean": 1.4280279847511679e-08, + "advantage_min": -0.4589373283088207, + "advantage_std": 0.49290744960308075, + "completion_length": 3050.5833740234375, + "epoch": 0.17485714285714285, + "grad_norm": 0.0999632179737091, + "kl": 0.02754974365234375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.88586709003076e-07, + "loss": 0.0486, + "reward": -0.3145775627344847, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.3145775627344847, + "reward_after_std": 0.49290746822953224, + "reward_before_mean": -0.13981377240270376, + "reward_before_std": 0.48043977096676826, + "reward_change_max": 7.906556129455566e-05, + "reward_change_mean": -0.17476379964500666, + "reward_change_min": -0.3179873824119568, + "reward_change_std": 0.1258330475538969, + "reward_std": 0.49290747195482254, + "rewards/cosine_scaled_reward": -0.19490689039230347, + "rewards/format_reward": 0.25000000186264515, + "step": 153 + }, + { + "advantage_max": 1.290296759456396, + "advantage_mean": 5.5879357807597785e-09, + "advantage_min": -0.9250775575637817, + "advantage_std": 0.8055738545954227, + "completion_length": 3379.666717529297, + "epoch": 0.176, + "grad_norm": 0.17429296672344208, + "kl": 0.013561248779296875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.865091407243394e-07, + "loss": 0.063, + "reward": 0.11301964987069368, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.11301964987069368, + "reward_after_std": 0.8055738434195518, + "reward_before_mean": 0.4128780122846365, + "reward_before_std": 0.8634231984615326, + "reward_change_max": 0.0006842613220214844, + "reward_change_mean": -0.29985836148262024, + "reward_change_min": -0.6293492093682289, + "reward_change_std": 0.25532870925962925, + "reward_std": 0.8055738694965839, + "rewards/cosine_scaled_reward": 0.050189003348350525, + "rewards/format_reward": 0.3125000111758709, + "step": 154 + }, + { + "advantage_max": 1.3279439583420753, + "advantage_mean": -2.4835264955669345e-09, + "advantage_min": -0.7597385421395302, + "advantage_std": 0.7712014801800251, + "completion_length": 2549.1250228881836, + "epoch": 0.17714285714285713, + "grad_norm": 0.12317630648612976, + "kl": 0.013763427734375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.844151714648274e-07, + "loss": 0.0098, + "reward": 0.42278579249978065, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.42278579249978065, + "reward_after_std": 0.7712015192955732, + "reward_before_mean": 0.8301597703248262, + "reward_before_std": 0.7407869715243578, + "reward_change_max": 0.0004476085305213928, + "reward_change_mean": -0.40737398341298103, + "reward_change_min": -0.6836552545428276, + "reward_change_std": 0.2730773724615574, + "reward_std": 0.7712015546858311, + "rewards/cosine_scaled_reward": 0.15466322377324104, + "rewards/format_reward": 0.5208333376795053, + "step": 155 + }, + { + "advantage_max": 1.1669469252228737, + "advantage_mean": 1.9247333726823967e-08, + "advantage_min": -0.7012041360139847, + "advantage_std": 0.6874648444354534, + "completion_length": 3192.8541870117188, + "epoch": 0.1782857142857143, + "grad_norm": 0.13070867955684662, + "kl": 0.017330169677734375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.823049032816478e-07, + "loss": 0.0628, + "reward": -0.21437102183699608, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.21437102183699608, + "reward_after_std": 0.6874648630619049, + "reward_before_mean": -0.023766040802001953, + "reward_before_std": 0.7222958765923977, + "reward_change_max": 0.000732123851776123, + "reward_change_mean": -0.1906049638055265, + "reward_change_min": -0.4271918907761574, + "reward_change_std": 0.1812627101317048, + "reward_std": 0.6874649003148079, + "rewards/cosine_scaled_reward": -0.12646635621786118, + "rewards/format_reward": 0.2291666716337204, + "step": 156 + }, + { + "advantage_max": 1.20693701505661, + "advantage_mean": 1.2417634476236117e-08, + "advantage_min": -0.6609192118048668, + "advantage_std": 0.6912636980414391, + "completion_length": 3175.041717529297, + "epoch": 0.17942857142857144, + "grad_norm": 0.13659845292568207, + "kl": 0.018890380859375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.801784390262943e-07, + "loss": 0.0286, + "reward": -0.19832509686239064, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.19832509686239064, + "reward_after_std": 0.6912636831402779, + "reward_before_mean": -0.0033038491383194923, + "reward_before_std": 0.7137451581656933, + "reward_change_max": 0.0005949512124061584, + "reward_change_mean": -0.19502126099541783, + "reward_change_min": -0.3924486022442579, + "reward_change_std": 0.16552246548235416, + "reward_std": 0.6912636868655682, + "rewards/cosine_scaled_reward": -0.17873526364564896, + "rewards/format_reward": 0.3541666753590107, + "step": 157 + }, + { + "advantage_max": 1.2760753110051155, + "advantage_mean": -6.208819014474898e-10, + "advantage_min": -0.9892221018671989, + "advantage_std": 0.8185785189270973, + "completion_length": 2982.0834045410156, + "epoch": 0.18057142857142858, + "grad_norm": 0.18541646003723145, + "kl": 0.01678466796875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.780358823396352e-07, + "loss": 0.0881, + "reward": 0.3491497424838599, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3491497424838599, + "reward_after_std": 0.818578477948904, + "reward_before_mean": 0.7326403986662626, + "reward_before_std": 0.8680536933243275, + "reward_change_max": 0.003411300480365753, + "reward_change_mean": -0.38349065091460943, + "reward_change_min": -0.695528618991375, + "reward_change_std": 0.30123837385326624, + "reward_std": 0.8185785189270973, + "rewards/cosine_scaled_reward": 0.0850701853632927, + "rewards/format_reward": 0.5625000149011612, + "step": 158 + }, + { + "advantage_max": 0.9803209267556667, + "advantage_mean": 6.208817127095756e-09, + "advantage_min": -0.5312448479235172, + "advantage_std": 0.5489882118999958, + "completion_length": 2971.875045776367, + "epoch": 0.18171428571428572, + "grad_norm": 0.08510956913232803, + "kl": 0.01891326904296875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.758773376468604e-07, + "loss": -0.0023, + "reward": -0.18099602963775396, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.18099602963775396, + "reward_after_std": 0.5489882081747055, + "reward_before_mean": 0.03407071530818939, + "reward_before_std": 0.5351154431700706, + "reward_change_max": 0.0009119212627410889, + "reward_change_mean": -0.21506676077842712, + "reward_change_min": -0.3951657433062792, + "reward_change_std": 0.1554846577346325, + "reward_std": 0.5489882193505764, + "rewards/cosine_scaled_reward": -0.18088130932301283, + "rewards/format_reward": 0.3958333395421505, + "step": 159 + }, + { + "advantage_max": 0.962229996919632, + "advantage_mean": 1.2417632477834672e-09, + "advantage_min": -0.7357146218419075, + "advantage_std": 0.6014006249606609, + "completion_length": 2864.4166870117188, + "epoch": 0.18285714285714286, + "grad_norm": 0.13316203653812408, + "kl": 0.0220184326171875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.737029101523929e-07, + "loss": 0.027, + "reward": 0.043143775314092636, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.043143775314092636, + "reward_after_std": 0.6014006175100803, + "reward_before_mean": 0.3398471586406231, + "reward_before_std": 0.6180408224463463, + "reward_change_max": 0.0007723942399024963, + "reward_change_mean": -0.2967033665627241, + "reward_change_min": -0.5417621470987797, + "reward_change_std": 0.22475436236709356, + "reward_std": 0.6014006324112415, + "rewards/cosine_scaled_reward": -0.007159763015806675, + "rewards/format_reward": 0.3541666753590107, + "step": 160 + }, + { + "advantage_max": 1.1514604538679123, + "advantage_mean": -1.800557086761856e-08, + "advantage_min": -0.8958503156900406, + "advantage_std": 0.729047141969204, + "completion_length": 2882.041702270508, + "epoch": 0.184, + "grad_norm": 0.13892494142055511, + "kl": 0.02344512939453125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.715127058347614e-07, + "loss": 0.0456, + "reward": 0.13343903236091137, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.13343903236091137, + "reward_after_std": 0.7290471494197845, + "reward_before_mean": 0.4490719512104988, + "reward_before_std": 0.7747690826654434, + "reward_change_max": 0.001975014805793762, + "reward_change_mean": -0.3156329430639744, + "reward_change_min": -0.5890417471528053, + "reward_change_std": 0.250003345310688, + "reward_std": 0.7290471717715263, + "rewards/cosine_scaled_reward": -0.004630686715245247, + "rewards/format_reward": 0.45833334513008595, + "step": 161 + }, + { + "advantage_max": 1.21609266102314, + "advantage_mean": 2.4835269396561444e-09, + "advantage_min": -0.6409214287996292, + "advantage_std": 0.7266101613640785, + "completion_length": 3296.0833740234375, + "epoch": 0.18514285714285714, + "grad_norm": 0.14078128337860107, + "kl": 0.02608489990234375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.693068314414344e-07, + "loss": 0.0246, + "reward": -0.14511827565729618, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.14511827565729618, + "reward_after_std": 0.7266101650893688, + "reward_before_mean": 0.06751438044011593, + "reward_before_std": 0.767331724986434, + "reward_change_max": 0.0001521781086921692, + "reward_change_mean": -0.21263265097513795, + "reward_change_min": -0.48333017714321613, + "reward_change_std": 0.19664390292018652, + "reward_std": 0.7266101948916912, + "rewards/cosine_scaled_reward": -0.09124281164258718, + "rewards/format_reward": 0.2500000074505806, + "step": 162 + }, + { + "advantage_max": 1.1875976473093033, + "advantage_mean": -5.587935225648266e-09, + "advantage_min": -0.6885306388139725, + "advantage_std": 0.680039256811142, + "completion_length": 2470.9167098999023, + "epoch": 0.18628571428571428, + "grad_norm": 0.12650761008262634, + "kl": 0.02099609375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.670853944836176e-07, + "loss": 0.0351, + "reward": 0.3515504002571106, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3515504002571106, + "reward_after_std": 0.6800392419099808, + "reward_before_mean": 0.7434308677911758, + "reward_before_std": 0.6333009023219347, + "reward_change_max": 0.0009451508522033691, + "reward_change_mean": -0.3918804544955492, + "reward_change_min": -0.6430389769375324, + "reward_change_std": 0.2503257617354393, + "reward_std": 0.6800392717123032, + "rewards/cosine_scaled_reward": 0.06963208317756653, + "rewards/format_reward": 0.6041666697710752, + "step": 163 + }, + { + "advantage_max": 1.4622886776924133, + "advantage_mean": 4.967053879312289e-09, + "advantage_min": -0.7638626024127007, + "advantage_std": 0.8638998009264469, + "completion_length": 2608.416748046875, + "epoch": 0.18742857142857142, + "grad_norm": 0.15157678723335266, + "kl": 0.020416259765625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.648485032310144e-07, + "loss": 0.0322, + "reward": 0.284273668192327, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.284273668192327, + "reward_after_std": 0.8638998009264469, + "reward_before_mean": 0.6316687539219856, + "reward_before_std": 0.8603082597255707, + "reward_change_max": 0.001119077205657959, + "reward_change_mean": -0.3473950671032071, + "reward_change_min": -0.7313497699797153, + "reward_change_std": 0.28116426430642605, + "reward_std": 0.863899827003479, + "rewards/cosine_scaled_reward": 0.04500104021281004, + "rewards/format_reward": 0.5416666753590107, + "step": 164 + }, + { + "advantage_max": 0.951996460556984, + "advantage_mean": 7.450581041013038e-09, + "advantage_min": -0.650677315890789, + "advantage_std": 0.5838065594434738, + "completion_length": 3245.687530517578, + "epoch": 0.18857142857142858, + "grad_norm": 0.13272669911384583, + "kl": 0.0304107666015625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.625962667065487e-07, + "loss": 0.0513, + "reward": -0.19890925288200378, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.19890925288200378, + "reward_after_std": 0.583806574344635, + "reward_before_mean": 0.013676023110747337, + "reward_before_std": 0.6172264814376831, + "reward_change_max": 0.0006950944662094116, + "reward_change_mean": -0.2125852620229125, + "reward_change_min": -0.4224776439368725, + "reward_change_std": 0.17862440133467317, + "reward_std": 0.5838065780699253, + "rewards/cosine_scaled_reward": -0.1494120005518198, + "rewards/format_reward": 0.31250001303851604, + "step": 165 + }, + { + "advantage_max": 1.505092702805996, + "advantage_mean": -2.793967746050896e-08, + "advantage_min": -1.0393745079636574, + "advantage_std": 0.8885571658611298, + "completion_length": 3017.354217529297, + "epoch": 0.18971428571428572, + "grad_norm": 0.16296008229255676, + "kl": 0.0185394287109375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.603287946810513e-07, + "loss": 0.0341, + "reward": 0.20471064187586308, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.20471064187586308, + "reward_after_std": 0.8885571695864201, + "reward_before_mean": 0.5197281390428543, + "reward_before_std": 0.9152417033910751, + "reward_change_max": 0.0004853755235671997, + "reward_change_mean": -0.3150175176560879, + "reward_change_min": -0.5962655283510685, + "reward_change_std": 0.2500441991724074, + "reward_std": 0.8885571919381618, + "rewards/cosine_scaled_reward": 0.030697400448843837, + "rewards/format_reward": 0.45833334885537624, + "step": 166 + }, + { + "advantage_max": 1.3313522264361382, + "advantage_mean": -1.0554989660072067e-08, + "advantage_min": -0.8351566269993782, + "advantage_std": 0.7960289977490902, + "completion_length": 2318.3959197998047, + "epoch": 0.19085714285714286, + "grad_norm": 0.17425104975700378, + "kl": 0.01657867431640625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.580461976679099e-07, + "loss": 0.0612, + "reward": 0.26994994410779327, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.26994994410779327, + "reward_after_std": 0.7960290051996708, + "reward_before_mean": 0.621154960244894, + "reward_before_std": 0.8035038635134697, + "reward_change_max": 0.0005308240652084351, + "reward_change_mean": -0.3512050053104758, + "reward_change_min": -0.643861211836338, + "reward_change_std": 0.2611974119208753, + "reward_std": 0.7960290424525738, + "rewards/cosine_scaled_reward": -0.06442254222929478, + "rewards/format_reward": 0.750000013038516, + "step": 167 + }, + { + "advantage_max": 1.5461714044213295, + "advantage_mean": -9.313226023710541e-09, + "advantage_min": -0.952774353325367, + "advantage_std": 0.9442404918372631, + "completion_length": 3180.6875610351562, + "epoch": 0.192, + "grad_norm": 0.17925553023815155, + "kl": 0.02230072021484375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.557485869176825e-07, + "loss": 0.0163, + "reward": 0.18643908202648163, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.18643908202648163, + "reward_after_std": 0.9442404955625534, + "reward_before_mean": 0.4941616393625736, + "reward_before_std": 1.0038710720837116, + "reward_change_max": 0.0011507794260978699, + "reward_change_mean": -0.30772253684699535, + "reward_change_min": -0.6709698811173439, + "reward_change_std": 0.27951822336763144, + "reward_std": 0.9442404992878437, + "rewards/cosine_scaled_reward": -0.0029191900976002216, + "rewards/format_reward": 0.5000000167638063, + "step": 168 + }, + { + "advantage_max": 1.2211386039853096, + "advantage_mean": 8.69234451084111e-09, + "advantage_min": -0.7467610165476799, + "advantage_std": 0.7033423036336899, + "completion_length": 2438.000015258789, + "epoch": 0.19314285714285714, + "grad_norm": 0.14280058443546295, + "kl": 0.0272674560546875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.534360744126753e-07, + "loss": 0.0521, + "reward": 0.6013626717031002, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6013626717031002, + "reward_after_std": 0.7033422887325287, + "reward_before_mean": 1.0799045194871724, + "reward_before_std": 0.6214714664965868, + "reward_change_max": 0.0003162398934364319, + "reward_change_mean": -0.47854181937873363, + "reward_change_min": -0.7317886389791965, + "reward_change_std": 0.29094003047794104, + "reward_std": 0.7033423185348511, + "rewards/cosine_scaled_reward": 0.22745224926620722, + "rewards/format_reward": 0.6250000055879354, + "step": 169 + }, + { + "advantage_max": 1.2461431175470352, + "advantage_mean": 3.414849292227018e-09, + "advantage_min": -0.7611747309565544, + "advantage_std": 0.7149195112287998, + "completion_length": 2473.687557220459, + "epoch": 0.19428571428571428, + "grad_norm": 0.13153916597366333, + "kl": 0.019500732421875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.511087728614862e-07, + "loss": 0.0595, + "reward": 0.1332508558407426, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1332508558407426, + "reward_after_std": 0.7149194926023483, + "reward_before_mean": 0.44204724254086614, + "reward_before_std": 0.7041205205023289, + "reward_change_max": 0.00042998790740966797, + "reward_change_mean": -0.30879637552425265, + "reward_change_min": -0.5489484928548336, + "reward_change_std": 0.22512452583760023, + "reward_std": 0.7149195000529289, + "rewards/cosine_scaled_reward": -0.028976373374462128, + "rewards/format_reward": 0.5000000055879354, + "step": 170 + }, + { + "advantage_max": 1.4106125310063362, + "advantage_mean": -3.72529057601767e-09, + "advantage_min": -0.6953155249357224, + "advantage_std": 0.795871002599597, + "completion_length": 2869.562530517578, + "epoch": 0.19542857142857142, + "grad_norm": 0.18414172530174255, + "kl": 0.02278900146484375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.487667956935087e-07, + "loss": 0.0576, + "reward": 0.26462606340646744, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.26462606340646744, + "reward_after_std": 0.7958709988743067, + "reward_before_mean": 0.6090709455311298, + "reward_before_std": 0.7663719952106476, + "reward_change_max": 0.0002579614520072937, + "reward_change_mean": -0.34444486489519477, + "reward_change_min": -0.5901921018958092, + "reward_change_std": 0.2330690361559391, + "reward_std": 0.7958710249513388, + "rewards/cosine_scaled_reward": 0.05453545227646828, + "rewards/format_reward": 0.5000000037252903, + "step": 171 + }, + { + "advantage_max": 1.0363451838493347, + "advantage_mean": 3.1044083970144243e-09, + "advantage_min": -0.6588342115283012, + "advantage_std": 0.6231314353644848, + "completion_length": 2926.729202270508, + "epoch": 0.19657142857142856, + "grad_norm": 0.10170703381299973, + "kl": 0.030120849609375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.464102570534061e-07, + "loss": 0.0193, + "reward": 0.10751838982105255, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.10751838982105255, + "reward_after_std": 0.6231314353644848, + "reward_before_mean": 0.42348906956613064, + "reward_before_std": 0.6140627078711987, + "reward_change_max": 0.00191400945186615, + "reward_change_mean": -0.31597068533301353, + "reward_change_min": -0.5915517024695873, + "reward_change_std": 0.23089101910591125, + "reward_std": 0.6231314614415169, + "rewards/cosine_scaled_reward": 0.03466118685901165, + "rewards/format_reward": 0.3541666679084301, + "step": 172 + }, + { + "advantage_max": 1.223799116909504, + "advantage_mean": -9.93410786964688e-09, + "advantage_min": -0.7045940980315208, + "advantage_std": 0.7283288538455963, + "completion_length": 2046.000057220459, + "epoch": 0.1977142857142857, + "grad_norm": 0.20791788399219513, + "kl": 0.026142120361328125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.440392717955475e-07, + "loss": 0.0616, + "reward": 0.05291812680661678, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.05291812680661678, + "reward_after_std": 0.728328850120306, + "reward_before_mean": 0.3344998322427273, + "reward_before_std": 0.735675803385675, + "reward_change_max": 0.0, + "reward_change_mean": -0.28158169984817505, + "reward_change_min": -0.5169616155326366, + "reward_change_std": 0.22174399625509977, + "reward_std": 0.7283288538455963, + "rewards/cosine_scaled_reward": -0.13483343180269003, + "rewards/format_reward": 0.6041666772216558, + "step": 173 + }, + { + "advantage_max": 1.531661294400692, + "advantage_mean": -5.5879355587151736e-09, + "advantage_min": -0.8348989337682724, + "advantage_std": 0.8691375777125359, + "completion_length": 2526.479248046875, + "epoch": 0.19885714285714284, + "grad_norm": 0.1942531168460846, + "kl": 0.03050994873046875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.416539554784089e-07, + "loss": 0.0638, + "reward": 0.21084141172468662, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.21084141172468662, + "reward_after_std": 0.8691376075148582, + "reward_before_mean": 0.52743915701285, + "reward_before_std": 0.8649193346500397, + "reward_change_max": 0.0003266632556915283, + "reward_change_mean": -0.3165977615863085, + "reward_change_min": -0.617426909506321, + "reward_change_std": 0.23885107226669788, + "reward_std": 0.8691376447677612, + "rewards/cosine_scaled_reward": -0.06961375381797552, + "rewards/format_reward": 0.6666666753590107, + "step": 174 + }, + { + "advantage_max": 1.0626054927706718, + "advantage_mean": 1.862645193639878e-08, + "advantage_min": -0.7459142580628395, + "advantage_std": 0.6619545966386795, + "completion_length": 2799.4792098999023, + "epoch": 0.2, + "grad_norm": 0.11736467480659485, + "kl": 0.0283355712890625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.392544243589427e-07, + "loss": 0.0262, + "reward": 0.09813978523015976, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.09813978523015976, + "reward_after_std": 0.6619545966386795, + "reward_before_mean": 0.4087598901242018, + "reward_before_std": 0.6921274587512016, + "reward_change_max": 0.0004105418920516968, + "reward_change_mean": -0.31062008207663894, + "reward_change_min": -0.5638699308037758, + "reward_change_std": 0.22923887381330132, + "reward_std": 0.6619546003639698, + "rewards/cosine_scaled_reward": -0.024786731228232384, + "rewards/format_reward": 0.4583333395421505, + "step": 175 + }, + { + "advantage_max": 1.7890079766511917, + "advantage_mean": 2.483526384544632e-09, + "advantage_min": -0.8193216100335121, + "advantage_std": 1.009437695145607, + "completion_length": 2720.041702270508, + "epoch": 0.20114285714285715, + "grad_norm": 0.18651027977466583, + "kl": 0.03166961669921875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.368407953869103e-07, + "loss": 0.0233, + "reward": 0.14203171245753765, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.14203171245753765, + "reward_after_std": 1.0094376914203167, + "reward_before_mean": 0.4155056234449148, + "reward_before_std": 1.0160624533891678, + "reward_change_max": 0.001289263367652893, + "reward_change_mean": -0.27347393333911896, + "reward_change_min": -0.5660295896232128, + "reward_change_std": 0.22895468026399612, + "reward_std": 1.0094376988708973, + "rewards/cosine_scaled_reward": -0.010997178498655558, + "rewards/format_reward": 0.43750000558793545, + "step": 176 + }, + { + "advantage_max": 1.4360157921910286, + "advantage_mean": -6.208816794028849e-10, + "advantage_min": -1.009121149778366, + "advantage_std": 0.9150974787771702, + "completion_length": 2988.041763305664, + "epoch": 0.2022857142857143, + "grad_norm": 0.3150843679904938, + "kl": 0.0362396240234375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.344131861991828e-07, + "loss": 0.0791, + "reward": 0.12510948814451694, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.12510948814451694, + "reward_after_std": 0.9150974527001381, + "reward_before_mean": 0.418378489674069, + "reward_before_std": 1.0042357444763184, + "reward_change_max": 0.0010838136076927185, + "reward_change_mean": -0.2932689841836691, + "reward_change_min": -0.664687767624855, + "reward_change_std": 0.28929333481937647, + "reward_std": 0.9150974601507187, + "rewards/cosine_scaled_reward": -0.061644104309380054, + "rewards/format_reward": 0.5416666828095913, + "step": 177 + }, + { + "advantage_max": 1.5168294608592987, + "advantage_mean": -1.2417634698280722e-08, + "advantage_min": -0.7548965439200401, + "advantage_std": 0.8664248380810022, + "completion_length": 2798.666702270508, + "epoch": 0.20342857142857143, + "grad_norm": 0.24158771336078644, + "kl": 0.0391998291015625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.319717151140072e-07, + "loss": 0.0326, + "reward": -0.05778668820858002, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": -0.05778668820858002, + "reward_after_std": 0.8664248511195183, + "reward_before_mean": 0.1630261167883873, + "reward_before_std": 0.8943422082811594, + "reward_change_max": 0.005850210785865784, + "reward_change_mean": -0.22081283433362842, + "reward_change_min": -0.5684525799006224, + "reward_change_std": 0.22119095316156745, + "reward_std": 0.8664248883724213, + "rewards/cosine_scaled_reward": -0.13723693694919348, + "rewards/format_reward": 0.43750000931322575, + "step": 178 + }, + { + "advantage_max": 0.8692804127931595, + "advantage_mean": 9.002784906453343e-09, + "advantage_min": -0.47451937943696976, + "advantage_std": 0.49049021303653717, + "completion_length": 2842.000015258789, + "epoch": 0.20457142857142857, + "grad_norm": 0.08588196337223053, + "kl": 0.0345916748046875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.295165011252396e-07, + "loss": 0.0121, + "reward": -0.29055724292993546, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.29055724292993546, + "reward_after_std": 0.49049021676182747, + "reward_before_mean": -0.10510284267365932, + "reward_before_std": 0.4801676608622074, + "reward_change_max": 0.0010400563478469849, + "reward_change_mean": -0.18545439094305038, + "reward_change_min": -0.3516765534877777, + "reward_change_std": 0.14079185109585524, + "reward_std": 0.49049022793769836, + "rewards/cosine_scaled_reward": -0.24005142599344254, + "rewards/format_reward": 0.37500000558793545, + "step": 179 + }, + { + "advantage_max": 1.318357888609171, + "advantage_mean": -1.6763806898190126e-08, + "advantage_min": -0.7594656124711037, + "advantage_std": 0.7751675732433796, + "completion_length": 2358.104190826416, + "epoch": 0.2057142857142857, + "grad_norm": 0.1223163828253746, + "kl": 0.03548431396484375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.270476638965461e-07, + "loss": 0.002, + "reward": 0.38155335932970047, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.38155335932970047, + "reward_after_std": 0.7751675825566053, + "reward_before_mean": 0.773796696215868, + "reward_before_std": 0.7584128007292747, + "reward_change_max": 0.0, + "reward_change_mean": -0.3922433443367481, + "reward_change_min": -0.7204612493515015, + "reward_change_std": 0.273406270891428, + "reward_std": 0.7751675937324762, + "rewards/cosine_scaled_reward": 0.10564833006355911, + "rewards/format_reward": 0.5625, + "step": 180 + }, + { + "advantage_max": 1.700867984443903, + "advantage_mean": 6.829699084054397e-09, + "advantage_min": -0.7429678663611412, + "advantage_std": 0.9190853256732225, + "completion_length": 3069.479202270508, + "epoch": 0.20685714285714285, + "grad_norm": 0.23546984791755676, + "kl": 0.0411376953125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.245653237555705e-07, + "loss": 0.0573, + "reward": -0.0617211596108973, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.0617211596108973, + "reward_after_std": 0.9190853126347065, + "reward_before_mean": 0.1456823544576764, + "reward_before_std": 0.9196916986256838, + "reward_change_max": 0.0004614144563674927, + "reward_change_mean": -0.20740348938852549, + "reward_change_min": -0.4607136957347393, + "reward_change_std": 0.18180597410537302, + "reward_std": 0.9190853200852871, + "rewards/cosine_scaled_reward": -0.09382550918962806, + "rewards/format_reward": 0.3333333395421505, + "step": 181 + }, + { + "advantage_max": 1.8870112970471382, + "advantage_mean": -1.6763806398589765e-08, + "advantage_min": -0.9500059187412262, + "advantage_std": 1.0961102209985256, + "completion_length": 2501.4792251586914, + "epoch": 0.208, + "grad_norm": 0.21964824199676514, + "kl": 0.02997589111328125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.220696016880687e-07, + "loss": 0.0521, + "reward": 0.3104388937354088, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3104388937354088, + "reward_after_std": 1.096110176295042, + "reward_before_mean": 0.6356113087385893, + "reward_before_std": 1.1282791681587696, + "reward_change_max": 0.0012559443712234497, + "reward_change_mean": -0.325172433629632, + "reward_change_min": -0.7234682310372591, + "reward_change_std": 0.28116371016949415, + "reward_std": 1.0961102060973644, + "rewards/cosine_scaled_reward": 0.015722323209047318, + "rewards/format_reward": 0.6041666772216558, + "step": 182 + }, + { + "advantage_max": 1.426853645592928, + "advantage_mean": -4.0357314046168824e-08, + "advantage_min": -1.0862329080700874, + "advantage_std": 0.9064907692372799, + "completion_length": 2791.854217529297, + "epoch": 0.20914285714285713, + "grad_norm": 0.17481961846351624, + "kl": 0.052337646484375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.195606193320136e-07, + "loss": 0.0374, + "reward": 0.28753895533736795, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.28753895533736795, + "reward_after_std": 0.9064907692372799, + "reward_before_mean": 0.6385138165205717, + "reward_before_std": 0.9792434312403202, + "reward_change_max": 0.0001685991883277893, + "reward_change_mean": -0.35097489645704627, + "reward_change_min": -0.6677650213241577, + "reward_change_std": 0.2933631045743823, + "reward_std": 0.9064908064901829, + "rewards/cosine_scaled_reward": 0.027590231969952583, + "rewards/format_reward": 0.5833333469927311, + "step": 183 + }, + { + "advantage_max": 1.0320013873279095, + "advantage_mean": 3.1044085080367267e-09, + "advantage_min": -0.4724693186581135, + "advantage_std": 0.5576813668012619, + "completion_length": 2872.541679382324, + "epoch": 0.2102857142857143, + "grad_norm": 0.10614390671253204, + "kl": 0.0408172607421875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.170384989716657e-07, + "loss": 0.0062, + "reward": -0.20734488288871944, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.20734488288871944, + "reward_after_std": 0.5576813891530037, + "reward_before_mean": -0.006254892796278, + "reward_before_std": 0.5329896248877048, + "reward_change_max": 0.0017844811081886292, + "reward_change_mean": -0.20108998264186084, + "reward_change_min": -0.34110090509057045, + "reward_change_std": 0.13719749893061817, + "reward_std": 0.5576814040541649, + "rewards/cosine_scaled_reward": -0.21146078407764435, + "rewards/format_reward": 0.4166666716337204, + "step": 184 + }, + { + "advantage_max": 1.0225131250917912, + "advantage_mean": 5.587935614226325e-09, + "advantage_min": -0.5672240667045116, + "advantage_std": 0.5753951445221901, + "completion_length": 2476.8125228881836, + "epoch": 0.21142857142857144, + "grad_norm": 0.11501786857843399, + "kl": 0.0401763916015625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.145033635316128e-07, + "loss": 0.0207, + "reward": -0.16281145935499808, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.16281145935499808, + "reward_after_std": 0.575395142659545, + "reward_before_mean": 0.05530272121541202, + "reward_before_std": 0.5677776131778955, + "reward_change_max": 0.00036994367837905884, + "reward_change_mean": -0.21811419213190675, + "reward_change_min": -0.407501645386219, + "reward_change_std": 0.15843796357512474, + "reward_std": 0.5753951575607061, + "rewards/cosine_scaled_reward": -0.21193197183310986, + "rewards/format_reward": 0.47916667722165585, + "step": 185 + }, + { + "advantage_max": 0.8590374700725079, + "advantage_mean": -1.3659398168108794e-08, + "advantage_min": -0.6779530048370361, + "advantage_std": 0.5576737355440855, + "completion_length": 3155.5833587646484, + "epoch": 0.21257142857142858, + "grad_norm": 0.12613937258720398, + "kl": 0.048248291015625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.119553365707802e-07, + "loss": 0.0319, + "reward": 0.11866102367639542, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.11866102367639542, + "reward_after_std": 0.5576737429946661, + "reward_before_mean": 0.4515787363052368, + "reward_before_std": 0.5622416902333498, + "reward_change_max": 0.0005166977643966675, + "reward_change_mean": -0.332917720079422, + "reward_change_min": -0.5469591151922941, + "reward_change_std": 0.23157448787242174, + "reward_std": 0.5576737560331821, + "rewards/cosine_scaled_reward": 0.04870602488517761, + "rewards/format_reward": 0.3541666716337204, + "step": 186 + }, + { + "advantage_max": 1.1893529891967773, + "advantage_mean": -1.1486312567754453e-08, + "advantage_min": -0.7816543951630592, + "advantage_std": 0.7066598571836948, + "completion_length": 2754.4583892822266, + "epoch": 0.21371428571428572, + "grad_norm": 0.147071972489357, + "kl": 0.0525665283203125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.093945422764069e-07, + "loss": 0.0109, + "reward": 0.006238499656319618, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.006238499656319618, + "reward_after_std": 0.7066598758101463, + "reward_before_mean": 0.2741194274276495, + "reward_before_std": 0.7243945486843586, + "reward_change_max": 0.0003826245665550232, + "reward_change_mean": -0.2678809203207493, + "reward_change_min": -0.5089087933301926, + "reward_change_std": 0.20938009303063154, + "reward_std": 0.7066599018871784, + "rewards/cosine_scaled_reward": -0.11294030770659447, + "rewards/format_reward": 0.500000013038516, + "step": 187 + }, + { + "advantage_max": 0.9972609803080559, + "advantage_mean": 1.117587122845265e-08, + "advantage_min": -0.5697911977767944, + "advantage_std": 0.5747736543416977, + "completion_length": 3509.3958740234375, + "epoch": 0.21485714285714286, + "grad_norm": 0.15090584754943848, + "kl": 0.058258056640625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.068211054579943e-07, + "loss": 0.0266, + "reward": -0.3646040167659521, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.3646040167659521, + "reward_after_std": 0.5747736543416977, + "reward_before_mean": -0.2149992468766868, + "reward_before_std": 0.5986794494092464, + "reward_change_max": 0.0015026628971099854, + "reward_change_mean": -0.14960476756095886, + "reward_change_min": -0.3564014993607998, + "reward_change_std": 0.14348152186721563, + "reward_std": 0.5747736543416977, + "rewards/cosine_scaled_reward": -0.1699996292591095, + "rewards/format_reward": 0.1250000037252903, + "step": 188 + }, + { + "advantage_max": 0.8467955514788628, + "advantage_mean": -1.4280279181377864e-08, + "advantage_min": -0.6261894814670086, + "advantage_std": 0.5325388088822365, + "completion_length": 2987.8333740234375, + "epoch": 0.216, + "grad_norm": 0.146543949842453, + "kl": 0.058349609375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.04235151541222e-07, + "loss": 0.0441, + "reward": -0.036177659407258034, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.036177659407258034, + "reward_after_std": 0.5325388237833977, + "reward_before_mean": 0.24245697120204568, + "reward_before_std": 0.5513169299811125, + "reward_change_max": 0.0002681910991668701, + "reward_change_mean": -0.27863461058586836, + "reward_change_min": -0.5000833366066217, + "reward_change_std": 0.20214266702532768, + "reward_std": 0.5325388498604298, + "rewards/cosine_scaled_reward": -0.08710487745702267, + "rewards/format_reward": 0.41666667349636555, + "step": 189 + }, + { + "advantage_max": 1.4574809968471527, + "advantage_mean": -3.104408619059029e-09, + "advantage_min": -0.7873755618929863, + "advantage_std": 0.8542032577097416, + "completion_length": 2575.3333740234375, + "epoch": 0.21714285714285714, + "grad_norm": 0.1981331706047058, + "kl": 0.0491943359375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 8.01636806561836e-07, + "loss": 0.0212, + "reward": 0.0914838039316237, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.0914838039316237, + "reward_after_std": 0.8542032763361931, + "reward_before_mean": 0.3697546496987343, + "reward_before_std": 0.8832090497016907, + "reward_change_max": 0.0066404566168785095, + "reward_change_mean": -0.27827086206525564, + "reward_change_min": -0.6325997523963451, + "reward_change_std": 0.25706900004297495, + "reward_std": 0.8542033135890961, + "rewards/cosine_scaled_reward": -0.044289345387369394, + "rewards/format_reward": 0.45833334513008595, + "step": 190 + }, + { + "advantage_max": 1.5159916803240776, + "advantage_mean": -6.208814573582799e-10, + "advantage_min": -0.6861558184027672, + "advantage_std": 0.8327911645174026, + "completion_length": 2696.5625534057617, + "epoch": 0.21828571428571428, + "grad_norm": 0.20235764980316162, + "kl": 0.05841064453125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.990261971595048e-07, + "loss": 0.0478, + "reward": 0.15532669192180037, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.15532669192180037, + "reward_after_std": 0.832791157066822, + "reward_before_mean": 0.4526620793621987, + "reward_before_std": 0.8022483550012112, + "reward_change_max": 0.001082099974155426, + "reward_change_mean": -0.2973353751003742, + "reward_change_min": -0.5558175276964903, + "reward_change_std": 0.21507473941892385, + "reward_std": 0.8327912017703056, + "rewards/cosine_scaled_reward": -0.013252315111458302, + "rewards/format_reward": 0.47916667349636555, + "step": 191 + }, + { + "advantage_max": 1.1179408133029938, + "advantage_mean": 2.048909675256283e-08, + "advantage_min": -0.8805400393903255, + "advantage_std": 0.757763747125864, + "completion_length": 3161.0625610351562, + "epoch": 0.21942857142857142, + "grad_norm": 0.3063293993473053, + "kl": 0.057159423828125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.964034505716476e-07, + "loss": 0.0728, + "reward": -0.053129157051444054, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": -0.053129157051444054, + "reward_after_std": 0.7577637508511543, + "reward_before_mean": 0.19890626333653927, + "reward_before_std": 0.856065109372139, + "reward_change_max": 0.0003703683614730835, + "reward_change_mean": -0.252035410143435, + "reward_change_min": -0.5377420820295811, + "reward_change_std": 0.25206225644797087, + "reward_std": 0.7577637806534767, + "rewards/cosine_scaled_reward": -0.07763020880520344, + "rewards/format_reward": 0.35416667722165585, + "step": 192 + }, + { + "advantage_max": 1.110087014734745, + "advantage_mean": 2.1420419549222913e-08, + "advantage_min": -0.5900781787931919, + "advantage_std": 0.634924691170454, + "completion_length": 3080.9583587646484, + "epoch": 0.22057142857142858, + "grad_norm": 0.16752228140830994, + "kl": 0.057342529296875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.93768694627233e-07, + "loss": 0.0294, + "reward": -0.13575546815991402, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.13575546815991402, + "reward_after_std": 0.6349246874451637, + "reward_before_mean": 0.08614366129040718, + "reward_before_std": 0.6371559798717499, + "reward_change_max": 0.0006859153509140015, + "reward_change_mean": -0.2218991070985794, + "reward_change_min": -0.45419899746775627, + "reward_change_std": 0.1739786909893155, + "reward_std": 0.634924691170454, + "rewards/cosine_scaled_reward": -0.11317817401140928, + "rewards/format_reward": 0.3125000074505806, + "step": 193 + }, + { + "advantage_max": 1.5593114458024502, + "advantage_mean": -1.490116136038111e-08, + "advantage_min": -0.960111491382122, + "advantage_std": 0.9485115651041269, + "completion_length": 3000.916748046875, + "epoch": 0.22171428571428572, + "grad_norm": 0.26778778433799744, + "kl": 0.059417724609375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.911220577405484e-07, + "loss": 0.0141, + "reward": 0.29285904578864574, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.29285904578864574, + "reward_after_std": 0.9485115520656109, + "reward_before_mean": 0.6367957415059209, + "reward_before_std": 1.0004472993314266, + "reward_change_max": 0.001190371811389923, + "reward_change_mean": -0.34393669478595257, + "reward_change_min": -0.7738593406975269, + "reward_change_std": 0.2969193672761321, + "reward_std": 0.9485115893185139, + "rewards/cosine_scaled_reward": 0.07881453260779381, + "rewards/format_reward": 0.479166679084301, + "step": 194 + }, + { + "advantage_max": 1.4225033074617386, + "advantage_mean": -8.6923440667519e-09, + "advantage_min": -0.8249565176665783, + "advantage_std": 0.8413193933665752, + "completion_length": 2926.625045776367, + "epoch": 0.22285714285714286, + "grad_norm": 0.36274176836013794, + "kl": 0.064544677734375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.884636689049422e-07, + "loss": 0.0554, + "reward": 0.007098935544490814, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.007098935544490814, + "reward_after_std": 0.841319415718317, + "reward_before_mean": 0.25825855135917664, + "reward_before_std": 0.8795899786055088, + "reward_change_max": 0.00033611059188842773, + "reward_change_mean": -0.2511596102267504, + "reward_change_min": -0.5563398413360119, + "reward_change_std": 0.22679788246750832, + "reward_std": 0.8413194417953491, + "rewards/cosine_scaled_reward": -0.058370741084218025, + "rewards/format_reward": 0.3750000037252903, + "step": 195 + }, + { + "advantage_max": 0.9352176859974861, + "advantage_mean": -1.8626448716752009e-09, + "advantage_min": -0.5650050267577171, + "advantage_std": 0.5551619492471218, + "completion_length": 3204.750030517578, + "epoch": 0.224, + "grad_norm": 0.14659136533737183, + "kl": 0.0740966796875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.857936576865356e-07, + "loss": 0.0226, + "reward": -0.08506974019110203, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.08506974019110203, + "reward_after_std": 0.5551619343459606, + "reward_before_mean": 0.1700500212609768, + "reward_before_std": 0.5611773282289505, + "reward_change_max": 0.00023402273654937744, + "reward_change_mean": -0.25511975586414337, + "reward_change_min": -0.49135322496294975, + "reward_change_std": 0.18783819722011685, + "reward_std": 0.5551619455218315, + "rewards/cosine_scaled_reward": -0.07122499728575349, + "rewards/format_reward": 0.3125000037252903, + "step": 196 + }, + { + "advantage_max": 1.9159402027726173, + "advantage_mean": -1.6142925107764938e-08, + "advantage_min": -0.9467073902487755, + "advantage_std": 1.1410527899861336, + "completion_length": 2238.041702270508, + "epoch": 0.22514285714285714, + "grad_norm": 0.42249488830566406, + "kl": 0.0753173828125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.831121542179086e-07, + "loss": 0.0565, + "reward": 0.2458451751153916, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2458451751153916, + "reward_after_std": 1.141052782535553, + "reward_before_mean": 0.5444424021989107, + "reward_before_std": 1.2092568203806877, + "reward_change_max": 0.0008358433842658997, + "reward_change_mean": -0.2985972370952368, + "reward_change_min": -0.7867999076843262, + "reward_change_std": 0.30800035782158375, + "reward_std": 1.1410528048872948, + "rewards/cosine_scaled_reward": -0.009028811939060688, + "rewards/format_reward": 0.5625000093132257, + "step": 197 + }, + { + "advantage_max": 1.5482355654239655, + "advantage_mean": 3.725290520506519e-09, + "advantage_min": -0.7490741685032845, + "advantage_std": 0.8406277522444725, + "completion_length": 2701.500045776367, + "epoch": 0.22628571428571428, + "grad_norm": 0.2042098492383957, + "kl": 0.091156005859375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.804192891917571e-07, + "loss": 0.024, + "reward": -0.07515976205468178, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.07515976205468178, + "reward_after_std": 0.8406277596950531, + "reward_before_mean": 0.13804511167109013, + "reward_before_std": 0.8344271630048752, + "reward_change_max": 0.0003175213932991028, + "reward_change_mean": -0.21320487465709448, + "reward_change_min": -0.4269953351467848, + "reward_change_std": 0.17606920842081308, + "reward_std": 0.8406277745962143, + "rewards/cosine_scaled_reward": -0.07681078463792801, + "rewards/format_reward": 0.2916666716337204, + "step": 198 + }, + { + "advantage_max": 1.396604772657156, + "advantage_mean": -8.6923440667519e-09, + "advantage_min": -0.7665940597653389, + "advantage_std": 0.8094251714646816, + "completion_length": 2494.6875381469727, + "epoch": 0.22742857142857142, + "grad_norm": 0.2061101496219635, + "kl": 0.09295654296875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.777151938545235e-07, + "loss": 0.0141, + "reward": 0.0012609101831912994, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.0012609101831912994, + "reward_after_std": 0.8094251602888107, + "reward_before_mean": 0.2516523525118828, + "reward_before_std": 0.8308322429656982, + "reward_change_max": 0.0001317635178565979, + "reward_change_mean": -0.25039147958159447, + "reward_change_min": -0.5647806152701378, + "reward_change_std": 0.22136891726404428, + "reward_std": 0.8094251789152622, + "rewards/cosine_scaled_reward": -0.04084047582000494, + "rewards/format_reward": 0.33333333767950535, + "step": 199 + }, + { + "advantage_max": 1.399455588310957, + "advantage_mean": -7.450580929990736e-09, + "advantage_min": -0.6911447197198868, + "advantage_std": 0.7602925859391689, + "completion_length": 2492.104232788086, + "epoch": 0.22857142857142856, + "grad_norm": 0.23204493522644043, + "kl": 0.094390869140625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.75e-07, + "loss": 0.0048, + "reward": 0.3572434112429619, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3572434112429619, + "reward_after_std": 0.7602925859391689, + "reward_before_mean": 0.7317504649981856, + "reward_before_std": 0.6808374896645546, + "reward_change_max": 0.00037054717540740967, + "reward_change_mean": -0.3745070155709982, + "reward_change_min": -0.5897736884653568, + "reward_change_std": 0.23156773671507835, + "reward_std": 0.760292612016201, + "rewards/cosine_scaled_reward": 0.07420855306554586, + "rewards/format_reward": 0.583333345130086, + "step": 200 + }, + { + "advantage_max": 1.662718866020441, + "advantage_mean": -1.7384688910659918e-08, + "advantage_min": -0.8902830928564072, + "advantage_std": 0.9579394981265068, + "completion_length": 2042.0625305175781, + "epoch": 0.2297142857142857, + "grad_norm": 0.18706873059272766, + "kl": 0.0697021484375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.72273839962904e-07, + "loss": 0.0017, + "reward": 0.5458131283521652, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5458131283521652, + "reward_after_std": 0.9579394981265068, + "reward_before_mean": 0.973633773624897, + "reward_before_std": 0.9209330566227436, + "reward_change_max": 0.0014783218502998352, + "reward_change_mean": -0.42782066529616714, + "reward_change_min": -0.7845392525196075, + "reward_change_std": 0.3085333174094558, + "reward_std": 0.9579395055770874, + "rewards/cosine_scaled_reward": 0.25765021913684905, + "rewards/format_reward": 0.4583333432674408, + "step": 201 + }, + { + "advantage_max": 1.3534116931259632, + "advantage_mean": 3.1044083970144243e-09, + "advantage_min": -0.6152248904109001, + "advantage_std": 0.7239951826632023, + "completion_length": 2311.1458587646484, + "epoch": 0.23085714285714284, + "grad_norm": 0.18341876566410065, + "kl": 0.10516357421875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.695368466124296e-07, + "loss": -0.0024, + "reward": 0.5883260769769549, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5883260769769549, + "reward_after_std": 0.7239951677620411, + "reward_before_mean": 1.052895911037922, + "reward_before_std": 0.5883917305618525, + "reward_change_max": 0.0004690214991569519, + "reward_change_mean": -0.46456983499228954, + "reward_change_min": -0.7253349907696247, + "reward_change_std": 0.27589836064726114, + "reward_std": 0.7239951826632023, + "rewards/cosine_scaled_reward": 0.22436463087797165, + "rewards/format_reward": 0.6041666679084301, + "step": 202 + }, + { + "advantage_max": 1.304409470409155, + "advantage_mean": -1.6763806509612067e-08, + "advantage_min": -0.656681727617979, + "advantage_std": 0.7154108583927155, + "completion_length": 2858.5833892822266, + "epoch": 0.232, + "grad_norm": 0.318823903799057, + "kl": 0.112030029296875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.667891533457718e-07, + "loss": 0.0137, + "reward": -0.048699749168008566, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.048699749168008566, + "reward_after_std": 0.7154108472168446, + "reward_before_mean": 0.19068025797605515, + "reward_before_std": 0.6872757263481617, + "reward_change_max": 0.0018011406064033508, + "reward_change_mean": -0.23938002390787005, + "reward_change_min": -0.47005924582481384, + "reward_change_std": 0.1845587631687522, + "reward_std": 0.7154108621180058, + "rewards/cosine_scaled_reward": -0.040076554752886295, + "rewards/format_reward": 0.2708333395421505, + "step": 203 + }, + { + "advantage_max": 1.1338126733899117, + "advantage_mean": -1.0865429111994729e-09, + "advantage_min": -0.7272412367165089, + "advantage_std": 0.6677736900746822, + "completion_length": 2282.937545776367, + "epoch": 0.23314285714285715, + "grad_norm": 0.22631201148033142, + "kl": 0.110076904296875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.640308940816239e-07, + "loss": 0.0228, + "reward": 0.19898322504013777, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.19898322504013777, + "reward_after_std": 0.6677736863493919, + "reward_before_mean": 0.5331610734574497, + "reward_before_std": 0.6481306254863739, + "reward_change_max": 0.0, + "reward_change_mean": -0.3341778600588441, + "reward_change_min": -0.5623057503253222, + "reward_change_std": 0.21866612136363983, + "reward_std": 0.6677737049758434, + "rewards/cosine_scaled_reward": -0.07716945745050907, + "rewards/format_reward": 0.6875000149011612, + "step": 204 + }, + { + "advantage_max": 1.262697447091341, + "advantage_mean": -3.290673217248852e-08, + "advantage_min": -0.8431374989449978, + "advantage_std": 0.7658955343067646, + "completion_length": 2390.2083740234375, + "epoch": 0.2342857142857143, + "grad_norm": 0.26258933544158936, + "kl": 0.10296630859375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.612622032536507e-07, + "loss": 0.0023, + "reward": 0.35523027554154396, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.35523027554154396, + "reward_after_std": 0.7658955492079258, + "reward_before_mean": 0.742044972255826, + "reward_before_std": 0.7611411139369011, + "reward_change_max": 0.0, + "reward_change_mean": -0.38681469298899174, + "reward_change_min": -0.6838057711720467, + "reward_change_std": 0.27394791319966316, + "reward_std": 0.765895564109087, + "rewards/cosine_scaled_reward": 0.12102247402071953, + "rewards/format_reward": 0.5000000111758709, + "step": 205 + }, + { + "advantage_max": 1.3321489915251732, + "advantage_mean": -1.8626450382086546e-09, + "advantage_min": -0.794492594897747, + "advantage_std": 0.8248475287109613, + "completion_length": 3009.8542098999023, + "epoch": 0.23542857142857143, + "grad_norm": 0.3896113932132721, + "kl": 0.12060546875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.584832158039378e-07, + "loss": 0.0297, + "reward": -0.004749797284603119, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.004749797284603119, + "reward_after_std": 0.8248475547879934, + "reward_before_mean": 0.24687214195728302, + "reward_before_std": 0.88743188790977, + "reward_change_max": 0.0006934329867362976, + "reward_change_mean": -0.2516219327226281, + "reward_change_min": -0.6339543778449297, + "reward_change_std": 0.2461548363789916, + "reward_std": 0.8248475603759289, + "rewards/cosine_scaled_reward": -0.05364727135747671, + "rewards/format_reward": 0.35416666977107525, + "step": 206 + }, + { + "advantage_max": 1.607744850218296, + "advantage_mean": 2.483527050678447e-09, + "advantage_min": -0.7184270024299622, + "advantage_std": 0.8869195282459259, + "completion_length": 3170.291717529297, + "epoch": 0.23657142857142857, + "grad_norm": 0.366899698972702, + "kl": 0.16107177734375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.556940671764124e-07, + "loss": 0.0036, + "reward": -0.1947600757703185, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.1947600757703185, + "reward_after_std": 0.8869195394217968, + "reward_before_mean": -0.028473446145653725, + "reward_before_std": 0.915056474506855, + "reward_change_max": 0.0021965429186820984, + "reward_change_mean": -0.16628664545714855, + "reward_change_min": -0.46696799620985985, + "reward_change_std": 0.18509722780436277, + "reward_std": 0.8869195394217968, + "rewards/cosine_scaled_reward": -0.170486721675843, + "rewards/format_reward": 0.3125000037252903, + "step": 207 + }, + { + "advantage_max": 1.1644534580409527, + "advantage_mean": -1.73846881335038e-08, + "advantage_min": -0.7889424115419388, + "advantage_std": 0.7400659993290901, + "completion_length": 2515.979232788086, + "epoch": 0.2377142857142857, + "grad_norm": 0.6642101407051086, + "kl": 0.11297607421875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.528948933102438e-07, + "loss": 0.0714, + "reward": 0.046982649713754654, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.046982649713754654, + "reward_after_std": 0.7400659881532192, + "reward_before_mean": 0.3288039155304432, + "reward_before_std": 0.7989055551588535, + "reward_change_max": 0.0010460317134857178, + "reward_change_mean": -0.2818212900310755, + "reward_change_min": -0.5972126051783562, + "reward_change_std": 0.2474859021604061, + "reward_std": 0.7400660440325737, + "rewards/cosine_scaled_reward": -0.06476471698260866, + "rewards/format_reward": 0.4583333469927311, + "step": 208 + }, + { + "advantage_max": 1.2763051986694336, + "advantage_mean": 1.1175870950896893e-08, + "advantage_min": -0.7047868482768536, + "advantage_std": 0.7354258019477129, + "completion_length": 2538.6041946411133, + "epoch": 0.23885714285714285, + "grad_norm": 0.25784793496131897, + "kl": 0.1322021484375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.500858306332172e-07, + "loss": 0.002, + "reward": 0.23811393603682518, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.23811393603682518, + "reward_after_std": 0.7354257944971323, + "reward_before_mean": 0.5821687076240778, + "reward_before_std": 0.7137798797339201, + "reward_change_max": 0.0, + "reward_change_mean": -0.3440546961501241, + "reward_change_min": -0.6435936130583286, + "reward_change_std": 0.24290089262649417, + "reward_std": 0.735425828024745, + "rewards/cosine_scaled_reward": 0.06191765144467354, + "rewards/format_reward": 0.4583333432674408, + "step": 209 + }, + { + "advantage_max": 1.183098427951336, + "advantage_mean": -4.967053435223079e-09, + "advantage_min": -0.6418071016669273, + "advantage_std": 0.7025355864316225, + "completion_length": 2545.8333740234375, + "epoch": 0.24, + "grad_norm": 0.40859097242355347, + "kl": 0.1146240234375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.472670160550848e-07, + "loss": 0.0745, + "reward": 0.05246494244784117, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.05246494244784117, + "reward_after_std": 0.7025356087833643, + "reward_before_mean": 0.3357812147587538, + "reward_before_std": 0.7116394340991974, + "reward_change_max": 0.0008187666535377502, + "reward_change_mean": -0.28331627510488033, + "reward_change_min": -0.5772730372846127, + "reward_change_std": 0.23112040758132935, + "reward_std": 0.7025356367230415, + "rewards/cosine_scaled_reward": -0.019609388895332813, + "rewards/format_reward": 0.37500000186264515, + "step": 210 + }, + { + "advantage_max": 1.1921544596552849, + "advantage_mean": -8.692344122263052e-09, + "advantage_min": -0.6380018964409828, + "advantage_std": 0.6864985190331936, + "completion_length": 2406.062545776367, + "epoch": 0.24114285714285713, + "grad_norm": 0.25473645329475403, + "kl": 0.1444091796875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.444385869608921e-07, + "loss": 0.0218, + "reward": 0.10756439715623856, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.10756439715623856, + "reward_after_std": 0.6864985190331936, + "reward_before_mean": 0.4096587970852852, + "reward_before_std": 0.6774689964950085, + "reward_change_max": 0.00046136975288391113, + "reward_change_mean": -0.30209438502788544, + "reward_change_min": -0.5448434054851532, + "reward_change_std": 0.21998351905494928, + "reward_std": 0.6864985190331936, + "rewards/cosine_scaled_reward": -0.03475394658744335, + "rewards/format_reward": 0.47916666977107525, + "step": 211 + }, + { + "advantage_max": 1.1086622849106789, + "advantage_mean": -3.414849569782774e-09, + "advantage_min": -0.5913215838372707, + "advantage_std": 0.6221080049872398, + "completion_length": 2567.5833587646484, + "epoch": 0.2422857142857143, + "grad_norm": 0.18780724704265594, + "kl": 0.151611328125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.416006812042827e-07, + "loss": 0.0246, + "reward": 0.13817980530438945, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.13817980530438945, + "reward_after_std": 0.6221080236136913, + "reward_before_mean": 0.4593674587085843, + "reward_before_std": 0.5720679853111506, + "reward_change_max": 0.0013311505317687988, + "reward_change_mean": -0.32118767499923706, + "reward_change_min": -0.5473824627697468, + "reward_change_std": 0.21084042405709624, + "reward_std": 0.6221080496907234, + "rewards/cosine_scaled_reward": -0.009899599011987448, + "rewards/format_reward": 0.47916666977107525, + "step": 212 + }, + { + "advantage_max": 1.4852821864187717, + "advantage_mean": -8.071462331837864e-09, + "advantage_min": -0.7190396524965763, + "advantage_std": 0.8180091008543968, + "completion_length": 2564.9375610351562, + "epoch": 0.24342857142857144, + "grad_norm": 0.9091688990592957, + "kl": 0.189697265625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.387534371007797e-07, + "loss": -0.0705, + "reward": 0.2521779127418995, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2521779127418995, + "reward_after_std": 0.8180091008543968, + "reward_before_mean": 0.5849970206618309, + "reward_before_std": 0.7709748446941376, + "reward_change_max": 0.0, + "reward_change_mean": -0.3328191004693508, + "reward_change_min": -0.5791736077517271, + "reward_change_std": 0.22523777186870575, + "reward_std": 0.8180091418325901, + "rewards/cosine_scaled_reward": 0.01124850008636713, + "rewards/format_reward": 0.5625000074505806, + "step": 213 + }, + { + "advantage_max": 1.2928205132484436, + "advantage_mean": -1.7384688688615313e-08, + "advantage_min": -0.85975431650877, + "advantage_std": 0.782087666913867, + "completion_length": 2792.7500534057617, + "epoch": 0.24457142857142858, + "grad_norm": 0.2622942328453064, + "kl": 0.156494140625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.358969934210438e-07, + "loss": 0.0108, + "reward": 0.09292547777295113, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.09292547777295113, + "reward_after_std": 0.7820876855403185, + "reward_before_mean": 0.38339292258024216, + "reward_before_std": 0.8171257805079222, + "reward_change_max": 0.00044471025466918945, + "reward_change_mean": -0.2904674874152988, + "reward_change_min": -0.5845286399126053, + "reward_change_std": 0.2391545344144106, + "reward_std": 0.7820877321064472, + "rewards/cosine_scaled_reward": -0.04788686567917466, + "rewards/format_reward": 0.47916667722165585, + "step": 214 + }, + { + "advantage_max": 1.0478114522993565, + "advantage_mean": 1.3659397835041887e-08, + "advantage_min": -0.7259840816259384, + "advantage_std": 0.6186531595885754, + "completion_length": 2528.7500381469727, + "epoch": 0.24571428571428572, + "grad_norm": 0.20343047380447388, + "kl": 0.15179443359375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.330314893841101e-07, + "loss": 0.0208, + "reward": -0.10817716736346483, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.10817716736346483, + "reward_after_std": 0.6186531558632851, + "reward_before_mean": 0.12496000435203314, + "reward_before_std": 0.632936142385006, + "reward_change_max": 0.0008758828043937683, + "reward_change_mean": -0.23313714284449816, + "reward_change_min": -0.43077932484447956, + "reward_change_std": 0.17999738734215498, + "reward_std": 0.6186531707644463, + "rewards/cosine_scaled_reward": -0.17710335180163383, + "rewards/format_reward": 0.47916667722165585, + "step": 215 + }, + { + "advantage_max": 1.538545936346054, + "advantage_mean": -3.10440865236572e-08, + "advantage_min": -0.8211144357919693, + "advantage_std": 0.88631546869874, + "completion_length": 2307.1250534057617, + "epoch": 0.24685714285714286, + "grad_norm": 0.3738247752189636, + "kl": 0.1524658203125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.301570646506027e-07, + "loss": 0.0444, + "reward": 0.2975668590515852, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2975668590515852, + "reward_after_std": 0.8863154835999012, + "reward_before_mean": 0.6430559878936037, + "reward_before_std": 0.883563507348299, + "reward_change_max": 0.000818595290184021, + "reward_change_mean": -0.3454891378059983, + "reward_change_min": -0.7173335365951061, + "reward_change_std": 0.2694939011707902, + "reward_std": 0.8863155096769333, + "rewards/cosine_scaled_reward": -0.011805359274148941, + "rewards/format_reward": 0.6666666697710752, + "step": 216 + }, + { + "advantage_max": 1.8591451607644558, + "advantage_mean": 3.1044085080367267e-09, + "advantage_min": -0.8041123300790787, + "advantage_std": 1.0541826523840427, + "completion_length": 2986.7500610351562, + "epoch": 0.248, + "grad_norm": 0.3932034969329834, + "kl": 0.18426513671875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.27273859315928e-07, + "loss": 0.0544, + "reward": 0.04206418804824352, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.04206418804824352, + "reward_after_std": 1.054182630032301, + "reward_before_mean": 0.275969285517931, + "reward_before_std": 1.1014858186244965, + "reward_change_max": 0.00034668296575546265, + "reward_change_mean": -0.2339051030576229, + "reward_change_min": -0.6030337251722813, + "reward_change_std": 0.23021453525871038, + "reward_std": 1.0541826710104942, + "rewards/cosine_scaled_reward": -0.05993202514946461, + "rewards/format_reward": 0.3958333358168602, + "step": 217 + }, + { + "advantage_max": 1.7612306363880634, + "advantage_mean": -9.93410786964688e-09, + "advantage_min": -0.782971628010273, + "advantage_std": 0.9692483730614185, + "completion_length": 2740.041732788086, + "epoch": 0.24914285714285714, + "grad_norm": 0.3887059688568115, + "kl": 0.14617919921875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.243820139034464e-07, + "loss": 0.034, + "reward": -0.06596058112336323, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.06596058112336323, + "reward_after_std": 0.9692483730614185, + "reward_before_mean": 0.13316884357482195, + "reward_before_std": 0.9875393584370613, + "reward_change_max": 0.0006215497851371765, + "reward_change_mean": -0.19912943989038467, + "reward_change_min": -0.5118247009813786, + "reward_change_std": 0.19447639770805836, + "reward_std": 0.9692483954131603, + "rewards/cosine_scaled_reward": -0.12091558671090752, + "rewards/format_reward": 0.37500000931322575, + "step": 218 + }, + { + "advantage_max": 1.399564553052187, + "advantage_mean": -3.7252901874396116e-09, + "advantage_min": -0.8181595951318741, + "advantage_std": 0.8158378414809704, + "completion_length": 2502.354217529297, + "epoch": 0.2502857142857143, + "grad_norm": 0.30666327476501465, + "kl": 0.157745361328125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.214816693576234e-07, + "loss": 0.0188, + "reward": 0.2214650847017765, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2214650847017765, + "reward_after_std": 0.8158378321677446, + "reward_before_mean": 0.550739474594593, + "reward_before_std": 0.8254678416997194, + "reward_change_max": 0.0006307139992713928, + "reward_change_mean": -0.3292743950150907, + "reward_change_min": -0.6367195174098015, + "reward_change_std": 0.25280233519151807, + "reward_std": 0.8158378712832928, + "rewards/cosine_scaled_reward": -0.01629692828282714, + "rewards/format_reward": 0.5833333414047956, + "step": 219 + }, + { + "advantage_max": 0.5982950031757355, + "advantage_mean": 1.1175871006408045e-08, + "advantage_min": -0.33232397958636284, + "advantage_std": 0.3459637016057968, + "completion_length": 2903.270866394043, + "epoch": 0.25142857142857145, + "grad_norm": 0.18245455622673035, + "kl": 0.2322998046875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.185729670371604e-07, + "loss": 0.0284, + "reward": -0.3950345846824348, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.3950345846824348, + "reward_after_std": 0.34596370719373226, + "reward_before_mean": -0.23216606117784977, + "reward_before_std": 0.3361614188179374, + "reward_change_max": 0.0, + "reward_change_mean": -0.16286852350458503, + "reward_change_min": -0.3166360780596733, + "reward_change_std": 0.11497970344498754, + "reward_std": 0.34596371836960316, + "rewards/cosine_scaled_reward": -0.26191636361181736, + "rewards/format_reward": 0.291666679084301, + "step": 220 + }, + { + "advantage_max": 1.3345818668603897, + "advantage_mean": -1.73846881335038e-08, + "advantage_min": -0.7682487592101097, + "advantage_std": 0.7680236846208572, + "completion_length": 2026.5000267028809, + "epoch": 0.25257142857142856, + "grad_norm": 0.3813193142414093, + "kl": 0.136322021484375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.156560487081051e-07, + "loss": 0.0055, + "reward": 0.2768420181237161, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2768420181237161, + "reward_after_std": 0.7680236846208572, + "reward_before_mean": 0.6301960237324238, + "reward_before_std": 0.7425558120012283, + "reward_change_max": 0.0014693215489387512, + "reward_change_mean": -0.353354025632143, + "reward_change_min": -0.6284851208329201, + "reward_change_std": 0.24855860322713852, + "reward_std": 0.7680237218737602, + "rewards/cosine_scaled_reward": 0.023431332781910896, + "rewards/format_reward": 0.5833333488553762, + "step": 221 + }, + { + "advantage_max": 1.325666181743145, + "advantage_mean": 6.829699250587851e-09, + "advantage_min": -0.7518243491649628, + "advantage_std": 0.7725033760070801, + "completion_length": 2567.8958740234375, + "epoch": 0.2537142857142857, + "grad_norm": 0.3199946880340576, + "kl": 0.1875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.127310565369415e-07, + "loss": 0.022, + "reward": 0.22519738972187042, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.22519738972187042, + "reward_after_std": 0.7725033611059189, + "reward_before_mean": 0.5621007736772299, + "reward_before_std": 0.7645175494253635, + "reward_change_max": 0.0003986433148384094, + "reward_change_mean": -0.33690338023006916, + "reward_change_min": -0.6237695887684822, + "reward_change_std": 0.24479289446026087, + "reward_std": 0.7725033946335316, + "rewards/cosine_scaled_reward": -0.03144961781799793, + "rewards/format_reward": 0.6250000074505806, + "step": 222 + }, + { + "advantage_max": 1.14262056350708, + "advantage_mean": 9.93410742555767e-09, + "advantage_min": -0.8323545679450035, + "advantage_std": 0.731819149106741, + "completion_length": 2596.187545776367, + "epoch": 0.25485714285714284, + "grad_norm": 0.33422207832336426, + "kl": 0.175048828125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.097981330836616e-07, + "loss": 0.0487, + "reward": 0.05562268290668726, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.05562268290668726, + "reward_after_std": 0.7318191528320312, + "reward_before_mean": 0.34569063875824213, + "reward_before_std": 0.793747067451477, + "reward_change_max": 0.0011824890971183777, + "reward_change_mean": -0.29006795305758715, + "reward_change_min": -0.6069894656538963, + "reward_change_std": 0.24760264065116644, + "reward_std": 0.7318191714584827, + "rewards/cosine_scaled_reward": -0.0771546857431531, + "rewards/format_reward": 0.5000000111758709, + "step": 223 + }, + { + "advantage_max": 1.6740315780043602, + "advantage_mean": 4.346172532976311e-09, + "advantage_min": -0.9070464447140694, + "advantage_std": 0.9623745940625668, + "completion_length": 2658.2084045410156, + "epoch": 0.256, + "grad_norm": 0.6894913911819458, + "kl": 0.1593170166015625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.068574212948169e-07, + "loss": 0.0743, + "reward": 0.15881489496678114, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.15881489496678114, + "reward_after_std": 0.9623745791614056, + "reward_before_mean": 0.44474709406495094, + "reward_before_std": 0.9890395030379295, + "reward_change_max": 0.0003013685345649719, + "reward_change_mean": -0.28593218978494406, + "reward_change_min": -0.645022090524435, + "reward_change_std": 0.25701585691422224, + "reward_std": 0.9623746164143085, + "rewards/cosine_scaled_reward": -0.006793119246140122, + "rewards/format_reward": 0.4583333469927311, + "step": 224 + }, + { + "advantage_max": 1.655724935233593, + "advantage_mean": -1.1175871006408045e-08, + "advantage_min": -1.0246229209005833, + "advantage_std": 1.0299497470259666, + "completion_length": 2939.6459197998047, + "epoch": 0.2571428571428571, + "grad_norm": 0.5554704666137695, + "kl": 0.2054443359375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.039090644965509e-07, + "loss": 0.0487, + "reward": 0.27755059860646725, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.27755059860646725, + "reward_after_std": 1.0299497283995152, + "reward_before_mean": 0.6106989241670817, + "reward_before_std": 1.107281219214201, + "reward_change_max": 0.00046384334564208984, + "reward_change_mean": -0.3331483481451869, + "reward_change_min": -0.7214470133185387, + "reward_change_std": 0.3054943010210991, + "reward_std": 1.0299497619271278, + "rewards/cosine_scaled_reward": 0.03451612964272499, + "rewards/format_reward": 0.5416666772216558, + "step": 225 + }, + { + "advantage_max": 1.139162603765726, + "advantage_mean": -6.829699306099002e-09, + "advantage_min": -0.6521196067333221, + "advantage_std": 0.6465449631214142, + "completion_length": 2743.2084045410156, + "epoch": 0.2582857142857143, + "grad_norm": 0.24690324068069458, + "kl": 0.2142333984375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 7.009532063876148e-07, + "loss": 0.0231, + "reward": 0.33282162994146347, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.33282162994146347, + "reward_after_std": 0.6465449705719948, + "reward_before_mean": 0.7200730703771114, + "reward_before_std": 0.5950763653963804, + "reward_change_max": 0.00016529858112335205, + "reward_change_mean": -0.3872514390386641, + "reward_change_min": -0.6319355145096779, + "reward_change_std": 0.23841121140867472, + "reward_std": 0.6465449780225754, + "rewards/cosine_scaled_reward": 0.07878652843646705, + "rewards/format_reward": 0.5625000074505806, + "step": 226 + }, + { + "advantage_max": 1.3981711119413376, + "advantage_mean": 7.450580874479584e-09, + "advantage_min": -0.8407816961407661, + "advantage_std": 0.8285619355738163, + "completion_length": 2509.7500762939453, + "epoch": 0.25942857142857145, + "grad_norm": 0.4142477810382843, + "kl": 0.22259521484375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.979899910323624e-07, + "loss": 0.015, + "reward": 0.16833718493580818, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.16833718493580818, + "reward_after_std": 0.8285619467496872, + "reward_before_mean": 0.4775360394269228, + "reward_before_std": 0.8386373966932297, + "reward_change_max": 0.0, + "reward_change_mean": -0.30919882375746965, + "reward_change_min": -0.5947811640799046, + "reward_change_std": 0.2422353681176901, + "reward_std": 0.8285619504749775, + "rewards/cosine_scaled_reward": -0.0633153374074027, + "rewards/format_reward": 0.6041666772216558, + "step": 227 + }, + { + "advantage_max": 1.0023160874843597, + "advantage_mean": -4.967053435223079e-09, + "advantage_min": -0.7809347063302994, + "advantage_std": 0.6344969868659973, + "completion_length": 2511.3750228881836, + "epoch": 0.26057142857142856, + "grad_norm": 0.21875105798244476, + "kl": 0.20574951171875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.950195628537299e-07, + "loss": 0.026, + "reward": 0.17744406033307314, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.17744406033307314, + "reward_after_std": 0.6344969756901264, + "reward_before_mean": 0.519596628844738, + "reward_before_std": 0.6598436906933784, + "reward_change_max": 0.0009124875068664551, + "reward_change_mean": -0.34215256478637457, + "reward_change_min": -0.6125946547836065, + "reward_change_std": 0.2433307566680014, + "reward_std": 0.6344969943165779, + "rewards/cosine_scaled_reward": 0.030631639063358307, + "rewards/format_reward": 0.4583333358168602, + "step": 228 + }, + { + "advantage_max": 1.1860961690545082, + "advantage_mean": -3.725290242950763e-09, + "advantage_min": -0.5950976237654686, + "advantage_std": 0.666605468839407, + "completion_length": 3048.1458587646484, + "epoch": 0.26171428571428573, + "grad_norm": 0.3069765865802765, + "kl": 0.293701171875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.920420666261961e-07, + "loss": 0.0238, + "reward": -0.034969646483659744, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.034969646483659744, + "reward_after_std": 0.6666054800152779, + "reward_before_mean": 0.21758911339566112, + "reward_before_std": 0.6485214531421661, + "reward_change_max": 0.005425959825515747, + "reward_change_mean": -0.25255876034498215, + "reward_change_min": -0.5039389729499817, + "reward_change_std": 0.19493055483326316, + "reward_std": 0.6666054986417294, + "rewards/cosine_scaled_reward": -0.057872116565704346, + "rewards/format_reward": 0.33333333767950535, + "step": 229 + }, + { + "advantage_max": 1.0551139377057552, + "advantage_mean": 1.738468857759301e-08, + "advantage_min": -0.4135409705340862, + "advantage_std": 0.560747466981411, + "completion_length": 3287.9166870117188, + "epoch": 0.26285714285714284, + "grad_norm": 0.41716912388801575, + "kl": 0.2872314453125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.890576474687263e-07, + "loss": 0.0396, + "reward": -0.41289188899099827, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.41289188899099827, + "reward_after_std": 0.5607474707067013, + "reward_before_mean": -0.2849421767750755, + "reward_before_std": 0.5487396996468306, + "reward_change_max": 0.0005631819367408752, + "reward_change_mean": -0.12794970721006393, + "reward_change_min": -0.2580363266170025, + "reward_change_std": 0.10446212626993656, + "reward_std": 0.5607474967837334, + "rewards/cosine_scaled_reward": -0.26747109182178974, + "rewards/format_reward": 0.2500000037252903, + "step": 230 + }, + { + "advantage_max": 1.5714677423238754, + "advantage_mean": -2.235174301201681e-08, + "advantage_min": -0.7718853428959846, + "advantage_std": 0.8607656769454479, + "completion_length": 2893.854263305664, + "epoch": 0.264, + "grad_norm": 0.342227578163147, + "kl": 0.253875732421875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.860664508377001e-07, + "loss": 0.0199, + "reward": 0.2559966053813696, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2559966053813696, + "reward_after_std": 0.8607656769454479, + "reward_before_mean": 0.5868953629396856, + "reward_before_std": 0.8197899498045444, + "reward_change_max": 0.000645756721496582, + "reward_change_mean": -0.33089875616133213, + "reward_change_min": -0.5897320918738842, + "reward_change_std": 0.23832240141928196, + "reward_std": 0.86076570302248, + "rewards/cosine_scaled_reward": 0.04344766750000417, + "rewards/format_reward": 0.5000000037252903, + "step": 231 + }, + { + "advantage_max": 0.703536681830883, + "advantage_mean": 1.4901161693448017e-08, + "advantage_min": -0.48478899523615837, + "advantage_std": 0.4345742128789425, + "completion_length": 2969.7500610351562, + "epoch": 0.2651428571428571, + "grad_norm": 0.23666274547576904, + "kl": 0.29571533203125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.83068622519821e-07, + "loss": 0.0313, + "reward": -0.41810051421634853, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.41810051421634853, + "reward_after_std": 0.4345742128789425, + "reward_before_mean": -0.26686476916074753, + "reward_before_std": 0.4644641876220703, + "reward_change_max": 0.0005011186003684998, + "reward_change_mean": -0.15123574994504452, + "reward_change_min": -0.35539793223142624, + "reward_change_std": 0.13576048891991377, + "reward_std": 0.4345742352306843, + "rewards/cosine_scaled_reward": -0.23759905248880386, + "rewards/format_reward": 0.20833334140479565, + "step": 232 + }, + { + "advantage_max": 1.0524715818464756, + "advantage_mean": 8.692344621863413e-09, + "advantage_min": -0.5536077432334423, + "advantage_std": 0.6050297953188419, + "completion_length": 2742.125045776367, + "epoch": 0.2662857142857143, + "grad_norm": 0.2027333676815033, + "kl": 0.2685546875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.800643086250121e-07, + "loss": 0.0358, + "reward": -0.13591570034623146, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.13591570034623146, + "reward_after_std": 0.6050298027694225, + "reward_before_mean": 0.08999292412772775, + "reward_before_std": 0.6030003279447556, + "reward_change_max": 0.00041041523218154907, + "reward_change_mean": -0.22590863425284624, + "reward_change_min": -0.4129418469965458, + "reward_change_std": 0.164816755335778, + "reward_std": 0.605029821395874, + "rewards/cosine_scaled_reward": -0.19458688236773014, + "rewards/format_reward": 0.4791666679084301, + "step": 233 + }, + { + "advantage_max": 1.3114805594086647, + "advantage_mean": -8.071462387349015e-09, + "advantage_min": -0.7555878981947899, + "advantage_std": 0.7706392370164394, + "completion_length": 2707.812515258789, + "epoch": 0.2674285714285714, + "grad_norm": 0.25337696075439453, + "kl": 0.24365234375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.770536555792944e-07, + "loss": 0.0202, + "reward": -0.028612198890186846, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.028612198890186846, + "reward_after_std": 0.7706392221152782, + "reward_before_mean": 0.2160261981189251, + "reward_before_std": 0.7999156005680561, + "reward_change_max": 0.0006139576435089111, + "reward_change_mean": -0.24463840294629335, + "reward_change_min": -0.5106790214776993, + "reward_change_std": 0.2085583619773388, + "reward_std": 0.7706392407417297, + "rewards/cosine_scaled_reward": -0.08990356823778711, + "rewards/format_reward": 0.39583334513008595, + "step": 234 + }, + { + "advantage_max": 1.4646566659212112, + "advantage_mean": 3.1044083970144243e-09, + "advantage_min": -0.7245773002505302, + "advantage_std": 0.827287781983614, + "completion_length": 2410.520881652832, + "epoch": 0.26857142857142857, + "grad_norm": 0.3164808750152588, + "kl": 0.244384765625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.740368101176495e-07, + "loss": 0.0445, + "reward": 0.20270962733775377, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.20270962733775377, + "reward_after_std": 0.8272877987474203, + "reward_before_mean": 0.520314646884799, + "reward_before_std": 0.820151200518012, + "reward_change_max": 0.0004795342683792114, + "reward_change_mean": -0.3176050102338195, + "reward_change_min": -0.5893700663000345, + "reward_change_std": 0.225506953895092, + "reward_std": 0.8272878266870975, + "rewards/cosine_scaled_reward": 0.02057398436591029, + "rewards/format_reward": 0.4791666753590107, + "step": 235 + }, + { + "advantage_max": 1.7150937244296074, + "advantage_mean": 5.551115123125783e-17, + "advantage_min": -0.914762269705534, + "advantage_std": 0.9817547611892223, + "completion_length": 2980.5209350585938, + "epoch": 0.26971428571428574, + "grad_norm": 0.6971549391746521, + "kl": 0.2442626953125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.710139192768694e-07, + "loss": 0.0655, + "reward": 0.23412334313616157, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.23412334313616157, + "reward_after_std": 0.9817547835409641, + "reward_before_mean": 0.5466544292867184, + "reward_before_std": 0.9983992204070091, + "reward_change_max": 0.0008015632629394531, + "reward_change_mean": -0.31253109592944384, + "reward_change_min": -0.6879443116486073, + "reward_change_std": 0.2631764831021428, + "reward_std": 0.9817548058927059, + "rewards/cosine_scaled_reward": 0.012910543940961361, + "rewards/format_reward": 0.520833345130086, + "step": 236 + }, + { + "advantage_max": 1.2556101009249687, + "advantage_mean": -6.829698639965187e-09, + "advantage_min": -0.9055002331733704, + "advantage_std": 0.7706798426806927, + "completion_length": 2733.2083740234375, + "epoch": 0.27085714285714285, + "grad_norm": 0.4280366897583008, + "kl": 0.22900390625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.679851303883891e-07, + "loss": 0.0486, + "reward": 0.17084035277366638, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.17084035277366638, + "reward_after_std": 0.7706798352301121, + "reward_before_mean": 0.49289674311876297, + "reward_before_std": 0.8058627881109715, + "reward_change_max": 0.0, + "reward_change_mean": -0.3220563712529838, + "reward_change_min": -0.6181009113788605, + "reward_change_std": 0.2525101900100708, + "reward_std": 0.7706798538565636, + "rewards/cosine_scaled_reward": 0.006865017116069794, + "rewards/format_reward": 0.4791666753590107, + "step": 237 + }, + { + "advantage_max": 1.6488749012351036, + "advantage_mean": -6.208817415753742e-08, + "advantage_min": -1.1664773039519787, + "advantage_std": 1.0465109311044216, + "completion_length": 2192.604232788086, + "epoch": 0.272, + "grad_norm": 0.7376325726509094, + "kl": 0.192626953125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.649505910711058e-07, + "loss": 0.0487, + "reward": 0.5787554197013378, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5787554197013378, + "reward_after_std": 1.046510897576809, + "reward_before_mean": 1.0178330019116402, + "reward_before_std": 1.119078617542982, + "reward_change_max": 0.0002253800630569458, + "reward_change_mean": -0.43907763762399554, + "reward_change_min": -0.8594516478478909, + "reward_change_std": 0.35902342572808266, + "reward_std": 1.0465109124779701, + "rewards/cosine_scaled_reward": 0.1859998283907771, + "rewards/format_reward": 0.6458333432674408, + "step": 238 + }, + { + "advantage_max": 1.5666873008012772, + "advantage_mean": -1.9868215073159945e-08, + "advantage_min": -0.9591661542654037, + "advantage_std": 0.8998047038912773, + "completion_length": 2221.7917404174805, + "epoch": 0.27314285714285713, + "grad_norm": 0.33203524351119995, + "kl": 0.21246337890625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.619104492241847e-07, + "loss": 0.02, + "reward": 0.46643321961164474, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.46643321961164474, + "reward_after_std": 0.8998047187924385, + "reward_before_mean": 0.8711623698472977, + "reward_before_std": 0.8713781274855137, + "reward_change_max": 0.0010540187358856201, + "reward_change_mean": -0.40472911577671766, + "reward_change_min": -0.7147786617279053, + "reward_change_std": 0.2880152091383934, + "reward_std": 0.8998047187924385, + "rewards/cosine_scaled_reward": 0.1230811607092619, + "rewards/format_reward": 0.6250000149011612, + "step": 239 + }, + { + "advantage_max": 1.298158198595047, + "advantage_mean": -6.208817349140361e-10, + "advantage_min": -0.5822702124714851, + "advantage_std": 0.6943303383886814, + "completion_length": 2911.166717529297, + "epoch": 0.2742857142857143, + "grad_norm": 0.4261091649532318, + "kl": 0.427734375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.588648530198504e-07, + "loss": 0.045, + "reward": -0.2253067083656788, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.2253067083656788, + "reward_after_std": 0.6943303607404232, + "reward_before_mean": -0.048595423810184, + "reward_before_std": 0.6824948564171791, + "reward_change_max": 0.0004196241497993469, + "reward_change_mean": -0.17671129759401083, + "reward_change_min": -0.3555619493126869, + "reward_change_std": 0.14292089035734534, + "reward_std": 0.6943303756415844, + "rewards/cosine_scaled_reward": -0.26388105377554893, + "rewards/format_reward": 0.47916667722165585, + "step": 240 + }, + { + "advantage_max": 0.8420538194477558, + "advantage_mean": 1.3659397890553038e-08, + "advantage_min": -0.6000218987464905, + "advantage_std": 0.5334564503282309, + "completion_length": 2765.5208435058594, + "epoch": 0.2754285714285714, + "grad_norm": 0.2724221348762512, + "kl": 0.3409423828125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.558139508961654e-07, + "loss": 0.038, + "reward": -0.24559512361884117, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.24559512361884117, + "reward_after_std": 0.5334564689546824, + "reward_before_mean": -0.04277325049042702, + "reward_before_std": 0.5738527663052082, + "reward_change_max": 0.0010135173797607422, + "reward_change_mean": -0.20282186102122068, + "reward_change_min": -0.44659484922885895, + "reward_change_std": 0.17772664222866297, + "reward_std": 0.5334564968943596, + "rewards/cosine_scaled_reward": -0.17763663083314896, + "rewards/format_reward": 0.3125000111758709, + "step": 241 + }, + { + "advantage_max": 0.946055244654417, + "advantage_mean": 9.313226301266297e-09, + "advantage_min": -0.5404593013226986, + "advantage_std": 0.5464118495583534, + "completion_length": 2345.8959045410156, + "epoch": 0.2765714285714286, + "grad_norm": 0.35438135266304016, + "kl": 0.35821533203125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.527578915497951e-07, + "loss": 0.045, + "reward": 0.14251868287101388, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.14251868287101388, + "reward_after_std": 0.5464118458330631, + "reward_before_mean": 0.4747390653938055, + "reward_before_std": 0.5084672495722771, + "reward_change_max": 0.0004891380667686462, + "reward_change_mean": -0.3322203680872917, + "reward_change_min": -0.5551994703710079, + "reward_change_std": 0.211034232750535, + "reward_std": 0.5464118607342243, + "rewards/cosine_scaled_reward": -0.08554715011268854, + "rewards/format_reward": 0.6458333395421505, + "step": 242 + }, + { + "advantage_max": 1.200172282755375, + "advantage_mean": -3.725290353973065e-09, + "advantage_min": -0.8043633177876472, + "advantage_std": 0.7464568391442299, + "completion_length": 2693.500030517578, + "epoch": 0.2777142857142857, + "grad_norm": 0.45109739899635315, + "kl": 0.3643798828125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.496968239287603e-07, + "loss": 0.0355, + "reward": 0.038709891960024834, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.038709891960024834, + "reward_after_std": 0.7464568465948105, + "reward_before_mean": 0.31772999092936516, + "reward_before_std": 0.7977652475237846, + "reward_change_max": 0.0006454810500144958, + "reward_change_mean": -0.2790201008319855, + "reward_change_min": -0.589969240128994, + "reward_change_std": 0.23987862188369036, + "reward_std": 0.7464568465948105, + "rewards/cosine_scaled_reward": -0.03905167616903782, + "rewards/format_reward": 0.3958333432674408, + "step": 243 + }, + { + "advantage_max": 1.4144585989415646, + "advantage_mean": -6.661338147750939e-16, + "advantage_min": -0.8154066577553749, + "advantage_std": 0.8018203265964985, + "completion_length": 2837.6875534057617, + "epoch": 0.27885714285714286, + "grad_norm": 0.43070149421691895, + "kl": 0.37689208984375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.466308972251785e-07, + "loss": 0.0446, + "reward": 0.29334662668406963, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.29334662668406963, + "reward_after_std": 0.8018203116953373, + "reward_before_mean": 0.6482694167643785, + "reward_before_std": 0.7809974029660225, + "reward_change_max": 0.0, + "reward_change_mean": -0.35492274537682533, + "reward_change_min": -0.638619052246213, + "reward_change_std": 0.24733527563512325, + "reward_std": 0.8018203228712082, + "rewards/cosine_scaled_reward": 0.08455136185511947, + "rewards/format_reward": 0.47916666977107525, + "step": 244 + }, + { + "advantage_max": 1.202423632144928, + "advantage_mean": 4.967053712778835e-09, + "advantage_min": -0.6274031549692154, + "advantage_std": 0.6885917708277702, + "completion_length": 3277.7916717529297, + "epoch": 0.28, + "grad_norm": 0.5922569632530212, + "kl": 0.4952392578125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.435602608679916e-07, + "loss": 0.0422, + "reward": -0.23441375326365232, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.23441375326365232, + "reward_after_std": 0.6885917671024799, + "reward_before_mean": -0.0529680053004995, + "reward_before_std": 0.714995913207531, + "reward_change_max": 5.5089592933654785e-05, + "reward_change_mean": -0.1814457457512617, + "reward_change_min": -0.43721815571188927, + "reward_change_std": 0.17085466999560595, + "reward_std": 0.6885917857289314, + "rewards/cosine_scaled_reward": -0.16190067771822214, + "rewards/format_reward": 0.2708333395421505, + "step": 245 + }, + { + "advantage_max": 1.3673798479139805, + "advantage_mean": 1.5522043483873205e-08, + "advantage_min": -0.8780043236911297, + "advantage_std": 0.8121840171515942, + "completion_length": 3056.416717529297, + "epoch": 0.28114285714285714, + "grad_norm": 0.5122177600860596, + "kl": 0.4609375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.404850645156841e-07, + "loss": 0.051, + "reward": 0.12062996253371239, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.12062996253371239, + "reward_after_std": 0.8121840003877878, + "reward_before_mean": 0.41690291836857796, + "reward_before_std": 0.8465407397598028, + "reward_change_max": 0.001095414161682129, + "reward_change_mean": -0.29627293813973665, + "reward_change_min": -0.6192711889743805, + "reward_change_std": 0.24561070930212736, + "reward_std": 0.8121840059757233, + "rewards/cosine_scaled_reward": -0.01029854454100132, + "rewards/format_reward": 0.4375000074505806, + "step": 246 + }, + { + "advantage_max": 0.9564785324037075, + "advantage_mean": 1.1796753074388988e-08, + "advantage_min": -0.6387578025460243, + "advantage_std": 0.5899263545870781, + "completion_length": 3254.8958435058594, + "epoch": 0.2822857142857143, + "grad_norm": 0.3940400183200836, + "kl": 0.456787109375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.374054580489873e-07, + "loss": 0.0462, + "reward": -0.2427238319069147, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.2427238319069147, + "reward_after_std": 0.5899263694882393, + "reward_before_mean": -0.047593068331480026, + "reward_before_std": 0.631349828094244, + "reward_change_max": 0.002109043300151825, + "reward_change_mean": -0.1951307598501444, + "reward_change_min": -0.4547439571470022, + "reward_change_std": 0.18082292843610048, + "reward_std": 0.5899263992905617, + "rewards/cosine_scaled_reward": -0.20087987463921309, + "rewards/format_reward": 0.3541666753590107, + "step": 247 + }, + { + "advantage_max": 1.0900807715952396, + "advantage_mean": -2.545615063187512e-08, + "advantage_min": -0.7387166060507298, + "advantage_std": 0.6745978407561779, + "completion_length": 2681.250045776367, + "epoch": 0.2834285714285714, + "grad_norm": 0.39817264676094055, + "kl": 0.3626708984375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.343215915635761e-07, + "loss": 0.0595, + "reward": 0.27292491123080254, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.27292491123080254, + "reward_after_std": 0.6745978370308876, + "reward_before_mean": 0.6433320241048932, + "reward_before_std": 0.6638566926121712, + "reward_change_max": 0.001688249409198761, + "reward_change_mean": -0.3704071491956711, + "reward_change_min": -0.6514604948461056, + "reward_change_std": 0.26492215413600206, + "reward_std": 0.6745978556573391, + "rewards/cosine_scaled_reward": 0.08208268322050571, + "rewards/format_reward": 0.4791666716337204, + "step": 248 + }, + { + "advantage_max": 1.3351382836699486, + "advantage_mean": -1.4901161971003773e-08, + "advantage_min": -0.7336929887533188, + "advantage_std": 0.7524409592151642, + "completion_length": 2210.5417098999023, + "epoch": 0.2845714285714286, + "grad_norm": 0.7154747247695923, + "kl": 0.27446746826171875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.31233615362752e-07, + "loss": 0.0584, + "reward": 0.347092317417264, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.347092317417264, + "reward_after_std": 0.7524409592151642, + "reward_before_mean": 0.7257094047963619, + "reward_before_std": 0.6893659690394998, + "reward_change_max": 0.0010166019201278687, + "reward_change_mean": -0.3786170780658722, + "reward_change_min": -0.6137403771281242, + "reward_change_std": 0.2522727893665433, + "reward_std": 0.752440981566906, + "rewards/cosine_scaled_reward": 0.03993801912292838, + "rewards/format_reward": 0.645833333954215, + "step": 249 + }, + { + "advantage_max": 1.1932175531983376, + "advantage_mean": -1.8626452047421083e-09, + "advantage_min": -0.6423701345920563, + "advantage_std": 0.6697065159678459, + "completion_length": 2465.70841217041, + "epoch": 0.2857142857142857, + "grad_norm": 0.9017249345779419, + "kl": 0.38275146484375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.281416799501187e-07, + "loss": 0.0728, + "reward": 0.025156520307064056, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.025156520307064056, + "reward_after_std": 0.6697065122425556, + "reward_before_mean": 0.29914787295274436, + "reward_before_std": 0.6508230045437813, + "reward_change_max": 0.0013307556509971619, + "reward_change_mean": -0.2739913575351238, + "reward_change_min": -0.5167406238615513, + "reward_change_std": 0.19862729497253895, + "reward_std": 0.6697065494954586, + "rewards/cosine_scaled_reward": -0.1733427420258522, + "rewards/format_reward": 0.6458333358168602, + "step": 250 + }, + { + "advantage_max": 1.6214554160833359, + "advantage_mean": -7.450581041013038e-09, + "advantage_min": -0.8437067344784737, + "advantage_std": 0.8904179111123085, + "completion_length": 1860.6250610351562, + "epoch": 0.28685714285714287, + "grad_norm": 0.37185877561569214, + "kl": 0.2660369873046875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.25045936022246e-07, + "loss": 0.0244, + "reward": 0.2744586355984211, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2744586355984211, + "reward_after_std": 0.8904179111123085, + "reward_before_mean": 0.6066321942489594, + "reward_before_std": 0.858960397541523, + "reward_change_max": 0.0, + "reward_change_mean": -0.33217359334230423, + "reward_change_min": -0.5966192334890366, + "reward_change_std": 0.2286313408985734, + "reward_std": 0.8904179409146309, + "rewards/cosine_scaled_reward": -0.0508505729958415, + "rewards/format_reward": 0.7083333414047956, + "step": 251 + }, + { + "advantage_max": 1.134083978831768, + "advantage_mean": 6.208817349140361e-10, + "advantage_min": -0.6665167361497879, + "advantage_std": 0.6361800581216812, + "completion_length": 2859.3958587646484, + "epoch": 0.288, + "grad_norm": 0.4567761719226837, + "kl": 0.44671630859375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.219465344613258e-07, + "loss": 0.0348, + "reward": -0.03838689235271886, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.03838689235271886, + "reward_after_std": 0.6361800506711006, + "reward_before_mean": 0.21587416902184486, + "reward_before_std": 0.6247138157486916, + "reward_change_max": 0.0002838447690010071, + "reward_change_mean": -0.2542610792443156, + "reward_change_min": -0.4498459994792938, + "reward_change_std": 0.1773118730634451, + "reward_std": 0.6361800730228424, + "rewards/cosine_scaled_reward": -0.11081291688606143, + "rewards/format_reward": 0.4375000074505806, + "step": 252 + }, + { + "advantage_max": 1.4208215326070786, + "advantage_mean": -3.4148494476582414e-08, + "advantage_min": -0.8858304098248482, + "advantage_std": 0.8445864953100681, + "completion_length": 2464.0834045410156, + "epoch": 0.28914285714285715, + "grad_norm": 0.8337281346321106, + "kl": 0.365966796875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.188436263278172e-07, + "loss": 0.0728, + "reward": 0.28498442959971726, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.28498442959971726, + "reward_after_std": 0.8445864543318748, + "reward_before_mean": 0.6343209322076291, + "reward_before_std": 0.858670748770237, + "reward_change_max": 0.0, + "reward_change_mean": -0.3493365282192826, + "reward_change_min": -0.6889376491308212, + "reward_change_std": 0.2688952349126339, + "reward_std": 0.844586469233036, + "rewards/cosine_scaled_reward": -0.026589547283947468, + "rewards/format_reward": 0.6875000167638063, + "step": 253 + }, + { + "advantage_max": 1.2321031764149666, + "advantage_mean": 4.346172310931706e-09, + "advantage_min": -0.680670976638794, + "advantage_std": 0.725123506039381, + "completion_length": 3151.375030517578, + "epoch": 0.29028571428571426, + "grad_norm": 0.609215497970581, + "kl": 0.606689453125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.157373628530852e-07, + "loss": 0.0443, + "reward": -0.049922436475753784, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.049922436475753784, + "reward_after_std": 0.7251235023140907, + "reward_before_mean": 0.19461863487958908, + "reward_before_std": 0.7533079758286476, + "reward_change_max": 0.002223499119281769, + "reward_change_mean": -0.2445410778746009, + "reward_change_min": -0.5359072387218475, + "reward_change_std": 0.2124933786690235, + "reward_std": 0.7251235097646713, + "rewards/cosine_scaled_reward": -0.10060735675506294, + "rewards/format_reward": 0.39583333767950535, + "step": 254 + }, + { + "advantage_max": 1.402302596718073, + "advantage_mean": 1.4280279847511679e-08, + "advantage_min": -0.7114418521523476, + "advantage_std": 0.8122603800147772, + "completion_length": 2964.687530517578, + "epoch": 0.2914285714285714, + "grad_norm": 0.6498525142669678, + "kl": 0.58319091796875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.126278954320294e-07, + "loss": 0.0347, + "reward": -0.07798312418162823, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.07798312418162823, + "reward_after_std": 0.8122603800147772, + "reward_before_mean": 0.14547542482614517, + "reward_before_std": 0.8470509238541126, + "reward_change_max": 0.0012149512767791748, + "reward_change_mean": -0.22345853736624122, + "reward_change_min": -0.5182074159383774, + "reward_change_std": 0.20545397838577628, + "reward_std": 0.8122603949159384, + "rewards/cosine_scaled_reward": -0.15642895735800266, + "rewards/format_reward": 0.45833333767950535, + "step": 255 + }, + { + "advantage_max": 1.3165737465023994, + "advantage_mean": 6.519258216597379e-09, + "advantage_min": -0.674428217113018, + "advantage_std": 0.7454143799841404, + "completion_length": 2872.625030517578, + "epoch": 0.2925714285714286, + "grad_norm": 0.48637011647224426, + "kl": 0.48291015625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.095153756157051e-07, + "loss": 0.056, + "reward": -0.01483570970594883, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.01483570970594883, + "reward_after_std": 0.7454143762588501, + "reward_before_mean": 0.23627811670303345, + "reward_before_std": 0.7435798235237598, + "reward_change_max": 0.0004970431327819824, + "reward_change_mean": -0.25111383642069995, + "reward_change_min": -0.5052115879952908, + "reward_change_std": 0.1974979422520846, + "reward_std": 0.7454143948853016, + "rewards/cosine_scaled_reward": -0.15269426861777902, + "rewards/format_reward": 0.5416666734963655, + "step": 256 + }, + { + "advantage_max": 1.5429691597819328, + "advantage_mean": -6.208817127095756e-09, + "advantage_min": -0.9041449502110481, + "advantage_std": 0.9234105013310909, + "completion_length": 3000.854232788086, + "epoch": 0.2937142857142857, + "grad_norm": 0.707891583442688, + "kl": 0.54736328125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.06399955103937e-07, + "loss": 0.0864, + "reward": 0.23718170449137688, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.23718170449137688, + "reward_after_std": 0.9234105162322521, + "reward_before_mean": 0.5629798602312803, + "reward_before_std": 0.9633310399949551, + "reward_change_max": 0.0, + "reward_change_mean": -0.32579815573990345, + "reward_change_min": -0.6817917302250862, + "reward_change_std": 0.27525400929152966, + "reward_std": 0.9234105236828327, + "rewards/cosine_scaled_reward": 0.041906584054231644, + "rewards/format_reward": 0.47916667722165585, + "step": 257 + }, + { + "advantage_max": 1.7362965643405914, + "advantage_mean": -1.8626452602532595e-08, + "advantage_min": -1.136706404387951, + "advantage_std": 1.0333962552249432, + "completion_length": 2941.5209045410156, + "epoch": 0.2948571428571429, + "grad_norm": 1.655138373374939, + "kl": 0.515380859375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.032817857379256e-07, + "loss": 0.1134, + "reward": 0.23384379362687469, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.23384379362687469, + "reward_after_std": 1.0333962105214596, + "reward_before_mean": 0.5438088413793594, + "reward_before_std": 1.09098768979311, + "reward_change_max": 0.0008084475994110107, + "reward_change_mean": -0.3099650777876377, + "reward_change_min": -0.6670037191361189, + "reward_change_std": 0.27815736550837755, + "reward_std": 1.0333962254226208, + "rewards/cosine_scaled_reward": 0.011487742187455297, + "rewards/format_reward": 0.5208333525806665, + "step": 258 + }, + { + "advantage_max": 1.3132545426487923, + "advantage_mean": -4.967053546245381e-09, + "advantage_min": -0.6945758275687695, + "advantage_std": 0.7458698004484177, + "completion_length": 2559.854217529297, + "epoch": 0.296, + "grad_norm": 0.5919146537780762, + "kl": 0.54998779296875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 6.001610194928464e-07, + "loss": 0.03, + "reward": 0.3503173356875777, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3503173356875777, + "reward_after_std": 0.7458698116242886, + "reward_before_mean": 0.7317201669793576, + "reward_before_std": 0.7002310231328011, + "reward_change_max": 4.976987838745117e-05, + "reward_change_mean": -0.38140283338725567, + "reward_change_min": -0.6384340189397335, + "reward_change_std": 0.24794460413977504, + "reward_std": 0.7458698600530624, + "rewards/cosine_scaled_reward": 0.063776751208934, + "rewards/format_reward": 0.6041666716337204, + "step": 259 + }, + { + "advantage_max": 1.2585435509681702, + "advantage_mean": -1.2417635142369932e-08, + "advantage_min": -0.7303697317838669, + "advantage_std": 0.7169954068958759, + "completion_length": 2164.166679382324, + "epoch": 0.29714285714285715, + "grad_norm": 0.7633321285247803, + "kl": 0.45684814453125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.97037808470444e-07, + "loss": 0.0247, + "reward": 0.5059811770915985, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5059811770915985, + "reward_after_std": 0.716995369642973, + "reward_before_mean": 0.947710856795311, + "reward_before_std": 0.6634920183569193, + "reward_change_max": 0.0, + "reward_change_mean": -0.4417296778410673, + "reward_change_min": -0.7061281353235245, + "reward_change_std": 0.2711289385333657, + "reward_std": 0.7169954031705856, + "rewards/cosine_scaled_reward": 0.1613554283976555, + "rewards/format_reward": 0.6250000037252903, + "step": 260 + }, + { + "advantage_max": 0.6871362961828709, + "advantage_mean": 1.0865430277728905e-08, + "advantage_min": -0.5955430008471012, + "advantage_std": 0.44926629960536957, + "completion_length": 3026.4375610351562, + "epoch": 0.29828571428571427, + "grad_norm": 0.6654581427574158, + "kl": 0.636474609375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.939123048916173e-07, + "loss": 0.0423, + "reward": -0.23231990821659565, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.23231990821659565, + "reward_after_std": 0.44926629960536957, + "reward_before_mean": -0.013448119163513184, + "reward_before_std": 0.4752881024032831, + "reward_change_max": 0.001972891390323639, + "reward_change_mean": -0.21887180488556623, + "reward_change_min": -0.3933011367917061, + "reward_change_std": 0.16985276667401195, + "reward_std": 0.44926630333065987, + "rewards/cosine_scaled_reward": -0.2254740484058857, + "rewards/format_reward": 0.4375000111758709, + "step": 261 + }, + { + "advantage_max": 0.953107200562954, + "advantage_mean": 6.519257911286047e-09, + "advantage_min": -0.5478541739284992, + "advantage_std": 0.5378349423408508, + "completion_length": 2944.166763305664, + "epoch": 0.29942857142857143, + "grad_norm": 0.6682643294334412, + "kl": 0.56689453125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.907846610890011e-07, + "loss": 0.0425, + "reward": -0.11213955376297235, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.11213955376297235, + "reward_after_std": 0.5378349535167217, + "reward_before_mean": 0.12946981191635132, + "reward_before_std": 0.5200149789452553, + "reward_change_max": 0.000765681266784668, + "reward_change_mean": -0.24160936230327934, + "reward_change_min": -0.44526655226945877, + "reward_change_std": 0.16583187272772193, + "reward_std": 0.5378349870443344, + "rewards/cosine_scaled_reward": -0.20609844103455544, + "rewards/format_reward": 0.5416666846722364, + "step": 262 + }, + { + "advantage_max": 1.180128302425146, + "advantage_mean": 4.967053879312289e-09, + "advantage_min": -0.6777870059013367, + "advantage_std": 0.6769901290535927, + "completion_length": 2966.1875610351562, + "epoch": 0.30057142857142854, + "grad_norm": 0.8927240967750549, + "kl": 0.61474609375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.87655029499542e-07, + "loss": 0.0346, + "reward": -0.19135123770684004, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.19135123770684004, + "reward_after_std": 0.6769901514053345, + "reward_before_mean": 0.005868307780474424, + "reward_before_std": 0.6968195661902428, + "reward_change_max": 0.001687467098236084, + "reward_change_mean": -0.1972195482812822, + "reward_change_min": -0.4410099871456623, + "reward_change_std": 0.18090852163732052, + "reward_std": 0.6769901663064957, + "rewards/cosine_scaled_reward": -0.23664918635040522, + "rewards/format_reward": 0.4791666716337204, + "step": 263 + }, + { + "advantage_max": 1.276852548122406, + "advantage_mean": -5.551115123125783e-17, + "advantage_min": -0.8147817477583885, + "advantage_std": 0.7658328823745251, + "completion_length": 2917.0000915527344, + "epoch": 0.3017142857142857, + "grad_norm": 0.7383478283882141, + "kl": 0.48486328125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.845235626570683e-07, + "loss": 0.062, + "reward": 0.05471212463453412, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.05471212463453412, + "reward_after_std": 0.7658328823745251, + "reward_before_mean": 0.3321605040691793, + "reward_before_std": 0.7957486398518085, + "reward_change_max": 0.0, + "reward_change_mean": -0.27744837664067745, + "reward_change_min": -0.5727364979684353, + "reward_change_std": 0.23048890847712755, + "reward_std": 0.7658329159021378, + "rewards/cosine_scaled_reward": -0.13600308820605278, + "rewards/format_reward": 0.6041666809469461, + "step": 264 + }, + { + "advantage_max": 1.0988817438483238, + "advantage_mean": 1.3659398501175701e-08, + "advantage_min": -0.7619898840785027, + "advantage_std": 0.6942568942904472, + "completion_length": 2722.8333892822266, + "epoch": 0.3028571428571429, + "grad_norm": 0.3977333605289459, + "kl": 0.427734375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.813904131848564e-07, + "loss": 0.0308, + "reward": 0.09951317869126797, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.09951317869126797, + "reward_after_std": 0.6942568644881248, + "reward_before_mean": 0.40570926177315414, + "reward_before_std": 0.7324245609343052, + "reward_change_max": 0.0006006136536598206, + "reward_change_mean": -0.30619606003165245, + "reward_change_min": -0.6168924458324909, + "reward_change_std": 0.24154386390000582, + "reward_std": 0.6942568868398666, + "rewards/cosine_scaled_reward": -0.10964538622647524, + "rewards/format_reward": 0.625000013038516, + "step": 265 + }, + { + "advantage_max": 1.1252152770757675, + "advantage_mean": 7.45058070794613e-09, + "advantage_min": -0.7847601100802422, + "advantage_std": 0.6905211433768272, + "completion_length": 3088.0625915527344, + "epoch": 0.304, + "grad_norm": 0.3700275421142578, + "kl": 0.4598388671875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.78255733788191e-07, + "loss": 0.035, + "reward": -0.05057033384218812, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.05057033384218812, + "reward_after_std": 0.6905211508274078, + "reward_before_mean": 0.20164042292162776, + "reward_before_std": 0.7296537682414055, + "reward_change_max": 0.0005407780408859253, + "reward_change_mean": -0.25221075117588043, + "reward_change_min": -0.5302932150661945, + "reward_change_std": 0.2140982821583748, + "reward_std": 0.6905211806297302, + "rewards/cosine_scaled_reward": -0.18042979948222637, + "rewards/format_reward": 0.5625000111758709, + "step": 266 + }, + { + "advantage_max": 1.2583488374948502, + "advantage_mean": 5.551115123125783e-17, + "advantage_min": -0.6300838924944401, + "advantage_std": 0.7009607516229153, + "completion_length": 3292.500030517578, + "epoch": 0.30514285714285716, + "grad_norm": 0.42236313223838806, + "kl": 0.476318359375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.751196772469237e-07, + "loss": 0.0466, + "reward": -0.255409243516624, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.255409243516624, + "reward_after_std": 0.7009607441723347, + "reward_before_mean": -0.08686720486730337, + "reward_before_std": 0.7180141918361187, + "reward_change_max": 0.0011328905820846558, + "reward_change_mean": -0.16854204889386892, + "reward_change_min": -0.38842369988560677, + "reward_change_std": 0.15719072706997395, + "reward_std": 0.7009607516229153, + "rewards/cosine_scaled_reward": -0.18926694057881832, + "rewards/format_reward": 0.29166667722165585, + "step": 267 + }, + { + "advantage_max": 1.2112119421362877, + "advantage_mean": -3.104407841902912e-10, + "advantage_min": -0.6363185346126556, + "advantage_std": 0.6808359436690807, + "completion_length": 2588.7084197998047, + "epoch": 0.3062857142857143, + "grad_norm": 0.6039485335350037, + "kl": 0.3270263671875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.71982396408026e-07, + "loss": 0.0321, + "reward": -0.030109137838735478, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.030109137838735478, + "reward_after_std": 0.6808359213173389, + "reward_before_mean": 0.22252458706498146, + "reward_before_std": 0.665228683501482, + "reward_change_max": 0.00047681480646133423, + "reward_change_mean": -0.2526337383314967, + "reward_change_min": -0.5076783336699009, + "reward_change_std": 0.1878043320029974, + "reward_std": 0.6808359511196613, + "rewards/cosine_scaled_reward": -0.15957104857079685, + "rewards/format_reward": 0.541666679084301, + "step": 268 + }, + { + "advantage_max": 1.5567923858761787, + "advantage_mean": -1.2417634698280722e-09, + "advantage_min": -0.8225489631295204, + "advantage_std": 0.8803973942995071, + "completion_length": 2870.8959197998047, + "epoch": 0.30742857142857144, + "grad_norm": 0.6920576095581055, + "kl": 0.34759521484375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.688440441781398e-07, + "loss": 0.0488, + "reward": 0.11579827091190964, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.11579827091190964, + "reward_after_std": 0.8803974017500877, + "reward_before_mean": 0.39466875046491623, + "reward_before_std": 0.8872611932456493, + "reward_change_max": 0.00016210228204727173, + "reward_change_mean": -0.2788704950362444, + "reward_change_min": -0.5925246551632881, + "reward_change_std": 0.22718221321702003, + "reward_std": 0.8803974017500877, + "rewards/cosine_scaled_reward": -0.08391562500037253, + "rewards/format_reward": 0.5625000074505806, + "step": 269 + }, + { + "advantage_max": 1.813984364271164, + "advantage_mean": -1.8005569923928988e-08, + "advantage_min": -0.8069485202431679, + "advantage_std": 0.9654024578630924, + "completion_length": 2783.166748046875, + "epoch": 0.30857142857142855, + "grad_norm": 0.8999072909355164, + "kl": 0.38671875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.657047735161255e-07, + "loss": 0.0483, + "reward": 0.20181133039295673, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.20181133039295673, + "reward_after_std": 0.9654024355113506, + "reward_before_mean": 0.4937496539205313, + "reward_before_std": 0.9233413189649582, + "reward_change_max": 0.0008331462740898132, + "reward_change_mean": -0.29193832352757454, + "reward_change_min": -0.5729988738894463, + "reward_change_std": 0.21221650764346123, + "reward_std": 0.9654024727642536, + "rewards/cosine_scaled_reward": -0.05520852329209447, + "rewards/format_reward": 0.6041666753590107, + "step": 270 + }, + { + "advantage_max": 1.2739427499473095, + "advantage_mean": 1.6142925274298392e-08, + "advantage_min": -0.7408364042639732, + "advantage_std": 0.770323583856225, + "completion_length": 2661.1458892822266, + "epoch": 0.3097142857142857, + "grad_norm": 0.38857465982437134, + "kl": 0.31219482421875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.625647374256061e-07, + "loss": 0.0231, + "reward": 0.43555654399096966, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.43555654399096966, + "reward_after_std": 0.7703235819935799, + "reward_before_mean": 0.8508926667273045, + "reward_before_std": 0.7559794168919325, + "reward_change_max": 0.0006112903356552124, + "reward_change_mean": -0.41533607384189963, + "reward_change_min": -0.7344745993614197, + "reward_change_std": 0.29105067485943437, + "reward_std": 0.7703235857188702, + "rewards/cosine_scaled_reward": 0.11294632405042648, + "rewards/format_reward": 0.6250000018626451, + "step": 271 + }, + { + "advantage_max": 1.1698375567793846, + "advantage_mean": 2.4835267176115394e-09, + "advantage_min": -0.7527714669704437, + "advantage_std": 0.6810107119381428, + "completion_length": 3082.666702270508, + "epoch": 0.31085714285714283, + "grad_norm": 0.6503217220306396, + "kl": 0.442138671875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.594240889475106e-07, + "loss": 0.0602, + "reward": -0.02925921604037285, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.02925921604037285, + "reward_after_std": 0.6810106933116913, + "reward_before_mean": 0.22632784768939018, + "reward_before_std": 0.6936670579016209, + "reward_change_max": 0.0006357654929161072, + "reward_change_mean": -0.2555870823562145, + "reward_change_min": -0.484931293874979, + "reward_change_std": 0.20067433547228575, + "reward_std": 0.6810107082128525, + "rewards/cosine_scaled_reward": -0.07433607243001461, + "rewards/format_reward": 0.3750000074505806, + "step": 272 + }, + { + "advantage_max": 1.5366176590323448, + "advantage_mean": 1.862645426786713e-09, + "advantage_min": -0.8639146089553833, + "advantage_std": 0.8814141787588596, + "completion_length": 3012.2709045410156, + "epoch": 0.312, + "grad_norm": 0.5377759337425232, + "kl": 0.4683837890625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.562829811526154e-07, + "loss": 0.0601, + "reward": 0.15391826815903187, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.15391826815903187, + "reward_after_std": 0.8814141787588596, + "reward_before_mean": 0.45080176065675914, + "reward_before_std": 0.903516910970211, + "reward_change_max": 0.0003979429602622986, + "reward_change_mean": -0.29688349924981594, + "reward_change_min": -0.5701575763523579, + "reward_change_std": 0.2267187712714076, + "reward_std": 0.8814142197370529, + "rewards/cosine_scaled_reward": -0.003765794448554516, + "rewards/format_reward": 0.4583333395421505, + "step": 273 + }, + { + "advantage_max": 1.475723922252655, + "advantage_mean": -3.663202252646158e-08, + "advantage_min": -1.0043025985360146, + "advantage_std": 0.8919141329824924, + "completion_length": 1951.1667404174805, + "epoch": 0.31314285714285717, + "grad_norm": 0.43542540073394775, + "kl": 0.33087158203125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.531415671340826e-07, + "loss": 0.0124, + "reward": 0.6142655089497566, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6142655089497566, + "reward_after_std": 0.891914140433073, + "reward_before_mean": 1.0786607032641768, + "reward_before_std": 0.8922014944255352, + "reward_change_max": 0.0006687715649604797, + "reward_change_mean": -0.4643952287733555, + "reward_change_min": -0.8292928412556648, + "reward_change_std": 0.32155177742242813, + "reward_std": 0.8919141665101051, + "rewards/cosine_scaled_reward": 0.13308035396039486, + "rewards/format_reward": 0.8125000111758709, + "step": 274 + }, + { + "advantage_max": 1.2398691214621067, + "advantage_mean": -2.1730860777502414e-08, + "advantage_min": -0.8742958381772041, + "advantage_std": 0.7562834247946739, + "completion_length": 2218.437545776367, + "epoch": 0.3142857142857143, + "grad_norm": 0.5172128081321716, + "kl": 0.32861328125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.5e-07, + "loss": 0.0019, + "reward": 0.3955942359752953, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3955942359752953, + "reward_after_std": 0.7562834359705448, + "reward_before_mean": 0.7990798335522413, + "reward_before_std": 0.7605790868401527, + "reward_change_max": 0.0, + "reward_change_mean": -0.4034856390208006, + "reward_change_min": -0.6919118501245975, + "reward_change_std": 0.2679525539278984, + "reward_std": 0.7562834545969963, + "rewards/cosine_scaled_reward": 0.05578993167728186, + "rewards/format_reward": 0.6875000074505806, + "step": 275 + }, + { + "advantage_max": 1.663666844367981, + "advantage_mean": -1.490116141589226e-08, + "advantage_min": -0.941422164440155, + "advantage_std": 0.9734741970896721, + "completion_length": 2530.645896911621, + "epoch": 0.31542857142857145, + "grad_norm": 1.7971059083938599, + "kl": 0.385009765625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.468584328659172e-07, + "loss": 0.0951, + "reward": 0.30931809917092323, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.30931809917092323, + "reward_after_std": 0.9734741821885109, + "reward_before_mean": 0.6490100165829062, + "reward_before_std": 1.0003498420119286, + "reward_change_max": 0.008046261966228485, + "reward_change_mean": -0.33969192765653133, + "reward_change_min": -0.7254288531839848, + "reward_change_std": 0.28267188742756844, + "reward_std": 0.9734742194414139, + "rewards/cosine_scaled_reward": 0.03283834829926491, + "rewards/format_reward": 0.5833333563059568, + "step": 276 + }, + { + "advantage_max": 1.3660626783967018, + "advantage_mean": -5.277494746769307e-09, + "advantage_min": -0.8298850581049919, + "advantage_std": 0.8108585998415947, + "completion_length": 2307.479232788086, + "epoch": 0.31657142857142856, + "grad_norm": 0.5035321116447449, + "kl": 0.3431396484375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.437170188473847e-07, + "loss": 0.0175, + "reward": 0.30645604338496923, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.30645604338496923, + "reward_after_std": 0.8108585998415947, + "reward_before_mean": 0.6676031211391091, + "reward_before_std": 0.8139922991394997, + "reward_change_max": 0.0, + "reward_change_mean": -0.36114706844091415, + "reward_change_min": -0.6979051753878593, + "reward_change_std": 0.259663138538599, + "reward_std": 0.8108586072921753, + "rewards/cosine_scaled_reward": -0.009948452236130834, + "rewards/format_reward": 0.6875000111758709, + "step": 277 + }, + { + "advantage_max": 0.9520070180296898, + "advantage_mean": -1.2417635031347629e-08, + "advantage_min": -0.44027720391750336, + "advantage_std": 0.5101836659014225, + "completion_length": 2111.125045776367, + "epoch": 0.3177142857142857, + "grad_norm": 0.3610879182815552, + "kl": 0.4071044921875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.405759110524894e-07, + "loss": 0.0474, + "reward": 0.31509879417717457, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.31509879417717457, + "reward_after_std": 0.5101836659014225, + "reward_before_mean": 0.707968354850891, + "reward_before_std": 0.3866867758333683, + "reward_change_max": 0.0011505857110023499, + "reward_change_mean": -0.39286957401782274, + "reward_change_min": -0.5869279652833939, + "reward_change_std": 0.223956735804677, + "reward_std": 0.5101836733520031, + "rewards/cosine_scaled_reward": 0.010234175249934196, + "rewards/format_reward": 0.6875000055879354, + "step": 278 + }, + { + "advantage_max": 1.2670928463339806, + "advantage_mean": 1.1175871339474952e-08, + "advantage_min": -0.832107700407505, + "advantage_std": 0.7472830601036549, + "completion_length": 3059.0834350585938, + "epoch": 0.31885714285714284, + "grad_norm": 0.7010309100151062, + "kl": 0.6044921875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.37435262574394e-07, + "loss": 0.0514, + "reward": 0.1004374697804451, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1004374697804451, + "reward_after_std": 0.7472830601036549, + "reward_before_mean": 0.39622872904874384, + "reward_before_std": 0.7557075619697571, + "reward_change_max": 0.00023264437913894653, + "reward_change_mean": -0.2957912692800164, + "reward_change_min": -0.5839440189301968, + "reward_change_std": 0.23290821257978678, + "reward_std": 0.7472830787301064, + "rewards/cosine_scaled_reward": -0.12480231374502182, + "rewards/format_reward": 0.6458333507180214, + "step": 279 + }, + { + "advantage_max": 1.7708331495523453, + "advantage_mean": 2.4835269396561444e-09, + "advantage_min": -0.936590850353241, + "advantage_std": 1.0220554992556572, + "completion_length": 2438.4166870117188, + "epoch": 0.32, + "grad_norm": 0.9222289323806763, + "kl": 0.440582275390625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.342952264838747e-07, + "loss": 0.0523, + "reward": 0.6123909717425704, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6123909717425704, + "reward_after_std": 1.0220554918050766, + "reward_before_mean": 1.0539636346511543, + "reward_before_std": 1.0054830946028233, + "reward_change_max": 0.0, + "reward_change_mean": -0.44157261587679386, + "reward_change_min": -0.8539315573871136, + "reward_change_std": 0.32234959304332733, + "reward_std": 1.0220555067062378, + "rewards/cosine_scaled_reward": 0.16239846497774124, + "rewards/format_reward": 0.7291666716337204, + "step": 280 + }, + { + "advantage_max": 1.0117218270897865, + "advantage_mean": 1.4280279680978225e-08, + "advantage_min": -0.5438734255731106, + "advantage_std": 0.580887034535408, + "completion_length": 3199.937530517578, + "epoch": 0.3211428571428571, + "grad_norm": 0.811485767364502, + "kl": 0.572509765625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.311559558218603e-07, + "loss": 0.0406, + "reward": -0.2502650732640177, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2502650732640177, + "reward_after_std": 0.5808870159089565, + "reward_before_mean": -0.06142948009073734, + "reward_before_std": 0.589481795206666, + "reward_change_max": 0.0005460679531097412, + "reward_change_mean": -0.18883559666574, + "reward_change_min": -0.42559648118913174, + "reward_change_std": 0.1617231946438551, + "reward_std": 0.5808870419859886, + "rewards/cosine_scaled_reward": -0.2494647353887558, + "rewards/format_reward": 0.43750000931322575, + "step": 281 + }, + { + "advantage_max": 1.3895000964403152, + "advantage_mean": -3.1044086745701804e-08, + "advantage_min": -0.9264363273978233, + "advantage_std": 0.8309589326381683, + "completion_length": 2574.9166870117188, + "epoch": 0.3222857142857143, + "grad_norm": 0.532403290271759, + "kl": 0.4725341796875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.28017603591974e-07, + "loss": 0.0415, + "reward": 0.49094755947589874, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.49094755947589874, + "reward_after_std": 0.8309589587152004, + "reward_before_mean": 0.9181583793833852, + "reward_before_std": 0.8120024017989635, + "reward_change_max": 0.00045821070671081543, + "reward_change_mean": -0.42721083760261536, + "reward_change_min": -0.7255665622651577, + "reward_change_std": 0.29684863798320293, + "reward_std": 0.8309589698910713, + "rewards/cosine_scaled_reward": 0.08407919853925705, + "rewards/format_reward": 0.7500000186264515, + "step": 282 + }, + { + "advantage_max": 1.415271319448948, + "advantage_mean": -1.2417634254191512e-08, + "advantage_min": -0.9697859063744545, + "advantage_std": 0.8754711970686913, + "completion_length": 2855.1875915527344, + "epoch": 0.32342857142857145, + "grad_norm": 1.2471925020217896, + "kl": 0.5540771484375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.248803227530763e-07, + "loss": 0.0888, + "reward": 0.20010646618902683, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.20010646618902683, + "reward_after_std": 0.8754712045192719, + "reward_before_mean": 0.519549798220396, + "reward_before_std": 0.9254002720117569, + "reward_change_max": 0.0010265707969665527, + "reward_change_mean": -0.31944333389401436, + "reward_change_min": -0.6503037363290787, + "reward_change_std": 0.2723896815441549, + "reward_std": 0.8754712231457233, + "rewards/cosine_scaled_reward": -0.0006417706608772278, + "rewards/format_reward": 0.5208333469927311, + "step": 283 + }, + { + "advantage_max": 1.316909946501255, + "advantage_mean": -1.7384689243726825e-08, + "advantage_min": -0.8938746899366379, + "advantage_std": 0.7911999821662903, + "completion_length": 2411.3333740234375, + "epoch": 0.32457142857142857, + "grad_norm": 1.0863850116729736, + "kl": 0.50048828125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.21744266211809e-07, + "loss": 0.0111, + "reward": 0.4276206409558654, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4276206409558654, + "reward_after_std": 0.7911999821662903, + "reward_before_mean": 0.8353298846632242, + "reward_before_std": 0.7929439060389996, + "reward_change_max": 0.0, + "reward_change_mean": -0.4077092222869396, + "reward_change_min": -0.7041095197200775, + "reward_change_std": 0.27274756878614426, + "reward_std": 0.7911999858915806, + "rewards/cosine_scaled_reward": 0.011414924636483192, + "rewards/format_reward": 0.8125000111758709, + "step": 284 + }, + { + "advantage_max": 1.259010173380375, + "advantage_mean": -3.1044088966147854e-09, + "advantage_min": -0.6755791082978249, + "advantage_std": 0.6990512013435364, + "completion_length": 2026.4375686645508, + "epoch": 0.32571428571428573, + "grad_norm": 0.7733309864997864, + "kl": 0.29052734375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.186095868151436e-07, + "loss": -0.0132, + "reward": 0.11948005040176213, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.11948005040176213, + "reward_after_std": 0.6990512199699879, + "reward_before_mean": 0.42276617558673024, + "reward_before_std": 0.6689119711518288, + "reward_change_max": 5.587935447692871e-05, + "reward_change_mean": -0.3032861463725567, + "reward_change_min": -0.5256478674709797, + "reward_change_std": 0.20443684607744217, + "reward_std": 0.6990512236952782, + "rewards/cosine_scaled_reward": -0.19486692734062672, + "rewards/format_reward": 0.8125000111758709, + "step": 285 + }, + { + "advantage_max": 1.4266880974173546, + "advantage_mean": -8.071462442860167e-09, + "advantage_min": -0.838430143892765, + "advantage_std": 0.7950329594314098, + "completion_length": 2675.8125610351562, + "epoch": 0.32685714285714285, + "grad_norm": 0.8457327485084534, + "kl": 0.55706787109375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.154764373429315e-07, + "loss": 0.0748, + "reward": 0.14559801947325468, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.14559801947325468, + "reward_after_std": 0.7950329519808292, + "reward_before_mean": 0.44482777640223503, + "reward_before_std": 0.7759469635784626, + "reward_change_max": 0.0, + "reward_change_mean": -0.2992297522723675, + "reward_change_min": -0.5666345283389091, + "reward_change_std": 0.21813244745135307, + "reward_std": 0.795032974332571, + "rewards/cosine_scaled_reward": -0.16300279181450605, + "rewards/format_reward": 0.7708333618938923, + "step": 286 + }, + { + "advantage_max": 1.0565427094697952, + "advantage_mean": -2.452482850134885e-08, + "advantage_min": -0.7084263153374195, + "advantage_std": 0.6352221257984638, + "completion_length": 2057.166732788086, + "epoch": 0.328, + "grad_norm": 0.8031435012817383, + "kl": 0.353179931640625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.123449705004581e-07, + "loss": 0.0017, + "reward": 0.29547856375575066, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.29547856375575066, + "reward_after_std": 0.6352221146225929, + "reward_before_mean": 0.6729789730161428, + "reward_before_std": 0.622281976044178, + "reward_change_max": 0.0, + "reward_change_mean": -0.37750041857361794, + "reward_change_min": -0.6289332434535027, + "reward_change_std": 0.24394258856773376, + "reward_std": 0.635222140699625, + "rewards/cosine_scaled_reward": -0.048927186988294125, + "rewards/format_reward": 0.7708333358168602, + "step": 287 + }, + { + "advantage_max": 1.450631108134985, + "advantage_mean": -2.1730858223989458e-09, + "advantage_min": -0.7151788100600243, + "advantage_std": 0.8018929846584797, + "completion_length": 2653.562545776367, + "epoch": 0.3291428571428571, + "grad_norm": 0.7178438305854797, + "kl": 0.41448974609375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.09215338910999e-07, + "loss": 0.0133, + "reward": 0.25840965658426285, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.25840965658426285, + "reward_after_std": 0.8018929846584797, + "reward_before_mean": 0.598530687391758, + "reward_before_std": 0.7543132728897035, + "reward_change_max": 0.00046640634536743164, + "reward_change_mean": -0.3401210466399789, + "reward_change_min": -0.6340159177780151, + "reward_change_std": 0.2446063496172428, + "reward_std": 0.8018930144608021, + "rewards/cosine_scaled_reward": -0.07573466026224196, + "rewards/format_reward": 0.7500000111758709, + "step": 288 + }, + { + "advantage_max": 1.0173869207501411, + "advantage_mean": -6.208817349140361e-09, + "advantage_min": -0.6042489297688007, + "advantage_std": 0.5885357595980167, + "completion_length": 2031.6042175292969, + "epoch": 0.3302857142857143, + "grad_norm": 0.49764305353164673, + "kl": 0.3008270263671875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.060876951083828e-07, + "loss": 0.0369, + "reward": 0.2047883691266179, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2047883691266179, + "reward_after_std": 0.5885357577353716, + "reward_before_mean": 0.5541504016146064, + "reward_before_std": 0.5410357071086764, + "reward_change_max": 0.0004688054323196411, + "reward_change_mean": -0.34936204878613353, + "reward_change_min": -0.6108824927359819, + "reward_change_std": 0.2264309749007225, + "reward_std": 0.5885357577353716, + "rewards/cosine_scaled_reward": -0.10834147967398167, + "rewards/format_reward": 0.7708333414047956, + "step": 289 + }, + { + "advantage_max": 1.5400925129652023, + "advantage_mean": -2.23517425679276e-08, + "advantage_min": -0.8571957536041737, + "advantage_std": 0.902538351714611, + "completion_length": 2571.666778564453, + "epoch": 0.3314285714285714, + "grad_norm": 0.6938238143920898, + "kl": 0.488525390625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 5.02962191529556e-07, + "loss": 0.0609, + "reward": 0.4056507070781663, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4056507070781663, + "reward_after_std": 0.9025383479893208, + "reward_before_mean": 0.7911003306508064, + "reward_before_std": 0.8982041031122208, + "reward_change_max": 0.0, + "reward_change_mean": -0.38544964604079723, + "reward_change_min": -0.7154050804674625, + "reward_change_std": 0.2800214570015669, + "reward_std": 0.9025383703410625, + "rewards/cosine_scaled_reward": -0.0002831774763762951, + "rewards/format_reward": 0.7916666716337204, + "step": 290 + }, + { + "advantage_max": 1.4336735494434834, + "advantage_mean": -2.60770322002557e-08, + "advantage_min": -0.819994043558836, + "advantage_std": 0.8136847987771034, + "completion_length": 2753.0625610351562, + "epoch": 0.3325714285714286, + "grad_norm": 0.5715336203575134, + "kl": 0.43505859375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.998389805071536e-07, + "loss": 0.0546, + "reward": 0.29718087799847126, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.29718087799847126, + "reward_after_std": 0.8136848174035549, + "reward_before_mean": 0.6506573930382729, + "reward_before_std": 0.7926754802465439, + "reward_change_max": 0.0006195604801177979, + "reward_change_mean": -0.3534764889627695, + "reward_change_min": -0.6486410312354565, + "reward_change_std": 0.24405055306851864, + "reward_std": 0.8136848285794258, + "rewards/cosine_scaled_reward": -0.07050466444343328, + "rewards/format_reward": 0.7916666716337204, + "step": 291 + }, + { + "advantage_max": 1.1349475383758545, + "advantage_mean": -4.190951474747351e-09, + "advantage_min": -0.7262782752513885, + "advantage_std": 0.6686569154262543, + "completion_length": 2857.5001068115234, + "epoch": 0.33371428571428574, + "grad_norm": 0.3354578912258148, + "kl": 0.37042236328125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.967182142620745e-07, + "loss": 0.0119, + "reward": 0.16059950197814032, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.16059950197814032, + "reward_after_std": 0.6686569005250931, + "reward_before_mean": 0.487422039732337, + "reward_before_std": 0.6643938161432743, + "reward_change_max": 0.0005034282803535461, + "reward_change_mean": -0.3268225295469165, + "reward_change_min": -0.5673139244318008, + "reward_change_std": 0.22015255317091942, + "reward_std": 0.668656948953867, + "rewards/cosine_scaled_reward": -0.1104556662030518, + "rewards/format_reward": 0.7083333432674408, + "step": 292 + }, + { + "advantage_max": 1.0261386930942535, + "advantage_mean": -9.313226079221693e-09, + "advantage_min": -0.7676491439342499, + "advantage_std": 0.625335369259119, + "completion_length": 2113.916732788086, + "epoch": 0.33485714285714285, + "grad_norm": 0.466907262802124, + "kl": 0.21929931640625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.93600044896063e-07, + "loss": 0.02, + "reward": 0.3123467434197664, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3123467434197664, + "reward_after_std": 0.6253353767096996, + "reward_before_mean": 0.6998563334345818, + "reward_before_std": 0.6170645132660866, + "reward_change_max": 0.00037410855293273926, + "reward_change_mean": -0.38750957138836384, + "reward_change_min": -0.6227229908108711, + "reward_change_std": 0.2519225236028433, + "reward_std": 0.6253353990614414, + "rewards/cosine_scaled_reward": -0.06673851422965527, + "rewards/format_reward": 0.833333358168602, + "step": 293 + }, + { + "advantage_max": 1.1145121417939663, + "advantage_mean": 0.0, + "advantage_min": -0.7262555472552776, + "advantage_std": 0.6630832739174366, + "completion_length": 2759.1875610351562, + "epoch": 0.336, + "grad_norm": 1.2123439311981201, + "kl": 0.4388427734375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.904846243842949e-07, + "loss": 0.0641, + "reward": -0.08887020824477077, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.08887020824477077, + "reward_after_std": 0.6630832552909851, + "reward_before_mean": 0.14825415145605803, + "reward_before_std": 0.6850478500127792, + "reward_change_max": 0.0, + "reward_change_mean": -0.23712435737252235, + "reward_change_min": -0.481619268655777, + "reward_change_std": 0.19604469276964664, + "reward_std": 0.663083266466856, + "rewards/cosine_scaled_reward": -0.15503959357738495, + "rewards/format_reward": 0.45833334885537624, + "step": 294 + }, + { + "advantage_max": 1.190081775188446, + "advantage_mean": -1.552204376142896e-08, + "advantage_min": -0.9659006744623184, + "advantage_std": 0.7599161565303802, + "completion_length": 2853.8959197998047, + "epoch": 0.33714285714285713, + "grad_norm": 0.5325373411178589, + "kl": 0.446533203125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.873721045679706e-07, + "loss": 0.0451, + "reward": 0.37088565714657307, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.37088565714657307, + "reward_after_std": 0.759916141629219, + "reward_before_mean": 0.7698443452827632, + "reward_before_std": 0.797116793692112, + "reward_change_max": 0.0, + "reward_change_mean": -0.3989586550742388, + "reward_change_min": -0.7092138826847076, + "reward_change_std": 0.289244526065886, + "reward_std": 0.7599161565303802, + "rewards/cosine_scaled_reward": 0.05158880911767483, + "rewards/format_reward": 0.6666666753590107, + "step": 295 + }, + { + "advantage_max": 1.3938785642385483, + "advantage_mean": -3.1044087300813317e-09, + "advantage_min": -0.9425633475184441, + "advantage_std": 0.8566170409321785, + "completion_length": 2891.2084045410156, + "epoch": 0.3382857142857143, + "grad_norm": 0.82956862449646, + "kl": 0.4888916015625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.842626371469149e-07, + "loss": 0.0317, + "reward": 0.2034913629759103, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2034913629759103, + "reward_after_std": 0.8566170409321785, + "reward_before_mean": 0.5244719043839723, + "reward_before_std": 0.903838749974966, + "reward_change_max": 0.0012940466403961182, + "reward_change_mean": -0.320980554446578, + "reward_change_min": -0.6148804239928722, + "reward_change_std": 0.25917847361415625, + "reward_std": 0.8566170632839203, + "rewards/cosine_scaled_reward": -0.039847382344305515, + "rewards/format_reward": 0.6041666865348816, + "step": 296 + }, + { + "advantage_max": 1.4810718297958374, + "advantage_mean": 8.071462442860167e-09, + "advantage_min": -0.689074344933033, + "advantage_std": 0.7899037785828114, + "completion_length": 3057.8125610351562, + "epoch": 0.3394285714285714, + "grad_norm": 1.0994075536727905, + "kl": 0.466552734375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.811563736721829e-07, + "loss": 0.0888, + "reward": -0.15424572816118598, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.15424572816118598, + "reward_after_std": 0.7899037972092628, + "reward_before_mean": 0.03237947775050998, + "reward_before_std": 0.7759405970573425, + "reward_change_max": 0.0008534565567970276, + "reward_change_mean": -0.18662519752979279, + "reward_change_min": -0.35279146023094654, + "reward_change_std": 0.14328347519040108, + "reward_std": 0.7899038307368755, + "rewards/cosine_scaled_reward": -0.1921435995027423, + "rewards/format_reward": 0.4166666753590107, + "step": 297 + }, + { + "advantage_max": 1.3388825692236423, + "advantage_mean": -3.725290298461914e-09, + "advantage_min": -0.6641395017504692, + "advantage_std": 0.7280402891337872, + "completion_length": 2240.27091217041, + "epoch": 0.3405714285714286, + "grad_norm": 0.6047587990760803, + "kl": 0.290863037109375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.780534655386743e-07, + "loss": 0.0508, + "reward": 0.3340705500449985, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3340705500449985, + "reward_after_std": 0.728040311485529, + "reward_before_mean": 0.7073127664625645, + "reward_before_std": 0.6394331082701683, + "reward_change_max": 0.0, + "reward_change_mean": -0.3732422087341547, + "reward_change_min": -0.6003867406398058, + "reward_change_std": 0.22959734685719013, + "reward_std": 0.728040311485529, + "rewards/cosine_scaled_reward": -0.010926967253908515, + "rewards/format_reward": 0.729166679084301, + "step": 298 + }, + { + "advantage_max": 1.0907358229160309, + "advantage_mean": 8.692344288796505e-09, + "advantage_min": -0.7735178507864475, + "advantage_std": 0.6482625380158424, + "completion_length": 2626.7500915527344, + "epoch": 0.3417142857142857, + "grad_norm": 0.729369580745697, + "kl": 0.315673828125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.749540639777539e-07, + "loss": -0.0082, + "reward": 0.482487186556682, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.482487186556682, + "reward_after_std": 0.6482625491917133, + "reward_before_mean": 0.9276544786989689, + "reward_before_std": 0.6089337766170502, + "reward_change_max": 0.0, + "reward_change_mean": -0.44516725465655327, + "reward_change_min": -0.714932031929493, + "reward_change_std": 0.2732243649661541, + "reward_std": 0.6482625640928745, + "rewards/cosine_scaled_reward": 0.0784105621278286, + "rewards/format_reward": 0.7708333507180214, + "step": 299 + }, + { + "advantage_max": 1.2775380536913872, + "advantage_mean": -1.769512955607233e-08, + "advantage_min": -0.8028685301542282, + "advantage_std": 0.7613370381295681, + "completion_length": 2808.166717529297, + "epoch": 0.34285714285714286, + "grad_norm": 0.932621419429779, + "kl": 0.3758544921875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.7185832004988133e-07, + "loss": -0.0015, + "reward": 0.24657689794548787, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.24657689794548787, + "reward_after_std": 0.7613370008766651, + "reward_before_mean": 0.591511320322752, + "reward_before_std": 0.7680493295192719, + "reward_change_max": 0.0, + "reward_change_mean": -0.3449344504624605, + "reward_change_min": -0.6845588479191065, + "reward_change_std": 0.2547345831990242, + "reward_std": 0.7613370381295681, + "rewards/cosine_scaled_reward": -0.0792443419340998, + "rewards/format_reward": 0.7500000204890966, + "step": 300 + }, + { + "advantage_max": 1.2761215642094612, + "advantage_mean": -3.1044084525255755e-09, + "advantage_min": -0.8102873601019382, + "advantage_std": 0.7457788214087486, + "completion_length": 2302.5625762939453, + "epoch": 0.344, + "grad_norm": 0.4439832270145416, + "kl": 0.370361328125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.68766384637248e-07, + "loss": 0.0372, + "reward": 0.20716924034059048, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.20716924034059048, + "reward_after_std": 0.7457788102328777, + "reward_before_mean": 0.539110149256885, + "reward_before_std": 0.7404397539794445, + "reward_change_max": 0.0004123970866203308, + "reward_change_mean": -0.33194091357290745, + "reward_change_min": -0.6133960336446762, + "reward_change_std": 0.2363328319042921, + "reward_std": 0.7457788325846195, + "rewards/cosine_scaled_reward": -0.053361592814326286, + "rewards/format_reward": 0.6458333414047956, + "step": 301 + }, + { + "advantage_max": 1.0984256006777287, + "advantage_mean": -1.4280279292400166e-08, + "advantage_min": -0.6192377582192421, + "advantage_std": 0.6156471632421017, + "completion_length": 2580.812568664551, + "epoch": 0.34514285714285714, + "grad_norm": 0.5197194218635559, + "kl": 0.38067626953125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.656784084364238e-07, + "loss": 0.0277, + "reward": 0.3671446368098259, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3671446368098259, + "reward_after_std": 0.6156471632421017, + "reward_before_mean": 0.7704743854701519, + "reward_before_std": 0.5410328283905983, + "reward_change_max": 0.0, + "reward_change_mean": -0.4033297412097454, + "reward_change_min": -0.6593691110610962, + "reward_change_std": 0.24221038818359375, + "reward_std": 0.6156471930444241, + "rewards/cosine_scaled_reward": -0.02101281937211752, + "rewards/format_reward": 0.8125000149011612, + "step": 302 + }, + { + "advantage_max": 1.365809440612793, + "advantage_mean": 1.241763414316921e-09, + "advantage_min": -0.6896493546664715, + "advantage_std": 0.7584427632391453, + "completion_length": 2604.4584197998047, + "epoch": 0.3462857142857143, + "grad_norm": 1.0810075998306274, + "kl": 0.54681396484375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.6259454195101267e-07, + "loss": 0.0158, + "reward": 0.0789760680636391, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.0789760680636391, + "reward_after_std": 0.758442759513855, + "reward_before_mean": 0.3580004023388028, + "reward_before_std": 0.7291050627827644, + "reward_change_max": 0.0002533271908760071, + "reward_change_mean": -0.27902431692928076, + "reward_change_min": -0.536742877215147, + "reward_change_std": 0.20501808635890484, + "reward_std": 0.7584427706897259, + "rewards/cosine_scaled_reward": -0.14391647558659315, + "rewards/format_reward": 0.6458333414047956, + "step": 303 + }, + { + "advantage_max": 1.1893558949232101, + "advantage_mean": -1.4280279292400166e-08, + "advantage_min": -0.6290902234613895, + "advantage_std": 0.6690522991120815, + "completion_length": 2497.666717529297, + "epoch": 0.3474285714285714, + "grad_norm": 0.39485907554626465, + "kl": 0.443359375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.59514935484316e-07, + "loss": 0.0437, + "reward": 0.2333060341188684, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2333060341188684, + "reward_after_std": 0.6690522991120815, + "reward_before_mean": 0.5804811692796648, + "reward_before_std": 0.6259120032191277, + "reward_change_max": 0.00022764503955841064, + "reward_change_mean": -0.34717513993382454, + "reward_change_min": -0.5779447555541992, + "reward_change_std": 0.21656623855233192, + "reward_std": 0.6690523251891136, + "rewards/cosine_scaled_reward": -0.12642608489841223, + "rewards/format_reward": 0.8333333507180214, + "step": 304 + }, + { + "advantage_max": 1.1147175133228302, + "advantage_mean": 6.20881956958641e-10, + "advantage_min": -0.6409763470292091, + "advantage_std": 0.6258622892200947, + "completion_length": 2925.6250610351562, + "epoch": 0.3485714285714286, + "grad_norm": 0.4802895784378052, + "kl": 0.4498291015625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.5643973913200837e-07, + "loss": 0.0614, + "reward": 0.07837151922285557, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.07837151922285557, + "reward_after_std": 0.6258622966706753, + "reward_before_mean": 0.37614856101572514, + "reward_before_std": 0.5923885479569435, + "reward_change_max": 0.0028682053089141846, + "reward_change_mean": -0.29777705390006304, + "reward_change_min": -0.5343084167689085, + "reward_change_std": 0.20146138314157724, + "reward_std": 0.6258623190224171, + "rewards/cosine_scaled_reward": -0.12442572601139545, + "rewards/format_reward": 0.6250000093132257, + "step": 305 + }, + { + "advantage_max": 1.3190920874476433, + "advantage_mean": -1.2417634476236117e-08, + "advantage_min": -0.7708496078848839, + "advantage_std": 0.7822548560798168, + "completion_length": 2487.1458892822266, + "epoch": 0.3497142857142857, + "grad_norm": 0.4971684217453003, + "kl": 0.333984375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.5336910277482155e-07, + "loss": 0.0417, + "reward": 0.5616477080620825, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5616477080620825, + "reward_after_std": 0.7822548523545265, + "reward_before_mean": 1.0177391674369574, + "reward_before_std": 0.7489114031195641, + "reward_change_max": 0.0, + "reward_change_mean": -0.4560914523899555, + "reward_change_min": -0.7948772348463535, + "reward_change_std": 0.29982269182801247, + "reward_std": 0.7822548858821392, + "rewards/cosine_scaled_reward": 0.13386957813054323, + "rewards/format_reward": 0.7500000223517418, + "step": 306 + }, + { + "advantage_max": 1.3895053565502167, + "advantage_mean": -1.1796753296433593e-08, + "advantage_min": -0.8578041307628155, + "advantage_std": 0.8069791980087757, + "completion_length": 2686.4375610351562, + "epoch": 0.35085714285714287, + "grad_norm": 1.477962613105774, + "kl": 0.44000244140625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.503031760712397e-07, + "loss": 0.0864, + "reward": 0.186143385944888, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.186143385944888, + "reward_after_std": 0.8069791980087757, + "reward_before_mean": 0.5036964304745197, + "reward_before_std": 0.8120075799524784, + "reward_change_max": 0.0, + "reward_change_mean": -0.3175530396401882, + "reward_change_min": -0.6073685400187969, + "reward_change_std": 0.23993523512035608, + "reward_std": 0.8069792166352272, + "rewards/cosine_scaled_reward": -0.07106846524402499, + "rewards/format_reward": 0.6458333414047956, + "step": 307 + }, + { + "advantage_max": 1.0783714689314365, + "advantage_mean": 1.3659398112597643e-08, + "advantage_min": -0.8203104771673679, + "advantage_std": 0.6680172383785248, + "completion_length": 2831.5000610351562, + "epoch": 0.352, + "grad_norm": 0.9542275667190552, + "kl": 0.50390625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.4724210845020494e-07, + "loss": 0.0664, + "reward": -0.11247891874518245, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.11247891874518245, + "reward_after_std": 0.6680172309279442, + "reward_before_mean": 0.1211111843585968, + "reward_before_std": 0.7149896770715714, + "reward_change_max": 7.31348991394043e-05, + "reward_change_mean": -0.23359010089188814, + "reward_change_min": -0.4645574577152729, + "reward_change_std": 0.20687062293291092, + "reward_std": 0.6680172458291054, + "rewards/cosine_scaled_reward": -0.21027773432433605, + "rewards/format_reward": 0.5416666809469461, + "step": 308 + }, + { + "advantage_max": 1.5179293677210808, + "advantage_mean": -2.048909714114089e-08, + "advantage_min": -0.8129667229950428, + "advantage_std": 0.8496960774064064, + "completion_length": 2704.0209350585938, + "epoch": 0.35314285714285715, + "grad_norm": 0.5584425330162048, + "kl": 0.4093017578125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.441860491038345e-07, + "loss": 0.0375, + "reward": 0.24515102710574865, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.24515102710574865, + "reward_after_std": 0.8496960625052452, + "reward_before_mean": 0.5750349033623934, + "reward_before_std": 0.8240993618965149, + "reward_change_max": 0.0006732270121574402, + "reward_change_mean": -0.3298839027993381, + "reward_change_min": -0.6384319141507149, + "reward_change_std": 0.239481333643198, + "reward_std": 0.8496960885822773, + "rewards/cosine_scaled_reward": -0.06664922530762851, + "rewards/format_reward": 0.7083333414047956, + "step": 309 + }, + { + "advantage_max": 1.1005538403987885, + "advantage_mean": -8.381903476850638e-09, + "advantage_min": -0.5918949320912361, + "advantage_std": 0.6197176352143288, + "completion_length": 2167.2708892822266, + "epoch": 0.35428571428571426, + "grad_norm": 0.3327484726905823, + "kl": 0.37042236328125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.4113514698014953e-07, + "loss": 0.0355, + "reward": 0.08579268056200817, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.08579268056200817, + "reward_after_std": 0.6197176389396191, + "reward_before_mean": 0.38630370143800974, + "reward_before_std": 0.5802172459661961, + "reward_change_max": 0.0003741607069969177, + "reward_change_mean": -0.3005110053345561, + "reward_change_min": -0.5397593379020691, + "reward_change_std": 0.1986795188859105, + "reward_std": 0.6197176463901997, + "rewards/cosine_scaled_reward": -0.18184817489236593, + "rewards/format_reward": 0.7500000111758709, + "step": 310 + }, + { + "advantage_max": 1.617615930736065, + "advantage_mean": -2.4835269396561444e-08, + "advantage_min": -1.0272089317440987, + "advantage_std": 0.9101916253566742, + "completion_length": 2331.1459045410156, + "epoch": 0.3554285714285714, + "grad_norm": 0.9486208558082581, + "kl": 0.318389892578125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.3808955077581546e-07, + "loss": 0.0806, + "reward": 0.6239680799189955, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6239680799189955, + "reward_after_std": 0.9101915881037712, + "reward_before_mean": 1.0810477957129478, + "reward_before_std": 0.8660503253340721, + "reward_change_max": 0.0, + "reward_change_mean": -0.45707971416413784, + "reward_change_min": -0.7482115887105465, + "reward_change_std": 0.2896172106266022, + "reward_std": 0.9101916253566742, + "rewards/cosine_scaled_reward": 0.1238572234287858, + "rewards/format_reward": 0.8333333507180214, + "step": 311 + }, + { + "advantage_max": 1.2817089334130287, + "advantage_mean": -2.1730860721991263e-08, + "advantage_min": -0.692761953920126, + "advantage_std": 0.7200740836560726, + "completion_length": 2084.145896911621, + "epoch": 0.3565714285714286, + "grad_norm": 0.6255805492401123, + "kl": 0.417327880859375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.350494089288943e-07, + "loss": 0.037, + "reward": 0.521587735041976, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.521587735041976, + "reward_after_std": 0.7200741060078144, + "reward_before_mean": 0.9654922960326076, + "reward_before_std": 0.6313992403447628, + "reward_change_max": 0.00043123960494995117, + "reward_change_mean": -0.4439045786857605, + "reward_change_min": -0.6904230825603008, + "reward_change_std": 0.26942718401551247, + "reward_std": 0.720074113458395, + "rewards/cosine_scaled_reward": 0.05566280521452427, + "rewards/format_reward": 0.8541666716337204, + "step": 312 + }, + { + "advantage_max": 1.5071654841303825, + "advantage_mean": -4.6566128730773926e-09, + "advantage_min": -0.8955875560641289, + "advantage_std": 0.8493525125086308, + "completion_length": 2596.81258392334, + "epoch": 0.3577142857142857, + "grad_norm": 1.359320044517517, + "kl": 0.553924560546875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.3201486961161093e-07, + "loss": 0.0083, + "reward": 0.5998956672847271, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5998956672847271, + "reward_after_std": 0.8493525348603725, + "reward_before_mean": 1.054509874433279, + "reward_before_std": 0.7884119637310505, + "reward_change_max": 0.00043067336082458496, + "reward_change_mean": -0.4546141605824232, + "reward_change_min": -0.7211263254284859, + "reward_change_std": 0.2862904816865921, + "reward_std": 0.849352553486824, + "rewards/cosine_scaled_reward": 0.1105882371775806, + "rewards/format_reward": 0.8333333507180214, + "step": 313 + }, + { + "advantage_max": 1.2207506373524666, + "advantage_mean": -2.17308601113686e-08, + "advantage_min": -0.8295302875339985, + "advantage_std": 0.718177042901516, + "completion_length": 2326.1459045410156, + "epoch": 0.3588571428571429, + "grad_norm": 0.5819482207298279, + "kl": 0.472625732421875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.2898608072313045e-07, + "loss": 0.0495, + "reward": 0.5721183009445667, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5721183009445667, + "reward_after_std": 0.7181770466268063, + "reward_before_mean": 1.0381627357564867, + "reward_before_std": 0.6686963923275471, + "reward_change_max": 0.0, + "reward_change_mean": -0.4660444292239845, + "reward_change_min": -0.7502425312995911, + "reward_change_std": 0.29036774951964617, + "reward_std": 0.7181770764291286, + "rewards/cosine_scaled_reward": 0.1024146843701601, + "rewards/format_reward": 0.833333333954215, + "step": 314 + }, + { + "advantage_max": 1.541440226137638, + "advantage_mean": -3.7252904094842165e-09, + "advantage_min": -0.7856865674257278, + "advantage_std": 0.8436717689037323, + "completion_length": 2587.687545776367, + "epoch": 0.36, + "grad_norm": 0.8492938280105591, + "kl": 0.5152587890625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.2596318988235037e-07, + "loss": 0.0423, + "reward": 0.2977000498212874, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2977000498212874, + "reward_after_std": 0.8436717763543129, + "reward_before_mean": 0.6422043442726135, + "reward_before_std": 0.7983835525810719, + "reward_change_max": 0.00015465915203094482, + "reward_change_mean": -0.3445042800158262, + "reward_change_min": -0.579395305365324, + "reward_change_std": 0.23542877286672592, + "reward_std": 0.8436717912554741, + "rewards/cosine_scaled_reward": -0.04348117241170257, + "rewards/format_reward": 0.7291666902601719, + "step": 315 + }, + { + "advantage_max": 1.1138798967003822, + "advantage_mean": 1.1102230246251565e-16, + "advantage_min": -0.6890594810247421, + "advantage_std": 0.6519434526562691, + "completion_length": 3087.9376220703125, + "epoch": 0.36114285714285715, + "grad_norm": 0.9313336610794067, + "kl": 0.517822265625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.2294634442070553e-07, + "loss": 0.0346, + "reward": 0.045592143665999174, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.045592143665999174, + "reward_after_std": 0.6519434675574303, + "reward_before_mean": 0.33301879093050957, + "reward_before_std": 0.649714894592762, + "reward_change_max": 0.0, + "reward_change_mean": -0.28742665797472, + "reward_change_min": -0.5346714928746223, + "reward_change_std": 0.2103767693042755, + "reward_std": 0.6519434861838818, + "rewards/cosine_scaled_reward": -0.198073947802186, + "rewards/format_reward": 0.7291666902601719, + "step": 316 + }, + { + "advantage_max": 1.530574381351471, + "advantage_mean": -8.07146216530441e-09, + "advantage_min": -1.0632232949137688, + "advantage_std": 0.9659229815006256, + "completion_length": 2734.979232788086, + "epoch": 0.36228571428571427, + "grad_norm": 0.9145365953445435, + "kl": 0.66119384765625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.1993569137498776e-07, + "loss": 0.0646, + "reward": 0.2181464321911335, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2181464321911335, + "reward_after_std": 0.9659229628741741, + "reward_before_mean": 0.5356616548378952, + "reward_before_std": 1.0483863092958927, + "reward_change_max": 0.00044549256563186646, + "reward_change_mean": -0.31751522794365883, + "reward_change_min": -0.7150661610066891, + "reward_change_std": 0.3014825861901045, + "reward_std": 0.9659229926764965, + "rewards/cosine_scaled_reward": -0.05508585087954998, + "rewards/format_reward": 0.6458333469927311, + "step": 317 + }, + { + "advantage_max": 1.8403086960315704, + "advantage_mean": -3.042320562141043e-08, + "advantage_min": -0.8629412576556206, + "advantage_std": 0.9772805720567703, + "completion_length": 2202.8334045410156, + "epoch": 0.36342857142857143, + "grad_norm": 0.6073242425918579, + "kl": 0.37530517578125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.1693137748017915e-07, + "loss": 0.0146, + "reward": 0.46245671808719635, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.46245671808719635, + "reward_after_std": 0.9772805646061897, + "reward_before_mean": 0.8472766764461994, + "reward_before_std": 0.8918176591396332, + "reward_change_max": 0.0, + "reward_change_mean": -0.38481999561190605, + "reward_change_min": -0.6207333505153656, + "reward_change_std": 0.23993216827511787, + "reward_std": 0.9772805869579315, + "rewards/cosine_scaled_reward": -0.01386166550219059, + "rewards/format_reward": 0.8750000149011612, + "step": 318 + }, + { + "advantage_max": 1.1768467128276825, + "advantage_mean": -9.313225912688239e-09, + "advantage_min": -0.6424081809818745, + "advantage_std": 0.6727948747575283, + "completion_length": 2646.0625915527344, + "epoch": 0.36457142857142855, + "grad_norm": 0.823601245880127, + "kl": 0.4171142578125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.1393354916230005e-07, + "loss": 0.0132, + "reward": 0.19451459869742393, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.19451459869742393, + "reward_after_std": 0.6727948673069477, + "reward_before_mean": 0.5286380238831043, + "reward_before_std": 0.6407083608210087, + "reward_change_max": 0.0, + "reward_change_mean": -0.33412344940006733, + "reward_change_min": -0.583837378770113, + "reward_change_std": 0.2194485031068325, + "reward_std": 0.6727948747575283, + "rewards/cosine_scaled_reward": -0.14193099546901067, + "rewards/format_reward": 0.8125000149011612, + "step": 319 + }, + { + "advantage_max": 1.5140546262264252, + "advantage_mean": -1.179675274132208e-08, + "advantage_min": -0.8545507602393627, + "advantage_std": 0.8494656905531883, + "completion_length": 1993.4791946411133, + "epoch": 0.3657142857142857, + "grad_norm": 0.4081292748451233, + "kl": 0.28387451171875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.1094235253127374e-07, + "loss": 0.0282, + "reward": 0.45085313729941845, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.45085313729941845, + "reward_after_std": 0.8494656905531883, + "reward_before_mean": 0.8511345311999321, + "reward_before_std": 0.7997904568910599, + "reward_change_max": 0.00048110634088516235, + "reward_change_mean": -0.4002814181149006, + "reward_change_min": -0.687841709703207, + "reward_change_std": 0.25973583571612835, + "reward_std": 0.8494657054543495, + "rewards/cosine_scaled_reward": -0.02234938833862543, + "rewards/format_reward": 0.8958333507180214, + "step": 320 + }, + { + "advantage_max": 1.6693382859230042, + "advantage_mean": -8.071462331837864e-09, + "advantage_min": -0.9148355275392532, + "advantage_std": 0.9250027239322662, + "completion_length": 2250.4583892822266, + "epoch": 0.3668571428571429, + "grad_norm": 0.8360119462013245, + "kl": 0.370880126953125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.079579333738039e-07, + "loss": 0.054, + "reward": 0.5935531364229973, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5935531364229973, + "reward_after_std": 0.9250027164816856, + "reward_before_mean": 1.0354133360087872, + "reward_before_std": 0.8597222454845905, + "reward_change_max": 0.0005154833197593689, + "reward_change_mean": -0.4418601803481579, + "reward_change_min": -0.750252865254879, + "reward_change_std": 0.28662803769111633, + "reward_std": 0.9250027611851692, + "rewards/cosine_scaled_reward": 0.0697899884544313, + "rewards/format_reward": 0.895833358168602, + "step": 321 + }, + { + "advantage_max": 1.4949319586157799, + "advantage_mean": -1.6142925107764938e-08, + "advantage_min": -0.6884929984807968, + "advantage_std": 0.8232560381293297, + "completion_length": 2184.0417404174805, + "epoch": 0.368, + "grad_norm": 1.4889612197875977, + "kl": 0.39019775390625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.0498043714627006e-07, + "loss": 0.0637, + "reward": 0.5103768724948168, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5103768724948168, + "reward_after_std": 0.8232560381293297, + "reward_before_mean": 0.9297751300036907, + "reward_before_std": 0.7539035603404045, + "reward_change_max": 0.0, + "reward_change_mean": -0.4193982556462288, + "reward_change_min": -0.7438863329589367, + "reward_change_std": 0.26376455649733543, + "reward_std": 0.8232560753822327, + "rewards/cosine_scaled_reward": 0.027387551497668028, + "rewards/format_reward": 0.8750000074505806, + "step": 322 + }, + { + "advantage_max": 1.387131430208683, + "advantage_mean": -8.692344288796505e-09, + "advantage_min": -0.9888906627893448, + "advantage_std": 0.8229181803762913, + "completion_length": 2645.2709197998047, + "epoch": 0.36914285714285716, + "grad_norm": 1.0869429111480713, + "kl": 0.33551025390625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 4.020100089676376e-07, + "loss": 0.0078, + "reward": 0.4324415000155568, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4324415000155568, + "reward_after_std": 0.8229181692004204, + "reward_before_mean": 0.8359145727008581, + "reward_before_std": 0.8219218775629997, + "reward_change_max": 0.00012808293104171753, + "reward_change_mean": -0.4034730438143015, + "reward_change_min": -0.695304848253727, + "reward_change_std": 0.27555028162896633, + "reward_std": 0.8229181692004204, + "rewards/cosine_scaled_reward": -0.009126080200076103, + "rewards/format_reward": 0.8541666865348816, + "step": 323 + }, + { + "advantage_max": 1.194041632115841, + "advantage_mean": -6.208817071584605e-09, + "advantage_min": -0.6673163548111916, + "advantage_std": 0.6746069677174091, + "completion_length": 2896.729263305664, + "epoch": 0.3702857142857143, + "grad_norm": 2.350710868835449, + "kl": 0.572998046875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.9904679361238526e-07, + "loss": 0.0048, + "reward": -0.01826178189367056, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.01826178189367056, + "reward_after_std": 0.6746069677174091, + "reward_before_mean": 0.23842260986566544, + "reward_before_std": 0.6667893007397652, + "reward_change_max": 0.0, + "reward_change_mean": -0.2566843908280134, + "reward_change_min": -0.5075510442256927, + "reward_change_std": 0.19278783723711967, + "reward_std": 0.67460697889328, + "rewards/cosine_scaled_reward": -0.21412204019725323, + "rewards/format_reward": 0.6666666828095913, + "step": 324 + }, + { + "advantage_max": 1.063421942293644, + "advantage_mean": 8.692344399818808e-09, + "advantage_min": -0.5553975813090801, + "advantage_std": 0.600383810698986, + "completion_length": 2946.7500534057617, + "epoch": 0.37142857142857144, + "grad_norm": 1.726870059967041, + "kl": 0.43701171875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.9609093550344907e-07, + "loss": 0.0116, + "reward": 0.06526811327785254, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.06526811327785254, + "reward_after_std": 0.6003838032484055, + "reward_before_mean": 0.36415105359628797, + "reward_before_std": 0.5600848067551851, + "reward_change_max": 0.0, + "reward_change_mean": -0.29888293612748384, + "reward_change_min": -0.5316140241920948, + "reward_change_std": 0.19572586566209793, + "reward_std": 0.600383810698986, + "rewards/cosine_scaled_reward": -0.2033411506563425, + "rewards/format_reward": 0.7708333414047956, + "step": 325 + }, + { + "advantage_max": 1.0879326313734055, + "advantage_mean": 3.1044086745701804e-09, + "advantage_min": -0.7306830547749996, + "advantage_std": 0.6596686951816082, + "completion_length": 2455.104217529297, + "epoch": 0.37257142857142855, + "grad_norm": 1.0197948217391968, + "kl": 0.259765625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.931425787051832e-07, + "loss": 0.0394, + "reward": 0.2782456114073284, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2782456114073284, + "reward_after_std": 0.6596686840057373, + "reward_before_mean": 0.6505864365026355, + "reward_before_std": 0.6487043891102076, + "reward_change_max": 0.0, + "reward_change_mean": -0.3723407946527004, + "reward_change_min": -0.6594884321093559, + "reward_change_std": 0.25413103960454464, + "reward_std": 0.6596687138080597, + "rewards/cosine_scaled_reward": -0.08095679804682732, + "rewards/format_reward": 0.8125000149011612, + "step": 326 + }, + { + "advantage_max": 1.2405484095215797, + "advantage_mean": -8.6923440667519e-09, + "advantage_min": -0.6985969394445419, + "advantage_std": 0.695731807500124, + "completion_length": 2579.791732788086, + "epoch": 0.3737142857142857, + "grad_norm": 0.5163988471031189, + "kl": 0.351715087890625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.902018669163384e-07, + "loss": 0.0303, + "reward": 0.534046346321702, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.534046346321702, + "reward_after_std": 0.6957318224012852, + "reward_before_mean": 0.9854510715231299, + "reward_before_std": 0.6119641549885273, + "reward_change_max": 0.0004111677408218384, + "reward_change_mean": -0.4514046907424927, + "reward_change_min": -0.7134763710200787, + "reward_change_std": 0.27318305894732475, + "reward_std": 0.6957318410277367, + "rewards/cosine_scaled_reward": 0.07605885528028011, + "rewards/format_reward": 0.8333333432674408, + "step": 327 + }, + { + "advantage_max": 1.5700553804636002, + "advantage_mean": 3.7252904094842165e-09, + "advantage_min": -0.7542428597807884, + "advantage_std": 0.8464733026921749, + "completion_length": 3031.8958435058594, + "epoch": 0.37485714285714283, + "grad_norm": 0.5918129682540894, + "kl": 0.3701171875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.872689434630585e-07, + "loss": 0.0157, + "reward": 0.1579922076780349, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1579922076780349, + "reward_after_std": 0.846473291516304, + "reward_before_mean": 0.4518141821026802, + "reward_before_std": 0.8057435862720013, + "reward_change_max": 0.00017704814672470093, + "reward_change_mean": -0.29382197093218565, + "reward_change_min": -0.5395024605095387, + "reward_change_std": 0.2040925696492195, + "reward_std": 0.8464733026921749, + "rewards/cosine_scaled_reward": -0.06575959082692862, + "rewards/format_reward": 0.5833333469927311, + "step": 328 + }, + { + "advantage_max": 1.4207844957709312, + "advantage_mean": 6.8296994726324556e-09, + "advantage_min": -0.958678312599659, + "advantage_std": 0.8764909096062183, + "completion_length": 2077.437572479248, + "epoch": 0.376, + "grad_norm": 0.4301084578037262, + "kl": 0.289764404296875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.843439512918949e-07, + "loss": 0.0393, + "reward": 0.46948680374771357, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.46948680374771357, + "reward_after_std": 0.8764909207820892, + "reward_before_mean": 0.8864495474845171, + "reward_before_std": 0.9092329628765583, + "reward_change_max": 0.0, + "reward_change_mean": -0.416962718591094, + "reward_change_min": -0.8044983074069023, + "reward_change_std": 0.30942643620073795, + "reward_std": 0.8764909394085407, + "rewards/cosine_scaled_reward": 0.02655809542920906, + "rewards/format_reward": 0.8333333432674408, + "step": 329 + }, + { + "advantage_max": 1.0345421582460403, + "advantage_mean": -4.967053990334591e-09, + "advantage_min": -0.5925486199557781, + "advantage_std": 0.5856764316558838, + "completion_length": 2163.291717529297, + "epoch": 0.37714285714285717, + "grad_norm": 0.9790158867835999, + "kl": 0.21964263916015625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.8142703296283953e-07, + "loss": -0.0266, + "reward": 0.061649966053664684, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.061649966053664684, + "reward_after_std": 0.5856764391064644, + "reward_before_mean": 0.3572140075266361, + "reward_before_std": 0.5511214323341846, + "reward_change_max": 0.0014399513602256775, + "reward_change_mean": -0.2955640386790037, + "reward_change_min": -0.5271896719932556, + "reward_change_std": 0.19697471428662539, + "reward_std": 0.5856764689087868, + "rewards/cosine_scaled_reward": -0.22764301113784313, + "rewards/format_reward": 0.8125000055879354, + "step": 330 + }, + { + "advantage_max": 1.266398135572672, + "advantage_mean": -9.313226023710541e-09, + "advantage_min": -0.6501773856580257, + "advantage_std": 0.7007499448955059, + "completion_length": 2759.958381652832, + "epoch": 0.3782857142857143, + "grad_norm": 0.4184553623199463, + "kl": 0.295257568359375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.785183306423767e-07, + "loss": 0.0313, + "reward": 0.031125844456255436, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.031125844456255436, + "reward_after_std": 0.7007499560713768, + "reward_before_mean": 0.30181864835321903, + "reward_before_std": 0.6772023364901543, + "reward_change_max": 0.00038214027881622314, + "reward_change_mean": -0.2706927992403507, + "reward_change_min": -0.5030109845101833, + "reward_change_std": 0.18806026875972748, + "reward_std": 0.7007499635219574, + "rewards/cosine_scaled_reward": -0.15117402002215385, + "rewards/format_reward": 0.6041666716337204, + "step": 331 + }, + { + "advantage_max": 1.2278646379709244, + "advantage_mean": -1.2107193636534674e-08, + "advantage_min": -0.9190050959587097, + "advantage_std": 0.7645938657224178, + "completion_length": 2374.4583892822266, + "epoch": 0.37942857142857145, + "grad_norm": 1.026884913444519, + "kl": 0.20819091796875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.7561798609655373e-07, + "loss": 0.0254, + "reward": 0.26205848407698795, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.26205848407698795, + "reward_after_std": 0.764593880623579, + "reward_before_mean": 0.6165533754974604, + "reward_before_std": 0.7967246845364571, + "reward_change_max": 0.00045099854469299316, + "reward_change_mean": -0.3544948771595955, + "reward_change_min": -0.6746104620397091, + "reward_change_std": 0.27001148648560047, + "reward_std": 0.7645938880741596, + "rewards/cosine_scaled_reward": -0.056306662037968636, + "rewards/format_reward": 0.7291666828095913, + "step": 332 + }, + { + "advantage_max": 1.4431308880448341, + "advantage_mean": -8.692344177774203e-09, + "advantage_min": -0.721727728843689, + "advantage_std": 0.7957188636064529, + "completion_length": 2322.1458892822266, + "epoch": 0.38057142857142856, + "grad_norm": 0.2893451452255249, + "kl": 0.182647705078125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.72726140684072e-07, + "loss": 0.0226, + "reward": 0.37836203817278147, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.37836203817278147, + "reward_after_std": 0.7957188822329044, + "reward_before_mean": 0.7605113498866558, + "reward_before_std": 0.7327791415154934, + "reward_change_max": 0.0002195313572883606, + "reward_change_mean": -0.3821493051946163, + "reward_change_min": -0.6533030718564987, + "reward_change_std": 0.2390086129307747, + "reward_std": 0.7957189120352268, + "rewards/cosine_scaled_reward": -0.08849434833973646, + "rewards/format_reward": 0.9375000149011612, + "step": 333 + }, + { + "advantage_max": 1.0318748727440834, + "advantage_mean": -1.1796752574788627e-08, + "advantage_min": -0.7106717079877853, + "advantage_std": 0.6053128764033318, + "completion_length": 3011.041748046875, + "epoch": 0.38171428571428573, + "grad_norm": 0.4309418201446533, + "kl": 0.30096435546875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.6984293534939737e-07, + "loss": 0.0272, + "reward": 0.042655323166400194, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.042655323166400194, + "reward_after_std": 0.6053128689527512, + "reward_before_mean": 0.33556643780320883, + "reward_before_std": 0.6005287803709507, + "reward_change_max": 0.0002907887101173401, + "reward_change_mean": -0.29291114024817944, + "reward_change_min": -0.5190308559685946, + "reward_change_std": 0.20768418349325657, + "reward_std": 0.6053128838539124, + "rewards/cosine_scaled_reward": -0.17596679739654064, + "rewards/format_reward": 0.687500013038516, + "step": 334 + }, + { + "advantage_max": 1.6594262346625328, + "advantage_mean": 1.241763464276957e-08, + "advantage_min": -0.8757152184844017, + "advantage_std": 0.9582944251596928, + "completion_length": 2611.4584197998047, + "epoch": 0.38285714285714284, + "grad_norm": 0.9186263680458069, + "kl": 0.2675628662109375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.6696851061588994e-07, + "loss": 0.0586, + "reward": 0.32363066729158163, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.32363066729158163, + "reward_after_std": 0.9582944102585316, + "reward_before_mean": 0.6715507498010993, + "reward_before_std": 0.9660878330469131, + "reward_change_max": 0.0006835907697677612, + "reward_change_mean": -0.3479200517758727, + "reward_change_min": -0.7065860964357853, + "reward_change_std": 0.27568814530968666, + "reward_std": 0.9582944326102734, + "rewards/cosine_scaled_reward": 0.002442028373479843, + "rewards/format_reward": 0.6666666753590107, + "step": 335 + }, + { + "advantage_max": 1.6442746073007584, + "advantage_mean": 3.7252906315288215e-09, + "advantage_min": -0.803277987986803, + "advantage_std": 0.9076324515044689, + "completion_length": 2992.979217529297, + "epoch": 0.384, + "grad_norm": 0.8076133728027344, + "kl": 0.2337646484375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.641030065789562e-07, + "loss": 0.0422, + "reward": 0.41128079127520323, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.41128079127520323, + "reward_after_std": 0.907632440328598, + "reward_before_mean": 0.7877331438940018, + "reward_before_std": 0.861430436372757, + "reward_change_max": 0.0009226053953170776, + "reward_change_mean": -0.3764523593708873, + "reward_change_min": -0.683083888143301, + "reward_change_std": 0.2502218373119831, + "reward_std": 0.9076324477791786, + "rewards/cosine_scaled_reward": 0.008449911139905453, + "rewards/format_reward": 0.7708333544433117, + "step": 336 + }, + { + "advantage_max": 1.3428704738616943, + "advantage_mean": -1.2417637473838283e-09, + "advantage_min": -0.8602436855435371, + "advantage_std": 0.7921734526753426, + "completion_length": 2696.7709045410156, + "epoch": 0.3851428571428571, + "grad_norm": 0.777777373790741, + "kl": 0.247222900390625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.612465628992203e-07, + "loss": -0.0119, + "reward": 0.3240007753483951, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3240007753483951, + "reward_after_std": 0.7921734377741814, + "reward_before_mean": 0.6951221290510148, + "reward_before_std": 0.7921620719134808, + "reward_change_max": 0.0005899891257286072, + "reward_change_mean": -0.3711213394999504, + "reward_change_min": -0.6649631895124912, + "reward_change_std": 0.26434677839279175, + "reward_std": 0.7921734377741814, + "rewards/cosine_scaled_reward": -0.0691056028008461, + "rewards/format_reward": 0.8333333358168602, + "step": 337 + }, + { + "advantage_max": 1.409317284822464, + "advantage_mean": 6.208820124697922e-10, + "advantage_min": -0.6859129592776299, + "advantage_std": 0.7859718762338161, + "completion_length": 2161.1250610351562, + "epoch": 0.3862857142857143, + "grad_norm": 0.6067075133323669, + "kl": 0.23406982421875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.5839931879571725e-07, + "loss": 0.0089, + "reward": 0.4811843913048506, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4811843913048506, + "reward_after_std": 0.7859718687832355, + "reward_before_mean": 0.9017840125598013, + "reward_before_std": 0.7260171696543694, + "reward_change_max": 0.0, + "reward_change_mean": -0.42059963941574097, + "reward_change_min": -0.7111620083451271, + "reward_change_std": 0.26399547420442104, + "reward_std": 0.7859718948602676, + "rewards/cosine_scaled_reward": 0.0029753390699625015, + "rewards/format_reward": 0.8958333358168602, + "step": 338 + }, + { + "advantage_max": 0.8729583323001862, + "advantage_mean": 1.2417630257388623e-09, + "advantage_min": -0.6809690743684769, + "advantage_std": 0.538725059479475, + "completion_length": 3008.0625610351562, + "epoch": 0.38742857142857146, + "grad_norm": 0.35310637950897217, + "kl": 0.2740478515625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.555614130391079e-07, + "loss": 0.0412, + "reward": -0.032884322106838226, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.032884322106838226, + "reward_after_std": 0.5387250632047653, + "reward_before_mean": 0.24240444134920835, + "reward_before_std": 0.5509735830128193, + "reward_change_max": 0.0, + "reward_change_mean": -0.27528876066207886, + "reward_change_min": -0.4679707083851099, + "reward_change_std": 0.1936870813369751, + "reward_std": 0.5387250930070877, + "rewards/cosine_scaled_reward": -0.17046445421874523, + "rewards/format_reward": 0.5833333525806665, + "step": 339 + }, + { + "advantage_max": 1.3509134501218796, + "advantage_mean": -6.829699139565548e-09, + "advantage_min": -0.9202584028244019, + "advantage_std": 0.8368659019470215, + "completion_length": 2576.7708587646484, + "epoch": 0.38857142857142857, + "grad_norm": 0.6532253623008728, + "kl": 0.2698974609375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.5273298394491515e-07, + "loss": 0.0479, + "reward": 0.22542057232931256, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.22542057232931256, + "reward_after_std": 0.8368658944964409, + "reward_before_mean": 0.5590503867715597, + "reward_before_std": 0.8876641169190407, + "reward_change_max": 0.00047085434198379517, + "reward_change_mean": -0.3336298204958439, + "reward_change_min": -0.6656833551824093, + "reward_change_std": 0.2725123818963766, + "reward_std": 0.8368659354746342, + "rewards/cosine_scaled_reward": -0.10589147731661797, + "rewards/format_reward": 0.7708333507180214, + "step": 340 + }, + { + "advantage_max": 1.4389492645859718, + "advantage_mean": -5.587936058315535e-09, + "advantage_min": -0.8931010887026787, + "advantage_std": 0.8441877365112305, + "completion_length": 2549.479202270508, + "epoch": 0.38971428571428574, + "grad_norm": 0.7518829703330994, + "kl": 0.2005615234375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.4991416936678276e-07, + "loss": 0.0284, + "reward": 0.6108754873275757, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6108754873275757, + "reward_after_std": 0.8441877439618111, + "reward_before_mean": 1.074859730899334, + "reward_before_std": 0.790581751614809, + "reward_change_max": 0.00037673860788345337, + "reward_change_mean": -0.4639842379838228, + "reward_change_min": -0.7642018236219883, + "reward_change_std": 0.31235019117593765, + "reward_std": 0.844187755137682, + "rewards/cosine_scaled_reward": 0.17284652451053262, + "rewards/format_reward": 0.7291666828095913, + "step": 341 + }, + { + "advantage_max": 1.3784963935613632, + "advantage_mean": -1.55220432618286e-08, + "advantage_min": -0.8081448301672935, + "advantage_std": 0.7640027962625027, + "completion_length": 2857.12508392334, + "epoch": 0.39085714285714285, + "grad_norm": 1.3091081380844116, + "kl": 0.37322998046875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.471051066897562e-07, + "loss": 0.0775, + "reward": 0.2577698500826955, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2577698500826955, + "reward_after_std": 0.7640027962625027, + "reward_before_mean": 0.600545659661293, + "reward_before_std": 0.7238595262169838, + "reward_change_max": 0.00037366151809692383, + "reward_change_mean": -0.3427758179605007, + "reward_change_min": -0.6013978645205498, + "reward_change_std": 0.22922171279788017, + "reward_std": 0.7640028148889542, + "rewards/cosine_scaled_reward": -0.13722718448843807, + "rewards/format_reward": 0.8750000223517418, + "step": 342 + }, + { + "advantage_max": 1.09914218634367, + "advantage_mean": -4.6566095424083187e-10, + "advantage_min": -0.7929297350347042, + "advantage_std": 0.6592183811590075, + "completion_length": 2881.229248046875, + "epoch": 0.392, + "grad_norm": 0.8948997259140015, + "kl": 0.30389404296875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.4430593282358777e-07, + "loss": 0.0518, + "reward": 0.15604414325207472, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.15604414325207472, + "reward_after_std": 0.6592183588072658, + "reward_before_mean": 0.4839049391448498, + "reward_before_std": 0.6639570314437151, + "reward_change_max": 0.0, + "reward_change_mean": -0.3278607833199203, + "reward_change_min": -0.5611877627670765, + "reward_change_std": 0.22848687833175063, + "reward_std": 0.6592183625325561, + "rewards/cosine_scaled_reward": -0.07054753974080086, + "rewards/format_reward": 0.6250000149011612, + "step": 343 + }, + { + "advantage_max": 1.1419403105974197, + "advantage_mean": -1.9247334503980085e-08, + "advantage_min": -0.8298167437314987, + "advantage_std": 0.7062124721705914, + "completion_length": 2314.75008392334, + "epoch": 0.3931428571428571, + "grad_norm": 0.3817563056945801, + "kl": 0.208465576171875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.4151678419606233e-07, + "loss": 0.0339, + "reward": 0.5916862864978611, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5916862864978611, + "reward_after_std": 0.7062124647200108, + "reward_before_mean": 1.0728607330238447, + "reward_before_std": 0.6833640523254871, + "reward_change_max": 0.0007588937878608704, + "reward_change_mean": -0.48117440938949585, + "reward_change_min": -0.7631802037358284, + "reward_change_std": 0.30804815515875816, + "reward_std": 0.7062124907970428, + "rewards/cosine_scaled_reward": 0.1197636779397726, + "rewards/format_reward": 0.8333333358168602, + "step": 344 + }, + { + "advantage_max": 1.2474537640810013, + "advantage_mean": -2.7939678071131624e-08, + "advantage_min": -0.850233644247055, + "advantage_std": 0.7378453128039837, + "completion_length": 2768.6250610351562, + "epoch": 0.3942857142857143, + "grad_norm": 0.7111788392066956, + "kl": 0.31512451171875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.387377967463493e-07, + "loss": 0.0513, + "reward": 0.7365547483786941, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7365547483786941, + "reward_after_std": 0.7378453128039837, + "reward_before_mean": 1.2583894599229097, + "reward_before_std": 0.6701792404055595, + "reward_change_max": 0.0, + "reward_change_mean": -0.5218347907066345, + "reward_change_min": -0.8273529559373856, + "reward_change_std": 0.31641891226172447, + "reward_std": 0.7378453314304352, + "rewards/cosine_scaled_reward": 0.19169476255774498, + "rewards/format_reward": 0.8750000149011612, + "step": 345 + }, + { + "advantage_max": 1.2420702129602432, + "advantage_mean": -8.692343955729598e-09, + "advantage_min": -0.8061654381453991, + "advantage_std": 0.7235353961586952, + "completion_length": 2846.0209197998047, + "epoch": 0.3954285714285714, + "grad_norm": 1.0861871242523193, + "kl": 0.507080078125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.359691059183761e-07, + "loss": 0.0173, + "reward": 0.304742424399592, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.304742424399592, + "reward_after_std": 0.7235354036092758, + "reward_before_mean": 0.6732838414609432, + "reward_before_std": 0.7006465289741755, + "reward_change_max": 0.0003463476896286011, + "reward_change_mean": -0.36854144744575024, + "reward_change_min": -0.6262233816087246, + "reward_change_std": 0.2468447219580412, + "reward_std": 0.723535418510437, + "rewards/cosine_scaled_reward": -0.059191410429775715, + "rewards/format_reward": 0.791666679084301, + "step": 346 + }, + { + "advantage_max": 1.2352950349450111, + "advantage_mean": 4.346172310931706e-09, + "advantage_min": -0.6556868124753237, + "advantage_std": 0.6889407336711884, + "completion_length": 2864.6250915527344, + "epoch": 0.3965714285714286, + "grad_norm": 0.6275542378425598, + "kl": 0.4534912109375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.3321084665422803e-07, + "loss": 0.055, + "reward": 0.13871129555627704, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.13871129555627704, + "reward_after_std": 0.6889407597482204, + "reward_before_mean": 0.4508918561041355, + "reward_before_std": 0.6534276064485312, + "reward_change_max": 0.0008821189403533936, + "reward_change_mean": -0.312180545181036, + "reward_change_min": -0.5465703941881657, + "reward_change_std": 0.21107300743460655, + "reward_std": 0.6889407970011234, + "rewards/cosine_scaled_reward": -0.18080408312380314, + "rewards/format_reward": 0.8125000149011612, + "step": 347 + }, + { + "advantage_max": 1.0376718565821648, + "advantage_mean": -1.6142925329809543e-08, + "advantage_min": -0.5380270965397358, + "advantage_std": 0.5884160585701466, + "completion_length": 2268.916732788086, + "epoch": 0.3977142857142857, + "grad_norm": 0.8303443193435669, + "kl": 0.32196044921875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.3046315338757026e-07, + "loss": 0.0055, + "reward": 0.2341640405356884, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2341640405356884, + "reward_after_std": 0.5884160622954369, + "reward_before_mean": 0.5881709158420563, + "reward_before_std": 0.537275068461895, + "reward_change_max": 0.0, + "reward_change_mean": -0.35400689020752907, + "reward_change_min": -0.5834939926862717, + "reward_change_std": 0.21537455171346664, + "reward_std": 0.588416077196598, + "rewards/cosine_scaled_reward": -0.1746645476669073, + "rewards/format_reward": 0.9375000149011612, + "step": 348 + }, + { + "advantage_max": 1.6154710426926613, + "advantage_mean": -1.8626452047421083e-09, + "advantage_min": -0.7679126597940922, + "advantage_std": 0.8989286422729492, + "completion_length": 2812.8125915527344, + "epoch": 0.39885714285714285, + "grad_norm": 1.0645766258239746, + "kl": 0.5093994140625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.2772616003709616e-07, + "loss": 0.092, + "reward": 0.19876206829212606, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.19876206829212606, + "reward_after_std": 0.8989286720752716, + "reward_before_mean": 0.5034147594124079, + "reward_before_std": 0.8784813545644283, + "reward_change_max": 0.0, + "reward_change_mean": -0.3046526936814189, + "reward_change_min": -0.5808026678860188, + "reward_change_std": 0.22330620884895325, + "reward_std": 0.8989286981523037, + "rewards/cosine_scaled_reward": -0.08162595890462399, + "rewards/format_reward": 0.6666666809469461, + "step": 349 + }, + { + "advantage_max": 1.4590798914432526, + "advantage_mean": 3.104408619059029e-09, + "advantage_min": -0.8784847110509872, + "advantage_std": 0.8449890464544296, + "completion_length": 2441.9792556762695, + "epoch": 0.4, + "grad_norm": 1.5493332147598267, + "kl": 0.555908203125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.250000000000001e-07, + "loss": 0.015, + "reward": 0.3524491051211953, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3524491051211953, + "reward_after_std": 0.8449890315532684, + "reward_before_mean": 0.7244518399238586, + "reward_before_std": 0.8319392949342728, + "reward_change_max": 0.0, + "reward_change_mean": -0.3720027543604374, + "reward_change_min": -0.701644878834486, + "reward_change_std": 0.2610305938869715, + "reward_std": 0.8449890613555908, + "rewards/cosine_scaled_reward": -0.07527408562600613, + "rewards/format_reward": 0.8750000223517418, + "step": 350 + }, + { + "advantage_max": 1.1394439861178398, + "advantage_mean": -3.7252904983020585e-08, + "advantage_min": -0.6002353355288506, + "advantage_std": 0.6371245309710503, + "completion_length": 2274.8125610351562, + "epoch": 0.40114285714285713, + "grad_norm": 0.8637124300003052, + "kl": 0.306060791015625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.222848061454764e-07, + "loss": -0.0094, + "reward": 0.4534956933930516, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4534956933930516, + "reward_after_std": 0.63712452724576, + "reward_before_mean": 0.8838562555611134, + "reward_before_std": 0.5522509068250656, + "reward_change_max": 0.0, + "reward_change_mean": -0.4303605705499649, + "reward_change_min": -0.6738534830510616, + "reward_change_std": 0.2491367096081376, + "reward_std": 0.6371245495975018, + "rewards/cosine_scaled_reward": 0.0044281138107180595, + "rewards/format_reward": 0.875, + "step": 351 + }, + { + "advantage_max": 1.2284162268042564, + "advantage_mean": -1.1796752907855534e-08, + "advantage_min": -0.6765665486454964, + "advantage_std": 0.6997440755367279, + "completion_length": 2598.0000762939453, + "epoch": 0.4022857142857143, + "grad_norm": 0.720954418182373, + "kl": 0.5006103515625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.195807108082429e-07, + "loss": 0.0359, + "reward": 0.33301494736224413, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.33301494736224413, + "reward_after_std": 0.6997440531849861, + "reward_before_mean": 0.7135494388639927, + "reward_before_std": 0.6609387919306755, + "reward_change_max": 0.0, + "reward_change_mean": -0.38053448498249054, + "reward_change_min": -0.6261219903826714, + "reward_change_std": 0.23826817143708467, + "reward_std": 0.6997440606355667, + "rewards/cosine_scaled_reward": -0.007808626629412174, + "rewards/format_reward": 0.7291666697710752, + "step": 352 + }, + { + "advantage_max": 1.3336387276649475, + "advantage_mean": 1.11758712839638e-08, + "advantage_min": -0.8034698478877544, + "advantage_std": 0.7565931305289268, + "completion_length": 2164.6875610351562, + "epoch": 0.4034285714285714, + "grad_norm": 0.525687575340271, + "kl": 0.34417724609375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.168878457820915e-07, + "loss": 0.027, + "reward": 0.5460577132180333, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5460577132180333, + "reward_after_std": 0.756593145430088, + "reward_before_mean": 0.9952808357775211, + "reward_before_std": 0.7036684639751911, + "reward_change_max": 8.256733417510986e-05, + "reward_change_mean": -0.4492231123149395, + "reward_change_min": -0.7174819149076939, + "reward_change_std": 0.2743480708450079, + "reward_std": 0.7565931528806686, + "rewards/cosine_scaled_reward": 0.028890418354421854, + "rewards/format_reward": 0.9375000074505806, + "step": 353 + }, + { + "advantage_max": 1.1443421319127083, + "advantage_mean": -2.3903946488879058e-08, + "advantage_min": -0.8811666816473007, + "advantage_std": 0.6901189722120762, + "completion_length": 1969.1042098999023, + "epoch": 0.4045714285714286, + "grad_norm": 0.5808809399604797, + "kl": 0.221710205078125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.142063423134644e-07, + "loss": 0.0014, + "reward": 0.5879236805485561, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5879236805485561, + "reward_after_std": 0.6901189647614956, + "reward_before_mean": 1.0658465139567852, + "reward_before_std": 0.6480529569089413, + "reward_change_max": 0.0016156435012817383, + "reward_change_mean": -0.47792287170886993, + "reward_change_min": -0.7460371851921082, + "reward_change_std": 0.2975119221955538, + "reward_std": 0.6901189796626568, + "rewards/cosine_scaled_reward": 0.09542325511574745, + "rewards/format_reward": 0.8750000149011612, + "step": 354 + }, + { + "advantage_max": 1.299523502588272, + "advantage_mean": -2.048909669705168e-08, + "advantage_min": -0.6467021554708481, + "advantage_std": 0.723317988216877, + "completion_length": 2410.6458740234375, + "epoch": 0.4057142857142857, + "grad_norm": 0.9375362992286682, + "kl": 0.46929931640625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.115363310950578e-07, + "loss": 0.0214, + "reward": 0.2581765688955784, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2581765688955784, + "reward_after_std": 0.7233179956674576, + "reward_before_mean": 0.6042394880205393, + "reward_before_std": 0.6638563796877861, + "reward_change_max": 0.0006398856639862061, + "reward_change_mean": -0.34606292354874313, + "reward_change_min": -0.5550568662583828, + "reward_change_std": 0.22587225958704948, + "reward_std": 0.7233180180191994, + "rewards/cosine_scaled_reward": -0.07288026809692383, + "rewards/format_reward": 0.750000013038516, + "step": 355 + }, + { + "advantage_max": 1.4495657980442047, + "advantage_mean": -6.829699028543246e-09, + "advantage_min": -0.7594706527888775, + "advantage_std": 0.7941495589911938, + "completion_length": 2560.479248046875, + "epoch": 0.40685714285714286, + "grad_norm": 1.1204156875610352, + "kl": 0.340087890625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.0887794225945143e-07, + "loss": 0.0194, + "reward": 0.4108897661790252, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4108897661790252, + "reward_after_std": 0.7941495440900326, + "reward_before_mean": 0.8035768382251263, + "reward_before_std": 0.7295331358909607, + "reward_change_max": 0.0, + "reward_change_mean": -0.3926870562136173, + "reward_change_min": -0.645715419203043, + "reward_change_std": 0.23967733141034842, + "reward_std": 0.7941495478153229, + "rewards/cosine_scaled_reward": -0.0044615985825657845, + "rewards/format_reward": 0.812500013038516, + "step": 356 + }, + { + "advantage_max": 1.1339119970798492, + "advantage_mean": -9.313226023710541e-09, + "advantage_min": -0.8648699447512627, + "advantage_std": 0.7139856666326523, + "completion_length": 2902.604248046875, + "epoch": 0.408, + "grad_norm": 0.5644457340240479, + "kl": 0.3636474609375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.062313053727671e-07, + "loss": 0.0355, + "reward": 0.18295936728827655, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.18295936728827655, + "reward_after_std": 0.7139856740832329, + "reward_before_mean": 0.5178206749260426, + "reward_before_std": 0.7472909539937973, + "reward_change_max": 0.00011270493268966675, + "reward_change_mean": -0.3348613269627094, + "reward_change_min": -0.6464793048799038, + "reward_change_std": 0.25172287225723267, + "reward_std": 0.7139856964349747, + "rewards/cosine_scaled_reward": -0.1577563351020217, + "rewards/format_reward": 0.8333333656191826, + "step": 357 + }, + { + "advantage_max": 1.388573870062828, + "advantage_mean": -2.033387658251229e-08, + "advantage_min": -0.8736642077565193, + "advantage_std": 0.7979660034179688, + "completion_length": 2549.666748046875, + "epoch": 0.40914285714285714, + "grad_norm": 1.1542965173721313, + "kl": 0.3417816162109375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.0359654942835247e-07, + "loss": -0.0082, + "reward": 0.5488296616822481, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5488296616822481, + "reward_after_std": 0.7979660108685493, + "reward_before_mean": 0.9944317154586315, + "reward_before_std": 0.7499481812119484, + "reward_change_max": 0.0, + "reward_change_mean": -0.4456020426005125, + "reward_change_min": -0.7461185529828072, + "reward_change_std": 0.2852071709930897, + "reward_std": 0.7979660257697105, + "rewards/cosine_scaled_reward": 0.07013251329772174, + "rewards/format_reward": 0.854166679084301, + "step": 358 + }, + { + "advantage_max": 1.25496444106102, + "advantage_mean": -1.9247333726823967e-08, + "advantage_min": -0.6898075491189957, + "advantage_std": 0.7037457078695297, + "completion_length": 2656.104248046875, + "epoch": 0.4102857142857143, + "grad_norm": 0.7680111527442932, + "kl": 0.3133544921875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 3.0097380284049523e-07, + "loss": 0.0469, + "reward": 0.3507824167609215, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3507824167609215, + "reward_after_std": 0.70374571159482, + "reward_before_mean": 0.7362727681174874, + "reward_before_std": 0.6479121707379818, + "reward_change_max": 0.00012759119272232056, + "reward_change_mean": -0.3854903504252434, + "reward_change_min": -0.6395407691597939, + "reward_change_std": 0.23867321945726871, + "reward_std": 0.7037457376718521, + "rewards/cosine_scaled_reward": -0.0901969678234309, + "rewards/format_reward": 0.9166666865348816, + "step": 359 + }, + { + "advantage_max": 1.4321295246481895, + "advantage_mean": -2.235174156872688e-08, + "advantage_min": -0.8640732020139694, + "advantage_std": 0.8487196229398251, + "completion_length": 2889.8959350585938, + "epoch": 0.4114285714285714, + "grad_norm": 0.6298926472663879, + "kl": 0.30224609375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.9836319343816397e-07, + "loss": 0.0255, + "reward": 0.5272009279578924, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5272009279578924, + "reward_after_std": 0.848719634115696, + "reward_before_mean": 0.964709036052227, + "reward_before_std": 0.8328141756355762, + "reward_change_max": 0.0, + "reward_change_mean": -0.43750810623168945, + "reward_change_min": -0.7766466028988361, + "reward_change_std": 0.29725635796785355, + "reward_std": 0.8487196452915668, + "rewards/cosine_scaled_reward": 0.03443782590329647, + "rewards/format_reward": 0.8958333507180214, + "step": 360 + }, + { + "advantage_max": 1.3873552680015564, + "advantage_mean": -4.594524749546025e-08, + "advantage_min": -1.0418336614966393, + "advantage_std": 0.8450100310146809, + "completion_length": 2874.3959350585938, + "epoch": 0.4125714285714286, + "grad_norm": 0.6404920816421509, + "kl": 0.262939453125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.9576484845877793e-07, + "loss": 0.0174, + "reward": 0.457249105675146, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.457249105675146, + "reward_after_std": 0.8450100161135197, + "reward_before_mean": 0.8721631693188101, + "reward_before_std": 0.8606941141188145, + "reward_change_max": 0.0, + "reward_change_mean": -0.4149140939116478, + "reward_change_min": -0.7242123819887638, + "reward_change_std": 0.29377906769514084, + "reward_std": 0.8450100235641003, + "rewards/cosine_scaled_reward": 0.02983156335540116, + "rewards/format_reward": 0.8125000149011612, + "step": 361 + }, + { + "advantage_max": 0.968044251203537, + "advantage_mean": -1.2417634698280722e-09, + "advantage_min": -0.5936462879180908, + "advantage_std": 0.5590002462267876, + "completion_length": 1745.2708435058594, + "epoch": 0.4137142857142857, + "grad_norm": 0.1821906417608261, + "kl": 0.17156982421875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.931788945420058e-07, + "loss": 0.0169, + "reward": 0.6431191973388195, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6431191973388195, + "reward_after_std": 0.5590002536773682, + "reward_before_mean": 1.1544144650688395, + "reward_before_std": 0.46106959506869316, + "reward_change_max": 0.0, + "reward_change_mean": -0.511295210570097, + "reward_change_min": -0.7569701746106148, + "reward_change_std": 0.28887542709708214, + "reward_std": 0.5590002611279488, + "rewards/cosine_scaled_reward": 0.11887386068701744, + "rewards/format_reward": 0.9166666716337204, + "step": 362 + }, + { + "advantage_max": 0.97291599214077, + "advantage_mean": -9.934107758624577e-09, + "advantage_min": -0.6005025487393141, + "advantage_std": 0.5606646221131086, + "completion_length": 1856.6875457763672, + "epoch": 0.41485714285714287, + "grad_norm": 0.6652201414108276, + "kl": 0.1973419189453125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.9060545772359305e-07, + "loss": 0.031, + "reward": 0.18876660615205765, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.18876660615205765, + "reward_after_std": 0.5606646202504635, + "reward_before_mean": 0.5343197894544574, + "reward_before_std": 0.5127148274332285, + "reward_change_max": 0.0, + "reward_change_mean": -0.3455531857907772, + "reward_change_min": -0.5572856552898884, + "reward_change_std": 0.2125458586961031, + "reward_std": 0.5606646370142698, + "rewards/cosine_scaled_reward": -0.04534011334180832, + "rewards/format_reward": 0.625000013038516, + "step": 363 + }, + { + "advantage_max": 1.0218002647161484, + "advantage_mean": -2.173086099954702e-09, + "advantage_min": -0.6517962850630283, + "advantage_std": 0.6047668792307377, + "completion_length": 2830.687545776367, + "epoch": 0.416, + "grad_norm": 0.4466303884983063, + "kl": 0.231109619140625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.8804466342921987e-07, + "loss": 0.0254, + "reward": -0.07767466246150434, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.07767466246150434, + "reward_after_std": 0.6047668866813183, + "reward_before_mean": 0.1724892733618617, + "reward_before_std": 0.6115720048546791, + "reward_change_max": 0.0, + "reward_change_mean": -0.25016394164413214, + "reward_change_min": -0.48623254150152206, + "reward_change_std": 0.1896633468568325, + "reward_std": 0.6047668941318989, + "rewards/cosine_scaled_reward": -0.23667203076183796, + "rewards/format_reward": 0.6458333544433117, + "step": 364 + }, + { + "advantage_max": 1.1228408366441727, + "advantage_mean": -3.7252901874396116e-09, + "advantage_min": -0.9194622822105885, + "advantage_std": 0.6852251142263412, + "completion_length": 2639.562545776367, + "epoch": 0.41714285714285715, + "grad_norm": 0.3083858788013458, + "kl": 0.18316650390625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.854966364683872e-07, + "loss": 0.0081, + "reward": 0.39128240814898163, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.39128240814898163, + "reward_after_std": 0.6852251067757607, + "reward_before_mean": 0.7988421376794577, + "reward_before_std": 0.6811210848391056, + "reward_change_max": 0.0, + "reward_change_mean": -0.40755973756313324, + "reward_change_min": -0.6659704782068729, + "reward_change_std": 0.2675940692424774, + "reward_std": 0.6852251254022121, + "rewards/cosine_scaled_reward": -0.027662288397550583, + "rewards/format_reward": 0.8541666865348816, + "step": 365 + }, + { + "advantage_max": 1.1428400948643684, + "advantage_mean": -2.4524828834415757e-08, + "advantage_min": -0.571011945605278, + "advantage_std": 0.6341691315174103, + "completion_length": 1841.208408355713, + "epoch": 0.41828571428571426, + "grad_norm": 1.0585683584213257, + "kl": 0.09136962890625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.829615010283344e-07, + "loss": 0.0348, + "reward": 0.40346649289131165, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.40346649289131165, + "reward_after_std": 0.6341691240668297, + "reward_before_mean": 0.8120505083352327, + "reward_before_std": 0.5371465170755982, + "reward_change_max": 0.0002114623785018921, + "reward_change_mean": -0.40858402848243713, + "reward_change_min": -0.6271373182535172, + "reward_change_std": 0.2456482443958521, + "reward_std": 0.6341691352427006, + "rewards/cosine_scaled_reward": 0.010191927663981915, + "rewards/format_reward": 0.7916666772216558, + "step": 366 + }, + { + "advantage_max": 1.2152687087655067, + "advantage_mean": 3.725290520506519e-09, + "advantage_min": -0.7091002613306046, + "advantage_std": 0.685837309807539, + "completion_length": 3005.5209045410156, + "epoch": 0.41942857142857143, + "grad_norm": 0.7061448693275452, + "kl": 0.249267578125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.8043938066798645e-07, + "loss": 0.0427, + "reward": 0.2152902279049158, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2152902279049158, + "reward_after_std": 0.6858373135328293, + "reward_before_mean": 0.5551963672041893, + "reward_before_std": 0.652896411716938, + "reward_change_max": 0.000535130500793457, + "reward_change_mean": -0.3399061169475317, + "reward_change_min": -0.5558051727712154, + "reward_change_std": 0.21386231295764446, + "reward_std": 0.6858373135328293, + "rewards/cosine_scaled_reward": -0.03490182477980852, + "rewards/format_reward": 0.625000013038516, + "step": 367 + }, + { + "advantage_max": 1.115385714918375, + "advantage_mean": -7.450580818968433e-09, + "advantage_min": -0.7492459192872047, + "advantage_std": 0.6838843766599894, + "completion_length": 3076.7083740234375, + "epoch": 0.4205714285714286, + "grad_norm": 0.4976518154144287, + "kl": 0.3193359375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.7793039831193133e-07, + "loss": 0.0366, + "reward": 0.13641435164026916, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.13641435164026916, + "reward_after_std": 0.6838843803852797, + "reward_before_mean": 0.456346322549507, + "reward_before_std": 0.7006695438176394, + "reward_change_max": 0.0004066452383995056, + "reward_change_mean": -0.3199319802224636, + "reward_change_min": -0.607593409717083, + "reward_change_std": 0.2362601924687624, + "reward_std": 0.6838844045996666, + "rewards/cosine_scaled_reward": -0.0739101842045784, + "rewards/format_reward": 0.6041666865348816, + "step": 368 + }, + { + "advantage_max": 1.613924890756607, + "advantage_mean": -1.9868215517249155e-08, + "advantage_min": -0.7823650315403938, + "advantage_std": 0.9184759818017483, + "completion_length": 2996.0000610351562, + "epoch": 0.4217142857142857, + "grad_norm": 1.507102131843567, + "kl": 0.25439453125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.7543467624442956e-07, + "loss": 0.0562, + "reward": 0.19157198071479797, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.19157198071479797, + "reward_after_std": 0.9184759818017483, + "reward_before_mean": 0.495089840143919, + "reward_before_std": 0.9159770458936691, + "reward_change_max": 0.0006600469350814819, + "reward_change_mean": -0.3035178631544113, + "reward_change_min": -0.6244982462376356, + "reward_change_std": 0.2401027213782072, + "reward_std": 0.9184760265052319, + "rewards/cosine_scaled_reward": -0.09620509948581457, + "rewards/format_reward": 0.6875000111758709, + "step": 369 + }, + { + "advantage_max": 0.9570782333612442, + "advantage_mean": -4.346172144398253e-09, + "advantage_min": -0.45735423266887665, + "advantage_std": 0.5198530592024326, + "completion_length": 2871.125045776367, + "epoch": 0.4228571428571429, + "grad_norm": 5.859915733337402, + "kl": 0.2988128662109375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.729523361034538e-07, + "loss": -0.0005, + "reward": 0.15831564646214247, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.15831564646214247, + "reward_after_std": 0.5198530443012714, + "reward_before_mean": 0.49796394538134336, + "reward_before_std": 0.42710920982062817, + "reward_change_max": 0.0, + "reward_change_mean": -0.3396483026444912, + "reward_change_min": -0.5230873040854931, + "reward_change_std": 0.20043556857854128, + "reward_std": 0.5198530666530132, + "rewards/cosine_scaled_reward": -0.09476803429424763, + "rewards/format_reward": 0.6875000093132257, + "step": 370 + }, + { + "advantage_max": 1.4651240780949593, + "advantage_mean": -3.78737858297562e-08, + "advantage_min": -0.8076458275318146, + "advantage_std": 0.8199820630252361, + "completion_length": 2042.9792175292969, + "epoch": 0.424, + "grad_norm": 0.8236666917800903, + "kl": 0.2234954833984375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.7048349887476037e-07, + "loss": 0.0448, + "reward": 0.6258745496161282, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6258745496161282, + "reward_after_std": 0.8199820779263973, + "reward_before_mean": 1.0955132944509387, + "reward_before_std": 0.7609365023672581, + "reward_change_max": 0.00015053898096084595, + "reward_change_mean": -0.46963876485824585, + "reward_change_min": -0.7598228193819523, + "reward_change_std": 0.28789537586271763, + "reward_std": 0.8199820891022682, + "rewards/cosine_scaled_reward": 0.15192330069839954, + "rewards/format_reward": 0.7916666734963655, + "step": 371 + }, + { + "advantage_max": 1.5770609304308891, + "advantage_mean": 1.862645371275562e-09, + "advantage_min": -0.9838331863284111, + "advantage_std": 0.9379903674125671, + "completion_length": 3058.2709197998047, + "epoch": 0.42514285714285716, + "grad_norm": 0.6419177651405334, + "kl": 0.270416259765625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.6802828488599294e-07, + "loss": 0.0274, + "reward": 0.2897559218108654, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2897559218108654, + "reward_after_std": 0.9379903674125671, + "reward_before_mean": 0.6321213558912859, + "reward_before_std": 0.976532481610775, + "reward_change_max": 0.0, + "reward_change_mean": -0.34236546233296394, + "reward_change_min": -0.69129853323102, + "reward_change_std": 0.2746661426499486, + "reward_std": 0.9379903674125671, + "rewards/cosine_scaled_reward": -0.01727266050875187, + "rewards/format_reward": 0.6666666753590107, + "step": 372 + }, + { + "advantage_max": 1.2619002610445023, + "advantage_mean": -6.519258216597379e-09, + "advantage_min": -0.6246639899909496, + "advantage_std": 0.7037790045142174, + "completion_length": 1801.416732788086, + "epoch": 0.42628571428571427, + "grad_norm": 1.0489673614501953, + "kl": 0.2279205322265625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.655868138008171e-07, + "loss": 0.0008, + "reward": 0.25850481167435646, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.25850481167435646, + "reward_after_std": 0.7037790045142174, + "reward_before_mean": 0.6074935798533261, + "reward_before_std": 0.651880044490099, + "reward_change_max": 0.0, + "reward_change_mean": -0.3489887546747923, + "reward_change_min": -0.634777944535017, + "reward_change_std": 0.22416657023131847, + "reward_std": 0.7037790305912495, + "rewards/cosine_scaled_reward": -0.1441698971320875, + "rewards/format_reward": 0.8958333395421505, + "step": 373 + }, + { + "advantage_max": 1.2929741963744164, + "advantage_mean": -7.761021825203329e-09, + "advantage_min": -0.8992895781993866, + "advantage_std": 0.7896677516400814, + "completion_length": 2458.354232788086, + "epoch": 0.42742857142857144, + "grad_norm": 1.0882432460784912, + "kl": 0.290618896484375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.631592046130896e-07, + "loss": 0.0673, + "reward": 0.441376032307744, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.441376032307744, + "reward_after_std": 0.7896677367389202, + "reward_before_mean": 0.8569105174392462, + "reward_before_std": 0.7956783920526505, + "reward_change_max": 0.00048802047967910767, + "reward_change_mean": -0.41553447395563126, + "reward_change_min": -0.7457053437829018, + "reward_change_std": 0.28711700066924095, + "reward_std": 0.7896677367389202, + "rewards/cosine_scaled_reward": 0.011788577772676945, + "rewards/format_reward": 0.8333333432674408, + "step": 374 + }, + { + "advantage_max": 1.397294044494629, + "advantage_mean": -1.4280280125067435e-08, + "advantage_min": -0.7589370906352997, + "advantage_std": 0.7676047421991825, + "completion_length": 2798.5209045410156, + "epoch": 0.42857142857142855, + "grad_norm": 0.46163445711135864, + "kl": 0.3203582763671875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.6074557564105724e-07, + "loss": 0.0239, + "reward": 0.3821914023719728, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3821914023719728, + "reward_after_std": 0.7676047645509243, + "reward_before_mean": 0.7702061347663403, + "reward_before_std": 0.689431369304657, + "reward_change_max": 0.0, + "reward_change_mean": -0.38801475055515766, + "reward_change_min": -0.643918763846159, + "reward_change_std": 0.2488219765946269, + "reward_std": 0.767604798078537, + "rewards/cosine_scaled_reward": 0.01010306237731129, + "rewards/format_reward": 0.750000013038516, + "step": 375 + }, + { + "advantage_max": 1.0296322032809258, + "advantage_mean": 4.656613011855271e-09, + "advantage_min": -0.5118473805487156, + "advantage_std": 0.5575856603682041, + "completion_length": 2436.625015258789, + "epoch": 0.4297142857142857, + "grad_norm": 1.7255357503890991, + "kl": 0.3665771484375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.583460445215911e-07, + "loss": 0.005, + "reward": 0.1960609758971259, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1960609758971259, + "reward_after_std": 0.5575856678187847, + "reward_before_mean": 0.5448548486456275, + "reward_before_std": 0.47852717712521553, + "reward_change_max": 0.0, + "reward_change_mean": -0.3487938679754734, + "reward_change_min": -0.5355156362056732, + "reward_change_std": 0.19799629971385002, + "reward_std": 0.5575856789946556, + "rewards/cosine_scaled_reward": -0.11298925429582596, + "rewards/format_reward": 0.7708333376795053, + "step": 376 + }, + { + "advantage_max": 1.1096007898449898, + "advantage_mean": -6.208817127095756e-09, + "advantage_min": -0.5029036141932011, + "advantage_std": 0.5966975335031748, + "completion_length": 3197.8958740234375, + "epoch": 0.4308571428571429, + "grad_norm": 0.8108214139938354, + "kl": 0.3704833984375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.5596072820445254e-07, + "loss": 0.0234, + "reward": 0.28803440602496266, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.28803440602496266, + "reward_after_std": 0.5966975409537554, + "reward_before_mean": 0.6619824189692736, + "reward_before_std": 0.5019467137753963, + "reward_change_max": 0.0002845451235771179, + "reward_change_mean": -0.37394802644848824, + "reward_change_min": -0.5833319462835789, + "reward_change_std": 0.2154426146298647, + "reward_std": 0.5966975726187229, + "rewards/cosine_scaled_reward": -0.06484212819486856, + "rewards/format_reward": 0.7916666716337204, + "step": 377 + }, + { + "advantage_max": 1.4894996359944344, + "advantage_mean": -3.0733645128844245e-08, + "advantage_min": -0.9004512503743172, + "advantage_std": 0.8546633832156658, + "completion_length": 2478.37504196167, + "epoch": 0.432, + "grad_norm": 0.7999647259712219, + "kl": 0.309326171875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.5358974294659373e-07, + "loss": 0.0116, + "reward": 0.4863022118806839, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4863022118806839, + "reward_after_std": 0.8546633496880531, + "reward_before_mean": 0.9011543802917004, + "reward_before_std": 0.8215220980346203, + "reward_change_max": 0.0, + "reward_change_mean": -0.41485217586159706, + "reward_change_min": -0.7082854770123959, + "reward_change_std": 0.27995297499001026, + "reward_std": 0.8546633832156658, + "rewards/cosine_scaled_reward": 0.06516051013022661, + "rewards/format_reward": 0.7708333563059568, + "step": 378 + }, + { + "advantage_max": 1.4217093512415886, + "advantage_mean": 1.862645149230957e-09, + "advantage_min": -0.9056989178061485, + "advantage_std": 0.8449318259954453, + "completion_length": 3112.5000610351562, + "epoch": 0.43314285714285716, + "grad_norm": 0.805682361125946, + "kl": 0.54296875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.512332043064913e-07, + "loss": 0.0326, + "reward": 0.18402667669579387, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.18402667669579387, + "reward_after_std": 0.8449318036437035, + "reward_before_mean": 0.49959153961390257, + "reward_before_std": 0.8714813143014908, + "reward_change_max": 0.0, + "reward_change_mean": -0.3155648708343506, + "reward_change_min": -0.7030841335654259, + "reward_change_std": 0.25640485249459743, + "reward_std": 0.8449318334460258, + "rewards/cosine_scaled_reward": -0.09395423531532288, + "rewards/format_reward": 0.6875000186264515, + "step": 379 + }, + { + "advantage_max": 1.1825830340385437, + "advantage_mean": -4.9670543234014986e-09, + "advantage_min": -0.6122145131230354, + "advantage_std": 0.6642481423914433, + "completion_length": 2171.0208587646484, + "epoch": 0.4342857142857143, + "grad_norm": 0.3546510934829712, + "kl": 0.1728668212890625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.488912271385139e-07, + "loss": 0.0106, + "reward": 0.3683098815381527, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3683098815381527, + "reward_after_std": 0.6642481498420238, + "reward_before_mean": 0.7669425271451473, + "reward_before_std": 0.5984037779271603, + "reward_change_max": 0.00131244957447052, + "reward_change_mean": -0.3986326586455107, + "reward_change_min": -0.6843216829001904, + "reward_change_std": 0.25640712305903435, + "reward_std": 0.6642481610178947, + "rewards/cosine_scaled_reward": -0.022778733167797327, + "rewards/format_reward": 0.8125000037252903, + "step": 380 + }, + { + "advantage_max": 1.3350362554192543, + "advantage_mean": -1.179675274132208e-08, + "advantage_min": -0.7753116935491562, + "advantage_std": 0.7826059982180595, + "completion_length": 2961.8750610351562, + "epoch": 0.43542857142857144, + "grad_norm": 0.44929471611976624, + "kl": 0.3763427734375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.465639255873246e-07, + "loss": 0.0373, + "reward": 0.16396947141038254, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.16396947141038254, + "reward_after_std": 0.7826059982180595, + "reward_before_mean": 0.47848290018737316, + "reward_before_std": 0.7885253168642521, + "reward_change_max": 0.0, + "reward_change_mean": -0.3145134476944804, + "reward_change_min": -0.6155954711139202, + "reward_change_std": 0.24165144469588995, + "reward_std": 0.7826060205698013, + "rewards/cosine_scaled_reward": -0.09409188944846392, + "rewards/format_reward": 0.6666666865348816, + "step": 381 + }, + { + "advantage_max": 0.8830334767699242, + "advantage_mean": -3.1044086745701804e-09, + "advantage_min": -0.5198586732149124, + "advantage_std": 0.5014585591852665, + "completion_length": 2477.7500610351562, + "epoch": 0.43657142857142855, + "grad_norm": 0.9291367530822754, + "kl": 0.3695831298828125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.4425141308231765e-07, + "loss": 0.0188, + "reward": -0.028684318996965885, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.028684318996965885, + "reward_after_std": 0.5014585554599762, + "reward_before_mean": 0.24919481086544693, + "reward_before_std": 0.4676024615764618, + "reward_change_max": 0.0007557347416877747, + "reward_change_mean": -0.27787913754582405, + "reward_change_min": -0.47073232382535934, + "reward_change_std": 0.17706115450710058, + "reward_std": 0.5014585703611374, + "rewards/cosine_scaled_reward": -0.23998594097793102, + "rewards/format_reward": 0.7291666716337204, + "step": 382 + }, + { + "advantage_max": 1.5601003393530846, + "advantage_mean": -2.1109979209121832e-08, + "advantage_min": -0.9728534445166588, + "advantage_std": 0.9514635317027569, + "completion_length": 3067.5834045410156, + "epoch": 0.4377142857142857, + "grad_norm": 0.6326491832733154, + "kl": 0.30438232421875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.4195380233209006e-07, + "loss": 0.0372, + "reward": 0.42157111782580614, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.42157111782580614, + "reward_after_std": 0.9514635168015957, + "reward_before_mean": 0.8118895404040813, + "reward_before_std": 0.980067502707243, + "reward_change_max": 0.0, + "reward_change_mean": -0.3903184309601784, + "reward_change_min": -0.7474622651934624, + "reward_change_std": 0.29729065112769604, + "reward_std": 0.951463520526886, + "rewards/cosine_scaled_reward": 0.030944768339395523, + "rewards/format_reward": 0.7500000149011612, + "step": 383 + }, + { + "advantage_max": 1.471701368689537, + "advantage_mean": -2.7939678515220834e-08, + "advantage_min": -0.8781173340976238, + "advantage_std": 0.8436664417386055, + "completion_length": 1930.3750305175781, + "epoch": 0.43885714285714283, + "grad_norm": 0.5232311487197876, + "kl": 0.1711578369140625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.3967120531894857e-07, + "loss": 0.0307, + "reward": 0.804866848513484, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.804866848513484, + "reward_after_std": 0.8436664193868637, + "reward_before_mean": 1.334816126152873, + "reward_before_std": 0.76854282990098, + "reward_change_max": 0.0, + "reward_change_mean": -0.529949240386486, + "reward_change_min": -0.8661737963557243, + "reward_change_std": 0.3276502024382353, + "reward_std": 0.8436664417386055, + "rewards/cosine_scaled_reward": 0.28199137654155493, + "rewards/format_reward": 0.7708333507180214, + "step": 384 + }, + { + "advantage_max": 1.4069309905171394, + "advantage_mean": 0.0, + "advantage_min": -0.9303930997848511, + "advantage_std": 0.855097584426403, + "completion_length": 2643.875045776367, + "epoch": 0.44, + "grad_norm": 1.061144471168518, + "kl": 0.23193359375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.374037332934512e-07, + "loss": 0.0402, + "reward": 0.3949592959834263, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3949592959834263, + "reward_after_std": 0.8550975695252419, + "reward_before_mean": 0.78522406402044, + "reward_before_std": 0.868790153414011, + "reward_change_max": 0.0, + "reward_change_mean": -0.39026473090052605, + "reward_change_min": -0.7018118984997272, + "reward_change_std": 0.28549752198159695, + "reward_std": 0.8550975993275642, + "rewards/cosine_scaled_reward": -0.0032213227823376656, + "rewards/format_reward": 0.7916666865348816, + "step": 385 + }, + { + "advantage_max": 1.5323580503463745, + "advantage_mean": -1.2417633588057697e-09, + "advantage_min": -0.6946651190519333, + "advantage_std": 0.813526090234518, + "completion_length": 3014.1458892822266, + "epoch": 0.44114285714285717, + "grad_norm": 0.5251702666282654, + "kl": 0.3226318359375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.3515149676898552e-07, + "loss": 0.035, + "reward": 0.5693798456341028, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.5693798456341028, + "reward_after_std": 0.8135260716080666, + "reward_before_mean": 1.0148430932313204, + "reward_before_std": 0.6952022239565849, + "reward_change_max": 0.0, + "reward_change_mean": -0.44546326249837875, + "reward_change_min": -0.6919324658811092, + "reward_change_std": 0.2599577587097883, + "reward_std": 0.8135261088609695, + "rewards/cosine_scaled_reward": 0.09075488056987524, + "rewards/format_reward": 0.833333358168602, + "step": 386 + }, + { + "advantage_max": 1.114190198481083, + "advantage_mean": -4.967053879312289e-09, + "advantage_min": -0.7234060317277908, + "advantage_std": 0.6508563421666622, + "completion_length": 2963.3334045410156, + "epoch": 0.4422857142857143, + "grad_norm": 0.4398019015789032, + "kl": 0.31781005859375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.3291460551638237e-07, + "loss": 0.0356, + "reward": 0.19776187278330326, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.19776187278330326, + "reward_after_std": 0.6508563347160816, + "reward_before_mean": 0.5393041241914034, + "reward_before_std": 0.6317312866449356, + "reward_change_max": 0.0, + "reward_change_mean": -0.34154226537793875, + "reward_change_min": -0.5750756375491619, + "reward_change_std": 0.22550523653626442, + "reward_std": 0.6508563458919525, + "rewards/cosine_scaled_reward": -0.09493127651512623, + "rewards/format_reward": 0.7291666716337204, + "step": 387 + }, + { + "advantage_max": 1.4288556650280952, + "advantage_mean": -2.6697915267437367e-08, + "advantage_min": -0.7844956144690514, + "advantage_std": 0.7918211258947849, + "completion_length": 2707.7709045410156, + "epoch": 0.44342857142857145, + "grad_norm": 0.48590895533561707, + "kl": 0.3404541015625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.306931685585657e-07, + "loss": 0.0593, + "reward": 0.569500157609582, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.569500157609582, + "reward_after_std": 0.7918211333453655, + "reward_before_mean": 1.0220210487022996, + "reward_before_std": 0.7196073681116104, + "reward_change_max": 0.0, + "reward_change_mean": -0.4525208752602339, + "reward_change_min": -0.7153974436223507, + "reward_change_std": 0.2721158731728792, + "reward_std": 0.7918211333453655, + "rewards/cosine_scaled_reward": 0.12559382850304246, + "rewards/format_reward": 0.7708333358168602, + "step": 388 + }, + { + "advantage_max": 1.5086243450641632, + "advantage_mean": -1.2417634698280722e-08, + "advantage_min": -0.8356839530169964, + "advantage_std": 0.8504573740065098, + "completion_length": 2847.8125915527344, + "epoch": 0.44457142857142856, + "grad_norm": 0.5012574791908264, + "kl": 0.409698486328125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.2848729416523859e-07, + "loss": 0.0629, + "reward": 0.3996206484735012, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3996206484735012, + "reward_after_std": 0.8504573740065098, + "reward_before_mean": 0.7850042954087257, + "reward_before_std": 0.8103694636374712, + "reward_change_max": 0.0005656033754348755, + "reward_change_mean": -0.38538364320993423, + "reward_change_min": -0.6802755519747734, + "reward_change_std": 0.26358477119356394, + "reward_std": 0.8504573963582516, + "rewards/cosine_scaled_reward": 0.007085463672410697, + "rewards/format_reward": 0.770833345130086, + "step": 389 + }, + { + "advantage_max": 1.4127248227596283, + "advantage_mean": -2.235174201281609e-08, + "advantage_min": -0.7147350683808327, + "advantage_std": 0.7942103296518326, + "completion_length": 2742.000068664551, + "epoch": 0.44571428571428573, + "grad_norm": 0.8033288717269897, + "kl": 0.3431396484375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.2629708984760706e-07, + "loss": 0.0074, + "reward": 0.2416452246834524, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2416452246834524, + "reward_after_std": 0.7942103520035744, + "reward_before_mean": 0.5779923680238426, + "reward_before_std": 0.7717950865626335, + "reward_change_max": 0.0, + "reward_change_mean": -0.33634717389941216, + "reward_change_min": -0.6063726805150509, + "reward_change_std": 0.2278979104012251, + "reward_std": 0.794210359454155, + "rewards/cosine_scaled_reward": -0.06517048925161362, + "rewards/format_reward": 0.7083333395421505, + "step": 390 + }, + { + "advantage_max": 1.413565844297409, + "advantage_mean": -4.346172144398253e-09, + "advantage_min": -0.6925171725451946, + "advantage_std": 0.7850869931280613, + "completion_length": 2662.6042251586914, + "epoch": 0.44685714285714284, + "grad_norm": 0.9450730681419373, + "kl": 0.368194580078125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.2412266235313973e-07, + "loss": -0.005, + "reward": 0.48758782213553786, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.48758782213553786, + "reward_after_std": 0.7850869931280613, + "reward_before_mean": 0.9113508481532335, + "reward_before_std": 0.7137027382850647, + "reward_change_max": 0.0, + "reward_change_mean": -0.42376305535435677, + "reward_change_min": -0.7457866631448269, + "reward_change_std": 0.2688668370246887, + "reward_std": 0.7850870080292225, + "rewards/cosine_scaled_reward": 0.05984208732843399, + "rewards/format_reward": 0.7916666828095913, + "step": 391 + }, + { + "advantage_max": 1.374567873775959, + "advantage_mean": -3.72529057601767e-09, + "advantage_min": -0.8890761323273182, + "advantage_std": 0.8275346420705318, + "completion_length": 2413.895881652832, + "epoch": 0.448, + "grad_norm": 1.1826577186584473, + "kl": 0.32952880859375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.2196411766036487e-07, + "loss": 0.0639, + "reward": 0.22747798508498818, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.22747798508498818, + "reward_after_std": 0.827534630894661, + "reward_before_mean": 0.5611926540732384, + "reward_before_std": 0.8485117331147194, + "reward_change_max": 0.0010716915130615234, + "reward_change_mean": -0.3337146546691656, + "reward_change_min": -0.670333344489336, + "reward_change_std": 0.26952179335057735, + "reward_std": 0.8275346383452415, + "rewards/cosine_scaled_reward": -0.115237015648745, + "rewards/format_reward": 0.791666679084301, + "step": 392 + }, + { + "advantage_max": 1.9456132277846336, + "advantage_mean": -8.07146260939362e-09, + "advantage_min": -1.0318707302212715, + "advantage_std": 1.0898070186376572, + "completion_length": 2754.8750915527344, + "epoch": 0.4491428571428571, + "grad_norm": 0.5890648365020752, + "kl": 0.322509765625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.1982156097370557e-07, + "loss": 0.0249, + "reward": 0.5918875364586711, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5918875364586711, + "reward_after_std": 1.089807003736496, + "reward_before_mean": 1.0142197906970978, + "reward_before_std": 1.0668868571519852, + "reward_change_max": 0.0, + "reward_change_mean": -0.42233227752149105, + "reward_change_min": -0.8279923424124718, + "reward_change_std": 0.2987551037222147, + "reward_std": 1.0898070484399796, + "rewards/cosine_scaled_reward": 0.09044323640409857, + "rewards/format_reward": 0.8333333488553762, + "step": 393 + }, + { + "advantage_max": 1.3842891454696655, + "advantage_mean": -3.7252904094842165e-09, + "advantage_min": -0.7427752874791622, + "advantage_std": 0.7568569779396057, + "completion_length": 3218.3959045410156, + "epoch": 0.4502857142857143, + "grad_norm": 1.3154881000518799, + "kl": 0.46435546875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.1769509671835223e-07, + "loss": 0.0223, + "reward": 0.07046335702762008, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.07046335702762008, + "reward_after_std": 0.7568569853901863, + "reward_before_mean": 0.3476231265813112, + "reward_before_std": 0.7233721874654293, + "reward_change_max": 0.00036607682704925537, + "reward_change_mean": -0.2771597858518362, + "reward_change_min": -0.5230727456510067, + "reward_change_std": 0.19481442868709564, + "reward_std": 0.7568569853901863, + "rewards/cosine_scaled_reward": -0.16993844415992498, + "rewards/format_reward": 0.6875000074505806, + "step": 394 + }, + { + "advantage_max": 1.5892572775483131, + "advantage_mean": -2.4214388327781222e-08, + "advantage_min": -0.9472315311431885, + "advantage_std": 0.9160324931144714, + "completion_length": 2425.645896911621, + "epoch": 0.4514285714285714, + "grad_norm": 0.4146610200405121, + "kl": 0.232391357421875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.1558482853517253e-07, + "loss": 0.023, + "reward": 0.6786849615164101, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6786849615164101, + "reward_after_std": 0.9160324931144714, + "reward_before_mean": 1.1544897370040417, + "reward_before_std": 0.8708195760846138, + "reward_change_max": 3.794580698013306e-05, + "reward_change_mean": -0.47580479457974434, + "reward_change_min": -0.8236850872635841, + "reward_change_std": 0.31363353319466114, + "reward_std": 0.9160325489938259, + "rewards/cosine_scaled_reward": 0.1501615282613784, + "rewards/format_reward": 0.8541666865348816, + "step": 395 + }, + { + "advantage_max": 1.145592711865902, + "advantage_mean": -4.2219958085176756e-08, + "advantage_min": -0.6616269759833813, + "advantage_std": 0.64500567689538, + "completion_length": 2860.7083740234375, + "epoch": 0.45257142857142857, + "grad_norm": 0.9409205317497253, + "kl": 0.27783203125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.134908592756607e-07, + "loss": -0.0031, + "reward": 0.48085956354043446, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.48085956354043446, + "reward_after_std": 0.6450056806206703, + "reward_before_mean": 0.9201758950948715, + "reward_before_std": 0.5669548735022545, + "reward_change_max": 0.00025184452533721924, + "reward_change_mean": -0.4393163565546274, + "reward_change_min": -0.6854121945798397, + "reward_change_std": 0.2582757193595171, + "reward_std": 0.6450057104229927, + "rewards/cosine_scaled_reward": 0.012171266600489616, + "rewards/format_reward": 0.8958333432674408, + "step": 396 + }, + { + "advantage_max": 1.4064282700419426, + "advantage_mean": 1.5522043095295146e-09, + "advantage_min": -0.8187252506613731, + "advantage_std": 0.7925236187875271, + "completion_length": 2702.3333892822266, + "epoch": 0.45371428571428574, + "grad_norm": 0.35432639718055725, + "kl": 0.3092041015625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.1141329099692406e-07, + "loss": 0.0292, + "reward": 0.27529994398355484, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.27529994398355484, + "reward_after_std": 0.7925235889852047, + "reward_before_mean": 0.6220556901535019, + "reward_before_std": 0.7582901753485203, + "reward_change_max": 0.0, + "reward_change_mean": -0.3467557244002819, + "reward_change_min": -0.5839594602584839, + "reward_change_std": 0.23202148266136646, + "reward_std": 0.7925236150622368, + "rewards/cosine_scaled_reward": -0.10563884908333421, + "rewards/format_reward": 0.8333333432674408, + "step": 397 + }, + { + "advantage_max": 1.1642621606588364, + "advantage_mean": 5.5879355587151736e-09, + "advantage_min": -0.6164032593369484, + "advantage_std": 0.6434980258345604, + "completion_length": 2782.166717529297, + "epoch": 0.45485714285714285, + "grad_norm": 0.5750638842582703, + "kl": 0.3607177734375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.0935222495670968e-07, + "loss": 0.0161, + "reward": 0.20992313139140606, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.20992313139140606, + "reward_after_std": 0.6434980146586895, + "reward_before_mean": 0.5533581459894776, + "reward_before_std": 0.5759002082049847, + "reward_change_max": 0.0002307295799255371, + "reward_change_mean": -0.343435013666749, + "reward_change_min": -0.5538477338850498, + "reward_change_std": 0.21595249138772488, + "reward_std": 0.6434980481863022, + "rewards/cosine_scaled_reward": -0.10873759724199772, + "rewards/format_reward": 0.7708333488553762, + "step": 398 + }, + { + "advantage_max": 1.4001071378588676, + "advantage_mean": -2.2972624635908545e-08, + "advantage_min": -0.8582115992903709, + "advantage_std": 0.8050716407597065, + "completion_length": 2384.500045776367, + "epoch": 0.456, + "grad_norm": 0.9607195258140564, + "kl": 0.248504638671875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.0730776160846853e-07, + "loss": 0.0479, + "reward": 0.5995660796470474, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5995660796470474, + "reward_after_std": 0.8050716295838356, + "reward_before_mean": 1.0644009187817574, + "reward_before_std": 0.7492502890527248, + "reward_change_max": 0.0006214603781700134, + "reward_change_mean": -0.46483487263321877, + "reward_change_min": -0.728652473539114, + "reward_change_std": 0.291376868262887, + "reward_std": 0.8050716482102871, + "rewards/cosine_scaled_reward": 0.07386713265441358, + "rewards/format_reward": 0.9166666865348816, + "step": 399 + }, + { + "advantage_max": 1.5142693221569061, + "advantage_mean": -2.7318796558262193e-08, + "advantage_min": -0.7139600031077862, + "advantage_std": 0.8169923797249794, + "completion_length": 1973.4167175292969, + "epoch": 0.45714285714285713, + "grad_norm": 0.6028825044631958, + "kl": 0.140106201171875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.0528000059645995e-07, + "loss": -0.0066, + "reward": 0.8798388665309176, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.8798388665309176, + "reward_after_std": 0.8169923909008503, + "reward_before_mean": 1.4358258470892906, + "reward_before_std": 0.6879842728376389, + "reward_change_max": 0.0, + "reward_change_mean": -0.5559869892895222, + "reward_change_min": -0.8480464890599251, + "reward_change_std": 0.3101024003699422, + "reward_std": 0.8169924318790436, + "rewards/cosine_scaled_reward": 0.26999625749886036, + "rewards/format_reward": 0.8958333395421505, + "step": 400 + }, + { + "advantage_max": 1.2209898084402084, + "advantage_mean": -1.055498977109437e-08, + "advantage_min": -0.622258760035038, + "advantage_std": 0.6894869990646839, + "completion_length": 2769.64591217041, + "epoch": 0.4582857142857143, + "grad_norm": 0.736003041267395, + "kl": 0.2721405029296875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.032690407508949e-07, + "loss": 0.0017, + "reward": 0.48054402810521424, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.48054402810521424, + "reward_after_std": 0.6894869841635227, + "reward_before_mean": 0.9125964008271694, + "reward_before_std": 0.6117764301598072, + "reward_change_max": 0.0, + "reward_change_mean": -0.43205239437520504, + "reward_change_min": -0.7335581555962563, + "reward_change_std": 0.2669758554548025, + "reward_std": 0.6894870065152645, + "rewards/cosine_scaled_reward": 0.10213153855875134, + "rewards/format_reward": 0.7083333432674408, + "step": 401 + }, + { + "advantage_max": 1.2742392867803574, + "advantage_mean": 5.587935669737476e-09, + "advantage_min": -0.7246151715517044, + "advantage_std": 0.744117021560669, + "completion_length": 2256.979217529297, + "epoch": 0.4594285714285714, + "grad_norm": 1.0698230266571045, + "kl": 0.2095489501953125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 2.0127498008311922e-07, + "loss": 0.0454, + "reward": 0.16344716679304838, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.16344716679304838, + "reward_after_std": 0.7441170252859592, + "reward_before_mean": 0.4758140491321683, + "reward_before_std": 0.7317548841238022, + "reward_change_max": 0.00041125714778900146, + "reward_change_mean": -0.31236686930060387, + "reward_change_min": -0.6093603521585464, + "reward_change_std": 0.24216584488749504, + "reward_std": 0.7441170550882816, + "rewards/cosine_scaled_reward": -0.08500966196879745, + "rewards/format_reward": 0.6458333525806665, + "step": 402 + }, + { + "advantage_max": 1.4515544027090073, + "advantage_mean": 9.934108091691485e-09, + "advantage_min": -0.7735986150801182, + "advantage_std": 0.8350824005901814, + "completion_length": 2176.916736602783, + "epoch": 0.4605714285714286, + "grad_norm": 0.7707470655441284, + "kl": 0.15277099609375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.9929791578083655e-07, + "loss": 0.0315, + "reward": 0.4309818516485393, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4309818516485393, + "reward_after_std": 0.835082408040762, + "reward_before_mean": 0.8304437976330519, + "reward_before_std": 0.8025292456150055, + "reward_change_max": 0.0, + "reward_change_mean": -0.3994619268923998, + "reward_change_min": -0.7102017849683762, + "reward_change_std": 0.27272626385092735, + "reward_std": 0.8350824564695358, + "rewards/cosine_scaled_reward": 0.019388556480407715, + "rewards/format_reward": 0.7916666753590107, + "step": 403 + }, + { + "advantage_max": 0.9113276973366737, + "advantage_mean": -1.241763458725842e-08, + "advantage_min": -0.5583123937249184, + "advantage_std": 0.5214662775397301, + "completion_length": 2384.7500534057617, + "epoch": 0.4617142857142857, + "grad_norm": 0.2345946878194809, + "kl": 0.2875099182128906, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.9733794420337213e-07, + "loss": 0.0314, + "reward": 0.3233966355910525, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3233966355910525, + "reward_after_std": 0.521466288715601, + "reward_before_mean": 0.7204841803759336, + "reward_before_std": 0.4442944601178169, + "reward_change_max": 0.00019099563360214233, + "reward_change_mean": -0.3970875274389982, + "reward_change_min": -0.6074875667691231, + "reward_change_std": 0.23336594179272652, + "reward_std": 0.5214662961661816, + "rewards/cosine_scaled_reward": -0.025174589827656746, + "rewards/format_reward": 0.7708333544433117, + "step": 404 + }, + { + "advantage_max": 1.4360693022608757, + "advantage_mean": -2.7318797002351403e-08, + "advantage_min": -0.9703242368996143, + "advantage_std": 0.8638960532844067, + "completion_length": 2237.750030517578, + "epoch": 0.46285714285714286, + "grad_norm": 0.9567005038261414, + "kl": 0.2099609375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.9539516087697517e-07, + "loss": 0.0385, + "reward": 0.5817769723944366, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5817769723944366, + "reward_after_std": 0.8638960495591164, + "reward_before_mean": 1.034262259490788, + "reward_before_std": 0.8446610942482948, + "reward_change_max": 0.0016998574137687683, + "reward_change_mean": -0.4524852652102709, + "reward_change_min": -0.7840995192527771, + "reward_change_std": 0.3182655703276396, + "reward_std": 0.8638960756361485, + "rewards/cosine_scaled_reward": 0.10046443715691566, + "rewards/format_reward": 0.8333333507180214, + "step": 405 + }, + { + "advantage_max": 1.579980731010437, + "advantage_mean": -1.9868215517249155e-08, + "advantage_min": -0.9753882475197315, + "advantage_std": 0.9244318529963493, + "completion_length": 2248.2500610351562, + "epoch": 0.464, + "grad_norm": 1.3583718538284302, + "kl": 0.151763916015625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.934696604901642e-07, + "loss": 0.0449, + "reward": 0.6996534131467342, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6996534131467342, + "reward_after_std": 0.9244318082928658, + "reward_before_mean": 1.1870945319533348, + "reward_before_std": 0.8938433974981308, + "reward_change_max": 0.0, + "reward_change_mean": -0.48744115233421326, + "reward_change_min": -0.8586244881153107, + "reward_change_std": 0.33273613080382347, + "reward_std": 0.924431823194027, + "rewards/cosine_scaled_reward": 0.1352139227092266, + "rewards/format_reward": 0.916666679084301, + "step": 406 + }, + { + "advantage_max": 1.241002269089222, + "advantage_mean": -1.1796752352744022e-08, + "advantage_min": -0.8528676331043243, + "advantage_std": 0.7451582886278629, + "completion_length": 2460.3334045410156, + "epoch": 0.46514285714285714, + "grad_norm": 0.8853124380111694, + "kl": 0.1806640625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.915615368891117e-07, + "loss": 0.0284, + "reward": 0.5823654560372233, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5823654560372233, + "reward_after_std": 0.745158277451992, + "reward_before_mean": 1.0513971992768347, + "reward_before_std": 0.7118194662034512, + "reward_change_max": 0.0, + "reward_change_mean": -0.46903173439204693, + "reward_change_min": -0.7383575513958931, + "reward_change_std": 0.2971672974526882, + "reward_std": 0.745158277451992, + "rewards/cosine_scaled_reward": 0.12986523960717022, + "rewards/format_reward": 0.7916666828095913, + "step": 407 + }, + { + "advantage_max": 1.7559831738471985, + "advantage_mean": -3.973643192267673e-08, + "advantage_min": -0.9686634726822376, + "advantage_std": 0.9539515525102615, + "completion_length": 2662.1250610351562, + "epoch": 0.4662857142857143, + "grad_norm": 1.44785737991333, + "kl": 0.2345733642578125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.8967088307307e-07, + "loss": 0.0491, + "reward": 0.7052465332672, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.7052465332672, + "reward_after_std": 0.9539515525102615, + "reward_before_mean": 1.1805474273860455, + "reward_before_std": 0.8772687762975693, + "reward_change_max": 0.0, + "reward_change_mean": -0.4753008782863617, + "reward_change_min": -0.7417329847812653, + "reward_change_std": 0.2891210447996855, + "reward_std": 0.9539515823125839, + "rewards/cosine_scaled_reward": 0.18402369134128094, + "rewards/format_reward": 0.8125000204890966, + "step": 408 + }, + { + "advantage_max": 1.1871024146676064, + "advantage_mean": -9.31322596819939e-09, + "advantage_min": -0.7041791677474976, + "advantage_std": 0.719702310860157, + "completion_length": 3173.3959350585938, + "epoch": 0.4674285714285714, + "grad_norm": 0.584197998046875, + "kl": 0.3837890625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.8779779118983867e-07, + "loss": 0.0472, + "reward": 0.2377706104889512, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2377706104889512, + "reward_after_std": 0.7197023220360279, + "reward_before_mean": 0.5875239875167608, + "reward_before_std": 0.721150603145361, + "reward_change_max": 0.00040981173515319824, + "reward_change_mean": -0.34975339006632566, + "reward_change_min": -0.6922573670744896, + "reward_change_std": 0.2620809990912676, + "reward_std": 0.7197023443877697, + "rewards/cosine_scaled_reward": 0.002095327712595463, + "rewards/format_reward": 0.5833333395421505, + "step": 409 + }, + { + "advantage_max": 1.271196611225605, + "advantage_mean": -6.208817571184966e-09, + "advantage_min": -0.6538401357829571, + "advantage_std": 0.6966192908585072, + "completion_length": 2520.9792098999023, + "epoch": 0.4685714285714286, + "grad_norm": 0.32509198784828186, + "kl": 0.3772735595703125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.8594235253127372e-07, + "loss": 0.0199, + "reward": 0.250648136716336, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.250648136716336, + "reward_after_std": 0.6966192834079266, + "reward_before_mean": 0.5946154878474772, + "reward_before_std": 0.6319909431040287, + "reward_change_max": 0.0, + "reward_change_mean": -0.343967342749238, + "reward_change_min": -0.5719360187649727, + "reward_change_std": 0.21776084788143635, + "reward_std": 0.6966192871332169, + "rewards/cosine_scaled_reward": -0.03602561820298433, + "rewards/format_reward": 0.6666666679084301, + "step": 410 + }, + { + "advantage_max": 1.2601307108998299, + "advantage_mean": -2.7939678626243136e-09, + "advantage_min": -0.8391876332461834, + "advantage_std": 0.7550375498831272, + "completion_length": 2947.6875915527344, + "epoch": 0.4697142857142857, + "grad_norm": 0.7180549502372742, + "kl": 0.292205810546875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.8410465752883758e-07, + "loss": 0.0225, + "reward": 0.5333940163254738, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5333940163254738, + "reward_after_std": 0.7550375796854496, + "reward_before_mean": 0.9847032781690359, + "reward_before_std": 0.7263107262551785, + "reward_change_max": 0.0003239363431930542, + "reward_change_mean": -0.4513092339038849, + "reward_change_min": -0.7858996503055096, + "reward_change_std": 0.2997463811188936, + "reward_std": 0.7550375871360302, + "rewards/cosine_scaled_reward": 0.07568495441228151, + "rewards/format_reward": 0.8333333432674408, + "step": 411 + }, + { + "advantage_max": 1.1648833081126213, + "advantage_mean": -1.8626452047421083e-08, + "advantage_min": -0.8400361612439156, + "advantage_std": 0.707458071410656, + "completion_length": 2883.291717529297, + "epoch": 0.47085714285714286, + "grad_norm": 0.45658764243125916, + "kl": 0.254974365234375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.822847957491922e-07, + "loss": 0.0437, + "reward": 0.44985311944037676, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.44985311944037676, + "reward_after_std": 0.707458071410656, + "reward_before_mean": 0.8784957565367222, + "reward_before_std": 0.6848769150674343, + "reward_change_max": 4.801899194717407e-05, + "reward_change_mean": -0.42864260636270046, + "reward_change_min": -0.6926585100591183, + "reward_change_std": 0.28065695613622665, + "reward_std": 0.7074581012129784, + "rewards/cosine_scaled_reward": 0.05383117590099573, + "rewards/format_reward": 0.7708333469927311, + "step": 412 + }, + { + "advantage_max": 1.3362039625644684, + "advantage_mean": -2.1109979431166437e-08, + "advantage_min": -0.8293716721236706, + "advantage_std": 0.7513575069606304, + "completion_length": 2932.416748046875, + "epoch": 0.472, + "grad_norm": 0.4243045747280121, + "kl": 0.34423828125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.804828558898332e-07, + "loss": 0.0409, + "reward": 0.46855833008885384, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.46855833008885384, + "reward_after_std": 0.751357514411211, + "reward_before_mean": 0.8912516199052334, + "reward_before_std": 0.6947352215647697, + "reward_change_max": 0.0, + "reward_change_mean": -0.42269331961870193, + "reward_change_min": -0.6819902062416077, + "reward_change_std": 0.2646718043833971, + "reward_std": 0.7513575218617916, + "rewards/cosine_scaled_reward": 0.11229247087612748, + "rewards/format_reward": 0.6666666734963655, + "step": 413 + }, + { + "advantage_max": 1.4965841621160507, + "advantage_mean": -7.45058115203534e-09, + "advantage_min": -0.9178449511528015, + "advantage_std": 0.8657373040914536, + "completion_length": 3184.791717529297, + "epoch": 0.47314285714285714, + "grad_norm": 0.5575371384620667, + "kl": 0.42626953125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.7869892577476722e-07, + "loss": 0.04, + "reward": 0.11716062761843204, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.11716062761843204, + "reward_after_std": 0.8657373264431953, + "reward_before_mean": 0.401735796011053, + "reward_before_std": 0.8855462893843651, + "reward_change_max": 0.0011830329895019531, + "reward_change_mean": -0.2845751619897783, + "reward_change_min": -0.5535517930984497, + "reward_change_std": 0.2295767618343234, + "reward_std": 0.8657373674213886, + "rewards/cosine_scaled_reward": -0.10121545614674687, + "rewards/format_reward": 0.6041666883975267, + "step": 414 + }, + { + "advantage_max": 1.6141176372766495, + "advantage_mean": -3.7252904094842165e-09, + "advantage_min": -1.0077509805560112, + "advantage_std": 0.9576461650431156, + "completion_length": 3334.0000610351562, + "epoch": 0.4742857142857143, + "grad_norm": 0.752418577671051, + "kl": 0.502197265625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.7693309235023127e-07, + "loss": 0.0569, + "reward": 0.3126736783888191, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3126736783888191, + "reward_after_std": 0.9576461650431156, + "reward_before_mean": 0.6598517000675201, + "reward_before_std": 0.989838071167469, + "reward_change_max": 0.0, + "reward_change_mean": -0.3471780326217413, + "reward_change_min": -0.6480702608823776, + "reward_change_std": 0.27285377867519855, + "reward_std": 0.9576461873948574, + "rewards/cosine_scaled_reward": -0.03465749090537429, + "rewards/format_reward": 0.7291666865348816, + "step": 415 + }, + { + "advantage_max": 1.2872228100895882, + "advantage_mean": -1.4280279847511679e-08, + "advantage_min": -0.7412748411297798, + "advantage_std": 0.7350778169929981, + "completion_length": 2092.854232788086, + "epoch": 0.4754285714285714, + "grad_norm": 0.7298946976661682, + "kl": 0.1840667724609375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.7518544168045524e-07, + "loss": 0.0363, + "reward": 0.5364300422370434, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5364300422370434, + "reward_after_std": 0.7350778095424175, + "reward_before_mean": 0.9861254207789898, + "reward_before_std": 0.6707355920225382, + "reward_change_max": 0.0, + "reward_change_mean": -0.44969535805284977, + "reward_change_min": -0.7272941693663597, + "reward_change_std": 0.27874294482171535, + "reward_std": 0.7350778356194496, + "rewards/cosine_scaled_reward": 0.07639601826667786, + "rewards/format_reward": 0.8333333414047956, + "step": 416 + }, + { + "advantage_max": 1.5041983649134636, + "advantage_mean": -8.6923440667519e-09, + "advantage_min": -0.8219343274831772, + "advantage_std": 0.8873535022139549, + "completion_length": 3249.375030517578, + "epoch": 0.4765714285714286, + "grad_norm": 0.6985993981361389, + "kl": 0.469482421875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.7345605894346726e-07, + "loss": 0.0623, + "reward": 0.0966934897005558, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.0966934897005558, + "reward_after_std": 0.887353528290987, + "reward_before_mean": 0.3739356682635844, + "reward_before_std": 0.9190301541239023, + "reward_change_max": 0.0009882226586341858, + "reward_change_mean": -0.27724218368530273, + "reward_change_min": -0.6620569825172424, + "reward_change_std": 0.260542631149292, + "reward_std": 0.8873535580933094, + "rewards/cosine_scaled_reward": -0.12553217657841742, + "rewards/format_reward": 0.6250000074505806, + "step": 417 + }, + { + "advantage_max": 1.4096960350871086, + "advantage_mean": -3.725290376177526e-08, + "advantage_min": -0.8126649931073189, + "advantage_std": 0.8450727388262749, + "completion_length": 2094.7292098999023, + "epoch": 0.4777142857142857, + "grad_norm": 1.2689417600631714, + "kl": 0.41217803955078125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.7174502842694212e-07, + "loss": -0.0456, + "reward": 0.4172011539340019, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4172011539340019, + "reward_after_std": 0.8450727574527264, + "reward_before_mean": 0.8055177573114634, + "reward_before_std": 0.8436265364289284, + "reward_change_max": 0.0004687011241912842, + "reward_change_mean": -0.3883166164159775, + "reward_change_min": -0.7676271609961987, + "reward_change_std": 0.29594606440514326, + "reward_std": 0.8450727611780167, + "rewards/cosine_scaled_reward": 0.05900887493044138, + "rewards/format_reward": 0.6875000149011612, + "step": 418 + }, + { + "advantage_max": 1.4702820256352425, + "advantage_mean": 3.725290298461914e-09, + "advantage_min": -0.8695723973214626, + "advantage_std": 0.8322137109935284, + "completion_length": 2882.604278564453, + "epoch": 0.47885714285714287, + "grad_norm": 1.4612587690353394, + "kl": 0.684814453125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.7005243352409333e-07, + "loss": 0.0799, + "reward": 0.5663878936320543, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5663878936320543, + "reward_after_std": 0.832213718444109, + "reward_before_mean": 1.0104223068337888, + "reward_before_std": 0.7705448046326637, + "reward_change_max": 0.000252746045589447, + "reward_change_mean": -0.4440344120375812, + "reward_change_min": -0.7140844948589802, + "reward_change_std": 0.28817459661513567, + "reward_std": 0.8322137407958508, + "rewards/cosine_scaled_reward": 0.14062782609835267, + "rewards/format_reward": 0.7291666716337204, + "step": 419 + }, + { + "advantage_max": 1.3699936755001545, + "advantage_mean": -2.048909719665204e-08, + "advantage_min": -0.8499936163425446, + "advantage_std": 0.8008501157164574, + "completion_length": 2282.854217529297, + "epoch": 0.48, + "grad_norm": 0.8160320520401001, + "kl": 0.307464599609375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.6837835672960831e-07, + "loss": 0.0613, + "reward": 0.33678290247917175, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.33678290247917175, + "reward_after_std": 0.8008501008152962, + "reward_before_mean": 0.707935786806047, + "reward_before_std": 0.7854114696383476, + "reward_change_max": 0.0, + "reward_change_mean": -0.37115289457142353, + "reward_change_min": -0.6540162637829781, + "reward_change_std": 0.261508384719491, + "reward_std": 0.8008501194417477, + "rewards/cosine_scaled_reward": -0.031448788940906525, + "rewards/format_reward": 0.7708333544433117, + "step": 420 + }, + { + "advantage_max": 1.5776860639452934, + "advantage_mean": -1.3038516433194758e-08, + "advantage_min": -0.8760270141065121, + "advantage_std": 0.8757519386708736, + "completion_length": 3130.416748046875, + "epoch": 0.48114285714285715, + "grad_norm": 1.04563570022583, + "kl": 0.467529296875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.6672287963562852e-07, + "loss": 0.0216, + "reward": 0.24572166753932834, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.24572166753932834, + "reward_after_std": 0.8757519423961639, + "reward_before_mean": 0.5705269044265151, + "reward_before_std": 0.8527628295123577, + "reward_change_max": 0.0, + "reward_change_mean": -0.3248052569106221, + "reward_change_min": -0.5897286981344223, + "reward_change_std": 0.23443656880408525, + "reward_std": 0.8757519759237766, + "rewards/cosine_scaled_reward": -0.06890320917591453, + "rewards/format_reward": 0.7083333488553762, + "step": 421 + }, + { + "advantage_max": 1.361400119960308, + "advantage_mean": -1.2417634420724966e-08, + "advantage_min": -0.6548620238900185, + "advantage_std": 0.7440179772675037, + "completion_length": 2844.229248046875, + "epoch": 0.48228571428571426, + "grad_norm": 0.553774893283844, + "kl": 0.36322021484375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.6508608292777203e-07, + "loss": 0.0398, + "reward": 0.4359323289245367, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4359323289245367, + "reward_after_std": 0.7440179847180843, + "reward_before_mean": 0.8442143835127354, + "reward_before_std": 0.6683123558759689, + "reward_change_max": 0.0007221251726150513, + "reward_change_mean": -0.40828206948935986, + "reward_change_min": -0.6951416172087193, + "reward_change_std": 0.2510274276137352, + "reward_std": 0.7440180070698261, + "rewards/cosine_scaled_reward": -0.015392808709293604, + "rewards/format_reward": 0.8750000111758709, + "step": 422 + }, + { + "advantage_max": 1.1223453134298325, + "advantage_mean": 9.62366689116756e-09, + "advantage_min": -0.7540298514068127, + "advantage_std": 0.6678426675498486, + "completion_length": 2842.916793823242, + "epoch": 0.48342857142857143, + "grad_norm": 0.4749550223350525, + "kl": 0.377349853515625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.6346804638120098e-07, + "loss": 0.0351, + "reward": 0.1393016508081928, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1393016508081928, + "reward_after_std": 0.6678426824510098, + "reward_before_mean": 0.45855964068323374, + "reward_before_std": 0.6641668677330017, + "reward_change_max": 0.0, + "reward_change_mean": -0.31925799883902073, + "reward_change_min": -0.5939181633293629, + "reward_change_std": 0.23012700304389, + "reward_std": 0.6678427010774612, + "rewards/cosine_scaled_reward": -0.14572018571197987, + "rewards/format_reward": 0.7500000149011612, + "step": 423 + }, + { + "advantage_max": 1.4285841286182404, + "advantage_mean": -4.346172144398253e-09, + "advantage_min": -0.8253460563719273, + "advantage_std": 0.8169199749827385, + "completion_length": 3067.5000610351562, + "epoch": 0.4845714285714286, + "grad_norm": 0.528998613357544, + "kl": 0.48583984375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.6186884885673413e-07, + "loss": 0.0324, + "reward": 0.16846600966528058, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.16846600966528058, + "reward_after_std": 0.8169199749827385, + "reward_before_mean": 0.476221552118659, + "reward_before_std": 0.8139840699732304, + "reward_change_max": 0.0, + "reward_change_mean": -0.3077555485069752, + "reward_change_min": -0.5694200806319714, + "reward_change_std": 0.22507017478346825, + "reward_std": 0.81691999360919, + "rewards/cosine_scaled_reward": -0.10563923278823495, + "rewards/format_reward": 0.6875000260770321, + "step": 424 + }, + { + "advantage_max": 1.5528990626335144, + "advantage_mean": -4.221995819619906e-08, + "advantage_min": -1.2289166450500488, + "advantage_std": 0.9778655990958214, + "completion_length": 1963.1250610351562, + "epoch": 0.4857142857142857, + "grad_norm": 1.2701576948165894, + "kl": 0.180633544921875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.6028856829700258e-07, + "loss": 0.0464, + "reward": 0.9164818078279495, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.9164818078279495, + "reward_after_std": 0.9778655841946602, + "reward_before_mean": 1.4825816750526428, + "reward_before_std": 0.9932750500738621, + "reward_change_max": 0.0, + "reward_change_mean": -0.5660999063402414, + "reward_change_min": -0.950649194419384, + "reward_change_std": 0.3851541206240654, + "reward_std": 0.9778655916452408, + "rewards/cosine_scaled_reward": 0.3142075026407838, + "rewards/format_reward": 0.8541666828095913, + "step": 425 + }, + { + "advantage_max": 1.4850023314356804, + "advantage_mean": -1.6142924885720333e-08, + "advantage_min": -0.8387530986219645, + "advantage_std": 0.8622367791831493, + "completion_length": 2385.000045776367, + "epoch": 0.4868571428571429, + "grad_norm": 0.5399412512779236, + "kl": 0.41119384765625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.5872728172265146e-07, + "loss": 0.0495, + "reward": 0.41559531493112445, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.41559531493112445, + "reward_after_std": 0.8622367866337299, + "reward_before_mean": 0.8041009623557329, + "reward_before_std": 0.8447239138185978, + "reward_change_max": 9.606033563613892e-05, + "reward_change_mean": -0.3885056748986244, + "reward_change_min": -0.6781072355806828, + "reward_change_std": 0.2709171436727047, + "reward_std": 0.8622367903590202, + "rewards/cosine_scaled_reward": -0.00419950857758522, + "rewards/format_reward": 0.8125000111758709, + "step": 426 + }, + { + "advantage_max": 1.783732920885086, + "advantage_mean": -2.220446049250313e-16, + "advantage_min": -1.004781313240528, + "advantage_std": 1.019487425684929, + "completion_length": 2844.291717529297, + "epoch": 0.488, + "grad_norm": 1.7400785684585571, + "kl": 0.4984130859375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.5718506522858572e-07, + "loss": 0.102, + "reward": 0.2693352377973497, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2693352377973497, + "reward_after_std": 1.0194874107837677, + "reward_before_mean": 0.5815716478973627, + "reward_before_std": 1.039123497903347, + "reward_change_max": 0.0, + "reward_change_mean": -0.3122364245355129, + "reward_change_min": -0.6392947360873222, + "reward_change_std": 0.2599599938839674, + "reward_std": 1.0194874852895737, + "rewards/cosine_scaled_reward": -0.04254750721156597, + "rewards/format_reward": 0.6666666865348816, + "step": 427 + }, + { + "advantage_max": 1.3194306641817093, + "advantage_mean": -6.829699084054397e-09, + "advantage_min": -0.857621468603611, + "advantage_std": 0.7734849788248539, + "completion_length": 2835.6667098999023, + "epoch": 0.48914285714285716, + "grad_norm": 0.7393080592155457, + "kl": 0.417510986328125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.5566199398026147e-07, + "loss": 0.0293, + "reward": 0.28968586708651856, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.28968586708651856, + "reward_after_std": 0.7734849452972412, + "reward_before_mean": 0.6473546512424946, + "reward_before_std": 0.7677021622657776, + "reward_change_max": 0.0, + "reward_change_mean": -0.35766879096627235, + "reward_change_min": -0.6340950168669224, + "reward_change_std": 0.24785144068300724, + "reward_std": 0.7734849527478218, + "rewards/cosine_scaled_reward": -0.11382270202739164, + "rewards/format_reward": 0.8750000223517418, + "step": 428 + }, + { + "advantage_max": 0.9747552573680878, + "advantage_mean": 9.62366689116756e-09, + "advantage_min": -0.5368304066359997, + "advantage_std": 0.5519351437687874, + "completion_length": 2275.416763305664, + "epoch": 0.49028571428571427, + "grad_norm": 1.0370701551437378, + "kl": 0.2674560546875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.5415814221002265e-07, + "loss": -0.0058, + "reward": 0.38617168786004186, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.38617168786004186, + "reward_after_std": 0.5519351437687874, + "reward_before_mean": 0.8017744198441505, + "reward_before_std": 0.4633374772965908, + "reward_change_max": 0.0, + "reward_change_mean": -0.4156027212738991, + "reward_change_min": -0.6280790641903877, + "reward_change_std": 0.24106368236243725, + "reward_std": 0.5519351549446583, + "rewards/cosine_scaled_reward": -0.057446133345365524, + "rewards/format_reward": 0.9166666679084301, + "step": 429 + }, + { + "advantage_max": 1.1260406970977783, + "advantage_mean": -1.4901161971003773e-08, + "advantage_min": -0.7276361249387264, + "advantage_std": 0.6451876908540726, + "completion_length": 2481.750030517578, + "epoch": 0.49142857142857144, + "grad_norm": 0.5541008710861206, + "kl": 0.3700408935546875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.5267358321348285e-07, + "loss": 0.0265, + "reward": 0.3534410297870636, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3534410297870636, + "reward_after_std": 0.6451876983046532, + "reward_before_mean": 0.7495078779757023, + "reward_before_std": 0.5930134803056717, + "reward_change_max": 0.0, + "reward_change_mean": -0.3960668696090579, + "reward_change_min": -0.6170521266758442, + "reward_change_std": 0.2415135744959116, + "reward_std": 0.645187720656395, + "rewards/cosine_scaled_reward": -0.00024606427177786827, + "rewards/format_reward": 0.7500000111758709, + "step": 430 + }, + { + "advantage_max": 1.158790297806263, + "advantage_mean": -2.7939677238464355e-09, + "advantage_min": -0.6757703498005867, + "advantage_std": 0.6554910391569138, + "completion_length": 2506.8334045410156, + "epoch": 0.49257142857142855, + "grad_norm": 0.8864076137542725, + "kl": 0.35559844970703125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.5120838934595337e-07, + "loss": 0.0084, + "reward": 0.20965594646986574, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.20965594646986574, + "reward_after_std": 0.6554910317063332, + "reward_before_mean": 0.5490434896200895, + "reward_before_std": 0.6157159730792046, + "reward_change_max": 0.0006238818168640137, + "reward_change_mean": -0.33938753232359886, + "reward_change_min": -0.5494464412331581, + "reward_change_std": 0.2182679483667016, + "reward_std": 0.6554910577833652, + "rewards/cosine_scaled_reward": -0.15256160404533148, + "rewards/format_reward": 0.8541666716337204, + "step": 431 + }, + { + "advantage_max": 1.2552975341677666, + "advantage_mean": -8.071462331837864e-09, + "advantage_min": -0.7928625233471394, + "advantage_std": 0.7409033365547657, + "completion_length": 2835.7709045410156, + "epoch": 0.4937142857142857, + "grad_norm": 1.0404794216156006, + "kl": 0.420684814453125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.4976263201891613e-07, + "loss": 0.0706, + "reward": 0.45932803535833955, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.45932803535833955, + "reward_after_std": 0.7409033589065075, + "reward_before_mean": 0.8826699033379555, + "reward_before_std": 0.7020781114697456, + "reward_change_max": 0.003414548933506012, + "reward_change_mean": -0.42334189265966415, + "reward_change_min": -0.6823976673185825, + "reward_change_std": 0.2694571502506733, + "reward_std": 0.7409033626317978, + "rewards/cosine_scaled_reward": -0.01699838414788246, + "rewards/format_reward": 0.9166666865348816, + "step": 432 + }, + { + "advantage_max": 1.3465048968791962, + "advantage_mean": 2.4835269396561444e-09, + "advantage_min": -0.6381904669106007, + "advantage_std": 0.7317006289958954, + "completion_length": 2914.12508392334, + "epoch": 0.4948571428571429, + "grad_norm": 0.7161730527877808, + "kl": 0.39739990234375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.483363816965435e-07, + "loss": 0.051, + "reward": 0.5220309607684612, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5220309607684612, + "reward_after_std": 0.731700599193573, + "reward_before_mean": 0.9621444530785084, + "reward_before_std": 0.6160903349518776, + "reward_change_max": 0.0003250539302825928, + "reward_change_mean": -0.44011346995830536, + "reward_change_min": -0.6610860973596573, + "reward_change_std": 0.25879207253456116, + "reward_std": 0.7317006103694439, + "rewards/cosine_scaled_reward": 0.07482220698148012, + "rewards/format_reward": 0.8125000149011612, + "step": 433 + }, + { + "advantage_max": 1.2062464877963066, + "advantage_mean": -1.2417634698280722e-09, + "advantage_min": -0.6216416116803885, + "advantage_std": 0.6547112353146076, + "completion_length": 3110.6459045410156, + "epoch": 0.496, + "grad_norm": 0.389948308467865, + "kl": 0.447509765625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.469297078922642e-07, + "loss": 0.0442, + "reward": -0.06265595648437738, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.06265595648437738, + "reward_after_std": 0.6547112427651882, + "reward_before_mean": 0.17960346583276987, + "reward_before_std": 0.6161167174577713, + "reward_change_max": 0.0, + "reward_change_mean": -0.2422594241797924, + "reward_change_min": -0.4483148455619812, + "reward_change_std": 0.17038149200379848, + "reward_std": 0.6547112688422203, + "rewards/cosine_scaled_reward": -0.28519828617572784, + "rewards/format_reward": 0.7500000149011612, + "step": 434 + }, + { + "advantage_max": 1.1669103428721428, + "advantage_mean": -1.3659397946064189e-08, + "advantage_min": -0.6414779610931873, + "advantage_std": 0.6753768660128117, + "completion_length": 2001.2292442321777, + "epoch": 0.49714285714285716, + "grad_norm": 0.7122639417648315, + "kl": 0.27149200439453125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.4554267916537495e-07, + "loss": -0.0088, + "reward": 0.3204499026760459, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3204499026760459, + "reward_after_std": 0.675376869738102, + "reward_before_mean": 0.6983707807958126, + "reward_before_std": 0.6387381218373775, + "reward_change_max": 0.0, + "reward_change_mean": -0.3779208790510893, + "reward_change_min": -0.6663752608001232, + "reward_change_std": 0.2425840962678194, + "reward_std": 0.6753768734633923, + "rewards/cosine_scaled_reward": -0.08831463009119034, + "rewards/format_reward": 0.8750000074505806, + "step": 435 + }, + { + "advantage_max": 1.3380301296710968, + "advantage_mean": -2.7318795670083773e-08, + "advantage_min": -0.9807571396231651, + "advantage_std": 0.8031731098890305, + "completion_length": 2268.5208587646484, + "epoch": 0.4982857142857143, + "grad_norm": 0.5405041575431824, + "kl": 0.2992401123046875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.4417536311769885e-07, + "loss": 0.017, + "reward": 0.6386113851331174, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6386113851331174, + "reward_after_std": 0.803173117339611, + "reward_before_mean": 1.1200417447835207, + "reward_before_std": 0.7725371960550547, + "reward_change_max": 0.0, + "reward_change_mean": -0.48143038526177406, + "reward_change_min": -0.7670682519674301, + "reward_change_std": 0.3101059626787901, + "reward_std": 0.8031731359660625, + "rewards/cosine_scaled_reward": 0.13293753401376307, + "rewards/format_reward": 0.8541666865348816, + "step": 436 + }, + { + "advantage_max": 1.1815454438328743, + "advantage_mean": -3.7252904094842165e-09, + "advantage_min": -0.8695116825401783, + "advantage_std": 0.7322442196309566, + "completion_length": 3261.0625915527344, + "epoch": 0.49942857142857144, + "grad_norm": 0.8688717484474182, + "kl": 0.55712890625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.4282782639029128e-07, + "loss": 0.0415, + "reward": 0.25387762673199177, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.25387762673199177, + "reward_after_std": 0.7322442270815372, + "reward_before_mean": 0.6103640319779515, + "reward_before_std": 0.7458185590803623, + "reward_change_max": 0.0002872347831726074, + "reward_change_mean": -0.35648639500141144, + "reward_change_min": -0.6514874063432217, + "reward_change_std": 0.2629950176924467, + "reward_std": 0.732244249433279, + "rewards/cosine_scaled_reward": -0.08023467287421227, + "rewards/format_reward": 0.7708333507180214, + "step": 437 + }, + { + "advantage_max": 1.1966863423585892, + "advantage_mean": -1.8316011179964065e-08, + "advantage_min": -0.8341338858008385, + "advantage_std": 0.7336725704371929, + "completion_length": 2907.979217529297, + "epoch": 0.5005714285714286, + "grad_norm": 0.4294912815093994, + "kl": 0.367462158203125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.4150013466019114e-07, + "loss": 0.0373, + "reward": 0.30324064660817385, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.30324064660817385, + "reward_after_std": 0.7336725741624832, + "reward_before_mean": 0.6767560187727213, + "reward_before_std": 0.7442520670592785, + "reward_change_max": 0.0, + "reward_change_mean": -0.37351538613438606, + "reward_change_min": -0.688855767250061, + "reward_change_std": 0.26820408646017313, + "reward_std": 0.7336725778877735, + "rewards/cosine_scaled_reward": -0.026205329224467278, + "rewards/format_reward": 0.7291666734963655, + "step": 438 + }, + { + "advantage_max": 1.2665520757436752, + "advantage_mean": -1.8626452047421083e-09, + "advantage_min": -0.6952036470174789, + "advantage_std": 0.702237457036972, + "completion_length": 2692.5833892822266, + "epoch": 0.5017142857142857, + "grad_norm": 0.6560401320457458, + "kl": 0.3876953125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.4019235263722034e-07, + "loss": 0.0416, + "reward": 0.2241203337907791, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2241203337907791, + "reward_after_std": 0.7022374682128429, + "reward_before_mean": 0.5633864104747772, + "reward_before_std": 0.6478680744767189, + "reward_change_max": 0.0018536224961280823, + "reward_change_mean": -0.3392660841345787, + "reward_change_min": -0.567062571644783, + "reward_change_std": 0.22123288549482822, + "reward_std": 0.7022374868392944, + "rewards/cosine_scaled_reward": -0.11414013616740704, + "rewards/format_reward": 0.791666679084301, + "step": 439 + }, + { + "advantage_max": 1.1453857421875, + "advantage_mean": -3.7252904094842165e-09, + "advantage_min": -0.6617013551294804, + "advantage_std": 0.6645463220775127, + "completion_length": 2964.7083740234375, + "epoch": 0.5028571428571429, + "grad_norm": 1.013630986213684, + "kl": 0.516143798828125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.3890454406082956e-07, + "loss": 0.0305, + "reward": -0.036612953059375286, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.036612953059375286, + "reward_after_std": 0.664546325802803, + "reward_before_mean": 0.218028850533301, + "reward_before_std": 0.6687482669949532, + "reward_change_max": 0.0, + "reward_change_mean": -0.25464182160794735, + "reward_change_min": -0.5325037129223347, + "reward_change_std": 0.20117001980543137, + "reward_std": 0.6645463369786739, + "rewards/cosine_scaled_reward": -0.19306890480220318, + "rewards/format_reward": 0.604166679084301, + "step": 440 + }, + { + "advantage_max": 1.3998192101716995, + "advantage_mean": -1.800557042352935e-08, + "advantage_min": -0.9868221804499626, + "advantage_std": 0.8746149949729443, + "completion_length": 2815.854217529297, + "epoch": 0.504, + "grad_norm": 1.0281097888946533, + "kl": 0.436187744140625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.3763677169699217e-07, + "loss": 0.0168, + "reward": 0.45351985446177423, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.45351985446177423, + "reward_after_std": 0.8746149949729443, + "reward_before_mean": 0.8660605433396995, + "reward_before_std": 0.9094558022916317, + "reward_change_max": 0.000750415027141571, + "reward_change_mean": -0.4125407300889492, + "reward_change_min": -0.7904599867761135, + "reward_change_std": 0.31091453321278095, + "reward_std": 0.874615054577589, + "rewards/cosine_scaled_reward": 0.005946941673755646, + "rewards/format_reward": 0.8541666865348816, + "step": 441 + }, + { + "advantage_max": 1.6837698444724083, + "advantage_mean": -1.8626451825376478e-08, + "advantage_min": -1.013794220983982, + "advantage_std": 0.9842873066663742, + "completion_length": 2870.7709045410156, + "epoch": 0.5051428571428571, + "grad_norm": 0.6417372822761536, + "kl": 0.419921875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.3638909733514452e-07, + "loss": 0.0562, + "reward": 0.5628248087596148, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5628248087596148, + "reward_after_std": 0.9842873364686966, + "reward_before_mean": 0.9925806345418096, + "reward_before_std": 0.9769185334444046, + "reward_change_max": 0.0, + "reward_change_mean": -0.4297558609396219, + "reward_change_min": -0.7348082773387432, + "reward_change_std": 0.30168131552636623, + "reward_std": 0.9842873699963093, + "rewards/cosine_scaled_reward": 0.10045698285102844, + "rewards/format_reward": 0.7916666753590107, + "step": 442 + }, + { + "advantage_max": 1.337314061820507, + "advantage_mean": -4.035731110407781e-09, + "advantage_min": -0.8306575156748295, + "advantage_std": 0.7948845028877258, + "completion_length": 2973.6251068115234, + "epoch": 0.5062857142857143, + "grad_norm": 0.7379650473594666, + "kl": 0.41070556640625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.351615817851748e-07, + "loss": 0.0476, + "reward": 0.22690303064882755, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.22690303064882755, + "reward_after_std": 0.7948845438659191, + "reward_before_mean": 0.5625253785401583, + "reward_before_std": 0.8026487492024899, + "reward_change_max": 0.0, + "reward_change_mean": -0.33562235068529844, + "reward_change_min": -0.6595030464231968, + "reward_change_std": 0.2517847139388323, + "reward_std": 0.7948845475912094, + "rewards/cosine_scaled_reward": -0.08332065586000681, + "rewards/format_reward": 0.7291666772216558, + "step": 443 + }, + { + "advantage_max": 1.0303258448839188, + "advantage_mean": -5.898376453927767e-09, + "advantage_min": -0.70342618227005, + "advantage_std": 0.6188901476562023, + "completion_length": 2797.3125762939453, + "epoch": 0.5074285714285715, + "grad_norm": 0.4941686987876892, + "kl": 0.3975830078125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.3395428487445914e-07, + "loss": 0.0183, + "reward": 0.3297221283428371, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3297221283428371, + "reward_after_std": 0.6188901402056217, + "reward_before_mean": 0.7205270808190107, + "reward_before_std": 0.5975214540958405, + "reward_change_max": 0.0, + "reward_change_mean": -0.3908049762248993, + "reward_change_min": -0.6559524200856686, + "reward_change_std": 0.24912833236157894, + "reward_std": 0.6188901737332344, + "rewards/cosine_scaled_reward": -0.098069803789258, + "rewards/format_reward": 0.9166666865348816, + "step": 444 + }, + { + "advantage_max": 1.4425413608551025, + "advantage_mean": -9.934107980669182e-09, + "advantage_min": -0.8248635195195675, + "advantage_std": 0.8054063022136688, + "completion_length": 2815.104248046875, + "epoch": 0.5085714285714286, + "grad_norm": 0.6476776599884033, + "kl": 0.385772705078125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.3276726544494571e-07, + "loss": 0.0453, + "reward": 0.23617349471896887, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.23617349471896887, + "reward_after_std": 0.8054062686860561, + "reward_before_mean": 0.5678501327056438, + "reward_before_std": 0.779238685965538, + "reward_change_max": 0.0006808564066886902, + "reward_change_mean": -0.3316766396164894, + "reward_change_min": -0.6133453659713268, + "reward_change_std": 0.22692137584090233, + "reward_std": 0.8054062835872173, + "rewards/cosine_scaled_reward": -0.1014916084241122, + "rewards/format_reward": 0.7708333544433117, + "step": 445 + }, + { + "advantage_max": 1.4127313196659088, + "advantage_mean": -4.967054045845742e-09, + "advantage_min": -0.9514566399157047, + "advantage_std": 0.841310903429985, + "completion_length": 2577.4167404174805, + "epoch": 0.5097142857142857, + "grad_norm": 0.7943739891052246, + "kl": 0.3366851806640625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.316005813502869e-07, + "loss": 0.0403, + "reward": 0.5418746005743742, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5418746005743742, + "reward_after_std": 0.8413108885288239, + "reward_before_mean": 0.9842005173268262, + "reward_before_std": 0.8326334059238434, + "reward_change_max": 0.000830821692943573, + "reward_change_mean": -0.4423258863389492, + "reward_change_min": -0.7536700703203678, + "reward_change_std": 0.30366277135908604, + "reward_std": 0.8413109183311462, + "rewards/cosine_scaled_reward": 0.1066835792735219, + "rewards/format_reward": 0.7708333432674408, + "step": 446 + }, + { + "advantage_max": 1.3711147084832191, + "advantage_mean": -2.359350548264416e-08, + "advantage_min": -0.7326249293982983, + "advantage_std": 0.7835522890090942, + "completion_length": 2426.2500610351562, + "epoch": 0.5108571428571429, + "grad_norm": 0.4948989748954773, + "kl": 0.27752685546875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.3045428945301953e-07, + "loss": 0.0314, + "reward": 0.3651602268218994, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3651602268218994, + "reward_after_std": 0.7835522815585136, + "reward_before_mean": 0.7487014681100845, + "reward_before_std": 0.7524013314396143, + "reward_change_max": 0.0001742541790008545, + "reward_change_mean": -0.3835412599146366, + "reward_change_min": -0.7031693980097771, + "reward_change_std": 0.25586988404393196, + "reward_std": 0.7835522890090942, + "rewards/cosine_scaled_reward": -0.08398262108676136, + "rewards/format_reward": 0.916666679084301, + "step": 447 + }, + { + "advantage_max": 1.3957402855157852, + "advantage_mean": -3.10440864126349e-08, + "advantage_min": -0.863853208720684, + "advantage_std": 0.8466850370168686, + "completion_length": 2403.354217529297, + "epoch": 0.512, + "grad_norm": 0.5587709546089172, + "kl": 0.33380126953125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.2932844562179352e-07, + "loss": 0.0388, + "reward": 0.6120043303817511, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6120043303817511, + "reward_after_std": 0.846685029566288, + "reward_before_mean": 1.0778508316725492, + "reward_before_std": 0.8333419039845467, + "reward_change_max": 0.0, + "reward_change_mean": -0.4658465310931206, + "reward_change_min": -0.8283277899026871, + "reward_change_std": 0.3207920826971531, + "reward_std": 0.8466850444674492, + "rewards/cosine_scaled_reward": 0.14309207256883383, + "rewards/format_reward": 0.7916666716337204, + "step": 448 + }, + { + "advantage_max": 1.3688802272081375, + "advantage_mean": -2.1109978987077227e-08, + "advantage_min": -0.874987531453371, + "advantage_std": 0.8195151649415493, + "completion_length": 2257.6875610351562, + "epoch": 0.5131428571428571, + "grad_norm": 0.8245042562484741, + "kl": 0.3472442626953125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.2822310472864885e-07, + "loss": 0.0005, + "reward": 0.45789824426174164, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.45789824426174164, + "reward_after_std": 0.8195151649415493, + "reward_before_mean": 0.8723139688372612, + "reward_before_std": 0.8171190805733204, + "reward_change_max": 0.0, + "reward_change_mean": -0.4144157078117132, + "reward_change_min": -0.7483086436986923, + "reward_change_std": 0.2881466280668974, + "reward_std": 0.8195151947438717, + "rewards/cosine_scaled_reward": -0.011759708635509014, + "rewards/format_reward": 0.8958333358168602, + "step": 449 + }, + { + "advantage_max": 1.13484638184309, + "advantage_mean": -1.8316011374253094e-08, + "advantage_min": -0.6597550548613071, + "advantage_std": 0.6347230449318886, + "completion_length": 2701.2917404174805, + "epoch": 0.5142857142857142, + "grad_norm": 1.3705062866210938, + "kl": 0.351776123046875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.2713832064634125e-07, + "loss": -0.003, + "reward": 0.4212948093190789, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4212948093190789, + "reward_after_std": 0.6347230412065983, + "reward_before_mean": 0.8418345140962629, + "reward_before_std": 0.5590377673506737, + "reward_change_max": 0.0, + "reward_change_mean": -0.4205396883189678, + "reward_change_min": -0.6680282019078732, + "reward_change_std": 0.24988417513668537, + "reward_std": 0.6347230449318886, + "rewards/cosine_scaled_reward": -0.0061661116778850555, + "rewards/format_reward": 0.854166679084301, + "step": 450 + }, + { + "advantage_max": 1.2935975268483162, + "advantage_mean": -2.4835271617007493e-09, + "advantage_min": -0.9024942293763161, + "advantage_std": 0.769032035022974, + "completion_length": 2333.854232788086, + "epoch": 0.5154285714285715, + "grad_norm": 0.4481179416179657, + "kl": 0.3267059326171875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.260741462457165e-07, + "loss": 0.0365, + "reward": 0.5793941374868155, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5793941374868155, + "reward_after_std": 0.769032035022974, + "reward_before_mean": 1.0413884930312634, + "reward_before_std": 0.7353867180645466, + "reward_change_max": 0.00039821118116378784, + "reward_change_mean": -0.46199433878064156, + "reward_change_min": -0.7502287328243256, + "reward_change_std": 0.29962888173758984, + "reward_std": 0.7690320760011673, + "rewards/cosine_scaled_reward": 0.09361090138554573, + "rewards/format_reward": 0.854166679084301, + "step": 451 + }, + { + "advantage_max": 1.2705382034182549, + "advantage_mean": -1.1796752463766325e-08, + "advantage_min": -0.9169280380010605, + "advantage_std": 0.780293669551611, + "completion_length": 3047.479217529297, + "epoch": 0.5165714285714286, + "grad_norm": 0.4624117314815521, + "kl": 0.39569091796875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.2503063339313356e-07, + "loss": 0.0313, + "reward": 0.299836840480566, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.299836840480566, + "reward_after_std": 0.7802936770021915, + "reward_before_mean": 0.66663564927876, + "reward_before_std": 0.7914271615445614, + "reward_change_max": 0.0013251379132270813, + "reward_change_mean": -0.3667988320812583, + "reward_change_min": -0.6514892093837261, + "reward_change_std": 0.27619097754359245, + "reward_std": 0.7802936919033527, + "rewards/cosine_scaled_reward": 0.020817823708057404, + "rewards/format_reward": 0.6250000149011612, + "step": 452 + }, + { + "advantage_max": 1.450104333460331, + "advantage_mean": 1.1102230246251565e-16, + "advantage_min": -0.8548537157475948, + "advantage_std": 0.8533807098865509, + "completion_length": 2728.0834197998047, + "epoch": 0.5177142857142857, + "grad_norm": 0.5218294858932495, + "kl": 0.4306640625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.2400783294793668e-07, + "loss": 0.0286, + "reward": 0.4595272596925497, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4595272596925497, + "reward_after_std": 0.8533807396888733, + "reward_before_mean": 0.868656799197197, + "reward_before_std": 0.8462558500468731, + "reward_change_max": 0.0, + "reward_change_mean": -0.4091295227408409, + "reward_change_min": -0.7668772041797638, + "reward_change_std": 0.28244134970009327, + "reward_std": 0.8533807471394539, + "rewards/cosine_scaled_reward": 0.007245063781738281, + "rewards/format_reward": 0.854166679084301, + "step": 453 + }, + { + "advantage_max": 1.288001649081707, + "advantage_mean": -4.035731554496991e-09, + "advantage_min": -0.6806519478559494, + "advantage_std": 0.7259433791041374, + "completion_length": 2560.291717529297, + "epoch": 0.5188571428571429, + "grad_norm": 0.8109725117683411, + "kl": 0.325469970703125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.2300579475997657e-07, + "loss": 0.0179, + "reward": 0.2342965486459434, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2342965486459434, + "reward_after_std": 0.7259433902800083, + "reward_before_mean": 0.5756424032151699, + "reward_before_std": 0.6870302464812994, + "reward_change_max": 0.000321120023727417, + "reward_change_mean": -0.3413458652794361, + "reward_change_min": -0.5898742564022541, + "reward_change_std": 0.2277445401996374, + "reward_std": 0.7259434051811695, + "rewards/cosine_scaled_reward": -0.0975954644382, + "rewards/format_reward": 0.7708333376795053, + "step": 454 + }, + { + "advantage_max": 1.7102677822113037, + "advantage_mean": -3.1044086745701804e-09, + "advantage_min": -0.7576536983251572, + "advantage_std": 0.9212291911244392, + "completion_length": 3278.5834350585938, + "epoch": 0.52, + "grad_norm": 0.6358473300933838, + "kl": 0.513427734375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.220245676671809e-07, + "loss": 0.0445, + "reward": 0.1089541104156524, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1089541104156524, + "reward_after_std": 0.921229176223278, + "reward_before_mean": 0.3775138114579022, + "reward_before_std": 0.8881769068539143, + "reward_change_max": 0.00048607587814331055, + "reward_change_mean": -0.26855968311429024, + "reward_change_min": -0.5115904286503792, + "reward_change_std": 0.20105735212564468, + "reward_std": 0.921229176223278, + "rewards/cosine_scaled_reward": -0.16540978103876114, + "rewards/format_reward": 0.7083333507180214, + "step": 455 + }, + { + "advantage_max": 1.6177105605602264, + "advantage_mean": -3.7252901874396116e-09, + "advantage_min": -0.9958631098270416, + "advantage_std": 0.9880196638405323, + "completion_length": 2972.5625610351562, + "epoch": 0.5211428571428571, + "grad_norm": 0.6450056433677673, + "kl": 0.3536376953125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.2106419949317388e-07, + "loss": 0.0434, + "reward": 0.42031298764050007, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.42031298764050007, + "reward_after_std": 0.9880196675658226, + "reward_before_mean": 0.8057602029293776, + "reward_before_std": 1.0287685617804527, + "reward_change_max": 0.0007295906543731689, + "reward_change_mean": -0.3854472152888775, + "reward_change_min": -0.78760626912117, + "reward_change_std": 0.3133588805794716, + "reward_std": 0.9880196936428547, + "rewards/cosine_scaled_reward": 0.017463432624936104, + "rewards/format_reward": 0.7708333432674408, + "step": 456 + }, + { + "advantage_max": 0.9447000622749329, + "advantage_mean": 7.761021519891997e-09, + "advantage_min": -0.4586385563015938, + "advantage_std": 0.5160973146557808, + "completion_length": 2758.812545776367, + "epoch": 0.5222857142857142, + "grad_norm": 0.8224807977676392, + "kl": 0.35028076171875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.2012473704494537e-07, + "loss": 0.007, + "reward": 0.26081686979159713, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.26081686979159713, + "reward_after_std": 0.5160973146557808, + "reward_before_mean": 0.6370738223195076, + "reward_before_std": 0.4221321437507868, + "reward_change_max": 0.00038155168294906616, + "reward_change_mean": -0.3762569138780236, + "reward_change_min": -0.6008391063660383, + "reward_change_std": 0.21958772093057632, + "reward_std": 0.5160973332822323, + "rewards/cosine_scaled_reward": -0.03562977910041809, + "rewards/format_reward": 0.7083333469927311, + "step": 457 + }, + { + "advantage_max": 1.000264324247837, + "advantage_mean": 3.4148496808050766e-09, + "advantage_min": -0.4859766326844692, + "advantage_std": 0.5605919919908047, + "completion_length": 2269.2708740234375, + "epoch": 0.5234285714285715, + "grad_norm": 0.8835322856903076, + "kl": 0.3087158203125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.1920622611056974e-07, + "loss": -0.0088, + "reward": -0.009334953036159277, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.009334953036159277, + "reward_after_std": 0.5605919994413853, + "reward_before_mean": 0.25875352788716555, + "reward_before_std": 0.5173221491277218, + "reward_change_max": 0.0, + "reward_change_mean": -0.2680884934961796, + "reward_change_min": -0.4858828894793987, + "reward_change_std": 0.17110472451895475, + "reward_std": 0.5605920068919659, + "rewards/cosine_scaled_reward": -0.23520657513290644, + "rewards/format_reward": 0.7291666679084301, + "step": 458 + }, + { + "advantage_max": 1.5267757251858711, + "advantage_mean": -1.2417634920325327e-08, + "advantage_min": -0.8131808936595917, + "advantage_std": 0.844078216701746, + "completion_length": 2138.666732788086, + "epoch": 0.5245714285714286, + "grad_norm": 0.5719569325447083, + "kl": 0.26104736328125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.1830871145697412e-07, + "loss": 0.0098, + "reward": 0.483590893127257, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.483590893127257, + "reward_after_std": 0.8440782129764557, + "reward_before_mean": 0.894639240577817, + "reward_before_std": 0.7770277038216591, + "reward_change_max": 0.0004179328680038452, + "reward_change_mean": -0.4110483396798372, + "reward_change_min": -0.682177871465683, + "reward_change_std": 0.2591621521860361, + "reward_std": 0.8440782576799393, + "rewards/cosine_scaled_reward": 0.020236277021467686, + "rewards/format_reward": 0.854166679084301, + "step": 459 + }, + { + "advantage_max": 1.5692550614476204, + "advantage_mean": -8.692344150018627e-09, + "advantage_min": -0.9461584500968456, + "advantage_std": 0.9103639535605907, + "completion_length": 2925.6876220703125, + "epoch": 0.5257142857142857, + "grad_norm": 0.9705744385719299, + "kl": 0.40936279296875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.1743223682775649e-07, + "loss": 0.058, + "reward": 0.281633076723665, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.281633076723665, + "reward_after_std": 0.9103639498353004, + "reward_before_mean": 0.6173821464180946, + "reward_before_std": 0.9208094067871571, + "reward_change_max": 0.0, + "reward_change_mean": -0.3357490822672844, + "reward_change_min": -0.6764678545296192, + "reward_change_std": 0.2633221186697483, + "reward_std": 0.9103639684617519, + "rewards/cosine_scaled_reward": -0.06630893185501918, + "rewards/format_reward": 0.7500000260770321, + "step": 460 + }, + { + "advantage_max": 1.1545908525586128, + "advantage_mean": 7.450581318568794e-09, + "advantage_min": -0.699158314615488, + "advantage_std": 0.6645155474543571, + "completion_length": 2859.291732788086, + "epoch": 0.5268571428571428, + "grad_norm": 1.0532013177871704, + "kl": 0.40289306640625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.1657684494105386e-07, + "loss": -0.0062, + "reward": 0.392983645782806, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.392983645782806, + "reward_after_std": 0.6645155362784863, + "reward_before_mean": 0.7981845624744892, + "reward_before_std": 0.6197777800261974, + "reward_change_max": 0.00014375895261764526, + "reward_change_mean": -0.40520089864730835, + "reward_change_min": -0.6614694185554981, + "reward_change_std": 0.2514055222272873, + "reward_std": 0.6645155511796474, + "rewards/cosine_scaled_reward": 0.034508924931287766, + "rewards/format_reward": 0.729166679084301, + "step": 461 + }, + { + "advantage_max": 1.0376139730215073, + "advantage_mean": 8.071462498371318e-09, + "advantage_min": -0.48623234406113625, + "advantage_std": 0.5613770298659801, + "completion_length": 2777.041763305664, + "epoch": 0.528, + "grad_norm": 0.3270750045776367, + "kl": 0.333648681640625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.1574257748745986e-07, + "loss": 0.0547, + "reward": -0.12515896558761597, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.12515896558761597, + "reward_after_std": 0.5613770335912704, + "reward_before_mean": 0.10399177204817533, + "reward_before_std": 0.5172784253954887, + "reward_change_max": 0.0003483295440673828, + "reward_change_mean": -0.22915074229240417, + "reward_change_min": -0.3955531381070614, + "reward_change_std": 0.14966793870553374, + "reward_std": 0.5613770447671413, + "rewards/cosine_scaled_reward": -0.28133745677769184, + "rewards/format_reward": 0.6666666753590107, + "step": 462 + }, + { + "advantage_max": 1.6833451092243195, + "advantage_mean": -8.6923440667519e-09, + "advantage_min": -0.8412944078445435, + "advantage_std": 0.9715849310159683, + "completion_length": 3012.3334350585938, + "epoch": 0.5291428571428571, + "grad_norm": 0.6381751298904419, + "kl": 0.445709228515625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.1492947512799328e-07, + "loss": 0.0375, + "reward": 0.26589803770184517, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.26589803770184517, + "reward_after_std": 0.9715849310159683, + "reward_before_mean": 0.588882073876448, + "reward_before_std": 0.9903637580573559, + "reward_change_max": 0.0003296881914138794, + "reward_change_mean": -0.32298404537141323, + "reward_change_min": -0.715866107493639, + "reward_change_std": 0.26778385415673256, + "reward_std": 0.9715849570930004, + "rewards/cosine_scaled_reward": -0.028475646511651576, + "rewards/format_reward": 0.6458333469927311, + "step": 463 + }, + { + "advantage_max": 1.3124027475714684, + "advantage_mean": -1.0554989715583218e-08, + "advantage_min": -0.788254126906395, + "advantage_std": 0.7402537241578102, + "completion_length": 2271.979217529297, + "epoch": 0.5302857142857142, + "grad_norm": 1.9013720750808716, + "kl": 0.29579925537109375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.1413757749211602e-07, + "loss": -0.0279, + "reward": 0.3903045654296875, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3903045654296875, + "reward_after_std": 0.7402537241578102, + "reward_before_mean": 0.7856322703883052, + "reward_before_std": 0.6768908370286226, + "reward_change_max": 0.0, + "reward_change_mean": -0.39532770961523056, + "reward_change_min": -0.6510465815663338, + "reward_change_std": 0.2512638717889786, + "reward_std": 0.740253746509552, + "rewards/cosine_scaled_reward": -0.01343385933432728, + "rewards/format_reward": 0.8125000223517418, + "step": 464 + }, + { + "advantage_max": 1.3665404319763184, + "advantage_mean": -7.450580818968433e-09, + "advantage_min": -0.8858865313231945, + "advantage_std": 0.8127275332808495, + "completion_length": 2940.9375915527344, + "epoch": 0.5314285714285715, + "grad_norm": 1.5533735752105713, + "kl": 0.3443603515625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.1336692317580158e-07, + "loss": 0.0716, + "reward": 0.25389866065233946, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.25389866065233946, + "reward_after_std": 0.81272754073143, + "reward_before_mean": 0.5967785839457065, + "reward_before_std": 0.8133799955248833, + "reward_change_max": 0.000377558171749115, + "reward_change_mean": -0.342879943549633, + "reward_change_min": -0.5939531847834587, + "reward_change_std": 0.253477456048131, + "reward_std": 0.8127275817096233, + "rewards/cosine_scaled_reward": -0.09744403883814812, + "rewards/format_reward": 0.7916666865348816, + "step": 465 + }, + { + "advantage_max": 1.525937169790268, + "advantage_mean": -1.1796752796833232e-08, + "advantage_min": -1.1690080612897873, + "advantage_std": 0.9380715265870094, + "completion_length": 2885.4375762939453, + "epoch": 0.5325714285714286, + "grad_norm": 0.5980718731880188, + "kl": 0.3075103759765625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.1261754973965422e-07, + "loss": 0.0252, + "reward": 0.4429134102538228, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4429134102538228, + "reward_after_std": 0.9380715265870094, + "reward_before_mean": 0.842274374968838, + "reward_before_std": 0.9796821102499962, + "reward_change_max": 0.0005110055208206177, + "reward_change_mean": -0.39936098270118237, + "reward_change_min": -0.7218204885721207, + "reward_change_std": 0.3039932679384947, + "reward_std": 0.9380715452134609, + "rewards/cosine_scaled_reward": 0.09822050668299198, + "rewards/format_reward": 0.6458333525806665, + "step": 466 + }, + { + "advantage_max": 1.35072410851717, + "advantage_mean": 1.862645149230957e-09, + "advantage_min": -0.7130578570067883, + "advantage_std": 0.7507331855595112, + "completion_length": 2964.8125610351562, + "epoch": 0.5337142857142857, + "grad_norm": 0.589440643787384, + "kl": 0.342010498046875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.1188949370707787e-07, + "loss": 0.014, + "reward": 0.38083328772336245, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.38083328772336245, + "reward_after_std": 0.7507331930100918, + "reward_before_mean": 0.771749480314611, + "reward_before_std": 0.6903218924999237, + "reward_change_max": 0.00035600364208221436, + "reward_change_mean": -0.39091616682708263, + "reward_change_min": -0.6508742086589336, + "reward_change_std": 0.24533732421696186, + "reward_std": 0.7507332153618336, + "rewards/cosine_scaled_reward": -0.041208596900105476, + "rewards/format_reward": 0.854166679084301, + "step": 467 + }, + { + "advantage_max": 1.5255406275391579, + "advantage_mean": -1.1175871006408045e-08, + "advantage_min": -1.018823392689228, + "advantage_std": 0.896932028234005, + "completion_length": 2998.6250610351562, + "epoch": 0.5348571428571428, + "grad_norm": 0.43171975016593933, + "kl": 0.34735107421875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.1118279056249653e-07, + "loss": 0.0287, + "reward": 0.31930156861199066, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.31930156861199066, + "reward_after_std": 0.8969319984316826, + "reward_before_mean": 0.675044497475028, + "reward_before_std": 0.9102982468903065, + "reward_change_max": 0.0, + "reward_change_mean": -0.35574290715157986, + "reward_change_min": -0.6775732263922691, + "reward_change_std": 0.26923401467502117, + "reward_std": 0.8969319984316826, + "rewards/cosine_scaled_reward": -0.02706110430881381, + "rewards/format_reward": 0.7291666865348816, + "step": 468 + }, + { + "advantage_max": 1.3851486891508102, + "advantage_mean": 6.208820124697922e-10, + "advantage_min": -0.8695004656910896, + "advantage_std": 0.8268131166696548, + "completion_length": 2835.854217529297, + "epoch": 0.536, + "grad_norm": 0.6295157670974731, + "kl": 0.373138427734375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.1049747474962444e-07, + "loss": 0.0329, + "reward": 0.3110335245728493, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3110335245728493, + "reward_after_std": 0.8268131241202354, + "reward_before_mean": 0.674715653527528, + "reward_before_std": 0.8407481797039509, + "reward_change_max": 0.0008903965353965759, + "reward_change_mean": -0.3636821284890175, + "reward_change_min": -0.7130404487252235, + "reward_change_std": 0.28268345445394516, + "reward_std": 0.8268131501972675, + "rewards/cosine_scaled_reward": 0.004024487920105457, + "rewards/format_reward": 0.6666666753590107, + "step": 469 + }, + { + "advantage_max": 0.9884959533810616, + "advantage_mean": -1.5522042540183634e-09, + "advantage_min": -0.8061573393642902, + "advantage_std": 0.6135408133268356, + "completion_length": 3324.604217529297, + "epoch": 0.5371428571428571, + "grad_norm": 0.3777313232421875, + "kl": 0.4073486328125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.0983357966978745e-07, + "loss": 0.0369, + "reward": -0.020457723177969456, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.020457723177969456, + "reward_after_std": 0.613540805876255, + "reward_before_mean": 0.25258609745651484, + "reward_before_std": 0.6398132536560297, + "reward_change_max": 0.0014644190669059753, + "reward_change_mean": -0.2730438318103552, + "reward_change_min": -0.49486764147877693, + "reward_change_std": 0.21054570376873016, + "reward_std": 0.6135408133268356, + "rewards/cosine_scaled_reward": -0.19662363454699516, + "rewards/format_reward": 0.6458333563059568, + "step": 470 + }, + { + "advantage_max": 1.4752235859632492, + "advantage_mean": 1.8626451714354175e-08, + "advantage_min": -0.7741179168224335, + "advantage_std": 0.833162184804678, + "completion_length": 3127.3333892822266, + "epoch": 0.5382857142857143, + "grad_norm": 0.8081554174423218, + "kl": 0.361053466796875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.0919113768029517e-07, + "loss": 0.0207, + "reward": 0.24363027699291706, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.24363027699291706, + "reward_after_std": 0.8331621624529362, + "reward_before_mean": 0.5768796000629663, + "reward_before_std": 0.8027565106749535, + "reward_change_max": 0.0004920586943626404, + "reward_change_mean": -0.3332492858171463, + "reward_change_min": -0.6620456390082836, + "reward_change_std": 0.2511415099725127, + "reward_std": 0.8331621661782265, + "rewards/cosine_scaled_reward": -0.04489355348050594, + "rewards/format_reward": 0.6666666809469461, + "step": 471 + }, + { + "advantage_max": 1.4293014854192734, + "advantage_mean": 1.428027990302283e-08, + "advantage_min": -0.6859598383307457, + "advantage_std": 0.7660287953913212, + "completion_length": 2807.2708587646484, + "epoch": 0.5394285714285715, + "grad_norm": 0.43928098678588867, + "kl": 0.34246826171875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.0857018009286381e-07, + "loss": 0.0105, + "reward": 0.15427241090219468, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.15427241090219468, + "reward_after_std": 0.7660288400948048, + "reward_before_mean": 0.45309646893292665, + "reward_before_std": 0.7070276569575071, + "reward_change_max": 0.0, + "reward_change_mean": -0.2988240495324135, + "reward_change_min": -0.5285344235599041, + "reward_change_std": 0.19260118901729584, + "reward_std": 0.7660288661718369, + "rewards/cosine_scaled_reward": -0.14845177298411727, + "rewards/format_reward": 0.7500000149011612, + "step": 472 + }, + { + "advantage_max": 1.2765984535217285, + "advantage_mean": -3.104407841902912e-10, + "advantage_min": -0.575638035312295, + "advantage_std": 0.6808191984891891, + "completion_length": 2983.8333892822266, + "epoch": 0.5405714285714286, + "grad_norm": 0.332226037979126, + "kl": 0.30999755859375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.0797073717209013e-07, + "loss": 0.0312, + "reward": 0.12082979548722506, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.12082979548722506, + "reward_after_std": 0.6808191984891891, + "reward_before_mean": 0.4231628682464361, + "reward_before_std": 0.614310803823173, + "reward_change_max": 0.0002312883734703064, + "reward_change_mean": -0.3023330457508564, + "reward_change_min": -0.4855576269328594, + "reward_change_std": 0.1949680121615529, + "reward_std": 0.6808192264288664, + "rewards/cosine_scaled_reward": -0.13216857006773353, + "rewards/format_reward": 0.6875000149011612, + "step": 473 + }, + { + "advantage_max": 1.580621987581253, + "advantage_mean": -2.1109978876054925e-08, + "advantage_min": -0.785576019436121, + "advantage_std": 0.8718631789088249, + "completion_length": 2244.5625610351562, + "epoch": 0.5417142857142857, + "grad_norm": 0.7054795622825623, + "kl": 0.220245361328125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.0739283813397639e-07, + "loss": 0.0127, + "reward": 0.6519424570724368, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6519424570724368, + "reward_after_std": 0.8718631565570831, + "reward_before_mean": 1.1209867387078702, + "reward_before_std": 0.7825071476399899, + "reward_change_max": 0.0, + "reward_change_mean": -0.469044242054224, + "reward_change_min": -0.7807438485324383, + "reward_change_std": 0.28917416650801897, + "reward_std": 0.8718631789088249, + "rewards/cosine_scaled_reward": 0.15424335189163685, + "rewards/format_reward": 0.812500013038516, + "step": 474 + }, + { + "advantage_max": 1.4571957886219025, + "advantage_mean": -1.1175871117430347e-08, + "advantage_min": -1.061225775629282, + "advantage_std": 0.897326685488224, + "completion_length": 2246.208381652832, + "epoch": 0.5428571428571428, + "grad_norm": 1.8836877346038818, + "kl": 0.2695465087890625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.068365111445064e-07, + "loss": 0.0519, + "reward": 0.4812218938022852, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4812218938022852, + "reward_after_std": 0.8973266705870628, + "reward_before_mean": 0.8961574863642454, + "reward_before_std": 0.9338822811841965, + "reward_change_max": 0.001050010323524475, + "reward_change_mean": -0.414935564622283, + "reward_change_min": -0.7467651851475239, + "reward_change_std": 0.3112996993586421, + "reward_std": 0.8973267190158367, + "rewards/cosine_scaled_reward": 0.08349537872709334, + "rewards/format_reward": 0.7291666772216558, + "step": 475 + }, + { + "advantage_max": 1.702410340309143, + "advantage_mean": -2.235174290099451e-08, + "advantage_min": -1.2538457065820694, + "advantage_std": 1.0458894148468971, + "completion_length": 2918.7500915527344, + "epoch": 0.544, + "grad_norm": 2.251671075820923, + "kl": 0.2965087890625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.063017833182728e-07, + "loss": 0.0935, + "reward": 0.7583688944578171, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7583688944578171, + "reward_after_std": 1.0458894148468971, + "reward_before_mean": 1.2590843848884106, + "reward_before_std": 1.0712070390582085, + "reward_change_max": 0.0, + "reward_change_mean": -0.5007154755294323, + "reward_change_min": -0.853624165058136, + "reward_change_std": 0.3549340758472681, + "reward_std": 1.0458894520998, + "rewards/cosine_scaled_reward": 0.21287550101988018, + "rewards/format_reward": 0.833333358168602, + "step": 476 + }, + { + "advantage_max": 1.2137524485588074, + "advantage_mean": -9.313225801665936e-09, + "advantage_min": -0.5264289863407612, + "advantage_std": 0.6505212225019932, + "completion_length": 1786.9167137145996, + "epoch": 0.5451428571428572, + "grad_norm": 0.17899537086486816, + "kl": 0.1691131591796875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.0578868071715544e-07, + "loss": 0.0155, + "reward": 0.9699084199965, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.9699084199965, + "reward_after_std": 0.6505212299525738, + "reward_before_mean": 1.573659099638462, + "reward_before_std": 0.4683607667684555, + "reward_change_max": 0.0, + "reward_change_mean": -0.6037506610155106, + "reward_change_min": -0.8868011832237244, + "reward_change_std": 0.32529093883931637, + "reward_std": 0.6505212485790253, + "rewards/cosine_scaled_reward": 0.31807953119277954, + "rewards/format_reward": 0.9375000074505806, + "step": 477 + }, + { + "advantage_max": 1.3267375081777573, + "advantage_mean": -2.607703308843412e-08, + "advantage_min": -0.6465108655393124, + "advantage_std": 0.724626038223505, + "completion_length": 2695.3958587646484, + "epoch": 0.5462857142857143, + "grad_norm": 0.6639411449432373, + "kl": 0.230682373046875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.0529722834905125e-07, + "loss": 0.0217, + "reward": 0.21232910081744194, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.21232910081744194, + "reward_after_std": 0.724626038223505, + "reward_before_mean": 0.5422618109732866, + "reward_before_std": 0.6659068446606398, + "reward_change_max": 0.0008394867181777954, + "reward_change_mean": -0.3299326840788126, + "reward_change_min": -0.5855097156018019, + "reward_change_std": 0.2148620719090104, + "reward_std": 0.7246260643005371, + "rewards/cosine_scaled_reward": -0.03095244988799095, + "rewards/format_reward": 0.6041666828095913, + "step": 478 + }, + { + "advantage_max": 1.374097228050232, + "advantage_mean": -1.9868215517249155e-08, + "advantage_min": -0.7297849971801043, + "advantage_std": 0.7507626861333847, + "completion_length": 3014.8125610351562, + "epoch": 0.5474285714285714, + "grad_norm": 0.46662241220474243, + "kl": 0.305755615234375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.0482745016665526e-07, + "loss": 0.0175, + "reward": 0.2535403287038207, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2535403287038207, + "reward_after_std": 0.7507626786828041, + "reward_before_mean": 0.5968595510348678, + "reward_before_std": 0.6907813530415297, + "reward_change_max": 0.0, + "reward_change_mean": -0.34331923350691795, + "reward_change_min": -0.6154851168394089, + "reward_change_std": 0.2200963729992509, + "reward_std": 0.7507626973092556, + "rewards/cosine_scaled_reward": -0.11823690216988325, + "rewards/format_reward": 0.833333358168602, + "step": 479 + }, + { + "advantage_max": 1.3359937444329262, + "advantage_mean": -2.0489097418696645e-08, + "advantage_min": -0.7955343127250671, + "advantage_std": 0.7900065630674362, + "completion_length": 2454.7708740234375, + "epoch": 0.5485714285714286, + "grad_norm": 0.6866908669471741, + "kl": 0.47723388671875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.0437936906629334e-07, + "loss": 0.0103, + "reward": 0.34717184118926525, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.34717184118926525, + "reward_after_std": 0.7900065779685974, + "reward_before_mean": 0.7200711611658335, + "reward_before_std": 0.7834494672715664, + "reward_change_max": 0.00013305246829986572, + "reward_change_mean": -0.3728993311524391, + "reward_change_min": -0.6839794237166643, + "reward_change_std": 0.26153867691755295, + "reward_std": 0.7900065779685974, + "rewards/cosine_scaled_reward": 0.026702251750975847, + "rewards/format_reward": 0.6666666734963655, + "step": 480 + }, + { + "advantage_max": 1.258769951760769, + "advantage_mean": 9.934107703113426e-09, + "advantage_min": -0.8331233933568001, + "advantage_std": 0.7410633154213428, + "completion_length": 3171.854217529297, + "epoch": 0.5497142857142857, + "grad_norm": 0.6480934619903564, + "kl": 0.3162841796875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.0395300688680625e-07, + "loss": 0.046, + "reward": 0.2729811486788094, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2729811486788094, + "reward_after_std": 0.7410633079707623, + "reward_before_mean": 0.6314380564726889, + "reward_before_std": 0.738004770129919, + "reward_change_max": 0.0006511285901069641, + "reward_change_mean": -0.3584568817168474, + "reward_change_min": -0.62668626755476, + "reward_change_std": 0.24528701975941658, + "reward_std": 0.7410633154213428, + "rewards/cosine_scaled_reward": -0.038447652012109756, + "rewards/format_reward": 0.7083333507180214, + "step": 481 + }, + { + "advantage_max": 1.3237205818295479, + "advantage_mean": 2.856055936195645e-08, + "advantage_min": -0.6684618592262268, + "advantage_std": 0.7145509906113148, + "completion_length": 2819.520896911621, + "epoch": 0.5508571428571428, + "grad_norm": 0.4941161274909973, + "kl": 0.3163299560546875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.0354838440848501e-07, + "loss": 0.0221, + "reward": 0.6409013960510492, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6409013960510492, + "reward_after_std": 0.7145509831607342, + "reward_before_mean": 1.1262214393354952, + "reward_before_std": 0.5935944020748138, + "reward_change_max": 0.00171564519405365, + "reward_change_mean": -0.48531997948884964, + "reward_change_min": -0.730036336928606, + "reward_change_std": 0.2870408296585083, + "reward_std": 0.7145509868860245, + "rewards/cosine_scaled_reward": 0.25061069428920746, + "rewards/format_reward": 0.6250000018626451, + "step": 482 + }, + { + "advantage_max": 1.198041632771492, + "advantage_mean": -6.208817349140361e-10, + "advantage_min": -0.46576736494898796, + "advantage_std": 0.6329626999795437, + "completion_length": 2924.0833740234375, + "epoch": 0.552, + "grad_norm": 0.48551997542381287, + "kl": 0.335968017578125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.0316552135205837e-07, + "loss": 0.012, + "reward": 0.1743651172146201, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1743651172146201, + "reward_after_std": 0.6329627186059952, + "reward_before_mean": 0.5006456337869167, + "reward_before_std": 0.5492782052606344, + "reward_change_max": 0.0, + "reward_change_mean": -0.326280502602458, + "reward_change_min": -0.5173515044152737, + "reward_change_std": 0.18664393853396177, + "reward_std": 0.6329627446830273, + "rewards/cosine_scaled_reward": -0.14551052823662758, + "rewards/format_reward": 0.791666679084301, + "step": 483 + }, + { + "advantage_max": 1.2706203386187553, + "advantage_mean": -4.718701185346674e-08, + "advantage_min": -0.780962735414505, + "advantage_std": 0.7774913385510445, + "completion_length": 2575.895896911621, + "epoch": 0.5531428571428572, + "grad_norm": 0.9153090119361877, + "kl": 0.226165771484375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.0280443637773163e-07, + "loss": 0.0402, + "reward": 0.5697587521281093, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5697587521281093, + "reward_after_std": 0.7774913720786572, + "reward_before_mean": 1.0327607281506062, + "reward_before_std": 0.7620697543025017, + "reward_change_max": 0.0, + "reward_change_mean": -0.46300200559198856, + "reward_change_min": -0.813766747713089, + "reward_change_std": 0.3095168676227331, + "reward_std": 0.7774913795292377, + "rewards/cosine_scaled_reward": 0.12054700963199139, + "rewards/format_reward": 0.7916666828095913, + "step": 484 + }, + { + "advantage_max": 1.4597226828336716, + "advantage_mean": -2.793967884828774e-08, + "advantage_min": -0.8125118277966976, + "advantage_std": 0.8193517737090588, + "completion_length": 2764.0625762939453, + "epoch": 0.5542857142857143, + "grad_norm": 0.40035754442214966, + "kl": 0.3151092529296875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.0246514708427701e-07, + "loss": 0.0403, + "reward": 0.3696631761267781, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3696631761267781, + "reward_after_std": 0.8193517588078976, + "reward_before_mean": 0.7487029042094946, + "reward_before_std": 0.783175889402628, + "reward_change_max": 0.0, + "reward_change_mean": -0.3790397383272648, + "reward_change_min": -0.6763305589556694, + "reward_change_std": 0.24684625305235386, + "reward_std": 0.8193517737090588, + "rewards/cosine_scaled_reward": -0.08398190187290311, + "rewards/format_reward": 0.9166666865348816, + "step": 485 + }, + { + "advantage_max": 1.3625748306512833, + "advantage_mean": -1.8316011041186187e-08, + "advantage_min": -0.6952449381351471, + "advantage_std": 0.7565643563866615, + "completion_length": 2407.000072479248, + "epoch": 0.5554285714285714, + "grad_norm": 0.5384291410446167, + "kl": 0.4549407958984375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.0214767000817596e-07, + "loss": 0.0137, + "reward": 0.41442851535975933, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.41442851535975933, + "reward_after_std": 0.7565643414855003, + "reward_before_mean": 0.812147680670023, + "reward_before_std": 0.6902235746383667, + "reward_change_max": 7.747858762741089e-05, + "reward_change_mean": -0.3977191895246506, + "reward_change_min": -0.6768068000674248, + "reward_change_std": 0.26034667529165745, + "reward_std": 0.7565643452107906, + "rewards/cosine_scaled_reward": -0.0001761619932949543, + "rewards/format_reward": 0.8125000074505806, + "step": 486 + }, + { + "advantage_max": 1.2316907346248627, + "advantage_mean": -4.656613034059731e-08, + "advantage_min": -0.8126912750303745, + "advantage_std": 0.7212287411093712, + "completion_length": 2273.229217529297, + "epoch": 0.5565714285714286, + "grad_norm": 0.3203495442867279, + "kl": 0.17083740234375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.0185202062281336e-07, + "loss": 0.0127, + "reward": 0.7203271514736116, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7203271514736116, + "reward_after_std": 0.7212287411093712, + "reward_before_mean": 1.2395121343433857, + "reward_before_std": 0.6601413935422897, + "reward_change_max": 0.0, + "reward_change_mean": -0.5191849693655968, + "reward_change_min": -0.8080818802118301, + "reward_change_std": 0.3102181311696768, + "reward_std": 0.7212287522852421, + "rewards/cosine_scaled_reward": 0.1510060466825962, + "rewards/format_reward": 0.9375000149011612, + "step": 487 + }, + { + "advantage_max": 1.3180788084864616, + "advantage_mean": -1.2417634975836478e-08, + "advantage_min": -0.6756866686046124, + "advantage_std": 0.7199700437486172, + "completion_length": 2206.312530517578, + "epoch": 0.5577142857142857, + "grad_norm": 1.3325201272964478, + "kl": 0.20296478271484375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.0157821333772304e-07, + "loss": -0.0222, + "reward": 0.2248132168315351, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2248132168315351, + "reward_after_std": 0.7199700064957142, + "reward_before_mean": 0.5571361780166626, + "reward_before_std": 0.6680998187512159, + "reward_change_max": 0.0004176497459411621, + "reward_change_mean": -0.33232299983501434, + "reward_change_min": -0.5621999390423298, + "reward_change_std": 0.21261890977621078, + "reward_std": 0.7199700102210045, + "rewards/cosine_scaled_reward": -0.11726525146514177, + "rewards/format_reward": 0.7916666828095913, + "step": 488 + }, + { + "advantage_max": 1.1426765695214272, + "advantage_mean": 9.313226301266297e-10, + "advantage_min": -0.537463366985321, + "advantage_std": 0.6189225316047668, + "completion_length": 3095.541717529297, + "epoch": 0.5588571428571428, + "grad_norm": 0.9554708003997803, + "kl": 0.415283203125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.013262614978859e-07, + "loss": -0.0094, + "reward": -0.1730261892080307, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.1730261892080307, + "reward_after_std": 0.6189225241541862, + "reward_before_mean": 0.030127177014946938, + "reward_before_std": 0.5958229564130306, + "reward_change_max": 0.0011682584881782532, + "reward_change_mean": -0.2031533746048808, + "reward_change_min": -0.4085076302289963, + "reward_change_std": 0.15374962240457535, + "reward_std": 0.6189225278794765, + "rewards/cosine_scaled_reward": -0.23493642359972, + "rewards/format_reward": 0.5000000111758709, + "step": 489 + }, + { + "advantage_max": 1.2859749644994736, + "advantage_mean": -1.3038516433194758e-08, + "advantage_min": -0.6581551507115364, + "advantage_std": 0.7169382348656654, + "completion_length": 2327.3333740234375, + "epoch": 0.56, + "grad_norm": 0.44198766350746155, + "kl": 0.2081146240234375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.0109617738307911e-07, + "loss": 0.0099, + "reward": 0.37883203383535147, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.37883203383535147, + "reward_after_std": 0.7169382683932781, + "reward_before_mean": 0.7729843532433733, + "reward_before_std": 0.6514980234205723, + "reward_change_max": 0.0, + "reward_change_mean": -0.3941523414105177, + "reward_change_min": -0.6781666129827499, + "reward_change_std": 0.2485183533281088, + "reward_std": 0.7169382870197296, + "rewards/cosine_scaled_reward": -0.030174492858350277, + "rewards/format_reward": 0.8333333432674408, + "step": 490 + }, + { + "advantage_max": 1.3834428116679192, + "advantage_mean": -2.7318796169684134e-08, + "advantage_min": -0.9954207092523575, + "advantage_std": 0.8297108337283134, + "completion_length": 2634.62508392334, + "epoch": 0.5611428571428572, + "grad_norm": 1.119695782661438, + "kl": 0.3593902587890625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.0088797220727779e-07, + "loss": 0.0477, + "reward": 0.46813568845391273, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.46813568845391273, + "reward_after_std": 0.8297108709812164, + "reward_before_mean": 0.883242666721344, + "reward_before_std": 0.8290642537176609, + "reward_change_max": 0.000342443585395813, + "reward_change_mean": -0.415106987580657, + "reward_change_min": -0.7718001753091812, + "reward_change_std": 0.29161699395626783, + "reward_std": 0.8297109119594097, + "rewards/cosine_scaled_reward": 0.09787133475765586, + "rewards/format_reward": 0.6875000093132257, + "step": 491 + }, + { + "advantage_max": 1.1452015675604343, + "advantage_mean": -6.829698862009792e-09, + "advantage_min": -0.7990109957754612, + "advantage_std": 0.7219687141478062, + "completion_length": 2643.562568664551, + "epoch": 0.5622857142857143, + "grad_norm": 0.5378697514533997, + "kl": 0.319671630859375, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.0070165611810855e-07, + "loss": 0.0246, + "reward": 0.2805432486347854, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2805432486347854, + "reward_after_std": 0.7219687290489674, + "reward_before_mean": 0.646415838971734, + "reward_before_std": 0.745473101735115, + "reward_change_max": 0.0002370402216911316, + "reward_change_mean": -0.3658725507557392, + "reward_change_min": -0.6684302501380444, + "reward_change_std": 0.2682249080389738, + "reward_std": 0.72196876257658, + "rewards/cosine_scaled_reward": -0.020542113110423088, + "rewards/format_reward": 0.6875000111758709, + "step": 492 + }, + { + "advantage_max": 1.3004106357693672, + "advantage_mean": -2.1420419660245216e-08, + "advantage_min": -0.918564785271883, + "advantage_std": 0.8005609177052975, + "completion_length": 2551.6459045410156, + "epoch": 0.5634285714285714, + "grad_norm": 0.9306224584579468, + "kl": 0.24664306640625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.005372381963547e-07, + "loss": 0.0584, + "reward": 0.5436392567353323, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5436392567353323, + "reward_after_std": 0.8005609251558781, + "reward_before_mean": 0.995521822333103, + "reward_before_std": 0.7887520510703325, + "reward_change_max": 0.0, + "reward_change_mean": -0.451882591471076, + "reward_change_min": -0.7738821282982826, + "reward_change_std": 0.3092910312116146, + "reward_std": 0.8005609400570393, + "rewards/cosine_scaled_reward": 0.07067757099866867, + "rewards/format_reward": 0.8541666865348816, + "step": 493 + }, + { + "advantage_max": 1.4173968508839607, + "advantage_mean": -2.8560559528489904e-08, + "advantage_min": -0.6377351954579353, + "advantage_std": 0.7632331699132919, + "completion_length": 1996.0625457763672, + "epoch": 0.5645714285714286, + "grad_norm": 0.3634704053401947, + "kl": 0.2783660888671875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.0039472645551372e-07, + "loss": -0.0001, + "reward": 0.677189095877111, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.677189095877111, + "reward_after_std": 0.7632331699132919, + "reward_before_mean": 1.163249596953392, + "reward_before_std": 0.6305623799562454, + "reward_change_max": 0.0, + "reward_change_mean": -0.4860605113208294, + "reward_change_min": -0.7516691125929356, + "reward_change_std": 0.27895483560860157, + "reward_std": 0.7632332071661949, + "rewards/cosine_scaled_reward": 0.1128747807815671, + "rewards/format_reward": 0.9375000074505806, + "step": 494 + }, + { + "advantage_max": 1.2059417739510536, + "advantage_mean": -9.934107647602275e-09, + "advantage_min": -0.9517855867743492, + "advantage_std": 0.7393425740301609, + "completion_length": 2863.8958892822266, + "epoch": 0.5657142857142857, + "grad_norm": 0.5228192806243896, + "kl": 0.309600830078125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.002741278414069e-07, + "loss": 0.0412, + "reward": 0.40756674110889435, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.40756674110889435, + "reward_after_std": 0.7393425889313221, + "reward_before_mean": 0.8172311699017882, + "reward_before_std": 0.7446677684783936, + "reward_change_max": 0.0, + "reward_change_mean": -0.4096644464880228, + "reward_change_min": -0.6881908997893333, + "reward_change_std": 0.2763666333630681, + "reward_std": 0.7393426224589348, + "rewards/cosine_scaled_reward": 0.07528225053101778, + "rewards/format_reward": 0.6666666753590107, + "step": 495 + }, + { + "advantage_max": 1.2969953119754791, + "advantage_mean": -2.421438738409165e-08, + "advantage_min": -0.8419771865010262, + "advantage_std": 0.7619081437587738, + "completion_length": 1974.2708892822266, + "epoch": 0.5668571428571428, + "grad_norm": 0.2679189145565033, + "kl": 0.5582504272460938, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.0017544823184055e-07, + "loss": 0.0025, + "reward": 0.564401363953948, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.564401363953948, + "reward_after_std": 0.7619081437587738, + "reward_before_mean": 1.0211270181462169, + "reward_before_std": 0.7228041291236877, + "reward_change_max": 0.0, + "reward_change_mean": -0.45672566443681717, + "reward_change_min": -0.793279368430376, + "reward_change_std": 0.2969939485192299, + "reward_std": 0.7619081512093544, + "rewards/cosine_scaled_reward": 0.11473017372190952, + "rewards/format_reward": 0.7916666753590107, + "step": 496 + }, + { + "advantage_max": 1.4672126322984695, + "advantage_mean": -3.166496814754893e-08, + "advantage_min": -0.7556325867772102, + "advantage_std": 0.8255305737257004, + "completion_length": 2442.0000610351562, + "epoch": 0.568, + "grad_norm": 0.3775382339954376, + "kl": 0.1978759765625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.0009869243631952e-07, + "loss": -0.0023, + "reward": 0.7381462557241321, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.7381462557241321, + "reward_after_std": 0.8255305588245392, + "reward_before_mean": 1.2490927260369062, + "reward_before_std": 0.7295142617076635, + "reward_change_max": 0.0, + "reward_change_mean": -0.5109464433044195, + "reward_change_min": -0.7986562699079514, + "reward_change_std": 0.308968473225832, + "reward_std": 0.8255305588245392, + "rewards/cosine_scaled_reward": 0.20787964761257172, + "rewards/format_reward": 0.8333333432674408, + "step": 497 + }, + { + "advantage_max": 1.5589833036065102, + "advantage_mean": 6.208816238917336e-10, + "advantage_min": -0.9517237991094589, + "advantage_std": 0.9105304293334484, + "completion_length": 2816.0209350585938, + "epoch": 0.5691428571428572, + "grad_norm": 0.9435493350028992, + "kl": 0.3084716796875, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.000438641958131e-07, + "loss": 0.0553, + "reward": 0.4708832767792046, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4708832767792046, + "reward_after_std": 0.9105304293334484, + "reward_before_mean": 0.8774845395237207, + "reward_before_std": 0.9078313298523426, + "reward_change_max": 0.0, + "reward_change_mean": -0.4066012669354677, + "reward_change_min": -0.7389591410756111, + "reward_change_std": 0.28658714331686497, + "reward_std": 0.9105304405093193, + "rewards/cosine_scaled_reward": 0.0741589218378067, + "rewards/format_reward": 0.7291666902601719, + "step": 498 + }, + { + "advantage_max": 1.5296600610017776, + "advantage_mean": -1.2417634254191512e-08, + "advantage_min": -1.004964530467987, + "advantage_std": 0.9031927324831486, + "completion_length": 2717.0000610351562, + "epoch": 0.5702857142857143, + "grad_norm": 0.9648796319961548, + "kl": 0.1967926025390625, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1.0001096618257236e-07, + "loss": 0.0385, + "reward": 0.6673821806907654, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6673821806907654, + "reward_after_std": 0.9031927324831486, + "reward_before_mean": 1.146652415394783, + "reward_before_std": 0.884892612695694, + "reward_change_max": 0.0, + "reward_change_mean": -0.47927025333046913, + "reward_change_min": -0.8178643807768822, + "reward_change_std": 0.31639137864112854, + "reward_std": 0.9031927511096001, + "rewards/cosine_scaled_reward": 0.10457620583474636, + "rewards/format_reward": 0.9375000074505806, + "step": 499 + }, + { + "advantage_max": 1.4438907951116562, + "advantage_mean": -4.9049656836164246e-08, + "advantage_min": -0.9703863263130188, + "advantage_std": 0.8778162263333797, + "completion_length": 3062.916748046875, + "epoch": 0.5714285714285714, + "grad_norm": 2.0350399017333984, + "kl": 0.3018798828125, + "lambda_div_used": 0.7000000000000001, + "learning_rate": 1e-07, + "loss": 0.0634, + "reward": 0.5093031972646713, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5093031972646713, + "reward_after_std": 0.8778162263333797, + "reward_before_mean": 0.9389307349920273, + "reward_before_std": 0.8889699075371027, + "reward_change_max": 0.0, + "reward_change_mean": -0.4296275693923235, + "reward_change_min": -0.7938886098563671, + "reward_change_std": 0.31151862256228924, + "reward_std": 0.8778162784874439, + "rewards/cosine_scaled_reward": 0.13613202422857285, + "rewards/format_reward": 0.6666666809469461, + "step": 500 + }, + { + "epoch": 0.5714285714285714, + "step": 500, + "total_flos": 0.0, + "train_loss": 0.006108085031155497, + "train_runtime": 18366.0905, + "train_samples_per_second": 1.307, + "train_steps_per_second": 0.027 + } + ], + "logging_steps": 1, + "max_steps": 500, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}