{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5714285714285714, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "advantage_max": 1.3236461393535137, "advantage_mean": -1.6653345369377348e-16, "advantage_min": -0.8878969363868237, "advantage_std": 0.7976016215980053, "completion_length": 2571.2083587646484, "epoch": 0.001142857142857143, "grad_norm": 0.12771576642990112, "kl": 0.0, "lambda_div_used": 0.7000000000000001, "learning_rate": 2e-08, "loss": 0.0681, "reward": 0.1723687592893839, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1723687592893839, "reward_after_std": 0.7976016290485859, "reward_before_mean": 0.4897647276520729, "reward_before_std": 0.8290339298546314, "reward_change_max": 0.00042107701301574707, "reward_change_mean": -0.31739595998078585, "reward_change_min": -0.6219300664961338, "reward_change_std": 0.2523575215600431, "reward_std": 0.7976016625761986, "rewards/cosine_scaled_reward": -0.015534311532974243, "rewards/format_reward": 0.5208333488553762, "step": 1 }, { "advantage_max": 0.7214599922299385, "advantage_mean": 8.692344399818808e-09, "advantage_min": -0.49231788888573647, "advantage_std": 0.4440294001251459, "completion_length": 2804.395881652832, "epoch": 0.002285714285714286, "grad_norm": 0.06164511293172836, "kl": 0.0, "lambda_div_used": 0.7000000000000001, "learning_rate": 4e-08, "loss": 0.0245, "reward": -0.018269629566930234, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.018269629566930234, "reward_after_std": 0.444029388949275, "reward_before_mean": 0.27539755403995514, "reward_before_std": 0.42092561535537243, "reward_change_max": 0.0009796768426895142, "reward_change_mean": -0.29366718512028456, "reward_change_min": -0.478233277797699, "reward_change_std": 0.19509424595162272, "reward_std": 0.44402940198779106, "rewards/cosine_scaled_reward": -0.04980122856795788, "rewards/format_reward": 0.37500000558793545, "step": 2 }, { "advantage_max": 0.7810538075864315, "advantage_mean": 1.8005570256995895e-08, "advantage_min": -0.5050339177250862, "advantage_std": 0.47483229637145996, "completion_length": 3403.375, "epoch": 0.0034285714285714284, "grad_norm": 0.09680631011724472, "kl": 4.32431697845459e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 6e-08, "loss": -0.0069, "reward": -0.4477355405688286, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.4477355405688286, "reward_after_std": 0.47483232244849205, "reward_before_mean": -0.3116879053413868, "reward_before_std": 0.5111051723361015, "reward_change_max": 0.0012290999293327332, "reward_change_mean": -0.13604762544855475, "reward_change_min": -0.3396712355315685, "reward_change_std": 0.136130525264889, "reward_std": 0.47483235225081444, "rewards/cosine_scaled_reward": -0.2183439557757083, "rewards/format_reward": 0.12500000558793545, "step": 3 }, { "advantage_max": 1.2861799150705338, "advantage_mean": 6.208821234920947e-10, "advantage_min": -0.6494872495532036, "advantage_std": 0.7340018600225449, "completion_length": 2357.833366394043, "epoch": 0.004571428571428572, "grad_norm": 0.1758621782064438, "kl": 3.581494092941284e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 8e-08, "loss": 0.0506, "reward": 0.11002347664907575, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.11002347664907575, "reward_after_std": 0.7340018600225449, "reward_before_mean": 0.40769497863948345, "reward_before_std": 0.716522503644228, "reward_change_max": 0.00016899406909942627, "reward_change_mean": -0.2976714950054884, "reward_change_min": -0.5929372683167458, "reward_change_std": 0.22305172309279442, "reward_std": 0.734001874923706, "rewards/cosine_scaled_reward": -0.09823585068807006, "rewards/format_reward": 0.604166679084301, "step": 4 }, { "advantage_max": 1.3395042344927788, "advantage_mean": 5.587935725248627e-09, "advantage_min": -0.6843537390232086, "advantage_std": 0.7548569068312645, "completion_length": 3192.6459045410156, "epoch": 0.005714285714285714, "grad_norm": 0.1231139749288559, "kl": 3.738701343536377e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 1e-07, "loss": 0.0299, "reward": -0.13100245175883174, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.13100245175883174, "reward_after_std": 0.7548569180071354, "reward_before_mean": 0.07772888010367751, "reward_before_std": 0.7701464891433716, "reward_change_max": 0.00015503168106079102, "reward_change_mean": -0.20873133465647697, "reward_change_min": -0.44208815693855286, "reward_change_std": 0.18292431626468897, "reward_std": 0.7548569515347481, "rewards/cosine_scaled_reward": -0.15905223228037357, "rewards/format_reward": 0.39583334140479565, "step": 5 }, { "advantage_max": 1.438813678920269, "advantage_mean": 3.725290076417309e-09, "advantage_min": -0.5762999951839447, "advantage_std": 0.7623046673834324, "completion_length": 3154.604202270508, "epoch": 0.006857142857142857, "grad_norm": 0.14749404788017273, "kl": 3.841519355773926e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.2e-07, "loss": 0.0406, "reward": -0.23128792829811573, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.23128792829811573, "reward_after_std": 0.7623046673834324, "reward_before_mean": -0.06624754145741463, "reward_before_std": 0.7461451888084412, "reward_change_max": 0.0002317279577255249, "reward_change_mean": -0.16504040150903165, "reward_change_min": -0.3834940567612648, "reward_change_std": 0.14161420124582946, "reward_std": 0.762304674834013, "rewards/cosine_scaled_reward": -0.17895710514858365, "rewards/format_reward": 0.291666679084301, "step": 6 }, { "advantage_max": 1.3337594084441662, "advantage_mean": -1.1796752963366686e-08, "advantage_min": -0.6820095479488373, "advantage_std": 0.7653882391750813, "completion_length": 3154.5209045410156, "epoch": 0.008, "grad_norm": 0.11123297363519669, "kl": 2.4020671844482422e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.4e-07, "loss": 0.0161, "reward": -0.04566465876996517, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.04566465876996517, "reward_after_std": 0.7653882466256618, "reward_before_mean": 0.19614734745118767, "reward_before_std": 0.7696255072951317, "reward_change_max": 0.000983603298664093, "reward_change_mean": -0.24181200610473752, "reward_change_min": -0.5278668366372585, "reward_change_std": 0.2141664084047079, "reward_std": 0.7653882782906294, "rewards/cosine_scaled_reward": -0.1519263405352831, "rewards/format_reward": 0.5000000093132257, "step": 7 }, { "advantage_max": 1.153030887246132, "advantage_mean": -8.071462720415923e-09, "advantage_min": -0.8050092048943043, "advantage_std": 0.7189371399581432, "completion_length": 2714.7291717529297, "epoch": 0.009142857142857144, "grad_norm": 0.17044638097286224, "kl": 2.3853033781051636e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.6e-07, "loss": 0.0227, "reward": 0.3043976202607155, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3043976202607155, "reward_after_std": 0.7189371287822723, "reward_before_mean": 0.6821175068616867, "reward_before_std": 0.7231598682701588, "reward_change_max": 0.0009385868906974792, "reward_change_mean": -0.37771990802139044, "reward_change_min": -0.6880337987095118, "reward_change_std": 0.2805122025310993, "reward_std": 0.7189371511340141, "rewards/cosine_scaled_reward": 0.111892094835639, "rewards/format_reward": 0.4583333395421505, "step": 8 }, { "advantage_max": 1.0858414433896542, "advantage_mean": 4.967053768289986e-09, "advantage_min": -0.6400764584541321, "advantage_std": 0.6383172105997801, "completion_length": 3332.1458435058594, "epoch": 0.010285714285714285, "grad_norm": 0.11889996379613876, "kl": 4.9740076065063477e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.8e-07, "loss": 0.0537, "reward": -0.2352797817438841, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2352797817438841, "reward_after_std": 0.6383171789348125, "reward_before_mean": -0.04502727324143052, "reward_before_std": 0.6668666442856193, "reward_change_max": 0.0008114203810691833, "reward_change_mean": -0.19025251083076, "reward_change_min": -0.4170127771794796, "reward_change_std": 0.1755459113046527, "reward_std": 0.6383172050118446, "rewards/cosine_scaled_reward": -0.16834697453305125, "rewards/format_reward": 0.2916666716337204, "step": 9 }, { "advantage_max": 1.48471187800169, "advantage_mean": 1.2417635808503746e-09, "advantage_min": -0.7741466164588928, "advantage_std": 0.8413009904325008, "completion_length": 2900.958335876465, "epoch": 0.011428571428571429, "grad_norm": 0.15922096371650696, "kl": 3.575533628463745e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 2e-07, "loss": 0.0418, "reward": -0.09273874759674072, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.09273874759674072, "reward_after_std": 0.8413009904325008, "reward_before_mean": 0.11865614727139473, "reward_before_std": 0.8663091957569122, "reward_change_max": 0.0007829740643501282, "reward_change_mean": -0.21139488369226456, "reward_change_min": -0.4577816314995289, "reward_change_std": 0.1882876893505454, "reward_std": 0.8413010165095329, "rewards/cosine_scaled_reward": -0.10733860358595848, "rewards/format_reward": 0.33333334140479565, "step": 10 }, { "advantage_max": 1.0778188593685627, "advantage_mean": 2.1730860888524717e-08, "advantage_min": -0.5051753893494606, "advantage_std": 0.5997787564992905, "completion_length": 3446.5625610351562, "epoch": 0.012571428571428572, "grad_norm": 0.09384988248348236, "kl": 3.8117170333862305e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.1999999999999998e-07, "loss": 0.0257, "reward": -0.47689105570316315, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.47689105570316315, "reward_after_std": 0.5997787676751614, "reward_before_mean": -0.37315269373357296, "reward_before_std": 0.6239271499216557, "reward_change_max": 0.0018914267420768738, "reward_change_mean": -0.1037383598741144, "reward_change_min": -0.27506355568766594, "reward_change_std": 0.11384909274056554, "reward_std": 0.599778788164258, "rewards/cosine_scaled_reward": -0.2490763533860445, "rewards/format_reward": 0.12500000186264515, "step": 11 }, { "advantage_max": 1.2988129183650017, "advantage_mean": 1.5522045315741195e-09, "advantage_min": -0.6877567246556282, "advantage_std": 0.7454183585941792, "completion_length": 2431.416702270508, "epoch": 0.013714285714285714, "grad_norm": 0.12335586547851562, "kl": 3.510713577270508e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.4e-07, "loss": 0.0696, "reward": 0.06804788112640381, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.06804788112640381, "reward_after_std": 0.7454183287918568, "reward_before_mean": 0.35046464344486594, "reward_before_std": 0.741436411626637, "reward_change_max": 0.0008105039596557617, "reward_change_mean": -0.2824167497456074, "reward_change_min": -0.5328971222043037, "reward_change_std": 0.21176872844807804, "reward_std": 0.7454183623194695, "rewards/cosine_scaled_reward": -0.11643435899168253, "rewards/format_reward": 0.583333345130086, "step": 12 }, { "advantage_max": 0.8294573128223419, "advantage_mean": -1.4280279847511679e-08, "advantage_min": -0.4593823775649071, "advantage_std": 0.4747231351211667, "completion_length": 2889.416702270508, "epoch": 0.014857142857142857, "grad_norm": 0.06548355519771576, "kl": 3.244727849960327e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.6e-07, "loss": 0.022, "reward": 0.18717875331640244, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.18717875331640244, "reward_after_std": 0.4747231239452958, "reward_before_mean": 0.544963575899601, "reward_before_std": 0.4018893027678132, "reward_change_max": 0.000781618058681488, "reward_change_mean": -0.35778482200112194, "reward_change_min": -0.5649526380002499, "reward_change_std": 0.21307120216079056, "reward_std": 0.47472314070910215, "rewards/cosine_scaled_reward": 0.032898444682359695, "rewards/format_reward": 0.4791666716337204, "step": 13 }, { "advantage_max": 1.322419997304678, "advantage_mean": 2.1109978987077227e-08, "advantage_min": -0.8189655467867851, "advantage_std": 0.8067995980381966, "completion_length": 2697.854248046875, "epoch": 0.016, "grad_norm": 0.15838854014873505, "kl": 2.3767352104187012e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.8e-07, "loss": 0.0594, "reward": 0.19783409871160984, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.19783409871160984, "reward_after_std": 0.806799590587616, "reward_before_mean": 0.5256497077643871, "reward_before_std": 0.839010551571846, "reward_change_max": 0.0003392919898033142, "reward_change_mean": -0.32781560346484184, "reward_change_min": -0.6789125502109528, "reward_change_std": 0.26764219626784325, "reward_std": 0.8067996315658092, "rewards/cosine_scaled_reward": 0.03365818038582802, "rewards/format_reward": 0.45833333767950535, "step": 14 }, { "advantage_max": 0.9029887653887272, "advantage_mean": 1.1796752907855534e-08, "advantage_min": -0.44812071323394775, "advantage_std": 0.5208632545545697, "completion_length": 2737.166702270508, "epoch": 0.017142857142857144, "grad_norm": 0.054936591535806656, "kl": 2.24970281124115e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 3e-07, "loss": 0.0079, "reward": -0.13882983848452568, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.13882983848452568, "reward_after_std": 0.5208632694557309, "reward_before_mean": 0.09872580878436565, "reward_before_std": 0.5114677743986249, "reward_change_max": 0.0, "reward_change_mean": -0.2375556300394237, "reward_change_min": -0.4582338333129883, "reward_change_std": 0.16593290120363235, "reward_std": 0.5208632759749889, "rewards/cosine_scaled_reward": -0.12772043980658054, "rewards/format_reward": 0.3541666679084301, "step": 15 }, { "advantage_max": 0.5143468156456947, "advantage_mean": 2.483526917451684e-08, "advantage_min": -0.3243384584784508, "advantage_std": 0.30771737545728683, "completion_length": 3521.9375, "epoch": 0.018285714285714287, "grad_norm": 0.04844345152378082, "kl": 4.1229650378227234e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.2e-07, "loss": 0.0078, "reward": -0.6069029793143272, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.6069029793143272, "reward_after_std": 0.30771736800670624, "reward_before_mean": -0.5090135782957077, "reward_before_std": 0.3248976990580559, "reward_change_max": 0.0016613304615020752, "reward_change_mean": -0.09788939589634538, "reward_change_min": -0.2121806014329195, "reward_change_std": 0.08965697605162859, "reward_std": 0.30771737918257713, "rewards/cosine_scaled_reward": -0.26492345705628395, "rewards/format_reward": 0.02083333395421505, "step": 16 }, { "advantage_max": 1.6180179975926876, "advantage_mean": -3.725290298461914e-09, "advantage_min": -0.7726177126169205, "advantage_std": 0.8878979086875916, "completion_length": 2487.3958892822266, "epoch": 0.019428571428571427, "grad_norm": 0.17348401248455048, "kl": 4.330277442932129e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.4000000000000003e-07, "loss": 0.0565, "reward": 0.1379982978105545, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1379982978105545, "reward_after_std": 0.8878978937864304, "reward_before_mean": 0.4221195343416184, "reward_before_std": 0.873494204133749, "reward_change_max": 0.0, "reward_change_mean": -0.284121235832572, "reward_change_min": -0.5485293567180634, "reward_change_std": 0.20937805250287056, "reward_std": 0.8878979422152042, "rewards/cosine_scaled_reward": -0.02852356492076069, "rewards/format_reward": 0.4791666753590107, "step": 17 }, { "advantage_max": 1.0580488927662373, "advantage_mean": 5.587935336670569e-09, "advantage_min": -0.6208096109330654, "advantage_std": 0.6125941518694162, "completion_length": 2949.2083740234375, "epoch": 0.02057142857142857, "grad_norm": 0.14818914234638214, "kl": 2.2158026695251465e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.6e-07, "loss": 0.0716, "reward": -0.11659494414925575, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.11659494414925575, "reward_after_std": 0.6125941406935453, "reward_before_mean": 0.118058524094522, "reward_before_std": 0.6145357359200716, "reward_change_max": 0.0011220648884773254, "reward_change_mean": -0.23465345334261656, "reward_change_min": -0.40788656659424305, "reward_change_std": 0.17815768904983997, "reward_std": 0.6125941649079323, "rewards/cosine_scaled_reward": -0.10763741098344326, "rewards/format_reward": 0.3333333395421505, "step": 18 }, { "advantage_max": 1.2318142503499985, "advantage_mean": 4.346171977864799e-09, "advantage_min": -0.7438315749168396, "advantage_std": 0.7457190416753292, "completion_length": 2978.6875610351562, "epoch": 0.021714285714285714, "grad_norm": 0.11854659020900726, "kl": 2.814456820487976e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.7999999999999996e-07, "loss": 0.0561, "reward": 0.14310528058558702, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.14310528058558702, "reward_after_std": 0.7457190491259098, "reward_before_mean": 0.45691246911883354, "reward_before_std": 0.7543267421424389, "reward_change_max": 0.0008727908134460449, "reward_change_mean": -0.3138071997091174, "reward_change_min": -0.5745680537074804, "reward_change_std": 0.2437105644494295, "reward_std": 0.7457190677523613, "rewards/cosine_scaled_reward": 0.009706247597932816, "rewards/format_reward": 0.43750000931322575, "step": 19 }, { "advantage_max": 1.3907008990645409, "advantage_mean": -1.552204281773939e-09, "advantage_min": -0.7317355498671532, "advantage_std": 0.7999358735978603, "completion_length": 2609.3959045410156, "epoch": 0.022857142857142857, "grad_norm": 0.13953657448291779, "kl": 2.8684735298156738e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 4e-07, "loss": 0.0822, "reward": 0.31031588884070516, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.31031588884070516, "reward_after_std": 0.7999358661472797, "reward_before_mean": 0.6719230581074953, "reward_before_std": 0.7620627535507083, "reward_change_max": 3.715604543685913e-05, "reward_change_mean": -0.36160713620483875, "reward_change_min": -0.6067077554762363, "reward_change_std": 0.2561584496870637, "reward_std": 0.7999358959496021, "rewards/cosine_scaled_reward": 0.033878179267048836, "rewards/format_reward": 0.6041666734963655, "step": 20 }, { "advantage_max": 1.4857213720679283, "advantage_mean": -3.725290298461914e-09, "advantage_min": -0.8182055205106735, "advantage_std": 0.8357969745993614, "completion_length": 2668.145866394043, "epoch": 0.024, "grad_norm": 0.11965252459049225, "kl": 3.9711594581604004e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.1999999999999995e-07, "loss": 0.0684, "reward": 0.039832099340856075, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.039832099340856075, "reward_after_std": 0.8357969783246517, "reward_before_mean": 0.2982704224996269, "reward_before_std": 0.8404805772006512, "reward_change_max": 0.0007499381899833679, "reward_change_mean": -0.25843835016712546, "reward_change_min": -0.5017091780900955, "reward_change_std": 0.20348974969238043, "reward_std": 0.8357969857752323, "rewards/cosine_scaled_reward": -0.06961478537414223, "rewards/format_reward": 0.43750001303851604, "step": 21 }, { "advantage_max": 0.9170179665088654, "advantage_mean": -1.2417634753791873e-08, "advantage_min": -0.6024731658399105, "advantage_std": 0.5401135012507439, "completion_length": 1652.4166793823242, "epoch": 0.025142857142857144, "grad_norm": 0.06278178095817566, "kl": 3.166962414979935e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.3999999999999997e-07, "loss": -0.0109, "reward": 0.2182811200618744, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2182811200618744, "reward_after_std": 0.540113490074873, "reward_before_mean": 0.5821179002523422, "reward_before_std": 0.5008180625736713, "reward_change_max": 0.0, "reward_change_mean": -0.3638368174433708, "reward_change_min": -0.5668350532650948, "reward_change_std": 0.22511073760688305, "reward_std": 0.5401134938001633, "rewards/cosine_scaled_reward": -0.11519105918705463, "rewards/format_reward": 0.8125, "step": 22 }, { "advantage_max": 1.2377678006887436, "advantage_mean": -3.7252901874396116e-09, "advantage_min": -0.7406592965126038, "advantage_std": 0.7070612488314509, "completion_length": 2549.0833740234375, "epoch": 0.026285714285714287, "grad_norm": 0.10234551876783371, "kl": 2.5782734155654907e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.6e-07, "loss": 0.0368, "reward": -0.08573945984244347, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.08573945984244347, "reward_after_std": 0.7070612283423543, "reward_before_mean": 0.14411595277488232, "reward_before_std": 0.7186607727780938, "reward_change_max": 0.0006104856729507446, "reward_change_mean": -0.2298554142471403, "reward_change_min": -0.48414019867777824, "reward_change_std": 0.18986564758233726, "reward_std": 0.7070612665265799, "rewards/cosine_scaled_reward": -0.146692031994462, "rewards/format_reward": 0.43750001303851604, "step": 23 }, { "advantage_max": 1.4403239861130714, "advantage_mean": -1.4901161637936866e-08, "advantage_min": -0.9893696680665016, "advantage_std": 0.8664927519857883, "completion_length": 2878.916702270508, "epoch": 0.027428571428571427, "grad_norm": 0.1490224152803421, "kl": 1.432560384273529e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.8e-07, "loss": 0.0584, "reward": 0.3888702280819416, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3888702280819416, "reward_after_std": 0.8664927519857883, "reward_before_mean": 0.7756715640425682, "reward_before_std": 0.8816492557525635, "reward_change_max": 0.0002819374203681946, "reward_change_mean": -0.3868013136088848, "reward_change_min": -0.6882800199091434, "reward_change_std": 0.28499543759971857, "reward_std": 0.8664927929639816, "rewards/cosine_scaled_reward": 0.10658576083369553, "rewards/format_reward": 0.5625000223517418, "step": 24 }, { "advantage_max": 0.9528279937803745, "advantage_mean": 7.761021575403149e-09, "advantage_min": -0.6989870108664036, "advantage_std": 0.5915895830839872, "completion_length": 2845.562515258789, "epoch": 0.02857142857142857, "grad_norm": 0.149390310049057, "kl": 4.055723547935486e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 5e-07, "loss": 0.059, "reward": -0.12853457941673696, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.12853457941673696, "reward_after_std": 0.5915896017104387, "reward_before_mean": 0.10707342872046866, "reward_before_std": 0.6251619644463062, "reward_change_max": 0.0005048885941505432, "reward_change_mean": -0.23560800775885582, "reward_change_min": -0.48452158086001873, "reward_change_std": 0.19485379848629236, "reward_std": 0.5915896091610193, "rewards/cosine_scaled_reward": -0.1235466287471354, "rewards/format_reward": 0.3541666716337204, "step": 25 }, { "advantage_max": 0.951295755803585, "advantage_mean": 4.34617203337595e-09, "advantage_min": -0.5947780013084412, "advantage_std": 0.5643060579895973, "completion_length": 2929.104217529297, "epoch": 0.029714285714285714, "grad_norm": 0.09629985690116882, "kl": 2.4005770683288574e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.2e-07, "loss": 0.0539, "reward": 0.045638392213732004, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.045638392213732004, "reward_after_std": 0.5643060579895973, "reward_before_mean": 0.3457363471388817, "reward_before_std": 0.5388211533427238, "reward_change_max": 0.0, "reward_change_mean": -0.30009796749800444, "reward_change_min": -0.5169609375298023, "reward_change_std": 0.2061476781964302, "reward_std": 0.5643060654401779, "rewards/cosine_scaled_reward": -0.04588180594146252, "rewards/format_reward": 0.43750001303851604, "step": 26 }, { "advantage_max": 1.1732506714761257, "advantage_mean": -1.5522043372850902e-08, "advantage_min": -0.7885406948626041, "advantage_std": 0.7137688174843788, "completion_length": 2981.6042098999023, "epoch": 0.030857142857142857, "grad_norm": 0.14982038736343384, "kl": 2.8835609555244446e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.4e-07, "loss": 0.0481, "reward": 0.09526193886995316, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.09526193886995316, "reward_after_std": 0.7137688249349594, "reward_before_mean": 0.3976815016940236, "reward_before_std": 0.7458325773477554, "reward_change_max": 0.000577881932258606, "reward_change_mean": -0.30241959635168314, "reward_change_min": -0.5662531852722168, "reward_change_std": 0.22851586807519197, "reward_std": 0.7137688659131527, "rewards/cosine_scaled_reward": -0.040742579847574234, "rewards/format_reward": 0.479166679084301, "step": 27 }, { "advantage_max": 1.4417070969939232, "advantage_mean": -1.1175871450497255e-08, "advantage_min": -0.7128314636647701, "advantage_std": 0.789174672216177, "completion_length": 2737.812530517578, "epoch": 0.032, "grad_norm": 0.12516100704669952, "kl": 2.178177237510681e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.6e-07, "loss": 0.0128, "reward": 0.24161657877266407, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.24161657877266407, "reward_after_std": 0.7891746796667576, "reward_before_mean": 0.5766212344169617, "reward_before_std": 0.741939015686512, "reward_change_max": 1.2122094631195068e-05, "reward_change_mean": -0.3350046342238784, "reward_change_min": -0.5396411195397377, "reward_change_std": 0.21770456805825233, "reward_std": 0.78917470946908, "rewards/cosine_scaled_reward": 0.05914393765851855, "rewards/format_reward": 0.4583333395421505, "step": 28 }, { "advantage_max": 0.9136006869375706, "advantage_mean": 1.8626452213954536e-08, "advantage_min": -0.46621380001306534, "advantage_std": 0.5133081059902906, "completion_length": 3170.187545776367, "epoch": 0.03314285714285714, "grad_norm": 0.11526963859796524, "kl": 1.806020736694336e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.8e-07, "loss": 0.068, "reward": -0.2716685086488724, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2716685086488724, "reward_after_std": 0.5133080948144197, "reward_before_mean": -0.08273126650601625, "reward_before_std": 0.5100496802479029, "reward_change_max": 0.0, "reward_change_mean": -0.18893724866211414, "reward_change_min": -0.33541516587138176, "reward_change_std": 0.13039901200681925, "reward_std": 0.5133081208914518, "rewards/cosine_scaled_reward": -0.15594896860420704, "rewards/format_reward": 0.22916667349636555, "step": 29 }, { "advantage_max": 1.8409449309110641, "advantage_mean": -2.204130222782652e-08, "advantage_min": -0.9385720863938332, "advantage_std": 1.040012452751398, "completion_length": 3010.666702270508, "epoch": 0.03428571428571429, "grad_norm": 0.210233673453331, "kl": 2.4873297661542892e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 6e-07, "loss": 0.0371, "reward": 0.4122390305856243, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4122390305856243, "reward_after_std": 1.0400124676525593, "reward_before_mean": 0.7786657903343439, "reward_before_std": 1.038475975394249, "reward_change_max": 0.000269085168838501, "reward_change_mean": -0.3664267407730222, "reward_change_min": -0.7022055611014366, "reward_change_std": 0.2809586049988866, "reward_std": 1.0400125198066235, "rewards/cosine_scaled_reward": 0.12891620831214823, "rewards/format_reward": 0.5208333488553762, "step": 30 }, { "advantage_max": 1.3186752051115036, "advantage_mean": 2.5456151742098143e-08, "advantage_min": -0.6435412839055061, "advantage_std": 0.7557070441544056, "completion_length": 2892.187545776367, "epoch": 0.03542857142857143, "grad_norm": 0.13154418766498566, "kl": 1.9848346710205078e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.2e-07, "loss": 0.0475, "reward": -0.2269106972962618, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2269106972962618, "reward_after_std": 0.7557070478796959, "reward_before_mean": -0.051385149359703064, "reward_before_std": 0.788501251488924, "reward_change_max": 0.0, "reward_change_mean": -0.17552555818110704, "reward_change_min": -0.4460537787526846, "reward_change_std": 0.18104476854205132, "reward_std": 0.7557070702314377, "rewards/cosine_scaled_reward": -0.19235923327505589, "rewards/format_reward": 0.33333333767950535, "step": 31 }, { "advantage_max": 1.3948543444275856, "advantage_mean": 1.862645149230957e-09, "advantage_min": -0.7191313877701759, "advantage_std": 0.7711601257324219, "completion_length": 3234.8333740234375, "epoch": 0.036571428571428574, "grad_norm": 0.10185429453849792, "kl": 2.5451648980379105e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.4e-07, "loss": 0.0248, "reward": 0.06370437087025493, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.06370437087025493, "reward_after_std": 0.7711601257324219, "reward_before_mean": 0.338614858686924, "reward_before_std": 0.74234314635396, "reward_change_max": 0.0004075467586517334, "reward_change_mean": -0.2749105137772858, "reward_change_min": -0.49326785281300545, "reward_change_std": 0.20088558923453093, "reward_std": 0.7711601331830025, "rewards/cosine_scaled_reward": -0.007775906473398209, "rewards/format_reward": 0.35416667349636555, "step": 32 }, { "advantage_max": 1.4922088123857975, "advantage_mean": 5.587935225648266e-09, "advantage_min": -0.7815022058784962, "advantage_std": 0.8810334913432598, "completion_length": 3333.5834045410156, "epoch": 0.037714285714285714, "grad_norm": 0.12162205576896667, "kl": 3.6016106605529785e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.6e-07, "loss": 0.0661, "reward": -0.06509976089000702, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.06509976089000702, "reward_after_std": 0.8810334764420986, "reward_before_mean": 0.1571635976433754, "reward_before_std": 0.9383543431758881, "reward_change_max": 0.00016274303197860718, "reward_change_mean": -0.2222633557394147, "reward_change_min": -0.5449208281934261, "reward_change_std": 0.22707031201571226, "reward_std": 0.881033506244421, "rewards/cosine_scaled_reward": -0.056834882125258446, "rewards/format_reward": 0.2708333395421505, "step": 33 }, { "advantage_max": 1.4944884777069092, "advantage_mean": -3.228585032655218e-08, "advantage_min": -0.9274916350841522, "advantage_std": 0.9044716916978359, "completion_length": 2455.6458740234375, "epoch": 0.038857142857142854, "grad_norm": 0.14022988080978394, "kl": 0.00020068883895874023, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.800000000000001e-07, "loss": 0.0203, "reward": 0.346778467297554, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.346778467297554, "reward_after_std": 0.9044716954231262, "reward_before_mean": 0.71558902785182, "reward_before_std": 0.9435294065624475, "reward_change_max": 0.00022082775831222534, "reward_change_mean": -0.36881057592108846, "reward_change_min": -0.6944437511265278, "reward_change_std": 0.28569589368999004, "reward_std": 0.904471717774868, "rewards/cosine_scaled_reward": 0.097377834841609, "rewards/format_reward": 0.5208333432674408, "step": 34 }, { "advantage_max": 1.3393050953745842, "advantage_mean": -8.07146305348283e-09, "advantage_min": -0.7060817331075668, "advantage_std": 0.7657259926199913, "completion_length": 3187.5833892822266, "epoch": 0.04, "grad_norm": 0.110480897128582, "kl": 0.00013943761587142944, "lambda_div_used": 0.7000000000000001, "learning_rate": 7e-07, "loss": 0.017, "reward": -0.1551370123634115, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1551370123634115, "reward_after_std": 0.7657259963452816, "reward_before_mean": 0.04514682665467262, "reward_before_std": 0.7847169302403927, "reward_change_max": 0.0, "reward_change_mean": -0.20028384402394295, "reward_change_min": -0.41303151473402977, "reward_change_std": 0.1785027701407671, "reward_std": 0.7657260186970234, "rewards/cosine_scaled_reward": -0.13367659132927656, "rewards/format_reward": 0.31250000558793545, "step": 35 }, { "advantage_max": 0.6390106528997421, "advantage_mean": 1.5522043650406658e-08, "advantage_min": -0.3384590819478035, "advantage_std": 0.3662732969969511, "completion_length": 3560.0416870117188, "epoch": 0.04114285714285714, "grad_norm": 0.06496626883745193, "kl": 7.263757288455963e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.2e-07, "loss": 0.0044, "reward": -0.551641970872879, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.551641970872879, "reward_after_std": 0.3662732820957899, "reward_before_mean": -0.44342844747006893, "reward_before_std": 0.3724683914333582, "reward_change_max": 0.0006786733865737915, "reward_change_mean": -0.10821351455524564, "reward_change_min": -0.224327702075243, "reward_change_std": 0.09183390927501023, "reward_std": 0.36627329140901566, "rewards/cosine_scaled_reward": -0.2529642302542925, "rewards/format_reward": 0.06250000186264515, "step": 36 }, { "advantage_max": 0.7393645793199539, "advantage_mean": 1.7384688244526103e-08, "advantage_min": -0.4619470313191414, "advantage_std": 0.44865885004401207, "completion_length": 3252.2708435058594, "epoch": 0.04228571428571429, "grad_norm": 0.08295831084251404, "kl": 5.307118408381939e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.4e-07, "loss": 0.022, "reward": -0.35455665923655033, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.35455665923655033, "reward_after_std": 0.44865884631872177, "reward_before_mean": -0.18188916146755219, "reward_before_std": 0.46603875420987606, "reward_change_max": 0.0007766708731651306, "reward_change_mean": -0.17266748752444983, "reward_change_min": -0.36579640582203865, "reward_change_std": 0.14287791587412357, "reward_std": 0.44865885376930237, "rewards/cosine_scaled_reward": -0.1951112560927868, "rewards/format_reward": 0.2083333358168602, "step": 37 }, { "advantage_max": 0.6996339820325375, "advantage_mean": 1.98682153507157e-08, "advantage_min": -0.484468013048172, "advantage_std": 0.410785099491477, "completion_length": 3253.0416870117188, "epoch": 0.04342857142857143, "grad_norm": 0.053032856434583664, "kl": 6.0859136283397675e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.599999999999999e-07, "loss": 0.0058, "reward": -0.33535145223140717, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.33535145223140717, "reward_after_std": 0.41078509762883186, "reward_before_mean": -0.15529391914606094, "reward_before_std": 0.41112925857305527, "reward_change_max": 0.002039514482021332, "reward_change_mean": -0.1800575191155076, "reward_change_min": -0.30918743275105953, "reward_change_std": 0.1306900419294834, "reward_std": 0.41078510507941246, "rewards/cosine_scaled_reward": -0.150563626550138, "rewards/format_reward": 0.14583333395421505, "step": 38 }, { "advantage_max": 0.9096625335514545, "advantage_mean": 8.07146260939362e-09, "advantage_min": -0.46032148599624634, "advantage_std": 0.5079090781509876, "completion_length": 2682.062530517578, "epoch": 0.044571428571428574, "grad_norm": 0.0740419402718544, "kl": 0.0001322571188211441, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.799999999999999e-07, "loss": 0.0176, "reward": 0.18389996141195297, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.18389996141195297, "reward_after_std": 0.5079090669751167, "reward_before_mean": 0.5366221237927675, "reward_before_std": 0.413540075533092, "reward_change_max": 0.0003514885902404785, "reward_change_mean": -0.35272214096039534, "reward_change_min": -0.5291457362473011, "reward_change_std": 0.20978331100195646, "reward_std": 0.5079090893268585, "rewards/cosine_scaled_reward": -0.023355623707175255, "rewards/format_reward": 0.5833333414047956, "step": 39 }, { "advantage_max": 1.2809822633862495, "advantage_mean": 1.1175871006408045e-08, "advantage_min": -0.6526738554239273, "advantage_std": 0.6940833032131195, "completion_length": 2629.8541870117188, "epoch": 0.045714285714285714, "grad_norm": 0.12017546594142914, "kl": 0.00029501691460609436, "lambda_div_used": 0.7000000000000001, "learning_rate": 8e-07, "loss": 0.0788, "reward": 0.07090002810582519, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.07090002810582519, "reward_after_std": 0.6940833032131195, "reward_before_mean": 0.3545444831252098, "reward_before_std": 0.6401285659521818, "reward_change_max": 0.00034108012914657593, "reward_change_mean": -0.28364445082843304, "reward_change_min": -0.5075650922954082, "reward_change_std": 0.1958700818940997, "reward_std": 0.6940833181142807, "rewards/cosine_scaled_reward": -0.07272776251193136, "rewards/format_reward": 0.5000000074505806, "step": 40 }, { "advantage_max": 1.0797162391245365, "advantage_mean": 1.490116185998147e-08, "advantage_min": -0.6975666042417288, "advantage_std": 0.6240663919597864, "completion_length": 3017.2916870117188, "epoch": 0.046857142857142854, "grad_norm": 0.09810768067836761, "kl": 0.00014271587133407593, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.199999999999999e-07, "loss": 0.0073, "reward": -0.19936674274504185, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.19936674274504185, "reward_after_std": 0.6240663770586252, "reward_before_mean": 0.0034020310267806053, "reward_before_std": 0.6362273693084717, "reward_change_max": 0.0014721229672431946, "reward_change_mean": -0.20276877097785473, "reward_change_min": -0.3929155748337507, "reward_change_std": 0.16917146416381001, "reward_std": 0.6240663770586252, "rewards/cosine_scaled_reward": -0.20663231890648603, "rewards/format_reward": 0.4166666753590107, "step": 41 }, { "advantage_max": 0.9113545380532742, "advantage_mean": -8.692344177774203e-09, "advantage_min": -0.48987017199397087, "advantage_std": 0.5287072211503983, "completion_length": 2871.6250228881836, "epoch": 0.048, "grad_norm": 0.0773473009467125, "kl": 0.00018630176782608032, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.399999999999999e-07, "loss": -0.0028, "reward": -0.2969640102237463, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2969640102237463, "reward_after_std": 0.5287072211503983, "reward_before_mean": -0.11779316561296582, "reward_before_std": 0.5369173996150494, "reward_change_max": 0.0, "reward_change_mean": -0.17917086835950613, "reward_change_min": -0.34727344289422035, "reward_change_std": 0.14140096702612936, "reward_std": 0.5287072323262691, "rewards/cosine_scaled_reward": -0.23597991233691573, "rewards/format_reward": 0.35416666977107525, "step": 42 }, { "advantage_max": 1.219473421573639, "advantage_mean": 9.313225579621331e-09, "advantage_min": -0.641626700758934, "advantage_std": 0.6846173517405987, "completion_length": 3060.500030517578, "epoch": 0.04914285714285714, "grad_norm": 0.11512342095375061, "kl": 0.00010600313544273376, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.599999999999999e-07, "loss": 0.0332, "reward": -0.21309549175202847, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.21309549175202847, "reward_after_std": 0.6846173368394375, "reward_before_mean": -0.025469645857810974, "reward_before_std": 0.6994537971913815, "reward_change_max": 0.0007215887308120728, "reward_change_mean": -0.18762585893273354, "reward_change_min": -0.3715638890862465, "reward_change_std": 0.15713506587781012, "reward_std": 0.6846173815429211, "rewards/cosine_scaled_reward": -0.12731814879225567, "rewards/format_reward": 0.2291666679084301, "step": 43 }, { "advantage_max": 1.4568804576992989, "advantage_mean": -1.3659397612997282e-08, "advantage_min": -0.6851945444941521, "advantage_std": 0.8075256794691086, "completion_length": 2762.770866394043, "epoch": 0.05028571428571429, "grad_norm": 0.11844295263290405, "kl": 0.00044381991028785706, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.799999999999999e-07, "loss": 0.0426, "reward": 0.10861534625291824, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.10861534625291824, "reward_after_std": 0.807525709271431, "reward_before_mean": 0.393554262816906, "reward_before_std": 0.7881722338497639, "reward_change_max": 0.0009457021951675415, "reward_change_mean": -0.2849389025941491, "reward_change_min": -0.5463197156786919, "reward_change_std": 0.2175145372748375, "reward_std": 0.8075257502496243, "rewards/cosine_scaled_reward": -0.04280621279031038, "rewards/format_reward": 0.4791666753590107, "step": 44 }, { "advantage_max": 1.256616048514843, "advantage_mean": 1.2417631367611648e-09, "advantage_min": -0.7105851396918297, "advantage_std": 0.7391474787145853, "completion_length": 3474.5208435058594, "epoch": 0.05142857142857143, "grad_norm": 0.10600760579109192, "kl": 0.00013585388660430908, "lambda_div_used": 0.7000000000000001, "learning_rate": 9e-07, "loss": 0.0036, "reward": -0.13553864229470491, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.13553864229470491, "reward_after_std": 0.7391474638134241, "reward_before_mean": 0.07757800817489624, "reward_before_std": 0.7742020450532436, "reward_change_max": 0.0005881339311599731, "reward_change_mean": -0.21311665140092373, "reward_change_min": -0.4759393446147442, "reward_change_std": 0.19536291249096394, "reward_std": 0.7391474694013596, "rewards/cosine_scaled_reward": -0.08621100289747119, "rewards/format_reward": 0.25000000931322575, "step": 45 }, { "advantage_max": 0.9702697545289993, "advantage_mean": 2.6077032755367213e-08, "advantage_min": -0.4514354318380356, "advantage_std": 0.541413675993681, "completion_length": 3213.500015258789, "epoch": 0.052571428571428575, "grad_norm": 0.09703514724969864, "kl": 0.00024537742137908936, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.2e-07, "loss": 0.0322, "reward": -0.3946500001475215, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3946500001475215, "reward_after_std": 0.5414136610925198, "reward_before_mean": -0.2541161496192217, "reward_before_std": 0.5485969968140125, "reward_change_max": 0.00014875829219818115, "reward_change_mean": -0.14053383423015475, "reward_change_min": -0.2998774442821741, "reward_change_std": 0.12114948220551014, "reward_std": 0.5414136685431004, "rewards/cosine_scaled_reward": -0.21039140783250332, "rewards/format_reward": 0.1666666679084301, "step": 46 }, { "advantage_max": 1.5182860642671585, "advantage_mean": 3.725290076417309e-09, "advantage_min": -0.8598650246858597, "advantage_std": 0.9061323516070843, "completion_length": 2918.541702270508, "epoch": 0.053714285714285714, "grad_norm": 0.1575622260570526, "kl": 0.00014133378863334656, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.399999999999999e-07, "loss": 0.1144, "reward": 0.02792397327721119, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.02792397327721119, "reward_after_std": 0.906132385134697, "reward_before_mean": 0.2801897209137678, "reward_before_std": 0.963929258286953, "reward_change_max": 0.0010666772723197937, "reward_change_mean": -0.2522657341323793, "reward_change_min": -0.5717835687100887, "reward_change_std": 0.23776433896273375, "reward_std": 0.906132385134697, "rewards/cosine_scaled_reward": -0.04740514978766441, "rewards/format_reward": 0.37500000931322575, "step": 47 }, { "advantage_max": 1.406430073082447, "advantage_mean": 3.725290853573426e-09, "advantage_min": -0.8095233663916588, "advantage_std": 0.8473255261778831, "completion_length": 2847.0833587646484, "epoch": 0.054857142857142854, "grad_norm": 0.1331583559513092, "kl": 0.0009892657399177551, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.6e-07, "loss": 0.0373, "reward": 0.08239079266786575, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.08239079266786575, "reward_after_std": 0.8473255299031734, "reward_before_mean": 0.36185096576809883, "reward_before_std": 0.8943199291825294, "reward_change_max": 0.0010818466544151306, "reward_change_mean": -0.2794601647183299, "reward_change_min": -0.6500570997595787, "reward_change_std": 0.24804373178631067, "reward_std": 0.847325537353754, "rewards/cosine_scaled_reward": -0.01699118735268712, "rewards/format_reward": 0.39583333767950535, "step": 48 }, { "advantage_max": 1.3203575275838375, "advantage_mean": -1.1175871339474952e-08, "advantage_min": -0.8599809035658836, "advantage_std": 0.8113159202039242, "completion_length": 2324.7292251586914, "epoch": 0.056, "grad_norm": 0.11754000186920166, "kl": 0.0004737917333841324, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.8e-07, "loss": 0.0227, "reward": 0.22125684656202793, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.22125684656202793, "reward_after_std": 0.8113159164786339, "reward_before_mean": 0.557825468480587, "reward_before_std": 0.8478972613811493, "reward_change_max": 0.0, "reward_change_mean": -0.3365686163306236, "reward_change_min": -0.6611680220812559, "reward_change_std": 0.26093919202685356, "reward_std": 0.8113159202039242, "rewards/cosine_scaled_reward": -0.023170609027147293, "rewards/format_reward": 0.6041666753590107, "step": 49 }, { "advantage_max": 1.2256408035755157, "advantage_mean": -1.3659398390153399e-08, "advantage_min": -0.7610447257757187, "advantage_std": 0.7761923484504223, "completion_length": 2976.6458740234375, "epoch": 0.05714285714285714, "grad_norm": 0.21591882407665253, "kl": 0.0005423109978437424, "lambda_div_used": 0.7000000000000001, "learning_rate": 1e-06, "loss": 0.0274, "reward": 0.21824848279356956, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.21824848279356956, "reward_after_std": 0.7761923521757126, "reward_before_mean": 0.5609084158204496, "reward_before_std": 0.8036927813664079, "reward_change_max": 0.0, "reward_change_mean": -0.34265993256121874, "reward_change_min": -0.6551012843847275, "reward_change_std": 0.28258705232292414, "reward_std": 0.7761923968791962, "rewards/cosine_scaled_reward": 0.0721208662725985, "rewards/format_reward": 0.41666667349636555, "step": 50 }, { "advantage_max": 0.9211089834570885, "advantage_mean": 1.365939800157534e-08, "advantage_min": -0.5625776872038841, "advantage_std": 0.5281440187245607, "completion_length": 2261.437515258789, "epoch": 0.05828571428571429, "grad_norm": 0.0686202198266983, "kl": 0.0023306608200073242, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.999890338174275e-07, "loss": 0.0083, "reward": 0.09554161503911018, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.09554161503911018, "reward_after_std": 0.5281440075486898, "reward_before_mean": 0.41558761009946465, "reward_before_std": 0.4915418364107609, "reward_change_max": 0.0001592785120010376, "reward_change_mean": -0.32004596339538693, "reward_change_min": -0.5329243093729019, "reward_change_std": 0.19881984498351812, "reward_std": 0.5281440112739801, "rewards/cosine_scaled_reward": -0.07345621287822723, "rewards/format_reward": 0.5625, "step": 51 }, { "advantage_max": 1.6391232684254646, "advantage_mean": -6.2088170160734535e-09, "advantage_min": -0.9773569256067276, "advantage_std": 0.9745854027569294, "completion_length": 3048.729217529297, "epoch": 0.05942857142857143, "grad_norm": 0.13857057690620422, "kl": 0.0015719830989837646, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.999561358041868e-07, "loss": 0.0581, "reward": 0.18856710754334927, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.18856710754334927, "reward_after_std": 0.97458541020751, "reward_before_mean": 0.4898769208230078, "reward_before_std": 1.0282159596681595, "reward_change_max": 0.001281455159187317, "reward_change_mean": -0.30130978557281196, "reward_change_min": -0.6202972773462534, "reward_change_std": 0.27393114077858627, "reward_std": 0.9745854176580906, "rewards/cosine_scaled_reward": 0.0470217689871788, "rewards/format_reward": 0.39583333767950535, "step": 52 }, { "advantage_max": 1.5374806299805641, "advantage_mean": 7.45058070794613e-09, "advantage_min": -0.955609530210495, "advantage_std": 0.8936157710850239, "completion_length": 2898.937545776367, "epoch": 0.060571428571428575, "grad_norm": 0.1304454803466797, "kl": 0.0008092299103736877, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.999013075636804e-07, "loss": 0.0303, "reward": 0.15414141491055489, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.15414141491055489, "reward_after_std": 0.8936157636344433, "reward_before_mean": 0.451077688485384, "reward_before_std": 0.9141882732510567, "reward_change_max": 0.0007978379726409912, "reward_change_mean": -0.2969362363219261, "reward_change_min": -0.5617908462882042, "reward_change_std": 0.236876605078578, "reward_std": 0.8936157822608948, "rewards/cosine_scaled_reward": -0.0036278427578508854, "rewards/format_reward": 0.4583333432674408, "step": 53 }, { "advantage_max": 1.4440747387707233, "advantage_mean": -6.208817349140361e-09, "advantage_min": -0.9045711942017078, "advantage_std": 0.8898514695465565, "completion_length": 2969.437530517578, "epoch": 0.061714285714285715, "grad_norm": 0.1441929042339325, "kl": 0.0003513023257255554, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.998245517681593e-07, "loss": 0.0623, "reward": 0.4009180925786495, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4009180925786495, "reward_after_std": 0.8898514620959759, "reward_before_mean": 0.7924199029803276, "reward_before_std": 0.9223152846097946, "reward_change_max": 0.0, "reward_change_mean": -0.3915017740800977, "reward_change_min": -0.7978204973042011, "reward_change_std": 0.31471778359264135, "reward_std": 0.8898514732718468, "rewards/cosine_scaled_reward": 0.15662658959627151, "rewards/format_reward": 0.4791666753590107, "step": 54 }, { "advantage_max": 1.3430213667452335, "advantage_mean": -1.0554989660072067e-08, "advantage_min": -0.6609198525547981, "advantage_std": 0.7468325290828943, "completion_length": 3155.041732788086, "epoch": 0.06285714285714286, "grad_norm": 0.1293783038854599, "kl": 0.0011532604694366455, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.997258721585931e-07, "loss": 0.0199, "reward": 0.04226991068571806, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.04226991068571806, "reward_after_std": 0.7468325309455395, "reward_before_mean": 0.3128039035946131, "reward_before_std": 0.7251314949244261, "reward_change_max": 0.0, "reward_change_mean": -0.2705340012907982, "reward_change_min": -0.4766126349568367, "reward_change_std": 0.20029573515057564, "reward_std": 0.746832549571991, "rewards/cosine_scaled_reward": -0.010264725424349308, "rewards/format_reward": 0.33333333767950535, "step": 55 }, { "advantage_max": 1.0376209765672684, "advantage_mean": -1.2417633588057697e-09, "advantage_min": -0.8000453487038612, "advantage_std": 0.6578298974782228, "completion_length": 3013.416679382324, "epoch": 0.064, "grad_norm": 0.11045597493648529, "kl": 0.0005884170532226562, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.996052735444862e-07, "loss": 0.0461, "reward": 0.003945750184357166, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.003945750184357166, "reward_after_std": 0.6578298974782228, "reward_before_mean": 0.28323000913951546, "reward_before_std": 0.702507134526968, "reward_change_max": 0.0010666921734809875, "reward_change_mean": -0.27928424440324306, "reward_change_min": -0.5404662974178791, "reward_change_std": 0.22895370610058308, "reward_std": 0.6578299012035131, "rewards/cosine_scaled_reward": -0.056301675736904144, "rewards/format_reward": 0.3958333395421505, "step": 56 }, { "advantage_max": 0.8575425706803799, "advantage_mean": -5.587935281159417e-09, "advantage_min": -0.6129637286067009, "advantage_std": 0.5624299887567759, "completion_length": 3314.312530517578, "epoch": 0.06514285714285714, "grad_norm": 0.08359239995479584, "kl": 0.0003393888473510742, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.994627618036452e-07, "loss": 0.0254, "reward": -0.23580889869481325, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.23580889869481325, "reward_after_std": 0.5624299924820662, "reward_before_mean": -0.028469612821936607, "reward_before_std": 0.6206865087151527, "reward_change_max": 0.00027373433113098145, "reward_change_mean": -0.2073393096216023, "reward_change_min": -0.45087942108511925, "reward_change_std": 0.19378007715567946, "reward_std": 0.5624300055205822, "rewards/cosine_scaled_reward": -0.14965146128088236, "rewards/format_reward": 0.27083333767950535, "step": 57 }, { "advantage_max": 1.5007806494832039, "advantage_mean": 1.862645426786713e-09, "advantage_min": -0.9062970317900181, "advantage_std": 0.8869751691818237, "completion_length": 2272.770881652832, "epoch": 0.06628571428571428, "grad_norm": 0.12403688579797745, "kl": 0.007107377052307129, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.992983438818915e-07, "loss": 0.059, "reward": 0.43791239289566875, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.43791239289566875, "reward_after_std": 0.8869751468300819, "reward_before_mean": 0.8376260045915842, "reward_before_std": 0.8879306688904762, "reward_change_max": 0.0, "reward_change_mean": -0.3997136056423187, "reward_change_min": -0.7445529289543629, "reward_change_std": 0.29559123795479536, "reward_std": 0.8869751766324043, "rewards/cosine_scaled_reward": 0.08547966694459319, "rewards/format_reward": 0.6666666753590107, "step": 58 }, { "advantage_max": 0.9958036541938782, "advantage_mean": -1.5522043483873205e-08, "advantage_min": -0.6712572649121284, "advantage_std": 0.6627084948122501, "completion_length": 2968.375015258789, "epoch": 0.06742857142857143, "grad_norm": 0.13538452982902527, "kl": 0.0009574443101882935, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.991120277927223e-07, "loss": 0.0752, "reward": -0.19166237860918045, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.19166237860918045, "reward_after_std": 0.6627085022628307, "reward_before_mean": 0.018512267619371414, "reward_before_std": 0.7423705346882343, "reward_change_max": 0.0015425384044647217, "reward_change_mean": -0.21017466066405177, "reward_change_min": -0.5010223798453808, "reward_change_std": 0.22122562769800425, "reward_std": 0.6627085246145725, "rewards/cosine_scaled_reward": -0.13657721132040024, "rewards/format_reward": 0.29166666977107525, "step": 59 }, { "advantage_max": 1.114711195230484, "advantage_mean": 2.421438738409165e-08, "advantage_min": -0.43552929908037186, "advantage_std": 0.5868423134088516, "completion_length": 3090.9583587646484, "epoch": 0.06857142857142857, "grad_norm": 0.08353926241397858, "kl": 0.001408219337463379, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.989038226169207e-07, "loss": 0.0065, "reward": -0.34867841517552733, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.34867841517552733, "reward_after_std": 0.586842292919755, "reward_before_mean": -0.200962302275002, "reward_before_std": 0.5666951425373554, "reward_change_max": 0.001304030418395996, "reward_change_mean": -0.14771609636954963, "reward_change_min": -0.28777224756777287, "reward_change_std": 0.11059607611969113, "reward_std": 0.5868423096835613, "rewards/cosine_scaled_reward": -0.24631450232118368, "rewards/format_reward": 0.2916666679084301, "step": 60 }, { "advantage_max": 1.2283815741539001, "advantage_mean": 1.5832484101530042e-08, "advantage_min": -0.7546460404992104, "advantage_std": 0.7360720597207546, "completion_length": 3173.0625610351562, "epoch": 0.06971428571428571, "grad_norm": 0.15681175887584686, "kl": 0.0024908222258090973, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.98673738502114e-07, "loss": 0.036, "reward": -0.0034181829541921616, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.0034181829541921616, "reward_after_std": 0.7360720634460449, "reward_before_mean": 0.2585161756724119, "reward_before_std": 0.7624808065593243, "reward_change_max": 0.0004996657371520996, "reward_change_mean": -0.2619343502447009, "reward_change_min": -0.5183047540485859, "reward_change_std": 0.21530343778431416, "reward_std": 0.7360720820724964, "rewards/cosine_scaled_reward": -0.06865859217941761, "rewards/format_reward": 0.3958333469927311, "step": 61 }, { "advantage_max": 1.6221475005149841, "advantage_mean": 6.208817127095756e-09, "advantage_min": -0.7927829623222351, "advantage_std": 0.9064479470252991, "completion_length": 2707.541732788086, "epoch": 0.07085714285714285, "grad_norm": 0.1787949800491333, "kl": 0.020225495100021362, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.98421786662277e-07, "loss": 0.0607, "reward": 0.13127783383242786, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.13127783383242786, "reward_after_std": 0.9064479619264603, "reward_before_mean": 0.4136840028950246, "reward_before_std": 0.9145010150969028, "reward_change_max": 0.0001274794340133667, "reward_change_mean": -0.2824061932042241, "reward_change_min": -0.5440523903816938, "reward_change_std": 0.21861811820417643, "reward_std": 0.9064479991793633, "rewards/cosine_scaled_reward": -0.043158004991710186, "rewards/format_reward": 0.5000000037252903, "step": 62 }, { "advantage_max": 1.1011157259345055, "advantage_mean": 0.0, "advantage_min": -0.6950971148908138, "advantage_std": 0.6476048491895199, "completion_length": 2512.0833892822266, "epoch": 0.072, "grad_norm": 0.0958164632320404, "kl": 0.003099203109741211, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.981479793771866e-07, "loss": 0.0544, "reward": 0.3186934031546116, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3186934031546116, "reward_after_std": 0.6476048715412617, "reward_before_mean": 0.7054652385413647, "reward_before_std": 0.6132442802190781, "reward_change_max": 0.0004886388778686523, "reward_change_mean": -0.38677185960114, "reward_change_min": -0.6413705144077539, "reward_change_std": 0.2497247066348791, "reward_std": 0.6476048901677132, "rewards/cosine_scaled_reward": 0.04023261368274689, "rewards/format_reward": 0.6250000055879354, "step": 63 }, { "advantage_max": 1.2440772727131844, "advantage_mean": 4.967053879312289e-09, "advantage_min": -0.6886808797717094, "advantage_std": 0.7557894457131624, "completion_length": 3082.8958435058594, "epoch": 0.07314285714285715, "grad_norm": 0.12037858366966248, "kl": 0.0024671554565429688, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.97852329991824e-07, "loss": 0.069, "reward": -0.07378544472157955, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.07378544472157955, "reward_after_std": 0.7557894717901945, "reward_before_mean": 0.16163264960050583, "reward_before_std": 0.8081401251256466, "reward_change_max": 0.0011908113956451416, "reward_change_mean": -0.2354180756956339, "reward_change_min": -0.558480516076088, "reward_change_std": 0.2264844672754407, "reward_std": 0.7557894978672266, "rewards/cosine_scaled_reward": -0.07543368389201532, "rewards/format_reward": 0.3125000037252903, "step": 64 }, { "advantage_max": 1.0268595106899738, "advantage_mean": -1.1796752963366686e-08, "advantage_min": -0.4311934597790241, "advantage_std": 0.5640022493898869, "completion_length": 2795.437511444092, "epoch": 0.07428571428571429, "grad_norm": 0.10451654344797134, "kl": 0.0019540786743164062, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.975348529157229e-07, "loss": 0.0135, "reward": -0.20017614914104342, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.20017614914104342, "reward_after_std": 0.5640022493898869, "reward_before_mean": 0.006564559414982796, "reward_before_std": 0.5450711958110332, "reward_change_max": 0.0015406087040901184, "reward_change_mean": -0.20674069644883275, "reward_change_min": -0.381564624607563, "reward_change_std": 0.15082712983712554, "reward_std": 0.5640022531151772, "rewards/cosine_scaled_reward": -0.18421773053705692, "rewards/format_reward": 0.37500000186264515, "step": 65 }, { "advantage_max": 1.0503377504646778, "advantage_mean": -1.2417634254191512e-08, "advantage_min": -0.6915866583585739, "advantage_std": 0.6261376366019249, "completion_length": 2167.041679382324, "epoch": 0.07542857142857143, "grad_norm": 0.0698748528957367, "kl": 0.0012662410736083984, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.971955636222684e-07, "loss": 0.0057, "reward": 0.2023143582046032, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2023143582046032, "reward_after_std": 0.6261376515030861, "reward_before_mean": 0.5499310828745365, "reward_before_std": 0.604407899081707, "reward_change_max": 0.0, "reward_change_mean": -0.3476167330518365, "reward_change_min": -0.5529674589633942, "reward_change_std": 0.23018301371484995, "reward_std": 0.6261376589536667, "rewards/cosine_scaled_reward": 0.035382192581892014, "rewards/format_reward": 0.4791666716337204, "step": 66 }, { "advantage_max": 0.7263390906155109, "advantage_mean": 3.10440866346795e-08, "advantage_min": -0.3848998099565506, "advantage_std": 0.4047544952481985, "completion_length": 3579.5625, "epoch": 0.07657142857142857, "grad_norm": 0.059073422104120255, "kl": 0.0013927817344665527, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.968344786479415e-07, "loss": 0.0008, "reward": -0.5511298915371299, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.5511298915371299, "reward_after_std": 0.40475449338555336, "reward_before_mean": -0.44834938645362854, "reward_before_std": 0.4108631107956171, "reward_change_max": 3.879517316818237e-05, "reward_change_mean": -0.10278050391934812, "reward_change_min": -0.2122868075966835, "reward_change_std": 0.0857694000005722, "reward_std": 0.40475449711084366, "rewards/cosine_scaled_reward": -0.25542469043284655, "rewards/format_reward": 0.06250000186264515, "step": 67 }, { "advantage_max": 1.2082672864198685, "advantage_mean": 5.587935614226325e-09, "advantage_min": -0.665333541110158, "advantage_std": 0.746182668954134, "completion_length": 2459.562572479248, "epoch": 0.07771428571428571, "grad_norm": 0.11337409168481827, "kl": 0.005399227142333984, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.964516155915151e-07, "loss": 0.0337, "reward": 0.010181301273405552, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.010181301273405552, "reward_after_std": 0.7461826372891665, "reward_before_mean": 0.27732523111626506, "reward_before_std": 0.7844086028635502, "reward_change_max": 0.00016658753156661987, "reward_change_mean": -0.26714393263682723, "reward_change_min": -0.6000061314553022, "reward_change_std": 0.23759357165545225, "reward_std": 0.7461826726794243, "rewards/cosine_scaled_reward": -0.14258738327771425, "rewards/format_reward": 0.5625000111758709, "step": 68 }, { "advantage_max": 1.0632721930742264, "advantage_mean": 1.3659398057086491e-08, "advantage_min": -0.5626996904611588, "advantage_std": 0.6013586819171906, "completion_length": 2767.0416870117188, "epoch": 0.07885714285714286, "grad_norm": 0.11885175853967667, "kl": 0.0038573741912841797, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.960469931131936e-07, "loss": 0.0527, "reward": -0.31397517397999763, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.31397517397999763, "reward_after_std": 0.6013586968183517, "reward_before_mean": -0.15239904704503715, "reward_before_std": 0.6141092143952847, "reward_change_max": 0.0019856542348861694, "reward_change_mean": -0.16157611832022667, "reward_change_min": -0.3111561890691519, "reward_change_std": 0.13281467324122787, "reward_std": 0.6013587154448032, "rewards/cosine_scaled_reward": -0.24286619946360588, "rewards/format_reward": 0.3333333395421505, "step": 69 }, { "advantage_max": 1.0964757651090622, "advantage_mean": 1.0554989271494009e-08, "advantage_min": -0.5003730542957783, "advantage_std": 0.5867039151489735, "completion_length": 2997.041702270508, "epoch": 0.08, "grad_norm": 0.08089398592710495, "kl": 0.001575469970703125, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.956206309337066e-07, "loss": 0.0055, "reward": -0.2361793201416731, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2361793201416731, "reward_after_std": 0.5867039300501347, "reward_before_mean": -0.04719802783802152, "reward_before_std": 0.5565843749791384, "reward_change_max": 0.0007937699556350708, "reward_change_mean": -0.18898130021989346, "reward_change_min": -0.33782116137444973, "reward_change_std": 0.13402173947542906, "reward_std": 0.5867039673030376, "rewards/cosine_scaled_reward": -0.2006823541596532, "rewards/format_reward": 0.3541666679084301, "step": 70 }, { "advantage_max": 1.1093778908252716, "advantage_mean": 3.1044086745701804e-09, "advantage_min": -0.5899154469370842, "advantage_std": 0.6263005174696445, "completion_length": 2871.3958435058594, "epoch": 0.08114285714285714, "grad_norm": 0.13375209271907806, "kl": 0.006723284721374512, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.951725498333448e-07, "loss": 0.0477, "reward": -0.0509650744497776, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.0509650744497776, "reward_after_std": 0.6263005062937737, "reward_before_mean": 0.2027706727385521, "reward_before_std": 0.6161373816430569, "reward_change_max": 0.0005811154842376709, "reward_change_mean": -0.253735733916983, "reward_change_min": -0.46558609418570995, "reward_change_std": 0.17825811356306076, "reward_std": 0.6263005137443542, "rewards/cosine_scaled_reward": -0.06528134597465396, "rewards/format_reward": 0.33333334140479565, "step": 71 }, { "advantage_max": 0.974971279501915, "advantage_mean": 2.6697914323747796e-08, "advantage_min": -0.6151207871735096, "advantage_std": 0.5957626178860664, "completion_length": 3153.9166870117188, "epoch": 0.08228571428571428, "grad_norm": 0.09279810637235641, "kl": 0.005246877670288086, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.947027716509488e-07, "loss": 0.0394, "reward": -0.3170163119211793, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3170163119211793, "reward_after_std": 0.5957626029849052, "reward_before_mean": -0.14919825922697783, "reward_before_std": 0.6452187933027744, "reward_change_max": 0.0013034045696258545, "reward_change_mean": -0.16781801730394363, "reward_change_min": -0.41230191849172115, "reward_change_std": 0.16988197807222605, "reward_std": 0.5957626290619373, "rewards/cosine_scaled_reward": -0.21001579985022545, "rewards/format_reward": 0.27083333767950535, "step": 72 }, { "advantage_max": 1.1339119151234627, "advantage_mean": 2.4835269452072595e-08, "advantage_min": -0.5014988705515862, "advantage_std": 0.6255671754479408, "completion_length": 3523.3333740234375, "epoch": 0.08342857142857144, "grad_norm": 0.10664436221122742, "kl": 0.0007270574569702148, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.942113192828444e-07, "loss": 0.0274, "reward": -0.43988874554634094, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.43988874554634094, "reward_after_std": 0.6255671717226505, "reward_before_mean": -0.3272168878465891, "reward_before_std": 0.6468932591378689, "reward_change_max": 0.0018475502729415894, "reward_change_mean": -0.11267185024917126, "reward_change_min": -0.30092281103134155, "reward_change_std": 0.1199548018630594, "reward_std": 0.6255671866238117, "rewards/cosine_scaled_reward": -0.2261084453202784, "rewards/format_reward": 0.12500000186264515, "step": 73 }, { "advantage_max": 1.3062651008367538, "advantage_mean": -1.1796752130699417e-08, "advantage_min": -0.6358704566955566, "advantage_std": 0.7469595354050398, "completion_length": 3221.5208587646484, "epoch": 0.08457142857142858, "grad_norm": 0.132724791765213, "kl": 0.0031901895999908447, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.93698216681727e-07, "loss": 0.0676, "reward": -0.025578954257071018, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.025578954257071018, "reward_after_std": 0.7469595540314913, "reward_before_mean": 0.2248452752828598, "reward_before_std": 0.7438722159713507, "reward_change_max": 0.0008772239089012146, "reward_change_mean": -0.2504242234863341, "reward_change_min": -0.5414825454354286, "reward_change_std": 0.21578879188746214, "reward_std": 0.7469595912843943, "rewards/cosine_scaled_reward": -0.033410708769224584, "rewards/format_reward": 0.29166666977107525, "step": 74 }, { "advantage_max": 1.1306501738727093, "advantage_mean": 1.7384688633104162e-08, "advantage_min": -0.5787255503237247, "advantage_std": 0.6625193282961845, "completion_length": 3090.604232788086, "epoch": 0.08571428571428572, "grad_norm": 0.11262889206409454, "kl": 0.0034093856811523438, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.931634888554935e-07, "loss": 0.0406, "reward": 0.06313235312700272, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.06313235312700272, "reward_after_std": 0.6625193208456039, "reward_before_mean": 0.3570488765835762, "reward_before_std": 0.64370296895504, "reward_change_max": 0.00019219517707824707, "reward_change_mean": -0.29391652159392834, "reward_change_min": -0.6067571640014648, "reward_change_std": 0.22870568884536624, "reward_std": 0.6625193618237972, "rewards/cosine_scaled_reward": 0.0014410973526537418, "rewards/format_reward": 0.3541666716337204, "step": 75 }, { "advantage_max": 0.8555637449026108, "advantage_mean": 4.967053768289986e-09, "advantage_min": -0.5915851294994354, "advantage_std": 0.5106821767985821, "completion_length": 2776.2083892822266, "epoch": 0.08685714285714285, "grad_norm": 0.08106429129838943, "kl": 0.0007976293563842773, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.926071618660237e-07, "loss": 0.0483, "reward": -0.12464714795351028, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.12464714795351028, "reward_after_std": 0.5106821544468403, "reward_before_mean": 0.12069362960755825, "reward_before_std": 0.5108134467154741, "reward_change_max": 0.0007242336869239807, "reward_change_mean": -0.2453407747671008, "reward_change_min": -0.4396594688296318, "reward_change_std": 0.17849350557662547, "reward_std": 0.5106821693480015, "rewards/cosine_scaled_reward": -0.18965318612754345, "rewards/format_reward": 0.5000000093132257, "step": 76 }, { "advantage_max": 0.8869626522064209, "advantage_mean": 3.1044084525255755e-09, "advantage_min": -0.6437748745083809, "advantage_std": 0.5324870087206364, "completion_length": 3121.979217529297, "epoch": 0.088, "grad_norm": 0.08320073783397675, "kl": 0.0010444223880767822, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.9202926282791e-07, "loss": 0.0294, "reward": -0.21349376626312733, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.21349376626312733, "reward_after_std": 0.5324870124459267, "reward_before_mean": -0.0016396627761423588, "reward_before_std": 0.5460649132728577, "reward_change_max": 0.0005499720573425293, "reward_change_mean": -0.21185409231111407, "reward_change_min": -0.3882265854626894, "reward_change_std": 0.16535702906548977, "reward_std": 0.5324870124459267, "rewards/cosine_scaled_reward": -0.1883198358118534, "rewards/format_reward": 0.37500000931322575, "step": 77 }, { "advantage_max": 1.5690804421901703, "advantage_mean": -3.601114062501409e-08, "advantage_min": -0.7654212564229965, "advantage_std": 0.8929519504308701, "completion_length": 2964.062545776367, "epoch": 0.08914285714285715, "grad_norm": 0.1362496167421341, "kl": 0.0023328065872192383, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.91429819907136e-07, "loss": 0.0472, "reward": 0.23330368660390377, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.23330368660390377, "reward_after_std": 0.8929519504308701, "reward_before_mean": 0.5573525791987777, "reward_before_std": 0.8918314576148987, "reward_change_max": 0.0002750083804130554, "reward_change_mean": -0.3240489256568253, "reward_change_min": -0.6681363992393017, "reward_change_std": 0.259742493275553, "reward_std": 0.8929519802331924, "rewards/cosine_scaled_reward": 0.07034294621553272, "rewards/format_reward": 0.41666666977107525, "step": 78 }, { "advantage_max": 1.1290984824299812, "advantage_mean": 2.483526828633842e-09, "advantage_min": -0.566096693277359, "advantage_std": 0.624596331268549, "completion_length": 2287.541690826416, "epoch": 0.09028571428571429, "grad_norm": 0.10173244774341583, "kl": 0.0024640560150146484, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.908088623197048e-07, "loss": -0.0065, "reward": 0.14616259443573654, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.14616259443573654, "reward_after_std": 0.6245963498950005, "reward_before_mean": 0.46677736937999725, "reward_before_std": 0.5631000082939863, "reward_change_max": 0.0004928857088088989, "reward_change_mean": -0.3206147523596883, "reward_change_min": -0.5135222245007753, "reward_change_std": 0.20620128232985735, "reward_std": 0.6245963498950005, "rewards/cosine_scaled_reward": -0.03744465671479702, "rewards/format_reward": 0.5416666679084301, "step": 79 }, { "advantage_max": 1.4897634759545326, "advantage_mean": -3.4148496752539614e-08, "advantage_min": -0.6975793838500977, "advantage_std": 0.8379081785678864, "completion_length": 3263.166717529297, "epoch": 0.09142857142857143, "grad_norm": 0.15222951769828796, "kl": 0.0024718046188354492, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.901664203302124e-07, "loss": 0.0648, "reward": 0.0717293145135045, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.0717293145135045, "reward_after_std": 0.8379081785678864, "reward_before_mean": 0.34337579587008804, "reward_before_std": 0.8289961963891983, "reward_change_max": 0.00044892728328704834, "reward_change_mean": -0.2716464791446924, "reward_change_min": -0.4956607408821583, "reward_change_std": 0.20896095503121614, "reward_std": 0.837908186018467, "rewards/cosine_scaled_reward": -0.015812127850949764, "rewards/format_reward": 0.3750000037252903, "step": 80 }, { "advantage_max": 1.2281683385372162, "advantage_mean": 2.483527383745354e-09, "advantage_min": -0.618021085858345, "advantage_std": 0.690451554954052, "completion_length": 3113.541732788086, "epoch": 0.09257142857142857, "grad_norm": 0.20810602605342865, "kl": 0.005570411682128906, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.895025252503755e-07, "loss": 0.0338, "reward": -0.2511444576084614, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2511444576084614, "reward_after_std": 0.6904515661299229, "reward_before_mean": -0.07739553973078728, "reward_before_std": 0.7059197537600994, "reward_change_max": 0.000757955014705658, "reward_change_mean": -0.1737489178776741, "reward_change_min": -0.411263357847929, "reward_change_std": 0.16339826956391335, "reward_std": 0.6904515847563744, "rewards/cosine_scaled_reward": -0.1949477707967162, "rewards/format_reward": 0.31250000558793545, "step": 81 }, { "advantage_max": 1.31115597859025, "advantage_mean": 1.862645149230957e-09, "advantage_min": -0.8089087083935738, "advantage_std": 0.7565962858498096, "completion_length": 2834.8958740234375, "epoch": 0.09371428571428571, "grad_norm": 0.09568361937999725, "kl": 0.002544999122619629, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.888172094375033e-07, "loss": 0.0091, "reward": 0.1376865222118795, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1376865222118795, "reward_after_std": 0.756596315652132, "reward_before_mean": 0.4453375115990639, "reward_before_std": 0.7576594576239586, "reward_change_max": 0.0004483461380004883, "reward_change_mean": -0.3076509768143296, "reward_change_min": -0.5397645384073257, "reward_change_std": 0.21682811994105577, "reward_std": 0.7565963491797447, "rewards/cosine_scaled_reward": 0.03516875783680007, "rewards/format_reward": 0.37500000558793545, "step": 82 }, { "advantage_max": 1.5486390925943851, "advantage_mean": 1.2417632477834672e-09, "advantage_min": -0.7127926684916019, "advantage_std": 0.8253224082291126, "completion_length": 2919.6458435058594, "epoch": 0.09485714285714286, "grad_norm": 0.1437751054763794, "kl": 0.002646923065185547, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.881105062929221e-07, "loss": 0.004, "reward": 0.000450813677161932, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.000450813677161932, "reward_after_std": 0.8253223933279514, "reward_before_mean": 0.24097184976562858, "reward_before_std": 0.7998228445649147, "reward_change_max": 0.0002821981906890869, "reward_change_mean": -0.2405210305005312, "reward_change_min": -0.4127188418060541, "reward_change_std": 0.16908379085361958, "reward_std": 0.825322400778532, "rewards/cosine_scaled_reward": -0.025347420232719742, "rewards/format_reward": 0.2916666679084301, "step": 83 }, { "advantage_max": 1.3038128688931465, "advantage_mean": -6.208817904251873e-10, "advantage_min": -0.6896253600716591, "advantage_std": 0.7823732793331146, "completion_length": 3117.812530517578, "epoch": 0.096, "grad_norm": 0.12220973521471024, "kl": 0.001085519790649414, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.873824502603459e-07, "loss": 0.0568, "reward": 0.06299414858222008, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.06299414858222008, "reward_after_std": 0.782373296096921, "reward_before_mean": 0.34435543417930603, "reward_before_std": 0.811880424618721, "reward_change_max": 0.0007270798087120056, "reward_change_mean": -0.28136129723861814, "reward_change_min": -0.6436299737542868, "reward_change_std": 0.25078502343967557, "reward_std": 0.7823732979595661, "rewards/cosine_scaled_reward": 0.005511032417416573, "rewards/format_reward": 0.3333333358168602, "step": 84 }, { "advantage_max": 1.8904551193118095, "advantage_mean": -6.208817349140361e-09, "advantage_min": -0.8693808242678642, "advantage_std": 1.0314135998487473, "completion_length": 3134.875030517578, "epoch": 0.09714285714285714, "grad_norm": 0.1699473261833191, "kl": 0.0018897056579589844, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.866330768241983e-07, "loss": 0.0623, "reward": -0.04264771193265915, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.04264771193265915, "reward_after_std": 1.0314136110246181, "reward_before_mean": 0.1589924634899944, "reward_before_std": 1.0488222055137157, "reward_change_max": 0.001532226800918579, "reward_change_mean": -0.20164018403738737, "reward_change_min": -0.44846535101532936, "reward_change_std": 0.19201862812042236, "reward_std": 1.031413622200489, "rewards/cosine_scaled_reward": -0.08717044070363045, "rewards/format_reward": 0.33333334140479565, "step": 85 }, { "advantage_max": 1.0618792101740837, "advantage_mean": 8.07146260939362e-09, "advantage_min": -0.7318792194128036, "advantage_std": 0.6476677916944027, "completion_length": 2897.2291717529297, "epoch": 0.09828571428571428, "grad_norm": 0.08707182109355927, "kl": 0.0034961700439453125, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.85862422507884e-07, "loss": 0.013, "reward": 0.017172118183225393, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.017172118183225393, "reward_after_std": 0.6476677916944027, "reward_before_mean": 0.2989570014178753, "reward_before_std": 0.6632709354162216, "reward_change_max": 0.0015084072947502136, "reward_change_mean": -0.2817848902195692, "reward_change_min": -0.5358342751860619, "reward_change_std": 0.21289643086493015, "reward_std": 0.6476677916944027, "rewards/cosine_scaled_reward": -0.07968816673383117, "rewards/format_reward": 0.4583333432674408, "step": 86 }, { "advantage_max": 1.3855399899184704, "advantage_mean": -8.692345065952622e-09, "advantage_min": -0.8480601236224174, "advantage_std": 0.8235466033220291, "completion_length": 2927.166717529297, "epoch": 0.09942857142857142, "grad_norm": 0.20664729177951813, "kl": 0.010286808013916016, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.850705248720068e-07, "loss": 0.0528, "reward": 0.04761325381696224, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.04761325381696224, "reward_after_std": 0.8235466033220291, "reward_before_mean": 0.3165686149150133, "reward_before_std": 0.8553726263344288, "reward_change_max": 0.0007121041417121887, "reward_change_mean": -0.2689553592354059, "reward_change_min": -0.5325675681233406, "reward_change_std": 0.22574665397405624, "reward_std": 0.8235466182231903, "rewards/cosine_scaled_reward": -0.08129904745146632, "rewards/format_reward": 0.47916668467223644, "step": 87 }, { "advantage_max": 1.5240605920553207, "advantage_mean": -6.208818015274176e-09, "advantage_min": -1.0140509381890297, "advantage_std": 0.9408680759370327, "completion_length": 2959.166748046875, "epoch": 0.10057142857142858, "grad_norm": 0.16569511592388153, "kl": 0.007565021514892578, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.8425742251254e-07, "loss": 0.0452, "reward": 0.22164431703276932, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.22164431703276932, "reward_after_std": 0.9408680908381939, "reward_before_mean": 0.5424534603953362, "reward_before_std": 1.0073655508458614, "reward_change_max": 0.000623852014541626, "reward_change_mean": -0.3208091165870428, "reward_change_min": -0.6903372332453728, "reward_change_std": 0.2898290455341339, "reward_std": 0.9408681094646454, "rewards/cosine_scaled_reward": 0.03164337668567896, "rewards/format_reward": 0.4791666753590107, "step": 88 }, { "advantage_max": 1.2696744501590729, "advantage_mean": 1.8626453157644107e-09, "advantage_min": -0.5966677069664001, "advantage_std": 0.7119504939764738, "completion_length": 3250.8958587646484, "epoch": 0.10171428571428572, "grad_norm": 0.1325407475233078, "kl": 0.0038471221923828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.83423155058946e-07, "loss": 0.0007, "reward": -0.111092375125736, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.111092375125736, "reward_after_std": 0.7119505014270544, "reward_before_mean": 0.1102348892018199, "reward_before_std": 0.7136143017560244, "reward_change_max": 0.0010024309158325195, "reward_change_mean": -0.22132724151015282, "reward_change_min": -0.47170834988355637, "reward_change_std": 0.18059588875621557, "reward_std": 0.7119505144655704, "rewards/cosine_scaled_reward": -0.06988256610929966, "rewards/format_reward": 0.25000000186264515, "step": 89 }, { "advantage_max": 1.1662665717303753, "advantage_mean": 0.0, "advantage_min": -0.5448751635849476, "advantage_std": 0.6311663277447224, "completion_length": 2775.437515258789, "epoch": 0.10285714285714286, "grad_norm": 0.11538201570510864, "kl": 0.011361122131347656, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.825677631722435e-07, "loss": 0.0164, "reward": -0.16298224218189716, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.16298224218189716, "reward_after_std": 0.6311663128435612, "reward_before_mean": 0.04312063241377473, "reward_before_std": 0.6125261038541794, "reward_change_max": 0.0005078762769699097, "reward_change_mean": -0.20610287599265575, "reward_change_min": -0.38208043575286865, "reward_change_std": 0.15109846275299788, "reward_std": 0.6311663277447224, "rewards/cosine_scaled_reward": -0.1867730226367712, "rewards/format_reward": 0.4166666716337204, "step": 90 }, { "advantage_max": 1.474507611244917, "advantage_mean": 1.4280280180578586e-08, "advantage_min": -0.9642866589128971, "advantage_std": 0.9065007567405701, "completion_length": 3187.0000915527344, "epoch": 0.104, "grad_norm": 0.15550005435943604, "kl": 0.005296945571899414, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.816912885430258e-07, "loss": 0.038, "reward": 0.1353749530389905, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1353749530389905, "reward_after_std": 0.9065007455646992, "reward_before_mean": 0.42992572486400604, "reward_before_std": 0.9722138866782188, "reward_change_max": 0.0, "reward_change_mean": -0.29455077461898327, "reward_change_min": -0.6697384584695101, "reward_change_std": 0.2784268017858267, "reward_std": 0.9065007641911507, "rewards/cosine_scaled_reward": 0.017046190798282623, "rewards/format_reward": 0.3958333432674408, "step": 91 }, { "advantage_max": 1.0653186030685902, "advantage_mean": 5.898376342905465e-09, "advantage_min": -0.8072712197899818, "advantage_std": 0.6613656990230083, "completion_length": 2990.500015258789, "epoch": 0.10514285714285715, "grad_norm": 0.11002243310213089, "kl": 0.00823211669921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.807937738894303e-07, "loss": 0.0423, "reward": -0.012807987630367279, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.012807987630367279, "reward_after_std": 0.661365695297718, "reward_before_mean": 0.25806828774511814, "reward_before_std": 0.6961026005446911, "reward_change_max": 0.0003693774342536926, "reward_change_mean": -0.2708762800320983, "reward_change_min": -0.492202278226614, "reward_change_std": 0.21451785834506154, "reward_std": 0.6613657251000404, "rewards/cosine_scaled_reward": -0.1001325212419033, "rewards/format_reward": 0.45833334885537624, "step": 92 }, { "advantage_max": 0.7933408431708813, "advantage_mean": 2.1730860666480112e-08, "advantage_min": -0.46355464309453964, "advantage_std": 0.4695439264178276, "completion_length": 3566.375, "epoch": 0.10628571428571429, "grad_norm": 0.08727142214775085, "kl": 0.005443572998046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.798752629550546e-07, "loss": 0.0051, "reward": -0.46245191991329193, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.46245191991329193, "reward_after_std": 0.4695439338684082, "reward_before_mean": -0.33290886227041483, "reward_before_std": 0.4970332011580467, "reward_change_max": 0.0008160322904586792, "reward_change_mean": -0.1295430501922965, "reward_change_min": -0.2736487351357937, "reward_change_std": 0.12085827440023422, "reward_std": 0.4695439413189888, "rewards/cosine_scaled_reward": -0.18728776648640633, "rewards/format_reward": 0.0416666679084301, "step": 93 }, { "advantage_max": 1.0807769075036049, "advantage_mean": -1.6142924885720333e-08, "advantage_min": -0.5711992047727108, "advantage_std": 0.6211388818919659, "completion_length": 3111.437530517578, "epoch": 0.10742857142857143, "grad_norm": 0.12472743541002274, "kl": 0.010393619537353516, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.78935800506826e-07, "loss": 0.0441, "reward": -0.10525502264499664, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.10525502264499664, "reward_after_std": 0.6211388818919659, "reward_before_mean": 0.13153162971138954, "reward_before_std": 0.6166800931096077, "reward_change_max": 0.00021298229694366455, "reward_change_mean": -0.23678663885220885, "reward_change_min": -0.49180080369114876, "reward_change_std": 0.18674441473558545, "reward_std": 0.6211389005184174, "rewards/cosine_scaled_reward": -0.06965086422860622, "rewards/format_reward": 0.27083333767950535, "step": 94 }, { "advantage_max": 1.2856793850660324, "advantage_mean": 8.6923440667519e-09, "advantage_min": -0.5027649588882923, "advantage_std": 0.6929790526628494, "completion_length": 3391.0416870117188, "epoch": 0.10857142857142857, "grad_norm": 0.09327510744333267, "kl": 0.002196788787841797, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.779754323328192e-07, "loss": 0.0196, "reward": -0.3284091189270839, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3284091189270839, "reward_after_std": 0.6929790526628494, "reward_before_mean": -0.18555639125406742, "reward_before_std": 0.6929865293204784, "reward_change_max": 0.0010388195514678955, "reward_change_mean": -0.14285272872075438, "reward_change_min": -0.36844565719366074, "reward_change_std": 0.13806079514324665, "reward_std": 0.69297906011343, "rewards/cosine_scaled_reward": -0.20736153866164386, "rewards/format_reward": 0.22916666977107525, "step": 95 }, { "advantage_max": 0.9502243474125862, "advantage_mean": 2.7755575615628914e-16, "advantage_min": -0.762770913541317, "advantage_std": 0.6231532171368599, "completion_length": 3168.2708587646484, "epoch": 0.10971428571428571, "grad_norm": 0.11269133538007736, "kl": 0.008467674255371094, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.769942052400235e-07, "loss": 0.0454, "reward": -0.09130434179678559, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.09130434179678559, "reward_after_std": 0.6231532134115696, "reward_before_mean": 0.15943890064954758, "reward_before_std": 0.6864537037909031, "reward_change_max": 0.002087824046611786, "reward_change_mean": -0.25074325082823634, "reward_change_min": -0.5374267399311066, "reward_change_std": 0.21889762580394745, "reward_std": 0.6231532245874405, "rewards/cosine_scaled_reward": -0.07653055200353265, "rewards/format_reward": 0.3125000111758709, "step": 96 }, { "advantage_max": 0.894398532807827, "advantage_mean": 9.934107536579972e-09, "advantage_min": -0.6122366264462471, "advantage_std": 0.5579078607261181, "completion_length": 3316.2083740234375, "epoch": 0.11085714285714286, "grad_norm": 0.11165490746498108, "kl": 0.00435638427734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.759921670520634e-07, "loss": 0.0248, "reward": -0.07307976484298706, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.07307976484298706, "reward_after_std": 0.5579078570008278, "reward_before_mean": 0.1892811693251133, "reward_before_std": 0.5676539484411478, "reward_change_max": 0.00020658224821090698, "reward_change_mean": -0.26236093137413263, "reward_change_min": -0.4734621290117502, "reward_change_std": 0.2033616118133068, "reward_std": 0.5579078681766987, "rewards/cosine_scaled_reward": -0.0407760813832283, "rewards/format_reward": 0.27083333767950535, "step": 97 }, { "advantage_max": 1.1973653174936771, "advantage_mean": 1.8626444830971423e-09, "advantage_min": -0.49922803044319153, "advantage_std": 0.6366796083748341, "completion_length": 3146.0833740234375, "epoch": 0.112, "grad_norm": 0.1001494824886322, "kl": 0.003482341766357422, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.749693666068663e-07, "loss": 0.0621, "reward": -0.19620523788034916, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.19620523788034916, "reward_after_std": 0.6366796176880598, "reward_before_mean": -0.00034199096262454987, "reward_before_std": 0.5961110591888428, "reward_change_max": 0.0005563125014305115, "reward_change_mean": -0.19586324412375689, "reward_change_min": -0.3818340804427862, "reward_change_std": 0.14894506987184286, "reward_std": 0.6366796419024467, "rewards/cosine_scaled_reward": -0.1668376699090004, "rewards/format_reward": 0.33333333767950535, "step": 98 }, { "advantage_max": 0.9004656635224819, "advantage_mean": -3.7252897433504017e-09, "advantage_min": -0.4743019826710224, "advantage_std": 0.5211676489561796, "completion_length": 2839.750015258789, "epoch": 0.11314285714285714, "grad_norm": 0.08176909387111664, "kl": 0.01638031005859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.739258537542835e-07, "loss": 0.0088, "reward": -0.25589028745889664, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.25589028745889664, "reward_after_std": 0.5211676508188248, "reward_before_mean": -0.05980054661631584, "reward_before_std": 0.5296378303319216, "reward_change_max": 0.0010651499032974243, "reward_change_mean": -0.196089755743742, "reward_change_min": -0.39398371428251266, "reward_change_std": 0.15637101605534554, "reward_std": 0.5211676601320505, "rewards/cosine_scaled_reward": -0.1861502705141902, "rewards/format_reward": 0.3125, "step": 99 }, { "advantage_max": 1.4020006023347378, "advantage_mean": -1.2417636363615259e-09, "advantage_min": -0.9309210404753685, "advantage_std": 0.8519085347652435, "completion_length": 3042.062530517578, "epoch": 0.11428571428571428, "grad_norm": 0.1596578061580658, "kl": 0.009717941284179688, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.728616793536587e-07, "loss": 0.0867, "reward": 0.09608042938634753, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.09608042938634753, "reward_after_std": 0.8519085198640823, "reward_before_mean": 0.3819417469203472, "reward_before_std": 0.9046727884560823, "reward_change_max": 0.0, "reward_change_mean": -0.2858613282442093, "reward_change_min": -0.5586092844605446, "reward_change_std": 0.2462056027725339, "reward_std": 0.851908553391695, "rewards/cosine_scaled_reward": -0.006945790722966194, "rewards/format_reward": 0.39583334885537624, "step": 100 }, { "advantage_max": 0.637329638004303, "advantage_mean": 3.4148494920671624e-08, "advantage_min": -0.44175083562731743, "advantage_std": 0.37393420562148094, "completion_length": 2976.416717529297, "epoch": 0.11542857142857142, "grad_norm": 0.06858979165554047, "kl": 0.004735231399536133, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.717768952713511e-07, "loss": 0.0141, "reward": -0.12358441762626171, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.12358441762626171, "reward_after_std": 0.37393419072031975, "reward_before_mean": 0.13824375998228788, "reward_before_std": 0.3420494869351387, "reward_change_max": 0.0, "reward_change_mean": -0.26182815805077553, "reward_change_min": -0.40689949691295624, "reward_change_std": 0.16052342765033245, "reward_std": 0.37393420189619064, "rewards/cosine_scaled_reward": -0.0871281186118722, "rewards/format_reward": 0.31250000186264515, "step": 101 }, { "advantage_max": 1.4978736191987991, "advantage_mean": 1.614292521878724e-08, "advantage_min": -0.9803852066397667, "advantage_std": 0.9344386383891106, "completion_length": 2877.0000915527344, "epoch": 0.11657142857142858, "grad_norm": 0.22138696908950806, "kl": 0.012332916259765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.706715543782064e-07, "loss": 0.112, "reward": 0.25213714223355055, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.25213714223355055, "reward_after_std": 0.9344386458396912, "reward_before_mean": 0.5871431287378073, "reward_before_std": 1.0055402405560017, "reward_change_max": 0.0006800442934036255, "reward_change_mean": -0.3350059576332569, "reward_change_min": -0.7078782953321934, "reward_change_std": 0.2961591836065054, "reward_std": 0.9344386756420135, "rewards/cosine_scaled_reward": 0.033154879696667194, "rewards/format_reward": 0.5208333395421505, "step": 102 }, { "advantage_max": 1.4239412993192673, "advantage_mean": 9.934107647602275e-09, "advantage_min": -0.7207528688013554, "advantage_std": 0.8292893841862679, "completion_length": 3183.3958740234375, "epoch": 0.11771428571428572, "grad_norm": 0.14050276577472687, "kl": 0.009951591491699219, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.695457105469804e-07, "loss": 0.098, "reward": -0.10000946186482906, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.10000946186482906, "reward_after_std": 0.8292893879115582, "reward_before_mean": 0.11191971227526665, "reward_before_std": 0.8726517036557198, "reward_change_max": 0.0009425804018974304, "reward_change_mean": -0.2119291506242007, "reward_change_min": -0.5215573105961084, "reward_change_std": 0.21027667145244777, "reward_std": 0.8292894102632999, "rewards/cosine_scaled_reward": -0.11070681922137737, "rewards/format_reward": 0.3333333432674408, "step": 103 }, { "advantage_max": 1.1159479767084122, "advantage_mean": 8.071462720415923e-09, "advantage_min": -0.6675407961010933, "advantage_std": 0.661103330552578, "completion_length": 2837.000015258789, "epoch": 0.11885714285714286, "grad_norm": 0.23616887629032135, "kl": 0.006572723388671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.683994186497132e-07, "loss": 0.0544, "reward": -0.12476626224815845, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.12476626224815845, "reward_after_std": 0.6611033640801907, "reward_before_mean": 0.10218615579651669, "reward_before_std": 0.6853909529745579, "reward_change_max": 0.0016945451498031616, "reward_change_mean": -0.2269524084404111, "reward_change_min": -0.4498551990836859, "reward_change_std": 0.18958128709346056, "reward_std": 0.6611033827066422, "rewards/cosine_scaled_reward": -0.136406933888793, "rewards/format_reward": 0.37500000186264515, "step": 104 }, { "advantage_max": 1.7909524142742157, "advantage_mean": 3.1044085080367267e-09, "advantage_min": -0.9422320500016212, "advantage_std": 1.0115957744419575, "completion_length": 2932.2708435058594, "epoch": 0.12, "grad_norm": 0.12929129600524902, "kl": 0.0063686370849609375, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.672327345550543e-07, "loss": 0.0374, "reward": 0.13043325836770236, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.13043325836770236, "reward_after_std": 1.0115957781672478, "reward_before_mean": 0.4006117479875684, "reward_before_std": 1.0370574593544006, "reward_change_max": 0.0004918202757835388, "reward_change_mean": -0.2701784926466644, "reward_change_min": -0.5277115534991026, "reward_change_std": 0.2267649695277214, "reward_std": 1.0115957856178284, "rewards/cosine_scaled_reward": 0.002389195084106177, "rewards/format_reward": 0.3958333432674408, "step": 105 }, { "advantage_max": 1.4532872326672077, "advantage_mean": -3.1044086745701804e-09, "advantage_min": -0.8069161958992481, "advantage_std": 0.8219671659171581, "completion_length": 2498.3333740234375, "epoch": 0.12114285714285715, "grad_norm": 0.12542401254177094, "kl": 0.013767242431640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.66045715125541e-07, "loss": 0.0723, "reward": 0.41410426795482635, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.41410426795482635, "reward_after_std": 0.821967139840126, "reward_before_mean": 0.8086659302935004, "reward_before_std": 0.7867364194244146, "reward_change_max": 0.0, "reward_change_mean": -0.3945616828277707, "reward_change_min": -0.6701525822281837, "reward_change_std": 0.264983544126153, "reward_std": 0.8219671808183193, "rewards/cosine_scaled_reward": 0.11266629956662655, "rewards/format_reward": 0.5833333376795053, "step": 106 }, { "advantage_max": 0.9903253465890884, "advantage_mean": -1.3659397946064189e-08, "advantage_min": -0.5689501836895943, "advantage_std": 0.5777853094041348, "completion_length": 2992.3333435058594, "epoch": 0.12228571428571429, "grad_norm": 0.12914782762527466, "kl": 0.00812530517578125, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.648384182148252e-07, "loss": 0.0535, "reward": -0.03730320557951927, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.03730320557951927, "reward_after_std": 0.577785300090909, "reward_before_mean": 0.23000611877068877, "reward_before_std": 0.5657899845391512, "reward_change_max": 0.0007830634713172913, "reward_change_mean": -0.26730930525809526, "reward_change_min": -0.46656811609864235, "reward_change_std": 0.1908670226112008, "reward_std": 0.5777853112667799, "rewards/cosine_scaled_reward": -0.09333029016852379, "rewards/format_reward": 0.41666666977107525, "step": 107 }, { "advantage_max": 1.4422883205115795, "advantage_mean": 4.967053657267684e-09, "advantage_min": -0.6572414226830006, "advantage_std": 0.8186516799032688, "completion_length": 2927.0417098999023, "epoch": 0.12342857142857143, "grad_norm": 24.618846893310547, "kl": 3.3649168014526367, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.636109026648554e-07, "loss": 0.1124, "reward": -0.15085413120687008, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.15085413120687008, "reward_after_std": 0.8186516761779785, "reward_before_mean": 0.04168054834008217, "reward_before_std": 0.8399753011763096, "reward_change_max": 0.0003223493695259094, "reward_change_mean": -0.19253466837108135, "reward_change_min": -0.4192156232893467, "reward_change_std": 0.18019374925643206, "reward_std": 0.8186517059803009, "rewards/cosine_scaled_reward": -0.1458263983950019, "rewards/format_reward": 0.3333333358168602, "step": 108 }, { "advantage_max": 0.8548574820160866, "advantage_mean": 9.934107536579972e-09, "advantage_min": -0.5198825635015965, "advantage_std": 0.49553442001342773, "completion_length": 3041.3541870117188, "epoch": 0.12457142857142857, "grad_norm": 0.07054049521684647, "kl": 0.0053844451904296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.623632283030077e-07, "loss": 0.0056, "reward": -0.21929301880300045, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.21929301880300045, "reward_after_std": 0.49553442373871803, "reward_before_mean": -0.006567927077412605, "reward_before_std": 0.49612240865826607, "reward_change_max": 0.0013580769300460815, "reward_change_mean": -0.21272510197013617, "reward_change_min": -0.3920324221253395, "reward_change_std": 0.15321388468146324, "reward_std": 0.4955344498157501, "rewards/cosine_scaled_reward": -0.15953395422548056, "rewards/format_reward": 0.31250000186264515, "step": 109 }, { "advantage_max": 1.7169987708330154, "advantage_mean": 1.521160253314946e-08, "advantage_min": -0.773616373538971, "advantage_std": 0.9723969623446465, "completion_length": 2968.9584045410156, "epoch": 0.12571428571428572, "grad_norm": 0.16165363788604736, "kl": 0.007773399353027344, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.610954559391704e-07, "loss": 0.0213, "reward": 0.1421629348769784, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1421629348769784, "reward_after_std": 0.9723969735205173, "reward_before_mean": 0.4209832027554512, "reward_before_std": 0.9890514109283686, "reward_change_max": 0.00023964792490005493, "reward_change_mean": -0.27882024459540844, "reward_change_min": -0.6649527996778488, "reward_change_std": 0.2514312257990241, "reward_std": 0.9723969958722591, "rewards/cosine_scaled_reward": -0.02909173769876361, "rewards/format_reward": 0.479166679084301, "step": 110 }, { "advantage_max": 1.0488375090062618, "advantage_mean": 1.4280279181377864e-08, "advantage_min": -0.6760149672627449, "advantage_std": 0.6392879411578178, "completion_length": 3331.0208740234375, "epoch": 0.12685714285714286, "grad_norm": 0.1319161206483841, "kl": 0.008884429931640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.598076473627796e-07, "loss": 0.0618, "reward": -0.24013402685523033, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.24013402685523033, "reward_after_std": 0.6392879635095596, "reward_before_mean": -0.04797346517443657, "reward_before_std": 0.6857679709792137, "reward_change_max": 0.0008327588438987732, "reward_change_mean": -0.1921605784446001, "reward_change_min": -0.4485097285360098, "reward_change_std": 0.18650522828102112, "reward_std": 0.6392879746854305, "rewards/cosine_scaled_reward": -0.10732006467878819, "rewards/format_reward": 0.16666666977107525, "step": 111 }, { "advantage_max": 1.4520698636770248, "advantage_mean": -3.1044085080367267e-09, "advantage_min": -0.7237064838409424, "advantage_std": 0.8273558132350445, "completion_length": 3388.4584045410156, "epoch": 0.128, "grad_norm": 0.13587014377117157, "kl": 0.004698753356933594, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.58499865339809e-07, "loss": 0.0292, "reward": 0.06877225078642368, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.06877225078642368, "reward_after_std": 0.8273558430373669, "reward_before_mean": 0.34132440108805895, "reward_before_std": 0.827323455363512, "reward_change_max": 0.0007152184844017029, "reward_change_mean": -0.2725521409884095, "reward_change_min": -0.5733534023165703, "reward_change_std": 0.22624664986506104, "reward_std": 0.8273558542132378, "rewards/cosine_scaled_reward": 0.003995520528405905, "rewards/format_reward": 0.3333333358168602, "step": 112 }, { "advantage_max": 1.3849294260144234, "advantage_mean": -2.4214387051024744e-08, "advantage_min": -0.9319385662674904, "advantage_std": 0.8326413743197918, "completion_length": 2787.5625610351562, "epoch": 0.12914285714285714, "grad_norm": 0.17472368478775024, "kl": 0.0091705322265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.571721736097088e-07, "loss": 0.0602, "reward": 0.22794928343500942, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.22794928343500942, "reward_after_std": 0.8326413743197918, "reward_before_mean": 0.561581514775753, "reward_before_std": 0.8620108254253864, "reward_change_max": 0.0, "reward_change_mean": -0.333632237277925, "reward_change_min": -0.6458872146904469, "reward_change_std": 0.2622876074165106, "reward_std": 0.832641389220953, "rewards/cosine_scaled_reward": -0.010875914245843887, "rewards/format_reward": 0.5833333414047956, "step": 113 }, { "advantage_max": 0.9513976350426674, "advantage_mean": -1.8626453157644107e-09, "advantage_min": -0.44018446281552315, "advantage_std": 0.5162045583128929, "completion_length": 2600.62508392334, "epoch": 0.13028571428571428, "grad_norm": 0.07465193420648575, "kl": 0.0067729949951171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.55824636882301e-07, "loss": 0.014, "reward": -0.1275107857072726, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1275107857072726, "reward_after_std": 0.5162045545876026, "reward_before_mean": 0.11073758825659752, "reward_before_std": 0.46768177300691605, "reward_change_max": 0.0012068524956703186, "reward_change_mean": -0.23824838874861598, "reward_change_min": -0.41094420850276947, "reward_change_std": 0.15441239904612303, "reward_std": 0.5162045657634735, "rewards/cosine_scaled_reward": -0.2571312137879431, "rewards/format_reward": 0.6250000055879354, "step": 114 }, { "advantage_max": 1.2352916896343231, "advantage_mean": -1.1175871117430347e-08, "advantage_min": -0.5806517638266087, "advantage_std": 0.6685930602252483, "completion_length": 2888.1250228881836, "epoch": 0.13142857142857142, "grad_norm": 0.09872996062040329, "kl": 0.006771087646484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.54457320834625e-07, "loss": 0.0075, "reward": -0.10066608339548111, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.10066608339548111, "reward_after_std": 0.668593030422926, "reward_before_mean": 0.1253925976343453, "reward_before_std": 0.6527570895850658, "reward_change_max": 0.001193806529045105, "reward_change_mean": -0.22605869453400373, "reward_change_min": -0.3762983959168196, "reward_change_std": 0.15092294523492455, "reward_std": 0.6685930527746677, "rewards/cosine_scaled_reward": -0.09355370327830315, "rewards/format_reward": 0.31250000186264515, "step": 115 }, { "advantage_max": 0.9336372055113316, "advantage_mean": 1.862645371275562e-09, "advantage_min": -0.5836285278201103, "advantage_std": 0.5509970504790545, "completion_length": 3403.2291870117188, "epoch": 0.13257142857142856, "grad_norm": 0.1181810051202774, "kl": 0.006374359130859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.530702921077358e-07, "loss": 0.0328, "reward": -0.2978327311575413, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2978327311575413, "reward_after_std": 0.5509970467537642, "reward_before_mean": -0.11921874433755875, "reward_before_std": 0.577593807131052, "reward_change_max": 0.000988095998764038, "reward_change_mean": -0.17861400917172432, "reward_change_min": -0.3633838780224323, "reward_change_std": 0.15010416228324175, "reward_std": 0.5509970523416996, "rewards/cosine_scaled_reward": -0.14294270798563957, "rewards/format_reward": 0.16666667349636555, "step": 116 }, { "advantage_max": 1.1050181835889816, "advantage_mean": 2.23517425679276e-08, "advantage_min": -0.5356793627142906, "advantage_std": 0.6055473424494267, "completion_length": 3196.375030517578, "epoch": 0.1337142857142857, "grad_norm": 0.10387641191482544, "kl": 0.008967399597167969, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.516636183034564e-07, "loss": 0.0234, "reward": -0.341567924246192, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.341567924246192, "reward_after_std": 0.6055473275482655, "reward_before_mean": -0.19230719900224358, "reward_before_std": 0.612238947302103, "reward_change_max": 0.0010152682662010193, "reward_change_mean": -0.14926070533692837, "reward_change_min": -0.3155266009271145, "reward_change_std": 0.13089507957920432, "reward_std": 0.6055473312735558, "rewards/cosine_scaled_reward": -0.2419869415462017, "rewards/format_reward": 0.29166666977107525, "step": 117 }, { "advantage_max": 1.641647595912218, "advantage_mean": -4.967053213178474e-09, "advantage_min": -0.9000925049185753, "advantage_std": 0.9510752744972706, "completion_length": 2933.125030517578, "epoch": 0.13485714285714287, "grad_norm": 0.1362229734659195, "kl": 0.004642486572265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.502373679810839e-07, "loss": 0.0111, "reward": 0.3653463274240494, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3653463274240494, "reward_after_std": 0.9510752744972706, "reward_before_mean": 0.7301360741257668, "reward_before_std": 0.9502398185431957, "reward_change_max": 0.0, "reward_change_mean": -0.3647897462360561, "reward_change_min": -0.7364008165895939, "reward_change_std": 0.28538533207029104, "reward_std": 0.951075304299593, "rewards/cosine_scaled_reward": 0.11506801494397223, "rewards/format_reward": 0.5000000055879354, "step": 118 }, { "advantage_max": 1.2773814722895622, "advantage_mean": -4.440892098500626e-16, "advantage_min": -0.7336373254656792, "advantage_std": 0.7546707466244698, "completion_length": 2631.645851135254, "epoch": 0.136, "grad_norm": 0.11543148756027222, "kl": 0.010685920715332031, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.487916106540465e-07, "loss": 0.0418, "reward": 0.12168693542480469, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.12168693542480469, "reward_after_std": 0.7546707466244698, "reward_before_mean": 0.4236811628798023, "reward_before_std": 0.7664337083697319, "reward_change_max": 0.0007797032594680786, "reward_change_mean": -0.301994226872921, "reward_change_min": -0.5957271978259087, "reward_change_std": 0.2320833122357726, "reward_std": 0.7546707689762115, "rewards/cosine_scaled_reward": -0.06940942443907261, "rewards/format_reward": 0.5625000037252903, "step": 119 }, { "advantage_max": 1.3410765826702118, "advantage_mean": 5.898376453927767e-09, "advantage_min": -0.6475675106048584, "advantage_std": 0.7686546426266432, "completion_length": 2500.8750610351562, "epoch": 0.13714285714285715, "grad_norm": 0.1472538560628891, "kl": 0.009729385375976562, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.473264167865171e-07, "loss": 0.0945, "reward": 0.17069168761372566, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.17069168761372566, "reward_after_std": 0.7686546500772238, "reward_before_mean": 0.48508234234759584, "reward_before_std": 0.7526429118588567, "reward_change_max": 0.0, "reward_change_mean": -0.3143906258046627, "reward_change_min": -0.6074437368661165, "reward_change_std": 0.23728215042501688, "reward_std": 0.7686546761542559, "rewards/cosine_scaled_reward": -0.007458832420525141, "rewards/format_reward": 0.500000013038516, "step": 120 }, { "advantage_max": 1.3169677779078484, "advantage_mean": -1.8316011485275396e-08, "advantage_min": -0.5936854109168053, "advantage_std": 0.7040582299232483, "completion_length": 2078.000015258789, "epoch": 0.1382857142857143, "grad_norm": 0.13888248801231384, "kl": 0.012456893920898438, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.458418577899774e-07, "loss": 0.07, "reward": 0.36981683829799294, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.36981683829799294, "reward_after_std": 0.7040582150220871, "reward_before_mean": 0.7583018532022834, "reward_before_std": 0.6163065582513809, "reward_change_max": 0.000717945396900177, "reward_change_mean": -0.3884850200265646, "reward_change_min": -0.6099117025732994, "reward_change_std": 0.2327408418059349, "reward_std": 0.7040582410991192, "rewards/cosine_scaled_reward": 0.014567593112587929, "rewards/format_reward": 0.7291666679084301, "step": 121 }, { "advantage_max": 1.396921530365944, "advantage_mean": 4.967053990334591e-09, "advantage_min": -0.8277639225125313, "advantage_std": 0.8302930146455765, "completion_length": 2936.9584159851074, "epoch": 0.13942857142857143, "grad_norm": 0.14620067179203033, "kl": 0.007358551025390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.443380060197385e-07, "loss": 0.0048, "reward": 0.14990826323628426, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.14990826323628426, "reward_after_std": 0.8302930146455765, "reward_before_mean": 0.45474499464035034, "reward_before_std": 0.8626224808394909, "reward_change_max": 6.0871243476867676e-05, "reward_change_mean": -0.30483673978596926, "reward_change_min": -0.6525392979383469, "reward_change_std": 0.2505636941641569, "reward_std": 0.8302930295467377, "rewards/cosine_scaled_reward": -0.012210835237056017, "rewards/format_reward": 0.47916667722165585, "step": 122 }, { "advantage_max": 1.1955565959215164, "advantage_mean": 1.3038516488705909e-08, "advantage_min": -0.7384285181760788, "advantage_std": 0.7622129395604134, "completion_length": 3045.6459045410156, "epoch": 0.14057142857142857, "grad_norm": 0.1293623447418213, "kl": 0.00701904296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.428149347714143e-07, "loss": 0.0924, "reward": -0.10539140645414591, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.10539140645414591, "reward_after_std": 0.7622129209339619, "reward_before_mean": 0.12175704911351204, "reward_before_std": 0.8335296474397182, "reward_change_max": 0.0007355660200119019, "reward_change_mean": -0.22714845649898052, "reward_change_min": -0.5292183440178633, "reward_change_std": 0.23081361688673496, "reward_std": 0.7622129283845425, "rewards/cosine_scaled_reward": -0.13703814148902893, "rewards/format_reward": 0.39583333767950535, "step": 123 }, { "advantage_max": 1.2478653825819492, "advantage_mean": -2.2972624080797033e-08, "advantage_min": -0.7673164531588554, "advantage_std": 0.756400678306818, "completion_length": 2592.687515258789, "epoch": 0.1417142857142857, "grad_norm": 0.12547579407691956, "kl": 0.01032257080078125, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.412727182773486e-07, "loss": 0.039, "reward": 0.17265462409704924, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.17265462409704924, "reward_after_std": 0.7564007006585598, "reward_before_mean": 0.4965659724548459, "reward_before_std": 0.7837951183319092, "reward_change_max": 0.0, "reward_change_mean": -0.3239113837480545, "reward_change_min": -0.667121559381485, "reward_change_std": 0.26153578516095877, "reward_std": 0.7564007118344307, "rewards/cosine_scaled_reward": -0.022550346329808235, "rewards/format_reward": 0.5416666716337204, "step": 124 }, { "advantage_max": 1.150554358959198, "advantage_mean": -6.208818126296478e-09, "advantage_min": -0.5988728702068329, "advantage_std": 0.6304605938494205, "completion_length": 2893.791702270508, "epoch": 0.14285714285714285, "grad_norm": 0.08245649188756943, "kl": 0.0050067901611328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.397114317029974e-07, "loss": 0.0061, "reward": 0.041422320529818535, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.041422320529818535, "reward_after_std": 0.6304605901241302, "reward_before_mean": 0.3251929506659508, "reward_before_std": 0.5924066100269556, "reward_change_max": 0.0006644278764724731, "reward_change_mean": -0.2837706417776644, "reward_change_min": -0.46433842554688454, "reward_change_std": 0.1861866288818419, "reward_std": 0.6304606199264526, "rewards/cosine_scaled_reward": 0.006346469279378653, "rewards/format_reward": 0.31250000186264515, "step": 125 }, { "advantage_max": 1.4216941222548485, "advantage_mean": -7.450580763457282e-09, "advantage_min": -0.7550601400434971, "advantage_std": 0.821855939924717, "completion_length": 2790.604217529297, "epoch": 0.144, "grad_norm": 0.1472250074148178, "kl": 0.005031585693359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.381311511432658e-07, "loss": 0.0522, "reward": 0.052971549332141876, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.052971549332141876, "reward_after_std": 0.821855939924717, "reward_before_mean": 0.3223703149706125, "reward_before_std": 0.8384648263454437, "reward_change_max": 0.0003665909171104431, "reward_change_mean": -0.2693987749516964, "reward_change_min": -0.5547848157584667, "reward_change_std": 0.21912654396146536, "reward_std": 0.821855966001749, "rewards/cosine_scaled_reward": -0.06798151088878512, "rewards/format_reward": 0.45833333767950535, "step": 126 }, { "advantage_max": 0.954961534589529, "advantage_mean": -1.241763414316921e-09, "advantage_min": -0.6304600276052952, "advantage_std": 0.608190419152379, "completion_length": 3243.7500610351562, "epoch": 0.14514285714285713, "grad_norm": 0.16301438212394714, "kl": 0.00958251953125, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.36531953618799e-07, "loss": 0.0694, "reward": -0.1906900741159916, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1906900741159916, "reward_after_std": 0.608190419152379, "reward_before_mean": 0.024494127836078405, "reward_before_std": 0.6589562017470598, "reward_change_max": 0.001492425799369812, "reward_change_mean": -0.21518420707434416, "reward_change_min": -0.5101010613143444, "reward_change_std": 0.20124634448438883, "reward_std": 0.6081904359161854, "rewards/cosine_scaled_reward": -0.154419609811157, "rewards/format_reward": 0.3333333432674408, "step": 127 }, { "advantage_max": 1.0362044796347618, "advantage_mean": -1.4280280069556284e-08, "advantage_min": -0.8516915030777454, "advantage_std": 0.6831777542829514, "completion_length": 2831.104202270508, "epoch": 0.1462857142857143, "grad_norm": 0.10564086586236954, "kl": 0.007222175598144531, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.34913917072228e-07, "loss": 0.0296, "reward": 0.30764661356806755, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.30764661356806755, "reward_after_std": 0.6831777645274997, "reward_before_mean": 0.6954053975641727, "reward_before_std": 0.7116181813180447, "reward_change_max": 0.0012954026460647583, "reward_change_mean": -0.387758809607476, "reward_change_min": -0.6460206061601639, "reward_change_std": 0.27978145400993526, "reward_std": 0.6831777868792415, "rewards/cosine_scaled_reward": 0.13936935923993587, "rewards/format_reward": 0.4166666716337204, "step": 128 }, { "advantage_max": 1.2157826200127602, "advantage_mean": 7.45058070794613e-09, "advantage_min": -0.6898716762661934, "advantage_std": 0.6953651700168848, "completion_length": 3310.3333740234375, "epoch": 0.14742857142857144, "grad_norm": 0.12506195902824402, "kl": 0.008449554443359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.332771203643714e-07, "loss": -0.0103, "reward": -0.12795098591595888, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.12795098591595888, "reward_after_std": 0.6953651700168848, "reward_before_mean": 0.090589489787817, "reward_before_std": 0.7126827575266361, "reward_change_max": 0.0, "reward_change_mean": -0.2185404673218727, "reward_change_min": -0.46337801590561867, "reward_change_std": 0.18281217105686665, "reward_std": 0.695365184918046, "rewards/cosine_scaled_reward": -0.05887192999944091, "rewards/format_reward": 0.2083333358168602, "step": 129 }, { "advantage_max": 0.8977086283266544, "advantage_mean": 1.9868215628271457e-08, "advantage_min": -0.6305883452296257, "advantage_std": 0.5447062347084284, "completion_length": 3246.8958740234375, "epoch": 0.14857142857142858, "grad_norm": 0.1151081845164299, "kl": 0.009033203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.316216432703916e-07, "loss": 0.0062, "reward": -0.19078312814235687, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.19078312814235687, "reward_after_std": 0.5447062440216541, "reward_before_mean": 0.02786485105752945, "reward_before_std": 0.568111153319478, "reward_change_max": 0.0009800717234611511, "reward_change_mean": -0.21864797454327345, "reward_change_min": -0.3908236641436815, "reward_change_std": 0.16550936549901962, "reward_std": 0.544706255197525, "rewards/cosine_scaled_reward": -0.07981756143271923, "rewards/format_reward": 0.18750000186264515, "step": 130 }, { "advantage_max": 1.2027404643595219, "advantage_mean": 9.934108036180334e-09, "advantage_min": -0.8151110913604498, "advantage_std": 0.7555824033915997, "completion_length": 2788.770866394043, "epoch": 0.14971428571428572, "grad_norm": 0.1706915944814682, "kl": 0.009748458862304688, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.299475664759068e-07, "loss": 0.078, "reward": 0.23934237146750093, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.23934237146750093, "reward_after_std": 0.75558240711689, "reward_before_mean": 0.5904918815940619, "reward_before_std": 0.7709046499803662, "reward_change_max": 0.0, "reward_change_mean": -0.3511495003476739, "reward_change_min": -0.64772904291749, "reward_change_std": 0.27793479710817337, "reward_std": 0.755582433193922, "rewards/cosine_scaled_reward": 0.07649594731628895, "rewards/format_reward": 0.43750000186264515, "step": 131 }, { "advantage_max": 1.5452817603945732, "advantage_mean": 6.208817349140361e-09, "advantage_min": -0.7206988781690598, "advantage_std": 0.8455226197838783, "completion_length": 2813.6458435058594, "epoch": 0.15085714285714286, "grad_norm": 0.1316002458333969, "kl": 0.00800323486328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.282549715730579e-07, "loss": -0.0289, "reward": 0.009623751044273376, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.009623751044273376, "reward_after_std": 0.8455226235091686, "reward_before_mean": 0.25404807389713824, "reward_before_std": 0.8349966257810593, "reward_change_max": 0.0010694265365600586, "reward_change_mean": -0.24442429654300213, "reward_change_min": -0.5337617993354797, "reward_change_std": 0.1964772827923298, "reward_std": 0.8455226384103298, "rewards/cosine_scaled_reward": -0.08130931667983532, "rewards/format_reward": 0.41666667349636555, "step": 132 }, { "advantage_max": 1.050060760229826, "advantage_mean": 1.862645426786713e-09, "advantage_min": -0.4991532042622566, "advantage_std": 0.5961620546877384, "completion_length": 3129.2916870117188, "epoch": 0.152, "grad_norm": 0.12014055252075195, "kl": 0.009761810302734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.265439410565328e-07, "loss": 0.0423, "reward": -0.35092977434396744, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.35092977434396744, "reward_after_std": 0.5961620360612869, "reward_before_mean": -0.20190617628395557, "reward_before_std": 0.6122478432953358, "reward_change_max": 0.00027695298194885254, "reward_change_mean": -0.14902361016720533, "reward_change_min": -0.3222372457385063, "reward_change_std": 0.12988192215561867, "reward_std": 0.5961620435118675, "rewards/cosine_scaled_reward": -0.23636975418776274, "rewards/format_reward": 0.2708333358168602, "step": 133 }, { "advantage_max": 1.163169089704752, "advantage_mean": -1.645336678013365e-08, "advantage_min": -0.5319757275283337, "advantage_std": 0.6446932945400476, "completion_length": 2510.104202270508, "epoch": 0.15314285714285714, "grad_norm": 0.10554244369268417, "kl": 0.011915206909179688, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.248145583195447e-07, "loss": 0.0326, "reward": 0.05566288158297539, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.05566288158297539, "reward_after_std": 0.6446932852268219, "reward_before_mean": 0.34459465090185404, "reward_before_std": 0.6043676100671291, "reward_change_max": 0.0004299283027648926, "reward_change_mean": -0.28893175069242716, "reward_change_min": -0.5042743273079395, "reward_change_std": 0.19494254142045975, "reward_std": 0.6446933001279831, "rewards/cosine_scaled_reward": -0.07770269550383091, "rewards/format_reward": 0.5000000018626451, "step": 134 }, { "advantage_max": 1.4917153492569923, "advantage_mean": 1.0554988993938252e-08, "advantage_min": -0.7013456299901009, "advantage_std": 0.8363952152431011, "completion_length": 1998.5416984558105, "epoch": 0.15428571428571428, "grad_norm": 0.17164035141468048, "kl": 0.009614944458007812, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.230669076497687e-07, "loss": -0.0149, "reward": 0.5657868012785912, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5657868012785912, "reward_after_std": 0.8363952077925205, "reward_before_mean": 1.0127003118395805, "reward_before_std": 0.7460980167379603, "reward_change_max": 0.0, "reward_change_mean": -0.446913446765393, "reward_change_min": -0.7371262945234776, "reward_change_std": 0.2943604183383286, "reward_std": 0.8363952338695526, "rewards/cosine_scaled_reward": 0.14176679588854313, "rewards/format_reward": 0.7291666697710752, "step": 135 }, { "advantage_max": 1.4442752003669739, "advantage_mean": -2.918144209607121e-08, "advantage_min": -0.8376066125929356, "advantage_std": 0.8569142334163189, "completion_length": 2878.3958892822266, "epoch": 0.15542857142857142, "grad_norm": 0.13732051849365234, "kl": 0.010951995849609375, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.213010742252327e-07, "loss": 0.0786, "reward": 0.3354130834341049, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3354130834341049, "reward_after_std": 0.8569142334163189, "reward_before_mean": 0.7032555975019932, "reward_before_std": 0.8669465184211731, "reward_change_max": 0.0, "reward_change_mean": -0.367842567153275, "reward_change_min": -0.7203944735229015, "reward_change_std": 0.2801892305724323, "reward_std": 0.8569142483174801, "rewards/cosine_scaled_reward": 0.1432944694533944, "rewards/format_reward": 0.4166666753590107, "step": 136 }, { "advantage_max": 1.1755945719778538, "advantage_mean": 3.885780586188048e-16, "advantage_min": -0.5631213709712029, "advantage_std": 0.6682571768760681, "completion_length": 3051.250030517578, "epoch": 0.15657142857142858, "grad_norm": 0.12679171562194824, "kl": 0.009998321533203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.195171441101668e-07, "loss": 0.0445, "reward": -0.17650610394775867, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.17650610394775867, "reward_after_std": 0.6682571917772293, "reward_before_mean": 0.026825436390936375, "reward_before_std": 0.6791456611827016, "reward_change_max": 0.001605108380317688, "reward_change_mean": -0.20333154685795307, "reward_change_min": -0.4427060279995203, "reward_change_std": 0.17323465924710035, "reward_std": 0.6682572159916162, "rewards/cosine_scaled_reward": -0.12200394924730062, "rewards/format_reward": 0.2708333432674408, "step": 137 }, { "advantage_max": 0.9305066950619221, "advantage_mean": 1.800557042352935e-08, "advantage_min": -0.5270604677498341, "advantage_std": 0.5412895157933235, "completion_length": 2504.0625381469727, "epoch": 0.15771428571428572, "grad_norm": 0.09073272347450256, "kl": 0.009454727172851562, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.177152042508077e-07, "loss": 0.0546, "reward": -0.015904040075838566, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.015904040075838566, "reward_after_std": 0.5412895157933235, "reward_before_mean": 0.2626056023873389, "reward_before_std": 0.5118083693087101, "reward_change_max": 0.0006034299731254578, "reward_change_mean": -0.2785096103325486, "reward_change_min": -0.4878885019570589, "reward_change_std": 0.19265306554734707, "reward_std": 0.5412895232439041, "rewards/cosine_scaled_reward": -0.12911387719213963, "rewards/format_reward": 0.5208333358168602, "step": 138 }, { "advantage_max": 1.397744432091713, "advantage_mean": 1.1175871006408045e-08, "advantage_min": -0.6843620836734772, "advantage_std": 0.7991587594151497, "completion_length": 3182.5625610351562, "epoch": 0.15885714285714286, "grad_norm": 0.17045240104198456, "kl": 0.012096405029296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.158953424711624e-07, "loss": 0.0473, "reward": -0.06700982432812452, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.06700982432812452, "reward_after_std": 0.7991587519645691, "reward_before_mean": 0.16040014289319515, "reward_before_std": 0.815875044092536, "reward_change_max": 0.0009377151727676392, "reward_change_mean": -0.2274099476635456, "reward_change_min": -0.5065705124288797, "reward_change_std": 0.19903714209794998, "reward_std": 0.79915876314044, "rewards/cosine_scaled_reward": -0.09688326716423035, "rewards/format_reward": 0.35416666977107525, "step": 139 }, { "advantage_max": 1.0005671940743923, "advantage_mean": 1.6763806842678974e-08, "advantage_min": -0.4939218834042549, "advantage_std": 0.5732725989073515, "completion_length": 3125.104217529297, "epoch": 0.16, "grad_norm": 0.40375617146492004, "kl": 0.018047332763671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.140576474687263e-07, "loss": 0.0712, "reward": -0.18270614463835955, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.18270614463835955, "reward_after_std": 0.5732725989073515, "reward_before_mean": 0.03130531124770641, "reward_before_std": 0.5579951740801334, "reward_change_max": 0.0002490878105163574, "reward_change_mean": -0.21401146426796913, "reward_change_min": -0.45000362023711205, "reward_change_std": 0.178125046659261, "reward_std": 0.5732726082205772, "rewards/cosine_scaled_reward": -0.09893068426754326, "rewards/format_reward": 0.22916666977107525, "step": 140 }, { "advantage_max": 1.3642967343330383, "advantage_mean": -1.6653345369377348e-16, "advantage_min": -0.8970592468976974, "advantage_std": 0.8663997799158096, "completion_length": 2833.9375610351562, "epoch": 0.16114285714285714, "grad_norm": 0.14736533164978027, "kl": 0.013208389282226562, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.122022088101613e-07, "loss": 0.0636, "reward": 0.19854285567998886, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.19854285567998886, "reward_after_std": 0.8663997799158096, "reward_before_mean": 0.5228808233514428, "reward_before_std": 0.9317158870398998, "reward_change_max": 0.0006700456142425537, "reward_change_mean": -0.32433797139674425, "reward_change_min": -0.6870186366140842, "reward_change_std": 0.28708665631711483, "reward_std": 0.8663998134434223, "rewards/cosine_scaled_reward": -0.04064292460680008, "rewards/format_reward": 0.6041666734963655, "step": 141 }, { "advantage_max": 1.1075918152928352, "advantage_mean": -1.862645243599914e-08, "advantage_min": -0.9935803860425949, "advantage_std": 0.7360248751938343, "completion_length": 2854.979217529297, "epoch": 0.16228571428571428, "grad_norm": 0.13701511919498444, "kl": 0.01279449462890625, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.103291169269299e-07, "loss": 0.0439, "reward": 0.2674298919737339, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2674298919737339, "reward_after_std": 0.736024871468544, "reward_before_mean": 0.6353370100259781, "reward_before_std": 0.799102995544672, "reward_change_max": 0.000289037823677063, "reward_change_mean": -0.3679071478545666, "reward_change_min": -0.6680124215781689, "reward_change_std": 0.28033728897571564, "reward_std": 0.7360248789191246, "rewards/cosine_scaled_reward": 0.015585171058773994, "rewards/format_reward": 0.604166692122817, "step": 142 }, { "advantage_max": 0.9835025705397129, "advantage_mean": 1.0865430083439875e-08, "advantage_min": -0.7354140728712082, "advantage_std": 0.618965107947588, "completion_length": 2793.750030517578, "epoch": 0.16342857142857142, "grad_norm": 0.13960637152194977, "kl": 0.015850067138671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.084384631108882e-07, "loss": 0.0635, "reward": -0.17446402180939913, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.17446402180939913, "reward_after_std": 0.6189651004970074, "reward_before_mean": 0.04228054964914918, "reward_before_std": 0.6649498995393515, "reward_change_max": 0.0016652792692184448, "reward_change_mean": -0.21674457285553217, "reward_change_min": -0.429037906229496, "reward_change_std": 0.19230018742382526, "reward_std": 0.6189651042222977, "rewards/cosine_scaled_reward": -0.18719306215643883, "rewards/format_reward": 0.41666668094694614, "step": 143 }, { "advantage_max": 1.6626508310437202, "advantage_mean": -2.483526606589237e-09, "advantage_min": -0.8582720793783665, "advantage_std": 0.9675541780889034, "completion_length": 3074.4166679382324, "epoch": 0.16457142857142856, "grad_norm": 0.1934293508529663, "kl": 0.012912750244140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.065303395098358e-07, "loss": 0.0595, "reward": -0.01596003444865346, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.01596003444865346, "reward_after_std": 0.9675541818141937, "reward_before_mean": 0.21122129168361425, "reward_before_std": 1.0259087830781937, "reward_change_max": 0.0013353228569030762, "reward_change_mean": -0.22718132566660643, "reward_change_min": -0.5569618083536625, "reward_change_std": 0.2365485643967986, "reward_std": 0.9675541929900646, "rewards/cosine_scaled_reward": -0.050639352295547724, "rewards/format_reward": 0.31250000186264515, "step": 144 }, { "advantage_max": 1.6574936211109161, "advantage_mean": 7.45058065243498e-09, "advantage_min": -0.8055602125823498, "advantage_std": 0.9307769909501076, "completion_length": 2110.416717529297, "epoch": 0.1657142857142857, "grad_norm": 0.1797972172498703, "kl": 0.010667800903320312, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.046048391230247e-07, "loss": 0.0496, "reward": 0.3544669998809695, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3544669998809695, "reward_after_std": 0.9307770021259785, "reward_before_mean": 0.7125979978591204, "reward_before_std": 0.909045472741127, "reward_change_max": 0.0006473585963249207, "reward_change_mean": -0.3581309705041349, "reward_change_min": -0.6491826735436916, "reward_change_std": 0.25680120568722486, "reward_std": 0.9307770058512688, "rewards/cosine_scaled_reward": 0.03338230960071087, "rewards/format_reward": 0.6458333414047956, "step": 145 }, { "advantage_max": 1.0858964622020721, "advantage_mean": -1.1796753296433593e-08, "advantage_min": -0.5913076885044575, "advantage_std": 0.6359936855733395, "completion_length": 2598.5000762939453, "epoch": 0.16685714285714287, "grad_norm": 0.12754735350608826, "kl": 0.00946044921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.026620557966279e-07, "loss": 0.0484, "reward": -0.09592493623495102, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.09592493623495102, "reward_after_std": 0.6359937191009521, "reward_before_mean": 0.14266785606741905, "reward_before_std": 0.6422541700303555, "reward_change_max": 0.0005887970328330994, "reward_change_mean": -0.238592809997499, "reward_change_min": -0.5087453033775091, "reward_change_std": 0.19102911744266748, "reward_std": 0.6359937265515327, "rewards/cosine_scaled_reward": -0.19949940592050552, "rewards/format_reward": 0.5416666734963655, "step": 146 }, { "advantage_max": 1.05124119669199, "advantage_mean": 6.208817238118058e-09, "advantage_min": -0.6531185433268547, "advantage_std": 0.6395123526453972, "completion_length": 2919.750015258789, "epoch": 0.168, "grad_norm": 0.1187099739909172, "kl": 0.016239166259765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.007020842191634e-07, "loss": 0.0615, "reward": -0.23719407757744193, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.23719407757744193, "reward_after_std": 0.6395123563706875, "reward_before_mean": -0.04856202378869057, "reward_before_std": 0.6837803050875664, "reward_change_max": 0.001722574234008789, "reward_change_mean": -0.18863204028457403, "reward_change_min": -0.48571337200701237, "reward_change_std": 0.18479610979557037, "reward_std": 0.6395123600959778, "rewards/cosine_scaled_reward": -0.18053102178964764, "rewards/format_reward": 0.3125000074505806, "step": 147 }, { "advantage_max": 0.9280873015522957, "advantage_mean": 1.2417634698280722e-08, "advantage_min": -0.6955177783966064, "advantage_std": 0.5650780126452446, "completion_length": 2600.5000915527344, "epoch": 0.16914285714285715, "grad_norm": 0.07447077333927155, "kl": 0.014652252197265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.987250199168808e-07, "loss": 0.0204, "reward": 0.0697112400084734, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.0697112400084734, "reward_after_std": 0.5650780089199543, "reward_before_mean": 0.37948842719197273, "reward_before_std": 0.5654736310243607, "reward_change_max": 0.00041494518518447876, "reward_change_mean": -0.30977714341133833, "reward_change_min": -0.5139572322368622, "reward_change_std": 0.21031226217746735, "reward_std": 0.5650780126452446, "rewards/cosine_scaled_reward": -0.09150580875575542, "rewards/format_reward": 0.5625000074505806, "step": 148 }, { "advantage_max": 1.217205923050642, "advantage_mean": 1.1796752907855534e-08, "advantage_min": -0.8463896103203297, "advantage_std": 0.7423972543329, "completion_length": 2888.5208740234375, "epoch": 0.1702857142857143, "grad_norm": 0.13410669565200806, "kl": 0.0128021240234375, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.967309592491052e-07, "loss": 0.0404, "reward": 0.05353837716393173, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.05353837716393173, "reward_after_std": 0.7423972692340612, "reward_before_mean": 0.3378589116036892, "reward_before_std": 0.7749494183808565, "reward_change_max": 0.0017078742384910583, "reward_change_mean": -0.2843204974196851, "reward_change_min": -0.5100828967988491, "reward_change_std": 0.2250691340304911, "reward_std": 0.7423973102122545, "rewards/cosine_scaled_reward": -0.060237223748117685, "rewards/format_reward": 0.4583333395421505, "step": 149 }, { "advantage_max": 1.3684921450912952, "advantage_mean": -3.104408119458668e-09, "advantage_min": -0.7457129880785942, "advantage_std": 0.790175162255764, "completion_length": 2972.166732788086, "epoch": 0.17142857142857143, "grad_norm": 0.14953668415546417, "kl": 0.017360687255859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.9471999940354e-07, "loss": 0.0354, "reward": 0.059539347887039185, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.059539347887039185, "reward_after_std": 0.7901751659810543, "reward_before_mean": 0.333737276494503, "reward_before_std": 0.8040711954236031, "reward_change_max": 0.0, "reward_change_mean": -0.27419792767614126, "reward_change_min": -0.5881333574652672, "reward_change_std": 0.22270974004641175, "reward_std": 0.7901751957833767, "rewards/cosine_scaled_reward": -0.031048028729856014, "rewards/format_reward": 0.39583333767950535, "step": 150 }, { "advantage_max": 1.38007552921772, "advantage_mean": -9.934107647602275e-09, "advantage_min": -0.7273972257971764, "advantage_std": 0.7743493728339672, "completion_length": 2510.8334045410156, "epoch": 0.17257142857142857, "grad_norm": 0.16105739772319794, "kl": 0.014842987060546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.926922383915315e-07, "loss": -0.0076, "reward": 0.3971906192600727, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3971906192600727, "reward_after_std": 0.7743494212627411, "reward_before_mean": 0.791034422814846, "reward_before_std": 0.7206472083926201, "reward_change_max": 0.0, "reward_change_mean": -0.39384382497519255, "reward_change_min": -0.6590692065656185, "reward_change_std": 0.25266471691429615, "reward_std": 0.7743494361639023, "rewards/cosine_scaled_reward": 0.07260053791105747, "rewards/format_reward": 0.6458333432674408, "step": 151 }, { "advantage_max": 1.392113920301199, "advantage_mean": -1.0554989493538613e-08, "advantage_min": -0.6385779082775116, "advantage_std": 0.7757796794176102, "completion_length": 2763.18754196167, "epoch": 0.1737142857142857, "grad_norm": 0.1782752126455307, "kl": 0.01556396484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.906477750432903e-07, "loss": 0.044, "reward": -0.10477269627153873, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.10477269627153873, "reward_after_std": 0.7757796980440617, "reward_before_mean": 0.10918856167700142, "reward_before_std": 0.7816188298165798, "reward_change_max": 0.0007291659712791443, "reward_change_mean": -0.2139612501487136, "reward_change_min": -0.44862732477486134, "reward_change_std": 0.180716834962368, "reward_std": 0.7757797166705132, "rewards/cosine_scaled_reward": -0.15373907564207911, "rewards/format_reward": 0.4166666716337204, "step": 152 }, { "advantage_max": 0.8868526294827461, "advantage_mean": 1.4280279847511679e-08, "advantage_min": -0.4589373283088207, "advantage_std": 0.49290744960308075, "completion_length": 3050.5833740234375, "epoch": 0.17485714285714285, "grad_norm": 0.0999632179737091, "kl": 0.02754974365234375, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.88586709003076e-07, "loss": 0.0486, "reward": -0.3145775627344847, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3145775627344847, "reward_after_std": 0.49290746822953224, "reward_before_mean": -0.13981377240270376, "reward_before_std": 0.48043977096676826, "reward_change_max": 7.906556129455566e-05, "reward_change_mean": -0.17476379964500666, "reward_change_min": -0.3179873824119568, "reward_change_std": 0.1258330475538969, "reward_std": 0.49290747195482254, "rewards/cosine_scaled_reward": -0.19490689039230347, "rewards/format_reward": 0.25000000186264515, "step": 153 }, { "advantage_max": 1.290296759456396, "advantage_mean": 5.5879357807597785e-09, "advantage_min": -0.9250775575637817, "advantage_std": 0.8055738545954227, "completion_length": 3379.666717529297, "epoch": 0.176, "grad_norm": 0.17429296672344208, "kl": 0.013561248779296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.865091407243394e-07, "loss": 0.063, "reward": 0.11301964987069368, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.11301964987069368, "reward_after_std": 0.8055738434195518, "reward_before_mean": 0.4128780122846365, "reward_before_std": 0.8634231984615326, "reward_change_max": 0.0006842613220214844, "reward_change_mean": -0.29985836148262024, "reward_change_min": -0.6293492093682289, "reward_change_std": 0.25532870925962925, "reward_std": 0.8055738694965839, "rewards/cosine_scaled_reward": 0.050189003348350525, "rewards/format_reward": 0.3125000111758709, "step": 154 }, { "advantage_max": 1.3279439583420753, "advantage_mean": -2.4835264955669345e-09, "advantage_min": -0.7597385421395302, "advantage_std": 0.7712014801800251, "completion_length": 2549.1250228881836, "epoch": 0.17714285714285713, "grad_norm": 0.12317630648612976, "kl": 0.013763427734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.844151714648274e-07, "loss": 0.0098, "reward": 0.42278579249978065, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.42278579249978065, "reward_after_std": 0.7712015192955732, "reward_before_mean": 0.8301597703248262, "reward_before_std": 0.7407869715243578, "reward_change_max": 0.0004476085305213928, "reward_change_mean": -0.40737398341298103, "reward_change_min": -0.6836552545428276, "reward_change_std": 0.2730773724615574, "reward_std": 0.7712015546858311, "rewards/cosine_scaled_reward": 0.15466322377324104, "rewards/format_reward": 0.5208333376795053, "step": 155 }, { "advantage_max": 1.1669469252228737, "advantage_mean": 1.9247333726823967e-08, "advantage_min": -0.7012041360139847, "advantage_std": 0.6874648444354534, "completion_length": 3192.8541870117188, "epoch": 0.1782857142857143, "grad_norm": 0.13070867955684662, "kl": 0.017330169677734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.823049032816478e-07, "loss": 0.0628, "reward": -0.21437102183699608, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.21437102183699608, "reward_after_std": 0.6874648630619049, "reward_before_mean": -0.023766040802001953, "reward_before_std": 0.7222958765923977, "reward_change_max": 0.000732123851776123, "reward_change_mean": -0.1906049638055265, "reward_change_min": -0.4271918907761574, "reward_change_std": 0.1812627101317048, "reward_std": 0.6874649003148079, "rewards/cosine_scaled_reward": -0.12646635621786118, "rewards/format_reward": 0.2291666716337204, "step": 156 }, { "advantage_max": 1.20693701505661, "advantage_mean": 1.2417634476236117e-08, "advantage_min": -0.6609192118048668, "advantage_std": 0.6912636980414391, "completion_length": 3175.041717529297, "epoch": 0.17942857142857144, "grad_norm": 0.13659845292568207, "kl": 0.018890380859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.801784390262943e-07, "loss": 0.0286, "reward": -0.19832509686239064, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.19832509686239064, "reward_after_std": 0.6912636831402779, "reward_before_mean": -0.0033038491383194923, "reward_before_std": 0.7137451581656933, "reward_change_max": 0.0005949512124061584, "reward_change_mean": -0.19502126099541783, "reward_change_min": -0.3924486022442579, "reward_change_std": 0.16552246548235416, "reward_std": 0.6912636868655682, "rewards/cosine_scaled_reward": -0.17873526364564896, "rewards/format_reward": 0.3541666753590107, "step": 157 }, { "advantage_max": 1.2760753110051155, "advantage_mean": -6.208819014474898e-10, "advantage_min": -0.9892221018671989, "advantage_std": 0.8185785189270973, "completion_length": 2982.0834045410156, "epoch": 0.18057142857142858, "grad_norm": 0.18541646003723145, "kl": 0.01678466796875, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.780358823396352e-07, "loss": 0.0881, "reward": 0.3491497424838599, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3491497424838599, "reward_after_std": 0.818578477948904, "reward_before_mean": 0.7326403986662626, "reward_before_std": 0.8680536933243275, "reward_change_max": 0.003411300480365753, "reward_change_mean": -0.38349065091460943, "reward_change_min": -0.695528618991375, "reward_change_std": 0.30123837385326624, "reward_std": 0.8185785189270973, "rewards/cosine_scaled_reward": 0.0850701853632927, "rewards/format_reward": 0.5625000149011612, "step": 158 }, { "advantage_max": 0.9803209267556667, "advantage_mean": 6.208817127095756e-09, "advantage_min": -0.5312448479235172, "advantage_std": 0.5489882118999958, "completion_length": 2971.875045776367, "epoch": 0.18171428571428572, "grad_norm": 0.08510956913232803, "kl": 0.01891326904296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.758773376468604e-07, "loss": -0.0023, "reward": -0.18099602963775396, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.18099602963775396, "reward_after_std": 0.5489882081747055, "reward_before_mean": 0.03407071530818939, "reward_before_std": 0.5351154431700706, "reward_change_max": 0.0009119212627410889, "reward_change_mean": -0.21506676077842712, "reward_change_min": -0.3951657433062792, "reward_change_std": 0.1554846577346325, "reward_std": 0.5489882193505764, "rewards/cosine_scaled_reward": -0.18088130932301283, "rewards/format_reward": 0.3958333395421505, "step": 159 }, { "advantage_max": 0.962229996919632, "advantage_mean": 1.2417632477834672e-09, "advantage_min": -0.7357146218419075, "advantage_std": 0.6014006249606609, "completion_length": 2864.4166870117188, "epoch": 0.18285714285714286, "grad_norm": 0.13316203653812408, "kl": 0.0220184326171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.737029101523929e-07, "loss": 0.027, "reward": 0.043143775314092636, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.043143775314092636, "reward_after_std": 0.6014006175100803, "reward_before_mean": 0.3398471586406231, "reward_before_std": 0.6180408224463463, "reward_change_max": 0.0007723942399024963, "reward_change_mean": -0.2967033665627241, "reward_change_min": -0.5417621470987797, "reward_change_std": 0.22475436236709356, "reward_std": 0.6014006324112415, "rewards/cosine_scaled_reward": -0.007159763015806675, "rewards/format_reward": 0.3541666753590107, "step": 160 }, { "advantage_max": 1.1514604538679123, "advantage_mean": -1.800557086761856e-08, "advantage_min": -0.8958503156900406, "advantage_std": 0.729047141969204, "completion_length": 2882.041702270508, "epoch": 0.184, "grad_norm": 0.13892494142055511, "kl": 0.02344512939453125, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.715127058347614e-07, "loss": 0.0456, "reward": 0.13343903236091137, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.13343903236091137, "reward_after_std": 0.7290471494197845, "reward_before_mean": 0.4490719512104988, "reward_before_std": 0.7747690826654434, "reward_change_max": 0.001975014805793762, "reward_change_mean": -0.3156329430639744, "reward_change_min": -0.5890417471528053, "reward_change_std": 0.250003345310688, "reward_std": 0.7290471717715263, "rewards/cosine_scaled_reward": -0.004630686715245247, "rewards/format_reward": 0.45833334513008595, "step": 161 }, { "advantage_max": 1.21609266102314, "advantage_mean": 2.4835269396561444e-09, "advantage_min": -0.6409214287996292, "advantage_std": 0.7266101613640785, "completion_length": 3296.0833740234375, "epoch": 0.18514285714285714, "grad_norm": 0.14078128337860107, "kl": 0.02608489990234375, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.693068314414344e-07, "loss": 0.0246, "reward": -0.14511827565729618, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.14511827565729618, "reward_after_std": 0.7266101650893688, "reward_before_mean": 0.06751438044011593, "reward_before_std": 0.767331724986434, "reward_change_max": 0.0001521781086921692, "reward_change_mean": -0.21263265097513795, "reward_change_min": -0.48333017714321613, "reward_change_std": 0.19664390292018652, "reward_std": 0.7266101948916912, "rewards/cosine_scaled_reward": -0.09124281164258718, "rewards/format_reward": 0.2500000074505806, "step": 162 }, { "advantage_max": 1.1875976473093033, "advantage_mean": -5.587935225648266e-09, "advantage_min": -0.6885306388139725, "advantage_std": 0.680039256811142, "completion_length": 2470.9167098999023, "epoch": 0.18628571428571428, "grad_norm": 0.12650761008262634, "kl": 0.02099609375, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.670853944836176e-07, "loss": 0.0351, "reward": 0.3515504002571106, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3515504002571106, "reward_after_std": 0.6800392419099808, "reward_before_mean": 0.7434308677911758, "reward_before_std": 0.6333009023219347, "reward_change_max": 0.0009451508522033691, "reward_change_mean": -0.3918804544955492, "reward_change_min": -0.6430389769375324, "reward_change_std": 0.2503257617354393, "reward_std": 0.6800392717123032, "rewards/cosine_scaled_reward": 0.06963208317756653, "rewards/format_reward": 0.6041666697710752, "step": 163 }, { "advantage_max": 1.4622886776924133, "advantage_mean": 4.967053879312289e-09, "advantage_min": -0.7638626024127007, "advantage_std": 0.8638998009264469, "completion_length": 2608.416748046875, "epoch": 0.18742857142857142, "grad_norm": 0.15157678723335266, "kl": 0.020416259765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.648485032310144e-07, "loss": 0.0322, "reward": 0.284273668192327, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.284273668192327, "reward_after_std": 0.8638998009264469, "reward_before_mean": 0.6316687539219856, "reward_before_std": 0.8603082597255707, "reward_change_max": 0.001119077205657959, "reward_change_mean": -0.3473950671032071, "reward_change_min": -0.7313497699797153, "reward_change_std": 0.28116426430642605, "reward_std": 0.863899827003479, "rewards/cosine_scaled_reward": 0.04500104021281004, "rewards/format_reward": 0.5416666753590107, "step": 164 }, { "advantage_max": 0.951996460556984, "advantage_mean": 7.450581041013038e-09, "advantage_min": -0.650677315890789, "advantage_std": 0.5838065594434738, "completion_length": 3245.687530517578, "epoch": 0.18857142857142858, "grad_norm": 0.13272669911384583, "kl": 0.0304107666015625, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.625962667065487e-07, "loss": 0.0513, "reward": -0.19890925288200378, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.19890925288200378, "reward_after_std": 0.583806574344635, "reward_before_mean": 0.013676023110747337, "reward_before_std": 0.6172264814376831, "reward_change_max": 0.0006950944662094116, "reward_change_mean": -0.2125852620229125, "reward_change_min": -0.4224776439368725, "reward_change_std": 0.17862440133467317, "reward_std": 0.5838065780699253, "rewards/cosine_scaled_reward": -0.1494120005518198, "rewards/format_reward": 0.31250001303851604, "step": 165 }, { "advantage_max": 1.505092702805996, "advantage_mean": -2.793967746050896e-08, "advantage_min": -1.0393745079636574, "advantage_std": 0.8885571658611298, "completion_length": 3017.354217529297, "epoch": 0.18971428571428572, "grad_norm": 0.16296008229255676, "kl": 0.0185394287109375, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.603287946810513e-07, "loss": 0.0341, "reward": 0.20471064187586308, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.20471064187586308, "reward_after_std": 0.8885571695864201, "reward_before_mean": 0.5197281390428543, "reward_before_std": 0.9152417033910751, "reward_change_max": 0.0004853755235671997, "reward_change_mean": -0.3150175176560879, "reward_change_min": -0.5962655283510685, "reward_change_std": 0.2500441991724074, "reward_std": 0.8885571919381618, "rewards/cosine_scaled_reward": 0.030697400448843837, "rewards/format_reward": 0.45833334885537624, "step": 166 }, { "advantage_max": 1.3313522264361382, "advantage_mean": -1.0554989660072067e-08, "advantage_min": -0.8351566269993782, "advantage_std": 0.7960289977490902, "completion_length": 2318.3959197998047, "epoch": 0.19085714285714286, "grad_norm": 0.17425104975700378, "kl": 0.01657867431640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.580461976679099e-07, "loss": 0.0612, "reward": 0.26994994410779327, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.26994994410779327, "reward_after_std": 0.7960290051996708, "reward_before_mean": 0.621154960244894, "reward_before_std": 0.8035038635134697, "reward_change_max": 0.0005308240652084351, "reward_change_mean": -0.3512050053104758, "reward_change_min": -0.643861211836338, "reward_change_std": 0.2611974119208753, "reward_std": 0.7960290424525738, "rewards/cosine_scaled_reward": -0.06442254222929478, "rewards/format_reward": 0.750000013038516, "step": 167 }, { "advantage_max": 1.5461714044213295, "advantage_mean": -9.313226023710541e-09, "advantage_min": -0.952774353325367, "advantage_std": 0.9442404918372631, "completion_length": 3180.6875610351562, "epoch": 0.192, "grad_norm": 0.17925553023815155, "kl": 0.02230072021484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.557485869176825e-07, "loss": 0.0163, "reward": 0.18643908202648163, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.18643908202648163, "reward_after_std": 0.9442404955625534, "reward_before_mean": 0.4941616393625736, "reward_before_std": 1.0038710720837116, "reward_change_max": 0.0011507794260978699, "reward_change_mean": -0.30772253684699535, "reward_change_min": -0.6709698811173439, "reward_change_std": 0.27951822336763144, "reward_std": 0.9442404992878437, "rewards/cosine_scaled_reward": -0.0029191900976002216, "rewards/format_reward": 0.5000000167638063, "step": 168 }, { "advantage_max": 1.2211386039853096, "advantage_mean": 8.69234451084111e-09, "advantage_min": -0.7467610165476799, "advantage_std": 0.7033423036336899, "completion_length": 2438.000015258789, "epoch": 0.19314285714285714, "grad_norm": 0.14280058443546295, "kl": 0.0272674560546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.534360744126753e-07, "loss": 0.0521, "reward": 0.6013626717031002, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6013626717031002, "reward_after_std": 0.7033422887325287, "reward_before_mean": 1.0799045194871724, "reward_before_std": 0.6214714664965868, "reward_change_max": 0.0003162398934364319, "reward_change_mean": -0.47854181937873363, "reward_change_min": -0.7317886389791965, "reward_change_std": 0.29094003047794104, "reward_std": 0.7033423185348511, "rewards/cosine_scaled_reward": 0.22745224926620722, "rewards/format_reward": 0.6250000055879354, "step": 169 }, { "advantage_max": 1.2461431175470352, "advantage_mean": 3.414849292227018e-09, "advantage_min": -0.7611747309565544, "advantage_std": 0.7149195112287998, "completion_length": 2473.687557220459, "epoch": 0.19428571428571428, "grad_norm": 0.13153916597366333, "kl": 0.019500732421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.511087728614862e-07, "loss": 0.0595, "reward": 0.1332508558407426, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1332508558407426, "reward_after_std": 0.7149194926023483, "reward_before_mean": 0.44204724254086614, "reward_before_std": 0.7041205205023289, "reward_change_max": 0.00042998790740966797, "reward_change_mean": -0.30879637552425265, "reward_change_min": -0.5489484928548336, "reward_change_std": 0.22512452583760023, "reward_std": 0.7149195000529289, "rewards/cosine_scaled_reward": -0.028976373374462128, "rewards/format_reward": 0.5000000055879354, "step": 170 }, { "advantage_max": 1.4106125310063362, "advantage_mean": -3.72529057601767e-09, "advantage_min": -0.6953155249357224, "advantage_std": 0.795871002599597, "completion_length": 2869.562530517578, "epoch": 0.19542857142857142, "grad_norm": 0.18414172530174255, "kl": 0.02278900146484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.487667956935087e-07, "loss": 0.0576, "reward": 0.26462606340646744, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.26462606340646744, "reward_after_std": 0.7958709988743067, "reward_before_mean": 0.6090709455311298, "reward_before_std": 0.7663719952106476, "reward_change_max": 0.0002579614520072937, "reward_change_mean": -0.34444486489519477, "reward_change_min": -0.5901921018958092, "reward_change_std": 0.2330690361559391, "reward_std": 0.7958710249513388, "rewards/cosine_scaled_reward": 0.05453545227646828, "rewards/format_reward": 0.5000000037252903, "step": 171 }, { "advantage_max": 1.0363451838493347, "advantage_mean": 3.1044083970144243e-09, "advantage_min": -0.6588342115283012, "advantage_std": 0.6231314353644848, "completion_length": 2926.729202270508, "epoch": 0.19657142857142856, "grad_norm": 0.10170703381299973, "kl": 0.030120849609375, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.464102570534061e-07, "loss": 0.0193, "reward": 0.10751838982105255, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.10751838982105255, "reward_after_std": 0.6231314353644848, "reward_before_mean": 0.42348906956613064, "reward_before_std": 0.6140627078711987, "reward_change_max": 0.00191400945186615, "reward_change_mean": -0.31597068533301353, "reward_change_min": -0.5915517024695873, "reward_change_std": 0.23089101910591125, "reward_std": 0.6231314614415169, "rewards/cosine_scaled_reward": 0.03466118685901165, "rewards/format_reward": 0.3541666679084301, "step": 172 }, { "advantage_max": 1.223799116909504, "advantage_mean": -9.93410786964688e-09, "advantage_min": -0.7045940980315208, "advantage_std": 0.7283288538455963, "completion_length": 2046.000057220459, "epoch": 0.1977142857142857, "grad_norm": 0.20791788399219513, "kl": 0.026142120361328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.440392717955475e-07, "loss": 0.0616, "reward": 0.05291812680661678, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.05291812680661678, "reward_after_std": 0.728328850120306, "reward_before_mean": 0.3344998322427273, "reward_before_std": 0.735675803385675, "reward_change_max": 0.0, "reward_change_mean": -0.28158169984817505, "reward_change_min": -0.5169616155326366, "reward_change_std": 0.22174399625509977, "reward_std": 0.7283288538455963, "rewards/cosine_scaled_reward": -0.13483343180269003, "rewards/format_reward": 0.6041666772216558, "step": 173 }, { "advantage_max": 1.531661294400692, "advantage_mean": -5.5879355587151736e-09, "advantage_min": -0.8348989337682724, "advantage_std": 0.8691375777125359, "completion_length": 2526.479248046875, "epoch": 0.19885714285714284, "grad_norm": 0.1942531168460846, "kl": 0.03050994873046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.416539554784089e-07, "loss": 0.0638, "reward": 0.21084141172468662, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.21084141172468662, "reward_after_std": 0.8691376075148582, "reward_before_mean": 0.52743915701285, "reward_before_std": 0.8649193346500397, "reward_change_max": 0.0003266632556915283, "reward_change_mean": -0.3165977615863085, "reward_change_min": -0.617426909506321, "reward_change_std": 0.23885107226669788, "reward_std": 0.8691376447677612, "rewards/cosine_scaled_reward": -0.06961375381797552, "rewards/format_reward": 0.6666666753590107, "step": 174 }, { "advantage_max": 1.0626054927706718, "advantage_mean": 1.862645193639878e-08, "advantage_min": -0.7459142580628395, "advantage_std": 0.6619545966386795, "completion_length": 2799.4792098999023, "epoch": 0.2, "grad_norm": 0.11736467480659485, "kl": 0.0283355712890625, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.392544243589427e-07, "loss": 0.0262, "reward": 0.09813978523015976, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.09813978523015976, "reward_after_std": 0.6619545966386795, "reward_before_mean": 0.4087598901242018, "reward_before_std": 0.6921274587512016, "reward_change_max": 0.0004105418920516968, "reward_change_mean": -0.31062008207663894, "reward_change_min": -0.5638699308037758, "reward_change_std": 0.22923887381330132, "reward_std": 0.6619546003639698, "rewards/cosine_scaled_reward": -0.024786731228232384, "rewards/format_reward": 0.4583333395421505, "step": 175 }, { "advantage_max": 1.7890079766511917, "advantage_mean": 2.483526384544632e-09, "advantage_min": -0.8193216100335121, "advantage_std": 1.009437695145607, "completion_length": 2720.041702270508, "epoch": 0.20114285714285715, "grad_norm": 0.18651027977466583, "kl": 0.03166961669921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.368407953869103e-07, "loss": 0.0233, "reward": 0.14203171245753765, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.14203171245753765, "reward_after_std": 1.0094376914203167, "reward_before_mean": 0.4155056234449148, "reward_before_std": 1.0160624533891678, "reward_change_max": 0.001289263367652893, "reward_change_mean": -0.27347393333911896, "reward_change_min": -0.5660295896232128, "reward_change_std": 0.22895468026399612, "reward_std": 1.0094376988708973, "rewards/cosine_scaled_reward": -0.010997178498655558, "rewards/format_reward": 0.43750000558793545, "step": 176 }, { "advantage_max": 1.4360157921910286, "advantage_mean": -6.208816794028849e-10, "advantage_min": -1.009121149778366, "advantage_std": 0.9150974787771702, "completion_length": 2988.041763305664, "epoch": 0.2022857142857143, "grad_norm": 0.3150843679904938, "kl": 0.0362396240234375, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.344131861991828e-07, "loss": 0.0791, "reward": 0.12510948814451694, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.12510948814451694, "reward_after_std": 0.9150974527001381, "reward_before_mean": 0.418378489674069, "reward_before_std": 1.0042357444763184, "reward_change_max": 0.0010838136076927185, "reward_change_mean": -0.2932689841836691, "reward_change_min": -0.664687767624855, "reward_change_std": 0.28929333481937647, "reward_std": 0.9150974601507187, "rewards/cosine_scaled_reward": -0.061644104309380054, "rewards/format_reward": 0.5416666828095913, "step": 177 }, { "advantage_max": 1.5168294608592987, "advantage_mean": -1.2417634698280722e-08, "advantage_min": -0.7548965439200401, "advantage_std": 0.8664248380810022, "completion_length": 2798.666702270508, "epoch": 0.20342857142857143, "grad_norm": 0.24158771336078644, "kl": 0.0391998291015625, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.319717151140072e-07, "loss": 0.0326, "reward": -0.05778668820858002, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.05778668820858002, "reward_after_std": 0.8664248511195183, "reward_before_mean": 0.1630261167883873, "reward_before_std": 0.8943422082811594, "reward_change_max": 0.005850210785865784, "reward_change_mean": -0.22081283433362842, "reward_change_min": -0.5684525799006224, "reward_change_std": 0.22119095316156745, "reward_std": 0.8664248883724213, "rewards/cosine_scaled_reward": -0.13723693694919348, "rewards/format_reward": 0.43750000931322575, "step": 178 }, { "advantage_max": 0.8692804127931595, "advantage_mean": 9.002784906453343e-09, "advantage_min": -0.47451937943696976, "advantage_std": 0.49049021303653717, "completion_length": 2842.000015258789, "epoch": 0.20457142857142857, "grad_norm": 0.08588196337223053, "kl": 0.0345916748046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.295165011252396e-07, "loss": 0.0121, "reward": -0.29055724292993546, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.29055724292993546, "reward_after_std": 0.49049021676182747, "reward_before_mean": -0.10510284267365932, "reward_before_std": 0.4801676608622074, "reward_change_max": 0.0010400563478469849, "reward_change_mean": -0.18545439094305038, "reward_change_min": -0.3516765534877777, "reward_change_std": 0.14079185109585524, "reward_std": 0.49049022793769836, "rewards/cosine_scaled_reward": -0.24005142599344254, "rewards/format_reward": 0.37500000558793545, "step": 179 }, { "advantage_max": 1.318357888609171, "advantage_mean": -1.6763806898190126e-08, "advantage_min": -0.7594656124711037, "advantage_std": 0.7751675732433796, "completion_length": 2358.104190826416, "epoch": 0.2057142857142857, "grad_norm": 0.1223163828253746, "kl": 0.03548431396484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.270476638965461e-07, "loss": 0.002, "reward": 0.38155335932970047, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.38155335932970047, "reward_after_std": 0.7751675825566053, "reward_before_mean": 0.773796696215868, "reward_before_std": 0.7584128007292747, "reward_change_max": 0.0, "reward_change_mean": -0.3922433443367481, "reward_change_min": -0.7204612493515015, "reward_change_std": 0.273406270891428, "reward_std": 0.7751675937324762, "rewards/cosine_scaled_reward": 0.10564833006355911, "rewards/format_reward": 0.5625, "step": 180 }, { "advantage_max": 1.700867984443903, "advantage_mean": 6.829699084054397e-09, "advantage_min": -0.7429678663611412, "advantage_std": 0.9190853256732225, "completion_length": 3069.479202270508, "epoch": 0.20685714285714285, "grad_norm": 0.23546984791755676, "kl": 0.0411376953125, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.245653237555705e-07, "loss": 0.0573, "reward": -0.0617211596108973, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.0617211596108973, "reward_after_std": 0.9190853126347065, "reward_before_mean": 0.1456823544576764, "reward_before_std": 0.9196916986256838, "reward_change_max": 0.0004614144563674927, "reward_change_mean": -0.20740348938852549, "reward_change_min": -0.4607136957347393, "reward_change_std": 0.18180597410537302, "reward_std": 0.9190853200852871, "rewards/cosine_scaled_reward": -0.09382550918962806, "rewards/format_reward": 0.3333333395421505, "step": 181 }, { "advantage_max": 1.8870112970471382, "advantage_mean": -1.6763806398589765e-08, "advantage_min": -0.9500059187412262, "advantage_std": 1.0961102209985256, "completion_length": 2501.4792251586914, "epoch": 0.208, "grad_norm": 0.21964824199676514, "kl": 0.02997589111328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.220696016880687e-07, "loss": 0.0521, "reward": 0.3104388937354088, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3104388937354088, "reward_after_std": 1.096110176295042, "reward_before_mean": 0.6356113087385893, "reward_before_std": 1.1282791681587696, "reward_change_max": 0.0012559443712234497, "reward_change_mean": -0.325172433629632, "reward_change_min": -0.7234682310372591, "reward_change_std": 0.28116371016949415, "reward_std": 1.0961102060973644, "rewards/cosine_scaled_reward": 0.015722323209047318, "rewards/format_reward": 0.6041666772216558, "step": 182 }, { "advantage_max": 1.426853645592928, "advantage_mean": -4.0357314046168824e-08, "advantage_min": -1.0862329080700874, "advantage_std": 0.9064907692372799, "completion_length": 2791.854217529297, "epoch": 0.20914285714285713, "grad_norm": 0.17481961846351624, "kl": 0.052337646484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.195606193320136e-07, "loss": 0.0374, "reward": 0.28753895533736795, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.28753895533736795, "reward_after_std": 0.9064907692372799, "reward_before_mean": 0.6385138165205717, "reward_before_std": 0.9792434312403202, "reward_change_max": 0.0001685991883277893, "reward_change_mean": -0.35097489645704627, "reward_change_min": -0.6677650213241577, "reward_change_std": 0.2933631045743823, "reward_std": 0.9064908064901829, "rewards/cosine_scaled_reward": 0.027590231969952583, "rewards/format_reward": 0.5833333469927311, "step": 183 }, { "advantage_max": 1.0320013873279095, "advantage_mean": 3.1044085080367267e-09, "advantage_min": -0.4724693186581135, "advantage_std": 0.5576813668012619, "completion_length": 2872.541679382324, "epoch": 0.2102857142857143, "grad_norm": 0.10614390671253204, "kl": 0.0408172607421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.170384989716657e-07, "loss": 0.0062, "reward": -0.20734488288871944, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.20734488288871944, "reward_after_std": 0.5576813891530037, "reward_before_mean": -0.006254892796278, "reward_before_std": 0.5329896248877048, "reward_change_max": 0.0017844811081886292, "reward_change_mean": -0.20108998264186084, "reward_change_min": -0.34110090509057045, "reward_change_std": 0.13719749893061817, "reward_std": 0.5576814040541649, "rewards/cosine_scaled_reward": -0.21146078407764435, "rewards/format_reward": 0.4166666716337204, "step": 184 }, { "advantage_max": 1.0225131250917912, "advantage_mean": 5.587935614226325e-09, "advantage_min": -0.5672240667045116, "advantage_std": 0.5753951445221901, "completion_length": 2476.8125228881836, "epoch": 0.21142857142857144, "grad_norm": 0.11501786857843399, "kl": 0.0401763916015625, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.145033635316128e-07, "loss": 0.0207, "reward": -0.16281145935499808, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.16281145935499808, "reward_after_std": 0.575395142659545, "reward_before_mean": 0.05530272121541202, "reward_before_std": 0.5677776131778955, "reward_change_max": 0.00036994367837905884, "reward_change_mean": -0.21811419213190675, "reward_change_min": -0.407501645386219, "reward_change_std": 0.15843796357512474, "reward_std": 0.5753951575607061, "rewards/cosine_scaled_reward": -0.21193197183310986, "rewards/format_reward": 0.47916667722165585, "step": 185 }, { "advantage_max": 0.8590374700725079, "advantage_mean": -1.3659398168108794e-08, "advantage_min": -0.6779530048370361, "advantage_std": 0.5576737355440855, "completion_length": 3155.5833587646484, "epoch": 0.21257142857142858, "grad_norm": 0.12613937258720398, "kl": 0.048248291015625, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.119553365707802e-07, "loss": 0.0319, "reward": 0.11866102367639542, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.11866102367639542, "reward_after_std": 0.5576737429946661, "reward_before_mean": 0.4515787363052368, "reward_before_std": 0.5622416902333498, "reward_change_max": 0.0005166977643966675, "reward_change_mean": -0.332917720079422, "reward_change_min": -0.5469591151922941, "reward_change_std": 0.23157448787242174, "reward_std": 0.5576737560331821, "rewards/cosine_scaled_reward": 0.04870602488517761, "rewards/format_reward": 0.3541666716337204, "step": 186 }, { "advantage_max": 1.1893529891967773, "advantage_mean": -1.1486312567754453e-08, "advantage_min": -0.7816543951630592, "advantage_std": 0.7066598571836948, "completion_length": 2754.4583892822266, "epoch": 0.21371428571428572, "grad_norm": 0.147071972489357, "kl": 0.0525665283203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.093945422764069e-07, "loss": 0.0109, "reward": 0.006238499656319618, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.006238499656319618, "reward_after_std": 0.7066598758101463, "reward_before_mean": 0.2741194274276495, "reward_before_std": 0.7243945486843586, "reward_change_max": 0.0003826245665550232, "reward_change_mean": -0.2678809203207493, "reward_change_min": -0.5089087933301926, "reward_change_std": 0.20938009303063154, "reward_std": 0.7066599018871784, "rewards/cosine_scaled_reward": -0.11294030770659447, "rewards/format_reward": 0.500000013038516, "step": 187 }, { "advantage_max": 0.9972609803080559, "advantage_mean": 1.117587122845265e-08, "advantage_min": -0.5697911977767944, "advantage_std": 0.5747736543416977, "completion_length": 3509.3958740234375, "epoch": 0.21485714285714286, "grad_norm": 0.15090584754943848, "kl": 0.058258056640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.068211054579943e-07, "loss": 0.0266, "reward": -0.3646040167659521, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3646040167659521, "reward_after_std": 0.5747736543416977, "reward_before_mean": -0.2149992468766868, "reward_before_std": 0.5986794494092464, "reward_change_max": 0.0015026628971099854, "reward_change_mean": -0.14960476756095886, "reward_change_min": -0.3564014993607998, "reward_change_std": 0.14348152186721563, "reward_std": 0.5747736543416977, "rewards/cosine_scaled_reward": -0.1699996292591095, "rewards/format_reward": 0.1250000037252903, "step": 188 }, { "advantage_max": 0.8467955514788628, "advantage_mean": -1.4280279181377864e-08, "advantage_min": -0.6261894814670086, "advantage_std": 0.5325388088822365, "completion_length": 2987.8333740234375, "epoch": 0.216, "grad_norm": 0.146543949842453, "kl": 0.058349609375, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.04235151541222e-07, "loss": 0.0441, "reward": -0.036177659407258034, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.036177659407258034, "reward_after_std": 0.5325388237833977, "reward_before_mean": 0.24245697120204568, "reward_before_std": 0.5513169299811125, "reward_change_max": 0.0002681910991668701, "reward_change_mean": -0.27863461058586836, "reward_change_min": -0.5000833366066217, "reward_change_std": 0.20214266702532768, "reward_std": 0.5325388498604298, "rewards/cosine_scaled_reward": -0.08710487745702267, "rewards/format_reward": 0.41666667349636555, "step": 189 }, { "advantage_max": 1.4574809968471527, "advantage_mean": -3.104408619059029e-09, "advantage_min": -0.7873755618929863, "advantage_std": 0.8542032577097416, "completion_length": 2575.3333740234375, "epoch": 0.21714285714285714, "grad_norm": 0.1981331706047058, "kl": 0.0491943359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.01636806561836e-07, "loss": 0.0212, "reward": 0.0914838039316237, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.0914838039316237, "reward_after_std": 0.8542032763361931, "reward_before_mean": 0.3697546496987343, "reward_before_std": 0.8832090497016907, "reward_change_max": 0.0066404566168785095, "reward_change_mean": -0.27827086206525564, "reward_change_min": -0.6325997523963451, "reward_change_std": 0.25706900004297495, "reward_std": 0.8542033135890961, "rewards/cosine_scaled_reward": -0.044289345387369394, "rewards/format_reward": 0.45833334513008595, "step": 190 }, { "advantage_max": 1.5159916803240776, "advantage_mean": -6.208814573582799e-10, "advantage_min": -0.6861558184027672, "advantage_std": 0.8327911645174026, "completion_length": 2696.5625534057617, "epoch": 0.21828571428571428, "grad_norm": 0.20235764980316162, "kl": 0.05841064453125, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.990261971595048e-07, "loss": 0.0478, "reward": 0.15532669192180037, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.15532669192180037, "reward_after_std": 0.832791157066822, "reward_before_mean": 0.4526620793621987, "reward_before_std": 0.8022483550012112, "reward_change_max": 0.001082099974155426, "reward_change_mean": -0.2973353751003742, "reward_change_min": -0.5558175276964903, "reward_change_std": 0.21507473941892385, "reward_std": 0.8327912017703056, "rewards/cosine_scaled_reward": -0.013252315111458302, "rewards/format_reward": 0.47916667349636555, "step": 191 }, { "advantage_max": 1.1179408133029938, "advantage_mean": 2.048909675256283e-08, "advantage_min": -0.8805400393903255, "advantage_std": 0.757763747125864, "completion_length": 3161.0625610351562, "epoch": 0.21942857142857142, "grad_norm": 0.3063293993473053, "kl": 0.057159423828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.964034505716476e-07, "loss": 0.0728, "reward": -0.053129157051444054, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.053129157051444054, "reward_after_std": 0.7577637508511543, "reward_before_mean": 0.19890626333653927, "reward_before_std": 0.856065109372139, "reward_change_max": 0.0003703683614730835, "reward_change_mean": -0.252035410143435, "reward_change_min": -0.5377420820295811, "reward_change_std": 0.25206225644797087, "reward_std": 0.7577637806534767, "rewards/cosine_scaled_reward": -0.07763020880520344, "rewards/format_reward": 0.35416667722165585, "step": 192 }, { "advantage_max": 1.110087014734745, "advantage_mean": 2.1420419549222913e-08, "advantage_min": -0.5900781787931919, "advantage_std": 0.634924691170454, "completion_length": 3080.9583587646484, "epoch": 0.22057142857142858, "grad_norm": 0.16752228140830994, "kl": 0.057342529296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.93768694627233e-07, "loss": 0.0294, "reward": -0.13575546815991402, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.13575546815991402, "reward_after_std": 0.6349246874451637, "reward_before_mean": 0.08614366129040718, "reward_before_std": 0.6371559798717499, "reward_change_max": 0.0006859153509140015, "reward_change_mean": -0.2218991070985794, "reward_change_min": -0.45419899746775627, "reward_change_std": 0.1739786909893155, "reward_std": 0.634924691170454, "rewards/cosine_scaled_reward": -0.11317817401140928, "rewards/format_reward": 0.3125000074505806, "step": 193 }, { "advantage_max": 1.5593114458024502, "advantage_mean": -1.490116136038111e-08, "advantage_min": -0.960111491382122, "advantage_std": 0.9485115651041269, "completion_length": 3000.916748046875, "epoch": 0.22171428571428572, "grad_norm": 0.26778778433799744, "kl": 0.059417724609375, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.911220577405484e-07, "loss": 0.0141, "reward": 0.29285904578864574, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.29285904578864574, "reward_after_std": 0.9485115520656109, "reward_before_mean": 0.6367957415059209, "reward_before_std": 1.0004472993314266, "reward_change_max": 0.001190371811389923, "reward_change_mean": -0.34393669478595257, "reward_change_min": -0.7738593406975269, "reward_change_std": 0.2969193672761321, "reward_std": 0.9485115893185139, "rewards/cosine_scaled_reward": 0.07881453260779381, "rewards/format_reward": 0.479166679084301, "step": 194 }, { "advantage_max": 1.4225033074617386, "advantage_mean": -8.6923440667519e-09, "advantage_min": -0.8249565176665783, "advantage_std": 0.8413193933665752, "completion_length": 2926.625045776367, "epoch": 0.22285714285714286, "grad_norm": 0.36274176836013794, "kl": 0.064544677734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.884636689049422e-07, "loss": 0.0554, "reward": 0.007098935544490814, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.007098935544490814, "reward_after_std": 0.841319415718317, "reward_before_mean": 0.25825855135917664, "reward_before_std": 0.8795899786055088, "reward_change_max": 0.00033611059188842773, "reward_change_mean": -0.2511596102267504, "reward_change_min": -0.5563398413360119, "reward_change_std": 0.22679788246750832, "reward_std": 0.8413194417953491, "rewards/cosine_scaled_reward": -0.058370741084218025, "rewards/format_reward": 0.3750000037252903, "step": 195 }, { "advantage_max": 0.9352176859974861, "advantage_mean": -1.8626448716752009e-09, "advantage_min": -0.5650050267577171, "advantage_std": 0.5551619492471218, "completion_length": 3204.750030517578, "epoch": 0.224, "grad_norm": 0.14659136533737183, "kl": 0.0740966796875, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.857936576865356e-07, "loss": 0.0226, "reward": -0.08506974019110203, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.08506974019110203, "reward_after_std": 0.5551619343459606, "reward_before_mean": 0.1700500212609768, "reward_before_std": 0.5611773282289505, "reward_change_max": 0.00023402273654937744, "reward_change_mean": -0.25511975586414337, "reward_change_min": -0.49135322496294975, "reward_change_std": 0.18783819722011685, "reward_std": 0.5551619455218315, "rewards/cosine_scaled_reward": -0.07122499728575349, "rewards/format_reward": 0.3125000037252903, "step": 196 }, { "advantage_max": 1.9159402027726173, "advantage_mean": -1.6142925107764938e-08, "advantage_min": -0.9467073902487755, "advantage_std": 1.1410527899861336, "completion_length": 2238.041702270508, "epoch": 0.22514285714285714, "grad_norm": 0.42249488830566406, "kl": 0.0753173828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.831121542179086e-07, "loss": 0.0565, "reward": 0.2458451751153916, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2458451751153916, "reward_after_std": 1.141052782535553, "reward_before_mean": 0.5444424021989107, "reward_before_std": 1.2092568203806877, "reward_change_max": 0.0008358433842658997, "reward_change_mean": -0.2985972370952368, "reward_change_min": -0.7867999076843262, "reward_change_std": 0.30800035782158375, "reward_std": 1.1410528048872948, "rewards/cosine_scaled_reward": -0.009028811939060688, "rewards/format_reward": 0.5625000093132257, "step": 197 }, { "advantage_max": 1.5482355654239655, "advantage_mean": 3.725290520506519e-09, "advantage_min": -0.7490741685032845, "advantage_std": 0.8406277522444725, "completion_length": 2701.500045776367, "epoch": 0.22628571428571428, "grad_norm": 0.2042098492383957, "kl": 0.091156005859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.804192891917571e-07, "loss": 0.024, "reward": -0.07515976205468178, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.07515976205468178, "reward_after_std": 0.8406277596950531, "reward_before_mean": 0.13804511167109013, "reward_before_std": 0.8344271630048752, "reward_change_max": 0.0003175213932991028, "reward_change_mean": -0.21320487465709448, "reward_change_min": -0.4269953351467848, "reward_change_std": 0.17606920842081308, "reward_std": 0.8406277745962143, "rewards/cosine_scaled_reward": -0.07681078463792801, "rewards/format_reward": 0.2916666716337204, "step": 198 }, { "advantage_max": 1.396604772657156, "advantage_mean": -8.6923440667519e-09, "advantage_min": -0.7665940597653389, "advantage_std": 0.8094251714646816, "completion_length": 2494.6875381469727, "epoch": 0.22742857142857142, "grad_norm": 0.2061101496219635, "kl": 0.09295654296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.777151938545235e-07, "loss": 0.0141, "reward": 0.0012609101831912994, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.0012609101831912994, "reward_after_std": 0.8094251602888107, "reward_before_mean": 0.2516523525118828, "reward_before_std": 0.8308322429656982, "reward_change_max": 0.0001317635178565979, "reward_change_mean": -0.25039147958159447, "reward_change_min": -0.5647806152701378, "reward_change_std": 0.22136891726404428, "reward_std": 0.8094251789152622, "rewards/cosine_scaled_reward": -0.04084047582000494, "rewards/format_reward": 0.33333333767950535, "step": 199 }, { "advantage_max": 1.399455588310957, "advantage_mean": -7.450580929990736e-09, "advantage_min": -0.6911447197198868, "advantage_std": 0.7602925859391689, "completion_length": 2492.104232788086, "epoch": 0.22857142857142856, "grad_norm": 0.23204493522644043, "kl": 0.094390869140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.75e-07, "loss": 0.0048, "reward": 0.3572434112429619, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3572434112429619, "reward_after_std": 0.7602925859391689, "reward_before_mean": 0.7317504649981856, "reward_before_std": 0.6808374896645546, "reward_change_max": 0.00037054717540740967, "reward_change_mean": -0.3745070155709982, "reward_change_min": -0.5897736884653568, "reward_change_std": 0.23156773671507835, "reward_std": 0.760292612016201, "rewards/cosine_scaled_reward": 0.07420855306554586, "rewards/format_reward": 0.583333345130086, "step": 200 }, { "advantage_max": 1.662718866020441, "advantage_mean": -1.7384688910659918e-08, "advantage_min": -0.8902830928564072, "advantage_std": 0.9579394981265068, "completion_length": 2042.0625305175781, "epoch": 0.2297142857142857, "grad_norm": 0.18706873059272766, "kl": 0.0697021484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.72273839962904e-07, "loss": 0.0017, "reward": 0.5458131283521652, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5458131283521652, "reward_after_std": 0.9579394981265068, "reward_before_mean": 0.973633773624897, "reward_before_std": 0.9209330566227436, "reward_change_max": 0.0014783218502998352, "reward_change_mean": -0.42782066529616714, "reward_change_min": -0.7845392525196075, "reward_change_std": 0.3085333174094558, "reward_std": 0.9579395055770874, "rewards/cosine_scaled_reward": 0.25765021913684905, "rewards/format_reward": 0.4583333432674408, "step": 201 }, { "advantage_max": 1.3534116931259632, "advantage_mean": 3.1044083970144243e-09, "advantage_min": -0.6152248904109001, "advantage_std": 0.7239951826632023, "completion_length": 2311.1458587646484, "epoch": 0.23085714285714284, "grad_norm": 0.18341876566410065, "kl": 0.10516357421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.695368466124296e-07, "loss": -0.0024, "reward": 0.5883260769769549, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5883260769769549, "reward_after_std": 0.7239951677620411, "reward_before_mean": 1.052895911037922, "reward_before_std": 0.5883917305618525, "reward_change_max": 0.0004690214991569519, "reward_change_mean": -0.46456983499228954, "reward_change_min": -0.7253349907696247, "reward_change_std": 0.27589836064726114, "reward_std": 0.7239951826632023, "rewards/cosine_scaled_reward": 0.22436463087797165, "rewards/format_reward": 0.6041666679084301, "step": 202 }, { "advantage_max": 1.304409470409155, "advantage_mean": -1.6763806509612067e-08, "advantage_min": -0.656681727617979, "advantage_std": 0.7154108583927155, "completion_length": 2858.5833892822266, "epoch": 0.232, "grad_norm": 0.318823903799057, "kl": 0.112030029296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.667891533457718e-07, "loss": 0.0137, "reward": -0.048699749168008566, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.048699749168008566, "reward_after_std": 0.7154108472168446, "reward_before_mean": 0.19068025797605515, "reward_before_std": 0.6872757263481617, "reward_change_max": 0.0018011406064033508, "reward_change_mean": -0.23938002390787005, "reward_change_min": -0.47005924582481384, "reward_change_std": 0.1845587631687522, "reward_std": 0.7154108621180058, "rewards/cosine_scaled_reward": -0.040076554752886295, "rewards/format_reward": 0.2708333395421505, "step": 203 }, { "advantage_max": 1.1338126733899117, "advantage_mean": -1.0865429111994729e-09, "advantage_min": -0.7272412367165089, "advantage_std": 0.6677736900746822, "completion_length": 2282.937545776367, "epoch": 0.23314285714285715, "grad_norm": 0.22631201148033142, "kl": 0.110076904296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.640308940816239e-07, "loss": 0.0228, "reward": 0.19898322504013777, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.19898322504013777, "reward_after_std": 0.6677736863493919, "reward_before_mean": 0.5331610734574497, "reward_before_std": 0.6481306254863739, "reward_change_max": 0.0, "reward_change_mean": -0.3341778600588441, "reward_change_min": -0.5623057503253222, "reward_change_std": 0.21866612136363983, "reward_std": 0.6677737049758434, "rewards/cosine_scaled_reward": -0.07716945745050907, "rewards/format_reward": 0.6875000149011612, "step": 204 }, { "advantage_max": 1.262697447091341, "advantage_mean": -3.290673217248852e-08, "advantage_min": -0.8431374989449978, "advantage_std": 0.7658955343067646, "completion_length": 2390.2083740234375, "epoch": 0.2342857142857143, "grad_norm": 0.26258933544158936, "kl": 0.10296630859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.612622032536507e-07, "loss": 0.0023, "reward": 0.35523027554154396, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.35523027554154396, "reward_after_std": 0.7658955492079258, "reward_before_mean": 0.742044972255826, "reward_before_std": 0.7611411139369011, "reward_change_max": 0.0, "reward_change_mean": -0.38681469298899174, "reward_change_min": -0.6838057711720467, "reward_change_std": 0.27394791319966316, "reward_std": 0.765895564109087, "rewards/cosine_scaled_reward": 0.12102247402071953, "rewards/format_reward": 0.5000000111758709, "step": 205 }, { "advantage_max": 1.3321489915251732, "advantage_mean": -1.8626450382086546e-09, "advantage_min": -0.794492594897747, "advantage_std": 0.8248475287109613, "completion_length": 3009.8542098999023, "epoch": 0.23542857142857143, "grad_norm": 0.3896113932132721, "kl": 0.12060546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.584832158039378e-07, "loss": 0.0297, "reward": -0.004749797284603119, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.004749797284603119, "reward_after_std": 0.8248475547879934, "reward_before_mean": 0.24687214195728302, "reward_before_std": 0.88743188790977, "reward_change_max": 0.0006934329867362976, "reward_change_mean": -0.2516219327226281, "reward_change_min": -0.6339543778449297, "reward_change_std": 0.2461548363789916, "reward_std": 0.8248475603759289, "rewards/cosine_scaled_reward": -0.05364727135747671, "rewards/format_reward": 0.35416666977107525, "step": 206 }, { "advantage_max": 1.607744850218296, "advantage_mean": 2.483527050678447e-09, "advantage_min": -0.7184270024299622, "advantage_std": 0.8869195282459259, "completion_length": 3170.291717529297, "epoch": 0.23657142857142857, "grad_norm": 0.366899698972702, "kl": 0.16107177734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.556940671764124e-07, "loss": 0.0036, "reward": -0.1947600757703185, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1947600757703185, "reward_after_std": 0.8869195394217968, "reward_before_mean": -0.028473446145653725, "reward_before_std": 0.915056474506855, "reward_change_max": 0.0021965429186820984, "reward_change_mean": -0.16628664545714855, "reward_change_min": -0.46696799620985985, "reward_change_std": 0.18509722780436277, "reward_std": 0.8869195394217968, "rewards/cosine_scaled_reward": -0.170486721675843, "rewards/format_reward": 0.3125000037252903, "step": 207 }, { "advantage_max": 1.1644534580409527, "advantage_mean": -1.73846881335038e-08, "advantage_min": -0.7889424115419388, "advantage_std": 0.7400659993290901, "completion_length": 2515.979232788086, "epoch": 0.2377142857142857, "grad_norm": 0.6642101407051086, "kl": 0.11297607421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.528948933102438e-07, "loss": 0.0714, "reward": 0.046982649713754654, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.046982649713754654, "reward_after_std": 0.7400659881532192, "reward_before_mean": 0.3288039155304432, "reward_before_std": 0.7989055551588535, "reward_change_max": 0.0010460317134857178, "reward_change_mean": -0.2818212900310755, "reward_change_min": -0.5972126051783562, "reward_change_std": 0.2474859021604061, "reward_std": 0.7400660440325737, "rewards/cosine_scaled_reward": -0.06476471698260866, "rewards/format_reward": 0.4583333469927311, "step": 208 }, { "advantage_max": 1.2763051986694336, "advantage_mean": 1.1175870950896893e-08, "advantage_min": -0.7047868482768536, "advantage_std": 0.7354258019477129, "completion_length": 2538.6041946411133, "epoch": 0.23885714285714285, "grad_norm": 0.25784793496131897, "kl": 0.1322021484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.500858306332172e-07, "loss": 0.002, "reward": 0.23811393603682518, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.23811393603682518, "reward_after_std": 0.7354257944971323, "reward_before_mean": 0.5821687076240778, "reward_before_std": 0.7137798797339201, "reward_change_max": 0.0, "reward_change_mean": -0.3440546961501241, "reward_change_min": -0.6435936130583286, "reward_change_std": 0.24290089262649417, "reward_std": 0.735425828024745, "rewards/cosine_scaled_reward": 0.06191765144467354, "rewards/format_reward": 0.4583333432674408, "step": 209 }, { "advantage_max": 1.183098427951336, "advantage_mean": -4.967053435223079e-09, "advantage_min": -0.6418071016669273, "advantage_std": 0.7025355864316225, "completion_length": 2545.8333740234375, "epoch": 0.24, "grad_norm": 0.40859097242355347, "kl": 0.1146240234375, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.472670160550848e-07, "loss": 0.0745, "reward": 0.05246494244784117, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.05246494244784117, "reward_after_std": 0.7025356087833643, "reward_before_mean": 0.3357812147587538, "reward_before_std": 0.7116394340991974, "reward_change_max": 0.0008187666535377502, "reward_change_mean": -0.28331627510488033, "reward_change_min": -0.5772730372846127, "reward_change_std": 0.23112040758132935, "reward_std": 0.7025356367230415, "rewards/cosine_scaled_reward": -0.019609388895332813, "rewards/format_reward": 0.37500000186264515, "step": 210 }, { "advantage_max": 1.1921544596552849, "advantage_mean": -8.692344122263052e-09, "advantage_min": -0.6380018964409828, "advantage_std": 0.6864985190331936, "completion_length": 2406.062545776367, "epoch": 0.24114285714285713, "grad_norm": 0.25473645329475403, "kl": 0.1444091796875, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.444385869608921e-07, "loss": 0.0218, "reward": 0.10756439715623856, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.10756439715623856, "reward_after_std": 0.6864985190331936, "reward_before_mean": 0.4096587970852852, "reward_before_std": 0.6774689964950085, "reward_change_max": 0.00046136975288391113, "reward_change_mean": -0.30209438502788544, "reward_change_min": -0.5448434054851532, "reward_change_std": 0.21998351905494928, "reward_std": 0.6864985190331936, "rewards/cosine_scaled_reward": -0.03475394658744335, "rewards/format_reward": 0.47916666977107525, "step": 211 }, { "advantage_max": 1.1086622849106789, "advantage_mean": -3.414849569782774e-09, "advantage_min": -0.5913215838372707, "advantage_std": 0.6221080049872398, "completion_length": 2567.5833587646484, "epoch": 0.2422857142857143, "grad_norm": 0.18780724704265594, "kl": 0.151611328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.416006812042827e-07, "loss": 0.0246, "reward": 0.13817980530438945, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.13817980530438945, "reward_after_std": 0.6221080236136913, "reward_before_mean": 0.4593674587085843, "reward_before_std": 0.5720679853111506, "reward_change_max": 0.0013311505317687988, "reward_change_mean": -0.32118767499923706, "reward_change_min": -0.5473824627697468, "reward_change_std": 0.21084042405709624, "reward_std": 0.6221080496907234, "rewards/cosine_scaled_reward": -0.009899599011987448, "rewards/format_reward": 0.47916666977107525, "step": 212 }, { "advantage_max": 1.4852821864187717, "advantage_mean": -8.071462331837864e-09, "advantage_min": -0.7190396524965763, "advantage_std": 0.8180091008543968, "completion_length": 2564.9375610351562, "epoch": 0.24342857142857144, "grad_norm": 0.9091688990592957, "kl": 0.189697265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.387534371007797e-07, "loss": -0.0705, "reward": 0.2521779127418995, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2521779127418995, "reward_after_std": 0.8180091008543968, "reward_before_mean": 0.5849970206618309, "reward_before_std": 0.7709748446941376, "reward_change_max": 0.0, "reward_change_mean": -0.3328191004693508, "reward_change_min": -0.5791736077517271, "reward_change_std": 0.22523777186870575, "reward_std": 0.8180091418325901, "rewards/cosine_scaled_reward": 0.01124850008636713, "rewards/format_reward": 0.5625000074505806, "step": 213 }, { "advantage_max": 1.2928205132484436, "advantage_mean": -1.7384688688615313e-08, "advantage_min": -0.85975431650877, "advantage_std": 0.782087666913867, "completion_length": 2792.7500534057617, "epoch": 0.24457142857142858, "grad_norm": 0.2622942328453064, "kl": 0.156494140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.358969934210438e-07, "loss": 0.0108, "reward": 0.09292547777295113, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.09292547777295113, "reward_after_std": 0.7820876855403185, "reward_before_mean": 0.38339292258024216, "reward_before_std": 0.8171257805079222, "reward_change_max": 0.00044471025466918945, "reward_change_mean": -0.2904674874152988, "reward_change_min": -0.5845286399126053, "reward_change_std": 0.2391545344144106, "reward_std": 0.7820877321064472, "rewards/cosine_scaled_reward": -0.04788686567917466, "rewards/format_reward": 0.47916667722165585, "step": 214 }, { "advantage_max": 1.0478114522993565, "advantage_mean": 1.3659397835041887e-08, "advantage_min": -0.7259840816259384, "advantage_std": 0.6186531595885754, "completion_length": 2528.7500381469727, "epoch": 0.24571428571428572, "grad_norm": 0.20343047380447388, "kl": 0.15179443359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.330314893841101e-07, "loss": 0.0208, "reward": -0.10817716736346483, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.10817716736346483, "reward_after_std": 0.6186531558632851, "reward_before_mean": 0.12496000435203314, "reward_before_std": 0.632936142385006, "reward_change_max": 0.0008758828043937683, "reward_change_mean": -0.23313714284449816, "reward_change_min": -0.43077932484447956, "reward_change_std": 0.17999738734215498, "reward_std": 0.6186531707644463, "rewards/cosine_scaled_reward": -0.17710335180163383, "rewards/format_reward": 0.47916667722165585, "step": 215 }, { "advantage_max": 1.538545936346054, "advantage_mean": -3.10440865236572e-08, "advantage_min": -0.8211144357919693, "advantage_std": 0.88631546869874, "completion_length": 2307.1250534057617, "epoch": 0.24685714285714286, "grad_norm": 0.3738247752189636, "kl": 0.1524658203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.301570646506027e-07, "loss": 0.0444, "reward": 0.2975668590515852, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2975668590515852, "reward_after_std": 0.8863154835999012, "reward_before_mean": 0.6430559878936037, "reward_before_std": 0.883563507348299, "reward_change_max": 0.000818595290184021, "reward_change_mean": -0.3454891378059983, "reward_change_min": -0.7173335365951061, "reward_change_std": 0.2694939011707902, "reward_std": 0.8863155096769333, "rewards/cosine_scaled_reward": -0.011805359274148941, "rewards/format_reward": 0.6666666697710752, "step": 216 }, { "advantage_max": 1.8591451607644558, "advantage_mean": 3.1044085080367267e-09, "advantage_min": -0.8041123300790787, "advantage_std": 1.0541826523840427, "completion_length": 2986.7500610351562, "epoch": 0.248, "grad_norm": 0.3932034969329834, "kl": 0.18426513671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.27273859315928e-07, "loss": 0.0544, "reward": 0.04206418804824352, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.04206418804824352, "reward_after_std": 1.054182630032301, "reward_before_mean": 0.275969285517931, "reward_before_std": 1.1014858186244965, "reward_change_max": 0.00034668296575546265, "reward_change_mean": -0.2339051030576229, "reward_change_min": -0.6030337251722813, "reward_change_std": 0.23021453525871038, "reward_std": 1.0541826710104942, "rewards/cosine_scaled_reward": -0.05993202514946461, "rewards/format_reward": 0.3958333358168602, "step": 217 }, { "advantage_max": 1.7612306363880634, "advantage_mean": -9.93410786964688e-09, "advantage_min": -0.782971628010273, "advantage_std": 0.9692483730614185, "completion_length": 2740.041732788086, "epoch": 0.24914285714285714, "grad_norm": 0.3887059688568115, "kl": 0.14617919921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.243820139034464e-07, "loss": 0.034, "reward": -0.06596058112336323, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.06596058112336323, "reward_after_std": 0.9692483730614185, "reward_before_mean": 0.13316884357482195, "reward_before_std": 0.9875393584370613, "reward_change_max": 0.0006215497851371765, "reward_change_mean": -0.19912943989038467, "reward_change_min": -0.5118247009813786, "reward_change_std": 0.19447639770805836, "reward_std": 0.9692483954131603, "rewards/cosine_scaled_reward": -0.12091558671090752, "rewards/format_reward": 0.37500000931322575, "step": 218 }, { "advantage_max": 1.399564553052187, "advantage_mean": -3.7252901874396116e-09, "advantage_min": -0.8181595951318741, "advantage_std": 0.8158378414809704, "completion_length": 2502.354217529297, "epoch": 0.2502857142857143, "grad_norm": 0.30666327476501465, "kl": 0.157745361328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.214816693576234e-07, "loss": 0.0188, "reward": 0.2214650847017765, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2214650847017765, "reward_after_std": 0.8158378321677446, "reward_before_mean": 0.550739474594593, "reward_before_std": 0.8254678416997194, "reward_change_max": 0.0006307139992713928, "reward_change_mean": -0.3292743950150907, "reward_change_min": -0.6367195174098015, "reward_change_std": 0.25280233519151807, "reward_std": 0.8158378712832928, "rewards/cosine_scaled_reward": -0.01629692828282714, "rewards/format_reward": 0.5833333414047956, "step": 219 }, { "advantage_max": 0.5982950031757355, "advantage_mean": 1.1175871006408045e-08, "advantage_min": -0.33232397958636284, "advantage_std": 0.3459637016057968, "completion_length": 2903.270866394043, "epoch": 0.25142857142857145, "grad_norm": 0.18245455622673035, "kl": 0.2322998046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.185729670371604e-07, "loss": 0.0284, "reward": -0.3950345846824348, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3950345846824348, "reward_after_std": 0.34596370719373226, "reward_before_mean": -0.23216606117784977, "reward_before_std": 0.3361614188179374, "reward_change_max": 0.0, "reward_change_mean": -0.16286852350458503, "reward_change_min": -0.3166360780596733, "reward_change_std": 0.11497970344498754, "reward_std": 0.34596371836960316, "rewards/cosine_scaled_reward": -0.26191636361181736, "rewards/format_reward": 0.291666679084301, "step": 220 }, { "advantage_max": 1.3345818668603897, "advantage_mean": -1.73846881335038e-08, "advantage_min": -0.7682487592101097, "advantage_std": 0.7680236846208572, "completion_length": 2026.5000267028809, "epoch": 0.25257142857142856, "grad_norm": 0.3813193142414093, "kl": 0.136322021484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.156560487081051e-07, "loss": 0.0055, "reward": 0.2768420181237161, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2768420181237161, "reward_after_std": 0.7680236846208572, "reward_before_mean": 0.6301960237324238, "reward_before_std": 0.7425558120012283, "reward_change_max": 0.0014693215489387512, "reward_change_mean": -0.353354025632143, "reward_change_min": -0.6284851208329201, "reward_change_std": 0.24855860322713852, "reward_std": 0.7680237218737602, "rewards/cosine_scaled_reward": 0.023431332781910896, "rewards/format_reward": 0.5833333488553762, "step": 221 }, { "advantage_max": 1.325666181743145, "advantage_mean": 6.829699250587851e-09, "advantage_min": -0.7518243491649628, "advantage_std": 0.7725033760070801, "completion_length": 2567.8958740234375, "epoch": 0.2537142857142857, "grad_norm": 0.3199946880340576, "kl": 0.1875, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.127310565369415e-07, "loss": 0.022, "reward": 0.22519738972187042, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.22519738972187042, "reward_after_std": 0.7725033611059189, "reward_before_mean": 0.5621007736772299, "reward_before_std": 0.7645175494253635, "reward_change_max": 0.0003986433148384094, "reward_change_mean": -0.33690338023006916, "reward_change_min": -0.6237695887684822, "reward_change_std": 0.24479289446026087, "reward_std": 0.7725033946335316, "rewards/cosine_scaled_reward": -0.03144961781799793, "rewards/format_reward": 0.6250000074505806, "step": 222 }, { "advantage_max": 1.14262056350708, "advantage_mean": 9.93410742555767e-09, "advantage_min": -0.8323545679450035, "advantage_std": 0.731819149106741, "completion_length": 2596.187545776367, "epoch": 0.25485714285714284, "grad_norm": 0.33422207832336426, "kl": 0.175048828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.097981330836616e-07, "loss": 0.0487, "reward": 0.05562268290668726, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.05562268290668726, "reward_after_std": 0.7318191528320312, "reward_before_mean": 0.34569063875824213, "reward_before_std": 0.793747067451477, "reward_change_max": 0.0011824890971183777, "reward_change_mean": -0.29006795305758715, "reward_change_min": -0.6069894656538963, "reward_change_std": 0.24760264065116644, "reward_std": 0.7318191714584827, "rewards/cosine_scaled_reward": -0.0771546857431531, "rewards/format_reward": 0.5000000111758709, "step": 223 }, { "advantage_max": 1.6740315780043602, "advantage_mean": 4.346172532976311e-09, "advantage_min": -0.9070464447140694, "advantage_std": 0.9623745940625668, "completion_length": 2658.2084045410156, "epoch": 0.256, "grad_norm": 0.6894913911819458, "kl": 0.1593170166015625, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.068574212948169e-07, "loss": 0.0743, "reward": 0.15881489496678114, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.15881489496678114, "reward_after_std": 0.9623745791614056, "reward_before_mean": 0.44474709406495094, "reward_before_std": 0.9890395030379295, "reward_change_max": 0.0003013685345649719, "reward_change_mean": -0.28593218978494406, "reward_change_min": -0.645022090524435, "reward_change_std": 0.25701585691422224, "reward_std": 0.9623746164143085, "rewards/cosine_scaled_reward": -0.006793119246140122, "rewards/format_reward": 0.4583333469927311, "step": 224 }, { "advantage_max": 1.655724935233593, "advantage_mean": -1.1175871006408045e-08, "advantage_min": -1.0246229209005833, "advantage_std": 1.0299497470259666, "completion_length": 2939.6459197998047, "epoch": 0.2571428571428571, "grad_norm": 0.5554704666137695, "kl": 0.2054443359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.039090644965509e-07, "loss": 0.0487, "reward": 0.27755059860646725, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.27755059860646725, "reward_after_std": 1.0299497283995152, "reward_before_mean": 0.6106989241670817, "reward_before_std": 1.107281219214201, "reward_change_max": 0.00046384334564208984, "reward_change_mean": -0.3331483481451869, "reward_change_min": -0.7214470133185387, "reward_change_std": 0.3054943010210991, "reward_std": 1.0299497619271278, "rewards/cosine_scaled_reward": 0.03451612964272499, "rewards/format_reward": 0.5416666772216558, "step": 225 }, { "advantage_max": 1.139162603765726, "advantage_mean": -6.829699306099002e-09, "advantage_min": -0.6521196067333221, "advantage_std": 0.6465449631214142, "completion_length": 2743.2084045410156, "epoch": 0.2582857142857143, "grad_norm": 0.24690324068069458, "kl": 0.2142333984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.009532063876148e-07, "loss": 0.0231, "reward": 0.33282162994146347, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.33282162994146347, "reward_after_std": 0.6465449705719948, "reward_before_mean": 0.7200730703771114, "reward_before_std": 0.5950763653963804, "reward_change_max": 0.00016529858112335205, "reward_change_mean": -0.3872514390386641, "reward_change_min": -0.6319355145096779, "reward_change_std": 0.23841121140867472, "reward_std": 0.6465449780225754, "rewards/cosine_scaled_reward": 0.07878652843646705, "rewards/format_reward": 0.5625000074505806, "step": 226 }, { "advantage_max": 1.3981711119413376, "advantage_mean": 7.450580874479584e-09, "advantage_min": -0.8407816961407661, "advantage_std": 0.8285619355738163, "completion_length": 2509.7500762939453, "epoch": 0.25942857142857145, "grad_norm": 0.4142477810382843, "kl": 0.22259521484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.979899910323624e-07, "loss": 0.015, "reward": 0.16833718493580818, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.16833718493580818, "reward_after_std": 0.8285619467496872, "reward_before_mean": 0.4775360394269228, "reward_before_std": 0.8386373966932297, "reward_change_max": 0.0, "reward_change_mean": -0.30919882375746965, "reward_change_min": -0.5947811640799046, "reward_change_std": 0.2422353681176901, "reward_std": 0.8285619504749775, "rewards/cosine_scaled_reward": -0.0633153374074027, "rewards/format_reward": 0.6041666772216558, "step": 227 }, { "advantage_max": 1.0023160874843597, "advantage_mean": -4.967053435223079e-09, "advantage_min": -0.7809347063302994, "advantage_std": 0.6344969868659973, "completion_length": 2511.3750228881836, "epoch": 0.26057142857142856, "grad_norm": 0.21875105798244476, "kl": 0.20574951171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.950195628537299e-07, "loss": 0.026, "reward": 0.17744406033307314, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.17744406033307314, "reward_after_std": 0.6344969756901264, "reward_before_mean": 0.519596628844738, "reward_before_std": 0.6598436906933784, "reward_change_max": 0.0009124875068664551, "reward_change_mean": -0.34215256478637457, "reward_change_min": -0.6125946547836065, "reward_change_std": 0.2433307566680014, "reward_std": 0.6344969943165779, "rewards/cosine_scaled_reward": 0.030631639063358307, "rewards/format_reward": 0.4583333358168602, "step": 228 }, { "advantage_max": 1.1860961690545082, "advantage_mean": -3.725290242950763e-09, "advantage_min": -0.5950976237654686, "advantage_std": 0.666605468839407, "completion_length": 3048.1458587646484, "epoch": 0.26171428571428573, "grad_norm": 0.3069765865802765, "kl": 0.293701171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.920420666261961e-07, "loss": 0.0238, "reward": -0.034969646483659744, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.034969646483659744, "reward_after_std": 0.6666054800152779, "reward_before_mean": 0.21758911339566112, "reward_before_std": 0.6485214531421661, "reward_change_max": 0.005425959825515747, "reward_change_mean": -0.25255876034498215, "reward_change_min": -0.5039389729499817, "reward_change_std": 0.19493055483326316, "reward_std": 0.6666054986417294, "rewards/cosine_scaled_reward": -0.057872116565704346, "rewards/format_reward": 0.33333333767950535, "step": 229 }, { "advantage_max": 1.0551139377057552, "advantage_mean": 1.738468857759301e-08, "advantage_min": -0.4135409705340862, "advantage_std": 0.560747466981411, "completion_length": 3287.9166870117188, "epoch": 0.26285714285714284, "grad_norm": 0.41716912388801575, "kl": 0.2872314453125, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.890576474687263e-07, "loss": 0.0396, "reward": -0.41289188899099827, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.41289188899099827, "reward_after_std": 0.5607474707067013, "reward_before_mean": -0.2849421767750755, "reward_before_std": 0.5487396996468306, "reward_change_max": 0.0005631819367408752, "reward_change_mean": -0.12794970721006393, "reward_change_min": -0.2580363266170025, "reward_change_std": 0.10446212626993656, "reward_std": 0.5607474967837334, "rewards/cosine_scaled_reward": -0.26747109182178974, "rewards/format_reward": 0.2500000037252903, "step": 230 }, { "advantage_max": 1.5714677423238754, "advantage_mean": -2.235174301201681e-08, "advantage_min": -0.7718853428959846, "advantage_std": 0.8607656769454479, "completion_length": 2893.854263305664, "epoch": 0.264, "grad_norm": 0.342227578163147, "kl": 0.253875732421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.860664508377001e-07, "loss": 0.0199, "reward": 0.2559966053813696, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2559966053813696, "reward_after_std": 0.8607656769454479, "reward_before_mean": 0.5868953629396856, "reward_before_std": 0.8197899498045444, "reward_change_max": 0.000645756721496582, "reward_change_mean": -0.33089875616133213, "reward_change_min": -0.5897320918738842, "reward_change_std": 0.23832240141928196, "reward_std": 0.86076570302248, "rewards/cosine_scaled_reward": 0.04344766750000417, "rewards/format_reward": 0.5000000037252903, "step": 231 }, { "advantage_max": 0.703536681830883, "advantage_mean": 1.4901161693448017e-08, "advantage_min": -0.48478899523615837, "advantage_std": 0.4345742128789425, "completion_length": 2969.7500610351562, "epoch": 0.2651428571428571, "grad_norm": 0.23666274547576904, "kl": 0.29571533203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.83068622519821e-07, "loss": 0.0313, "reward": -0.41810051421634853, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.41810051421634853, "reward_after_std": 0.4345742128789425, "reward_before_mean": -0.26686476916074753, "reward_before_std": 0.4644641876220703, "reward_change_max": 0.0005011186003684998, "reward_change_mean": -0.15123574994504452, "reward_change_min": -0.35539793223142624, "reward_change_std": 0.13576048891991377, "reward_std": 0.4345742352306843, "rewards/cosine_scaled_reward": -0.23759905248880386, "rewards/format_reward": 0.20833334140479565, "step": 232 }, { "advantage_max": 1.0524715818464756, "advantage_mean": 8.692344621863413e-09, "advantage_min": -0.5536077432334423, "advantage_std": 0.6050297953188419, "completion_length": 2742.125045776367, "epoch": 0.2662857142857143, "grad_norm": 0.2027333676815033, "kl": 0.2685546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.800643086250121e-07, "loss": 0.0358, "reward": -0.13591570034623146, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.13591570034623146, "reward_after_std": 0.6050298027694225, "reward_before_mean": 0.08999292412772775, "reward_before_std": 0.6030003279447556, "reward_change_max": 0.00041041523218154907, "reward_change_mean": -0.22590863425284624, "reward_change_min": -0.4129418469965458, "reward_change_std": 0.164816755335778, "reward_std": 0.605029821395874, "rewards/cosine_scaled_reward": -0.19458688236773014, "rewards/format_reward": 0.4791666679084301, "step": 233 }, { "advantage_max": 1.3114805594086647, "advantage_mean": -8.071462387349015e-09, "advantage_min": -0.7555878981947899, "advantage_std": 0.7706392370164394, "completion_length": 2707.812515258789, "epoch": 0.2674285714285714, "grad_norm": 0.25337696075439453, "kl": 0.24365234375, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.770536555792944e-07, "loss": 0.0202, "reward": -0.028612198890186846, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.028612198890186846, "reward_after_std": 0.7706392221152782, "reward_before_mean": 0.2160261981189251, "reward_before_std": 0.7999156005680561, "reward_change_max": 0.0006139576435089111, "reward_change_mean": -0.24463840294629335, "reward_change_min": -0.5106790214776993, "reward_change_std": 0.2085583619773388, "reward_std": 0.7706392407417297, "rewards/cosine_scaled_reward": -0.08990356823778711, "rewards/format_reward": 0.39583334513008595, "step": 234 }, { "advantage_max": 1.4646566659212112, "advantage_mean": 3.1044083970144243e-09, "advantage_min": -0.7245773002505302, "advantage_std": 0.827287781983614, "completion_length": 2410.520881652832, "epoch": 0.26857142857142857, "grad_norm": 0.3164808750152588, "kl": 0.244384765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.740368101176495e-07, "loss": 0.0445, "reward": 0.20270962733775377, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.20270962733775377, "reward_after_std": 0.8272877987474203, "reward_before_mean": 0.520314646884799, "reward_before_std": 0.820151200518012, "reward_change_max": 0.0004795342683792114, "reward_change_mean": -0.3176050102338195, "reward_change_min": -0.5893700663000345, "reward_change_std": 0.225506953895092, "reward_std": 0.8272878266870975, "rewards/cosine_scaled_reward": 0.02057398436591029, "rewards/format_reward": 0.4791666753590107, "step": 235 }, { "advantage_max": 1.7150937244296074, "advantage_mean": 5.551115123125783e-17, "advantage_min": -0.914762269705534, "advantage_std": 0.9817547611892223, "completion_length": 2980.5209350585938, "epoch": 0.26971428571428574, "grad_norm": 0.6971549391746521, "kl": 0.2442626953125, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.710139192768694e-07, "loss": 0.0655, "reward": 0.23412334313616157, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.23412334313616157, "reward_after_std": 0.9817547835409641, "reward_before_mean": 0.5466544292867184, "reward_before_std": 0.9983992204070091, "reward_change_max": 0.0008015632629394531, "reward_change_mean": -0.31253109592944384, "reward_change_min": -0.6879443116486073, "reward_change_std": 0.2631764831021428, "reward_std": 0.9817548058927059, "rewards/cosine_scaled_reward": 0.012910543940961361, "rewards/format_reward": 0.520833345130086, "step": 236 }, { "advantage_max": 1.2556101009249687, "advantage_mean": -6.829698639965187e-09, "advantage_min": -0.9055002331733704, "advantage_std": 0.7706798426806927, "completion_length": 2733.2083740234375, "epoch": 0.27085714285714285, "grad_norm": 0.4280366897583008, "kl": 0.22900390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.679851303883891e-07, "loss": 0.0486, "reward": 0.17084035277366638, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.17084035277366638, "reward_after_std": 0.7706798352301121, "reward_before_mean": 0.49289674311876297, "reward_before_std": 0.8058627881109715, "reward_change_max": 0.0, "reward_change_mean": -0.3220563712529838, "reward_change_min": -0.6181009113788605, "reward_change_std": 0.2525101900100708, "reward_std": 0.7706798538565636, "rewards/cosine_scaled_reward": 0.006865017116069794, "rewards/format_reward": 0.4791666753590107, "step": 237 }, { "advantage_max": 1.6488749012351036, "advantage_mean": -6.208817415753742e-08, "advantage_min": -1.1664773039519787, "advantage_std": 1.0465109311044216, "completion_length": 2192.604232788086, "epoch": 0.272, "grad_norm": 0.7376325726509094, "kl": 0.192626953125, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.649505910711058e-07, "loss": 0.0487, "reward": 0.5787554197013378, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5787554197013378, "reward_after_std": 1.046510897576809, "reward_before_mean": 1.0178330019116402, "reward_before_std": 1.119078617542982, "reward_change_max": 0.0002253800630569458, "reward_change_mean": -0.43907763762399554, "reward_change_min": -0.8594516478478909, "reward_change_std": 0.35902342572808266, "reward_std": 1.0465109124779701, "rewards/cosine_scaled_reward": 0.1859998283907771, "rewards/format_reward": 0.6458333432674408, "step": 238 }, { "advantage_max": 1.5666873008012772, "advantage_mean": -1.9868215073159945e-08, "advantage_min": -0.9591661542654037, "advantage_std": 0.8998047038912773, "completion_length": 2221.7917404174805, "epoch": 0.27314285714285713, "grad_norm": 0.33203524351119995, "kl": 0.21246337890625, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.619104492241847e-07, "loss": 0.02, "reward": 0.46643321961164474, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.46643321961164474, "reward_after_std": 0.8998047187924385, "reward_before_mean": 0.8711623698472977, "reward_before_std": 0.8713781274855137, "reward_change_max": 0.0010540187358856201, "reward_change_mean": -0.40472911577671766, "reward_change_min": -0.7147786617279053, "reward_change_std": 0.2880152091383934, "reward_std": 0.8998047187924385, "rewards/cosine_scaled_reward": 0.1230811607092619, "rewards/format_reward": 0.6250000149011612, "step": 239 }, { "advantage_max": 1.298158198595047, "advantage_mean": -6.208817349140361e-10, "advantage_min": -0.5822702124714851, "advantage_std": 0.6943303383886814, "completion_length": 2911.166717529297, "epoch": 0.2742857142857143, "grad_norm": 0.4261091649532318, "kl": 0.427734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.588648530198504e-07, "loss": 0.045, "reward": -0.2253067083656788, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.2253067083656788, "reward_after_std": 0.6943303607404232, "reward_before_mean": -0.048595423810184, "reward_before_std": 0.6824948564171791, "reward_change_max": 0.0004196241497993469, "reward_change_mean": -0.17671129759401083, "reward_change_min": -0.3555619493126869, "reward_change_std": 0.14292089035734534, "reward_std": 0.6943303756415844, "rewards/cosine_scaled_reward": -0.26388105377554893, "rewards/format_reward": 0.47916667722165585, "step": 240 }, { "advantage_max": 0.8420538194477558, "advantage_mean": 1.3659397890553038e-08, "advantage_min": -0.6000218987464905, "advantage_std": 0.5334564503282309, "completion_length": 2765.5208435058594, "epoch": 0.2754285714285714, "grad_norm": 0.2724221348762512, "kl": 0.3409423828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.558139508961654e-07, "loss": 0.038, "reward": -0.24559512361884117, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.24559512361884117, "reward_after_std": 0.5334564689546824, "reward_before_mean": -0.04277325049042702, "reward_before_std": 0.5738527663052082, "reward_change_max": 0.0010135173797607422, "reward_change_mean": -0.20282186102122068, "reward_change_min": -0.44659484922885895, "reward_change_std": 0.17772664222866297, "reward_std": 0.5334564968943596, "rewards/cosine_scaled_reward": -0.17763663083314896, "rewards/format_reward": 0.3125000111758709, "step": 241 }, { "advantage_max": 0.946055244654417, "advantage_mean": 9.313226301266297e-09, "advantage_min": -0.5404593013226986, "advantage_std": 0.5464118495583534, "completion_length": 2345.8959045410156, "epoch": 0.2765714285714286, "grad_norm": 0.35438135266304016, "kl": 0.35821533203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.527578915497951e-07, "loss": 0.045, "reward": 0.14251868287101388, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.14251868287101388, "reward_after_std": 0.5464118458330631, "reward_before_mean": 0.4747390653938055, "reward_before_std": 0.5084672495722771, "reward_change_max": 0.0004891380667686462, "reward_change_mean": -0.3322203680872917, "reward_change_min": -0.5551994703710079, "reward_change_std": 0.211034232750535, "reward_std": 0.5464118607342243, "rewards/cosine_scaled_reward": -0.08554715011268854, "rewards/format_reward": 0.6458333395421505, "step": 242 }, { "advantage_max": 1.200172282755375, "advantage_mean": -3.725290353973065e-09, "advantage_min": -0.8043633177876472, "advantage_std": 0.7464568391442299, "completion_length": 2693.500030517578, "epoch": 0.2777142857142857, "grad_norm": 0.45109739899635315, "kl": 0.3643798828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.496968239287603e-07, "loss": 0.0355, "reward": 0.038709891960024834, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.038709891960024834, "reward_after_std": 0.7464568465948105, "reward_before_mean": 0.31772999092936516, "reward_before_std": 0.7977652475237846, "reward_change_max": 0.0006454810500144958, "reward_change_mean": -0.2790201008319855, "reward_change_min": -0.589969240128994, "reward_change_std": 0.23987862188369036, "reward_std": 0.7464568465948105, "rewards/cosine_scaled_reward": -0.03905167616903782, "rewards/format_reward": 0.3958333432674408, "step": 243 }, { "advantage_max": 1.4144585989415646, "advantage_mean": -6.661338147750939e-16, "advantage_min": -0.8154066577553749, "advantage_std": 0.8018203265964985, "completion_length": 2837.6875534057617, "epoch": 0.27885714285714286, "grad_norm": 0.43070149421691895, "kl": 0.37689208984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.466308972251785e-07, "loss": 0.0446, "reward": 0.29334662668406963, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.29334662668406963, "reward_after_std": 0.8018203116953373, "reward_before_mean": 0.6482694167643785, "reward_before_std": 0.7809974029660225, "reward_change_max": 0.0, "reward_change_mean": -0.35492274537682533, "reward_change_min": -0.638619052246213, "reward_change_std": 0.24733527563512325, "reward_std": 0.8018203228712082, "rewards/cosine_scaled_reward": 0.08455136185511947, "rewards/format_reward": 0.47916666977107525, "step": 244 }, { "advantage_max": 1.202423632144928, "advantage_mean": 4.967053712778835e-09, "advantage_min": -0.6274031549692154, "advantage_std": 0.6885917708277702, "completion_length": 3277.7916717529297, "epoch": 0.28, "grad_norm": 0.5922569632530212, "kl": 0.4952392578125, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.435602608679916e-07, "loss": 0.0422, "reward": -0.23441375326365232, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.23441375326365232, "reward_after_std": 0.6885917671024799, "reward_before_mean": -0.0529680053004995, "reward_before_std": 0.714995913207531, "reward_change_max": 5.5089592933654785e-05, "reward_change_mean": -0.1814457457512617, "reward_change_min": -0.43721815571188927, "reward_change_std": 0.17085466999560595, "reward_std": 0.6885917857289314, "rewards/cosine_scaled_reward": -0.16190067771822214, "rewards/format_reward": 0.2708333395421505, "step": 245 }, { "advantage_max": 1.3673798479139805, "advantage_mean": 1.5522043483873205e-08, "advantage_min": -0.8780043236911297, "advantage_std": 0.8121840171515942, "completion_length": 3056.416717529297, "epoch": 0.28114285714285714, "grad_norm": 0.5122177600860596, "kl": 0.4609375, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.404850645156841e-07, "loss": 0.051, "reward": 0.12062996253371239, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.12062996253371239, "reward_after_std": 0.8121840003877878, "reward_before_mean": 0.41690291836857796, "reward_before_std": 0.8465407397598028, "reward_change_max": 0.001095414161682129, "reward_change_mean": -0.29627293813973665, "reward_change_min": -0.6192711889743805, "reward_change_std": 0.24561070930212736, "reward_std": 0.8121840059757233, "rewards/cosine_scaled_reward": -0.01029854454100132, "rewards/format_reward": 0.4375000074505806, "step": 246 }, { "advantage_max": 0.9564785324037075, "advantage_mean": 1.1796753074388988e-08, "advantage_min": -0.6387578025460243, "advantage_std": 0.5899263545870781, "completion_length": 3254.8958435058594, "epoch": 0.2822857142857143, "grad_norm": 0.3940400183200836, "kl": 0.456787109375, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.374054580489873e-07, "loss": 0.0462, "reward": -0.2427238319069147, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2427238319069147, "reward_after_std": 0.5899263694882393, "reward_before_mean": -0.047593068331480026, "reward_before_std": 0.631349828094244, "reward_change_max": 0.002109043300151825, "reward_change_mean": -0.1951307598501444, "reward_change_min": -0.4547439571470022, "reward_change_std": 0.18082292843610048, "reward_std": 0.5899263992905617, "rewards/cosine_scaled_reward": -0.20087987463921309, "rewards/format_reward": 0.3541666753590107, "step": 247 }, { "advantage_max": 1.0900807715952396, "advantage_mean": -2.545615063187512e-08, "advantage_min": -0.7387166060507298, "advantage_std": 0.6745978407561779, "completion_length": 2681.250045776367, "epoch": 0.2834285714285714, "grad_norm": 0.39817264676094055, "kl": 0.3626708984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.343215915635761e-07, "loss": 0.0595, "reward": 0.27292491123080254, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.27292491123080254, "reward_after_std": 0.6745978370308876, "reward_before_mean": 0.6433320241048932, "reward_before_std": 0.6638566926121712, "reward_change_max": 0.001688249409198761, "reward_change_mean": -0.3704071491956711, "reward_change_min": -0.6514604948461056, "reward_change_std": 0.26492215413600206, "reward_std": 0.6745978556573391, "rewards/cosine_scaled_reward": 0.08208268322050571, "rewards/format_reward": 0.4791666716337204, "step": 248 }, { "advantage_max": 1.3351382836699486, "advantage_mean": -1.4901161971003773e-08, "advantage_min": -0.7336929887533188, "advantage_std": 0.7524409592151642, "completion_length": 2210.5417098999023, "epoch": 0.2845714285714286, "grad_norm": 0.7154747247695923, "kl": 0.27446746826171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.31233615362752e-07, "loss": 0.0584, "reward": 0.347092317417264, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.347092317417264, "reward_after_std": 0.7524409592151642, "reward_before_mean": 0.7257094047963619, "reward_before_std": 0.6893659690394998, "reward_change_max": 0.0010166019201278687, "reward_change_mean": -0.3786170780658722, "reward_change_min": -0.6137403771281242, "reward_change_std": 0.2522727893665433, "reward_std": 0.752440981566906, "rewards/cosine_scaled_reward": 0.03993801912292838, "rewards/format_reward": 0.645833333954215, "step": 249 }, { "advantage_max": 1.1932175531983376, "advantage_mean": -1.8626452047421083e-09, "advantage_min": -0.6423701345920563, "advantage_std": 0.6697065159678459, "completion_length": 2465.70841217041, "epoch": 0.2857142857142857, "grad_norm": 0.9017249345779419, "kl": 0.38275146484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.281416799501187e-07, "loss": 0.0728, "reward": 0.025156520307064056, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.025156520307064056, "reward_after_std": 0.6697065122425556, "reward_before_mean": 0.29914787295274436, "reward_before_std": 0.6508230045437813, "reward_change_max": 0.0013307556509971619, "reward_change_mean": -0.2739913575351238, "reward_change_min": -0.5167406238615513, "reward_change_std": 0.19862729497253895, "reward_std": 0.6697065494954586, "rewards/cosine_scaled_reward": -0.1733427420258522, "rewards/format_reward": 0.6458333358168602, "step": 250 }, { "advantage_max": 1.6214554160833359, "advantage_mean": -7.450581041013038e-09, "advantage_min": -0.8437067344784737, "advantage_std": 0.8904179111123085, "completion_length": 1860.6250610351562, "epoch": 0.28685714285714287, "grad_norm": 0.37185877561569214, "kl": 0.2660369873046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.25045936022246e-07, "loss": 0.0244, "reward": 0.2744586355984211, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2744586355984211, "reward_after_std": 0.8904179111123085, "reward_before_mean": 0.6066321942489594, "reward_before_std": 0.858960397541523, "reward_change_max": 0.0, "reward_change_mean": -0.33217359334230423, "reward_change_min": -0.5966192334890366, "reward_change_std": 0.2286313408985734, "reward_std": 0.8904179409146309, "rewards/cosine_scaled_reward": -0.0508505729958415, "rewards/format_reward": 0.7083333414047956, "step": 251 }, { "advantage_max": 1.134083978831768, "advantage_mean": 6.208817349140361e-10, "advantage_min": -0.6665167361497879, "advantage_std": 0.6361800581216812, "completion_length": 2859.3958587646484, "epoch": 0.288, "grad_norm": 0.4567761719226837, "kl": 0.44671630859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.219465344613258e-07, "loss": 0.0348, "reward": -0.03838689235271886, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.03838689235271886, "reward_after_std": 0.6361800506711006, "reward_before_mean": 0.21587416902184486, "reward_before_std": 0.6247138157486916, "reward_change_max": 0.0002838447690010071, "reward_change_mean": -0.2542610792443156, "reward_change_min": -0.4498459994792938, "reward_change_std": 0.1773118730634451, "reward_std": 0.6361800730228424, "rewards/cosine_scaled_reward": -0.11081291688606143, "rewards/format_reward": 0.4375000074505806, "step": 252 }, { "advantage_max": 1.4208215326070786, "advantage_mean": -3.4148494476582414e-08, "advantage_min": -0.8858304098248482, "advantage_std": 0.8445864953100681, "completion_length": 2464.0834045410156, "epoch": 0.28914285714285715, "grad_norm": 0.8337281346321106, "kl": 0.365966796875, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.188436263278172e-07, "loss": 0.0728, "reward": 0.28498442959971726, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.28498442959971726, "reward_after_std": 0.8445864543318748, "reward_before_mean": 0.6343209322076291, "reward_before_std": 0.858670748770237, "reward_change_max": 0.0, "reward_change_mean": -0.3493365282192826, "reward_change_min": -0.6889376491308212, "reward_change_std": 0.2688952349126339, "reward_std": 0.844586469233036, "rewards/cosine_scaled_reward": -0.026589547283947468, "rewards/format_reward": 0.6875000167638063, "step": 253 }, { "advantage_max": 1.2321031764149666, "advantage_mean": 4.346172310931706e-09, "advantage_min": -0.680670976638794, "advantage_std": 0.725123506039381, "completion_length": 3151.375030517578, "epoch": 0.29028571428571426, "grad_norm": 0.609215497970581, "kl": 0.606689453125, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.157373628530852e-07, "loss": 0.0443, "reward": -0.049922436475753784, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.049922436475753784, "reward_after_std": 0.7251235023140907, "reward_before_mean": 0.19461863487958908, "reward_before_std": 0.7533079758286476, "reward_change_max": 0.002223499119281769, "reward_change_mean": -0.2445410778746009, "reward_change_min": -0.5359072387218475, "reward_change_std": 0.2124933786690235, "reward_std": 0.7251235097646713, "rewards/cosine_scaled_reward": -0.10060735675506294, "rewards/format_reward": 0.39583333767950535, "step": 254 }, { "advantage_max": 1.402302596718073, "advantage_mean": 1.4280279847511679e-08, "advantage_min": -0.7114418521523476, "advantage_std": 0.8122603800147772, "completion_length": 2964.687530517578, "epoch": 0.2914285714285714, "grad_norm": 0.6498525142669678, "kl": 0.58319091796875, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.126278954320294e-07, "loss": 0.0347, "reward": -0.07798312418162823, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.07798312418162823, "reward_after_std": 0.8122603800147772, "reward_before_mean": 0.14547542482614517, "reward_before_std": 0.8470509238541126, "reward_change_max": 0.0012149512767791748, "reward_change_mean": -0.22345853736624122, "reward_change_min": -0.5182074159383774, "reward_change_std": 0.20545397838577628, "reward_std": 0.8122603949159384, "rewards/cosine_scaled_reward": -0.15642895735800266, "rewards/format_reward": 0.45833333767950535, "step": 255 }, { "advantage_max": 1.3165737465023994, "advantage_mean": 6.519258216597379e-09, "advantage_min": -0.674428217113018, "advantage_std": 0.7454143799841404, "completion_length": 2872.625030517578, "epoch": 0.2925714285714286, "grad_norm": 0.48637011647224426, "kl": 0.48291015625, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.095153756157051e-07, "loss": 0.056, "reward": -0.01483570970594883, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.01483570970594883, "reward_after_std": 0.7454143762588501, "reward_before_mean": 0.23627811670303345, "reward_before_std": 0.7435798235237598, "reward_change_max": 0.0004970431327819824, "reward_change_mean": -0.25111383642069995, "reward_change_min": -0.5052115879952908, "reward_change_std": 0.1974979422520846, "reward_std": 0.7454143948853016, "rewards/cosine_scaled_reward": -0.15269426861777902, "rewards/format_reward": 0.5416666734963655, "step": 256 }, { "advantage_max": 1.5429691597819328, "advantage_mean": -6.208817127095756e-09, "advantage_min": -0.9041449502110481, "advantage_std": 0.9234105013310909, "completion_length": 3000.854232788086, "epoch": 0.2937142857142857, "grad_norm": 0.707891583442688, "kl": 0.54736328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.06399955103937e-07, "loss": 0.0864, "reward": 0.23718170449137688, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.23718170449137688, "reward_after_std": 0.9234105162322521, "reward_before_mean": 0.5629798602312803, "reward_before_std": 0.9633310399949551, "reward_change_max": 0.0, "reward_change_mean": -0.32579815573990345, "reward_change_min": -0.6817917302250862, "reward_change_std": 0.27525400929152966, "reward_std": 0.9234105236828327, "rewards/cosine_scaled_reward": 0.041906584054231644, "rewards/format_reward": 0.47916667722165585, "step": 257 }, { "advantage_max": 1.7362965643405914, "advantage_mean": -1.8626452602532595e-08, "advantage_min": -1.136706404387951, "advantage_std": 1.0333962552249432, "completion_length": 2941.5209045410156, "epoch": 0.2948571428571429, "grad_norm": 1.655138373374939, "kl": 0.515380859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.032817857379256e-07, "loss": 0.1134, "reward": 0.23384379362687469, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.23384379362687469, "reward_after_std": 1.0333962105214596, "reward_before_mean": 0.5438088413793594, "reward_before_std": 1.09098768979311, "reward_change_max": 0.0008084475994110107, "reward_change_mean": -0.3099650777876377, "reward_change_min": -0.6670037191361189, "reward_change_std": 0.27815736550837755, "reward_std": 1.0333962254226208, "rewards/cosine_scaled_reward": 0.011487742187455297, "rewards/format_reward": 0.5208333525806665, "step": 258 }, { "advantage_max": 1.3132545426487923, "advantage_mean": -4.967053546245381e-09, "advantage_min": -0.6945758275687695, "advantage_std": 0.7458698004484177, "completion_length": 2559.854217529297, "epoch": 0.296, "grad_norm": 0.5919146537780762, "kl": 0.54998779296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.001610194928464e-07, "loss": 0.03, "reward": 0.3503173356875777, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3503173356875777, "reward_after_std": 0.7458698116242886, "reward_before_mean": 0.7317201669793576, "reward_before_std": 0.7002310231328011, "reward_change_max": 4.976987838745117e-05, "reward_change_mean": -0.38140283338725567, "reward_change_min": -0.6384340189397335, "reward_change_std": 0.24794460413977504, "reward_std": 0.7458698600530624, "rewards/cosine_scaled_reward": 0.063776751208934, "rewards/format_reward": 0.6041666716337204, "step": 259 }, { "advantage_max": 1.2585435509681702, "advantage_mean": -1.2417635142369932e-08, "advantage_min": -0.7303697317838669, "advantage_std": 0.7169954068958759, "completion_length": 2164.166679382324, "epoch": 0.29714285714285715, "grad_norm": 0.7633321285247803, "kl": 0.45684814453125, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.97037808470444e-07, "loss": 0.0247, "reward": 0.5059811770915985, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5059811770915985, "reward_after_std": 0.716995369642973, "reward_before_mean": 0.947710856795311, "reward_before_std": 0.6634920183569193, "reward_change_max": 0.0, "reward_change_mean": -0.4417296778410673, "reward_change_min": -0.7061281353235245, "reward_change_std": 0.2711289385333657, "reward_std": 0.7169954031705856, "rewards/cosine_scaled_reward": 0.1613554283976555, "rewards/format_reward": 0.6250000037252903, "step": 260 }, { "advantage_max": 0.6871362961828709, "advantage_mean": 1.0865430277728905e-08, "advantage_min": -0.5955430008471012, "advantage_std": 0.44926629960536957, "completion_length": 3026.4375610351562, "epoch": 0.29828571428571427, "grad_norm": 0.6654581427574158, "kl": 0.636474609375, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.939123048916173e-07, "loss": 0.0423, "reward": -0.23231990821659565, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.23231990821659565, "reward_after_std": 0.44926629960536957, "reward_before_mean": -0.013448119163513184, "reward_before_std": 0.4752881024032831, "reward_change_max": 0.001972891390323639, "reward_change_mean": -0.21887180488556623, "reward_change_min": -0.3933011367917061, "reward_change_std": 0.16985276667401195, "reward_std": 0.44926630333065987, "rewards/cosine_scaled_reward": -0.2254740484058857, "rewards/format_reward": 0.4375000111758709, "step": 261 }, { "advantage_max": 0.953107200562954, "advantage_mean": 6.519257911286047e-09, "advantage_min": -0.5478541739284992, "advantage_std": 0.5378349423408508, "completion_length": 2944.166763305664, "epoch": 0.29942857142857143, "grad_norm": 0.6682643294334412, "kl": 0.56689453125, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.907846610890011e-07, "loss": 0.0425, "reward": -0.11213955376297235, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.11213955376297235, "reward_after_std": 0.5378349535167217, "reward_before_mean": 0.12946981191635132, "reward_before_std": 0.5200149789452553, "reward_change_max": 0.000765681266784668, "reward_change_mean": -0.24160936230327934, "reward_change_min": -0.44526655226945877, "reward_change_std": 0.16583187272772193, "reward_std": 0.5378349870443344, "rewards/cosine_scaled_reward": -0.20609844103455544, "rewards/format_reward": 0.5416666846722364, "step": 262 }, { "advantage_max": 1.180128302425146, "advantage_mean": 4.967053879312289e-09, "advantage_min": -0.6777870059013367, "advantage_std": 0.6769901290535927, "completion_length": 2966.1875610351562, "epoch": 0.30057142857142854, "grad_norm": 0.8927240967750549, "kl": 0.61474609375, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.87655029499542e-07, "loss": 0.0346, "reward": -0.19135123770684004, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.19135123770684004, "reward_after_std": 0.6769901514053345, "reward_before_mean": 0.005868307780474424, "reward_before_std": 0.6968195661902428, "reward_change_max": 0.001687467098236084, "reward_change_mean": -0.1972195482812822, "reward_change_min": -0.4410099871456623, "reward_change_std": 0.18090852163732052, "reward_std": 0.6769901663064957, "rewards/cosine_scaled_reward": -0.23664918635040522, "rewards/format_reward": 0.4791666716337204, "step": 263 }, { "advantage_max": 1.276852548122406, "advantage_mean": -5.551115123125783e-17, "advantage_min": -0.8147817477583885, "advantage_std": 0.7658328823745251, "completion_length": 2917.0000915527344, "epoch": 0.3017142857142857, "grad_norm": 0.7383478283882141, "kl": 0.48486328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.845235626570683e-07, "loss": 0.062, "reward": 0.05471212463453412, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.05471212463453412, "reward_after_std": 0.7658328823745251, "reward_before_mean": 0.3321605040691793, "reward_before_std": 0.7957486398518085, "reward_change_max": 0.0, "reward_change_mean": -0.27744837664067745, "reward_change_min": -0.5727364979684353, "reward_change_std": 0.23048890847712755, "reward_std": 0.7658329159021378, "rewards/cosine_scaled_reward": -0.13600308820605278, "rewards/format_reward": 0.6041666809469461, "step": 264 }, { "advantage_max": 1.0988817438483238, "advantage_mean": 1.3659398501175701e-08, "advantage_min": -0.7619898840785027, "advantage_std": 0.6942568942904472, "completion_length": 2722.8333892822266, "epoch": 0.3028571428571429, "grad_norm": 0.3977333605289459, "kl": 0.427734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.813904131848564e-07, "loss": 0.0308, "reward": 0.09951317869126797, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.09951317869126797, "reward_after_std": 0.6942568644881248, "reward_before_mean": 0.40570926177315414, "reward_before_std": 0.7324245609343052, "reward_change_max": 0.0006006136536598206, "reward_change_mean": -0.30619606003165245, "reward_change_min": -0.6168924458324909, "reward_change_std": 0.24154386390000582, "reward_std": 0.6942568868398666, "rewards/cosine_scaled_reward": -0.10964538622647524, "rewards/format_reward": 0.625000013038516, "step": 265 }, { "advantage_max": 1.1252152770757675, "advantage_mean": 7.45058070794613e-09, "advantage_min": -0.7847601100802422, "advantage_std": 0.6905211433768272, "completion_length": 3088.0625915527344, "epoch": 0.304, "grad_norm": 0.3700275421142578, "kl": 0.4598388671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.78255733788191e-07, "loss": 0.035, "reward": -0.05057033384218812, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.05057033384218812, "reward_after_std": 0.6905211508274078, "reward_before_mean": 0.20164042292162776, "reward_before_std": 0.7296537682414055, "reward_change_max": 0.0005407780408859253, "reward_change_mean": -0.25221075117588043, "reward_change_min": -0.5302932150661945, "reward_change_std": 0.2140982821583748, "reward_std": 0.6905211806297302, "rewards/cosine_scaled_reward": -0.18042979948222637, "rewards/format_reward": 0.5625000111758709, "step": 266 }, { "advantage_max": 1.2583488374948502, "advantage_mean": 5.551115123125783e-17, "advantage_min": -0.6300838924944401, "advantage_std": 0.7009607516229153, "completion_length": 3292.500030517578, "epoch": 0.30514285714285716, "grad_norm": 0.42236313223838806, "kl": 0.476318359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.751196772469237e-07, "loss": 0.0466, "reward": -0.255409243516624, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.255409243516624, "reward_after_std": 0.7009607441723347, "reward_before_mean": -0.08686720486730337, "reward_before_std": 0.7180141918361187, "reward_change_max": 0.0011328905820846558, "reward_change_mean": -0.16854204889386892, "reward_change_min": -0.38842369988560677, "reward_change_std": 0.15719072706997395, "reward_std": 0.7009607516229153, "rewards/cosine_scaled_reward": -0.18926694057881832, "rewards/format_reward": 0.29166667722165585, "step": 267 }, { "advantage_max": 1.2112119421362877, "advantage_mean": -3.104407841902912e-10, "advantage_min": -0.6363185346126556, "advantage_std": 0.6808359436690807, "completion_length": 2588.7084197998047, "epoch": 0.3062857142857143, "grad_norm": 0.6039485335350037, "kl": 0.3270263671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.71982396408026e-07, "loss": 0.0321, "reward": -0.030109137838735478, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.030109137838735478, "reward_after_std": 0.6808359213173389, "reward_before_mean": 0.22252458706498146, "reward_before_std": 0.665228683501482, "reward_change_max": 0.00047681480646133423, "reward_change_mean": -0.2526337383314967, "reward_change_min": -0.5076783336699009, "reward_change_std": 0.1878043320029974, "reward_std": 0.6808359511196613, "rewards/cosine_scaled_reward": -0.15957104857079685, "rewards/format_reward": 0.541666679084301, "step": 268 }, { "advantage_max": 1.5567923858761787, "advantage_mean": -1.2417634698280722e-09, "advantage_min": -0.8225489631295204, "advantage_std": 0.8803973942995071, "completion_length": 2870.8959197998047, "epoch": 0.30742857142857144, "grad_norm": 0.6920576095581055, "kl": 0.34759521484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.688440441781398e-07, "loss": 0.0488, "reward": 0.11579827091190964, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.11579827091190964, "reward_after_std": 0.8803974017500877, "reward_before_mean": 0.39466875046491623, "reward_before_std": 0.8872611932456493, "reward_change_max": 0.00016210228204727173, "reward_change_mean": -0.2788704950362444, "reward_change_min": -0.5925246551632881, "reward_change_std": 0.22718221321702003, "reward_std": 0.8803974017500877, "rewards/cosine_scaled_reward": -0.08391562500037253, "rewards/format_reward": 0.5625000074505806, "step": 269 }, { "advantage_max": 1.813984364271164, "advantage_mean": -1.8005569923928988e-08, "advantage_min": -0.8069485202431679, "advantage_std": 0.9654024578630924, "completion_length": 2783.166748046875, "epoch": 0.30857142857142855, "grad_norm": 0.8999072909355164, "kl": 0.38671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.657047735161255e-07, "loss": 0.0483, "reward": 0.20181133039295673, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.20181133039295673, "reward_after_std": 0.9654024355113506, "reward_before_mean": 0.4937496539205313, "reward_before_std": 0.9233413189649582, "reward_change_max": 0.0008331462740898132, "reward_change_mean": -0.29193832352757454, "reward_change_min": -0.5729988738894463, "reward_change_std": 0.21221650764346123, "reward_std": 0.9654024727642536, "rewards/cosine_scaled_reward": -0.05520852329209447, "rewards/format_reward": 0.6041666753590107, "step": 270 }, { "advantage_max": 1.2739427499473095, "advantage_mean": 1.6142925274298392e-08, "advantage_min": -0.7408364042639732, "advantage_std": 0.770323583856225, "completion_length": 2661.1458892822266, "epoch": 0.3097142857142857, "grad_norm": 0.38857465982437134, "kl": 0.31219482421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.625647374256061e-07, "loss": 0.0231, "reward": 0.43555654399096966, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.43555654399096966, "reward_after_std": 0.7703235819935799, "reward_before_mean": 0.8508926667273045, "reward_before_std": 0.7559794168919325, "reward_change_max": 0.0006112903356552124, "reward_change_mean": -0.41533607384189963, "reward_change_min": -0.7344745993614197, "reward_change_std": 0.29105067485943437, "reward_std": 0.7703235857188702, "rewards/cosine_scaled_reward": 0.11294632405042648, "rewards/format_reward": 0.6250000018626451, "step": 271 }, { "advantage_max": 1.1698375567793846, "advantage_mean": 2.4835267176115394e-09, "advantage_min": -0.7527714669704437, "advantage_std": 0.6810107119381428, "completion_length": 3082.666702270508, "epoch": 0.31085714285714283, "grad_norm": 0.6503217220306396, "kl": 0.442138671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.594240889475106e-07, "loss": 0.0602, "reward": -0.02925921604037285, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.02925921604037285, "reward_after_std": 0.6810106933116913, "reward_before_mean": 0.22632784768939018, "reward_before_std": 0.6936670579016209, "reward_change_max": 0.0006357654929161072, "reward_change_mean": -0.2555870823562145, "reward_change_min": -0.484931293874979, "reward_change_std": 0.20067433547228575, "reward_std": 0.6810107082128525, "rewards/cosine_scaled_reward": -0.07433607243001461, "rewards/format_reward": 0.3750000074505806, "step": 272 }, { "advantage_max": 1.5366176590323448, "advantage_mean": 1.862645426786713e-09, "advantage_min": -0.8639146089553833, "advantage_std": 0.8814141787588596, "completion_length": 3012.2709045410156, "epoch": 0.312, "grad_norm": 0.5377759337425232, "kl": 0.4683837890625, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.562829811526154e-07, "loss": 0.0601, "reward": 0.15391826815903187, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.15391826815903187, "reward_after_std": 0.8814141787588596, "reward_before_mean": 0.45080176065675914, "reward_before_std": 0.903516910970211, "reward_change_max": 0.0003979429602622986, "reward_change_mean": -0.29688349924981594, "reward_change_min": -0.5701575763523579, "reward_change_std": 0.2267187712714076, "reward_std": 0.8814142197370529, "rewards/cosine_scaled_reward": -0.003765794448554516, "rewards/format_reward": 0.4583333395421505, "step": 273 }, { "advantage_max": 1.475723922252655, "advantage_mean": -3.663202252646158e-08, "advantage_min": -1.0043025985360146, "advantage_std": 0.8919141329824924, "completion_length": 1951.1667404174805, "epoch": 0.31314285714285717, "grad_norm": 0.43542540073394775, "kl": 0.33087158203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.531415671340826e-07, "loss": 0.0124, "reward": 0.6142655089497566, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6142655089497566, "reward_after_std": 0.891914140433073, "reward_before_mean": 1.0786607032641768, "reward_before_std": 0.8922014944255352, "reward_change_max": 0.0006687715649604797, "reward_change_mean": -0.4643952287733555, "reward_change_min": -0.8292928412556648, "reward_change_std": 0.32155177742242813, "reward_std": 0.8919141665101051, "rewards/cosine_scaled_reward": 0.13308035396039486, "rewards/format_reward": 0.8125000111758709, "step": 274 }, { "advantage_max": 1.2398691214621067, "advantage_mean": -2.1730860777502414e-08, "advantage_min": -0.8742958381772041, "advantage_std": 0.7562834247946739, "completion_length": 2218.437545776367, "epoch": 0.3142857142857143, "grad_norm": 0.5172128081321716, "kl": 0.32861328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.5e-07, "loss": 0.0019, "reward": 0.3955942359752953, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3955942359752953, "reward_after_std": 0.7562834359705448, "reward_before_mean": 0.7990798335522413, "reward_before_std": 0.7605790868401527, "reward_change_max": 0.0, "reward_change_mean": -0.4034856390208006, "reward_change_min": -0.6919118501245975, "reward_change_std": 0.2679525539278984, "reward_std": 0.7562834545969963, "rewards/cosine_scaled_reward": 0.05578993167728186, "rewards/format_reward": 0.6875000074505806, "step": 275 }, { "advantage_max": 1.663666844367981, "advantage_mean": -1.490116141589226e-08, "advantage_min": -0.941422164440155, "advantage_std": 0.9734741970896721, "completion_length": 2530.645896911621, "epoch": 0.31542857142857145, "grad_norm": 1.7971059083938599, "kl": 0.385009765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.468584328659172e-07, "loss": 0.0951, "reward": 0.30931809917092323, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.30931809917092323, "reward_after_std": 0.9734741821885109, "reward_before_mean": 0.6490100165829062, "reward_before_std": 1.0003498420119286, "reward_change_max": 0.008046261966228485, "reward_change_mean": -0.33969192765653133, "reward_change_min": -0.7254288531839848, "reward_change_std": 0.28267188742756844, "reward_std": 0.9734742194414139, "rewards/cosine_scaled_reward": 0.03283834829926491, "rewards/format_reward": 0.5833333563059568, "step": 276 }, { "advantage_max": 1.3660626783967018, "advantage_mean": -5.277494746769307e-09, "advantage_min": -0.8298850581049919, "advantage_std": 0.8108585998415947, "completion_length": 2307.479232788086, "epoch": 0.31657142857142856, "grad_norm": 0.5035321116447449, "kl": 0.3431396484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.437170188473847e-07, "loss": 0.0175, "reward": 0.30645604338496923, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.30645604338496923, "reward_after_std": 0.8108585998415947, "reward_before_mean": 0.6676031211391091, "reward_before_std": 0.8139922991394997, "reward_change_max": 0.0, "reward_change_mean": -0.36114706844091415, "reward_change_min": -0.6979051753878593, "reward_change_std": 0.259663138538599, "reward_std": 0.8108586072921753, "rewards/cosine_scaled_reward": -0.009948452236130834, "rewards/format_reward": 0.6875000111758709, "step": 277 }, { "advantage_max": 0.9520070180296898, "advantage_mean": -1.2417635031347629e-08, "advantage_min": -0.44027720391750336, "advantage_std": 0.5101836659014225, "completion_length": 2111.125045776367, "epoch": 0.3177142857142857, "grad_norm": 0.3610879182815552, "kl": 0.4071044921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.405759110524894e-07, "loss": 0.0474, "reward": 0.31509879417717457, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.31509879417717457, "reward_after_std": 0.5101836659014225, "reward_before_mean": 0.707968354850891, "reward_before_std": 0.3866867758333683, "reward_change_max": 0.0011505857110023499, "reward_change_mean": -0.39286957401782274, "reward_change_min": -0.5869279652833939, "reward_change_std": 0.223956735804677, "reward_std": 0.5101836733520031, "rewards/cosine_scaled_reward": 0.010234175249934196, "rewards/format_reward": 0.6875000055879354, "step": 278 }, { "advantage_max": 1.2670928463339806, "advantage_mean": 1.1175871339474952e-08, "advantage_min": -0.832107700407505, "advantage_std": 0.7472830601036549, "completion_length": 3059.0834350585938, "epoch": 0.31885714285714284, "grad_norm": 0.7010309100151062, "kl": 0.6044921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.37435262574394e-07, "loss": 0.0514, "reward": 0.1004374697804451, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1004374697804451, "reward_after_std": 0.7472830601036549, "reward_before_mean": 0.39622872904874384, "reward_before_std": 0.7557075619697571, "reward_change_max": 0.00023264437913894653, "reward_change_mean": -0.2957912692800164, "reward_change_min": -0.5839440189301968, "reward_change_std": 0.23290821257978678, "reward_std": 0.7472830787301064, "rewards/cosine_scaled_reward": -0.12480231374502182, "rewards/format_reward": 0.6458333507180214, "step": 279 }, { "advantage_max": 1.7708331495523453, "advantage_mean": 2.4835269396561444e-09, "advantage_min": -0.936590850353241, "advantage_std": 1.0220554992556572, "completion_length": 2438.4166870117188, "epoch": 0.32, "grad_norm": 0.9222289323806763, "kl": 0.440582275390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.342952264838747e-07, "loss": 0.0523, "reward": 0.6123909717425704, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6123909717425704, "reward_after_std": 1.0220554918050766, "reward_before_mean": 1.0539636346511543, "reward_before_std": 1.0054830946028233, "reward_change_max": 0.0, "reward_change_mean": -0.44157261587679386, "reward_change_min": -0.8539315573871136, "reward_change_std": 0.32234959304332733, "reward_std": 1.0220555067062378, "rewards/cosine_scaled_reward": 0.16239846497774124, "rewards/format_reward": 0.7291666716337204, "step": 280 }, { "advantage_max": 1.0117218270897865, "advantage_mean": 1.4280279680978225e-08, "advantage_min": -0.5438734255731106, "advantage_std": 0.580887034535408, "completion_length": 3199.937530517578, "epoch": 0.3211428571428571, "grad_norm": 0.811485767364502, "kl": 0.572509765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.311559558218603e-07, "loss": 0.0406, "reward": -0.2502650732640177, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2502650732640177, "reward_after_std": 0.5808870159089565, "reward_before_mean": -0.06142948009073734, "reward_before_std": 0.589481795206666, "reward_change_max": 0.0005460679531097412, "reward_change_mean": -0.18883559666574, "reward_change_min": -0.42559648118913174, "reward_change_std": 0.1617231946438551, "reward_std": 0.5808870419859886, "rewards/cosine_scaled_reward": -0.2494647353887558, "rewards/format_reward": 0.43750000931322575, "step": 281 }, { "advantage_max": 1.3895000964403152, "advantage_mean": -3.1044086745701804e-08, "advantage_min": -0.9264363273978233, "advantage_std": 0.8309589326381683, "completion_length": 2574.9166870117188, "epoch": 0.3222857142857143, "grad_norm": 0.532403290271759, "kl": 0.4725341796875, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.28017603591974e-07, "loss": 0.0415, "reward": 0.49094755947589874, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.49094755947589874, "reward_after_std": 0.8309589587152004, "reward_before_mean": 0.9181583793833852, "reward_before_std": 0.8120024017989635, "reward_change_max": 0.00045821070671081543, "reward_change_mean": -0.42721083760261536, "reward_change_min": -0.7255665622651577, "reward_change_std": 0.29684863798320293, "reward_std": 0.8309589698910713, "rewards/cosine_scaled_reward": 0.08407919853925705, "rewards/format_reward": 0.7500000186264515, "step": 282 }, { "advantage_max": 1.415271319448948, "advantage_mean": -1.2417634254191512e-08, "advantage_min": -0.9697859063744545, "advantage_std": 0.8754711970686913, "completion_length": 2855.1875915527344, "epoch": 0.32342857142857145, "grad_norm": 1.2471925020217896, "kl": 0.5540771484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.248803227530763e-07, "loss": 0.0888, "reward": 0.20010646618902683, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.20010646618902683, "reward_after_std": 0.8754712045192719, "reward_before_mean": 0.519549798220396, "reward_before_std": 0.9254002720117569, "reward_change_max": 0.0010265707969665527, "reward_change_mean": -0.31944333389401436, "reward_change_min": -0.6503037363290787, "reward_change_std": 0.2723896815441549, "reward_std": 0.8754712231457233, "rewards/cosine_scaled_reward": -0.0006417706608772278, "rewards/format_reward": 0.5208333469927311, "step": 283 }, { "advantage_max": 1.316909946501255, "advantage_mean": -1.7384689243726825e-08, "advantage_min": -0.8938746899366379, "advantage_std": 0.7911999821662903, "completion_length": 2411.3333740234375, "epoch": 0.32457142857142857, "grad_norm": 1.0863850116729736, "kl": 0.50048828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.21744266211809e-07, "loss": 0.0111, "reward": 0.4276206409558654, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4276206409558654, "reward_after_std": 0.7911999821662903, "reward_before_mean": 0.8353298846632242, "reward_before_std": 0.7929439060389996, "reward_change_max": 0.0, "reward_change_mean": -0.4077092222869396, "reward_change_min": -0.7041095197200775, "reward_change_std": 0.27274756878614426, "reward_std": 0.7911999858915806, "rewards/cosine_scaled_reward": 0.011414924636483192, "rewards/format_reward": 0.8125000111758709, "step": 284 }, { "advantage_max": 1.259010173380375, "advantage_mean": -3.1044088966147854e-09, "advantage_min": -0.6755791082978249, "advantage_std": 0.6990512013435364, "completion_length": 2026.4375686645508, "epoch": 0.32571428571428573, "grad_norm": 0.7733309864997864, "kl": 0.29052734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.186095868151436e-07, "loss": -0.0132, "reward": 0.11948005040176213, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.11948005040176213, "reward_after_std": 0.6990512199699879, "reward_before_mean": 0.42276617558673024, "reward_before_std": 0.6689119711518288, "reward_change_max": 5.587935447692871e-05, "reward_change_mean": -0.3032861463725567, "reward_change_min": -0.5256478674709797, "reward_change_std": 0.20443684607744217, "reward_std": 0.6990512236952782, "rewards/cosine_scaled_reward": -0.19486692734062672, "rewards/format_reward": 0.8125000111758709, "step": 285 }, { "advantage_max": 1.4266880974173546, "advantage_mean": -8.071462442860167e-09, "advantage_min": -0.838430143892765, "advantage_std": 0.7950329594314098, "completion_length": 2675.8125610351562, "epoch": 0.32685714285714285, "grad_norm": 0.8457327485084534, "kl": 0.55706787109375, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.154764373429315e-07, "loss": 0.0748, "reward": 0.14559801947325468, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.14559801947325468, "reward_after_std": 0.7950329519808292, "reward_before_mean": 0.44482777640223503, "reward_before_std": 0.7759469635784626, "reward_change_max": 0.0, "reward_change_mean": -0.2992297522723675, "reward_change_min": -0.5666345283389091, "reward_change_std": 0.21813244745135307, "reward_std": 0.795032974332571, "rewards/cosine_scaled_reward": -0.16300279181450605, "rewards/format_reward": 0.7708333618938923, "step": 286 }, { "advantage_max": 1.0565427094697952, "advantage_mean": -2.452482850134885e-08, "advantage_min": -0.7084263153374195, "advantage_std": 0.6352221257984638, "completion_length": 2057.166732788086, "epoch": 0.328, "grad_norm": 0.8031435012817383, "kl": 0.353179931640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.123449705004581e-07, "loss": 0.0017, "reward": 0.29547856375575066, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.29547856375575066, "reward_after_std": 0.6352221146225929, "reward_before_mean": 0.6729789730161428, "reward_before_std": 0.622281976044178, "reward_change_max": 0.0, "reward_change_mean": -0.37750041857361794, "reward_change_min": -0.6289332434535027, "reward_change_std": 0.24394258856773376, "reward_std": 0.635222140699625, "rewards/cosine_scaled_reward": -0.048927186988294125, "rewards/format_reward": 0.7708333358168602, "step": 287 }, { "advantage_max": 1.450631108134985, "advantage_mean": -2.1730858223989458e-09, "advantage_min": -0.7151788100600243, "advantage_std": 0.8018929846584797, "completion_length": 2653.562545776367, "epoch": 0.3291428571428571, "grad_norm": 0.7178438305854797, "kl": 0.41448974609375, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.09215338910999e-07, "loss": 0.0133, "reward": 0.25840965658426285, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.25840965658426285, "reward_after_std": 0.8018929846584797, "reward_before_mean": 0.598530687391758, "reward_before_std": 0.7543132728897035, "reward_change_max": 0.00046640634536743164, "reward_change_mean": -0.3401210466399789, "reward_change_min": -0.6340159177780151, "reward_change_std": 0.2446063496172428, "reward_std": 0.8018930144608021, "rewards/cosine_scaled_reward": -0.07573466026224196, "rewards/format_reward": 0.7500000111758709, "step": 288 }, { "advantage_max": 1.0173869207501411, "advantage_mean": -6.208817349140361e-09, "advantage_min": -0.6042489297688007, "advantage_std": 0.5885357595980167, "completion_length": 2031.6042175292969, "epoch": 0.3302857142857143, "grad_norm": 0.49764305353164673, "kl": 0.3008270263671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.060876951083828e-07, "loss": 0.0369, "reward": 0.2047883691266179, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2047883691266179, "reward_after_std": 0.5885357577353716, "reward_before_mean": 0.5541504016146064, "reward_before_std": 0.5410357071086764, "reward_change_max": 0.0004688054323196411, "reward_change_mean": -0.34936204878613353, "reward_change_min": -0.6108824927359819, "reward_change_std": 0.2264309749007225, "reward_std": 0.5885357577353716, "rewards/cosine_scaled_reward": -0.10834147967398167, "rewards/format_reward": 0.7708333414047956, "step": 289 }, { "advantage_max": 1.5400925129652023, "advantage_mean": -2.23517425679276e-08, "advantage_min": -0.8571957536041737, "advantage_std": 0.902538351714611, "completion_length": 2571.666778564453, "epoch": 0.3314285714285714, "grad_norm": 0.6938238143920898, "kl": 0.488525390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.02962191529556e-07, "loss": 0.0609, "reward": 0.4056507070781663, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4056507070781663, "reward_after_std": 0.9025383479893208, "reward_before_mean": 0.7911003306508064, "reward_before_std": 0.8982041031122208, "reward_change_max": 0.0, "reward_change_mean": -0.38544964604079723, "reward_change_min": -0.7154050804674625, "reward_change_std": 0.2800214570015669, "reward_std": 0.9025383703410625, "rewards/cosine_scaled_reward": -0.0002831774763762951, "rewards/format_reward": 0.7916666716337204, "step": 290 }, { "advantage_max": 1.4336735494434834, "advantage_mean": -2.60770322002557e-08, "advantage_min": -0.819994043558836, "advantage_std": 0.8136847987771034, "completion_length": 2753.0625610351562, "epoch": 0.3325714285714286, "grad_norm": 0.5715336203575134, "kl": 0.43505859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.998389805071536e-07, "loss": 0.0546, "reward": 0.29718087799847126, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.29718087799847126, "reward_after_std": 0.8136848174035549, "reward_before_mean": 0.6506573930382729, "reward_before_std": 0.7926754802465439, "reward_change_max": 0.0006195604801177979, "reward_change_mean": -0.3534764889627695, "reward_change_min": -0.6486410312354565, "reward_change_std": 0.24405055306851864, "reward_std": 0.8136848285794258, "rewards/cosine_scaled_reward": -0.07050466444343328, "rewards/format_reward": 0.7916666716337204, "step": 291 }, { "advantage_max": 1.1349475383758545, "advantage_mean": -4.190951474747351e-09, "advantage_min": -0.7262782752513885, "advantage_std": 0.6686569154262543, "completion_length": 2857.5001068115234, "epoch": 0.33371428571428574, "grad_norm": 0.3354578912258148, "kl": 0.37042236328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.967182142620745e-07, "loss": 0.0119, "reward": 0.16059950197814032, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.16059950197814032, "reward_after_std": 0.6686569005250931, "reward_before_mean": 0.487422039732337, "reward_before_std": 0.6643938161432743, "reward_change_max": 0.0005034282803535461, "reward_change_mean": -0.3268225295469165, "reward_change_min": -0.5673139244318008, "reward_change_std": 0.22015255317091942, "reward_std": 0.668656948953867, "rewards/cosine_scaled_reward": -0.1104556662030518, "rewards/format_reward": 0.7083333432674408, "step": 292 }, { "advantage_max": 1.0261386930942535, "advantage_mean": -9.313226079221693e-09, "advantage_min": -0.7676491439342499, "advantage_std": 0.625335369259119, "completion_length": 2113.916732788086, "epoch": 0.33485714285714285, "grad_norm": 0.466907262802124, "kl": 0.21929931640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.93600044896063e-07, "loss": 0.02, "reward": 0.3123467434197664, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3123467434197664, "reward_after_std": 0.6253353767096996, "reward_before_mean": 0.6998563334345818, "reward_before_std": 0.6170645132660866, "reward_change_max": 0.00037410855293273926, "reward_change_mean": -0.38750957138836384, "reward_change_min": -0.6227229908108711, "reward_change_std": 0.2519225236028433, "reward_std": 0.6253353990614414, "rewards/cosine_scaled_reward": -0.06673851422965527, "rewards/format_reward": 0.833333358168602, "step": 293 }, { "advantage_max": 1.1145121417939663, "advantage_mean": 0.0, "advantage_min": -0.7262555472552776, "advantage_std": 0.6630832739174366, "completion_length": 2759.1875610351562, "epoch": 0.336, "grad_norm": 1.2123439311981201, "kl": 0.4388427734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.904846243842949e-07, "loss": 0.0641, "reward": -0.08887020824477077, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.08887020824477077, "reward_after_std": 0.6630832552909851, "reward_before_mean": 0.14825415145605803, "reward_before_std": 0.6850478500127792, "reward_change_max": 0.0, "reward_change_mean": -0.23712435737252235, "reward_change_min": -0.481619268655777, "reward_change_std": 0.19604469276964664, "reward_std": 0.663083266466856, "rewards/cosine_scaled_reward": -0.15503959357738495, "rewards/format_reward": 0.45833334885537624, "step": 294 }, { "advantage_max": 1.190081775188446, "advantage_mean": -1.552204376142896e-08, "advantage_min": -0.9659006744623184, "advantage_std": 0.7599161565303802, "completion_length": 2853.8959197998047, "epoch": 0.33714285714285713, "grad_norm": 0.5325373411178589, "kl": 0.446533203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.873721045679706e-07, "loss": 0.0451, "reward": 0.37088565714657307, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.37088565714657307, "reward_after_std": 0.759916141629219, "reward_before_mean": 0.7698443452827632, "reward_before_std": 0.797116793692112, "reward_change_max": 0.0, "reward_change_mean": -0.3989586550742388, "reward_change_min": -0.7092138826847076, "reward_change_std": 0.289244526065886, "reward_std": 0.7599161565303802, "rewards/cosine_scaled_reward": 0.05158880911767483, "rewards/format_reward": 0.6666666753590107, "step": 295 }, { "advantage_max": 1.3938785642385483, "advantage_mean": -3.1044087300813317e-09, "advantage_min": -0.9425633475184441, "advantage_std": 0.8566170409321785, "completion_length": 2891.2084045410156, "epoch": 0.3382857142857143, "grad_norm": 0.82956862449646, "kl": 0.4888916015625, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.842626371469149e-07, "loss": 0.0317, "reward": 0.2034913629759103, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2034913629759103, "reward_after_std": 0.8566170409321785, "reward_before_mean": 0.5244719043839723, "reward_before_std": 0.903838749974966, "reward_change_max": 0.0012940466403961182, "reward_change_mean": -0.320980554446578, "reward_change_min": -0.6148804239928722, "reward_change_std": 0.25917847361415625, "reward_std": 0.8566170632839203, "rewards/cosine_scaled_reward": -0.039847382344305515, "rewards/format_reward": 0.6041666865348816, "step": 296 }, { "advantage_max": 1.4810718297958374, "advantage_mean": 8.071462442860167e-09, "advantage_min": -0.689074344933033, "advantage_std": 0.7899037785828114, "completion_length": 3057.8125610351562, "epoch": 0.3394285714285714, "grad_norm": 1.0994075536727905, "kl": 0.466552734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.811563736721829e-07, "loss": 0.0888, "reward": -0.15424572816118598, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.15424572816118598, "reward_after_std": 0.7899037972092628, "reward_before_mean": 0.03237947775050998, "reward_before_std": 0.7759405970573425, "reward_change_max": 0.0008534565567970276, "reward_change_mean": -0.18662519752979279, "reward_change_min": -0.35279146023094654, "reward_change_std": 0.14328347519040108, "reward_std": 0.7899038307368755, "rewards/cosine_scaled_reward": -0.1921435995027423, "rewards/format_reward": 0.4166666753590107, "step": 297 }, { "advantage_max": 1.3388825692236423, "advantage_mean": -3.725290298461914e-09, "advantage_min": -0.6641395017504692, "advantage_std": 0.7280402891337872, "completion_length": 2240.27091217041, "epoch": 0.3405714285714286, "grad_norm": 0.6047587990760803, "kl": 0.290863037109375, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.780534655386743e-07, "loss": 0.0508, "reward": 0.3340705500449985, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3340705500449985, "reward_after_std": 0.728040311485529, "reward_before_mean": 0.7073127664625645, "reward_before_std": 0.6394331082701683, "reward_change_max": 0.0, "reward_change_mean": -0.3732422087341547, "reward_change_min": -0.6003867406398058, "reward_change_std": 0.22959734685719013, "reward_std": 0.728040311485529, "rewards/cosine_scaled_reward": -0.010926967253908515, "rewards/format_reward": 0.729166679084301, "step": 298 }, { "advantage_max": 1.0907358229160309, "advantage_mean": 8.692344288796505e-09, "advantage_min": -0.7735178507864475, "advantage_std": 0.6482625380158424, "completion_length": 2626.7500915527344, "epoch": 0.3417142857142857, "grad_norm": 0.729369580745697, "kl": 0.315673828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.749540639777539e-07, "loss": -0.0082, "reward": 0.482487186556682, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.482487186556682, "reward_after_std": 0.6482625491917133, "reward_before_mean": 0.9276544786989689, "reward_before_std": 0.6089337766170502, "reward_change_max": 0.0, "reward_change_mean": -0.44516725465655327, "reward_change_min": -0.714932031929493, "reward_change_std": 0.2732243649661541, "reward_std": 0.6482625640928745, "rewards/cosine_scaled_reward": 0.0784105621278286, "rewards/format_reward": 0.7708333507180214, "step": 299 }, { "advantage_max": 1.2775380536913872, "advantage_mean": -1.769512955607233e-08, "advantage_min": -0.8028685301542282, "advantage_std": 0.7613370381295681, "completion_length": 2808.166717529297, "epoch": 0.34285714285714286, "grad_norm": 0.932621419429779, "kl": 0.3758544921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.7185832004988133e-07, "loss": -0.0015, "reward": 0.24657689794548787, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.24657689794548787, "reward_after_std": 0.7613370008766651, "reward_before_mean": 0.591511320322752, "reward_before_std": 0.7680493295192719, "reward_change_max": 0.0, "reward_change_mean": -0.3449344504624605, "reward_change_min": -0.6845588479191065, "reward_change_std": 0.2547345831990242, "reward_std": 0.7613370381295681, "rewards/cosine_scaled_reward": -0.0792443419340998, "rewards/format_reward": 0.7500000204890966, "step": 300 }, { "advantage_max": 1.2761215642094612, "advantage_mean": -3.1044084525255755e-09, "advantage_min": -0.8102873601019382, "advantage_std": 0.7457788214087486, "completion_length": 2302.5625762939453, "epoch": 0.344, "grad_norm": 0.4439832270145416, "kl": 0.370361328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.68766384637248e-07, "loss": 0.0372, "reward": 0.20716924034059048, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.20716924034059048, "reward_after_std": 0.7457788102328777, "reward_before_mean": 0.539110149256885, "reward_before_std": 0.7404397539794445, "reward_change_max": 0.0004123970866203308, "reward_change_mean": -0.33194091357290745, "reward_change_min": -0.6133960336446762, "reward_change_std": 0.2363328319042921, "reward_std": 0.7457788325846195, "rewards/cosine_scaled_reward": -0.053361592814326286, "rewards/format_reward": 0.6458333414047956, "step": 301 }, { "advantage_max": 1.0984256006777287, "advantage_mean": -1.4280279292400166e-08, "advantage_min": -0.6192377582192421, "advantage_std": 0.6156471632421017, "completion_length": 2580.812568664551, "epoch": 0.34514285714285714, "grad_norm": 0.5197194218635559, "kl": 0.38067626953125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.656784084364238e-07, "loss": 0.0277, "reward": 0.3671446368098259, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3671446368098259, "reward_after_std": 0.6156471632421017, "reward_before_mean": 0.7704743854701519, "reward_before_std": 0.5410328283905983, "reward_change_max": 0.0, "reward_change_mean": -0.4033297412097454, "reward_change_min": -0.6593691110610962, "reward_change_std": 0.24221038818359375, "reward_std": 0.6156471930444241, "rewards/cosine_scaled_reward": -0.02101281937211752, "rewards/format_reward": 0.8125000149011612, "step": 302 }, { "advantage_max": 1.365809440612793, "advantage_mean": 1.241763414316921e-09, "advantage_min": -0.6896493546664715, "advantage_std": 0.7584427632391453, "completion_length": 2604.4584197998047, "epoch": 0.3462857142857143, "grad_norm": 1.0810075998306274, "kl": 0.54681396484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.6259454195101267e-07, "loss": 0.0158, "reward": 0.0789760680636391, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.0789760680636391, "reward_after_std": 0.758442759513855, "reward_before_mean": 0.3580004023388028, "reward_before_std": 0.7291050627827644, "reward_change_max": 0.0002533271908760071, "reward_change_mean": -0.27902431692928076, "reward_change_min": -0.536742877215147, "reward_change_std": 0.20501808635890484, "reward_std": 0.7584427706897259, "rewards/cosine_scaled_reward": -0.14391647558659315, "rewards/format_reward": 0.6458333414047956, "step": 303 }, { "advantage_max": 1.1893558949232101, "advantage_mean": -1.4280279292400166e-08, "advantage_min": -0.6290902234613895, "advantage_std": 0.6690522991120815, "completion_length": 2497.666717529297, "epoch": 0.3474285714285714, "grad_norm": 0.39485907554626465, "kl": 0.443359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.59514935484316e-07, "loss": 0.0437, "reward": 0.2333060341188684, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2333060341188684, "reward_after_std": 0.6690522991120815, "reward_before_mean": 0.5804811692796648, "reward_before_std": 0.6259120032191277, "reward_change_max": 0.00022764503955841064, "reward_change_mean": -0.34717513993382454, "reward_change_min": -0.5779447555541992, "reward_change_std": 0.21656623855233192, "reward_std": 0.6690523251891136, "rewards/cosine_scaled_reward": -0.12642608489841223, "rewards/format_reward": 0.8333333507180214, "step": 304 }, { "advantage_max": 1.1147175133228302, "advantage_mean": 6.20881956958641e-10, "advantage_min": -0.6409763470292091, "advantage_std": 0.6258622892200947, "completion_length": 2925.6250610351562, "epoch": 0.3485714285714286, "grad_norm": 0.4802895784378052, "kl": 0.4498291015625, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.5643973913200837e-07, "loss": 0.0614, "reward": 0.07837151922285557, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.07837151922285557, "reward_after_std": 0.6258622966706753, "reward_before_mean": 0.37614856101572514, "reward_before_std": 0.5923885479569435, "reward_change_max": 0.0028682053089141846, "reward_change_mean": -0.29777705390006304, "reward_change_min": -0.5343084167689085, "reward_change_std": 0.20146138314157724, "reward_std": 0.6258623190224171, "rewards/cosine_scaled_reward": -0.12442572601139545, "rewards/format_reward": 0.6250000093132257, "step": 305 }, { "advantage_max": 1.3190920874476433, "advantage_mean": -1.2417634476236117e-08, "advantage_min": -0.7708496078848839, "advantage_std": 0.7822548560798168, "completion_length": 2487.1458892822266, "epoch": 0.3497142857142857, "grad_norm": 0.4971684217453003, "kl": 0.333984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.5336910277482155e-07, "loss": 0.0417, "reward": 0.5616477080620825, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5616477080620825, "reward_after_std": 0.7822548523545265, "reward_before_mean": 1.0177391674369574, "reward_before_std": 0.7489114031195641, "reward_change_max": 0.0, "reward_change_mean": -0.4560914523899555, "reward_change_min": -0.7948772348463535, "reward_change_std": 0.29982269182801247, "reward_std": 0.7822548858821392, "rewards/cosine_scaled_reward": 0.13386957813054323, "rewards/format_reward": 0.7500000223517418, "step": 306 }, { "advantage_max": 1.3895053565502167, "advantage_mean": -1.1796753296433593e-08, "advantage_min": -0.8578041307628155, "advantage_std": 0.8069791980087757, "completion_length": 2686.4375610351562, "epoch": 0.35085714285714287, "grad_norm": 1.477962613105774, "kl": 0.44000244140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.503031760712397e-07, "loss": 0.0864, "reward": 0.186143385944888, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.186143385944888, "reward_after_std": 0.8069791980087757, "reward_before_mean": 0.5036964304745197, "reward_before_std": 0.8120075799524784, "reward_change_max": 0.0, "reward_change_mean": -0.3175530396401882, "reward_change_min": -0.6073685400187969, "reward_change_std": 0.23993523512035608, "reward_std": 0.8069792166352272, "rewards/cosine_scaled_reward": -0.07106846524402499, "rewards/format_reward": 0.6458333414047956, "step": 307 }, { "advantage_max": 1.0783714689314365, "advantage_mean": 1.3659398112597643e-08, "advantage_min": -0.8203104771673679, "advantage_std": 0.6680172383785248, "completion_length": 2831.5000610351562, "epoch": 0.352, "grad_norm": 0.9542275667190552, "kl": 0.50390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.4724210845020494e-07, "loss": 0.0664, "reward": -0.11247891874518245, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.11247891874518245, "reward_after_std": 0.6680172309279442, "reward_before_mean": 0.1211111843585968, "reward_before_std": 0.7149896770715714, "reward_change_max": 7.31348991394043e-05, "reward_change_mean": -0.23359010089188814, "reward_change_min": -0.4645574577152729, "reward_change_std": 0.20687062293291092, "reward_std": 0.6680172458291054, "rewards/cosine_scaled_reward": -0.21027773432433605, "rewards/format_reward": 0.5416666809469461, "step": 308 }, { "advantage_max": 1.5179293677210808, "advantage_mean": -2.048909714114089e-08, "advantage_min": -0.8129667229950428, "advantage_std": 0.8496960774064064, "completion_length": 2704.0209350585938, "epoch": 0.35314285714285715, "grad_norm": 0.5584425330162048, "kl": 0.4093017578125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.441860491038345e-07, "loss": 0.0375, "reward": 0.24515102710574865, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.24515102710574865, "reward_after_std": 0.8496960625052452, "reward_before_mean": 0.5750349033623934, "reward_before_std": 0.8240993618965149, "reward_change_max": 0.0006732270121574402, "reward_change_mean": -0.3298839027993381, "reward_change_min": -0.6384319141507149, "reward_change_std": 0.239481333643198, "reward_std": 0.8496960885822773, "rewards/cosine_scaled_reward": -0.06664922530762851, "rewards/format_reward": 0.7083333414047956, "step": 309 }, { "advantage_max": 1.1005538403987885, "advantage_mean": -8.381903476850638e-09, "advantage_min": -0.5918949320912361, "advantage_std": 0.6197176352143288, "completion_length": 2167.2708892822266, "epoch": 0.35428571428571426, "grad_norm": 0.3327484726905823, "kl": 0.37042236328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.4113514698014953e-07, "loss": 0.0355, "reward": 0.08579268056200817, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.08579268056200817, "reward_after_std": 0.6197176389396191, "reward_before_mean": 0.38630370143800974, "reward_before_std": 0.5802172459661961, "reward_change_max": 0.0003741607069969177, "reward_change_mean": -0.3005110053345561, "reward_change_min": -0.5397593379020691, "reward_change_std": 0.1986795188859105, "reward_std": 0.6197176463901997, "rewards/cosine_scaled_reward": -0.18184817489236593, "rewards/format_reward": 0.7500000111758709, "step": 310 }, { "advantage_max": 1.617615930736065, "advantage_mean": -2.4835269396561444e-08, "advantage_min": -1.0272089317440987, "advantage_std": 0.9101916253566742, "completion_length": 2331.1459045410156, "epoch": 0.3554285714285714, "grad_norm": 0.9486208558082581, "kl": 0.318389892578125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.3808955077581546e-07, "loss": 0.0806, "reward": 0.6239680799189955, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6239680799189955, "reward_after_std": 0.9101915881037712, "reward_before_mean": 1.0810477957129478, "reward_before_std": 0.8660503253340721, "reward_change_max": 0.0, "reward_change_mean": -0.45707971416413784, "reward_change_min": -0.7482115887105465, "reward_change_std": 0.2896172106266022, "reward_std": 0.9101916253566742, "rewards/cosine_scaled_reward": 0.1238572234287858, "rewards/format_reward": 0.8333333507180214, "step": 311 }, { "advantage_max": 1.2817089334130287, "advantage_mean": -2.1730860721991263e-08, "advantage_min": -0.692761953920126, "advantage_std": 0.7200740836560726, "completion_length": 2084.145896911621, "epoch": 0.3565714285714286, "grad_norm": 0.6255805492401123, "kl": 0.417327880859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.350494089288943e-07, "loss": 0.037, "reward": 0.521587735041976, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.521587735041976, "reward_after_std": 0.7200741060078144, "reward_before_mean": 0.9654922960326076, "reward_before_std": 0.6313992403447628, "reward_change_max": 0.00043123960494995117, "reward_change_mean": -0.4439045786857605, "reward_change_min": -0.6904230825603008, "reward_change_std": 0.26942718401551247, "reward_std": 0.720074113458395, "rewards/cosine_scaled_reward": 0.05566280521452427, "rewards/format_reward": 0.8541666716337204, "step": 312 }, { "advantage_max": 1.5071654841303825, "advantage_mean": -4.6566128730773926e-09, "advantage_min": -0.8955875560641289, "advantage_std": 0.8493525125086308, "completion_length": 2596.81258392334, "epoch": 0.3577142857142857, "grad_norm": 1.359320044517517, "kl": 0.553924560546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.3201486961161093e-07, "loss": 0.0083, "reward": 0.5998956672847271, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5998956672847271, "reward_after_std": 0.8493525348603725, "reward_before_mean": 1.054509874433279, "reward_before_std": 0.7884119637310505, "reward_change_max": 0.00043067336082458496, "reward_change_mean": -0.4546141605824232, "reward_change_min": -0.7211263254284859, "reward_change_std": 0.2862904816865921, "reward_std": 0.849352553486824, "rewards/cosine_scaled_reward": 0.1105882371775806, "rewards/format_reward": 0.8333333507180214, "step": 313 }, { "advantage_max": 1.2207506373524666, "advantage_mean": -2.17308601113686e-08, "advantage_min": -0.8295302875339985, "advantage_std": 0.718177042901516, "completion_length": 2326.1459045410156, "epoch": 0.3588571428571429, "grad_norm": 0.5819482207298279, "kl": 0.472625732421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.2898608072313045e-07, "loss": 0.0495, "reward": 0.5721183009445667, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5721183009445667, "reward_after_std": 0.7181770466268063, "reward_before_mean": 1.0381627357564867, "reward_before_std": 0.6686963923275471, "reward_change_max": 0.0, "reward_change_mean": -0.4660444292239845, "reward_change_min": -0.7502425312995911, "reward_change_std": 0.29036774951964617, "reward_std": 0.7181770764291286, "rewards/cosine_scaled_reward": 0.1024146843701601, "rewards/format_reward": 0.833333333954215, "step": 314 }, { "advantage_max": 1.541440226137638, "advantage_mean": -3.7252904094842165e-09, "advantage_min": -0.7856865674257278, "advantage_std": 0.8436717689037323, "completion_length": 2587.687545776367, "epoch": 0.36, "grad_norm": 0.8492938280105591, "kl": 0.5152587890625, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.2596318988235037e-07, "loss": 0.0423, "reward": 0.2977000498212874, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2977000498212874, "reward_after_std": 0.8436717763543129, "reward_before_mean": 0.6422043442726135, "reward_before_std": 0.7983835525810719, "reward_change_max": 0.00015465915203094482, "reward_change_mean": -0.3445042800158262, "reward_change_min": -0.579395305365324, "reward_change_std": 0.23542877286672592, "reward_std": 0.8436717912554741, "rewards/cosine_scaled_reward": -0.04348117241170257, "rewards/format_reward": 0.7291666902601719, "step": 315 }, { "advantage_max": 1.1138798967003822, "advantage_mean": 1.1102230246251565e-16, "advantage_min": -0.6890594810247421, "advantage_std": 0.6519434526562691, "completion_length": 3087.9376220703125, "epoch": 0.36114285714285715, "grad_norm": 0.9313336610794067, "kl": 0.517822265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.2294634442070553e-07, "loss": 0.0346, "reward": 0.045592143665999174, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.045592143665999174, "reward_after_std": 0.6519434675574303, "reward_before_mean": 0.33301879093050957, "reward_before_std": 0.649714894592762, "reward_change_max": 0.0, "reward_change_mean": -0.28742665797472, "reward_change_min": -0.5346714928746223, "reward_change_std": 0.2103767693042755, "reward_std": 0.6519434861838818, "rewards/cosine_scaled_reward": -0.198073947802186, "rewards/format_reward": 0.7291666902601719, "step": 316 }, { "advantage_max": 1.530574381351471, "advantage_mean": -8.07146216530441e-09, "advantage_min": -1.0632232949137688, "advantage_std": 0.9659229815006256, "completion_length": 2734.979232788086, "epoch": 0.36228571428571427, "grad_norm": 0.9145365953445435, "kl": 0.66119384765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.1993569137498776e-07, "loss": 0.0646, "reward": 0.2181464321911335, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2181464321911335, "reward_after_std": 0.9659229628741741, "reward_before_mean": 0.5356616548378952, "reward_before_std": 1.0483863092958927, "reward_change_max": 0.00044549256563186646, "reward_change_mean": -0.31751522794365883, "reward_change_min": -0.7150661610066891, "reward_change_std": 0.3014825861901045, "reward_std": 0.9659229926764965, "rewards/cosine_scaled_reward": -0.05508585087954998, "rewards/format_reward": 0.6458333469927311, "step": 317 }, { "advantage_max": 1.8403086960315704, "advantage_mean": -3.042320562141043e-08, "advantage_min": -0.8629412576556206, "advantage_std": 0.9772805720567703, "completion_length": 2202.8334045410156, "epoch": 0.36342857142857143, "grad_norm": 0.6073242425918579, "kl": 0.37530517578125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.1693137748017915e-07, "loss": 0.0146, "reward": 0.46245671808719635, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.46245671808719635, "reward_after_std": 0.9772805646061897, "reward_before_mean": 0.8472766764461994, "reward_before_std": 0.8918176591396332, "reward_change_max": 0.0, "reward_change_mean": -0.38481999561190605, "reward_change_min": -0.6207333505153656, "reward_change_std": 0.23993216827511787, "reward_std": 0.9772805869579315, "rewards/cosine_scaled_reward": -0.01386166550219059, "rewards/format_reward": 0.8750000149011612, "step": 318 }, { "advantage_max": 1.1768467128276825, "advantage_mean": -9.313225912688239e-09, "advantage_min": -0.6424081809818745, "advantage_std": 0.6727948747575283, "completion_length": 2646.0625915527344, "epoch": 0.36457142857142855, "grad_norm": 0.823601245880127, "kl": 0.4171142578125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.1393354916230005e-07, "loss": 0.0132, "reward": 0.19451459869742393, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.19451459869742393, "reward_after_std": 0.6727948673069477, "reward_before_mean": 0.5286380238831043, "reward_before_std": 0.6407083608210087, "reward_change_max": 0.0, "reward_change_mean": -0.33412344940006733, "reward_change_min": -0.583837378770113, "reward_change_std": 0.2194485031068325, "reward_std": 0.6727948747575283, "rewards/cosine_scaled_reward": -0.14193099546901067, "rewards/format_reward": 0.8125000149011612, "step": 319 }, { "advantage_max": 1.5140546262264252, "advantage_mean": -1.179675274132208e-08, "advantage_min": -0.8545507602393627, "advantage_std": 0.8494656905531883, "completion_length": 1993.4791946411133, "epoch": 0.3657142857142857, "grad_norm": 0.4081292748451233, "kl": 0.28387451171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.1094235253127374e-07, "loss": 0.0282, "reward": 0.45085313729941845, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.45085313729941845, "reward_after_std": 0.8494656905531883, "reward_before_mean": 0.8511345311999321, "reward_before_std": 0.7997904568910599, "reward_change_max": 0.00048110634088516235, "reward_change_mean": -0.4002814181149006, "reward_change_min": -0.687841709703207, "reward_change_std": 0.25973583571612835, "reward_std": 0.8494657054543495, "rewards/cosine_scaled_reward": -0.02234938833862543, "rewards/format_reward": 0.8958333507180214, "step": 320 }, { "advantage_max": 1.6693382859230042, "advantage_mean": -8.071462331837864e-09, "advantage_min": -0.9148355275392532, "advantage_std": 0.9250027239322662, "completion_length": 2250.4583892822266, "epoch": 0.3668571428571429, "grad_norm": 0.8360119462013245, "kl": 0.370880126953125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.079579333738039e-07, "loss": 0.054, "reward": 0.5935531364229973, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5935531364229973, "reward_after_std": 0.9250027164816856, "reward_before_mean": 1.0354133360087872, "reward_before_std": 0.8597222454845905, "reward_change_max": 0.0005154833197593689, "reward_change_mean": -0.4418601803481579, "reward_change_min": -0.750252865254879, "reward_change_std": 0.28662803769111633, "reward_std": 0.9250027611851692, "rewards/cosine_scaled_reward": 0.0697899884544313, "rewards/format_reward": 0.895833358168602, "step": 321 }, { "advantage_max": 1.4949319586157799, "advantage_mean": -1.6142925107764938e-08, "advantage_min": -0.6884929984807968, "advantage_std": 0.8232560381293297, "completion_length": 2184.0417404174805, "epoch": 0.368, "grad_norm": 1.4889612197875977, "kl": 0.39019775390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.0498043714627006e-07, "loss": 0.0637, "reward": 0.5103768724948168, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5103768724948168, "reward_after_std": 0.8232560381293297, "reward_before_mean": 0.9297751300036907, "reward_before_std": 0.7539035603404045, "reward_change_max": 0.0, "reward_change_mean": -0.4193982556462288, "reward_change_min": -0.7438863329589367, "reward_change_std": 0.26376455649733543, "reward_std": 0.8232560753822327, "rewards/cosine_scaled_reward": 0.027387551497668028, "rewards/format_reward": 0.8750000074505806, "step": 322 }, { "advantage_max": 1.387131430208683, "advantage_mean": -8.692344288796505e-09, "advantage_min": -0.9888906627893448, "advantage_std": 0.8229181803762913, "completion_length": 2645.2709197998047, "epoch": 0.36914285714285716, "grad_norm": 1.0869429111480713, "kl": 0.33551025390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.020100089676376e-07, "loss": 0.0078, "reward": 0.4324415000155568, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4324415000155568, "reward_after_std": 0.8229181692004204, "reward_before_mean": 0.8359145727008581, "reward_before_std": 0.8219218775629997, "reward_change_max": 0.00012808293104171753, "reward_change_mean": -0.4034730438143015, "reward_change_min": -0.695304848253727, "reward_change_std": 0.27555028162896633, "reward_std": 0.8229181692004204, "rewards/cosine_scaled_reward": -0.009126080200076103, "rewards/format_reward": 0.8541666865348816, "step": 323 }, { "advantage_max": 1.194041632115841, "advantage_mean": -6.208817071584605e-09, "advantage_min": -0.6673163548111916, "advantage_std": 0.6746069677174091, "completion_length": 2896.729263305664, "epoch": 0.3702857142857143, "grad_norm": 2.350710868835449, "kl": 0.572998046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.9904679361238526e-07, "loss": 0.0048, "reward": -0.01826178189367056, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.01826178189367056, "reward_after_std": 0.6746069677174091, "reward_before_mean": 0.23842260986566544, "reward_before_std": 0.6667893007397652, "reward_change_max": 0.0, "reward_change_mean": -0.2566843908280134, "reward_change_min": -0.5075510442256927, "reward_change_std": 0.19278783723711967, "reward_std": 0.67460697889328, "rewards/cosine_scaled_reward": -0.21412204019725323, "rewards/format_reward": 0.6666666828095913, "step": 324 }, { "advantage_max": 1.063421942293644, "advantage_mean": 8.692344399818808e-09, "advantage_min": -0.5553975813090801, "advantage_std": 0.600383810698986, "completion_length": 2946.7500534057617, "epoch": 0.37142857142857144, "grad_norm": 1.726870059967041, "kl": 0.43701171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.9609093550344907e-07, "loss": 0.0116, "reward": 0.06526811327785254, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.06526811327785254, "reward_after_std": 0.6003838032484055, "reward_before_mean": 0.36415105359628797, "reward_before_std": 0.5600848067551851, "reward_change_max": 0.0, "reward_change_mean": -0.29888293612748384, "reward_change_min": -0.5316140241920948, "reward_change_std": 0.19572586566209793, "reward_std": 0.600383810698986, "rewards/cosine_scaled_reward": -0.2033411506563425, "rewards/format_reward": 0.7708333414047956, "step": 325 }, { "advantage_max": 1.0879326313734055, "advantage_mean": 3.1044086745701804e-09, "advantage_min": -0.7306830547749996, "advantage_std": 0.6596686951816082, "completion_length": 2455.104217529297, "epoch": 0.37257142857142855, "grad_norm": 1.0197948217391968, "kl": 0.259765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.931425787051832e-07, "loss": 0.0394, "reward": 0.2782456114073284, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2782456114073284, "reward_after_std": 0.6596686840057373, "reward_before_mean": 0.6505864365026355, "reward_before_std": 0.6487043891102076, "reward_change_max": 0.0, "reward_change_mean": -0.3723407946527004, "reward_change_min": -0.6594884321093559, "reward_change_std": 0.25413103960454464, "reward_std": 0.6596687138080597, "rewards/cosine_scaled_reward": -0.08095679804682732, "rewards/format_reward": 0.8125000149011612, "step": 326 }, { "advantage_max": 1.2405484095215797, "advantage_mean": -8.6923440667519e-09, "advantage_min": -0.6985969394445419, "advantage_std": 0.695731807500124, "completion_length": 2579.791732788086, "epoch": 0.3737142857142857, "grad_norm": 0.5163988471031189, "kl": 0.351715087890625, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.902018669163384e-07, "loss": 0.0303, "reward": 0.534046346321702, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.534046346321702, "reward_after_std": 0.6957318224012852, "reward_before_mean": 0.9854510715231299, "reward_before_std": 0.6119641549885273, "reward_change_max": 0.0004111677408218384, "reward_change_mean": -0.4514046907424927, "reward_change_min": -0.7134763710200787, "reward_change_std": 0.27318305894732475, "reward_std": 0.6957318410277367, "rewards/cosine_scaled_reward": 0.07605885528028011, "rewards/format_reward": 0.8333333432674408, "step": 327 }, { "advantage_max": 1.5700553804636002, "advantage_mean": 3.7252904094842165e-09, "advantage_min": -0.7542428597807884, "advantage_std": 0.8464733026921749, "completion_length": 3031.8958435058594, "epoch": 0.37485714285714283, "grad_norm": 0.5918129682540894, "kl": 0.3701171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.872689434630585e-07, "loss": 0.0157, "reward": 0.1579922076780349, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1579922076780349, "reward_after_std": 0.846473291516304, "reward_before_mean": 0.4518141821026802, "reward_before_std": 0.8057435862720013, "reward_change_max": 0.00017704814672470093, "reward_change_mean": -0.29382197093218565, "reward_change_min": -0.5395024605095387, "reward_change_std": 0.2040925696492195, "reward_std": 0.8464733026921749, "rewards/cosine_scaled_reward": -0.06575959082692862, "rewards/format_reward": 0.5833333469927311, "step": 328 }, { "advantage_max": 1.4207844957709312, "advantage_mean": 6.8296994726324556e-09, "advantage_min": -0.958678312599659, "advantage_std": 0.8764909096062183, "completion_length": 2077.437572479248, "epoch": 0.376, "grad_norm": 0.4301084578037262, "kl": 0.289764404296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.843439512918949e-07, "loss": 0.0393, "reward": 0.46948680374771357, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.46948680374771357, "reward_after_std": 0.8764909207820892, "reward_before_mean": 0.8864495474845171, "reward_before_std": 0.9092329628765583, "reward_change_max": 0.0, "reward_change_mean": -0.416962718591094, "reward_change_min": -0.8044983074069023, "reward_change_std": 0.30942643620073795, "reward_std": 0.8764909394085407, "rewards/cosine_scaled_reward": 0.02655809542920906, "rewards/format_reward": 0.8333333432674408, "step": 329 }, { "advantage_max": 1.0345421582460403, "advantage_mean": -4.967053990334591e-09, "advantage_min": -0.5925486199557781, "advantage_std": 0.5856764316558838, "completion_length": 2163.291717529297, "epoch": 0.37714285714285717, "grad_norm": 0.9790158867835999, "kl": 0.21964263916015625, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.8142703296283953e-07, "loss": -0.0266, "reward": 0.061649966053664684, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.061649966053664684, "reward_after_std": 0.5856764391064644, "reward_before_mean": 0.3572140075266361, "reward_before_std": 0.5511214323341846, "reward_change_max": 0.0014399513602256775, "reward_change_mean": -0.2955640386790037, "reward_change_min": -0.5271896719932556, "reward_change_std": 0.19697471428662539, "reward_std": 0.5856764689087868, "rewards/cosine_scaled_reward": -0.22764301113784313, "rewards/format_reward": 0.8125000055879354, "step": 330 }, { "advantage_max": 1.266398135572672, "advantage_mean": -9.313226023710541e-09, "advantage_min": -0.6501773856580257, "advantage_std": 0.7007499448955059, "completion_length": 2759.958381652832, "epoch": 0.3782857142857143, "grad_norm": 0.4184553623199463, "kl": 0.295257568359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.785183306423767e-07, "loss": 0.0313, "reward": 0.031125844456255436, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.031125844456255436, "reward_after_std": 0.7007499560713768, "reward_before_mean": 0.30181864835321903, "reward_before_std": 0.6772023364901543, "reward_change_max": 0.00038214027881622314, "reward_change_mean": -0.2706927992403507, "reward_change_min": -0.5030109845101833, "reward_change_std": 0.18806026875972748, "reward_std": 0.7007499635219574, "rewards/cosine_scaled_reward": -0.15117402002215385, "rewards/format_reward": 0.6041666716337204, "step": 331 }, { "advantage_max": 1.2278646379709244, "advantage_mean": -1.2107193636534674e-08, "advantage_min": -0.9190050959587097, "advantage_std": 0.7645938657224178, "completion_length": 2374.4583892822266, "epoch": 0.37942857142857145, "grad_norm": 1.026884913444519, "kl": 0.20819091796875, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.7561798609655373e-07, "loss": 0.0254, "reward": 0.26205848407698795, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.26205848407698795, "reward_after_std": 0.764593880623579, "reward_before_mean": 0.6165533754974604, "reward_before_std": 0.7967246845364571, "reward_change_max": 0.00045099854469299316, "reward_change_mean": -0.3544948771595955, "reward_change_min": -0.6746104620397091, "reward_change_std": 0.27001148648560047, "reward_std": 0.7645938880741596, "rewards/cosine_scaled_reward": -0.056306662037968636, "rewards/format_reward": 0.7291666828095913, "step": 332 }, { "advantage_max": 1.4431308880448341, "advantage_mean": -8.692344177774203e-09, "advantage_min": -0.721727728843689, "advantage_std": 0.7957188636064529, "completion_length": 2322.1458892822266, "epoch": 0.38057142857142856, "grad_norm": 0.2893451452255249, "kl": 0.182647705078125, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.72726140684072e-07, "loss": 0.0226, "reward": 0.37836203817278147, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.37836203817278147, "reward_after_std": 0.7957188822329044, "reward_before_mean": 0.7605113498866558, "reward_before_std": 0.7327791415154934, "reward_change_max": 0.0002195313572883606, "reward_change_mean": -0.3821493051946163, "reward_change_min": -0.6533030718564987, "reward_change_std": 0.2390086129307747, "reward_std": 0.7957189120352268, "rewards/cosine_scaled_reward": -0.08849434833973646, "rewards/format_reward": 0.9375000149011612, "step": 333 }, { "advantage_max": 1.0318748727440834, "advantage_mean": -1.1796752574788627e-08, "advantage_min": -0.7106717079877853, "advantage_std": 0.6053128764033318, "completion_length": 3011.041748046875, "epoch": 0.38171428571428573, "grad_norm": 0.4309418201446533, "kl": 0.30096435546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.6984293534939737e-07, "loss": 0.0272, "reward": 0.042655323166400194, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.042655323166400194, "reward_after_std": 0.6053128689527512, "reward_before_mean": 0.33556643780320883, "reward_before_std": 0.6005287803709507, "reward_change_max": 0.0002907887101173401, "reward_change_mean": -0.29291114024817944, "reward_change_min": -0.5190308559685946, "reward_change_std": 0.20768418349325657, "reward_std": 0.6053128838539124, "rewards/cosine_scaled_reward": -0.17596679739654064, "rewards/format_reward": 0.687500013038516, "step": 334 }, { "advantage_max": 1.6594262346625328, "advantage_mean": 1.241763464276957e-08, "advantage_min": -0.8757152184844017, "advantage_std": 0.9582944251596928, "completion_length": 2611.4584197998047, "epoch": 0.38285714285714284, "grad_norm": 0.9186263680458069, "kl": 0.2675628662109375, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.6696851061588994e-07, "loss": 0.0586, "reward": 0.32363066729158163, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.32363066729158163, "reward_after_std": 0.9582944102585316, "reward_before_mean": 0.6715507498010993, "reward_before_std": 0.9660878330469131, "reward_change_max": 0.0006835907697677612, "reward_change_mean": -0.3479200517758727, "reward_change_min": -0.7065860964357853, "reward_change_std": 0.27568814530968666, "reward_std": 0.9582944326102734, "rewards/cosine_scaled_reward": 0.002442028373479843, "rewards/format_reward": 0.6666666753590107, "step": 335 }, { "advantage_max": 1.6442746073007584, "advantage_mean": 3.7252906315288215e-09, "advantage_min": -0.803277987986803, "advantage_std": 0.9076324515044689, "completion_length": 2992.979217529297, "epoch": 0.384, "grad_norm": 0.8076133728027344, "kl": 0.2337646484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.641030065789562e-07, "loss": 0.0422, "reward": 0.41128079127520323, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.41128079127520323, "reward_after_std": 0.907632440328598, "reward_before_mean": 0.7877331438940018, "reward_before_std": 0.861430436372757, "reward_change_max": 0.0009226053953170776, "reward_change_mean": -0.3764523593708873, "reward_change_min": -0.683083888143301, "reward_change_std": 0.2502218373119831, "reward_std": 0.9076324477791786, "rewards/cosine_scaled_reward": 0.008449911139905453, "rewards/format_reward": 0.7708333544433117, "step": 336 }, { "advantage_max": 1.3428704738616943, "advantage_mean": -1.2417637473838283e-09, "advantage_min": -0.8602436855435371, "advantage_std": 0.7921734526753426, "completion_length": 2696.7709045410156, "epoch": 0.3851428571428571, "grad_norm": 0.777777373790741, "kl": 0.247222900390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.612465628992203e-07, "loss": -0.0119, "reward": 0.3240007753483951, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3240007753483951, "reward_after_std": 0.7921734377741814, "reward_before_mean": 0.6951221290510148, "reward_before_std": 0.7921620719134808, "reward_change_max": 0.0005899891257286072, "reward_change_mean": -0.3711213394999504, "reward_change_min": -0.6649631895124912, "reward_change_std": 0.26434677839279175, "reward_std": 0.7921734377741814, "rewards/cosine_scaled_reward": -0.0691056028008461, "rewards/format_reward": 0.8333333358168602, "step": 337 }, { "advantage_max": 1.409317284822464, "advantage_mean": 6.208820124697922e-10, "advantage_min": -0.6859129592776299, "advantage_std": 0.7859718762338161, "completion_length": 2161.1250610351562, "epoch": 0.3862857142857143, "grad_norm": 0.6067075133323669, "kl": 0.23406982421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.5839931879571725e-07, "loss": 0.0089, "reward": 0.4811843913048506, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4811843913048506, "reward_after_std": 0.7859718687832355, "reward_before_mean": 0.9017840125598013, "reward_before_std": 0.7260171696543694, "reward_change_max": 0.0, "reward_change_mean": -0.42059963941574097, "reward_change_min": -0.7111620083451271, "reward_change_std": 0.26399547420442104, "reward_std": 0.7859718948602676, "rewards/cosine_scaled_reward": 0.0029753390699625015, "rewards/format_reward": 0.8958333358168602, "step": 338 }, { "advantage_max": 0.8729583323001862, "advantage_mean": 1.2417630257388623e-09, "advantage_min": -0.6809690743684769, "advantage_std": 0.538725059479475, "completion_length": 3008.0625610351562, "epoch": 0.38742857142857146, "grad_norm": 0.35310637950897217, "kl": 0.2740478515625, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.555614130391079e-07, "loss": 0.0412, "reward": -0.032884322106838226, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.032884322106838226, "reward_after_std": 0.5387250632047653, "reward_before_mean": 0.24240444134920835, "reward_before_std": 0.5509735830128193, "reward_change_max": 0.0, "reward_change_mean": -0.27528876066207886, "reward_change_min": -0.4679707083851099, "reward_change_std": 0.1936870813369751, "reward_std": 0.5387250930070877, "rewards/cosine_scaled_reward": -0.17046445421874523, "rewards/format_reward": 0.5833333525806665, "step": 339 }, { "advantage_max": 1.3509134501218796, "advantage_mean": -6.829699139565548e-09, "advantage_min": -0.9202584028244019, "advantage_std": 0.8368659019470215, "completion_length": 2576.7708587646484, "epoch": 0.38857142857142857, "grad_norm": 0.6532253623008728, "kl": 0.2698974609375, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.5273298394491515e-07, "loss": 0.0479, "reward": 0.22542057232931256, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.22542057232931256, "reward_after_std": 0.8368658944964409, "reward_before_mean": 0.5590503867715597, "reward_before_std": 0.8876641169190407, "reward_change_max": 0.00047085434198379517, "reward_change_mean": -0.3336298204958439, "reward_change_min": -0.6656833551824093, "reward_change_std": 0.2725123818963766, "reward_std": 0.8368659354746342, "rewards/cosine_scaled_reward": -0.10589147731661797, "rewards/format_reward": 0.7708333507180214, "step": 340 }, { "advantage_max": 1.4389492645859718, "advantage_mean": -5.587936058315535e-09, "advantage_min": -0.8931010887026787, "advantage_std": 0.8441877365112305, "completion_length": 2549.479202270508, "epoch": 0.38971428571428574, "grad_norm": 0.7518829703330994, "kl": 0.2005615234375, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.4991416936678276e-07, "loss": 0.0284, "reward": 0.6108754873275757, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6108754873275757, "reward_after_std": 0.8441877439618111, "reward_before_mean": 1.074859730899334, "reward_before_std": 0.790581751614809, "reward_change_max": 0.00037673860788345337, "reward_change_mean": -0.4639842379838228, "reward_change_min": -0.7642018236219883, "reward_change_std": 0.31235019117593765, "reward_std": 0.844187755137682, "rewards/cosine_scaled_reward": 0.17284652451053262, "rewards/format_reward": 0.7291666828095913, "step": 341 }, { "advantage_max": 1.3784963935613632, "advantage_mean": -1.55220432618286e-08, "advantage_min": -0.8081448301672935, "advantage_std": 0.7640027962625027, "completion_length": 2857.12508392334, "epoch": 0.39085714285714285, "grad_norm": 1.3091081380844116, "kl": 0.37322998046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.471051066897562e-07, "loss": 0.0775, "reward": 0.2577698500826955, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2577698500826955, "reward_after_std": 0.7640027962625027, "reward_before_mean": 0.600545659661293, "reward_before_std": 0.7238595262169838, "reward_change_max": 0.00037366151809692383, "reward_change_mean": -0.3427758179605007, "reward_change_min": -0.6013978645205498, "reward_change_std": 0.22922171279788017, "reward_std": 0.7640028148889542, "rewards/cosine_scaled_reward": -0.13722718448843807, "rewards/format_reward": 0.8750000223517418, "step": 342 }, { "advantage_max": 1.09914218634367, "advantage_mean": -4.6566095424083187e-10, "advantage_min": -0.7929297350347042, "advantage_std": 0.6592183811590075, "completion_length": 2881.229248046875, "epoch": 0.392, "grad_norm": 0.8948997259140015, "kl": 0.30389404296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.4430593282358777e-07, "loss": 0.0518, "reward": 0.15604414325207472, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.15604414325207472, "reward_after_std": 0.6592183588072658, "reward_before_mean": 0.4839049391448498, "reward_before_std": 0.6639570314437151, "reward_change_max": 0.0, "reward_change_mean": -0.3278607833199203, "reward_change_min": -0.5611877627670765, "reward_change_std": 0.22848687833175063, "reward_std": 0.6592183625325561, "rewards/cosine_scaled_reward": -0.07054753974080086, "rewards/format_reward": 0.6250000149011612, "step": 343 }, { "advantage_max": 1.1419403105974197, "advantage_mean": -1.9247334503980085e-08, "advantage_min": -0.8298167437314987, "advantage_std": 0.7062124721705914, "completion_length": 2314.75008392334, "epoch": 0.3931428571428571, "grad_norm": 0.3817563056945801, "kl": 0.208465576171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.4151678419606233e-07, "loss": 0.0339, "reward": 0.5916862864978611, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5916862864978611, "reward_after_std": 0.7062124647200108, "reward_before_mean": 1.0728607330238447, "reward_before_std": 0.6833640523254871, "reward_change_max": 0.0007588937878608704, "reward_change_mean": -0.48117440938949585, "reward_change_min": -0.7631802037358284, "reward_change_std": 0.30804815515875816, "reward_std": 0.7062124907970428, "rewards/cosine_scaled_reward": 0.1197636779397726, "rewards/format_reward": 0.8333333358168602, "step": 344 }, { "advantage_max": 1.2474537640810013, "advantage_mean": -2.7939678071131624e-08, "advantage_min": -0.850233644247055, "advantage_std": 0.7378453128039837, "completion_length": 2768.6250610351562, "epoch": 0.3942857142857143, "grad_norm": 0.7111788392066956, "kl": 0.31512451171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.387377967463493e-07, "loss": 0.0513, "reward": 0.7365547483786941, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7365547483786941, "reward_after_std": 0.7378453128039837, "reward_before_mean": 1.2583894599229097, "reward_before_std": 0.6701792404055595, "reward_change_max": 0.0, "reward_change_mean": -0.5218347907066345, "reward_change_min": -0.8273529559373856, "reward_change_std": 0.31641891226172447, "reward_std": 0.7378453314304352, "rewards/cosine_scaled_reward": 0.19169476255774498, "rewards/format_reward": 0.8750000149011612, "step": 345 }, { "advantage_max": 1.2420702129602432, "advantage_mean": -8.692343955729598e-09, "advantage_min": -0.8061654381453991, "advantage_std": 0.7235353961586952, "completion_length": 2846.0209197998047, "epoch": 0.3954285714285714, "grad_norm": 1.0861871242523193, "kl": 0.507080078125, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.359691059183761e-07, "loss": 0.0173, "reward": 0.304742424399592, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.304742424399592, "reward_after_std": 0.7235354036092758, "reward_before_mean": 0.6732838414609432, "reward_before_std": 0.7006465289741755, "reward_change_max": 0.0003463476896286011, "reward_change_mean": -0.36854144744575024, "reward_change_min": -0.6262233816087246, "reward_change_std": 0.2468447219580412, "reward_std": 0.723535418510437, "rewards/cosine_scaled_reward": -0.059191410429775715, "rewards/format_reward": 0.791666679084301, "step": 346 }, { "advantage_max": 1.2352950349450111, "advantage_mean": 4.346172310931706e-09, "advantage_min": -0.6556868124753237, "advantage_std": 0.6889407336711884, "completion_length": 2864.6250915527344, "epoch": 0.3965714285714286, "grad_norm": 0.6275542378425598, "kl": 0.4534912109375, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.3321084665422803e-07, "loss": 0.055, "reward": 0.13871129555627704, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.13871129555627704, "reward_after_std": 0.6889407597482204, "reward_before_mean": 0.4508918561041355, "reward_before_std": 0.6534276064485312, "reward_change_max": 0.0008821189403533936, "reward_change_mean": -0.312180545181036, "reward_change_min": -0.5465703941881657, "reward_change_std": 0.21107300743460655, "reward_std": 0.6889407970011234, "rewards/cosine_scaled_reward": -0.18080408312380314, "rewards/format_reward": 0.8125000149011612, "step": 347 }, { "advantage_max": 1.0376718565821648, "advantage_mean": -1.6142925329809543e-08, "advantage_min": -0.5380270965397358, "advantage_std": 0.5884160585701466, "completion_length": 2268.916732788086, "epoch": 0.3977142857142857, "grad_norm": 0.8303443193435669, "kl": 0.32196044921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.3046315338757026e-07, "loss": 0.0055, "reward": 0.2341640405356884, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2341640405356884, "reward_after_std": 0.5884160622954369, "reward_before_mean": 0.5881709158420563, "reward_before_std": 0.537275068461895, "reward_change_max": 0.0, "reward_change_mean": -0.35400689020752907, "reward_change_min": -0.5834939926862717, "reward_change_std": 0.21537455171346664, "reward_std": 0.588416077196598, "rewards/cosine_scaled_reward": -0.1746645476669073, "rewards/format_reward": 0.9375000149011612, "step": 348 }, { "advantage_max": 1.6154710426926613, "advantage_mean": -1.8626452047421083e-09, "advantage_min": -0.7679126597940922, "advantage_std": 0.8989286422729492, "completion_length": 2812.8125915527344, "epoch": 0.39885714285714285, "grad_norm": 1.0645766258239746, "kl": 0.5093994140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.2772616003709616e-07, "loss": 0.092, "reward": 0.19876206829212606, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.19876206829212606, "reward_after_std": 0.8989286720752716, "reward_before_mean": 0.5034147594124079, "reward_before_std": 0.8784813545644283, "reward_change_max": 0.0, "reward_change_mean": -0.3046526936814189, "reward_change_min": -0.5808026678860188, "reward_change_std": 0.22330620884895325, "reward_std": 0.8989286981523037, "rewards/cosine_scaled_reward": -0.08162595890462399, "rewards/format_reward": 0.6666666809469461, "step": 349 }, { "advantage_max": 1.4590798914432526, "advantage_mean": 3.104408619059029e-09, "advantage_min": -0.8784847110509872, "advantage_std": 0.8449890464544296, "completion_length": 2441.9792556762695, "epoch": 0.4, "grad_norm": 1.5493332147598267, "kl": 0.555908203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.250000000000001e-07, "loss": 0.015, "reward": 0.3524491051211953, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3524491051211953, "reward_after_std": 0.8449890315532684, "reward_before_mean": 0.7244518399238586, "reward_before_std": 0.8319392949342728, "reward_change_max": 0.0, "reward_change_mean": -0.3720027543604374, "reward_change_min": -0.701644878834486, "reward_change_std": 0.2610305938869715, "reward_std": 0.8449890613555908, "rewards/cosine_scaled_reward": -0.07527408562600613, "rewards/format_reward": 0.8750000223517418, "step": 350 }, { "advantage_max": 1.1394439861178398, "advantage_mean": -3.7252904983020585e-08, "advantage_min": -0.6002353355288506, "advantage_std": 0.6371245309710503, "completion_length": 2274.8125610351562, "epoch": 0.40114285714285713, "grad_norm": 0.8637124300003052, "kl": 0.306060791015625, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.222848061454764e-07, "loss": -0.0094, "reward": 0.4534956933930516, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4534956933930516, "reward_after_std": 0.63712452724576, "reward_before_mean": 0.8838562555611134, "reward_before_std": 0.5522509068250656, "reward_change_max": 0.0, "reward_change_mean": -0.4303605705499649, "reward_change_min": -0.6738534830510616, "reward_change_std": 0.2491367096081376, "reward_std": 0.6371245495975018, "rewards/cosine_scaled_reward": 0.0044281138107180595, "rewards/format_reward": 0.875, "step": 351 }, { "advantage_max": 1.2284162268042564, "advantage_mean": -1.1796752907855534e-08, "advantage_min": -0.6765665486454964, "advantage_std": 0.6997440755367279, "completion_length": 2598.0000762939453, "epoch": 0.4022857142857143, "grad_norm": 0.720954418182373, "kl": 0.5006103515625, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.195807108082429e-07, "loss": 0.0359, "reward": 0.33301494736224413, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.33301494736224413, "reward_after_std": 0.6997440531849861, "reward_before_mean": 0.7135494388639927, "reward_before_std": 0.6609387919306755, "reward_change_max": 0.0, "reward_change_mean": -0.38053448498249054, "reward_change_min": -0.6261219903826714, "reward_change_std": 0.23826817143708467, "reward_std": 0.6997440606355667, "rewards/cosine_scaled_reward": -0.007808626629412174, "rewards/format_reward": 0.7291666697710752, "step": 352 }, { "advantage_max": 1.3336387276649475, "advantage_mean": 1.11758712839638e-08, "advantage_min": -0.8034698478877544, "advantage_std": 0.7565931305289268, "completion_length": 2164.6875610351562, "epoch": 0.4034285714285714, "grad_norm": 0.525687575340271, "kl": 0.34417724609375, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.168878457820915e-07, "loss": 0.027, "reward": 0.5460577132180333, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5460577132180333, "reward_after_std": 0.756593145430088, "reward_before_mean": 0.9952808357775211, "reward_before_std": 0.7036684639751911, "reward_change_max": 8.256733417510986e-05, "reward_change_mean": -0.4492231123149395, "reward_change_min": -0.7174819149076939, "reward_change_std": 0.2743480708450079, "reward_std": 0.7565931528806686, "rewards/cosine_scaled_reward": 0.028890418354421854, "rewards/format_reward": 0.9375000074505806, "step": 353 }, { "advantage_max": 1.1443421319127083, "advantage_mean": -2.3903946488879058e-08, "advantage_min": -0.8811666816473007, "advantage_std": 0.6901189722120762, "completion_length": 1969.1042098999023, "epoch": 0.4045714285714286, "grad_norm": 0.5808809399604797, "kl": 0.221710205078125, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.142063423134644e-07, "loss": 0.0014, "reward": 0.5879236805485561, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5879236805485561, "reward_after_std": 0.6901189647614956, "reward_before_mean": 1.0658465139567852, "reward_before_std": 0.6480529569089413, "reward_change_max": 0.0016156435012817383, "reward_change_mean": -0.47792287170886993, "reward_change_min": -0.7460371851921082, "reward_change_std": 0.2975119221955538, "reward_std": 0.6901189796626568, "rewards/cosine_scaled_reward": 0.09542325511574745, "rewards/format_reward": 0.8750000149011612, "step": 354 }, { "advantage_max": 1.299523502588272, "advantage_mean": -2.048909669705168e-08, "advantage_min": -0.6467021554708481, "advantage_std": 0.723317988216877, "completion_length": 2410.6458740234375, "epoch": 0.4057142857142857, "grad_norm": 0.9375362992286682, "kl": 0.46929931640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.115363310950578e-07, "loss": 0.0214, "reward": 0.2581765688955784, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2581765688955784, "reward_after_std": 0.7233179956674576, "reward_before_mean": 0.6042394880205393, "reward_before_std": 0.6638563796877861, "reward_change_max": 0.0006398856639862061, "reward_change_mean": -0.34606292354874313, "reward_change_min": -0.5550568662583828, "reward_change_std": 0.22587225958704948, "reward_std": 0.7233180180191994, "rewards/cosine_scaled_reward": -0.07288026809692383, "rewards/format_reward": 0.750000013038516, "step": 355 }, { "advantage_max": 1.4495657980442047, "advantage_mean": -6.829699028543246e-09, "advantage_min": -0.7594706527888775, "advantage_std": 0.7941495589911938, "completion_length": 2560.479248046875, "epoch": 0.40685714285714286, "grad_norm": 1.1204156875610352, "kl": 0.340087890625, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.0887794225945143e-07, "loss": 0.0194, "reward": 0.4108897661790252, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4108897661790252, "reward_after_std": 0.7941495440900326, "reward_before_mean": 0.8035768382251263, "reward_before_std": 0.7295331358909607, "reward_change_max": 0.0, "reward_change_mean": -0.3926870562136173, "reward_change_min": -0.645715419203043, "reward_change_std": 0.23967733141034842, "reward_std": 0.7941495478153229, "rewards/cosine_scaled_reward": -0.0044615985825657845, "rewards/format_reward": 0.812500013038516, "step": 356 }, { "advantage_max": 1.1339119970798492, "advantage_mean": -9.313226023710541e-09, "advantage_min": -0.8648699447512627, "advantage_std": 0.7139856666326523, "completion_length": 2902.604248046875, "epoch": 0.408, "grad_norm": 0.5644457340240479, "kl": 0.3636474609375, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.062313053727671e-07, "loss": 0.0355, "reward": 0.18295936728827655, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.18295936728827655, "reward_after_std": 0.7139856740832329, "reward_before_mean": 0.5178206749260426, "reward_before_std": 0.7472909539937973, "reward_change_max": 0.00011270493268966675, "reward_change_mean": -0.3348613269627094, "reward_change_min": -0.6464793048799038, "reward_change_std": 0.25172287225723267, "reward_std": 0.7139856964349747, "rewards/cosine_scaled_reward": -0.1577563351020217, "rewards/format_reward": 0.8333333656191826, "step": 357 }, { "advantage_max": 1.388573870062828, "advantage_mean": -2.033387658251229e-08, "advantage_min": -0.8736642077565193, "advantage_std": 0.7979660034179688, "completion_length": 2549.666748046875, "epoch": 0.40914285714285714, "grad_norm": 1.1542965173721313, "kl": 0.3417816162109375, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.0359654942835247e-07, "loss": -0.0082, "reward": 0.5488296616822481, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5488296616822481, "reward_after_std": 0.7979660108685493, "reward_before_mean": 0.9944317154586315, "reward_before_std": 0.7499481812119484, "reward_change_max": 0.0, "reward_change_mean": -0.4456020426005125, "reward_change_min": -0.7461185529828072, "reward_change_std": 0.2852071709930897, "reward_std": 0.7979660257697105, "rewards/cosine_scaled_reward": 0.07013251329772174, "rewards/format_reward": 0.854166679084301, "step": 358 }, { "advantage_max": 1.25496444106102, "advantage_mean": -1.9247333726823967e-08, "advantage_min": -0.6898075491189957, "advantage_std": 0.7037457078695297, "completion_length": 2656.104248046875, "epoch": 0.4102857142857143, "grad_norm": 0.7680111527442932, "kl": 0.3133544921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.0097380284049523e-07, "loss": 0.0469, "reward": 0.3507824167609215, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3507824167609215, "reward_after_std": 0.70374571159482, "reward_before_mean": 0.7362727681174874, "reward_before_std": 0.6479121707379818, "reward_change_max": 0.00012759119272232056, "reward_change_mean": -0.3854903504252434, "reward_change_min": -0.6395407691597939, "reward_change_std": 0.23867321945726871, "reward_std": 0.7037457376718521, "rewards/cosine_scaled_reward": -0.0901969678234309, "rewards/format_reward": 0.9166666865348816, "step": 359 }, { "advantage_max": 1.4321295246481895, "advantage_mean": -2.235174156872688e-08, "advantage_min": -0.8640732020139694, "advantage_std": 0.8487196229398251, "completion_length": 2889.8959350585938, "epoch": 0.4114285714285714, "grad_norm": 0.6298926472663879, "kl": 0.30224609375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.9836319343816397e-07, "loss": 0.0255, "reward": 0.5272009279578924, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5272009279578924, "reward_after_std": 0.848719634115696, "reward_before_mean": 0.964709036052227, "reward_before_std": 0.8328141756355762, "reward_change_max": 0.0, "reward_change_mean": -0.43750810623168945, "reward_change_min": -0.7766466028988361, "reward_change_std": 0.29725635796785355, "reward_std": 0.8487196452915668, "rewards/cosine_scaled_reward": 0.03443782590329647, "rewards/format_reward": 0.8958333507180214, "step": 360 }, { "advantage_max": 1.3873552680015564, "advantage_mean": -4.594524749546025e-08, "advantage_min": -1.0418336614966393, "advantage_std": 0.8450100310146809, "completion_length": 2874.3959350585938, "epoch": 0.4125714285714286, "grad_norm": 0.6404920816421509, "kl": 0.262939453125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.9576484845877793e-07, "loss": 0.0174, "reward": 0.457249105675146, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.457249105675146, "reward_after_std": 0.8450100161135197, "reward_before_mean": 0.8721631693188101, "reward_before_std": 0.8606941141188145, "reward_change_max": 0.0, "reward_change_mean": -0.4149140939116478, "reward_change_min": -0.7242123819887638, "reward_change_std": 0.29377906769514084, "reward_std": 0.8450100235641003, "rewards/cosine_scaled_reward": 0.02983156335540116, "rewards/format_reward": 0.8125000149011612, "step": 361 }, { "advantage_max": 0.968044251203537, "advantage_mean": -1.2417634698280722e-09, "advantage_min": -0.5936462879180908, "advantage_std": 0.5590002462267876, "completion_length": 1745.2708435058594, "epoch": 0.4137142857142857, "grad_norm": 0.1821906417608261, "kl": 0.17156982421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.931788945420058e-07, "loss": 0.0169, "reward": 0.6431191973388195, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6431191973388195, "reward_after_std": 0.5590002536773682, "reward_before_mean": 1.1544144650688395, "reward_before_std": 0.46106959506869316, "reward_change_max": 0.0, "reward_change_mean": -0.511295210570097, "reward_change_min": -0.7569701746106148, "reward_change_std": 0.28887542709708214, "reward_std": 0.5590002611279488, "rewards/cosine_scaled_reward": 0.11887386068701744, "rewards/format_reward": 0.9166666716337204, "step": 362 }, { "advantage_max": 0.97291599214077, "advantage_mean": -9.934107758624577e-09, "advantage_min": -0.6005025487393141, "advantage_std": 0.5606646221131086, "completion_length": 1856.6875457763672, "epoch": 0.41485714285714287, "grad_norm": 0.6652201414108276, "kl": 0.1973419189453125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.9060545772359305e-07, "loss": 0.031, "reward": 0.18876660615205765, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.18876660615205765, "reward_after_std": 0.5606646202504635, "reward_before_mean": 0.5343197894544574, "reward_before_std": 0.5127148274332285, "reward_change_max": 0.0, "reward_change_mean": -0.3455531857907772, "reward_change_min": -0.5572856552898884, "reward_change_std": 0.2125458586961031, "reward_std": 0.5606646370142698, "rewards/cosine_scaled_reward": -0.04534011334180832, "rewards/format_reward": 0.625000013038516, "step": 363 }, { "advantage_max": 1.0218002647161484, "advantage_mean": -2.173086099954702e-09, "advantage_min": -0.6517962850630283, "advantage_std": 0.6047668792307377, "completion_length": 2830.687545776367, "epoch": 0.416, "grad_norm": 0.4466303884983063, "kl": 0.231109619140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.8804466342921987e-07, "loss": 0.0254, "reward": -0.07767466246150434, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.07767466246150434, "reward_after_std": 0.6047668866813183, "reward_before_mean": 0.1724892733618617, "reward_before_std": 0.6115720048546791, "reward_change_max": 0.0, "reward_change_mean": -0.25016394164413214, "reward_change_min": -0.48623254150152206, "reward_change_std": 0.1896633468568325, "reward_std": 0.6047668941318989, "rewards/cosine_scaled_reward": -0.23667203076183796, "rewards/format_reward": 0.6458333544433117, "step": 364 }, { "advantage_max": 1.1228408366441727, "advantage_mean": -3.7252901874396116e-09, "advantage_min": -0.9194622822105885, "advantage_std": 0.6852251142263412, "completion_length": 2639.562545776367, "epoch": 0.41714285714285715, "grad_norm": 0.3083858788013458, "kl": 0.18316650390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.854966364683872e-07, "loss": 0.0081, "reward": 0.39128240814898163, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.39128240814898163, "reward_after_std": 0.6852251067757607, "reward_before_mean": 0.7988421376794577, "reward_before_std": 0.6811210848391056, "reward_change_max": 0.0, "reward_change_mean": -0.40755973756313324, "reward_change_min": -0.6659704782068729, "reward_change_std": 0.2675940692424774, "reward_std": 0.6852251254022121, "rewards/cosine_scaled_reward": -0.027662288397550583, "rewards/format_reward": 0.8541666865348816, "step": 365 }, { "advantage_max": 1.1428400948643684, "advantage_mean": -2.4524828834415757e-08, "advantage_min": -0.571011945605278, "advantage_std": 0.6341691315174103, "completion_length": 1841.208408355713, "epoch": 0.41828571428571426, "grad_norm": 1.0585683584213257, "kl": 0.09136962890625, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.829615010283344e-07, "loss": 0.0348, "reward": 0.40346649289131165, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.40346649289131165, "reward_after_std": 0.6341691240668297, "reward_before_mean": 0.8120505083352327, "reward_before_std": 0.5371465170755982, "reward_change_max": 0.0002114623785018921, "reward_change_mean": -0.40858402848243713, "reward_change_min": -0.6271373182535172, "reward_change_std": 0.2456482443958521, "reward_std": 0.6341691352427006, "rewards/cosine_scaled_reward": 0.010191927663981915, "rewards/format_reward": 0.7916666772216558, "step": 366 }, { "advantage_max": 1.2152687087655067, "advantage_mean": 3.725290520506519e-09, "advantage_min": -0.7091002613306046, "advantage_std": 0.685837309807539, "completion_length": 3005.5209045410156, "epoch": 0.41942857142857143, "grad_norm": 0.7061448693275452, "kl": 0.249267578125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.8043938066798645e-07, "loss": 0.0427, "reward": 0.2152902279049158, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2152902279049158, "reward_after_std": 0.6858373135328293, "reward_before_mean": 0.5551963672041893, "reward_before_std": 0.652896411716938, "reward_change_max": 0.000535130500793457, "reward_change_mean": -0.3399061169475317, "reward_change_min": -0.5558051727712154, "reward_change_std": 0.21386231295764446, "reward_std": 0.6858373135328293, "rewards/cosine_scaled_reward": -0.03490182477980852, "rewards/format_reward": 0.625000013038516, "step": 367 }, { "advantage_max": 1.115385714918375, "advantage_mean": -7.450580818968433e-09, "advantage_min": -0.7492459192872047, "advantage_std": 0.6838843766599894, "completion_length": 3076.7083740234375, "epoch": 0.4205714285714286, "grad_norm": 0.4976518154144287, "kl": 0.3193359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.7793039831193133e-07, "loss": 0.0366, "reward": 0.13641435164026916, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.13641435164026916, "reward_after_std": 0.6838843803852797, "reward_before_mean": 0.456346322549507, "reward_before_std": 0.7006695438176394, "reward_change_max": 0.0004066452383995056, "reward_change_mean": -0.3199319802224636, "reward_change_min": -0.607593409717083, "reward_change_std": 0.2362601924687624, "reward_std": 0.6838844045996666, "rewards/cosine_scaled_reward": -0.0739101842045784, "rewards/format_reward": 0.6041666865348816, "step": 368 }, { "advantage_max": 1.613924890756607, "advantage_mean": -1.9868215517249155e-08, "advantage_min": -0.7823650315403938, "advantage_std": 0.9184759818017483, "completion_length": 2996.0000610351562, "epoch": 0.4217142857142857, "grad_norm": 1.507102131843567, "kl": 0.25439453125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.7543467624442956e-07, "loss": 0.0562, "reward": 0.19157198071479797, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.19157198071479797, "reward_after_std": 0.9184759818017483, "reward_before_mean": 0.495089840143919, "reward_before_std": 0.9159770458936691, "reward_change_max": 0.0006600469350814819, "reward_change_mean": -0.3035178631544113, "reward_change_min": -0.6244982462376356, "reward_change_std": 0.2401027213782072, "reward_std": 0.9184760265052319, "rewards/cosine_scaled_reward": -0.09620509948581457, "rewards/format_reward": 0.6875000111758709, "step": 369 }, { "advantage_max": 0.9570782333612442, "advantage_mean": -4.346172144398253e-09, "advantage_min": -0.45735423266887665, "advantage_std": 0.5198530592024326, "completion_length": 2871.125045776367, "epoch": 0.4228571428571429, "grad_norm": 5.859915733337402, "kl": 0.2988128662109375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.729523361034538e-07, "loss": -0.0005, "reward": 0.15831564646214247, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.15831564646214247, "reward_after_std": 0.5198530443012714, "reward_before_mean": 0.49796394538134336, "reward_before_std": 0.42710920982062817, "reward_change_max": 0.0, "reward_change_mean": -0.3396483026444912, "reward_change_min": -0.5230873040854931, "reward_change_std": 0.20043556857854128, "reward_std": 0.5198530666530132, "rewards/cosine_scaled_reward": -0.09476803429424763, "rewards/format_reward": 0.6875000093132257, "step": 370 }, { "advantage_max": 1.4651240780949593, "advantage_mean": -3.78737858297562e-08, "advantage_min": -0.8076458275318146, "advantage_std": 0.8199820630252361, "completion_length": 2042.9792175292969, "epoch": 0.424, "grad_norm": 0.8236666917800903, "kl": 0.2234954833984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.7048349887476037e-07, "loss": 0.0448, "reward": 0.6258745496161282, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6258745496161282, "reward_after_std": 0.8199820779263973, "reward_before_mean": 1.0955132944509387, "reward_before_std": 0.7609365023672581, "reward_change_max": 0.00015053898096084595, "reward_change_mean": -0.46963876485824585, "reward_change_min": -0.7598228193819523, "reward_change_std": 0.28789537586271763, "reward_std": 0.8199820891022682, "rewards/cosine_scaled_reward": 0.15192330069839954, "rewards/format_reward": 0.7916666734963655, "step": 371 }, { "advantage_max": 1.5770609304308891, "advantage_mean": 1.862645371275562e-09, "advantage_min": -0.9838331863284111, "advantage_std": 0.9379903674125671, "completion_length": 3058.2709197998047, "epoch": 0.42514285714285716, "grad_norm": 0.6419177651405334, "kl": 0.270416259765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.6802828488599294e-07, "loss": 0.0274, "reward": 0.2897559218108654, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2897559218108654, "reward_after_std": 0.9379903674125671, "reward_before_mean": 0.6321213558912859, "reward_before_std": 0.976532481610775, "reward_change_max": 0.0, "reward_change_mean": -0.34236546233296394, "reward_change_min": -0.69129853323102, "reward_change_std": 0.2746661426499486, "reward_std": 0.9379903674125671, "rewards/cosine_scaled_reward": -0.01727266050875187, "rewards/format_reward": 0.6666666753590107, "step": 372 }, { "advantage_max": 1.2619002610445023, "advantage_mean": -6.519258216597379e-09, "advantage_min": -0.6246639899909496, "advantage_std": 0.7037790045142174, "completion_length": 1801.416732788086, "epoch": 0.42628571428571427, "grad_norm": 1.0489673614501953, "kl": 0.2279205322265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.655868138008171e-07, "loss": 0.0008, "reward": 0.25850481167435646, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.25850481167435646, "reward_after_std": 0.7037790045142174, "reward_before_mean": 0.6074935798533261, "reward_before_std": 0.651880044490099, "reward_change_max": 0.0, "reward_change_mean": -0.3489887546747923, "reward_change_min": -0.634777944535017, "reward_change_std": 0.22416657023131847, "reward_std": 0.7037790305912495, "rewards/cosine_scaled_reward": -0.1441698971320875, "rewards/format_reward": 0.8958333395421505, "step": 373 }, { "advantage_max": 1.2929741963744164, "advantage_mean": -7.761021825203329e-09, "advantage_min": -0.8992895781993866, "advantage_std": 0.7896677516400814, "completion_length": 2458.354232788086, "epoch": 0.42742857142857144, "grad_norm": 1.0882432460784912, "kl": 0.290618896484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.631592046130896e-07, "loss": 0.0673, "reward": 0.441376032307744, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.441376032307744, "reward_after_std": 0.7896677367389202, "reward_before_mean": 0.8569105174392462, "reward_before_std": 0.7956783920526505, "reward_change_max": 0.00048802047967910767, "reward_change_mean": -0.41553447395563126, "reward_change_min": -0.7457053437829018, "reward_change_std": 0.28711700066924095, "reward_std": 0.7896677367389202, "rewards/cosine_scaled_reward": 0.011788577772676945, "rewards/format_reward": 0.8333333432674408, "step": 374 }, { "advantage_max": 1.397294044494629, "advantage_mean": -1.4280280125067435e-08, "advantage_min": -0.7589370906352997, "advantage_std": 0.7676047421991825, "completion_length": 2798.5209045410156, "epoch": 0.42857142857142855, "grad_norm": 0.46163445711135864, "kl": 0.3203582763671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.6074557564105724e-07, "loss": 0.0239, "reward": 0.3821914023719728, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3821914023719728, "reward_after_std": 0.7676047645509243, "reward_before_mean": 0.7702061347663403, "reward_before_std": 0.689431369304657, "reward_change_max": 0.0, "reward_change_mean": -0.38801475055515766, "reward_change_min": -0.643918763846159, "reward_change_std": 0.2488219765946269, "reward_std": 0.767604798078537, "rewards/cosine_scaled_reward": 0.01010306237731129, "rewards/format_reward": 0.750000013038516, "step": 375 }, { "advantage_max": 1.0296322032809258, "advantage_mean": 4.656613011855271e-09, "advantage_min": -0.5118473805487156, "advantage_std": 0.5575856603682041, "completion_length": 2436.625015258789, "epoch": 0.4297142857142857, "grad_norm": 1.7255357503890991, "kl": 0.3665771484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.583460445215911e-07, "loss": 0.005, "reward": 0.1960609758971259, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1960609758971259, "reward_after_std": 0.5575856678187847, "reward_before_mean": 0.5448548486456275, "reward_before_std": 0.47852717712521553, "reward_change_max": 0.0, "reward_change_mean": -0.3487938679754734, "reward_change_min": -0.5355156362056732, "reward_change_std": 0.19799629971385002, "reward_std": 0.5575856789946556, "rewards/cosine_scaled_reward": -0.11298925429582596, "rewards/format_reward": 0.7708333376795053, "step": 376 }, { "advantage_max": 1.1096007898449898, "advantage_mean": -6.208817127095756e-09, "advantage_min": -0.5029036141932011, "advantage_std": 0.5966975335031748, "completion_length": 3197.8958740234375, "epoch": 0.4308571428571429, "grad_norm": 0.8108214139938354, "kl": 0.3704833984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.5596072820445254e-07, "loss": 0.0234, "reward": 0.28803440602496266, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.28803440602496266, "reward_after_std": 0.5966975409537554, "reward_before_mean": 0.6619824189692736, "reward_before_std": 0.5019467137753963, "reward_change_max": 0.0002845451235771179, "reward_change_mean": -0.37394802644848824, "reward_change_min": -0.5833319462835789, "reward_change_std": 0.2154426146298647, "reward_std": 0.5966975726187229, "rewards/cosine_scaled_reward": -0.06484212819486856, "rewards/format_reward": 0.7916666716337204, "step": 377 }, { "advantage_max": 1.4894996359944344, "advantage_mean": -3.0733645128844245e-08, "advantage_min": -0.9004512503743172, "advantage_std": 0.8546633832156658, "completion_length": 2478.37504196167, "epoch": 0.432, "grad_norm": 0.7999647259712219, "kl": 0.309326171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.5358974294659373e-07, "loss": 0.0116, "reward": 0.4863022118806839, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4863022118806839, "reward_after_std": 0.8546633496880531, "reward_before_mean": 0.9011543802917004, "reward_before_std": 0.8215220980346203, "reward_change_max": 0.0, "reward_change_mean": -0.41485217586159706, "reward_change_min": -0.7082854770123959, "reward_change_std": 0.27995297499001026, "reward_std": 0.8546633832156658, "rewards/cosine_scaled_reward": 0.06516051013022661, "rewards/format_reward": 0.7708333563059568, "step": 378 }, { "advantage_max": 1.4217093512415886, "advantage_mean": 1.862645149230957e-09, "advantage_min": -0.9056989178061485, "advantage_std": 0.8449318259954453, "completion_length": 3112.5000610351562, "epoch": 0.43314285714285716, "grad_norm": 0.805682361125946, "kl": 0.54296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.512332043064913e-07, "loss": 0.0326, "reward": 0.18402667669579387, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.18402667669579387, "reward_after_std": 0.8449318036437035, "reward_before_mean": 0.49959153961390257, "reward_before_std": 0.8714813143014908, "reward_change_max": 0.0, "reward_change_mean": -0.3155648708343506, "reward_change_min": -0.7030841335654259, "reward_change_std": 0.25640485249459743, "reward_std": 0.8449318334460258, "rewards/cosine_scaled_reward": -0.09395423531532288, "rewards/format_reward": 0.6875000186264515, "step": 379 }, { "advantage_max": 1.1825830340385437, "advantage_mean": -4.9670543234014986e-09, "advantage_min": -0.6122145131230354, "advantage_std": 0.6642481423914433, "completion_length": 2171.0208587646484, "epoch": 0.4342857142857143, "grad_norm": 0.3546510934829712, "kl": 0.1728668212890625, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.488912271385139e-07, "loss": 0.0106, "reward": 0.3683098815381527, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3683098815381527, "reward_after_std": 0.6642481498420238, "reward_before_mean": 0.7669425271451473, "reward_before_std": 0.5984037779271603, "reward_change_max": 0.00131244957447052, "reward_change_mean": -0.3986326586455107, "reward_change_min": -0.6843216829001904, "reward_change_std": 0.25640712305903435, "reward_std": 0.6642481610178947, "rewards/cosine_scaled_reward": -0.022778733167797327, "rewards/format_reward": 0.8125000037252903, "step": 380 }, { "advantage_max": 1.3350362554192543, "advantage_mean": -1.179675274132208e-08, "advantage_min": -0.7753116935491562, "advantage_std": 0.7826059982180595, "completion_length": 2961.8750610351562, "epoch": 0.43542857142857144, "grad_norm": 0.44929471611976624, "kl": 0.3763427734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.465639255873246e-07, "loss": 0.0373, "reward": 0.16396947141038254, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.16396947141038254, "reward_after_std": 0.7826059982180595, "reward_before_mean": 0.47848290018737316, "reward_before_std": 0.7885253168642521, "reward_change_max": 0.0, "reward_change_mean": -0.3145134476944804, "reward_change_min": -0.6155954711139202, "reward_change_std": 0.24165144469588995, "reward_std": 0.7826060205698013, "rewards/cosine_scaled_reward": -0.09409188944846392, "rewards/format_reward": 0.6666666865348816, "step": 381 }, { "advantage_max": 0.8830334767699242, "advantage_mean": -3.1044086745701804e-09, "advantage_min": -0.5198586732149124, "advantage_std": 0.5014585591852665, "completion_length": 2477.7500610351562, "epoch": 0.43657142857142855, "grad_norm": 0.9291367530822754, "kl": 0.3695831298828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.4425141308231765e-07, "loss": 0.0188, "reward": -0.028684318996965885, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.028684318996965885, "reward_after_std": 0.5014585554599762, "reward_before_mean": 0.24919481086544693, "reward_before_std": 0.4676024615764618, "reward_change_max": 0.0007557347416877747, "reward_change_mean": -0.27787913754582405, "reward_change_min": -0.47073232382535934, "reward_change_std": 0.17706115450710058, "reward_std": 0.5014585703611374, "rewards/cosine_scaled_reward": -0.23998594097793102, "rewards/format_reward": 0.7291666716337204, "step": 382 }, { "advantage_max": 1.5601003393530846, "advantage_mean": -2.1109979209121832e-08, "advantage_min": -0.9728534445166588, "advantage_std": 0.9514635317027569, "completion_length": 3067.5834045410156, "epoch": 0.4377142857142857, "grad_norm": 0.6326491832733154, "kl": 0.30438232421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.4195380233209006e-07, "loss": 0.0372, "reward": 0.42157111782580614, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.42157111782580614, "reward_after_std": 0.9514635168015957, "reward_before_mean": 0.8118895404040813, "reward_before_std": 0.980067502707243, "reward_change_max": 0.0, "reward_change_mean": -0.3903184309601784, "reward_change_min": -0.7474622651934624, "reward_change_std": 0.29729065112769604, "reward_std": 0.951463520526886, "rewards/cosine_scaled_reward": 0.030944768339395523, "rewards/format_reward": 0.7500000149011612, "step": 383 }, { "advantage_max": 1.471701368689537, "advantage_mean": -2.7939678515220834e-08, "advantage_min": -0.8781173340976238, "advantage_std": 0.8436664417386055, "completion_length": 1930.3750305175781, "epoch": 0.43885714285714283, "grad_norm": 0.5232311487197876, "kl": 0.1711578369140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.3967120531894857e-07, "loss": 0.0307, "reward": 0.804866848513484, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.804866848513484, "reward_after_std": 0.8436664193868637, "reward_before_mean": 1.334816126152873, "reward_before_std": 0.76854282990098, "reward_change_max": 0.0, "reward_change_mean": -0.529949240386486, "reward_change_min": -0.8661737963557243, "reward_change_std": 0.3276502024382353, "reward_std": 0.8436664417386055, "rewards/cosine_scaled_reward": 0.28199137654155493, "rewards/format_reward": 0.7708333507180214, "step": 384 }, { "advantage_max": 1.4069309905171394, "advantage_mean": 0.0, "advantage_min": -0.9303930997848511, "advantage_std": 0.855097584426403, "completion_length": 2643.875045776367, "epoch": 0.44, "grad_norm": 1.061144471168518, "kl": 0.23193359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.374037332934512e-07, "loss": 0.0402, "reward": 0.3949592959834263, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3949592959834263, "reward_after_std": 0.8550975695252419, "reward_before_mean": 0.78522406402044, "reward_before_std": 0.868790153414011, "reward_change_max": 0.0, "reward_change_mean": -0.39026473090052605, "reward_change_min": -0.7018118984997272, "reward_change_std": 0.28549752198159695, "reward_std": 0.8550975993275642, "rewards/cosine_scaled_reward": -0.0032213227823376656, "rewards/format_reward": 0.7916666865348816, "step": 385 }, { "advantage_max": 1.5323580503463745, "advantage_mean": -1.2417633588057697e-09, "advantage_min": -0.6946651190519333, "advantage_std": 0.813526090234518, "completion_length": 3014.1458892822266, "epoch": 0.44114285714285717, "grad_norm": 0.5251702666282654, "kl": 0.3226318359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.3515149676898552e-07, "loss": 0.035, "reward": 0.5693798456341028, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.5693798456341028, "reward_after_std": 0.8135260716080666, "reward_before_mean": 1.0148430932313204, "reward_before_std": 0.6952022239565849, "reward_change_max": 0.0, "reward_change_mean": -0.44546326249837875, "reward_change_min": -0.6919324658811092, "reward_change_std": 0.2599577587097883, "reward_std": 0.8135261088609695, "rewards/cosine_scaled_reward": 0.09075488056987524, "rewards/format_reward": 0.833333358168602, "step": 386 }, { "advantage_max": 1.114190198481083, "advantage_mean": -4.967053879312289e-09, "advantage_min": -0.7234060317277908, "advantage_std": 0.6508563421666622, "completion_length": 2963.3334045410156, "epoch": 0.4422857142857143, "grad_norm": 0.4398019015789032, "kl": 0.31781005859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.3291460551638237e-07, "loss": 0.0356, "reward": 0.19776187278330326, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.19776187278330326, "reward_after_std": 0.6508563347160816, "reward_before_mean": 0.5393041241914034, "reward_before_std": 0.6317312866449356, "reward_change_max": 0.0, "reward_change_mean": -0.34154226537793875, "reward_change_min": -0.5750756375491619, "reward_change_std": 0.22550523653626442, "reward_std": 0.6508563458919525, "rewards/cosine_scaled_reward": -0.09493127651512623, "rewards/format_reward": 0.7291666716337204, "step": 387 }, { "advantage_max": 1.4288556650280952, "advantage_mean": -2.6697915267437367e-08, "advantage_min": -0.7844956144690514, "advantage_std": 0.7918211258947849, "completion_length": 2707.7709045410156, "epoch": 0.44342857142857145, "grad_norm": 0.48590895533561707, "kl": 0.3404541015625, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.306931685585657e-07, "loss": 0.0593, "reward": 0.569500157609582, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.569500157609582, "reward_after_std": 0.7918211333453655, "reward_before_mean": 1.0220210487022996, "reward_before_std": 0.7196073681116104, "reward_change_max": 0.0, "reward_change_mean": -0.4525208752602339, "reward_change_min": -0.7153974436223507, "reward_change_std": 0.2721158731728792, "reward_std": 0.7918211333453655, "rewards/cosine_scaled_reward": 0.12559382850304246, "rewards/format_reward": 0.7708333358168602, "step": 388 }, { "advantage_max": 1.5086243450641632, "advantage_mean": -1.2417634698280722e-08, "advantage_min": -0.8356839530169964, "advantage_std": 0.8504573740065098, "completion_length": 2847.8125915527344, "epoch": 0.44457142857142856, "grad_norm": 0.5012574791908264, "kl": 0.409698486328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.2848729416523859e-07, "loss": 0.0629, "reward": 0.3996206484735012, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3996206484735012, "reward_after_std": 0.8504573740065098, "reward_before_mean": 0.7850042954087257, "reward_before_std": 0.8103694636374712, "reward_change_max": 0.0005656033754348755, "reward_change_mean": -0.38538364320993423, "reward_change_min": -0.6802755519747734, "reward_change_std": 0.26358477119356394, "reward_std": 0.8504573963582516, "rewards/cosine_scaled_reward": 0.007085463672410697, "rewards/format_reward": 0.770833345130086, "step": 389 }, { "advantage_max": 1.4127248227596283, "advantage_mean": -2.235174201281609e-08, "advantage_min": -0.7147350683808327, "advantage_std": 0.7942103296518326, "completion_length": 2742.000068664551, "epoch": 0.44571428571428573, "grad_norm": 0.8033288717269897, "kl": 0.3431396484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.2629708984760706e-07, "loss": 0.0074, "reward": 0.2416452246834524, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2416452246834524, "reward_after_std": 0.7942103520035744, "reward_before_mean": 0.5779923680238426, "reward_before_std": 0.7717950865626335, "reward_change_max": 0.0, "reward_change_mean": -0.33634717389941216, "reward_change_min": -0.6063726805150509, "reward_change_std": 0.2278979104012251, "reward_std": 0.794210359454155, "rewards/cosine_scaled_reward": -0.06517048925161362, "rewards/format_reward": 0.7083333395421505, "step": 390 }, { "advantage_max": 1.413565844297409, "advantage_mean": -4.346172144398253e-09, "advantage_min": -0.6925171725451946, "advantage_std": 0.7850869931280613, "completion_length": 2662.6042251586914, "epoch": 0.44685714285714284, "grad_norm": 0.9450730681419373, "kl": 0.368194580078125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.2412266235313973e-07, "loss": -0.005, "reward": 0.48758782213553786, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.48758782213553786, "reward_after_std": 0.7850869931280613, "reward_before_mean": 0.9113508481532335, "reward_before_std": 0.7137027382850647, "reward_change_max": 0.0, "reward_change_mean": -0.42376305535435677, "reward_change_min": -0.7457866631448269, "reward_change_std": 0.2688668370246887, "reward_std": 0.7850870080292225, "rewards/cosine_scaled_reward": 0.05984208732843399, "rewards/format_reward": 0.7916666828095913, "step": 391 }, { "advantage_max": 1.374567873775959, "advantage_mean": -3.72529057601767e-09, "advantage_min": -0.8890761323273182, "advantage_std": 0.8275346420705318, "completion_length": 2413.895881652832, "epoch": 0.448, "grad_norm": 1.1826577186584473, "kl": 0.32952880859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.2196411766036487e-07, "loss": 0.0639, "reward": 0.22747798508498818, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.22747798508498818, "reward_after_std": 0.827534630894661, "reward_before_mean": 0.5611926540732384, "reward_before_std": 0.8485117331147194, "reward_change_max": 0.0010716915130615234, "reward_change_mean": -0.3337146546691656, "reward_change_min": -0.670333344489336, "reward_change_std": 0.26952179335057735, "reward_std": 0.8275346383452415, "rewards/cosine_scaled_reward": -0.115237015648745, "rewards/format_reward": 0.791666679084301, "step": 392 }, { "advantage_max": 1.9456132277846336, "advantage_mean": -8.07146260939362e-09, "advantage_min": -1.0318707302212715, "advantage_std": 1.0898070186376572, "completion_length": 2754.8750915527344, "epoch": 0.4491428571428571, "grad_norm": 0.5890648365020752, "kl": 0.322509765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.1982156097370557e-07, "loss": 0.0249, "reward": 0.5918875364586711, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5918875364586711, "reward_after_std": 1.089807003736496, "reward_before_mean": 1.0142197906970978, "reward_before_std": 1.0668868571519852, "reward_change_max": 0.0, "reward_change_mean": -0.42233227752149105, "reward_change_min": -0.8279923424124718, "reward_change_std": 0.2987551037222147, "reward_std": 1.0898070484399796, "rewards/cosine_scaled_reward": 0.09044323640409857, "rewards/format_reward": 0.8333333488553762, "step": 393 }, { "advantage_max": 1.3842891454696655, "advantage_mean": -3.7252904094842165e-09, "advantage_min": -0.7427752874791622, "advantage_std": 0.7568569779396057, "completion_length": 3218.3959045410156, "epoch": 0.4502857142857143, "grad_norm": 1.3154881000518799, "kl": 0.46435546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.1769509671835223e-07, "loss": 0.0223, "reward": 0.07046335702762008, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.07046335702762008, "reward_after_std": 0.7568569853901863, "reward_before_mean": 0.3476231265813112, "reward_before_std": 0.7233721874654293, "reward_change_max": 0.00036607682704925537, "reward_change_mean": -0.2771597858518362, "reward_change_min": -0.5230727456510067, "reward_change_std": 0.19481442868709564, "reward_std": 0.7568569853901863, "rewards/cosine_scaled_reward": -0.16993844415992498, "rewards/format_reward": 0.6875000074505806, "step": 394 }, { "advantage_max": 1.5892572775483131, "advantage_mean": -2.4214388327781222e-08, "advantage_min": -0.9472315311431885, "advantage_std": 0.9160324931144714, "completion_length": 2425.645896911621, "epoch": 0.4514285714285714, "grad_norm": 0.4146610200405121, "kl": 0.232391357421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.1558482853517253e-07, "loss": 0.023, "reward": 0.6786849615164101, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6786849615164101, "reward_after_std": 0.9160324931144714, "reward_before_mean": 1.1544897370040417, "reward_before_std": 0.8708195760846138, "reward_change_max": 3.794580698013306e-05, "reward_change_mean": -0.47580479457974434, "reward_change_min": -0.8236850872635841, "reward_change_std": 0.31363353319466114, "reward_std": 0.9160325489938259, "rewards/cosine_scaled_reward": 0.1501615282613784, "rewards/format_reward": 0.8541666865348816, "step": 395 }, { "advantage_max": 1.145592711865902, "advantage_mean": -4.2219958085176756e-08, "advantage_min": -0.6616269759833813, "advantage_std": 0.64500567689538, "completion_length": 2860.7083740234375, "epoch": 0.45257142857142857, "grad_norm": 0.9409205317497253, "kl": 0.27783203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.134908592756607e-07, "loss": -0.0031, "reward": 0.48085956354043446, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.48085956354043446, "reward_after_std": 0.6450056806206703, "reward_before_mean": 0.9201758950948715, "reward_before_std": 0.5669548735022545, "reward_change_max": 0.00025184452533721924, "reward_change_mean": -0.4393163565546274, "reward_change_min": -0.6854121945798397, "reward_change_std": 0.2582757193595171, "reward_std": 0.6450057104229927, "rewards/cosine_scaled_reward": 0.012171266600489616, "rewards/format_reward": 0.8958333432674408, "step": 396 }, { "advantage_max": 1.4064282700419426, "advantage_mean": 1.5522043095295146e-09, "advantage_min": -0.8187252506613731, "advantage_std": 0.7925236187875271, "completion_length": 2702.3333892822266, "epoch": 0.45371428571428574, "grad_norm": 0.35432639718055725, "kl": 0.3092041015625, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.1141329099692406e-07, "loss": 0.0292, "reward": 0.27529994398355484, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.27529994398355484, "reward_after_std": 0.7925235889852047, "reward_before_mean": 0.6220556901535019, "reward_before_std": 0.7582901753485203, "reward_change_max": 0.0, "reward_change_mean": -0.3467557244002819, "reward_change_min": -0.5839594602584839, "reward_change_std": 0.23202148266136646, "reward_std": 0.7925236150622368, "rewards/cosine_scaled_reward": -0.10563884908333421, "rewards/format_reward": 0.8333333432674408, "step": 397 }, { "advantage_max": 1.1642621606588364, "advantage_mean": 5.5879355587151736e-09, "advantage_min": -0.6164032593369484, "advantage_std": 0.6434980258345604, "completion_length": 2782.166717529297, "epoch": 0.45485714285714285, "grad_norm": 0.5750638842582703, "kl": 0.3607177734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.0935222495670968e-07, "loss": 0.0161, "reward": 0.20992313139140606, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.20992313139140606, "reward_after_std": 0.6434980146586895, "reward_before_mean": 0.5533581459894776, "reward_before_std": 0.5759002082049847, "reward_change_max": 0.0002307295799255371, "reward_change_mean": -0.343435013666749, "reward_change_min": -0.5538477338850498, "reward_change_std": 0.21595249138772488, "reward_std": 0.6434980481863022, "rewards/cosine_scaled_reward": -0.10873759724199772, "rewards/format_reward": 0.7708333488553762, "step": 398 }, { "advantage_max": 1.4001071378588676, "advantage_mean": -2.2972624635908545e-08, "advantage_min": -0.8582115992903709, "advantage_std": 0.8050716407597065, "completion_length": 2384.500045776367, "epoch": 0.456, "grad_norm": 0.9607195258140564, "kl": 0.248504638671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.0730776160846853e-07, "loss": 0.0479, "reward": 0.5995660796470474, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5995660796470474, "reward_after_std": 0.8050716295838356, "reward_before_mean": 1.0644009187817574, "reward_before_std": 0.7492502890527248, "reward_change_max": 0.0006214603781700134, "reward_change_mean": -0.46483487263321877, "reward_change_min": -0.728652473539114, "reward_change_std": 0.291376868262887, "reward_std": 0.8050716482102871, "rewards/cosine_scaled_reward": 0.07386713265441358, "rewards/format_reward": 0.9166666865348816, "step": 399 }, { "advantage_max": 1.5142693221569061, "advantage_mean": -2.7318796558262193e-08, "advantage_min": -0.7139600031077862, "advantage_std": 0.8169923797249794, "completion_length": 1973.4167175292969, "epoch": 0.45714285714285713, "grad_norm": 0.6028825044631958, "kl": 0.140106201171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.0528000059645995e-07, "loss": -0.0066, "reward": 0.8798388665309176, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.8798388665309176, "reward_after_std": 0.8169923909008503, "reward_before_mean": 1.4358258470892906, "reward_before_std": 0.6879842728376389, "reward_change_max": 0.0, "reward_change_mean": -0.5559869892895222, "reward_change_min": -0.8480464890599251, "reward_change_std": 0.3101024003699422, "reward_std": 0.8169924318790436, "rewards/cosine_scaled_reward": 0.26999625749886036, "rewards/format_reward": 0.8958333395421505, "step": 400 }, { "advantage_max": 1.2209898084402084, "advantage_mean": -1.055498977109437e-08, "advantage_min": -0.622258760035038, "advantage_std": 0.6894869990646839, "completion_length": 2769.64591217041, "epoch": 0.4582857142857143, "grad_norm": 0.736003041267395, "kl": 0.2721405029296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.032690407508949e-07, "loss": 0.0017, "reward": 0.48054402810521424, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.48054402810521424, "reward_after_std": 0.6894869841635227, "reward_before_mean": 0.9125964008271694, "reward_before_std": 0.6117764301598072, "reward_change_max": 0.0, "reward_change_mean": -0.43205239437520504, "reward_change_min": -0.7335581555962563, "reward_change_std": 0.2669758554548025, "reward_std": 0.6894870065152645, "rewards/cosine_scaled_reward": 0.10213153855875134, "rewards/format_reward": 0.7083333432674408, "step": 401 }, { "advantage_max": 1.2742392867803574, "advantage_mean": 5.587935669737476e-09, "advantage_min": -0.7246151715517044, "advantage_std": 0.744117021560669, "completion_length": 2256.979217529297, "epoch": 0.4594285714285714, "grad_norm": 1.0698230266571045, "kl": 0.2095489501953125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.0127498008311922e-07, "loss": 0.0454, "reward": 0.16344716679304838, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.16344716679304838, "reward_after_std": 0.7441170252859592, "reward_before_mean": 0.4758140491321683, "reward_before_std": 0.7317548841238022, "reward_change_max": 0.00041125714778900146, "reward_change_mean": -0.31236686930060387, "reward_change_min": -0.6093603521585464, "reward_change_std": 0.24216584488749504, "reward_std": 0.7441170550882816, "rewards/cosine_scaled_reward": -0.08500966196879745, "rewards/format_reward": 0.6458333525806665, "step": 402 }, { "advantage_max": 1.4515544027090073, "advantage_mean": 9.934108091691485e-09, "advantage_min": -0.7735986150801182, "advantage_std": 0.8350824005901814, "completion_length": 2176.916736602783, "epoch": 0.4605714285714286, "grad_norm": 0.7707470655441284, "kl": 0.15277099609375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.9929791578083655e-07, "loss": 0.0315, "reward": 0.4309818516485393, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4309818516485393, "reward_after_std": 0.835082408040762, "reward_before_mean": 0.8304437976330519, "reward_before_std": 0.8025292456150055, "reward_change_max": 0.0, "reward_change_mean": -0.3994619268923998, "reward_change_min": -0.7102017849683762, "reward_change_std": 0.27272626385092735, "reward_std": 0.8350824564695358, "rewards/cosine_scaled_reward": 0.019388556480407715, "rewards/format_reward": 0.7916666753590107, "step": 403 }, { "advantage_max": 0.9113276973366737, "advantage_mean": -1.241763458725842e-08, "advantage_min": -0.5583123937249184, "advantage_std": 0.5214662775397301, "completion_length": 2384.7500534057617, "epoch": 0.4617142857142857, "grad_norm": 0.2345946878194809, "kl": 0.2875099182128906, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.9733794420337213e-07, "loss": 0.0314, "reward": 0.3233966355910525, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3233966355910525, "reward_after_std": 0.521466288715601, "reward_before_mean": 0.7204841803759336, "reward_before_std": 0.4442944601178169, "reward_change_max": 0.00019099563360214233, "reward_change_mean": -0.3970875274389982, "reward_change_min": -0.6074875667691231, "reward_change_std": 0.23336594179272652, "reward_std": 0.5214662961661816, "rewards/cosine_scaled_reward": -0.025174589827656746, "rewards/format_reward": 0.7708333544433117, "step": 404 }, { "advantage_max": 1.4360693022608757, "advantage_mean": -2.7318797002351403e-08, "advantage_min": -0.9703242368996143, "advantage_std": 0.8638960532844067, "completion_length": 2237.750030517578, "epoch": 0.46285714285714286, "grad_norm": 0.9567005038261414, "kl": 0.2099609375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.9539516087697517e-07, "loss": 0.0385, "reward": 0.5817769723944366, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5817769723944366, "reward_after_std": 0.8638960495591164, "reward_before_mean": 1.034262259490788, "reward_before_std": 0.8446610942482948, "reward_change_max": 0.0016998574137687683, "reward_change_mean": -0.4524852652102709, "reward_change_min": -0.7840995192527771, "reward_change_std": 0.3182655703276396, "reward_std": 0.8638960756361485, "rewards/cosine_scaled_reward": 0.10046443715691566, "rewards/format_reward": 0.8333333507180214, "step": 405 }, { "advantage_max": 1.579980731010437, "advantage_mean": -1.9868215517249155e-08, "advantage_min": -0.9753882475197315, "advantage_std": 0.9244318529963493, "completion_length": 2248.2500610351562, "epoch": 0.464, "grad_norm": 1.3583718538284302, "kl": 0.151763916015625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.934696604901642e-07, "loss": 0.0449, "reward": 0.6996534131467342, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6996534131467342, "reward_after_std": 0.9244318082928658, "reward_before_mean": 1.1870945319533348, "reward_before_std": 0.8938433974981308, "reward_change_max": 0.0, "reward_change_mean": -0.48744115233421326, "reward_change_min": -0.8586244881153107, "reward_change_std": 0.33273613080382347, "reward_std": 0.924431823194027, "rewards/cosine_scaled_reward": 0.1352139227092266, "rewards/format_reward": 0.916666679084301, "step": 406 }, { "advantage_max": 1.241002269089222, "advantage_mean": -1.1796752352744022e-08, "advantage_min": -0.8528676331043243, "advantage_std": 0.7451582886278629, "completion_length": 2460.3334045410156, "epoch": 0.46514285714285714, "grad_norm": 0.8853124380111694, "kl": 0.1806640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.915615368891117e-07, "loss": 0.0284, "reward": 0.5823654560372233, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5823654560372233, "reward_after_std": 0.745158277451992, "reward_before_mean": 1.0513971992768347, "reward_before_std": 0.7118194662034512, "reward_change_max": 0.0, "reward_change_mean": -0.46903173439204693, "reward_change_min": -0.7383575513958931, "reward_change_std": 0.2971672974526882, "reward_std": 0.745158277451992, "rewards/cosine_scaled_reward": 0.12986523960717022, "rewards/format_reward": 0.7916666828095913, "step": 407 }, { "advantage_max": 1.7559831738471985, "advantage_mean": -3.973643192267673e-08, "advantage_min": -0.9686634726822376, "advantage_std": 0.9539515525102615, "completion_length": 2662.1250610351562, "epoch": 0.4662857142857143, "grad_norm": 1.44785737991333, "kl": 0.2345733642578125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.8967088307307e-07, "loss": 0.0491, "reward": 0.7052465332672, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7052465332672, "reward_after_std": 0.9539515525102615, "reward_before_mean": 1.1805474273860455, "reward_before_std": 0.8772687762975693, "reward_change_max": 0.0, "reward_change_mean": -0.4753008782863617, "reward_change_min": -0.7417329847812653, "reward_change_std": 0.2891210447996855, "reward_std": 0.9539515823125839, "rewards/cosine_scaled_reward": 0.18402369134128094, "rewards/format_reward": 0.8125000204890966, "step": 408 }, { "advantage_max": 1.1871024146676064, "advantage_mean": -9.31322596819939e-09, "advantage_min": -0.7041791677474976, "advantage_std": 0.719702310860157, "completion_length": 3173.3959350585938, "epoch": 0.4674285714285714, "grad_norm": 0.584197998046875, "kl": 0.3837890625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.8779779118983867e-07, "loss": 0.0472, "reward": 0.2377706104889512, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2377706104889512, "reward_after_std": 0.7197023220360279, "reward_before_mean": 0.5875239875167608, "reward_before_std": 0.721150603145361, "reward_change_max": 0.00040981173515319824, "reward_change_mean": -0.34975339006632566, "reward_change_min": -0.6922573670744896, "reward_change_std": 0.2620809990912676, "reward_std": 0.7197023443877697, "rewards/cosine_scaled_reward": 0.002095327712595463, "rewards/format_reward": 0.5833333395421505, "step": 409 }, { "advantage_max": 1.271196611225605, "advantage_mean": -6.208817571184966e-09, "advantage_min": -0.6538401357829571, "advantage_std": 0.6966192908585072, "completion_length": 2520.9792098999023, "epoch": 0.4685714285714286, "grad_norm": 0.32509198784828186, "kl": 0.3772735595703125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.8594235253127372e-07, "loss": 0.0199, "reward": 0.250648136716336, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.250648136716336, "reward_after_std": 0.6966192834079266, "reward_before_mean": 0.5946154878474772, "reward_before_std": 0.6319909431040287, "reward_change_max": 0.0, "reward_change_mean": -0.343967342749238, "reward_change_min": -0.5719360187649727, "reward_change_std": 0.21776084788143635, "reward_std": 0.6966192871332169, "rewards/cosine_scaled_reward": -0.03602561820298433, "rewards/format_reward": 0.6666666679084301, "step": 410 }, { "advantage_max": 1.2601307108998299, "advantage_mean": -2.7939678626243136e-09, "advantage_min": -0.8391876332461834, "advantage_std": 0.7550375498831272, "completion_length": 2947.6875915527344, "epoch": 0.4697142857142857, "grad_norm": 0.7180549502372742, "kl": 0.292205810546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.8410465752883758e-07, "loss": 0.0225, "reward": 0.5333940163254738, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5333940163254738, "reward_after_std": 0.7550375796854496, "reward_before_mean": 0.9847032781690359, "reward_before_std": 0.7263107262551785, "reward_change_max": 0.0003239363431930542, "reward_change_mean": -0.4513092339038849, "reward_change_min": -0.7858996503055096, "reward_change_std": 0.2997463811188936, "reward_std": 0.7550375871360302, "rewards/cosine_scaled_reward": 0.07568495441228151, "rewards/format_reward": 0.8333333432674408, "step": 411 }, { "advantage_max": 1.1648833081126213, "advantage_mean": -1.8626452047421083e-08, "advantage_min": -0.8400361612439156, "advantage_std": 0.707458071410656, "completion_length": 2883.291717529297, "epoch": 0.47085714285714286, "grad_norm": 0.45658764243125916, "kl": 0.254974365234375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.822847957491922e-07, "loss": 0.0437, "reward": 0.44985311944037676, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.44985311944037676, "reward_after_std": 0.707458071410656, "reward_before_mean": 0.8784957565367222, "reward_before_std": 0.6848769150674343, "reward_change_max": 4.801899194717407e-05, "reward_change_mean": -0.42864260636270046, "reward_change_min": -0.6926585100591183, "reward_change_std": 0.28065695613622665, "reward_std": 0.7074581012129784, "rewards/cosine_scaled_reward": 0.05383117590099573, "rewards/format_reward": 0.7708333469927311, "step": 412 }, { "advantage_max": 1.3362039625644684, "advantage_mean": -2.1109979431166437e-08, "advantage_min": -0.8293716721236706, "advantage_std": 0.7513575069606304, "completion_length": 2932.416748046875, "epoch": 0.472, "grad_norm": 0.4243045747280121, "kl": 0.34423828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.804828558898332e-07, "loss": 0.0409, "reward": 0.46855833008885384, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.46855833008885384, "reward_after_std": 0.751357514411211, "reward_before_mean": 0.8912516199052334, "reward_before_std": 0.6947352215647697, "reward_change_max": 0.0, "reward_change_mean": -0.42269331961870193, "reward_change_min": -0.6819902062416077, "reward_change_std": 0.2646718043833971, "reward_std": 0.7513575218617916, "rewards/cosine_scaled_reward": 0.11229247087612748, "rewards/format_reward": 0.6666666734963655, "step": 413 }, { "advantage_max": 1.4965841621160507, "advantage_mean": -7.45058115203534e-09, "advantage_min": -0.9178449511528015, "advantage_std": 0.8657373040914536, "completion_length": 3184.791717529297, "epoch": 0.47314285714285714, "grad_norm": 0.5575371384620667, "kl": 0.42626953125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.7869892577476722e-07, "loss": 0.04, "reward": 0.11716062761843204, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.11716062761843204, "reward_after_std": 0.8657373264431953, "reward_before_mean": 0.401735796011053, "reward_before_std": 0.8855462893843651, "reward_change_max": 0.0011830329895019531, "reward_change_mean": -0.2845751619897783, "reward_change_min": -0.5535517930984497, "reward_change_std": 0.2295767618343234, "reward_std": 0.8657373674213886, "rewards/cosine_scaled_reward": -0.10121545614674687, "rewards/format_reward": 0.6041666883975267, "step": 414 }, { "advantage_max": 1.6141176372766495, "advantage_mean": -3.7252904094842165e-09, "advantage_min": -1.0077509805560112, "advantage_std": 0.9576461650431156, "completion_length": 3334.0000610351562, "epoch": 0.4742857142857143, "grad_norm": 0.752418577671051, "kl": 0.502197265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.7693309235023127e-07, "loss": 0.0569, "reward": 0.3126736783888191, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3126736783888191, "reward_after_std": 0.9576461650431156, "reward_before_mean": 0.6598517000675201, "reward_before_std": 0.989838071167469, "reward_change_max": 0.0, "reward_change_mean": -0.3471780326217413, "reward_change_min": -0.6480702608823776, "reward_change_std": 0.27285377867519855, "reward_std": 0.9576461873948574, "rewards/cosine_scaled_reward": -0.03465749090537429, "rewards/format_reward": 0.7291666865348816, "step": 415 }, { "advantage_max": 1.2872228100895882, "advantage_mean": -1.4280279847511679e-08, "advantage_min": -0.7412748411297798, "advantage_std": 0.7350778169929981, "completion_length": 2092.854232788086, "epoch": 0.4754285714285714, "grad_norm": 0.7298946976661682, "kl": 0.1840667724609375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.7518544168045524e-07, "loss": 0.0363, "reward": 0.5364300422370434, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5364300422370434, "reward_after_std": 0.7350778095424175, "reward_before_mean": 0.9861254207789898, "reward_before_std": 0.6707355920225382, "reward_change_max": 0.0, "reward_change_mean": -0.44969535805284977, "reward_change_min": -0.7272941693663597, "reward_change_std": 0.27874294482171535, "reward_std": 0.7350778356194496, "rewards/cosine_scaled_reward": 0.07639601826667786, "rewards/format_reward": 0.8333333414047956, "step": 416 }, { "advantage_max": 1.5041983649134636, "advantage_mean": -8.6923440667519e-09, "advantage_min": -0.8219343274831772, "advantage_std": 0.8873535022139549, "completion_length": 3249.375030517578, "epoch": 0.4765714285714286, "grad_norm": 0.6985993981361389, "kl": 0.469482421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.7345605894346726e-07, "loss": 0.0623, "reward": 0.0966934897005558, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.0966934897005558, "reward_after_std": 0.887353528290987, "reward_before_mean": 0.3739356682635844, "reward_before_std": 0.9190301541239023, "reward_change_max": 0.0009882226586341858, "reward_change_mean": -0.27724218368530273, "reward_change_min": -0.6620569825172424, "reward_change_std": 0.260542631149292, "reward_std": 0.8873535580933094, "rewards/cosine_scaled_reward": -0.12553217657841742, "rewards/format_reward": 0.6250000074505806, "step": 417 }, { "advantage_max": 1.4096960350871086, "advantage_mean": -3.725290376177526e-08, "advantage_min": -0.8126649931073189, "advantage_std": 0.8450727388262749, "completion_length": 2094.7292098999023, "epoch": 0.4777142857142857, "grad_norm": 1.2689417600631714, "kl": 0.41217803955078125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.7174502842694212e-07, "loss": -0.0456, "reward": 0.4172011539340019, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4172011539340019, "reward_after_std": 0.8450727574527264, "reward_before_mean": 0.8055177573114634, "reward_before_std": 0.8436265364289284, "reward_change_max": 0.0004687011241912842, "reward_change_mean": -0.3883166164159775, "reward_change_min": -0.7676271609961987, "reward_change_std": 0.29594606440514326, "reward_std": 0.8450727611780167, "rewards/cosine_scaled_reward": 0.05900887493044138, "rewards/format_reward": 0.6875000149011612, "step": 418 }, { "advantage_max": 1.4702820256352425, "advantage_mean": 3.725290298461914e-09, "advantage_min": -0.8695723973214626, "advantage_std": 0.8322137109935284, "completion_length": 2882.604278564453, "epoch": 0.47885714285714287, "grad_norm": 1.4612587690353394, "kl": 0.684814453125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.7005243352409333e-07, "loss": 0.0799, "reward": 0.5663878936320543, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5663878936320543, "reward_after_std": 0.832213718444109, "reward_before_mean": 1.0104223068337888, "reward_before_std": 0.7705448046326637, "reward_change_max": 0.000252746045589447, "reward_change_mean": -0.4440344120375812, "reward_change_min": -0.7140844948589802, "reward_change_std": 0.28817459661513567, "reward_std": 0.8322137407958508, "rewards/cosine_scaled_reward": 0.14062782609835267, "rewards/format_reward": 0.7291666716337204, "step": 419 }, { "advantage_max": 1.3699936755001545, "advantage_mean": -2.048909719665204e-08, "advantage_min": -0.8499936163425446, "advantage_std": 0.8008501157164574, "completion_length": 2282.854217529297, "epoch": 0.48, "grad_norm": 0.8160320520401001, "kl": 0.307464599609375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.6837835672960831e-07, "loss": 0.0613, "reward": 0.33678290247917175, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.33678290247917175, "reward_after_std": 0.8008501008152962, "reward_before_mean": 0.707935786806047, "reward_before_std": 0.7854114696383476, "reward_change_max": 0.0, "reward_change_mean": -0.37115289457142353, "reward_change_min": -0.6540162637829781, "reward_change_std": 0.261508384719491, "reward_std": 0.8008501194417477, "rewards/cosine_scaled_reward": -0.031448788940906525, "rewards/format_reward": 0.7708333544433117, "step": 420 }, { "advantage_max": 1.5776860639452934, "advantage_mean": -1.3038516433194758e-08, "advantage_min": -0.8760270141065121, "advantage_std": 0.8757519386708736, "completion_length": 3130.416748046875, "epoch": 0.48114285714285715, "grad_norm": 1.04563570022583, "kl": 0.467529296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.6672287963562852e-07, "loss": 0.0216, "reward": 0.24572166753932834, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.24572166753932834, "reward_after_std": 0.8757519423961639, "reward_before_mean": 0.5705269044265151, "reward_before_std": 0.8527628295123577, "reward_change_max": 0.0, "reward_change_mean": -0.3248052569106221, "reward_change_min": -0.5897286981344223, "reward_change_std": 0.23443656880408525, "reward_std": 0.8757519759237766, "rewards/cosine_scaled_reward": -0.06890320917591453, "rewards/format_reward": 0.7083333488553762, "step": 421 }, { "advantage_max": 1.361400119960308, "advantage_mean": -1.2417634420724966e-08, "advantage_min": -0.6548620238900185, "advantage_std": 0.7440179772675037, "completion_length": 2844.229248046875, "epoch": 0.48228571428571426, "grad_norm": 0.553774893283844, "kl": 0.36322021484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.6508608292777203e-07, "loss": 0.0398, "reward": 0.4359323289245367, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4359323289245367, "reward_after_std": 0.7440179847180843, "reward_before_mean": 0.8442143835127354, "reward_before_std": 0.6683123558759689, "reward_change_max": 0.0007221251726150513, "reward_change_mean": -0.40828206948935986, "reward_change_min": -0.6951416172087193, "reward_change_std": 0.2510274276137352, "reward_std": 0.7440180070698261, "rewards/cosine_scaled_reward": -0.015392808709293604, "rewards/format_reward": 0.8750000111758709, "step": 422 }, { "advantage_max": 1.1223453134298325, "advantage_mean": 9.62366689116756e-09, "advantage_min": -0.7540298514068127, "advantage_std": 0.6678426675498486, "completion_length": 2842.916793823242, "epoch": 0.48342857142857143, "grad_norm": 0.4749550223350525, "kl": 0.377349853515625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.6346804638120098e-07, "loss": 0.0351, "reward": 0.1393016508081928, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1393016508081928, "reward_after_std": 0.6678426824510098, "reward_before_mean": 0.45855964068323374, "reward_before_std": 0.6641668677330017, "reward_change_max": 0.0, "reward_change_mean": -0.31925799883902073, "reward_change_min": -0.5939181633293629, "reward_change_std": 0.23012700304389, "reward_std": 0.6678427010774612, "rewards/cosine_scaled_reward": -0.14572018571197987, "rewards/format_reward": 0.7500000149011612, "step": 423 }, { "advantage_max": 1.4285841286182404, "advantage_mean": -4.346172144398253e-09, "advantage_min": -0.8253460563719273, "advantage_std": 0.8169199749827385, "completion_length": 3067.5000610351562, "epoch": 0.4845714285714286, "grad_norm": 0.528998613357544, "kl": 0.48583984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.6186884885673413e-07, "loss": 0.0324, "reward": 0.16846600966528058, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.16846600966528058, "reward_after_std": 0.8169199749827385, "reward_before_mean": 0.476221552118659, "reward_before_std": 0.8139840699732304, "reward_change_max": 0.0, "reward_change_mean": -0.3077555485069752, "reward_change_min": -0.5694200806319714, "reward_change_std": 0.22507017478346825, "reward_std": 0.81691999360919, "rewards/cosine_scaled_reward": -0.10563923278823495, "rewards/format_reward": 0.6875000260770321, "step": 424 }, { "advantage_max": 1.5528990626335144, "advantage_mean": -4.221995819619906e-08, "advantage_min": -1.2289166450500488, "advantage_std": 0.9778655990958214, "completion_length": 1963.1250610351562, "epoch": 0.4857142857142857, "grad_norm": 1.2701576948165894, "kl": 0.180633544921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.6028856829700258e-07, "loss": 0.0464, "reward": 0.9164818078279495, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.9164818078279495, "reward_after_std": 0.9778655841946602, "reward_before_mean": 1.4825816750526428, "reward_before_std": 0.9932750500738621, "reward_change_max": 0.0, "reward_change_mean": -0.5660999063402414, "reward_change_min": -0.950649194419384, "reward_change_std": 0.3851541206240654, "reward_std": 0.9778655916452408, "rewards/cosine_scaled_reward": 0.3142075026407838, "rewards/format_reward": 0.8541666828095913, "step": 425 }, { "advantage_max": 1.4850023314356804, "advantage_mean": -1.6142924885720333e-08, "advantage_min": -0.8387530986219645, "advantage_std": 0.8622367791831493, "completion_length": 2385.000045776367, "epoch": 0.4868571428571429, "grad_norm": 0.5399412512779236, "kl": 0.41119384765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.5872728172265146e-07, "loss": 0.0495, "reward": 0.41559531493112445, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.41559531493112445, "reward_after_std": 0.8622367866337299, "reward_before_mean": 0.8041009623557329, "reward_before_std": 0.8447239138185978, "reward_change_max": 9.606033563613892e-05, "reward_change_mean": -0.3885056748986244, "reward_change_min": -0.6781072355806828, "reward_change_std": 0.2709171436727047, "reward_std": 0.8622367903590202, "rewards/cosine_scaled_reward": -0.00419950857758522, "rewards/format_reward": 0.8125000111758709, "step": 426 }, { "advantage_max": 1.783732920885086, "advantage_mean": -2.220446049250313e-16, "advantage_min": -1.004781313240528, "advantage_std": 1.019487425684929, "completion_length": 2844.291717529297, "epoch": 0.488, "grad_norm": 1.7400785684585571, "kl": 0.4984130859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.5718506522858572e-07, "loss": 0.102, "reward": 0.2693352377973497, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2693352377973497, "reward_after_std": 1.0194874107837677, "reward_before_mean": 0.5815716478973627, "reward_before_std": 1.039123497903347, "reward_change_max": 0.0, "reward_change_mean": -0.3122364245355129, "reward_change_min": -0.6392947360873222, "reward_change_std": 0.2599599938839674, "reward_std": 1.0194874852895737, "rewards/cosine_scaled_reward": -0.04254750721156597, "rewards/format_reward": 0.6666666865348816, "step": 427 }, { "advantage_max": 1.3194306641817093, "advantage_mean": -6.829699084054397e-09, "advantage_min": -0.857621468603611, "advantage_std": 0.7734849788248539, "completion_length": 2835.6667098999023, "epoch": 0.48914285714285716, "grad_norm": 0.7393080592155457, "kl": 0.417510986328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.5566199398026147e-07, "loss": 0.0293, "reward": 0.28968586708651856, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.28968586708651856, "reward_after_std": 0.7734849452972412, "reward_before_mean": 0.6473546512424946, "reward_before_std": 0.7677021622657776, "reward_change_max": 0.0, "reward_change_mean": -0.35766879096627235, "reward_change_min": -0.6340950168669224, "reward_change_std": 0.24785144068300724, "reward_std": 0.7734849527478218, "rewards/cosine_scaled_reward": -0.11382270202739164, "rewards/format_reward": 0.8750000223517418, "step": 428 }, { "advantage_max": 0.9747552573680878, "advantage_mean": 9.62366689116756e-09, "advantage_min": -0.5368304066359997, "advantage_std": 0.5519351437687874, "completion_length": 2275.416763305664, "epoch": 0.49028571428571427, "grad_norm": 1.0370701551437378, "kl": 0.2674560546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.5415814221002265e-07, "loss": -0.0058, "reward": 0.38617168786004186, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.38617168786004186, "reward_after_std": 0.5519351437687874, "reward_before_mean": 0.8017744198441505, "reward_before_std": 0.4633374772965908, "reward_change_max": 0.0, "reward_change_mean": -0.4156027212738991, "reward_change_min": -0.6280790641903877, "reward_change_std": 0.24106368236243725, "reward_std": 0.5519351549446583, "rewards/cosine_scaled_reward": -0.057446133345365524, "rewards/format_reward": 0.9166666679084301, "step": 429 }, { "advantage_max": 1.1260406970977783, "advantage_mean": -1.4901161971003773e-08, "advantage_min": -0.7276361249387264, "advantage_std": 0.6451876908540726, "completion_length": 2481.750030517578, "epoch": 0.49142857142857144, "grad_norm": 0.5541008710861206, "kl": 0.3700408935546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.5267358321348285e-07, "loss": 0.0265, "reward": 0.3534410297870636, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3534410297870636, "reward_after_std": 0.6451876983046532, "reward_before_mean": 0.7495078779757023, "reward_before_std": 0.5930134803056717, "reward_change_max": 0.0, "reward_change_mean": -0.3960668696090579, "reward_change_min": -0.6170521266758442, "reward_change_std": 0.2415135744959116, "reward_std": 0.645187720656395, "rewards/cosine_scaled_reward": -0.00024606427177786827, "rewards/format_reward": 0.7500000111758709, "step": 430 }, { "advantage_max": 1.158790297806263, "advantage_mean": -2.7939677238464355e-09, "advantage_min": -0.6757703498005867, "advantage_std": 0.6554910391569138, "completion_length": 2506.8334045410156, "epoch": 0.49257142857142855, "grad_norm": 0.8864076137542725, "kl": 0.35559844970703125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.5120838934595337e-07, "loss": 0.0084, "reward": 0.20965594646986574, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.20965594646986574, "reward_after_std": 0.6554910317063332, "reward_before_mean": 0.5490434896200895, "reward_before_std": 0.6157159730792046, "reward_change_max": 0.0006238818168640137, "reward_change_mean": -0.33938753232359886, "reward_change_min": -0.5494464412331581, "reward_change_std": 0.2182679483667016, "reward_std": 0.6554910577833652, "rewards/cosine_scaled_reward": -0.15256160404533148, "rewards/format_reward": 0.8541666716337204, "step": 431 }, { "advantage_max": 1.2552975341677666, "advantage_mean": -8.071462331837864e-09, "advantage_min": -0.7928625233471394, "advantage_std": 0.7409033365547657, "completion_length": 2835.7709045410156, "epoch": 0.4937142857142857, "grad_norm": 1.0404794216156006, "kl": 0.420684814453125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.4976263201891613e-07, "loss": 0.0706, "reward": 0.45932803535833955, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.45932803535833955, "reward_after_std": 0.7409033589065075, "reward_before_mean": 0.8826699033379555, "reward_before_std": 0.7020781114697456, "reward_change_max": 0.003414548933506012, "reward_change_mean": -0.42334189265966415, "reward_change_min": -0.6823976673185825, "reward_change_std": 0.2694571502506733, "reward_std": 0.7409033626317978, "rewards/cosine_scaled_reward": -0.01699838414788246, "rewards/format_reward": 0.9166666865348816, "step": 432 }, { "advantage_max": 1.3465048968791962, "advantage_mean": 2.4835269396561444e-09, "advantage_min": -0.6381904669106007, "advantage_std": 0.7317006289958954, "completion_length": 2914.12508392334, "epoch": 0.4948571428571429, "grad_norm": 0.7161730527877808, "kl": 0.39739990234375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.483363816965435e-07, "loss": 0.051, "reward": 0.5220309607684612, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5220309607684612, "reward_after_std": 0.731700599193573, "reward_before_mean": 0.9621444530785084, "reward_before_std": 0.6160903349518776, "reward_change_max": 0.0003250539302825928, "reward_change_mean": -0.44011346995830536, "reward_change_min": -0.6610860973596573, "reward_change_std": 0.25879207253456116, "reward_std": 0.7317006103694439, "rewards/cosine_scaled_reward": 0.07482220698148012, "rewards/format_reward": 0.8125000149011612, "step": 433 }, { "advantage_max": 1.2062464877963066, "advantage_mean": -1.2417634698280722e-09, "advantage_min": -0.6216416116803885, "advantage_std": 0.6547112353146076, "completion_length": 3110.6459045410156, "epoch": 0.496, "grad_norm": 0.389948308467865, "kl": 0.447509765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.469297078922642e-07, "loss": 0.0442, "reward": -0.06265595648437738, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.06265595648437738, "reward_after_std": 0.6547112427651882, "reward_before_mean": 0.17960346583276987, "reward_before_std": 0.6161167174577713, "reward_change_max": 0.0, "reward_change_mean": -0.2422594241797924, "reward_change_min": -0.4483148455619812, "reward_change_std": 0.17038149200379848, "reward_std": 0.6547112688422203, "rewards/cosine_scaled_reward": -0.28519828617572784, "rewards/format_reward": 0.7500000149011612, "step": 434 }, { "advantage_max": 1.1669103428721428, "advantage_mean": -1.3659397946064189e-08, "advantage_min": -0.6414779610931873, "advantage_std": 0.6753768660128117, "completion_length": 2001.2292442321777, "epoch": 0.49714285714285716, "grad_norm": 0.7122639417648315, "kl": 0.27149200439453125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.4554267916537495e-07, "loss": -0.0088, "reward": 0.3204499026760459, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3204499026760459, "reward_after_std": 0.675376869738102, "reward_before_mean": 0.6983707807958126, "reward_before_std": 0.6387381218373775, "reward_change_max": 0.0, "reward_change_mean": -0.3779208790510893, "reward_change_min": -0.6663752608001232, "reward_change_std": 0.2425840962678194, "reward_std": 0.6753768734633923, "rewards/cosine_scaled_reward": -0.08831463009119034, "rewards/format_reward": 0.8750000074505806, "step": 435 }, { "advantage_max": 1.3380301296710968, "advantage_mean": -2.7318795670083773e-08, "advantage_min": -0.9807571396231651, "advantage_std": 0.8031731098890305, "completion_length": 2268.5208587646484, "epoch": 0.4982857142857143, "grad_norm": 0.5405041575431824, "kl": 0.2992401123046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.4417536311769885e-07, "loss": 0.017, "reward": 0.6386113851331174, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6386113851331174, "reward_after_std": 0.803173117339611, "reward_before_mean": 1.1200417447835207, "reward_before_std": 0.7725371960550547, "reward_change_max": 0.0, "reward_change_mean": -0.48143038526177406, "reward_change_min": -0.7670682519674301, "reward_change_std": 0.3101059626787901, "reward_std": 0.8031731359660625, "rewards/cosine_scaled_reward": 0.13293753401376307, "rewards/format_reward": 0.8541666865348816, "step": 436 }, { "advantage_max": 1.1815454438328743, "advantage_mean": -3.7252904094842165e-09, "advantage_min": -0.8695116825401783, "advantage_std": 0.7322442196309566, "completion_length": 3261.0625915527344, "epoch": 0.49942857142857144, "grad_norm": 0.8688717484474182, "kl": 0.55712890625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.4282782639029128e-07, "loss": 0.0415, "reward": 0.25387762673199177, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.25387762673199177, "reward_after_std": 0.7322442270815372, "reward_before_mean": 0.6103640319779515, "reward_before_std": 0.7458185590803623, "reward_change_max": 0.0002872347831726074, "reward_change_mean": -0.35648639500141144, "reward_change_min": -0.6514874063432217, "reward_change_std": 0.2629950176924467, "reward_std": 0.732244249433279, "rewards/cosine_scaled_reward": -0.08023467287421227, "rewards/format_reward": 0.7708333507180214, "step": 437 }, { "advantage_max": 1.1966863423585892, "advantage_mean": -1.8316011179964065e-08, "advantage_min": -0.8341338858008385, "advantage_std": 0.7336725704371929, "completion_length": 2907.979217529297, "epoch": 0.5005714285714286, "grad_norm": 0.4294912815093994, "kl": 0.367462158203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.4150013466019114e-07, "loss": 0.0373, "reward": 0.30324064660817385, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.30324064660817385, "reward_after_std": 0.7336725741624832, "reward_before_mean": 0.6767560187727213, "reward_before_std": 0.7442520670592785, "reward_change_max": 0.0, "reward_change_mean": -0.37351538613438606, "reward_change_min": -0.688855767250061, "reward_change_std": 0.26820408646017313, "reward_std": 0.7336725778877735, "rewards/cosine_scaled_reward": -0.026205329224467278, "rewards/format_reward": 0.7291666734963655, "step": 438 }, { "advantage_max": 1.2665520757436752, "advantage_mean": -1.8626452047421083e-09, "advantage_min": -0.6952036470174789, "advantage_std": 0.702237457036972, "completion_length": 2692.5833892822266, "epoch": 0.5017142857142857, "grad_norm": 0.6560401320457458, "kl": 0.3876953125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.4019235263722034e-07, "loss": 0.0416, "reward": 0.2241203337907791, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2241203337907791, "reward_after_std": 0.7022374682128429, "reward_before_mean": 0.5633864104747772, "reward_before_std": 0.6478680744767189, "reward_change_max": 0.0018536224961280823, "reward_change_mean": -0.3392660841345787, "reward_change_min": -0.567062571644783, "reward_change_std": 0.22123288549482822, "reward_std": 0.7022374868392944, "rewards/cosine_scaled_reward": -0.11414013616740704, "rewards/format_reward": 0.791666679084301, "step": 439 }, { "advantage_max": 1.1453857421875, "advantage_mean": -3.7252904094842165e-09, "advantage_min": -0.6617013551294804, "advantage_std": 0.6645463220775127, "completion_length": 2964.7083740234375, "epoch": 0.5028571428571429, "grad_norm": 1.013630986213684, "kl": 0.516143798828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.3890454406082956e-07, "loss": 0.0305, "reward": -0.036612953059375286, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.036612953059375286, "reward_after_std": 0.664546325802803, "reward_before_mean": 0.218028850533301, "reward_before_std": 0.6687482669949532, "reward_change_max": 0.0, "reward_change_mean": -0.25464182160794735, "reward_change_min": -0.5325037129223347, "reward_change_std": 0.20117001980543137, "reward_std": 0.6645463369786739, "rewards/cosine_scaled_reward": -0.19306890480220318, "rewards/format_reward": 0.604166679084301, "step": 440 }, { "advantage_max": 1.3998192101716995, "advantage_mean": -1.800557042352935e-08, "advantage_min": -0.9868221804499626, "advantage_std": 0.8746149949729443, "completion_length": 2815.854217529297, "epoch": 0.504, "grad_norm": 1.0281097888946533, "kl": 0.436187744140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.3763677169699217e-07, "loss": 0.0168, "reward": 0.45351985446177423, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.45351985446177423, "reward_after_std": 0.8746149949729443, "reward_before_mean": 0.8660605433396995, "reward_before_std": 0.9094558022916317, "reward_change_max": 0.000750415027141571, "reward_change_mean": -0.4125407300889492, "reward_change_min": -0.7904599867761135, "reward_change_std": 0.31091453321278095, "reward_std": 0.874615054577589, "rewards/cosine_scaled_reward": 0.005946941673755646, "rewards/format_reward": 0.8541666865348816, "step": 441 }, { "advantage_max": 1.6837698444724083, "advantage_mean": -1.8626451825376478e-08, "advantage_min": -1.013794220983982, "advantage_std": 0.9842873066663742, "completion_length": 2870.7709045410156, "epoch": 0.5051428571428571, "grad_norm": 0.6417372822761536, "kl": 0.419921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.3638909733514452e-07, "loss": 0.0562, "reward": 0.5628248087596148, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5628248087596148, "reward_after_std": 0.9842873364686966, "reward_before_mean": 0.9925806345418096, "reward_before_std": 0.9769185334444046, "reward_change_max": 0.0, "reward_change_mean": -0.4297558609396219, "reward_change_min": -0.7348082773387432, "reward_change_std": 0.30168131552636623, "reward_std": 0.9842873699963093, "rewards/cosine_scaled_reward": 0.10045698285102844, "rewards/format_reward": 0.7916666753590107, "step": 442 }, { "advantage_max": 1.337314061820507, "advantage_mean": -4.035731110407781e-09, "advantage_min": -0.8306575156748295, "advantage_std": 0.7948845028877258, "completion_length": 2973.6251068115234, "epoch": 0.5062857142857143, "grad_norm": 0.7379650473594666, "kl": 0.41070556640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.351615817851748e-07, "loss": 0.0476, "reward": 0.22690303064882755, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.22690303064882755, "reward_after_std": 0.7948845438659191, "reward_before_mean": 0.5625253785401583, "reward_before_std": 0.8026487492024899, "reward_change_max": 0.0, "reward_change_mean": -0.33562235068529844, "reward_change_min": -0.6595030464231968, "reward_change_std": 0.2517847139388323, "reward_std": 0.7948845475912094, "rewards/cosine_scaled_reward": -0.08332065586000681, "rewards/format_reward": 0.7291666772216558, "step": 443 }, { "advantage_max": 1.0303258448839188, "advantage_mean": -5.898376453927767e-09, "advantage_min": -0.70342618227005, "advantage_std": 0.6188901476562023, "completion_length": 2797.3125762939453, "epoch": 0.5074285714285715, "grad_norm": 0.4941686987876892, "kl": 0.3975830078125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.3395428487445914e-07, "loss": 0.0183, "reward": 0.3297221283428371, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3297221283428371, "reward_after_std": 0.6188901402056217, "reward_before_mean": 0.7205270808190107, "reward_before_std": 0.5975214540958405, "reward_change_max": 0.0, "reward_change_mean": -0.3908049762248993, "reward_change_min": -0.6559524200856686, "reward_change_std": 0.24912833236157894, "reward_std": 0.6188901737332344, "rewards/cosine_scaled_reward": -0.098069803789258, "rewards/format_reward": 0.9166666865348816, "step": 444 }, { "advantage_max": 1.4425413608551025, "advantage_mean": -9.934107980669182e-09, "advantage_min": -0.8248635195195675, "advantage_std": 0.8054063022136688, "completion_length": 2815.104248046875, "epoch": 0.5085714285714286, "grad_norm": 0.6476776599884033, "kl": 0.385772705078125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.3276726544494571e-07, "loss": 0.0453, "reward": 0.23617349471896887, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.23617349471896887, "reward_after_std": 0.8054062686860561, "reward_before_mean": 0.5678501327056438, "reward_before_std": 0.779238685965538, "reward_change_max": 0.0006808564066886902, "reward_change_mean": -0.3316766396164894, "reward_change_min": -0.6133453659713268, "reward_change_std": 0.22692137584090233, "reward_std": 0.8054062835872173, "rewards/cosine_scaled_reward": -0.1014916084241122, "rewards/format_reward": 0.7708333544433117, "step": 445 }, { "advantage_max": 1.4127313196659088, "advantage_mean": -4.967054045845742e-09, "advantage_min": -0.9514566399157047, "advantage_std": 0.841310903429985, "completion_length": 2577.4167404174805, "epoch": 0.5097142857142857, "grad_norm": 0.7943739891052246, "kl": 0.3366851806640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.316005813502869e-07, "loss": 0.0403, "reward": 0.5418746005743742, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5418746005743742, "reward_after_std": 0.8413108885288239, "reward_before_mean": 0.9842005173268262, "reward_before_std": 0.8326334059238434, "reward_change_max": 0.000830821692943573, "reward_change_mean": -0.4423258863389492, "reward_change_min": -0.7536700703203678, "reward_change_std": 0.30366277135908604, "reward_std": 0.8413109183311462, "rewards/cosine_scaled_reward": 0.1066835792735219, "rewards/format_reward": 0.7708333432674408, "step": 446 }, { "advantage_max": 1.3711147084832191, "advantage_mean": -2.359350548264416e-08, "advantage_min": -0.7326249293982983, "advantage_std": 0.7835522890090942, "completion_length": 2426.2500610351562, "epoch": 0.5108571428571429, "grad_norm": 0.4948989748954773, "kl": 0.27752685546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.3045428945301953e-07, "loss": 0.0314, "reward": 0.3651602268218994, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3651602268218994, "reward_after_std": 0.7835522815585136, "reward_before_mean": 0.7487014681100845, "reward_before_std": 0.7524013314396143, "reward_change_max": 0.0001742541790008545, "reward_change_mean": -0.3835412599146366, "reward_change_min": -0.7031693980097771, "reward_change_std": 0.25586988404393196, "reward_std": 0.7835522890090942, "rewards/cosine_scaled_reward": -0.08398262108676136, "rewards/format_reward": 0.916666679084301, "step": 447 }, { "advantage_max": 1.3957402855157852, "advantage_mean": -3.10440864126349e-08, "advantage_min": -0.863853208720684, "advantage_std": 0.8466850370168686, "completion_length": 2403.354217529297, "epoch": 0.512, "grad_norm": 0.5587709546089172, "kl": 0.33380126953125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.2932844562179352e-07, "loss": 0.0388, "reward": 0.6120043303817511, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6120043303817511, "reward_after_std": 0.846685029566288, "reward_before_mean": 1.0778508316725492, "reward_before_std": 0.8333419039845467, "reward_change_max": 0.0, "reward_change_mean": -0.4658465310931206, "reward_change_min": -0.8283277899026871, "reward_change_std": 0.3207920826971531, "reward_std": 0.8466850444674492, "rewards/cosine_scaled_reward": 0.14309207256883383, "rewards/format_reward": 0.7916666716337204, "step": 448 }, { "advantage_max": 1.3688802272081375, "advantage_mean": -2.1109978987077227e-08, "advantage_min": -0.874987531453371, "advantage_std": 0.8195151649415493, "completion_length": 2257.6875610351562, "epoch": 0.5131428571428571, "grad_norm": 0.8245042562484741, "kl": 0.3472442626953125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.2822310472864885e-07, "loss": 0.0005, "reward": 0.45789824426174164, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.45789824426174164, "reward_after_std": 0.8195151649415493, "reward_before_mean": 0.8723139688372612, "reward_before_std": 0.8171190805733204, "reward_change_max": 0.0, "reward_change_mean": -0.4144157078117132, "reward_change_min": -0.7483086436986923, "reward_change_std": 0.2881466280668974, "reward_std": 0.8195151947438717, "rewards/cosine_scaled_reward": -0.011759708635509014, "rewards/format_reward": 0.8958333358168602, "step": 449 }, { "advantage_max": 1.13484638184309, "advantage_mean": -1.8316011374253094e-08, "advantage_min": -0.6597550548613071, "advantage_std": 0.6347230449318886, "completion_length": 2701.2917404174805, "epoch": 0.5142857142857142, "grad_norm": 1.3705062866210938, "kl": 0.351776123046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.2713832064634125e-07, "loss": -0.003, "reward": 0.4212948093190789, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4212948093190789, "reward_after_std": 0.6347230412065983, "reward_before_mean": 0.8418345140962629, "reward_before_std": 0.5590377673506737, "reward_change_max": 0.0, "reward_change_mean": -0.4205396883189678, "reward_change_min": -0.6680282019078732, "reward_change_std": 0.24988417513668537, "reward_std": 0.6347230449318886, "rewards/cosine_scaled_reward": -0.0061661116778850555, "rewards/format_reward": 0.854166679084301, "step": 450 }, { "advantage_max": 1.2935975268483162, "advantage_mean": -2.4835271617007493e-09, "advantage_min": -0.9024942293763161, "advantage_std": 0.769032035022974, "completion_length": 2333.854232788086, "epoch": 0.5154285714285715, "grad_norm": 0.4481179416179657, "kl": 0.3267059326171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.260741462457165e-07, "loss": 0.0365, "reward": 0.5793941374868155, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5793941374868155, "reward_after_std": 0.769032035022974, "reward_before_mean": 1.0413884930312634, "reward_before_std": 0.7353867180645466, "reward_change_max": 0.00039821118116378784, "reward_change_mean": -0.46199433878064156, "reward_change_min": -0.7502287328243256, "reward_change_std": 0.29962888173758984, "reward_std": 0.7690320760011673, "rewards/cosine_scaled_reward": 0.09361090138554573, "rewards/format_reward": 0.854166679084301, "step": 451 }, { "advantage_max": 1.2705382034182549, "advantage_mean": -1.1796752463766325e-08, "advantage_min": -0.9169280380010605, "advantage_std": 0.780293669551611, "completion_length": 3047.479217529297, "epoch": 0.5165714285714286, "grad_norm": 0.4624117314815521, "kl": 0.39569091796875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.2503063339313356e-07, "loss": 0.0313, "reward": 0.299836840480566, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.299836840480566, "reward_after_std": 0.7802936770021915, "reward_before_mean": 0.66663564927876, "reward_before_std": 0.7914271615445614, "reward_change_max": 0.0013251379132270813, "reward_change_mean": -0.3667988320812583, "reward_change_min": -0.6514892093837261, "reward_change_std": 0.27619097754359245, "reward_std": 0.7802936919033527, "rewards/cosine_scaled_reward": 0.020817823708057404, "rewards/format_reward": 0.6250000149011612, "step": 452 }, { "advantage_max": 1.450104333460331, "advantage_mean": 1.1102230246251565e-16, "advantage_min": -0.8548537157475948, "advantage_std": 0.8533807098865509, "completion_length": 2728.0834197998047, "epoch": 0.5177142857142857, "grad_norm": 0.5218294858932495, "kl": 0.4306640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.2400783294793668e-07, "loss": 0.0286, "reward": 0.4595272596925497, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4595272596925497, "reward_after_std": 0.8533807396888733, "reward_before_mean": 0.868656799197197, "reward_before_std": 0.8462558500468731, "reward_change_max": 0.0, "reward_change_mean": -0.4091295227408409, "reward_change_min": -0.7668772041797638, "reward_change_std": 0.28244134970009327, "reward_std": 0.8533807471394539, "rewards/cosine_scaled_reward": 0.007245063781738281, "rewards/format_reward": 0.854166679084301, "step": 453 }, { "advantage_max": 1.288001649081707, "advantage_mean": -4.035731554496991e-09, "advantage_min": -0.6806519478559494, "advantage_std": 0.7259433791041374, "completion_length": 2560.291717529297, "epoch": 0.5188571428571429, "grad_norm": 0.8109725117683411, "kl": 0.325469970703125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.2300579475997657e-07, "loss": 0.0179, "reward": 0.2342965486459434, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2342965486459434, "reward_after_std": 0.7259433902800083, "reward_before_mean": 0.5756424032151699, "reward_before_std": 0.6870302464812994, "reward_change_max": 0.000321120023727417, "reward_change_mean": -0.3413458652794361, "reward_change_min": -0.5898742564022541, "reward_change_std": 0.2277445401996374, "reward_std": 0.7259434051811695, "rewards/cosine_scaled_reward": -0.0975954644382, "rewards/format_reward": 0.7708333376795053, "step": 454 }, { "advantage_max": 1.7102677822113037, "advantage_mean": -3.1044086745701804e-09, "advantage_min": -0.7576536983251572, "advantage_std": 0.9212291911244392, "completion_length": 3278.5834350585938, "epoch": 0.52, "grad_norm": 0.6358473300933838, "kl": 0.513427734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.220245676671809e-07, "loss": 0.0445, "reward": 0.1089541104156524, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1089541104156524, "reward_after_std": 0.921229176223278, "reward_before_mean": 0.3775138114579022, "reward_before_std": 0.8881769068539143, "reward_change_max": 0.00048607587814331055, "reward_change_mean": -0.26855968311429024, "reward_change_min": -0.5115904286503792, "reward_change_std": 0.20105735212564468, "reward_std": 0.921229176223278, "rewards/cosine_scaled_reward": -0.16540978103876114, "rewards/format_reward": 0.7083333507180214, "step": 455 }, { "advantage_max": 1.6177105605602264, "advantage_mean": -3.7252901874396116e-09, "advantage_min": -0.9958631098270416, "advantage_std": 0.9880196638405323, "completion_length": 2972.5625610351562, "epoch": 0.5211428571428571, "grad_norm": 0.6450056433677673, "kl": 0.3536376953125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.2106419949317388e-07, "loss": 0.0434, "reward": 0.42031298764050007, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.42031298764050007, "reward_after_std": 0.9880196675658226, "reward_before_mean": 0.8057602029293776, "reward_before_std": 1.0287685617804527, "reward_change_max": 0.0007295906543731689, "reward_change_mean": -0.3854472152888775, "reward_change_min": -0.78760626912117, "reward_change_std": 0.3133588805794716, "reward_std": 0.9880196936428547, "rewards/cosine_scaled_reward": 0.017463432624936104, "rewards/format_reward": 0.7708333432674408, "step": 456 }, { "advantage_max": 0.9447000622749329, "advantage_mean": 7.761021519891997e-09, "advantage_min": -0.4586385563015938, "advantage_std": 0.5160973146557808, "completion_length": 2758.812545776367, "epoch": 0.5222857142857142, "grad_norm": 0.8224807977676392, "kl": 0.35028076171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.2012473704494537e-07, "loss": 0.007, "reward": 0.26081686979159713, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.26081686979159713, "reward_after_std": 0.5160973146557808, "reward_before_mean": 0.6370738223195076, "reward_before_std": 0.4221321437507868, "reward_change_max": 0.00038155168294906616, "reward_change_mean": -0.3762569138780236, "reward_change_min": -0.6008391063660383, "reward_change_std": 0.21958772093057632, "reward_std": 0.5160973332822323, "rewards/cosine_scaled_reward": -0.03562977910041809, "rewards/format_reward": 0.7083333469927311, "step": 457 }, { "advantage_max": 1.000264324247837, "advantage_mean": 3.4148496808050766e-09, "advantage_min": -0.4859766326844692, "advantage_std": 0.5605919919908047, "completion_length": 2269.2708740234375, "epoch": 0.5234285714285715, "grad_norm": 0.8835322856903076, "kl": 0.3087158203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1920622611056974e-07, "loss": -0.0088, "reward": -0.009334953036159277, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.009334953036159277, "reward_after_std": 0.5605919994413853, "reward_before_mean": 0.25875352788716555, "reward_before_std": 0.5173221491277218, "reward_change_max": 0.0, "reward_change_mean": -0.2680884934961796, "reward_change_min": -0.4858828894793987, "reward_change_std": 0.17110472451895475, "reward_std": 0.5605920068919659, "rewards/cosine_scaled_reward": -0.23520657513290644, "rewards/format_reward": 0.7291666679084301, "step": 458 }, { "advantage_max": 1.5267757251858711, "advantage_mean": -1.2417634920325327e-08, "advantage_min": -0.8131808936595917, "advantage_std": 0.844078216701746, "completion_length": 2138.666732788086, "epoch": 0.5245714285714286, "grad_norm": 0.5719569325447083, "kl": 0.26104736328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1830871145697412e-07, "loss": 0.0098, "reward": 0.483590893127257, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.483590893127257, "reward_after_std": 0.8440782129764557, "reward_before_mean": 0.894639240577817, "reward_before_std": 0.7770277038216591, "reward_change_max": 0.0004179328680038452, "reward_change_mean": -0.4110483396798372, "reward_change_min": -0.682177871465683, "reward_change_std": 0.2591621521860361, "reward_std": 0.8440782576799393, "rewards/cosine_scaled_reward": 0.020236277021467686, "rewards/format_reward": 0.854166679084301, "step": 459 }, { "advantage_max": 1.5692550614476204, "advantage_mean": -8.692344150018627e-09, "advantage_min": -0.9461584500968456, "advantage_std": 0.9103639535605907, "completion_length": 2925.6876220703125, "epoch": 0.5257142857142857, "grad_norm": 0.9705744385719299, "kl": 0.40936279296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1743223682775649e-07, "loss": 0.058, "reward": 0.281633076723665, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.281633076723665, "reward_after_std": 0.9103639498353004, "reward_before_mean": 0.6173821464180946, "reward_before_std": 0.9208094067871571, "reward_change_max": 0.0, "reward_change_mean": -0.3357490822672844, "reward_change_min": -0.6764678545296192, "reward_change_std": 0.2633221186697483, "reward_std": 0.9103639684617519, "rewards/cosine_scaled_reward": -0.06630893185501918, "rewards/format_reward": 0.7500000260770321, "step": 460 }, { "advantage_max": 1.1545908525586128, "advantage_mean": 7.450581318568794e-09, "advantage_min": -0.699158314615488, "advantage_std": 0.6645155474543571, "completion_length": 2859.291732788086, "epoch": 0.5268571428571428, "grad_norm": 1.0532013177871704, "kl": 0.40289306640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1657684494105386e-07, "loss": -0.0062, "reward": 0.392983645782806, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.392983645782806, "reward_after_std": 0.6645155362784863, "reward_before_mean": 0.7981845624744892, "reward_before_std": 0.6197777800261974, "reward_change_max": 0.00014375895261764526, "reward_change_mean": -0.40520089864730835, "reward_change_min": -0.6614694185554981, "reward_change_std": 0.2514055222272873, "reward_std": 0.6645155511796474, "rewards/cosine_scaled_reward": 0.034508924931287766, "rewards/format_reward": 0.729166679084301, "step": 461 }, { "advantage_max": 1.0376139730215073, "advantage_mean": 8.071462498371318e-09, "advantage_min": -0.48623234406113625, "advantage_std": 0.5613770298659801, "completion_length": 2777.041763305664, "epoch": 0.528, "grad_norm": 0.3270750045776367, "kl": 0.333648681640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1574257748745986e-07, "loss": 0.0547, "reward": -0.12515896558761597, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.12515896558761597, "reward_after_std": 0.5613770335912704, "reward_before_mean": 0.10399177204817533, "reward_before_std": 0.5172784253954887, "reward_change_max": 0.0003483295440673828, "reward_change_mean": -0.22915074229240417, "reward_change_min": -0.3955531381070614, "reward_change_std": 0.14966793870553374, "reward_std": 0.5613770447671413, "rewards/cosine_scaled_reward": -0.28133745677769184, "rewards/format_reward": 0.6666666753590107, "step": 462 }, { "advantage_max": 1.6833451092243195, "advantage_mean": -8.6923440667519e-09, "advantage_min": -0.8412944078445435, "advantage_std": 0.9715849310159683, "completion_length": 3012.3334350585938, "epoch": 0.5291428571428571, "grad_norm": 0.6381751298904419, "kl": 0.445709228515625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1492947512799328e-07, "loss": 0.0375, "reward": 0.26589803770184517, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.26589803770184517, "reward_after_std": 0.9715849310159683, "reward_before_mean": 0.588882073876448, "reward_before_std": 0.9903637580573559, "reward_change_max": 0.0003296881914138794, "reward_change_mean": -0.32298404537141323, "reward_change_min": -0.715866107493639, "reward_change_std": 0.26778385415673256, "reward_std": 0.9715849570930004, "rewards/cosine_scaled_reward": -0.028475646511651576, "rewards/format_reward": 0.6458333469927311, "step": 463 }, { "advantage_max": 1.3124027475714684, "advantage_mean": -1.0554989715583218e-08, "advantage_min": -0.788254126906395, "advantage_std": 0.7402537241578102, "completion_length": 2271.979217529297, "epoch": 0.5302857142857142, "grad_norm": 1.9013720750808716, "kl": 0.29579925537109375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1413757749211602e-07, "loss": -0.0279, "reward": 0.3903045654296875, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3903045654296875, "reward_after_std": 0.7402537241578102, "reward_before_mean": 0.7856322703883052, "reward_before_std": 0.6768908370286226, "reward_change_max": 0.0, "reward_change_mean": -0.39532770961523056, "reward_change_min": -0.6510465815663338, "reward_change_std": 0.2512638717889786, "reward_std": 0.740253746509552, "rewards/cosine_scaled_reward": -0.01343385933432728, "rewards/format_reward": 0.8125000223517418, "step": 464 }, { "advantage_max": 1.3665404319763184, "advantage_mean": -7.450580818968433e-09, "advantage_min": -0.8858865313231945, "advantage_std": 0.8127275332808495, "completion_length": 2940.9375915527344, "epoch": 0.5314285714285715, "grad_norm": 1.5533735752105713, "kl": 0.3443603515625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1336692317580158e-07, "loss": 0.0716, "reward": 0.25389866065233946, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.25389866065233946, "reward_after_std": 0.81272754073143, "reward_before_mean": 0.5967785839457065, "reward_before_std": 0.8133799955248833, "reward_change_max": 0.000377558171749115, "reward_change_mean": -0.342879943549633, "reward_change_min": -0.5939531847834587, "reward_change_std": 0.253477456048131, "reward_std": 0.8127275817096233, "rewards/cosine_scaled_reward": -0.09744403883814812, "rewards/format_reward": 0.7916666865348816, "step": 465 }, { "advantage_max": 1.525937169790268, "advantage_mean": -1.1796752796833232e-08, "advantage_min": -1.1690080612897873, "advantage_std": 0.9380715265870094, "completion_length": 2885.4375762939453, "epoch": 0.5325714285714286, "grad_norm": 0.5980718731880188, "kl": 0.3075103759765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1261754973965422e-07, "loss": 0.0252, "reward": 0.4429134102538228, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4429134102538228, "reward_after_std": 0.9380715265870094, "reward_before_mean": 0.842274374968838, "reward_before_std": 0.9796821102499962, "reward_change_max": 0.0005110055208206177, "reward_change_mean": -0.39936098270118237, "reward_change_min": -0.7218204885721207, "reward_change_std": 0.3039932679384947, "reward_std": 0.9380715452134609, "rewards/cosine_scaled_reward": 0.09822050668299198, "rewards/format_reward": 0.6458333525806665, "step": 466 }, { "advantage_max": 1.35072410851717, "advantage_mean": 1.862645149230957e-09, "advantage_min": -0.7130578570067883, "advantage_std": 0.7507331855595112, "completion_length": 2964.8125610351562, "epoch": 0.5337142857142857, "grad_norm": 0.589440643787384, "kl": 0.342010498046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1188949370707787e-07, "loss": 0.014, "reward": 0.38083328772336245, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.38083328772336245, "reward_after_std": 0.7507331930100918, "reward_before_mean": 0.771749480314611, "reward_before_std": 0.6903218924999237, "reward_change_max": 0.00035600364208221436, "reward_change_mean": -0.39091616682708263, "reward_change_min": -0.6508742086589336, "reward_change_std": 0.24533732421696186, "reward_std": 0.7507332153618336, "rewards/cosine_scaled_reward": -0.041208596900105476, "rewards/format_reward": 0.854166679084301, "step": 467 }, { "advantage_max": 1.5255406275391579, "advantage_mean": -1.1175871006408045e-08, "advantage_min": -1.018823392689228, "advantage_std": 0.896932028234005, "completion_length": 2998.6250610351562, "epoch": 0.5348571428571428, "grad_norm": 0.43171975016593933, "kl": 0.34735107421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1118279056249653e-07, "loss": 0.0287, "reward": 0.31930156861199066, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.31930156861199066, "reward_after_std": 0.8969319984316826, "reward_before_mean": 0.675044497475028, "reward_before_std": 0.9102982468903065, "reward_change_max": 0.0, "reward_change_mean": -0.35574290715157986, "reward_change_min": -0.6775732263922691, "reward_change_std": 0.26923401467502117, "reward_std": 0.8969319984316826, "rewards/cosine_scaled_reward": -0.02706110430881381, "rewards/format_reward": 0.7291666865348816, "step": 468 }, { "advantage_max": 1.3851486891508102, "advantage_mean": 6.208820124697922e-10, "advantage_min": -0.8695004656910896, "advantage_std": 0.8268131166696548, "completion_length": 2835.854217529297, "epoch": 0.536, "grad_norm": 0.6295157670974731, "kl": 0.373138427734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1049747474962444e-07, "loss": 0.0329, "reward": 0.3110335245728493, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3110335245728493, "reward_after_std": 0.8268131241202354, "reward_before_mean": 0.674715653527528, "reward_before_std": 0.8407481797039509, "reward_change_max": 0.0008903965353965759, "reward_change_mean": -0.3636821284890175, "reward_change_min": -0.7130404487252235, "reward_change_std": 0.28268345445394516, "reward_std": 0.8268131501972675, "rewards/cosine_scaled_reward": 0.004024487920105457, "rewards/format_reward": 0.6666666753590107, "step": 469 }, { "advantage_max": 0.9884959533810616, "advantage_mean": -1.5522042540183634e-09, "advantage_min": -0.8061573393642902, "advantage_std": 0.6135408133268356, "completion_length": 3324.604217529297, "epoch": 0.5371428571428571, "grad_norm": 0.3777313232421875, "kl": 0.4073486328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0983357966978745e-07, "loss": 0.0369, "reward": -0.020457723177969456, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.020457723177969456, "reward_after_std": 0.613540805876255, "reward_before_mean": 0.25258609745651484, "reward_before_std": 0.6398132536560297, "reward_change_max": 0.0014644190669059753, "reward_change_mean": -0.2730438318103552, "reward_change_min": -0.49486764147877693, "reward_change_std": 0.21054570376873016, "reward_std": 0.6135408133268356, "rewards/cosine_scaled_reward": -0.19662363454699516, "rewards/format_reward": 0.6458333563059568, "step": 470 }, { "advantage_max": 1.4752235859632492, "advantage_mean": 1.8626451714354175e-08, "advantage_min": -0.7741179168224335, "advantage_std": 0.833162184804678, "completion_length": 3127.3333892822266, "epoch": 0.5382857142857143, "grad_norm": 0.8081554174423218, "kl": 0.361053466796875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0919113768029517e-07, "loss": 0.0207, "reward": 0.24363027699291706, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.24363027699291706, "reward_after_std": 0.8331621624529362, "reward_before_mean": 0.5768796000629663, "reward_before_std": 0.8027565106749535, "reward_change_max": 0.0004920586943626404, "reward_change_mean": -0.3332492858171463, "reward_change_min": -0.6620456390082836, "reward_change_std": 0.2511415099725127, "reward_std": 0.8331621661782265, "rewards/cosine_scaled_reward": -0.04489355348050594, "rewards/format_reward": 0.6666666809469461, "step": 471 }, { "advantage_max": 1.4293014854192734, "advantage_mean": 1.428027990302283e-08, "advantage_min": -0.6859598383307457, "advantage_std": 0.7660287953913212, "completion_length": 2807.2708587646484, "epoch": 0.5394285714285715, "grad_norm": 0.43928098678588867, "kl": 0.34246826171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0857018009286381e-07, "loss": 0.0105, "reward": 0.15427241090219468, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.15427241090219468, "reward_after_std": 0.7660288400948048, "reward_before_mean": 0.45309646893292665, "reward_before_std": 0.7070276569575071, "reward_change_max": 0.0, "reward_change_mean": -0.2988240495324135, "reward_change_min": -0.5285344235599041, "reward_change_std": 0.19260118901729584, "reward_std": 0.7660288661718369, "rewards/cosine_scaled_reward": -0.14845177298411727, "rewards/format_reward": 0.7500000149011612, "step": 472 }, { "advantage_max": 1.2765984535217285, "advantage_mean": -3.104407841902912e-10, "advantage_min": -0.575638035312295, "advantage_std": 0.6808191984891891, "completion_length": 2983.8333892822266, "epoch": 0.5405714285714286, "grad_norm": 0.332226037979126, "kl": 0.30999755859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0797073717209013e-07, "loss": 0.0312, "reward": 0.12082979548722506, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.12082979548722506, "reward_after_std": 0.6808191984891891, "reward_before_mean": 0.4231628682464361, "reward_before_std": 0.614310803823173, "reward_change_max": 0.0002312883734703064, "reward_change_mean": -0.3023330457508564, "reward_change_min": -0.4855576269328594, "reward_change_std": 0.1949680121615529, "reward_std": 0.6808192264288664, "rewards/cosine_scaled_reward": -0.13216857006773353, "rewards/format_reward": 0.6875000149011612, "step": 473 }, { "advantage_max": 1.580621987581253, "advantage_mean": -2.1109978876054925e-08, "advantage_min": -0.785576019436121, "advantage_std": 0.8718631789088249, "completion_length": 2244.5625610351562, "epoch": 0.5417142857142857, "grad_norm": 0.7054795622825623, "kl": 0.220245361328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0739283813397639e-07, "loss": 0.0127, "reward": 0.6519424570724368, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6519424570724368, "reward_after_std": 0.8718631565570831, "reward_before_mean": 1.1209867387078702, "reward_before_std": 0.7825071476399899, "reward_change_max": 0.0, "reward_change_mean": -0.469044242054224, "reward_change_min": -0.7807438485324383, "reward_change_std": 0.28917416650801897, "reward_std": 0.8718631789088249, "rewards/cosine_scaled_reward": 0.15424335189163685, "rewards/format_reward": 0.812500013038516, "step": 474 }, { "advantage_max": 1.4571957886219025, "advantage_mean": -1.1175871117430347e-08, "advantage_min": -1.061225775629282, "advantage_std": 0.897326685488224, "completion_length": 2246.208381652832, "epoch": 0.5428571428571428, "grad_norm": 1.8836877346038818, "kl": 0.2695465087890625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.068365111445064e-07, "loss": 0.0519, "reward": 0.4812218938022852, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4812218938022852, "reward_after_std": 0.8973266705870628, "reward_before_mean": 0.8961574863642454, "reward_before_std": 0.9338822811841965, "reward_change_max": 0.001050010323524475, "reward_change_mean": -0.414935564622283, "reward_change_min": -0.7467651851475239, "reward_change_std": 0.3112996993586421, "reward_std": 0.8973267190158367, "rewards/cosine_scaled_reward": 0.08349537872709334, "rewards/format_reward": 0.7291666772216558, "step": 475 }, { "advantage_max": 1.702410340309143, "advantage_mean": -2.235174290099451e-08, "advantage_min": -1.2538457065820694, "advantage_std": 1.0458894148468971, "completion_length": 2918.7500915527344, "epoch": 0.544, "grad_norm": 2.251671075820923, "kl": 0.2965087890625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.063017833182728e-07, "loss": 0.0935, "reward": 0.7583688944578171, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7583688944578171, "reward_after_std": 1.0458894148468971, "reward_before_mean": 1.2590843848884106, "reward_before_std": 1.0712070390582085, "reward_change_max": 0.0, "reward_change_mean": -0.5007154755294323, "reward_change_min": -0.853624165058136, "reward_change_std": 0.3549340758472681, "reward_std": 1.0458894520998, "rewards/cosine_scaled_reward": 0.21287550101988018, "rewards/format_reward": 0.833333358168602, "step": 476 }, { "advantage_max": 1.2137524485588074, "advantage_mean": -9.313225801665936e-09, "advantage_min": -0.5264289863407612, "advantage_std": 0.6505212225019932, "completion_length": 1786.9167137145996, "epoch": 0.5451428571428572, "grad_norm": 0.17899537086486816, "kl": 0.1691131591796875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0578868071715544e-07, "loss": 0.0155, "reward": 0.9699084199965, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.9699084199965, "reward_after_std": 0.6505212299525738, "reward_before_mean": 1.573659099638462, "reward_before_std": 0.4683607667684555, "reward_change_max": 0.0, "reward_change_mean": -0.6037506610155106, "reward_change_min": -0.8868011832237244, "reward_change_std": 0.32529093883931637, "reward_std": 0.6505212485790253, "rewards/cosine_scaled_reward": 0.31807953119277954, "rewards/format_reward": 0.9375000074505806, "step": 477 }, { "advantage_max": 1.3267375081777573, "advantage_mean": -2.607703308843412e-08, "advantage_min": -0.6465108655393124, "advantage_std": 0.724626038223505, "completion_length": 2695.3958587646484, "epoch": 0.5462857142857143, "grad_norm": 0.6639411449432373, "kl": 0.230682373046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0529722834905125e-07, "loss": 0.0217, "reward": 0.21232910081744194, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.21232910081744194, "reward_after_std": 0.724626038223505, "reward_before_mean": 0.5422618109732866, "reward_before_std": 0.6659068446606398, "reward_change_max": 0.0008394867181777954, "reward_change_mean": -0.3299326840788126, "reward_change_min": -0.5855097156018019, "reward_change_std": 0.2148620719090104, "reward_std": 0.7246260643005371, "rewards/cosine_scaled_reward": -0.03095244988799095, "rewards/format_reward": 0.6041666828095913, "step": 478 }, { "advantage_max": 1.374097228050232, "advantage_mean": -1.9868215517249155e-08, "advantage_min": -0.7297849971801043, "advantage_std": 0.7507626861333847, "completion_length": 3014.8125610351562, "epoch": 0.5474285714285714, "grad_norm": 0.46662241220474243, "kl": 0.305755615234375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0482745016665526e-07, "loss": 0.0175, "reward": 0.2535403287038207, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2535403287038207, "reward_after_std": 0.7507626786828041, "reward_before_mean": 0.5968595510348678, "reward_before_std": 0.6907813530415297, "reward_change_max": 0.0, "reward_change_mean": -0.34331923350691795, "reward_change_min": -0.6154851168394089, "reward_change_std": 0.2200963729992509, "reward_std": 0.7507626973092556, "rewards/cosine_scaled_reward": -0.11823690216988325, "rewards/format_reward": 0.833333358168602, "step": 479 }, { "advantage_max": 1.3359937444329262, "advantage_mean": -2.0489097418696645e-08, "advantage_min": -0.7955343127250671, "advantage_std": 0.7900065630674362, "completion_length": 2454.7708740234375, "epoch": 0.5485714285714286, "grad_norm": 0.6866908669471741, "kl": 0.47723388671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0437936906629334e-07, "loss": 0.0103, "reward": 0.34717184118926525, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.34717184118926525, "reward_after_std": 0.7900065779685974, "reward_before_mean": 0.7200711611658335, "reward_before_std": 0.7834494672715664, "reward_change_max": 0.00013305246829986572, "reward_change_mean": -0.3728993311524391, "reward_change_min": -0.6839794237166643, "reward_change_std": 0.26153867691755295, "reward_std": 0.7900065779685974, "rewards/cosine_scaled_reward": 0.026702251750975847, "rewards/format_reward": 0.6666666734963655, "step": 480 }, { "advantage_max": 1.258769951760769, "advantage_mean": 9.934107703113426e-09, "advantage_min": -0.8331233933568001, "advantage_std": 0.7410633154213428, "completion_length": 3171.854217529297, "epoch": 0.5497142857142857, "grad_norm": 0.6480934619903564, "kl": 0.3162841796875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0395300688680625e-07, "loss": 0.046, "reward": 0.2729811486788094, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2729811486788094, "reward_after_std": 0.7410633079707623, "reward_before_mean": 0.6314380564726889, "reward_before_std": 0.738004770129919, "reward_change_max": 0.0006511285901069641, "reward_change_mean": -0.3584568817168474, "reward_change_min": -0.62668626755476, "reward_change_std": 0.24528701975941658, "reward_std": 0.7410633154213428, "rewards/cosine_scaled_reward": -0.038447652012109756, "rewards/format_reward": 0.7083333507180214, "step": 481 }, { "advantage_max": 1.3237205818295479, "advantage_mean": 2.856055936195645e-08, "advantage_min": -0.6684618592262268, "advantage_std": 0.7145509906113148, "completion_length": 2819.520896911621, "epoch": 0.5508571428571428, "grad_norm": 0.4941161274909973, "kl": 0.3163299560546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0354838440848501e-07, "loss": 0.0221, "reward": 0.6409013960510492, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6409013960510492, "reward_after_std": 0.7145509831607342, "reward_before_mean": 1.1262214393354952, "reward_before_std": 0.5935944020748138, "reward_change_max": 0.00171564519405365, "reward_change_mean": -0.48531997948884964, "reward_change_min": -0.730036336928606, "reward_change_std": 0.2870408296585083, "reward_std": 0.7145509868860245, "rewards/cosine_scaled_reward": 0.25061069428920746, "rewards/format_reward": 0.6250000018626451, "step": 482 }, { "advantage_max": 1.198041632771492, "advantage_mean": -6.208817349140361e-10, "advantage_min": -0.46576736494898796, "advantage_std": 0.6329626999795437, "completion_length": 2924.0833740234375, "epoch": 0.552, "grad_norm": 0.48551997542381287, "kl": 0.335968017578125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0316552135205837e-07, "loss": 0.012, "reward": 0.1743651172146201, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1743651172146201, "reward_after_std": 0.6329627186059952, "reward_before_mean": 0.5006456337869167, "reward_before_std": 0.5492782052606344, "reward_change_max": 0.0, "reward_change_mean": -0.326280502602458, "reward_change_min": -0.5173515044152737, "reward_change_std": 0.18664393853396177, "reward_std": 0.6329627446830273, "rewards/cosine_scaled_reward": -0.14551052823662758, "rewards/format_reward": 0.791666679084301, "step": 483 }, { "advantage_max": 1.2706203386187553, "advantage_mean": -4.718701185346674e-08, "advantage_min": -0.780962735414505, "advantage_std": 0.7774913385510445, "completion_length": 2575.895896911621, "epoch": 0.5531428571428572, "grad_norm": 0.9153090119361877, "kl": 0.226165771484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0280443637773163e-07, "loss": 0.0402, "reward": 0.5697587521281093, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5697587521281093, "reward_after_std": 0.7774913720786572, "reward_before_mean": 1.0327607281506062, "reward_before_std": 0.7620697543025017, "reward_change_max": 0.0, "reward_change_mean": -0.46300200559198856, "reward_change_min": -0.813766747713089, "reward_change_std": 0.3095168676227331, "reward_std": 0.7774913795292377, "rewards/cosine_scaled_reward": 0.12054700963199139, "rewards/format_reward": 0.7916666828095913, "step": 484 }, { "advantage_max": 1.4597226828336716, "advantage_mean": -2.793967884828774e-08, "advantage_min": -0.8125118277966976, "advantage_std": 0.8193517737090588, "completion_length": 2764.0625762939453, "epoch": 0.5542857142857143, "grad_norm": 0.40035754442214966, "kl": 0.3151092529296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0246514708427701e-07, "loss": 0.0403, "reward": 0.3696631761267781, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3696631761267781, "reward_after_std": 0.8193517588078976, "reward_before_mean": 0.7487029042094946, "reward_before_std": 0.783175889402628, "reward_change_max": 0.0, "reward_change_mean": -0.3790397383272648, "reward_change_min": -0.6763305589556694, "reward_change_std": 0.24684625305235386, "reward_std": 0.8193517737090588, "rewards/cosine_scaled_reward": -0.08398190187290311, "rewards/format_reward": 0.9166666865348816, "step": 485 }, { "advantage_max": 1.3625748306512833, "advantage_mean": -1.8316011041186187e-08, "advantage_min": -0.6952449381351471, "advantage_std": 0.7565643563866615, "completion_length": 2407.000072479248, "epoch": 0.5554285714285714, "grad_norm": 0.5384291410446167, "kl": 0.4549407958984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0214767000817596e-07, "loss": 0.0137, "reward": 0.41442851535975933, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.41442851535975933, "reward_after_std": 0.7565643414855003, "reward_before_mean": 0.812147680670023, "reward_before_std": 0.6902235746383667, "reward_change_max": 7.747858762741089e-05, "reward_change_mean": -0.3977191895246506, "reward_change_min": -0.6768068000674248, "reward_change_std": 0.26034667529165745, "reward_std": 0.7565643452107906, "rewards/cosine_scaled_reward": -0.0001761619932949543, "rewards/format_reward": 0.8125000074505806, "step": 486 }, { "advantage_max": 1.2316907346248627, "advantage_mean": -4.656613034059731e-08, "advantage_min": -0.8126912750303745, "advantage_std": 0.7212287411093712, "completion_length": 2273.229217529297, "epoch": 0.5565714285714286, "grad_norm": 0.3203495442867279, "kl": 0.17083740234375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0185202062281336e-07, "loss": 0.0127, "reward": 0.7203271514736116, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7203271514736116, "reward_after_std": 0.7212287411093712, "reward_before_mean": 1.2395121343433857, "reward_before_std": 0.6601413935422897, "reward_change_max": 0.0, "reward_change_mean": -0.5191849693655968, "reward_change_min": -0.8080818802118301, "reward_change_std": 0.3102181311696768, "reward_std": 0.7212287522852421, "rewards/cosine_scaled_reward": 0.1510060466825962, "rewards/format_reward": 0.9375000149011612, "step": 487 }, { "advantage_max": 1.3180788084864616, "advantage_mean": -1.2417634975836478e-08, "advantage_min": -0.6756866686046124, "advantage_std": 0.7199700437486172, "completion_length": 2206.312530517578, "epoch": 0.5577142857142857, "grad_norm": 1.3325201272964478, "kl": 0.20296478271484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0157821333772304e-07, "loss": -0.0222, "reward": 0.2248132168315351, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2248132168315351, "reward_after_std": 0.7199700064957142, "reward_before_mean": 0.5571361780166626, "reward_before_std": 0.6680998187512159, "reward_change_max": 0.0004176497459411621, "reward_change_mean": -0.33232299983501434, "reward_change_min": -0.5621999390423298, "reward_change_std": 0.21261890977621078, "reward_std": 0.7199700102210045, "rewards/cosine_scaled_reward": -0.11726525146514177, "rewards/format_reward": 0.7916666828095913, "step": 488 }, { "advantage_max": 1.1426765695214272, "advantage_mean": 9.313226301266297e-10, "advantage_min": -0.537463366985321, "advantage_std": 0.6189225316047668, "completion_length": 3095.541717529297, "epoch": 0.5588571428571428, "grad_norm": 0.9554708003997803, "kl": 0.415283203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.013262614978859e-07, "loss": -0.0094, "reward": -0.1730261892080307, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1730261892080307, "reward_after_std": 0.6189225241541862, "reward_before_mean": 0.030127177014946938, "reward_before_std": 0.5958229564130306, "reward_change_max": 0.0011682584881782532, "reward_change_mean": -0.2031533746048808, "reward_change_min": -0.4085076302289963, "reward_change_std": 0.15374962240457535, "reward_std": 0.6189225278794765, "rewards/cosine_scaled_reward": -0.23493642359972, "rewards/format_reward": 0.5000000111758709, "step": 489 }, { "advantage_max": 1.2859749644994736, "advantage_mean": -1.3038516433194758e-08, "advantage_min": -0.6581551507115364, "advantage_std": 0.7169382348656654, "completion_length": 2327.3333740234375, "epoch": 0.56, "grad_norm": 0.44198766350746155, "kl": 0.2081146240234375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0109617738307911e-07, "loss": 0.0099, "reward": 0.37883203383535147, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.37883203383535147, "reward_after_std": 0.7169382683932781, "reward_before_mean": 0.7729843532433733, "reward_before_std": 0.6514980234205723, "reward_change_max": 0.0, "reward_change_mean": -0.3941523414105177, "reward_change_min": -0.6781666129827499, "reward_change_std": 0.2485183533281088, "reward_std": 0.7169382870197296, "rewards/cosine_scaled_reward": -0.030174492858350277, "rewards/format_reward": 0.8333333432674408, "step": 490 }, { "advantage_max": 1.3834428116679192, "advantage_mean": -2.7318796169684134e-08, "advantage_min": -0.9954207092523575, "advantage_std": 0.8297108337283134, "completion_length": 2634.62508392334, "epoch": 0.5611428571428572, "grad_norm": 1.119695782661438, "kl": 0.3593902587890625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0088797220727779e-07, "loss": 0.0477, "reward": 0.46813568845391273, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.46813568845391273, "reward_after_std": 0.8297108709812164, "reward_before_mean": 0.883242666721344, "reward_before_std": 0.8290642537176609, "reward_change_max": 0.000342443585395813, "reward_change_mean": -0.415106987580657, "reward_change_min": -0.7718001753091812, "reward_change_std": 0.29161699395626783, "reward_std": 0.8297109119594097, "rewards/cosine_scaled_reward": 0.09787133475765586, "rewards/format_reward": 0.6875000093132257, "step": 491 }, { "advantage_max": 1.1452015675604343, "advantage_mean": -6.829698862009792e-09, "advantage_min": -0.7990109957754612, "advantage_std": 0.7219687141478062, "completion_length": 2643.562568664551, "epoch": 0.5622857142857143, "grad_norm": 0.5378697514533997, "kl": 0.319671630859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0070165611810855e-07, "loss": 0.0246, "reward": 0.2805432486347854, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2805432486347854, "reward_after_std": 0.7219687290489674, "reward_before_mean": 0.646415838971734, "reward_before_std": 0.745473101735115, "reward_change_max": 0.0002370402216911316, "reward_change_mean": -0.3658725507557392, "reward_change_min": -0.6684302501380444, "reward_change_std": 0.2682249080389738, "reward_std": 0.72196876257658, "rewards/cosine_scaled_reward": -0.020542113110423088, "rewards/format_reward": 0.6875000111758709, "step": 492 }, { "advantage_max": 1.3004106357693672, "advantage_mean": -2.1420419660245216e-08, "advantage_min": -0.918564785271883, "advantage_std": 0.8005609177052975, "completion_length": 2551.6459045410156, "epoch": 0.5634285714285714, "grad_norm": 0.9306224584579468, "kl": 0.24664306640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.005372381963547e-07, "loss": 0.0584, "reward": 0.5436392567353323, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5436392567353323, "reward_after_std": 0.8005609251558781, "reward_before_mean": 0.995521822333103, "reward_before_std": 0.7887520510703325, "reward_change_max": 0.0, "reward_change_mean": -0.451882591471076, "reward_change_min": -0.7738821282982826, "reward_change_std": 0.3092910312116146, "reward_std": 0.8005609400570393, "rewards/cosine_scaled_reward": 0.07067757099866867, "rewards/format_reward": 0.8541666865348816, "step": 493 }, { "advantage_max": 1.4173968508839607, "advantage_mean": -2.8560559528489904e-08, "advantage_min": -0.6377351954579353, "advantage_std": 0.7632331699132919, "completion_length": 1996.0625457763672, "epoch": 0.5645714285714286, "grad_norm": 0.3634704053401947, "kl": 0.2783660888671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0039472645551372e-07, "loss": -0.0001, "reward": 0.677189095877111, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.677189095877111, "reward_after_std": 0.7632331699132919, "reward_before_mean": 1.163249596953392, "reward_before_std": 0.6305623799562454, "reward_change_max": 0.0, "reward_change_mean": -0.4860605113208294, "reward_change_min": -0.7516691125929356, "reward_change_std": 0.27895483560860157, "reward_std": 0.7632332071661949, "rewards/cosine_scaled_reward": 0.1128747807815671, "rewards/format_reward": 0.9375000074505806, "step": 494 }, { "advantage_max": 1.2059417739510536, "advantage_mean": -9.934107647602275e-09, "advantage_min": -0.9517855867743492, "advantage_std": 0.7393425740301609, "completion_length": 2863.8958892822266, "epoch": 0.5657142857142857, "grad_norm": 0.5228192806243896, "kl": 0.309600830078125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.002741278414069e-07, "loss": 0.0412, "reward": 0.40756674110889435, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.40756674110889435, "reward_after_std": 0.7393425889313221, "reward_before_mean": 0.8172311699017882, "reward_before_std": 0.7446677684783936, "reward_change_max": 0.0, "reward_change_mean": -0.4096644464880228, "reward_change_min": -0.6881908997893333, "reward_change_std": 0.2763666333630681, "reward_std": 0.7393426224589348, "rewards/cosine_scaled_reward": 0.07528225053101778, "rewards/format_reward": 0.6666666753590107, "step": 495 }, { "advantage_max": 1.2969953119754791, "advantage_mean": -2.421438738409165e-08, "advantage_min": -0.8419771865010262, "advantage_std": 0.7619081437587738, "completion_length": 1974.2708892822266, "epoch": 0.5668571428571428, "grad_norm": 0.2679189145565033, "kl": 0.5582504272460938, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0017544823184055e-07, "loss": 0.0025, "reward": 0.564401363953948, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.564401363953948, "reward_after_std": 0.7619081437587738, "reward_before_mean": 1.0211270181462169, "reward_before_std": 0.7228041291236877, "reward_change_max": 0.0, "reward_change_mean": -0.45672566443681717, "reward_change_min": -0.793279368430376, "reward_change_std": 0.2969939485192299, "reward_std": 0.7619081512093544, "rewards/cosine_scaled_reward": 0.11473017372190952, "rewards/format_reward": 0.7916666753590107, "step": 496 }, { "advantage_max": 1.4672126322984695, "advantage_mean": -3.166496814754893e-08, "advantage_min": -0.7556325867772102, "advantage_std": 0.8255305737257004, "completion_length": 2442.0000610351562, "epoch": 0.568, "grad_norm": 0.3775382339954376, "kl": 0.1978759765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0009869243631952e-07, "loss": -0.0023, "reward": 0.7381462557241321, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7381462557241321, "reward_after_std": 0.8255305588245392, "reward_before_mean": 1.2490927260369062, "reward_before_std": 0.7295142617076635, "reward_change_max": 0.0, "reward_change_mean": -0.5109464433044195, "reward_change_min": -0.7986562699079514, "reward_change_std": 0.308968473225832, "reward_std": 0.8255305588245392, "rewards/cosine_scaled_reward": 0.20787964761257172, "rewards/format_reward": 0.8333333432674408, "step": 497 }, { "advantage_max": 1.5589833036065102, "advantage_mean": 6.208816238917336e-10, "advantage_min": -0.9517237991094589, "advantage_std": 0.9105304293334484, "completion_length": 2816.0209350585938, "epoch": 0.5691428571428572, "grad_norm": 0.9435493350028992, "kl": 0.3084716796875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.000438641958131e-07, "loss": 0.0553, "reward": 0.4708832767792046, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4708832767792046, "reward_after_std": 0.9105304293334484, "reward_before_mean": 0.8774845395237207, "reward_before_std": 0.9078313298523426, "reward_change_max": 0.0, "reward_change_mean": -0.4066012669354677, "reward_change_min": -0.7389591410756111, "reward_change_std": 0.28658714331686497, "reward_std": 0.9105304405093193, "rewards/cosine_scaled_reward": 0.0741589218378067, "rewards/format_reward": 0.7291666902601719, "step": 498 }, { "advantage_max": 1.5296600610017776, "advantage_mean": -1.2417634254191512e-08, "advantage_min": -1.004964530467987, "advantage_std": 0.9031927324831486, "completion_length": 2717.0000610351562, "epoch": 0.5702857142857143, "grad_norm": 0.9648796319961548, "kl": 0.1967926025390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0001096618257236e-07, "loss": 0.0385, "reward": 0.6673821806907654, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6673821806907654, "reward_after_std": 0.9031927324831486, "reward_before_mean": 1.146652415394783, "reward_before_std": 0.884892612695694, "reward_change_max": 0.0, "reward_change_mean": -0.47927025333046913, "reward_change_min": -0.8178643807768822, "reward_change_std": 0.31639137864112854, "reward_std": 0.9031927511096001, "rewards/cosine_scaled_reward": 0.10457620583474636, "rewards/format_reward": 0.9375000074505806, "step": 499 }, { "advantage_max": 1.4438907951116562, "advantage_mean": -4.9049656836164246e-08, "advantage_min": -0.9703863263130188, "advantage_std": 0.8778162263333797, "completion_length": 3062.916748046875, "epoch": 0.5714285714285714, "grad_norm": 2.0350399017333984, "kl": 0.3018798828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1e-07, "loss": 0.0634, "reward": 0.5093031972646713, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5093031972646713, "reward_after_std": 0.8778162263333797, "reward_before_mean": 0.9389307349920273, "reward_before_std": 0.8889699075371027, "reward_change_max": 0.0, "reward_change_mean": -0.4296275693923235, "reward_change_min": -0.7938886098563671, "reward_change_std": 0.31151862256228924, "reward_std": 0.8778162784874439, "rewards/cosine_scaled_reward": 0.13613202422857285, "rewards/format_reward": 0.6666666809469461, "step": 500 }, { "epoch": 0.5714285714285714, "step": 500, "total_flos": 0.0, "train_loss": 0.006108085031155497, "train_runtime": 18366.0905, "train_samples_per_second": 1.307, "train_steps_per_second": 0.027 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }