diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13542 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5714285714285714, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "advantage_max": 1.3805946558713913, + "advantage_mean": -8.0714617212152e-09, + "advantage_min": -1.278314545750618, + "advantage_std": 0.9998298436403275, + "completion_length": 2571.2083587646484, + "epoch": 0.001142857142857143, + "grad_norm": 0.19454674422740936, + "kl": 0.0, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2e-08, + "loss": 0.0, + "reward": 0.383966077119112, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.383966077119112, + "reward_after_std": 0.8095231093466282, + "reward_before_mean": 0.4897647276520729, + "reward_before_std": 0.8290339298546314, + "reward_change_max": 0.000140361487865448, + "reward_change_mean": -0.10579865705221891, + "reward_change_min": -0.2073100507259369, + "reward_change_std": 0.08411919022910297, + "reward_std": 0.8095231391489506, + "rewards/cosine_scaled_reward": -0.015534311532974243, + "rewards/format_reward": 0.5208333488553762, + "step": 1 + }, + { + "advantage_max": 1.2630222663283348, + "advantage_mean": -2.6077034975813262e-08, + "advantage_min": -1.2786083295941353, + "advantage_std": 0.9997444376349449, + "completion_length": 2804.395881652832, + "epoch": 0.002285714285714286, + "grad_norm": 0.18217992782592773, + "kl": 0.0, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4e-08, + "loss": 0.0, + "reward": 0.17750850692391396, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.17750850692391396, + "reward_after_std": 0.42011942341923714, + "reward_before_mean": 0.27539755403995514, + "reward_before_std": 0.42092561535537243, + "reward_change_max": 0.0003265589475631714, + "reward_change_mean": -0.09788906387984753, + "reward_change_min": -0.1594111192971468, + "reward_change_std": 0.06503142253495753, + "reward_std": 0.42011944204568863, + "rewards/cosine_scaled_reward": -0.04980122856795788, + "rewards/format_reward": 0.37500000558793545, + "step": 2 + }, + { + "advantage_max": 1.5949327051639557, + "advantage_mean": 4.346172421954009e-09, + "advantage_min": -0.991000697016716, + "advantage_std": 0.9998101443052292, + "completion_length": 3280.854217529297, + "epoch": 0.0034285714285714284, + "grad_norm": 0.1597713828086853, + "kl": 3.88026237487793e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6e-08, + "loss": 0.0, + "reward": -0.27885486651211977, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.27885486651211977, + "reward_after_std": 0.7552903220057487, + "reward_before_mean": -0.23804896231740713, + "reward_before_std": 0.7629885245114565, + "reward_change_max": 0.0004154220223426819, + "reward_change_mean": -0.04080591048114002, + "reward_change_min": -0.09831635467708111, + "reward_change_std": 0.04355394095182419, + "reward_std": 0.7552903480827808, + "rewards/cosine_scaled_reward": -0.24402447510510683, + "rewards/format_reward": 0.2500000074505806, + "step": 3 + }, + { + "advantage_max": 1.553865224123001, + "advantage_mean": 7.450580485901526e-09, + "advantage_min": -1.1104755029082298, + "advantage_std": 0.9998296648263931, + "completion_length": 2339.729217529297, + "epoch": 0.004571428571428572, + "grad_norm": 0.24378903210163116, + "kl": 2.9033049941062927e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8e-08, + "loss": 0.0, + "reward": 0.3669445291161537, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3669445291161537, + "reward_after_std": 0.9222714062780142, + "reward_before_mean": 0.4652156475931406, + "reward_before_std": 0.932275427505374, + "reward_change_max": 0.0, + "reward_change_mean": -0.09827110765036196, + "reward_change_min": -0.19664788339287043, + "reward_change_std": 0.07947756757494062, + "reward_std": 0.9222714211791754, + "rewards/cosine_scaled_reward": -0.05905885813990608, + "rewards/format_reward": 0.5833333414047956, + "step": 4 + }, + { + "advantage_max": 1.449422225356102, + "advantage_mean": 4.967053546245381e-09, + "advantage_min": -1.0410668477416039, + "advantage_std": 0.9998126924037933, + "completion_length": 3355.2291870117188, + "epoch": 0.005714285714285714, + "grad_norm": 0.18919876217842102, + "kl": 4.6834349632263184e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1e-07, + "loss": 0.0, + "reward": -0.07166258245706558, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": -0.07166258245706558, + "reward_after_std": 0.7799306400120258, + "reward_before_mean": -0.007116513326764107, + "reward_before_std": 0.8074344918131828, + "reward_change_max": 0.000532977283000946, + "reward_change_mean": -0.06454607425257564, + "reward_change_min": -0.17053373903036118, + "reward_change_std": 0.06985534727573395, + "reward_std": 0.7799306474626064, + "rewards/cosine_scaled_reward": -0.14939158782362938, + "rewards/format_reward": 0.2916666716337204, + "step": 5 + }, + { + "advantage_max": 1.7041800767183304, + "advantage_mean": 9.934107986220297e-08, + "advantage_min": -1.0263897106051445, + "advantage_std": 0.9997144341468811, + "completion_length": 2931.4167098999023, + "epoch": 0.006857142857142857, + "grad_norm": 0.23432040214538574, + "kl": 3.852322697639465e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.2e-07, + "loss": 0.0, + "reward": -0.2719933092594147, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.2719933092594147, + "reward_after_std": 0.4997922573238611, + "reward_before_mean": -0.2224076190032065, + "reward_before_std": 0.4970291517674923, + "reward_change_max": 0.0001704096794128418, + "reward_change_mean": -0.04958567628636956, + "reward_change_min": -0.09600676875561476, + "reward_change_std": 0.03900796617381275, + "reward_std": 0.4997922834008932, + "rewards/cosine_scaled_reward": -0.25703714042901993, + "rewards/format_reward": 0.29166666977107525, + "step": 6 + }, + { + "advantage_max": 1.5957663804292679, + "advantage_mean": -1.241763414316921e-09, + "advantage_min": -1.015457857400179, + "advantage_std": 0.999874085187912, + "completion_length": 3199.0000915527344, + "epoch": 0.008, + "grad_norm": 0.15826046466827393, + "kl": 2.849102020263672e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.4e-07, + "loss": 0.0, + "reward": 0.19284541113302112, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.19284541113302112, + "reward_after_std": 0.9743777364492416, + "reward_before_mean": 0.27275329316034913, + "reward_before_std": 0.984375350177288, + "reward_change_max": 0.00023509562015533447, + "reward_change_mean": -0.07990789820905775, + "reward_change_min": -0.17123806476593018, + "reward_change_std": 0.06935694860294461, + "reward_std": 0.9743777737021446, + "rewards/cosine_scaled_reward": -0.10320668725762516, + "rewards/format_reward": 0.4791666828095913, + "step": 7 + }, + { + "advantage_max": 1.558206208050251, + "advantage_mean": 7.202228147207279e-08, + "advantage_min": -1.082590851932764, + "advantage_std": 0.9997908994555473, + "completion_length": 2662.1666946411133, + "epoch": 0.009142857142857144, + "grad_norm": 0.16711029410362244, + "kl": 2.8438866138458252e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.6e-07, + "loss": 0.0, + "reward": 0.41234924644231796, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.41234924644231796, + "reward_after_std": 0.6741374768316746, + "reward_before_mean": 0.5237455815076828, + "reward_before_std": 0.6730234175920486, + "reward_change_max": 0.0003030449151992798, + "reward_change_mean": -0.11139630584511906, + "reward_change_min": -0.1782920677214861, + "reward_change_std": 0.07700008910614997, + "reward_std": 0.6741374880075455, + "rewards/cosine_scaled_reward": 0.06395611725747585, + "rewards/format_reward": 0.3958333358168602, + "step": 8 + }, + { + "advantage_max": 1.541668102145195, + "advantage_mean": 1.2417634809303024e-08, + "advantage_min": -1.2456592470407486, + "advantage_std": 0.9998108074069023, + "completion_length": 3136.7709045410156, + "epoch": 0.010285714285714285, + "grad_norm": 0.18616381287574768, + "kl": 4.213303327560425e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.8e-07, + "loss": 0.0, + "reward": -0.056825272273272276, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.056825272273272276, + "reward_after_std": 0.7951892241835594, + "reward_before_mean": 0.004031727090477943, + "reward_before_std": 0.8006694987416267, + "reward_change_max": 0.0, + "reward_change_mean": -0.06085700215771794, + "reward_change_min": -0.12736657354980707, + "reward_change_std": 0.0515642911195755, + "reward_std": 0.7951892279088497, + "rewards/cosine_scaled_reward": -0.16465081088244915, + "rewards/format_reward": 0.3333333432674408, + "step": 9 + }, + { + "advantage_max": 1.451988160610199, + "advantage_mean": 2.4835279388568665e-09, + "advantage_min": -1.1254910752177238, + "advantage_std": 0.9998168796300888, + "completion_length": 2620.104179382324, + "epoch": 0.011428571428571429, + "grad_norm": 0.2192392647266388, + "kl": 2.3484230041503906e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2e-07, + "loss": 0.0, + "reward": 0.05896207131445408, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.05896207131445408, + "reward_after_std": 0.7329425290226936, + "reward_before_mean": 0.1349986456334591, + "reward_before_std": 0.7468922697007656, + "reward_change_max": 0.0001971423625946045, + "reward_change_mean": -0.07603657222352922, + "reward_change_min": -0.16776021383702755, + "reward_change_std": 0.06868458609096706, + "reward_std": 0.732942558825016, + "rewards/cosine_scaled_reward": -0.1304173544049263, + "rewards/format_reward": 0.39583334513008595, + "step": 10 + }, + { + "advantage_max": 1.5240405201911926, + "advantage_mean": 1.018246036377235e-07, + "advantage_min": -0.9174718670547009, + "advantage_std": 0.9997430071234703, + "completion_length": 3468.9166870117188, + "epoch": 0.012571428571428572, + "grad_norm": 0.1580207347869873, + "kl": 3.941357135772705e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.1999999999999998e-07, + "loss": 0.0, + "reward": -0.44767653942108154, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.44767653942108154, + "reward_after_std": 0.6254625134170055, + "reward_before_mean": -0.41805149242281914, + "reward_before_std": 0.6336532738059759, + "reward_change_max": 0.0004032254219055176, + "reward_change_mean": -0.029625033494085073, + "reward_change_min": -0.0750849274918437, + "reward_change_std": 0.03149611933622509, + "reward_std": 0.6254625394940376, + "rewards/cosine_scaled_reward": -0.25069241458550096, + "rewards/format_reward": 0.0833333358168602, + "step": 11 + }, + { + "advantage_max": 1.3734028935432434, + "advantage_mean": 7.450581707146853e-09, + "advantage_min": -1.2330540791153908, + "advantage_std": 0.9998399540781975, + "completion_length": 2658.8750762939453, + "epoch": 0.013714285714285714, + "grad_norm": 0.18816447257995605, + "kl": 4.194304347038269e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.4e-07, + "loss": 0.0, + "reward": 0.40494780242443085, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.40494780242443085, + "reward_after_std": 0.773624949157238, + "reward_before_mean": 0.5151724070310593, + "reward_before_std": 0.7934709116816521, + "reward_change_max": 0.0, + "reward_change_mean": -0.11022460693493485, + "reward_change_min": -0.21708102989941835, + "reward_change_std": 0.08380235452204943, + "reward_std": 0.7736249603331089, + "rewards/cosine_scaled_reward": -0.05491379927843809, + "rewards/format_reward": 0.6250000223517418, + "step": 12 + }, + { + "advantage_max": 1.3199822530150414, + "advantage_mean": -4.097819361614796e-08, + "advantage_min": -1.100662998855114, + "advantage_std": 0.9997796267271042, + "completion_length": 2828.354217529297, + "epoch": 0.014857142857142857, + "grad_norm": 0.19753533601760864, + "kl": 3.528594970703125e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.6e-07, + "loss": 0.0, + "reward": -0.02724500745534897, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.02724500745534897, + "reward_after_std": 0.6626134999096394, + "reward_before_mean": 0.04541436675935984, + "reward_before_std": 0.6859034113585949, + "reward_change_max": 8.57040286064148e-05, + "reward_change_mean": -0.07265940262004733, + "reward_change_min": -0.16889882646501064, + "reward_change_std": 0.06863630237057805, + "reward_std": 0.6626135222613811, + "rewards/cosine_scaled_reward": -0.16479281801730394, + "rewards/format_reward": 0.37500000558793545, + "step": 13 + }, + { + "advantage_max": 1.5738040059804916, + "advantage_mean": -4.049313184761871e-08, + "advantage_min": -1.0287166237831116, + "advantage_std": 0.9998800083994865, + "completion_length": 2633.0833892822266, + "epoch": 0.016, + "grad_norm": 0.2650340795516968, + "kl": 2.3968517780303955e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.8e-07, + "loss": 0.0, + "reward": 0.4404462520033121, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4404462520033121, + "reward_after_std": 0.9656060226261616, + "reward_before_mean": 0.5435896124690771, + "reward_before_std": 0.9739357531070709, + "reward_change_max": 0.00042141228914260864, + "reward_change_mean": -0.10314338165335357, + "reward_change_min": -0.19937315676361322, + "reward_change_std": 0.08145514910575002, + "reward_std": 0.9656060636043549, + "rewards/cosine_scaled_reward": 0.03221147443400696, + "rewards/format_reward": 0.47916666977107525, + "step": 14 + }, + { + "advantage_max": 1.5372154638171196, + "advantage_mean": -2.0489098417897367e-08, + "advantage_min": -1.1531073153018951, + "advantage_std": 0.9997818544507027, + "completion_length": 2792.6458587646484, + "epoch": 0.017142857142857144, + "grad_norm": 0.19909434020519257, + "kl": 2.8697148081846535e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3e-07, + "loss": 0.0, + "reward": 0.36440867744386196, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.36440867744386196, + "reward_after_std": 0.5662843585014343, + "reward_before_mean": 0.4735768185928464, + "reward_before_std": 0.5562909506261349, + "reward_change_max": 0.00023984909057617188, + "reward_change_mean": -0.10916817560791969, + "reward_change_min": -0.16707832738757133, + "reward_change_std": 0.06875652400776744, + "reward_std": 0.5662843994796276, + "rewards/cosine_scaled_reward": 0.028455082327127457, + "rewards/format_reward": 0.41666667349636555, + "step": 15 + }, + { + "advantage_max": 1.5482462048530579, + "advantage_mean": 2.607703303292297e-08, + "advantage_min": -1.0424293726682663, + "advantage_std": 0.9997147470712662, + "completion_length": 3578.25, + "epoch": 0.018285714285714287, + "grad_norm": 0.17111748456954956, + "kl": 5.213916301727295e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.2e-07, + "loss": 0.0, + "reward": -0.4694888131925836, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": -0.4694888131925836, + "reward_after_std": 0.4447050001472235, + "reward_before_mean": -0.4354347405023873, + "reward_before_std": 0.4501433949917555, + "reward_change_max": 8.464604616165161e-05, + "reward_change_mean": -0.03405407292302698, + "reward_change_min": -0.07718909159302711, + "reward_change_std": 0.03179531981004402, + "reward_std": 0.44470502249896526, + "rewards/cosine_scaled_reward": -0.22813403699547052, + "rewards/format_reward": 0.02083333395421505, + "step": 16 + }, + { + "advantage_max": 1.3803511783480644, + "advantage_mean": -9.313226301266297e-09, + "advantage_min": -1.3098116517066956, + "advantage_std": 0.9998393729329109, + "completion_length": 2443.937557220459, + "epoch": 0.019428571428571427, + "grad_norm": 0.2514090836048126, + "kl": 4.226714372634888e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.4000000000000003e-07, + "loss": 0.0, + "reward": 0.5570252109318972, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5570252109318972, + "reward_after_std": 0.7874607294797897, + "reward_before_mean": 0.6806772872805595, + "reward_before_std": 0.8015068471431732, + "reward_change_max": 0.00028277933597564697, + "reward_change_mean": -0.12365204491652548, + "reward_change_min": -0.21179709024727345, + "reward_change_std": 0.08916230010800064, + "reward_std": 0.7874607406556606, + "rewards/cosine_scaled_reward": 0.048671944066882133, + "rewards/format_reward": 0.5833333488553762, + "step": 17 + }, + { + "advantage_max": 1.4020969048142433, + "advantage_mean": 1.614292566287645e-08, + "advantage_min": -1.123589277267456, + "advantage_std": 0.9997856467962265, + "completion_length": 2848.1458892822266, + "epoch": 0.02057142857142857, + "grad_norm": 0.17697472870349884, + "kl": 3.02046537399292e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.6e-07, + "loss": 0.0, + "reward": 0.20538506656885147, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.20538506656885147, + "reward_after_std": 0.8300341553986073, + "reward_before_mean": 0.29380474984645844, + "reward_before_std": 0.852685147896409, + "reward_change_max": 7.675588130950928e-05, + "reward_change_mean": -0.08841968746855855, + "reward_change_min": -0.20666884537786245, + "reward_change_std": 0.08047383697703481, + "reward_std": 0.8300341740250587, + "rewards/cosine_scaled_reward": -0.0614309649245115, + "rewards/format_reward": 0.41666667722165585, + "step": 18 + }, + { + "advantage_max": 1.4702775925397873, + "advantage_mean": -2.173086599555063e-09, + "advantage_min": -1.1005384474992752, + "advantage_std": 0.9998194351792336, + "completion_length": 2901.3542098999023, + "epoch": 0.021714285714285714, + "grad_norm": 0.21968965232372284, + "kl": 3.5278499126434326e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.7999999999999996e-07, + "loss": 0.0, + "reward": 0.20741954632103443, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.20741954632103443, + "reward_after_std": 0.8663461040705442, + "reward_before_mean": 0.2943375655449927, + "reward_before_std": 0.8890400361269712, + "reward_change_max": 0.00013860315084457397, + "reward_change_mean": -0.08691801549866796, + "reward_change_min": -0.19336348306387663, + "reward_change_std": 0.07870251836720854, + "reward_std": 0.8663461394608021, + "rewards/cosine_scaled_reward": -0.02991456165909767, + "rewards/format_reward": 0.3541666716337204, + "step": 19 + }, + { + "advantage_max": 1.6149840354919434, + "advantage_mean": -8.69234404454744e-08, + "advantage_min": -1.064434602856636, + "advantage_std": 0.9997932985424995, + "completion_length": 2344.500026702881, + "epoch": 0.022857142857142857, + "grad_norm": 0.22625480592250824, + "kl": 1.2205913662910461e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4e-07, + "loss": 0.0, + "reward": 0.5863076392561197, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5863076392561197, + "reward_after_std": 0.7506540268659592, + "reward_before_mean": 0.7089644218795002, + "reward_before_std": 0.7365398444235325, + "reward_change_max": 0.0, + "reward_change_mean": -0.12265676353126764, + "reward_change_min": -0.2063779616728425, + "reward_change_std": 0.08107350138016045, + "reward_std": 0.7506540361791849, + "rewards/cosine_scaled_reward": 0.03156551416032016, + "rewards/format_reward": 0.6458333414047956, + "step": 20 + }, + { + "advantage_max": 1.485236831009388, + "advantage_mean": 3.104408285992122e-09, + "advantage_min": -1.123526617884636, + "advantage_std": 0.9998413771390915, + "completion_length": 2848.5000610351562, + "epoch": 0.024, + "grad_norm": 0.24100278317928314, + "kl": 4.163384437561035e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.1999999999999995e-07, + "loss": 0.0, + "reward": 0.33815048914402723, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.33815048914402723, + "reward_after_std": 0.8459835797548294, + "reward_before_mean": 0.4365639388561249, + "reward_before_std": 0.8539957068860531, + "reward_change_max": 0.0003660544753074646, + "reward_change_mean": -0.0984134313184768, + "reward_change_min": -0.1952377436682582, + "reward_change_std": 0.07635108148679137, + "reward_std": 0.8459835983812809, + "rewards/cosine_scaled_reward": -0.00046803371515125036, + "rewards/format_reward": 0.4375000074505806, + "step": 21 + }, + { + "advantage_max": 1.5112750977277756, + "advantage_mean": -6.208819014474898e-10, + "advantage_min": -1.1027398481965065, + "advantage_std": 0.9998446479439735, + "completion_length": 1809.458381652832, + "epoch": 0.025142857142857144, + "grad_norm": 0.3369239568710327, + "kl": 2.700556069612503e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.3999999999999997e-07, + "loss": 0.0, + "reward": 0.5723168756812811, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5723168756812811, + "reward_after_std": 0.7695174552500248, + "reward_before_mean": 0.6950296210125089, + "reward_before_std": 0.7699864134192467, + "reward_change_max": 0.00015548616647720337, + "reward_change_mean": -0.12271274626255035, + "reward_change_min": -0.21874888613820076, + "reward_change_std": 0.0842174394056201, + "reward_std": 0.7695174664258957, + "rewards/cosine_scaled_reward": -0.02748518972657621, + "rewards/format_reward": 0.7500000074505806, + "step": 22 + }, + { + "advantage_max": 1.5562490671873093, + "advantage_mean": -1.6142924885720333e-08, + "advantage_min": -1.1134375929832458, + "advantage_std": 0.9998349696397781, + "completion_length": 2557.354202270508, + "epoch": 0.026285714285714287, + "grad_norm": 0.18720196187496185, + "kl": 2.9034912586212158e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.6e-07, + "loss": 0.0, + "reward": 0.24941763281822205, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.24941763281822205, + "reward_after_std": 0.7656088657677174, + "reward_before_mean": 0.34069389663636684, + "reward_before_std": 0.7691123522818089, + "reward_change_max": 0.00048488378524780273, + "reward_change_mean": -0.09127628512214869, + "reward_change_min": -0.1845838474109769, + "reward_change_std": 0.06947551434859633, + "reward_std": 0.7656089253723621, + "rewards/cosine_scaled_reward": -0.10048638191074133, + "rewards/format_reward": 0.5416666828095913, + "step": 23 + }, + { + "advantage_max": 1.5228987485170364, + "advantage_mean": 1.2417644690287943e-09, + "advantage_min": -1.1863074079155922, + "advantage_std": 0.9997837617993355, + "completion_length": 2850.333396911621, + "epoch": 0.027428571428571427, + "grad_norm": 0.22365504503250122, + "kl": 2.4393200874328613e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.8e-07, + "loss": 0.0, + "reward": 0.3684218265116215, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.3684218265116215, + "reward_after_std": 0.7231812328100204, + "reward_before_mean": 0.47270913142710924, + "reward_before_std": 0.7200934160500765, + "reward_change_max": 2.9243528842926025e-05, + "reward_change_mean": -0.10428728186525404, + "reward_change_min": -0.19938689470291138, + "reward_change_std": 0.07831439608708024, + "reward_std": 0.7231812477111816, + "rewards/cosine_scaled_reward": 0.02802122524008155, + "rewards/format_reward": 0.41666667349636555, + "step": 24 + }, + { + "advantage_max": 1.4767991751432419, + "advantage_mean": -1.8471231544303635e-08, + "advantage_min": -1.182334378361702, + "advantage_std": 0.9998176693916321, + "completion_length": 2888.2083892822266, + "epoch": 0.02857142857142857, + "grad_norm": 0.1811361014842987, + "kl": 3.592018038034439e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5e-07, + "loss": 0.0, + "reward": 0.05788080208003521, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.05788080208003521, + "reward_after_std": 0.8258396983146667, + "reward_before_mean": 0.12952115014195442, + "reward_before_std": 0.8361939936876297, + "reward_change_max": 0.00030659139156341553, + "reward_change_mean": -0.07164037134498358, + "reward_change_min": -0.15691961627453566, + "reward_change_std": 0.06482615089043975, + "reward_std": 0.8258397094905376, + "rewards/cosine_scaled_reward": -0.12273942306637764, + "rewards/format_reward": 0.37500000558793545, + "step": 25 + }, + { + "advantage_max": 1.4326291382312775, + "advantage_mean": 2.1109979653211042e-08, + "advantage_min": -1.20679422467947, + "advantage_std": 0.999689131975174, + "completion_length": 2975.3541870117188, + "epoch": 0.029714285714285714, + "grad_norm": 0.1737651377916336, + "kl": 2.80626118183136e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.2e-07, + "loss": 0.0, + "reward": 0.21969399228692055, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.21969399228692055, + "reward_after_std": 0.4889183659106493, + "reward_before_mean": 0.31834758073091507, + "reward_before_std": 0.4868295267224312, + "reward_change_max": 0.00041982531547546387, + "reward_change_mean": -0.09865356958471239, + "reward_change_min": -0.17028795275837183, + "reward_change_std": 0.06675215833820403, + "reward_std": 0.48891837801784277, + "rewards/cosine_scaled_reward": -0.059576213359832764, + "rewards/format_reward": 0.4375000149011612, + "step": 26 + }, + { + "advantage_max": 1.3971495032310486, + "advantage_mean": 7.450580596923828e-09, + "advantage_min": -1.1024487167596817, + "advantage_std": 0.9997835606336594, + "completion_length": 3062.916717529297, + "epoch": 0.030857142857142857, + "grad_norm": 0.18261635303497314, + "kl": 3.505777567625046e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.4e-07, + "loss": 0.0, + "reward": 0.23383064568042755, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.23383064568042755, + "reward_after_std": 0.7971065267920494, + "reward_before_mean": 0.3276177365332842, + "reward_before_std": 0.8222432602196932, + "reward_change_max": 0.00037420541048049927, + "reward_change_mean": -0.0937870959751308, + "reward_change_min": -0.20607583597302437, + "reward_change_std": 0.08386349817737937, + "reward_std": 0.7971065677702427, + "rewards/cosine_scaled_reward": -0.06535779661498964, + "rewards/format_reward": 0.4583333432674408, + "step": 27 + }, + { + "advantage_max": 1.602249875664711, + "advantage_mean": 1.0554989660072067e-08, + "advantage_min": -0.8943894132971764, + "advantage_std": 0.9998579323291779, + "completion_length": 2890.2500610351562, + "epoch": 0.032, + "grad_norm": 0.19755809009075165, + "kl": 3.156531602144241e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.6e-07, + "loss": 0.0, + "reward": 0.28285503294318914, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.28285503294318914, + "reward_after_std": 0.9709917679429054, + "reward_before_mean": 0.37220960669219494, + "reward_before_std": 0.9838091358542442, + "reward_change_max": 0.00012604892253875732, + "reward_change_mean": -0.08935456513427198, + "reward_change_min": -0.19135633949190378, + "reward_change_std": 0.0756442949641496, + "reward_std": 0.9709917902946472, + "rewards/cosine_scaled_reward": -0.02222853573039174, + "rewards/format_reward": 0.4166666679084301, + "step": 28 + }, + { + "advantage_max": 1.4943844228982925, + "advantage_mean": 2.8560558806844938e-08, + "advantage_min": -1.1554220169782639, + "advantage_std": 0.9997913166880608, + "completion_length": 3206.6875610351562, + "epoch": 0.03314285714285714, + "grad_norm": 0.14532366394996643, + "kl": 2.2741500288248062e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.8e-07, + "loss": 0.0, + "reward": -0.20698433928191662, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.20698433928191662, + "reward_after_std": 0.6183616258203983, + "reward_before_mean": -0.1521163946017623, + "reward_before_std": 0.6331501640379429, + "reward_change_max": 0.00013259053230285645, + "reward_change_mean": -0.05486793885938823, + "reward_change_min": -0.13878423906862736, + "reward_change_std": 0.05339451297186315, + "reward_std": 0.6183616407215595, + "rewards/cosine_scaled_reward": -0.2114748670719564, + "rewards/format_reward": 0.2708333395421505, + "step": 29 + }, + { + "advantage_max": 1.3567621260881424, + "advantage_mean": 1.0632599911630791e-08, + "advantage_min": -1.3685436844825745, + "advantage_std": 0.9998265281319618, + "completion_length": 3011.229202270508, + "epoch": 0.03428571428571429, + "grad_norm": 0.17021258175373077, + "kl": 2.570822834968567e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6e-07, + "loss": 0.0, + "reward": 0.3891737814992666, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3891737814992666, + "reward_after_std": 0.8715236745774746, + "reward_before_mean": 0.4956340156495571, + "reward_before_std": 0.9000084735453129, + "reward_change_max": 0.00030282139778137207, + "reward_change_mean": -0.10646022134460509, + "reward_change_min": -0.2039351612329483, + "reward_change_std": 0.09085589437745512, + "reward_std": 0.8715236894786358, + "rewards/cosine_scaled_reward": 0.029066994786262512, + "rewards/format_reward": 0.4375000149011612, + "step": 30 + }, + { + "advantage_max": 1.5537595748901367, + "advantage_mean": -3.6011139736835673e-08, + "advantage_min": -1.0320660769939423, + "advantage_std": 0.9998223856091499, + "completion_length": 3004.0417709350586, + "epoch": 0.03542857142857143, + "grad_norm": 0.22154660522937775, + "kl": 2.0623207092285156e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.2e-07, + "loss": 0.0, + "reward": 0.30279272235929966, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.30279272235929966, + "reward_after_std": 0.7506419010460377, + "reward_before_mean": 0.4000659354496747, + "reward_before_std": 0.7510486245155334, + "reward_change_max": 0.0001844540238380432, + "reward_change_mean": -0.09727319562807679, + "reward_change_min": -0.20195122808218002, + "reward_change_std": 0.0752708266954869, + "reward_std": 0.750641942024231, + "rewards/cosine_scaled_reward": 0.02294962201267481, + "rewards/format_reward": 0.35416666977107525, + "step": 31 + }, + { + "advantage_max": 1.525789052248001, + "advantage_mean": 1.6653345369377348e-15, + "advantage_min": -1.0015061795711517, + "advantage_std": 0.9997410029172897, + "completion_length": 3162.6666870117188, + "epoch": 0.036571428571428574, + "grad_norm": 0.1808365285396576, + "kl": 2.628657966852188e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.4e-07, + "loss": 0.0, + "reward": 0.20195652917027473, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.20195652917027473, + "reward_after_std": 0.7237186953425407, + "reward_before_mean": 0.2911367453634739, + "reward_before_std": 0.7264976073056459, + "reward_change_max": 0.00014059245586395264, + "reward_change_mean": -0.0891802167170681, + "reward_change_min": -0.1922367298975587, + "reward_change_std": 0.07259462832007557, + "reward_std": 0.7237186953425407, + "rewards/cosine_scaled_reward": -0.052348305471241474, + "rewards/format_reward": 0.39583334513008595, + "step": 32 + }, + { + "advantage_max": 1.3504931405186653, + "advantage_mean": 6.208817904251873e-09, + "advantage_min": -1.2225516885519028, + "advantage_std": 0.9997994303703308, + "completion_length": 3309.8958740234375, + "epoch": 0.037714285714285714, + "grad_norm": 0.14805929362773895, + "kl": 3.3639371395111084e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.6e-07, + "loss": 0.0, + "reward": -0.009339592419564724, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.009339592419564724, + "reward_after_std": 0.7747208327054977, + "reward_before_mean": 0.06072163209319115, + "reward_before_std": 0.7972827106714249, + "reward_change_max": 8.016079664230347e-05, + "reward_change_mean": -0.07006121682934463, + "reward_change_min": -0.16277748066931963, + "reward_change_std": 0.06796340085566044, + "reward_std": 0.774720836430788, + "rewards/cosine_scaled_reward": -0.10505585628561676, + "rewards/format_reward": 0.27083334140479565, + "step": 33 + }, + { + "advantage_max": 1.3841820135712624, + "advantage_mean": -3.7252905427109795e-08, + "advantage_min": -1.2319133207201958, + "advantage_std": 0.9998511150479317, + "completion_length": 2577.3333892822266, + "epoch": 0.038857142857142854, + "grad_norm": 0.2878997027873993, + "kl": 2.3380503989756107e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.800000000000001e-07, + "loss": 0.0, + "reward": 0.6522115813568234, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6522115813568234, + "reward_after_std": 0.8838492073118687, + "reward_before_mean": 0.7806666740216315, + "reward_before_std": 0.8991155996918678, + "reward_change_max": 0.00018197298049926758, + "reward_change_mean": -0.12845506332814693, + "reward_change_min": -0.22763802763074636, + "reward_change_std": 0.09426811803132296, + "reward_std": 0.883849237114191, + "rewards/cosine_scaled_reward": 0.1194999860599637, + "rewards/format_reward": 0.541666679084301, + "step": 34 + }, + { + "advantage_max": 1.50138621032238, + "advantage_mean": 3.7252904094842165e-08, + "advantage_min": -0.9604013338685036, + "advantage_std": 0.9998384490609169, + "completion_length": 3031.50004196167, + "epoch": 0.04, + "grad_norm": 0.2110377997159958, + "kl": 5.46872615814209e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7e-07, + "loss": 0.0, + "reward": 0.15888013318181038, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.15888013318181038, + "reward_after_std": 0.9893754497170448, + "reward_before_mean": 0.2364537250250578, + "reward_before_std": 1.0095467530190945, + "reward_change_max": 8.15466046333313e-05, + "reward_change_mean": -0.07757358253002167, + "reward_change_min": -0.21603084448724985, + "reward_change_std": 0.07964391424320638, + "reward_std": 0.9893755055963993, + "rewards/cosine_scaled_reward": -0.027606474235653877, + "rewards/format_reward": 0.2916666716337204, + "step": 35 + }, + { + "advantage_max": 1.6357170641422272, + "advantage_mean": 6.829699250587851e-09, + "advantage_min": -0.9163132086396217, + "advantage_std": 0.9998221769928932, + "completion_length": 3327.7916870117188, + "epoch": 0.04114285714285714, + "grad_norm": 0.18343685567378998, + "kl": 3.866851329803467e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.2e-07, + "loss": 0.0, + "reward": -0.29037621850147843, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.29037621850147843, + "reward_after_std": 0.7336566708981991, + "reward_before_mean": -0.24998886417597532, + "reward_before_std": 0.7397099249064922, + "reward_change_max": 0.00043429434299468994, + "reward_change_mean": -0.040387358982115984, + "reward_change_min": -0.09777700062841177, + "reward_change_std": 0.04080916219390929, + "reward_std": 0.7336566857993603, + "rewards/cosine_scaled_reward": -0.239577763248235, + "rewards/format_reward": 0.2291666716337204, + "step": 36 + }, + { + "advantage_max": 1.3579955101013184, + "advantage_mean": 4.0667752942979973e-08, + "advantage_min": -1.2495231628417969, + "advantage_std": 0.9997347593307495, + "completion_length": 3411.875, + "epoch": 0.04228571428571429, + "grad_norm": 0.19917240738868713, + "kl": 3.3661723136901855e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.4e-07, + "loss": 0.0, + "reward": -0.3309951778501272, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": -0.3309951778501272, + "reward_after_std": 0.3882594630122185, + "reward_before_mean": -0.2804699596017599, + "reward_before_std": 0.3967503234744072, + "reward_change_max": 0.00010347366333007812, + "reward_change_mean": -0.05052520358003676, + "reward_change_min": -0.09982103761285543, + "reward_change_std": 0.04131218558177352, + "reward_std": 0.3882594667375088, + "rewards/cosine_scaled_reward": -0.22356831841170788, + "rewards/format_reward": 0.1666666679084301, + "step": 37 + }, + { + "advantage_max": 1.2970862612128258, + "advantage_mean": 3.9736431256542915e-08, + "advantage_min": -1.3777238950133324, + "advantage_std": 0.9996457993984222, + "completion_length": 3378.6041717529297, + "epoch": 0.04342857142857143, + "grad_norm": 0.17214582860469818, + "kl": 3.484450280666351e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.599999999999999e-07, + "loss": 0.0, + "reward": -0.2649919129908085, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.2649919129908085, + "reward_after_std": 0.34833380207419395, + "reward_before_mean": -0.2071321178227663, + "reward_before_std": 0.35493761859834194, + "reward_change_max": 0.00027345120906829834, + "reward_change_mean": -0.05785980122163892, + "reward_change_min": -0.11019740533083677, + "reward_change_std": 0.04111600434407592, + "reward_std": 0.3483338113874197, + "rewards/cosine_scaled_reward": -0.16606605518609285, + "rewards/format_reward": 0.125, + "step": 38 + }, + { + "advantage_max": 1.611236572265625, + "advantage_mean": -3.9736430812453705e-08, + "advantage_min": -1.0035665556788445, + "advantage_std": 0.9996546432375908, + "completion_length": 2727.7292289733887, + "epoch": 0.044571428571428574, + "grad_norm": 0.20894435048103333, + "kl": 2.6639550924301147e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.799999999999999e-07, + "loss": 0.0, + "reward": 0.3717892151325941, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3717892151325941, + "reward_after_std": 0.3282108139246702, + "reward_before_mean": 0.48738136142492294, + "reward_before_std": 0.29373720567673445, + "reward_change_max": 0.0, + "reward_change_mean": -0.11559212068095803, + "reward_change_min": -0.16906874999403954, + "reward_change_std": 0.0631270776502788, + "reward_std": 0.3282108213752508, + "rewards/cosine_scaled_reward": -0.027142662554979324, + "rewards/format_reward": 0.5416666716337204, + "step": 39 + }, + { + "advantage_max": 1.560842514038086, + "advantage_mean": 7.388492573312533e-08, + "advantage_min": -1.1241285800933838, + "advantage_std": 0.9998079016804695, + "completion_length": 2497.312568664551, + "epoch": 0.045714285714285714, + "grad_norm": 0.1783609390258789, + "kl": 9.907619096338749e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8e-07, + "loss": 0.0, + "reward": 0.34173901937901974, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.34173901937901974, + "reward_after_std": 0.6312458626925945, + "reward_before_mean": 0.4451894119847566, + "reward_before_std": 0.6229474730789661, + "reward_change_max": 0.0002436712384223938, + "reward_change_mean": -0.1034503523260355, + "reward_change_min": -0.17544407304376364, + "reward_change_std": 0.06937033985741436, + "reward_std": 0.6312458775937557, + "rewards/cosine_scaled_reward": -0.04823863413184881, + "rewards/format_reward": 0.5416666697710752, + "step": 40 + }, + { + "advantage_max": 1.6749724745750427, + "advantage_mean": 2.7939677238464355e-08, + "advantage_min": -0.9673251286149025, + "advantage_std": 0.9998568594455719, + "completion_length": 2899.3125762939453, + "epoch": 0.046857142857142854, + "grad_norm": 0.18267039954662323, + "kl": 3.07522714138031e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.199999999999999e-07, + "loss": 0.0, + "reward": -0.018273118417710066, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.018273118417710066, + "reward_after_std": 0.8944275602698326, + "reward_before_mean": 0.04296614229679108, + "reward_before_std": 0.9022683948278427, + "reward_change_max": 0.0005041435360908508, + "reward_change_mean": -0.06123924785060808, + "reward_change_min": -0.14666928444057703, + "reward_change_std": 0.06066753948107362, + "reward_std": 0.8944276012480259, + "rewards/cosine_scaled_reward": -0.17643359955400229, + "rewards/format_reward": 0.39583334513008595, + "step": 41 + }, + { + "advantage_max": 1.458516851067543, + "advantage_mean": 1.9868214629070735e-08, + "advantage_min": -1.1069196499884129, + "advantage_std": 0.9997414946556091, + "completion_length": 2711.3333854675293, + "epoch": 0.048, + "grad_norm": 0.3004877269268036, + "kl": 6.502866744995117e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.399999999999999e-07, + "loss": 0.0, + "reward": -0.21551374037517235, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.21551374037517235, + "reward_after_std": 0.4664292559027672, + "reward_before_mean": -0.1585762370377779, + "reward_before_std": 0.4684213399887085, + "reward_change_max": 0.00046293437480926514, + "reward_change_mean": -0.05693749734200537, + "reward_change_min": -0.10906532034277916, + "reward_change_std": 0.04194885538890958, + "reward_std": 0.4664292633533478, + "rewards/cosine_scaled_reward": -0.26678812876343727, + "rewards/format_reward": 0.3750000037252903, + "step": 42 + }, + { + "advantage_max": 1.5238156765699387, + "advantage_mean": 4.315127999365842e-08, + "advantage_min": -1.2874258160591125, + "advantage_std": 0.9997437745332718, + "completion_length": 2902.937515258789, + "epoch": 0.04914285714285714, + "grad_norm": 0.1742953509092331, + "kl": 4.782527685165405e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.599999999999999e-07, + "loss": 0.0, + "reward": 0.19388390332460403, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.19388390332460403, + "reward_after_std": 0.5199645850807428, + "reward_before_mean": 0.2874040777387563, + "reward_before_std": 0.5115220807492733, + "reward_change_max": 0.00013028830289840698, + "reward_change_mean": -0.09352018125355244, + "reward_change_min": -0.15791374817490578, + "reward_change_std": 0.060954955872148275, + "reward_std": 0.5199645888060331, + "rewards/cosine_scaled_reward": -0.033381287939846516, + "rewards/format_reward": 0.3541666753590107, + "step": 43 + }, + { + "advantage_max": 1.3402893841266632, + "advantage_mean": 2.7318795670083773e-08, + "advantage_min": -1.2359666973352432, + "advantage_std": 0.9998237863183022, + "completion_length": 2733.2500610351562, + "epoch": 0.05028571428571429, + "grad_norm": 0.33008483052253723, + "kl": 0.0001117512583732605, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.799999999999999e-07, + "loss": 0.0, + "reward": 0.367175517603755, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.367175517603755, + "reward_after_std": 0.7814797982573509, + "reward_before_mean": 0.4731551297008991, + "reward_before_std": 0.8076624237000942, + "reward_change_max": 0.00021695345640182495, + "reward_change_mean": -0.10597959894221276, + "reward_change_min": -0.21262322179973125, + "reward_change_std": 0.08547356608323753, + "reward_std": 0.7814798168838024, + "rewards/cosine_scaled_reward": -0.01342245377600193, + "rewards/format_reward": 0.5000000093132257, + "step": 44 + }, + { + "advantage_max": 1.481304481625557, + "advantage_mean": 1.9247334503980085e-08, + "advantage_min": -1.104643315076828, + "advantage_std": 0.999833881855011, + "completion_length": 3342.854217529297, + "epoch": 0.05142857142857143, + "grad_norm": 0.14805112779140472, + "kl": 4.836916923522949e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9e-07, + "loss": 0.0, + "reward": 0.10503099672496319, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.10503099672496319, + "reward_after_std": 0.8778037875890732, + "reward_before_mean": 0.17967192456126213, + "reward_before_std": 0.8892814107239246, + "reward_change_max": 0.0, + "reward_change_mean": -0.07464091246947646, + "reward_change_min": -0.15921855811029673, + "reward_change_std": 0.06344135943800211, + "reward_std": 0.8778038173913956, + "rewards/cosine_scaled_reward": -0.04558071191422641, + "rewards/format_reward": 0.2708333395421505, + "step": 45 + }, + { + "advantage_max": 1.521193027496338, + "advantage_mean": 2.9802322498717615e-08, + "advantage_min": -1.1753706708550453, + "advantage_std": 0.9997015595436096, + "completion_length": 3199.8541870117188, + "epoch": 0.052571428571428575, + "grad_norm": 0.21178790926933289, + "kl": 8.314847946166992e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.2e-07, + "loss": 0.0, + "reward": -0.3450825661420822, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.3450825661420822, + "reward_after_std": 0.36790336668491364, + "reward_before_mean": -0.29655372351408005, + "reward_before_std": 0.3680526949465275, + "reward_change_max": 0.0004460737109184265, + "reward_change_mean": -0.04852884029969573, + "reward_change_min": -0.0890387985855341, + "reward_change_std": 0.03501270990818739, + "reward_std": 0.36790337786078453, + "rewards/cosine_scaled_reward": -0.23161020036786795, + "rewards/format_reward": 0.1666666679084301, + "step": 46 + }, + { + "advantage_max": 1.335110366344452, + "advantage_mean": 1.1796752574788627e-08, + "advantage_min": -1.17454382032156, + "advantage_std": 0.9998417124152184, + "completion_length": 2774.854232788086, + "epoch": 0.053714285714285714, + "grad_norm": 0.20298603177070618, + "kl": 4.419684410095215e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.399999999999999e-07, + "loss": 0.0, + "reward": 0.3838723013177514, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.3838723013177514, + "reward_after_std": 0.9062189273536205, + "reward_before_mean": 0.48803192749619484, + "reward_before_std": 0.9351296909153461, + "reward_change_max": 4.4032931327819824e-05, + "reward_change_mean": -0.10415959800593555, + "reward_change_min": -0.2301958166062832, + "reward_change_std": 0.09293584548868239, + "reward_std": 0.9062189720571041, + "rewards/cosine_scaled_reward": 0.0044326139613986015, + "rewards/format_reward": 0.4791666753590107, + "step": 47 + }, + { + "advantage_max": 1.4299870878458023, + "advantage_mean": -8.61473536950541e-09, + "advantage_min": -1.2005236744880676, + "advantage_std": 0.999784804880619, + "completion_length": 2806.3125228881836, + "epoch": 0.054857142857142854, + "grad_norm": 0.23084399104118347, + "kl": 0.00018368917517364025, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.6e-07, + "loss": 0.0, + "reward": 0.12521709315478802, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.12521709315478802, + "reward_after_std": 0.6359206773340702, + "reward_before_mean": 0.20962151745334268, + "reward_before_std": 0.6417857967317104, + "reward_change_max": 0.0002502724528312683, + "reward_change_mean": -0.0844044117256999, + "reward_change_min": -0.14903380069881678, + "reward_change_std": 0.05997437797486782, + "reward_std": 0.6359206922352314, + "rewards/cosine_scaled_reward": -0.07227259315550327, + "rewards/format_reward": 0.35416666977107525, + "step": 48 + }, + { + "advantage_max": 1.4477006047964096, + "advantage_mean": -6.208817793229571e-09, + "advantage_min": -1.0258841067552567, + "advantage_std": 0.9997946843504906, + "completion_length": 2393.3542289733887, + "epoch": 0.056, + "grad_norm": 0.21073652803897858, + "kl": 5.701184272766113e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.8e-07, + "loss": 0.0, + "reward": 0.2004772163927555, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2004772163927555, + "reward_after_std": 0.7987145818769932, + "reward_before_mean": 0.2891058661043644, + "reward_before_std": 0.8171101789921522, + "reward_change_max": 0.00024158507585525513, + "reward_change_mean": -0.08862866298295557, + "reward_change_min": -0.20624815300107002, + "reward_change_std": 0.07846913067623973, + "reward_std": 0.7987145818769932, + "rewards/cosine_scaled_reward": -0.11586374510079622, + "rewards/format_reward": 0.5208333376795053, + "step": 49 + }, + { + "advantage_max": 1.3314997255802155, + "advantage_mean": -9.934107092490763e-09, + "advantage_min": -1.2882369980216026, + "advantage_std": 0.9997088387608528, + "completion_length": 2960.645835876465, + "epoch": 0.05714285714285714, + "grad_norm": 0.16604925692081451, + "kl": 0.00013734400272369385, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1e-06, + "loss": 0.0, + "reward": 0.34677931293845177, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.34677931293845177, + "reward_after_std": 0.6038327347487211, + "reward_before_mean": 0.4557027849368751, + "reward_before_std": 0.6080019045621157, + "reward_change_max": 2.8908252716064453e-06, + "reward_change_mean": -0.1089234659448266, + "reward_change_min": -0.19173666648566723, + "reward_change_std": 0.07837402378208935, + "reward_std": 0.603832745924592, + "rewards/cosine_scaled_reward": 0.04035138082690537, + "rewards/format_reward": 0.3750000111758709, + "step": 50 + }, + { + "advantage_max": 1.3933206498622894, + "advantage_mean": 7.450581041013038e-09, + "advantage_min": -1.2193833366036415, + "advantage_std": 0.9997348785400391, + "completion_length": 2253.7500228881836, + "epoch": 0.05828571428571429, + "grad_norm": 0.2345164716243744, + "kl": 0.0003926903009414673, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.999890338174275e-07, + "loss": 0.0, + "reward": 0.28201270662248135, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.28201270662248135, + "reward_after_std": 0.5423499569296837, + "reward_before_mean": 0.38460008054971695, + "reward_before_std": 0.5425751395523548, + "reward_change_max": 0.00022970139980316162, + "reward_change_mean": -0.10258737101685256, + "reward_change_min": -0.17983837984502316, + "reward_change_std": 0.06753439782187343, + "reward_std": 0.5423499867320061, + "rewards/cosine_scaled_reward": -0.09936663717962801, + "rewards/format_reward": 0.5833333358168602, + "step": 51 + }, + { + "advantage_max": 1.5688743889331818, + "advantage_mean": 1.8626451714354175e-08, + "advantage_min": -0.8412662744522095, + "advantage_std": 0.9998848661780357, + "completion_length": 2888.187530517578, + "epoch": 0.05942857142857143, + "grad_norm": 0.21857954561710358, + "kl": 0.00026175379753112793, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.999561358041868e-07, + "loss": 0.0, + "reward": 0.34447018057107925, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.34447018057107925, + "reward_after_std": 1.2108869962394238, + "reward_before_mean": 0.4332852326333523, + "reward_before_std": 1.240879662334919, + "reward_change_max": 0.00027269870042800903, + "reward_change_mean": -0.0888150431565009, + "reward_change_min": -0.24372821487486362, + "reward_change_std": 0.09732173680095002, + "reward_std": 1.210887011140585, + "rewards/cosine_scaled_reward": 0.008309275843203068, + "rewards/format_reward": 0.4166666753590107, + "step": 52 + }, + { + "advantage_max": 1.6007621735334396, + "advantage_mean": -1.3659397946064189e-08, + "advantage_min": -0.9977999553084373, + "advantage_std": 0.9997362941503525, + "completion_length": 2766.4791870117188, + "epoch": 0.060571428571428575, + "grad_norm": 0.1986720860004425, + "kl": 0.0002454817295074463, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.999013075636804e-07, + "loss": 0.0, + "reward": 0.26487368531525135, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.26487368531525135, + "reward_after_std": 0.6082321088761091, + "reward_before_mean": 0.3611887330189347, + "reward_before_std": 0.5937345344573259, + "reward_change_max": 0.0001229792833328247, + "reward_change_mean": -0.0963150686584413, + "reward_change_min": -0.1630243817344308, + "reward_change_std": 0.06432746141217649, + "reward_std": 0.6082321219146252, + "rewards/cosine_scaled_reward": -0.07982230000197887, + "rewards/format_reward": 0.5208333376795053, + "step": 53 + }, + { + "advantage_max": 1.4808703660964966, + "advantage_mean": 6.829698140364826e-09, + "advantage_min": -1.1947182267904282, + "advantage_std": 0.9997754395008087, + "completion_length": 2824.041732788086, + "epoch": 0.061714285714285715, + "grad_norm": 0.17189155519008636, + "kl": 5.392730236053467e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.998245517681593e-07, + "loss": 0.0, + "reward": 0.5566112250089645, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5566112250089645, + "reward_after_std": 0.9852496441453695, + "reward_before_mean": 0.674728263169527, + "reward_before_std": 1.0110351080074906, + "reward_change_max": 0.00023486465215682983, + "reward_change_mean": -0.1181170241907239, + "reward_change_min": -0.24376624636352062, + "reward_change_std": 0.10137569368816912, + "reward_std": 0.9852496590465307, + "rewards/cosine_scaled_reward": 0.09778079111129045, + "rewards/format_reward": 0.4791666828095913, + "step": 54 + }, + { + "advantage_max": 1.4139875769615173, + "advantage_mean": 6.208817238118058e-08, + "advantage_min": -1.2174672558903694, + "advantage_std": 0.9997905045747757, + "completion_length": 3000.250015258789, + "epoch": 0.06285714285714286, + "grad_norm": 0.1520586907863617, + "kl": 0.0002251937985420227, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.997258721585931e-07, + "loss": 0.0, + "reward": 0.23542099818587303, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.23542099818587303, + "reward_after_std": 0.6617230176925659, + "reward_before_mean": 0.3305197563022375, + "reward_before_std": 0.6710530370473862, + "reward_change_max": 0.001324571669101715, + "reward_change_mean": -0.09509875881485641, + "reward_change_min": -0.16870635468512774, + "reward_change_std": 0.07222678326070309, + "reward_std": 0.6617230512201786, + "rewards/cosine_scaled_reward": -0.011823451146483421, + "rewards/format_reward": 0.35416667349636555, + "step": 55 + }, + { + "advantage_max": 1.1939893886446953, + "advantage_mean": 1.8626444830971423e-09, + "advantage_min": -1.2510383129119873, + "advantage_std": 0.9998167455196381, + "completion_length": 3025.2291870117188, + "epoch": 0.064, + "grad_norm": 0.1891845017671585, + "kl": 0.0001361072063446045, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.996052735444862e-07, + "loss": 0.0, + "reward": 0.10598282422870398, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.10598282422870398, + "reward_after_std": 0.7244430258870125, + "reward_before_mean": 0.19185005594044924, + "reward_before_std": 0.7621579542756081, + "reward_change_max": 0.0002643391489982605, + "reward_change_mean": -0.08586725732311606, + "reward_change_min": -0.17667766101658344, + "reward_change_std": 0.0800751845818013, + "reward_std": 0.7244430519640446, + "rewards/cosine_scaled_reward": -0.09157497808337212, + "rewards/format_reward": 0.3750000074505806, + "step": 56 + }, + { + "advantage_max": 1.4618928134441376, + "advantage_mean": 2.607703264434491e-08, + "advantage_min": -1.0425853356719017, + "advantage_std": 0.9997790455818176, + "completion_length": 3318.625030517578, + "epoch": 0.06514285714285714, + "grad_norm": 0.12933112680912018, + "kl": 5.304068326950073e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.994627618036452e-07, + "loss": 0.0, + "reward": -0.21856810012832284, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.21856810012832284, + "reward_after_std": 0.5995266698300838, + "reward_before_mean": -0.16476159170269966, + "reward_before_std": 0.6078551784157753, + "reward_change_max": 0.00038677453994750977, + "reward_change_mean": -0.05380649887956679, + "reward_change_min": -0.12232645880430937, + "reward_change_std": 0.04918327531777322, + "reward_std": 0.5995266884565353, + "rewards/cosine_scaled_reward": -0.22821413166821003, + "rewards/format_reward": 0.2916666753590107, + "step": 57 + }, + { + "advantage_max": 1.490548200905323, + "advantage_mean": 1.800557053455165e-08, + "advantage_min": -1.1940131336450577, + "advantage_std": 0.9998381435871124, + "completion_length": 2338.750045776367, + "epoch": 0.06628571428571428, + "grad_norm": 0.20656853914260864, + "kl": 0.0009194463491439819, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.992983438818915e-07, + "loss": 0.0, + "reward": 0.5056098848581314, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5056098848581314, + "reward_after_std": 0.8406195119023323, + "reward_before_mean": 0.6206929110921919, + "reward_before_std": 0.8440921474248171, + "reward_change_max": 6.683915853500366e-05, + "reward_change_mean": -0.11508298618718982, + "reward_change_min": -0.21354464441537857, + "reward_change_std": 0.08731875498779118, + "reward_std": 0.8406195193529129, + "rewards/cosine_scaled_reward": -0.012570214690640569, + "rewards/format_reward": 0.6458333469927311, + "step": 58 + }, + { + "advantage_max": 1.4613443687558174, + "advantage_mean": -2.5145709625640222e-08, + "advantage_min": -1.219063676893711, + "advantage_std": 0.9997749403119087, + "completion_length": 2888.750030517578, + "epoch": 0.06742857142857143, + "grad_norm": 0.15128706395626068, + "kl": 7.972121238708496e-05, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.991120277927223e-07, + "loss": 0.0, + "reward": 0.15974653512239456, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.15974653512239456, + "reward_after_std": 0.6801816318184137, + "reward_before_mean": 0.2470409832894802, + "reward_before_std": 0.6890074703842402, + "reward_change_max": 0.0, + "reward_change_mean": -0.08729447645600885, + "reward_change_min": -0.16269752010703087, + "reward_change_std": 0.06480636191554368, + "reward_std": 0.680181659758091, + "rewards/cosine_scaled_reward": -0.053562849294394255, + "rewards/format_reward": 0.35416667722165585, + "step": 59 + }, + { + "advantage_max": 1.449000284075737, + "advantage_mean": 3.973643103449831e-08, + "advantage_min": -1.0444062128663063, + "advantage_std": 0.9997691512107849, + "completion_length": 2974.312530517578, + "epoch": 0.06857142857142857, + "grad_norm": 0.17451316118240356, + "kl": 0.00019010156393051147, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.989038226169207e-07, + "loss": 0.0, + "reward": -0.003677740693092346, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.003677740693092346, + "reward_after_std": 0.6160192638635635, + "reward_before_mean": 0.07069200649857521, + "reward_before_std": 0.6240869630128145, + "reward_change_max": 0.0004219040274620056, + "reward_change_mean": -0.0743697372963652, + "reward_change_min": -0.14743295591324568, + "reward_change_std": 0.061072568874806166, + "reward_std": 0.6160192675888538, + "rewards/cosine_scaled_reward": -0.14173733163625002, + "rewards/format_reward": 0.3541666679084301, + "step": 60 + }, + { + "advantage_max": 1.4675796553492546, + "advantage_mean": 1.6763807009212428e-08, + "advantage_min": -1.1424203887581825, + "advantage_std": 0.9997835755348206, + "completion_length": 2999.4375610351562, + "epoch": 0.06971428571428571, + "grad_norm": 0.15803340077400208, + "kl": 0.000301167368888855, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.98673738502114e-07, + "loss": 0.0, + "reward": 0.13608455285429955, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.13608455285429955, + "reward_after_std": 0.7962324041873217, + "reward_before_mean": 0.21927554439753294, + "reward_before_std": 0.8183390758931637, + "reward_change_max": 7.96392560005188e-05, + "reward_change_mean": -0.08319097117055207, + "reward_change_min": -0.17873274348676205, + "reward_change_std": 0.07595815474633127, + "reward_std": 0.7962324265390635, + "rewards/cosine_scaled_reward": -0.11952891387045383, + "rewards/format_reward": 0.4583333395421505, + "step": 61 + }, + { + "advantage_max": 1.6056253165006638, + "advantage_mean": -1.0865429223017031e-08, + "advantage_min": -0.9415153115987778, + "advantage_std": 0.9997888281941414, + "completion_length": 2589.770896911621, + "epoch": 0.07085714285714285, + "grad_norm": 0.22276122868061066, + "kl": 0.0006796866655349731, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.98421786662277e-07, + "loss": 0.0, + "reward": 0.35923552978783846, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.35923552978783846, + "reward_after_std": 0.6406212784349918, + "reward_before_mean": 0.4651615908369422, + "reward_before_std": 0.6320403479039669, + "reward_change_max": 0.0006706267595291138, + "reward_change_mean": -0.10592610284220427, + "reward_change_min": -0.19946615397930145, + "reward_change_std": 0.07461804489139467, + "reward_std": 0.6406213119626045, + "rewards/cosine_scaled_reward": -0.027835868299007416, + "rewards/format_reward": 0.520833333954215, + "step": 62 + }, + { + "advantage_max": 1.5056186392903328, + "advantage_mean": -1.6142924996742636e-08, + "advantage_min": -1.066870667040348, + "advantage_std": 0.9998953640460968, + "completion_length": 2393.5208587646484, + "epoch": 0.072, + "grad_norm": 0.19597779214382172, + "kl": 0.0006195306777954102, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.981479793771866e-07, + "loss": 0.0, + "reward": 0.7143217946140794, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.7143217946140794, + "reward_after_std": 1.1465397253632545, + "reward_before_mean": 0.8397536515258253, + "reward_before_std": 1.1638507843017578, + "reward_change_max": 6.40377402305603e-05, + "reward_change_mean": -0.12543187430128455, + "reward_change_min": -0.25196870043873787, + "reward_change_std": 0.1010968410409987, + "reward_std": 1.1465397775173187, + "rewards/cosine_scaled_reward": 0.08654349111020565, + "rewards/format_reward": 0.6666666772216558, + "step": 63 + }, + { + "advantage_max": 1.208019107580185, + "advantage_mean": -1.2417635031347629e-08, + "advantage_min": -1.371815674006939, + "advantage_std": 0.9997898116707802, + "completion_length": 2820.354202270508, + "epoch": 0.07314285714285715, + "grad_norm": 0.1696338802576065, + "kl": 0.00028708577156066895, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.97852329991824e-07, + "loss": 0.0, + "reward": 0.304183728992939, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.304183728992939, + "reward_after_std": 0.7283022552728653, + "reward_before_mean": 0.4062739387154579, + "reward_before_std": 0.7515294570475817, + "reward_change_max": 0.00043839961290359497, + "reward_change_mean": -0.10209022578783333, + "reward_change_min": -0.19523747824132442, + "reward_change_std": 0.08049859874881804, + "reward_std": 0.7283022850751877, + "rewards/cosine_scaled_reward": -0.015613039955496788, + "rewards/format_reward": 0.4375000149011612, + "step": 64 + }, + { + "advantage_max": 1.6743332594633102, + "advantage_mean": 1.862645232497684e-08, + "advantage_min": -0.969209760427475, + "advantage_std": 0.9997278153896332, + "completion_length": 2733.708354949951, + "epoch": 0.07428571428571429, + "grad_norm": 0.20424288511276245, + "kl": 0.0002609342336654663, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.975348529157229e-07, + "loss": 0.0, + "reward": 0.15900675393640995, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.15900675393640995, + "reward_after_std": 0.6618925724178553, + "reward_before_mean": 0.24430988542735577, + "reward_before_std": 0.6556311100721359, + "reward_change_max": 0.00014291703701019287, + "reward_change_mean": -0.08530310750938952, + "reward_change_min": -0.16443136800080538, + "reward_change_std": 0.061483539175242186, + "reward_std": 0.6618925910443068, + "rewards/cosine_scaled_reward": -0.07576173637062311, + "rewards/format_reward": 0.39583333767950535, + "step": 65 + }, + { + "advantage_max": 1.4976499304175377, + "advantage_mean": -6.022552873075071e-08, + "advantage_min": -1.0167640447616577, + "advantage_std": 0.9998143464326859, + "completion_length": 2099.729175567627, + "epoch": 0.07542857142857143, + "grad_norm": 0.24778138101100922, + "kl": 0.00030663609504699707, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.971955636222684e-07, + "loss": 0.0, + "reward": 0.38356152176856995, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.38356152176856995, + "reward_after_std": 0.6583958007395267, + "reward_before_mean": 0.49129557237029076, + "reward_before_std": 0.6529230363667011, + "reward_change_max": 0.00029415637254714966, + "reward_change_mean": -0.10773406224325299, + "reward_change_min": -0.18134311586618423, + "reward_change_std": 0.07149266311898828, + "reward_std": 0.6583958119153976, + "rewards/cosine_scaled_reward": -0.014768877997994423, + "rewards/format_reward": 0.520833333954215, + "step": 66 + }, + { + "advantage_max": 1.3264884650707245, + "advantage_mean": 4.6566129618952345e-08, + "advantage_min": -1.1300052106380463, + "advantage_std": 0.9996982514858246, + "completion_length": 3386.4166717529297, + "epoch": 0.07657142857142857, + "grad_norm": 0.13453011214733124, + "kl": 0.00039284047670662403, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.968344786479415e-07, + "loss": 0.0, + "reward": -0.4645117961335927, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.4645117961335927, + "reward_after_std": 0.36321923695504665, + "reward_before_mean": -0.42523948568850756, + "reward_before_std": 0.37532838620245457, + "reward_change_max": 0.00018453598022460938, + "reward_change_mean": -0.03927231300622225, + "reward_change_min": -0.08630170952528715, + "reward_change_std": 0.036291120108217, + "reward_std": 0.36321924813091755, + "rewards/cosine_scaled_reward": -0.2542864102870226, + "rewards/format_reward": 0.0833333358168602, + "step": 67 + }, + { + "advantage_max": 1.334203988313675, + "advantage_mean": -2.7939686120248552e-09, + "advantage_min": -1.3702456429600716, + "advantage_std": 0.9997404292225838, + "completion_length": 1854.4167098999023, + "epoch": 0.07771428571428571, + "grad_norm": 0.27549850940704346, + "kl": 0.0012511014938354492, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.964516155915151e-07, + "loss": 0.0001, + "reward": 0.4696199508616701, + "reward_advantage_correlation": 0.9999999999999994, + "reward_after_mean": 0.4696199508616701, + "reward_after_std": 0.6607894655317068, + "reward_before_mean": 0.5879831919446588, + "reward_before_std": 0.6720792315900326, + "reward_change_max": 0.0, + "reward_change_mean": -0.11836327239871025, + "reward_change_min": -0.20884494576603174, + "reward_change_std": 0.08378471713513136, + "reward_std": 0.660789493471384, + "rewards/cosine_scaled_reward": -0.028925069607794285, + "rewards/format_reward": 0.6458333358168602, + "step": 68 + }, + { + "advantage_max": 1.56258724629879, + "advantage_mean": 1.8626451381287268e-08, + "advantage_min": -1.1549096181988716, + "advantage_std": 0.9997214451432228, + "completion_length": 2427.2708587646484, + "epoch": 0.07885714285714286, + "grad_norm": 0.26802805066108704, + "kl": 0.0012503266334533691, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.960469931131936e-07, + "loss": 0.0001, + "reward": -0.025797476526349783, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.025797476526349783, + "reward_after_std": 0.5156911239027977, + "reward_before_mean": 0.046627337113022804, + "reward_before_std": 0.5075650177896023, + "reward_change_max": 0.00010112673044204712, + "reward_change_mean": -0.07242480898275971, + "reward_change_min": -0.12345962692052126, + "reward_change_std": 0.04638163233175874, + "reward_std": 0.5156911425292492, + "rewards/cosine_scaled_reward": -0.20585301099345088, + "rewards/format_reward": 0.45833333395421505, + "step": 69 + }, + { + "advantage_max": 1.4540487378835678, + "advantage_mean": 3.4148494476582414e-08, + "advantage_min": -1.126408912241459, + "advantage_std": 0.9997123554348946, + "completion_length": 3066.8333740234375, + "epoch": 0.08, + "grad_norm": 0.21717911958694458, + "kl": 0.0016925111413002014, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.956206309337066e-07, + "loss": 0.0001, + "reward": -0.06509184092283249, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.06509184092283249, + "reward_after_std": 0.5979281403124332, + "reward_before_mean": 0.003177594393491745, + "reward_before_std": 0.6060793250799179, + "reward_change_max": 0.00020104646682739258, + "reward_change_mean": -0.06826943415217102, + "reward_change_min": -0.1556295258924365, + "reward_change_std": 0.05811656138394028, + "reward_std": 0.5979281663894653, + "rewards/cosine_scaled_reward": -0.1546612000092864, + "rewards/format_reward": 0.3125000074505806, + "step": 70 + }, + { + "advantage_max": 1.4608792811632156, + "advantage_mean": 3.725289798861553e-09, + "advantage_min": -1.152649886906147, + "advantage_std": 0.9997194185853004, + "completion_length": 2654.1041946411133, + "epoch": 0.08114285714285714, + "grad_norm": 0.1895046830177307, + "kl": 0.0008933022618293762, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.951725498333448e-07, + "loss": 0.0, + "reward": 0.2874251026660204, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2874251026660204, + "reward_after_std": 0.5808608923107386, + "reward_before_mean": 0.3901416026055813, + "reward_before_std": 0.5837310794740915, + "reward_change_max": 0.0, + "reward_change_mean": -0.10271649085916579, + "reward_change_min": -0.1831564288586378, + "reward_change_std": 0.06829000089783221, + "reward_std": 0.580860897898674, + "rewards/cosine_scaled_reward": -0.023679209873080254, + "rewards/format_reward": 0.4375000074505806, + "step": 71 + }, + { + "advantage_max": 1.3443211615085602, + "advantage_mean": -3.104409007637088e-09, + "advantage_min": -1.3195882812142372, + "advantage_std": 0.9997728690505028, + "completion_length": 2894.9583740234375, + "epoch": 0.08228571428571428, + "grad_norm": 0.23933842778205872, + "kl": 0.000815272331237793, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.947027716509488e-07, + "loss": 0.0, + "reward": -0.11035427264869213, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.11035427264869213, + "reward_after_std": 0.5387446023523808, + "reward_before_mean": -0.04381885752081871, + "reward_before_std": 0.5496399514377117, + "reward_change_max": 0.00013368576765060425, + "reward_change_mean": -0.0665354230441153, + "reward_change_min": -0.12944668717682362, + "reward_change_std": 0.05370501568540931, + "reward_std": 0.5387446247041225, + "rewards/cosine_scaled_reward": -0.18857610132545233, + "rewards/format_reward": 0.33333334140479565, + "step": 72 + }, + { + "advantage_max": 1.3321927040815353, + "advantage_mean": 4.4703484247676784e-08, + "advantage_min": -1.1550491526722908, + "advantage_std": 0.9997767359018326, + "completion_length": 3468.6666870117188, + "epoch": 0.08342857142857144, + "grad_norm": 0.1596253216266632, + "kl": 0.00013802945613861084, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.942113192828444e-07, + "loss": 0.0, + "reward": -0.16815321380272508, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": -0.16815321380272508, + "reward_after_std": 0.7371582835912704, + "reward_before_mean": -0.11063026264309883, + "reward_before_std": 0.7664970513433218, + "reward_change_max": 0.00023616105318069458, + "reward_change_mean": -0.05752295721322298, + "reward_change_min": -0.14209103304892778, + "reward_change_std": 0.06510339491069317, + "reward_std": 0.7371583022177219, + "rewards/cosine_scaled_reward": -0.1386484676040709, + "rewards/format_reward": 0.1666666679084301, + "step": 73 + }, + { + "advantage_max": 1.6054811775684357, + "advantage_mean": -9.93410742555767e-09, + "advantage_min": -0.9970987290143967, + "advantage_std": 0.9997713267803192, + "completion_length": 3130.3750610351562, + "epoch": 0.08457142857142858, + "grad_norm": 0.16316719353199005, + "kl": 0.0006289742887020111, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.93698216681727e-07, + "loss": 0.0, + "reward": 0.04533570492640138, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.04533570492640138, + "reward_after_std": 0.6655574645847082, + "reward_before_mean": 0.12103927996940911, + "reward_before_std": 0.6674526929855347, + "reward_change_max": 0.0002670586109161377, + "reward_change_mean": -0.07570357341319323, + "reward_change_min": -0.1453123176470399, + "reward_change_std": 0.058132898062467575, + "reward_std": 0.6655574906617403, + "rewards/cosine_scaled_reward": -0.0748970415443182, + "rewards/format_reward": 0.27083333767950535, + "step": 74 + }, + { + "advantage_max": 1.3084037005901337, + "advantage_mean": 2.1730860721991263e-08, + "advantage_min": -1.0659999400377274, + "advantage_std": 0.9998039081692696, + "completion_length": 3022.291702270508, + "epoch": 0.08571428571428572, + "grad_norm": 0.15364454686641693, + "kl": 0.0014982819557189941, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.931634888554935e-07, + "loss": 0.0001, + "reward": 0.3326728269457817, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.3326728269457817, + "reward_after_std": 0.6384584531188011, + "reward_before_mean": 0.43834810703992844, + "reward_before_std": 0.6464258171617985, + "reward_change_max": 0.0001539289951324463, + "reward_change_mean": -0.1056752463337034, + "reward_change_min": -0.19835533201694489, + "reward_change_std": 0.07731911540031433, + "reward_std": 0.6384584605693817, + "rewards/cosine_scaled_reward": 0.042090704664587975, + "rewards/format_reward": 0.35416666977107525, + "step": 75 + }, + { + "advantage_max": 1.4036643505096436, + "advantage_mean": 1.3659398168108794e-08, + "advantage_min": -1.2448914647102356, + "advantage_std": 0.999701626598835, + "completion_length": 2984.4583892822266, + "epoch": 0.08685714285714285, + "grad_norm": 0.17103531956672668, + "kl": 0.00016760081052780151, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.926071618660237e-07, + "loss": 0.0, + "reward": -0.15381433628499508, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.15381433628499508, + "reward_after_std": 0.4996340088546276, + "reward_before_mean": -0.09012758638709784, + "reward_before_std": 0.5069925468415022, + "reward_change_max": 0.00010403245687484741, + "reward_change_mean": -0.0636867480352521, + "reward_change_min": -0.132554329931736, + "reward_change_std": 0.052232020534574986, + "reward_std": 0.4996340125799179, + "rewards/cosine_scaled_reward": -0.22214713506400585, + "rewards/format_reward": 0.3541666679084301, + "step": 76 + }, + { + "advantage_max": 1.5948710143566132, + "advantage_mean": 6.829698806498641e-09, + "advantage_min": -1.0013544484972954, + "advantage_std": 0.9997521862387657, + "completion_length": 3199.875030517578, + "epoch": 0.088, + "grad_norm": 0.14091943204402924, + "kl": 0.00019103288650512695, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.9202926282791e-07, + "loss": 0.0, + "reward": 0.07242773100733757, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.07242773100733757, + "reward_after_std": 0.460830919444561, + "reward_before_mean": 0.15626836940646172, + "reward_before_std": 0.44724370911717415, + "reward_change_max": 9.12696123123169e-06, + "reward_change_mean": -0.08384064945857972, + "reward_change_min": -0.13726599793881178, + "reward_change_std": 0.05241412605391815, + "reward_std": 0.4608309231698513, + "rewards/cosine_scaled_reward": -0.08853249228559434, + "rewards/format_reward": 0.33333333395421505, + "step": 77 + }, + { + "advantage_max": 1.372763067483902, + "advantage_mean": 2.8870999813079834e-08, + "advantage_min": -1.1454579532146454, + "advantage_std": 0.9997924268245697, + "completion_length": 2975.812530517578, + "epoch": 0.08914285714285715, + "grad_norm": 0.15570473670959473, + "kl": 0.0012784861028194427, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.91429819907136e-07, + "loss": 0.0001, + "reward": 0.16491154581308365, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.16491154581308365, + "reward_after_std": 0.605958666652441, + "reward_before_mean": 0.25589458271861076, + "reward_before_std": 0.6145712472498417, + "reward_change_max": 0.00023362040519714355, + "reward_change_mean": -0.09098305949009955, + "reward_change_min": -0.17649622447788715, + "reward_change_std": 0.06956856162287295, + "reward_std": 0.6059587113559246, + "rewards/cosine_scaled_reward": -0.03871938120573759, + "rewards/format_reward": 0.33333333395421505, + "step": 78 + }, + { + "advantage_max": 1.5386330038309097, + "advantage_mean": -9.934107647602275e-09, + "advantage_min": -1.075737252831459, + "advantage_std": 0.9998145774006844, + "completion_length": 2298.500030517578, + "epoch": 0.09028571428571429, + "grad_norm": 0.21232455968856812, + "kl": 0.0015820115804672241, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.908088623197048e-07, + "loss": 0.0001, + "reward": 0.3763896021991968, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3763896021991968, + "reward_after_std": 0.719805970788002, + "reward_before_mean": 0.4805713724344969, + "reward_before_std": 0.7157487347722054, + "reward_change_max": 0.00013425201177597046, + "reward_change_mean": -0.10418177908286452, + "reward_change_min": -0.20248535182327032, + "reward_change_std": 0.07452570833265781, + "reward_std": 0.7198059931397438, + "rewards/cosine_scaled_reward": -0.06179766240529716, + "rewards/format_reward": 0.6041666697710752, + "step": 79 + }, + { + "advantage_max": 1.529449224472046, + "advantage_mean": 2.9181441485448545e-08, + "advantage_min": -1.0704481154680252, + "advantage_std": 0.9997870698571205, + "completion_length": 3156.6875610351562, + "epoch": 0.09142857142857143, + "grad_norm": 0.20058980584144592, + "kl": 0.0004749912768602371, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.901664203302124e-07, + "loss": 0.0, + "reward": -0.10771899670362473, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.10771899670362473, + "reward_after_std": 0.6453639008104801, + "reward_before_mean": -0.044033898040652275, + "reward_before_std": 0.6592339612543583, + "reward_change_max": 0.0, + "reward_change_mean": -0.06368510669562966, + "reward_change_min": -0.13146696891635656, + "reward_change_std": 0.05567331635393202, + "reward_std": 0.6453639306128025, + "rewards/cosine_scaled_reward": -0.16785027831792831, + "rewards/format_reward": 0.291666679084301, + "step": 80 + }, + { + "advantage_max": 1.2946752682328224, + "advantage_mean": 3.973643103449831e-08, + "advantage_min": -1.3097454234957695, + "advantage_std": 0.9997820854187012, + "completion_length": 3027.062530517578, + "epoch": 0.09257142857142857, + "grad_norm": 0.26107150316238403, + "kl": 0.0017225146293640137, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.895025252503755e-07, + "loss": 0.0001, + "reward": -0.09596222266554832, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.09596222266554832, + "reward_after_std": 0.6142298579216003, + "reward_before_mean": -0.030082307755947113, + "reward_before_std": 0.6317219771444798, + "reward_change_max": 0.00025378912687301636, + "reward_change_mean": -0.06587989977560937, + "reward_change_min": -0.14147170074284077, + "reward_change_std": 0.05747503787279129, + "reward_std": 0.6142298653721809, + "rewards/cosine_scaled_reward": -0.16087449342012405, + "rewards/format_reward": 0.291666679084301, + "step": 81 + }, + { + "advantage_max": 1.4243344590067863, + "advantage_mean": -6.208817460162663e-09, + "advantage_min": -1.1731738597154617, + "advantage_std": 0.9997696131467819, + "completion_length": 2904.520851135254, + "epoch": 0.09371428571428571, + "grad_norm": 0.22400455176830292, + "kl": 0.0046500712633132935, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.888172094375033e-07, + "loss": 0.0002, + "reward": 0.29460637643933296, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.29460637643933296, + "reward_after_std": 0.6816530674695969, + "reward_before_mean": 0.39471880346536636, + "reward_before_std": 0.6884325593709946, + "reward_change_max": 9.835511445999146e-05, + "reward_change_mean": -0.10011240118183196, + "reward_change_min": -0.17668104264885187, + "reward_change_std": 0.06958513834979385, + "reward_std": 0.6816530898213387, + "rewards/cosine_scaled_reward": 0.009859389916528016, + "rewards/format_reward": 0.3750000037252903, + "step": 82 + }, + { + "advantage_max": 1.6047600284218788, + "advantage_mean": -1.3504177553969043e-08, + "advantage_min": -1.2123665064573288, + "advantage_std": 0.9998186007142067, + "completion_length": 2714.4791870117188, + "epoch": 0.09485714285714286, + "grad_norm": 0.2228856235742569, + "kl": 0.0006010010838508606, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.881105062929221e-07, + "loss": 0.0, + "reward": 0.2101850677281618, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2101850677281618, + "reward_after_std": 0.7800624221563339, + "reward_before_mean": 0.29517440497875214, + "reward_before_std": 0.7766251862049103, + "reward_change_max": 7.58543610572815e-05, + "reward_change_mean": -0.0849893398117274, + "reward_change_min": -0.15232862625271082, + "reward_change_std": 0.06034258124418557, + "reward_std": 0.7800624407827854, + "rewards/cosine_scaled_reward": -0.060746138100512326, + "rewards/format_reward": 0.4166666679084301, + "step": 83 + }, + { + "advantage_max": 1.3094022646546364, + "advantage_mean": 1.3659398279131096e-08, + "advantage_min": -1.1910891830921173, + "advantage_std": 0.9997829273343086, + "completion_length": 3066.6875228881836, + "epoch": 0.096, + "grad_norm": 0.17936787009239197, + "kl": 0.0003447532653808594, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.873824502603459e-07, + "loss": 0.0, + "reward": 0.2950674742460251, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2950674742460251, + "reward_after_std": 0.8434533104300499, + "reward_before_mean": 0.39375803247094154, + "reward_before_std": 0.8733946792781353, + "reward_change_max": 8.602440357208252e-05, + "reward_change_mean": -0.09869056491879746, + "reward_change_min": -0.2063098233193159, + "reward_change_std": 0.0874484230298549, + "reward_std": 0.8434533290565014, + "rewards/cosine_scaled_reward": 0.009379002032801509, + "rewards/format_reward": 0.3750000074505806, + "step": 84 + }, + { + "advantage_max": 1.6304249167442322, + "advantage_mean": 2.4835269951672956e-09, + "advantage_min": -1.043467827141285, + "advantage_std": 0.9997950419783592, + "completion_length": 3137.0208740234375, + "epoch": 0.09714285714285714, + "grad_norm": 0.14530478417873383, + "kl": 0.00024145841598510742, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.866330768241983e-07, + "loss": 0.0, + "reward": 0.005932248197495937, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.005932248197495937, + "reward_after_std": 0.754243042320013, + "reward_before_mean": 0.07403266115579754, + "reward_before_std": 0.7538498565554619, + "reward_change_max": 0.00023034214973449707, + "reward_change_mean": -0.06810041260905564, + "reward_change_min": -0.15122622810304165, + "reward_change_std": 0.05614574020728469, + "reward_std": 0.754243042320013, + "rewards/cosine_scaled_reward": -0.140067002736032, + "rewards/format_reward": 0.3541666753590107, + "step": 85 + }, + { + "advantage_max": 1.32743688672781, + "advantage_mean": 1.2417634920325327e-08, + "advantage_min": -1.3129910230636597, + "advantage_std": 0.9997938498854637, + "completion_length": 2772.020851135254, + "epoch": 0.09828571428571428, + "grad_norm": 0.19045515358448029, + "kl": 0.001046299934387207, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.85862422507884e-07, + "loss": 0.0, + "reward": 0.2901347801089287, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.2901347801089287, + "reward_after_std": 0.6530134379863739, + "reward_before_mean": 0.3909718096256256, + "reward_before_std": 0.6593408472836018, + "reward_change_max": 0.0, + "reward_change_mean": -0.1008370304480195, + "reward_change_min": -0.1854435782879591, + "reward_change_std": 0.0712063885293901, + "reward_std": 0.6530134417116642, + "rewards/cosine_scaled_reward": -0.02326410636305809, + "rewards/format_reward": 0.4375000074505806, + "step": 86 + }, + { + "advantage_max": 1.4831641167402267, + "advantage_mean": -3.973643103449831e-08, + "advantage_min": -1.1471149325370789, + "advantage_std": 0.9998367726802826, + "completion_length": 2631.0208740234375, + "epoch": 0.09942857142857142, + "grad_norm": 0.19404949247837067, + "kl": 0.0006144046783447266, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.850705248720068e-07, + "loss": 0.0, + "reward": 0.41755142249166965, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.41755142249166965, + "reward_after_std": 0.858353029936552, + "reward_before_mean": 0.5248608682304621, + "reward_before_std": 0.8716229237616062, + "reward_change_max": 5.677342414855957e-06, + "reward_change_mean": -0.10730946669355035, + "reward_change_min": -0.20732402987778187, + "reward_change_std": 0.08647632226347923, + "reward_std": 0.858353067189455, + "rewards/cosine_scaled_reward": -0.018819569377228618, + "rewards/format_reward": 0.5625000093132257, + "step": 87 + }, + { + "advantage_max": 1.4660617038607597, + "advantage_mean": -7.140139923755839e-09, + "advantage_min": -1.3561210632324219, + "advantage_std": 0.9998756051063538, + "completion_length": 2669.541732788086, + "epoch": 0.10057142857142858, + "grad_norm": 0.1991894394159317, + "kl": 0.0010406449437141418, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.8425742251254e-07, + "loss": 0.0, + "reward": 0.5971436947584152, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5971436947584152, + "reward_after_std": 0.9840230047702789, + "reward_before_mean": 0.7174625433981419, + "reward_before_std": 1.0027831830084324, + "reward_change_max": 5.602836608886719e-05, + "reward_change_mean": -0.12031882908195257, + "reward_change_min": -0.22533655166625977, + "reward_change_std": 0.09558681072667241, + "reward_std": 0.9840230494737625, + "rewards/cosine_scaled_reward": 0.05664793308824301, + "rewards/format_reward": 0.6041666865348816, + "step": 88 + }, + { + "advantage_max": 1.3210382387042046, + "advantage_mean": 1.6142924885720333e-08, + "advantage_min": -1.2246809154748917, + "advantage_std": 0.9998392388224602, + "completion_length": 2891.8958740234375, + "epoch": 0.10171428571428572, + "grad_norm": 0.20064914226531982, + "kl": 0.0010943412780761719, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.83423155058946e-07, + "loss": 0.0, + "reward": 0.2721131080761552, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2721131080761552, + "reward_after_std": 0.9016778543591499, + "reward_before_mean": 0.36736097862012684, + "reward_before_std": 0.9379100240767002, + "reward_change_max": 9.276717901229858e-05, + "reward_change_mean": -0.09524787031114101, + "reward_change_min": -0.197541574947536, + "reward_change_std": 0.08959966944530606, + "reward_std": 0.9016778841614723, + "rewards/cosine_scaled_reward": -0.014236186631023884, + "rewards/format_reward": 0.3958333432674408, + "step": 89 + }, + { + "advantage_max": 1.4915196597576141, + "advantage_mean": -8.692343844707295e-09, + "advantage_min": -1.0883212387561798, + "advantage_std": 0.999758742749691, + "completion_length": 2403.3541870117188, + "epoch": 0.10285714285714286, + "grad_norm": 0.2652307152748108, + "kl": 0.001546025276184082, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.825677631722435e-07, + "loss": 0.0001, + "reward": -0.029102535918354988, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.029102535918354988, + "reward_after_std": 0.5665853600949049, + "reward_before_mean": 0.041870216839015484, + "reward_before_std": 0.5661425106227398, + "reward_change_max": 0.00011952966451644897, + "reward_change_mean": -0.07097276439890265, + "reward_change_min": -0.13266034051775932, + "reward_change_std": 0.05150189925916493, + "reward_std": 0.5665853675454855, + "rewards/cosine_scaled_reward": -0.22906489111483097, + "rewards/format_reward": 0.5000000055879354, + "step": 90 + }, + { + "advantage_max": 1.3845188915729523, + "advantage_mean": 8.692343955729598e-09, + "advantage_min": -1.2692686691880226, + "advantage_std": 0.9997860342264175, + "completion_length": 3016.2083587646484, + "epoch": 0.104, + "grad_norm": 0.17810925841331482, + "kl": 0.0007269233465194702, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.816912885430258e-07, + "loss": 0.0, + "reward": 0.19745041709393263, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.19745041709393263, + "reward_after_std": 0.740229532122612, + "reward_before_mean": 0.28715684451162815, + "reward_before_std": 0.7560277283191681, + "reward_change_max": 0.0003509148955345154, + "reward_change_mean": -0.08970644162036479, + "reward_change_min": -0.16745029855519533, + "reward_change_std": 0.07019369956105947, + "reward_std": 0.7402295880019665, + "rewards/cosine_scaled_reward": -0.05433825249201618, + "rewards/format_reward": 0.39583334885537624, + "step": 91 + }, + { + "advantage_max": 1.495618462562561, + "advantage_mean": -1.614292477469803e-08, + "advantage_min": -1.1609731614589691, + "advantage_std": 0.9998172298073769, + "completion_length": 2539.1458892822266, + "epoch": 0.10514285714285715, + "grad_norm": 0.2165587693452835, + "kl": 0.0017741471529006958, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.807937738894303e-07, + "loss": 0.0001, + "reward": 0.377059874124825, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.377059874124825, + "reward_after_std": 0.7129791099578142, + "reward_before_mean": 0.4834042700531427, + "reward_before_std": 0.7142429128289223, + "reward_change_max": 9.88692045211792e-05, + "reward_change_mean": -0.10634440556168556, + "reward_change_min": -0.1946062110364437, + "reward_change_std": 0.07394323078915477, + "reward_std": 0.7129791136831045, + "rewards/cosine_scaled_reward": -0.0603812150657177, + "rewards/format_reward": 0.6041666734963655, + "step": 92 + }, + { + "advantage_max": 1.4186953604221344, + "advantage_mean": 2.8871000368191346e-08, + "advantage_min": -1.1192193031311035, + "advantage_std": 0.9997008293867111, + "completion_length": 3446.8958740234375, + "epoch": 0.10628571428571429, + "grad_norm": 0.17092525959014893, + "kl": 0.0012556910514831543, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.798752629550546e-07, + "loss": 0.0001, + "reward": -0.4768864205107093, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": -0.4768864205107093, + "reward_after_std": 0.4022520687431097, + "reward_before_mean": -0.4405721053481102, + "reward_before_std": 0.4141699206084013, + "reward_change_max": 0.0004946738481521606, + "reward_change_mean": -0.03631432238034904, + "reward_change_min": -0.08117994200438261, + "reward_change_std": 0.03509921010117978, + "reward_std": 0.4022520836442709, + "rewards/cosine_scaled_reward": -0.26195271871984005, + "rewards/format_reward": 0.0833333358168602, + "step": 93 + }, + { + "advantage_max": 1.506947785615921, + "advantage_mean": -1.6763807453301638e-08, + "advantage_min": -1.0155241936445236, + "advantage_std": 0.999764122068882, + "completion_length": 2987.7291870117188, + "epoch": 0.10742857142857143, + "grad_norm": 0.1818607598543167, + "kl": 0.0012969821691513062, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.78935800506826e-07, + "loss": 0.0001, + "reward": -0.0261713950894773, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.0261713950894773, + "reward_after_std": 0.539334274828434, + "reward_before_mean": 0.046400152146816254, + "reward_before_std": 0.5371452905237675, + "reward_change_max": 0.0005335435271263123, + "reward_change_mean": -0.07257156854029745, + "reward_change_min": -0.13221831247210503, + "reward_change_std": 0.0521798743866384, + "reward_std": 0.5393342785537243, + "rewards/cosine_scaled_reward": -0.11221659136936069, + "rewards/format_reward": 0.27083333395421505, + "step": 94 + }, + { + "advantage_max": 1.4179429858922958, + "advantage_mean": 2.5456151686586992e-08, + "advantage_min": -1.117660902440548, + "advantage_std": 0.9998129159212112, + "completion_length": 3445.9166870117188, + "epoch": 0.10857142857142857, + "grad_norm": 0.15445290505886078, + "kl": 0.00019112974405288696, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.779754323328192e-07, + "loss": 0.0, + "reward": -0.17061306349933147, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.17061306349933147, + "reward_after_std": 0.7453450746834278, + "reward_before_mean": -0.11620625574141741, + "reward_before_std": 0.7634413428604603, + "reward_change_max": 0.00022859126329421997, + "reward_change_mean": -0.05440680589526892, + "reward_change_min": -0.1367551926523447, + "reward_change_std": 0.05708932294510305, + "reward_std": 0.745345089584589, + "rewards/cosine_scaled_reward": -0.14143646706361324, + "rewards/format_reward": 0.1666666716337204, + "step": 95 + }, + { + "advantage_max": 1.4602340012788773, + "advantage_mean": 6.208817904251873e-10, + "advantage_min": -1.1855292618274689, + "advantage_std": 0.9998244643211365, + "completion_length": 2690.7083587646484, + "epoch": 0.10971428571428571, + "grad_norm": 0.22608739137649536, + "kl": 0.0011289715766906738, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.769942052400235e-07, + "loss": 0.0, + "reward": 0.38900233432650566, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.38900233432650566, + "reward_after_std": 0.7013615928590298, + "reward_before_mean": 0.49687390495091677, + "reward_before_std": 0.7065553367137909, + "reward_change_max": 0.00010736286640167236, + "reward_change_mean": -0.10787154478020966, + "reward_change_min": -0.19875967130064964, + "reward_change_std": 0.07827508868649602, + "reward_std": 0.7013616152107716, + "rewards/cosine_scaled_reward": 0.02968693384900689, + "rewards/format_reward": 0.4375000074505806, + "step": 96 + }, + { + "advantage_max": 1.4047540798783302, + "advantage_mean": -4.346173199110126e-09, + "advantage_min": -1.2560274973511696, + "advantage_std": 0.9997810050845146, + "completion_length": 2811.1458892822266, + "epoch": 0.11085714285714286, + "grad_norm": 0.16837795078754425, + "kl": 0.001363903284072876, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.759921670520634e-07, + "loss": 0.0001, + "reward": 0.3484401609748602, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3484401609748602, + "reward_after_std": 0.5126909576356411, + "reward_before_mean": 0.4589174911379814, + "reward_before_std": 0.5126852281391621, + "reward_change_max": 0.00011277198791503906, + "reward_change_mean": -0.1104773310944438, + "reward_change_min": -0.1841278411448002, + "reward_change_std": 0.07085376582108438, + "reward_std": 0.5126909799873829, + "rewards/cosine_scaled_reward": -0.010124601423740387, + "rewards/format_reward": 0.47916668094694614, + "step": 97 + }, + { + "advantage_max": 1.5295169353485107, + "advantage_mean": 3.725290742551124e-09, + "advantage_min": -1.2989156991243362, + "advantage_std": 0.9997530430555344, + "completion_length": 2568.416717529297, + "epoch": 0.112, + "grad_norm": 0.17439162731170654, + "kl": 0.0003739595413208008, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.749693666068663e-07, + "loss": 0.0, + "reward": 0.26688409969210625, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.26688409969210625, + "reward_after_std": 0.45362649485468864, + "reward_before_mean": 0.3693796396255493, + "reward_before_std": 0.438557505607605, + "reward_change_max": 0.0001835152506828308, + "reward_change_mean": -0.10249554133042693, + "reward_change_min": -0.16221447475254536, + "reward_change_std": 0.06244846456684172, + "reward_std": 0.45362651348114014, + "rewards/cosine_scaled_reward": -0.0965601853094995, + "rewards/format_reward": 0.5625000074505806, + "step": 98 + }, + { + "advantage_max": 1.59602090716362, + "advantage_mean": 9.002784850942191e-09, + "advantage_min": -1.0146455764770508, + "advantage_std": 0.9997052848339081, + "completion_length": 2898.1458587646484, + "epoch": 0.11314285714285714, + "grad_norm": 0.20550180971622467, + "kl": 0.0007301568984985352, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.739258537542835e-07, + "loss": 0.0, + "reward": -0.011777647770941257, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.011777647770941257, + "reward_after_std": 0.5849963650107384, + "reward_before_mean": 0.059426740277558565, + "reward_before_std": 0.5733112944290042, + "reward_change_max": 7.230788469314575e-05, + "reward_change_mean": -0.07120441918959841, + "reward_change_min": -0.11866667028516531, + "reward_change_std": 0.04729985597077757, + "reward_std": 0.5849963836371899, + "rewards/cosine_scaled_reward": -0.10570329218171537, + "rewards/format_reward": 0.2708333395421505, + "step": 99 + }, + { + "advantage_max": 1.4574847668409348, + "advantage_mean": 2.7318797668485217e-08, + "advantage_min": -1.1436650529503822, + "advantage_std": 0.9998161122202873, + "completion_length": 2717.0625610351562, + "epoch": 0.11428571428571428, + "grad_norm": 0.19857947528362274, + "kl": 0.0009911060333251953, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.728616793536587e-07, + "loss": 0.0, + "reward": 0.3121153700631112, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3121153700631112, + "reward_after_std": 0.7690652385354042, + "reward_before_mean": 0.41096168756484985, + "reward_before_std": 0.7780533917248249, + "reward_change_max": 6.656348705291748e-05, + "reward_change_mean": -0.09884630842134356, + "reward_change_min": -0.1944613279774785, + "reward_change_std": 0.07431896426714957, + "reward_std": 0.7690652459859848, + "rewards/cosine_scaled_reward": -0.023685835301876068, + "rewards/format_reward": 0.45833334885537624, + "step": 100 + }, + { + "advantage_max": 1.4642290025949478, + "advantage_mean": 9.934107758624577e-09, + "advantage_min": -1.23203906416893, + "advantage_std": 0.9997976124286652, + "completion_length": 2641.625, + "epoch": 0.11542857142857142, + "grad_norm": 0.20073696970939636, + "kl": 0.0007115602493286133, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.717768952713511e-07, + "loss": 0.0, + "reward": 0.13412395305931568, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.13412395305931568, + "reward_after_std": 0.6262147203087807, + "reward_before_mean": 0.21931741666048765, + "reward_before_std": 0.6254003196954727, + "reward_change_max": 0.0007505491375923157, + "reward_change_mean": -0.08519347733817995, + "reward_change_min": -0.15299372747540474, + "reward_change_std": 0.0630116555839777, + "reward_std": 0.6262147352099419, + "rewards/cosine_scaled_reward": -0.09867463074624538, + "rewards/format_reward": 0.41666667722165585, + "step": 101 + }, + { + "advantage_max": 1.5872334390878677, + "advantage_mean": -1.8936891610366047e-08, + "advantage_min": -1.1092576533555984, + "advantage_std": 0.9997419193387032, + "completion_length": 2135.000045776367, + "epoch": 0.11657142857142858, + "grad_norm": 0.2481817603111267, + "kl": 0.001399993896484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.706715543782064e-07, + "loss": 0.0001, + "reward": 0.40515281772240996, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.40515281772240996, + "reward_after_std": 0.7024665623903275, + "reward_before_mean": 0.512839687988162, + "reward_before_std": 0.6963064633309841, + "reward_change_max": 0.00012967735528945923, + "reward_change_mean": -0.10768683551577851, + "reward_change_min": -0.18208745121955872, + "reward_change_std": 0.07141236204188317, + "reward_std": 0.7024666033685207, + "rewards/cosine_scaled_reward": -0.09774684254080057, + "rewards/format_reward": 0.7083333395421505, + "step": 102 + }, + { + "advantage_max": 1.360763557255268, + "advantage_mean": 9.623669194880335e-09, + "advantage_min": -1.057634711265564, + "advantage_std": 0.999780036509037, + "completion_length": 2951.833354949951, + "epoch": 0.11771428571428572, + "grad_norm": 0.2257830947637558, + "kl": 0.001177072525024414, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.695457105469804e-07, + "loss": 0.0, + "reward": 0.3502454627305269, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.3502454627305269, + "reward_after_std": 0.704563008621335, + "reward_before_mean": 0.4569867327809334, + "reward_before_std": 0.72186528891325, + "reward_change_max": 0.00033936649560928345, + "reward_change_mean": -0.10674124653451145, + "reward_change_min": -0.20691397693008184, + "reward_change_std": 0.08697378146462142, + "reward_std": 0.704563032835722, + "rewards/cosine_scaled_reward": -0.0006733201444149017, + "rewards/format_reward": 0.4583333395421505, + "step": 103 + }, + { + "advantage_max": 1.2722968012094498, + "advantage_mean": -2.110997954218874e-08, + "advantage_min": -1.3262568935751915, + "advantage_std": 0.9997180476784706, + "completion_length": 2898.4166870117188, + "epoch": 0.11885714285714286, + "grad_norm": 0.22004500031471252, + "kl": 0.0033478736877441406, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.683994186497132e-07, + "loss": 0.0001, + "reward": -0.08824241906404495, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.08824241906404495, + "reward_after_std": 0.4880238436162472, + "reward_before_mean": -0.017093989998102188, + "reward_before_std": 0.5020418781787157, + "reward_change_max": 0.0, + "reward_change_mean": -0.07114846212789416, + "reward_change_min": -0.13775192759931087, + "reward_change_std": 0.056608869694173336, + "reward_std": 0.4880238547921181, + "rewards/cosine_scaled_reward": -0.16479699313640594, + "rewards/format_reward": 0.3125, + "step": 104 + }, + { + "advantage_max": 1.2704541832208633, + "advantage_mean": 2.980232283178452e-08, + "advantage_min": -1.264593780040741, + "advantage_std": 0.9998151138424873, + "completion_length": 2905.125030517578, + "epoch": 0.12, + "grad_norm": 0.20489485561847687, + "kl": 0.0008729100227355957, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.672327345550543e-07, + "loss": 0.0, + "reward": 0.08965863287448883, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.08965863287448883, + "reward_after_std": 0.7559684477746487, + "reward_before_mean": 0.16935227066278458, + "reward_before_std": 0.7788311205804348, + "reward_change_max": 0.0003646537661552429, + "reward_change_mean": -0.07969362242147326, + "reward_change_min": -0.16236806381493807, + "reward_change_std": 0.06786630826536566, + "reward_std": 0.755968451499939, + "rewards/cosine_scaled_reward": -0.08199054421857, + "rewards/format_reward": 0.3333333395421505, + "step": 105 + }, + { + "advantage_max": 1.5852757394313812, + "advantage_mean": -3.911554924407312e-08, + "advantage_min": -1.0061208456754684, + "advantage_std": 0.9997430816292763, + "completion_length": 2090.75008392334, + "epoch": 0.12114285714285715, + "grad_norm": 0.2063012570142746, + "kl": 0.0014823079109191895, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.66045715125541e-07, + "loss": 0.0001, + "reward": 0.9065777286887169, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.9065777286887169, + "reward_after_std": 0.7232242524623871, + "reward_before_mean": 1.0624435134232044, + "reward_before_std": 0.7090621958486736, + "reward_change_max": 0.0, + "reward_change_mean": -0.15586584899574518, + "reward_change_min": -0.2597513496875763, + "reward_change_std": 0.10303916921839118, + "reward_std": 0.7232242971658707, + "rewards/cosine_scaled_reward": 0.19788843393325806, + "rewards/format_reward": 0.6666666772216558, + "step": 106 + }, + { + "advantage_max": 1.5181275755167007, + "advantage_mean": -2.4369608844776458e-08, + "advantage_min": -1.2018900960683823, + "advantage_std": 0.9997393265366554, + "completion_length": 2681.9584045410156, + "epoch": 0.12228571428571429, + "grad_norm": 0.22282981872558594, + "kl": 0.0009338855743408203, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.648384182148252e-07, + "loss": 0.0, + "reward": 0.24523488990962505, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.24523488990962505, + "reward_after_std": 0.5135970376431942, + "reward_before_mean": 0.34461949206888676, + "reward_before_std": 0.5058441665023565, + "reward_change_max": 0.00038911402225494385, + "reward_change_mean": -0.09938463289290667, + "reward_change_min": -0.17489725723862648, + "reward_change_std": 0.06601686554495245, + "reward_std": 0.5135970450937748, + "rewards/cosine_scaled_reward": -0.09852358978241682, + "rewards/format_reward": 0.5416666734963655, + "step": 107 + }, + { + "advantage_max": 1.462399698793888, + "advantage_mean": 1.6763807397790487e-08, + "advantage_min": -1.1984619572758675, + "advantage_std": 0.9997933208942413, + "completion_length": 2448.0625381469727, + "epoch": 0.12342857142857143, + "grad_norm": 0.20489199459552765, + "kl": 0.0010381042957305908, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.636109026648554e-07, + "loss": 0.0, + "reward": 0.3353926707059145, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3353926707059145, + "reward_after_std": 0.7036140821874142, + "reward_before_mean": 0.4386548884212971, + "reward_before_std": 0.7110070791095495, + "reward_change_max": 2.4124979972839355e-05, + "reward_change_mean": -0.1032621799968183, + "reward_change_min": -0.19217858277261257, + "reward_change_std": 0.07401760248467326, + "reward_std": 0.7036141231656075, + "rewards/cosine_scaled_reward": -0.041089228354394436, + "rewards/format_reward": 0.5208333432674408, + "step": 108 + }, + { + "advantage_max": 1.6706776022911072, + "advantage_mean": -2.297262446937509e-08, + "advantage_min": -0.9789133369922638, + "advantage_std": 0.9997392222285271, + "completion_length": 2940.6458587646484, + "epoch": 0.12457142857142857, + "grad_norm": 0.18210139870643616, + "kl": 0.0003904104232788086, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.623632283030077e-07, + "loss": 0.0, + "reward": 0.035868662409484386, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.035868662409484386, + "reward_after_std": 0.4906224813312292, + "reward_before_mean": 0.11451574973762035, + "reward_before_std": 0.47176924906671047, + "reward_change_max": 2.3439526557922363e-05, + "reward_change_mean": -0.07864708849228919, + "reward_change_min": -0.1277305269613862, + "reward_change_std": 0.05187747371383011, + "reward_std": 0.49062250182032585, + "rewards/cosine_scaled_reward": -0.09899212668460677, + "rewards/format_reward": 0.31250000186264515, + "step": 109 + }, + { + "advantage_max": 1.6114717870950699, + "advantage_mean": -5.029142124968189e-08, + "advantage_min": -1.1079509481787682, + "advantage_std": 0.9998557791113853, + "completion_length": 2539.500072479248, + "epoch": 0.12571428571428572, + "grad_norm": 0.24719710648059845, + "kl": 0.0007413476705551147, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.610954559391704e-07, + "loss": 0.0, + "reward": 0.21266168262809515, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.21266168262809515, + "reward_after_std": 0.9514952898025513, + "reward_before_mean": 0.29369947547093034, + "reward_before_std": 0.9561396837234497, + "reward_change_max": 0.0003879815340042114, + "reward_change_mean": -0.08103783521801233, + "reward_change_min": -0.1750015551224351, + "reward_change_std": 0.071824872167781, + "reward_std": 0.9514953121542931, + "rewards/cosine_scaled_reward": -0.11356692761182785, + "rewards/format_reward": 0.520833345130086, + "step": 110 + }, + { + "advantage_max": 1.3271641284227371, + "advantage_mean": 1.1175871006408045e-08, + "advantage_min": -1.1762759760022163, + "advantage_std": 0.9998204931616783, + "completion_length": 2878.6250762939453, + "epoch": 0.12685714285714286, + "grad_norm": 0.1850922554731369, + "kl": 0.001230478286743164, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.598076473627796e-07, + "loss": 0.0, + "reward": 0.26679439563304186, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.26679439563304186, + "reward_after_std": 0.8421963788568974, + "reward_before_mean": 0.36285027489066124, + "reward_before_std": 0.8735892362892628, + "reward_change_max": 8.895248174667358e-05, + "reward_change_mean": -0.0960558783262968, + "reward_change_min": -0.20756731741130352, + "reward_change_std": 0.08794838469475508, + "reward_std": 0.84219641238451, + "rewards/cosine_scaled_reward": -0.04774153791368008, + "rewards/format_reward": 0.45833334140479565, + "step": 111 + }, + { + "advantage_max": 1.4141745269298553, + "advantage_mean": 5.743156217263845e-08, + "advantage_min": -1.172879695892334, + "advantage_std": 0.9997918605804443, + "completion_length": 3140.3334045410156, + "epoch": 0.128, + "grad_norm": 0.16904281079769135, + "kl": 0.0007680952548980713, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.58499865339809e-07, + "loss": 0.0, + "reward": 0.37912358343601227, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.37912358343601227, + "reward_after_std": 0.7805259078741074, + "reward_before_mean": 0.4845715146511793, + "reward_before_std": 0.7915312610566616, + "reward_change_max": 0.00028561800718307495, + "reward_change_mean": -0.10544790513813496, + "reward_change_min": -0.22313793655484915, + "reward_change_std": 0.08740153908729553, + "reward_std": 0.7805259451270103, + "rewards/cosine_scaled_reward": 0.002702411264181137, + "rewards/format_reward": 0.47916666977107525, + "step": 112 + }, + { + "advantage_max": 1.441021591424942, + "advantage_mean": 8.692345065952622e-09, + "advantage_min": -1.151847779750824, + "advantage_std": 0.9997981712222099, + "completion_length": 2436.9167289733887, + "epoch": 0.12914285714285714, + "grad_norm": 0.2858113944530487, + "kl": 0.001512289047241211, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.571721736097088e-07, + "loss": 0.0001, + "reward": 0.18071626406162977, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.18071626406162977, + "reward_after_std": 0.6407562829554081, + "reward_before_mean": 0.2715048464015126, + "reward_before_std": 0.6499359104782343, + "reward_change_max": 7.875263690948486e-05, + "reward_change_mean": -0.09078857069835067, + "reward_change_min": -0.17216388322412968, + "reward_change_std": 0.06758308736607432, + "reward_std": 0.6407563146203756, + "rewards/cosine_scaled_reward": -0.13508092612028122, + "rewards/format_reward": 0.5416666846722364, + "step": 113 + }, + { + "advantage_max": 1.5190544202923775, + "advantage_mean": 2.7939677682553565e-08, + "advantage_min": -1.1916342675685883, + "advantage_std": 0.9997197538614273, + "completion_length": 2483.9792289733887, + "epoch": 0.13028571428571428, + "grad_norm": 0.20862361788749695, + "kl": 0.0018885135650634766, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.55824636882301e-07, + "loss": 0.0001, + "reward": 0.12357044592499733, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.12357044592499733, + "reward_after_std": 0.5326991295441985, + "reward_before_mean": 0.21228991076350212, + "reward_before_std": 0.5345789343118668, + "reward_change_max": 2.765655517578125e-05, + "reward_change_mean": -0.0887194707756862, + "reward_change_min": -0.1558008911088109, + "reward_change_std": 0.06081947742495686, + "reward_std": 0.5326991518959403, + "rewards/cosine_scaled_reward": -0.185521719744429, + "rewards/format_reward": 0.5833333488553762, + "step": 114 + }, + { + "advantage_max": 1.3991148322820663, + "advantage_mean": -1.4280280069556284e-08, + "advantage_min": -1.182186022400856, + "advantage_std": 0.9997821226716042, + "completion_length": 2882.6875, + "epoch": 0.13142857142857142, + "grad_norm": 0.1963007003068924, + "kl": 0.0019685029983520508, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.54457320834625e-07, + "loss": 0.0001, + "reward": 0.06682339310646057, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.06682339310646057, + "reward_after_std": 0.6091874223202467, + "reward_before_mean": 0.14801817759871483, + "reward_before_std": 0.6182383522391319, + "reward_change_max": 0.0005025044083595276, + "reward_change_mean": -0.08119479194283485, + "reward_change_min": -0.16018072236329317, + "reward_change_std": 0.0620639375410974, + "reward_std": 0.6091874409466982, + "rewards/cosine_scaled_reward": -0.08224091539159417, + "rewards/format_reward": 0.31250000186264515, + "step": 115 + }, + { + "advantage_max": 1.428558573126793, + "advantage_mean": 3.228585032655218e-08, + "advantage_min": -1.140651598572731, + "advantage_std": 0.9997088760137558, + "completion_length": 3213.5416717529297, + "epoch": 0.13257142857142856, + "grad_norm": 0.18319779634475708, + "kl": 0.0013089179992675781, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.530702921077358e-07, + "loss": 0.0001, + "reward": -0.28241080418229103, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.28241080418229103, + "reward_after_std": 0.42994930408895016, + "reward_before_mean": -0.22914370521903038, + "reward_before_std": 0.43542555905878544, + "reward_change_max": 0.00030046701431274414, + "reward_change_mean": -0.053267103619873524, + "reward_change_min": -0.11103585828095675, + "reward_change_std": 0.04199374059680849, + "reward_std": 0.4299493171274662, + "rewards/cosine_scaled_reward": -0.17707185074687004, + "rewards/format_reward": 0.125, + "step": 116 + }, + { + "advantage_max": 1.6159285753965378, + "advantage_mean": 4.284083932049043e-08, + "advantage_min": -1.0283942744135857, + "advantage_std": 0.9997640401124954, + "completion_length": 3145.166717529297, + "epoch": 0.1337142857142857, + "grad_norm": 0.1719525307416916, + "kl": 0.0014429092407226562, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.516636183034564e-07, + "loss": 0.0001, + "reward": -0.07115666568279266, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.07115666568279266, + "reward_after_std": 0.711862625554204, + "reward_before_mean": -0.00940685998648405, + "reward_before_std": 0.7143301498144865, + "reward_change_max": 0.00015719234943389893, + "reward_change_mean": -0.06174980604555458, + "reward_change_min": -0.13204221613705158, + "reward_change_std": 0.049385225865989923, + "reward_std": 0.7118626423180103, + "rewards/cosine_scaled_reward": -0.16095343511551619, + "rewards/format_reward": 0.3125000074505806, + "step": 117 + }, + { + "advantage_max": 1.6019388288259506, + "advantage_mean": -1.4280279847511679e-08, + "advantage_min": -0.964250460267067, + "advantage_std": 0.9998467639088631, + "completion_length": 3069.104217529297, + "epoch": 0.13485714285714287, + "grad_norm": 0.1549428105354309, + "kl": 0.0012319087982177734, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.502373679810839e-07, + "loss": 0.0, + "reward": 0.4462295286357403, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4462295286357403, + "reward_after_std": 0.9829271957278252, + "reward_before_mean": 0.5496396627277136, + "reward_before_std": 0.9876304157078266, + "reward_change_max": 0.00024300813674926758, + "reward_change_mean": -0.103410127107054, + "reward_change_min": -0.20373185630887747, + "reward_change_std": 0.08672826108522713, + "reward_std": 0.982927218079567, + "rewards/cosine_scaled_reward": 0.0560698164626956, + "rewards/format_reward": 0.43750000186264515, + "step": 118 + }, + { + "advantage_max": 1.2626687735319138, + "advantage_mean": -2.700835521896039e-08, + "advantage_min": -1.3746841996908188, + "advantage_std": 0.9998002350330353, + "completion_length": 2507.187545776367, + "epoch": 0.136, + "grad_norm": 0.20802848041057587, + "kl": 0.0029039382934570312, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.487916106540465e-07, + "loss": 0.0001, + "reward": 0.3486227598041296, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3486227598041296, + "reward_after_std": 0.6806661561131477, + "reward_before_mean": 0.4562889579683542, + "reward_before_std": 0.6996967010200024, + "reward_change_max": 0.00010123103857040405, + "reward_change_mean": -0.10766624030657113, + "reward_change_min": -0.197446602396667, + "reward_change_std": 0.08037910354323685, + "reward_std": 0.6806661747395992, + "rewards/cosine_scaled_reward": -0.06352218613028526, + "rewards/format_reward": 0.5833333507180214, + "step": 119 + }, + { + "advantage_max": 1.5879197269678116, + "advantage_mean": -4.842877543431712e-08, + "advantage_min": -1.06450717151165, + "advantage_std": 0.9998263493180275, + "completion_length": 2468.0833702087402, + "epoch": 0.13714285714285715, + "grad_norm": 0.2512477934360504, + "kl": 0.003391742706298828, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.473264167865171e-07, + "loss": 0.0001, + "reward": 0.3673525620251894, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.3673525620251894, + "reward_after_std": 0.7729743756353855, + "reward_before_mean": 0.4686034247279167, + "reward_before_std": 0.7684299945831299, + "reward_change_max": 0.00028771162033081055, + "reward_change_mean": -0.1012508855201304, + "reward_change_min": -0.1895952159538865, + "reward_change_std": 0.07404675986617804, + "reward_std": 0.7729743979871273, + "rewards/cosine_scaled_reward": -0.015698293107561767, + "rewards/format_reward": 0.5000000074505806, + "step": 120 + }, + { + "advantage_max": 1.5347543805837631, + "advantage_mean": -4.2219957530065244e-08, + "advantage_min": -1.1780244708061218, + "advantage_std": 0.9997963011264801, + "completion_length": 1739.1875305175781, + "epoch": 0.1382857142857143, + "grad_norm": 0.20562413334846497, + "kl": 0.0033533573150634766, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.458418577899774e-07, + "loss": 0.0001, + "reward": 0.6855917517095804, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6855917517095804, + "reward_after_std": 0.636465273797512, + "reward_before_mean": 0.8219538982957602, + "reward_before_std": 0.624314408749342, + "reward_change_max": 6.859749555587769e-05, + "reward_change_mean": -0.1363621219061315, + "reward_change_min": -0.22026861924678087, + "reward_change_std": 0.08315270929597318, + "reward_std": 0.636465273797512, + "rewards/cosine_scaled_reward": 0.004726927087176591, + "rewards/format_reward": 0.8125, + "step": 121 + }, + { + "advantage_max": 1.4387590885162354, + "advantage_mean": -2.4835269396561444e-09, + "advantage_min": -1.0619560778141022, + "advantage_std": 0.9998273774981499, + "completion_length": 2961.5208740234375, + "epoch": 0.13942857142857143, + "grad_norm": 0.1977798044681549, + "kl": 0.001184701919555664, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.443380060197385e-07, + "loss": 0.0, + "reward": 0.3571953661739826, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3571953661739826, + "reward_after_std": 0.952107772231102, + "reward_before_mean": 0.4578540176153183, + "reward_before_std": 0.9842019788920879, + "reward_change_max": 0.0, + "reward_change_mean": -0.10065865784417838, + "reward_change_min": -0.22830870375037193, + "reward_change_std": 0.09368588705547154, + "reward_std": 0.9521077759563923, + "rewards/cosine_scaled_reward": 0.01017700880765915, + "rewards/format_reward": 0.43750000558793545, + "step": 122 + }, + { + "advantage_max": 1.7321224063634872, + "advantage_mean": -1.2262414239572195e-07, + "advantage_min": -0.9814025685191154, + "advantage_std": 0.9997626096010208, + "completion_length": 2672.5833892822266, + "epoch": 0.14057142857142857, + "grad_norm": 0.18867647647857666, + "kl": 0.0012271404266357422, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.428149347714143e-07, + "loss": 0.0, + "reward": 0.12601416371762753, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.12601416371762753, + "reward_after_std": 0.5985247995704412, + "reward_before_mean": 0.2089059054851532, + "reward_before_std": 0.584993964061141, + "reward_change_max": 0.000914938747882843, + "reward_change_mean": -0.08289175282698125, + "reward_change_min": -0.13586501125246286, + "reward_change_std": 0.054501404985785484, + "reward_std": 0.598524821922183, + "rewards/cosine_scaled_reward": -0.11429706169292331, + "rewards/format_reward": 0.4375000037252903, + "step": 123 + }, + { + "advantage_max": 1.4781872406601906, + "advantage_mean": -2.483527050678447e-09, + "advantage_min": -1.183127485215664, + "advantage_std": 0.9998154491186142, + "completion_length": 2473.4792251586914, + "epoch": 0.1417142857142857, + "grad_norm": 0.23775885999202728, + "kl": 0.006164073944091797, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.412727182773486e-07, + "loss": 0.0002, + "reward": 0.46917978674173355, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.46917978674173355, + "reward_after_std": 0.8176695536822081, + "reward_before_mean": 0.5814000591635704, + "reward_before_std": 0.8219790123403072, + "reward_change_max": 0.0, + "reward_change_mean": -0.11222026916220784, + "reward_change_min": -0.19535045605152845, + "reward_change_std": 0.08150730514898896, + "reward_std": 0.8176696002483368, + "rewards/cosine_scaled_reward": 0.04070002248045057, + "rewards/format_reward": 0.5000000074505806, + "step": 124 + }, + { + "advantage_max": 1.4708463251590729, + "advantage_mean": 3.632158163124899e-08, + "advantage_min": -1.1529624238610268, + "advantage_std": 0.9997900947928429, + "completion_length": 2675.0625, + "epoch": 0.14285714285714285, + "grad_norm": 0.1962793618440628, + "kl": 0.0011947154998779297, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.397114317029974e-07, + "loss": 0.0, + "reward": 0.34733692556619644, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.34733692556619644, + "reward_after_std": 0.6260436102747917, + "reward_before_mean": 0.4519122443161905, + "reward_before_std": 0.6228771880269051, + "reward_change_max": 0.00011279433965682983, + "reward_change_mean": -0.10457530617713928, + "reward_change_min": -0.17521720752120018, + "reward_change_std": 0.06829811399802566, + "reward_std": 0.6260436214506626, + "rewards/cosine_scaled_reward": 0.04887278733076528, + "rewards/format_reward": 0.35416666977107525, + "step": 125 + }, + { + "advantage_max": 1.6396605372428894, + "advantage_mean": 2.3593506592867186e-08, + "advantage_min": -0.9779276698827744, + "advantage_std": 0.9998251125216484, + "completion_length": 2967.4583740234375, + "epoch": 0.144, + "grad_norm": 0.1698831021785736, + "kl": 0.0008451938629150391, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.381311511432658e-07, + "loss": 0.0, + "reward": 0.2825273647904396, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.2825273647904396, + "reward_after_std": 0.7569136507809162, + "reward_before_mean": 0.37666072975844145, + "reward_before_std": 0.7486708983778954, + "reward_change_max": 0.00029872357845306396, + "reward_change_mean": -0.09413336007855833, + "reward_change_min": -0.17554103955626488, + "reward_change_std": 0.06924102012999356, + "reward_std": 0.7569136843085289, + "rewards/cosine_scaled_reward": -0.030419636983424425, + "rewards/format_reward": 0.43750000558793545, + "step": 126 + }, + { + "advantage_max": 1.590319201350212, + "advantage_mean": -2.2351742345882997e-08, + "advantage_min": -1.1233162581920624, + "advantage_std": 0.999773882329464, + "completion_length": 3154.3333892822266, + "epoch": 0.14514285714285713, + "grad_norm": 0.172366663813591, + "kl": 0.0011754035949707031, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.36531953618799e-07, + "loss": 0.0, + "reward": -0.15957457711920142, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.15957457711920142, + "reward_after_std": 0.5303029231727123, + "reward_before_mean": -0.09974119672551751, + "reward_before_std": 0.5254558436572552, + "reward_change_max": 0.0, + "reward_change_mean": -0.05983339576050639, + "reward_change_min": -0.10755488555878401, + "reward_change_std": 0.04164266283623874, + "reward_std": 0.5303029306232929, + "rewards/cosine_scaled_reward": -0.2165372660383582, + "rewards/format_reward": 0.33333334140479565, + "step": 127 + }, + { + "advantage_max": 1.3794664293527603, + "advantage_mean": -3.3527613574335646e-08, + "advantage_min": -1.1035713329911232, + "advantage_std": 0.9998351410031319, + "completion_length": 2929.9375076293945, + "epoch": 0.1462857142857143, + "grad_norm": 0.1915324479341507, + "kl": 0.0020612478256225586, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.34913917072228e-07, + "loss": 0.0001, + "reward": 0.5117706246674061, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5117706246674061, + "reward_after_std": 0.8629733137786388, + "reward_before_mean": 0.62900335714221, + "reward_before_std": 0.8841073550283909, + "reward_change_max": 0.00022004544734954834, + "reward_change_mean": -0.11723275459371507, + "reward_change_min": -0.22613342199474573, + "reward_change_std": 0.09391661314293742, + "reward_std": 0.8629733547568321, + "rewards/cosine_scaled_reward": 0.09575167298316956, + "rewards/format_reward": 0.4375000037252903, + "step": 128 + }, + { + "advantage_max": 1.6466744989156723, + "advantage_mean": 4.253039909141165e-08, + "advantage_min": -0.9315560981631279, + "advantage_std": 0.9997737854719162, + "completion_length": 3490.229217529297, + "epoch": 0.14742857142857144, + "grad_norm": 0.16802391409873962, + "kl": 0.0023403167724609375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.332771203643714e-07, + "loss": 0.0001, + "reward": -0.21124888956546783, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": -0.21124888956546783, + "reward_after_std": 0.849559772759676, + "reward_before_mean": -0.1677091233432293, + "reward_before_std": 0.8529805261641741, + "reward_change_max": 0.000441625714302063, + "reward_change_mean": -0.04353977448772639, + "reward_change_min": -0.11069826781749725, + "reward_change_std": 0.04446559911593795, + "reward_std": 0.849559810012579, + "rewards/cosine_scaled_reward": -0.15677122166380286, + "rewards/format_reward": 0.1458333358168602, + "step": 129 + }, + { + "advantage_max": 1.3871336728334427, + "advantage_mean": -1.241763691872677e-09, + "advantage_min": -1.234124794602394, + "advantage_std": 0.9997271597385406, + "completion_length": 3101.0416717529297, + "epoch": 0.14857142857142858, + "grad_norm": 0.19376447796821594, + "kl": 0.002029895782470703, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.316216432703916e-07, + "loss": 0.0001, + "reward": -0.15444228425621986, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.15444228425621986, + "reward_after_std": 0.44813977740705013, + "reward_before_mean": -0.08898514322936535, + "reward_before_std": 0.4583962671458721, + "reward_change_max": 0.0003493502736091614, + "reward_change_mean": -0.0654571489430964, + "reward_change_min": -0.1214989572763443, + "reward_change_std": 0.048809306579642, + "reward_std": 0.4481397867202759, + "rewards/cosine_scaled_reward": -0.13824257254600525, + "rewards/format_reward": 0.1875, + "step": 130 + }, + { + "advantage_max": 1.4599091708660126, + "advantage_mean": -1.9247333615801665e-08, + "advantage_min": -1.2544859647750854, + "advantage_std": 0.999752089381218, + "completion_length": 2915.7916679382324, + "epoch": 0.14971428571428572, + "grad_norm": 0.18682855367660522, + "kl": 0.003175973892211914, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.299475664759068e-07, + "loss": 0.0001, + "reward": 0.5291292034089565, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.5291292034089565, + "reward_after_std": 0.6431342400610447, + "reward_before_mean": 0.6541959419846535, + "reward_before_std": 0.6478817011229694, + "reward_change_max": 0.0002513080835342407, + "reward_change_mean": -0.12506670271977782, + "reward_change_min": -0.20734062790870667, + "reward_change_std": 0.08662228705361485, + "reward_std": 0.6431342735886574, + "rewards/cosine_scaled_reward": 0.13959795609116554, + "rewards/format_reward": 0.3750000074505806, + "step": 131 + }, + { + "advantage_max": 1.4901214316487312, + "advantage_mean": 9.002783629696864e-09, + "advantage_min": -1.1353254616260529, + "advantage_std": 0.9997986853122711, + "completion_length": 2639.0208435058594, + "epoch": 0.15085714285714286, + "grad_norm": 0.1656983196735382, + "kl": 0.0013974905014038086, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.282549715730579e-07, + "loss": 0.0001, + "reward": 0.3720488026738167, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3720488026738167, + "reward_after_std": 0.8538518510758877, + "reward_before_mean": 0.4742213059216738, + "reward_before_std": 0.8687582034617662, + "reward_change_max": 0.0, + "reward_change_mean": -0.1021725102327764, + "reward_change_min": -0.19822395592927933, + "reward_change_std": 0.07776921393815428, + "reward_std": 0.8538518510758877, + "rewards/cosine_scaled_reward": 0.01836064923554659, + "rewards/format_reward": 0.43750000558793545, + "step": 132 + }, + { + "advantage_max": 1.3228430151939392, + "advantage_mean": 2.7318796114172983e-08, + "advantage_min": -1.146996609866619, + "advantage_std": 0.9996031150221825, + "completion_length": 3313.8958435058594, + "epoch": 0.152, + "grad_norm": 0.207058846950531, + "kl": 0.0017418861389160156, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.265439410565328e-07, + "loss": 0.0001, + "reward": -0.23692141473293304, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": -0.23692141473293304, + "reward_after_std": 0.5058922655880451, + "reward_before_mean": -0.18047994375228882, + "reward_before_std": 0.5220023482106626, + "reward_change_max": 0.00026736408472061157, + "reward_change_mean": -0.05644145607948303, + "reward_change_min": -0.1346017699688673, + "reward_change_std": 0.0541907181032002, + "reward_std": 0.5058922655880451, + "rewards/cosine_scaled_reward": -0.18398998258635402, + "rewards/format_reward": 0.1875000074505806, + "step": 133 + }, + { + "advantage_max": 1.4404646754264832, + "advantage_mean": -8.133550766231679e-08, + "advantage_min": -1.1880767047405243, + "advantage_std": 0.9997904896736145, + "completion_length": 2402.2291946411133, + "epoch": 0.15314285714285714, + "grad_norm": 0.2038743793964386, + "kl": 0.0018963813781738281, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.248145583195447e-07, + "loss": 0.0001, + "reward": 0.526889817789197, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.526889817789197, + "reward_after_std": 0.7365444861352444, + "reward_before_mean": 0.6467488976195455, + "reward_before_std": 0.7403203975409269, + "reward_change_max": 0.00013075023889541626, + "reward_change_mean": -0.11985907377675176, + "reward_change_min": -0.21406998671591282, + "reward_change_std": 0.08232975355349481, + "reward_std": 0.7365445084869862, + "rewards/cosine_scaled_reward": 0.04212443716824055, + "rewards/format_reward": 0.5625000055879354, + "step": 134 + }, + { + "advantage_max": 1.5359413474798203, + "advantage_mean": -1.0399769101443468e-07, + "advantage_min": -0.9698811173439026, + "advantage_std": 0.9997820109128952, + "completion_length": 2010.0208892822266, + "epoch": 0.15428571428571428, + "grad_norm": 0.2191932201385498, + "kl": 0.0039234161376953125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.230669076497687e-07, + "loss": 0.0002, + "reward": 0.9419772960245609, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.9419772960245609, + "reward_after_std": 0.7658766210079193, + "reward_before_mean": 1.1002132706344128, + "reward_before_std": 0.7517257547006011, + "reward_change_max": 0.0, + "reward_change_mean": -0.15823592338711023, + "reward_change_min": -0.2634607693180442, + "reward_change_std": 0.10368040250614285, + "reward_std": 0.7658766359090805, + "rewards/cosine_scaled_reward": 0.2271899450570345, + "rewards/format_reward": 0.6458333414047956, + "step": 135 + }, + { + "advantage_max": 1.3615388423204422, + "advantage_mean": -7.450580152834618e-09, + "advantage_min": -1.1105039417743683, + "advantage_std": 0.9997578710317612, + "completion_length": 2758.9791870117188, + "epoch": 0.15542857142857142, + "grad_norm": 0.2017042189836502, + "kl": 0.0016570091247558594, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.213010742252327e-07, + "loss": 0.0001, + "reward": 0.3180756554938853, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3180756554938853, + "reward_after_std": 0.9733518315479159, + "reward_before_mean": 0.41393173237884184, + "reward_before_std": 1.0054605212062597, + "reward_change_max": 0.0001665651798248291, + "reward_change_mean": -0.09585604723542929, + "reward_change_min": -0.2182399481534958, + "reward_change_std": 0.08857251331210136, + "reward_std": 0.9733518492430449, + "rewards/cosine_scaled_reward": 0.00904920045286417, + "rewards/format_reward": 0.39583333395421505, + "step": 136 + }, + { + "advantage_max": 1.5188078880310059, + "advantage_mean": 1.3659397168908072e-08, + "advantage_min": -0.969260111451149, + "advantage_std": 0.9996920749545097, + "completion_length": 3099.6875228881836, + "epoch": 0.15657142857142858, + "grad_norm": 0.1994733065366745, + "kl": 0.0014867782592773438, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.195171441101668e-07, + "loss": 0.0001, + "reward": -0.09599984437227249, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": -0.09599984437227249, + "reward_after_std": 0.6841760762035847, + "reward_before_mean": -0.033508097752928734, + "reward_before_std": 0.6963941073045135, + "reward_change_max": 0.0004359111189842224, + "reward_change_mean": -0.06249175767879933, + "reward_change_min": -0.14749382436275482, + "reward_change_std": 0.05822044319938868, + "reward_std": 0.684176079928875, + "rewards/cosine_scaled_reward": -0.15217071864753962, + "rewards/format_reward": 0.2708333358168602, + "step": 137 + }, + { + "advantage_max": 1.5097112655639648, + "advantage_mean": 3.91155504098073e-08, + "advantage_min": -1.1656879857182503, + "advantage_std": 0.9996949210762978, + "completion_length": 2763.395866394043, + "epoch": 0.15771428571428572, + "grad_norm": 0.18249079585075378, + "kl": 0.004346370697021484, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.177152042508077e-07, + "loss": 0.0002, + "reward": 0.09496973222121596, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.09496973222121596, + "reward_after_std": 0.4213468311354518, + "reward_before_mean": 0.1828093589283526, + "reward_before_std": 0.4119232380762696, + "reward_change_max": 0.00017911195755004883, + "reward_change_mean": -0.0878396132029593, + "reward_change_min": -0.14762252569198608, + "reward_change_std": 0.05701114097610116, + "reward_std": 0.4213468497619033, + "rewards/cosine_scaled_reward": -0.14817864634096622, + "rewards/format_reward": 0.4791666716337204, + "step": 138 + }, + { + "advantage_max": 1.5847613364458084, + "advantage_mean": -2.173085378309736e-09, + "advantage_min": -1.175045009702444, + "advantage_std": 0.9997278079390526, + "completion_length": 3105.812545776367, + "epoch": 0.15885714285714286, + "grad_norm": 0.20554403960704803, + "kl": 0.0026154518127441406, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.158953424711624e-07, + "loss": 0.0001, + "reward": 0.07944915629923344, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.07944915629923344, + "reward_after_std": 0.7219101591035724, + "reward_before_mean": 0.1550278328359127, + "reward_before_std": 0.719237182289362, + "reward_change_max": 0.00016336888074874878, + "reward_change_mean": -0.07557868165895343, + "reward_change_min": -0.14963595662266016, + "reward_change_std": 0.05737433675676584, + "reward_std": 0.7219101591035724, + "rewards/cosine_scaled_reward": -0.12040275533217937, + "rewards/format_reward": 0.39583333767950535, + "step": 139 + }, + { + "advantage_max": 1.6046061217784882, + "advantage_mean": 6.084640968850863e-08, + "advantage_min": -1.058633465319872, + "advantage_std": 0.999708391726017, + "completion_length": 2978.166748046875, + "epoch": 0.16, + "grad_norm": 0.1818619817495346, + "kl": 0.004107475280761719, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.140576474687263e-07, + "loss": 0.0002, + "reward": 0.10122170485556126, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.10122170485556126, + "reward_after_std": 0.6357193235307932, + "reward_before_mean": 0.18152375193312764, + "reward_before_std": 0.623972998932004, + "reward_change_max": 0.0006061270833015442, + "reward_change_mean": -0.08030204870738089, + "reward_change_min": -0.13752434495836496, + "reward_change_std": 0.05715995456557721, + "reward_std": 0.6357193402945995, + "rewards/cosine_scaled_reward": -0.06548812706023455, + "rewards/format_reward": 0.3125000037252903, + "step": 140 + }, + { + "advantage_max": 1.4406469464302063, + "advantage_mean": 6.643434380393387e-08, + "advantage_min": -1.0366877242922783, + "advantage_std": 0.9997889995574951, + "completion_length": 2887.916732788086, + "epoch": 0.16114285714285714, + "grad_norm": 0.18227314949035645, + "kl": 0.0030608177185058594, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.122022088101613e-07, + "loss": 0.0001, + "reward": 0.023926494643092155, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.023926494643092155, + "reward_after_std": 0.7154865898191929, + "reward_before_mean": 0.09898238629102707, + "reward_before_std": 0.736889086663723, + "reward_change_max": 0.000539436936378479, + "reward_change_mean": -0.07505585625767708, + "reward_change_min": -0.17021092772483826, + "reward_change_std": 0.06994302081875503, + "reward_std": 0.7154866233468056, + "rewards/cosine_scaled_reward": -0.19009215137339197, + "rewards/format_reward": 0.479166679084301, + "step": 141 + }, + { + "advantage_max": 1.3935022801160812, + "advantage_mean": -9.934107536579972e-09, + "advantage_min": -1.0459297895431519, + "advantage_std": 0.9998486042022705, + "completion_length": 2663.416732788086, + "epoch": 0.16228571428571428, + "grad_norm": 0.19613024592399597, + "kl": 0.0020918846130371094, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.103291169269299e-07, + "loss": 0.0001, + "reward": 0.2961191050708294, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2961191050708294, + "reward_after_std": 0.9062751308083534, + "reward_before_mean": 0.3927744999527931, + "reward_before_std": 0.939884040504694, + "reward_change_max": 0.00010180473327636719, + "reward_change_mean": -0.09665541374124587, + "reward_change_min": -0.22028039954602718, + "reward_change_std": 0.09074757865164429, + "reward_std": 0.9062751643359661, + "rewards/cosine_scaled_reward": -0.07444609270896763, + "rewards/format_reward": 0.5416666772216558, + "step": 142 + }, + { + "advantage_max": 1.5322048366069794, + "advantage_mean": -1.3504177776013648e-08, + "advantage_min": -1.0548894479870796, + "advantage_std": 0.9998358264565468, + "completion_length": 2548.041732788086, + "epoch": 0.16342857142857142, + "grad_norm": 0.30651625990867615, + "kl": 0.00348663330078125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.084384631108882e-07, + "loss": 0.0001, + "reward": 0.1695709004998207, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1695709004998207, + "reward_after_std": 0.8127848580479622, + "reward_before_mean": 0.2530234828591347, + "reward_before_std": 0.8218814432621002, + "reward_change_max": 0.0, + "reward_change_mean": -0.08345259842462838, + "reward_change_min": -0.16000903863459826, + "reward_change_std": 0.06607106560841203, + "reward_std": 0.8127848766744137, + "rewards/cosine_scaled_reward": -0.12348826136440039, + "rewards/format_reward": 0.5000000093132257, + "step": 143 + }, + { + "advantage_max": 1.6216581761837006, + "advantage_mean": 3.7252856355252106e-09, + "advantage_min": -1.0678609758615494, + "advantage_std": 0.9997515752911568, + "completion_length": 2930.3125228881836, + "epoch": 0.16457142857142856, + "grad_norm": 0.20165316760540009, + "kl": 0.0022115707397460938, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.065303395098358e-07, + "loss": 0.0001, + "reward": 0.2244191411882639, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2244191411882639, + "reward_after_std": 0.8113269321620464, + "reward_before_mean": 0.310532383620739, + "reward_before_std": 0.8061074055731297, + "reward_change_max": 0.0006739497184753418, + "reward_change_mean": -0.08611326036043465, + "reward_change_min": -0.172080148011446, + "reward_change_std": 0.06757508893497288, + "reward_std": 0.8113269321620464, + "rewards/cosine_scaled_reward": -0.03223381540738046, + "rewards/format_reward": 0.37500000558793545, + "step": 144 + }, + { + "advantage_max": 1.495003655552864, + "advantage_mean": -4.346171922353648e-08, + "advantage_min": -1.0665920972824097, + "advantage_std": 0.999808594584465, + "completion_length": 1915.729232788086, + "epoch": 0.1657142857142857, + "grad_norm": 0.30377116799354553, + "kl": 0.002331256866455078, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.046048391230247e-07, + "loss": 0.0001, + "reward": 0.47825442533940077, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.47825442533940077, + "reward_after_std": 0.7022902369499207, + "reward_before_mean": 0.5941783878952265, + "reward_before_std": 0.7010281160473824, + "reward_change_max": 0.0, + "reward_change_mean": -0.11592395044863224, + "reward_change_min": -0.2171989856287837, + "reward_change_std": 0.08219394693151116, + "reward_std": 0.7022902630269527, + "rewards/cosine_scaled_reward": -0.04666082002222538, + "rewards/format_reward": 0.6875000074505806, + "step": 145 + }, + { + "advantage_max": 1.4068877398967743, + "advantage_mean": 2.980232360894064e-08, + "advantage_min": -1.1271022856235504, + "advantage_std": 0.9998052194714546, + "completion_length": 2556.812530517578, + "epoch": 0.16685714285714287, + "grad_norm": 0.18746472895145416, + "kl": 0.0015310049057006836, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.026620557966279e-07, + "loss": 0.0001, + "reward": 0.08086108416318893, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.08086108416318893, + "reward_after_std": 0.665203008800745, + "reward_before_mean": 0.1612489800900221, + "reward_before_std": 0.6762508265674114, + "reward_change_max": 0.00020448118448257446, + "reward_change_mean": -0.08038788754492998, + "reward_change_min": -0.17192814219743013, + "reward_change_std": 0.06585183460265398, + "reward_std": 0.6652030423283577, + "rewards/cosine_scaled_reward": -0.20062552206218243, + "rewards/format_reward": 0.5625000018626451, + "step": 146 + }, + { + "advantage_max": 1.5742152035236359, + "advantage_mean": 6.829699250587851e-09, + "advantage_min": -1.0765742659568787, + "advantage_std": 0.9998117387294769, + "completion_length": 2837.437530517578, + "epoch": 0.168, + "grad_norm": 0.19602230191230774, + "kl": 0.00308990478515625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 9.007020842191634e-07, + "loss": 0.0001, + "reward": 0.08351327944546938, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.08351327944546938, + "reward_after_std": 0.6948803104460239, + "reward_before_mean": 0.16174614802002907, + "reward_before_std": 0.6967041678726673, + "reward_change_max": 0.0001440197229385376, + "reward_change_mean": -0.07823285344056785, + "reward_change_min": -0.1446439679712057, + "reward_change_std": 0.05933955032378435, + "reward_std": 0.694880336523056, + "rewards/cosine_scaled_reward": -0.10662694118218496, + "rewards/format_reward": 0.3750000037252903, + "step": 147 + }, + { + "advantage_max": 1.5195300206542015, + "advantage_mean": -2.4835269396561444e-09, + "advantage_min": -1.1114639192819595, + "advantage_std": 0.9998055621981621, + "completion_length": 2250.354202270508, + "epoch": 0.16914285714285715, + "grad_norm": 0.16835594177246094, + "kl": 0.0018110275268554688, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.987250199168808e-07, + "loss": 0.0001, + "reward": 0.31845202576369047, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.31845202576369047, + "reward_after_std": 0.6970034800469875, + "reward_before_mean": 0.4189865104854107, + "reward_before_std": 0.6898063011467457, + "reward_change_max": 0.0010981261730194092, + "reward_change_mean": -0.10053447051905096, + "reward_change_min": -0.1857745312154293, + "reward_change_std": 0.07377739227376878, + "reward_std": 0.6970034874975681, + "rewards/cosine_scaled_reward": -0.10300674941390753, + "rewards/format_reward": 0.6250000111758709, + "step": 148 + }, + { + "advantage_max": 1.4426256641745567, + "advantage_mean": 1.924733339375706e-08, + "advantage_min": -1.2441855445504189, + "advantage_std": 0.9998246803879738, + "completion_length": 2690.6458740234375, + "epoch": 0.1702857142857143, + "grad_norm": 0.3846557140350342, + "kl": 0.016859054565429688, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.967309592491052e-07, + "loss": 0.0007, + "reward": 0.3900892809033394, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3900892809033394, + "reward_after_std": 0.8488783277571201, + "reward_before_mean": 0.49406828731298447, + "reward_before_std": 0.8630441017448902, + "reward_change_max": 0.00035718828439712524, + "reward_change_mean": -0.10397897334769368, + "reward_change_min": -0.1900622034445405, + "reward_change_std": 0.07942482200451195, + "reward_std": 0.8488783352077007, + "rewards/cosine_scaled_reward": -0.013382526114583015, + "rewards/format_reward": 0.5208333414047956, + "step": 149 + }, + { + "advantage_max": 1.6954808682203293, + "advantage_mean": -1.8626452047421083e-08, + "advantage_min": -0.899493508040905, + "advantage_std": 0.9998697191476822, + "completion_length": 2483.479217529297, + "epoch": 0.17142857142857143, + "grad_norm": 0.224741131067276, + "kl": 0.0034275054931640625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.9471999940354e-07, + "loss": 0.0001, + "reward": 0.13289955770596862, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.13289955770596862, + "reward_after_std": 0.9514123201370239, + "reward_before_mean": 0.20388731081038713, + "reward_before_std": 0.9427561610937119, + "reward_change_max": 0.0, + "reward_change_mean": -0.07098775426857173, + "reward_change_min": -0.14540079329162836, + "reward_change_std": 0.057504348922520876, + "reward_std": 0.9514123611152172, + "rewards/cosine_scaled_reward": -0.12722301567555405, + "rewards/format_reward": 0.4583333358168602, + "step": 150 + }, + { + "advantage_max": 1.5135483890771866, + "advantage_mean": -8.816520657983773e-08, + "advantage_min": -1.2416688278317451, + "advantage_std": 0.9997973516583443, + "completion_length": 2449.729232788086, + "epoch": 0.17257142857142857, + "grad_norm": 0.2216656506061554, + "kl": 0.0034198760986328125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.926922383915315e-07, + "loss": 0.0001, + "reward": 0.48375364703679224, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.48375364703679224, + "reward_after_std": 0.6876551508903503, + "reward_before_mean": 0.5997938713990152, + "reward_before_std": 0.6790660806000233, + "reward_change_max": 8.161365985870361e-05, + "reward_change_mean": -0.11604023166000843, + "reward_change_min": -0.18250472843647003, + "reward_change_std": 0.0721652910578996, + "reward_std": 0.6876551546156406, + "rewards/cosine_scaled_reward": 0.01864692009985447, + "rewards/format_reward": 0.5625000204890966, + "step": 151 + }, + { + "advantage_max": 1.4979790449142456, + "advantage_mean": 5.960464721788128e-08, + "advantage_min": -1.2011219523847103, + "advantage_std": 0.9997016414999962, + "completion_length": 2874.708366394043, + "epoch": 0.1737142857142857, + "grad_norm": 0.23708555102348328, + "kl": 0.0017731189727783203, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.906477750432903e-07, + "loss": 0.0001, + "reward": -0.11927625350654125, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.11927625350654125, + "reward_after_std": 0.4810872804373503, + "reward_before_mean": -0.0537937730550766, + "reward_before_std": 0.476759847253561, + "reward_change_max": 0.00021364539861679077, + "reward_change_mean": -0.06548247905448079, + "reward_change_min": -0.110728794708848, + "reward_change_std": 0.043877444695681334, + "reward_std": 0.48108728788793087, + "rewards/cosine_scaled_reward": -0.19356355350464582, + "rewards/format_reward": 0.33333333395421505, + "step": 152 + }, + { + "advantage_max": 1.5236635357141495, + "advantage_mean": 2.6697914878859308e-08, + "advantage_min": -1.2076719664037228, + "advantage_std": 0.9997353553771973, + "completion_length": 2614.6459045410156, + "epoch": 0.17485714285714285, + "grad_norm": 0.33291175961494446, + "kl": 0.003040790557861328, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.88586709003076e-07, + "loss": 0.0001, + "reward": 0.16083110310137272, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.16083110310137272, + "reward_after_std": 0.5439409669488668, + "reward_before_mean": 0.25147375743836164, + "reward_before_std": 0.539504618383944, + "reward_change_max": 0.0001697242259979248, + "reward_change_mean": -0.09064263617619872, + "reward_change_min": -0.15238108951598406, + "reward_change_std": 0.06364887952804565, + "reward_std": 0.5439409743994474, + "rewards/cosine_scaled_reward": -0.11384646594524384, + "rewards/format_reward": 0.4791666679084301, + "step": 153 + }, + { + "advantage_max": 1.6322798505425453, + "advantage_mean": -1.9868215073159945e-08, + "advantage_min": -0.8856546506285667, + "advantage_std": 0.9998943731188774, + "completion_length": 3126.916717529297, + "epoch": 0.176, + "grad_norm": 0.15531423687934875, + "kl": 0.0019125938415527344, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.865091407243394e-07, + "loss": 0.0001, + "reward": 0.2853868268430233, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.2853868268430233, + "reward_after_std": 1.2025733813643456, + "reward_before_mean": 0.3669017466454534, + "reward_before_std": 1.2222107723355293, + "reward_change_max": 0.0006478503346443176, + "reward_change_mean": -0.081514913123101, + "reward_change_min": -0.20529971737414598, + "reward_change_std": 0.08442414319142699, + "reward_std": 1.2025734297931194, + "rewards/cosine_scaled_reward": -0.024882478785002604, + "rewards/format_reward": 0.4166666753590107, + "step": 154 + }, + { + "advantage_max": 1.6839442551136017, + "advantage_mean": -4.221995830722136e-08, + "advantage_min": -0.941412091255188, + "advantage_std": 0.9998479187488556, + "completion_length": 2498.583381652832, + "epoch": 0.17714285714285713, + "grad_norm": 0.19921159744262695, + "kl": 0.0031003952026367188, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.844151714648274e-07, + "loss": 0.0001, + "reward": 0.569856112357229, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.569856112357229, + "reward_after_std": 0.9291363656520844, + "reward_before_mean": 0.6841814294457436, + "reward_before_std": 0.9151368550956249, + "reward_change_max": 0.0002962574362754822, + "reward_change_mean": -0.11432529683224857, + "reward_change_min": -0.20955283753573895, + "reward_change_std": 0.08066396252252162, + "reward_std": 0.9291363768279552, + "rewards/cosine_scaled_reward": 0.07125736703164876, + "rewards/format_reward": 0.5416666679084301, + "step": 155 + }, + { + "advantage_max": 1.5021150261163712, + "advantage_mean": 1.1175871006408045e-08, + "advantage_min": -1.1296715438365936, + "advantage_std": 0.9998017847537994, + "completion_length": 2816.125030517578, + "epoch": 0.1782857142857143, + "grad_norm": 0.19361698627471924, + "kl": 0.0018634796142578125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.823049032816478e-07, + "loss": 0.0001, + "reward": 0.07529625482857227, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.07529625482857227, + "reward_after_std": 0.6703125648200512, + "reward_before_mean": 0.15437587723135948, + "reward_before_std": 0.6760777495801449, + "reward_change_max": 0.0002835988998413086, + "reward_change_mean": -0.07907962403260171, + "reward_change_min": -0.15298352297395468, + "reward_change_std": 0.060570935369469225, + "reward_std": 0.6703126020729542, + "rewards/cosine_scaled_reward": -0.08947873779106885, + "rewards/format_reward": 0.3333333358168602, + "step": 156 + }, + { + "advantage_max": 1.5404149293899536, + "advantage_mean": 8.863086731203396e-08, + "advantage_min": -1.1716954857110977, + "advantage_std": 0.9997392222285271, + "completion_length": 2796.541702270508, + "epoch": 0.17942857142857144, + "grad_norm": 0.28218579292297363, + "kl": 0.004803657531738281, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.801784390262943e-07, + "loss": 0.0002, + "reward": -0.06934672966599464, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.06934672966599464, + "reward_after_std": 0.5129354447126389, + "reward_before_mean": 0.001629834994673729, + "reward_before_std": 0.5202076155692339, + "reward_change_max": 6.977468729019165e-05, + "reward_change_mean": -0.07097655383404344, + "reward_change_min": -0.1306041106581688, + "reward_change_std": 0.05284939787816256, + "reward_std": 0.5129354521632195, + "rewards/cosine_scaled_reward": -0.22835176065564156, + "rewards/format_reward": 0.4583333469927311, + "step": 157 + }, + { + "advantage_max": 1.551323488354683, + "advantage_mean": -5.463759245039057e-08, + "advantage_min": -0.998228020966053, + "advantage_std": 0.9998158514499664, + "completion_length": 3241.2083740234375, + "epoch": 0.18057142857142858, + "grad_norm": 0.1757160723209381, + "kl": 0.0034437179565429688, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.780358823396352e-07, + "loss": 0.0001, + "reward": 0.4231220823712647, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4231220823712647, + "reward_after_std": 0.6513553634285927, + "reward_before_mean": 0.533676290884614, + "reward_before_std": 0.6418947353959084, + "reward_change_max": 0.00046910345554351807, + "reward_change_mean": -0.11055420152842999, + "reward_change_min": -0.19882715586572886, + "reward_change_std": 0.07370695215649903, + "reward_std": 0.651355367153883, + "rewards/cosine_scaled_reward": 0.11058812821283937, + "rewards/format_reward": 0.31250000558793545, + "step": 158 + }, + { + "advantage_max": 1.4679991006851196, + "advantage_mean": -2.2351742678949904e-08, + "advantage_min": -1.1628785654902458, + "advantage_std": 0.9997387602925301, + "completion_length": 2613.8958740234375, + "epoch": 0.18171428571428572, + "grad_norm": 0.1862615942955017, + "kl": 0.003566741943359375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.758773376468604e-07, + "loss": 0.0001, + "reward": -0.05008651316165924, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": -0.05008651316165924, + "reward_after_std": 0.4757378753274679, + "reward_before_mean": 0.023678398691117764, + "reward_before_std": 0.48022647947072983, + "reward_change_max": 0.0001963600516319275, + "reward_change_mean": -0.07376493467018008, + "reward_change_min": -0.13229443226009607, + "reward_change_std": 0.05242071067914367, + "reward_std": 0.4757378753274679, + "rewards/cosine_scaled_reward": -0.22774413786828518, + "rewards/format_reward": 0.4791666679084301, + "step": 159 + }, + { + "advantage_max": 1.4685984998941422, + "advantage_mean": -1.1796752907855534e-08, + "advantage_min": -1.1256564185023308, + "advantage_std": 0.9998102709650993, + "completion_length": 2535.125015258789, + "epoch": 0.18285714285714286, + "grad_norm": 0.2220413088798523, + "kl": 0.005443572998046875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.737029101523929e-07, + "loss": 0.0002, + "reward": 0.42259710282087326, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.42259710282087326, + "reward_after_std": 0.904817771166563, + "reward_before_mean": 0.5270836362615228, + "reward_before_std": 0.9138357825577259, + "reward_change_max": 7.965415716171265e-05, + "reward_change_mean": -0.10448656626977026, + "reward_change_min": -0.21647041756659746, + "reward_change_std": 0.08518222416751087, + "reward_std": 0.9048177935183048, + "rewards/cosine_scaled_reward": 0.06562515255063772, + "rewards/format_reward": 0.39583333395421505, + "step": 160 + }, + { + "advantage_max": 1.5048878341913223, + "advantage_mean": -1.9247333005179e-08, + "advantage_min": -1.1224373206496239, + "advantage_std": 0.9997854977846146, + "completion_length": 2536.5625610351562, + "epoch": 0.184, + "grad_norm": 0.21401101350784302, + "kl": 0.0053272247314453125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.715127058347614e-07, + "loss": 0.0002, + "reward": 0.34695685049518943, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.34695685049518943, + "reward_after_std": 0.7117992416024208, + "reward_before_mean": 0.44934904761612415, + "reward_before_std": 0.7109068520367146, + "reward_change_max": 3.3311545848846436e-05, + "reward_change_mean": -0.10239221714437008, + "reward_change_min": -0.17907678615301847, + "reward_change_std": 0.06959949876181781, + "reward_std": 0.7117992583662271, + "rewards/cosine_scaled_reward": -0.04615880874916911, + "rewards/format_reward": 0.5416666697710752, + "step": 161 + }, + { + "advantage_max": 1.2772861272096634, + "advantage_mean": -2.483526606589237e-09, + "advantage_min": -1.1813317835330963, + "advantage_std": 0.9998279586434364, + "completion_length": 2731.312515258789, + "epoch": 0.18514285714285714, + "grad_norm": 0.2270742654800415, + "kl": 0.006060600280761719, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.693068314414344e-07, + "loss": 0.0002, + "reward": 0.3163473308086395, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3163473308086395, + "reward_after_std": 0.8020492419600487, + "reward_before_mean": 0.41752319410443306, + "reward_before_std": 0.827314168214798, + "reward_change_max": 0.0001702532172203064, + "reward_change_mean": -0.10117586515843868, + "reward_change_min": -0.2129524489864707, + "reward_change_std": 0.08441391820088029, + "reward_std": 0.8020492419600487, + "rewards/cosine_scaled_reward": -0.009988403879106045, + "rewards/format_reward": 0.43750000558793545, + "step": 162 + }, + { + "advantage_max": 1.4550439938902855, + "advantage_mean": 3.104407619858307e-09, + "advantage_min": -1.16783557087183, + "advantage_std": 0.9997671395540237, + "completion_length": 2460.4583892822266, + "epoch": 0.18628571428571428, + "grad_norm": 0.20131473243236542, + "kl": 0.0035390853881835938, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.670853944836176e-07, + "loss": 0.0001, + "reward": 0.6401930401916616, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6401930401916616, + "reward_after_std": 0.7289017662405968, + "reward_before_mean": 0.7708904361352324, + "reward_before_std": 0.7250148011371493, + "reward_change_max": 0.0, + "reward_change_mean": -0.13069738494232297, + "reward_change_min": -0.21607584971934557, + "reward_change_std": 0.0865078882779926, + "reward_std": 0.7289017718285322, + "rewards/cosine_scaled_reward": 0.09377853712067008, + "rewards/format_reward": 0.5833333488553762, + "step": 163 + }, + { + "advantage_max": 1.3210031017661095, + "advantage_mean": -5.774200284580644e-08, + "advantage_min": -1.2639915123581886, + "advantage_std": 0.9998083263635635, + "completion_length": 2332.500045776367, + "epoch": 0.18742857142857142, + "grad_norm": 0.20200245082378387, + "kl": 0.004815101623535156, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.648485032310144e-07, + "loss": 0.0002, + "reward": 0.545483585447073, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.545483585447073, + "reward_after_std": 0.7864016853272915, + "reward_before_mean": 0.6684325840324163, + "reward_before_std": 0.8049891255795956, + "reward_change_max": 0.00083199143409729, + "reward_change_mean": -0.12294902792200446, + "reward_change_min": -0.23326233215630054, + "reward_change_std": 0.09470712952315807, + "reward_std": 0.7864017356187105, + "rewards/cosine_scaled_reward": 0.042549606412649155, + "rewards/format_reward": 0.5833333414047956, + "step": 164 + }, + { + "advantage_max": 1.3517840281128883, + "advantage_mean": -3.4148494032493204e-09, + "advantage_min": -1.3420864343643188, + "advantage_std": 0.9997818022966385, + "completion_length": 2552.541679382324, + "epoch": 0.18857142857142858, + "grad_norm": 0.22521643340587616, + "kl": 0.004207611083984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.625962667065487e-07, + "loss": 0.0002, + "reward": -0.1310715600848198, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.1310715600848198, + "reward_after_std": 0.4917597956955433, + "reward_before_mean": -0.06424580817110837, + "reward_before_std": 0.5039008930325508, + "reward_change_max": 7.886439561843872e-05, + "reward_change_mean": -0.0668257491197437, + "reward_change_min": -0.13053180649876595, + "reward_change_std": 0.053368333261460066, + "reward_std": 0.4917598068714142, + "rewards/cosine_scaled_reward": -0.250872902572155, + "rewards/format_reward": 0.4375000074505806, + "step": 165 + }, + { + "advantage_max": 1.5099513083696365, + "advantage_mean": 1.9247334170913177e-08, + "advantage_min": -1.0428318604826927, + "advantage_std": 0.9998095035552979, + "completion_length": 2697.6458740234375, + "epoch": 0.18971428571428572, + "grad_norm": 0.24739572405815125, + "kl": 0.0027518272399902344, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.603287946810513e-07, + "loss": 0.0001, + "reward": 0.15346253104507923, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.15346253104507923, + "reward_after_std": 0.7710051350295544, + "reward_before_mean": 0.23662841320037842, + "reward_before_std": 0.7812362983822823, + "reward_change_max": 0.00016464293003082275, + "reward_change_mean": -0.08316585887223482, + "reward_change_min": -0.1772454548627138, + "reward_change_std": 0.07039047335274518, + "reward_std": 0.7710051573812962, + "rewards/cosine_scaled_reward": -0.10043580364435911, + "rewards/format_reward": 0.4375, + "step": 166 + }, + { + "advantage_max": 1.4335933923721313, + "advantage_mean": -1.5522043650406658e-09, + "advantage_min": -1.2036648765206337, + "advantage_std": 0.9998134821653366, + "completion_length": 2456.541732788086, + "epoch": 0.19085714285714286, + "grad_norm": 0.2020367830991745, + "kl": 0.0030541419982910156, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.580461976679099e-07, + "loss": 0.0001, + "reward": 0.34286654088646173, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.34286654088646173, + "reward_after_std": 0.8823619969189167, + "reward_before_mean": 0.4425388155505061, + "reward_before_std": 0.903926894068718, + "reward_change_max": 0.00046034157276153564, + "reward_change_mean": -0.09967226395383477, + "reward_change_min": -0.22274313494563103, + "reward_change_std": 0.08939659083262086, + "reward_std": 0.882362000644207, + "rewards/cosine_scaled_reward": -0.09123059129342437, + "rewards/format_reward": 0.6250000111758709, + "step": 167 + }, + { + "advantage_max": 1.4103640839457512, + "advantage_mean": -2.173086061096896e-08, + "advantage_min": -1.219421647489071, + "advantage_std": 0.9998368471860886, + "completion_length": 2891.375, + "epoch": 0.192, + "grad_norm": 0.18209148943424225, + "kl": 0.0035066604614257812, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.557485869176825e-07, + "loss": 0.0001, + "reward": 0.23698728531599045, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.23698728531599045, + "reward_after_std": 0.8624007292091846, + "reward_before_mean": 0.326487647369504, + "reward_before_std": 0.8801949247717857, + "reward_change_max": 0.0002350136637687683, + "reward_change_mean": -0.08950036205351353, + "reward_change_min": -0.19595735147595406, + "reward_change_std": 0.0768729702103883, + "reward_std": 0.8624007403850555, + "rewards/cosine_scaled_reward": -0.07633952237665653, + "rewards/format_reward": 0.47916668094694614, + "step": 168 + }, + { + "advantage_max": 1.5285159349441528, + "advantage_mean": 3.725290464995368e-08, + "advantage_min": -1.2007123529911041, + "advantage_std": 0.9997627809643745, + "completion_length": 1906.6250457763672, + "epoch": 0.19314285714285714, + "grad_norm": 0.4720163941383362, + "kl": 0.0038690567016601562, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.534360744126753e-07, + "loss": 0.0002, + "reward": 1.2066361154429615, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 1.2066361154429615, + "reward_after_std": 0.6004377212375402, + "reward_before_mean": 1.3932382222265005, + "reward_before_std": 0.5733307562768459, + "reward_change_max": 0.00018994510173797607, + "reward_change_mean": -0.18660208210349083, + "reward_change_min": -0.27627282589673996, + "reward_change_std": 0.11074877297505736, + "reward_std": 0.6004377249628305, + "rewards/cosine_scaled_reward": 0.3007857669144869, + "rewards/format_reward": 0.7916666697710752, + "step": 169 + }, + { + "advantage_max": 1.613743469119072, + "advantage_mean": 5.3395831089986245e-08, + "advantage_min": -1.0112006813287735, + "advantage_std": 0.999798871576786, + "completion_length": 2178.145896911621, + "epoch": 0.19428571428571428, + "grad_norm": 0.2254769206047058, + "kl": 0.003116130828857422, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.511087728614862e-07, + "loss": 0.0001, + "reward": 0.48212924622930586, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.48212924622930586, + "reward_after_std": 0.6476732306182384, + "reward_before_mean": 0.5986989340744913, + "reward_before_std": 0.6297142971307039, + "reward_change_max": 0.00013228505849838257, + "reward_change_mean": -0.11656963010318577, + "reward_change_min": -0.20530240423977375, + "reward_change_std": 0.08074354752898216, + "reward_std": 0.6476732343435287, + "rewards/cosine_scaled_reward": 0.018099449574947357, + "rewards/format_reward": 0.5625000055879354, + "step": 170 + }, + { + "advantage_max": 1.4391934275627136, + "advantage_mean": -9.934107980669182e-09, + "advantage_min": -1.1576652973890305, + "advantage_std": 0.9997934475541115, + "completion_length": 2325.0000610351562, + "epoch": 0.19542857142857142, + "grad_norm": 0.19575172662734985, + "kl": 0.0027556419372558594, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.487667956935087e-07, + "loss": 0.0001, + "reward": 0.4846016988158226, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4846016988158226, + "reward_after_std": 0.7373366430401802, + "reward_before_mean": 0.6001935666427016, + "reward_before_std": 0.743379769846797, + "reward_change_max": 9.828805923461914e-05, + "reward_change_mean": -0.11559185804799199, + "reward_change_min": -0.20142329763621092, + "reward_change_std": 0.07610836182720959, + "reward_std": 0.7373366467654705, + "rewards/cosine_scaled_reward": 0.029263429809361696, + "rewards/format_reward": 0.5416666679084301, + "step": 171 + }, + { + "advantage_max": 1.5662109777331352, + "advantage_mean": -9.809931267312777e-08, + "advantage_min": -1.1238925158977509, + "advantage_std": 0.9997661337256432, + "completion_length": 2761.604179382324, + "epoch": 0.19657142857142856, + "grad_norm": 0.2239156812429428, + "kl": 0.0050258636474609375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.464102570534061e-07, + "loss": 0.0002, + "reward": 0.48637349624186754, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.48637349624186754, + "reward_after_std": 0.5919462293386459, + "reward_before_mean": 0.6056147422641516, + "reward_before_std": 0.5790109075605869, + "reward_change_max": 0.0006321147084236145, + "reward_change_mean": -0.1192412911914289, + "reward_change_min": -0.19068229012191296, + "reward_change_std": 0.07716060453094542, + "reward_std": 0.5919462330639362, + "rewards/cosine_scaled_reward": 0.11530737672001123, + "rewards/format_reward": 0.3750000037252903, + "step": 172 + }, + { + "advantage_max": 1.5436064153909683, + "advantage_mean": -8.785476313111218e-08, + "advantage_min": -0.9907987825572491, + "advantage_std": 0.9997530058026314, + "completion_length": 1651.7917175292969, + "epoch": 0.1977142857142857, + "grad_norm": 0.270830363035202, + "kl": 0.004105567932128906, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.440392717955475e-07, + "loss": 0.0002, + "reward": 0.38264737790450454, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.38264737790450454, + "reward_after_std": 0.6842790376394987, + "reward_before_mean": 0.4890690501779318, + "reward_before_std": 0.6811281414702535, + "reward_change_max": 0.000122949481010437, + "reward_change_mean": -0.10642170021310449, + "reward_change_min": -0.1934090843424201, + "reward_change_std": 0.07530650636181235, + "reward_std": 0.6842790395021439, + "rewards/cosine_scaled_reward": -0.1304654898121953, + "rewards/format_reward": 0.75, + "step": 173 + }, + { + "advantage_max": 1.4847132563591003, + "advantage_mean": 2.2817404077279946e-08, + "advantage_min": -1.3459831699728966, + "advantage_std": 0.9998100623488426, + "completion_length": 2408.666702270508, + "epoch": 0.19885714285714284, + "grad_norm": 0.25999969244003296, + "kl": 0.0075206756591796875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.416539554784089e-07, + "loss": 0.0003, + "reward": 0.32197510451078415, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.32197510451078415, + "reward_after_std": 0.6608857288956642, + "reward_before_mean": 0.42462080453697126, + "reward_before_std": 0.6630401350557804, + "reward_change_max": 0.00017840415239334106, + "reward_change_mean": -0.10264569194987416, + "reward_change_min": -0.1863954644650221, + "reward_change_std": 0.07458728924393654, + "reward_std": 0.6608857549726963, + "rewards/cosine_scaled_reward": -0.10018960013985634, + "rewards/format_reward": 0.6250000074505806, + "step": 174 + }, + { + "advantage_max": 1.4769642427563667, + "advantage_mean": 1.2852251740635978e-07, + "advantage_min": -1.2436860725283623, + "advantage_std": 0.9996817782521248, + "completion_length": 2774.625045776367, + "epoch": 0.2, + "grad_norm": 0.17187584936618805, + "kl": 0.0044689178466796875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.392544243589427e-07, + "loss": 0.0002, + "reward": 0.5743710789829493, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5743710789829493, + "reward_after_std": 0.5426803417503834, + "reward_before_mean": 0.7062350884079933, + "reward_before_std": 0.538358046207577, + "reward_change_max": 0.0001729428768157959, + "reward_change_mean": -0.1318639765959233, + "reward_change_min": -0.1999199902638793, + "reward_change_std": 0.08338183793239295, + "reward_std": 0.5426803696900606, + "rewards/cosine_scaled_reward": 0.10311753861606121, + "rewards/format_reward": 0.5000000055879354, + "step": 175 + }, + { + "advantage_max": 1.4718908816576004, + "advantage_mean": -3.476937759927523e-08, + "advantage_min": -0.9987376481294632, + "advantage_std": 0.9998609870672226, + "completion_length": 2058.0625610351562, + "epoch": 0.20114285714285715, + "grad_norm": 0.2753830552101135, + "kl": 0.0045604705810546875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.368407953869103e-07, + "loss": 0.0002, + "reward": 0.6023684218525887, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.6023684218525887, + "reward_after_std": 0.9104252867400646, + "reward_before_mean": 0.7241084277629852, + "reward_before_std": 0.9173421896994114, + "reward_change_max": 2.7664005756378174e-05, + "reward_change_mean": -0.12173997610807419, + "reward_change_min": -0.2230681637302041, + "reward_change_std": 0.09054337767884135, + "reward_std": 0.9104253090918064, + "rewards/cosine_scaled_reward": 0.018304186407476664, + "rewards/format_reward": 0.6875000055879354, + "step": 176 + }, + { + "advantage_max": 1.3292637690901756, + "advantage_mean": -4.346171089686379e-09, + "advantage_min": -1.3279682248830795, + "advantage_std": 0.9997624382376671, + "completion_length": 2565.750045776367, + "epoch": 0.2022857142857143, + "grad_norm": 0.33500978350639343, + "kl": 0.006360054016113281, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.344131861991828e-07, + "loss": 0.0003, + "reward": 0.4371657082810998, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4371657082810998, + "reward_after_std": 0.706968255341053, + "reward_before_mean": 0.552201054408215, + "reward_before_std": 0.7253951374441385, + "reward_change_max": 0.0002434253692626953, + "reward_change_mean": -0.11503535695374012, + "reward_change_min": -0.20339765585958958, + "reward_change_std": 0.08484502066858113, + "reward_std": 0.7069682665169239, + "rewards/cosine_scaled_reward": -0.015566141344606876, + "rewards/format_reward": 0.5833333432674408, + "step": 177 + }, + { + "advantage_max": 1.3190075904130936, + "advantage_mean": -2.607703353252333e-08, + "advantage_min": -1.2449935302138329, + "advantage_std": 0.9998176321387291, + "completion_length": 2517.604202270508, + "epoch": 0.20342857142857143, + "grad_norm": 0.2425314486026764, + "kl": 0.009618759155273438, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.319717151140072e-07, + "loss": 0.0004, + "reward": 0.3113237756770104, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3113237756770104, + "reward_after_std": 0.8060121387243271, + "reward_before_mean": 0.4120410708710551, + "reward_before_std": 0.8318094648420811, + "reward_change_max": 3.4168362617492676e-05, + "reward_change_mean": -0.10071731125935912, + "reward_change_min": -0.2089853510260582, + "reward_change_std": 0.08568191109225154, + "reward_std": 0.8060121387243271, + "rewards/cosine_scaled_reward": -0.0543961301445961, + "rewards/format_reward": 0.5208333358168602, + "step": 178 + }, + { + "advantage_max": 1.666494145989418, + "advantage_mean": 6.457170187434969e-08, + "advantage_min": -0.9503717795014381, + "advantage_std": 0.9996859654784203, + "completion_length": 2578.2916870117188, + "epoch": 0.20457142857142857, + "grad_norm": 0.18489985167980194, + "kl": 0.0033893585205078125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.295165011252396e-07, + "loss": 0.0001, + "reward": -0.011646052822470665, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.011646052822470665, + "reward_after_std": 0.5355609776452184, + "reward_before_mean": 0.06075285002589226, + "reward_before_std": 0.5213664807379246, + "reward_change_max": 8.018314838409424e-05, + "reward_change_mean": -0.07239888049662113, + "reward_change_min": -0.12431587558239698, + "reward_change_std": 0.04638653458096087, + "reward_std": 0.5355609860271215, + "rewards/cosine_scaled_reward": -0.18837358802556992, + "rewards/format_reward": 0.4375, + "step": 179 + }, + { + "advantage_max": 1.6271196901798248, + "advantage_mean": -2.173086099954702e-08, + "advantage_min": -1.081467144191265, + "advantage_std": 0.9998771697282791, + "completion_length": 1909.5625381469727, + "epoch": 0.2057142857142857, + "grad_norm": 0.28603145480155945, + "kl": 0.0059146881103515625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.270476638965461e-07, + "loss": 0.0002, + "reward": 0.6225273078307509, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6225273078307509, + "reward_after_std": 1.0300602950155735, + "reward_before_mean": 0.7391243830788881, + "reward_before_std": 1.0257667750120163, + "reward_change_max": 0.00026930123567581177, + "reward_change_mean": -0.11659704940393567, + "reward_change_min": -0.22699564415961504, + "reward_change_std": 0.08568047219887376, + "reward_std": 1.0300603322684765, + "rewards/cosine_scaled_reward": 0.015395501744933426, + "rewards/format_reward": 0.7083333414047956, + "step": 180 + }, + { + "advantage_max": 1.4436516463756561, + "advantage_mean": -4.718701129835523e-08, + "advantage_min": -1.2535812556743622, + "advantage_std": 0.9998086541891098, + "completion_length": 2982.5625610351562, + "epoch": 0.20685714285714285, + "grad_norm": 0.18748371303081512, + "kl": 0.0066986083984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.245653237555705e-07, + "loss": 0.0003, + "reward": 0.33492581988684833, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.33492581988684833, + "reward_after_std": 0.6875773407518864, + "reward_before_mean": 0.4378726640716195, + "reward_before_std": 0.688941452652216, + "reward_change_max": 0.0001654699444770813, + "reward_change_mean": -0.10294683976098895, + "reward_change_min": -0.18089472129940987, + "reward_change_std": 0.07351072086021304, + "reward_std": 0.6875773705542088, + "rewards/cosine_scaled_reward": 0.00018630968406796455, + "rewards/format_reward": 0.43750000186264515, + "step": 181 + }, + { + "advantage_max": 1.4636868089437485, + "advantage_mean": -2.793967790459817e-08, + "advantage_min": -1.205622598528862, + "advantage_std": 0.9998370632529259, + "completion_length": 2015.520866394043, + "epoch": 0.208, + "grad_norm": 0.18570247292518616, + "kl": 0.0036478042602539062, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.220696016880687e-07, + "loss": 0.0001, + "reward": 0.6111249923706055, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6111249923706055, + "reward_after_std": 0.8329652547836304, + "reward_before_mean": 0.7355204597115517, + "reward_before_std": 0.8338058553636074, + "reward_change_max": 0.0002494007349014282, + "reward_change_mean": -0.12439546594396234, + "reward_change_min": -0.2062538806349039, + "reward_change_std": 0.0867111561819911, + "reward_std": 0.8329653143882751, + "rewards/cosine_scaled_reward": 0.03442687960341573, + "rewards/format_reward": 0.6666666716337204, + "step": 182 + }, + { + "advantage_max": 1.3377911821007729, + "advantage_mean": -4.03573219287523e-09, + "advantage_min": -1.3339354917407036, + "advantage_std": 0.9998266994953156, + "completion_length": 1787.0209045410156, + "epoch": 0.20914285714285713, + "grad_norm": 0.23595218360424042, + "kl": 0.00778961181640625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.195606193320136e-07, + "loss": 0.0003, + "reward": 0.7993416367098689, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.7993416367098689, + "reward_after_std": 0.7656833119690418, + "reward_before_mean": 0.9461353290826082, + "reward_before_std": 0.7740000747144222, + "reward_change_max": 0.0, + "reward_change_mean": -0.14679364021867514, + "reward_change_min": -0.25638253428041935, + "reward_change_std": 0.09901249408721924, + "reward_std": 0.7656833156943321, + "rewards/cosine_scaled_reward": 0.06681763380765915, + "rewards/format_reward": 0.812500013038516, + "step": 183 + }, + { + "advantage_max": 1.4900680482387543, + "advantage_mean": -3.973643020183104e-08, + "advantage_min": -0.9935832992196083, + "advantage_std": 0.9997612237930298, + "completion_length": 2492.583366394043, + "epoch": 0.2102857142857143, + "grad_norm": 0.2290632426738739, + "kl": 0.005229949951171875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.170384989716657e-07, + "loss": 0.0002, + "reward": -0.07437216304242611, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.07437216304242611, + "reward_after_std": 0.46923505514860153, + "reward_before_mean": -0.003364154603332281, + "reward_before_std": 0.46698611602187157, + "reward_change_max": 0.00026264041662216187, + "reward_change_mean": -0.07100802287459373, + "reward_change_min": -0.13311471231281757, + "reward_change_std": 0.05143853323534131, + "reward_std": 0.46923505887389183, + "rewards/cosine_scaled_reward": -0.23084875382483006, + "rewards/format_reward": 0.4583333358168602, + "step": 184 + }, + { + "advantage_max": 1.5278790444135666, + "advantage_mean": -4.346172866043219e-09, + "advantage_min": -1.13626679033041, + "advantage_std": 0.9996984899044037, + "completion_length": 2207.1666870117188, + "epoch": 0.21142857142857144, + "grad_norm": 0.24160081148147583, + "kl": 0.005153656005859375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.145033635316128e-07, + "loss": 0.0002, + "reward": -0.17866181893623434, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.17866181893623434, + "reward_after_std": 0.42477146722376347, + "reward_before_mean": -0.1162537019699812, + "reward_before_std": 0.4256909815594554, + "reward_change_max": 0.0010868832468986511, + "reward_change_mean": -0.0624081171117723, + "reward_change_min": -0.11495065130293369, + "reward_change_std": 0.04580365400761366, + "reward_std": 0.42477147467434406, + "rewards/cosine_scaled_reward": -0.30812685564160347, + "rewards/format_reward": 0.5000000111758709, + "step": 185 + }, + { + "advantage_max": 1.419815257191658, + "advantage_mean": -2.2972623581196672e-08, + "advantage_min": -1.1030875816941261, + "advantage_std": 0.9998009353876114, + "completion_length": 2772.6666870117188, + "epoch": 0.21257142857142858, + "grad_norm": 0.19030563533306122, + "kl": 0.005794525146484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.119553365707802e-07, + "loss": 0.0002, + "reward": 0.13442187756299973, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.13442187756299973, + "reward_after_std": 0.6170549169182777, + "reward_before_mean": 0.22131559252738953, + "reward_before_std": 0.6231498755514622, + "reward_change_max": 0.0005284175276756287, + "reward_change_mean": -0.08689371170476079, + "reward_change_min": -0.16148117743432522, + "reward_change_std": 0.06356596038676798, + "reward_std": 0.6170549280941486, + "rewards/cosine_scaled_reward": -0.08725888282060623, + "rewards/format_reward": 0.39583333395421505, + "step": 186 + }, + { + "advantage_max": 1.4794887602329254, + "advantage_mean": 2.095475859498208e-08, + "advantage_min": -1.2403139621019363, + "advantage_std": 0.999807707965374, + "completion_length": 1984.0208587646484, + "epoch": 0.21371428571428572, + "grad_norm": 0.22693683207035065, + "kl": 0.006984710693359375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.093945422764069e-07, + "loss": 0.0003, + "reward": 0.44079733826220036, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.44079733826220036, + "reward_after_std": 0.6123870965093374, + "reward_before_mean": 0.5555229228921235, + "reward_before_std": 0.6088758781552315, + "reward_change_max": 0.00014670193195343018, + "reward_change_mean": -0.11472555063664913, + "reward_change_min": -0.19294564425945282, + "reward_change_std": 0.07350781094282866, + "reward_std": 0.6123871095478535, + "rewards/cosine_scaled_reward": -0.07640522718429565, + "rewards/format_reward": 0.7083333395421505, + "step": 187 + }, + { + "advantage_max": 1.4657558798789978, + "advantage_mean": 4.967053546245381e-09, + "advantage_min": -1.0956667438149452, + "advantage_std": 0.9997967481613159, + "completion_length": 3006.062530517578, + "epoch": 0.21485714285714286, + "grad_norm": 0.1615750640630722, + "kl": 0.0056781768798828125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.068211054579943e-07, + "loss": 0.0002, + "reward": -0.03509191796183586, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.03509191796183586, + "reward_after_std": 0.7077936753630638, + "reward_before_mean": 0.03107014298439026, + "reward_before_std": 0.7118645720183849, + "reward_change_max": 0.0002888292074203491, + "reward_change_mean": -0.0661620597820729, + "reward_change_min": -0.12805389054119587, + "reward_change_std": 0.04991923741181381, + "reward_std": 0.7077937126159668, + "rewards/cosine_scaled_reward": -0.15113160910550505, + "rewards/format_reward": 0.33333333395421505, + "step": 188 + }, + { + "advantage_max": 1.4838997721672058, + "advantage_mean": -1.862645193639878e-08, + "advantage_min": -1.0723033919930458, + "advantage_std": 0.9998086094856262, + "completion_length": 2195.3750228881836, + "epoch": 0.216, + "grad_norm": 0.2977884113788605, + "kl": 0.005596160888671875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.04235151541222e-07, + "loss": 0.0002, + "reward": 0.4038459522125777, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4038459522125777, + "reward_after_std": 0.660543642938137, + "reward_before_mean": 0.5137117095291615, + "reward_before_std": 0.6600365824997425, + "reward_change_max": 1.96993350982666e-05, + "reward_change_mean": -0.10986578883603215, + "reward_change_min": -0.18296963907778263, + "reward_change_std": 0.07029087585397065, + "reward_std": 0.660543654114008, + "rewards/cosine_scaled_reward": -0.05564414896070957, + "rewards/format_reward": 0.6250000037252903, + "step": 189 + }, + { + "advantage_max": 1.502543568611145, + "advantage_mean": -2.5766593525489156e-08, + "advantage_min": -1.2239737287163734, + "advantage_std": 0.9998229667544365, + "completion_length": 2273.187545776367, + "epoch": 0.21714285714285714, + "grad_norm": 0.1777806133031845, + "kl": 0.005923271179199219, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 8.01636806561836e-07, + "loss": 0.0002, + "reward": 0.5966947921551764, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5966947921551764, + "reward_after_std": 0.7224652580916882, + "reward_before_mean": 0.7232264764606953, + "reward_before_std": 0.7153876610100269, + "reward_change_max": 0.00013045966625213623, + "reward_change_mean": -0.12653166661038995, + "reward_change_min": -0.21457305550575256, + "reward_change_std": 0.08450583834201097, + "reward_std": 0.722465269267559, + "rewards/cosine_scaled_reward": 0.038696552932378836, + "rewards/format_reward": 0.6458333376795053, + "step": 190 + }, + { + "advantage_max": 1.5981075763702393, + "advantage_mean": 3.042320584345504e-08, + "advantage_min": -1.092733584344387, + "advantage_std": 0.9998094141483307, + "completion_length": 1768.2292175292969, + "epoch": 0.21828571428571428, + "grad_norm": 0.22896713018417358, + "kl": 0.005808830261230469, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.990261971595048e-07, + "loss": 0.0002, + "reward": 0.5460129454731941, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5460129454731941, + "reward_after_std": 0.7109896428883076, + "reward_before_mean": 0.6658507529646158, + "reward_before_std": 0.6980491913855076, + "reward_change_max": 0.00015036016702651978, + "reward_change_mean": -0.11983778630383313, + "reward_change_min": -0.2126011624932289, + "reward_change_std": 0.0784097551368177, + "reward_std": 0.7109896764159203, + "rewards/cosine_scaled_reward": -0.021241309586912394, + "rewards/format_reward": 0.7083333395421505, + "step": 191 + }, + { + "advantage_max": 1.5088998228311539, + "advantage_mean": 1.3659398834242609e-08, + "advantage_min": -1.123813882470131, + "advantage_std": 0.9998009204864502, + "completion_length": 2736.5625610351562, + "epoch": 0.21942857142857142, + "grad_norm": 0.26386189460754395, + "kl": 0.0050640106201171875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.964034505716476e-07, + "loss": 0.0002, + "reward": 0.0696917362511158, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.0696917362511158, + "reward_after_std": 0.7020151615142822, + "reward_before_mean": 0.1468326561152935, + "reward_before_std": 0.7101566754281521, + "reward_change_max": 0.0003652498126029968, + "reward_change_mean": -0.0771409273147583, + "reward_change_min": -0.16264318861067295, + "reward_change_std": 0.06438883999362588, + "reward_std": 0.7020151950418949, + "rewards/cosine_scaled_reward": -0.1661670026369393, + "rewards/format_reward": 0.47916667722165585, + "step": 192 + }, + { + "advantage_max": 1.4421148598194122, + "advantage_mean": 2.545615107596433e-08, + "advantage_min": -1.2296575456857681, + "advantage_std": 0.9997586086392403, + "completion_length": 2928.250030517578, + "epoch": 0.22057142857142858, + "grad_norm": 0.2043694406747818, + "kl": 0.005222320556640625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.93768694627233e-07, + "loss": 0.0002, + "reward": -0.035729264840483665, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.035729264840483665, + "reward_after_std": 0.5215943157672882, + "reward_before_mean": 0.0388828688301146, + "reward_before_std": 0.5313386619091034, + "reward_change_max": 0.00044645369052886963, + "reward_change_mean": -0.07461211644113064, + "reward_change_min": -0.13615632615983486, + "reward_change_std": 0.057447490049526095, + "reward_std": 0.5215943232178688, + "rewards/cosine_scaled_reward": -0.1680585816502571, + "rewards/format_reward": 0.3750000037252903, + "step": 193 + }, + { + "advantage_max": 1.486984170973301, + "advantage_mean": -7.2022281916162e-08, + "advantage_min": -1.0792049020528793, + "advantage_std": 0.9998373165726662, + "completion_length": 2874.937545776367, + "epoch": 0.22171428571428572, + "grad_norm": 0.20345580577850342, + "kl": 0.006009101867675781, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.911220577405484e-07, + "loss": 0.0002, + "reward": 0.7174287736415863, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.7174287736415863, + "reward_after_std": 0.8496483601629734, + "reward_before_mean": 0.8520793356001377, + "reward_before_std": 0.8528431281447411, + "reward_change_max": 0.0001317933201789856, + "reward_change_mean": -0.1346505917608738, + "reward_change_min": -0.2605106784030795, + "reward_change_std": 0.10104376077651978, + "reward_std": 0.8496483750641346, + "rewards/cosine_scaled_reward": 0.16562299244105816, + "rewards/format_reward": 0.5208333358168602, + "step": 194 + }, + { + "advantage_max": 1.2939805686473846, + "advantage_mean": 8.881784197001252e-16, + "advantage_min": -1.2831865474581718, + "advantage_std": 0.9998500421643257, + "completion_length": 2426.7083892822266, + "epoch": 0.22285714285714286, + "grad_norm": 0.25405922532081604, + "kl": 0.0049610137939453125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.884636689049422e-07, + "loss": 0.0002, + "reward": 0.44910696102306247, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.44910696102306247, + "reward_after_std": 0.856367252767086, + "reward_before_mean": 0.5602726228535175, + "reward_before_std": 0.8805490881204605, + "reward_change_max": 2.874433994293213e-05, + "reward_change_mean": -0.11116565251722932, + "reward_change_min": -0.24337132275104523, + "reward_change_std": 0.09182561980560422, + "reward_std": 0.8563672751188278, + "rewards/cosine_scaled_reward": -0.04278036626055837, + "rewards/format_reward": 0.6458333544433117, + "step": 195 + }, + { + "advantage_max": 1.2606007531285286, + "advantage_mean": 1.2417634254191512e-08, + "advantage_min": -1.360006719827652, + "advantage_std": 0.9998254328966141, + "completion_length": 3241.041717529297, + "epoch": 0.224, + "grad_norm": 0.15946926176548004, + "kl": 0.0063762664794921875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.857936576865356e-07, + "loss": 0.0003, + "reward": 0.1911745136603713, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.1911745136603713, + "reward_after_std": 0.7220529839396477, + "reward_before_mean": 0.28443292406154796, + "reward_before_std": 0.753384817391634, + "reward_change_max": 0.0001894533634185791, + "reward_change_mean": -0.09325835760682821, + "reward_change_min": -0.18131935968995094, + "reward_change_std": 0.07906473823823035, + "reward_std": 0.7220530100166798, + "rewards/cosine_scaled_reward": -0.03486689180135727, + "rewards/format_reward": 0.354166679084301, + "step": 196 + }, + { + "advantage_max": 1.3448470905423164, + "advantage_mean": -2.980232305382913e-08, + "advantage_min": -1.2865313589572906, + "advantage_std": 0.9998331740498543, + "completion_length": 1494.7083740234375, + "epoch": 0.22514285714285714, + "grad_norm": 0.2721844017505646, + "kl": 0.0056476593017578125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.831121542179086e-07, + "loss": 0.0002, + "reward": 0.8248235955834389, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.8248235955834389, + "reward_after_std": 0.8203364051878452, + "reward_before_mean": 0.9715809598565102, + "reward_before_std": 0.8329006358981133, + "reward_change_max": 0.00021795183420181274, + "reward_change_mean": -0.14675737638026476, + "reward_change_min": -0.2606231663376093, + "reward_change_std": 0.10089114168658853, + "reward_std": 0.8203364387154579, + "rewards/cosine_scaled_reward": 0.08995713107287884, + "rewards/format_reward": 0.791666679084301, + "step": 197 + }, + { + "advantage_max": 1.4269006997346878, + "advantage_mean": 2.4835268952472234e-08, + "advantage_min": -1.2689987570047379, + "advantage_std": 0.9997787326574326, + "completion_length": 2175.0000381469727, + "epoch": 0.22628571428571428, + "grad_norm": 0.22729894518852234, + "kl": 0.0071544647216796875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.804192891917571e-07, + "loss": 0.0003, + "reward": 0.43701247684657574, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.43701247684657574, + "reward_after_std": 0.6505181044340134, + "reward_before_mean": 0.5518691539764404, + "reward_before_std": 0.6552633382380009, + "reward_change_max": 5.833804607391357e-06, + "reward_change_mean": -0.11485666548833251, + "reward_change_min": -0.18947429209947586, + "reward_change_std": 0.07660498935729265, + "reward_std": 0.6505181305110455, + "rewards/cosine_scaled_reward": -0.026148765347898006, + "rewards/format_reward": 0.6041666828095913, + "step": 198 + }, + { + "advantage_max": 1.5523432940244675, + "advantage_mean": -1.7384688688615313e-08, + "advantage_min": -1.0372936129570007, + "advantage_std": 0.9998035654425621, + "completion_length": 2083.229248046875, + "epoch": 0.22742857142857142, + "grad_norm": 0.20395267009735107, + "kl": 0.0061740875244140625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.777151938545235e-07, + "loss": 0.0002, + "reward": 0.25621967762708664, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.25621967762708664, + "reward_after_std": 0.6264994069933891, + "reward_before_mean": 0.35232863295823336, + "reward_before_std": 0.6190488487482071, + "reward_change_max": 0.0001118779182434082, + "reward_change_mean": -0.09610894275829196, + "reward_change_min": -0.17422006744891405, + "reward_change_std": 0.06503989174962044, + "reward_std": 0.62649941816926, + "rewards/cosine_scaled_reward": -0.2092523672617972, + "rewards/format_reward": 0.7708333414047956, + "step": 199 + }, + { + "advantage_max": 1.3771865218877792, + "advantage_mean": -4.097819494841559e-08, + "advantage_min": -1.263512298464775, + "advantage_std": 0.999853827059269, + "completion_length": 1518.083381652832, + "epoch": 0.22857142857142856, + "grad_norm": 0.22084669768810272, + "kl": 0.004482269287109375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.75e-07, + "loss": 0.0002, + "reward": 0.930808313190937, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.930808313190937, + "reward_after_std": 0.9711525030434132, + "reward_before_mean": 1.083461206406355, + "reward_before_std": 0.9862293135374784, + "reward_change_max": 0.0, + "reward_change_mean": -0.152652895078063, + "reward_change_min": -0.27695308346301317, + "reward_change_std": 0.1092049004510045, + "reward_std": 0.9711525365710258, + "rewards/cosine_scaled_reward": 0.08339726109988987, + "rewards/format_reward": 0.916666679084301, + "step": 200 + }, + { + "advantage_max": 1.2871046587824821, + "advantage_mean": -8.881784197001252e-16, + "advantage_min": -1.2733745723962784, + "advantage_std": 0.9998413845896721, + "completion_length": 2204.479179382324, + "epoch": 0.2297142857142857, + "grad_norm": 0.19999176263809204, + "kl": 0.00505828857421875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.72273839962904e-07, + "loss": 0.0002, + "reward": 1.3313297554850578, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 1.3313297554850578, + "reward_after_std": 0.8409098871052265, + "reward_before_mean": 1.5278500989079475, + "reward_before_std": 0.8497489392757416, + "reward_change_max": 0.00014547258615493774, + "reward_change_mean": -0.1965202996507287, + "reward_change_min": -0.3145089754834771, + "reward_change_std": 0.12920579919591546, + "reward_std": 0.8409099094569683, + "rewards/cosine_scaled_reward": 0.37850836105644703, + "rewards/format_reward": 0.7708333432674408, + "step": 201 + }, + { + "advantage_max": 1.4420902132987976, + "advantage_mean": -4.842877432409409e-08, + "advantage_min": -1.206528678536415, + "advantage_std": 0.9997935071587563, + "completion_length": 1855.604190826416, + "epoch": 0.23085714285714284, + "grad_norm": 0.21113544702529907, + "kl": 0.00447845458984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.695368466124296e-07, + "loss": 0.0002, + "reward": 0.9050917774438858, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.9050917774438858, + "reward_after_std": 0.6540236826986074, + "reward_before_mean": 1.0633059060201049, + "reward_before_std": 0.6418928802013397, + "reward_change_max": 4.6059489250183105e-05, + "reward_change_mean": -0.15821410715579987, + "reward_change_min": -0.24650216195732355, + "reward_change_std": 0.09680199483409524, + "reward_std": 0.6540237125009298, + "rewards/cosine_scaled_reward": 0.17748626694083214, + "rewards/format_reward": 0.708333333954215, + "step": 202 + }, + { + "advantage_max": 1.6368324905633926, + "advantage_mean": 1.862646592520889e-09, + "advantage_min": -1.08430115878582, + "advantage_std": 0.9998248592019081, + "completion_length": 2852.2084350585938, + "epoch": 0.232, + "grad_norm": 0.18059028685092926, + "kl": 0.008449554443359375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.667891533457718e-07, + "loss": 0.0003, + "reward": 0.41513045597821474, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.41513045597821474, + "reward_after_std": 0.8612460158765316, + "reward_before_mean": 0.5171395465731621, + "reward_before_std": 0.8522833585739136, + "reward_change_max": 7.91698694229126e-05, + "reward_change_mean": -0.10200908035039902, + "reward_change_min": -0.1845270236954093, + "reward_change_std": 0.0716489371843636, + "reward_std": 0.8612460680305958, + "rewards/cosine_scaled_reward": 0.029403111548162997, + "rewards/format_reward": 0.45833333767950535, + "step": 203 + }, + { + "advantage_max": 1.44488774985075, + "advantage_mean": -3.104408552445648e-08, + "advantage_min": -1.3494350239634514, + "advantage_std": 0.9998263940215111, + "completion_length": 2016.4375534057617, + "epoch": 0.23314285714285715, + "grad_norm": 0.2973000109195709, + "kl": 0.008507728576660156, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.640308940816239e-07, + "loss": 0.0003, + "reward": 0.6798251471482217, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6798251471482217, + "reward_after_std": 0.7647205218672752, + "reward_before_mean": 0.8129912074655294, + "reward_before_std": 0.7654086537659168, + "reward_change_max": 0.0, + "reward_change_mean": -0.13316606543958187, + "reward_change_min": -0.21146363113075495, + "reward_change_std": 0.0862268814817071, + "reward_std": 0.764720544219017, + "rewards/cosine_scaled_reward": 0.03149560187011957, + "rewards/format_reward": 0.7500000186264515, + "step": 204 + }, + { + "advantage_max": 1.324913576245308, + "advantage_mean": -3.849466811978175e-08, + "advantage_min": -1.3274166509509087, + "advantage_std": 0.9998679384589195, + "completion_length": 2227.4375762939453, + "epoch": 0.2342857142857143, + "grad_norm": 0.20827358961105347, + "kl": 0.0046520233154296875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.612622032536507e-07, + "loss": 0.0002, + "reward": 0.9521224275231361, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.9521224275231361, + "reward_after_std": 0.9761649705469608, + "reward_before_mean": 1.1085763350129128, + "reward_before_std": 1.002737108618021, + "reward_change_max": 0.0003203153610229492, + "reward_change_mean": -0.15645390911959112, + "reward_change_min": -0.27211152762174606, + "reward_change_std": 0.11479099653661251, + "reward_std": 0.9761650040745735, + "rewards/cosine_scaled_reward": 0.18970483355224133, + "rewards/format_reward": 0.729166679084301, + "step": 205 + }, + { + "advantage_max": 1.5361033529043198, + "advantage_mean": -1.6142925440831846e-08, + "advantage_min": -1.174651451408863, + "advantage_std": 0.9998372942209244, + "completion_length": 2660.250072479248, + "epoch": 0.23542857142857143, + "grad_norm": 0.20716576278209686, + "kl": 0.0052280426025390625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.584832158039378e-07, + "loss": 0.0002, + "reward": 0.29744825698435307, + "reward_advantage_correlation": 0.9999999999999994, + "reward_after_mean": 0.29744825698435307, + "reward_after_std": 0.8509851843118668, + "reward_before_mean": 0.39010776579380035, + "reward_before_std": 0.8526091985404491, + "reward_change_max": 0.0002521201968193054, + "reward_change_mean": -0.09265952045097947, + "reward_change_min": -0.1899539204314351, + "reward_change_std": 0.07156781386584044, + "reward_std": 0.8509852103888988, + "rewards/cosine_scaled_reward": -0.10702944942750037, + "rewards/format_reward": 0.604166679084301, + "step": 206 + }, + { + "advantage_max": 1.5442499741911888, + "advantage_mean": -4.9049656836164246e-08, + "advantage_min": -1.2186319679021835, + "advantage_std": 0.9998384788632393, + "completion_length": 2184.0834045410156, + "epoch": 0.23657142857142857, + "grad_norm": 0.24978166818618774, + "kl": 0.006916046142578125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.556940671764124e-07, + "loss": 0.0003, + "reward": 0.32800598815083504, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.32800598815083504, + "reward_after_std": 0.746575552970171, + "reward_before_mean": 0.42756956070661545, + "reward_before_std": 0.7494839541614056, + "reward_change_max": 8.022040128707886e-05, + "reward_change_mean": -0.09956359711941332, + "reward_change_min": -0.17093972861766815, + "reward_change_std": 0.06876332219690084, + "reward_std": 0.7465755566954613, + "rewards/cosine_scaled_reward": -0.14038189128041267, + "rewards/format_reward": 0.7083333525806665, + "step": 207 + }, + { + "advantage_max": 1.4164851307868958, + "advantage_mean": -1.3659397279930374e-08, + "advantage_min": -1.213052585721016, + "advantage_std": 0.9997989684343338, + "completion_length": 2149.2709197998047, + "epoch": 0.2377142857142857, + "grad_norm": 0.2106243371963501, + "kl": 0.0059680938720703125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.528948933102438e-07, + "loss": 0.0002, + "reward": 0.5530873071402311, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.5530873071402311, + "reward_after_std": 0.6851087436079979, + "reward_before_mean": 0.6776117645204067, + "reward_before_std": 0.691135261207819, + "reward_change_max": 0.0, + "reward_change_mean": -0.12452444294467568, + "reward_change_min": -0.22678932081907988, + "reward_change_std": 0.08868333138525486, + "reward_std": 0.6851087622344494, + "rewards/cosine_scaled_reward": 0.01588919758796692, + "rewards/format_reward": 0.6458333395421505, + "step": 208 + }, + { + "advantage_max": 1.6769540160894394, + "advantage_mean": -7.82310966007671e-08, + "advantage_min": -0.9029154442250729, + "advantage_std": 0.9998540878295898, + "completion_length": 2148.1875381469727, + "epoch": 0.23885714285714285, + "grad_norm": 0.2742181420326233, + "kl": 0.00687408447265625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.500858306332172e-07, + "loss": 0.0003, + "reward": 0.6304308408871293, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6304308408871293, + "reward_after_std": 0.7765806205570698, + "reward_before_mean": 0.7553490996360779, + "reward_before_std": 0.7598440460860729, + "reward_change_max": 0.00027485936880111694, + "reward_change_mean": -0.12491830640647095, + "reward_change_min": -0.21811598259955645, + "reward_change_std": 0.0800136974430643, + "reward_std": 0.7765806391835213, + "rewards/cosine_scaled_reward": 0.07559122750535607, + "rewards/format_reward": 0.6041666697710752, + "step": 209 + }, + { + "advantage_max": 1.2896546870470047, + "advantage_mean": -1.3659397724019584e-08, + "advantage_min": -1.2374619990587234, + "advantage_std": 0.9997692629694939, + "completion_length": 2183.437530517578, + "epoch": 0.24, + "grad_norm": 0.18591295182704926, + "kl": 0.0058994293212890625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.472670160550848e-07, + "loss": 0.0002, + "reward": 0.26463001780211926, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.26463001780211926, + "reward_after_std": 0.5814108606427908, + "reward_before_mean": 0.36643529776483774, + "reward_before_std": 0.5933164171874523, + "reward_change_max": 0.00011651962995529175, + "reward_change_mean": -0.10180524515453726, + "reward_change_min": -0.1865832544863224, + "reward_change_std": 0.07230387115851045, + "reward_std": 0.5814108960330486, + "rewards/cosine_scaled_reward": -0.12928236462175846, + "rewards/format_reward": 0.6250000037252903, + "step": 210 + }, + { + "advantage_max": 1.483919121325016, + "advantage_mean": -4.1599077849063804e-08, + "advantage_min": -1.2655752003192902, + "advantage_std": 0.9997261986136436, + "completion_length": 2075.4583740234375, + "epoch": 0.24114285714285713, + "grad_norm": 0.19888249039649963, + "kl": 0.006458282470703125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.444385869608921e-07, + "loss": 0.0003, + "reward": 0.43373306343937656, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.43373306343937656, + "reward_after_std": 0.5451980549842119, + "reward_before_mean": 0.5491739911958575, + "reward_before_std": 0.5315751228481531, + "reward_change_max": 0.00027501583099365234, + "reward_change_mean": -0.11544095072895288, + "reward_change_min": -0.1886872909963131, + "reward_change_std": 0.07334151910617948, + "reward_std": 0.5451980568468571, + "rewards/cosine_scaled_reward": -0.037913015112280846, + "rewards/format_reward": 0.6250000111758709, + "step": 211 + }, + { + "advantage_max": 1.543802410364151, + "advantage_mean": -6.519258100023961e-08, + "advantage_min": -0.9943611100316048, + "advantage_std": 0.9998081922531128, + "completion_length": 1795.4375610351562, + "epoch": 0.2422857142857143, + "grad_norm": 0.2262740135192871, + "kl": 0.00582122802734375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.416006812042827e-07, + "loss": 0.0002, + "reward": 0.8692614883184433, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.8692614883184433, + "reward_after_std": 0.7634583115577698, + "reward_before_mean": 1.0197798013687134, + "reward_before_std": 0.752265190705657, + "reward_change_max": 0.0, + "reward_change_mean": -0.15051830047741532, + "reward_change_min": -0.2744205743074417, + "reward_change_std": 0.10307842120528221, + "reward_std": 0.7634583376348019, + "rewards/cosine_scaled_reward": 0.1244732104241848, + "rewards/format_reward": 0.7708333376795053, + "step": 212 + }, + { + "advantage_max": 1.3446763902902603, + "advantage_mean": 2.483527050678447e-09, + "advantage_min": -1.2655547559261322, + "advantage_std": 0.9998160228133202, + "completion_length": 2237.6875076293945, + "epoch": 0.24342857142857144, + "grad_norm": 0.2671678960323334, + "kl": 0.011074066162109375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.387534371007797e-07, + "loss": 0.0004, + "reward": 0.5273108333349228, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5273108333349228, + "reward_after_std": 0.727337971329689, + "reward_before_mean": 0.6491544246673584, + "reward_before_std": 0.737940214574337, + "reward_change_max": 0.0, + "reward_change_mean": -0.12184353871271014, + "reward_change_min": -0.21076182555407286, + "reward_change_std": 0.08359152404591441, + "reward_std": 0.7273379862308502, + "rewards/cosine_scaled_reward": 0.032910510897636414, + "rewards/format_reward": 0.5833333414047956, + "step": 213 + }, + { + "advantage_max": 1.522321492433548, + "advantage_mean": -1.2169282392893166e-07, + "advantage_min": -1.2173484787344933, + "advantage_std": 0.9997310861945152, + "completion_length": 2353.854232788086, + "epoch": 0.24457142857142858, + "grad_norm": 0.2423325479030609, + "kl": 0.00769805908203125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.358969934210438e-07, + "loss": 0.0003, + "reward": 0.5639788303524256, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5639788303524256, + "reward_after_std": 0.790889865718782, + "reward_before_mean": 0.684724192135036, + "reward_before_std": 0.7886148598045111, + "reward_change_max": 0.00029237568378448486, + "reward_change_mean": -0.12074539810419083, + "reward_change_min": -0.2257308717817068, + "reward_change_std": 0.08881830563768744, + "reward_std": 0.790889891795814, + "rewards/cosine_scaled_reward": 0.0402787746861577, + "rewards/format_reward": 0.6041666753590107, + "step": 214 + }, + { + "advantage_max": 1.440918743610382, + "advantage_mean": 5.587935336670569e-09, + "advantage_min": -1.2240000292658806, + "advantage_std": 0.9997818693518639, + "completion_length": 1874.0417022705078, + "epoch": 0.24571428571428572, + "grad_norm": 0.2190721482038498, + "kl": 0.0040130615234375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.330314893841101e-07, + "loss": 0.0002, + "reward": 0.46307086013257504, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.46307086013257504, + "reward_after_std": 0.5368920117616653, + "reward_before_mean": 0.5826256424188614, + "reward_before_std": 0.5326369069516659, + "reward_change_max": 0.00018672645092010498, + "reward_change_mean": -0.11955476645380259, + "reward_change_min": -0.1934266872704029, + "reward_change_std": 0.07289872295223176, + "reward_std": 0.5368920303881168, + "rewards/cosine_scaled_reward": -0.11493718903511763, + "rewards/format_reward": 0.8125000149011612, + "step": 215 + }, + { + "advantage_max": 1.4966644793748856, + "advantage_mean": -5.587935592021864e-08, + "advantage_min": -1.1622079610824585, + "advantage_std": 0.9997863173484802, + "completion_length": 1281.31254196167, + "epoch": 0.24685714285714286, + "grad_norm": 0.26192766427993774, + "kl": 0.00527191162109375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.301570646506027e-07, + "loss": 0.0002, + "reward": 0.8690951648168266, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.8690951648168266, + "reward_after_std": 0.7011812217533588, + "reward_before_mean": 1.022690481506288, + "reward_before_std": 0.6952761113643646, + "reward_change_max": 0.0, + "reward_change_mean": -0.15359534416347742, + "reward_change_min": -0.2577803088352084, + "reward_change_std": 0.09748534904792905, + "reward_std": 0.7011812403798103, + "rewards/cosine_scaled_reward": 0.07384524680674076, + "rewards/format_reward": 0.8750000111758709, + "step": 216 + }, + { + "advantage_max": 1.5052600800991058, + "advantage_mean": -4.221995697495373e-08, + "advantage_min": -1.0247588083148003, + "advantage_std": 0.999771237373352, + "completion_length": 1675.0416793823242, + "epoch": 0.248, + "grad_norm": 0.22367313504219055, + "kl": 0.0055751800537109375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.27273859315928e-07, + "loss": 0.0002, + "reward": 0.8154791872948408, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.8154791872948408, + "reward_after_std": 0.7668120842427015, + "reward_before_mean": 0.9590049833059311, + "reward_before_std": 0.7504305392503738, + "reward_change_max": 0.0, + "reward_change_mean": -0.14352576108649373, + "reward_change_min": -0.24628907442092896, + "reward_change_std": 0.08640740357805043, + "reward_std": 0.7668121261522174, + "rewards/cosine_scaled_reward": 0.10450247849803418, + "rewards/format_reward": 0.7500000055879354, + "step": 217 + }, + { + "advantage_max": 1.5412231981754303, + "advantage_mean": 1.738468902168222e-08, + "advantage_min": -0.9499373137950897, + "advantage_std": 0.9997916966676712, + "completion_length": 2136.4167098999023, + "epoch": 0.24914285714285714, + "grad_norm": 0.20119324326515198, + "kl": 0.007266998291015625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.243820139034464e-07, + "loss": 0.0003, + "reward": 0.28319047950208187, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.28319047950208187, + "reward_after_std": 0.6024854965507984, + "reward_before_mean": 0.3830933915451169, + "reward_before_std": 0.5972005352377892, + "reward_change_max": 0.00016600638628005981, + "reward_change_mean": -0.09990292368456721, + "reward_change_min": -0.17767559923231602, + "reward_change_std": 0.06573458341881633, + "reward_std": 0.6024855300784111, + "rewards/cosine_scaled_reward": -0.14178664050996304, + "rewards/format_reward": 0.6666666716337204, + "step": 218 + }, + { + "advantage_max": 1.5693671107292175, + "advantage_mean": 2.1109978987077227e-08, + "advantage_min": -1.0280758142471313, + "advantage_std": 0.9998404160141945, + "completion_length": 1953.8750457763672, + "epoch": 0.2502857142857143, + "grad_norm": 0.2856943905353546, + "kl": 0.009334564208984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.214816693576234e-07, + "loss": 0.0004, + "reward": 0.4099986110813916, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4099986110813916, + "reward_after_std": 0.7541120164096355, + "reward_before_mean": 0.5179123759735376, + "reward_before_std": 0.7560647651553154, + "reward_change_max": 0.00018017739057540894, + "reward_change_mean": -0.10791374277323484, + "reward_change_min": -0.2042955392971635, + "reward_change_std": 0.07851645350456238, + "reward_std": 0.7541120536625385, + "rewards/cosine_scaled_reward": -0.084793820977211, + "rewards/format_reward": 0.6875000093132257, + "step": 219 + }, + { + "advantage_max": 1.5397311598062515, + "advantage_mean": -3.197540970889534e-08, + "advantage_min": -1.2701191380620003, + "advantage_std": 0.9997361525893211, + "completion_length": 1732.9583854675293, + "epoch": 0.25142857142857145, + "grad_norm": 0.2450757920742035, + "kl": 0.006439208984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.185729670371604e-07, + "loss": 0.0003, + "reward": 0.11072067031636834, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.11072067031636834, + "reward_after_std": 0.4471112582832575, + "reward_before_mean": 0.1992113790474832, + "reward_before_std": 0.4396040756255388, + "reward_change_max": 0.0002147480845451355, + "reward_change_mean": -0.08849071525037289, + "reward_change_min": -0.14833337999880314, + "reward_change_std": 0.05763935064896941, + "reward_std": 0.4471112657338381, + "rewards/cosine_scaled_reward": -0.2858109883964062, + "rewards/format_reward": 0.7708333432674408, + "step": 220 + }, + { + "advantage_max": 1.6834911704063416, + "advantage_mean": -7.698933590649659e-08, + "advantage_min": -0.9756602942943573, + "advantage_std": 0.9997707083821297, + "completion_length": 1674.0416870117188, + "epoch": 0.25257142857142856, + "grad_norm": 0.2080233097076416, + "kl": 0.0053730010986328125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.156560487081051e-07, + "loss": 0.0002, + "reward": 0.7887346247443929, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.7887346247443929, + "reward_after_std": 0.6305991001427174, + "reward_before_mean": 0.9336526445113122, + "reward_before_std": 0.5975028732791543, + "reward_change_max": 0.00025875866413116455, + "reward_change_mean": -0.1449180180206895, + "reward_change_min": -0.22527676168829203, + "reward_change_std": 0.08657489949837327, + "reward_std": 0.6305991094559431, + "rewards/cosine_scaled_reward": 0.0814096424728632, + "rewards/format_reward": 0.7708333395421505, + "step": 221 + }, + { + "advantage_max": 1.2725271508097649, + "advantage_mean": -2.483526884144993e-08, + "advantage_min": -1.186394453048706, + "advantage_std": 0.9998194724321365, + "completion_length": 1854.083381652832, + "epoch": 0.2537142857142857, + "grad_norm": 0.23417192697525024, + "kl": 0.005588531494140625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.127310565369415e-07, + "loss": 0.0002, + "reward": 0.6199398525059223, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6199398525059223, + "reward_after_std": 0.8259677402675152, + "reward_before_mean": 0.7486249133944511, + "reward_before_std": 0.8462961055338383, + "reward_change_max": 0.0, + "reward_change_mean": -0.1286850469186902, + "reward_change_min": -0.2560490546748042, + "reward_change_std": 0.09627352794632316, + "reward_std": 0.8259677961468697, + "rewards/cosine_scaled_reward": 0.03056243620812893, + "rewards/format_reward": 0.6875, + "step": 222 + }, + { + "advantage_max": 1.4652435034513474, + "advantage_mean": -4.439304501779873e-08, + "advantage_min": -1.1945854425430298, + "advantage_std": 0.9997480288147926, + "completion_length": 2072.9375228881836, + "epoch": 0.25485714285714284, + "grad_norm": 0.21853138506412506, + "kl": 0.0057468414306640625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.097981330836616e-07, + "loss": 0.0002, + "reward": 0.5158516289666295, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.5158516289666295, + "reward_after_std": 0.5560441594570875, + "reward_before_mean": 0.6397079173475504, + "reward_before_std": 0.5492154005914927, + "reward_change_max": 0.0, + "reward_change_mean": -0.12385630467906594, + "reward_change_min": -0.20062089059501886, + "reward_change_std": 0.07504534930922091, + "reward_std": 0.5560441724956036, + "rewards/cosine_scaled_reward": 0.017770618200302124, + "rewards/format_reward": 0.6041666679084301, + "step": 223 + }, + { + "advantage_max": 1.2781447544693947, + "advantage_mean": 9.002784961964494e-09, + "advantage_min": -1.224984422326088, + "advantage_std": 0.9998601526021957, + "completion_length": 2220.416732788086, + "epoch": 0.256, + "grad_norm": 0.18231956660747528, + "kl": 0.0063304901123046875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.068574212948169e-07, + "loss": 0.0003, + "reward": 0.49894432350993156, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.49894432350993156, + "reward_after_std": 0.9219648316502571, + "reward_before_mean": 0.6136458464898169, + "reward_before_std": 0.9475033320486546, + "reward_change_max": 0.0, + "reward_change_mean": -0.11470149923115969, + "reward_change_min": -0.23749073594808578, + "reward_change_std": 0.09353986661881208, + "reward_std": 0.9219648726284504, + "rewards/cosine_scaled_reward": -0.05776042211800814, + "rewards/format_reward": 0.729166679084301, + "step": 224 + }, + { + "advantage_max": 1.6483723670244217, + "advantage_mean": 1.2417633921124605e-08, + "advantage_min": -1.0310075506567955, + "advantage_std": 0.9998345449566841, + "completion_length": 2474.9792556762695, + "epoch": 0.2571428571428571, + "grad_norm": 0.2757589519023895, + "kl": 0.011646270751953125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.039090644965509e-07, + "loss": 0.0005, + "reward": 0.39897412806749344, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.39897412806749344, + "reward_after_std": 0.7204340994358063, + "reward_before_mean": 0.5055959932506084, + "reward_before_std": 0.713385995477438, + "reward_change_max": 5.5462121963500977e-05, + "reward_change_mean": -0.10662184376269579, + "reward_change_min": -0.1872426439076662, + "reward_change_std": 0.07501717936247587, + "reward_std": 0.7204341255128384, + "rewards/cosine_scaled_reward": -0.04928534850478172, + "rewards/format_reward": 0.6041666734963655, + "step": 225 + }, + { + "advantage_max": 1.368759848177433, + "advantage_mean": -1.800557003495129e-08, + "advantage_min": -1.1878944411873817, + "advantage_std": 0.9998493194580078, + "completion_length": 1795.6458892822266, + "epoch": 0.2582857142857143, + "grad_norm": 0.19780239462852478, + "kl": 0.005168914794921875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 7.009532063876148e-07, + "loss": 0.0002, + "reward": 0.9348589247092605, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.9348589247092605, + "reward_after_std": 0.9241700284183025, + "reward_before_mean": 1.0913225067779422, + "reward_before_std": 0.9455209393054247, + "reward_change_max": 0.0, + "reward_change_mean": -0.1564636155962944, + "reward_change_min": -0.28128004260361195, + "reward_change_std": 0.11181944841518998, + "reward_std": 0.924170047044754, + "rewards/cosine_scaled_reward": 0.13941125571727753, + "rewards/format_reward": 0.8125000111758709, + "step": 226 + }, + { + "advantage_max": 1.5920889675617218, + "advantage_mean": -9.934107758624577e-09, + "advantage_min": -1.0377077758312225, + "advantage_std": 0.9998761713504791, + "completion_length": 1348.6875534057617, + "epoch": 0.25942857142857145, + "grad_norm": 0.26142218708992004, + "kl": 0.009868621826171875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.979899910323624e-07, + "loss": 0.0004, + "reward": 0.7289588078856468, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.7289588078856468, + "reward_after_std": 0.9257365316152573, + "reward_before_mean": 0.8592088520526886, + "reward_before_std": 0.9165628142654896, + "reward_change_max": 0.0, + "reward_change_mean": -0.1302500069141388, + "reward_change_min": -0.24056414235383272, + "reward_change_std": 0.0870923982001841, + "reward_std": 0.9257365316152573, + "rewards/cosine_scaled_reward": -0.028728928649798036, + "rewards/format_reward": 0.916666679084301, + "step": 227 + }, + { + "advantage_max": 1.4398110508918762, + "advantage_mean": 7.450580929990736e-09, + "advantage_min": -1.056349277496338, + "advantage_std": 0.9998444691300392, + "completion_length": 1577.3125381469727, + "epoch": 0.26057142857142856, + "grad_norm": 0.25881561636924744, + "kl": 0.006450653076171875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.950195628537299e-07, + "loss": 0.0003, + "reward": 0.7870747782289982, + "reward_advantage_correlation": 0.9999999999999994, + "reward_after_mean": 0.7870747782289982, + "reward_after_std": 0.8033650256693363, + "reward_before_mean": 0.929570865817368, + "reward_before_std": 0.8072608970105648, + "reward_change_max": 0.0, + "reward_change_mean": -0.1424960799049586, + "reward_change_min": -0.25784505158662796, + "reward_change_std": 0.09646610415074974, + "reward_std": 0.803365059196949, + "rewards/cosine_scaled_reward": 0.08978541730903089, + "rewards/format_reward": 0.7500000074505806, + "step": 228 + }, + { + "advantage_max": 1.4671648442745209, + "advantage_mean": -3.7563344879032456e-08, + "advantage_min": -1.1452326700091362, + "advantage_std": 0.9997798949480057, + "completion_length": 1936.2500228881836, + "epoch": 0.26171428571428573, + "grad_norm": 0.25452426075935364, + "kl": 0.007841110229492188, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.920420666261961e-07, + "loss": 0.0003, + "reward": 0.6133114844560623, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6133114844560623, + "reward_after_std": 0.6665918864309788, + "reward_before_mean": 0.7417216263711452, + "reward_before_std": 0.6557564614340663, + "reward_change_max": 6.188452243804932e-05, + "reward_change_mean": -0.12841017404571176, + "reward_change_min": -0.2055542655289173, + "reward_change_std": 0.08059329586103559, + "reward_std": 0.66659190133214, + "rewards/cosine_scaled_reward": 0.006277475506067276, + "rewards/format_reward": 0.7291666753590107, + "step": 229 + }, + { + "advantage_max": 1.4177044332027435, + "advantage_mean": -1.4280279403422469e-08, + "advantage_min": -1.315854400396347, + "advantage_std": 0.9998226910829544, + "completion_length": 2070.937530517578, + "epoch": 0.26285714285714284, + "grad_norm": 0.23040302097797394, + "kl": 0.0074100494384765625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.890576474687263e-07, + "loss": 0.0003, + "reward": 0.22305097430944443, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.22305097430944443, + "reward_after_std": 0.6350849457085133, + "reward_before_mean": 0.3175716698169708, + "reward_before_std": 0.6423088535666466, + "reward_change_max": 0.0005049854516983032, + "reward_change_mean": -0.09452070062980056, + "reward_change_min": -0.16764249559491873, + "reward_change_std": 0.06898301001638174, + "reward_std": 0.6350849494338036, + "rewards/cosine_scaled_reward": -0.1849641827866435, + "rewards/format_reward": 0.6875000111758709, + "step": 230 + }, + { + "advantage_max": 1.4866014271974564, + "advantage_mean": -1.1051695258945671e-07, + "advantage_min": -1.219824656844139, + "advantage_std": 0.9998104348778725, + "completion_length": 1799.8750686645508, + "epoch": 0.264, + "grad_norm": 0.21469198167324066, + "kl": 0.0066547393798828125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.860664508377001e-07, + "loss": 0.0003, + "reward": 0.8363642990589142, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.8363642990589142, + "reward_after_std": 0.7526835240423679, + "reward_before_mean": 0.9858303721994162, + "reward_before_std": 0.7550496160984039, + "reward_change_max": 0.0, + "reward_change_mean": -0.1494661532342434, + "reward_change_min": -0.2536203945055604, + "reward_change_std": 0.10055593773722649, + "reward_std": 0.7526835426688194, + "rewards/cosine_scaled_reward": 0.0762485321611166, + "rewards/format_reward": 0.8333333469927311, + "step": 231 + }, + { + "advantage_max": 1.6209463626146317, + "advantage_mean": 4.967053879312289e-09, + "advantage_min": -1.0295084789395332, + "advantage_std": 0.9998287931084633, + "completion_length": 2159.2083892822266, + "epoch": 0.2651428571428571, + "grad_norm": 0.24421940743923187, + "kl": 0.0079345703125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.83068622519821e-07, + "loss": 0.0003, + "reward": 0.21903796587139368, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.21903796587139368, + "reward_after_std": 0.6888058856129646, + "reward_before_mean": 0.3088764566928148, + "reward_before_std": 0.6795613020658493, + "reward_change_max": 0.0005556866526603699, + "reward_change_mean": -0.0898384740576148, + "reward_change_min": -0.15906614251434803, + "reward_change_std": 0.061581351794302464, + "reward_std": 0.6888059228658676, + "rewards/cosine_scaled_reward": -0.18931178748607635, + "rewards/format_reward": 0.687500013038516, + "step": 232 + }, + { + "advantage_max": 1.5372837334871292, + "advantage_mean": -3.802900583327329e-08, + "advantage_min": -1.0135958343744278, + "advantage_std": 0.9997899383306503, + "completion_length": 1570.333396911621, + "epoch": 0.2662857142857143, + "grad_norm": 0.267768532037735, + "kl": 0.00606536865234375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.800643086250121e-07, + "loss": 0.0002, + "reward": 0.48989987885579467, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.48989987885579467, + "reward_after_std": 0.6716056112200022, + "reward_before_mean": 0.6066783485002816, + "reward_before_std": 0.6613761279731989, + "reward_change_max": 0.0, + "reward_change_mean": -0.11677848640829325, + "reward_change_min": -0.2160600544884801, + "reward_change_std": 0.07633232930675149, + "reward_std": 0.671605659648776, + "rewards/cosine_scaled_reward": -0.12374416552484035, + "rewards/format_reward": 0.854166679084301, + "step": 233 + }, + { + "advantage_max": 1.4362711906433105, + "advantage_mean": 8.6923440667519e-09, + "advantage_min": -1.2135839760303497, + "advantage_std": 0.999776653945446, + "completion_length": 2142.3750228881836, + "epoch": 0.2674285714285714, + "grad_norm": 0.2719072103500366, + "kl": 0.00940704345703125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.770536555792944e-07, + "loss": 0.0004, + "reward": 0.3730768244713545, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3730768244713545, + "reward_after_std": 0.5923286378383636, + "reward_before_mean": 0.4837719602510333, + "reward_before_std": 0.5916496124118567, + "reward_change_max": 0.0, + "reward_change_mean": -0.11069511156529188, + "reward_change_min": -0.1783575415611267, + "reward_change_std": 0.07707143994048238, + "reward_std": 0.592328667640686, + "rewards/cosine_scaled_reward": -0.0914473719894886, + "rewards/format_reward": 0.6666666772216558, + "step": 234 + }, + { + "advantage_max": 1.5493723526597023, + "advantage_mean": -1.0679166395632933e-07, + "advantage_min": -1.0974969416856766, + "advantage_std": 0.9997179284691811, + "completion_length": 1460.270866394043, + "epoch": 0.26857142857142857, + "grad_norm": 0.24478395283222198, + "kl": 0.007293701171875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.740368101176495e-07, + "loss": 0.0003, + "reward": 1.0726623684167862, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 1.0726623684167862, + "reward_after_std": 0.5861970726400614, + "reward_before_mean": 1.2478751856833696, + "reward_before_std": 0.5587137271650136, + "reward_change_max": 0.00025247782468795776, + "reward_change_mean": -0.1752128468360752, + "reward_change_min": -0.2610089210793376, + "reward_change_std": 0.10384868178516626, + "reward_std": 0.5861970763653517, + "rewards/cosine_scaled_reward": 0.21768759936094284, + "rewards/format_reward": 0.8125000074505806, + "step": 235 + }, + { + "advantage_max": 1.740464448928833, + "advantage_mean": 4.440892098500626e-16, + "advantage_min": -0.8375889658927917, + "advantage_std": 0.9998397752642632, + "completion_length": 2148.5416946411133, + "epoch": 0.26971428571428574, + "grad_norm": 0.19493846595287323, + "kl": 0.0068416595458984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.710139192768694e-07, + "loss": 0.0003, + "reward": 0.37424314580857754, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.37424314580857754, + "reward_after_std": 0.8406364396214485, + "reward_before_mean": 0.47235390916466713, + "reward_before_std": 0.8247300237417221, + "reward_change_max": 0.0, + "reward_change_mean": -0.098110759165138, + "reward_change_min": -0.177609003148973, + "reward_change_std": 0.062242650194093585, + "reward_std": 0.8406364470720291, + "rewards/cosine_scaled_reward": -0.09715638670604676, + "rewards/format_reward": 0.6666666679084301, + "step": 236 + }, + { + "advantage_max": 1.5386316254734993, + "advantage_mean": -1.1175871117430347e-08, + "advantage_min": -1.1397172287106514, + "advantage_std": 0.9997890964150429, + "completion_length": 1514.7500228881836, + "epoch": 0.27085714285714285, + "grad_norm": 0.21677693724632263, + "kl": 0.00567626953125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.679851303883891e-07, + "loss": 0.0002, + "reward": 0.7754391804337502, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.7754391804337502, + "reward_after_std": 0.5829839073121548, + "reward_before_mean": 0.9211319833993912, + "reward_before_std": 0.5614862740039825, + "reward_change_max": 0.0, + "reward_change_mean": -0.14569283719174564, + "reward_change_min": -0.21358582749962807, + "reward_change_std": 0.0806428431533277, + "reward_std": 0.5829839296638966, + "rewards/cosine_scaled_reward": 0.03348265402019024, + "rewards/format_reward": 0.8541666716337204, + "step": 237 + }, + { + "advantage_max": 1.4392332583665848, + "advantage_mean": -4.346172211011634e-08, + "advantage_min": -1.210955560207367, + "advantage_std": 0.9997563362121582, + "completion_length": 1502.4791946411133, + "epoch": 0.272, + "grad_norm": 0.22297178208827972, + "kl": 0.008113861083984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.649505910711058e-07, + "loss": 0.0003, + "reward": 0.7160059418529272, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.7160059418529272, + "reward_after_std": 0.625291345641017, + "reward_before_mean": 0.8567305975593626, + "reward_before_std": 0.6137831769883633, + "reward_change_max": 0.0, + "reward_change_mean": -0.14072464779019356, + "reward_change_min": -0.23825540859252214, + "reward_change_std": 0.08909846004098654, + "reward_std": 0.6252913642674685, + "rewards/cosine_scaled_reward": -0.009134718915447593, + "rewards/format_reward": 0.8750000074505806, + "step": 238 + }, + { + "advantage_max": 1.581406444311142, + "advantage_mean": -2.6573738720614415e-07, + "advantage_min": -0.9953296408057213, + "advantage_std": 0.999721497297287, + "completion_length": 1504.1458473205566, + "epoch": 0.27314285714285713, + "grad_norm": 0.2068232297897339, + "kl": 0.0054912567138671875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.619104492241847e-07, + "loss": 0.0002, + "reward": 1.0969355329871178, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 1.0969355329871178, + "reward_after_std": 0.6276315655559301, + "reward_before_mean": 1.2723851148039103, + "reward_before_std": 0.5967890359461308, + "reward_change_max": 0.0002623945474624634, + "reward_change_mean": -0.17544960090890527, + "reward_change_min": -0.26079559326171875, + "reward_change_std": 0.10241867695003748, + "reward_std": 0.6276315916329622, + "rewards/cosine_scaled_reward": 0.25077585806138813, + "rewards/format_reward": 0.770833333954215, + "step": 239 + }, + { + "advantage_max": 1.5168597102165222, + "advantage_mean": 8.692343844707295e-09, + "advantage_min": -1.131408378481865, + "advantage_std": 0.9997967407107353, + "completion_length": 1919.458381652832, + "epoch": 0.2742857142857143, + "grad_norm": 0.29865968227386475, + "kl": 0.00994873046875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.588648530198504e-07, + "loss": 0.0004, + "reward": 0.2035725242458284, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2035725242458284, + "reward_after_std": 0.5868874341249466, + "reward_before_mean": 0.29640037566423416, + "reward_before_std": 0.582292553037405, + "reward_change_max": 0.00010397285223007202, + "reward_change_mean": -0.09282782999798656, + "reward_change_min": -0.1734831389039755, + "reward_change_std": 0.06268787989392877, + "reward_std": 0.5868874527513981, + "rewards/cosine_scaled_reward": -0.1955498280003667, + "rewards/format_reward": 0.687500013038516, + "step": 240 + }, + { + "advantage_max": 1.626302644610405, + "advantage_mean": -2.235174290099451e-08, + "advantage_min": -0.99515251070261, + "advantage_std": 0.999805323779583, + "completion_length": 2014.3333435058594, + "epoch": 0.2754285714285714, + "grad_norm": 0.26837101578712463, + "kl": 0.01031494140625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.558139508961654e-07, + "loss": 0.0004, + "reward": 0.06266416236758232, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.06266416236758232, + "reward_after_std": 0.5582391433417797, + "reward_before_mean": 0.1420099101960659, + "reward_before_std": 0.5503151379525661, + "reward_change_max": 0.00012226402759552002, + "reward_change_mean": -0.07934575085528195, + "reward_change_min": -0.1459060488268733, + "reward_change_std": 0.05173483118414879, + "reward_std": 0.5582391582429409, + "rewards/cosine_scaled_reward": -0.2727450542151928, + "rewards/format_reward": 0.6875000074505806, + "step": 241 + }, + { + "advantage_max": 1.4509450048208237, + "advantage_mean": 3.1044156134640843e-10, + "advantage_min": -1.3363563306629658, + "advantage_std": 0.9995779171586037, + "completion_length": 1342.4166946411133, + "epoch": 0.2765714285714286, + "grad_norm": 0.2745201289653778, + "kl": 0.013671875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.527578915497951e-07, + "loss": 0.0005, + "reward": 0.4477707026526332, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4477707026526332, + "reward_after_std": 0.5168483178131282, + "reward_before_mean": 0.5664155303966254, + "reward_before_std": 0.5090636860113591, + "reward_change_max": 0.00021380186080932617, + "reward_change_mean": -0.11864481214433908, + "reward_change_min": -0.18819516710937023, + "reward_change_std": 0.07314185099676251, + "reward_std": 0.5168483252637088, + "rewards/cosine_scaled_reward": -0.1751255802810192, + "rewards/format_reward": 0.916666679084301, + "step": 242 + }, + { + "advantage_max": 1.4236368983983994, + "advantage_mean": 3.1044091741705415e-09, + "advantage_min": -1.275793395936489, + "advantage_std": 0.9998544678092003, + "completion_length": 1794.9584045410156, + "epoch": 0.2777142857142857, + "grad_norm": 0.20937462151050568, + "kl": 0.00710296630859375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.496968239287603e-07, + "loss": 0.0003, + "reward": 0.6848674118518829, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6848674118518829, + "reward_after_std": 0.8243732713162899, + "reward_before_mean": 0.8182772938162088, + "reward_before_std": 0.8360154293477535, + "reward_change_max": 9.438395500183105e-05, + "reward_change_mean": -0.1334098584484309, + "reward_change_min": -0.23731573671102524, + "reward_change_std": 0.09346593916416168, + "reward_std": 0.8243732936680317, + "rewards/cosine_scaled_reward": 0.002888637245632708, + "rewards/format_reward": 0.812500013038516, + "step": 243 + }, + { + "advantage_max": 1.5768461674451828, + "advantage_mean": -3.725290742551124e-09, + "advantage_min": -1.05535177141428, + "advantage_std": 0.9997763559222221, + "completion_length": 1677.4791984558105, + "epoch": 0.27885714285714286, + "grad_norm": 0.24402037262916565, + "kl": 0.006748199462890625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.466308972251785e-07, + "loss": 0.0003, + "reward": 0.6520390259101987, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6520390259101987, + "reward_after_std": 0.6846922170370817, + "reward_before_mean": 0.7843025382608175, + "reward_before_std": 0.6760292388498783, + "reward_change_max": 0.00015122443437576294, + "reward_change_mean": -0.13226348999887705, + "reward_change_min": -0.22804296016693115, + "reward_change_std": 0.08690834417939186, + "reward_std": 0.6846922319382429, + "rewards/cosine_scaled_reward": -0.003682076930999756, + "rewards/format_reward": 0.7916666734963655, + "step": 244 + }, + { + "advantage_max": 1.397861048579216, + "advantage_mean": 9.3132264122886e-09, + "advantage_min": -1.0036931559443474, + "advantage_std": 0.9998713657259941, + "completion_length": 2007.2916946411133, + "epoch": 0.28, + "grad_norm": 0.2225971817970276, + "kl": 0.007488250732421875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.435602608679916e-07, + "loss": 0.0003, + "reward": 0.7011168226599693, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.7011168226599693, + "reward_after_std": 1.021484598517418, + "reward_before_mean": 0.8311672285199165, + "reward_before_std": 1.0451684147119522, + "reward_change_max": 4.5515596866607666e-05, + "reward_change_mean": -0.13005035952664912, + "reward_change_min": -0.26285023987293243, + "reward_change_std": 0.10422047041356564, + "reward_std": 1.0214846432209015, + "rewards/cosine_scaled_reward": 0.0510002663359046, + "rewards/format_reward": 0.7291666716337204, + "step": 245 + }, + { + "advantage_max": 1.483703851699829, + "advantage_mean": -3.1044087300813317e-09, + "advantage_min": -1.0376464948058128, + "advantage_std": 0.9998078644275665, + "completion_length": 1695.2500381469727, + "epoch": 0.28114285714285714, + "grad_norm": 0.23753374814987183, + "kl": 0.0087890625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.404850645156841e-07, + "loss": 0.0004, + "reward": 0.5922103077173233, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5922103077173233, + "reward_after_std": 0.6839786767959595, + "reward_before_mean": 0.7197319120168686, + "reward_before_std": 0.6809311471879482, + "reward_change_max": 0.0, + "reward_change_mean": -0.127521563321352, + "reward_change_min": -0.22557256184518337, + "reward_change_std": 0.08379548555240035, + "reward_std": 0.6839786805212498, + "rewards/cosine_scaled_reward": -0.04638407193124294, + "rewards/format_reward": 0.8125, + "step": 246 + }, + { + "advantage_max": 1.560398355126381, + "advantage_mean": 3.849466889693787e-08, + "advantage_min": -0.9917672201991081, + "advantage_std": 0.9997923448681831, + "completion_length": 2328.666748046875, + "epoch": 0.2822857142857143, + "grad_norm": 0.2871028780937195, + "kl": 0.01140594482421875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.374054580489873e-07, + "loss": 0.0005, + "reward": 0.1551234694197774, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.1551234694197774, + "reward_after_std": 0.6376011576503515, + "reward_before_mean": 0.24224435538053513, + "reward_before_std": 0.6399637795984745, + "reward_change_max": 6.29127025604248e-05, + "reward_change_mean": -0.08712084114085883, + "reward_change_min": -0.17339404299855232, + "reward_change_std": 0.0648341947235167, + "reward_std": 0.6376011855900288, + "rewards/cosine_scaled_reward": -0.14971117489039898, + "rewards/format_reward": 0.5416666753590107, + "step": 247 + }, + { + "advantage_max": 1.6397739797830582, + "advantage_mean": 3.47693762670076e-08, + "advantage_min": -1.076673448085785, + "advantage_std": 0.9997544661164284, + "completion_length": 1653.187515258789, + "epoch": 0.2834285714285714, + "grad_norm": 0.2525901794433594, + "kl": 0.00815582275390625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.343215915635761e-07, + "loss": 0.0003, + "reward": 0.8820497170090675, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.8820497170090675, + "reward_after_std": 0.615868715569377, + "reward_before_mean": 1.0375536493957043, + "reward_before_std": 0.5880707837641239, + "reward_change_max": 0.0007249712944030762, + "reward_change_mean": -0.15550392726436257, + "reward_change_min": -0.22709692269563675, + "reward_change_std": 0.09177486295811832, + "reward_std": 0.6158687248826027, + "rewards/cosine_scaled_reward": 0.1646101539954543, + "rewards/format_reward": 0.7083333432674408, + "step": 248 + }, + { + "advantage_max": 1.4647027403116226, + "advantage_mean": 1.4280280291600889e-08, + "advantage_min": -1.1837932839989662, + "advantage_std": 0.9998209699988365, + "completion_length": 1363.208366394043, + "epoch": 0.2845714285714286, + "grad_norm": 0.25547295808792114, + "kl": 0.0075168609619140625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.31233615362752e-07, + "loss": 0.0003, + "reward": 1.1282103421690408, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 1.1282103421690408, + "reward_after_std": 0.6968370862305164, + "reward_before_mean": 1.305668581277132, + "reward_before_std": 0.6800988968461752, + "reward_change_max": 9.535253047943115e-05, + "reward_change_mean": -0.1774582201614976, + "reward_change_min": -0.27143237367272377, + "reward_change_std": 0.10626805946230888, + "reward_std": 0.696837093681097, + "rewards/cosine_scaled_reward": 0.20491759851574898, + "rewards/format_reward": 0.8958333432674408, + "step": 249 + }, + { + "advantage_max": 1.692870482802391, + "advantage_mean": -2.173086016687975e-08, + "advantage_min": -1.0667153745889664, + "advantage_std": 0.999844454228878, + "completion_length": 1385.1250381469727, + "epoch": 0.2857142857142857, + "grad_norm": 0.31402283906936646, + "kl": 0.01108551025390625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.281416799501187e-07, + "loss": 0.0004, + "reward": 0.6773853991180658, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6773853991180658, + "reward_after_std": 0.768508791923523, + "reward_before_mean": 0.8056108057498932, + "reward_before_std": 0.7412783540785313, + "reward_change_max": 0.0, + "reward_change_mean": -0.12822541315108538, + "reward_change_min": -0.1969393789768219, + "reward_change_std": 0.07280616229400039, + "reward_std": 0.7685088030993938, + "rewards/cosine_scaled_reward": -0.04511125944554806, + "rewards/format_reward": 0.8958333432674408, + "step": 250 + }, + { + "advantage_max": 1.5190437734127045, + "advantage_mean": -5.463759289447978e-08, + "advantage_min": -1.0659090280532837, + "advantage_std": 0.9997875168919563, + "completion_length": 1294.6041870117188, + "epoch": 0.28685714285714287, + "grad_norm": 0.3337409794330597, + "kl": 0.011199951171875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.25045936022246e-07, + "loss": 0.0004, + "reward": 0.6588802421465516, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.6588802421465516, + "reward_after_std": 0.7135784067213535, + "reward_before_mean": 0.7902223952114582, + "reward_before_std": 0.7042027357965708, + "reward_change_max": 0.0, + "reward_change_mean": -0.13134220987558365, + "reward_change_min": -0.23708409443497658, + "reward_change_std": 0.08308501448482275, + "reward_std": 0.7135784365236759, + "rewards/cosine_scaled_reward": -0.0528054665774107, + "rewards/format_reward": 0.8958333358168602, + "step": 251 + }, + { + "advantage_max": 1.5606249049305916, + "advantage_mean": -1.7384688355548406e-08, + "advantage_min": -1.11652322858572, + "advantage_std": 0.9998264908790588, + "completion_length": 1679.3125534057617, + "epoch": 0.288, + "grad_norm": 0.23594596982002258, + "kl": 0.009107589721679688, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.219465344613258e-07, + "loss": 0.0004, + "reward": 0.5095890890806913, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5095890890806913, + "reward_after_std": 0.7146609388291836, + "reward_before_mean": 0.626227805390954, + "reward_before_std": 0.6994800176471472, + "reward_change_max": 0.0003551766276359558, + "reward_change_mean": -0.11663870047777891, + "reward_change_min": -0.19144857861101627, + "reward_change_std": 0.07572575053200126, + "reward_std": 0.7146609574556351, + "rewards/cosine_scaled_reward": -0.1035527940839529, + "rewards/format_reward": 0.8333333469927311, + "step": 252 + }, + { + "advantage_max": 1.6786980479955673, + "advantage_mean": 2.5766592637310737e-08, + "advantage_min": -0.9841240048408508, + "advantage_std": 0.9998369589447975, + "completion_length": 1716.2500457763672, + "epoch": 0.28914285714285715, + "grad_norm": 0.27604344487190247, + "kl": 0.009691238403320312, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.188436263278172e-07, + "loss": 0.0004, + "reward": 0.6863379459828138, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6863379459828138, + "reward_after_std": 0.8101256862282753, + "reward_before_mean": 0.8162229340523481, + "reward_before_std": 0.7945797778666019, + "reward_change_max": 4.842877388000488e-07, + "reward_change_mean": -0.129884981084615, + "reward_change_min": -0.22787339333444834, + "reward_change_std": 0.08715493371710181, + "reward_std": 0.8101256936788559, + "rewards/cosine_scaled_reward": -0.008555212989449501, + "rewards/format_reward": 0.8333333395421505, + "step": 253 + }, + { + "advantage_max": 1.6549670845270157, + "advantage_mean": -1.3038516155639002e-08, + "advantage_min": -1.0284245312213898, + "advantage_std": 0.9998267441987991, + "completion_length": 1541.7500381469727, + "epoch": 0.29028571428571426, + "grad_norm": 0.32521700859069824, + "kl": 0.009090423583984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.157373628530852e-07, + "loss": 0.0004, + "reward": 0.600863391533494, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.600863391533494, + "reward_after_std": 0.6789968274533749, + "reward_before_mean": 0.7261254452168941, + "reward_before_std": 0.6610924564301968, + "reward_change_max": 0.0, + "reward_change_mean": -0.12526203296147287, + "reward_change_min": -0.20569150894880295, + "reward_change_std": 0.07578376494348049, + "reward_std": 0.6789968274533749, + "rewards/cosine_scaled_reward": -0.05360395833849907, + "rewards/format_reward": 0.8333333488553762, + "step": 254 + }, + { + "advantage_max": 1.5857353210449219, + "advantage_mean": -2.4835269396561444e-08, + "advantage_min": -0.9731541946530342, + "advantage_std": 0.9997934550046921, + "completion_length": 2176.4166870117188, + "epoch": 0.2914285714285714, + "grad_norm": 0.25210484862327576, + "kl": 0.009918212890625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.126278954320294e-07, + "loss": 0.0004, + "reward": 0.2443622061982751, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2443622061982751, + "reward_after_std": 0.6697471439838409, + "reward_before_mean": 0.3375973515212536, + "reward_before_std": 0.6620416026562452, + "reward_change_max": 5.364418029785156e-06, + "reward_change_mean": -0.09323514788411558, + "reward_change_min": -0.16571150813251734, + "reward_change_std": 0.061854652129113674, + "reward_std": 0.6697471439838409, + "rewards/cosine_scaled_reward": -0.164534667506814, + "rewards/format_reward": 0.6666666679084301, + "step": 255 + }, + { + "advantage_max": 1.2646159529685974, + "advantage_mean": -1.1175872338675674e-08, + "advantage_min": -1.523250088095665, + "advantage_std": 0.9998194351792336, + "completion_length": 1493.7500534057617, + "epoch": 0.2925714285714286, + "grad_norm": 0.32058438658714294, + "kl": 0.009937286376953125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.095153756157051e-07, + "loss": 0.0004, + "reward": 0.7117386423051357, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7117386423051357, + "reward_after_std": 0.7076647616922855, + "reward_before_mean": 0.8520172564312816, + "reward_before_std": 0.7215098179876804, + "reward_change_max": 0.0, + "reward_change_mean": -0.14027859549969435, + "reward_change_min": -0.24180615320801735, + "reward_change_std": 0.09425603970885277, + "reward_std": 0.7076647914946079, + "rewards/cosine_scaled_reward": -0.011491380631923676, + "rewards/format_reward": 0.8750000223517418, + "step": 256 + }, + { + "advantage_max": 1.4877362996339798, + "advantage_mean": -4.967052102955449e-09, + "advantage_min": -1.1290940716862679, + "advantage_std": 0.9998562559485435, + "completion_length": 2158.229263305664, + "epoch": 0.2937142857142857, + "grad_norm": 0.264924556016922, + "kl": 0.00928497314453125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.06399955103937e-07, + "loss": 0.0004, + "reward": 0.9049882646650076, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.9049882646650076, + "reward_after_std": 0.8671839684247971, + "reward_before_mean": 1.0569295436143875, + "reward_before_std": 0.8683264292776585, + "reward_change_max": 0.0, + "reward_change_mean": -0.15194128267467022, + "reward_change_min": -0.26315687224268913, + "reward_change_std": 0.10091545199975371, + "reward_std": 0.8671840019524097, + "rewards/cosine_scaled_reward": 0.15346478174615186, + "rewards/format_reward": 0.7500000055879354, + "step": 257 + }, + { + "advantage_max": 1.61009082198143, + "advantage_mean": -3.9736431700632124e-08, + "advantage_min": -1.1207184195518494, + "advantage_std": 0.9997821226716042, + "completion_length": 1955.9375534057617, + "epoch": 0.2948571428571429, + "grad_norm": 0.2173985093832016, + "kl": 0.00830078125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.032817857379256e-07, + "loss": 0.0003, + "reward": 0.3813736569136381, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3813736569136381, + "reward_after_std": 0.7218630239367485, + "reward_before_mean": 0.4860916808247566, + "reward_before_std": 0.7180778924375772, + "reward_change_max": 0.00016684085130691528, + "reward_change_mean": -0.10471805580891669, + "reward_change_min": -0.18698874861001968, + "reward_change_std": 0.07169946609064937, + "reward_std": 0.7218630462884903, + "rewards/cosine_scaled_reward": -0.10070415772497654, + "rewards/format_reward": 0.6875000074505806, + "step": 258 + }, + { + "advantage_max": 1.4760468155145645, + "advantage_mean": -7.078051778020011e-08, + "advantage_min": -1.27405995875597, + "advantage_std": 0.9997367337346077, + "completion_length": 1573.7916984558105, + "epoch": 0.296, + "grad_norm": 0.29860401153564453, + "kl": 0.010494232177734375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 6.001610194928464e-07, + "loss": 0.0004, + "reward": 0.7439992446452379, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.7439992446452379, + "reward_after_std": 0.6591152455657721, + "reward_before_mean": 0.8863520976155996, + "reward_before_std": 0.6521930769085884, + "reward_change_max": 0.0, + "reward_change_mean": -0.14235285948961973, + "reward_change_min": -0.22541704028844833, + "reward_change_std": 0.09098386578261852, + "reward_std": 0.659115256741643, + "rewards/cosine_scaled_reward": 0.016092704609036446, + "rewards/format_reward": 0.854166679084301, + "step": 259 + }, + { + "advantage_max": 1.50406713783741, + "advantage_mean": -6.022552734297193e-08, + "advantage_min": -1.1367171704769135, + "advantage_std": 0.999772198498249, + "completion_length": 1268.145851135254, + "epoch": 0.29714285714285715, + "grad_norm": 0.3464038372039795, + "kl": 0.009735107421875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.97037808470444e-07, + "loss": 0.0004, + "reward": 0.9734570910222828, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.9734570910222828, + "reward_after_std": 0.6558955330401659, + "reward_before_mean": 1.1375967441126704, + "reward_before_std": 0.6387905618175864, + "reward_change_max": 0.0003156587481498718, + "reward_change_mean": -0.16413968708366156, + "reward_change_min": -0.26056714355945587, + "reward_change_std": 0.10325025115162134, + "reward_std": 0.6558955684304237, + "rewards/cosine_scaled_reward": 0.1625483650714159, + "rewards/format_reward": 0.8125000111758709, + "step": 260 + }, + { + "advantage_max": 1.6002927124500275, + "advantage_mean": 6.208817460162663e-09, + "advantage_min": -1.0500682145357132, + "advantage_std": 0.9997867494821548, + "completion_length": 2184.229179382324, + "epoch": 0.29828571428571427, + "grad_norm": 0.20467397570610046, + "kl": 0.009990692138671875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.939123048916173e-07, + "loss": 0.0004, + "reward": 0.2399905025959015, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2399905025959015, + "reward_after_std": 0.6328884586691856, + "reward_before_mean": 0.3335062563419342, + "reward_before_std": 0.6211761645972729, + "reward_change_max": 7.826089859008789e-05, + "reward_change_mean": -0.09351573511958122, + "reward_change_min": -0.16378842666745186, + "reward_change_std": 0.060065632220357656, + "reward_std": 0.6328884772956371, + "rewards/cosine_scaled_reward": -0.1457468868829892, + "rewards/format_reward": 0.6250000055879354, + "step": 261 + }, + { + "advantage_max": 1.3524987325072289, + "advantage_mean": -1.6453366669111347e-08, + "advantage_min": -1.279630459845066, + "advantage_std": 0.9997990727424622, + "completion_length": 1722.2500686645508, + "epoch": 0.29942857142857143, + "grad_norm": 0.32207873463630676, + "kl": 0.012050628662109375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.907846610890011e-07, + "loss": 0.0005, + "reward": 0.26394235249608755, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.26394235249608755, + "reward_after_std": 0.6132156141102314, + "reward_before_mean": 0.3629047591239214, + "reward_before_std": 0.6177421398460865, + "reward_change_max": 0.0011852085590362549, + "reward_change_mean": -0.09896239778026938, + "reward_change_min": -0.176782650873065, + "reward_change_std": 0.06817787745967507, + "reward_std": 0.6132156550884247, + "rewards/cosine_scaled_reward": -0.18313096463680267, + "rewards/format_reward": 0.7291666734963655, + "step": 262 + }, + { + "advantage_max": 1.4976128190755844, + "advantage_mean": -1.8626453490711015e-08, + "advantage_min": -1.0512003675103188, + "advantage_std": 0.9997568875551224, + "completion_length": 1497.9167175292969, + "epoch": 0.30057142857142854, + "grad_norm": 0.2446172684431076, + "kl": 0.006046295166015625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.87655029499542e-07, + "loss": 0.0002, + "reward": 0.45462223142385483, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.45462223142385483, + "reward_after_std": 0.5269899610430002, + "reward_before_mean": 0.5730956010520458, + "reward_before_std": 0.5175444334745407, + "reward_change_max": 0.0, + "reward_change_mean": -0.11847336497157812, + "reward_change_min": -0.20368537306785583, + "reward_change_std": 0.07308742869645357, + "reward_std": 0.5269899740815163, + "rewards/cosine_scaled_reward": -0.1926188673824072, + "rewards/format_reward": 0.9583333358168602, + "step": 263 + }, + { + "advantage_max": 1.5773601084947586, + "advantage_mean": -1.4901161971003773e-08, + "advantage_min": -1.1340029016137123, + "advantage_std": 0.9998199939727783, + "completion_length": 1489.395881652832, + "epoch": 0.3017142857142857, + "grad_norm": 0.24546962976455688, + "kl": 0.00830841064453125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.845235626570683e-07, + "loss": 0.0003, + "reward": 0.43727186508476734, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.43727186508476734, + "reward_after_std": 0.715681679546833, + "reward_before_mean": 0.5463062226772308, + "reward_before_std": 0.7034594938158989, + "reward_change_max": 0.0, + "reward_change_mean": -0.10903437249362469, + "reward_change_min": -0.18007494881749153, + "reward_change_std": 0.06830112263560295, + "reward_std": 0.7156816907227039, + "rewards/cosine_scaled_reward": -0.15393022983334959, + "rewards/format_reward": 0.8541666772216558, + "step": 264 + }, + { + "advantage_max": 1.5012609884142876, + "advantage_mean": -4.967053435223079e-09, + "advantage_min": -1.0871346518397331, + "advantage_std": 0.9997839033603668, + "completion_length": 1440.5833892822266, + "epoch": 0.3028571428571429, + "grad_norm": 0.25864601135253906, + "kl": 0.008214950561523438, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.813904131848564e-07, + "loss": 0.0003, + "reward": 0.7960514797596261, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.7960514797596261, + "reward_after_std": 0.747637789696455, + "reward_before_mean": 0.9413503212854266, + "reward_before_std": 0.7453129384666681, + "reward_change_max": 0.0, + "reward_change_mean": -0.14529882464557886, + "reward_change_min": -0.24091583769768476, + "reward_change_std": 0.09140054974704981, + "reward_std": 0.7476377971470356, + "rewards/cosine_scaled_reward": 0.0019251517951488495, + "rewards/format_reward": 0.9375000074505806, + "step": 265 + }, + { + "advantage_max": 1.3270168602466583, + "advantage_mean": -2.6697914656814703e-08, + "advantage_min": -1.3128254860639572, + "advantage_std": 0.9998119845986366, + "completion_length": 1821.958381652832, + "epoch": 0.304, + "grad_norm": 0.2732686698436737, + "kl": 0.01064300537109375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.78255733788191e-07, + "loss": 0.0004, + "reward": 0.5604800856672227, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5604800856672227, + "reward_after_std": 0.672345083206892, + "reward_before_mean": 0.6874868590384722, + "reward_before_std": 0.6820431463420391, + "reward_change_max": 0.0002983957529067993, + "reward_change_mean": -0.12700679013505578, + "reward_change_min": -0.21464761439710855, + "reward_change_std": 0.08562309807166457, + "reward_std": 0.6723450906574726, + "rewards/cosine_scaled_reward": -0.05208991654217243, + "rewards/format_reward": 0.7916666772216558, + "step": 266 + }, + { + "advantage_max": 1.5514462441205978, + "advantage_mean": -4.967053990334591e-09, + "advantage_min": -1.1182539835572243, + "advantage_std": 0.999804675579071, + "completion_length": 2261.3958892822266, + "epoch": 0.30514285714285716, + "grad_norm": 0.25328734517097473, + "kl": 0.013637542724609375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.751196772469237e-07, + "loss": 0.0005, + "reward": 0.20815421640872955, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.20815421640872955, + "reward_after_std": 0.6557923518121243, + "reward_before_mean": 0.29744547605514526, + "reward_before_std": 0.6443983167409897, + "reward_change_max": 0.00010949373245239258, + "reward_change_mean": -0.08929126942530274, + "reward_change_min": -0.15922965481877327, + "reward_change_std": 0.057676469441503286, + "reward_std": 0.655792374163866, + "rewards/cosine_scaled_reward": -0.17419393052114174, + "rewards/format_reward": 0.6458333432674408, + "step": 267 + }, + { + "advantage_max": 1.5531953871250153, + "advantage_mean": -1.9868215850316062e-08, + "advantage_min": -0.9885692149400711, + "advantage_std": 0.9998269900679588, + "completion_length": 1502.3542022705078, + "epoch": 0.3062857142857143, + "grad_norm": 0.3057520091533661, + "kl": 0.014217376708984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.71982396408026e-07, + "loss": 0.0006, + "reward": 0.5444807633757591, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5444807633757591, + "reward_after_std": 0.8374427780508995, + "reward_before_mean": 0.6610086187720299, + "reward_before_std": 0.8377688638865948, + "reward_change_max": 0.00028542429208755493, + "reward_change_mean": -0.11652784794569016, + "reward_change_min": -0.23292355239391327, + "reward_change_std": 0.0859985020942986, + "reward_std": 0.837442796677351, + "rewards/cosine_scaled_reward": -0.09657904086634517, + "rewards/format_reward": 0.8541666753590107, + "step": 268 + }, + { + "advantage_max": 1.3833930268883705, + "advantage_mean": 2.980232349791834e-08, + "advantage_min": -1.3209297060966492, + "advantage_std": 0.9998177289962769, + "completion_length": 1695.3750457763672, + "epoch": 0.30742857142857144, + "grad_norm": 0.2443539798259735, + "kl": 0.00933074951171875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.688440441781398e-07, + "loss": 0.0004, + "reward": 0.4807785237208009, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.4807785237208009, + "reward_after_std": 0.6509964242577553, + "reward_before_mean": 0.5990621582604945, + "reward_before_std": 0.6480329409241676, + "reward_change_max": 0.0, + "reward_change_mean": -0.11828359961509705, + "reward_change_min": -0.2115028277039528, + "reward_change_std": 0.08078650198876858, + "reward_std": 0.6509964615106583, + "rewards/cosine_scaled_reward": -0.07546893320977688, + "rewards/format_reward": 0.7500000149011612, + "step": 269 + }, + { + "advantage_max": 1.6883545815944672, + "advantage_mean": -1.9557773400791234e-08, + "advantage_min": -0.9836084470152855, + "advantage_std": 0.999851755797863, + "completion_length": 1690.3333892822266, + "epoch": 0.30857142857142855, + "grad_norm": 0.21359197795391083, + "kl": 0.009889602661132812, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.657047735161255e-07, + "loss": 0.0004, + "reward": 0.8308562897145748, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.8308562897145748, + "reward_after_std": 0.8897613398730755, + "reward_before_mean": 0.9719878695905209, + "reward_before_std": 0.8724946435540915, + "reward_change_max": 0.0002977624535560608, + "reward_change_mean": -0.1411315887235105, + "reward_change_min": -0.23690782114863396, + "reward_change_std": 0.09405350359156728, + "reward_std": 0.8897613659501076, + "rewards/cosine_scaled_reward": 0.04849394381744787, + "rewards/format_reward": 0.8750000074505806, + "step": 270 + }, + { + "advantage_max": 1.497256375849247, + "advantage_mean": -1.2728075704515618e-07, + "advantage_min": -1.2109168618917465, + "advantage_std": 0.999859169125557, + "completion_length": 1477.8958740234375, + "epoch": 0.3097142857142857, + "grad_norm": 0.5373415350914001, + "kl": 0.014141082763671875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.625647374256061e-07, + "loss": 0.0006, + "reward": 1.2130113132297993, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 1.2130113132297993, + "reward_after_std": 0.9313858449459076, + "reward_before_mean": 1.3908941932022572, + "reward_before_std": 0.9274521321058273, + "reward_change_max": 0.0, + "reward_change_mean": -0.1778829018585384, + "reward_change_min": -0.2849385794252157, + "reward_change_std": 0.11208993848413229, + "reward_std": 0.9313858933746815, + "rewards/cosine_scaled_reward": 0.268363754323218, + "rewards/format_reward": 0.8541666828095913, + "step": 271 + }, + { + "advantage_max": 1.5334807932376862, + "advantage_mean": -2.3903947654613233e-08, + "advantage_min": -1.0556566417217255, + "advantage_std": 0.9998181089758873, + "completion_length": 1630.1042022705078, + "epoch": 0.31085714285714283, + "grad_norm": 0.2611698508262634, + "kl": 0.0101165771484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.594240889475106e-07, + "loss": 0.0004, + "reward": 0.5528686475008726, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5528686475008726, + "reward_after_std": 0.7255290150642395, + "reward_before_mean": 0.6753423325717449, + "reward_before_std": 0.7237700335681438, + "reward_change_max": 0.00016976892948150635, + "reward_change_mean": -0.12247373699210584, + "reward_change_min": -0.2163546048104763, + "reward_change_std": 0.08112134877592325, + "reward_std": 0.7255290448665619, + "rewards/cosine_scaled_reward": -0.08941216330276802, + "rewards/format_reward": 0.8541666716337204, + "step": 272 + }, + { + "advantage_max": 1.5685235261917114, + "advantage_mean": -2.142041988228982e-08, + "advantage_min": -1.220929853618145, + "advantage_std": 0.9997903630137444, + "completion_length": 1454.270896911621, + "epoch": 0.312, + "grad_norm": 0.2637239694595337, + "kl": 0.01152801513671875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.562829811526154e-07, + "loss": 0.0005, + "reward": 0.7724597938358784, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.7724597938358784, + "reward_after_std": 0.6019731983542442, + "reward_before_mean": 0.917079396545887, + "reward_before_std": 0.5814417097717524, + "reward_change_max": 0.00026979297399520874, + "reward_change_mean": -0.1446195626631379, + "reward_change_min": -0.21904787234961987, + "reward_change_std": 0.08566631795838475, + "reward_std": 0.601973220705986, + "rewards/cosine_scaled_reward": 0.03145633079111576, + "rewards/format_reward": 0.8541666753590107, + "step": 273 + }, + { + "advantage_max": 1.531281739473343, + "advantage_mean": -5.4637593560613595e-08, + "advantage_min": -0.9971391409635544, + "advantage_std": 0.9998548924922943, + "completion_length": 1111.5833587646484, + "epoch": 0.31314285714285717, + "grad_norm": 0.3014761507511139, + "kl": 0.011322021484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.531415671340826e-07, + "loss": 0.0005, + "reward": 1.0097886063158512, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 1.0097886063158512, + "reward_after_std": 0.8508143164217472, + "reward_before_mean": 1.169247966259718, + "reward_before_std": 0.8384442552924156, + "reward_change_max": 0.0, + "reward_change_mean": -0.1594593795016408, + "reward_change_min": -0.2814331278204918, + "reward_change_std": 0.09933338640257716, + "reward_std": 0.8508143350481987, + "rewards/cosine_scaled_reward": 0.09504064894281328, + "rewards/format_reward": 0.9791666716337204, + "step": 274 + }, + { + "advantage_max": 1.46129010617733, + "advantage_mean": 3.725290853573426e-09, + "advantage_min": -1.2289179787039757, + "advantage_std": 0.9998204484581947, + "completion_length": 1571.020866394043, + "epoch": 0.3142857142857143, + "grad_norm": 0.18821187317371368, + "kl": 0.00789642333984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.5e-07, + "loss": 0.0003, + "reward": 1.1050619557499886, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 1.1050619557499886, + "reward_after_std": 0.6740875914692879, + "reward_before_mean": 1.2809049189090729, + "reward_before_std": 0.6542658880352974, + "reward_change_max": 0.0001346990466117859, + "reward_change_mean": -0.1758428937755525, + "reward_change_min": -0.2756085656583309, + "reward_change_std": 0.10608048643916845, + "reward_std": 0.6740876249969006, + "rewards/cosine_scaled_reward": 0.2342024319805205, + "rewards/format_reward": 0.8125000149011612, + "step": 275 + }, + { + "advantage_max": 1.4276919662952423, + "advantage_mean": -9.002785084089027e-08, + "advantage_min": -1.2498956099152565, + "advantage_std": 0.9998070895671844, + "completion_length": 1362.5208740234375, + "epoch": 0.31542857142857145, + "grad_norm": 0.2566883862018585, + "kl": 0.01116180419921875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.468584328659172e-07, + "loss": 0.0004, + "reward": 0.8555725496262312, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.8555725496262312, + "reward_after_std": 0.6251602806150913, + "reward_before_mean": 1.0092867035418749, + "reward_before_std": 0.6183755993843079, + "reward_change_max": 0.0001868903636932373, + "reward_change_mean": -0.15371418604627252, + "reward_change_min": -0.24152507819235325, + "reward_change_std": 0.09109621308743954, + "reward_std": 0.6251602955162525, + "rewards/cosine_scaled_reward": 0.0671433275565505, + "rewards/format_reward": 0.8750000055879354, + "step": 276 + }, + { + "advantage_max": 1.3485763520002365, + "advantage_mean": -4.47034849138106e-08, + "advantage_min": -1.2358265295624733, + "advantage_std": 0.9997855946421623, + "completion_length": 1636.8750381469727, + "epoch": 0.31657142857142856, + "grad_norm": 0.38714054226875305, + "kl": 0.015163421630859375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.437170188473847e-07, + "loss": 0.0006, + "reward": 0.8174463622272015, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.8174463622272015, + "reward_after_std": 0.5525830537080765, + "reward_before_mean": 0.969972088932991, + "reward_before_std": 0.5424490142613649, + "reward_change_max": 0.0, + "reward_change_mean": -0.1525257029570639, + "reward_change_min": -0.228757094591856, + "reward_change_std": 0.08871490228921175, + "reward_std": 0.5525830574333668, + "rewards/cosine_scaled_reward": 0.08915269374847412, + "rewards/format_reward": 0.7916666716337204, + "step": 277 + }, + { + "advantage_max": 1.5771578699350357, + "advantage_mean": -5.836288130556255e-08, + "advantage_min": -1.211281694471836, + "advantage_std": 0.9996704533696175, + "completion_length": 1442.1042022705078, + "epoch": 0.3177142857142857, + "grad_norm": 0.33007821440696716, + "kl": 0.009634017944335938, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.405759110524894e-07, + "loss": 0.0004, + "reward": 0.8426887975074351, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.8426887975074351, + "reward_after_std": 0.4258726928383112, + "reward_before_mean": 0.9998213611543179, + "reward_before_std": 0.3926746714860201, + "reward_change_max": 0.0, + "reward_change_mean": -0.15713255293667316, + "reward_change_min": -0.2242979882284999, + "reward_change_std": 0.08671380672603846, + "reward_std": 0.42587270215153694, + "rewards/cosine_scaled_reward": 0.07282732427120209, + "rewards/format_reward": 0.8541666679084301, + "step": 278 + }, + { + "advantage_max": 1.3795539736747742, + "advantage_mean": -6.643434363740042e-08, + "advantage_min": -1.1235552951693535, + "advantage_std": 0.9998089745640755, + "completion_length": 1575.3333892822266, + "epoch": 0.31885714285714284, + "grad_norm": 0.28037822246551514, + "kl": 0.0097198486328125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.37435262574394e-07, + "loss": 0.0004, + "reward": 0.6290068812668324, + "reward_advantage_correlation": 0.9999999999999994, + "reward_after_mean": 0.6290068812668324, + "reward_after_std": 0.6324650943279266, + "reward_before_mean": 0.7617004364728928, + "reward_before_std": 0.6272139437496662, + "reward_change_max": 0.0, + "reward_change_mean": -0.13269356172531843, + "reward_change_min": -0.22322656959295273, + "reward_change_std": 0.08410935197025537, + "reward_std": 0.6324650980532169, + "rewards/cosine_scaled_reward": -0.06706646271049976, + "rewards/format_reward": 0.8958333507180214, + "step": 279 + }, + { + "advantage_max": 1.4074894785881042, + "advantage_mean": -6.332993707225398e-08, + "advantage_min": -1.261429451406002, + "advantage_std": 0.9998380243778229, + "completion_length": 1622.937515258789, + "epoch": 0.32, + "grad_norm": 0.29624009132385254, + "kl": 0.011272430419921875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.342952264838747e-07, + "loss": 0.0005, + "reward": 1.2476790957152843, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 1.2476790957152843, + "reward_after_std": 0.8106604292988777, + "reward_before_mean": 1.434355091303587, + "reward_before_std": 0.810054725036025, + "reward_change_max": 0.0, + "reward_change_mean": -0.18667601607739925, + "reward_change_min": -0.30536388978362083, + "reward_change_std": 0.12097588926553726, + "reward_std": 0.8106604591012001, + "rewards/cosine_scaled_reward": 0.2796775340102613, + "rewards/format_reward": 0.875, + "step": 280 + }, + { + "advantage_max": 1.5743074342608452, + "advantage_mean": 7.450580374879223e-09, + "advantage_min": -1.2013270854949951, + "advantage_std": 0.9998085796833038, + "completion_length": 2353.2708587646484, + "epoch": 0.3211428571428571, + "grad_norm": 0.24173757433891296, + "kl": 0.01459503173828125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.311559558218603e-07, + "loss": 0.0006, + "reward": 0.36519142519682646, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.36519142519682646, + "reward_after_std": 0.7732009813189507, + "reward_before_mean": 0.4665646404027939, + "reward_before_std": 0.7668895106762648, + "reward_change_max": 9.001791477203369e-05, + "reward_change_mean": -0.10137320728972554, + "reward_change_min": -0.18519249744713306, + "reward_change_std": 0.07093417691066861, + "reward_std": 0.7732009924948215, + "rewards/cosine_scaled_reward": -0.047967685444746166, + "rewards/format_reward": 0.5625000018626451, + "step": 281 + }, + { + "advantage_max": 1.4620432406663895, + "advantage_mean": -2.0489098639941972e-08, + "advantage_min": -1.1684705764055252, + "advantage_std": 0.9997751787304878, + "completion_length": 1570.333396911621, + "epoch": 0.3222857142857143, + "grad_norm": 0.25614941120147705, + "kl": 0.01031494140625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.28017603591974e-07, + "loss": 0.0004, + "reward": 0.73689816147089, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.73689816147089, + "reward_after_std": 0.5660558789968491, + "reward_before_mean": 0.8810554593801498, + "reward_before_std": 0.5520604215562344, + "reward_change_max": 0.0, + "reward_change_mean": -0.14415732212364674, + "reward_change_min": -0.23753152042627335, + "reward_change_std": 0.08675737353041768, + "reward_std": 0.566055903211236, + "rewards/cosine_scaled_reward": 0.02386106736958027, + "rewards/format_reward": 0.8333333414047956, + "step": 282 + }, + { + "advantage_max": 1.3014950826764107, + "advantage_mean": -3.756334410187634e-08, + "advantage_min": -1.3910346552729607, + "advantage_std": 0.9998150020837784, + "completion_length": 2028.458366394043, + "epoch": 0.32342857142857145, + "grad_norm": 0.30256274342536926, + "kl": 0.01206207275390625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.248803227530763e-07, + "loss": 0.0005, + "reward": 0.7930786944925785, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.7930786944925785, + "reward_after_std": 0.7125616930425167, + "reward_before_mean": 0.9397026058286428, + "reward_before_std": 0.7213702015578747, + "reward_change_max": 0.0005670338869094849, + "reward_change_mean": -0.14662390458397567, + "reward_change_min": -0.24198182485997677, + "reward_change_std": 0.09706047433428466, + "reward_std": 0.7125617042183876, + "rewards/cosine_scaled_reward": 0.09485127124935389, + "rewards/format_reward": 0.7500000093132257, + "step": 283 + }, + { + "advantage_max": 1.5812103599309921, + "advantage_mean": -1.490116185998147e-08, + "advantage_min": -1.0278535932302475, + "advantage_std": 0.9998149424791336, + "completion_length": 1272.0417098999023, + "epoch": 0.32457142857142857, + "grad_norm": 0.2702529728412628, + "kl": 0.01029205322265625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.21744266211809e-07, + "loss": 0.0004, + "reward": 0.47102506645023823, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.47102506645023823, + "reward_after_std": 0.6693096347153187, + "reward_before_mean": 0.5846154130995274, + "reward_before_std": 0.6555521786212921, + "reward_change_max": 0.0, + "reward_change_mean": -0.11359035596251488, + "reward_change_min": -0.20026837661862373, + "reward_change_std": 0.07205808768048882, + "reward_std": 0.6693096496164799, + "rewards/cosine_scaled_reward": -0.13477563112974167, + "rewards/format_reward": 0.8541666716337204, + "step": 284 + }, + { + "advantage_max": 1.6508907973766327, + "advantage_mean": -3.228585088166369e-08, + "advantage_min": -1.0195088982582092, + "advantage_std": 0.9997524991631508, + "completion_length": 1204.8750305175781, + "epoch": 0.32571428571428573, + "grad_norm": 0.31728941202163696, + "kl": 0.01241302490234375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.186095868151436e-07, + "loss": 0.0005, + "reward": 0.7143117673695087, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.7143117673695087, + "reward_after_std": 0.6237810291349888, + "reward_before_mean": 0.8525383183732629, + "reward_before_std": 0.5971423909068108, + "reward_change_max": 0.0, + "reward_change_mean": -0.1382265416905284, + "reward_change_min": -0.21934391558170319, + "reward_change_std": 0.08337379200384021, + "reward_std": 0.6237810496240854, + "rewards/cosine_scaled_reward": -0.03206418454647064, + "rewards/format_reward": 0.916666679084301, + "step": 285 + }, + { + "advantage_max": 1.6236571073532104, + "advantage_mean": -4.967053657267684e-09, + "advantage_min": -0.982652448117733, + "advantage_std": 0.9998085200786591, + "completion_length": 1341.4792022705078, + "epoch": 0.32685714285714285, + "grad_norm": 0.3013690710067749, + "kl": 0.01345062255859375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.154764373429315e-07, + "loss": 0.0005, + "reward": 0.5443090852349997, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.5443090852349997, + "reward_after_std": 0.6821228787302971, + "reward_before_mean": 0.6657579094171524, + "reward_before_std": 0.6722039449959993, + "reward_change_max": 0.0, + "reward_change_mean": -0.12144879251718521, + "reward_change_min": -0.21625848673284054, + "reward_change_std": 0.07772636087611318, + "reward_std": 0.682122889906168, + "rewards/cosine_scaled_reward": -0.13587106950581074, + "rewards/format_reward": 0.9375000074505806, + "step": 286 + }, + { + "advantage_max": 1.6009186208248138, + "advantage_mean": 2.6077033421501028e-08, + "advantage_min": -1.0167246609926224, + "advantage_std": 0.9997030347585678, + "completion_length": 1400.854175567627, + "epoch": 0.328, + "grad_norm": 0.333176851272583, + "kl": 0.0135955810546875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.123449705004581e-07, + "loss": 0.0005, + "reward": 0.6055421698838472, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6055421698838472, + "reward_after_std": 0.6879148874431849, + "reward_before_mean": 0.7314797258004546, + "reward_before_std": 0.6727271899580956, + "reward_change_max": 3.1247735023498535e-05, + "reward_change_mean": -0.1259375517256558, + "reward_change_min": -0.21384302619844675, + "reward_change_std": 0.07986640278249979, + "reward_std": 0.6879148911684752, + "rewards/cosine_scaled_reward": 0.01157319126650691, + "rewards/format_reward": 0.7083333358168602, + "step": 287 + }, + { + "advantage_max": 1.5019276440143585, + "advantage_mean": -4.2530398758344745e-08, + "advantage_min": -1.243957407772541, + "advantage_std": 0.9997709915041924, + "completion_length": 1464.291732788086, + "epoch": 0.3291428571428571, + "grad_norm": 0.24288351833820343, + "kl": 0.00968170166015625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.09215338910999e-07, + "loss": 0.0004, + "reward": 0.5588183682411909, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5588183682411909, + "reward_after_std": 0.5841691084206104, + "reward_before_mean": 0.6852227933704853, + "reward_before_std": 0.5734246857464314, + "reward_change_max": 0.0, + "reward_change_mean": -0.12640444561839104, + "reward_change_min": -0.19520024210214615, + "reward_change_std": 0.07665029587224126, + "reward_std": 0.584169115871191, + "rewards/cosine_scaled_reward": -0.12613860587589443, + "rewards/format_reward": 0.9375000074505806, + "step": 288 + }, + { + "advantage_max": 1.6838382929563522, + "advantage_mean": -5.774200317887335e-08, + "advantage_min": -1.0431829616427422, + "advantage_std": 0.9997774288058281, + "completion_length": 1470.9792137145996, + "epoch": 0.3302857142857143, + "grad_norm": 0.4302460253238678, + "kl": 0.015533447265625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.060876951083828e-07, + "loss": 0.0006, + "reward": 0.6976730767637491, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6976730767637491, + "reward_after_std": 0.5832541156560183, + "reward_before_mean": 0.8337808940559626, + "reward_before_std": 0.5487283486872911, + "reward_change_max": 0.0005718618631362915, + "reward_change_mean": -0.13610781356692314, + "reward_change_min": -0.2004237212240696, + "reward_change_std": 0.07571555068716407, + "reward_std": 0.5832541491836309, + "rewards/cosine_scaled_reward": 0.00022377073764801025, + "rewards/format_reward": 0.8333333414047956, + "step": 289 + }, + { + "advantage_max": 1.4299821257591248, + "advantage_mean": 1.7384689798838338e-08, + "advantage_min": -1.2245251014828682, + "advantage_std": 0.999837800860405, + "completion_length": 1135.0625457763672, + "epoch": 0.3314285714285714, + "grad_norm": 0.29410240054130554, + "kl": 0.011016845703125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 5.02962191529556e-07, + "loss": 0.0004, + "reward": 1.0313334502279758, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 1.0313334502279758, + "reward_after_std": 0.8016556054353714, + "reward_before_mean": 1.1964529231190681, + "reward_before_std": 0.7968671545386314, + "reward_change_max": 0.0, + "reward_change_mean": -0.16511950362473726, + "reward_change_min": -0.2797320708632469, + "reward_change_std": 0.10460527054965496, + "reward_std": 0.801655612885952, + "rewards/cosine_scaled_reward": 0.12947646714746952, + "rewards/format_reward": 0.9375, + "step": 290 + }, + { + "advantage_max": 1.6369030177593231, + "advantage_mean": -5.494803323458086e-08, + "advantage_min": -1.0549881234765053, + "advantage_std": 0.9998100474476814, + "completion_length": 1349.6041870117188, + "epoch": 0.3325714285714286, + "grad_norm": 0.2570025324821472, + "kl": 0.0112152099609375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.998389805071536e-07, + "loss": 0.0004, + "reward": 0.8352435231208801, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.8352435231208801, + "reward_after_std": 0.7465978749096394, + "reward_before_mean": 0.980657309293747, + "reward_before_std": 0.7230622582137585, + "reward_change_max": 0.0, + "reward_change_mean": -0.14541381038725376, + "reward_change_min": -0.23240702971816063, + "reward_change_std": 0.08540754904970527, + "reward_std": 0.7465978935360909, + "rewards/cosine_scaled_reward": 0.021578645333647728, + "rewards/format_reward": 0.9375, + "step": 291 + }, + { + "advantage_max": 1.4761302471160889, + "advantage_mean": 9.934107703113426e-09, + "advantage_min": -1.1501412615180016, + "advantage_std": 0.9998109415173531, + "completion_length": 1796.5000457763672, + "epoch": 0.33371428571428574, + "grad_norm": 0.3171732425689697, + "kl": 0.015542984008789062, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.967182142620745e-07, + "loss": 0.0006, + "reward": 0.47690540738403797, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.47690540738403797, + "reward_after_std": 0.8478581756353378, + "reward_before_mean": 0.5877604689449072, + "reward_before_std": 0.8521979209035635, + "reward_change_max": 0.0, + "reward_change_mean": -0.11085503036156297, + "reward_change_min": -0.20827853865921497, + "reward_change_std": 0.08406012458726764, + "reward_std": 0.8478581979870796, + "rewards/cosine_scaled_reward": -0.10195311531424522, + "rewards/format_reward": 0.7916666716337204, + "step": 292 + }, + { + "advantage_max": 1.2476003393530846, + "advantage_mean": -3.601114129114791e-08, + "advantage_min": -1.526408739387989, + "advantage_std": 0.9998006299138069, + "completion_length": 1279.9167098999023, + "epoch": 0.33485714285714285, + "grad_norm": 0.35458728671073914, + "kl": 0.0106201171875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.93600044896063e-07, + "loss": 0.0004, + "reward": 0.6448268890380859, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.6448268890380859, + "reward_after_std": 0.5995666980743408, + "reward_before_mean": 0.7823301900643855, + "reward_before_std": 0.6108702011406422, + "reward_change_max": 0.0, + "reward_change_mean": -0.13750330917537212, + "reward_change_min": -0.21467959508299828, + "reward_change_std": 0.08796036243438721, + "reward_std": 0.599566712975502, + "rewards/cosine_scaled_reward": -0.06716826558113098, + "rewards/format_reward": 0.916666679084301, + "step": 293 + }, + { + "advantage_max": 1.4925710558891296, + "advantage_mean": 4.035731804297171e-09, + "advantage_min": -1.0893485471606255, + "advantage_std": 0.9998233541846275, + "completion_length": 2047.1875457763672, + "epoch": 0.336, + "grad_norm": 0.31637120246887207, + "kl": 0.017940521240234375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.904846243842949e-07, + "loss": 0.0007, + "reward": 0.6296005487674847, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6296005487674847, + "reward_after_std": 0.735579501837492, + "reward_before_mean": 0.7591220289468765, + "reward_before_std": 0.7338115274906158, + "reward_change_max": 3.84598970413208e-05, + "reward_change_mean": -0.1295214667916298, + "reward_change_min": -0.23670313879847527, + "reward_change_std": 0.08958509005606174, + "reward_std": 0.7355795204639435, + "rewards/cosine_scaled_reward": 0.01497767074033618, + "rewards/format_reward": 0.729166679084301, + "step": 294 + }, + { + "advantage_max": 1.6160722076892853, + "advantage_mean": -1.8626462594539817e-09, + "advantage_min": -1.1032218933105469, + "advantage_std": 0.999823197722435, + "completion_length": 1671.166732788086, + "epoch": 0.33714285714285713, + "grad_norm": 0.31535738706588745, + "kl": 0.015350341796875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.873721045679706e-07, + "loss": 0.0006, + "reward": 0.7431380706839263, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.7431380706839263, + "reward_after_std": 0.6490283533930779, + "reward_before_mean": 0.883342670276761, + "reward_before_std": 0.6273828744888306, + "reward_change_max": 0.0, + "reward_change_mean": -0.14020460075698793, + "reward_change_min": -0.22164242342114449, + "reward_change_std": 0.08250434487126768, + "reward_std": 0.6490283794701099, + "rewards/cosine_scaled_reward": 0.06667132629081607, + "rewards/format_reward": 0.7500000093132257, + "step": 295 + }, + { + "advantage_max": 1.403956413269043, + "advantage_mean": -4.656612906384083e-08, + "advantage_min": -1.2638613507151604, + "advantage_std": 0.9998411238193512, + "completion_length": 1765.3125610351562, + "epoch": 0.3382857142857143, + "grad_norm": 0.2900753319263458, + "kl": 0.014621734619140625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.842626371469149e-07, + "loss": 0.0006, + "reward": 0.6206641308963299, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6206641308963299, + "reward_after_std": 0.7382680289447308, + "reward_before_mean": 0.7495931871235371, + "reward_before_std": 0.7414855919778347, + "reward_change_max": 0.0, + "reward_change_mean": -0.12892907345667481, + "reward_change_min": -0.22145743668079376, + "reward_change_std": 0.08637001179158688, + "reward_std": 0.7382680363953114, + "rewards/cosine_scaled_reward": -0.04187007714062929, + "rewards/format_reward": 0.8333333432674408, + "step": 296 + }, + { + "advantage_max": 1.5119258910417557, + "advantage_mean": -4.2219957530065244e-08, + "advantage_min": -1.21080182492733, + "advantage_std": 0.9998287558555603, + "completion_length": 1928.7708892822266, + "epoch": 0.3394285714285714, + "grad_norm": 0.2618260085582733, + "kl": 0.01506805419921875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.811563736721829e-07, + "loss": 0.0006, + "reward": 0.43504673708230257, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.43504673708230257, + "reward_after_std": 0.7578357718884945, + "reward_before_mean": 0.5440489100292325, + "reward_before_std": 0.7604426443576813, + "reward_change_max": 0.0001793503761291504, + "reward_change_mean": -0.10900220740586519, + "reward_change_min": -0.1914794910699129, + "reward_change_std": 0.07616990571841598, + "reward_std": 0.7578357979655266, + "rewards/cosine_scaled_reward": -0.06130887754261494, + "rewards/format_reward": 0.6666666772216558, + "step": 297 + }, + { + "advantage_max": 1.5896563529968262, + "advantage_mean": -7.761025155872403e-10, + "advantage_min": -1.1579310297966003, + "advantage_std": 0.999824695289135, + "completion_length": 1441.7083740234375, + "epoch": 0.3405714285714286, + "grad_norm": 0.24601784348487854, + "kl": 0.01018524169921875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.780534655386743e-07, + "loss": 0.0004, + "reward": 0.7649918240495026, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.7649918240495026, + "reward_after_std": 0.7487938515841961, + "reward_before_mean": 0.904022048576735, + "reward_before_std": 0.7324915304780006, + "reward_change_max": 0.0, + "reward_change_mean": -0.13903021812438965, + "reward_change_min": -0.23804667592048645, + "reward_change_std": 0.08871197002008557, + "reward_std": 0.7487938888370991, + "rewards/cosine_scaled_reward": 0.024927678401581943, + "rewards/format_reward": 0.8541666716337204, + "step": 298 + }, + { + "advantage_max": 1.439231514930725, + "advantage_mean": 2.9802322498717615e-08, + "advantage_min": -1.1201618686318398, + "advantage_std": 0.9998530372977257, + "completion_length": 1864.520881652832, + "epoch": 0.3417142857142857, + "grad_norm": 0.4852619767189026, + "kl": 0.019191741943359375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.749540639777539e-07, + "loss": 0.0008, + "reward": 0.5580803826451302, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5580803826451302, + "reward_after_std": 0.8004944212734699, + "reward_before_mean": 0.6778634660877287, + "reward_before_std": 0.802312109619379, + "reward_change_max": 0.0004041343927383423, + "reward_change_mean": -0.1197830568999052, + "reward_change_min": -0.22889439016580582, + "reward_change_std": 0.08444441203027964, + "reward_std": 0.8004944249987602, + "rewards/cosine_scaled_reward": -0.05690161604434252, + "rewards/format_reward": 0.7916666716337204, + "step": 299 + }, + { + "advantage_max": 1.560862809419632, + "advantage_mean": -6.208817904251873e-09, + "advantage_min": -1.2231401279568672, + "advantage_std": 0.9997982382774353, + "completion_length": 2020.458366394043, + "epoch": 0.34285714285714286, + "grad_norm": 0.38628125190734863, + "kl": 0.020538330078125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.7185832004988133e-07, + "loss": 0.0008, + "reward": 0.2665316807106137, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.2665316807106137, + "reward_after_std": 0.7272491361945868, + "reward_before_mean": 0.3611962553113699, + "reward_before_std": 0.7304719872772694, + "reward_change_max": 0.00027470290660858154, + "reward_change_mean": -0.09466456901282072, + "reward_change_min": -0.18015100061893463, + "reward_change_std": 0.07119207771029323, + "reward_std": 0.727249139919877, + "rewards/cosine_scaled_reward": -0.13190188258886337, + "rewards/format_reward": 0.625000013038516, + "step": 300 + }, + { + "advantage_max": 1.3870003148913383, + "advantage_mean": -3.290673178391046e-08, + "advantage_min": -1.3263009116053581, + "advantage_std": 0.9997928962111473, + "completion_length": 1702.6250457763672, + "epoch": 0.344, + "grad_norm": 0.3162795901298523, + "kl": 0.021331787109375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.68766384637248e-07, + "loss": 0.0009, + "reward": 0.550446767359972, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.550446767359972, + "reward_after_std": 0.6271288879215717, + "reward_before_mean": 0.675824873149395, + "reward_before_std": 0.6298750899732113, + "reward_change_max": 0.0, + "reward_change_mean": -0.12537812907248735, + "reward_change_min": -0.20914898626506329, + "reward_change_std": 0.08048908645287156, + "reward_std": 0.6271288879215717, + "rewards/cosine_scaled_reward": -0.08917090541217476, + "rewards/format_reward": 0.8541666865348816, + "step": 301 + }, + { + "advantage_max": 1.5091819912195206, + "advantage_mean": -6.519258155535113e-08, + "advantage_min": -1.0816588401794434, + "advantage_std": 0.9997987672686577, + "completion_length": 2065.7291984558105, + "epoch": 0.34514285714285714, + "grad_norm": 0.3135543167591095, + "kl": 0.023223876953125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.656784084364238e-07, + "loss": 0.0009, + "reward": 0.4963846392929554, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4963846392929554, + "reward_after_std": 0.739615261554718, + "reward_before_mean": 0.6132568791508675, + "reward_before_std": 0.7420662026852369, + "reward_change_max": 0.0004647970199584961, + "reward_change_mean": -0.11687223275657743, + "reward_change_min": -0.22242337465286255, + "reward_change_std": 0.08635350060649216, + "reward_std": 0.7396152671426535, + "rewards/cosine_scaled_reward": 0.00454508513212204, + "rewards/format_reward": 0.6041666753590107, + "step": 302 + }, + { + "advantage_max": 1.5902022868394852, + "advantage_mean": -8.257727046601104e-08, + "advantage_min": -1.086832880973816, + "advantage_std": 0.999818466603756, + "completion_length": 1145.2916793823242, + "epoch": 0.3462857142857143, + "grad_norm": 0.45699557662010193, + "kl": 0.011653900146484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.6259454195101267e-07, + "loss": 0.0005, + "reward": 0.9359966441988945, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.9359966441988945, + "reward_after_std": 0.7046908959746361, + "reward_before_mean": 1.0924376100301743, + "reward_before_std": 0.6804439015686512, + "reward_change_max": 0.0, + "reward_change_mean": -0.15644100308418274, + "reward_change_min": -0.25441403687000275, + "reward_change_std": 0.09159657126292586, + "reward_std": 0.7046909183263779, + "rewards/cosine_scaled_reward": 0.056635468266904354, + "rewards/format_reward": 0.9791666716337204, + "step": 303 + }, + { + "advantage_max": 1.500855103135109, + "advantage_mean": -1.2417633477035395e-08, + "advantage_min": -1.2662229239940643, + "advantage_std": 0.9998027309775352, + "completion_length": 1537.2916870117188, + "epoch": 0.3474285714285714, + "grad_norm": 0.29932647943496704, + "kl": 0.0164794921875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.59514935484316e-07, + "loss": 0.0007, + "reward": 0.5928112086839974, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5928112086839974, + "reward_after_std": 0.669714767485857, + "reward_before_mean": 0.7199195058783516, + "reward_before_std": 0.6636870130896568, + "reward_change_max": 0.0, + "reward_change_mean": -0.12710829265415668, + "reward_change_min": -0.19983693584799767, + "reward_change_std": 0.07841200614348054, + "reward_std": 0.6697147898375988, + "rewards/cosine_scaled_reward": -0.05670691654086113, + "rewards/format_reward": 0.8333333358168602, + "step": 304 + }, + { + "advantage_max": 1.5782776921987534, + "advantage_mean": 7.4505804303903744e-09, + "advantage_min": -1.0678502842783928, + "advantage_std": 0.9998413845896721, + "completion_length": 1363.020866394043, + "epoch": 0.3485714285714286, + "grad_norm": 0.3479137122631073, + "kl": 0.0138092041015625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.5643973913200837e-07, + "loss": 0.0006, + "reward": 0.5404365761205554, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.5404365761205554, + "reward_after_std": 0.7375712133944035, + "reward_before_mean": 0.659806574229151, + "reward_before_std": 0.7277458868920803, + "reward_change_max": 0.00011269748210906982, + "reward_change_mean": -0.11936996411532164, + "reward_change_min": -0.21077474392950535, + "reward_change_std": 0.07811526395380497, + "reward_std": 0.7375712543725967, + "rewards/cosine_scaled_reward": -0.1180134043097496, + "rewards/format_reward": 0.8958333432674408, + "step": 305 + }, + { + "advantage_max": 1.4038033783435822, + "advantage_mean": -3.414849530924968e-08, + "advantage_min": -1.2565943449735641, + "advantage_std": 0.9998504742980003, + "completion_length": 1403.1875228881836, + "epoch": 0.3497142857142857, + "grad_norm": 0.4971572756767273, + "kl": 0.037811279296875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.5336910277482155e-07, + "loss": 0.0015, + "reward": 1.033497937489301, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 1.033497937489301, + "reward_after_std": 0.8184244558215141, + "reward_before_mean": 1.1987580558052287, + "reward_before_std": 0.8222080878913403, + "reward_change_max": 0.0, + "reward_change_mean": -0.16526012308895588, + "reward_change_min": -0.2801430709660053, + "reward_change_std": 0.10898207128047943, + "reward_std": 0.8184244707226753, + "rewards/cosine_scaled_reward": 0.2035456746816635, + "rewards/format_reward": 0.7916666753590107, + "step": 306 + }, + { + "advantage_max": 1.546189859509468, + "advantage_mean": -3.849466811978175e-08, + "advantage_min": -0.9951371252536774, + "advantage_std": 0.999823309481144, + "completion_length": 1248.8750381469727, + "epoch": 0.35085714285714287, + "grad_norm": 0.28933218121528625, + "kl": 0.010631561279296875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.503031760712397e-07, + "loss": 0.0004, + "reward": 0.6945029981434345, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6945029981434345, + "reward_after_std": 0.8165526390075684, + "reward_before_mean": 0.8262498378753662, + "reward_before_std": 0.8094968199729919, + "reward_change_max": 0.0, + "reward_change_mean": -0.13174681551754475, + "reward_change_min": -0.2479863427579403, + "reward_change_std": 0.08963473793119192, + "reward_std": 0.8165526390075684, + "rewards/cosine_scaled_reward": -0.02437510807067156, + "rewards/format_reward": 0.8750000037252903, + "step": 307 + }, + { + "advantage_max": 1.5855407267808914, + "advantage_mean": -1.4901161637936866e-08, + "advantage_min": -1.0699032694101334, + "advantage_std": 0.9998343363404274, + "completion_length": 2460.3959197998047, + "epoch": 0.352, + "grad_norm": 0.26973065733909607, + "kl": 0.026277542114257812, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.4724210845020494e-07, + "loss": 0.0011, + "reward": 0.2941260999068618, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2941260999068618, + "reward_after_std": 0.710686132311821, + "reward_before_mean": 0.3919203635305166, + "reward_before_std": 0.7093313783407211, + "reward_change_max": 0.00016189366579055786, + "reward_change_mean": -0.09779428178444505, + "reward_change_min": -0.17198238987475634, + "reward_change_std": 0.0679063880816102, + "reward_std": 0.7106861583888531, + "rewards/cosine_scaled_reward": -0.10612315125763416, + "rewards/format_reward": 0.604166679084301, + "step": 308 + }, + { + "advantage_max": 1.566720575094223, + "advantage_mean": -2.235174206832724e-08, + "advantage_min": -1.2411763966083527, + "advantage_std": 0.9997689723968506, + "completion_length": 2128.5000762939453, + "epoch": 0.35314285714285715, + "grad_norm": 0.35043805837631226, + "kl": 0.02182769775390625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.441860491038345e-07, + "loss": 0.0009, + "reward": 0.3403874337673187, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.3403874337673187, + "reward_after_std": 0.5281771644949913, + "reward_before_mean": 0.4473396446555853, + "reward_before_std": 0.5186953172087669, + "reward_change_max": 0.0, + "reward_change_mean": -0.1069522425532341, + "reward_change_min": -0.1729181855916977, + "reward_change_std": 0.06512200646102428, + "reward_std": 0.5281771682202816, + "rewards/cosine_scaled_reward": -0.12008016742765903, + "rewards/format_reward": 0.6875000186264515, + "step": 309 + }, + { + "advantage_max": 1.3736951127648354, + "advantage_mean": -4.4703484802788296e-08, + "advantage_min": -1.4934279769659042, + "advantage_std": 0.9997905939817429, + "completion_length": 1532.770881652832, + "epoch": 0.35428571428571426, + "grad_norm": 0.5251758098602295, + "kl": 0.027561187744140625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.4113514698014953e-07, + "loss": 0.0011, + "reward": 0.6881696791388094, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6881696791388094, + "reward_after_std": 0.5913071930408478, + "reward_before_mean": 0.8264183476567268, + "reward_before_std": 0.5859585925936699, + "reward_change_max": 0.00013815611600875854, + "reward_change_mean": -0.13824867643415928, + "reward_change_min": -0.22185775637626648, + "reward_change_std": 0.0831848056986928, + "reward_std": 0.5913072191178799, + "rewards/cosine_scaled_reward": -0.024290837347507477, + "rewards/format_reward": 0.8750000074505806, + "step": 310 + }, + { + "advantage_max": 1.3643943965435028, + "advantage_mean": -1.3721486946671746e-07, + "advantage_min": -1.2142458334565163, + "advantage_std": 0.9997890368103981, + "completion_length": 1374.0000381469727, + "epoch": 0.3554285714285714, + "grad_norm": 0.4340468943119049, + "kl": 0.015289306640625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.3808955077581546e-07, + "loss": 0.0006, + "reward": 0.8373637902550399, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.8373637902550399, + "reward_after_std": 0.6764303985983133, + "reward_before_mean": 0.9891079477965832, + "reward_before_std": 0.6704716719686985, + "reward_change_max": 0.00013653188943862915, + "reward_change_mean": -0.15174414590001106, + "reward_change_min": -0.2510115969926119, + "reward_change_std": 0.09827401582151651, + "reward_std": 0.67643041908741, + "rewards/cosine_scaled_reward": 0.0362206120043993, + "rewards/format_reward": 0.9166666716337204, + "step": 311 + }, + { + "advantage_max": 1.7767666429281235, + "advantage_mean": -6.239861538581692e-08, + "advantage_min": -0.9839174374938011, + "advantage_std": 0.9997939020395279, + "completion_length": 1366.6250305175781, + "epoch": 0.3565714285714286, + "grad_norm": 0.2758287489414215, + "kl": 0.016445159912109375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.350494089288943e-07, + "loss": 0.0007, + "reward": 1.218240201473236, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 1.218240201473236, + "reward_after_std": 0.7059788070619106, + "reward_before_mean": 1.3989666923880577, + "reward_before_std": 0.6575389942154288, + "reward_change_max": 9.416043758392334e-05, + "reward_change_mean": -0.18072648905217648, + "reward_change_min": -0.27589481323957443, + "reward_change_std": 0.1028918290976435, + "reward_std": 0.7059788201004267, + "rewards/cosine_scaled_reward": 0.282816668972373, + "rewards/format_reward": 0.8333333432674408, + "step": 312 + }, + { + "advantage_max": 1.3789242804050446, + "advantage_mean": -5.712111927902441e-08, + "advantage_min": -1.2307011783123016, + "advantage_std": 0.9997663050889969, + "completion_length": 1910.125015258789, + "epoch": 0.3577142857142857, + "grad_norm": 0.4083651304244995, + "kl": 0.025684356689453125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.3201486961161093e-07, + "loss": 0.001, + "reward": 0.6246846728026867, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6246846728026867, + "reward_after_std": 0.7261706329882145, + "reward_before_mean": 0.7556617148220539, + "reward_before_std": 0.7316101198084652, + "reward_change_max": 2.294778823852539e-05, + "reward_change_mean": -0.1309770462103188, + "reward_change_min": -0.2411420177668333, + "reward_change_std": 0.09405340254306793, + "reward_std": 0.7261706478893757, + "rewards/cosine_scaled_reward": 0.05491418088786304, + "rewards/format_reward": 0.6458333432674408, + "step": 313 + }, + { + "advantage_max": 1.4088439345359802, + "advantage_mean": 1.5211602422127157e-08, + "advantage_min": -1.309675395488739, + "advantage_std": 0.9997310861945152, + "completion_length": 1676.4583549499512, + "epoch": 0.3588571428571429, + "grad_norm": 0.2689584493637085, + "kl": 0.0266265869140625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.2898608072313045e-07, + "loss": 0.0011, + "reward": 0.6452626027166843, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.6452626027166843, + "reward_after_std": 0.536408307030797, + "reward_before_mean": 0.7821810264140368, + "reward_before_std": 0.5282867718487978, + "reward_change_max": 0.00013487786054611206, + "reward_change_mean": -0.13691840320825577, + "reward_change_min": -0.21895418595522642, + "reward_change_std": 0.08299481403082609, + "reward_std": 0.5364083256572485, + "rewards/cosine_scaled_reward": 0.03692383784800768, + "rewards/format_reward": 0.7083333432674408, + "step": 314 + }, + { + "advantage_max": 1.4416030943393707, + "advantage_mean": 4.967053879312289e-09, + "advantage_min": -1.1885623559355736, + "advantage_std": 0.9998395070433617, + "completion_length": 2093.750045776367, + "epoch": 0.36, + "grad_norm": 0.5383113026618958, + "kl": 0.047454833984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.2596318988235037e-07, + "loss": 0.0019, + "reward": 0.42393129877746105, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.42393129877746105, + "reward_after_std": 0.8001789897680283, + "reward_before_mean": 0.5337961576879025, + "reward_before_std": 0.8166777454316616, + "reward_change_max": 1.0460615158081055e-05, + "reward_change_mean": -0.10986482561565936, + "reward_change_min": -0.2027928214520216, + "reward_change_std": 0.08343219291418791, + "reward_std": 0.8001790381968021, + "rewards/cosine_scaled_reward": -0.06643527187407017, + "rewards/format_reward": 0.6666666809469461, + "step": 315 + }, + { + "advantage_max": 1.5823724269866943, + "advantage_mean": -1.0865429667106241e-09, + "advantage_min": -1.2184803411364555, + "advantage_std": 0.9997778832912445, + "completion_length": 2214.354248046875, + "epoch": 0.36114285714285715, + "grad_norm": 0.6452406048774719, + "kl": 0.03017425537109375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.2294634442070553e-07, + "loss": 0.0012, + "reward": 0.042976333759725094, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.042976333759725094, + "reward_after_std": 0.5025542117655277, + "reward_before_mean": 0.12296128878369927, + "reward_before_std": 0.4968912973999977, + "reward_change_max": 0.0006083101034164429, + "reward_change_mean": -0.07998494571074843, + "reward_change_min": -0.14237389434129, + "reward_change_std": 0.05578152043744922, + "reward_std": 0.5025542229413986, + "rewards/cosine_scaled_reward": -0.29268604703247547, + "rewards/format_reward": 0.7083333469927311, + "step": 316 + }, + { + "advantage_max": 1.4967581778764725, + "advantage_mean": -7.32640422773656e-08, + "advantage_min": -1.2174543887376785, + "advantage_std": 0.9997687339782715, + "completion_length": 2063.270881652832, + "epoch": 0.36228571428571427, + "grad_norm": 0.46099042892456055, + "kl": 0.03884124755859375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.1993569137498776e-07, + "loss": 0.0016, + "reward": 0.45239776093512774, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.45239776093512774, + "reward_after_std": 0.6172072049230337, + "reward_before_mean": 0.5681051621213555, + "reward_before_std": 0.6142678093165159, + "reward_change_max": 0.00023806840181350708, + "reward_change_mean": -0.11570744588971138, + "reward_change_min": -0.2016570856794715, + "reward_change_std": 0.08009989093989134, + "reward_std": 0.6172072291374207, + "rewards/cosine_scaled_reward": 0.013219265267252922, + "rewards/format_reward": 0.5416666753590107, + "step": 317 + }, + { + "advantage_max": 1.567556545138359, + "advantage_mean": -1.9868215073159945e-08, + "advantage_min": -1.2147565111517906, + "advantage_std": 0.9997292533516884, + "completion_length": 1573.3750457763672, + "epoch": 0.36342857142857143, + "grad_norm": 0.33571857213974, + "kl": 0.038578033447265625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.1693137748017915e-07, + "loss": 0.0015, + "reward": 0.3722683619707823, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3722683619707823, + "reward_after_std": 0.486986355856061, + "reward_before_mean": 0.48412251146510243, + "reward_before_std": 0.4724613279104233, + "reward_change_max": 0.00047060102224349976, + "reward_change_mean": -0.11185414809733629, + "reward_change_min": -0.17265755124390125, + "reward_change_std": 0.06754804449155927, + "reward_std": 0.486986368894577, + "rewards/cosine_scaled_reward": -0.1641887491568923, + "rewards/format_reward": 0.8125000055879354, + "step": 318 + }, + { + "advantage_max": 1.5715395361185074, + "advantage_mean": -8.071462553882469e-09, + "advantage_min": -1.107521429657936, + "advantage_std": 0.9997441917657852, + "completion_length": 1662.8125457763672, + "epoch": 0.36457142857142855, + "grad_norm": 0.44830116629600525, + "kl": 0.024005889892578125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.1393354916230005e-07, + "loss": 0.001, + "reward": 0.41969322599470615, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.41969322599470615, + "reward_after_std": 0.5853044390678406, + "reward_before_mean": 0.5322350189089775, + "reward_before_std": 0.5741870794445276, + "reward_change_max": 0.0, + "reward_change_mean": -0.1125417877919972, + "reward_change_min": -0.1796498317271471, + "reward_change_std": 0.07052160147577524, + "reward_std": 0.5853044539690018, + "rewards/cosine_scaled_reward": -0.15054917754605412, + "rewards/format_reward": 0.8333333488553762, + "step": 319 + }, + { + "advantage_max": 1.5654927790164948, + "advantage_mean": -3.0423205343854676e-08, + "advantage_min": -0.9983152002096176, + "advantage_std": 0.9998209252953529, + "completion_length": 1109.3333587646484, + "epoch": 0.3657142857142857, + "grad_norm": 0.4801042079925537, + "kl": 0.0272216796875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.1094235253127374e-07, + "loss": 0.0011, + "reward": 0.6591540115623502, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6591540115623502, + "reward_after_std": 0.7177729271352291, + "reward_before_mean": 0.7893197052180767, + "reward_before_std": 0.7012372482568026, + "reward_change_max": 0.0, + "reward_change_mean": -0.13016566913574934, + "reward_change_min": -0.22104015946388245, + "reward_change_std": 0.07897943677380681, + "reward_std": 0.7177729494869709, + "rewards/cosine_scaled_reward": -0.06367350154323503, + "rewards/format_reward": 0.9166666716337204, + "step": 320 + }, + { + "advantage_max": 1.5770218819379807, + "advantage_mean": -7.698933512934047e-08, + "advantage_min": -1.113427273929119, + "advantage_std": 0.9998316392302513, + "completion_length": 1142.2292022705078, + "epoch": 0.3668571428571429, + "grad_norm": 0.3772043287754059, + "kl": 0.019496917724609375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.079579333738039e-07, + "loss": 0.0008, + "reward": 1.0956639312207699, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 1.0956639312207699, + "reward_after_std": 0.7559323236346245, + "reward_before_mean": 1.2653162218630314, + "reward_before_std": 0.7302691843360662, + "reward_change_max": 0.0, + "reward_change_mean": -0.16965226829051971, + "reward_change_min": -0.264015831053257, + "reward_change_std": 0.09789514960721135, + "reward_std": 0.755932342261076, + "rewards/cosine_scaled_reward": 0.15349140530452132, + "rewards/format_reward": 0.9583333358168602, + "step": 321 + }, + { + "advantage_max": 1.6221220940351486, + "advantage_mean": -2.6077033199456423e-08, + "advantage_min": -0.9467347487807274, + "advantage_std": 0.9998131617903709, + "completion_length": 1825.3333740234375, + "epoch": 0.368, + "grad_norm": 0.6722792983055115, + "kl": 0.0529022216796875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.0498043714627006e-07, + "loss": 0.0021, + "reward": 0.15624785982072353, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.15624785982072353, + "reward_after_std": 0.6913126781582832, + "reward_before_mean": 0.24026533402502537, + "reward_before_std": 0.6873844414949417, + "reward_change_max": 0.00028805434703826904, + "reward_change_mean": -0.08401747269090265, + "reward_change_min": -0.1541888639330864, + "reward_change_std": 0.06128338072448969, + "reward_std": 0.6913126930594444, + "rewards/cosine_scaled_reward": -0.18195067904889584, + "rewards/format_reward": 0.6041666679084301, + "step": 322 + }, + { + "advantage_max": 1.512984074652195, + "advantage_mean": -2.2662183907229405e-08, + "advantage_min": -1.2236190289258957, + "advantage_std": 0.999837301671505, + "completion_length": 1798.4166946411133, + "epoch": 0.36914285714285716, + "grad_norm": 0.6877778172492981, + "kl": 0.04708099365234375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 4.020100089676376e-07, + "loss": 0.0019, + "reward": 0.39301418559625745, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.39301418559625745, + "reward_after_std": 0.8095175884664059, + "reward_before_mean": 0.49585794657468796, + "reward_before_std": 0.8129806108772755, + "reward_change_max": 8.770078420639038e-05, + "reward_change_mean": -0.10284376982599497, + "reward_change_min": -0.17867287807166576, + "reward_change_std": 0.0713571673259139, + "reward_std": 0.8095176108181477, + "rewards/cosine_scaled_reward": -0.03332102671265602, + "rewards/format_reward": 0.5625000111758709, + "step": 323 + }, + { + "advantage_max": 1.4784416109323502, + "advantage_mean": 3.725290298461914e-09, + "advantage_min": -1.0888723582029343, + "advantage_std": 0.9997969344258308, + "completion_length": 1833.7500381469727, + "epoch": 0.3702857142857143, + "grad_norm": 0.6865483522415161, + "kl": 0.05321502685546875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.9904679361238526e-07, + "loss": 0.0021, + "reward": 0.2840481363236904, + "reward_advantage_correlation": 0.9999999999999994, + "reward_after_mean": 0.2840481363236904, + "reward_after_std": 0.6864129733294249, + "reward_before_mean": 0.38250038865953684, + "reward_before_std": 0.6952138058841228, + "reward_change_max": 2.1405518054962158e-05, + "reward_change_mean": -0.09845226665493101, + "reward_change_min": -0.20211569592356682, + "reward_change_std": 0.074963403865695, + "reward_std": 0.68641297519207, + "rewards/cosine_scaled_reward": -0.1316664731130004, + "rewards/format_reward": 0.6458333488553762, + "step": 324 + }, + { + "advantage_max": 1.582233265042305, + "advantage_mean": -4.159907507350624e-08, + "advantage_min": -0.9999347180128098, + "advantage_std": 0.9998515471816063, + "completion_length": 2004.9167022705078, + "epoch": 0.37142857142857144, + "grad_norm": 0.4835790693759918, + "kl": 0.04077911376953125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.9609093550344907e-07, + "loss": 0.0016, + "reward": 0.5837226863950491, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5837226863950491, + "reward_after_std": 0.8696581162512302, + "reward_before_mean": 0.7024334259331226, + "reward_before_std": 0.8600073345005512, + "reward_change_max": 0.00013503432273864746, + "reward_change_mean": -0.11871073208749294, + "reward_change_min": -0.22101427800953388, + "reward_change_std": 0.08400990348309278, + "reward_std": 0.8696581199765205, + "rewards/cosine_scaled_reward": -0.013366644561756402, + "rewards/format_reward": 0.7291666753590107, + "step": 325 + }, + { + "advantage_max": 1.466815024614334, + "advantage_mean": -9.934107980669182e-09, + "advantage_min": -1.0336529538035393, + "advantage_std": 0.9998352974653244, + "completion_length": 1691.750057220459, + "epoch": 0.37257142857142855, + "grad_norm": 0.4159682095050812, + "kl": 0.050548553466796875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.931425787051832e-07, + "loss": 0.002, + "reward": 0.5287162624299526, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.5287162624299526, + "reward_after_std": 0.7326546274125576, + "reward_before_mean": 0.6483225747942924, + "reward_before_std": 0.7302737832069397, + "reward_change_max": 0.0, + "reward_change_mean": -0.11960631795227528, + "reward_change_min": -0.2105935337021947, + "reward_change_std": 0.08047133404761553, + "reward_std": 0.7326546646654606, + "rewards/cosine_scaled_reward": -0.0404220474883914, + "rewards/format_reward": 0.7291666734963655, + "step": 326 + }, + { + "advantage_max": 1.6500040888786316, + "advantage_mean": -7.140140018124796e-08, + "advantage_min": -0.9951028749346733, + "advantage_std": 0.999831348657608, + "completion_length": 1675.0000457763672, + "epoch": 0.3737142857142857, + "grad_norm": 0.4650628864765167, + "kl": 0.031185150146484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.902018669163384e-07, + "loss": 0.0012, + "reward": 0.8487563850358129, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.8487563850358129, + "reward_after_std": 0.6994316130876541, + "reward_before_mean": 0.9964957498013973, + "reward_before_std": 0.6714913509786129, + "reward_change_max": 0.0004935041069984436, + "reward_change_mean": -0.1477393419481814, + "reward_change_min": -0.24240724183619022, + "reward_change_std": 0.09056165255606174, + "reward_std": 0.6994316205382347, + "rewards/cosine_scaled_reward": 0.1128311650827527, + "rewards/format_reward": 0.7708333376795053, + "step": 327 + }, + { + "advantage_max": 1.593785047531128, + "advantage_mean": -3.7252901874396116e-09, + "advantage_min": -1.053187295794487, + "advantage_std": 0.9997126534581184, + "completion_length": 1876.0208740234375, + "epoch": 0.37485714285714283, + "grad_norm": 0.43940746784210205, + "kl": 0.047298431396484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.872689434630585e-07, + "loss": 0.0019, + "reward": 0.23065691691590473, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.23065691691590473, + "reward_after_std": 0.6557473633438349, + "reward_before_mean": 0.3228796450421214, + "reward_before_std": 0.6465305294841528, + "reward_change_max": 0.0001626908779144287, + "reward_change_mean": -0.09222274320200086, + "reward_change_min": -0.16623501293361187, + "reward_change_std": 0.06370436307042837, + "reward_std": 0.6557473940774798, + "rewards/cosine_scaled_reward": -0.19272685050964355, + "rewards/format_reward": 0.7083333414047956, + "step": 328 + }, + { + "advantage_max": 1.5264313220977783, + "advantage_mean": -1.5149514409618092e-07, + "advantage_min": -1.2123200222849846, + "advantage_std": 0.9997549876570702, + "completion_length": 1171.4375495910645, + "epoch": 0.376, + "grad_norm": 0.5207266211509705, + "kl": 0.03118896484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.843439512918949e-07, + "loss": 0.0012, + "reward": 0.9414299409836531, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.9414299409836531, + "reward_after_std": 0.6094411462545395, + "reward_before_mean": 1.1017081029713154, + "reward_before_std": 0.5816697841510177, + "reward_change_max": 0.0, + "reward_change_mean": -0.16027818620204926, + "reward_change_min": -0.2311301939189434, + "reward_change_std": 0.09064106363803148, + "reward_std": 0.6094411574304104, + "rewards/cosine_scaled_reward": 0.08210405427962542, + "rewards/format_reward": 0.9375000149011612, + "step": 329 + }, + { + "advantage_max": 1.5140583962202072, + "advantage_mean": -3.476937660007451e-08, + "advantage_min": -1.1497740894556046, + "advantage_std": 0.9998085647821426, + "completion_length": 1699.7292022705078, + "epoch": 0.37714285714285717, + "grad_norm": 0.9815267324447632, + "kl": 0.0795135498046875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.8142703296283953e-07, + "loss": 0.0032, + "reward": 0.28797438461333513, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.28797438461333513, + "reward_after_std": 0.6291838064789772, + "reward_before_mean": 0.38602944649755955, + "reward_before_std": 0.6264833547174931, + "reward_change_max": 9.696930646896362e-05, + "reward_change_mean": -0.09805506188422441, + "reward_change_min": -0.17832140903919935, + "reward_change_std": 0.06699541071429849, + "reward_std": 0.6291838251054287, + "rewards/cosine_scaled_reward": -0.17156862188130617, + "rewards/format_reward": 0.7291666734963655, + "step": 330 + }, + { + "advantage_max": 1.549956038594246, + "advantage_mean": -6.208817682207268e-09, + "advantage_min": -1.2178971469402313, + "advantage_std": 0.9997905716300011, + "completion_length": 1860.1250686645508, + "epoch": 0.3782857142857143, + "grad_norm": 0.8947551250457764, + "kl": 0.05895233154296875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.785183306423767e-07, + "loss": 0.0024, + "reward": 0.31997561175376177, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.31997561175376177, + "reward_after_std": 0.5985409691929817, + "reward_before_mean": 0.4238438168540597, + "reward_before_std": 0.5951807573437691, + "reward_change_max": 0.0001405850052833557, + "reward_change_mean": -0.10386821650899947, + "reward_change_min": -0.1779593052342534, + "reward_change_std": 0.06808420806191862, + "reward_std": 0.5985409840941429, + "rewards/cosine_scaled_reward": -0.07974475575610995, + "rewards/format_reward": 0.583333345130086, + "step": 331 + }, + { + "advantage_max": 1.4827670902013779, + "advantage_mean": -4.967053213178474e-09, + "advantage_min": -1.1617141589522362, + "advantage_std": 0.9997115060687065, + "completion_length": 1583.0000457763672, + "epoch": 0.37942857142857145, + "grad_norm": 0.36066368222236633, + "kl": 0.03369140625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.7561798609655373e-07, + "loss": 0.0013, + "reward": 0.3906476739794016, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3906476739794016, + "reward_after_std": 0.47099397890269756, + "reward_before_mean": 0.5055399723351002, + "reward_before_std": 0.46263338066637516, + "reward_change_max": 0.0, + "reward_change_mean": -0.11489230440929532, + "reward_change_min": -0.1840682066977024, + "reward_change_std": 0.06766448728740215, + "reward_std": 0.4709939956665039, + "rewards/cosine_scaled_reward": -0.12223003013059497, + "rewards/format_reward": 0.75, + "step": 332 + }, + { + "advantage_max": 1.4400066137313843, + "advantage_mean": -2.4835271617007493e-09, + "advantage_min": -1.3157860115170479, + "advantage_std": 0.9998525753617287, + "completion_length": 1373.9375457763672, + "epoch": 0.38057142857142856, + "grad_norm": 0.4405366778373718, + "kl": 0.037105560302734375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.72726140684072e-07, + "loss": 0.0015, + "reward": 0.878742154687643, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.878742154687643, + "reward_after_std": 0.8494612164795399, + "reward_before_mean": 1.027712881565094, + "reward_before_std": 0.8498156182467937, + "reward_change_max": 0.0, + "reward_change_mean": -0.14897070918232203, + "reward_change_min": -0.2575899437069893, + "reward_change_std": 0.09805800672620535, + "reward_std": 0.8494612462818623, + "rewards/cosine_scaled_reward": 0.06593976262956858, + "rewards/format_reward": 0.8958333507180214, + "step": 333 + }, + { + "advantage_max": 1.479415774345398, + "advantage_mean": -7.45058070794613e-09, + "advantage_min": -1.1129272356629372, + "advantage_std": 0.999847337603569, + "completion_length": 2343.5833892822266, + "epoch": 0.38171428571428573, + "grad_norm": 0.7468224763870239, + "kl": 0.11907958984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.6984293534939737e-07, + "loss": 0.0048, + "reward": 0.07790927402675152, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.07790927402675152, + "reward_after_std": 0.8249085582792759, + "reward_before_mean": 0.15207926771836355, + "reward_before_std": 0.8387485817074776, + "reward_change_max": 0.00037413090467453003, + "reward_change_mean": -0.07416999898850918, + "reward_change_min": -0.16692013293504715, + "reward_change_std": 0.06690044444985688, + "reward_std": 0.8249085918068886, + "rewards/cosine_scaled_reward": -0.18437703466042876, + "rewards/format_reward": 0.5208333488553762, + "step": 334 + }, + { + "advantage_max": 1.5890333950519562, + "advantage_mean": -6.581346334577631e-08, + "advantage_min": -1.1691535487771034, + "advantage_std": 0.9998458698391914, + "completion_length": 1481.020866394043, + "epoch": 0.38285714285714284, + "grad_norm": 0.3421805202960968, + "kl": 0.036113739013671875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.6696851061588994e-07, + "loss": 0.0014, + "reward": 0.7999673548620194, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.7999673548620194, + "reward_after_std": 0.7872883416712284, + "reward_before_mean": 0.9402303099632263, + "reward_before_std": 0.7693048864603043, + "reward_change_max": 0.00010892003774642944, + "reward_change_mean": -0.1402629679068923, + "reward_change_min": -0.21821013279259205, + "reward_change_std": 0.08301450358703732, + "reward_std": 0.7872883789241314, + "rewards/cosine_scaled_reward": 0.043031807988882065, + "rewards/format_reward": 0.8541666772216558, + "step": 335 + }, + { + "advantage_max": 1.5050934553146362, + "advantage_mean": -1.8936892387522164e-08, + "advantage_min": -1.2547463476657867, + "advantage_std": 0.999822124838829, + "completion_length": 1533.645896911621, + "epoch": 0.384, + "grad_norm": 0.4388027489185333, + "kl": 0.04041290283203125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.641030065789562e-07, + "loss": 0.0016, + "reward": 0.6022516712546349, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6022516712546349, + "reward_after_std": 0.6543654501438141, + "reward_before_mean": 0.7301745153963566, + "reward_before_std": 0.6454097218811512, + "reward_change_max": 0.000157281756401062, + "reward_change_mean": -0.12792286463081837, + "reward_change_min": -0.20177920907735825, + "reward_change_std": 0.07864083210006356, + "reward_std": 0.6543654501438141, + "rewards/cosine_scaled_reward": 0.03175393491983414, + "rewards/format_reward": 0.6666666772216558, + "step": 336 + }, + { + "advantage_max": 1.5357903391122818, + "advantage_mean": -3.97364304793868e-08, + "advantage_min": -1.2102145925164223, + "advantage_std": 0.9998451396822929, + "completion_length": 1482.979232788086, + "epoch": 0.3851428571428571, + "grad_norm": 0.6808515787124634, + "kl": 0.035221099853515625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.612465628992203e-07, + "loss": 0.0014, + "reward": 0.8413368645124137, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.8413368645124137, + "reward_after_std": 0.8354975320398808, + "reward_before_mean": 0.9865454584360123, + "reward_before_std": 0.8282735012471676, + "reward_change_max": 0.0, + "reward_change_mean": -0.1452085990458727, + "reward_change_min": -0.25506412237882614, + "reward_change_std": 0.09339386876672506, + "reward_std": 0.8354975394904613, + "rewards/cosine_scaled_reward": 0.0349393846699968, + "rewards/format_reward": 0.9166666865348816, + "step": 337 + }, + { + "advantage_max": 1.5026657208800316, + "advantage_mean": -5.4637592117323663e-08, + "advantage_min": -1.128111731261015, + "advantage_std": 0.999842643737793, + "completion_length": 1445.9792251586914, + "epoch": 0.3862857142857143, + "grad_norm": 0.4162408709526062, + "kl": 0.04589080810546875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.5839931879571725e-07, + "loss": 0.0018, + "reward": 0.8082408686168492, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.8082408686168492, + "reward_after_std": 0.8124595545232296, + "reward_before_mean": 0.9513399479910731, + "reward_before_std": 0.8106804341077805, + "reward_change_max": 0.0, + "reward_change_mean": -0.14309907238930464, + "reward_change_min": -0.23549943324178457, + "reward_change_std": 0.09040046157315373, + "reward_std": 0.8124595619738102, + "rewards/cosine_scaled_reward": 0.04858662304468453, + "rewards/format_reward": 0.8541666772216558, + "step": 338 + }, + { + "advantage_max": 1.317950114607811, + "advantage_mean": 8.381903254806033e-09, + "advantage_min": -1.4605398029088974, + "advantage_std": 0.9998449608683586, + "completion_length": 1954.3125457763672, + "epoch": 0.38742857142857146, + "grad_norm": 1.3577280044555664, + "kl": 0.08642578125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.555614130391079e-07, + "loss": 0.0035, + "reward": 0.30176051147282124, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.30176051147282124, + "reward_after_std": 0.6594284176826477, + "reward_before_mean": 0.4035015068948269, + "reward_before_std": 0.674022451043129, + "reward_change_max": 0.0011976435780525208, + "reward_change_mean": -0.10174098331481218, + "reward_change_min": -0.18010628037154675, + "reward_change_std": 0.0762557522393763, + "reward_std": 0.6594284400343895, + "rewards/cosine_scaled_reward": -0.110749252140522, + "rewards/format_reward": 0.6250000167638063, + "step": 339 + }, + { + "advantage_max": 1.4834815636277199, + "advantage_mean": -4.594524882772788e-08, + "advantage_min": -1.2522487416863441, + "advantage_std": 0.9998084381222725, + "completion_length": 1579.2500381469727, + "epoch": 0.38857142857142857, + "grad_norm": 0.33722105622291565, + "kl": 0.04709053039550781, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.5273298394491515e-07, + "loss": 0.0019, + "reward": 0.6650699935853481, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6650699935853481, + "reward_after_std": 0.6854303106665611, + "reward_before_mean": 0.7988633252680302, + "reward_before_std": 0.6824867390096188, + "reward_change_max": 8.215010166168213e-05, + "reward_change_mean": -0.133793359156698, + "reward_change_min": -0.21632769331336021, + "reward_change_std": 0.08351973094977438, + "reward_std": 0.6854303181171417, + "rewards/cosine_scaled_reward": -0.017234998289495707, + "rewards/format_reward": 0.8333333488553762, + "step": 340 + }, + { + "advantage_max": 1.689264178276062, + "advantage_mean": -2.7877590941249863e-07, + "advantage_min": -1.024775207042694, + "advantage_std": 0.9997997581958771, + "completion_length": 1468.0417022705078, + "epoch": 0.38971428571428574, + "grad_norm": 0.68641597032547, + "kl": 0.0506744384765625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.4991416936678276e-07, + "loss": 0.002, + "reward": 0.6998622994869947, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6998622994869947, + "reward_after_std": 0.736074797809124, + "reward_before_mean": 0.8322503371164203, + "reward_before_std": 0.70997529104352, + "reward_change_max": 0.0, + "reward_change_mean": -0.13238807581365108, + "reward_change_min": -0.21726097911596298, + "reward_change_std": 0.08524594688788056, + "reward_std": 0.7360748127102852, + "rewards/cosine_scaled_reward": 0.04112516465829685, + "rewards/format_reward": 0.7500000037252903, + "step": 341 + }, + { + "advantage_max": 1.5366226136684418, + "advantage_mean": -3.911554946611773e-08, + "advantage_min": -1.165475644171238, + "advantage_std": 0.9998475164175034, + "completion_length": 1879.416748046875, + "epoch": 0.39085714285714285, + "grad_norm": 0.9024969339370728, + "kl": 0.09264373779296875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.471051066897562e-07, + "loss": 0.0037, + "reward": 0.7040500938892365, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.7040500938892365, + "reward_after_std": 0.9079513140022755, + "reward_before_mean": 0.8336313590407372, + "reward_before_std": 0.9082721434533596, + "reward_change_max": 0.0, + "reward_change_mean": -0.12958126701414585, + "reward_change_min": -0.227171890437603, + "reward_change_std": 0.08963154442608356, + "reward_std": 0.907951358705759, + "rewards/cosine_scaled_reward": 0.010565669741481543, + "rewards/format_reward": 0.8125000186264515, + "step": 342 + }, + { + "advantage_max": 1.412947177886963, + "advantage_mean": -2.017865641246175e-08, + "advantage_min": -1.2453822493553162, + "advantage_std": 0.9998679906129837, + "completion_length": 1775.9792022705078, + "epoch": 0.392, + "grad_norm": 0.6765278577804565, + "kl": 0.08543014526367188, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.4430593282358777e-07, + "loss": 0.0034, + "reward": 0.6520730927586555, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6520730927586555, + "reward_after_std": 0.9671961665153503, + "reward_before_mean": 0.776901314035058, + "reward_before_std": 0.9851310290396214, + "reward_change_max": 0.0001538395881652832, + "reward_change_mean": -0.12482821242883801, + "reward_change_min": -0.24192855693399906, + "reward_change_std": 0.09555056085810065, + "reward_std": 0.9671961963176727, + "rewards/cosine_scaled_reward": 0.0342839767690748, + "rewards/format_reward": 0.708333345130086, + "step": 343 + }, + { + "advantage_max": 1.5145802944898605, + "advantage_mean": -1.2728075615697776e-07, + "advantage_min": -1.318326160311699, + "advantage_std": 0.9997060596942902, + "completion_length": 1550.6875534057617, + "epoch": 0.3931428571428571, + "grad_norm": 0.8243290781974792, + "kl": 0.084259033203125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.4151678419606233e-07, + "loss": 0.0034, + "reward": 1.1622946355491877, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 1.1622946355491877, + "reward_after_std": 0.5587964607402682, + "reward_before_mean": 1.3463120497763157, + "reward_before_std": 0.5346446477342397, + "reward_change_max": 0.0005112588405609131, + "reward_change_mean": -0.1840174519456923, + "reward_change_min": -0.2756340950727463, + "reward_change_std": 0.10637355549260974, + "reward_std": 0.5587964681908488, + "rewards/cosine_scaled_reward": 0.24607268278487027, + "rewards/format_reward": 0.8541666753590107, + "step": 344 + }, + { + "advantage_max": 1.540076158940792, + "advantage_mean": -5.215406517766752e-08, + "advantage_min": -1.1613540425896645, + "advantage_std": 0.9998455420136452, + "completion_length": 1671.3750305175781, + "epoch": 0.3942857142857143, + "grad_norm": 1.7589654922485352, + "kl": 0.1017913818359375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.387377967463493e-07, + "loss": 0.0041, + "reward": 0.5623795920982957, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5623795920982957, + "reward_after_std": 0.8140008598566055, + "reward_before_mean": 0.6826059645973146, + "reward_before_std": 0.8165422268211842, + "reward_change_max": 0.0010628923773765564, + "reward_change_mean": -0.12022638134658337, + "reward_change_min": -0.21904029790312052, + "reward_change_std": 0.08774518640711904, + "reward_std": 0.8140008747577667, + "rewards/cosine_scaled_reward": 0.04963629972189665, + "rewards/format_reward": 0.5833333414047956, + "step": 345 + }, + { + "advantage_max": 1.645007699728012, + "advantage_mean": -5.0912302596017867e-08, + "advantage_min": -1.0988484546542168, + "advantage_std": 0.9998019188642502, + "completion_length": 1430.3959045410156, + "epoch": 0.3954285714285714, + "grad_norm": 0.32801398634910583, + "kl": 0.046825408935546875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.359691059183761e-07, + "loss": 0.0019, + "reward": 0.5887374058365822, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5887374058365822, + "reward_after_std": 0.7088357359170914, + "reward_before_mean": 0.7109762877225876, + "reward_before_std": 0.6876601781696081, + "reward_change_max": 0.0, + "reward_change_mean": -0.12223889189772308, + "reward_change_min": -0.17916064709424973, + "reward_change_std": 0.06939612235873938, + "reward_std": 0.7088357619941235, + "rewards/cosine_scaled_reward": -0.09242854062176775, + "rewards/format_reward": 0.895833333954215, + "step": 346 + }, + { + "advantage_max": 1.5532027930021286, + "advantage_mean": -1.179675318541129e-08, + "advantage_min": -1.0655813068151474, + "advantage_std": 0.9997884854674339, + "completion_length": 1528.8750457763672, + "epoch": 0.3965714285714286, + "grad_norm": 0.578608512878418, + "kl": 0.04645538330078125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.3321084665422803e-07, + "loss": 0.0019, + "reward": 0.30343328788876534, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.30343328788876534, + "reward_after_std": 0.5521973073482513, + "reward_before_mean": 0.4068300393410027, + "reward_before_std": 0.5445283949375153, + "reward_change_max": 0.0, + "reward_change_mean": -0.10339675471186638, + "reward_change_min": -0.1844671368598938, + "reward_change_std": 0.06681121652945876, + "reward_std": 0.5521973147988319, + "rewards/cosine_scaled_reward": -0.22366831824183464, + "rewards/format_reward": 0.8541666865348816, + "step": 347 + }, + { + "advantage_max": 1.4550811648368835, + "advantage_mean": -3.7563345905988754e-08, + "advantage_min": -1.2732831984758377, + "advantage_std": 0.9997940734028816, + "completion_length": 1711.5000228881836, + "epoch": 0.3977142857142857, + "grad_norm": 0.9890735149383545, + "kl": 0.12113571166992188, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.3046315338757026e-07, + "loss": 0.0048, + "reward": 0.6786003398301546, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.6786003398301546, + "reward_after_std": 0.6397302523255348, + "reward_before_mean": 0.8155205333605409, + "reward_before_std": 0.6368211433291435, + "reward_change_max": 0.00017334520816802979, + "reward_change_mean": -0.13692022114992142, + "reward_change_min": -0.22533964831382036, + "reward_change_std": 0.0878034750930965, + "reward_std": 0.6397302746772766, + "rewards/cosine_scaled_reward": 0.011926926672458649, + "rewards/format_reward": 0.791666679084301, + "step": 348 + }, + { + "advantage_max": 1.5810499042272568, + "advantage_mean": -3.973643136756522e-08, + "advantage_min": -1.0555768236517906, + "advantage_std": 0.9997776970267296, + "completion_length": 1204.1667022705078, + "epoch": 0.39885714285714285, + "grad_norm": 0.6008383631706238, + "kl": 0.03905487060546875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.2772616003709616e-07, + "loss": 0.0016, + "reward": 0.6463835099712014, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.6463835099712014, + "reward_after_std": 0.5708205699920654, + "reward_before_mean": 0.7810069844126701, + "reward_before_std": 0.5535554438829422, + "reward_change_max": 0.0, + "reward_change_mean": -0.13462348422035575, + "reward_change_min": -0.2155060712248087, + "reward_change_std": 0.07849409175105393, + "reward_std": 0.5708205848932266, + "rewards/cosine_scaled_reward": -0.0053298622369766235, + "rewards/format_reward": 0.7916666716337204, + "step": 349 + }, + { + "advantage_max": 1.4653938859701157, + "advantage_mean": -1.6142925107764938e-08, + "advantage_min": -1.2062378972768784, + "advantage_std": 0.9998128265142441, + "completion_length": 917.1250305175781, + "epoch": 0.4, + "grad_norm": 0.537183403968811, + "kl": 0.012424468994140625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.250000000000001e-07, + "loss": 0.0005, + "reward": 0.6380548775196075, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6380548775196075, + "reward_after_std": 0.6736834794282913, + "reward_before_mean": 0.7693752646446228, + "reward_before_std": 0.6650245450437069, + "reward_change_max": 0.0, + "reward_change_mean": -0.13132039550691843, + "reward_change_min": -0.22087561339139938, + "reward_change_std": 0.08057826245203614, + "reward_std": 0.6736834980547428, + "rewards/cosine_scaled_reward": -0.09447903372347355, + "rewards/format_reward": 0.9583333432674408, + "step": 350 + }, + { + "advantage_max": 1.5197671800851822, + "advantage_mean": -4.532436648219118e-08, + "advantage_min": -1.1577540412545204, + "advantage_std": 0.9998496472835541, + "completion_length": 1553.4792022705078, + "epoch": 0.40114285714285713, + "grad_norm": 0.8147112727165222, + "kl": 0.12945556640625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.222848061454764e-07, + "loss": 0.0052, + "reward": 0.7770597245544195, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7770597245544195, + "reward_after_std": 0.7373732291162014, + "reward_before_mean": 0.9183787778019905, + "reward_before_std": 0.7273126542568207, + "reward_change_max": 0.0005416646599769592, + "reward_change_mean": -0.14131904486566782, + "reward_change_min": -0.23502979520708323, + "reward_change_std": 0.08941868646070361, + "reward_std": 0.737373273819685, + "rewards/cosine_scaled_reward": 0.06335602421313524, + "rewards/format_reward": 0.7916666828095913, + "step": 351 + }, + { + "advantage_max": 1.569848746061325, + "advantage_mean": -4.190951918836561e-09, + "advantage_min": -1.2060598954558372, + "advantage_std": 0.9997753575444221, + "completion_length": 1646.0834045410156, + "epoch": 0.4022857142857143, + "grad_norm": 0.8164442181587219, + "kl": 0.11374664306640625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.195807108082429e-07, + "loss": 0.0046, + "reward": 0.5288133807480335, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5288133807480335, + "reward_after_std": 0.610162977129221, + "reward_before_mean": 0.6497844681143761, + "reward_before_std": 0.5933005921542645, + "reward_change_max": 0.0, + "reward_change_mean": -0.12097106548026204, + "reward_change_min": -0.1816613031551242, + "reward_change_std": 0.06907373643480241, + "reward_std": 0.6101629845798016, + "rewards/cosine_scaled_reward": -0.029274450847879052, + "rewards/format_reward": 0.708333333954215, + "step": 352 + }, + { + "advantage_max": 1.3838004171848297, + "advantage_mean": -4.718701124284408e-08, + "advantage_min": -1.350169561803341, + "advantage_std": 0.9998253807425499, + "completion_length": 1370.3750381469727, + "epoch": 0.4034285714285714, + "grad_norm": 1.0508337020874023, + "kl": 0.0648651123046875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.168878457820915e-07, + "loss": 0.0026, + "reward": 0.8240749211981893, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.8240749211981893, + "reward_after_std": 0.718726497143507, + "reward_before_mean": 0.9740468331146985, + "reward_before_std": 0.7258542701601982, + "reward_change_max": 0.0, + "reward_change_mean": -0.14997189305722713, + "reward_change_min": -0.24729277566075325, + "reward_change_std": 0.0964922783896327, + "reward_std": 0.7187265120446682, + "rewards/cosine_scaled_reward": 0.08077336475253105, + "rewards/format_reward": 0.8125000111758709, + "step": 353 + }, + { + "advantage_max": 1.486832708120346, + "advantage_mean": -1.9868215184182247e-08, + "advantage_min": -1.33545982837677, + "advantage_std": 0.9997362196445465, + "completion_length": 1081.6250305175781, + "epoch": 0.4045714285714286, + "grad_norm": 0.6051336526870728, + "kl": 0.052875518798828125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.142063423134644e-07, + "loss": 0.0021, + "reward": 0.933193551376462, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.933193551376462, + "reward_after_std": 0.5013358984142542, + "reward_before_mean": 1.097759174183011, + "reward_before_std": 0.48550001345574856, + "reward_change_max": 0.0, + "reward_change_mean": -0.16456563211977482, + "reward_change_min": -0.23568181600421667, + "reward_change_std": 0.09174512466415763, + "reward_std": 0.5013359021395445, + "rewards/cosine_scaled_reward": 0.06971290893852711, + "rewards/format_reward": 0.9583333358168602, + "step": 354 + }, + { + "advantage_max": 1.3949600085616112, + "advantage_mean": -4.842877388000488e-08, + "advantage_min": -1.2098092809319496, + "advantage_std": 0.9998485893011093, + "completion_length": 1034.3125190734863, + "epoch": 0.4057142857142857, + "grad_norm": 0.4866308271884918, + "kl": 0.037677764892578125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.115363310950578e-07, + "loss": 0.0015, + "reward": 0.8702779617160559, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.8702779617160559, + "reward_after_std": 0.8237923942506313, + "reward_before_mean": 1.020679783076048, + "reward_before_std": 0.8269104994833469, + "reward_change_max": 0.00039453059434890747, + "reward_change_mean": -0.1504018036648631, + "reward_change_min": -0.28018930554389954, + "reward_change_std": 0.10402650013566017, + "reward_std": 0.8237924389541149, + "rewards/cosine_scaled_reward": 0.08325653476640582, + "rewards/format_reward": 0.8541666716337204, + "step": 355 + }, + { + "advantage_max": 1.4376270696520805, + "advantage_mean": -2.2351742789972207e-08, + "advantage_min": -1.1074972301721573, + "advantage_std": 0.9998405128717422, + "completion_length": 1922.1875381469727, + "epoch": 0.40685714285714286, + "grad_norm": 0.8787837028503418, + "kl": 0.18633270263671875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.0887794225945143e-07, + "loss": 0.0075, + "reward": 0.5590815953910351, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5590815953910351, + "reward_after_std": 0.7449811734259129, + "reward_before_mean": 0.6816292963922024, + "reward_before_std": 0.7479454576969147, + "reward_change_max": 0.00029243528842926025, + "reward_change_mean": -0.12254770565778017, + "reward_change_min": -0.23000308498740196, + "reward_change_std": 0.08476777747273445, + "reward_std": 0.7449811846017838, + "rewards/cosine_scaled_reward": -0.08626868622377515, + "rewards/format_reward": 0.8541666865348816, + "step": 356 + }, + { + "advantage_max": 1.421733245253563, + "advantage_mean": -1.3038516599728212e-08, + "advantage_min": -1.3238706812262535, + "advantage_std": 0.9998385459184647, + "completion_length": 2226.3125610351562, + "epoch": 0.408, + "grad_norm": 2.134793519973755, + "kl": 0.2264862060546875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.062313053727671e-07, + "loss": 0.0091, + "reward": 0.31191481556743383, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.31191481556743383, + "reward_after_std": 0.698531374335289, + "reward_before_mean": 0.4118203781545162, + "reward_before_std": 0.7032448500394821, + "reward_change_max": 0.0, + "reward_change_mean": -0.09990557003766298, + "reward_change_min": -0.17775661405175924, + "reward_change_std": 0.06979941623285413, + "reward_std": 0.6985313966870308, + "rewards/cosine_scaled_reward": -0.13783981930464506, + "rewards/format_reward": 0.6875000186264515, + "step": 357 + }, + { + "advantage_max": 1.2833376079797745, + "advantage_mean": 4.3461718668424965e-09, + "advantage_min": -1.2507511153817177, + "advantage_std": 0.9998571276664734, + "completion_length": 1489.083351135254, + "epoch": 0.40914285714285714, + "grad_norm": 0.5106679201126099, + "kl": 0.06841659545898438, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.0359654942835247e-07, + "loss": 0.0027, + "reward": 1.0083662807010114, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 1.0083662807010114, + "reward_after_std": 0.8467534109950066, + "reward_before_mean": 1.171830676496029, + "reward_before_std": 0.8582195043563843, + "reward_change_max": 0.0, + "reward_change_mean": -0.16346431523561478, + "reward_change_min": -0.27867283672094345, + "reward_change_std": 0.10701682418584824, + "reward_std": 0.846753440797329, + "rewards/cosine_scaled_reward": 0.16924862004816532, + "rewards/format_reward": 0.833333333954215, + "step": 358 + }, + { + "advantage_max": 1.3780936226248741, + "advantage_mean": -4.594524882772788e-08, + "advantage_min": -1.2478744611144066, + "advantage_std": 0.9997677430510521, + "completion_length": 1134.2292175292969, + "epoch": 0.4102857142857143, + "grad_norm": 0.6363142132759094, + "kl": 0.10742950439453125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 3.0097380284049523e-07, + "loss": 0.0043, + "reward": 0.6243336275219917, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6243336275219917, + "reward_after_std": 0.5924880225211382, + "reward_before_mean": 0.7577880509197712, + "reward_before_std": 0.590686340816319, + "reward_change_max": 0.0007399767637252808, + "reward_change_mean": -0.13345444854348898, + "reward_change_min": -0.21509629674255848, + "reward_change_std": 0.08265585312619805, + "reward_std": 0.5924880467355251, + "rewards/cosine_scaled_reward": -0.0690226498991251, + "rewards/format_reward": 0.8958333432674408, + "step": 359 + }, + { + "advantage_max": 1.388232484459877, + "advantage_mean": -6.519258266557415e-08, + "advantage_min": -1.3220400288701057, + "advantage_std": 0.9998097345232964, + "completion_length": 1587.1250610351562, + "epoch": 0.4114285714285714, + "grad_norm": 0.7408303022384644, + "kl": 0.2151947021484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.9836319343816397e-07, + "loss": 0.0086, + "reward": 0.841188732534647, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.841188732534647, + "reward_after_std": 0.7126155123114586, + "reward_before_mean": 0.992447454482317, + "reward_before_std": 0.7154055722057819, + "reward_change_max": 0.0, + "reward_change_mean": -0.15125871915370226, + "reward_change_min": -0.24961007386446, + "reward_change_std": 0.09662879165261984, + "reward_std": 0.7126155272126198, + "rewards/cosine_scaled_reward": 0.04830702394247055, + "rewards/format_reward": 0.8958333507180214, + "step": 360 + }, + { + "advantage_max": 1.5855596661567688, + "advantage_mean": 4.2840838099245104e-08, + "advantage_min": -1.1460353285074234, + "advantage_std": 0.9997694715857506, + "completion_length": 1471.395866394043, + "epoch": 0.4125714285714286, + "grad_norm": 1.182603359222412, + "kl": 0.10492706298828125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.9576484845877793e-07, + "loss": 0.0042, + "reward": 0.470302056055516, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.470302056055516, + "reward_after_std": 0.6256013102829456, + "reward_before_mean": 0.5854224106296897, + "reward_before_std": 0.6100865183398128, + "reward_change_max": 0.0, + "reward_change_mean": -0.11512034106999636, + "reward_change_min": -0.19347173534333706, + "reward_change_std": 0.07023542281240225, + "reward_std": 0.6256013140082359, + "rewards/cosine_scaled_reward": -0.13437213900033385, + "rewards/format_reward": 0.854166679084301, + "step": 361 + }, + { + "advantage_max": 1.4198757410049438, + "advantage_mean": -6.953875464343895e-08, + "advantage_min": -1.3393580988049507, + "advantage_std": 0.9997484311461449, + "completion_length": 925.854190826416, + "epoch": 0.4137142857142857, + "grad_norm": 0.6270393133163452, + "kl": 0.06605148315429688, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.931788945420058e-07, + "loss": 0.0026, + "reward": 0.8872093297541142, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.8872093297541142, + "reward_after_std": 0.4866991452872753, + "reward_before_mean": 1.0480562169104815, + "reward_before_std": 0.46841985266655684, + "reward_change_max": 0.0, + "reward_change_mean": -0.1608469020575285, + "reward_change_min": -0.2332429401576519, + "reward_change_std": 0.09089976316317916, + "reward_std": 0.4866991676390171, + "rewards/cosine_scaled_reward": 0.04486143495887518, + "rewards/format_reward": 0.9583333358168602, + "step": 362 + }, + { + "advantage_max": 1.4218028262257576, + "advantage_mean": 8.19563861220729e-08, + "advantage_min": -1.1774882376194, + "advantage_std": 0.9997980892658234, + "completion_length": 1123.0416984558105, + "epoch": 0.41485714285714287, + "grad_norm": 0.6843811869621277, + "kl": 0.14236831665039062, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.9060545772359305e-07, + "loss": 0.0057, + "reward": 1.0501264370977879, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 1.0501264370977879, + "reward_after_std": 0.6898505575954914, + "reward_before_mean": 1.2204857654869556, + "reward_before_std": 0.6796518871560693, + "reward_change_max": 0.0, + "reward_change_mean": -0.17035925202071667, + "reward_change_min": -0.26173054426908493, + "reward_change_std": 0.10411297017708421, + "reward_std": 0.689850565046072, + "rewards/cosine_scaled_reward": 0.19357617758214474, + "rewards/format_reward": 0.8333333358168602, + "step": 363 + }, + { + "advantage_max": 1.5182277262210846, + "advantage_mean": -2.5456150964942026e-08, + "advantage_min": -1.3673394322395325, + "advantage_std": 0.9997450858354568, + "completion_length": 1234.833381652832, + "epoch": 0.416, + "grad_norm": 1.0001448392868042, + "kl": 0.08144378662109375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.8804466342921987e-07, + "loss": 0.0033, + "reward": 0.3523000096902251, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.3523000096902251, + "reward_after_std": 0.48937233351171017, + "reward_before_mean": 0.46050266548991203, + "reward_before_std": 0.47428031265735626, + "reward_change_max": 0.0, + "reward_change_mean": -0.10820264089852571, + "reward_change_min": -0.1646072268486023, + "reward_change_std": 0.062361706513911486, + "reward_std": 0.4893723502755165, + "rewards/cosine_scaled_reward": -0.217665349598974, + "rewards/format_reward": 0.8958333432674408, + "step": 364 + }, + { + "advantage_max": 1.3571320548653603, + "advantage_mean": 4.65661231796588e-09, + "advantage_min": -1.4831850975751877, + "advantage_std": 0.9996974319219589, + "completion_length": 2165.5833740234375, + "epoch": 0.41714285714285715, + "grad_norm": 1.301094889640808, + "kl": 0.2874908447265625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.854966364683872e-07, + "loss": 0.0115, + "reward": 0.5533080464228988, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5533080464228988, + "reward_after_std": 0.5401174901053309, + "reward_before_mean": 0.6822809707373381, + "reward_before_std": 0.5410689422860742, + "reward_change_max": 0.00028949230909347534, + "reward_change_mean": -0.12897292617708445, + "reward_change_min": -0.20345229096710682, + "reward_change_std": 0.08068801555782557, + "reward_std": 0.5401174938306212, + "rewards/cosine_scaled_reward": 0.03905716352164745, + "rewards/format_reward": 0.6041666734963655, + "step": 365 + }, + { + "advantage_max": 1.5208699703216553, + "advantage_mean": -4.8428774879205605e-08, + "advantage_min": -1.1703914254903793, + "advantage_std": 0.9997816234827042, + "completion_length": 1370.395881652832, + "epoch": 0.41828571428571426, + "grad_norm": 0.7431749105453491, + "kl": 0.10540771484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.829615010283344e-07, + "loss": 0.0042, + "reward": 1.0022234451025724, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 1.0022234451025724, + "reward_after_std": 0.6891144718974829, + "reward_before_mean": 1.1668678969144821, + "reward_before_std": 0.6747738681733608, + "reward_change_max": 0.0, + "reward_change_mean": -0.1646444108337164, + "reward_change_min": -0.25490029994398355, + "reward_change_std": 0.09806211944669485, + "reward_std": 0.6891144774854183, + "rewards/cosine_scaled_reward": 0.12510060099884868, + "rewards/format_reward": 0.9166666679084301, + "step": 366 + }, + { + "advantage_max": 1.5256575047969818, + "advantage_mean": -4.718701029915451e-08, + "advantage_min": -1.1822673827409744, + "advantage_std": 0.9997625052928925, + "completion_length": 1623.8958892822266, + "epoch": 0.41942857142857143, + "grad_norm": 0.8612035512924194, + "kl": 0.1331787109375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.8043938066798645e-07, + "loss": 0.0053, + "reward": 0.6489149909466505, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6489149909466505, + "reward_after_std": 0.704158004373312, + "reward_before_mean": 0.7802309468388557, + "reward_before_std": 0.6997333746403456, + "reward_change_max": 0.0003588870167732239, + "reward_change_mean": -0.13131593633443117, + "reward_change_min": -0.22332393191754818, + "reward_change_std": 0.08702679723501205, + "reward_std": 0.7041580304503441, + "rewards/cosine_scaled_reward": -0.057801210321485996, + "rewards/format_reward": 0.8958333432674408, + "step": 367 + }, + { + "advantage_max": 1.5013212859630585, + "advantage_mean": -1.8626449271863521e-09, + "advantage_min": -1.1598549410700798, + "advantage_std": 0.9998278617858887, + "completion_length": 2047.5625534057617, + "epoch": 0.4205714285714286, + "grad_norm": 1.596835970878601, + "kl": 0.2228240966796875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.7793039831193133e-07, + "loss": 0.0089, + "reward": 0.2928560241125524, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2928560241125524, + "reward_after_std": 0.7190740033984184, + "reward_before_mean": 0.3913656147196889, + "reward_before_std": 0.7281620763242245, + "reward_change_max": 0.0003599002957344055, + "reward_change_mean": -0.09850956918671727, + "reward_change_min": -0.1961950259283185, + "reward_change_std": 0.07544383313506842, + "reward_std": 0.7190740182995796, + "rewards/cosine_scaled_reward": -0.12723387405276299, + "rewards/format_reward": 0.6458333414047956, + "step": 368 + }, + { + "advantage_max": 1.5033908188343048, + "advantage_mean": -3.042320473323201e-08, + "advantage_min": -1.068381130695343, + "advantage_std": 0.9998282641172409, + "completion_length": 1654.1875534057617, + "epoch": 0.4217142857142857, + "grad_norm": 1.0307308435440063, + "kl": 0.21329116821289062, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.7543467624442956e-07, + "loss": 0.0085, + "reward": 0.639374952763319, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.639374952763319, + "reward_after_std": 0.68315190076828, + "reward_before_mean": 0.7693626917898655, + "reward_before_std": 0.6706436909735203, + "reward_change_max": 0.0, + "reward_change_mean": -0.12998773623257875, + "reward_change_min": -0.2221611039713025, + "reward_change_std": 0.07870964519679546, + "reward_std": 0.6831519119441509, + "rewards/cosine_scaled_reward": -0.03198532899841666, + "rewards/format_reward": 0.8333333469927311, + "step": 369 + }, + { + "advantage_max": 1.6313235014677048, + "advantage_mean": 4.9049655004296255e-08, + "advantage_min": -1.0117665193974972, + "advantage_std": 0.9997687488794327, + "completion_length": 2152.3750610351562, + "epoch": 0.4228571428571429, + "grad_norm": 0.996197521686554, + "kl": 0.40521240234375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.729523361034538e-07, + "loss": 0.0162, + "reward": 0.4512446033768356, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4512446033768356, + "reward_after_std": 0.591382460668683, + "reward_before_mean": 0.5645157024264336, + "reward_before_std": 0.5668165199458599, + "reward_change_max": 0.0011430829763412476, + "reward_change_mean": -0.11327109299600124, + "reward_change_min": -0.18050487712025642, + "reward_change_std": 0.07153899781405926, + "reward_std": 0.5913824774324894, + "rewards/cosine_scaled_reward": -0.030242161825299263, + "rewards/format_reward": 0.6250000055879354, + "step": 370 + }, + { + "advantage_max": 1.6017784476280212, + "advantage_mean": -2.607703397661254e-08, + "advantage_min": -0.9585048705339432, + "advantage_std": 0.999807633459568, + "completion_length": 1053.56254196167, + "epoch": 0.424, + "grad_norm": 1.5119962692260742, + "kl": 0.134368896484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.7048349887476037e-07, + "loss": 0.0054, + "reward": 0.8266900572925806, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.8266900572925806, + "reward_after_std": 0.6163216028362513, + "reward_before_mean": 0.975990392267704, + "reward_before_std": 0.5858824178576469, + "reward_change_max": 0.0008676201105117798, + "reward_change_mean": -0.14930032240226865, + "reward_change_min": -0.23938406724482775, + "reward_change_std": 0.09261591965332627, + "reward_std": 0.6163216270506382, + "rewards/cosine_scaled_reward": 0.11299517937004566, + "rewards/format_reward": 0.7500000111758709, + "step": 371 + }, + { + "advantage_max": 1.4052574709057808, + "advantage_mean": 3.6011141624214815e-08, + "advantage_min": -1.2968919053673744, + "advantage_std": 0.9998442903161049, + "completion_length": 1930.7292175292969, + "epoch": 0.42514285714285716, + "grad_norm": 1.3718596696853638, + "kl": 0.19745254516601562, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.6802828488599294e-07, + "loss": 0.0079, + "reward": 0.7759344661608338, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.7759344661608338, + "reward_after_std": 0.8030894659459591, + "reward_before_mean": 0.9179776012897491, + "reward_before_std": 0.8054710291326046, + "reward_change_max": 0.00045900046825408936, + "reward_change_mean": -0.1420431211590767, + "reward_change_min": -0.24235581792891026, + "reward_change_std": 0.09905825974419713, + "reward_std": 0.8030894808471203, + "rewards/cosine_scaled_reward": 0.05273880437016487, + "rewards/format_reward": 0.8125000223517418, + "step": 372 + }, + { + "advantage_max": 1.5664848685264587, + "advantage_mean": -6.20881684954e-09, + "advantage_min": -1.2124197706580162, + "advantage_std": 0.9998014271259308, + "completion_length": 1053.2500381469727, + "epoch": 0.42628571428571427, + "grad_norm": 0.9296903610229492, + "kl": 0.08166122436523438, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.655868138008171e-07, + "loss": 0.0033, + "reward": 0.5102668823674321, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5102668823674321, + "reward_after_std": 0.7223041206598282, + "reward_before_mean": 0.6259184032678604, + "reward_before_std": 0.7110856045037508, + "reward_change_max": 0.0005139410495758057, + "reward_change_mean": -0.11565148271620274, + "reward_change_min": -0.18873510602861643, + "reward_change_std": 0.07335183955729008, + "reward_std": 0.72230414301157, + "rewards/cosine_scaled_reward": -0.14537415117956698, + "rewards/format_reward": 0.916666679084301, + "step": 373 + }, + { + "advantage_max": 1.3690541833639145, + "advantage_mean": -4.035731082652205e-09, + "advantage_min": -1.3715066015720367, + "advantage_std": 0.9997926205396652, + "completion_length": 1368.4167251586914, + "epoch": 0.42742857142857144, + "grad_norm": 0.9108665585517883, + "kl": 0.12201690673828125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.631592046130896e-07, + "loss": 0.0049, + "reward": 0.7745124213397503, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.7745124213397503, + "reward_after_std": 0.5470051765441895, + "reward_before_mean": 0.9227103255689144, + "reward_before_std": 0.5317458175122738, + "reward_change_max": 0.0, + "reward_change_mean": -0.14819789212197065, + "reward_change_min": -0.22565262019634247, + "reward_change_std": 0.08558421535417438, + "reward_std": 0.547005195170641, + "rewards/cosine_scaled_reward": 0.023855158127844334, + "rewards/format_reward": 0.8750000223517418, + "step": 374 + }, + { + "advantage_max": 1.5265842229127884, + "advantage_mean": -2.23517424569053e-08, + "advantage_min": -1.228366658091545, + "advantage_std": 0.9998601377010345, + "completion_length": 1652.5625686645508, + "epoch": 0.42857142857142855, + "grad_norm": 0.9376990795135498, + "kl": 0.26873016357421875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.6074557564105724e-07, + "loss": 0.0107, + "reward": 0.812188274692744, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.812188274692744, + "reward_after_std": 0.9057653173804283, + "reward_before_mean": 0.9515043869614601, + "reward_before_std": 0.9038424082100391, + "reward_change_max": 0.0001263841986656189, + "reward_change_mean": -0.13931614579632878, + "reward_change_min": -0.24343886598944664, + "reward_change_std": 0.09589880565181375, + "reward_std": 0.9057653844356537, + "rewards/cosine_scaled_reward": 0.0799188744276762, + "rewards/format_reward": 0.7916666753590107, + "step": 375 + }, + { + "advantage_max": 1.5179670602083206, + "advantage_mean": -2.421438682898014e-08, + "advantage_min": -1.1654746755957603, + "advantage_std": 0.9998091086745262, + "completion_length": 1489.7292404174805, + "epoch": 0.4297142857142857, + "grad_norm": 1.5551291704177856, + "kl": 0.2132110595703125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.583460445215911e-07, + "loss": 0.0085, + "reward": 0.5052896784618497, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5052896784618497, + "reward_after_std": 0.6814988665282726, + "reward_before_mean": 0.6225324124097824, + "reward_before_std": 0.6745848506689072, + "reward_change_max": 0.0, + "reward_change_mean": -0.1172427274286747, + "reward_change_min": -0.19756066799163818, + "reward_change_std": 0.07500209799036384, + "reward_std": 0.6814988739788532, + "rewards/cosine_scaled_reward": -0.11581713845953345, + "rewards/format_reward": 0.854166679084301, + "step": 376 + }, + { + "advantage_max": 1.4136180728673935, + "advantage_mean": -2.3593505260599557e-08, + "advantage_min": -1.2113258317112923, + "advantage_std": 0.999827466905117, + "completion_length": 1939.833381652832, + "epoch": 0.4308571428571429, + "grad_norm": 0.8605461120605469, + "kl": 0.288116455078125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.5596072820445254e-07, + "loss": 0.0115, + "reward": 0.47748872451484203, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.47748872451484203, + "reward_after_std": 0.7632182575762272, + "reward_before_mean": 0.5919480286538601, + "reward_before_std": 0.7724727466702461, + "reward_change_max": 0.0, + "reward_change_mean": -0.11445930134505033, + "reward_change_min": -0.2100103199481964, + "reward_change_std": 0.08319698181003332, + "reward_std": 0.763218279927969, + "rewards/cosine_scaled_reward": -0.11027600057423115, + "rewards/format_reward": 0.8125000149011612, + "step": 377 + }, + { + "advantage_max": 1.4481362104415894, + "advantage_mean": -6.581346156941947e-08, + "advantage_min": -1.1869488134980202, + "advantage_std": 0.9998489990830421, + "completion_length": 1121.0208950042725, + "epoch": 0.432, + "grad_norm": 0.6732988953590393, + "kl": 0.09349822998046875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.5358974294659373e-07, + "loss": 0.0037, + "reward": 0.9872011113911867, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.9872011113911867, + "reward_after_std": 0.7485641278326511, + "reward_before_mean": 1.1486159078776836, + "reward_before_std": 0.7412164583802223, + "reward_change_max": 0.0, + "reward_change_mean": -0.16141479928046465, + "reward_change_min": -0.25539009273052216, + "reward_change_std": 0.09596162987872958, + "reward_std": 0.7485641501843929, + "rewards/cosine_scaled_reward": 0.0951412720605731, + "rewards/format_reward": 0.9583333358168602, + "step": 378 + }, + { + "advantage_max": 1.4938328862190247, + "advantage_mean": -2.980232349791834e-08, + "advantage_min": -1.1671525463461876, + "advantage_std": 0.9998154565691948, + "completion_length": 1591.1458740234375, + "epoch": 0.43314285714285716, + "grad_norm": 1.0055720806121826, + "kl": 0.21600341796875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.512332043064913e-07, + "loss": 0.0086, + "reward": 0.607740237377584, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.607740237377584, + "reward_after_std": 0.7770608048886061, + "reward_before_mean": 0.7329375743865967, + "reward_before_std": 0.775208655744791, + "reward_change_max": 0.0, + "reward_change_mean": -0.12519735004752874, + "reward_change_min": -0.21551300026476383, + "reward_change_std": 0.0834552114829421, + "reward_std": 0.7770608123391867, + "rewards/cosine_scaled_reward": -0.07103121210820973, + "rewards/format_reward": 0.8750000074505806, + "step": 379 + }, + { + "advantage_max": 1.4799382463097572, + "advantage_mean": -3.414849480964932e-08, + "advantage_min": -1.0861377716064453, + "advantage_std": 0.9998506456613541, + "completion_length": 1408.06254196167, + "epoch": 0.4342857142857143, + "grad_norm": 1.4439901113510132, + "kl": 0.22924041748046875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.488912271385139e-07, + "loss": 0.0092, + "reward": 0.6337316166609526, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6337316166609526, + "reward_after_std": 0.8558752126991749, + "reward_before_mean": 0.7605880023911595, + "reward_before_std": 0.8635857813060284, + "reward_change_max": 0.0, + "reward_change_mean": -0.12685640575364232, + "reward_change_min": -0.2282596305012703, + "reward_change_std": 0.0902928994037211, + "reward_std": 0.8558752313256264, + "rewards/cosine_scaled_reward": -0.015539344982244074, + "rewards/format_reward": 0.7916666753590107, + "step": 380 + }, + { + "advantage_max": 1.7427802830934525, + "advantage_mean": 5.5879355587151736e-09, + "advantage_min": -1.088472604751587, + "advantage_std": 0.9997963383793831, + "completion_length": 1819.583351135254, + "epoch": 0.43542857142857144, + "grad_norm": 1.249358892440796, + "kl": 0.3687019348144531, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.465639255873246e-07, + "loss": 0.0147, + "reward": 0.2271743305027485, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2271743305027485, + "reward_after_std": 0.5816497430205345, + "reward_before_mean": 0.31924774509388953, + "reward_before_std": 0.561051607131958, + "reward_change_max": 0.0, + "reward_change_mean": -0.09207342192530632, + "reward_change_min": -0.14389674551784992, + "reward_change_std": 0.05652611888945103, + "reward_std": 0.5816497728228569, + "rewards/cosine_scaled_reward": -0.2049594670534134, + "rewards/format_reward": 0.7291666809469461, + "step": 381 + }, + { + "advantage_max": 1.4776546210050583, + "advantage_mean": -3.197540932031728e-08, + "advantage_min": -1.2696739807724953, + "advantage_std": 0.9998131394386292, + "completion_length": 1192.9583587646484, + "epoch": 0.43657142857142855, + "grad_norm": 1.0161633491516113, + "kl": 0.13286590576171875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.4425141308231765e-07, + "loss": 0.0053, + "reward": 0.49811657425016165, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.49811657425016165, + "reward_after_std": 0.7143443040549755, + "reward_before_mean": 0.6148605179041624, + "reward_before_std": 0.7126270364969969, + "reward_change_max": 2.530217170715332e-05, + "reward_change_mean": -0.1167439166456461, + "reward_change_min": -0.2098684161901474, + "reward_change_std": 0.07956612063571811, + "reward_std": 0.7143443375825882, + "rewards/cosine_scaled_reward": -0.14048643223941326, + "rewards/format_reward": 0.895833358168602, + "step": 382 + }, + { + "advantage_max": 1.3780869543552399, + "advantage_mean": -1.2107194136135035e-08, + "advantage_min": -1.2136836722493172, + "advantage_std": 0.9998874962329865, + "completion_length": 1577.4375610351562, + "epoch": 0.4377142857142857, + "grad_norm": 1.369926929473877, + "kl": 0.3840484619140625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.4195380233209006e-07, + "loss": 0.0154, + "reward": 0.6665458576753736, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.6665458576753736, + "reward_after_std": 1.086703211069107, + "reward_before_mean": 0.7915002275258303, + "reward_before_std": 1.1197119541466236, + "reward_change_max": 0.00027097761631011963, + "reward_change_mean": -0.12495436519384384, + "reward_change_min": -0.2746748309582472, + "reward_change_std": 0.10923281265422702, + "reward_std": 1.0867032408714294, + "rewards/cosine_scaled_reward": -8.322112262248993e-05, + "rewards/format_reward": 0.7916666865348816, + "step": 383 + }, + { + "advantage_max": 1.5250187814235687, + "advantage_mean": -1.1734665017471002e-07, + "advantage_min": -1.0902344584465027, + "advantage_std": 0.9998218566179276, + "completion_length": 1203.895851135254, + "epoch": 0.43885714285714283, + "grad_norm": 1.4417327642440796, + "kl": 0.1812744140625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.3967120531894857e-07, + "loss": 0.0072, + "reward": 1.1576847899705172, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 1.1576847899705172, + "reward_after_std": 0.8532119113951921, + "reward_before_mean": 1.3330178875476122, + "reward_before_std": 0.8452331945300102, + "reward_change_max": 7.525086402893066e-05, + "reward_change_mean": -0.17533311154693365, + "reward_change_min": -0.29297889675945044, + "reward_change_std": 0.11483225552365184, + "reward_std": 0.853211922571063, + "rewards/cosine_scaled_reward": 0.2602589353919029, + "rewards/format_reward": 0.8125000149011612, + "step": 384 + }, + { + "advantage_max": 1.4220309108495712, + "advantage_mean": -1.459072063170197e-08, + "advantage_min": -1.322306603193283, + "advantage_std": 0.9997835829854012, + "completion_length": 1766.1250457763672, + "epoch": 0.44, + "grad_norm": 1.2814421653747559, + "kl": 0.4825325012207031, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.374037332934512e-07, + "loss": 0.0193, + "reward": 0.3343192981556058, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3343192981556058, + "reward_after_std": 0.6680205948650837, + "reward_before_mean": 0.43759361281991005, + "reward_before_std": 0.6733908774331212, + "reward_change_max": 0.0005120784044265747, + "reward_change_mean": -0.10327431745827198, + "reward_change_min": -0.19344800151884556, + "reward_change_std": 0.07480754144489765, + "reward_std": 0.6680206172168255, + "rewards/cosine_scaled_reward": -0.06245320290327072, + "rewards/format_reward": 0.5625000167638063, + "step": 385 + }, + { + "advantage_max": 1.5401915460824966, + "advantage_mean": -3.539025822396624e-08, + "advantage_min": -1.2550265565514565, + "advantage_std": 0.9997994750738144, + "completion_length": 1499.6875305175781, + "epoch": 0.44114285714285717, + "grad_norm": 1.1877615451812744, + "kl": 0.3754119873046875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.3515149676898552e-07, + "loss": 0.015, + "reward": 0.7220457578077912, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.7220457578077912, + "reward_after_std": 0.5384745597839355, + "reward_before_mean": 0.863738858141005, + "reward_before_std": 0.5172322764992714, + "reward_change_max": 0.0008039996027946472, + "reward_change_mean": -0.14169306913390756, + "reward_change_min": -0.20913443807512522, + "reward_change_std": 0.08123286440968513, + "reward_std": 0.5384745635092258, + "rewards/cosine_scaled_reward": 0.025619419291615486, + "rewards/format_reward": 0.8125000149011612, + "step": 386 + }, + { + "advantage_max": 1.525049164891243, + "advantage_mean": -4.3461719556603384e-08, + "advantage_min": -1.3049319833517075, + "advantage_std": 0.9997809082269669, + "completion_length": 2077.104232788086, + "epoch": 0.4422857142857143, + "grad_norm": 2.191981554031372, + "kl": 0.593994140625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.3291460551638237e-07, + "loss": 0.0238, + "reward": 0.6428085435181856, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6428085435181856, + "reward_after_std": 0.6132344920188189, + "reward_before_mean": 0.7758152484893799, + "reward_before_std": 0.6024291254580021, + "reward_change_max": 0.0003018230199813843, + "reward_change_mean": -0.1330067147500813, + "reward_change_min": -0.20926320180296898, + "reward_change_std": 0.08395129209384322, + "reward_std": 0.6132345125079155, + "rewards/cosine_scaled_reward": -0.007925715297460556, + "rewards/format_reward": 0.7916666753590107, + "step": 387 + }, + { + "advantage_max": 1.5071382969617844, + "advantage_mean": -1.2440917529499274e-07, + "advantage_min": -1.265833929181099, + "advantage_std": 0.9998080208897591, + "completion_length": 1300.2292175292969, + "epoch": 0.44342857142857145, + "grad_norm": 1.908517837524414, + "kl": 0.2214508056640625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.306931685585657e-07, + "loss": 0.0089, + "reward": 0.8852858282625675, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.8852858282625675, + "reward_after_std": 0.7093720734119415, + "reward_before_mean": 1.0393500942736864, + "reward_before_std": 0.6999770719558001, + "reward_change_max": 0.00048132985830307007, + "reward_change_mean": -0.15406430745497346, + "reward_change_min": -0.25052541866898537, + "reward_change_std": 0.09875405393540859, + "reward_std": 0.7093720883131027, + "rewards/cosine_scaled_reward": 0.11342504154890776, + "rewards/format_reward": 0.8125000111758709, + "step": 388 + }, + { + "advantage_max": 1.4574102386832237, + "advantage_mean": -4.967053768289986e-08, + "advantage_min": -1.299857720732689, + "advantage_std": 0.9997845217585564, + "completion_length": 1413.4167022705078, + "epoch": 0.44457142857142856, + "grad_norm": 0.9911835193634033, + "kl": 0.2678680419921875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.2848729416523859e-07, + "loss": 0.0107, + "reward": 0.7586075998842716, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.7586075998842716, + "reward_after_std": 0.6198668666183949, + "reward_before_mean": 0.9024220667779446, + "reward_before_std": 0.6068241856992245, + "reward_change_max": 0.0, + "reward_change_mean": -0.14381443057209253, + "reward_change_min": -0.2356892367824912, + "reward_change_std": 0.08788485545665026, + "reward_std": 0.619866881519556, + "rewards/cosine_scaled_reward": 0.003294333815574646, + "rewards/format_reward": 0.8958333432674408, + "step": 389 + }, + { + "advantage_max": 1.5046132057905197, + "advantage_mean": -3.166496836959354e-08, + "advantage_min": -1.114329144358635, + "advantage_std": 0.9997953996062279, + "completion_length": 1732.0000228881836, + "epoch": 0.44571428571428573, + "grad_norm": 1.1819705963134766, + "kl": 0.4037322998046875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.2629708984760706e-07, + "loss": 0.0162, + "reward": 0.6063512277323753, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6063512277323753, + "reward_after_std": 0.8176131937652826, + "reward_before_mean": 0.7317489488050342, + "reward_before_std": 0.8221844676882029, + "reward_change_max": 0.0, + "reward_change_mean": -0.12539772223681211, + "reward_change_min": -0.24319585226476192, + "reward_change_std": 0.09066474437713623, + "reward_std": 0.8176132310181856, + "rewards/cosine_scaled_reward": -0.009125546552240849, + "rewards/format_reward": 0.7500000074505806, + "step": 390 + }, + { + "advantage_max": 1.3619165793061256, + "advantage_mean": -2.793967834868738e-08, + "advantage_min": -1.2452645674347878, + "advantage_std": 0.9998195543885231, + "completion_length": 1382.3125267028809, + "epoch": 0.44685714285714284, + "grad_norm": 2.8021371364593506, + "kl": 0.3901214599609375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.2412266235313973e-07, + "loss": 0.0156, + "reward": 0.8029541606083512, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.8029541606083512, + "reward_after_std": 0.6770164184272289, + "reward_before_mean": 0.9506847970187664, + "reward_before_std": 0.6766212470829487, + "reward_change_max": 0.00014794617891311646, + "reward_change_mean": -0.14773060707375407, + "reward_change_min": -0.24860516004264355, + "reward_change_std": 0.09467540634796023, + "reward_std": 0.6770164258778095, + "rewards/cosine_scaled_reward": 0.058675711043179035, + "rewards/format_reward": 0.8333333469927311, + "step": 391 + }, + { + "advantage_max": 1.5857295244932175, + "advantage_mean": -1.5522043150806297e-08, + "advantage_min": -1.0785433277487755, + "advantage_std": 0.9998374804854393, + "completion_length": 1728.770881652832, + "epoch": 0.448, + "grad_norm": 1.6562546491622925, + "kl": 0.5024871826171875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.2196411766036487e-07, + "loss": 0.0201, + "reward": 0.4962415201589465, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.4962415201589465, + "reward_after_std": 0.8928264938294888, + "reward_before_mean": 0.6067917384207249, + "reward_before_std": 0.893587950617075, + "reward_change_max": 0.0, + "reward_change_mean": -0.11055022478103638, + "reward_change_min": -0.2252865731716156, + "reward_change_std": 0.08553655026480556, + "reward_std": 0.89282650873065, + "rewards/cosine_scaled_reward": -0.07160413172096014, + "rewards/format_reward": 0.7500000093132257, + "step": 392 + }, + { + "advantage_max": 1.5273478254675865, + "advantage_mean": -4.967054101356894e-09, + "advantage_min": -1.0835141614079475, + "advantage_std": 0.9998887106776237, + "completion_length": 1534.2291946411133, + "epoch": 0.4491428571428571, + "grad_norm": 1.2979336977005005, + "kl": 0.332183837890625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.1982156097370557e-07, + "loss": 0.0133, + "reward": 0.658613370731473, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.658613370731473, + "reward_after_std": 1.0538778081536293, + "reward_before_mean": 0.7801401242613792, + "reward_before_std": 1.0620463229715824, + "reward_change_max": 0.0003866031765937805, + "reward_change_mean": -0.12152672559022903, + "reward_change_min": -0.24229469522833824, + "reward_change_std": 0.09234014619141817, + "reward_std": 1.053877830505371, + "rewards/cosine_scaled_reward": -0.016179951839148998, + "rewards/format_reward": 0.812500013038516, + "step": 393 + }, + { + "advantage_max": 1.4706476479768753, + "advantage_mean": -7.326404349861093e-08, + "advantage_min": -1.205642156302929, + "advantage_std": 0.9997804909944534, + "completion_length": 1600.2500457763672, + "epoch": 0.4502857142857143, + "grad_norm": 3.4596385955810547, + "kl": 0.54315185546875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.1769509671835223e-07, + "loss": 0.0217, + "reward": 0.4269302450120449, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4269302450120449, + "reward_after_std": 0.5508731566369534, + "reward_before_mean": 0.5413588918745518, + "reward_before_std": 0.5420444570481777, + "reward_change_max": 0.0, + "reward_change_mean": -0.11442865990102291, + "reward_change_min": -0.1859685741364956, + "reward_change_std": 0.07011770131066442, + "reward_std": 0.5508731603622437, + "rewards/cosine_scaled_reward": -0.1564038973301649, + "rewards/format_reward": 0.854166679084301, + "step": 394 + }, + { + "advantage_max": 1.6219586357474327, + "advantage_mean": -6.891787274199146e-08, + "advantage_min": -1.0342840030789375, + "advantage_std": 0.9997628480195999, + "completion_length": 1293.2500267028809, + "epoch": 0.4514285714285714, + "grad_norm": 1.1021445989608765, + "kl": 0.4151763916015625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.1558482853517253e-07, + "loss": 0.0166, + "reward": 0.6994353365153074, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.6994353365153074, + "reward_after_std": 0.5848857667297125, + "reward_before_mean": 0.837128933519125, + "reward_before_std": 0.5620684530586004, + "reward_change_max": 0.0002937912940979004, + "reward_change_mean": -0.13769357604905963, + "reward_change_min": -0.2083145957440138, + "reward_change_std": 0.07981055462732911, + "reward_std": 0.5848857890814543, + "rewards/cosine_scaled_reward": 0.022731118835508823, + "rewards/format_reward": 0.7916666679084301, + "step": 395 + }, + { + "advantage_max": 1.5212388187646866, + "advantage_mean": -1.5211602200082552e-08, + "advantage_min": -1.202341765165329, + "advantage_std": 0.9998112320899963, + "completion_length": 1216.1875228881836, + "epoch": 0.45257142857142857, + "grad_norm": 1.7865386009216309, + "kl": 0.31902313232421875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.134908592756607e-07, + "loss": 0.0128, + "reward": 0.5041277073323727, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5041277073323727, + "reward_after_std": 0.7280652057379484, + "reward_before_mean": 0.6195306107401848, + "reward_before_std": 0.724288634955883, + "reward_change_max": 0.0, + "reward_change_mean": -0.11540291737765074, + "reward_change_min": -0.19210434705018997, + "reward_change_std": 0.07592118624597788, + "reward_std": 0.728065237402916, + "rewards/cosine_scaled_reward": -0.09648469707462937, + "rewards/format_reward": 0.812500013038516, + "step": 396 + }, + { + "advantage_max": 1.4690123051404953, + "advantage_mean": -1.7384688910659918e-08, + "advantage_min": -1.2216744720935822, + "advantage_std": 0.9997849240899086, + "completion_length": 1188.9791870117188, + "epoch": 0.45371428571428574, + "grad_norm": 1.075379729270935, + "kl": 0.24041748046875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.1141329099692406e-07, + "loss": 0.0096, + "reward": 0.5569867407903075, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5569867407903075, + "reward_after_std": 0.7179490067064762, + "reward_before_mean": 0.6796721828486625, + "reward_before_std": 0.716059971600771, + "reward_change_max": 0.0, + "reward_change_mean": -0.12268543615937233, + "reward_change_min": -0.2074470091611147, + "reward_change_std": 0.08350208308547735, + "reward_std": 0.7179490327835083, + "rewards/cosine_scaled_reward": -0.06641392130404711, + "rewards/format_reward": 0.8125000186264515, + "step": 397 + }, + { + "advantage_max": 1.5344518646597862, + "advantage_mean": -3.57006997298015e-08, + "advantage_min": -1.1795164123177528, + "advantage_std": 0.999800331890583, + "completion_length": 1327.895881652832, + "epoch": 0.45485714285714285, + "grad_norm": 1.5244557857513428, + "kl": 0.3427886962890625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.0935222495670968e-07, + "loss": 0.0137, + "reward": 0.5405859863385558, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.5405859863385558, + "reward_after_std": 0.6886147819459438, + "reward_before_mean": 0.6625741459429264, + "reward_before_std": 0.6864035166800022, + "reward_change_max": 0.0, + "reward_change_mean": -0.12198818568140268, + "reward_change_min": -0.20752749498933554, + "reward_change_std": 0.07999280700460076, + "reward_std": 0.6886148191988468, + "rewards/cosine_scaled_reward": -0.07496293634176254, + "rewards/format_reward": 0.812500013038516, + "step": 398 + }, + { + "advantage_max": 1.5416178330779076, + "advantage_mean": -3.539025855703315e-08, + "advantage_min": -1.1947909593582153, + "advantage_std": 0.9998286813497543, + "completion_length": 1322.5417175292969, + "epoch": 0.456, + "grad_norm": 1.22747802734375, + "kl": 0.246826171875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.0730776160846853e-07, + "loss": 0.0099, + "reward": 0.8436704650521278, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.8436704650521278, + "reward_after_std": 0.8051801025867462, + "reward_before_mean": 0.9887518286705017, + "reward_before_std": 0.7946115285158157, + "reward_change_max": 0.0, + "reward_change_mean": -0.1450813477858901, + "reward_change_min": -0.23790498822927475, + "reward_change_std": 0.09142806520685554, + "reward_std": 0.8051801361143589, + "rewards/cosine_scaled_reward": 0.05687588080763817, + "rewards/format_reward": 0.8750000149011612, + "step": 399 + }, + { + "advantage_max": 1.418625384569168, + "advantage_mean": -9.685755153476805e-08, + "advantage_min": -1.2240911647677422, + "advantage_std": 0.999837763607502, + "completion_length": 1084.7291793823242, + "epoch": 0.45714285714285713, + "grad_norm": 1.2166296243667603, + "kl": 0.27674102783203125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.0528000059645995e-07, + "loss": 0.0111, + "reward": 1.2985866218805313, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 1.2985866218805313, + "reward_after_std": 0.8396326303482056, + "reward_before_mean": 1.488419085741043, + "reward_before_std": 0.8349230848252773, + "reward_change_max": 0.00015439093112945557, + "reward_change_mean": -0.18983250577002764, + "reward_change_min": -0.311465822160244, + "reward_change_std": 0.11947321891784668, + "reward_std": 0.8396326526999474, + "rewards/cosine_scaled_reward": 0.2962928842753172, + "rewards/format_reward": 0.8958333358168602, + "step": 400 + }, + { + "advantage_max": 1.3777280449867249, + "advantage_mean": -4.4082603789519226e-08, + "advantage_min": -1.185884103178978, + "advantage_std": 0.9997954741120338, + "completion_length": 1630.770866394043, + "epoch": 0.4582857142857143, + "grad_norm": 1.5643917322158813, + "kl": 0.403472900390625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.032690407508949e-07, + "loss": 0.0161, + "reward": 0.5990715604275465, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.5990715604275465, + "reward_after_std": 0.5103095322847366, + "reward_before_mean": 0.7318368963897228, + "reward_before_std": 0.5031833313405514, + "reward_change_max": 0.0, + "reward_change_mean": -0.1327653443440795, + "reward_change_min": -0.21006182581186295, + "reward_change_std": 0.07824637182056904, + "reward_std": 0.5103095509111881, + "rewards/cosine_scaled_reward": -0.040331561118364334, + "rewards/format_reward": 0.8125000149011612, + "step": 401 + }, + { + "advantage_max": 1.3117186725139618, + "advantage_mean": -9.93410786964688e-09, + "advantage_min": -1.4394584074616432, + "advantage_std": 0.99981340020895, + "completion_length": 1447.3333740234375, + "epoch": 0.4594285714285714, + "grad_norm": 1.5800583362579346, + "kl": 0.4939117431640625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 2.0127498008311922e-07, + "loss": 0.0198, + "reward": 0.7458519488573074, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.7458519488573074, + "reward_after_std": 0.7214295417070389, + "reward_before_mean": 0.8878723345696926, + "reward_before_std": 0.7308622431010008, + "reward_change_max": 0.0, + "reward_change_mean": -0.14202034566551447, + "reward_change_min": -0.22765328735113144, + "reward_change_std": 0.0920207086019218, + "reward_std": 0.7214295528829098, + "rewards/cosine_scaled_reward": 0.006436141207814217, + "rewards/format_reward": 0.8750000223517418, + "step": 402 + }, + { + "advantage_max": 1.5777545720338821, + "advantage_mean": -1.4901161582425715e-08, + "advantage_min": -1.0918622389435768, + "advantage_std": 0.9997143223881721, + "completion_length": 1011.1458587646484, + "epoch": 0.4605714285714286, + "grad_norm": 2.2372660636901855, + "kl": 0.34453582763671875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.9929791578083655e-07, + "loss": 0.0138, + "reward": 0.7979833465069532, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.7979833465069532, + "reward_after_std": 0.46419939398765564, + "reward_before_mean": 0.9484133645892143, + "reward_before_std": 0.43025317415595055, + "reward_change_max": 0.0, + "reward_change_mean": -0.15042999852448702, + "reward_change_min": -0.22080524545162916, + "reward_change_std": 0.08285338198766112, + "reward_std": 0.46419941633939743, + "rewards/cosine_scaled_reward": 0.0783733231946826, + "rewards/format_reward": 0.7916666697710752, + "step": 403 + }, + { + "advantage_max": 1.499540537595749, + "advantage_mean": -1.204510566843453e-07, + "advantage_min": -1.2422676607966423, + "advantage_std": 0.9997927024960518, + "completion_length": 1443.4375228881836, + "epoch": 0.4617142857142857, + "grad_norm": 1.9905834197998047, + "kl": 0.4494171142578125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.9733794420337213e-07, + "loss": 0.018, + "reward": 0.7095265840180218, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.7095265840180218, + "reward_after_std": 0.5653872638940811, + "reward_before_mean": 0.8499099458567798, + "reward_before_std": 0.5469077229499817, + "reward_change_max": 0.0, + "reward_change_mean": -0.1403833832591772, + "reward_change_min": -0.212770015001297, + "reward_change_std": 0.0808732183650136, + "reward_std": 0.5653872825205326, + "rewards/cosine_scaled_reward": -0.022961702197790146, + "rewards/format_reward": 0.8958333432674408, + "step": 404 + }, + { + "advantage_max": 1.7194660305976868, + "advantage_mean": 4.470348680118974e-08, + "advantage_min": -1.0948041006922722, + "advantage_std": 0.9997475519776344, + "completion_length": 1167.2917251586914, + "epoch": 0.46285714285714286, + "grad_norm": 1.041972041130066, + "kl": 0.289215087890625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.9539516087697517e-07, + "loss": 0.0116, + "reward": 1.2832879004999995, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 1.2832879004999995, + "reward_after_std": 0.5691489465534687, + "reward_before_mean": 1.4747139997780323, + "reward_before_std": 0.5159460082650185, + "reward_change_max": 0.0, + "reward_change_mean": -0.19142606016248465, + "reward_change_min": -0.26474354043602943, + "reward_change_std": 0.10462368186563253, + "reward_std": 0.5691489800810814, + "rewards/cosine_scaled_reward": 0.27902365755289793, + "rewards/format_reward": 0.916666679084301, + "step": 405 + }, + { + "advantage_max": 1.6274562031030655, + "advantage_mean": -3.2906732005955064e-08, + "advantage_min": -1.0492018535733223, + "advantage_std": 0.9997759088873863, + "completion_length": 1456.5833740234375, + "epoch": 0.464, + "grad_norm": 2.3249237537384033, + "kl": 0.54071044921875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.934696604901642e-07, + "loss": 0.0216, + "reward": 0.8334343023598194, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.8334343023598194, + "reward_after_std": 0.7299204599112272, + "reward_before_mean": 0.9799941023811698, + "reward_before_std": 0.7119660619646311, + "reward_change_max": 0.0, + "reward_change_mean": -0.14655979629606009, + "reward_change_min": -0.24078886583447456, + "reward_change_std": 0.0892445114441216, + "reward_std": 0.7299204748123884, + "rewards/cosine_scaled_reward": 0.010830356506630778, + "rewards/format_reward": 0.9583333358168602, + "step": 406 + }, + { + "advantage_max": 1.3725356981158257, + "advantage_mean": -2.2351741679749182e-08, + "advantage_min": -1.2306954599916935, + "advantage_std": 0.9997261166572571, + "completion_length": 1250.5625381469727, + "epoch": 0.46514285714285714, + "grad_norm": 1.371690034866333, + "kl": 0.2197418212890625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.915615368891117e-07, + "loss": 0.0088, + "reward": 0.752561591565609, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.752561591565609, + "reward_after_std": 0.566648356616497, + "reward_before_mean": 0.9003201862797141, + "reward_before_std": 0.5630356483161449, + "reward_change_max": 0.0, + "reward_change_mean": -0.1477585742250085, + "reward_change_min": -0.22929508332163095, + "reward_change_std": 0.09294451726600528, + "reward_std": 0.5666483640670776, + "rewards/cosine_scaled_reward": 0.002243412658572197, + "rewards/format_reward": 0.8958333358168602, + "step": 407 + }, + { + "advantage_max": 1.6451401710510254, + "advantage_mean": -1.5770395678238458e-07, + "advantage_min": -0.9748510047793388, + "advantage_std": 0.9998021051287651, + "completion_length": 1450.2709045410156, + "epoch": 0.4662857142857143, + "grad_norm": 1.4156643152236938, + "kl": 0.18245697021484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.8967088307307e-07, + "loss": 0.0073, + "reward": 0.9860383477061987, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.9860383477061987, + "reward_after_std": 0.7557676881551743, + "reward_before_mean": 1.1457962021231651, + "reward_before_std": 0.7287969561293721, + "reward_change_max": 0.0, + "reward_change_mean": -0.1597578590735793, + "reward_change_min": -0.262321799993515, + "reward_change_std": 0.09481826471164823, + "reward_std": 0.7557677067816257, + "rewards/cosine_scaled_reward": 0.10414808837231249, + "rewards/format_reward": 0.9375000074505806, + "step": 408 + }, + { + "advantage_max": 1.4862465560436249, + "advantage_mean": -9.31322596819939e-09, + "advantage_min": -1.1640625074505806, + "advantage_std": 0.999810591340065, + "completion_length": 1856.8333740234375, + "epoch": 0.4674285714285714, + "grad_norm": 1.2075356245040894, + "kl": 0.50775146484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.8779779118983867e-07, + "loss": 0.0203, + "reward": 0.48998264502733946, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.48998264502733946, + "reward_after_std": 0.6932985447347164, + "reward_before_mean": 0.6081876456737518, + "reward_before_std": 0.6996648348867893, + "reward_change_max": 0.000249423086643219, + "reward_change_mean": -0.11820495565189049, + "reward_change_min": -0.21144821494817734, + "reward_change_std": 0.08027452556416392, + "reward_std": 0.6932985782623291, + "rewards/cosine_scaled_reward": -0.09173952601850033, + "rewards/format_reward": 0.791666679084301, + "step": 409 + }, + { + "advantage_max": 1.6873383074998856, + "advantage_mean": -1.2417640249395845e-09, + "advantage_min": -1.0137062221765518, + "advantage_std": 0.9998016655445099, + "completion_length": 1704.4792213439941, + "epoch": 0.4685714285714286, + "grad_norm": 1.8969690799713135, + "kl": 0.7142829895019531, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.8594235253127372e-07, + "loss": 0.0287, + "reward": 0.48481374606490135, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.48481374606490135, + "reward_after_std": 0.7214395180344582, + "reward_before_mean": 0.5968669969588518, + "reward_before_std": 0.7014450505375862, + "reward_change_max": 0.00011499971151351929, + "reward_change_mean": -0.11205322481691837, + "reward_change_min": -0.17780397459864616, + "reward_change_std": 0.06741911824792624, + "reward_std": 0.7214395329356194, + "rewards/cosine_scaled_reward": -0.09739985689520836, + "rewards/format_reward": 0.7916666734963655, + "step": 410 + }, + { + "advantage_max": 1.344476506114006, + "advantage_mean": 6.208816794028849e-10, + "advantage_min": -1.2045889720320702, + "advantage_std": 0.999833382666111, + "completion_length": 1973.6458740234375, + "epoch": 0.4697142857142857, + "grad_norm": 1.6440776586532593, + "kl": 0.85833740234375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.8410465752883758e-07, + "loss": 0.0343, + "reward": 0.42943368293344975, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.42943368293344975, + "reward_after_std": 0.8023342192173004, + "reward_before_mean": 0.5399559205397964, + "reward_before_std": 0.821831464767456, + "reward_change_max": 0.0005313009023666382, + "reward_change_mean": -0.11052223108708858, + "reward_change_min": -0.21459924895316362, + "reward_change_std": 0.08706922875717282, + "reward_std": 0.8023342490196228, + "rewards/cosine_scaled_reward": -0.06335536949336529, + "rewards/format_reward": 0.6666666734963655, + "step": 411 + }, + { + "advantage_max": 1.4021871536970139, + "advantage_mean": -6.270905494876189e-08, + "advantage_min": -1.19430410861969, + "advantage_std": 0.9998315647244453, + "completion_length": 1140.3958702087402, + "epoch": 0.47085714285714286, + "grad_norm": 1.5791558027267456, + "kl": 0.285064697265625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.822847957491922e-07, + "loss": 0.0114, + "reward": 0.7805769965052605, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.7805769965052605, + "reward_after_std": 0.8468297980725765, + "reward_before_mean": 0.9220419060438871, + "reward_before_std": 0.8579179160296917, + "reward_change_max": 0.0002397671341896057, + "reward_change_mean": -0.14146495051681995, + "reward_change_min": -0.25670984014868736, + "reward_change_std": 0.09965648734942079, + "reward_std": 0.8468298017978668, + "rewards/cosine_scaled_reward": 0.0235209371894598, + "rewards/format_reward": 0.8750000149011612, + "step": 412 + }, + { + "advantage_max": 1.4463470578193665, + "advantage_mean": -3.352761379638025e-08, + "advantage_min": -1.2316881269216537, + "advantage_std": 0.9998175576329231, + "completion_length": 1210.2500610351562, + "epoch": 0.472, + "grad_norm": 1.284919261932373, + "kl": 0.293426513671875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.804828558898332e-07, + "loss": 0.0117, + "reward": 0.8780468343757093, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.8780468343757093, + "reward_after_std": 0.7943502962589264, + "reward_before_mean": 1.0290323235094547, + "reward_before_std": 0.7974189501255751, + "reward_change_max": 0.0, + "reward_change_mean": -0.15098547749221325, + "reward_change_min": -0.2520767832174897, + "reward_change_std": 0.09872942883521318, + "reward_std": 0.7943503148853779, + "rewards/cosine_scaled_reward": 0.06659948639571667, + "rewards/format_reward": 0.8958333358168602, + "step": 413 + }, + { + "advantage_max": 1.4396483451128006, + "advantage_mean": -2.2351742789972207e-08, + "advantage_min": -1.1945699751377106, + "advantage_std": 0.9998373538255692, + "completion_length": 1927.7083892822266, + "epoch": 0.47314285714285714, + "grad_norm": 1.1197619438171387, + "kl": 0.7069091796875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.7869892577476722e-07, + "loss": 0.0283, + "reward": 0.33350180089473724, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.33350180089473724, + "reward_after_std": 0.7306559979915619, + "reward_before_mean": 0.43494257144629955, + "reward_before_std": 0.7363609932363033, + "reward_change_max": 0.00020420551300048828, + "reward_change_mean": -0.10144078405573964, + "reward_change_min": -0.20094910357147455, + "reward_change_std": 0.07442115899175406, + "reward_std": 0.7306560054421425, + "rewards/cosine_scaled_reward": -0.14711205288767815, + "rewards/format_reward": 0.7291666772216558, + "step": 414 + }, + { + "advantage_max": 1.3788573667407036, + "advantage_mean": -3.601114040296949e-08, + "advantage_min": -1.2387224435806274, + "advantage_std": 0.9998515993356705, + "completion_length": 1650.0000381469727, + "epoch": 0.4742857142857143, + "grad_norm": 1.4316720962524414, + "kl": 0.5947723388671875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.7693309235023127e-07, + "loss": 0.0238, + "reward": 0.5648692059330642, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5648692059330642, + "reward_after_std": 0.8455432876944542, + "reward_before_mean": 0.685730435885489, + "reward_before_std": 0.862141527235508, + "reward_change_max": 0.0007596015930175781, + "reward_change_mean": -0.12086124438792467, + "reward_change_min": -0.23698932025581598, + "reward_change_std": 0.09399270685389638, + "reward_std": 0.8455432951450348, + "rewards/cosine_scaled_reward": -0.04255145916249603, + "rewards/format_reward": 0.7708333469927311, + "step": 415 + }, + { + "advantage_max": 1.5565531551837921, + "advantage_mean": -3.725290476097598e-08, + "advantage_min": -1.1521182730793953, + "advantage_std": 0.9998418241739273, + "completion_length": 1293.5000457763672, + "epoch": 0.4754285714285714, + "grad_norm": 1.0963540077209473, + "kl": 0.36377716064453125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.7518544168045524e-07, + "loss": 0.0146, + "reward": 1.0165877528488636, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 1.0165877528488636, + "reward_after_std": 0.7492444217205048, + "reward_before_mean": 1.1790525019168854, + "reward_before_std": 0.7278725281357765, + "reward_change_max": 0.0, + "reward_change_mean": -0.16246474720537663, + "reward_change_min": -0.24140873830765486, + "reward_change_std": 0.09044545330107212, + "reward_std": 0.7492444440722466, + "rewards/cosine_scaled_reward": 0.11035957233980298, + "rewards/format_reward": 0.9583333432674408, + "step": 416 + }, + { + "advantage_max": 1.604439303278923, + "advantage_mean": -3.539025844601085e-08, + "advantage_min": -1.240592211484909, + "advantage_std": 0.9998293966054916, + "completion_length": 1556.0000610351562, + "epoch": 0.4765714285714286, + "grad_norm": 2.053286075592041, + "kl": 0.60235595703125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.7345605894346726e-07, + "loss": 0.0241, + "reward": 0.44515037967357785, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.44515037967357785, + "reward_after_std": 0.6873351112008095, + "reward_before_mean": 0.5556222386658192, + "reward_before_std": 0.6735187582671642, + "reward_change_max": 0.0002534613013267517, + "reward_change_mean": -0.11047186609357595, + "reward_change_min": -0.18285326100885868, + "reward_change_std": 0.0725367316044867, + "reward_std": 0.6873351410031319, + "rewards/cosine_scaled_reward": -0.09718889463692904, + "rewards/format_reward": 0.7500000186264515, + "step": 417 + }, + { + "advantage_max": 1.4074752032756805, + "advantage_mean": -8.257727068805565e-08, + "advantage_min": -1.2240310907363892, + "advantage_std": 0.9998078942298889, + "completion_length": 1065.7708587646484, + "epoch": 0.4777142857142857, + "grad_norm": 0.9967370629310608, + "kl": 0.18988037109375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.7174502842694212e-07, + "loss": 0.0076, + "reward": 1.1236931383609772, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 1.1236931383609772, + "reward_after_std": 0.6934684608131647, + "reward_before_mean": 1.3006681762635708, + "reward_before_std": 0.6769684087485075, + "reward_change_max": 0.00027079880237579346, + "reward_change_mean": -0.17697507236152887, + "reward_change_min": -0.2605667933821678, + "reward_change_std": 0.10411691945046186, + "reward_std": 0.6934684999287128, + "rewards/cosine_scaled_reward": 0.18158408568706363, + "rewards/format_reward": 0.9375000149011612, + "step": 418 + }, + { + "advantage_max": 1.5458858013153076, + "advantage_mean": 2.483526828633842e-09, + "advantage_min": -1.1178877651691437, + "advantage_std": 0.9998831227421761, + "completion_length": 1536.9375534057617, + "epoch": 0.47885714285714287, + "grad_norm": 2.0971181392669678, + "kl": 0.45917510986328125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.7005243352409333e-07, + "loss": 0.0184, + "reward": 0.719907971099019, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.719907971099019, + "reward_after_std": 0.9108130037784576, + "reward_before_mean": 0.8503690185025334, + "reward_before_std": 0.909957580268383, + "reward_change_max": 0.00023179501295089722, + "reward_change_mean": -0.13046098314225674, + "reward_change_min": -0.23345648124814034, + "reward_change_std": 0.08942006062716246, + "reward_std": 0.9108130559325218, + "rewards/cosine_scaled_reward": 0.008517796639353037, + "rewards/format_reward": 0.8333333507180214, + "step": 419 + }, + { + "advantage_max": 1.655995175242424, + "advantage_mean": -3.6011140958081e-08, + "advantage_min": -1.1085584685206413, + "advantage_std": 0.9997412338852882, + "completion_length": 1058.2916870117188, + "epoch": 0.48, + "grad_norm": 2.4459757804870605, + "kl": 0.248291015625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.6837835672960831e-07, + "loss": 0.0099, + "reward": 0.5104020063299686, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5104020063299686, + "reward_after_std": 0.5309116821736097, + "reward_before_mean": 0.632997702807188, + "reward_before_std": 0.5120272561907768, + "reward_change_max": 0.0, + "reward_change_mean": -0.12259569112211466, + "reward_change_min": -0.18111898750066757, + "reward_change_std": 0.07065618922933936, + "reward_std": 0.5309117008000612, + "rewards/cosine_scaled_reward": -0.1105844946578145, + "rewards/format_reward": 0.8541666865348816, + "step": 420 + }, + { + "advantage_max": 1.58281809091568, + "advantage_mean": -1.2728075760026769e-08, + "advantage_min": -1.1530317813158035, + "advantage_std": 0.9998201727867126, + "completion_length": 1308.645881652832, + "epoch": 0.48114285714285715, + "grad_norm": 1.251358985900879, + "kl": 0.3505859375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.6672287963562852e-07, + "loss": 0.014, + "reward": 0.489888122305274, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.489888122305274, + "reward_after_std": 0.712477371096611, + "reward_before_mean": 0.6046459935605526, + "reward_before_std": 0.7005138099193573, + "reward_change_max": 0.00022549182176589966, + "reward_change_mean": -0.11475785728543997, + "reward_change_min": -0.20619327947497368, + "reward_change_std": 0.07444826699793339, + "reward_std": 0.712477408349514, + "rewards/cosine_scaled_reward": -0.16642701055388898, + "rewards/format_reward": 0.9375000149011612, + "step": 421 + }, + { + "advantage_max": 1.1890934333205223, + "advantage_mean": -1.30385160446167e-08, + "advantage_min": -1.4458886981010437, + "advantage_std": 0.9998152554035187, + "completion_length": 1730.9375762939453, + "epoch": 0.48228571428571426, + "grad_norm": 1.2435977458953857, + "kl": 0.5941848754882812, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.6508608292777203e-07, + "loss": 0.0237, + "reward": 0.5873144883662462, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5873144883662462, + "reward_after_std": 0.6254005320370197, + "reward_before_mean": 0.7183939876267686, + "reward_before_std": 0.6377482563257217, + "reward_change_max": 0.00011101365089416504, + "reward_change_mean": -0.13107949635013938, + "reward_change_min": -0.2110171616077423, + "reward_change_std": 0.08531484520062804, + "reward_std": 0.6254005543887615, + "rewards/cosine_scaled_reward": -0.057469683699309826, + "rewards/format_reward": 0.8333333469927311, + "step": 422 + }, + { + "advantage_max": 1.4273897409439087, + "advantage_mean": -3.8494666954047574e-08, + "advantage_min": -1.2456609457731247, + "advantage_std": 0.9997662082314491, + "completion_length": 1455.6667098999023, + "epoch": 0.48342857142857143, + "grad_norm": 1.3013380765914917, + "kl": 0.4249420166015625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.6346804638120098e-07, + "loss": 0.017, + "reward": 0.41419703885912895, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.41419703885912895, + "reward_after_std": 0.6243961397558451, + "reward_before_mean": 0.5267076604068279, + "reward_before_std": 0.6270595081150532, + "reward_change_max": 0.0, + "reward_change_mean": -0.11251062992960215, + "reward_change_min": -0.19462671875953674, + "reward_change_std": 0.0770272184163332, + "reward_std": 0.6243961472064257, + "rewards/cosine_scaled_reward": -0.1324795256368816, + "rewards/format_reward": 0.7916666772216558, + "step": 423 + }, + { + "advantage_max": 1.5881786197423935, + "advantage_mean": -3.725290464995368e-08, + "advantage_min": -0.9874609559774399, + "advantage_std": 0.9997924491763115, + "completion_length": 1607.1458740234375, + "epoch": 0.4845714285714286, + "grad_norm": 2.638868570327759, + "kl": 0.5507659912109375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.6186884885673413e-07, + "loss": 0.022, + "reward": 0.493446989916265, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.493446989916265, + "reward_after_std": 0.5820804536342621, + "reward_before_mean": 0.611056812107563, + "reward_before_std": 0.5580542217940092, + "reward_change_max": 0.0, + "reward_change_mean": -0.11760981846600771, + "reward_change_min": -0.18333129212260246, + "reward_change_std": 0.06667398847639561, + "reward_std": 0.5820804722607136, + "rewards/cosine_scaled_reward": -0.16322161629796028, + "rewards/format_reward": 0.9375000149011612, + "step": 424 + }, + { + "advantage_max": 1.5448092222213745, + "advantage_mean": -1.297642824305001e-07, + "advantage_min": -1.1348483115434647, + "advantage_std": 0.9997741878032684, + "completion_length": 1215.4791793823242, + "epoch": 0.4857142857142857, + "grad_norm": 2.198212146759033, + "kl": 0.18170928955078125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.6028856829700258e-07, + "loss": 0.0073, + "reward": 1.1097919731400907, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 1.1097919731400907, + "reward_after_std": 0.7644475474953651, + "reward_before_mean": 1.2826090063899755, + "reward_before_std": 0.7507690298371017, + "reward_change_max": 0.00022091716527938843, + "reward_change_mean": -0.1728170160204172, + "reward_change_min": -0.28230510652065277, + "reward_change_std": 0.11318794079124928, + "reward_std": 0.764447558671236, + "rewards/cosine_scaled_reward": 0.18297115061432123, + "rewards/format_reward": 0.916666679084301, + "step": 425 + }, + { + "advantage_max": 1.6671266108751297, + "advantage_mean": -1.0461857269383756e-07, + "advantage_min": -0.9880219921469688, + "advantage_std": 0.9998507276177406, + "completion_length": 1172.0000381469727, + "epoch": 0.4868571428571429, + "grad_norm": 1.7160371541976929, + "kl": 0.5969390869140625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.5872728172265146e-07, + "loss": 0.0239, + "reward": 0.8034339547157288, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.8034339547157288, + "reward_after_std": 0.8283376954495907, + "reward_before_mean": 0.9418845884501934, + "reward_before_std": 0.8046383187174797, + "reward_change_max": 0.00014852732419967651, + "reward_change_mean": -0.13845067285001278, + "reward_change_min": -0.2270393744111061, + "reward_change_std": 0.08508762950077653, + "reward_std": 0.8283377438783646, + "rewards/cosine_scaled_reward": -0.008224384859204292, + "rewards/format_reward": 0.9583333432674408, + "step": 426 + }, + { + "advantage_max": 1.3611183911561966, + "advantage_mean": -2.359350548264416e-08, + "advantage_min": -1.2368653267621994, + "advantage_std": 0.9998747855424881, + "completion_length": 1647.8541870117188, + "epoch": 0.488, + "grad_norm": 1.7571823596954346, + "kl": 0.385345458984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.5718506522858572e-07, + "loss": 0.0154, + "reward": 0.8747087176889181, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.8747087176889181, + "reward_after_std": 0.9652072787284851, + "reward_before_mean": 1.0225733071565628, + "reward_before_std": 0.9864769503474236, + "reward_change_max": 0.0, + "reward_change_mean": -0.14786457549780607, + "reward_change_min": -0.2746960259974003, + "reward_change_std": 0.1091885594651103, + "reward_std": 0.9652072936296463, + "rewards/cosine_scaled_reward": 0.10503664053976536, + "rewards/format_reward": 0.8125000074505806, + "step": 427 + }, + { + "advantage_max": 1.4560476392507553, + "advantage_mean": -8.6923440667519e-09, + "advantage_min": -1.1091465428471565, + "advantage_std": 0.999860942363739, + "completion_length": 1579.708396911621, + "epoch": 0.48914285714285716, + "grad_norm": 1.801839828491211, + "kl": 0.40151214599609375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.5566199398026147e-07, + "loss": 0.016, + "reward": 0.5794356926344335, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5794356926344335, + "reward_after_std": 0.8744179606437683, + "reward_before_mean": 0.699925497174263, + "reward_before_std": 0.8827432319521904, + "reward_change_max": 0.00021196156740188599, + "reward_change_mean": -0.12048980919644237, + "reward_change_min": -0.23152614384889603, + "reward_change_std": 0.08844160987064242, + "reward_std": 0.8744179755449295, + "rewards/cosine_scaled_reward": -0.05628725979477167, + "rewards/format_reward": 0.8125000074505806, + "step": 428 + }, + { + "advantage_max": 1.6187241524457932, + "advantage_mean": -1.4280279847511679e-08, + "advantage_min": -1.0603943690657616, + "advantage_std": 0.9998475760221481, + "completion_length": 1263.1042098999023, + "epoch": 0.49028571428571427, + "grad_norm": 1.4531282186508179, + "kl": 0.6030960083007812, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.5415814221002265e-07, + "loss": 0.0241, + "reward": 0.43697307258844376, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.43697307258844376, + "reward_after_std": 0.7528307847678661, + "reward_before_mean": 0.5441529117524624, + "reward_before_std": 0.7386700659990311, + "reward_change_max": 0.0006353110074996948, + "reward_change_mean": -0.10717981401830912, + "reward_change_min": -0.18293942417949438, + "reward_change_std": 0.06950774369761348, + "reward_std": 0.7528308033943176, + "rewards/cosine_scaled_reward": -0.16542356554418802, + "rewards/format_reward": 0.8750000074505806, + "step": 429 + }, + { + "advantage_max": 1.566932499408722, + "advantage_mean": 1.0865429667106241e-08, + "advantage_min": -1.0978027358651161, + "advantage_std": 0.9998347610235214, + "completion_length": 1255.0417098999023, + "epoch": 0.49142857142857144, + "grad_norm": 1.7678227424621582, + "kl": 0.28025054931640625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.5267358321348285e-07, + "loss": 0.0112, + "reward": 0.6415118533186615, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6415118533186615, + "reward_after_std": 0.7025103121995926, + "reward_before_mean": 0.7709160801023245, + "reward_before_std": 0.6886178329586983, + "reward_change_max": 0.00012315809726715088, + "reward_change_mean": -0.12940420676022768, + "reward_change_min": -0.21598459593951702, + "reward_change_std": 0.08058440638706088, + "reward_std": 0.7025103233754635, + "rewards/cosine_scaled_reward": -0.06245864322409034, + "rewards/format_reward": 0.8958333507180214, + "step": 430 + }, + { + "advantage_max": 1.4055536314845085, + "advantage_mean": 3.725291186640334e-09, + "advantage_min": -1.2864673808217049, + "advantage_std": 0.9997454509139061, + "completion_length": 1462.4375457763672, + "epoch": 0.49257142857142855, + "grad_norm": 1.5079882144927979, + "kl": 0.5954742431640625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.5120838934595337e-07, + "loss": 0.0238, + "reward": 0.5773325273767114, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5773325273767114, + "reward_after_std": 0.5111439414322376, + "reward_before_mean": 0.7075983798131347, + "reward_before_std": 0.5024440437555313, + "reward_change_max": 0.0, + "reward_change_mean": -0.1302658156491816, + "reward_change_min": -0.2057191450148821, + "reward_change_std": 0.07728143502026796, + "reward_std": 0.5111439451575279, + "rewards/cosine_scaled_reward": -0.10453415662050247, + "rewards/format_reward": 0.9166666716337204, + "step": 431 + }, + { + "advantage_max": 1.557257518172264, + "advantage_mean": -2.7318796336217588e-08, + "advantage_min": -0.9859335571527481, + "advantage_std": 0.9998309835791588, + "completion_length": 1686.395851135254, + "epoch": 0.4937142857142857, + "grad_norm": 2.2990598678588867, + "kl": 0.598297119140625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.4976263201891613e-07, + "loss": 0.0239, + "reward": 0.37111999094486237, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.37111999094486237, + "reward_after_std": 0.7255898527801037, + "reward_before_mean": 0.47415912989526987, + "reward_before_std": 0.7179553136229515, + "reward_change_max": 0.0, + "reward_change_mean": -0.10303913801908493, + "reward_change_min": -0.1894255429506302, + "reward_change_std": 0.06987693347036839, + "reward_std": 0.7255898788571358, + "rewards/cosine_scaled_reward": -0.15875378297641873, + "rewards/format_reward": 0.7916666753590107, + "step": 432 + }, + { + "advantage_max": 1.3101054728031158, + "advantage_mean": -5.587935669737476e-09, + "advantage_min": -1.4408425688743591, + "advantage_std": 0.9997941181063652, + "completion_length": 1531.2917251586914, + "epoch": 0.4948571428571429, + "grad_norm": 1.3603363037109375, + "kl": 0.4079132080078125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.483363816965435e-07, + "loss": 0.0163, + "reward": 0.7492245864123106, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.7492245864123106, + "reward_after_std": 0.6005501635372639, + "reward_before_mean": 0.894620930776, + "reward_before_std": 0.6026262082159519, + "reward_change_max": 2.2485852241516113e-05, + "reward_change_mean": -0.14539633970707655, + "reward_change_min": -0.23189252614974976, + "reward_change_std": 0.09261250263080001, + "reward_std": 0.6005501784384251, + "rewards/cosine_scaled_reward": 0.030643776757642627, + "rewards/format_reward": 0.8333333507180214, + "step": 433 + }, + { + "advantage_max": 1.482622116804123, + "advantage_mean": -3.321717245707845e-08, + "advantage_min": -1.2409061938524246, + "advantage_std": 0.9998137354850769, + "completion_length": 1580.020881652832, + "epoch": 0.496, + "grad_norm": 2.1808040142059326, + "kl": 0.7606887817382812, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.469297078922642e-07, + "loss": 0.0305, + "reward": 0.13291472848504782, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.13291472848504782, + "reward_after_std": 0.6066357120871544, + "reward_before_mean": 0.21792132034897804, + "reward_before_std": 0.6074311584234238, + "reward_change_max": 0.0010739341378211975, + "reward_change_mean": -0.08500660490244627, + "reward_change_min": -0.1500966176390648, + "reward_change_std": 0.06016435939818621, + "reward_std": 0.6066357530653477, + "rewards/cosine_scaled_reward": -0.25562268076464534, + "rewards/format_reward": 0.729166679084301, + "step": 434 + }, + { + "advantage_max": 1.4640971571207047, + "advantage_mean": -1.3038516710750514e-08, + "advantage_min": -1.2968028336763382, + "advantage_std": 0.9997572973370552, + "completion_length": 1197.9792098999023, + "epoch": 0.49714285714285716, + "grad_norm": 1.8218203783035278, + "kl": 0.5258941650390625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.4554267916537495e-07, + "loss": 0.0211, + "reward": 0.5289339208975434, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.5289339208975434, + "reward_after_std": 0.48837145417928696, + "reward_before_mean": 0.6544436899712309, + "reward_before_std": 0.47141289338469505, + "reward_change_max": 0.00042301416397094727, + "reward_change_mean": -0.1255097622051835, + "reward_change_min": -0.19645695015788078, + "reward_change_std": 0.07207289850339293, + "reward_std": 0.48837146535515785, + "rewards/cosine_scaled_reward": -0.1311115063726902, + "rewards/format_reward": 0.916666679084301, + "step": 435 + }, + { + "advantage_max": 1.34403195977211, + "advantage_mean": -3.4769377377230626e-08, + "advantage_min": -1.3322007581591606, + "advantage_std": 0.9998322278261185, + "completion_length": 1186.458366394043, + "epoch": 0.4982857142857143, + "grad_norm": 1.2338637113571167, + "kl": 0.35684967041015625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.4417536311769885e-07, + "loss": 0.0143, + "reward": 1.0391714964061975, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 1.0391714964061975, + "reward_after_std": 0.7189607694745064, + "reward_before_mean": 1.2088293116539717, + "reward_before_std": 0.7188755720853806, + "reward_change_max": 0.0, + "reward_change_mean": -0.169657819904387, + "reward_change_min": -0.25869670882821083, + "reward_change_std": 0.10130301676690578, + "reward_std": 0.7189607881009579, + "rewards/cosine_scaled_reward": 0.1460813172161579, + "rewards/format_reward": 0.9166666679084301, + "step": 436 + }, + { + "advantage_max": 1.476868376135826, + "advantage_mean": -2.6077032533322608e-08, + "advantage_min": -1.2403504475951195, + "advantage_std": 0.9998257905244827, + "completion_length": 1359.9583740234375, + "epoch": 0.49942857142857144, + "grad_norm": 1.9391827583312988, + "kl": 0.3791656494140625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.4282782639029128e-07, + "loss": 0.0152, + "reward": 0.6853760741651058, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6853760741651058, + "reward_after_std": 0.6799805872142315, + "reward_before_mean": 0.8209675564430654, + "reward_before_std": 0.6743724048137665, + "reward_change_max": 0.0, + "reward_change_mean": -0.1355915078893304, + "reward_change_min": -0.2195442169904709, + "reward_change_std": 0.0846588434651494, + "reward_std": 0.679980605840683, + "rewards/cosine_scaled_reward": -0.05826622620224953, + "rewards/format_reward": 0.9375000074505806, + "step": 437 + }, + { + "advantage_max": 1.4305167347192764, + "advantage_mean": -4.967053990334591e-09, + "advantage_min": -1.2116082832217216, + "advantage_std": 0.9997850134968758, + "completion_length": 1912.6667098999023, + "epoch": 0.5005714285714286, + "grad_norm": 1.7599682807922363, + "kl": 0.93927001953125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.4150013466019114e-07, + "loss": 0.0376, + "reward": 0.30255572497844696, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.30255572497844696, + "reward_after_std": 0.5124889835715294, + "reward_before_mean": 0.4072955325245857, + "reward_before_std": 0.5075165517628193, + "reward_change_max": 0.0, + "reward_change_mean": -0.10473981127142906, + "reward_change_min": -0.17432630248367786, + "reward_change_std": 0.06469799065962434, + "reward_std": 0.5124890096485615, + "rewards/cosine_scaled_reward": -0.2130189104937017, + "rewards/format_reward": 0.8333333432674408, + "step": 438 + }, + { + "advantage_max": 1.3590333685278893, + "advantage_mean": -1.8626451603331873e-08, + "advantage_min": -1.317535161972046, + "advantage_std": 0.9997846111655235, + "completion_length": 1555.5625534057617, + "epoch": 0.5017142857142857, + "grad_norm": 2.5428316593170166, + "kl": 0.6335601806640625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.4019235263722034e-07, + "loss": 0.0253, + "reward": 0.33497130312025547, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.33497130312025547, + "reward_after_std": 0.5517369508743286, + "reward_before_mean": 0.44286380242556334, + "reward_before_std": 0.5564365647733212, + "reward_change_max": 0.0, + "reward_change_mean": -0.10789249558001757, + "reward_change_min": -0.18508377857506275, + "reward_change_std": 0.07193143153563142, + "reward_std": 0.5517369657754898, + "rewards/cosine_scaled_reward": -0.16398477833718061, + "rewards/format_reward": 0.7708333469927311, + "step": 439 + }, + { + "advantage_max": 1.687167003750801, + "advantage_mean": -3.1664968702660445e-08, + "advantage_min": -0.9845371693372726, + "advantage_std": 0.9997920244932175, + "completion_length": 1395.2708702087402, + "epoch": 0.5028571428571429, + "grad_norm": 1.3478094339370728, + "kl": 0.426025390625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.3890454406082956e-07, + "loss": 0.017, + "reward": 0.4049868443980813, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4049868443980813, + "reward_after_std": 0.5812089741230011, + "reward_before_mean": 0.5134525969624519, + "reward_before_std": 0.5556198842823505, + "reward_change_max": 0.0, + "reward_change_mean": -0.1084657683968544, + "reward_change_min": -0.17195551469922066, + "reward_change_std": 0.06114753941074014, + "reward_std": 0.5812089964747429, + "rewards/cosine_scaled_reward": -0.19119038060307503, + "rewards/format_reward": 0.8958333507180214, + "step": 440 + }, + { + "advantage_max": 1.3728258907794952, + "advantage_mean": -3.166496831408239e-08, + "advantage_min": -1.2612786442041397, + "advantage_std": 0.9998279735445976, + "completion_length": 1365.6041946411133, + "epoch": 0.504, + "grad_norm": 1.260138750076294, + "kl": 0.362945556640625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.3763677169699217e-07, + "loss": 0.0145, + "reward": 0.8448034885077504, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.8448034885077504, + "reward_after_std": 0.7627089060842991, + "reward_before_mean": 0.9935674387961626, + "reward_before_std": 0.7690016403794289, + "reward_change_max": 8.557736873626709e-05, + "reward_change_mean": -0.1487639732658863, + "reward_change_min": -0.24890049546957016, + "reward_change_std": 0.0970719731412828, + "reward_std": 0.7627089321613312, + "rewards/cosine_scaled_reward": 0.048867044039070606, + "rewards/format_reward": 0.8958333432674408, + "step": 441 + }, + { + "advantage_max": 1.6671375334262848, + "advantage_mean": -4.346172155500483e-08, + "advantage_min": -1.0221184343099594, + "advantage_std": 0.9998374804854393, + "completion_length": 1317.5000457763672, + "epoch": 0.5051428571428571, + "grad_norm": 1.947296142578125, + "kl": 0.659881591796875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.3638909733514452e-07, + "loss": 0.0264, + "reward": 0.846515204757452, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.846515204757452, + "reward_after_std": 0.7835588194429874, + "reward_before_mean": 0.9904348067939281, + "reward_before_std": 0.7543639913201332, + "reward_change_max": 9.492039680480957e-06, + "reward_change_mean": -0.1439195815473795, + "reward_change_min": -0.2259671613574028, + "reward_change_std": 0.0858937781304121, + "reward_std": 0.7835588529706001, + "rewards/cosine_scaled_reward": 0.07855071779340506, + "rewards/format_reward": 0.8333333432674408, + "step": 442 + }, + { + "advantage_max": 1.4112332686781883, + "advantage_mean": -4.967053890414519e-08, + "advantage_min": -1.2800021320581436, + "advantage_std": 0.9997948706150055, + "completion_length": 1655.5625839233398, + "epoch": 0.5062857142857143, + "grad_norm": 1.9380803108215332, + "kl": 0.643463134765625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.351615817851748e-07, + "loss": 0.0257, + "reward": 0.4038530308753252, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4038530308753252, + "reward_after_std": 0.5651445239782333, + "reward_before_mean": 0.516314348205924, + "reward_before_std": 0.5649879388511181, + "reward_change_max": 0.0004429370164871216, + "reward_change_mean": -0.11246132105588913, + "reward_change_min": -0.18150242697447538, + "reward_change_std": 0.07103017391636968, + "reward_std": 0.5651445314288139, + "rewards/cosine_scaled_reward": -0.10642617009580135, + "rewards/format_reward": 0.7291666697710752, + "step": 443 + }, + { + "advantage_max": 1.4937669187784195, + "advantage_mean": -3.663202252646158e-08, + "advantage_min": -1.1797254905104637, + "advantage_std": 0.9998287782073021, + "completion_length": 1368.9791870117188, + "epoch": 0.5074285714285715, + "grad_norm": 1.5908373594284058, + "kl": 0.542572021484375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.3395428487445914e-07, + "loss": 0.0217, + "reward": 0.5830896962434053, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5830896962434053, + "reward_after_std": 0.8080550767481327, + "reward_before_mean": 0.7053059078752995, + "reward_before_std": 0.810644131153822, + "reward_change_max": 0.0, + "reward_change_mean": -0.12221621721982956, + "reward_change_min": -0.22589535266160965, + "reward_change_std": 0.08656603936105967, + "reward_std": 0.8080550953745842, + "rewards/cosine_scaled_reward": -0.10568038653582335, + "rewards/format_reward": 0.9166666865348816, + "step": 444 + }, + { + "advantage_max": 1.6786562949419022, + "advantage_mean": 9.934107758624577e-09, + "advantage_min": -1.1170127242803574, + "advantage_std": 0.9997536465525627, + "completion_length": 1343.583366394043, + "epoch": 0.5085714285714286, + "grad_norm": 1.6728626489639282, + "kl": 0.6574554443359375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.3276726544494571e-07, + "loss": 0.0263, + "reward": 0.4528891518712044, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4528891518712044, + "reward_after_std": 0.5791403837502003, + "reward_before_mean": 0.5661626718938351, + "reward_before_std": 0.5518585778772831, + "reward_change_max": 0.0, + "reward_change_mean": -0.11327349953353405, + "reward_change_min": -0.16920770704746246, + "reward_change_std": 0.06325740413740277, + "reward_std": 0.5791404116898775, + "rewards/cosine_scaled_reward": -0.17525201058015227, + "rewards/format_reward": 0.916666679084301, + "step": 445 + }, + { + "advantage_max": 1.405263438820839, + "advantage_mean": -1.862645060413115e-08, + "advantage_min": -1.3909134268760681, + "advantage_std": 0.9997911751270294, + "completion_length": 1314.9167022705078, + "epoch": 0.5097142857142857, + "grad_norm": 2.1797311305999756, + "kl": 0.2393798828125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.316005813502869e-07, + "loss": 0.0096, + "reward": 0.8286987226456404, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.8286987226456404, + "reward_after_std": 0.6055082138627768, + "reward_before_mean": 0.9798233285546303, + "reward_before_std": 0.5961595419794321, + "reward_change_max": 0.0, + "reward_change_mean": -0.151124594733119, + "reward_change_min": -0.23226046934723854, + "reward_change_std": 0.09119729977101088, + "reward_std": 0.605508241802454, + "rewards/cosine_scaled_reward": 0.031578321009874344, + "rewards/format_reward": 0.916666679084301, + "step": 446 + }, + { + "advantage_max": 1.4877047389745712, + "advantage_mean": -2.1109979653211042e-08, + "advantage_min": -1.2679708823561668, + "advantage_std": 0.9997538030147552, + "completion_length": 1493.2292098999023, + "epoch": 0.5108571428571429, + "grad_norm": 1.5136258602142334, + "kl": 0.736480712890625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.3045428945301953e-07, + "loss": 0.0295, + "reward": 0.4689008966088295, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4689008966088295, + "reward_after_std": 0.5583977196365595, + "reward_before_mean": 0.5876877517439425, + "reward_before_std": 0.5523215904831886, + "reward_change_max": 0.0, + "reward_change_mean": -0.11878686537966132, + "reward_change_min": -0.20156334061175585, + "reward_change_std": 0.07434120122343302, + "reward_std": 0.5583977401256561, + "rewards/cosine_scaled_reward": -0.10198946483433247, + "rewards/format_reward": 0.791666679084301, + "step": 447 + }, + { + "advantage_max": 1.3867352455854416, + "advantage_mean": -5.712112005618053e-08, + "advantage_min": -1.2888628989458084, + "advantage_std": 0.9998233914375305, + "completion_length": 1300.645866394043, + "epoch": 0.512, + "grad_norm": 2.749136447906494, + "kl": 0.6655731201171875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.2932844562179352e-07, + "loss": 0.0266, + "reward": 0.5472166938707232, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5472166938707232, + "reward_after_std": 0.6558889821171761, + "reward_before_mean": 0.6711777672171593, + "reward_before_std": 0.6643517129123211, + "reward_change_max": 0.00017968565225601196, + "reward_change_mean": -0.1239611036144197, + "reward_change_min": -0.21109597571194172, + "reward_change_std": 0.08364540711045265, + "reward_std": 0.6558890230953693, + "rewards/cosine_scaled_reward": -0.060244444757699966, + "rewards/format_reward": 0.791666679084301, + "step": 448 + }, + { + "advantage_max": 1.5808791145682335, + "advantage_mean": -3.60111408470587e-08, + "advantage_min": -1.0195664539933205, + "advantage_std": 0.9997886493802071, + "completion_length": 1297.833381652832, + "epoch": 0.5131428571428571, + "grad_norm": 3.942514419555664, + "kl": 0.5755157470703125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.2822310472864885e-07, + "loss": 0.023, + "reward": 0.3039223924279213, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.3039223924279213, + "reward_after_std": 0.5791955068707466, + "reward_before_mean": 0.4055069088935852, + "reward_before_std": 0.5674984790384769, + "reward_change_max": 0.0, + "reward_change_mean": -0.10158453835174441, + "reward_change_min": -0.17059927806258202, + "reward_change_std": 0.06457435572519898, + "reward_std": 0.5791955254971981, + "rewards/cosine_scaled_reward": -0.2555798841640353, + "rewards/format_reward": 0.9166666716337204, + "step": 449 + }, + { + "advantage_max": 1.3982073590159416, + "advantage_mean": -8.381903204845997e-08, + "advantage_min": -1.2664097175002098, + "advantage_std": 0.9997754022479057, + "completion_length": 1329.541732788086, + "epoch": 0.5142857142857142, + "grad_norm": 1.6348826885223389, + "kl": 0.3032073974609375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.2713832064634125e-07, + "loss": 0.0121, + "reward": 0.43079722626134753, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.43079722626134753, + "reward_after_std": 0.5070368982851505, + "reward_before_mean": 0.548844444565475, + "reward_before_std": 0.5014333333820105, + "reward_change_max": 0.00021466612815856934, + "reward_change_mean": -0.11804721876978874, + "reward_change_min": -0.1936736386269331, + "reward_change_std": 0.07638330943882465, + "reward_std": 0.5070369057357311, + "rewards/cosine_scaled_reward": -0.15266112051904202, + "rewards/format_reward": 0.8541666828095913, + "step": 450 + }, + { + "advantage_max": 1.440886214375496, + "advantage_mean": -3.47693762670076e-08, + "advantage_min": -1.1266870200634003, + "advantage_std": 0.9997949972748756, + "completion_length": 1200.8542098999023, + "epoch": 0.5154285714285715, + "grad_norm": 1.7086747884750366, + "kl": 0.6575164794921875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.260741462457165e-07, + "loss": 0.0263, + "reward": 0.6295475661754608, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6295475661754608, + "reward_after_std": 0.7362482752650976, + "reward_before_mean": 0.7578316442668438, + "reward_before_std": 0.7389062829315662, + "reward_change_max": 0.00040249526500701904, + "reward_change_mean": -0.12828407809138298, + "reward_change_min": -0.24284372478723526, + "reward_change_std": 0.08936772076413035, + "reward_std": 0.7362482752650976, + "rewards/cosine_scaled_reward": -0.06900085625238717, + "rewards/format_reward": 0.8958333395421505, + "step": 451 + }, + { + "advantage_max": 1.4972828030586243, + "advantage_mean": -4.221995908437748e-08, + "advantage_min": -1.2125985845923424, + "advantage_std": 0.9998395889997482, + "completion_length": 1394.4375610351562, + "epoch": 0.5165714285714286, + "grad_norm": 1.2934503555297852, + "kl": 0.5374298095703125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.2503063339313356e-07, + "loss": 0.0215, + "reward": 0.8485817462205887, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.8485817462205887, + "reward_after_std": 0.748363334685564, + "reward_before_mean": 0.9961145296692848, + "reward_before_std": 0.7355900332331657, + "reward_change_max": 0.0, + "reward_change_mean": -0.14753276854753494, + "reward_change_min": -0.23141445498913527, + "reward_change_std": 0.09052707394585013, + "reward_std": 0.7483633458614349, + "rewards/cosine_scaled_reward": 0.06055724306497723, + "rewards/format_reward": 0.8750000074505806, + "step": 452 + }, + { + "advantage_max": 1.4784182906150818, + "advantage_mean": -2.3593506592867186e-08, + "advantage_min": -1.2552871480584145, + "advantage_std": 0.999831311404705, + "completion_length": 1348.562557220459, + "epoch": 0.5177142857142857, + "grad_norm": 1.3329824209213257, + "kl": 0.463165283203125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.2400783294793668e-07, + "loss": 0.0185, + "reward": 0.6962817385792732, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.6962817385792732, + "reward_after_std": 0.6942555904388428, + "reward_before_mean": 0.8324069250375032, + "reward_before_std": 0.6887706816196442, + "reward_change_max": 0.00016242265701293945, + "reward_change_mean": -0.1361251873895526, + "reward_change_min": -0.22889462485909462, + "reward_change_std": 0.08675140561535954, + "reward_std": 0.6942556016147137, + "rewards/cosine_scaled_reward": 0.0203701239079237, + "rewards/format_reward": 0.7916666772216558, + "step": 453 + }, + { + "advantage_max": 1.4930087327957153, + "advantage_mean": -3.4458937342440876e-08, + "advantage_min": -1.2544832825660706, + "advantage_std": 0.9997552409768105, + "completion_length": 1382.6875228881836, + "epoch": 0.5188571428571429, + "grad_norm": 1.1521000862121582, + "kl": 0.7465667724609375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.2300579475997657e-07, + "loss": 0.0298, + "reward": 0.500964343547821, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.500964343547821, + "reward_after_std": 0.6572593171149492, + "reward_before_mean": 0.6187019534409046, + "reward_before_std": 0.6519017405807972, + "reward_change_max": 9.223073720932007e-05, + "reward_change_mean": -0.1177376201376319, + "reward_change_min": -0.19812804460525513, + "reward_change_std": 0.07715894654393196, + "reward_std": 0.6572593450546265, + "rewards/cosine_scaled_reward": -0.09689902793616056, + "rewards/format_reward": 0.8125000149011612, + "step": 454 + }, + { + "advantage_max": 1.551740899682045, + "advantage_mean": -1.8626451714354175e-08, + "advantage_min": -1.0669294819235802, + "advantage_std": 0.9997464343905449, + "completion_length": 1280.770866394043, + "epoch": 0.52, + "grad_norm": 1.89170503616333, + "kl": 0.398681640625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.220245676671809e-07, + "loss": 0.016, + "reward": 0.3647182397544384, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3647182397544384, + "reward_after_std": 0.4574992284178734, + "reward_before_mean": 0.4750360809266567, + "reward_before_std": 0.4399758540093899, + "reward_change_max": 0.0, + "reward_change_mean": -0.1103178346529603, + "reward_change_min": -0.173393864184618, + "reward_change_std": 0.06261738482862711, + "reward_std": 0.4574992470443249, + "rewards/cosine_scaled_reward": -0.22081530094146729, + "rewards/format_reward": 0.9166666716337204, + "step": 455 + }, + { + "advantage_max": 1.487000197172165, + "advantage_mean": -4.563480771047068e-08, + "advantage_min": -1.0528950244188309, + "advantage_std": 0.9998614117503166, + "completion_length": 1337.0417137145996, + "epoch": 0.5211428571428571, + "grad_norm": 6.231215953826904, + "kl": 0.7177734375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.2106419949317388e-07, + "loss": 0.0287, + "reward": 0.5764753445982933, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5764753445982933, + "reward_after_std": 0.8477189987897873, + "reward_before_mean": 0.6952435150742531, + "reward_before_std": 0.8480991721153259, + "reward_change_max": 0.00027041882276535034, + "reward_change_mean": -0.11876817606389523, + "reward_change_min": -0.22616205736994743, + "reward_change_std": 0.08259849390015006, + "reward_std": 0.8477190397679806, + "rewards/cosine_scaled_reward": -0.08987825782969594, + "rewards/format_reward": 0.8750000223517418, + "step": 456 + }, + { + "advantage_max": 1.4224179536104202, + "advantage_mean": -3.725289521305797e-09, + "advantage_min": -1.3244177401065826, + "advantage_std": 0.9997441843152046, + "completion_length": 1460.5000381469727, + "epoch": 0.5222857142857142, + "grad_norm": 2.2744693756103516, + "kl": 0.912109375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.2012473704494537e-07, + "loss": 0.0365, + "reward": 0.576944915112108, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.576944915112108, + "reward_after_std": 0.5542113147675991, + "reward_before_mean": 0.7066980539821088, + "reward_before_std": 0.5536979511380196, + "reward_change_max": 0.0001699402928352356, + "reward_change_mean": -0.12975311558693647, + "reward_change_min": -0.2050950825214386, + "reward_change_std": 0.0803024135529995, + "reward_std": 0.5542113371193409, + "rewards/cosine_scaled_reward": -0.07373432070016861, + "rewards/format_reward": 0.8541666865348816, + "step": 457 + }, + { + "advantage_max": 1.5208216905593872, + "advantage_mean": -3.601114051399179e-08, + "advantage_min": -1.1582210585474968, + "advantage_std": 0.9998268038034439, + "completion_length": 1149.7708587646484, + "epoch": 0.5234285714285715, + "grad_norm": 1.9077669382095337, + "kl": 0.43023681640625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.1920622611056974e-07, + "loss": 0.0172, + "reward": 0.7393535878509283, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.7393535878509283, + "reward_after_std": 0.6870045587420464, + "reward_before_mean": 0.8786043375730515, + "reward_before_std": 0.6741609685122967, + "reward_change_max": 0.0, + "reward_change_mean": -0.13925075996667147, + "reward_change_min": -0.22290120273828506, + "reward_change_std": 0.08484920859336853, + "reward_std": 0.6870045736432076, + "rewards/cosine_scaled_reward": -0.039864509366452694, + "rewards/format_reward": 0.9583333432674408, + "step": 458 + }, + { + "advantage_max": 1.2947088852524757, + "advantage_mean": -1.8626451825376478e-08, + "advantage_min": -1.2270531356334686, + "advantage_std": 0.999851182103157, + "completion_length": 1194.8958740234375, + "epoch": 0.5245714285714286, + "grad_norm": 2.4065475463867188, + "kl": 0.529296875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.1830871145697412e-07, + "loss": 0.0212, + "reward": 0.8759551551192999, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.8759551551192999, + "reward_after_std": 0.869237381964922, + "reward_before_mean": 1.0280643533915281, + "reward_before_std": 0.8942184932529926, + "reward_change_max": 0.0, + "reward_change_mean": -0.15210918057709932, + "reward_change_min": -0.2711542509496212, + "reward_change_std": 0.10925045888870955, + "reward_std": 0.8692374229431152, + "rewards/cosine_scaled_reward": 0.08694883063435555, + "rewards/format_reward": 0.854166679084301, + "step": 459 + }, + { + "advantage_max": 1.47978987544775, + "advantage_mean": 1.2417635808503746e-09, + "advantage_min": -1.13826222717762, + "advantage_std": 0.9998352378606796, + "completion_length": 1906.5417175292969, + "epoch": 0.5257142857142857, + "grad_norm": 2.8608667850494385, + "kl": 1.1624984741210938, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.1743223682775649e-07, + "loss": 0.0465, + "reward": 0.4057202450931072, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.4057202450931072, + "reward_after_std": 0.7728376425802708, + "reward_before_mean": 0.511981688439846, + "reward_before_std": 0.7764072492718697, + "reward_change_max": 0.0001369267702102661, + "reward_change_mean": -0.10626146895810962, + "reward_change_min": -0.19404671341180801, + "reward_change_std": 0.07556618331000209, + "reward_std": 0.772837657481432, + "rewards/cosine_scaled_reward": -0.12942582089453936, + "rewards/format_reward": 0.7708333469927311, + "step": 460 + }, + { + "advantage_max": 1.5734613537788391, + "advantage_mean": -2.7318796114172983e-08, + "advantage_min": -1.161512367427349, + "advantage_std": 0.9997789859771729, + "completion_length": 1528.0209045410156, + "epoch": 0.5268571428571428, + "grad_norm": 1.8937029838562012, + "kl": 0.5341567993164062, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.1657684494105386e-07, + "loss": 0.0214, + "reward": 0.5294137634336948, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5294137634336948, + "reward_after_std": 0.6099908128380775, + "reward_before_mean": 0.6501117488369346, + "reward_before_std": 0.592260580509901, + "reward_change_max": 0.0, + "reward_change_mean": -0.1206979900598526, + "reward_change_min": -0.19131910055875778, + "reward_change_std": 0.07167271664366126, + "reward_std": 0.6099908147007227, + "rewards/cosine_scaled_reward": -0.11244414187967777, + "rewards/format_reward": 0.8750000149011612, + "step": 461 + }, + { + "advantage_max": 1.5587438941001892, + "advantage_mean": -5.587935503204022e-09, + "advantage_min": -1.1386344656348228, + "advantage_std": 0.999779962003231, + "completion_length": 1313.5000228881836, + "epoch": 0.528, + "grad_norm": 2.6109707355499268, + "kl": 0.7901763916015625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.1574257748745986e-07, + "loss": 0.0317, + "reward": 0.2571336994878948, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2571336994878948, + "reward_after_std": 0.6080882269889116, + "reward_before_mean": 0.35165482480078936, + "reward_before_std": 0.5977811366319656, + "reward_change_max": 0.00030838698148727417, + "reward_change_mean": -0.094521121121943, + "reward_change_min": -0.15098145883530378, + "reward_change_std": 0.059791785664856434, + "reward_std": 0.6080882381647825, + "rewards/cosine_scaled_reward": -0.2304226029664278, + "rewards/format_reward": 0.8125000149011612, + "step": 462 + }, + { + "advantage_max": 1.3307348042726517, + "advantage_mean": -2.3903946239078877e-08, + "advantage_min": -1.3292298913002014, + "advantage_std": 0.9998722821474075, + "completion_length": 1411.1458740234375, + "epoch": 0.5291428571428571, + "grad_norm": 1.875565528869629, + "kl": 0.419036865234375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.1492947512799328e-07, + "loss": 0.0168, + "reward": 0.7753808298148215, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.7753808298148215, + "reward_after_std": 0.9464625902473927, + "reward_before_mean": 0.9129773788154125, + "reward_before_std": 0.9668588750064373, + "reward_change_max": 0.000978812575340271, + "reward_change_mean": -0.13759654574096203, + "reward_change_min": -0.27825625985860825, + "reward_change_std": 0.10437601897865534, + "reward_std": 0.9464626125991344, + "rewards/cosine_scaled_reward": 0.02940535603556782, + "rewards/format_reward": 0.8541666865348816, + "step": 463 + }, + { + "advantage_max": 1.618887484073639, + "advantage_mean": -2.5766591477127676e-07, + "advantage_min": -1.086423322558403, + "advantage_std": 0.9997170269489288, + "completion_length": 958.895866394043, + "epoch": 0.5302857142857142, + "grad_norm": 1.9415994882583618, + "kl": 0.5035934448242188, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.1413757749211602e-07, + "loss": 0.0202, + "reward": 1.0455838665366173, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 1.0455838665366173, + "reward_after_std": 0.3843573573976755, + "reward_before_mean": 1.2220981623977423, + "reward_before_std": 0.3369241552427411, + "reward_change_max": 0.0, + "reward_change_mean": -0.1765143796801567, + "reward_change_min": -0.24797379225492477, + "reward_change_std": 0.09332408988848329, + "reward_std": 0.38435736298561096, + "rewards/cosine_scaled_reward": 0.12146575003862381, + "rewards/format_reward": 0.9791666716337204, + "step": 464 + }, + { + "advantage_max": 1.4793611317873, + "advantage_mean": -3.8494666565469515e-08, + "advantage_min": -1.103214792907238, + "advantage_std": 0.9998826235532761, + "completion_length": 1525.7083587646484, + "epoch": 0.5314285714285715, + "grad_norm": 1.4860286712646484, + "kl": 0.501861572265625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.1336692317580158e-07, + "loss": 0.02, + "reward": 0.8048709314316511, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.8048709314316511, + "reward_after_std": 0.9882866255939007, + "reward_before_mean": 0.9428267925977707, + "reward_before_std": 0.9986936561763287, + "reward_change_max": 0.0, + "reward_change_mean": -0.1379559077322483, + "reward_change_min": -0.25486108660697937, + "reward_change_std": 0.09958967566490173, + "reward_std": 0.9882866889238358, + "rewards/cosine_scaled_reward": 0.013080062344670296, + "rewards/format_reward": 0.916666679084301, + "step": 465 + }, + { + "advantage_max": 1.4345777779817581, + "advantage_mean": -7.885197977897107e-08, + "advantage_min": -1.2190702483057976, + "advantage_std": 0.9998356327414513, + "completion_length": 1442.8541870117188, + "epoch": 0.5325714285714286, + "grad_norm": 1.3499559164047241, + "kl": 0.7032089233398438, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.1261754973965422e-07, + "loss": 0.0282, + "reward": 0.836047200486064, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.836047200486064, + "reward_after_std": 0.7958601415157318, + "reward_before_mean": 0.9831163268536329, + "reward_before_std": 0.7962427716702223, + "reward_change_max": 0.0007705315947532654, + "reward_change_mean": -0.14706913474947214, + "reward_change_min": -0.2674461305141449, + "reward_change_std": 0.10129892360419035, + "reward_std": 0.795860156416893, + "rewards/cosine_scaled_reward": 0.0853081488457974, + "rewards/format_reward": 0.812500013038516, + "step": 466 + }, + { + "advantage_max": 1.5279236733913422, + "advantage_mean": -4.967053546245381e-09, + "advantage_min": -1.1415907591581345, + "advantage_std": 0.9998429045081139, + "completion_length": 1567.8750457763672, + "epoch": 0.5337142857142857, + "grad_norm": 3.2998692989349365, + "kl": 0.63177490234375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.1188949370707787e-07, + "loss": 0.0253, + "reward": 0.2946251416578889, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2946251416578889, + "reward_after_std": 0.7547971494495869, + "reward_before_mean": 0.3902467442676425, + "reward_before_std": 0.7570151165127754, + "reward_change_max": 0.00019928067922592163, + "reward_change_mean": -0.09562160400673747, + "reward_change_min": -0.18911895900964737, + "reward_change_std": 0.07319759530946612, + "reward_std": 0.7547971531748772, + "rewards/cosine_scaled_reward": -0.19029329670593143, + "rewards/format_reward": 0.7708333432674408, + "step": 467 + }, + { + "advantage_max": 1.6056719273328781, + "advantage_mean": -1.862645193639878e-08, + "advantage_min": -1.0361272692680359, + "advantage_std": 0.9998692721128464, + "completion_length": 1681.145866394043, + "epoch": 0.5348571428571428, + "grad_norm": 1.96733820438385, + "kl": 1.0497665405273438, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.1118279056249653e-07, + "loss": 0.042, + "reward": 0.25632472475990653, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.25632472475990653, + "reward_after_std": 0.8274875283241272, + "reward_before_mean": 0.34465243108570576, + "reward_before_std": 0.828721173107624, + "reward_change_max": 0.0021148771047592163, + "reward_change_mean": -0.08832769468426704, + "reward_change_min": -0.1855623424053192, + "reward_change_std": 0.06841085152700543, + "reward_std": 0.8274875581264496, + "rewards/cosine_scaled_reward": -0.14017378957942128, + "rewards/format_reward": 0.6250000037252903, + "step": 468 + }, + { + "advantage_max": 1.6077563017606735, + "advantage_mean": -1.5522043039783995e-08, + "advantage_min": -1.0340360701084137, + "advantage_std": 0.9998153671622276, + "completion_length": 1132.208351135254, + "epoch": 0.536, + "grad_norm": 2.0020787715911865, + "kl": 0.48426055908203125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.1049747474962444e-07, + "loss": 0.0194, + "reward": 0.5653686504811049, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5653686504811049, + "reward_after_std": 0.7147700805217028, + "reward_before_mean": 0.6868355348706245, + "reward_before_std": 0.6977136358618736, + "reward_change_max": 0.0, + "reward_change_mean": -0.12146689835935831, + "reward_change_min": -0.20983821526169777, + "reward_change_std": 0.07952037081122398, + "reward_std": 0.7147701065987349, + "rewards/cosine_scaled_reward": -0.1044988944195211, + "rewards/format_reward": 0.8958333507180214, + "step": 469 + }, + { + "advantage_max": 1.4791221618652344, + "advantage_mean": -1.9247334059890875e-08, + "advantage_min": -1.3737546727061272, + "advantage_std": 0.9998335763812065, + "completion_length": 1760.1250381469727, + "epoch": 0.5371428571428571, + "grad_norm": 2.24651837348938, + "kl": 1.114990234375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0983357966978745e-07, + "loss": 0.0447, + "reward": 0.2855025250464678, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2855025250464678, + "reward_after_std": 0.675163846462965, + "reward_before_mean": 0.38193464977666736, + "reward_before_std": 0.6749286688864231, + "reward_change_max": 0.0002874135971069336, + "reward_change_mean": -0.09643213078379631, + "reward_change_min": -0.16725542396306992, + "reward_change_std": 0.06746510276570916, + "reward_std": 0.6751638650894165, + "rewards/cosine_scaled_reward": -0.15278268977999687, + "rewards/format_reward": 0.6875000186264515, + "step": 470 + }, + { + "advantage_max": 1.1827488467097282, + "advantage_mean": -2.545615063187512e-08, + "advantage_min": -1.522478125989437, + "advantage_std": 0.9998515471816063, + "completion_length": 1635.645896911621, + "epoch": 0.5382857142857143, + "grad_norm": 2.085747718811035, + "kl": 0.7410888671875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0919113768029517e-07, + "loss": 0.0297, + "reward": 0.8258672105148435, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.8258672105148435, + "reward_after_std": 0.7812116891145706, + "reward_before_mean": 0.9759960640221834, + "reward_before_std": 0.806895449757576, + "reward_change_max": 0.0, + "reward_change_mean": -0.15012879762798548, + "reward_change_min": -0.25330642610788345, + "reward_change_std": 0.10425947420299053, + "reward_std": 0.7812117375433445, + "rewards/cosine_scaled_reward": 0.09216467384248972, + "rewards/format_reward": 0.7916666865348816, + "step": 471 + }, + { + "advantage_max": 1.6919645369052887, + "advantage_mean": -2.6077032311278003e-08, + "advantage_min": -1.075069934129715, + "advantage_std": 0.9997465685009956, + "completion_length": 1520.1667098999023, + "epoch": 0.5394285714285715, + "grad_norm": 1.32899010181427, + "kl": 0.627593994140625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0857018009286381e-07, + "loss": 0.0251, + "reward": 0.35412935609929264, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.35412935609929264, + "reward_after_std": 0.6487985327839851, + "reward_before_mean": 0.45631421357393265, + "reward_before_std": 0.6299792267382145, + "reward_change_max": 0.0, + "reward_change_mean": -0.10218486096709967, + "reward_change_min": -0.17434173543006182, + "reward_change_std": 0.062425535172224045, + "reward_std": 0.6487985476851463, + "rewards/cosine_scaled_reward": -0.209342903457582, + "rewards/format_reward": 0.8750000074505806, + "step": 472 + }, + { + "advantage_max": 1.3841595649719238, + "advantage_mean": -4.23751782552273e-08, + "advantage_min": -1.1973591893911362, + "advantage_std": 0.9998006895184517, + "completion_length": 1474.208381652832, + "epoch": 0.5405714285714286, + "grad_norm": 3.4161858558654785, + "kl": 0.3667144775390625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0797073717209013e-07, + "loss": 0.0147, + "reward": 0.33687769807875156, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.33687769807875156, + "reward_after_std": 0.5530556403100491, + "reward_before_mean": 0.44355832412838936, + "reward_before_std": 0.5523912459611893, + "reward_change_max": 0.0, + "reward_change_mean": -0.10668064840137959, + "reward_change_min": -0.1907523050904274, + "reward_change_std": 0.06848618015646935, + "reward_std": 0.5530556440353394, + "rewards/cosine_scaled_reward": -0.19488751143217087, + "rewards/format_reward": 0.8333333469927311, + "step": 473 + }, + { + "advantage_max": 1.3778210431337357, + "advantage_mean": -6.705522881400583e-08, + "advantage_min": -1.3139918148517609, + "advantage_std": 0.9998078420758247, + "completion_length": 1363.9375305175781, + "epoch": 0.5417142857142857, + "grad_norm": 3.2250938415527344, + "kl": 0.5262603759765625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0739283813397639e-07, + "loss": 0.0211, + "reward": 1.2284482046961784, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 1.2284482046961784, + "reward_after_std": 0.7306374609470367, + "reward_before_mean": 1.415264431387186, + "reward_before_std": 0.7239528931677341, + "reward_change_max": 0.0, + "reward_change_mean": -0.18681625835597515, + "reward_change_min": -0.2915050946176052, + "reward_change_std": 0.11583147803321481, + "reward_std": 0.7306374758481979, + "rewards/cosine_scaled_reward": 0.2805488705635071, + "rewards/format_reward": 0.8541666939854622, + "step": 474 + }, + { + "advantage_max": 1.5258950591087341, + "advantage_mean": -2.4524827946237338e-08, + "advantage_min": -1.0890448316931725, + "advantage_std": 0.9997752085328102, + "completion_length": 1477.833381652832, + "epoch": 0.5428571428571428, + "grad_norm": 1.7994341850280762, + "kl": 0.28729248046875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.068365111445064e-07, + "loss": 0.0115, + "reward": 0.5002380846999586, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5002380846999586, + "reward_after_std": 0.5856530722230673, + "reward_before_mean": 0.6214745305478573, + "reward_before_std": 0.5763567853718996, + "reward_change_max": 0.0, + "reward_change_mean": -0.12123645003885031, + "reward_change_min": -0.20592597592622042, + "reward_change_std": 0.07472612150013447, + "reward_std": 0.5856531001627445, + "rewards/cosine_scaled_reward": -0.16842940403148532, + "rewards/format_reward": 0.9583333432674408, + "step": 475 + }, + { + "advantage_max": 1.3964376598596573, + "advantage_mean": -2.2351742123838392e-08, + "advantage_min": -1.254280962049961, + "advantage_std": 0.9997788667678833, + "completion_length": 1635.4375610351562, + "epoch": 0.544, + "grad_norm": 2.187488555908203, + "kl": 0.5134124755859375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.063017833182728e-07, + "loss": 0.0205, + "reward": 0.6432311162352562, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6432311162352562, + "reward_after_std": 0.6332471240311861, + "reward_before_mean": 0.7760857921093702, + "reward_before_std": 0.6299569476395845, + "reward_change_max": 0.00022538751363754272, + "reward_change_mean": -0.13285462884232402, + "reward_change_min": -0.21562502346932888, + "reward_change_std": 0.0833498639985919, + "reward_std": 0.6332471258938313, + "rewards/cosine_scaled_reward": -0.07029045931994915, + "rewards/format_reward": 0.9166666716337204, + "step": 476 + }, + { + "advantage_max": 1.7337168902158737, + "advantage_mean": 6.2088170160734535e-09, + "advantage_min": -1.0874443799257278, + "advantage_std": 0.999891571700573, + "completion_length": 1237.0625305175781, + "epoch": 0.5451428571428572, + "grad_norm": 1.326654314994812, + "kl": 0.44366455078125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0578868071715544e-07, + "loss": 0.0178, + "reward": 0.8383656330406666, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.8383656330406666, + "reward_after_std": 0.9872477427124977, + "reward_before_mean": 0.9752098955214024, + "reward_before_std": 0.9661254063248634, + "reward_change_max": 0.0, + "reward_change_mean": -0.13684424851089716, + "reward_change_min": -0.2305159643292427, + "reward_change_std": 0.08920324314385653, + "reward_std": 0.9872477427124977, + "rewards/cosine_scaled_reward": 0.029271604435052723, + "rewards/format_reward": 0.916666679084301, + "step": 477 + }, + { + "advantage_max": 1.573157086968422, + "advantage_mean": -3.0423204788743163e-08, + "advantage_min": -1.1611996442079544, + "advantage_std": 0.9998464584350586, + "completion_length": 1965.833366394043, + "epoch": 0.5462857142857143, + "grad_norm": 1.792330265045166, + "kl": 0.904815673828125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0529722834905125e-07, + "loss": 0.0362, + "reward": 0.48183274059556425, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.48183274059556425, + "reward_after_std": 0.8001515120267868, + "reward_before_mean": 0.5936882272362709, + "reward_before_std": 0.7990560345351696, + "reward_change_max": 0.0, + "reward_change_mean": -0.11185550084337592, + "reward_change_min": -0.20309627056121826, + "reward_change_std": 0.07982863765209913, + "reward_std": 0.800151526927948, + "rewards/cosine_scaled_reward": -0.06773922825232148, + "rewards/format_reward": 0.7291666846722364, + "step": 478 + }, + { + "advantage_max": 1.4396943747997284, + "advantage_mean": -9.623667751590403e-09, + "advantage_min": -1.3289758563041687, + "advantage_std": 0.9998451471328735, + "completion_length": 1589.4792098999023, + "epoch": 0.5474285714285714, + "grad_norm": 1.5058865547180176, + "kl": 0.708740234375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0482745016665526e-07, + "loss": 0.0284, + "reward": 0.4269852042198181, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4269852042198181, + "reward_after_std": 0.8025585748255253, + "reward_before_mean": 0.5355709902942181, + "reward_before_std": 0.8142498098313808, + "reward_change_max": 0.0011701732873916626, + "reward_change_mean": -0.10858577489852905, + "reward_change_min": -0.19784590601921082, + "reward_change_std": 0.08253068244084716, + "reward_std": 0.8025586009025574, + "rewards/cosine_scaled_reward": -0.12804784905165434, + "rewards/format_reward": 0.7916666902601719, + "step": 479 + }, + { + "advantage_max": 1.5044909566640854, + "advantage_mean": -2.2972623581196672e-08, + "advantage_min": -1.1191659942269325, + "advantage_std": 0.9997437074780464, + "completion_length": 1412.8333587646484, + "epoch": 0.5485714285714286, + "grad_norm": 1.9218823909759521, + "kl": 0.5767822265625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0437936906629334e-07, + "loss": 0.023, + "reward": 0.40811170265078545, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.40811170265078545, + "reward_after_std": 0.5391519796103239, + "reward_before_mean": 0.5209373664110899, + "reward_before_std": 0.5283264443278313, + "reward_change_max": 0.0, + "reward_change_mean": -0.11282562743872404, + "reward_change_min": -0.19051403924822807, + "reward_change_std": 0.07031045900657773, + "reward_std": 0.5391519945114851, + "rewards/cosine_scaled_reward": -0.17703134287148714, + "rewards/format_reward": 0.8750000074505806, + "step": 480 + }, + { + "advantage_max": 1.4230820909142494, + "advantage_mean": -2.297262435835279e-08, + "advantage_min": -1.2027820497751236, + "advantage_std": 0.9998055920004845, + "completion_length": 1643.5833892822266, + "epoch": 0.5497142857142857, + "grad_norm": 1.3952871561050415, + "kl": 0.5593643188476562, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0395300688680625e-07, + "loss": 0.0224, + "reward": 0.23358649760484695, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.23358649760484695, + "reward_after_std": 0.6157410964369774, + "reward_before_mean": 0.3300169431604445, + "reward_before_std": 0.6253011748194695, + "reward_change_max": 0.0006894916296005249, + "reward_change_mean": -0.09643045393750072, + "reward_change_min": -0.1772970948368311, + "reward_change_std": 0.07094644149765372, + "reward_std": 0.6157411076128483, + "rewards/cosine_scaled_reward": -0.22040820494294167, + "rewards/format_reward": 0.7708333544433117, + "step": 481 + }, + { + "advantage_max": 1.3363105058670044, + "advantage_mean": -3.3527614351491764e-08, + "advantage_min": -1.3465068489313126, + "advantage_std": 0.9998397678136826, + "completion_length": 1521.4792251586914, + "epoch": 0.5508571428571428, + "grad_norm": 1.705883502960205, + "kl": 0.665130615234375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0354838440848501e-07, + "loss": 0.0266, + "reward": 0.8281307835131884, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.8281307835131884, + "reward_after_std": 0.7850172445178032, + "reward_before_mean": 0.9757171748206019, + "reward_before_std": 0.7954017668962479, + "reward_change_max": 0.0005815252661705017, + "reward_change_mean": -0.14758640620857477, + "reward_change_min": -0.26257020607590675, + "reward_change_std": 0.09904332272708416, + "reward_std": 0.785017266869545, + "rewards/cosine_scaled_reward": 0.019108579959720373, + "rewards/format_reward": 0.9375000149011612, + "step": 482 + }, + { + "advantage_max": 1.423334315419197, + "advantage_mean": -8.69234451084111e-09, + "advantage_min": -1.1823545172810555, + "advantage_std": 0.9998378828167915, + "completion_length": 1683.3125305175781, + "epoch": 0.552, + "grad_norm": 2.6206977367401123, + "kl": 0.6625442504882812, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0316552135205837e-07, + "loss": 0.0265, + "reward": 0.7536204941570759, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.7536204941570759, + "reward_after_std": 0.8240559808909893, + "reward_before_mean": 0.8923968635499477, + "reward_before_std": 0.8315610997378826, + "reward_change_max": 0.0, + "reward_change_mean": -0.13877638336271048, + "reward_change_min": -0.24121517688035965, + "reward_change_std": 0.09393159532919526, + "reward_std": 0.8240560032427311, + "rewards/cosine_scaled_reward": 0.03994842991232872, + "rewards/format_reward": 0.812500013038516, + "step": 483 + }, + { + "advantage_max": 1.4809504449367523, + "advantage_mean": -4.7187011964489045e-08, + "advantage_min": -1.1418846175074577, + "advantage_std": 0.9997940585017204, + "completion_length": 1221.270851135254, + "epoch": 0.5531428571428572, + "grad_norm": 1.6962131261825562, + "kl": 0.384124755859375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0280443637773163e-07, + "loss": 0.0154, + "reward": 0.7981872851960361, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.7981872851960361, + "reward_after_std": 0.8525705002248287, + "reward_before_mean": 0.9407486170530319, + "reward_before_std": 0.8583914032205939, + "reward_change_max": 0.00044048577547073364, + "reward_change_mean": -0.14256139378994703, + "reward_change_min": -0.25064817070961, + "reward_change_std": 0.09835958620533347, + "reward_std": 0.8525705300271511, + "rewards/cosine_scaled_reward": 0.001624307595193386, + "rewards/format_reward": 0.9375000149011612, + "step": 484 + }, + { + "advantage_max": 1.3821987211704254, + "advantage_mean": -5.308538825188336e-08, + "advantage_min": -1.3728350549936295, + "advantage_std": 0.9997694715857506, + "completion_length": 1498.8125610351562, + "epoch": 0.5542857142857143, + "grad_norm": 1.9051717519760132, + "kl": 0.82757568359375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0246514708427701e-07, + "loss": 0.0331, + "reward": 0.486922824755311, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.486922824755311, + "reward_after_std": 0.6849659271538258, + "reward_before_mean": 0.6047742627561092, + "reward_before_std": 0.6905794739723206, + "reward_change_max": 0.0, + "reward_change_mean": -0.11785144358873367, + "reward_change_min": -0.20198489725589752, + "reward_change_std": 0.08204239793121815, + "reward_std": 0.6849659346044064, + "rewards/cosine_scaled_reward": -0.12469621049240232, + "rewards/format_reward": 0.854166679084301, + "step": 485 + }, + { + "advantage_max": 1.7733388990163803, + "advantage_mean": -5.0136199525319114e-08, + "advantage_min": -0.9555219374597073, + "advantage_std": 0.99978306889534, + "completion_length": 884.2500305175781, + "epoch": 0.5554285714285714, + "grad_norm": 2.335850715637207, + "kl": 0.38094329833984375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0214767000817596e-07, + "loss": 0.0152, + "reward": 0.5196775365620852, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5196775365620852, + "reward_after_std": 0.5223635025322437, + "reward_before_mean": 0.6409784676507115, + "reward_before_std": 0.4872880391776562, + "reward_change_max": 0.0, + "reward_change_mean": -0.12130093993619084, + "reward_change_min": -0.1774542685598135, + "reward_change_std": 0.06820766627788544, + "reward_std": 0.5223635211586952, + "rewards/cosine_scaled_reward": -0.13784411549568176, + "rewards/format_reward": 0.916666679084301, + "step": 486 + }, + { + "advantage_max": 1.4368427097797394, + "advantage_mean": -4.796311425803168e-08, + "advantage_min": -1.1753825396299362, + "advantage_std": 0.9998473450541496, + "completion_length": 973.0625228881836, + "epoch": 0.5565714285714286, + "grad_norm": 1.179042100906372, + "kl": 0.1561431884765625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0185202062281336e-07, + "loss": 0.0063, + "reward": 1.0191260538995266, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 1.0191260538995266, + "reward_after_std": 0.8055990971624851, + "reward_before_mean": 1.1827868521213531, + "reward_before_std": 0.802632138133049, + "reward_change_max": 0.0003419220447540283, + "reward_change_mean": -0.1636607814580202, + "reward_change_min": -0.28896985203027725, + "reward_change_std": 0.10505983280017972, + "reward_std": 0.8055991157889366, + "rewards/cosine_scaled_reward": 0.13306006882339716, + "rewards/format_reward": 0.9166666716337204, + "step": 487 + }, + { + "advantage_max": 1.3047830387949944, + "advantage_mean": -7.823109782201243e-08, + "advantage_min": -1.430648073554039, + "advantage_std": 0.9997356534004211, + "completion_length": 1126.270839691162, + "epoch": 0.5577142857142857, + "grad_norm": 1.4515074491500854, + "kl": 0.390380859375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0157821333772304e-07, + "loss": 0.0156, + "reward": 0.49460936337709427, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.49460936337709427, + "reward_after_std": 0.45185090601444244, + "reward_before_mean": 0.6208971869200468, + "reward_before_std": 0.4511682763695717, + "reward_change_max": 0.0, + "reward_change_mean": -0.12628783658146858, + "reward_change_min": -0.19421625509858131, + "reward_change_std": 0.07442756928503513, + "reward_std": 0.4518509153276682, + "rewards/cosine_scaled_reward": -0.16871808469295502, + "rewards/format_reward": 0.9583333432674408, + "step": 488 + }, + { + "advantage_max": 1.4228082448244095, + "advantage_mean": -2.359350631531143e-08, + "advantage_min": -1.3723071962594986, + "advantage_std": 0.9997459053993225, + "completion_length": 1585.041706085205, + "epoch": 0.5588571428571428, + "grad_norm": 1.8846523761749268, + "kl": 0.680572509765625, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.013262614978859e-07, + "loss": 0.0272, + "reward": 0.3229170944541693, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3229170944541693, + "reward_after_std": 0.5093741305172443, + "reward_before_mean": 0.4295815769582987, + "reward_before_std": 0.5072203408926725, + "reward_change_max": 5.799531936645508e-05, + "reward_change_mean": -0.10666447039693594, + "reward_change_min": -0.17716624028980732, + "reward_change_std": 0.06782869761809707, + "reward_std": 0.5093741416931152, + "rewards/cosine_scaled_reward": -0.20187588641420007, + "rewards/format_reward": 0.8333333432674408, + "step": 489 + }, + { + "advantage_max": 1.6553240045905113, + "advantage_mean": -2.545615163107584e-08, + "advantage_min": -1.0602488964796066, + "advantage_std": 0.9997842386364937, + "completion_length": 1372.4375610351562, + "epoch": 0.56, + "grad_norm": 1.7037945985794067, + "kl": 0.612060546875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0109617738307911e-07, + "loss": 0.0245, + "reward": 0.5970811229199171, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5970811229199171, + "reward_after_std": 0.6770407669246197, + "reward_before_mean": 0.7228398490697145, + "reward_before_std": 0.6587517447769642, + "reward_change_max": 0.0, + "reward_change_mean": -0.12575868796557188, + "reward_change_min": -0.19899218156933784, + "reward_change_std": 0.07600692426785827, + "reward_std": 0.6770407911390066, + "rewards/cosine_scaled_reward": -0.0969134415499866, + "rewards/format_reward": 0.9166666716337204, + "step": 490 + }, + { + "advantage_max": 1.5155636966228485, + "advantage_mean": -1.9247334392957782e-08, + "advantage_min": -1.1463100016117096, + "advantage_std": 0.9998896941542625, + "completion_length": 1695.854232788086, + "epoch": 0.5611428571428572, + "grad_norm": 1.6566426753997803, + "kl": 0.6379241943359375, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0088797220727779e-07, + "loss": 0.0255, + "reward": 0.7128791492432356, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.7128791492432356, + "reward_after_std": 0.9784063994884491, + "reward_before_mean": 0.8413580115884542, + "reward_before_std": 0.9818304255604744, + "reward_change_max": 9.988248348236084e-05, + "reward_change_mean": -0.12847886700183153, + "reward_change_min": -0.25756747275590897, + "reward_change_std": 0.093171376734972, + "reward_std": 0.9784064367413521, + "rewards/cosine_scaled_reward": -0.03765433467924595, + "rewards/format_reward": 0.9166666865348816, + "step": 491 + }, + { + "advantage_max": 1.4262542724609375, + "advantage_mean": -2.7939677682553565e-08, + "advantage_min": -1.191407211124897, + "advantage_std": 0.9998179897665977, + "completion_length": 1575.583396911621, + "epoch": 0.5622857142857143, + "grad_norm": 3.479654550552368, + "kl": 0.9051971435546875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0070165611810855e-07, + "loss": 0.0362, + "reward": 0.5793958441354334, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.5793958441354334, + "reward_after_std": 0.6907056048512459, + "reward_before_mean": 0.7047333084046841, + "reward_before_std": 0.6915153935551643, + "reward_change_max": 0.0, + "reward_change_mean": -0.1253374693915248, + "reward_change_min": -0.2124233189970255, + "reward_change_std": 0.08227851102128625, + "reward_std": 0.6907056123018265, + "rewards/cosine_scaled_reward": -0.07471668440848589, + "rewards/format_reward": 0.8541666753590107, + "step": 492 + }, + { + "advantage_max": 1.4624339193105698, + "advantage_mean": -2.235174201281609e-08, + "advantage_min": -1.2118503227829933, + "advantage_std": 0.9998716413974762, + "completion_length": 1361.3542175292969, + "epoch": 0.5634285714285714, + "grad_norm": 2.0603668689727783, + "kl": 0.7406387329101562, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.005372381963547e-07, + "loss": 0.0296, + "reward": 0.84825224801898, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.84825224801898, + "reward_after_std": 0.9701013043522835, + "reward_before_mean": 0.9899251163005829, + "reward_before_std": 0.9750980362296104, + "reward_change_max": 0.0, + "reward_change_mean": -0.14167285151779652, + "reward_change_min": -0.2806529551744461, + "reward_change_std": 0.10109834838658571, + "reward_std": 0.9701013043522835, + "rewards/cosine_scaled_reward": 0.057462539232801646, + "rewards/format_reward": 0.8750000149011612, + "step": 493 + }, + { + "advantage_max": 1.5129066854715347, + "advantage_mean": -3.849466811978175e-08, + "advantage_min": -1.1654788628220558, + "advantage_std": 0.9998171105980873, + "completion_length": 1063.9791946411133, + "epoch": 0.5645714285714286, + "grad_norm": 1.551969289779663, + "kl": 0.3380584716796875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0039472645551372e-07, + "loss": 0.0135, + "reward": 0.7001366913318634, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.7001366913318634, + "reward_after_std": 0.7168645672500134, + "reward_before_mean": 0.8348819054663181, + "reward_before_std": 0.7078965455293655, + "reward_change_max": 0.0, + "reward_change_mean": -0.13474521692842245, + "reward_change_min": -0.23480108752846718, + "reward_change_std": 0.08507911022752523, + "reward_std": 0.7168645933270454, + "rewards/cosine_scaled_reward": -0.04089239612221718, + "rewards/format_reward": 0.916666679084301, + "step": 494 + }, + { + "advantage_max": 1.4147690832614899, + "advantage_mean": -7.823109826610164e-08, + "advantage_min": -1.255192093551159, + "advantage_std": 0.9998476803302765, + "completion_length": 1362.250015258789, + "epoch": 0.5657142857142857, + "grad_norm": 1.8394250869750977, + "kl": 0.43761444091796875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.002741278414069e-07, + "loss": 0.0175, + "reward": 0.8606002209708095, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.8606002209708095, + "reward_after_std": 0.8067436181008816, + "reward_before_mean": 1.0086772553622723, + "reward_before_std": 0.8072432391345501, + "reward_change_max": 0.0, + "reward_change_mean": -0.1480770716443658, + "reward_change_min": -0.2526344172656536, + "reward_change_std": 0.09575034817680717, + "reward_std": 0.8067436292767525, + "rewards/cosine_scaled_reward": 0.046005279291421175, + "rewards/format_reward": 0.916666679084301, + "step": 495 + }, + { + "advantage_max": 1.4694621339440346, + "advantage_mean": -6.27090555038734e-08, + "advantage_min": -1.134453445672989, + "advantage_std": 0.9998220652341843, + "completion_length": 1408.083366394043, + "epoch": 0.5668571428571428, + "grad_norm": 2.118278980255127, + "kl": 0.7026290893554688, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0017544823184055e-07, + "loss": 0.0281, + "reward": 0.821080063469708, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.821080063469708, + "reward_after_std": 0.70627411454916, + "reward_before_mean": 0.968472232343629, + "reward_before_std": 0.7007058300077915, + "reward_change_max": 0.0, + "reward_change_mean": -0.14739223755896091, + "reward_change_min": -0.24292783066630363, + "reward_change_std": 0.09062009677290916, + "reward_std": 0.7062741294503212, + "rewards/cosine_scaled_reward": 0.03631945559754968, + "rewards/format_reward": 0.8958333395421505, + "step": 496 + }, + { + "advantage_max": 1.4108513593673706, + "advantage_mean": -1.0927518634407107e-07, + "advantage_min": -1.1535490825772285, + "advantage_std": 0.9997596219182014, + "completion_length": 1099.5833625793457, + "epoch": 0.568, + "grad_norm": 1.8349206447601318, + "kl": 0.5226058959960938, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0009869243631952e-07, + "loss": 0.0209, + "reward": 0.819844264537096, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.819844264537096, + "reward_after_std": 0.4837149791419506, + "reward_before_mean": 0.9735658243298531, + "reward_before_std": 0.46303862147033215, + "reward_change_max": 0.0, + "reward_change_mean": -0.1537215718999505, + "reward_change_min": -0.2222919762134552, + "reward_change_std": 0.08454245794564486, + "reward_std": 0.4837149903178215, + "rewards/cosine_scaled_reward": -0.013217097148299217, + "rewards/format_reward": 1.0, + "step": 497 + }, + { + "advantage_max": 1.4655998349189758, + "advantage_mean": -3.10440866346795e-08, + "advantage_min": -1.3093776553869247, + "advantage_std": 0.9998064488172531, + "completion_length": 1443.1875305175781, + "epoch": 0.5691428571428572, + "grad_norm": 1.950182557106018, + "kl": 0.6890029907226562, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.000438641958131e-07, + "loss": 0.0276, + "reward": 0.593856418505311, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.593856418505311, + "reward_after_std": 0.7217038404196501, + "reward_before_mean": 0.719802012128639, + "reward_before_std": 0.7204098887741566, + "reward_change_max": 0.0, + "reward_change_mean": -0.125945626758039, + "reward_change_min": -0.20720598101615906, + "reward_change_std": 0.08468122826889157, + "reward_std": 0.721703888848424, + "rewards/cosine_scaled_reward": -0.06718233320862055, + "rewards/format_reward": 0.854166679084301, + "step": 498 + }, + { + "advantage_max": 1.6709688156843185, + "advantage_mean": -3.663202252646158e-08, + "advantage_min": -1.1494032144546509, + "advantage_std": 0.9997295960783958, + "completion_length": 1472.3958740234375, + "epoch": 0.5702857142857143, + "grad_norm": 1.0532234907150269, + "kl": 0.49327850341796875, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1.0001096618257236e-07, + "loss": 0.0197, + "reward": 0.5505574708804488, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.5505574708804488, + "reward_after_std": 0.6247312221676111, + "reward_before_mean": 0.6727077215909958, + "reward_before_std": 0.6021678037941456, + "reward_change_max": 0.0, + "reward_change_mean": -0.12215026002377272, + "reward_change_min": -0.18972956016659737, + "reward_change_std": 0.0723155359737575, + "reward_std": 0.6247312305495143, + "rewards/cosine_scaled_reward": -0.14281281549483538, + "rewards/format_reward": 0.9583333432674408, + "step": 499 + }, + { + "advantage_max": 1.541480839252472, + "advantage_mean": -5.587935614226325e-09, + "advantage_min": -1.154192365705967, + "advantage_std": 0.9998290911316872, + "completion_length": 1295.6250610351562, + "epoch": 0.5714285714285714, + "grad_norm": 1.619492769241333, + "kl": 0.4624786376953125, + "lambda_div_used": 0.9000000000000001, + "learning_rate": 1e-07, + "loss": 0.0185, + "reward": 0.6967308446764946, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6967308446764946, + "reward_after_std": 0.736558023840189, + "reward_before_mean": 0.8310529440641403, + "reward_before_std": 0.7277755029499531, + "reward_change_max": 0.0, + "reward_change_mean": -0.13432206492871046, + "reward_change_min": -0.2356853261590004, + "reward_change_std": 0.0854858374223113, + "reward_std": 0.7365580387413502, + "rewards/cosine_scaled_reward": -0.053223551250994205, + "rewards/format_reward": 0.9375000074505806, + "step": 500 + }, + { + "epoch": 0.5714285714285714, + "step": 500, + "total_flos": 0.0, + "train_loss": 0.0055785658547727055, + "train_runtime": 58514.5956, + "train_samples_per_second": 0.41, + "train_steps_per_second": 0.009 + } + ], + "logging_steps": 1, + "max_steps": 500, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}