{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5714285714285714, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "advantage_max": 1.3805946558713913, "advantage_mean": -8.0714617212152e-09, "advantage_min": -1.278314545750618, "advantage_std": 0.9998298436403275, "completion_length": 2571.2083587646484, "epoch": 0.001142857142857143, "grad_norm": 0.19454674422740936, "kl": 0.0, "lambda_div_used": 0.9000000000000001, "learning_rate": 2e-08, "loss": 0.0, "reward": 0.383966077119112, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.383966077119112, "reward_after_std": 0.8095231093466282, "reward_before_mean": 0.4897647276520729, "reward_before_std": 0.8290339298546314, "reward_change_max": 0.000140361487865448, "reward_change_mean": -0.10579865705221891, "reward_change_min": -0.2073100507259369, "reward_change_std": 0.08411919022910297, "reward_std": 0.8095231391489506, "rewards/cosine_scaled_reward": -0.015534311532974243, "rewards/format_reward": 0.5208333488553762, "step": 1 }, { "advantage_max": 1.2630222663283348, "advantage_mean": -2.6077034975813262e-08, "advantage_min": -1.2786083295941353, "advantage_std": 0.9997444376349449, "completion_length": 2804.395881652832, "epoch": 0.002285714285714286, "grad_norm": 0.18217992782592773, "kl": 0.0, "lambda_div_used": 0.9000000000000001, "learning_rate": 4e-08, "loss": 0.0, "reward": 0.17750850692391396, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.17750850692391396, "reward_after_std": 0.42011942341923714, "reward_before_mean": 0.27539755403995514, "reward_before_std": 0.42092561535537243, "reward_change_max": 0.0003265589475631714, "reward_change_mean": -0.09788906387984753, "reward_change_min": -0.1594111192971468, "reward_change_std": 0.06503142253495753, "reward_std": 0.42011944204568863, "rewards/cosine_scaled_reward": -0.04980122856795788, "rewards/format_reward": 0.37500000558793545, "step": 2 }, { "advantage_max": 1.5949327051639557, "advantage_mean": 4.346172421954009e-09, "advantage_min": -0.991000697016716, "advantage_std": 0.9998101443052292, "completion_length": 3280.854217529297, "epoch": 0.0034285714285714284, "grad_norm": 0.1597713828086853, "kl": 3.88026237487793e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 6e-08, "loss": 0.0, "reward": -0.27885486651211977, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.27885486651211977, "reward_after_std": 0.7552903220057487, "reward_before_mean": -0.23804896231740713, "reward_before_std": 0.7629885245114565, "reward_change_max": 0.0004154220223426819, "reward_change_mean": -0.04080591048114002, "reward_change_min": -0.09831635467708111, "reward_change_std": 0.04355394095182419, "reward_std": 0.7552903480827808, "rewards/cosine_scaled_reward": -0.24402447510510683, "rewards/format_reward": 0.2500000074505806, "step": 3 }, { "advantage_max": 1.553865224123001, "advantage_mean": 7.450580485901526e-09, "advantage_min": -1.1104755029082298, "advantage_std": 0.9998296648263931, "completion_length": 2339.729217529297, "epoch": 0.004571428571428572, "grad_norm": 0.24378903210163116, "kl": 2.9033049941062927e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 8e-08, "loss": 0.0, "reward": 0.3669445291161537, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3669445291161537, "reward_after_std": 0.9222714062780142, "reward_before_mean": 0.4652156475931406, "reward_before_std": 0.932275427505374, "reward_change_max": 0.0, "reward_change_mean": -0.09827110765036196, "reward_change_min": -0.19664788339287043, "reward_change_std": 0.07947756757494062, "reward_std": 0.9222714211791754, "rewards/cosine_scaled_reward": -0.05905885813990608, "rewards/format_reward": 0.5833333414047956, "step": 4 }, { "advantage_max": 1.449422225356102, "advantage_mean": 4.967053546245381e-09, "advantage_min": -1.0410668477416039, "advantage_std": 0.9998126924037933, "completion_length": 3355.2291870117188, "epoch": 0.005714285714285714, "grad_norm": 0.18919876217842102, "kl": 4.6834349632263184e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 1e-07, "loss": 0.0, "reward": -0.07166258245706558, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": -0.07166258245706558, "reward_after_std": 0.7799306400120258, "reward_before_mean": -0.007116513326764107, "reward_before_std": 0.8074344918131828, "reward_change_max": 0.000532977283000946, "reward_change_mean": -0.06454607425257564, "reward_change_min": -0.17053373903036118, "reward_change_std": 0.06985534727573395, "reward_std": 0.7799306474626064, "rewards/cosine_scaled_reward": -0.14939158782362938, "rewards/format_reward": 0.2916666716337204, "step": 5 }, { "advantage_max": 1.7041800767183304, "advantage_mean": 9.934107986220297e-08, "advantage_min": -1.0263897106051445, "advantage_std": 0.9997144341468811, "completion_length": 2931.4167098999023, "epoch": 0.006857142857142857, "grad_norm": 0.23432040214538574, "kl": 3.852322697639465e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.2e-07, "loss": 0.0, "reward": -0.2719933092594147, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.2719933092594147, "reward_after_std": 0.4997922573238611, "reward_before_mean": -0.2224076190032065, "reward_before_std": 0.4970291517674923, "reward_change_max": 0.0001704096794128418, "reward_change_mean": -0.04958567628636956, "reward_change_min": -0.09600676875561476, "reward_change_std": 0.03900796617381275, "reward_std": 0.4997922834008932, "rewards/cosine_scaled_reward": -0.25703714042901993, "rewards/format_reward": 0.29166666977107525, "step": 6 }, { "advantage_max": 1.5957663804292679, "advantage_mean": -1.241763414316921e-09, "advantage_min": -1.015457857400179, "advantage_std": 0.999874085187912, "completion_length": 3199.0000915527344, "epoch": 0.008, "grad_norm": 0.15826046466827393, "kl": 2.849102020263672e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.4e-07, "loss": 0.0, "reward": 0.19284541113302112, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.19284541113302112, "reward_after_std": 0.9743777364492416, "reward_before_mean": 0.27275329316034913, "reward_before_std": 0.984375350177288, "reward_change_max": 0.00023509562015533447, "reward_change_mean": -0.07990789820905775, "reward_change_min": -0.17123806476593018, "reward_change_std": 0.06935694860294461, "reward_std": 0.9743777737021446, "rewards/cosine_scaled_reward": -0.10320668725762516, "rewards/format_reward": 0.4791666828095913, "step": 7 }, { "advantage_max": 1.558206208050251, "advantage_mean": 7.202228147207279e-08, "advantage_min": -1.082590851932764, "advantage_std": 0.9997908994555473, "completion_length": 2662.1666946411133, "epoch": 0.009142857142857144, "grad_norm": 0.16711029410362244, "kl": 2.8438866138458252e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.6e-07, "loss": 0.0, "reward": 0.41234924644231796, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.41234924644231796, "reward_after_std": 0.6741374768316746, "reward_before_mean": 0.5237455815076828, "reward_before_std": 0.6730234175920486, "reward_change_max": 0.0003030449151992798, "reward_change_mean": -0.11139630584511906, "reward_change_min": -0.1782920677214861, "reward_change_std": 0.07700008910614997, "reward_std": 0.6741374880075455, "rewards/cosine_scaled_reward": 0.06395611725747585, "rewards/format_reward": 0.3958333358168602, "step": 8 }, { "advantage_max": 1.541668102145195, "advantage_mean": 1.2417634809303024e-08, "advantage_min": -1.2456592470407486, "advantage_std": 0.9998108074069023, "completion_length": 3136.7709045410156, "epoch": 0.010285714285714285, "grad_norm": 0.18616381287574768, "kl": 4.213303327560425e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.8e-07, "loss": 0.0, "reward": -0.056825272273272276, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.056825272273272276, "reward_after_std": 0.7951892241835594, "reward_before_mean": 0.004031727090477943, "reward_before_std": 0.8006694987416267, "reward_change_max": 0.0, "reward_change_mean": -0.06085700215771794, "reward_change_min": -0.12736657354980707, "reward_change_std": 0.0515642911195755, "reward_std": 0.7951892279088497, "rewards/cosine_scaled_reward": -0.16465081088244915, "rewards/format_reward": 0.3333333432674408, "step": 9 }, { "advantage_max": 1.451988160610199, "advantage_mean": 2.4835279388568665e-09, "advantage_min": -1.1254910752177238, "advantage_std": 0.9998168796300888, "completion_length": 2620.104179382324, "epoch": 0.011428571428571429, "grad_norm": 0.2192392647266388, "kl": 2.3484230041503906e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 2e-07, "loss": 0.0, "reward": 0.05896207131445408, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.05896207131445408, "reward_after_std": 0.7329425290226936, "reward_before_mean": 0.1349986456334591, "reward_before_std": 0.7468922697007656, "reward_change_max": 0.0001971423625946045, "reward_change_mean": -0.07603657222352922, "reward_change_min": -0.16776021383702755, "reward_change_std": 0.06868458609096706, "reward_std": 0.732942558825016, "rewards/cosine_scaled_reward": -0.1304173544049263, "rewards/format_reward": 0.39583334513008595, "step": 10 }, { "advantage_max": 1.5240405201911926, "advantage_mean": 1.018246036377235e-07, "advantage_min": -0.9174718670547009, "advantage_std": 0.9997430071234703, "completion_length": 3468.9166870117188, "epoch": 0.012571428571428572, "grad_norm": 0.1580207347869873, "kl": 3.941357135772705e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.1999999999999998e-07, "loss": 0.0, "reward": -0.44767653942108154, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.44767653942108154, "reward_after_std": 0.6254625134170055, "reward_before_mean": -0.41805149242281914, "reward_before_std": 0.6336532738059759, "reward_change_max": 0.0004032254219055176, "reward_change_mean": -0.029625033494085073, "reward_change_min": -0.0750849274918437, "reward_change_std": 0.03149611933622509, "reward_std": 0.6254625394940376, "rewards/cosine_scaled_reward": -0.25069241458550096, "rewards/format_reward": 0.0833333358168602, "step": 11 }, { "advantage_max": 1.3734028935432434, "advantage_mean": 7.450581707146853e-09, "advantage_min": -1.2330540791153908, "advantage_std": 0.9998399540781975, "completion_length": 2658.8750762939453, "epoch": 0.013714285714285714, "grad_norm": 0.18816447257995605, "kl": 4.194304347038269e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.4e-07, "loss": 0.0, "reward": 0.40494780242443085, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.40494780242443085, "reward_after_std": 0.773624949157238, "reward_before_mean": 0.5151724070310593, "reward_before_std": 0.7934709116816521, "reward_change_max": 0.0, "reward_change_mean": -0.11022460693493485, "reward_change_min": -0.21708102989941835, "reward_change_std": 0.08380235452204943, "reward_std": 0.7736249603331089, "rewards/cosine_scaled_reward": -0.05491379927843809, "rewards/format_reward": 0.6250000223517418, "step": 12 }, { "advantage_max": 1.3199822530150414, "advantage_mean": -4.097819361614796e-08, "advantage_min": -1.100662998855114, "advantage_std": 0.9997796267271042, "completion_length": 2828.354217529297, "epoch": 0.014857142857142857, "grad_norm": 0.19753533601760864, "kl": 3.528594970703125e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.6e-07, "loss": 0.0, "reward": -0.02724500745534897, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.02724500745534897, "reward_after_std": 0.6626134999096394, "reward_before_mean": 0.04541436675935984, "reward_before_std": 0.6859034113585949, "reward_change_max": 8.57040286064148e-05, "reward_change_mean": -0.07265940262004733, "reward_change_min": -0.16889882646501064, "reward_change_std": 0.06863630237057805, "reward_std": 0.6626135222613811, "rewards/cosine_scaled_reward": -0.16479281801730394, "rewards/format_reward": 0.37500000558793545, "step": 13 }, { "advantage_max": 1.5738040059804916, "advantage_mean": -4.049313184761871e-08, "advantage_min": -1.0287166237831116, "advantage_std": 0.9998800083994865, "completion_length": 2633.0833892822266, "epoch": 0.016, "grad_norm": 0.2650340795516968, "kl": 2.3968517780303955e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.8e-07, "loss": 0.0, "reward": 0.4404462520033121, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4404462520033121, "reward_after_std": 0.9656060226261616, "reward_before_mean": 0.5435896124690771, "reward_before_std": 0.9739357531070709, "reward_change_max": 0.00042141228914260864, "reward_change_mean": -0.10314338165335357, "reward_change_min": -0.19937315676361322, "reward_change_std": 0.08145514910575002, "reward_std": 0.9656060636043549, "rewards/cosine_scaled_reward": 0.03221147443400696, "rewards/format_reward": 0.47916666977107525, "step": 14 }, { "advantage_max": 1.5372154638171196, "advantage_mean": -2.0489098417897367e-08, "advantage_min": -1.1531073153018951, "advantage_std": 0.9997818544507027, "completion_length": 2792.6458587646484, "epoch": 0.017142857142857144, "grad_norm": 0.19909434020519257, "kl": 2.8697148081846535e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.36440867744386196, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.36440867744386196, "reward_after_std": 0.5662843585014343, "reward_before_mean": 0.4735768185928464, "reward_before_std": 0.5562909506261349, "reward_change_max": 0.00023984909057617188, "reward_change_mean": -0.10916817560791969, "reward_change_min": -0.16707832738757133, "reward_change_std": 0.06875652400776744, "reward_std": 0.5662843994796276, "rewards/cosine_scaled_reward": 0.028455082327127457, "rewards/format_reward": 0.41666667349636555, "step": 15 }, { "advantage_max": 1.5482462048530579, "advantage_mean": 2.607703303292297e-08, "advantage_min": -1.0424293726682663, "advantage_std": 0.9997147470712662, "completion_length": 3578.25, "epoch": 0.018285714285714287, "grad_norm": 0.17111748456954956, "kl": 5.213916301727295e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.2e-07, "loss": 0.0, "reward": -0.4694888131925836, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.4694888131925836, "reward_after_std": 0.4447050001472235, "reward_before_mean": -0.4354347405023873, "reward_before_std": 0.4501433949917555, "reward_change_max": 8.464604616165161e-05, "reward_change_mean": -0.03405407292302698, "reward_change_min": -0.07718909159302711, "reward_change_std": 0.03179531981004402, "reward_std": 0.44470502249896526, "rewards/cosine_scaled_reward": -0.22813403699547052, "rewards/format_reward": 0.02083333395421505, "step": 16 }, { "advantage_max": 1.3803511783480644, "advantage_mean": -9.313226301266297e-09, "advantage_min": -1.3098116517066956, "advantage_std": 0.9998393729329109, "completion_length": 2443.937557220459, "epoch": 0.019428571428571427, "grad_norm": 0.2514090836048126, "kl": 4.226714372634888e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.4000000000000003e-07, "loss": 0.0, "reward": 0.5570252109318972, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5570252109318972, "reward_after_std": 0.7874607294797897, "reward_before_mean": 0.6806772872805595, "reward_before_std": 0.8015068471431732, "reward_change_max": 0.00028277933597564697, "reward_change_mean": -0.12365204491652548, "reward_change_min": -0.21179709024727345, "reward_change_std": 0.08916230010800064, "reward_std": 0.7874607406556606, "rewards/cosine_scaled_reward": 0.048671944066882133, "rewards/format_reward": 0.5833333488553762, "step": 17 }, { "advantage_max": 1.4020969048142433, "advantage_mean": 1.614292566287645e-08, "advantage_min": -1.123589277267456, "advantage_std": 0.9997856467962265, "completion_length": 2848.1458892822266, "epoch": 0.02057142857142857, "grad_norm": 0.17697472870349884, "kl": 3.02046537399292e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.6e-07, "loss": 0.0, "reward": 0.20538506656885147, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.20538506656885147, "reward_after_std": 0.8300341553986073, "reward_before_mean": 0.29380474984645844, "reward_before_std": 0.852685147896409, "reward_change_max": 7.675588130950928e-05, "reward_change_mean": -0.08841968746855855, "reward_change_min": -0.20666884537786245, "reward_change_std": 0.08047383697703481, "reward_std": 0.8300341740250587, "rewards/cosine_scaled_reward": -0.0614309649245115, "rewards/format_reward": 0.41666667722165585, "step": 18 }, { "advantage_max": 1.4702775925397873, "advantage_mean": -2.173086599555063e-09, "advantage_min": -1.1005384474992752, "advantage_std": 0.9998194351792336, "completion_length": 2901.3542098999023, "epoch": 0.021714285714285714, "grad_norm": 0.21968965232372284, "kl": 3.5278499126434326e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.7999999999999996e-07, "loss": 0.0, "reward": 0.20741954632103443, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.20741954632103443, "reward_after_std": 0.8663461040705442, "reward_before_mean": 0.2943375655449927, "reward_before_std": 0.8890400361269712, "reward_change_max": 0.00013860315084457397, "reward_change_mean": -0.08691801549866796, "reward_change_min": -0.19336348306387663, "reward_change_std": 0.07870251836720854, "reward_std": 0.8663461394608021, "rewards/cosine_scaled_reward": -0.02991456165909767, "rewards/format_reward": 0.3541666716337204, "step": 19 }, { "advantage_max": 1.6149840354919434, "advantage_mean": -8.69234404454744e-08, "advantage_min": -1.064434602856636, "advantage_std": 0.9997932985424995, "completion_length": 2344.500026702881, "epoch": 0.022857142857142857, "grad_norm": 0.22625480592250824, "kl": 1.2205913662910461e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 4e-07, "loss": 0.0, "reward": 0.5863076392561197, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5863076392561197, "reward_after_std": 0.7506540268659592, "reward_before_mean": 0.7089644218795002, "reward_before_std": 0.7365398444235325, "reward_change_max": 0.0, "reward_change_mean": -0.12265676353126764, "reward_change_min": -0.2063779616728425, "reward_change_std": 0.08107350138016045, "reward_std": 0.7506540361791849, "rewards/cosine_scaled_reward": 0.03156551416032016, "rewards/format_reward": 0.6458333414047956, "step": 20 }, { "advantage_max": 1.485236831009388, "advantage_mean": 3.104408285992122e-09, "advantage_min": -1.123526617884636, "advantage_std": 0.9998413771390915, "completion_length": 2848.5000610351562, "epoch": 0.024, "grad_norm": 0.24100278317928314, "kl": 4.163384437561035e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.1999999999999995e-07, "loss": 0.0, "reward": 0.33815048914402723, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.33815048914402723, "reward_after_std": 0.8459835797548294, "reward_before_mean": 0.4365639388561249, "reward_before_std": 0.8539957068860531, "reward_change_max": 0.0003660544753074646, "reward_change_mean": -0.0984134313184768, "reward_change_min": -0.1952377436682582, "reward_change_std": 0.07635108148679137, "reward_std": 0.8459835983812809, "rewards/cosine_scaled_reward": -0.00046803371515125036, "rewards/format_reward": 0.4375000074505806, "step": 21 }, { "advantage_max": 1.5112750977277756, "advantage_mean": -6.208819014474898e-10, "advantage_min": -1.1027398481965065, "advantage_std": 0.9998446479439735, "completion_length": 1809.458381652832, "epoch": 0.025142857142857144, "grad_norm": 0.3369239568710327, "kl": 2.700556069612503e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.3999999999999997e-07, "loss": 0.0, "reward": 0.5723168756812811, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5723168756812811, "reward_after_std": 0.7695174552500248, "reward_before_mean": 0.6950296210125089, "reward_before_std": 0.7699864134192467, "reward_change_max": 0.00015548616647720337, "reward_change_mean": -0.12271274626255035, "reward_change_min": -0.21874888613820076, "reward_change_std": 0.0842174394056201, "reward_std": 0.7695174664258957, "rewards/cosine_scaled_reward": -0.02748518972657621, "rewards/format_reward": 0.7500000074505806, "step": 22 }, { "advantage_max": 1.5562490671873093, "advantage_mean": -1.6142924885720333e-08, "advantage_min": -1.1134375929832458, "advantage_std": 0.9998349696397781, "completion_length": 2557.354202270508, "epoch": 0.026285714285714287, "grad_norm": 0.18720196187496185, "kl": 2.9034912586212158e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.6e-07, "loss": 0.0, "reward": 0.24941763281822205, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.24941763281822205, "reward_after_std": 0.7656088657677174, "reward_before_mean": 0.34069389663636684, "reward_before_std": 0.7691123522818089, "reward_change_max": 0.00048488378524780273, "reward_change_mean": -0.09127628512214869, "reward_change_min": -0.1845838474109769, "reward_change_std": 0.06947551434859633, "reward_std": 0.7656089253723621, "rewards/cosine_scaled_reward": -0.10048638191074133, "rewards/format_reward": 0.5416666828095913, "step": 23 }, { "advantage_max": 1.5228987485170364, "advantage_mean": 1.2417644690287943e-09, "advantage_min": -1.1863074079155922, "advantage_std": 0.9997837617993355, "completion_length": 2850.333396911621, "epoch": 0.027428571428571427, "grad_norm": 0.22365504503250122, "kl": 2.4393200874328613e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.8e-07, "loss": 0.0, "reward": 0.3684218265116215, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.3684218265116215, "reward_after_std": 0.7231812328100204, "reward_before_mean": 0.47270913142710924, "reward_before_std": 0.7200934160500765, "reward_change_max": 2.9243528842926025e-05, "reward_change_mean": -0.10428728186525404, "reward_change_min": -0.19938689470291138, "reward_change_std": 0.07831439608708024, "reward_std": 0.7231812477111816, "rewards/cosine_scaled_reward": 0.02802122524008155, "rewards/format_reward": 0.41666667349636555, "step": 24 }, { "advantage_max": 1.4767991751432419, "advantage_mean": -1.8471231544303635e-08, "advantage_min": -1.182334378361702, "advantage_std": 0.9998176693916321, "completion_length": 2888.2083892822266, "epoch": 0.02857142857142857, "grad_norm": 0.1811361014842987, "kl": 3.592018038034439e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 5e-07, "loss": 0.0, "reward": 0.05788080208003521, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.05788080208003521, "reward_after_std": 0.8258396983146667, "reward_before_mean": 0.12952115014195442, "reward_before_std": 0.8361939936876297, "reward_change_max": 0.00030659139156341553, "reward_change_mean": -0.07164037134498358, "reward_change_min": -0.15691961627453566, "reward_change_std": 0.06482615089043975, "reward_std": 0.8258397094905376, "rewards/cosine_scaled_reward": -0.12273942306637764, "rewards/format_reward": 0.37500000558793545, "step": 25 }, { "advantage_max": 1.4326291382312775, "advantage_mean": 2.1109979653211042e-08, "advantage_min": -1.20679422467947, "advantage_std": 0.999689131975174, "completion_length": 2975.3541870117188, "epoch": 0.029714285714285714, "grad_norm": 0.1737651377916336, "kl": 2.80626118183136e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.2e-07, "loss": 0.0, "reward": 0.21969399228692055, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.21969399228692055, "reward_after_std": 0.4889183659106493, "reward_before_mean": 0.31834758073091507, "reward_before_std": 0.4868295267224312, "reward_change_max": 0.00041982531547546387, "reward_change_mean": -0.09865356958471239, "reward_change_min": -0.17028795275837183, "reward_change_std": 0.06675215833820403, "reward_std": 0.48891837801784277, "rewards/cosine_scaled_reward": -0.059576213359832764, "rewards/format_reward": 0.4375000149011612, "step": 26 }, { "advantage_max": 1.3971495032310486, "advantage_mean": 7.450580596923828e-09, "advantage_min": -1.1024487167596817, "advantage_std": 0.9997835606336594, "completion_length": 3062.916717529297, "epoch": 0.030857142857142857, "grad_norm": 0.18261635303497314, "kl": 3.505777567625046e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.4e-07, "loss": 0.0, "reward": 0.23383064568042755, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.23383064568042755, "reward_after_std": 0.7971065267920494, "reward_before_mean": 0.3276177365332842, "reward_before_std": 0.8222432602196932, "reward_change_max": 0.00037420541048049927, "reward_change_mean": -0.0937870959751308, "reward_change_min": -0.20607583597302437, "reward_change_std": 0.08386349817737937, "reward_std": 0.7971065677702427, "rewards/cosine_scaled_reward": -0.06535779661498964, "rewards/format_reward": 0.4583333432674408, "step": 27 }, { "advantage_max": 1.602249875664711, "advantage_mean": 1.0554989660072067e-08, "advantage_min": -0.8943894132971764, "advantage_std": 0.9998579323291779, "completion_length": 2890.2500610351562, "epoch": 0.032, "grad_norm": 0.19755809009075165, "kl": 3.156531602144241e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.6e-07, "loss": 0.0, "reward": 0.28285503294318914, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.28285503294318914, "reward_after_std": 0.9709917679429054, "reward_before_mean": 0.37220960669219494, "reward_before_std": 0.9838091358542442, "reward_change_max": 0.00012604892253875732, "reward_change_mean": -0.08935456513427198, "reward_change_min": -0.19135633949190378, "reward_change_std": 0.0756442949641496, "reward_std": 0.9709917902946472, "rewards/cosine_scaled_reward": -0.02222853573039174, "rewards/format_reward": 0.4166666679084301, "step": 28 }, { "advantage_max": 1.4943844228982925, "advantage_mean": 2.8560558806844938e-08, "advantage_min": -1.1554220169782639, "advantage_std": 0.9997913166880608, "completion_length": 3206.6875610351562, "epoch": 0.03314285714285714, "grad_norm": 0.14532366394996643, "kl": 2.2741500288248062e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.8e-07, "loss": 0.0, "reward": -0.20698433928191662, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.20698433928191662, "reward_after_std": 0.6183616258203983, "reward_before_mean": -0.1521163946017623, "reward_before_std": 0.6331501640379429, "reward_change_max": 0.00013259053230285645, "reward_change_mean": -0.05486793885938823, "reward_change_min": -0.13878423906862736, "reward_change_std": 0.05339451297186315, "reward_std": 0.6183616407215595, "rewards/cosine_scaled_reward": -0.2114748670719564, "rewards/format_reward": 0.2708333395421505, "step": 29 }, { "advantage_max": 1.3567621260881424, "advantage_mean": 1.0632599911630791e-08, "advantage_min": -1.3685436844825745, "advantage_std": 0.9998265281319618, "completion_length": 3011.229202270508, "epoch": 0.03428571428571429, "grad_norm": 0.17021258175373077, "kl": 2.570822834968567e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 6e-07, "loss": 0.0, "reward": 0.3891737814992666, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3891737814992666, "reward_after_std": 0.8715236745774746, "reward_before_mean": 0.4956340156495571, "reward_before_std": 0.9000084735453129, "reward_change_max": 0.00030282139778137207, "reward_change_mean": -0.10646022134460509, "reward_change_min": -0.2039351612329483, "reward_change_std": 0.09085589437745512, "reward_std": 0.8715236894786358, "rewards/cosine_scaled_reward": 0.029066994786262512, "rewards/format_reward": 0.4375000149011612, "step": 30 }, { "advantage_max": 1.5537595748901367, "advantage_mean": -3.6011139736835673e-08, "advantage_min": -1.0320660769939423, "advantage_std": 0.9998223856091499, "completion_length": 3004.0417709350586, "epoch": 0.03542857142857143, "grad_norm": 0.22154660522937775, "kl": 2.0623207092285156e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.2e-07, "loss": 0.0, "reward": 0.30279272235929966, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.30279272235929966, "reward_after_std": 0.7506419010460377, "reward_before_mean": 0.4000659354496747, "reward_before_std": 0.7510486245155334, "reward_change_max": 0.0001844540238380432, "reward_change_mean": -0.09727319562807679, "reward_change_min": -0.20195122808218002, "reward_change_std": 0.0752708266954869, "reward_std": 0.750641942024231, "rewards/cosine_scaled_reward": 0.02294962201267481, "rewards/format_reward": 0.35416666977107525, "step": 31 }, { "advantage_max": 1.525789052248001, "advantage_mean": 1.6653345369377348e-15, "advantage_min": -1.0015061795711517, "advantage_std": 0.9997410029172897, "completion_length": 3162.6666870117188, "epoch": 0.036571428571428574, "grad_norm": 0.1808365285396576, "kl": 2.628657966852188e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.4e-07, "loss": 0.0, "reward": 0.20195652917027473, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.20195652917027473, "reward_after_std": 0.7237186953425407, "reward_before_mean": 0.2911367453634739, "reward_before_std": 0.7264976073056459, "reward_change_max": 0.00014059245586395264, "reward_change_mean": -0.0891802167170681, "reward_change_min": -0.1922367298975587, "reward_change_std": 0.07259462832007557, "reward_std": 0.7237186953425407, "rewards/cosine_scaled_reward": -0.052348305471241474, "rewards/format_reward": 0.39583334513008595, "step": 32 }, { "advantage_max": 1.3504931405186653, "advantage_mean": 6.208817904251873e-09, "advantage_min": -1.2225516885519028, "advantage_std": 0.9997994303703308, "completion_length": 3309.8958740234375, "epoch": 0.037714285714285714, "grad_norm": 0.14805929362773895, "kl": 3.3639371395111084e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.6e-07, "loss": 0.0, "reward": -0.009339592419564724, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.009339592419564724, "reward_after_std": 0.7747208327054977, "reward_before_mean": 0.06072163209319115, "reward_before_std": 0.7972827106714249, "reward_change_max": 8.016079664230347e-05, "reward_change_mean": -0.07006121682934463, "reward_change_min": -0.16277748066931963, "reward_change_std": 0.06796340085566044, "reward_std": 0.774720836430788, "rewards/cosine_scaled_reward": -0.10505585628561676, "rewards/format_reward": 0.27083334140479565, "step": 33 }, { "advantage_max": 1.3841820135712624, "advantage_mean": -3.7252905427109795e-08, "advantage_min": -1.2319133207201958, "advantage_std": 0.9998511150479317, "completion_length": 2577.3333892822266, "epoch": 0.038857142857142854, "grad_norm": 0.2878997027873993, "kl": 2.3380503989756107e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.800000000000001e-07, "loss": 0.0, "reward": 0.6522115813568234, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6522115813568234, "reward_after_std": 0.8838492073118687, "reward_before_mean": 0.7806666740216315, "reward_before_std": 0.8991155996918678, "reward_change_max": 0.00018197298049926758, "reward_change_mean": -0.12845506332814693, "reward_change_min": -0.22763802763074636, "reward_change_std": 0.09426811803132296, "reward_std": 0.883849237114191, "rewards/cosine_scaled_reward": 0.1194999860599637, "rewards/format_reward": 0.541666679084301, "step": 34 }, { "advantage_max": 1.50138621032238, "advantage_mean": 3.7252904094842165e-08, "advantage_min": -0.9604013338685036, "advantage_std": 0.9998384490609169, "completion_length": 3031.50004196167, "epoch": 0.04, "grad_norm": 0.2110377997159958, "kl": 5.46872615814209e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 7e-07, "loss": 0.0, "reward": 0.15888013318181038, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.15888013318181038, "reward_after_std": 0.9893754497170448, "reward_before_mean": 0.2364537250250578, "reward_before_std": 1.0095467530190945, "reward_change_max": 8.15466046333313e-05, "reward_change_mean": -0.07757358253002167, "reward_change_min": -0.21603084448724985, "reward_change_std": 0.07964391424320638, "reward_std": 0.9893755055963993, "rewards/cosine_scaled_reward": -0.027606474235653877, "rewards/format_reward": 0.2916666716337204, "step": 35 }, { "advantage_max": 1.6357170641422272, "advantage_mean": 6.829699250587851e-09, "advantage_min": -0.9163132086396217, "advantage_std": 0.9998221769928932, "completion_length": 3327.7916870117188, "epoch": 0.04114285714285714, "grad_norm": 0.18343685567378998, "kl": 3.866851329803467e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.2e-07, "loss": 0.0, "reward": -0.29037621850147843, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.29037621850147843, "reward_after_std": 0.7336566708981991, "reward_before_mean": -0.24998886417597532, "reward_before_std": 0.7397099249064922, "reward_change_max": 0.00043429434299468994, "reward_change_mean": -0.040387358982115984, "reward_change_min": -0.09777700062841177, "reward_change_std": 0.04080916219390929, "reward_std": 0.7336566857993603, "rewards/cosine_scaled_reward": -0.239577763248235, "rewards/format_reward": 0.2291666716337204, "step": 36 }, { "advantage_max": 1.3579955101013184, "advantage_mean": 4.0667752942979973e-08, "advantage_min": -1.2495231628417969, "advantage_std": 0.9997347593307495, "completion_length": 3411.875, "epoch": 0.04228571428571429, "grad_norm": 0.19917240738868713, "kl": 3.3661723136901855e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.4e-07, "loss": 0.0, "reward": -0.3309951778501272, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.3309951778501272, "reward_after_std": 0.3882594630122185, "reward_before_mean": -0.2804699596017599, "reward_before_std": 0.3967503234744072, "reward_change_max": 0.00010347366333007812, "reward_change_mean": -0.05052520358003676, "reward_change_min": -0.09982103761285543, "reward_change_std": 0.04131218558177352, "reward_std": 0.3882594667375088, "rewards/cosine_scaled_reward": -0.22356831841170788, "rewards/format_reward": 0.1666666679084301, "step": 37 }, { "advantage_max": 1.2970862612128258, "advantage_mean": 3.9736431256542915e-08, "advantage_min": -1.3777238950133324, "advantage_std": 0.9996457993984222, "completion_length": 3378.6041717529297, "epoch": 0.04342857142857143, "grad_norm": 0.17214582860469818, "kl": 3.484450280666351e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.599999999999999e-07, "loss": 0.0, "reward": -0.2649919129908085, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.2649919129908085, "reward_after_std": 0.34833380207419395, "reward_before_mean": -0.2071321178227663, "reward_before_std": 0.35493761859834194, "reward_change_max": 0.00027345120906829834, "reward_change_mean": -0.05785980122163892, "reward_change_min": -0.11019740533083677, "reward_change_std": 0.04111600434407592, "reward_std": 0.3483338113874197, "rewards/cosine_scaled_reward": -0.16606605518609285, "rewards/format_reward": 0.125, "step": 38 }, { "advantage_max": 1.611236572265625, "advantage_mean": -3.9736430812453705e-08, "advantage_min": -1.0035665556788445, "advantage_std": 0.9996546432375908, "completion_length": 2727.7292289733887, "epoch": 0.044571428571428574, "grad_norm": 0.20894435048103333, "kl": 2.6639550924301147e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.799999999999999e-07, "loss": 0.0, "reward": 0.3717892151325941, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3717892151325941, "reward_after_std": 0.3282108139246702, "reward_before_mean": 0.48738136142492294, "reward_before_std": 0.29373720567673445, "reward_change_max": 0.0, "reward_change_mean": -0.11559212068095803, "reward_change_min": -0.16906874999403954, "reward_change_std": 0.0631270776502788, "reward_std": 0.3282108213752508, "rewards/cosine_scaled_reward": -0.027142662554979324, "rewards/format_reward": 0.5416666716337204, "step": 39 }, { "advantage_max": 1.560842514038086, "advantage_mean": 7.388492573312533e-08, "advantage_min": -1.1241285800933838, "advantage_std": 0.9998079016804695, "completion_length": 2497.312568664551, "epoch": 0.045714285714285714, "grad_norm": 0.1783609390258789, "kl": 9.907619096338749e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 8e-07, "loss": 0.0, "reward": 0.34173901937901974, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.34173901937901974, "reward_after_std": 0.6312458626925945, "reward_before_mean": 0.4451894119847566, "reward_before_std": 0.6229474730789661, "reward_change_max": 0.0002436712384223938, "reward_change_mean": -0.1034503523260355, "reward_change_min": -0.17544407304376364, "reward_change_std": 0.06937033985741436, "reward_std": 0.6312458775937557, "rewards/cosine_scaled_reward": -0.04823863413184881, "rewards/format_reward": 0.5416666697710752, "step": 40 }, { "advantage_max": 1.6749724745750427, "advantage_mean": 2.7939677238464355e-08, "advantage_min": -0.9673251286149025, "advantage_std": 0.9998568594455719, "completion_length": 2899.3125762939453, "epoch": 0.046857142857142854, "grad_norm": 0.18267039954662323, "kl": 3.07522714138031e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.199999999999999e-07, "loss": 0.0, "reward": -0.018273118417710066, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.018273118417710066, "reward_after_std": 0.8944275602698326, "reward_before_mean": 0.04296614229679108, "reward_before_std": 0.9022683948278427, "reward_change_max": 0.0005041435360908508, "reward_change_mean": -0.06123924785060808, "reward_change_min": -0.14666928444057703, "reward_change_std": 0.06066753948107362, "reward_std": 0.8944276012480259, "rewards/cosine_scaled_reward": -0.17643359955400229, "rewards/format_reward": 0.39583334513008595, "step": 41 }, { "advantage_max": 1.458516851067543, "advantage_mean": 1.9868214629070735e-08, "advantage_min": -1.1069196499884129, "advantage_std": 0.9997414946556091, "completion_length": 2711.3333854675293, "epoch": 0.048, "grad_norm": 0.3004877269268036, "kl": 6.502866744995117e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.399999999999999e-07, "loss": 0.0, "reward": -0.21551374037517235, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.21551374037517235, "reward_after_std": 0.4664292559027672, "reward_before_mean": -0.1585762370377779, "reward_before_std": 0.4684213399887085, "reward_change_max": 0.00046293437480926514, "reward_change_mean": -0.05693749734200537, "reward_change_min": -0.10906532034277916, "reward_change_std": 0.04194885538890958, "reward_std": 0.4664292633533478, "rewards/cosine_scaled_reward": -0.26678812876343727, "rewards/format_reward": 0.3750000037252903, "step": 42 }, { "advantage_max": 1.5238156765699387, "advantage_mean": 4.315127999365842e-08, "advantage_min": -1.2874258160591125, "advantage_std": 0.9997437745332718, "completion_length": 2902.937515258789, "epoch": 0.04914285714285714, "grad_norm": 0.1742953509092331, "kl": 4.782527685165405e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.599999999999999e-07, "loss": 0.0, "reward": 0.19388390332460403, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.19388390332460403, "reward_after_std": 0.5199645850807428, "reward_before_mean": 0.2874040777387563, "reward_before_std": 0.5115220807492733, "reward_change_max": 0.00013028830289840698, "reward_change_mean": -0.09352018125355244, "reward_change_min": -0.15791374817490578, "reward_change_std": 0.060954955872148275, "reward_std": 0.5199645888060331, "rewards/cosine_scaled_reward": -0.033381287939846516, "rewards/format_reward": 0.3541666753590107, "step": 43 }, { "advantage_max": 1.3402893841266632, "advantage_mean": 2.7318795670083773e-08, "advantage_min": -1.2359666973352432, "advantage_std": 0.9998237863183022, "completion_length": 2733.2500610351562, "epoch": 0.05028571428571429, "grad_norm": 0.33008483052253723, "kl": 0.0001117512583732605, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.799999999999999e-07, "loss": 0.0, "reward": 0.367175517603755, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.367175517603755, "reward_after_std": 0.7814797982573509, "reward_before_mean": 0.4731551297008991, "reward_before_std": 0.8076624237000942, "reward_change_max": 0.00021695345640182495, "reward_change_mean": -0.10597959894221276, "reward_change_min": -0.21262322179973125, "reward_change_std": 0.08547356608323753, "reward_std": 0.7814798168838024, "rewards/cosine_scaled_reward": -0.01342245377600193, "rewards/format_reward": 0.5000000093132257, "step": 44 }, { "advantage_max": 1.481304481625557, "advantage_mean": 1.9247334503980085e-08, "advantage_min": -1.104643315076828, "advantage_std": 0.999833881855011, "completion_length": 3342.854217529297, "epoch": 0.05142857142857143, "grad_norm": 0.14805112779140472, "kl": 4.836916923522949e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 9e-07, "loss": 0.0, "reward": 0.10503099672496319, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.10503099672496319, "reward_after_std": 0.8778037875890732, "reward_before_mean": 0.17967192456126213, "reward_before_std": 0.8892814107239246, "reward_change_max": 0.0, "reward_change_mean": -0.07464091246947646, "reward_change_min": -0.15921855811029673, "reward_change_std": 0.06344135943800211, "reward_std": 0.8778038173913956, "rewards/cosine_scaled_reward": -0.04558071191422641, "rewards/format_reward": 0.2708333395421505, "step": 45 }, { "advantage_max": 1.521193027496338, "advantage_mean": 2.9802322498717615e-08, "advantage_min": -1.1753706708550453, "advantage_std": 0.9997015595436096, "completion_length": 3199.8541870117188, "epoch": 0.052571428571428575, "grad_norm": 0.21178790926933289, "kl": 8.314847946166992e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.2e-07, "loss": 0.0, "reward": -0.3450825661420822, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.3450825661420822, "reward_after_std": 0.36790336668491364, "reward_before_mean": -0.29655372351408005, "reward_before_std": 0.3680526949465275, "reward_change_max": 0.0004460737109184265, "reward_change_mean": -0.04852884029969573, "reward_change_min": -0.0890387985855341, "reward_change_std": 0.03501270990818739, "reward_std": 0.36790337786078453, "rewards/cosine_scaled_reward": -0.23161020036786795, "rewards/format_reward": 0.1666666679084301, "step": 46 }, { "advantage_max": 1.335110366344452, "advantage_mean": 1.1796752574788627e-08, "advantage_min": -1.17454382032156, "advantage_std": 0.9998417124152184, "completion_length": 2774.854232788086, "epoch": 0.053714285714285714, "grad_norm": 0.20298603177070618, "kl": 4.419684410095215e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.399999999999999e-07, "loss": 0.0, "reward": 0.3838723013177514, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.3838723013177514, "reward_after_std": 0.9062189273536205, "reward_before_mean": 0.48803192749619484, "reward_before_std": 0.9351296909153461, "reward_change_max": 4.4032931327819824e-05, "reward_change_mean": -0.10415959800593555, "reward_change_min": -0.2301958166062832, "reward_change_std": 0.09293584548868239, "reward_std": 0.9062189720571041, "rewards/cosine_scaled_reward": 0.0044326139613986015, "rewards/format_reward": 0.4791666753590107, "step": 47 }, { "advantage_max": 1.4299870878458023, "advantage_mean": -8.61473536950541e-09, "advantage_min": -1.2005236744880676, "advantage_std": 0.999784804880619, "completion_length": 2806.3125228881836, "epoch": 0.054857142857142854, "grad_norm": 0.23084399104118347, "kl": 0.00018368917517364025, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.6e-07, "loss": 0.0, "reward": 0.12521709315478802, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.12521709315478802, "reward_after_std": 0.6359206773340702, "reward_before_mean": 0.20962151745334268, "reward_before_std": 0.6417857967317104, "reward_change_max": 0.0002502724528312683, "reward_change_mean": -0.0844044117256999, "reward_change_min": -0.14903380069881678, "reward_change_std": 0.05997437797486782, "reward_std": 0.6359206922352314, "rewards/cosine_scaled_reward": -0.07227259315550327, "rewards/format_reward": 0.35416666977107525, "step": 48 }, { "advantage_max": 1.4477006047964096, "advantage_mean": -6.208817793229571e-09, "advantage_min": -1.0258841067552567, "advantage_std": 0.9997946843504906, "completion_length": 2393.3542289733887, "epoch": 0.056, "grad_norm": 0.21073652803897858, "kl": 5.701184272766113e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.8e-07, "loss": 0.0, "reward": 0.2004772163927555, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2004772163927555, "reward_after_std": 0.7987145818769932, "reward_before_mean": 0.2891058661043644, "reward_before_std": 0.8171101789921522, "reward_change_max": 0.00024158507585525513, "reward_change_mean": -0.08862866298295557, "reward_change_min": -0.20624815300107002, "reward_change_std": 0.07846913067623973, "reward_std": 0.7987145818769932, "rewards/cosine_scaled_reward": -0.11586374510079622, "rewards/format_reward": 0.5208333376795053, "step": 49 }, { "advantage_max": 1.3314997255802155, "advantage_mean": -9.934107092490763e-09, "advantage_min": -1.2882369980216026, "advantage_std": 0.9997088387608528, "completion_length": 2960.645835876465, "epoch": 0.05714285714285714, "grad_norm": 0.16604925692081451, "kl": 0.00013734400272369385, "lambda_div_used": 0.9000000000000001, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.34677931293845177, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.34677931293845177, "reward_after_std": 0.6038327347487211, "reward_before_mean": 0.4557027849368751, "reward_before_std": 0.6080019045621157, "reward_change_max": 2.8908252716064453e-06, "reward_change_mean": -0.1089234659448266, "reward_change_min": -0.19173666648566723, "reward_change_std": 0.07837402378208935, "reward_std": 0.603832745924592, "rewards/cosine_scaled_reward": 0.04035138082690537, "rewards/format_reward": 0.3750000111758709, "step": 50 }, { "advantage_max": 1.3933206498622894, "advantage_mean": 7.450581041013038e-09, "advantage_min": -1.2193833366036415, "advantage_std": 0.9997348785400391, "completion_length": 2253.7500228881836, "epoch": 0.05828571428571429, "grad_norm": 0.2345164716243744, "kl": 0.0003926903009414673, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.999890338174275e-07, "loss": 0.0, "reward": 0.28201270662248135, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.28201270662248135, "reward_after_std": 0.5423499569296837, "reward_before_mean": 0.38460008054971695, "reward_before_std": 0.5425751395523548, "reward_change_max": 0.00022970139980316162, "reward_change_mean": -0.10258737101685256, "reward_change_min": -0.17983837984502316, "reward_change_std": 0.06753439782187343, "reward_std": 0.5423499867320061, "rewards/cosine_scaled_reward": -0.09936663717962801, "rewards/format_reward": 0.5833333358168602, "step": 51 }, { "advantage_max": 1.5688743889331818, "advantage_mean": 1.8626451714354175e-08, "advantage_min": -0.8412662744522095, "advantage_std": 0.9998848661780357, "completion_length": 2888.187530517578, "epoch": 0.05942857142857143, "grad_norm": 0.21857954561710358, "kl": 0.00026175379753112793, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.999561358041868e-07, "loss": 0.0, "reward": 0.34447018057107925, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.34447018057107925, "reward_after_std": 1.2108869962394238, "reward_before_mean": 0.4332852326333523, "reward_before_std": 1.240879662334919, "reward_change_max": 0.00027269870042800903, "reward_change_mean": -0.0888150431565009, "reward_change_min": -0.24372821487486362, "reward_change_std": 0.09732173680095002, "reward_std": 1.210887011140585, "rewards/cosine_scaled_reward": 0.008309275843203068, "rewards/format_reward": 0.4166666753590107, "step": 52 }, { "advantage_max": 1.6007621735334396, "advantage_mean": -1.3659397946064189e-08, "advantage_min": -0.9977999553084373, "advantage_std": 0.9997362941503525, "completion_length": 2766.4791870117188, "epoch": 0.060571428571428575, "grad_norm": 0.1986720860004425, "kl": 0.0002454817295074463, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.999013075636804e-07, "loss": 0.0, "reward": 0.26487368531525135, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.26487368531525135, "reward_after_std": 0.6082321088761091, "reward_before_mean": 0.3611887330189347, "reward_before_std": 0.5937345344573259, "reward_change_max": 0.0001229792833328247, "reward_change_mean": -0.0963150686584413, "reward_change_min": -0.1630243817344308, "reward_change_std": 0.06432746141217649, "reward_std": 0.6082321219146252, "rewards/cosine_scaled_reward": -0.07982230000197887, "rewards/format_reward": 0.5208333376795053, "step": 53 }, { "advantage_max": 1.4808703660964966, "advantage_mean": 6.829698140364826e-09, "advantage_min": -1.1947182267904282, "advantage_std": 0.9997754395008087, "completion_length": 2824.041732788086, "epoch": 0.061714285714285715, "grad_norm": 0.17189155519008636, "kl": 5.392730236053467e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.998245517681593e-07, "loss": 0.0, "reward": 0.5566112250089645, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5566112250089645, "reward_after_std": 0.9852496441453695, "reward_before_mean": 0.674728263169527, "reward_before_std": 1.0110351080074906, "reward_change_max": 0.00023486465215682983, "reward_change_mean": -0.1181170241907239, "reward_change_min": -0.24376624636352062, "reward_change_std": 0.10137569368816912, "reward_std": 0.9852496590465307, "rewards/cosine_scaled_reward": 0.09778079111129045, "rewards/format_reward": 0.4791666828095913, "step": 54 }, { "advantage_max": 1.4139875769615173, "advantage_mean": 6.208817238118058e-08, "advantage_min": -1.2174672558903694, "advantage_std": 0.9997905045747757, "completion_length": 3000.250015258789, "epoch": 0.06285714285714286, "grad_norm": 0.1520586907863617, "kl": 0.0002251937985420227, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.997258721585931e-07, "loss": 0.0, "reward": 0.23542099818587303, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.23542099818587303, "reward_after_std": 0.6617230176925659, "reward_before_mean": 0.3305197563022375, "reward_before_std": 0.6710530370473862, "reward_change_max": 0.001324571669101715, "reward_change_mean": -0.09509875881485641, "reward_change_min": -0.16870635468512774, "reward_change_std": 0.07222678326070309, "reward_std": 0.6617230512201786, "rewards/cosine_scaled_reward": -0.011823451146483421, "rewards/format_reward": 0.35416667349636555, "step": 55 }, { "advantage_max": 1.1939893886446953, "advantage_mean": 1.8626444830971423e-09, "advantage_min": -1.2510383129119873, "advantage_std": 0.9998167455196381, "completion_length": 3025.2291870117188, "epoch": 0.064, "grad_norm": 0.1891845017671585, "kl": 0.0001361072063446045, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.996052735444862e-07, "loss": 0.0, "reward": 0.10598282422870398, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.10598282422870398, "reward_after_std": 0.7244430258870125, "reward_before_mean": 0.19185005594044924, "reward_before_std": 0.7621579542756081, "reward_change_max": 0.0002643391489982605, "reward_change_mean": -0.08586725732311606, "reward_change_min": -0.17667766101658344, "reward_change_std": 0.0800751845818013, "reward_std": 0.7244430519640446, "rewards/cosine_scaled_reward": -0.09157497808337212, "rewards/format_reward": 0.3750000074505806, "step": 56 }, { "advantage_max": 1.4618928134441376, "advantage_mean": 2.607703264434491e-08, "advantage_min": -1.0425853356719017, "advantage_std": 0.9997790455818176, "completion_length": 3318.625030517578, "epoch": 0.06514285714285714, "grad_norm": 0.12933112680912018, "kl": 5.304068326950073e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.994627618036452e-07, "loss": 0.0, "reward": -0.21856810012832284, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.21856810012832284, "reward_after_std": 0.5995266698300838, "reward_before_mean": -0.16476159170269966, "reward_before_std": 0.6078551784157753, "reward_change_max": 0.00038677453994750977, "reward_change_mean": -0.05380649887956679, "reward_change_min": -0.12232645880430937, "reward_change_std": 0.04918327531777322, "reward_std": 0.5995266884565353, "rewards/cosine_scaled_reward": -0.22821413166821003, "rewards/format_reward": 0.2916666753590107, "step": 57 }, { "advantage_max": 1.490548200905323, "advantage_mean": 1.800557053455165e-08, "advantage_min": -1.1940131336450577, "advantage_std": 0.9998381435871124, "completion_length": 2338.750045776367, "epoch": 0.06628571428571428, "grad_norm": 0.20656853914260864, "kl": 0.0009194463491439819, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.992983438818915e-07, "loss": 0.0, "reward": 0.5056098848581314, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5056098848581314, "reward_after_std": 0.8406195119023323, "reward_before_mean": 0.6206929110921919, "reward_before_std": 0.8440921474248171, "reward_change_max": 6.683915853500366e-05, "reward_change_mean": -0.11508298618718982, "reward_change_min": -0.21354464441537857, "reward_change_std": 0.08731875498779118, "reward_std": 0.8406195193529129, "rewards/cosine_scaled_reward": -0.012570214690640569, "rewards/format_reward": 0.6458333469927311, "step": 58 }, { "advantage_max": 1.4613443687558174, "advantage_mean": -2.5145709625640222e-08, "advantage_min": -1.219063676893711, "advantage_std": 0.9997749403119087, "completion_length": 2888.750030517578, "epoch": 0.06742857142857143, "grad_norm": 0.15128706395626068, "kl": 7.972121238708496e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.991120277927223e-07, "loss": 0.0, "reward": 0.15974653512239456, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.15974653512239456, "reward_after_std": 0.6801816318184137, "reward_before_mean": 0.2470409832894802, "reward_before_std": 0.6890074703842402, "reward_change_max": 0.0, "reward_change_mean": -0.08729447645600885, "reward_change_min": -0.16269752010703087, "reward_change_std": 0.06480636191554368, "reward_std": 0.680181659758091, "rewards/cosine_scaled_reward": -0.053562849294394255, "rewards/format_reward": 0.35416667722165585, "step": 59 }, { "advantage_max": 1.449000284075737, "advantage_mean": 3.973643103449831e-08, "advantage_min": -1.0444062128663063, "advantage_std": 0.9997691512107849, "completion_length": 2974.312530517578, "epoch": 0.06857142857142857, "grad_norm": 0.17451316118240356, "kl": 0.00019010156393051147, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.989038226169207e-07, "loss": 0.0, "reward": -0.003677740693092346, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.003677740693092346, "reward_after_std": 0.6160192638635635, "reward_before_mean": 0.07069200649857521, "reward_before_std": 0.6240869630128145, "reward_change_max": 0.0004219040274620056, "reward_change_mean": -0.0743697372963652, "reward_change_min": -0.14743295591324568, "reward_change_std": 0.061072568874806166, "reward_std": 0.6160192675888538, "rewards/cosine_scaled_reward": -0.14173733163625002, "rewards/format_reward": 0.3541666679084301, "step": 60 }, { "advantage_max": 1.4675796553492546, "advantage_mean": 1.6763807009212428e-08, "advantage_min": -1.1424203887581825, "advantage_std": 0.9997835755348206, "completion_length": 2999.4375610351562, "epoch": 0.06971428571428571, "grad_norm": 0.15803340077400208, "kl": 0.000301167368888855, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.98673738502114e-07, "loss": 0.0, "reward": 0.13608455285429955, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.13608455285429955, "reward_after_std": 0.7962324041873217, "reward_before_mean": 0.21927554439753294, "reward_before_std": 0.8183390758931637, "reward_change_max": 7.96392560005188e-05, "reward_change_mean": -0.08319097117055207, "reward_change_min": -0.17873274348676205, "reward_change_std": 0.07595815474633127, "reward_std": 0.7962324265390635, "rewards/cosine_scaled_reward": -0.11952891387045383, "rewards/format_reward": 0.4583333395421505, "step": 61 }, { "advantage_max": 1.6056253165006638, "advantage_mean": -1.0865429223017031e-08, "advantage_min": -0.9415153115987778, "advantage_std": 0.9997888281941414, "completion_length": 2589.770896911621, "epoch": 0.07085714285714285, "grad_norm": 0.22276122868061066, "kl": 0.0006796866655349731, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.98421786662277e-07, "loss": 0.0, "reward": 0.35923552978783846, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.35923552978783846, "reward_after_std": 0.6406212784349918, "reward_before_mean": 0.4651615908369422, "reward_before_std": 0.6320403479039669, "reward_change_max": 0.0006706267595291138, "reward_change_mean": -0.10592610284220427, "reward_change_min": -0.19946615397930145, "reward_change_std": 0.07461804489139467, "reward_std": 0.6406213119626045, "rewards/cosine_scaled_reward": -0.027835868299007416, "rewards/format_reward": 0.520833333954215, "step": 62 }, { "advantage_max": 1.5056186392903328, "advantage_mean": -1.6142924996742636e-08, "advantage_min": -1.066870667040348, "advantage_std": 0.9998953640460968, "completion_length": 2393.5208587646484, "epoch": 0.072, "grad_norm": 0.19597779214382172, "kl": 0.0006195306777954102, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.981479793771866e-07, "loss": 0.0, "reward": 0.7143217946140794, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.7143217946140794, "reward_after_std": 1.1465397253632545, "reward_before_mean": 0.8397536515258253, "reward_before_std": 1.1638507843017578, "reward_change_max": 6.40377402305603e-05, "reward_change_mean": -0.12543187430128455, "reward_change_min": -0.25196870043873787, "reward_change_std": 0.1010968410409987, "reward_std": 1.1465397775173187, "rewards/cosine_scaled_reward": 0.08654349111020565, "rewards/format_reward": 0.6666666772216558, "step": 63 }, { "advantage_max": 1.208019107580185, "advantage_mean": -1.2417635031347629e-08, "advantage_min": -1.371815674006939, "advantage_std": 0.9997898116707802, "completion_length": 2820.354202270508, "epoch": 0.07314285714285715, "grad_norm": 0.1696338802576065, "kl": 0.00028708577156066895, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.97852329991824e-07, "loss": 0.0, "reward": 0.304183728992939, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.304183728992939, "reward_after_std": 0.7283022552728653, "reward_before_mean": 0.4062739387154579, "reward_before_std": 0.7515294570475817, "reward_change_max": 0.00043839961290359497, "reward_change_mean": -0.10209022578783333, "reward_change_min": -0.19523747824132442, "reward_change_std": 0.08049859874881804, "reward_std": 0.7283022850751877, "rewards/cosine_scaled_reward": -0.015613039955496788, "rewards/format_reward": 0.4375000149011612, "step": 64 }, { "advantage_max": 1.6743332594633102, "advantage_mean": 1.862645232497684e-08, "advantage_min": -0.969209760427475, "advantage_std": 0.9997278153896332, "completion_length": 2733.708354949951, "epoch": 0.07428571428571429, "grad_norm": 0.20424288511276245, "kl": 0.0002609342336654663, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.975348529157229e-07, "loss": 0.0, "reward": 0.15900675393640995, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.15900675393640995, "reward_after_std": 0.6618925724178553, "reward_before_mean": 0.24430988542735577, "reward_before_std": 0.6556311100721359, "reward_change_max": 0.00014291703701019287, "reward_change_mean": -0.08530310750938952, "reward_change_min": -0.16443136800080538, "reward_change_std": 0.061483539175242186, "reward_std": 0.6618925910443068, "rewards/cosine_scaled_reward": -0.07576173637062311, "rewards/format_reward": 0.39583333767950535, "step": 65 }, { "advantage_max": 1.4976499304175377, "advantage_mean": -6.022552873075071e-08, "advantage_min": -1.0167640447616577, "advantage_std": 0.9998143464326859, "completion_length": 2099.729175567627, "epoch": 0.07542857142857143, "grad_norm": 0.24778138101100922, "kl": 0.00030663609504699707, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.971955636222684e-07, "loss": 0.0, "reward": 0.38356152176856995, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.38356152176856995, "reward_after_std": 0.6583958007395267, "reward_before_mean": 0.49129557237029076, "reward_before_std": 0.6529230363667011, "reward_change_max": 0.00029415637254714966, "reward_change_mean": -0.10773406224325299, "reward_change_min": -0.18134311586618423, "reward_change_std": 0.07149266311898828, "reward_std": 0.6583958119153976, "rewards/cosine_scaled_reward": -0.014768877997994423, "rewards/format_reward": 0.520833333954215, "step": 66 }, { "advantage_max": 1.3264884650707245, "advantage_mean": 4.6566129618952345e-08, "advantage_min": -1.1300052106380463, "advantage_std": 0.9996982514858246, "completion_length": 3386.4166717529297, "epoch": 0.07657142857142857, "grad_norm": 0.13453011214733124, "kl": 0.00039284047670662403, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.968344786479415e-07, "loss": 0.0, "reward": -0.4645117961335927, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.4645117961335927, "reward_after_std": 0.36321923695504665, "reward_before_mean": -0.42523948568850756, "reward_before_std": 0.37532838620245457, "reward_change_max": 0.00018453598022460938, "reward_change_mean": -0.03927231300622225, "reward_change_min": -0.08630170952528715, "reward_change_std": 0.036291120108217, "reward_std": 0.36321924813091755, "rewards/cosine_scaled_reward": -0.2542864102870226, "rewards/format_reward": 0.0833333358168602, "step": 67 }, { "advantage_max": 1.334203988313675, "advantage_mean": -2.7939686120248552e-09, "advantage_min": -1.3702456429600716, "advantage_std": 0.9997404292225838, "completion_length": 1854.4167098999023, "epoch": 0.07771428571428571, "grad_norm": 0.27549850940704346, "kl": 0.0012511014938354492, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.964516155915151e-07, "loss": 0.0001, "reward": 0.4696199508616701, "reward_advantage_correlation": 0.9999999999999994, "reward_after_mean": 0.4696199508616701, "reward_after_std": 0.6607894655317068, "reward_before_mean": 0.5879831919446588, "reward_before_std": 0.6720792315900326, "reward_change_max": 0.0, "reward_change_mean": -0.11836327239871025, "reward_change_min": -0.20884494576603174, "reward_change_std": 0.08378471713513136, "reward_std": 0.660789493471384, "rewards/cosine_scaled_reward": -0.028925069607794285, "rewards/format_reward": 0.6458333358168602, "step": 68 }, { "advantage_max": 1.56258724629879, "advantage_mean": 1.8626451381287268e-08, "advantage_min": -1.1549096181988716, "advantage_std": 0.9997214451432228, "completion_length": 2427.2708587646484, "epoch": 0.07885714285714286, "grad_norm": 0.26802805066108704, "kl": 0.0012503266334533691, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.960469931131936e-07, "loss": 0.0001, "reward": -0.025797476526349783, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.025797476526349783, "reward_after_std": 0.5156911239027977, "reward_before_mean": 0.046627337113022804, "reward_before_std": 0.5075650177896023, "reward_change_max": 0.00010112673044204712, "reward_change_mean": -0.07242480898275971, "reward_change_min": -0.12345962692052126, "reward_change_std": 0.04638163233175874, "reward_std": 0.5156911425292492, "rewards/cosine_scaled_reward": -0.20585301099345088, "rewards/format_reward": 0.45833333395421505, "step": 69 }, { "advantage_max": 1.4540487378835678, "advantage_mean": 3.4148494476582414e-08, "advantage_min": -1.126408912241459, "advantage_std": 0.9997123554348946, "completion_length": 3066.8333740234375, "epoch": 0.08, "grad_norm": 0.21717911958694458, "kl": 0.0016925111413002014, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.956206309337066e-07, "loss": 0.0001, "reward": -0.06509184092283249, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.06509184092283249, "reward_after_std": 0.5979281403124332, "reward_before_mean": 0.003177594393491745, "reward_before_std": 0.6060793250799179, "reward_change_max": 0.00020104646682739258, "reward_change_mean": -0.06826943415217102, "reward_change_min": -0.1556295258924365, "reward_change_std": 0.05811656138394028, "reward_std": 0.5979281663894653, "rewards/cosine_scaled_reward": -0.1546612000092864, "rewards/format_reward": 0.3125000074505806, "step": 70 }, { "advantage_max": 1.4608792811632156, "advantage_mean": 3.725289798861553e-09, "advantage_min": -1.152649886906147, "advantage_std": 0.9997194185853004, "completion_length": 2654.1041946411133, "epoch": 0.08114285714285714, "grad_norm": 0.1895046830177307, "kl": 0.0008933022618293762, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.951725498333448e-07, "loss": 0.0, "reward": 0.2874251026660204, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2874251026660204, "reward_after_std": 0.5808608923107386, "reward_before_mean": 0.3901416026055813, "reward_before_std": 0.5837310794740915, "reward_change_max": 0.0, "reward_change_mean": -0.10271649085916579, "reward_change_min": -0.1831564288586378, "reward_change_std": 0.06829000089783221, "reward_std": 0.580860897898674, "rewards/cosine_scaled_reward": -0.023679209873080254, "rewards/format_reward": 0.4375000074505806, "step": 71 }, { "advantage_max": 1.3443211615085602, "advantage_mean": -3.104409007637088e-09, "advantage_min": -1.3195882812142372, "advantage_std": 0.9997728690505028, "completion_length": 2894.9583740234375, "epoch": 0.08228571428571428, "grad_norm": 0.23933842778205872, "kl": 0.000815272331237793, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.947027716509488e-07, "loss": 0.0, "reward": -0.11035427264869213, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.11035427264869213, "reward_after_std": 0.5387446023523808, "reward_before_mean": -0.04381885752081871, "reward_before_std": 0.5496399514377117, "reward_change_max": 0.00013368576765060425, "reward_change_mean": -0.0665354230441153, "reward_change_min": -0.12944668717682362, "reward_change_std": 0.05370501568540931, "reward_std": 0.5387446247041225, "rewards/cosine_scaled_reward": -0.18857610132545233, "rewards/format_reward": 0.33333334140479565, "step": 72 }, { "advantage_max": 1.3321927040815353, "advantage_mean": 4.4703484247676784e-08, "advantage_min": -1.1550491526722908, "advantage_std": 0.9997767359018326, "completion_length": 3468.6666870117188, "epoch": 0.08342857142857144, "grad_norm": 0.1596253216266632, "kl": 0.00013802945613861084, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.942113192828444e-07, "loss": 0.0, "reward": -0.16815321380272508, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": -0.16815321380272508, "reward_after_std": 0.7371582835912704, "reward_before_mean": -0.11063026264309883, "reward_before_std": 0.7664970513433218, "reward_change_max": 0.00023616105318069458, "reward_change_mean": -0.05752295721322298, "reward_change_min": -0.14209103304892778, "reward_change_std": 0.06510339491069317, "reward_std": 0.7371583022177219, "rewards/cosine_scaled_reward": -0.1386484676040709, "rewards/format_reward": 0.1666666679084301, "step": 73 }, { "advantage_max": 1.6054811775684357, "advantage_mean": -9.93410742555767e-09, "advantage_min": -0.9970987290143967, "advantage_std": 0.9997713267803192, "completion_length": 3130.3750610351562, "epoch": 0.08457142857142858, "grad_norm": 0.16316719353199005, "kl": 0.0006289742887020111, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.93698216681727e-07, "loss": 0.0, "reward": 0.04533570492640138, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.04533570492640138, "reward_after_std": 0.6655574645847082, "reward_before_mean": 0.12103927996940911, "reward_before_std": 0.6674526929855347, "reward_change_max": 0.0002670586109161377, "reward_change_mean": -0.07570357341319323, "reward_change_min": -0.1453123176470399, "reward_change_std": 0.058132898062467575, "reward_std": 0.6655574906617403, "rewards/cosine_scaled_reward": -0.0748970415443182, "rewards/format_reward": 0.27083333767950535, "step": 74 }, { "advantage_max": 1.3084037005901337, "advantage_mean": 2.1730860721991263e-08, "advantage_min": -1.0659999400377274, "advantage_std": 0.9998039081692696, "completion_length": 3022.291702270508, "epoch": 0.08571428571428572, "grad_norm": 0.15364454686641693, "kl": 0.0014982819557189941, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.931634888554935e-07, "loss": 0.0001, "reward": 0.3326728269457817, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.3326728269457817, "reward_after_std": 0.6384584531188011, "reward_before_mean": 0.43834810703992844, "reward_before_std": 0.6464258171617985, "reward_change_max": 0.0001539289951324463, "reward_change_mean": -0.1056752463337034, "reward_change_min": -0.19835533201694489, "reward_change_std": 0.07731911540031433, "reward_std": 0.6384584605693817, "rewards/cosine_scaled_reward": 0.042090704664587975, "rewards/format_reward": 0.35416666977107525, "step": 75 }, { "advantage_max": 1.4036643505096436, "advantage_mean": 1.3659398168108794e-08, "advantage_min": -1.2448914647102356, "advantage_std": 0.999701626598835, "completion_length": 2984.4583892822266, "epoch": 0.08685714285714285, "grad_norm": 0.17103531956672668, "kl": 0.00016760081052780151, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.926071618660237e-07, "loss": 0.0, "reward": -0.15381433628499508, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.15381433628499508, "reward_after_std": 0.4996340088546276, "reward_before_mean": -0.09012758638709784, "reward_before_std": 0.5069925468415022, "reward_change_max": 0.00010403245687484741, "reward_change_mean": -0.0636867480352521, "reward_change_min": -0.132554329931736, "reward_change_std": 0.052232020534574986, "reward_std": 0.4996340125799179, "rewards/cosine_scaled_reward": -0.22214713506400585, "rewards/format_reward": 0.3541666679084301, "step": 76 }, { "advantage_max": 1.5948710143566132, "advantage_mean": 6.829698806498641e-09, "advantage_min": -1.0013544484972954, "advantage_std": 0.9997521862387657, "completion_length": 3199.875030517578, "epoch": 0.088, "grad_norm": 0.14091943204402924, "kl": 0.00019103288650512695, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.9202926282791e-07, "loss": 0.0, "reward": 0.07242773100733757, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.07242773100733757, "reward_after_std": 0.460830919444561, "reward_before_mean": 0.15626836940646172, "reward_before_std": 0.44724370911717415, "reward_change_max": 9.12696123123169e-06, "reward_change_mean": -0.08384064945857972, "reward_change_min": -0.13726599793881178, "reward_change_std": 0.05241412605391815, "reward_std": 0.4608309231698513, "rewards/cosine_scaled_reward": -0.08853249228559434, "rewards/format_reward": 0.33333333395421505, "step": 77 }, { "advantage_max": 1.372763067483902, "advantage_mean": 2.8870999813079834e-08, "advantage_min": -1.1454579532146454, "advantage_std": 0.9997924268245697, "completion_length": 2975.812530517578, "epoch": 0.08914285714285715, "grad_norm": 0.15570473670959473, "kl": 0.0012784861028194427, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.91429819907136e-07, "loss": 0.0001, "reward": 0.16491154581308365, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.16491154581308365, "reward_after_std": 0.605958666652441, "reward_before_mean": 0.25589458271861076, "reward_before_std": 0.6145712472498417, "reward_change_max": 0.00023362040519714355, "reward_change_mean": -0.09098305949009955, "reward_change_min": -0.17649622447788715, "reward_change_std": 0.06956856162287295, "reward_std": 0.6059587113559246, "rewards/cosine_scaled_reward": -0.03871938120573759, "rewards/format_reward": 0.33333333395421505, "step": 78 }, { "advantage_max": 1.5386330038309097, "advantage_mean": -9.934107647602275e-09, "advantage_min": -1.075737252831459, "advantage_std": 0.9998145774006844, "completion_length": 2298.500030517578, "epoch": 0.09028571428571429, "grad_norm": 0.21232455968856812, "kl": 0.0015820115804672241, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.908088623197048e-07, "loss": 0.0001, "reward": 0.3763896021991968, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3763896021991968, "reward_after_std": 0.719805970788002, "reward_before_mean": 0.4805713724344969, "reward_before_std": 0.7157487347722054, "reward_change_max": 0.00013425201177597046, "reward_change_mean": -0.10418177908286452, "reward_change_min": -0.20248535182327032, "reward_change_std": 0.07452570833265781, "reward_std": 0.7198059931397438, "rewards/cosine_scaled_reward": -0.06179766240529716, "rewards/format_reward": 0.6041666697710752, "step": 79 }, { "advantage_max": 1.529449224472046, "advantage_mean": 2.9181441485448545e-08, "advantage_min": -1.0704481154680252, "advantage_std": 0.9997870698571205, "completion_length": 3156.6875610351562, "epoch": 0.09142857142857143, "grad_norm": 0.20058980584144592, "kl": 0.0004749912768602371, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.901664203302124e-07, "loss": 0.0, "reward": -0.10771899670362473, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.10771899670362473, "reward_after_std": 0.6453639008104801, "reward_before_mean": -0.044033898040652275, "reward_before_std": 0.6592339612543583, "reward_change_max": 0.0, "reward_change_mean": -0.06368510669562966, "reward_change_min": -0.13146696891635656, "reward_change_std": 0.05567331635393202, "reward_std": 0.6453639306128025, "rewards/cosine_scaled_reward": -0.16785027831792831, "rewards/format_reward": 0.291666679084301, "step": 80 }, { "advantage_max": 1.2946752682328224, "advantage_mean": 3.973643103449831e-08, "advantage_min": -1.3097454234957695, "advantage_std": 0.9997820854187012, "completion_length": 3027.062530517578, "epoch": 0.09257142857142857, "grad_norm": 0.26107150316238403, "kl": 0.0017225146293640137, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.895025252503755e-07, "loss": 0.0001, "reward": -0.09596222266554832, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.09596222266554832, "reward_after_std": 0.6142298579216003, "reward_before_mean": -0.030082307755947113, "reward_before_std": 0.6317219771444798, "reward_change_max": 0.00025378912687301636, "reward_change_mean": -0.06587989977560937, "reward_change_min": -0.14147170074284077, "reward_change_std": 0.05747503787279129, "reward_std": 0.6142298653721809, "rewards/cosine_scaled_reward": -0.16087449342012405, "rewards/format_reward": 0.291666679084301, "step": 81 }, { "advantage_max": 1.4243344590067863, "advantage_mean": -6.208817460162663e-09, "advantage_min": -1.1731738597154617, "advantage_std": 0.9997696131467819, "completion_length": 2904.520851135254, "epoch": 0.09371428571428571, "grad_norm": 0.22400455176830292, "kl": 0.0046500712633132935, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.888172094375033e-07, "loss": 0.0002, "reward": 0.29460637643933296, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.29460637643933296, "reward_after_std": 0.6816530674695969, "reward_before_mean": 0.39471880346536636, "reward_before_std": 0.6884325593709946, "reward_change_max": 9.835511445999146e-05, "reward_change_mean": -0.10011240118183196, "reward_change_min": -0.17668104264885187, "reward_change_std": 0.06958513834979385, "reward_std": 0.6816530898213387, "rewards/cosine_scaled_reward": 0.009859389916528016, "rewards/format_reward": 0.3750000037252903, "step": 82 }, { "advantage_max": 1.6047600284218788, "advantage_mean": -1.3504177553969043e-08, "advantage_min": -1.2123665064573288, "advantage_std": 0.9998186007142067, "completion_length": 2714.4791870117188, "epoch": 0.09485714285714286, "grad_norm": 0.2228856235742569, "kl": 0.0006010010838508606, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.881105062929221e-07, "loss": 0.0, "reward": 0.2101850677281618, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2101850677281618, "reward_after_std": 0.7800624221563339, "reward_before_mean": 0.29517440497875214, "reward_before_std": 0.7766251862049103, "reward_change_max": 7.58543610572815e-05, "reward_change_mean": -0.0849893398117274, "reward_change_min": -0.15232862625271082, "reward_change_std": 0.06034258124418557, "reward_std": 0.7800624407827854, "rewards/cosine_scaled_reward": -0.060746138100512326, "rewards/format_reward": 0.4166666679084301, "step": 83 }, { "advantage_max": 1.3094022646546364, "advantage_mean": 1.3659398279131096e-08, "advantage_min": -1.1910891830921173, "advantage_std": 0.9997829273343086, "completion_length": 3066.6875228881836, "epoch": 0.096, "grad_norm": 0.17936787009239197, "kl": 0.0003447532653808594, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.873824502603459e-07, "loss": 0.0, "reward": 0.2950674742460251, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2950674742460251, "reward_after_std": 0.8434533104300499, "reward_before_mean": 0.39375803247094154, "reward_before_std": 0.8733946792781353, "reward_change_max": 8.602440357208252e-05, "reward_change_mean": -0.09869056491879746, "reward_change_min": -0.2063098233193159, "reward_change_std": 0.0874484230298549, "reward_std": 0.8434533290565014, "rewards/cosine_scaled_reward": 0.009379002032801509, "rewards/format_reward": 0.3750000074505806, "step": 84 }, { "advantage_max": 1.6304249167442322, "advantage_mean": 2.4835269951672956e-09, "advantage_min": -1.043467827141285, "advantage_std": 0.9997950419783592, "completion_length": 3137.0208740234375, "epoch": 0.09714285714285714, "grad_norm": 0.14530478417873383, "kl": 0.00024145841598510742, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.866330768241983e-07, "loss": 0.0, "reward": 0.005932248197495937, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.005932248197495937, "reward_after_std": 0.754243042320013, "reward_before_mean": 0.07403266115579754, "reward_before_std": 0.7538498565554619, "reward_change_max": 0.00023034214973449707, "reward_change_mean": -0.06810041260905564, "reward_change_min": -0.15122622810304165, "reward_change_std": 0.05614574020728469, "reward_std": 0.754243042320013, "rewards/cosine_scaled_reward": -0.140067002736032, "rewards/format_reward": 0.3541666753590107, "step": 85 }, { "advantage_max": 1.32743688672781, "advantage_mean": 1.2417634920325327e-08, "advantage_min": -1.3129910230636597, "advantage_std": 0.9997938498854637, "completion_length": 2772.020851135254, "epoch": 0.09828571428571428, "grad_norm": 0.19045515358448029, "kl": 0.001046299934387207, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.85862422507884e-07, "loss": 0.0, "reward": 0.2901347801089287, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.2901347801089287, "reward_after_std": 0.6530134379863739, "reward_before_mean": 0.3909718096256256, "reward_before_std": 0.6593408472836018, "reward_change_max": 0.0, "reward_change_mean": -0.1008370304480195, "reward_change_min": -0.1854435782879591, "reward_change_std": 0.0712063885293901, "reward_std": 0.6530134417116642, "rewards/cosine_scaled_reward": -0.02326410636305809, "rewards/format_reward": 0.4375000074505806, "step": 86 }, { "advantage_max": 1.4831641167402267, "advantage_mean": -3.973643103449831e-08, "advantage_min": -1.1471149325370789, "advantage_std": 0.9998367726802826, "completion_length": 2631.0208740234375, "epoch": 0.09942857142857142, "grad_norm": 0.19404949247837067, "kl": 0.0006144046783447266, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.850705248720068e-07, "loss": 0.0, "reward": 0.41755142249166965, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.41755142249166965, "reward_after_std": 0.858353029936552, "reward_before_mean": 0.5248608682304621, "reward_before_std": 0.8716229237616062, "reward_change_max": 5.677342414855957e-06, "reward_change_mean": -0.10730946669355035, "reward_change_min": -0.20732402987778187, "reward_change_std": 0.08647632226347923, "reward_std": 0.858353067189455, "rewards/cosine_scaled_reward": -0.018819569377228618, "rewards/format_reward": 0.5625000093132257, "step": 87 }, { "advantage_max": 1.4660617038607597, "advantage_mean": -7.140139923755839e-09, "advantage_min": -1.3561210632324219, "advantage_std": 0.9998756051063538, "completion_length": 2669.541732788086, "epoch": 0.10057142857142858, "grad_norm": 0.1991894394159317, "kl": 0.0010406449437141418, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.8425742251254e-07, "loss": 0.0, "reward": 0.5971436947584152, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5971436947584152, "reward_after_std": 0.9840230047702789, "reward_before_mean": 0.7174625433981419, "reward_before_std": 1.0027831830084324, "reward_change_max": 5.602836608886719e-05, "reward_change_mean": -0.12031882908195257, "reward_change_min": -0.22533655166625977, "reward_change_std": 0.09558681072667241, "reward_std": 0.9840230494737625, "rewards/cosine_scaled_reward": 0.05664793308824301, "rewards/format_reward": 0.6041666865348816, "step": 88 }, { "advantage_max": 1.3210382387042046, "advantage_mean": 1.6142924885720333e-08, "advantage_min": -1.2246809154748917, "advantage_std": 0.9998392388224602, "completion_length": 2891.8958740234375, "epoch": 0.10171428571428572, "grad_norm": 0.20064914226531982, "kl": 0.0010943412780761719, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.83423155058946e-07, "loss": 0.0, "reward": 0.2721131080761552, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2721131080761552, "reward_after_std": 0.9016778543591499, "reward_before_mean": 0.36736097862012684, "reward_before_std": 0.9379100240767002, "reward_change_max": 9.276717901229858e-05, "reward_change_mean": -0.09524787031114101, "reward_change_min": -0.197541574947536, "reward_change_std": 0.08959966944530606, "reward_std": 0.9016778841614723, "rewards/cosine_scaled_reward": -0.014236186631023884, "rewards/format_reward": 0.3958333432674408, "step": 89 }, { "advantage_max": 1.4915196597576141, "advantage_mean": -8.692343844707295e-09, "advantage_min": -1.0883212387561798, "advantage_std": 0.999758742749691, "completion_length": 2403.3541870117188, "epoch": 0.10285714285714286, "grad_norm": 0.2652307152748108, "kl": 0.001546025276184082, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.825677631722435e-07, "loss": 0.0001, "reward": -0.029102535918354988, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.029102535918354988, "reward_after_std": 0.5665853600949049, "reward_before_mean": 0.041870216839015484, "reward_before_std": 0.5661425106227398, "reward_change_max": 0.00011952966451644897, "reward_change_mean": -0.07097276439890265, "reward_change_min": -0.13266034051775932, "reward_change_std": 0.05150189925916493, "reward_std": 0.5665853675454855, "rewards/cosine_scaled_reward": -0.22906489111483097, "rewards/format_reward": 0.5000000055879354, "step": 90 }, { "advantage_max": 1.3845188915729523, "advantage_mean": 8.692343955729598e-09, "advantage_min": -1.2692686691880226, "advantage_std": 0.9997860342264175, "completion_length": 3016.2083587646484, "epoch": 0.104, "grad_norm": 0.17810925841331482, "kl": 0.0007269233465194702, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.816912885430258e-07, "loss": 0.0, "reward": 0.19745041709393263, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.19745041709393263, "reward_after_std": 0.740229532122612, "reward_before_mean": 0.28715684451162815, "reward_before_std": 0.7560277283191681, "reward_change_max": 0.0003509148955345154, "reward_change_mean": -0.08970644162036479, "reward_change_min": -0.16745029855519533, "reward_change_std": 0.07019369956105947, "reward_std": 0.7402295880019665, "rewards/cosine_scaled_reward": -0.05433825249201618, "rewards/format_reward": 0.39583334885537624, "step": 91 }, { "advantage_max": 1.495618462562561, "advantage_mean": -1.614292477469803e-08, "advantage_min": -1.1609731614589691, "advantage_std": 0.9998172298073769, "completion_length": 2539.1458892822266, "epoch": 0.10514285714285715, "grad_norm": 0.2165587693452835, "kl": 0.0017741471529006958, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.807937738894303e-07, "loss": 0.0001, "reward": 0.377059874124825, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.377059874124825, "reward_after_std": 0.7129791099578142, "reward_before_mean": 0.4834042700531427, "reward_before_std": 0.7142429128289223, "reward_change_max": 9.88692045211792e-05, "reward_change_mean": -0.10634440556168556, "reward_change_min": -0.1946062110364437, "reward_change_std": 0.07394323078915477, "reward_std": 0.7129791136831045, "rewards/cosine_scaled_reward": -0.0603812150657177, "rewards/format_reward": 0.6041666734963655, "step": 92 }, { "advantage_max": 1.4186953604221344, "advantage_mean": 2.8871000368191346e-08, "advantage_min": -1.1192193031311035, "advantage_std": 0.9997008293867111, "completion_length": 3446.8958740234375, "epoch": 0.10628571428571429, "grad_norm": 0.17092525959014893, "kl": 0.0012556910514831543, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.798752629550546e-07, "loss": 0.0001, "reward": -0.4768864205107093, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": -0.4768864205107093, "reward_after_std": 0.4022520687431097, "reward_before_mean": -0.4405721053481102, "reward_before_std": 0.4141699206084013, "reward_change_max": 0.0004946738481521606, "reward_change_mean": -0.03631432238034904, "reward_change_min": -0.08117994200438261, "reward_change_std": 0.03509921010117978, "reward_std": 0.4022520836442709, "rewards/cosine_scaled_reward": -0.26195271871984005, "rewards/format_reward": 0.0833333358168602, "step": 93 }, { "advantage_max": 1.506947785615921, "advantage_mean": -1.6763807453301638e-08, "advantage_min": -1.0155241936445236, "advantage_std": 0.999764122068882, "completion_length": 2987.7291870117188, "epoch": 0.10742857142857143, "grad_norm": 0.1818607598543167, "kl": 0.0012969821691513062, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.78935800506826e-07, "loss": 0.0001, "reward": -0.0261713950894773, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.0261713950894773, "reward_after_std": 0.539334274828434, "reward_before_mean": 0.046400152146816254, "reward_before_std": 0.5371452905237675, "reward_change_max": 0.0005335435271263123, "reward_change_mean": -0.07257156854029745, "reward_change_min": -0.13221831247210503, "reward_change_std": 0.0521798743866384, "reward_std": 0.5393342785537243, "rewards/cosine_scaled_reward": -0.11221659136936069, "rewards/format_reward": 0.27083333395421505, "step": 94 }, { "advantage_max": 1.4179429858922958, "advantage_mean": 2.5456151686586992e-08, "advantage_min": -1.117660902440548, "advantage_std": 0.9998129159212112, "completion_length": 3445.9166870117188, "epoch": 0.10857142857142857, "grad_norm": 0.15445290505886078, "kl": 0.00019112974405288696, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.779754323328192e-07, "loss": 0.0, "reward": -0.17061306349933147, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.17061306349933147, "reward_after_std": 0.7453450746834278, "reward_before_mean": -0.11620625574141741, "reward_before_std": 0.7634413428604603, "reward_change_max": 0.00022859126329421997, "reward_change_mean": -0.05440680589526892, "reward_change_min": -0.1367551926523447, "reward_change_std": 0.05708932294510305, "reward_std": 0.745345089584589, "rewards/cosine_scaled_reward": -0.14143646706361324, "rewards/format_reward": 0.1666666716337204, "step": 95 }, { "advantage_max": 1.4602340012788773, "advantage_mean": 6.208817904251873e-10, "advantage_min": -1.1855292618274689, "advantage_std": 0.9998244643211365, "completion_length": 2690.7083587646484, "epoch": 0.10971428571428571, "grad_norm": 0.22608739137649536, "kl": 0.0011289715766906738, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.769942052400235e-07, "loss": 0.0, "reward": 0.38900233432650566, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.38900233432650566, "reward_after_std": 0.7013615928590298, "reward_before_mean": 0.49687390495091677, "reward_before_std": 0.7065553367137909, "reward_change_max": 0.00010736286640167236, "reward_change_mean": -0.10787154478020966, "reward_change_min": -0.19875967130064964, "reward_change_std": 0.07827508868649602, "reward_std": 0.7013616152107716, "rewards/cosine_scaled_reward": 0.02968693384900689, "rewards/format_reward": 0.4375000074505806, "step": 96 }, { "advantage_max": 1.4047540798783302, "advantage_mean": -4.346173199110126e-09, "advantage_min": -1.2560274973511696, "advantage_std": 0.9997810050845146, "completion_length": 2811.1458892822266, "epoch": 0.11085714285714286, "grad_norm": 0.16837795078754425, "kl": 0.001363903284072876, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.759921670520634e-07, "loss": 0.0001, "reward": 0.3484401609748602, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3484401609748602, "reward_after_std": 0.5126909576356411, "reward_before_mean": 0.4589174911379814, "reward_before_std": 0.5126852281391621, "reward_change_max": 0.00011277198791503906, "reward_change_mean": -0.1104773310944438, "reward_change_min": -0.1841278411448002, "reward_change_std": 0.07085376582108438, "reward_std": 0.5126909799873829, "rewards/cosine_scaled_reward": -0.010124601423740387, "rewards/format_reward": 0.47916668094694614, "step": 97 }, { "advantage_max": 1.5295169353485107, "advantage_mean": 3.725290742551124e-09, "advantage_min": -1.2989156991243362, "advantage_std": 0.9997530430555344, "completion_length": 2568.416717529297, "epoch": 0.112, "grad_norm": 0.17439162731170654, "kl": 0.0003739595413208008, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.749693666068663e-07, "loss": 0.0, "reward": 0.26688409969210625, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.26688409969210625, "reward_after_std": 0.45362649485468864, "reward_before_mean": 0.3693796396255493, "reward_before_std": 0.438557505607605, "reward_change_max": 0.0001835152506828308, "reward_change_mean": -0.10249554133042693, "reward_change_min": -0.16221447475254536, "reward_change_std": 0.06244846456684172, "reward_std": 0.45362651348114014, "rewards/cosine_scaled_reward": -0.0965601853094995, "rewards/format_reward": 0.5625000074505806, "step": 98 }, { "advantage_max": 1.59602090716362, "advantage_mean": 9.002784850942191e-09, "advantage_min": -1.0146455764770508, "advantage_std": 0.9997052848339081, "completion_length": 2898.1458587646484, "epoch": 0.11314285714285714, "grad_norm": 0.20550180971622467, "kl": 0.0007301568984985352, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.739258537542835e-07, "loss": 0.0, "reward": -0.011777647770941257, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.011777647770941257, "reward_after_std": 0.5849963650107384, "reward_before_mean": 0.059426740277558565, "reward_before_std": 0.5733112944290042, "reward_change_max": 7.230788469314575e-05, "reward_change_mean": -0.07120441918959841, "reward_change_min": -0.11866667028516531, "reward_change_std": 0.04729985597077757, "reward_std": 0.5849963836371899, "rewards/cosine_scaled_reward": -0.10570329218171537, "rewards/format_reward": 0.2708333395421505, "step": 99 }, { "advantage_max": 1.4574847668409348, "advantage_mean": 2.7318797668485217e-08, "advantage_min": -1.1436650529503822, "advantage_std": 0.9998161122202873, "completion_length": 2717.0625610351562, "epoch": 0.11428571428571428, "grad_norm": 0.19857947528362274, "kl": 0.0009911060333251953, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.728616793536587e-07, "loss": 0.0, "reward": 0.3121153700631112, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3121153700631112, "reward_after_std": 0.7690652385354042, "reward_before_mean": 0.41096168756484985, "reward_before_std": 0.7780533917248249, "reward_change_max": 6.656348705291748e-05, "reward_change_mean": -0.09884630842134356, "reward_change_min": -0.1944613279774785, "reward_change_std": 0.07431896426714957, "reward_std": 0.7690652459859848, "rewards/cosine_scaled_reward": -0.023685835301876068, "rewards/format_reward": 0.45833334885537624, "step": 100 }, { "advantage_max": 1.4642290025949478, "advantage_mean": 9.934107758624577e-09, "advantage_min": -1.23203906416893, "advantage_std": 0.9997976124286652, "completion_length": 2641.625, "epoch": 0.11542857142857142, "grad_norm": 0.20073696970939636, "kl": 0.0007115602493286133, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.717768952713511e-07, "loss": 0.0, "reward": 0.13412395305931568, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.13412395305931568, "reward_after_std": 0.6262147203087807, "reward_before_mean": 0.21931741666048765, "reward_before_std": 0.6254003196954727, "reward_change_max": 0.0007505491375923157, "reward_change_mean": -0.08519347733817995, "reward_change_min": -0.15299372747540474, "reward_change_std": 0.0630116555839777, "reward_std": 0.6262147352099419, "rewards/cosine_scaled_reward": -0.09867463074624538, "rewards/format_reward": 0.41666667722165585, "step": 101 }, { "advantage_max": 1.5872334390878677, "advantage_mean": -1.8936891610366047e-08, "advantage_min": -1.1092576533555984, "advantage_std": 0.9997419193387032, "completion_length": 2135.000045776367, "epoch": 0.11657142857142858, "grad_norm": 0.2481817603111267, "kl": 0.001399993896484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.706715543782064e-07, "loss": 0.0001, "reward": 0.40515281772240996, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.40515281772240996, "reward_after_std": 0.7024665623903275, "reward_before_mean": 0.512839687988162, "reward_before_std": 0.6963064633309841, "reward_change_max": 0.00012967735528945923, "reward_change_mean": -0.10768683551577851, "reward_change_min": -0.18208745121955872, "reward_change_std": 0.07141236204188317, "reward_std": 0.7024666033685207, "rewards/cosine_scaled_reward": -0.09774684254080057, "rewards/format_reward": 0.7083333395421505, "step": 102 }, { "advantage_max": 1.360763557255268, "advantage_mean": 9.623669194880335e-09, "advantage_min": -1.057634711265564, "advantage_std": 0.999780036509037, "completion_length": 2951.833354949951, "epoch": 0.11771428571428572, "grad_norm": 0.2257830947637558, "kl": 0.001177072525024414, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.695457105469804e-07, "loss": 0.0, "reward": 0.3502454627305269, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.3502454627305269, "reward_after_std": 0.704563008621335, "reward_before_mean": 0.4569867327809334, "reward_before_std": 0.72186528891325, "reward_change_max": 0.00033936649560928345, "reward_change_mean": -0.10674124653451145, "reward_change_min": -0.20691397693008184, "reward_change_std": 0.08697378146462142, "reward_std": 0.704563032835722, "rewards/cosine_scaled_reward": -0.0006733201444149017, "rewards/format_reward": 0.4583333395421505, "step": 103 }, { "advantage_max": 1.2722968012094498, "advantage_mean": -2.110997954218874e-08, "advantage_min": -1.3262568935751915, "advantage_std": 0.9997180476784706, "completion_length": 2898.4166870117188, "epoch": 0.11885714285714286, "grad_norm": 0.22004500031471252, "kl": 0.0033478736877441406, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.683994186497132e-07, "loss": 0.0001, "reward": -0.08824241906404495, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.08824241906404495, "reward_after_std": 0.4880238436162472, "reward_before_mean": -0.017093989998102188, "reward_before_std": 0.5020418781787157, "reward_change_max": 0.0, "reward_change_mean": -0.07114846212789416, "reward_change_min": -0.13775192759931087, "reward_change_std": 0.056608869694173336, "reward_std": 0.4880238547921181, "rewards/cosine_scaled_reward": -0.16479699313640594, "rewards/format_reward": 0.3125, "step": 104 }, { "advantage_max": 1.2704541832208633, "advantage_mean": 2.980232283178452e-08, "advantage_min": -1.264593780040741, "advantage_std": 0.9998151138424873, "completion_length": 2905.125030517578, "epoch": 0.12, "grad_norm": 0.20489485561847687, "kl": 0.0008729100227355957, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.672327345550543e-07, "loss": 0.0, "reward": 0.08965863287448883, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.08965863287448883, "reward_after_std": 0.7559684477746487, "reward_before_mean": 0.16935227066278458, "reward_before_std": 0.7788311205804348, "reward_change_max": 0.0003646537661552429, "reward_change_mean": -0.07969362242147326, "reward_change_min": -0.16236806381493807, "reward_change_std": 0.06786630826536566, "reward_std": 0.755968451499939, "rewards/cosine_scaled_reward": -0.08199054421857, "rewards/format_reward": 0.3333333395421505, "step": 105 }, { "advantage_max": 1.5852757394313812, "advantage_mean": -3.911554924407312e-08, "advantage_min": -1.0061208456754684, "advantage_std": 0.9997430816292763, "completion_length": 2090.75008392334, "epoch": 0.12114285714285715, "grad_norm": 0.2063012570142746, "kl": 0.0014823079109191895, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.66045715125541e-07, "loss": 0.0001, "reward": 0.9065777286887169, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.9065777286887169, "reward_after_std": 0.7232242524623871, "reward_before_mean": 1.0624435134232044, "reward_before_std": 0.7090621958486736, "reward_change_max": 0.0, "reward_change_mean": -0.15586584899574518, "reward_change_min": -0.2597513496875763, "reward_change_std": 0.10303916921839118, "reward_std": 0.7232242971658707, "rewards/cosine_scaled_reward": 0.19788843393325806, "rewards/format_reward": 0.6666666772216558, "step": 106 }, { "advantage_max": 1.5181275755167007, "advantage_mean": -2.4369608844776458e-08, "advantage_min": -1.2018900960683823, "advantage_std": 0.9997393265366554, "completion_length": 2681.9584045410156, "epoch": 0.12228571428571429, "grad_norm": 0.22282981872558594, "kl": 0.0009338855743408203, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.648384182148252e-07, "loss": 0.0, "reward": 0.24523488990962505, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.24523488990962505, "reward_after_std": 0.5135970376431942, "reward_before_mean": 0.34461949206888676, "reward_before_std": 0.5058441665023565, "reward_change_max": 0.00038911402225494385, "reward_change_mean": -0.09938463289290667, "reward_change_min": -0.17489725723862648, "reward_change_std": 0.06601686554495245, "reward_std": 0.5135970450937748, "rewards/cosine_scaled_reward": -0.09852358978241682, "rewards/format_reward": 0.5416666734963655, "step": 107 }, { "advantage_max": 1.462399698793888, "advantage_mean": 1.6763807397790487e-08, "advantage_min": -1.1984619572758675, "advantage_std": 0.9997933208942413, "completion_length": 2448.0625381469727, "epoch": 0.12342857142857143, "grad_norm": 0.20489199459552765, "kl": 0.0010381042957305908, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.636109026648554e-07, "loss": 0.0, "reward": 0.3353926707059145, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3353926707059145, "reward_after_std": 0.7036140821874142, "reward_before_mean": 0.4386548884212971, "reward_before_std": 0.7110070791095495, "reward_change_max": 2.4124979972839355e-05, "reward_change_mean": -0.1032621799968183, "reward_change_min": -0.19217858277261257, "reward_change_std": 0.07401760248467326, "reward_std": 0.7036141231656075, "rewards/cosine_scaled_reward": -0.041089228354394436, "rewards/format_reward": 0.5208333432674408, "step": 108 }, { "advantage_max": 1.6706776022911072, "advantage_mean": -2.297262446937509e-08, "advantage_min": -0.9789133369922638, "advantage_std": 0.9997392222285271, "completion_length": 2940.6458587646484, "epoch": 0.12457142857142857, "grad_norm": 0.18210139870643616, "kl": 0.0003904104232788086, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.623632283030077e-07, "loss": 0.0, "reward": 0.035868662409484386, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.035868662409484386, "reward_after_std": 0.4906224813312292, "reward_before_mean": 0.11451574973762035, "reward_before_std": 0.47176924906671047, "reward_change_max": 2.3439526557922363e-05, "reward_change_mean": -0.07864708849228919, "reward_change_min": -0.1277305269613862, "reward_change_std": 0.05187747371383011, "reward_std": 0.49062250182032585, "rewards/cosine_scaled_reward": -0.09899212668460677, "rewards/format_reward": 0.31250000186264515, "step": 109 }, { "advantage_max": 1.6114717870950699, "advantage_mean": -5.029142124968189e-08, "advantage_min": -1.1079509481787682, "advantage_std": 0.9998557791113853, "completion_length": 2539.500072479248, "epoch": 0.12571428571428572, "grad_norm": 0.24719710648059845, "kl": 0.0007413476705551147, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.610954559391704e-07, "loss": 0.0, "reward": 0.21266168262809515, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.21266168262809515, "reward_after_std": 0.9514952898025513, "reward_before_mean": 0.29369947547093034, "reward_before_std": 0.9561396837234497, "reward_change_max": 0.0003879815340042114, "reward_change_mean": -0.08103783521801233, "reward_change_min": -0.1750015551224351, "reward_change_std": 0.071824872167781, "reward_std": 0.9514953121542931, "rewards/cosine_scaled_reward": -0.11356692761182785, "rewards/format_reward": 0.520833345130086, "step": 110 }, { "advantage_max": 1.3271641284227371, "advantage_mean": 1.1175871006408045e-08, "advantage_min": -1.1762759760022163, "advantage_std": 0.9998204931616783, "completion_length": 2878.6250762939453, "epoch": 0.12685714285714286, "grad_norm": 0.1850922554731369, "kl": 0.001230478286743164, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.598076473627796e-07, "loss": 0.0, "reward": 0.26679439563304186, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.26679439563304186, "reward_after_std": 0.8421963788568974, "reward_before_mean": 0.36285027489066124, "reward_before_std": 0.8735892362892628, "reward_change_max": 8.895248174667358e-05, "reward_change_mean": -0.0960558783262968, "reward_change_min": -0.20756731741130352, "reward_change_std": 0.08794838469475508, "reward_std": 0.84219641238451, "rewards/cosine_scaled_reward": -0.04774153791368008, "rewards/format_reward": 0.45833334140479565, "step": 111 }, { "advantage_max": 1.4141745269298553, "advantage_mean": 5.743156217263845e-08, "advantage_min": -1.172879695892334, "advantage_std": 0.9997918605804443, "completion_length": 3140.3334045410156, "epoch": 0.128, "grad_norm": 0.16904281079769135, "kl": 0.0007680952548980713, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.58499865339809e-07, "loss": 0.0, "reward": 0.37912358343601227, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.37912358343601227, "reward_after_std": 0.7805259078741074, "reward_before_mean": 0.4845715146511793, "reward_before_std": 0.7915312610566616, "reward_change_max": 0.00028561800718307495, "reward_change_mean": -0.10544790513813496, "reward_change_min": -0.22313793655484915, "reward_change_std": 0.08740153908729553, "reward_std": 0.7805259451270103, "rewards/cosine_scaled_reward": 0.002702411264181137, "rewards/format_reward": 0.47916666977107525, "step": 112 }, { "advantage_max": 1.441021591424942, "advantage_mean": 8.692345065952622e-09, "advantage_min": -1.151847779750824, "advantage_std": 0.9997981712222099, "completion_length": 2436.9167289733887, "epoch": 0.12914285714285714, "grad_norm": 0.2858113944530487, "kl": 0.001512289047241211, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.571721736097088e-07, "loss": 0.0001, "reward": 0.18071626406162977, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.18071626406162977, "reward_after_std": 0.6407562829554081, "reward_before_mean": 0.2715048464015126, "reward_before_std": 0.6499359104782343, "reward_change_max": 7.875263690948486e-05, "reward_change_mean": -0.09078857069835067, "reward_change_min": -0.17216388322412968, "reward_change_std": 0.06758308736607432, "reward_std": 0.6407563146203756, "rewards/cosine_scaled_reward": -0.13508092612028122, "rewards/format_reward": 0.5416666846722364, "step": 113 }, { "advantage_max": 1.5190544202923775, "advantage_mean": 2.7939677682553565e-08, "advantage_min": -1.1916342675685883, "advantage_std": 0.9997197538614273, "completion_length": 2483.9792289733887, "epoch": 0.13028571428571428, "grad_norm": 0.20862361788749695, "kl": 0.0018885135650634766, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.55824636882301e-07, "loss": 0.0001, "reward": 0.12357044592499733, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.12357044592499733, "reward_after_std": 0.5326991295441985, "reward_before_mean": 0.21228991076350212, "reward_before_std": 0.5345789343118668, "reward_change_max": 2.765655517578125e-05, "reward_change_mean": -0.0887194707756862, "reward_change_min": -0.1558008911088109, "reward_change_std": 0.06081947742495686, "reward_std": 0.5326991518959403, "rewards/cosine_scaled_reward": -0.185521719744429, "rewards/format_reward": 0.5833333488553762, "step": 114 }, { "advantage_max": 1.3991148322820663, "advantage_mean": -1.4280280069556284e-08, "advantage_min": -1.182186022400856, "advantage_std": 0.9997821226716042, "completion_length": 2882.6875, "epoch": 0.13142857142857142, "grad_norm": 0.1963007003068924, "kl": 0.0019685029983520508, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.54457320834625e-07, "loss": 0.0001, "reward": 0.06682339310646057, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.06682339310646057, "reward_after_std": 0.6091874223202467, "reward_before_mean": 0.14801817759871483, "reward_before_std": 0.6182383522391319, "reward_change_max": 0.0005025044083595276, "reward_change_mean": -0.08119479194283485, "reward_change_min": -0.16018072236329317, "reward_change_std": 0.0620639375410974, "reward_std": 0.6091874409466982, "rewards/cosine_scaled_reward": -0.08224091539159417, "rewards/format_reward": 0.31250000186264515, "step": 115 }, { "advantage_max": 1.428558573126793, "advantage_mean": 3.228585032655218e-08, "advantage_min": -1.140651598572731, "advantage_std": 0.9997088760137558, "completion_length": 3213.5416717529297, "epoch": 0.13257142857142856, "grad_norm": 0.18319779634475708, "kl": 0.0013089179992675781, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.530702921077358e-07, "loss": 0.0001, "reward": -0.28241080418229103, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.28241080418229103, "reward_after_std": 0.42994930408895016, "reward_before_mean": -0.22914370521903038, "reward_before_std": 0.43542555905878544, "reward_change_max": 0.00030046701431274414, "reward_change_mean": -0.053267103619873524, "reward_change_min": -0.11103585828095675, "reward_change_std": 0.04199374059680849, "reward_std": 0.4299493171274662, "rewards/cosine_scaled_reward": -0.17707185074687004, "rewards/format_reward": 0.125, "step": 116 }, { "advantage_max": 1.6159285753965378, "advantage_mean": 4.284083932049043e-08, "advantage_min": -1.0283942744135857, "advantage_std": 0.9997640401124954, "completion_length": 3145.166717529297, "epoch": 0.1337142857142857, "grad_norm": 0.1719525307416916, "kl": 0.0014429092407226562, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.516636183034564e-07, "loss": 0.0001, "reward": -0.07115666568279266, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.07115666568279266, "reward_after_std": 0.711862625554204, "reward_before_mean": -0.00940685998648405, "reward_before_std": 0.7143301498144865, "reward_change_max": 0.00015719234943389893, "reward_change_mean": -0.06174980604555458, "reward_change_min": -0.13204221613705158, "reward_change_std": 0.049385225865989923, "reward_std": 0.7118626423180103, "rewards/cosine_scaled_reward": -0.16095343511551619, "rewards/format_reward": 0.3125000074505806, "step": 117 }, { "advantage_max": 1.6019388288259506, "advantage_mean": -1.4280279847511679e-08, "advantage_min": -0.964250460267067, "advantage_std": 0.9998467639088631, "completion_length": 3069.104217529297, "epoch": 0.13485714285714287, "grad_norm": 0.1549428105354309, "kl": 0.0012319087982177734, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.502373679810839e-07, "loss": 0.0, "reward": 0.4462295286357403, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4462295286357403, "reward_after_std": 0.9829271957278252, "reward_before_mean": 0.5496396627277136, "reward_before_std": 0.9876304157078266, "reward_change_max": 0.00024300813674926758, "reward_change_mean": -0.103410127107054, "reward_change_min": -0.20373185630887747, "reward_change_std": 0.08672826108522713, "reward_std": 0.982927218079567, "rewards/cosine_scaled_reward": 0.0560698164626956, "rewards/format_reward": 0.43750000186264515, "step": 118 }, { "advantage_max": 1.2626687735319138, "advantage_mean": -2.700835521896039e-08, "advantage_min": -1.3746841996908188, "advantage_std": 0.9998002350330353, "completion_length": 2507.187545776367, "epoch": 0.136, "grad_norm": 0.20802848041057587, "kl": 0.0029039382934570312, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.487916106540465e-07, "loss": 0.0001, "reward": 0.3486227598041296, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3486227598041296, "reward_after_std": 0.6806661561131477, "reward_before_mean": 0.4562889579683542, "reward_before_std": 0.6996967010200024, "reward_change_max": 0.00010123103857040405, "reward_change_mean": -0.10766624030657113, "reward_change_min": -0.197446602396667, "reward_change_std": 0.08037910354323685, "reward_std": 0.6806661747395992, "rewards/cosine_scaled_reward": -0.06352218613028526, "rewards/format_reward": 0.5833333507180214, "step": 119 }, { "advantage_max": 1.5879197269678116, "advantage_mean": -4.842877543431712e-08, "advantage_min": -1.06450717151165, "advantage_std": 0.9998263493180275, "completion_length": 2468.0833702087402, "epoch": 0.13714285714285715, "grad_norm": 0.2512477934360504, "kl": 0.003391742706298828, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.473264167865171e-07, "loss": 0.0001, "reward": 0.3673525620251894, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.3673525620251894, "reward_after_std": 0.7729743756353855, "reward_before_mean": 0.4686034247279167, "reward_before_std": 0.7684299945831299, "reward_change_max": 0.00028771162033081055, "reward_change_mean": -0.1012508855201304, "reward_change_min": -0.1895952159538865, "reward_change_std": 0.07404675986617804, "reward_std": 0.7729743979871273, "rewards/cosine_scaled_reward": -0.015698293107561767, "rewards/format_reward": 0.5000000074505806, "step": 120 }, { "advantage_max": 1.5347543805837631, "advantage_mean": -4.2219957530065244e-08, "advantage_min": -1.1780244708061218, "advantage_std": 0.9997963011264801, "completion_length": 1739.1875305175781, "epoch": 0.1382857142857143, "grad_norm": 0.20562413334846497, "kl": 0.0033533573150634766, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.458418577899774e-07, "loss": 0.0001, "reward": 0.6855917517095804, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6855917517095804, "reward_after_std": 0.636465273797512, "reward_before_mean": 0.8219538982957602, "reward_before_std": 0.624314408749342, "reward_change_max": 6.859749555587769e-05, "reward_change_mean": -0.1363621219061315, "reward_change_min": -0.22026861924678087, "reward_change_std": 0.08315270929597318, "reward_std": 0.636465273797512, "rewards/cosine_scaled_reward": 0.004726927087176591, "rewards/format_reward": 0.8125, "step": 121 }, { "advantage_max": 1.4387590885162354, "advantage_mean": -2.4835269396561444e-09, "advantage_min": -1.0619560778141022, "advantage_std": 0.9998273774981499, "completion_length": 2961.5208740234375, "epoch": 0.13942857142857143, "grad_norm": 0.1977798044681549, "kl": 0.001184701919555664, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.443380060197385e-07, "loss": 0.0, "reward": 0.3571953661739826, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3571953661739826, "reward_after_std": 0.952107772231102, "reward_before_mean": 0.4578540176153183, "reward_before_std": 0.9842019788920879, "reward_change_max": 0.0, "reward_change_mean": -0.10065865784417838, "reward_change_min": -0.22830870375037193, "reward_change_std": 0.09368588705547154, "reward_std": 0.9521077759563923, "rewards/cosine_scaled_reward": 0.01017700880765915, "rewards/format_reward": 0.43750000558793545, "step": 122 }, { "advantage_max": 1.7321224063634872, "advantage_mean": -1.2262414239572195e-07, "advantage_min": -0.9814025685191154, "advantage_std": 0.9997626096010208, "completion_length": 2672.5833892822266, "epoch": 0.14057142857142857, "grad_norm": 0.18867647647857666, "kl": 0.0012271404266357422, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.428149347714143e-07, "loss": 0.0, "reward": 0.12601416371762753, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.12601416371762753, "reward_after_std": 0.5985247995704412, "reward_before_mean": 0.2089059054851532, "reward_before_std": 0.584993964061141, "reward_change_max": 0.000914938747882843, "reward_change_mean": -0.08289175282698125, "reward_change_min": -0.13586501125246286, "reward_change_std": 0.054501404985785484, "reward_std": 0.598524821922183, "rewards/cosine_scaled_reward": -0.11429706169292331, "rewards/format_reward": 0.4375000037252903, "step": 123 }, { "advantage_max": 1.4781872406601906, "advantage_mean": -2.483527050678447e-09, "advantage_min": -1.183127485215664, "advantage_std": 0.9998154491186142, "completion_length": 2473.4792251586914, "epoch": 0.1417142857142857, "grad_norm": 0.23775885999202728, "kl": 0.006164073944091797, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.412727182773486e-07, "loss": 0.0002, "reward": 0.46917978674173355, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.46917978674173355, "reward_after_std": 0.8176695536822081, "reward_before_mean": 0.5814000591635704, "reward_before_std": 0.8219790123403072, "reward_change_max": 0.0, "reward_change_mean": -0.11222026916220784, "reward_change_min": -0.19535045605152845, "reward_change_std": 0.08150730514898896, "reward_std": 0.8176696002483368, "rewards/cosine_scaled_reward": 0.04070002248045057, "rewards/format_reward": 0.5000000074505806, "step": 124 }, { "advantage_max": 1.4708463251590729, "advantage_mean": 3.632158163124899e-08, "advantage_min": -1.1529624238610268, "advantage_std": 0.9997900947928429, "completion_length": 2675.0625, "epoch": 0.14285714285714285, "grad_norm": 0.1962793618440628, "kl": 0.0011947154998779297, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.397114317029974e-07, "loss": 0.0, "reward": 0.34733692556619644, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.34733692556619644, "reward_after_std": 0.6260436102747917, "reward_before_mean": 0.4519122443161905, "reward_before_std": 0.6228771880269051, "reward_change_max": 0.00011279433965682983, "reward_change_mean": -0.10457530617713928, "reward_change_min": -0.17521720752120018, "reward_change_std": 0.06829811399802566, "reward_std": 0.6260436214506626, "rewards/cosine_scaled_reward": 0.04887278733076528, "rewards/format_reward": 0.35416666977107525, "step": 125 }, { "advantage_max": 1.6396605372428894, "advantage_mean": 2.3593506592867186e-08, "advantage_min": -0.9779276698827744, "advantage_std": 0.9998251125216484, "completion_length": 2967.4583740234375, "epoch": 0.144, "grad_norm": 0.1698831021785736, "kl": 0.0008451938629150391, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.381311511432658e-07, "loss": 0.0, "reward": 0.2825273647904396, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.2825273647904396, "reward_after_std": 0.7569136507809162, "reward_before_mean": 0.37666072975844145, "reward_before_std": 0.7486708983778954, "reward_change_max": 0.00029872357845306396, "reward_change_mean": -0.09413336007855833, "reward_change_min": -0.17554103955626488, "reward_change_std": 0.06924102012999356, "reward_std": 0.7569136843085289, "rewards/cosine_scaled_reward": -0.030419636983424425, "rewards/format_reward": 0.43750000558793545, "step": 126 }, { "advantage_max": 1.590319201350212, "advantage_mean": -2.2351742345882997e-08, "advantage_min": -1.1233162581920624, "advantage_std": 0.999773882329464, "completion_length": 3154.3333892822266, "epoch": 0.14514285714285713, "grad_norm": 0.172366663813591, "kl": 0.0011754035949707031, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.36531953618799e-07, "loss": 0.0, "reward": -0.15957457711920142, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.15957457711920142, "reward_after_std": 0.5303029231727123, "reward_before_mean": -0.09974119672551751, "reward_before_std": 0.5254558436572552, "reward_change_max": 0.0, "reward_change_mean": -0.05983339576050639, "reward_change_min": -0.10755488555878401, "reward_change_std": 0.04164266283623874, "reward_std": 0.5303029306232929, "rewards/cosine_scaled_reward": -0.2165372660383582, "rewards/format_reward": 0.33333334140479565, "step": 127 }, { "advantage_max": 1.3794664293527603, "advantage_mean": -3.3527613574335646e-08, "advantage_min": -1.1035713329911232, "advantage_std": 0.9998351410031319, "completion_length": 2929.9375076293945, "epoch": 0.1462857142857143, "grad_norm": 0.1915324479341507, "kl": 0.0020612478256225586, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.34913917072228e-07, "loss": 0.0001, "reward": 0.5117706246674061, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5117706246674061, "reward_after_std": 0.8629733137786388, "reward_before_mean": 0.62900335714221, "reward_before_std": 0.8841073550283909, "reward_change_max": 0.00022004544734954834, "reward_change_mean": -0.11723275459371507, "reward_change_min": -0.22613342199474573, "reward_change_std": 0.09391661314293742, "reward_std": 0.8629733547568321, "rewards/cosine_scaled_reward": 0.09575167298316956, "rewards/format_reward": 0.4375000037252903, "step": 128 }, { "advantage_max": 1.6466744989156723, "advantage_mean": 4.253039909141165e-08, "advantage_min": -0.9315560981631279, "advantage_std": 0.9997737854719162, "completion_length": 3490.229217529297, "epoch": 0.14742857142857144, "grad_norm": 0.16802391409873962, "kl": 0.0023403167724609375, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.332771203643714e-07, "loss": 0.0001, "reward": -0.21124888956546783, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.21124888956546783, "reward_after_std": 0.849559772759676, "reward_before_mean": -0.1677091233432293, "reward_before_std": 0.8529805261641741, "reward_change_max": 0.000441625714302063, "reward_change_mean": -0.04353977448772639, "reward_change_min": -0.11069826781749725, "reward_change_std": 0.04446559911593795, "reward_std": 0.849559810012579, "rewards/cosine_scaled_reward": -0.15677122166380286, "rewards/format_reward": 0.1458333358168602, "step": 129 }, { "advantage_max": 1.3871336728334427, "advantage_mean": -1.241763691872677e-09, "advantage_min": -1.234124794602394, "advantage_std": 0.9997271597385406, "completion_length": 3101.0416717529297, "epoch": 0.14857142857142858, "grad_norm": 0.19376447796821594, "kl": 0.002029895782470703, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.316216432703916e-07, "loss": 0.0001, "reward": -0.15444228425621986, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.15444228425621986, "reward_after_std": 0.44813977740705013, "reward_before_mean": -0.08898514322936535, "reward_before_std": 0.4583962671458721, "reward_change_max": 0.0003493502736091614, "reward_change_mean": -0.0654571489430964, "reward_change_min": -0.1214989572763443, "reward_change_std": 0.048809306579642, "reward_std": 0.4481397867202759, "rewards/cosine_scaled_reward": -0.13824257254600525, "rewards/format_reward": 0.1875, "step": 130 }, { "advantage_max": 1.4599091708660126, "advantage_mean": -1.9247333615801665e-08, "advantage_min": -1.2544859647750854, "advantage_std": 0.999752089381218, "completion_length": 2915.7916679382324, "epoch": 0.14971428571428572, "grad_norm": 0.18682855367660522, "kl": 0.003175973892211914, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.299475664759068e-07, "loss": 0.0001, "reward": 0.5291292034089565, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.5291292034089565, "reward_after_std": 0.6431342400610447, "reward_before_mean": 0.6541959419846535, "reward_before_std": 0.6478817011229694, "reward_change_max": 0.0002513080835342407, "reward_change_mean": -0.12506670271977782, "reward_change_min": -0.20734062790870667, "reward_change_std": 0.08662228705361485, "reward_std": 0.6431342735886574, "rewards/cosine_scaled_reward": 0.13959795609116554, "rewards/format_reward": 0.3750000074505806, "step": 131 }, { "advantage_max": 1.4901214316487312, "advantage_mean": 9.002783629696864e-09, "advantage_min": -1.1353254616260529, "advantage_std": 0.9997986853122711, "completion_length": 2639.0208435058594, "epoch": 0.15085714285714286, "grad_norm": 0.1656983196735382, "kl": 0.0013974905014038086, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.282549715730579e-07, "loss": 0.0001, "reward": 0.3720488026738167, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3720488026738167, "reward_after_std": 0.8538518510758877, "reward_before_mean": 0.4742213059216738, "reward_before_std": 0.8687582034617662, "reward_change_max": 0.0, "reward_change_mean": -0.1021725102327764, "reward_change_min": -0.19822395592927933, "reward_change_std": 0.07776921393815428, "reward_std": 0.8538518510758877, "rewards/cosine_scaled_reward": 0.01836064923554659, "rewards/format_reward": 0.43750000558793545, "step": 132 }, { "advantage_max": 1.3228430151939392, "advantage_mean": 2.7318796114172983e-08, "advantage_min": -1.146996609866619, "advantage_std": 0.9996031150221825, "completion_length": 3313.8958435058594, "epoch": 0.152, "grad_norm": 0.207058846950531, "kl": 0.0017418861389160156, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.265439410565328e-07, "loss": 0.0001, "reward": -0.23692141473293304, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.23692141473293304, "reward_after_std": 0.5058922655880451, "reward_before_mean": -0.18047994375228882, "reward_before_std": 0.5220023482106626, "reward_change_max": 0.00026736408472061157, "reward_change_mean": -0.05644145607948303, "reward_change_min": -0.1346017699688673, "reward_change_std": 0.0541907181032002, "reward_std": 0.5058922655880451, "rewards/cosine_scaled_reward": -0.18398998258635402, "rewards/format_reward": 0.1875000074505806, "step": 133 }, { "advantage_max": 1.4404646754264832, "advantage_mean": -8.133550766231679e-08, "advantage_min": -1.1880767047405243, "advantage_std": 0.9997904896736145, "completion_length": 2402.2291946411133, "epoch": 0.15314285714285714, "grad_norm": 0.2038743793964386, "kl": 0.0018963813781738281, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.248145583195447e-07, "loss": 0.0001, "reward": 0.526889817789197, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.526889817789197, "reward_after_std": 0.7365444861352444, "reward_before_mean": 0.6467488976195455, "reward_before_std": 0.7403203975409269, "reward_change_max": 0.00013075023889541626, "reward_change_mean": -0.11985907377675176, "reward_change_min": -0.21406998671591282, "reward_change_std": 0.08232975355349481, "reward_std": 0.7365445084869862, "rewards/cosine_scaled_reward": 0.04212443716824055, "rewards/format_reward": 0.5625000055879354, "step": 134 }, { "advantage_max": 1.5359413474798203, "advantage_mean": -1.0399769101443468e-07, "advantage_min": -0.9698811173439026, "advantage_std": 0.9997820109128952, "completion_length": 2010.0208892822266, "epoch": 0.15428571428571428, "grad_norm": 0.2191932201385498, "kl": 0.0039234161376953125, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.230669076497687e-07, "loss": 0.0002, "reward": 0.9419772960245609, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.9419772960245609, "reward_after_std": 0.7658766210079193, "reward_before_mean": 1.1002132706344128, "reward_before_std": 0.7517257547006011, "reward_change_max": 0.0, "reward_change_mean": -0.15823592338711023, "reward_change_min": -0.2634607693180442, "reward_change_std": 0.10368040250614285, "reward_std": 0.7658766359090805, "rewards/cosine_scaled_reward": 0.2271899450570345, "rewards/format_reward": 0.6458333414047956, "step": 135 }, { "advantage_max": 1.3615388423204422, "advantage_mean": -7.450580152834618e-09, "advantage_min": -1.1105039417743683, "advantage_std": 0.9997578710317612, "completion_length": 2758.9791870117188, "epoch": 0.15542857142857142, "grad_norm": 0.2017042189836502, "kl": 0.0016570091247558594, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.213010742252327e-07, "loss": 0.0001, "reward": 0.3180756554938853, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3180756554938853, "reward_after_std": 0.9733518315479159, "reward_before_mean": 0.41393173237884184, "reward_before_std": 1.0054605212062597, "reward_change_max": 0.0001665651798248291, "reward_change_mean": -0.09585604723542929, "reward_change_min": -0.2182399481534958, "reward_change_std": 0.08857251331210136, "reward_std": 0.9733518492430449, "rewards/cosine_scaled_reward": 0.00904920045286417, "rewards/format_reward": 0.39583333395421505, "step": 136 }, { "advantage_max": 1.5188078880310059, "advantage_mean": 1.3659397168908072e-08, "advantage_min": -0.969260111451149, "advantage_std": 0.9996920749545097, "completion_length": 3099.6875228881836, "epoch": 0.15657142857142858, "grad_norm": 0.1994733065366745, "kl": 0.0014867782592773438, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.195171441101668e-07, "loss": 0.0001, "reward": -0.09599984437227249, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.09599984437227249, "reward_after_std": 0.6841760762035847, "reward_before_mean": -0.033508097752928734, "reward_before_std": 0.6963941073045135, "reward_change_max": 0.0004359111189842224, "reward_change_mean": -0.06249175767879933, "reward_change_min": -0.14749382436275482, "reward_change_std": 0.05822044319938868, "reward_std": 0.684176079928875, "rewards/cosine_scaled_reward": -0.15217071864753962, "rewards/format_reward": 0.2708333358168602, "step": 137 }, { "advantage_max": 1.5097112655639648, "advantage_mean": 3.91155504098073e-08, "advantage_min": -1.1656879857182503, "advantage_std": 0.9996949210762978, "completion_length": 2763.395866394043, "epoch": 0.15771428571428572, "grad_norm": 0.18249079585075378, "kl": 0.004346370697021484, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.177152042508077e-07, "loss": 0.0002, "reward": 0.09496973222121596, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.09496973222121596, "reward_after_std": 0.4213468311354518, "reward_before_mean": 0.1828093589283526, "reward_before_std": 0.4119232380762696, "reward_change_max": 0.00017911195755004883, "reward_change_mean": -0.0878396132029593, "reward_change_min": -0.14762252569198608, "reward_change_std": 0.05701114097610116, "reward_std": 0.4213468497619033, "rewards/cosine_scaled_reward": -0.14817864634096622, "rewards/format_reward": 0.4791666716337204, "step": 138 }, { "advantage_max": 1.5847613364458084, "advantage_mean": -2.173085378309736e-09, "advantage_min": -1.175045009702444, "advantage_std": 0.9997278079390526, "completion_length": 3105.812545776367, "epoch": 0.15885714285714286, "grad_norm": 0.20554403960704803, "kl": 0.0026154518127441406, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.158953424711624e-07, "loss": 0.0001, "reward": 0.07944915629923344, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.07944915629923344, "reward_after_std": 0.7219101591035724, "reward_before_mean": 0.1550278328359127, "reward_before_std": 0.719237182289362, "reward_change_max": 0.00016336888074874878, "reward_change_mean": -0.07557868165895343, "reward_change_min": -0.14963595662266016, "reward_change_std": 0.05737433675676584, "reward_std": 0.7219101591035724, "rewards/cosine_scaled_reward": -0.12040275533217937, "rewards/format_reward": 0.39583333767950535, "step": 139 }, { "advantage_max": 1.6046061217784882, "advantage_mean": 6.084640968850863e-08, "advantage_min": -1.058633465319872, "advantage_std": 0.999708391726017, "completion_length": 2978.166748046875, "epoch": 0.16, "grad_norm": 0.1818619817495346, "kl": 0.004107475280761719, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.140576474687263e-07, "loss": 0.0002, "reward": 0.10122170485556126, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.10122170485556126, "reward_after_std": 0.6357193235307932, "reward_before_mean": 0.18152375193312764, "reward_before_std": 0.623972998932004, "reward_change_max": 0.0006061270833015442, "reward_change_mean": -0.08030204870738089, "reward_change_min": -0.13752434495836496, "reward_change_std": 0.05715995456557721, "reward_std": 0.6357193402945995, "rewards/cosine_scaled_reward": -0.06548812706023455, "rewards/format_reward": 0.3125000037252903, "step": 140 }, { "advantage_max": 1.4406469464302063, "advantage_mean": 6.643434380393387e-08, "advantage_min": -1.0366877242922783, "advantage_std": 0.9997889995574951, "completion_length": 2887.916732788086, "epoch": 0.16114285714285714, "grad_norm": 0.18227314949035645, "kl": 0.0030608177185058594, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.122022088101613e-07, "loss": 0.0001, "reward": 0.023926494643092155, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.023926494643092155, "reward_after_std": 0.7154865898191929, "reward_before_mean": 0.09898238629102707, "reward_before_std": 0.736889086663723, "reward_change_max": 0.000539436936378479, "reward_change_mean": -0.07505585625767708, "reward_change_min": -0.17021092772483826, "reward_change_std": 0.06994302081875503, "reward_std": 0.7154866233468056, "rewards/cosine_scaled_reward": -0.19009215137339197, "rewards/format_reward": 0.479166679084301, "step": 141 }, { "advantage_max": 1.3935022801160812, "advantage_mean": -9.934107536579972e-09, "advantage_min": -1.0459297895431519, "advantage_std": 0.9998486042022705, "completion_length": 2663.416732788086, "epoch": 0.16228571428571428, "grad_norm": 0.19613024592399597, "kl": 0.0020918846130371094, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.103291169269299e-07, "loss": 0.0001, "reward": 0.2961191050708294, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2961191050708294, "reward_after_std": 0.9062751308083534, "reward_before_mean": 0.3927744999527931, "reward_before_std": 0.939884040504694, "reward_change_max": 0.00010180473327636719, "reward_change_mean": -0.09665541374124587, "reward_change_min": -0.22028039954602718, "reward_change_std": 0.09074757865164429, "reward_std": 0.9062751643359661, "rewards/cosine_scaled_reward": -0.07444609270896763, "rewards/format_reward": 0.5416666772216558, "step": 142 }, { "advantage_max": 1.5322048366069794, "advantage_mean": -1.3504177776013648e-08, "advantage_min": -1.0548894479870796, "advantage_std": 0.9998358264565468, "completion_length": 2548.041732788086, "epoch": 0.16342857142857142, "grad_norm": 0.30651625990867615, "kl": 0.00348663330078125, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.084384631108882e-07, "loss": 0.0001, "reward": 0.1695709004998207, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1695709004998207, "reward_after_std": 0.8127848580479622, "reward_before_mean": 0.2530234828591347, "reward_before_std": 0.8218814432621002, "reward_change_max": 0.0, "reward_change_mean": -0.08345259842462838, "reward_change_min": -0.16000903863459826, "reward_change_std": 0.06607106560841203, "reward_std": 0.8127848766744137, "rewards/cosine_scaled_reward": -0.12348826136440039, "rewards/format_reward": 0.5000000093132257, "step": 143 }, { "advantage_max": 1.6216581761837006, "advantage_mean": 3.7252856355252106e-09, "advantage_min": -1.0678609758615494, "advantage_std": 0.9997515752911568, "completion_length": 2930.3125228881836, "epoch": 0.16457142857142856, "grad_norm": 0.20165316760540009, "kl": 0.0022115707397460938, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.065303395098358e-07, "loss": 0.0001, "reward": 0.2244191411882639, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2244191411882639, "reward_after_std": 0.8113269321620464, "reward_before_mean": 0.310532383620739, "reward_before_std": 0.8061074055731297, "reward_change_max": 0.0006739497184753418, "reward_change_mean": -0.08611326036043465, "reward_change_min": -0.172080148011446, "reward_change_std": 0.06757508893497288, "reward_std": 0.8113269321620464, "rewards/cosine_scaled_reward": -0.03223381540738046, "rewards/format_reward": 0.37500000558793545, "step": 144 }, { "advantage_max": 1.495003655552864, "advantage_mean": -4.346171922353648e-08, "advantage_min": -1.0665920972824097, "advantage_std": 0.999808594584465, "completion_length": 1915.729232788086, "epoch": 0.1657142857142857, "grad_norm": 0.30377116799354553, "kl": 0.002331256866455078, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.046048391230247e-07, "loss": 0.0001, "reward": 0.47825442533940077, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.47825442533940077, "reward_after_std": 0.7022902369499207, "reward_before_mean": 0.5941783878952265, "reward_before_std": 0.7010281160473824, "reward_change_max": 0.0, "reward_change_mean": -0.11592395044863224, "reward_change_min": -0.2171989856287837, "reward_change_std": 0.08219394693151116, "reward_std": 0.7022902630269527, "rewards/cosine_scaled_reward": -0.04666082002222538, "rewards/format_reward": 0.6875000074505806, "step": 145 }, { "advantage_max": 1.4068877398967743, "advantage_mean": 2.980232360894064e-08, "advantage_min": -1.1271022856235504, "advantage_std": 0.9998052194714546, "completion_length": 2556.812530517578, "epoch": 0.16685714285714287, "grad_norm": 0.18746472895145416, "kl": 0.0015310049057006836, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.026620557966279e-07, "loss": 0.0001, "reward": 0.08086108416318893, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.08086108416318893, "reward_after_std": 0.665203008800745, "reward_before_mean": 0.1612489800900221, "reward_before_std": 0.6762508265674114, "reward_change_max": 0.00020448118448257446, "reward_change_mean": -0.08038788754492998, "reward_change_min": -0.17192814219743013, "reward_change_std": 0.06585183460265398, "reward_std": 0.6652030423283577, "rewards/cosine_scaled_reward": -0.20062552206218243, "rewards/format_reward": 0.5625000018626451, "step": 146 }, { "advantage_max": 1.5742152035236359, "advantage_mean": 6.829699250587851e-09, "advantage_min": -1.0765742659568787, "advantage_std": 0.9998117387294769, "completion_length": 2837.437530517578, "epoch": 0.168, "grad_norm": 0.19602230191230774, "kl": 0.00308990478515625, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.007020842191634e-07, "loss": 0.0001, "reward": 0.08351327944546938, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.08351327944546938, "reward_after_std": 0.6948803104460239, "reward_before_mean": 0.16174614802002907, "reward_before_std": 0.6967041678726673, "reward_change_max": 0.0001440197229385376, "reward_change_mean": -0.07823285344056785, "reward_change_min": -0.1446439679712057, "reward_change_std": 0.05933955032378435, "reward_std": 0.694880336523056, "rewards/cosine_scaled_reward": -0.10662694118218496, "rewards/format_reward": 0.3750000037252903, "step": 147 }, { "advantage_max": 1.5195300206542015, "advantage_mean": -2.4835269396561444e-09, "advantage_min": -1.1114639192819595, "advantage_std": 0.9998055621981621, "completion_length": 2250.354202270508, "epoch": 0.16914285714285715, "grad_norm": 0.16835594177246094, "kl": 0.0018110275268554688, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.987250199168808e-07, "loss": 0.0001, "reward": 0.31845202576369047, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.31845202576369047, "reward_after_std": 0.6970034800469875, "reward_before_mean": 0.4189865104854107, "reward_before_std": 0.6898063011467457, "reward_change_max": 0.0010981261730194092, "reward_change_mean": -0.10053447051905096, "reward_change_min": -0.1857745312154293, "reward_change_std": 0.07377739227376878, "reward_std": 0.6970034874975681, "rewards/cosine_scaled_reward": -0.10300674941390753, "rewards/format_reward": 0.6250000111758709, "step": 148 }, { "advantage_max": 1.4426256641745567, "advantage_mean": 1.924733339375706e-08, "advantage_min": -1.2441855445504189, "advantage_std": 0.9998246803879738, "completion_length": 2690.6458740234375, "epoch": 0.1702857142857143, "grad_norm": 0.3846557140350342, "kl": 0.016859054565429688, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.967309592491052e-07, "loss": 0.0007, "reward": 0.3900892809033394, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3900892809033394, "reward_after_std": 0.8488783277571201, "reward_before_mean": 0.49406828731298447, "reward_before_std": 0.8630441017448902, "reward_change_max": 0.00035718828439712524, "reward_change_mean": -0.10397897334769368, "reward_change_min": -0.1900622034445405, "reward_change_std": 0.07942482200451195, "reward_std": 0.8488783352077007, "rewards/cosine_scaled_reward": -0.013382526114583015, "rewards/format_reward": 0.5208333414047956, "step": 149 }, { "advantage_max": 1.6954808682203293, "advantage_mean": -1.8626452047421083e-08, "advantage_min": -0.899493508040905, "advantage_std": 0.9998697191476822, "completion_length": 2483.479217529297, "epoch": 0.17142857142857143, "grad_norm": 0.224741131067276, "kl": 0.0034275054931640625, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.9471999940354e-07, "loss": 0.0001, "reward": 0.13289955770596862, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.13289955770596862, "reward_after_std": 0.9514123201370239, "reward_before_mean": 0.20388731081038713, "reward_before_std": 0.9427561610937119, "reward_change_max": 0.0, "reward_change_mean": -0.07098775426857173, "reward_change_min": -0.14540079329162836, "reward_change_std": 0.057504348922520876, "reward_std": 0.9514123611152172, "rewards/cosine_scaled_reward": -0.12722301567555405, "rewards/format_reward": 0.4583333358168602, "step": 150 }, { "advantage_max": 1.5135483890771866, "advantage_mean": -8.816520657983773e-08, "advantage_min": -1.2416688278317451, "advantage_std": 0.9997973516583443, "completion_length": 2449.729232788086, "epoch": 0.17257142857142857, "grad_norm": 0.2216656506061554, "kl": 0.0034198760986328125, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.926922383915315e-07, "loss": 0.0001, "reward": 0.48375364703679224, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.48375364703679224, "reward_after_std": 0.6876551508903503, "reward_before_mean": 0.5997938713990152, "reward_before_std": 0.6790660806000233, "reward_change_max": 8.161365985870361e-05, "reward_change_mean": -0.11604023166000843, "reward_change_min": -0.18250472843647003, "reward_change_std": 0.0721652910578996, "reward_std": 0.6876551546156406, "rewards/cosine_scaled_reward": 0.01864692009985447, "rewards/format_reward": 0.5625000204890966, "step": 151 }, { "advantage_max": 1.4979790449142456, "advantage_mean": 5.960464721788128e-08, "advantage_min": -1.2011219523847103, "advantage_std": 0.9997016414999962, "completion_length": 2874.708366394043, "epoch": 0.1737142857142857, "grad_norm": 0.23708555102348328, "kl": 0.0017731189727783203, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.906477750432903e-07, "loss": 0.0001, "reward": -0.11927625350654125, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.11927625350654125, "reward_after_std": 0.4810872804373503, "reward_before_mean": -0.0537937730550766, "reward_before_std": 0.476759847253561, "reward_change_max": 0.00021364539861679077, "reward_change_mean": -0.06548247905448079, "reward_change_min": -0.110728794708848, "reward_change_std": 0.043877444695681334, "reward_std": 0.48108728788793087, "rewards/cosine_scaled_reward": -0.19356355350464582, "rewards/format_reward": 0.33333333395421505, "step": 152 }, { "advantage_max": 1.5236635357141495, "advantage_mean": 2.6697914878859308e-08, "advantage_min": -1.2076719664037228, "advantage_std": 0.9997353553771973, "completion_length": 2614.6459045410156, "epoch": 0.17485714285714285, "grad_norm": 0.33291175961494446, "kl": 0.003040790557861328, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.88586709003076e-07, "loss": 0.0001, "reward": 0.16083110310137272, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.16083110310137272, "reward_after_std": 0.5439409669488668, "reward_before_mean": 0.25147375743836164, "reward_before_std": 0.539504618383944, "reward_change_max": 0.0001697242259979248, "reward_change_mean": -0.09064263617619872, "reward_change_min": -0.15238108951598406, "reward_change_std": 0.06364887952804565, "reward_std": 0.5439409743994474, "rewards/cosine_scaled_reward": -0.11384646594524384, "rewards/format_reward": 0.4791666679084301, "step": 153 }, { "advantage_max": 1.6322798505425453, "advantage_mean": -1.9868215073159945e-08, "advantage_min": -0.8856546506285667, "advantage_std": 0.9998943731188774, "completion_length": 3126.916717529297, "epoch": 0.176, "grad_norm": 0.15531423687934875, "kl": 0.0019125938415527344, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.865091407243394e-07, "loss": 0.0001, "reward": 0.2853868268430233, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.2853868268430233, "reward_after_std": 1.2025733813643456, "reward_before_mean": 0.3669017466454534, "reward_before_std": 1.2222107723355293, "reward_change_max": 0.0006478503346443176, "reward_change_mean": -0.081514913123101, "reward_change_min": -0.20529971737414598, "reward_change_std": 0.08442414319142699, "reward_std": 1.2025734297931194, "rewards/cosine_scaled_reward": -0.024882478785002604, "rewards/format_reward": 0.4166666753590107, "step": 154 }, { "advantage_max": 1.6839442551136017, "advantage_mean": -4.221995830722136e-08, "advantage_min": -0.941412091255188, "advantage_std": 0.9998479187488556, "completion_length": 2498.583381652832, "epoch": 0.17714285714285713, "grad_norm": 0.19921159744262695, "kl": 0.0031003952026367188, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.844151714648274e-07, "loss": 0.0001, "reward": 0.569856112357229, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.569856112357229, "reward_after_std": 0.9291363656520844, "reward_before_mean": 0.6841814294457436, "reward_before_std": 0.9151368550956249, "reward_change_max": 0.0002962574362754822, "reward_change_mean": -0.11432529683224857, "reward_change_min": -0.20955283753573895, "reward_change_std": 0.08066396252252162, "reward_std": 0.9291363768279552, "rewards/cosine_scaled_reward": 0.07125736703164876, "rewards/format_reward": 0.5416666679084301, "step": 155 }, { "advantage_max": 1.5021150261163712, "advantage_mean": 1.1175871006408045e-08, "advantage_min": -1.1296715438365936, "advantage_std": 0.9998017847537994, "completion_length": 2816.125030517578, "epoch": 0.1782857142857143, "grad_norm": 0.19361698627471924, "kl": 0.0018634796142578125, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.823049032816478e-07, "loss": 0.0001, "reward": 0.07529625482857227, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.07529625482857227, "reward_after_std": 0.6703125648200512, "reward_before_mean": 0.15437587723135948, "reward_before_std": 0.6760777495801449, "reward_change_max": 0.0002835988998413086, "reward_change_mean": -0.07907962403260171, "reward_change_min": -0.15298352297395468, "reward_change_std": 0.060570935369469225, "reward_std": 0.6703126020729542, "rewards/cosine_scaled_reward": -0.08947873779106885, "rewards/format_reward": 0.3333333358168602, "step": 156 }, { "advantage_max": 1.5404149293899536, "advantage_mean": 8.863086731203396e-08, "advantage_min": -1.1716954857110977, "advantage_std": 0.9997392222285271, "completion_length": 2796.541702270508, "epoch": 0.17942857142857144, "grad_norm": 0.28218579292297363, "kl": 0.004803657531738281, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.801784390262943e-07, "loss": 0.0002, "reward": -0.06934672966599464, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.06934672966599464, "reward_after_std": 0.5129354447126389, "reward_before_mean": 0.001629834994673729, "reward_before_std": 0.5202076155692339, "reward_change_max": 6.977468729019165e-05, "reward_change_mean": -0.07097655383404344, "reward_change_min": -0.1306041106581688, "reward_change_std": 0.05284939787816256, "reward_std": 0.5129354521632195, "rewards/cosine_scaled_reward": -0.22835176065564156, "rewards/format_reward": 0.4583333469927311, "step": 157 }, { "advantage_max": 1.551323488354683, "advantage_mean": -5.463759245039057e-08, "advantage_min": -0.998228020966053, "advantage_std": 0.9998158514499664, "completion_length": 3241.2083740234375, "epoch": 0.18057142857142858, "grad_norm": 0.1757160723209381, "kl": 0.0034437179565429688, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.780358823396352e-07, "loss": 0.0001, "reward": 0.4231220823712647, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4231220823712647, "reward_after_std": 0.6513553634285927, "reward_before_mean": 0.533676290884614, "reward_before_std": 0.6418947353959084, "reward_change_max": 0.00046910345554351807, "reward_change_mean": -0.11055420152842999, "reward_change_min": -0.19882715586572886, "reward_change_std": 0.07370695215649903, "reward_std": 0.651355367153883, "rewards/cosine_scaled_reward": 0.11058812821283937, "rewards/format_reward": 0.31250000558793545, "step": 158 }, { "advantage_max": 1.4679991006851196, "advantage_mean": -2.2351742678949904e-08, "advantage_min": -1.1628785654902458, "advantage_std": 0.9997387602925301, "completion_length": 2613.8958740234375, "epoch": 0.18171428571428572, "grad_norm": 0.1862615942955017, "kl": 0.003566741943359375, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.758773376468604e-07, "loss": 0.0001, "reward": -0.05008651316165924, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.05008651316165924, "reward_after_std": 0.4757378753274679, "reward_before_mean": 0.023678398691117764, "reward_before_std": 0.48022647947072983, "reward_change_max": 0.0001963600516319275, "reward_change_mean": -0.07376493467018008, "reward_change_min": -0.13229443226009607, "reward_change_std": 0.05242071067914367, "reward_std": 0.4757378753274679, "rewards/cosine_scaled_reward": -0.22774413786828518, "rewards/format_reward": 0.4791666679084301, "step": 159 }, { "advantage_max": 1.4685984998941422, "advantage_mean": -1.1796752907855534e-08, "advantage_min": -1.1256564185023308, "advantage_std": 0.9998102709650993, "completion_length": 2535.125015258789, "epoch": 0.18285714285714286, "grad_norm": 0.2220413088798523, "kl": 0.005443572998046875, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.737029101523929e-07, "loss": 0.0002, "reward": 0.42259710282087326, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.42259710282087326, "reward_after_std": 0.904817771166563, "reward_before_mean": 0.5270836362615228, "reward_before_std": 0.9138357825577259, "reward_change_max": 7.965415716171265e-05, "reward_change_mean": -0.10448656626977026, "reward_change_min": -0.21647041756659746, "reward_change_std": 0.08518222416751087, "reward_std": 0.9048177935183048, "rewards/cosine_scaled_reward": 0.06562515255063772, "rewards/format_reward": 0.39583333395421505, "step": 160 }, { "advantage_max": 1.5048878341913223, "advantage_mean": -1.9247333005179e-08, "advantage_min": -1.1224373206496239, "advantage_std": 0.9997854977846146, "completion_length": 2536.5625610351562, "epoch": 0.184, "grad_norm": 0.21401101350784302, "kl": 0.0053272247314453125, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.715127058347614e-07, "loss": 0.0002, "reward": 0.34695685049518943, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.34695685049518943, "reward_after_std": 0.7117992416024208, "reward_before_mean": 0.44934904761612415, "reward_before_std": 0.7109068520367146, "reward_change_max": 3.3311545848846436e-05, "reward_change_mean": -0.10239221714437008, "reward_change_min": -0.17907678615301847, "reward_change_std": 0.06959949876181781, "reward_std": 0.7117992583662271, "rewards/cosine_scaled_reward": -0.04615880874916911, "rewards/format_reward": 0.5416666697710752, "step": 161 }, { "advantage_max": 1.2772861272096634, "advantage_mean": -2.483526606589237e-09, "advantage_min": -1.1813317835330963, "advantage_std": 0.9998279586434364, "completion_length": 2731.312515258789, "epoch": 0.18514285714285714, "grad_norm": 0.2270742654800415, "kl": 0.006060600280761719, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.693068314414344e-07, "loss": 0.0002, "reward": 0.3163473308086395, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3163473308086395, "reward_after_std": 0.8020492419600487, "reward_before_mean": 0.41752319410443306, "reward_before_std": 0.827314168214798, "reward_change_max": 0.0001702532172203064, "reward_change_mean": -0.10117586515843868, "reward_change_min": -0.2129524489864707, "reward_change_std": 0.08441391820088029, "reward_std": 0.8020492419600487, "rewards/cosine_scaled_reward": -0.009988403879106045, "rewards/format_reward": 0.43750000558793545, "step": 162 }, { "advantage_max": 1.4550439938902855, "advantage_mean": 3.104407619858307e-09, "advantage_min": -1.16783557087183, "advantage_std": 0.9997671395540237, "completion_length": 2460.4583892822266, "epoch": 0.18628571428571428, "grad_norm": 0.20131473243236542, "kl": 0.0035390853881835938, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.670853944836176e-07, "loss": 0.0001, "reward": 0.6401930401916616, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6401930401916616, "reward_after_std": 0.7289017662405968, "reward_before_mean": 0.7708904361352324, "reward_before_std": 0.7250148011371493, "reward_change_max": 0.0, "reward_change_mean": -0.13069738494232297, "reward_change_min": -0.21607584971934557, "reward_change_std": 0.0865078882779926, "reward_std": 0.7289017718285322, "rewards/cosine_scaled_reward": 0.09377853712067008, "rewards/format_reward": 0.5833333488553762, "step": 163 }, { "advantage_max": 1.3210031017661095, "advantage_mean": -5.774200284580644e-08, "advantage_min": -1.2639915123581886, "advantage_std": 0.9998083263635635, "completion_length": 2332.500045776367, "epoch": 0.18742857142857142, "grad_norm": 0.20200245082378387, "kl": 0.004815101623535156, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.648485032310144e-07, "loss": 0.0002, "reward": 0.545483585447073, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.545483585447073, "reward_after_std": 0.7864016853272915, "reward_before_mean": 0.6684325840324163, "reward_before_std": 0.8049891255795956, "reward_change_max": 0.00083199143409729, "reward_change_mean": -0.12294902792200446, "reward_change_min": -0.23326233215630054, "reward_change_std": 0.09470712952315807, "reward_std": 0.7864017356187105, "rewards/cosine_scaled_reward": 0.042549606412649155, "rewards/format_reward": 0.5833333414047956, "step": 164 }, { "advantage_max": 1.3517840281128883, "advantage_mean": -3.4148494032493204e-09, "advantage_min": -1.3420864343643188, "advantage_std": 0.9997818022966385, "completion_length": 2552.541679382324, "epoch": 0.18857142857142858, "grad_norm": 0.22521643340587616, "kl": 0.004207611083984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.625962667065487e-07, "loss": 0.0002, "reward": -0.1310715600848198, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.1310715600848198, "reward_after_std": 0.4917597956955433, "reward_before_mean": -0.06424580817110837, "reward_before_std": 0.5039008930325508, "reward_change_max": 7.886439561843872e-05, "reward_change_mean": -0.0668257491197437, "reward_change_min": -0.13053180649876595, "reward_change_std": 0.053368333261460066, "reward_std": 0.4917598068714142, "rewards/cosine_scaled_reward": -0.250872902572155, "rewards/format_reward": 0.4375000074505806, "step": 165 }, { "advantage_max": 1.5099513083696365, "advantage_mean": 1.9247334170913177e-08, "advantage_min": -1.0428318604826927, "advantage_std": 0.9998095035552979, "completion_length": 2697.6458740234375, "epoch": 0.18971428571428572, "grad_norm": 0.24739572405815125, "kl": 0.0027518272399902344, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.603287946810513e-07, "loss": 0.0001, "reward": 0.15346253104507923, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.15346253104507923, "reward_after_std": 0.7710051350295544, "reward_before_mean": 0.23662841320037842, "reward_before_std": 0.7812362983822823, "reward_change_max": 0.00016464293003082275, "reward_change_mean": -0.08316585887223482, "reward_change_min": -0.1772454548627138, "reward_change_std": 0.07039047335274518, "reward_std": 0.7710051573812962, "rewards/cosine_scaled_reward": -0.10043580364435911, "rewards/format_reward": 0.4375, "step": 166 }, { "advantage_max": 1.4335933923721313, "advantage_mean": -1.5522043650406658e-09, "advantage_min": -1.2036648765206337, "advantage_std": 0.9998134821653366, "completion_length": 2456.541732788086, "epoch": 0.19085714285714286, "grad_norm": 0.2020367830991745, "kl": 0.0030541419982910156, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.580461976679099e-07, "loss": 0.0001, "reward": 0.34286654088646173, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.34286654088646173, "reward_after_std": 0.8823619969189167, "reward_before_mean": 0.4425388155505061, "reward_before_std": 0.903926894068718, "reward_change_max": 0.00046034157276153564, "reward_change_mean": -0.09967226395383477, "reward_change_min": -0.22274313494563103, "reward_change_std": 0.08939659083262086, "reward_std": 0.882362000644207, "rewards/cosine_scaled_reward": -0.09123059129342437, "rewards/format_reward": 0.6250000111758709, "step": 167 }, { "advantage_max": 1.4103640839457512, "advantage_mean": -2.173086061096896e-08, "advantage_min": -1.219421647489071, "advantage_std": 0.9998368471860886, "completion_length": 2891.375, "epoch": 0.192, "grad_norm": 0.18209148943424225, "kl": 0.0035066604614257812, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.557485869176825e-07, "loss": 0.0001, "reward": 0.23698728531599045, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.23698728531599045, "reward_after_std": 0.8624007292091846, "reward_before_mean": 0.326487647369504, "reward_before_std": 0.8801949247717857, "reward_change_max": 0.0002350136637687683, "reward_change_mean": -0.08950036205351353, "reward_change_min": -0.19595735147595406, "reward_change_std": 0.0768729702103883, "reward_std": 0.8624007403850555, "rewards/cosine_scaled_reward": -0.07633952237665653, "rewards/format_reward": 0.47916668094694614, "step": 168 }, { "advantage_max": 1.5285159349441528, "advantage_mean": 3.725290464995368e-08, "advantage_min": -1.2007123529911041, "advantage_std": 0.9997627809643745, "completion_length": 1906.6250457763672, "epoch": 0.19314285714285714, "grad_norm": 0.4720163941383362, "kl": 0.0038690567016601562, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.534360744126753e-07, "loss": 0.0002, "reward": 1.2066361154429615, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 1.2066361154429615, "reward_after_std": 0.6004377212375402, "reward_before_mean": 1.3932382222265005, "reward_before_std": 0.5733307562768459, "reward_change_max": 0.00018994510173797607, "reward_change_mean": -0.18660208210349083, "reward_change_min": -0.27627282589673996, "reward_change_std": 0.11074877297505736, "reward_std": 0.6004377249628305, "rewards/cosine_scaled_reward": 0.3007857669144869, "rewards/format_reward": 0.7916666697710752, "step": 169 }, { "advantage_max": 1.613743469119072, "advantage_mean": 5.3395831089986245e-08, "advantage_min": -1.0112006813287735, "advantage_std": 0.999798871576786, "completion_length": 2178.145896911621, "epoch": 0.19428571428571428, "grad_norm": 0.2254769206047058, "kl": 0.003116130828857422, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.511087728614862e-07, "loss": 0.0001, "reward": 0.48212924622930586, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.48212924622930586, "reward_after_std": 0.6476732306182384, "reward_before_mean": 0.5986989340744913, "reward_before_std": 0.6297142971307039, "reward_change_max": 0.00013228505849838257, "reward_change_mean": -0.11656963010318577, "reward_change_min": -0.20530240423977375, "reward_change_std": 0.08074354752898216, "reward_std": 0.6476732343435287, "rewards/cosine_scaled_reward": 0.018099449574947357, "rewards/format_reward": 0.5625000055879354, "step": 170 }, { "advantage_max": 1.4391934275627136, "advantage_mean": -9.934107980669182e-09, "advantage_min": -1.1576652973890305, "advantage_std": 0.9997934475541115, "completion_length": 2325.0000610351562, "epoch": 0.19542857142857142, "grad_norm": 0.19575172662734985, "kl": 0.0027556419372558594, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.487667956935087e-07, "loss": 0.0001, "reward": 0.4846016988158226, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4846016988158226, "reward_after_std": 0.7373366430401802, "reward_before_mean": 0.6001935666427016, "reward_before_std": 0.743379769846797, "reward_change_max": 9.828805923461914e-05, "reward_change_mean": -0.11559185804799199, "reward_change_min": -0.20142329763621092, "reward_change_std": 0.07610836182720959, "reward_std": 0.7373366467654705, "rewards/cosine_scaled_reward": 0.029263429809361696, "rewards/format_reward": 0.5416666679084301, "step": 171 }, { "advantage_max": 1.5662109777331352, "advantage_mean": -9.809931267312777e-08, "advantage_min": -1.1238925158977509, "advantage_std": 0.9997661337256432, "completion_length": 2761.604179382324, "epoch": 0.19657142857142856, "grad_norm": 0.2239156812429428, "kl": 0.0050258636474609375, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.464102570534061e-07, "loss": 0.0002, "reward": 0.48637349624186754, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.48637349624186754, "reward_after_std": 0.5919462293386459, "reward_before_mean": 0.6056147422641516, "reward_before_std": 0.5790109075605869, "reward_change_max": 0.0006321147084236145, "reward_change_mean": -0.1192412911914289, "reward_change_min": -0.19068229012191296, "reward_change_std": 0.07716060453094542, "reward_std": 0.5919462330639362, "rewards/cosine_scaled_reward": 0.11530737672001123, "rewards/format_reward": 0.3750000037252903, "step": 172 }, { "advantage_max": 1.5436064153909683, "advantage_mean": -8.785476313111218e-08, "advantage_min": -0.9907987825572491, "advantage_std": 0.9997530058026314, "completion_length": 1651.7917175292969, "epoch": 0.1977142857142857, "grad_norm": 0.270830363035202, "kl": 0.004105567932128906, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.440392717955475e-07, "loss": 0.0002, "reward": 0.38264737790450454, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.38264737790450454, "reward_after_std": 0.6842790376394987, "reward_before_mean": 0.4890690501779318, "reward_before_std": 0.6811281414702535, "reward_change_max": 0.000122949481010437, "reward_change_mean": -0.10642170021310449, "reward_change_min": -0.1934090843424201, "reward_change_std": 0.07530650636181235, "reward_std": 0.6842790395021439, "rewards/cosine_scaled_reward": -0.1304654898121953, "rewards/format_reward": 0.75, "step": 173 }, { "advantage_max": 1.4847132563591003, "advantage_mean": 2.2817404077279946e-08, "advantage_min": -1.3459831699728966, "advantage_std": 0.9998100623488426, "completion_length": 2408.666702270508, "epoch": 0.19885714285714284, "grad_norm": 0.25999969244003296, "kl": 0.0075206756591796875, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.416539554784089e-07, "loss": 0.0003, "reward": 0.32197510451078415, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.32197510451078415, "reward_after_std": 0.6608857288956642, "reward_before_mean": 0.42462080453697126, "reward_before_std": 0.6630401350557804, "reward_change_max": 0.00017840415239334106, "reward_change_mean": -0.10264569194987416, "reward_change_min": -0.1863954644650221, "reward_change_std": 0.07458728924393654, "reward_std": 0.6608857549726963, "rewards/cosine_scaled_reward": -0.10018960013985634, "rewards/format_reward": 0.6250000074505806, "step": 174 }, { "advantage_max": 1.4769642427563667, "advantage_mean": 1.2852251740635978e-07, "advantage_min": -1.2436860725283623, "advantage_std": 0.9996817782521248, "completion_length": 2774.625045776367, "epoch": 0.2, "grad_norm": 0.17187584936618805, "kl": 0.0044689178466796875, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.392544243589427e-07, "loss": 0.0002, "reward": 0.5743710789829493, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5743710789829493, "reward_after_std": 0.5426803417503834, "reward_before_mean": 0.7062350884079933, "reward_before_std": 0.538358046207577, "reward_change_max": 0.0001729428768157959, "reward_change_mean": -0.1318639765959233, "reward_change_min": -0.1999199902638793, "reward_change_std": 0.08338183793239295, "reward_std": 0.5426803696900606, "rewards/cosine_scaled_reward": 0.10311753861606121, "rewards/format_reward": 0.5000000055879354, "step": 175 }, { "advantage_max": 1.4718908816576004, "advantage_mean": -3.476937759927523e-08, "advantage_min": -0.9987376481294632, "advantage_std": 0.9998609870672226, "completion_length": 2058.0625610351562, "epoch": 0.20114285714285715, "grad_norm": 0.2753830552101135, "kl": 0.0045604705810546875, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.368407953869103e-07, "loss": 0.0002, "reward": 0.6023684218525887, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.6023684218525887, "reward_after_std": 0.9104252867400646, "reward_before_mean": 0.7241084277629852, "reward_before_std": 0.9173421896994114, "reward_change_max": 2.7664005756378174e-05, "reward_change_mean": -0.12173997610807419, "reward_change_min": -0.2230681637302041, "reward_change_std": 0.09054337767884135, "reward_std": 0.9104253090918064, "rewards/cosine_scaled_reward": 0.018304186407476664, "rewards/format_reward": 0.6875000055879354, "step": 176 }, { "advantage_max": 1.3292637690901756, "advantage_mean": -4.346171089686379e-09, "advantage_min": -1.3279682248830795, "advantage_std": 0.9997624382376671, "completion_length": 2565.750045776367, "epoch": 0.2022857142857143, "grad_norm": 0.33500978350639343, "kl": 0.006360054016113281, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.344131861991828e-07, "loss": 0.0003, "reward": 0.4371657082810998, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4371657082810998, "reward_after_std": 0.706968255341053, "reward_before_mean": 0.552201054408215, "reward_before_std": 0.7253951374441385, "reward_change_max": 0.0002434253692626953, "reward_change_mean": -0.11503535695374012, "reward_change_min": -0.20339765585958958, "reward_change_std": 0.08484502066858113, "reward_std": 0.7069682665169239, "rewards/cosine_scaled_reward": -0.015566141344606876, "rewards/format_reward": 0.5833333432674408, "step": 177 }, { "advantage_max": 1.3190075904130936, "advantage_mean": -2.607703353252333e-08, "advantage_min": -1.2449935302138329, "advantage_std": 0.9998176321387291, "completion_length": 2517.604202270508, "epoch": 0.20342857142857143, "grad_norm": 0.2425314486026764, "kl": 0.009618759155273438, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.319717151140072e-07, "loss": 0.0004, "reward": 0.3113237756770104, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3113237756770104, "reward_after_std": 0.8060121387243271, "reward_before_mean": 0.4120410708710551, "reward_before_std": 0.8318094648420811, "reward_change_max": 3.4168362617492676e-05, "reward_change_mean": -0.10071731125935912, "reward_change_min": -0.2089853510260582, "reward_change_std": 0.08568191109225154, "reward_std": 0.8060121387243271, "rewards/cosine_scaled_reward": -0.0543961301445961, "rewards/format_reward": 0.5208333358168602, "step": 178 }, { "advantage_max": 1.666494145989418, "advantage_mean": 6.457170187434969e-08, "advantage_min": -0.9503717795014381, "advantage_std": 0.9996859654784203, "completion_length": 2578.2916870117188, "epoch": 0.20457142857142857, "grad_norm": 0.18489985167980194, "kl": 0.0033893585205078125, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.295165011252396e-07, "loss": 0.0001, "reward": -0.011646052822470665, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.011646052822470665, "reward_after_std": 0.5355609776452184, "reward_before_mean": 0.06075285002589226, "reward_before_std": 0.5213664807379246, "reward_change_max": 8.018314838409424e-05, "reward_change_mean": -0.07239888049662113, "reward_change_min": -0.12431587558239698, "reward_change_std": 0.04638653458096087, "reward_std": 0.5355609860271215, "rewards/cosine_scaled_reward": -0.18837358802556992, "rewards/format_reward": 0.4375, "step": 179 }, { "advantage_max": 1.6271196901798248, "advantage_mean": -2.173086099954702e-08, "advantage_min": -1.081467144191265, "advantage_std": 0.9998771697282791, "completion_length": 1909.5625381469727, "epoch": 0.2057142857142857, "grad_norm": 0.28603145480155945, "kl": 0.0059146881103515625, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.270476638965461e-07, "loss": 0.0002, "reward": 0.6225273078307509, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6225273078307509, "reward_after_std": 1.0300602950155735, "reward_before_mean": 0.7391243830788881, "reward_before_std": 1.0257667750120163, "reward_change_max": 0.00026930123567581177, "reward_change_mean": -0.11659704940393567, "reward_change_min": -0.22699564415961504, "reward_change_std": 0.08568047219887376, "reward_std": 1.0300603322684765, "rewards/cosine_scaled_reward": 0.015395501744933426, "rewards/format_reward": 0.7083333414047956, "step": 180 }, { "advantage_max": 1.4436516463756561, "advantage_mean": -4.718701129835523e-08, "advantage_min": -1.2535812556743622, "advantage_std": 0.9998086541891098, "completion_length": 2982.5625610351562, "epoch": 0.20685714285714285, "grad_norm": 0.18748371303081512, "kl": 0.0066986083984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.245653237555705e-07, "loss": 0.0003, "reward": 0.33492581988684833, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.33492581988684833, "reward_after_std": 0.6875773407518864, "reward_before_mean": 0.4378726640716195, "reward_before_std": 0.688941452652216, "reward_change_max": 0.0001654699444770813, "reward_change_mean": -0.10294683976098895, "reward_change_min": -0.18089472129940987, "reward_change_std": 0.07351072086021304, "reward_std": 0.6875773705542088, "rewards/cosine_scaled_reward": 0.00018630968406796455, "rewards/format_reward": 0.43750000186264515, "step": 181 }, { "advantage_max": 1.4636868089437485, "advantage_mean": -2.793967790459817e-08, "advantage_min": -1.205622598528862, "advantage_std": 0.9998370632529259, "completion_length": 2015.520866394043, "epoch": 0.208, "grad_norm": 0.18570247292518616, "kl": 0.0036478042602539062, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.220696016880687e-07, "loss": 0.0001, "reward": 0.6111249923706055, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6111249923706055, "reward_after_std": 0.8329652547836304, "reward_before_mean": 0.7355204597115517, "reward_before_std": 0.8338058553636074, "reward_change_max": 0.0002494007349014282, "reward_change_mean": -0.12439546594396234, "reward_change_min": -0.2062538806349039, "reward_change_std": 0.0867111561819911, "reward_std": 0.8329653143882751, "rewards/cosine_scaled_reward": 0.03442687960341573, "rewards/format_reward": 0.6666666716337204, "step": 182 }, { "advantage_max": 1.3377911821007729, "advantage_mean": -4.03573219287523e-09, "advantage_min": -1.3339354917407036, "advantage_std": 0.9998266994953156, "completion_length": 1787.0209045410156, "epoch": 0.20914285714285713, "grad_norm": 0.23595218360424042, "kl": 0.00778961181640625, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.195606193320136e-07, "loss": 0.0003, "reward": 0.7993416367098689, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7993416367098689, "reward_after_std": 0.7656833119690418, "reward_before_mean": 0.9461353290826082, "reward_before_std": 0.7740000747144222, "reward_change_max": 0.0, "reward_change_mean": -0.14679364021867514, "reward_change_min": -0.25638253428041935, "reward_change_std": 0.09901249408721924, "reward_std": 0.7656833156943321, "rewards/cosine_scaled_reward": 0.06681763380765915, "rewards/format_reward": 0.812500013038516, "step": 183 }, { "advantage_max": 1.4900680482387543, "advantage_mean": -3.973643020183104e-08, "advantage_min": -0.9935832992196083, "advantage_std": 0.9997612237930298, "completion_length": 2492.583366394043, "epoch": 0.2102857142857143, "grad_norm": 0.2290632426738739, "kl": 0.005229949951171875, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.170384989716657e-07, "loss": 0.0002, "reward": -0.07437216304242611, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.07437216304242611, "reward_after_std": 0.46923505514860153, "reward_before_mean": -0.003364154603332281, "reward_before_std": 0.46698611602187157, "reward_change_max": 0.00026264041662216187, "reward_change_mean": -0.07100802287459373, "reward_change_min": -0.13311471231281757, "reward_change_std": 0.05143853323534131, "reward_std": 0.46923505887389183, "rewards/cosine_scaled_reward": -0.23084875382483006, "rewards/format_reward": 0.4583333358168602, "step": 184 }, { "advantage_max": 1.5278790444135666, "advantage_mean": -4.346172866043219e-09, "advantage_min": -1.13626679033041, "advantage_std": 0.9996984899044037, "completion_length": 2207.1666870117188, "epoch": 0.21142857142857144, "grad_norm": 0.24160081148147583, "kl": 0.005153656005859375, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.145033635316128e-07, "loss": 0.0002, "reward": -0.17866181893623434, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.17866181893623434, "reward_after_std": 0.42477146722376347, "reward_before_mean": -0.1162537019699812, "reward_before_std": 0.4256909815594554, "reward_change_max": 0.0010868832468986511, "reward_change_mean": -0.0624081171117723, "reward_change_min": -0.11495065130293369, "reward_change_std": 0.04580365400761366, "reward_std": 0.42477147467434406, "rewards/cosine_scaled_reward": -0.30812685564160347, "rewards/format_reward": 0.5000000111758709, "step": 185 }, { "advantage_max": 1.419815257191658, "advantage_mean": -2.2972623581196672e-08, "advantage_min": -1.1030875816941261, "advantage_std": 0.9998009353876114, "completion_length": 2772.6666870117188, "epoch": 0.21257142857142858, "grad_norm": 0.19030563533306122, "kl": 0.005794525146484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.119553365707802e-07, "loss": 0.0002, "reward": 0.13442187756299973, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.13442187756299973, "reward_after_std": 0.6170549169182777, "reward_before_mean": 0.22131559252738953, "reward_before_std": 0.6231498755514622, "reward_change_max": 0.0005284175276756287, "reward_change_mean": -0.08689371170476079, "reward_change_min": -0.16148117743432522, "reward_change_std": 0.06356596038676798, "reward_std": 0.6170549280941486, "rewards/cosine_scaled_reward": -0.08725888282060623, "rewards/format_reward": 0.39583333395421505, "step": 186 }, { "advantage_max": 1.4794887602329254, "advantage_mean": 2.095475859498208e-08, "advantage_min": -1.2403139621019363, "advantage_std": 0.999807707965374, "completion_length": 1984.0208587646484, "epoch": 0.21371428571428572, "grad_norm": 0.22693683207035065, "kl": 0.006984710693359375, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.093945422764069e-07, "loss": 0.0003, "reward": 0.44079733826220036, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.44079733826220036, "reward_after_std": 0.6123870965093374, "reward_before_mean": 0.5555229228921235, "reward_before_std": 0.6088758781552315, "reward_change_max": 0.00014670193195343018, "reward_change_mean": -0.11472555063664913, "reward_change_min": -0.19294564425945282, "reward_change_std": 0.07350781094282866, "reward_std": 0.6123871095478535, "rewards/cosine_scaled_reward": -0.07640522718429565, "rewards/format_reward": 0.7083333395421505, "step": 187 }, { "advantage_max": 1.4657558798789978, "advantage_mean": 4.967053546245381e-09, "advantage_min": -1.0956667438149452, "advantage_std": 0.9997967481613159, "completion_length": 3006.062530517578, "epoch": 0.21485714285714286, "grad_norm": 0.1615750640630722, "kl": 0.0056781768798828125, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.068211054579943e-07, "loss": 0.0002, "reward": -0.03509191796183586, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.03509191796183586, "reward_after_std": 0.7077936753630638, "reward_before_mean": 0.03107014298439026, "reward_before_std": 0.7118645720183849, "reward_change_max": 0.0002888292074203491, "reward_change_mean": -0.0661620597820729, "reward_change_min": -0.12805389054119587, "reward_change_std": 0.04991923741181381, "reward_std": 0.7077937126159668, "rewards/cosine_scaled_reward": -0.15113160910550505, "rewards/format_reward": 0.33333333395421505, "step": 188 }, { "advantage_max": 1.4838997721672058, "advantage_mean": -1.862645193639878e-08, "advantage_min": -1.0723033919930458, "advantage_std": 0.9998086094856262, "completion_length": 2195.3750228881836, "epoch": 0.216, "grad_norm": 0.2977884113788605, "kl": 0.005596160888671875, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.04235151541222e-07, "loss": 0.0002, "reward": 0.4038459522125777, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4038459522125777, "reward_after_std": 0.660543642938137, "reward_before_mean": 0.5137117095291615, "reward_before_std": 0.6600365824997425, "reward_change_max": 1.96993350982666e-05, "reward_change_mean": -0.10986578883603215, "reward_change_min": -0.18296963907778263, "reward_change_std": 0.07029087585397065, "reward_std": 0.660543654114008, "rewards/cosine_scaled_reward": -0.05564414896070957, "rewards/format_reward": 0.6250000037252903, "step": 189 }, { "advantage_max": 1.502543568611145, "advantage_mean": -2.5766593525489156e-08, "advantage_min": -1.2239737287163734, "advantage_std": 0.9998229667544365, "completion_length": 2273.187545776367, "epoch": 0.21714285714285714, "grad_norm": 0.1777806133031845, "kl": 0.005923271179199219, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.01636806561836e-07, "loss": 0.0002, "reward": 0.5966947921551764, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5966947921551764, "reward_after_std": 0.7224652580916882, "reward_before_mean": 0.7232264764606953, "reward_before_std": 0.7153876610100269, "reward_change_max": 0.00013045966625213623, "reward_change_mean": -0.12653166661038995, "reward_change_min": -0.21457305550575256, "reward_change_std": 0.08450583834201097, "reward_std": 0.722465269267559, "rewards/cosine_scaled_reward": 0.038696552932378836, "rewards/format_reward": 0.6458333376795053, "step": 190 }, { "advantage_max": 1.5981075763702393, "advantage_mean": 3.042320584345504e-08, "advantage_min": -1.092733584344387, "advantage_std": 0.9998094141483307, "completion_length": 1768.2292175292969, "epoch": 0.21828571428571428, "grad_norm": 0.22896713018417358, "kl": 0.005808830261230469, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.990261971595048e-07, "loss": 0.0002, "reward": 0.5460129454731941, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5460129454731941, "reward_after_std": 0.7109896428883076, "reward_before_mean": 0.6658507529646158, "reward_before_std": 0.6980491913855076, "reward_change_max": 0.00015036016702651978, "reward_change_mean": -0.11983778630383313, "reward_change_min": -0.2126011624932289, "reward_change_std": 0.0784097551368177, "reward_std": 0.7109896764159203, "rewards/cosine_scaled_reward": -0.021241309586912394, "rewards/format_reward": 0.7083333395421505, "step": 191 }, { "advantage_max": 1.5088998228311539, "advantage_mean": 1.3659398834242609e-08, "advantage_min": -1.123813882470131, "advantage_std": 0.9998009204864502, "completion_length": 2736.5625610351562, "epoch": 0.21942857142857142, "grad_norm": 0.26386189460754395, "kl": 0.0050640106201171875, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.964034505716476e-07, "loss": 0.0002, "reward": 0.0696917362511158, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.0696917362511158, "reward_after_std": 0.7020151615142822, "reward_before_mean": 0.1468326561152935, "reward_before_std": 0.7101566754281521, "reward_change_max": 0.0003652498126029968, "reward_change_mean": -0.0771409273147583, "reward_change_min": -0.16264318861067295, "reward_change_std": 0.06438883999362588, "reward_std": 0.7020151950418949, "rewards/cosine_scaled_reward": -0.1661670026369393, "rewards/format_reward": 0.47916667722165585, "step": 192 }, { "advantage_max": 1.4421148598194122, "advantage_mean": 2.545615107596433e-08, "advantage_min": -1.2296575456857681, "advantage_std": 0.9997586086392403, "completion_length": 2928.250030517578, "epoch": 0.22057142857142858, "grad_norm": 0.2043694406747818, "kl": 0.005222320556640625, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.93768694627233e-07, "loss": 0.0002, "reward": -0.035729264840483665, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.035729264840483665, "reward_after_std": 0.5215943157672882, "reward_before_mean": 0.0388828688301146, "reward_before_std": 0.5313386619091034, "reward_change_max": 0.00044645369052886963, "reward_change_mean": -0.07461211644113064, "reward_change_min": -0.13615632615983486, "reward_change_std": 0.057447490049526095, "reward_std": 0.5215943232178688, "rewards/cosine_scaled_reward": -0.1680585816502571, "rewards/format_reward": 0.3750000037252903, "step": 193 }, { "advantage_max": 1.486984170973301, "advantage_mean": -7.2022281916162e-08, "advantage_min": -1.0792049020528793, "advantage_std": 0.9998373165726662, "completion_length": 2874.937545776367, "epoch": 0.22171428571428572, "grad_norm": 0.20345580577850342, "kl": 0.006009101867675781, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.911220577405484e-07, "loss": 0.0002, "reward": 0.7174287736415863, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.7174287736415863, "reward_after_std": 0.8496483601629734, "reward_before_mean": 0.8520793356001377, "reward_before_std": 0.8528431281447411, "reward_change_max": 0.0001317933201789856, "reward_change_mean": -0.1346505917608738, "reward_change_min": -0.2605106784030795, "reward_change_std": 0.10104376077651978, "reward_std": 0.8496483750641346, "rewards/cosine_scaled_reward": 0.16562299244105816, "rewards/format_reward": 0.5208333358168602, "step": 194 }, { "advantage_max": 1.2939805686473846, "advantage_mean": 8.881784197001252e-16, "advantage_min": -1.2831865474581718, "advantage_std": 0.9998500421643257, "completion_length": 2426.7083892822266, "epoch": 0.22285714285714286, "grad_norm": 0.25405922532081604, "kl": 0.0049610137939453125, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.884636689049422e-07, "loss": 0.0002, "reward": 0.44910696102306247, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.44910696102306247, "reward_after_std": 0.856367252767086, "reward_before_mean": 0.5602726228535175, "reward_before_std": 0.8805490881204605, "reward_change_max": 2.874433994293213e-05, "reward_change_mean": -0.11116565251722932, "reward_change_min": -0.24337132275104523, "reward_change_std": 0.09182561980560422, "reward_std": 0.8563672751188278, "rewards/cosine_scaled_reward": -0.04278036626055837, "rewards/format_reward": 0.6458333544433117, "step": 195 }, { "advantage_max": 1.2606007531285286, "advantage_mean": 1.2417634254191512e-08, "advantage_min": -1.360006719827652, "advantage_std": 0.9998254328966141, "completion_length": 3241.041717529297, "epoch": 0.224, "grad_norm": 0.15946926176548004, "kl": 0.0063762664794921875, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.857936576865356e-07, "loss": 0.0003, "reward": 0.1911745136603713, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1911745136603713, "reward_after_std": 0.7220529839396477, "reward_before_mean": 0.28443292406154796, "reward_before_std": 0.753384817391634, "reward_change_max": 0.0001894533634185791, "reward_change_mean": -0.09325835760682821, "reward_change_min": -0.18131935968995094, "reward_change_std": 0.07906473823823035, "reward_std": 0.7220530100166798, "rewards/cosine_scaled_reward": -0.03486689180135727, "rewards/format_reward": 0.354166679084301, "step": 196 }, { "advantage_max": 1.3448470905423164, "advantage_mean": -2.980232305382913e-08, "advantage_min": -1.2865313589572906, "advantage_std": 0.9998331740498543, "completion_length": 1494.7083740234375, "epoch": 0.22514285714285714, "grad_norm": 0.2721844017505646, "kl": 0.0056476593017578125, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.831121542179086e-07, "loss": 0.0002, "reward": 0.8248235955834389, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.8248235955834389, "reward_after_std": 0.8203364051878452, "reward_before_mean": 0.9715809598565102, "reward_before_std": 0.8329006358981133, "reward_change_max": 0.00021795183420181274, "reward_change_mean": -0.14675737638026476, "reward_change_min": -0.2606231663376093, "reward_change_std": 0.10089114168658853, "reward_std": 0.8203364387154579, "rewards/cosine_scaled_reward": 0.08995713107287884, "rewards/format_reward": 0.791666679084301, "step": 197 }, { "advantage_max": 1.4269006997346878, "advantage_mean": 2.4835268952472234e-08, "advantage_min": -1.2689987570047379, "advantage_std": 0.9997787326574326, "completion_length": 2175.0000381469727, "epoch": 0.22628571428571428, "grad_norm": 0.22729894518852234, "kl": 0.0071544647216796875, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.804192891917571e-07, "loss": 0.0003, "reward": 0.43701247684657574, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.43701247684657574, "reward_after_std": 0.6505181044340134, "reward_before_mean": 0.5518691539764404, "reward_before_std": 0.6552633382380009, "reward_change_max": 5.833804607391357e-06, "reward_change_mean": -0.11485666548833251, "reward_change_min": -0.18947429209947586, "reward_change_std": 0.07660498935729265, "reward_std": 0.6505181305110455, "rewards/cosine_scaled_reward": -0.026148765347898006, "rewards/format_reward": 0.6041666828095913, "step": 198 }, { "advantage_max": 1.5523432940244675, "advantage_mean": -1.7384688688615313e-08, "advantage_min": -1.0372936129570007, "advantage_std": 0.9998035654425621, "completion_length": 2083.229248046875, "epoch": 0.22742857142857142, "grad_norm": 0.20395267009735107, "kl": 0.0061740875244140625, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.777151938545235e-07, "loss": 0.0002, "reward": 0.25621967762708664, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.25621967762708664, "reward_after_std": 0.6264994069933891, "reward_before_mean": 0.35232863295823336, "reward_before_std": 0.6190488487482071, "reward_change_max": 0.0001118779182434082, "reward_change_mean": -0.09610894275829196, "reward_change_min": -0.17422006744891405, "reward_change_std": 0.06503989174962044, "reward_std": 0.62649941816926, "rewards/cosine_scaled_reward": -0.2092523672617972, "rewards/format_reward": 0.7708333414047956, "step": 199 }, { "advantage_max": 1.3771865218877792, "advantage_mean": -4.097819494841559e-08, "advantage_min": -1.263512298464775, "advantage_std": 0.999853827059269, "completion_length": 1518.083381652832, "epoch": 0.22857142857142856, "grad_norm": 0.22084669768810272, "kl": 0.004482269287109375, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.75e-07, "loss": 0.0002, "reward": 0.930808313190937, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.930808313190937, "reward_after_std": 0.9711525030434132, "reward_before_mean": 1.083461206406355, "reward_before_std": 0.9862293135374784, "reward_change_max": 0.0, "reward_change_mean": -0.152652895078063, "reward_change_min": -0.27695308346301317, "reward_change_std": 0.1092049004510045, "reward_std": 0.9711525365710258, "rewards/cosine_scaled_reward": 0.08339726109988987, "rewards/format_reward": 0.916666679084301, "step": 200 }, { "advantage_max": 1.2871046587824821, "advantage_mean": -8.881784197001252e-16, "advantage_min": -1.2733745723962784, "advantage_std": 0.9998413845896721, "completion_length": 2204.479179382324, "epoch": 0.2297142857142857, "grad_norm": 0.19999176263809204, "kl": 0.00505828857421875, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.72273839962904e-07, "loss": 0.0002, "reward": 1.3313297554850578, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 1.3313297554850578, "reward_after_std": 0.8409098871052265, "reward_before_mean": 1.5278500989079475, "reward_before_std": 0.8497489392757416, "reward_change_max": 0.00014547258615493774, "reward_change_mean": -0.1965202996507287, "reward_change_min": -0.3145089754834771, "reward_change_std": 0.12920579919591546, "reward_std": 0.8409099094569683, "rewards/cosine_scaled_reward": 0.37850836105644703, "rewards/format_reward": 0.7708333432674408, "step": 201 }, { "advantage_max": 1.4420902132987976, "advantage_mean": -4.842877432409409e-08, "advantage_min": -1.206528678536415, "advantage_std": 0.9997935071587563, "completion_length": 1855.604190826416, "epoch": 0.23085714285714284, "grad_norm": 0.21113544702529907, "kl": 0.00447845458984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.695368466124296e-07, "loss": 0.0002, "reward": 0.9050917774438858, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.9050917774438858, "reward_after_std": 0.6540236826986074, "reward_before_mean": 1.0633059060201049, "reward_before_std": 0.6418928802013397, "reward_change_max": 4.6059489250183105e-05, "reward_change_mean": -0.15821410715579987, "reward_change_min": -0.24650216195732355, "reward_change_std": 0.09680199483409524, "reward_std": 0.6540237125009298, "rewards/cosine_scaled_reward": 0.17748626694083214, "rewards/format_reward": 0.708333333954215, "step": 202 }, { "advantage_max": 1.6368324905633926, "advantage_mean": 1.862646592520889e-09, "advantage_min": -1.08430115878582, "advantage_std": 0.9998248592019081, "completion_length": 2852.2084350585938, "epoch": 0.232, "grad_norm": 0.18059028685092926, "kl": 0.008449554443359375, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.667891533457718e-07, "loss": 0.0003, "reward": 0.41513045597821474, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.41513045597821474, "reward_after_std": 0.8612460158765316, "reward_before_mean": 0.5171395465731621, "reward_before_std": 0.8522833585739136, "reward_change_max": 7.91698694229126e-05, "reward_change_mean": -0.10200908035039902, "reward_change_min": -0.1845270236954093, "reward_change_std": 0.0716489371843636, "reward_std": 0.8612460680305958, "rewards/cosine_scaled_reward": 0.029403111548162997, "rewards/format_reward": 0.45833333767950535, "step": 203 }, { "advantage_max": 1.44488774985075, "advantage_mean": -3.104408552445648e-08, "advantage_min": -1.3494350239634514, "advantage_std": 0.9998263940215111, "completion_length": 2016.4375534057617, "epoch": 0.23314285714285715, "grad_norm": 0.2973000109195709, "kl": 0.008507728576660156, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.640308940816239e-07, "loss": 0.0003, "reward": 0.6798251471482217, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6798251471482217, "reward_after_std": 0.7647205218672752, "reward_before_mean": 0.8129912074655294, "reward_before_std": 0.7654086537659168, "reward_change_max": 0.0, "reward_change_mean": -0.13316606543958187, "reward_change_min": -0.21146363113075495, "reward_change_std": 0.0862268814817071, "reward_std": 0.764720544219017, "rewards/cosine_scaled_reward": 0.03149560187011957, "rewards/format_reward": 0.7500000186264515, "step": 204 }, { "advantage_max": 1.324913576245308, "advantage_mean": -3.849466811978175e-08, "advantage_min": -1.3274166509509087, "advantage_std": 0.9998679384589195, "completion_length": 2227.4375762939453, "epoch": 0.2342857142857143, "grad_norm": 0.20827358961105347, "kl": 0.0046520233154296875, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.612622032536507e-07, "loss": 0.0002, "reward": 0.9521224275231361, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.9521224275231361, "reward_after_std": 0.9761649705469608, "reward_before_mean": 1.1085763350129128, "reward_before_std": 1.002737108618021, "reward_change_max": 0.0003203153610229492, "reward_change_mean": -0.15645390911959112, "reward_change_min": -0.27211152762174606, "reward_change_std": 0.11479099653661251, "reward_std": 0.9761650040745735, "rewards/cosine_scaled_reward": 0.18970483355224133, "rewards/format_reward": 0.729166679084301, "step": 205 }, { "advantage_max": 1.5361033529043198, "advantage_mean": -1.6142925440831846e-08, "advantage_min": -1.174651451408863, "advantage_std": 0.9998372942209244, "completion_length": 2660.250072479248, "epoch": 0.23542857142857143, "grad_norm": 0.20716576278209686, "kl": 0.0052280426025390625, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.584832158039378e-07, "loss": 0.0002, "reward": 0.29744825698435307, "reward_advantage_correlation": 0.9999999999999994, "reward_after_mean": 0.29744825698435307, "reward_after_std": 0.8509851843118668, "reward_before_mean": 0.39010776579380035, "reward_before_std": 0.8526091985404491, "reward_change_max": 0.0002521201968193054, "reward_change_mean": -0.09265952045097947, "reward_change_min": -0.1899539204314351, "reward_change_std": 0.07156781386584044, "reward_std": 0.8509852103888988, "rewards/cosine_scaled_reward": -0.10702944942750037, "rewards/format_reward": 0.604166679084301, "step": 206 }, { "advantage_max": 1.5442499741911888, "advantage_mean": -4.9049656836164246e-08, "advantage_min": -1.2186319679021835, "advantage_std": 0.9998384788632393, "completion_length": 2184.0834045410156, "epoch": 0.23657142857142857, "grad_norm": 0.24978166818618774, "kl": 0.006916046142578125, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.556940671764124e-07, "loss": 0.0003, "reward": 0.32800598815083504, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.32800598815083504, "reward_after_std": 0.746575552970171, "reward_before_mean": 0.42756956070661545, "reward_before_std": 0.7494839541614056, "reward_change_max": 8.022040128707886e-05, "reward_change_mean": -0.09956359711941332, "reward_change_min": -0.17093972861766815, "reward_change_std": 0.06876332219690084, "reward_std": 0.7465755566954613, "rewards/cosine_scaled_reward": -0.14038189128041267, "rewards/format_reward": 0.7083333525806665, "step": 207 }, { "advantage_max": 1.4164851307868958, "advantage_mean": -1.3659397279930374e-08, "advantage_min": -1.213052585721016, "advantage_std": 0.9997989684343338, "completion_length": 2149.2709197998047, "epoch": 0.2377142857142857, "grad_norm": 0.2106243371963501, "kl": 0.0059680938720703125, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.528948933102438e-07, "loss": 0.0002, "reward": 0.5530873071402311, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.5530873071402311, "reward_after_std": 0.6851087436079979, "reward_before_mean": 0.6776117645204067, "reward_before_std": 0.691135261207819, "reward_change_max": 0.0, "reward_change_mean": -0.12452444294467568, "reward_change_min": -0.22678932081907988, "reward_change_std": 0.08868333138525486, "reward_std": 0.6851087622344494, "rewards/cosine_scaled_reward": 0.01588919758796692, "rewards/format_reward": 0.6458333395421505, "step": 208 }, { "advantage_max": 1.6769540160894394, "advantage_mean": -7.82310966007671e-08, "advantage_min": -0.9029154442250729, "advantage_std": 0.9998540878295898, "completion_length": 2148.1875381469727, "epoch": 0.23885714285714285, "grad_norm": 0.2742181420326233, "kl": 0.00687408447265625, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.500858306332172e-07, "loss": 0.0003, "reward": 0.6304308408871293, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6304308408871293, "reward_after_std": 0.7765806205570698, "reward_before_mean": 0.7553490996360779, "reward_before_std": 0.7598440460860729, "reward_change_max": 0.00027485936880111694, "reward_change_mean": -0.12491830640647095, "reward_change_min": -0.21811598259955645, "reward_change_std": 0.0800136974430643, "reward_std": 0.7765806391835213, "rewards/cosine_scaled_reward": 0.07559122750535607, "rewards/format_reward": 0.6041666697710752, "step": 209 }, { "advantage_max": 1.2896546870470047, "advantage_mean": -1.3659397724019584e-08, "advantage_min": -1.2374619990587234, "advantage_std": 0.9997692629694939, "completion_length": 2183.437530517578, "epoch": 0.24, "grad_norm": 0.18591295182704926, "kl": 0.0058994293212890625, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.472670160550848e-07, "loss": 0.0002, "reward": 0.26463001780211926, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.26463001780211926, "reward_after_std": 0.5814108606427908, "reward_before_mean": 0.36643529776483774, "reward_before_std": 0.5933164171874523, "reward_change_max": 0.00011651962995529175, "reward_change_mean": -0.10180524515453726, "reward_change_min": -0.1865832544863224, "reward_change_std": 0.07230387115851045, "reward_std": 0.5814108960330486, "rewards/cosine_scaled_reward": -0.12928236462175846, "rewards/format_reward": 0.6250000037252903, "step": 210 }, { "advantage_max": 1.483919121325016, "advantage_mean": -4.1599077849063804e-08, "advantage_min": -1.2655752003192902, "advantage_std": 0.9997261986136436, "completion_length": 2075.4583740234375, "epoch": 0.24114285714285713, "grad_norm": 0.19888249039649963, "kl": 0.006458282470703125, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.444385869608921e-07, "loss": 0.0003, "reward": 0.43373306343937656, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.43373306343937656, "reward_after_std": 0.5451980549842119, "reward_before_mean": 0.5491739911958575, "reward_before_std": 0.5315751228481531, "reward_change_max": 0.00027501583099365234, "reward_change_mean": -0.11544095072895288, "reward_change_min": -0.1886872909963131, "reward_change_std": 0.07334151910617948, "reward_std": 0.5451980568468571, "rewards/cosine_scaled_reward": -0.037913015112280846, "rewards/format_reward": 0.6250000111758709, "step": 211 }, { "advantage_max": 1.543802410364151, "advantage_mean": -6.519258100023961e-08, "advantage_min": -0.9943611100316048, "advantage_std": 0.9998081922531128, "completion_length": 1795.4375610351562, "epoch": 0.2422857142857143, "grad_norm": 0.2262740135192871, "kl": 0.00582122802734375, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.416006812042827e-07, "loss": 0.0002, "reward": 0.8692614883184433, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.8692614883184433, "reward_after_std": 0.7634583115577698, "reward_before_mean": 1.0197798013687134, "reward_before_std": 0.752265190705657, "reward_change_max": 0.0, "reward_change_mean": -0.15051830047741532, "reward_change_min": -0.2744205743074417, "reward_change_std": 0.10307842120528221, "reward_std": 0.7634583376348019, "rewards/cosine_scaled_reward": 0.1244732104241848, "rewards/format_reward": 0.7708333376795053, "step": 212 }, { "advantage_max": 1.3446763902902603, "advantage_mean": 2.483527050678447e-09, "advantage_min": -1.2655547559261322, "advantage_std": 0.9998160228133202, "completion_length": 2237.6875076293945, "epoch": 0.24342857142857144, "grad_norm": 0.2671678960323334, "kl": 0.011074066162109375, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.387534371007797e-07, "loss": 0.0004, "reward": 0.5273108333349228, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5273108333349228, "reward_after_std": 0.727337971329689, "reward_before_mean": 0.6491544246673584, "reward_before_std": 0.737940214574337, "reward_change_max": 0.0, "reward_change_mean": -0.12184353871271014, "reward_change_min": -0.21076182555407286, "reward_change_std": 0.08359152404591441, "reward_std": 0.7273379862308502, "rewards/cosine_scaled_reward": 0.032910510897636414, "rewards/format_reward": 0.5833333414047956, "step": 213 }, { "advantage_max": 1.522321492433548, "advantage_mean": -1.2169282392893166e-07, "advantage_min": -1.2173484787344933, "advantage_std": 0.9997310861945152, "completion_length": 2353.854232788086, "epoch": 0.24457142857142858, "grad_norm": 0.2423325479030609, "kl": 0.00769805908203125, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.358969934210438e-07, "loss": 0.0003, "reward": 0.5639788303524256, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5639788303524256, "reward_after_std": 0.790889865718782, "reward_before_mean": 0.684724192135036, "reward_before_std": 0.7886148598045111, "reward_change_max": 0.00029237568378448486, "reward_change_mean": -0.12074539810419083, "reward_change_min": -0.2257308717817068, "reward_change_std": 0.08881830563768744, "reward_std": 0.790889891795814, "rewards/cosine_scaled_reward": 0.0402787746861577, "rewards/format_reward": 0.6041666753590107, "step": 214 }, { "advantage_max": 1.440918743610382, "advantage_mean": 5.587935336670569e-09, "advantage_min": -1.2240000292658806, "advantage_std": 0.9997818693518639, "completion_length": 1874.0417022705078, "epoch": 0.24571428571428572, "grad_norm": 0.2190721482038498, "kl": 0.0040130615234375, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.330314893841101e-07, "loss": 0.0002, "reward": 0.46307086013257504, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.46307086013257504, "reward_after_std": 0.5368920117616653, "reward_before_mean": 0.5826256424188614, "reward_before_std": 0.5326369069516659, "reward_change_max": 0.00018672645092010498, "reward_change_mean": -0.11955476645380259, "reward_change_min": -0.1934266872704029, "reward_change_std": 0.07289872295223176, "reward_std": 0.5368920303881168, "rewards/cosine_scaled_reward": -0.11493718903511763, "rewards/format_reward": 0.8125000149011612, "step": 215 }, { "advantage_max": 1.4966644793748856, "advantage_mean": -5.587935592021864e-08, "advantage_min": -1.1622079610824585, "advantage_std": 0.9997863173484802, "completion_length": 1281.31254196167, "epoch": 0.24685714285714286, "grad_norm": 0.26192766427993774, "kl": 0.00527191162109375, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.301570646506027e-07, "loss": 0.0002, "reward": 0.8690951648168266, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.8690951648168266, "reward_after_std": 0.7011812217533588, "reward_before_mean": 1.022690481506288, "reward_before_std": 0.6952761113643646, "reward_change_max": 0.0, "reward_change_mean": -0.15359534416347742, "reward_change_min": -0.2577803088352084, "reward_change_std": 0.09748534904792905, "reward_std": 0.7011812403798103, "rewards/cosine_scaled_reward": 0.07384524680674076, "rewards/format_reward": 0.8750000111758709, "step": 216 }, { "advantage_max": 1.5052600800991058, "advantage_mean": -4.221995697495373e-08, "advantage_min": -1.0247588083148003, "advantage_std": 0.999771237373352, "completion_length": 1675.0416793823242, "epoch": 0.248, "grad_norm": 0.22367313504219055, "kl": 0.0055751800537109375, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.27273859315928e-07, "loss": 0.0002, "reward": 0.8154791872948408, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.8154791872948408, "reward_after_std": 0.7668120842427015, "reward_before_mean": 0.9590049833059311, "reward_before_std": 0.7504305392503738, "reward_change_max": 0.0, "reward_change_mean": -0.14352576108649373, "reward_change_min": -0.24628907442092896, "reward_change_std": 0.08640740357805043, "reward_std": 0.7668121261522174, "rewards/cosine_scaled_reward": 0.10450247849803418, "rewards/format_reward": 0.7500000055879354, "step": 217 }, { "advantage_max": 1.5412231981754303, "advantage_mean": 1.738468902168222e-08, "advantage_min": -0.9499373137950897, "advantage_std": 0.9997916966676712, "completion_length": 2136.4167098999023, "epoch": 0.24914285714285714, "grad_norm": 0.20119324326515198, "kl": 0.007266998291015625, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.243820139034464e-07, "loss": 0.0003, "reward": 0.28319047950208187, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.28319047950208187, "reward_after_std": 0.6024854965507984, "reward_before_mean": 0.3830933915451169, "reward_before_std": 0.5972005352377892, "reward_change_max": 0.00016600638628005981, "reward_change_mean": -0.09990292368456721, "reward_change_min": -0.17767559923231602, "reward_change_std": 0.06573458341881633, "reward_std": 0.6024855300784111, "rewards/cosine_scaled_reward": -0.14178664050996304, "rewards/format_reward": 0.6666666716337204, "step": 218 }, { "advantage_max": 1.5693671107292175, "advantage_mean": 2.1109978987077227e-08, "advantage_min": -1.0280758142471313, "advantage_std": 0.9998404160141945, "completion_length": 1953.8750457763672, "epoch": 0.2502857142857143, "grad_norm": 0.2856943905353546, "kl": 0.009334564208984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.214816693576234e-07, "loss": 0.0004, "reward": 0.4099986110813916, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4099986110813916, "reward_after_std": 0.7541120164096355, "reward_before_mean": 0.5179123759735376, "reward_before_std": 0.7560647651553154, "reward_change_max": 0.00018017739057540894, "reward_change_mean": -0.10791374277323484, "reward_change_min": -0.2042955392971635, "reward_change_std": 0.07851645350456238, "reward_std": 0.7541120536625385, "rewards/cosine_scaled_reward": -0.084793820977211, "rewards/format_reward": 0.6875000093132257, "step": 219 }, { "advantage_max": 1.5397311598062515, "advantage_mean": -3.197540970889534e-08, "advantage_min": -1.2701191380620003, "advantage_std": 0.9997361525893211, "completion_length": 1732.9583854675293, "epoch": 0.25142857142857145, "grad_norm": 0.2450757920742035, "kl": 0.006439208984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.185729670371604e-07, "loss": 0.0003, "reward": 0.11072067031636834, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.11072067031636834, "reward_after_std": 0.4471112582832575, "reward_before_mean": 0.1992113790474832, "reward_before_std": 0.4396040756255388, "reward_change_max": 0.0002147480845451355, "reward_change_mean": -0.08849071525037289, "reward_change_min": -0.14833337999880314, "reward_change_std": 0.05763935064896941, "reward_std": 0.4471112657338381, "rewards/cosine_scaled_reward": -0.2858109883964062, "rewards/format_reward": 0.7708333432674408, "step": 220 }, { "advantage_max": 1.6834911704063416, "advantage_mean": -7.698933590649659e-08, "advantage_min": -0.9756602942943573, "advantage_std": 0.9997707083821297, "completion_length": 1674.0416870117188, "epoch": 0.25257142857142856, "grad_norm": 0.2080233097076416, "kl": 0.0053730010986328125, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.156560487081051e-07, "loss": 0.0002, "reward": 0.7887346247443929, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7887346247443929, "reward_after_std": 0.6305991001427174, "reward_before_mean": 0.9336526445113122, "reward_before_std": 0.5975028732791543, "reward_change_max": 0.00025875866413116455, "reward_change_mean": -0.1449180180206895, "reward_change_min": -0.22527676168829203, "reward_change_std": 0.08657489949837327, "reward_std": 0.6305991094559431, "rewards/cosine_scaled_reward": 0.0814096424728632, "rewards/format_reward": 0.7708333395421505, "step": 221 }, { "advantage_max": 1.2725271508097649, "advantage_mean": -2.483526884144993e-08, "advantage_min": -1.186394453048706, "advantage_std": 0.9998194724321365, "completion_length": 1854.083381652832, "epoch": 0.2537142857142857, "grad_norm": 0.23417192697525024, "kl": 0.005588531494140625, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.127310565369415e-07, "loss": 0.0002, "reward": 0.6199398525059223, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6199398525059223, "reward_after_std": 0.8259677402675152, "reward_before_mean": 0.7486249133944511, "reward_before_std": 0.8462961055338383, "reward_change_max": 0.0, "reward_change_mean": -0.1286850469186902, "reward_change_min": -0.2560490546748042, "reward_change_std": 0.09627352794632316, "reward_std": 0.8259677961468697, "rewards/cosine_scaled_reward": 0.03056243620812893, "rewards/format_reward": 0.6875, "step": 222 }, { "advantage_max": 1.4652435034513474, "advantage_mean": -4.439304501779873e-08, "advantage_min": -1.1945854425430298, "advantage_std": 0.9997480288147926, "completion_length": 2072.9375228881836, "epoch": 0.25485714285714284, "grad_norm": 0.21853138506412506, "kl": 0.0057468414306640625, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.097981330836616e-07, "loss": 0.0002, "reward": 0.5158516289666295, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.5158516289666295, "reward_after_std": 0.5560441594570875, "reward_before_mean": 0.6397079173475504, "reward_before_std": 0.5492154005914927, "reward_change_max": 0.0, "reward_change_mean": -0.12385630467906594, "reward_change_min": -0.20062089059501886, "reward_change_std": 0.07504534930922091, "reward_std": 0.5560441724956036, "rewards/cosine_scaled_reward": 0.017770618200302124, "rewards/format_reward": 0.6041666679084301, "step": 223 }, { "advantage_max": 1.2781447544693947, "advantage_mean": 9.002784961964494e-09, "advantage_min": -1.224984422326088, "advantage_std": 0.9998601526021957, "completion_length": 2220.416732788086, "epoch": 0.256, "grad_norm": 0.18231956660747528, "kl": 0.0063304901123046875, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.068574212948169e-07, "loss": 0.0003, "reward": 0.49894432350993156, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.49894432350993156, "reward_after_std": 0.9219648316502571, "reward_before_mean": 0.6136458464898169, "reward_before_std": 0.9475033320486546, "reward_change_max": 0.0, "reward_change_mean": -0.11470149923115969, "reward_change_min": -0.23749073594808578, "reward_change_std": 0.09353986661881208, "reward_std": 0.9219648726284504, "rewards/cosine_scaled_reward": -0.05776042211800814, "rewards/format_reward": 0.729166679084301, "step": 224 }, { "advantage_max": 1.6483723670244217, "advantage_mean": 1.2417633921124605e-08, "advantage_min": -1.0310075506567955, "advantage_std": 0.9998345449566841, "completion_length": 2474.9792556762695, "epoch": 0.2571428571428571, "grad_norm": 0.2757589519023895, "kl": 0.011646270751953125, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.039090644965509e-07, "loss": 0.0005, "reward": 0.39897412806749344, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.39897412806749344, "reward_after_std": 0.7204340994358063, "reward_before_mean": 0.5055959932506084, "reward_before_std": 0.713385995477438, "reward_change_max": 5.5462121963500977e-05, "reward_change_mean": -0.10662184376269579, "reward_change_min": -0.1872426439076662, "reward_change_std": 0.07501717936247587, "reward_std": 0.7204341255128384, "rewards/cosine_scaled_reward": -0.04928534850478172, "rewards/format_reward": 0.6041666734963655, "step": 225 }, { "advantage_max": 1.368759848177433, "advantage_mean": -1.800557003495129e-08, "advantage_min": -1.1878944411873817, "advantage_std": 0.9998493194580078, "completion_length": 1795.6458892822266, "epoch": 0.2582857142857143, "grad_norm": 0.19780239462852478, "kl": 0.005168914794921875, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.009532063876148e-07, "loss": 0.0002, "reward": 0.9348589247092605, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.9348589247092605, "reward_after_std": 0.9241700284183025, "reward_before_mean": 1.0913225067779422, "reward_before_std": 0.9455209393054247, "reward_change_max": 0.0, "reward_change_mean": -0.1564636155962944, "reward_change_min": -0.28128004260361195, "reward_change_std": 0.11181944841518998, "reward_std": 0.924170047044754, "rewards/cosine_scaled_reward": 0.13941125571727753, "rewards/format_reward": 0.8125000111758709, "step": 226 }, { "advantage_max": 1.5920889675617218, "advantage_mean": -9.934107758624577e-09, "advantage_min": -1.0377077758312225, "advantage_std": 0.9998761713504791, "completion_length": 1348.6875534057617, "epoch": 0.25942857142857145, "grad_norm": 0.26142218708992004, "kl": 0.009868621826171875, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.979899910323624e-07, "loss": 0.0004, "reward": 0.7289588078856468, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7289588078856468, "reward_after_std": 0.9257365316152573, "reward_before_mean": 0.8592088520526886, "reward_before_std": 0.9165628142654896, "reward_change_max": 0.0, "reward_change_mean": -0.1302500069141388, "reward_change_min": -0.24056414235383272, "reward_change_std": 0.0870923982001841, "reward_std": 0.9257365316152573, "rewards/cosine_scaled_reward": -0.028728928649798036, "rewards/format_reward": 0.916666679084301, "step": 227 }, { "advantage_max": 1.4398110508918762, "advantage_mean": 7.450580929990736e-09, "advantage_min": -1.056349277496338, "advantage_std": 0.9998444691300392, "completion_length": 1577.3125381469727, "epoch": 0.26057142857142856, "grad_norm": 0.25881561636924744, "kl": 0.006450653076171875, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.950195628537299e-07, "loss": 0.0003, "reward": 0.7870747782289982, "reward_advantage_correlation": 0.9999999999999994, "reward_after_mean": 0.7870747782289982, "reward_after_std": 0.8033650256693363, "reward_before_mean": 0.929570865817368, "reward_before_std": 0.8072608970105648, "reward_change_max": 0.0, "reward_change_mean": -0.1424960799049586, "reward_change_min": -0.25784505158662796, "reward_change_std": 0.09646610415074974, "reward_std": 0.803365059196949, "rewards/cosine_scaled_reward": 0.08978541730903089, "rewards/format_reward": 0.7500000074505806, "step": 228 }, { "advantage_max": 1.4671648442745209, "advantage_mean": -3.7563344879032456e-08, "advantage_min": -1.1452326700091362, "advantage_std": 0.9997798949480057, "completion_length": 1936.2500228881836, "epoch": 0.26171428571428573, "grad_norm": 0.25452426075935364, "kl": 0.007841110229492188, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.920420666261961e-07, "loss": 0.0003, "reward": 0.6133114844560623, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6133114844560623, "reward_after_std": 0.6665918864309788, "reward_before_mean": 0.7417216263711452, "reward_before_std": 0.6557564614340663, "reward_change_max": 6.188452243804932e-05, "reward_change_mean": -0.12841017404571176, "reward_change_min": -0.2055542655289173, "reward_change_std": 0.08059329586103559, "reward_std": 0.66659190133214, "rewards/cosine_scaled_reward": 0.006277475506067276, "rewards/format_reward": 0.7291666753590107, "step": 229 }, { "advantage_max": 1.4177044332027435, "advantage_mean": -1.4280279403422469e-08, "advantage_min": -1.315854400396347, "advantage_std": 0.9998226910829544, "completion_length": 2070.937530517578, "epoch": 0.26285714285714284, "grad_norm": 0.23040302097797394, "kl": 0.0074100494384765625, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.890576474687263e-07, "loss": 0.0003, "reward": 0.22305097430944443, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.22305097430944443, "reward_after_std": 0.6350849457085133, "reward_before_mean": 0.3175716698169708, "reward_before_std": 0.6423088535666466, "reward_change_max": 0.0005049854516983032, "reward_change_mean": -0.09452070062980056, "reward_change_min": -0.16764249559491873, "reward_change_std": 0.06898301001638174, "reward_std": 0.6350849494338036, "rewards/cosine_scaled_reward": -0.1849641827866435, "rewards/format_reward": 0.6875000111758709, "step": 230 }, { "advantage_max": 1.4866014271974564, "advantage_mean": -1.1051695258945671e-07, "advantage_min": -1.219824656844139, "advantage_std": 0.9998104348778725, "completion_length": 1799.8750686645508, "epoch": 0.264, "grad_norm": 0.21469198167324066, "kl": 0.0066547393798828125, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.860664508377001e-07, "loss": 0.0003, "reward": 0.8363642990589142, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.8363642990589142, "reward_after_std": 0.7526835240423679, "reward_before_mean": 0.9858303721994162, "reward_before_std": 0.7550496160984039, "reward_change_max": 0.0, "reward_change_mean": -0.1494661532342434, "reward_change_min": -0.2536203945055604, "reward_change_std": 0.10055593773722649, "reward_std": 0.7526835426688194, "rewards/cosine_scaled_reward": 0.0762485321611166, "rewards/format_reward": 0.8333333469927311, "step": 231 }, { "advantage_max": 1.6209463626146317, "advantage_mean": 4.967053879312289e-09, "advantage_min": -1.0295084789395332, "advantage_std": 0.9998287931084633, "completion_length": 2159.2083892822266, "epoch": 0.2651428571428571, "grad_norm": 0.24421940743923187, "kl": 0.0079345703125, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.83068622519821e-07, "loss": 0.0003, "reward": 0.21903796587139368, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.21903796587139368, "reward_after_std": 0.6888058856129646, "reward_before_mean": 0.3088764566928148, "reward_before_std": 0.6795613020658493, "reward_change_max": 0.0005556866526603699, "reward_change_mean": -0.0898384740576148, "reward_change_min": -0.15906614251434803, "reward_change_std": 0.061581351794302464, "reward_std": 0.6888059228658676, "rewards/cosine_scaled_reward": -0.18931178748607635, "rewards/format_reward": 0.687500013038516, "step": 232 }, { "advantage_max": 1.5372837334871292, "advantage_mean": -3.802900583327329e-08, "advantage_min": -1.0135958343744278, "advantage_std": 0.9997899383306503, "completion_length": 1570.333396911621, "epoch": 0.2662857142857143, "grad_norm": 0.267768532037735, "kl": 0.00606536865234375, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.800643086250121e-07, "loss": 0.0002, "reward": 0.48989987885579467, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.48989987885579467, "reward_after_std": 0.6716056112200022, "reward_before_mean": 0.6066783485002816, "reward_before_std": 0.6613761279731989, "reward_change_max": 0.0, "reward_change_mean": -0.11677848640829325, "reward_change_min": -0.2160600544884801, "reward_change_std": 0.07633232930675149, "reward_std": 0.671605659648776, "rewards/cosine_scaled_reward": -0.12374416552484035, "rewards/format_reward": 0.854166679084301, "step": 233 }, { "advantage_max": 1.4362711906433105, "advantage_mean": 8.6923440667519e-09, "advantage_min": -1.2135839760303497, "advantage_std": 0.999776653945446, "completion_length": 2142.3750228881836, "epoch": 0.2674285714285714, "grad_norm": 0.2719072103500366, "kl": 0.00940704345703125, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.770536555792944e-07, "loss": 0.0004, "reward": 0.3730768244713545, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3730768244713545, "reward_after_std": 0.5923286378383636, "reward_before_mean": 0.4837719602510333, "reward_before_std": 0.5916496124118567, "reward_change_max": 0.0, "reward_change_mean": -0.11069511156529188, "reward_change_min": -0.1783575415611267, "reward_change_std": 0.07707143994048238, "reward_std": 0.592328667640686, "rewards/cosine_scaled_reward": -0.0914473719894886, "rewards/format_reward": 0.6666666772216558, "step": 234 }, { "advantage_max": 1.5493723526597023, "advantage_mean": -1.0679166395632933e-07, "advantage_min": -1.0974969416856766, "advantage_std": 0.9997179284691811, "completion_length": 1460.270866394043, "epoch": 0.26857142857142857, "grad_norm": 0.24478395283222198, "kl": 0.007293701171875, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.740368101176495e-07, "loss": 0.0003, "reward": 1.0726623684167862, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 1.0726623684167862, "reward_after_std": 0.5861970726400614, "reward_before_mean": 1.2478751856833696, "reward_before_std": 0.5587137271650136, "reward_change_max": 0.00025247782468795776, "reward_change_mean": -0.1752128468360752, "reward_change_min": -0.2610089210793376, "reward_change_std": 0.10384868178516626, "reward_std": 0.5861970763653517, "rewards/cosine_scaled_reward": 0.21768759936094284, "rewards/format_reward": 0.8125000074505806, "step": 235 }, { "advantage_max": 1.740464448928833, "advantage_mean": 4.440892098500626e-16, "advantage_min": -0.8375889658927917, "advantage_std": 0.9998397752642632, "completion_length": 2148.5416946411133, "epoch": 0.26971428571428574, "grad_norm": 0.19493846595287323, "kl": 0.0068416595458984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.710139192768694e-07, "loss": 0.0003, "reward": 0.37424314580857754, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.37424314580857754, "reward_after_std": 0.8406364396214485, "reward_before_mean": 0.47235390916466713, "reward_before_std": 0.8247300237417221, "reward_change_max": 0.0, "reward_change_mean": -0.098110759165138, "reward_change_min": -0.177609003148973, "reward_change_std": 0.062242650194093585, "reward_std": 0.8406364470720291, "rewards/cosine_scaled_reward": -0.09715638670604676, "rewards/format_reward": 0.6666666679084301, "step": 236 }, { "advantage_max": 1.5386316254734993, "advantage_mean": -1.1175871117430347e-08, "advantage_min": -1.1397172287106514, "advantage_std": 0.9997890964150429, "completion_length": 1514.7500228881836, "epoch": 0.27085714285714285, "grad_norm": 0.21677693724632263, "kl": 0.00567626953125, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.679851303883891e-07, "loss": 0.0002, "reward": 0.7754391804337502, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7754391804337502, "reward_after_std": 0.5829839073121548, "reward_before_mean": 0.9211319833993912, "reward_before_std": 0.5614862740039825, "reward_change_max": 0.0, "reward_change_mean": -0.14569283719174564, "reward_change_min": -0.21358582749962807, "reward_change_std": 0.0806428431533277, "reward_std": 0.5829839296638966, "rewards/cosine_scaled_reward": 0.03348265402019024, "rewards/format_reward": 0.8541666716337204, "step": 237 }, { "advantage_max": 1.4392332583665848, "advantage_mean": -4.346172211011634e-08, "advantage_min": -1.210955560207367, "advantage_std": 0.9997563362121582, "completion_length": 1502.4791946411133, "epoch": 0.272, "grad_norm": 0.22297178208827972, "kl": 0.008113861083984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.649505910711058e-07, "loss": 0.0003, "reward": 0.7160059418529272, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7160059418529272, "reward_after_std": 0.625291345641017, "reward_before_mean": 0.8567305975593626, "reward_before_std": 0.6137831769883633, "reward_change_max": 0.0, "reward_change_mean": -0.14072464779019356, "reward_change_min": -0.23825540859252214, "reward_change_std": 0.08909846004098654, "reward_std": 0.6252913642674685, "rewards/cosine_scaled_reward": -0.009134718915447593, "rewards/format_reward": 0.8750000074505806, "step": 238 }, { "advantage_max": 1.581406444311142, "advantage_mean": -2.6573738720614415e-07, "advantage_min": -0.9953296408057213, "advantage_std": 0.999721497297287, "completion_length": 1504.1458473205566, "epoch": 0.27314285714285713, "grad_norm": 0.2068232297897339, "kl": 0.0054912567138671875, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.619104492241847e-07, "loss": 0.0002, "reward": 1.0969355329871178, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 1.0969355329871178, "reward_after_std": 0.6276315655559301, "reward_before_mean": 1.2723851148039103, "reward_before_std": 0.5967890359461308, "reward_change_max": 0.0002623945474624634, "reward_change_mean": -0.17544960090890527, "reward_change_min": -0.26079559326171875, "reward_change_std": 0.10241867695003748, "reward_std": 0.6276315916329622, "rewards/cosine_scaled_reward": 0.25077585806138813, "rewards/format_reward": 0.770833333954215, "step": 239 }, { "advantage_max": 1.5168597102165222, "advantage_mean": 8.692343844707295e-09, "advantage_min": -1.131408378481865, "advantage_std": 0.9997967407107353, "completion_length": 1919.458381652832, "epoch": 0.2742857142857143, "grad_norm": 0.29865968227386475, "kl": 0.00994873046875, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.588648530198504e-07, "loss": 0.0004, "reward": 0.2035725242458284, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2035725242458284, "reward_after_std": 0.5868874341249466, "reward_before_mean": 0.29640037566423416, "reward_before_std": 0.582292553037405, "reward_change_max": 0.00010397285223007202, "reward_change_mean": -0.09282782999798656, "reward_change_min": -0.1734831389039755, "reward_change_std": 0.06268787989392877, "reward_std": 0.5868874527513981, "rewards/cosine_scaled_reward": -0.1955498280003667, "rewards/format_reward": 0.687500013038516, "step": 240 }, { "advantage_max": 1.626302644610405, "advantage_mean": -2.235174290099451e-08, "advantage_min": -0.99515251070261, "advantage_std": 0.999805323779583, "completion_length": 2014.3333435058594, "epoch": 0.2754285714285714, "grad_norm": 0.26837101578712463, "kl": 0.01031494140625, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.558139508961654e-07, "loss": 0.0004, "reward": 0.06266416236758232, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.06266416236758232, "reward_after_std": 0.5582391433417797, "reward_before_mean": 0.1420099101960659, "reward_before_std": 0.5503151379525661, "reward_change_max": 0.00012226402759552002, "reward_change_mean": -0.07934575085528195, "reward_change_min": -0.1459060488268733, "reward_change_std": 0.05173483118414879, "reward_std": 0.5582391582429409, "rewards/cosine_scaled_reward": -0.2727450542151928, "rewards/format_reward": 0.6875000074505806, "step": 241 }, { "advantage_max": 1.4509450048208237, "advantage_mean": 3.1044156134640843e-10, "advantage_min": -1.3363563306629658, "advantage_std": 0.9995779171586037, "completion_length": 1342.4166946411133, "epoch": 0.2765714285714286, "grad_norm": 0.2745201289653778, "kl": 0.013671875, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.527578915497951e-07, "loss": 0.0005, "reward": 0.4477707026526332, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4477707026526332, "reward_after_std": 0.5168483178131282, "reward_before_mean": 0.5664155303966254, "reward_before_std": 0.5090636860113591, "reward_change_max": 0.00021380186080932617, "reward_change_mean": -0.11864481214433908, "reward_change_min": -0.18819516710937023, "reward_change_std": 0.07314185099676251, "reward_std": 0.5168483252637088, "rewards/cosine_scaled_reward": -0.1751255802810192, "rewards/format_reward": 0.916666679084301, "step": 242 }, { "advantage_max": 1.4236368983983994, "advantage_mean": 3.1044091741705415e-09, "advantage_min": -1.275793395936489, "advantage_std": 0.9998544678092003, "completion_length": 1794.9584045410156, "epoch": 0.2777142857142857, "grad_norm": 0.20937462151050568, "kl": 0.00710296630859375, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.496968239287603e-07, "loss": 0.0003, "reward": 0.6848674118518829, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6848674118518829, "reward_after_std": 0.8243732713162899, "reward_before_mean": 0.8182772938162088, "reward_before_std": 0.8360154293477535, "reward_change_max": 9.438395500183105e-05, "reward_change_mean": -0.1334098584484309, "reward_change_min": -0.23731573671102524, "reward_change_std": 0.09346593916416168, "reward_std": 0.8243732936680317, "rewards/cosine_scaled_reward": 0.002888637245632708, "rewards/format_reward": 0.812500013038516, "step": 243 }, { "advantage_max": 1.5768461674451828, "advantage_mean": -3.725290742551124e-09, "advantage_min": -1.05535177141428, "advantage_std": 0.9997763559222221, "completion_length": 1677.4791984558105, "epoch": 0.27885714285714286, "grad_norm": 0.24402037262916565, "kl": 0.006748199462890625, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.466308972251785e-07, "loss": 0.0003, "reward": 0.6520390259101987, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6520390259101987, "reward_after_std": 0.6846922170370817, "reward_before_mean": 0.7843025382608175, "reward_before_std": 0.6760292388498783, "reward_change_max": 0.00015122443437576294, "reward_change_mean": -0.13226348999887705, "reward_change_min": -0.22804296016693115, "reward_change_std": 0.08690834417939186, "reward_std": 0.6846922319382429, "rewards/cosine_scaled_reward": -0.003682076930999756, "rewards/format_reward": 0.7916666734963655, "step": 244 }, { "advantage_max": 1.397861048579216, "advantage_mean": 9.3132264122886e-09, "advantage_min": -1.0036931559443474, "advantage_std": 0.9998713657259941, "completion_length": 2007.2916946411133, "epoch": 0.28, "grad_norm": 0.2225971817970276, "kl": 0.007488250732421875, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.435602608679916e-07, "loss": 0.0003, "reward": 0.7011168226599693, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.7011168226599693, "reward_after_std": 1.021484598517418, "reward_before_mean": 0.8311672285199165, "reward_before_std": 1.0451684147119522, "reward_change_max": 4.5515596866607666e-05, "reward_change_mean": -0.13005035952664912, "reward_change_min": -0.26285023987293243, "reward_change_std": 0.10422047041356564, "reward_std": 1.0214846432209015, "rewards/cosine_scaled_reward": 0.0510002663359046, "rewards/format_reward": 0.7291666716337204, "step": 245 }, { "advantage_max": 1.483703851699829, "advantage_mean": -3.1044087300813317e-09, "advantage_min": -1.0376464948058128, "advantage_std": 0.9998078644275665, "completion_length": 1695.2500381469727, "epoch": 0.28114285714285714, "grad_norm": 0.23753374814987183, "kl": 0.0087890625, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.404850645156841e-07, "loss": 0.0004, "reward": 0.5922103077173233, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5922103077173233, "reward_after_std": 0.6839786767959595, "reward_before_mean": 0.7197319120168686, "reward_before_std": 0.6809311471879482, "reward_change_max": 0.0, "reward_change_mean": -0.127521563321352, "reward_change_min": -0.22557256184518337, "reward_change_std": 0.08379548555240035, "reward_std": 0.6839786805212498, "rewards/cosine_scaled_reward": -0.04638407193124294, "rewards/format_reward": 0.8125, "step": 246 }, { "advantage_max": 1.560398355126381, "advantage_mean": 3.849466889693787e-08, "advantage_min": -0.9917672201991081, "advantage_std": 0.9997923448681831, "completion_length": 2328.666748046875, "epoch": 0.2822857142857143, "grad_norm": 0.2871028780937195, "kl": 0.01140594482421875, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.374054580489873e-07, "loss": 0.0005, "reward": 0.1551234694197774, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1551234694197774, "reward_after_std": 0.6376011576503515, "reward_before_mean": 0.24224435538053513, "reward_before_std": 0.6399637795984745, "reward_change_max": 6.29127025604248e-05, "reward_change_mean": -0.08712084114085883, "reward_change_min": -0.17339404299855232, "reward_change_std": 0.0648341947235167, "reward_std": 0.6376011855900288, "rewards/cosine_scaled_reward": -0.14971117489039898, "rewards/format_reward": 0.5416666753590107, "step": 247 }, { "advantage_max": 1.6397739797830582, "advantage_mean": 3.47693762670076e-08, "advantage_min": -1.076673448085785, "advantage_std": 0.9997544661164284, "completion_length": 1653.187515258789, "epoch": 0.2834285714285714, "grad_norm": 0.2525901794433594, "kl": 0.00815582275390625, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.343215915635761e-07, "loss": 0.0003, "reward": 0.8820497170090675, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.8820497170090675, "reward_after_std": 0.615868715569377, "reward_before_mean": 1.0375536493957043, "reward_before_std": 0.5880707837641239, "reward_change_max": 0.0007249712944030762, "reward_change_mean": -0.15550392726436257, "reward_change_min": -0.22709692269563675, "reward_change_std": 0.09177486295811832, "reward_std": 0.6158687248826027, "rewards/cosine_scaled_reward": 0.1646101539954543, "rewards/format_reward": 0.7083333432674408, "step": 248 }, { "advantage_max": 1.4647027403116226, "advantage_mean": 1.4280280291600889e-08, "advantage_min": -1.1837932839989662, "advantage_std": 0.9998209699988365, "completion_length": 1363.208366394043, "epoch": 0.2845714285714286, "grad_norm": 0.25547295808792114, "kl": 0.0075168609619140625, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.31233615362752e-07, "loss": 0.0003, "reward": 1.1282103421690408, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 1.1282103421690408, "reward_after_std": 0.6968370862305164, "reward_before_mean": 1.305668581277132, "reward_before_std": 0.6800988968461752, "reward_change_max": 9.535253047943115e-05, "reward_change_mean": -0.1774582201614976, "reward_change_min": -0.27143237367272377, "reward_change_std": 0.10626805946230888, "reward_std": 0.696837093681097, "rewards/cosine_scaled_reward": 0.20491759851574898, "rewards/format_reward": 0.8958333432674408, "step": 249 }, { "advantage_max": 1.692870482802391, "advantage_mean": -2.173086016687975e-08, "advantage_min": -1.0667153745889664, "advantage_std": 0.999844454228878, "completion_length": 1385.1250381469727, "epoch": 0.2857142857142857, "grad_norm": 0.31402283906936646, "kl": 0.01108551025390625, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.281416799501187e-07, "loss": 0.0004, "reward": 0.6773853991180658, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6773853991180658, "reward_after_std": 0.768508791923523, "reward_before_mean": 0.8056108057498932, "reward_before_std": 0.7412783540785313, "reward_change_max": 0.0, "reward_change_mean": -0.12822541315108538, "reward_change_min": -0.1969393789768219, "reward_change_std": 0.07280616229400039, "reward_std": 0.7685088030993938, "rewards/cosine_scaled_reward": -0.04511125944554806, "rewards/format_reward": 0.8958333432674408, "step": 250 }, { "advantage_max": 1.5190437734127045, "advantage_mean": -5.463759289447978e-08, "advantage_min": -1.0659090280532837, "advantage_std": 0.9997875168919563, "completion_length": 1294.6041870117188, "epoch": 0.28685714285714287, "grad_norm": 0.3337409794330597, "kl": 0.011199951171875, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.25045936022246e-07, "loss": 0.0004, "reward": 0.6588802421465516, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.6588802421465516, "reward_after_std": 0.7135784067213535, "reward_before_mean": 0.7902223952114582, "reward_before_std": 0.7042027357965708, "reward_change_max": 0.0, "reward_change_mean": -0.13134220987558365, "reward_change_min": -0.23708409443497658, "reward_change_std": 0.08308501448482275, "reward_std": 0.7135784365236759, "rewards/cosine_scaled_reward": -0.0528054665774107, "rewards/format_reward": 0.8958333358168602, "step": 251 }, { "advantage_max": 1.5606249049305916, "advantage_mean": -1.7384688355548406e-08, "advantage_min": -1.11652322858572, "advantage_std": 0.9998264908790588, "completion_length": 1679.3125534057617, "epoch": 0.288, "grad_norm": 0.23594596982002258, "kl": 0.009107589721679688, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.219465344613258e-07, "loss": 0.0004, "reward": 0.5095890890806913, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5095890890806913, "reward_after_std": 0.7146609388291836, "reward_before_mean": 0.626227805390954, "reward_before_std": 0.6994800176471472, "reward_change_max": 0.0003551766276359558, "reward_change_mean": -0.11663870047777891, "reward_change_min": -0.19144857861101627, "reward_change_std": 0.07572575053200126, "reward_std": 0.7146609574556351, "rewards/cosine_scaled_reward": -0.1035527940839529, "rewards/format_reward": 0.8333333469927311, "step": 252 }, { "advantage_max": 1.6786980479955673, "advantage_mean": 2.5766592637310737e-08, "advantage_min": -0.9841240048408508, "advantage_std": 0.9998369589447975, "completion_length": 1716.2500457763672, "epoch": 0.28914285714285715, "grad_norm": 0.27604344487190247, "kl": 0.009691238403320312, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.188436263278172e-07, "loss": 0.0004, "reward": 0.6863379459828138, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6863379459828138, "reward_after_std": 0.8101256862282753, "reward_before_mean": 0.8162229340523481, "reward_before_std": 0.7945797778666019, "reward_change_max": 4.842877388000488e-07, "reward_change_mean": -0.129884981084615, "reward_change_min": -0.22787339333444834, "reward_change_std": 0.08715493371710181, "reward_std": 0.8101256936788559, "rewards/cosine_scaled_reward": -0.008555212989449501, "rewards/format_reward": 0.8333333395421505, "step": 253 }, { "advantage_max": 1.6549670845270157, "advantage_mean": -1.3038516155639002e-08, "advantage_min": -1.0284245312213898, "advantage_std": 0.9998267441987991, "completion_length": 1541.7500381469727, "epoch": 0.29028571428571426, "grad_norm": 0.32521700859069824, "kl": 0.009090423583984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.157373628530852e-07, "loss": 0.0004, "reward": 0.600863391533494, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.600863391533494, "reward_after_std": 0.6789968274533749, "reward_before_mean": 0.7261254452168941, "reward_before_std": 0.6610924564301968, "reward_change_max": 0.0, "reward_change_mean": -0.12526203296147287, "reward_change_min": -0.20569150894880295, "reward_change_std": 0.07578376494348049, "reward_std": 0.6789968274533749, "rewards/cosine_scaled_reward": -0.05360395833849907, "rewards/format_reward": 0.8333333488553762, "step": 254 }, { "advantage_max": 1.5857353210449219, "advantage_mean": -2.4835269396561444e-08, "advantage_min": -0.9731541946530342, "advantage_std": 0.9997934550046921, "completion_length": 2176.4166870117188, "epoch": 0.2914285714285714, "grad_norm": 0.25210484862327576, "kl": 0.009918212890625, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.126278954320294e-07, "loss": 0.0004, "reward": 0.2443622061982751, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2443622061982751, "reward_after_std": 0.6697471439838409, "reward_before_mean": 0.3375973515212536, "reward_before_std": 0.6620416026562452, "reward_change_max": 5.364418029785156e-06, "reward_change_mean": -0.09323514788411558, "reward_change_min": -0.16571150813251734, "reward_change_std": 0.061854652129113674, "reward_std": 0.6697471439838409, "rewards/cosine_scaled_reward": -0.164534667506814, "rewards/format_reward": 0.6666666679084301, "step": 255 }, { "advantage_max": 1.2646159529685974, "advantage_mean": -1.1175872338675674e-08, "advantage_min": -1.523250088095665, "advantage_std": 0.9998194351792336, "completion_length": 1493.7500534057617, "epoch": 0.2925714285714286, "grad_norm": 0.32058438658714294, "kl": 0.009937286376953125, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.095153756157051e-07, "loss": 0.0004, "reward": 0.7117386423051357, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7117386423051357, "reward_after_std": 0.7076647616922855, "reward_before_mean": 0.8520172564312816, "reward_before_std": 0.7215098179876804, "reward_change_max": 0.0, "reward_change_mean": -0.14027859549969435, "reward_change_min": -0.24180615320801735, "reward_change_std": 0.09425603970885277, "reward_std": 0.7076647914946079, "rewards/cosine_scaled_reward": -0.011491380631923676, "rewards/format_reward": 0.8750000223517418, "step": 256 }, { "advantage_max": 1.4877362996339798, "advantage_mean": -4.967052102955449e-09, "advantage_min": -1.1290940716862679, "advantage_std": 0.9998562559485435, "completion_length": 2158.229263305664, "epoch": 0.2937142857142857, "grad_norm": 0.264924556016922, "kl": 0.00928497314453125, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.06399955103937e-07, "loss": 0.0004, "reward": 0.9049882646650076, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.9049882646650076, "reward_after_std": 0.8671839684247971, "reward_before_mean": 1.0569295436143875, "reward_before_std": 0.8683264292776585, "reward_change_max": 0.0, "reward_change_mean": -0.15194128267467022, "reward_change_min": -0.26315687224268913, "reward_change_std": 0.10091545199975371, "reward_std": 0.8671840019524097, "rewards/cosine_scaled_reward": 0.15346478174615186, "rewards/format_reward": 0.7500000055879354, "step": 257 }, { "advantage_max": 1.61009082198143, "advantage_mean": -3.9736431700632124e-08, "advantage_min": -1.1207184195518494, "advantage_std": 0.9997821226716042, "completion_length": 1955.9375534057617, "epoch": 0.2948571428571429, "grad_norm": 0.2173985093832016, "kl": 0.00830078125, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.032817857379256e-07, "loss": 0.0003, "reward": 0.3813736569136381, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3813736569136381, "reward_after_std": 0.7218630239367485, "reward_before_mean": 0.4860916808247566, "reward_before_std": 0.7180778924375772, "reward_change_max": 0.00016684085130691528, "reward_change_mean": -0.10471805580891669, "reward_change_min": -0.18698874861001968, "reward_change_std": 0.07169946609064937, "reward_std": 0.7218630462884903, "rewards/cosine_scaled_reward": -0.10070415772497654, "rewards/format_reward": 0.6875000074505806, "step": 258 }, { "advantage_max": 1.4760468155145645, "advantage_mean": -7.078051778020011e-08, "advantage_min": -1.27405995875597, "advantage_std": 0.9997367337346077, "completion_length": 1573.7916984558105, "epoch": 0.296, "grad_norm": 0.29860401153564453, "kl": 0.010494232177734375, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.001610194928464e-07, "loss": 0.0004, "reward": 0.7439992446452379, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.7439992446452379, "reward_after_std": 0.6591152455657721, "reward_before_mean": 0.8863520976155996, "reward_before_std": 0.6521930769085884, "reward_change_max": 0.0, "reward_change_mean": -0.14235285948961973, "reward_change_min": -0.22541704028844833, "reward_change_std": 0.09098386578261852, "reward_std": 0.659115256741643, "rewards/cosine_scaled_reward": 0.016092704609036446, "rewards/format_reward": 0.854166679084301, "step": 259 }, { "advantage_max": 1.50406713783741, "advantage_mean": -6.022552734297193e-08, "advantage_min": -1.1367171704769135, "advantage_std": 0.999772198498249, "completion_length": 1268.145851135254, "epoch": 0.29714285714285715, "grad_norm": 0.3464038372039795, "kl": 0.009735107421875, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.97037808470444e-07, "loss": 0.0004, "reward": 0.9734570910222828, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.9734570910222828, "reward_after_std": 0.6558955330401659, "reward_before_mean": 1.1375967441126704, "reward_before_std": 0.6387905618175864, "reward_change_max": 0.0003156587481498718, "reward_change_mean": -0.16413968708366156, "reward_change_min": -0.26056714355945587, "reward_change_std": 0.10325025115162134, "reward_std": 0.6558955684304237, "rewards/cosine_scaled_reward": 0.1625483650714159, "rewards/format_reward": 0.8125000111758709, "step": 260 }, { "advantage_max": 1.6002927124500275, "advantage_mean": 6.208817460162663e-09, "advantage_min": -1.0500682145357132, "advantage_std": 0.9997867494821548, "completion_length": 2184.229179382324, "epoch": 0.29828571428571427, "grad_norm": 0.20467397570610046, "kl": 0.009990692138671875, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.939123048916173e-07, "loss": 0.0004, "reward": 0.2399905025959015, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2399905025959015, "reward_after_std": 0.6328884586691856, "reward_before_mean": 0.3335062563419342, "reward_before_std": 0.6211761645972729, "reward_change_max": 7.826089859008789e-05, "reward_change_mean": -0.09351573511958122, "reward_change_min": -0.16378842666745186, "reward_change_std": 0.060065632220357656, "reward_std": 0.6328884772956371, "rewards/cosine_scaled_reward": -0.1457468868829892, "rewards/format_reward": 0.6250000055879354, "step": 261 }, { "advantage_max": 1.3524987325072289, "advantage_mean": -1.6453366669111347e-08, "advantage_min": -1.279630459845066, "advantage_std": 0.9997990727424622, "completion_length": 1722.2500686645508, "epoch": 0.29942857142857143, "grad_norm": 0.32207873463630676, "kl": 0.012050628662109375, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.907846610890011e-07, "loss": 0.0005, "reward": 0.26394235249608755, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.26394235249608755, "reward_after_std": 0.6132156141102314, "reward_before_mean": 0.3629047591239214, "reward_before_std": 0.6177421398460865, "reward_change_max": 0.0011852085590362549, "reward_change_mean": -0.09896239778026938, "reward_change_min": -0.176782650873065, "reward_change_std": 0.06817787745967507, "reward_std": 0.6132156550884247, "rewards/cosine_scaled_reward": -0.18313096463680267, "rewards/format_reward": 0.7291666734963655, "step": 262 }, { "advantage_max": 1.4976128190755844, "advantage_mean": -1.8626453490711015e-08, "advantage_min": -1.0512003675103188, "advantage_std": 0.9997568875551224, "completion_length": 1497.9167175292969, "epoch": 0.30057142857142854, "grad_norm": 0.2446172684431076, "kl": 0.006046295166015625, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.87655029499542e-07, "loss": 0.0002, "reward": 0.45462223142385483, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.45462223142385483, "reward_after_std": 0.5269899610430002, "reward_before_mean": 0.5730956010520458, "reward_before_std": 0.5175444334745407, "reward_change_max": 0.0, "reward_change_mean": -0.11847336497157812, "reward_change_min": -0.20368537306785583, "reward_change_std": 0.07308742869645357, "reward_std": 0.5269899740815163, "rewards/cosine_scaled_reward": -0.1926188673824072, "rewards/format_reward": 0.9583333358168602, "step": 263 }, { "advantage_max": 1.5773601084947586, "advantage_mean": -1.4901161971003773e-08, "advantage_min": -1.1340029016137123, "advantage_std": 0.9998199939727783, "completion_length": 1489.395881652832, "epoch": 0.3017142857142857, "grad_norm": 0.24546962976455688, "kl": 0.00830841064453125, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.845235626570683e-07, "loss": 0.0003, "reward": 0.43727186508476734, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.43727186508476734, "reward_after_std": 0.715681679546833, "reward_before_mean": 0.5463062226772308, "reward_before_std": 0.7034594938158989, "reward_change_max": 0.0, "reward_change_mean": -0.10903437249362469, "reward_change_min": -0.18007494881749153, "reward_change_std": 0.06830112263560295, "reward_std": 0.7156816907227039, "rewards/cosine_scaled_reward": -0.15393022983334959, "rewards/format_reward": 0.8541666772216558, "step": 264 }, { "advantage_max": 1.5012609884142876, "advantage_mean": -4.967053435223079e-09, "advantage_min": -1.0871346518397331, "advantage_std": 0.9997839033603668, "completion_length": 1440.5833892822266, "epoch": 0.3028571428571429, "grad_norm": 0.25864601135253906, "kl": 0.008214950561523438, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.813904131848564e-07, "loss": 0.0003, "reward": 0.7960514797596261, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7960514797596261, "reward_after_std": 0.747637789696455, "reward_before_mean": 0.9413503212854266, "reward_before_std": 0.7453129384666681, "reward_change_max": 0.0, "reward_change_mean": -0.14529882464557886, "reward_change_min": -0.24091583769768476, "reward_change_std": 0.09140054974704981, "reward_std": 0.7476377971470356, "rewards/cosine_scaled_reward": 0.0019251517951488495, "rewards/format_reward": 0.9375000074505806, "step": 265 }, { "advantage_max": 1.3270168602466583, "advantage_mean": -2.6697914656814703e-08, "advantage_min": -1.3128254860639572, "advantage_std": 0.9998119845986366, "completion_length": 1821.958381652832, "epoch": 0.304, "grad_norm": 0.2732686698436737, "kl": 0.01064300537109375, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.78255733788191e-07, "loss": 0.0004, "reward": 0.5604800856672227, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5604800856672227, "reward_after_std": 0.672345083206892, "reward_before_mean": 0.6874868590384722, "reward_before_std": 0.6820431463420391, "reward_change_max": 0.0002983957529067993, "reward_change_mean": -0.12700679013505578, "reward_change_min": -0.21464761439710855, "reward_change_std": 0.08562309807166457, "reward_std": 0.6723450906574726, "rewards/cosine_scaled_reward": -0.05208991654217243, "rewards/format_reward": 0.7916666772216558, "step": 266 }, { "advantage_max": 1.5514462441205978, "advantage_mean": -4.967053990334591e-09, "advantage_min": -1.1182539835572243, "advantage_std": 0.999804675579071, "completion_length": 2261.3958892822266, "epoch": 0.30514285714285716, "grad_norm": 0.25328734517097473, "kl": 0.013637542724609375, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.751196772469237e-07, "loss": 0.0005, "reward": 0.20815421640872955, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.20815421640872955, "reward_after_std": 0.6557923518121243, "reward_before_mean": 0.29744547605514526, "reward_before_std": 0.6443983167409897, "reward_change_max": 0.00010949373245239258, "reward_change_mean": -0.08929126942530274, "reward_change_min": -0.15922965481877327, "reward_change_std": 0.057676469441503286, "reward_std": 0.655792374163866, "rewards/cosine_scaled_reward": -0.17419393052114174, "rewards/format_reward": 0.6458333432674408, "step": 267 }, { "advantage_max": 1.5531953871250153, "advantage_mean": -1.9868215850316062e-08, "advantage_min": -0.9885692149400711, "advantage_std": 0.9998269900679588, "completion_length": 1502.3542022705078, "epoch": 0.3062857142857143, "grad_norm": 0.3057520091533661, "kl": 0.014217376708984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.71982396408026e-07, "loss": 0.0006, "reward": 0.5444807633757591, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5444807633757591, "reward_after_std": 0.8374427780508995, "reward_before_mean": 0.6610086187720299, "reward_before_std": 0.8377688638865948, "reward_change_max": 0.00028542429208755493, "reward_change_mean": -0.11652784794569016, "reward_change_min": -0.23292355239391327, "reward_change_std": 0.0859985020942986, "reward_std": 0.837442796677351, "rewards/cosine_scaled_reward": -0.09657904086634517, "rewards/format_reward": 0.8541666753590107, "step": 268 }, { "advantage_max": 1.3833930268883705, "advantage_mean": 2.980232349791834e-08, "advantage_min": -1.3209297060966492, "advantage_std": 0.9998177289962769, "completion_length": 1695.3750457763672, "epoch": 0.30742857142857144, "grad_norm": 0.2443539798259735, "kl": 0.00933074951171875, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.688440441781398e-07, "loss": 0.0004, "reward": 0.4807785237208009, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.4807785237208009, "reward_after_std": 0.6509964242577553, "reward_before_mean": 0.5990621582604945, "reward_before_std": 0.6480329409241676, "reward_change_max": 0.0, "reward_change_mean": -0.11828359961509705, "reward_change_min": -0.2115028277039528, "reward_change_std": 0.08078650198876858, "reward_std": 0.6509964615106583, "rewards/cosine_scaled_reward": -0.07546893320977688, "rewards/format_reward": 0.7500000149011612, "step": 269 }, { "advantage_max": 1.6883545815944672, "advantage_mean": -1.9557773400791234e-08, "advantage_min": -0.9836084470152855, "advantage_std": 0.999851755797863, "completion_length": 1690.3333892822266, "epoch": 0.30857142857142855, "grad_norm": 0.21359197795391083, "kl": 0.009889602661132812, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.657047735161255e-07, "loss": 0.0004, "reward": 0.8308562897145748, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.8308562897145748, "reward_after_std": 0.8897613398730755, "reward_before_mean": 0.9719878695905209, "reward_before_std": 0.8724946435540915, "reward_change_max": 0.0002977624535560608, "reward_change_mean": -0.1411315887235105, "reward_change_min": -0.23690782114863396, "reward_change_std": 0.09405350359156728, "reward_std": 0.8897613659501076, "rewards/cosine_scaled_reward": 0.04849394381744787, "rewards/format_reward": 0.8750000074505806, "step": 270 }, { "advantage_max": 1.497256375849247, "advantage_mean": -1.2728075704515618e-07, "advantage_min": -1.2109168618917465, "advantage_std": 0.999859169125557, "completion_length": 1477.8958740234375, "epoch": 0.3097142857142857, "grad_norm": 0.5373415350914001, "kl": 0.014141082763671875, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.625647374256061e-07, "loss": 0.0006, "reward": 1.2130113132297993, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 1.2130113132297993, "reward_after_std": 0.9313858449459076, "reward_before_mean": 1.3908941932022572, "reward_before_std": 0.9274521321058273, "reward_change_max": 0.0, "reward_change_mean": -0.1778829018585384, "reward_change_min": -0.2849385794252157, "reward_change_std": 0.11208993848413229, "reward_std": 0.9313858933746815, "rewards/cosine_scaled_reward": 0.268363754323218, "rewards/format_reward": 0.8541666828095913, "step": 271 }, { "advantage_max": 1.5334807932376862, "advantage_mean": -2.3903947654613233e-08, "advantage_min": -1.0556566417217255, "advantage_std": 0.9998181089758873, "completion_length": 1630.1042022705078, "epoch": 0.31085714285714283, "grad_norm": 0.2611698508262634, "kl": 0.0101165771484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.594240889475106e-07, "loss": 0.0004, "reward": 0.5528686475008726, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5528686475008726, "reward_after_std": 0.7255290150642395, "reward_before_mean": 0.6753423325717449, "reward_before_std": 0.7237700335681438, "reward_change_max": 0.00016976892948150635, "reward_change_mean": -0.12247373699210584, "reward_change_min": -0.2163546048104763, "reward_change_std": 0.08112134877592325, "reward_std": 0.7255290448665619, "rewards/cosine_scaled_reward": -0.08941216330276802, "rewards/format_reward": 0.8541666716337204, "step": 272 }, { "advantage_max": 1.5685235261917114, "advantage_mean": -2.142041988228982e-08, "advantage_min": -1.220929853618145, "advantage_std": 0.9997903630137444, "completion_length": 1454.270896911621, "epoch": 0.312, "grad_norm": 0.2637239694595337, "kl": 0.01152801513671875, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.562829811526154e-07, "loss": 0.0005, "reward": 0.7724597938358784, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7724597938358784, "reward_after_std": 0.6019731983542442, "reward_before_mean": 0.917079396545887, "reward_before_std": 0.5814417097717524, "reward_change_max": 0.00026979297399520874, "reward_change_mean": -0.1446195626631379, "reward_change_min": -0.21904787234961987, "reward_change_std": 0.08566631795838475, "reward_std": 0.601973220705986, "rewards/cosine_scaled_reward": 0.03145633079111576, "rewards/format_reward": 0.8541666753590107, "step": 273 }, { "advantage_max": 1.531281739473343, "advantage_mean": -5.4637593560613595e-08, "advantage_min": -0.9971391409635544, "advantage_std": 0.9998548924922943, "completion_length": 1111.5833587646484, "epoch": 0.31314285714285717, "grad_norm": 0.3014761507511139, "kl": 0.011322021484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.531415671340826e-07, "loss": 0.0005, "reward": 1.0097886063158512, "reward_advantage_correlation": 1.0, "reward_after_mean": 1.0097886063158512, "reward_after_std": 0.8508143164217472, "reward_before_mean": 1.169247966259718, "reward_before_std": 0.8384442552924156, "reward_change_max": 0.0, "reward_change_mean": -0.1594593795016408, "reward_change_min": -0.2814331278204918, "reward_change_std": 0.09933338640257716, "reward_std": 0.8508143350481987, "rewards/cosine_scaled_reward": 0.09504064894281328, "rewards/format_reward": 0.9791666716337204, "step": 274 }, { "advantage_max": 1.46129010617733, "advantage_mean": 3.725290853573426e-09, "advantage_min": -1.2289179787039757, "advantage_std": 0.9998204484581947, "completion_length": 1571.020866394043, "epoch": 0.3142857142857143, "grad_norm": 0.18821187317371368, "kl": 0.00789642333984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.5e-07, "loss": 0.0003, "reward": 1.1050619557499886, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 1.1050619557499886, "reward_after_std": 0.6740875914692879, "reward_before_mean": 1.2809049189090729, "reward_before_std": 0.6542658880352974, "reward_change_max": 0.0001346990466117859, "reward_change_mean": -0.1758428937755525, "reward_change_min": -0.2756085656583309, "reward_change_std": 0.10608048643916845, "reward_std": 0.6740876249969006, "rewards/cosine_scaled_reward": 0.2342024319805205, "rewards/format_reward": 0.8125000149011612, "step": 275 }, { "advantage_max": 1.4276919662952423, "advantage_mean": -9.002785084089027e-08, "advantage_min": -1.2498956099152565, "advantage_std": 0.9998070895671844, "completion_length": 1362.5208740234375, "epoch": 0.31542857142857145, "grad_norm": 0.2566883862018585, "kl": 0.01116180419921875, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.468584328659172e-07, "loss": 0.0004, "reward": 0.8555725496262312, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.8555725496262312, "reward_after_std": 0.6251602806150913, "reward_before_mean": 1.0092867035418749, "reward_before_std": 0.6183755993843079, "reward_change_max": 0.0001868903636932373, "reward_change_mean": -0.15371418604627252, "reward_change_min": -0.24152507819235325, "reward_change_std": 0.09109621308743954, "reward_std": 0.6251602955162525, "rewards/cosine_scaled_reward": 0.0671433275565505, "rewards/format_reward": 0.8750000055879354, "step": 276 }, { "advantage_max": 1.3485763520002365, "advantage_mean": -4.47034849138106e-08, "advantage_min": -1.2358265295624733, "advantage_std": 0.9997855946421623, "completion_length": 1636.8750381469727, "epoch": 0.31657142857142856, "grad_norm": 0.38714054226875305, "kl": 0.015163421630859375, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.437170188473847e-07, "loss": 0.0006, "reward": 0.8174463622272015, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.8174463622272015, "reward_after_std": 0.5525830537080765, "reward_before_mean": 0.969972088932991, "reward_before_std": 0.5424490142613649, "reward_change_max": 0.0, "reward_change_mean": -0.1525257029570639, "reward_change_min": -0.228757094591856, "reward_change_std": 0.08871490228921175, "reward_std": 0.5525830574333668, "rewards/cosine_scaled_reward": 0.08915269374847412, "rewards/format_reward": 0.7916666716337204, "step": 277 }, { "advantage_max": 1.5771578699350357, "advantage_mean": -5.836288130556255e-08, "advantage_min": -1.211281694471836, "advantage_std": 0.9996704533696175, "completion_length": 1442.1042022705078, "epoch": 0.3177142857142857, "grad_norm": 0.33007821440696716, "kl": 0.009634017944335938, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.405759110524894e-07, "loss": 0.0004, "reward": 0.8426887975074351, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.8426887975074351, "reward_after_std": 0.4258726928383112, "reward_before_mean": 0.9998213611543179, "reward_before_std": 0.3926746714860201, "reward_change_max": 0.0, "reward_change_mean": -0.15713255293667316, "reward_change_min": -0.2242979882284999, "reward_change_std": 0.08671380672603846, "reward_std": 0.42587270215153694, "rewards/cosine_scaled_reward": 0.07282732427120209, "rewards/format_reward": 0.8541666679084301, "step": 278 }, { "advantage_max": 1.3795539736747742, "advantage_mean": -6.643434363740042e-08, "advantage_min": -1.1235552951693535, "advantage_std": 0.9998089745640755, "completion_length": 1575.3333892822266, "epoch": 0.31885714285714284, "grad_norm": 0.28037822246551514, "kl": 0.0097198486328125, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.37435262574394e-07, "loss": 0.0004, "reward": 0.6290068812668324, "reward_advantage_correlation": 0.9999999999999994, "reward_after_mean": 0.6290068812668324, "reward_after_std": 0.6324650943279266, "reward_before_mean": 0.7617004364728928, "reward_before_std": 0.6272139437496662, "reward_change_max": 0.0, "reward_change_mean": -0.13269356172531843, "reward_change_min": -0.22322656959295273, "reward_change_std": 0.08410935197025537, "reward_std": 0.6324650980532169, "rewards/cosine_scaled_reward": -0.06706646271049976, "rewards/format_reward": 0.8958333507180214, "step": 279 }, { "advantage_max": 1.4074894785881042, "advantage_mean": -6.332993707225398e-08, "advantage_min": -1.261429451406002, "advantage_std": 0.9998380243778229, "completion_length": 1622.937515258789, "epoch": 0.32, "grad_norm": 0.29624009132385254, "kl": 0.011272430419921875, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.342952264838747e-07, "loss": 0.0005, "reward": 1.2476790957152843, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 1.2476790957152843, "reward_after_std": 0.8106604292988777, "reward_before_mean": 1.434355091303587, "reward_before_std": 0.810054725036025, "reward_change_max": 0.0, "reward_change_mean": -0.18667601607739925, "reward_change_min": -0.30536388978362083, "reward_change_std": 0.12097588926553726, "reward_std": 0.8106604591012001, "rewards/cosine_scaled_reward": 0.2796775340102613, "rewards/format_reward": 0.875, "step": 280 }, { "advantage_max": 1.5743074342608452, "advantage_mean": 7.450580374879223e-09, "advantage_min": -1.2013270854949951, "advantage_std": 0.9998085796833038, "completion_length": 2353.2708587646484, "epoch": 0.3211428571428571, "grad_norm": 0.24173757433891296, "kl": 0.01459503173828125, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.311559558218603e-07, "loss": 0.0006, "reward": 0.36519142519682646, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.36519142519682646, "reward_after_std": 0.7732009813189507, "reward_before_mean": 0.4665646404027939, "reward_before_std": 0.7668895106762648, "reward_change_max": 9.001791477203369e-05, "reward_change_mean": -0.10137320728972554, "reward_change_min": -0.18519249744713306, "reward_change_std": 0.07093417691066861, "reward_std": 0.7732009924948215, "rewards/cosine_scaled_reward": -0.047967685444746166, "rewards/format_reward": 0.5625000018626451, "step": 281 }, { "advantage_max": 1.4620432406663895, "advantage_mean": -2.0489098639941972e-08, "advantage_min": -1.1684705764055252, "advantage_std": 0.9997751787304878, "completion_length": 1570.333396911621, "epoch": 0.3222857142857143, "grad_norm": 0.25614941120147705, "kl": 0.01031494140625, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.28017603591974e-07, "loss": 0.0004, "reward": 0.73689816147089, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.73689816147089, "reward_after_std": 0.5660558789968491, "reward_before_mean": 0.8810554593801498, "reward_before_std": 0.5520604215562344, "reward_change_max": 0.0, "reward_change_mean": -0.14415732212364674, "reward_change_min": -0.23753152042627335, "reward_change_std": 0.08675737353041768, "reward_std": 0.566055903211236, "rewards/cosine_scaled_reward": 0.02386106736958027, "rewards/format_reward": 0.8333333414047956, "step": 282 }, { "advantage_max": 1.3014950826764107, "advantage_mean": -3.756334410187634e-08, "advantage_min": -1.3910346552729607, "advantage_std": 0.9998150020837784, "completion_length": 2028.458366394043, "epoch": 0.32342857142857145, "grad_norm": 0.30256274342536926, "kl": 0.01206207275390625, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.248803227530763e-07, "loss": 0.0005, "reward": 0.7930786944925785, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7930786944925785, "reward_after_std": 0.7125616930425167, "reward_before_mean": 0.9397026058286428, "reward_before_std": 0.7213702015578747, "reward_change_max": 0.0005670338869094849, "reward_change_mean": -0.14662390458397567, "reward_change_min": -0.24198182485997677, "reward_change_std": 0.09706047433428466, "reward_std": 0.7125617042183876, "rewards/cosine_scaled_reward": 0.09485127124935389, "rewards/format_reward": 0.7500000093132257, "step": 283 }, { "advantage_max": 1.5812103599309921, "advantage_mean": -1.490116185998147e-08, "advantage_min": -1.0278535932302475, "advantage_std": 0.9998149424791336, "completion_length": 1272.0417098999023, "epoch": 0.32457142857142857, "grad_norm": 0.2702529728412628, "kl": 0.01029205322265625, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.21744266211809e-07, "loss": 0.0004, "reward": 0.47102506645023823, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.47102506645023823, "reward_after_std": 0.6693096347153187, "reward_before_mean": 0.5846154130995274, "reward_before_std": 0.6555521786212921, "reward_change_max": 0.0, "reward_change_mean": -0.11359035596251488, "reward_change_min": -0.20026837661862373, "reward_change_std": 0.07205808768048882, "reward_std": 0.6693096496164799, "rewards/cosine_scaled_reward": -0.13477563112974167, "rewards/format_reward": 0.8541666716337204, "step": 284 }, { "advantage_max": 1.6508907973766327, "advantage_mean": -3.228585088166369e-08, "advantage_min": -1.0195088982582092, "advantage_std": 0.9997524991631508, "completion_length": 1204.8750305175781, "epoch": 0.32571428571428573, "grad_norm": 0.31728941202163696, "kl": 0.01241302490234375, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.186095868151436e-07, "loss": 0.0005, "reward": 0.7143117673695087, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7143117673695087, "reward_after_std": 0.6237810291349888, "reward_before_mean": 0.8525383183732629, "reward_before_std": 0.5971423909068108, "reward_change_max": 0.0, "reward_change_mean": -0.1382265416905284, "reward_change_min": -0.21934391558170319, "reward_change_std": 0.08337379200384021, "reward_std": 0.6237810496240854, "rewards/cosine_scaled_reward": -0.03206418454647064, "rewards/format_reward": 0.916666679084301, "step": 285 }, { "advantage_max": 1.6236571073532104, "advantage_mean": -4.967053657267684e-09, "advantage_min": -0.982652448117733, "advantage_std": 0.9998085200786591, "completion_length": 1341.4792022705078, "epoch": 0.32685714285714285, "grad_norm": 0.3013690710067749, "kl": 0.01345062255859375, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.154764373429315e-07, "loss": 0.0005, "reward": 0.5443090852349997, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.5443090852349997, "reward_after_std": 0.6821228787302971, "reward_before_mean": 0.6657579094171524, "reward_before_std": 0.6722039449959993, "reward_change_max": 0.0, "reward_change_mean": -0.12144879251718521, "reward_change_min": -0.21625848673284054, "reward_change_std": 0.07772636087611318, "reward_std": 0.682122889906168, "rewards/cosine_scaled_reward": -0.13587106950581074, "rewards/format_reward": 0.9375000074505806, "step": 286 }, { "advantage_max": 1.6009186208248138, "advantage_mean": 2.6077033421501028e-08, "advantage_min": -1.0167246609926224, "advantage_std": 0.9997030347585678, "completion_length": 1400.854175567627, "epoch": 0.328, "grad_norm": 0.333176851272583, "kl": 0.0135955810546875, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.123449705004581e-07, "loss": 0.0005, "reward": 0.6055421698838472, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6055421698838472, "reward_after_std": 0.6879148874431849, "reward_before_mean": 0.7314797258004546, "reward_before_std": 0.6727271899580956, "reward_change_max": 3.1247735023498535e-05, "reward_change_mean": -0.1259375517256558, "reward_change_min": -0.21384302619844675, "reward_change_std": 0.07986640278249979, "reward_std": 0.6879148911684752, "rewards/cosine_scaled_reward": 0.01157319126650691, "rewards/format_reward": 0.7083333358168602, "step": 287 }, { "advantage_max": 1.5019276440143585, "advantage_mean": -4.2530398758344745e-08, "advantage_min": -1.243957407772541, "advantage_std": 0.9997709915041924, "completion_length": 1464.291732788086, "epoch": 0.3291428571428571, "grad_norm": 0.24288351833820343, "kl": 0.00968170166015625, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.09215338910999e-07, "loss": 0.0004, "reward": 0.5588183682411909, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5588183682411909, "reward_after_std": 0.5841691084206104, "reward_before_mean": 0.6852227933704853, "reward_before_std": 0.5734246857464314, "reward_change_max": 0.0, "reward_change_mean": -0.12640444561839104, "reward_change_min": -0.19520024210214615, "reward_change_std": 0.07665029587224126, "reward_std": 0.584169115871191, "rewards/cosine_scaled_reward": -0.12613860587589443, "rewards/format_reward": 0.9375000074505806, "step": 288 }, { "advantage_max": 1.6838382929563522, "advantage_mean": -5.774200317887335e-08, "advantage_min": -1.0431829616427422, "advantage_std": 0.9997774288058281, "completion_length": 1470.9792137145996, "epoch": 0.3302857142857143, "grad_norm": 0.4302460253238678, "kl": 0.015533447265625, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.060876951083828e-07, "loss": 0.0006, "reward": 0.6976730767637491, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6976730767637491, "reward_after_std": 0.5832541156560183, "reward_before_mean": 0.8337808940559626, "reward_before_std": 0.5487283486872911, "reward_change_max": 0.0005718618631362915, "reward_change_mean": -0.13610781356692314, "reward_change_min": -0.2004237212240696, "reward_change_std": 0.07571555068716407, "reward_std": 0.5832541491836309, "rewards/cosine_scaled_reward": 0.00022377073764801025, "rewards/format_reward": 0.8333333414047956, "step": 289 }, { "advantage_max": 1.4299821257591248, "advantage_mean": 1.7384689798838338e-08, "advantage_min": -1.2245251014828682, "advantage_std": 0.999837800860405, "completion_length": 1135.0625457763672, "epoch": 0.3314285714285714, "grad_norm": 0.29410240054130554, "kl": 0.011016845703125, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.02962191529556e-07, "loss": 0.0004, "reward": 1.0313334502279758, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 1.0313334502279758, "reward_after_std": 0.8016556054353714, "reward_before_mean": 1.1964529231190681, "reward_before_std": 0.7968671545386314, "reward_change_max": 0.0, "reward_change_mean": -0.16511950362473726, "reward_change_min": -0.2797320708632469, "reward_change_std": 0.10460527054965496, "reward_std": 0.801655612885952, "rewards/cosine_scaled_reward": 0.12947646714746952, "rewards/format_reward": 0.9375, "step": 290 }, { "advantage_max": 1.6369030177593231, "advantage_mean": -5.494803323458086e-08, "advantage_min": -1.0549881234765053, "advantage_std": 0.9998100474476814, "completion_length": 1349.6041870117188, "epoch": 0.3325714285714286, "grad_norm": 0.2570025324821472, "kl": 0.0112152099609375, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.998389805071536e-07, "loss": 0.0004, "reward": 0.8352435231208801, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.8352435231208801, "reward_after_std": 0.7465978749096394, "reward_before_mean": 0.980657309293747, "reward_before_std": 0.7230622582137585, "reward_change_max": 0.0, "reward_change_mean": -0.14541381038725376, "reward_change_min": -0.23240702971816063, "reward_change_std": 0.08540754904970527, "reward_std": 0.7465978935360909, "rewards/cosine_scaled_reward": 0.021578645333647728, "rewards/format_reward": 0.9375, "step": 291 }, { "advantage_max": 1.4761302471160889, "advantage_mean": 9.934107703113426e-09, "advantage_min": -1.1501412615180016, "advantage_std": 0.9998109415173531, "completion_length": 1796.5000457763672, "epoch": 0.33371428571428574, "grad_norm": 0.3171732425689697, "kl": 0.015542984008789062, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.967182142620745e-07, "loss": 0.0006, "reward": 0.47690540738403797, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.47690540738403797, "reward_after_std": 0.8478581756353378, "reward_before_mean": 0.5877604689449072, "reward_before_std": 0.8521979209035635, "reward_change_max": 0.0, "reward_change_mean": -0.11085503036156297, "reward_change_min": -0.20827853865921497, "reward_change_std": 0.08406012458726764, "reward_std": 0.8478581979870796, "rewards/cosine_scaled_reward": -0.10195311531424522, "rewards/format_reward": 0.7916666716337204, "step": 292 }, { "advantage_max": 1.2476003393530846, "advantage_mean": -3.601114129114791e-08, "advantage_min": -1.526408739387989, "advantage_std": 0.9998006299138069, "completion_length": 1279.9167098999023, "epoch": 0.33485714285714285, "grad_norm": 0.35458728671073914, "kl": 0.0106201171875, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.93600044896063e-07, "loss": 0.0004, "reward": 0.6448268890380859, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.6448268890380859, "reward_after_std": 0.5995666980743408, "reward_before_mean": 0.7823301900643855, "reward_before_std": 0.6108702011406422, "reward_change_max": 0.0, "reward_change_mean": -0.13750330917537212, "reward_change_min": -0.21467959508299828, "reward_change_std": 0.08796036243438721, "reward_std": 0.599566712975502, "rewards/cosine_scaled_reward": -0.06716826558113098, "rewards/format_reward": 0.916666679084301, "step": 293 }, { "advantage_max": 1.4925710558891296, "advantage_mean": 4.035731804297171e-09, "advantage_min": -1.0893485471606255, "advantage_std": 0.9998233541846275, "completion_length": 2047.1875457763672, "epoch": 0.336, "grad_norm": 0.31637120246887207, "kl": 0.017940521240234375, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.904846243842949e-07, "loss": 0.0007, "reward": 0.6296005487674847, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6296005487674847, "reward_after_std": 0.735579501837492, "reward_before_mean": 0.7591220289468765, "reward_before_std": 0.7338115274906158, "reward_change_max": 3.84598970413208e-05, "reward_change_mean": -0.1295214667916298, "reward_change_min": -0.23670313879847527, "reward_change_std": 0.08958509005606174, "reward_std": 0.7355795204639435, "rewards/cosine_scaled_reward": 0.01497767074033618, "rewards/format_reward": 0.729166679084301, "step": 294 }, { "advantage_max": 1.6160722076892853, "advantage_mean": -1.8626462594539817e-09, "advantage_min": -1.1032218933105469, "advantage_std": 0.999823197722435, "completion_length": 1671.166732788086, "epoch": 0.33714285714285713, "grad_norm": 0.31535738706588745, "kl": 0.015350341796875, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.873721045679706e-07, "loss": 0.0006, "reward": 0.7431380706839263, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7431380706839263, "reward_after_std": 0.6490283533930779, "reward_before_mean": 0.883342670276761, "reward_before_std": 0.6273828744888306, "reward_change_max": 0.0, "reward_change_mean": -0.14020460075698793, "reward_change_min": -0.22164242342114449, "reward_change_std": 0.08250434487126768, "reward_std": 0.6490283794701099, "rewards/cosine_scaled_reward": 0.06667132629081607, "rewards/format_reward": 0.7500000093132257, "step": 295 }, { "advantage_max": 1.403956413269043, "advantage_mean": -4.656612906384083e-08, "advantage_min": -1.2638613507151604, "advantage_std": 0.9998411238193512, "completion_length": 1765.3125610351562, "epoch": 0.3382857142857143, "grad_norm": 0.2900753319263458, "kl": 0.014621734619140625, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.842626371469149e-07, "loss": 0.0006, "reward": 0.6206641308963299, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6206641308963299, "reward_after_std": 0.7382680289447308, "reward_before_mean": 0.7495931871235371, "reward_before_std": 0.7414855919778347, "reward_change_max": 0.0, "reward_change_mean": -0.12892907345667481, "reward_change_min": -0.22145743668079376, "reward_change_std": 0.08637001179158688, "reward_std": 0.7382680363953114, "rewards/cosine_scaled_reward": -0.04187007714062929, "rewards/format_reward": 0.8333333432674408, "step": 296 }, { "advantage_max": 1.5119258910417557, "advantage_mean": -4.2219957530065244e-08, "advantage_min": -1.21080182492733, "advantage_std": 0.9998287558555603, "completion_length": 1928.7708892822266, "epoch": 0.3394285714285714, "grad_norm": 0.2618260085582733, "kl": 0.01506805419921875, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.811563736721829e-07, "loss": 0.0006, "reward": 0.43504673708230257, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.43504673708230257, "reward_after_std": 0.7578357718884945, "reward_before_mean": 0.5440489100292325, "reward_before_std": 0.7604426443576813, "reward_change_max": 0.0001793503761291504, "reward_change_mean": -0.10900220740586519, "reward_change_min": -0.1914794910699129, "reward_change_std": 0.07616990571841598, "reward_std": 0.7578357979655266, "rewards/cosine_scaled_reward": -0.06130887754261494, "rewards/format_reward": 0.6666666772216558, "step": 297 }, { "advantage_max": 1.5896563529968262, "advantage_mean": -7.761025155872403e-10, "advantage_min": -1.1579310297966003, "advantage_std": 0.999824695289135, "completion_length": 1441.7083740234375, "epoch": 0.3405714285714286, "grad_norm": 0.24601784348487854, "kl": 0.01018524169921875, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.780534655386743e-07, "loss": 0.0004, "reward": 0.7649918240495026, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7649918240495026, "reward_after_std": 0.7487938515841961, "reward_before_mean": 0.904022048576735, "reward_before_std": 0.7324915304780006, "reward_change_max": 0.0, "reward_change_mean": -0.13903021812438965, "reward_change_min": -0.23804667592048645, "reward_change_std": 0.08871197002008557, "reward_std": 0.7487938888370991, "rewards/cosine_scaled_reward": 0.024927678401581943, "rewards/format_reward": 0.8541666716337204, "step": 298 }, { "advantage_max": 1.439231514930725, "advantage_mean": 2.9802322498717615e-08, "advantage_min": -1.1201618686318398, "advantage_std": 0.9998530372977257, "completion_length": 1864.520881652832, "epoch": 0.3417142857142857, "grad_norm": 0.4852619767189026, "kl": 0.019191741943359375, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.749540639777539e-07, "loss": 0.0008, "reward": 0.5580803826451302, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5580803826451302, "reward_after_std": 0.8004944212734699, "reward_before_mean": 0.6778634660877287, "reward_before_std": 0.802312109619379, "reward_change_max": 0.0004041343927383423, "reward_change_mean": -0.1197830568999052, "reward_change_min": -0.22889439016580582, "reward_change_std": 0.08444441203027964, "reward_std": 0.8004944249987602, "rewards/cosine_scaled_reward": -0.05690161604434252, "rewards/format_reward": 0.7916666716337204, "step": 299 }, { "advantage_max": 1.560862809419632, "advantage_mean": -6.208817904251873e-09, "advantage_min": -1.2231401279568672, "advantage_std": 0.9997982382774353, "completion_length": 2020.458366394043, "epoch": 0.34285714285714286, "grad_norm": 0.38628125190734863, "kl": 0.020538330078125, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.7185832004988133e-07, "loss": 0.0008, "reward": 0.2665316807106137, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.2665316807106137, "reward_after_std": 0.7272491361945868, "reward_before_mean": 0.3611962553113699, "reward_before_std": 0.7304719872772694, "reward_change_max": 0.00027470290660858154, "reward_change_mean": -0.09466456901282072, "reward_change_min": -0.18015100061893463, "reward_change_std": 0.07119207771029323, "reward_std": 0.727249139919877, "rewards/cosine_scaled_reward": -0.13190188258886337, "rewards/format_reward": 0.625000013038516, "step": 300 }, { "advantage_max": 1.3870003148913383, "advantage_mean": -3.290673178391046e-08, "advantage_min": -1.3263009116053581, "advantage_std": 0.9997928962111473, "completion_length": 1702.6250457763672, "epoch": 0.344, "grad_norm": 0.3162795901298523, "kl": 0.021331787109375, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.68766384637248e-07, "loss": 0.0009, "reward": 0.550446767359972, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.550446767359972, "reward_after_std": 0.6271288879215717, "reward_before_mean": 0.675824873149395, "reward_before_std": 0.6298750899732113, "reward_change_max": 0.0, "reward_change_mean": -0.12537812907248735, "reward_change_min": -0.20914898626506329, "reward_change_std": 0.08048908645287156, "reward_std": 0.6271288879215717, "rewards/cosine_scaled_reward": -0.08917090541217476, "rewards/format_reward": 0.8541666865348816, "step": 301 }, { "advantage_max": 1.5091819912195206, "advantage_mean": -6.519258155535113e-08, "advantage_min": -1.0816588401794434, "advantage_std": 0.9997987672686577, "completion_length": 2065.7291984558105, "epoch": 0.34514285714285714, "grad_norm": 0.3135543167591095, "kl": 0.023223876953125, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.656784084364238e-07, "loss": 0.0009, "reward": 0.4963846392929554, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4963846392929554, "reward_after_std": 0.739615261554718, "reward_before_mean": 0.6132568791508675, "reward_before_std": 0.7420662026852369, "reward_change_max": 0.0004647970199584961, "reward_change_mean": -0.11687223275657743, "reward_change_min": -0.22242337465286255, "reward_change_std": 0.08635350060649216, "reward_std": 0.7396152671426535, "rewards/cosine_scaled_reward": 0.00454508513212204, "rewards/format_reward": 0.6041666753590107, "step": 302 }, { "advantage_max": 1.5902022868394852, "advantage_mean": -8.257727046601104e-08, "advantage_min": -1.086832880973816, "advantage_std": 0.999818466603756, "completion_length": 1145.2916793823242, "epoch": 0.3462857142857143, "grad_norm": 0.45699557662010193, "kl": 0.011653900146484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.6259454195101267e-07, "loss": 0.0005, "reward": 0.9359966441988945, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.9359966441988945, "reward_after_std": 0.7046908959746361, "reward_before_mean": 1.0924376100301743, "reward_before_std": 0.6804439015686512, "reward_change_max": 0.0, "reward_change_mean": -0.15644100308418274, "reward_change_min": -0.25441403687000275, "reward_change_std": 0.09159657126292586, "reward_std": 0.7046909183263779, "rewards/cosine_scaled_reward": 0.056635468266904354, "rewards/format_reward": 0.9791666716337204, "step": 303 }, { "advantage_max": 1.500855103135109, "advantage_mean": -1.2417633477035395e-08, "advantage_min": -1.2662229239940643, "advantage_std": 0.9998027309775352, "completion_length": 1537.2916870117188, "epoch": 0.3474285714285714, "grad_norm": 0.29932647943496704, "kl": 0.0164794921875, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.59514935484316e-07, "loss": 0.0007, "reward": 0.5928112086839974, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5928112086839974, "reward_after_std": 0.669714767485857, "reward_before_mean": 0.7199195058783516, "reward_before_std": 0.6636870130896568, "reward_change_max": 0.0, "reward_change_mean": -0.12710829265415668, "reward_change_min": -0.19983693584799767, "reward_change_std": 0.07841200614348054, "reward_std": 0.6697147898375988, "rewards/cosine_scaled_reward": -0.05670691654086113, "rewards/format_reward": 0.8333333358168602, "step": 304 }, { "advantage_max": 1.5782776921987534, "advantage_mean": 7.4505804303903744e-09, "advantage_min": -1.0678502842783928, "advantage_std": 0.9998413845896721, "completion_length": 1363.020866394043, "epoch": 0.3485714285714286, "grad_norm": 0.3479137122631073, "kl": 0.0138092041015625, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.5643973913200837e-07, "loss": 0.0006, "reward": 0.5404365761205554, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.5404365761205554, "reward_after_std": 0.7375712133944035, "reward_before_mean": 0.659806574229151, "reward_before_std": 0.7277458868920803, "reward_change_max": 0.00011269748210906982, "reward_change_mean": -0.11936996411532164, "reward_change_min": -0.21077474392950535, "reward_change_std": 0.07811526395380497, "reward_std": 0.7375712543725967, "rewards/cosine_scaled_reward": -0.1180134043097496, "rewards/format_reward": 0.8958333432674408, "step": 305 }, { "advantage_max": 1.4038033783435822, "advantage_mean": -3.414849530924968e-08, "advantage_min": -1.2565943449735641, "advantage_std": 0.9998504742980003, "completion_length": 1403.1875228881836, "epoch": 0.3497142857142857, "grad_norm": 0.4971572756767273, "kl": 0.037811279296875, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.5336910277482155e-07, "loss": 0.0015, "reward": 1.033497937489301, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 1.033497937489301, "reward_after_std": 0.8184244558215141, "reward_before_mean": 1.1987580558052287, "reward_before_std": 0.8222080878913403, "reward_change_max": 0.0, "reward_change_mean": -0.16526012308895588, "reward_change_min": -0.2801430709660053, "reward_change_std": 0.10898207128047943, "reward_std": 0.8184244707226753, "rewards/cosine_scaled_reward": 0.2035456746816635, "rewards/format_reward": 0.7916666753590107, "step": 306 }, { "advantage_max": 1.546189859509468, "advantage_mean": -3.849466811978175e-08, "advantage_min": -0.9951371252536774, "advantage_std": 0.999823309481144, "completion_length": 1248.8750381469727, "epoch": 0.35085714285714287, "grad_norm": 0.28933218121528625, "kl": 0.010631561279296875, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.503031760712397e-07, "loss": 0.0004, "reward": 0.6945029981434345, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6945029981434345, "reward_after_std": 0.8165526390075684, "reward_before_mean": 0.8262498378753662, "reward_before_std": 0.8094968199729919, "reward_change_max": 0.0, "reward_change_mean": -0.13174681551754475, "reward_change_min": -0.2479863427579403, "reward_change_std": 0.08963473793119192, "reward_std": 0.8165526390075684, "rewards/cosine_scaled_reward": -0.02437510807067156, "rewards/format_reward": 0.8750000037252903, "step": 307 }, { "advantage_max": 1.5855407267808914, "advantage_mean": -1.4901161637936866e-08, "advantage_min": -1.0699032694101334, "advantage_std": 0.9998343363404274, "completion_length": 2460.3959197998047, "epoch": 0.352, "grad_norm": 0.26973065733909607, "kl": 0.026277542114257812, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.4724210845020494e-07, "loss": 0.0011, "reward": 0.2941260999068618, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2941260999068618, "reward_after_std": 0.710686132311821, "reward_before_mean": 0.3919203635305166, "reward_before_std": 0.7093313783407211, "reward_change_max": 0.00016189366579055786, "reward_change_mean": -0.09779428178444505, "reward_change_min": -0.17198238987475634, "reward_change_std": 0.0679063880816102, "reward_std": 0.7106861583888531, "rewards/cosine_scaled_reward": -0.10612315125763416, "rewards/format_reward": 0.604166679084301, "step": 308 }, { "advantage_max": 1.566720575094223, "advantage_mean": -2.235174206832724e-08, "advantage_min": -1.2411763966083527, "advantage_std": 0.9997689723968506, "completion_length": 2128.5000762939453, "epoch": 0.35314285714285715, "grad_norm": 0.35043805837631226, "kl": 0.02182769775390625, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.441860491038345e-07, "loss": 0.0009, "reward": 0.3403874337673187, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.3403874337673187, "reward_after_std": 0.5281771644949913, "reward_before_mean": 0.4473396446555853, "reward_before_std": 0.5186953172087669, "reward_change_max": 0.0, "reward_change_mean": -0.1069522425532341, "reward_change_min": -0.1729181855916977, "reward_change_std": 0.06512200646102428, "reward_std": 0.5281771682202816, "rewards/cosine_scaled_reward": -0.12008016742765903, "rewards/format_reward": 0.6875000186264515, "step": 309 }, { "advantage_max": 1.3736951127648354, "advantage_mean": -4.4703484802788296e-08, "advantage_min": -1.4934279769659042, "advantage_std": 0.9997905939817429, "completion_length": 1532.770881652832, "epoch": 0.35428571428571426, "grad_norm": 0.5251758098602295, "kl": 0.027561187744140625, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.4113514698014953e-07, "loss": 0.0011, "reward": 0.6881696791388094, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6881696791388094, "reward_after_std": 0.5913071930408478, "reward_before_mean": 0.8264183476567268, "reward_before_std": 0.5859585925936699, "reward_change_max": 0.00013815611600875854, "reward_change_mean": -0.13824867643415928, "reward_change_min": -0.22185775637626648, "reward_change_std": 0.0831848056986928, "reward_std": 0.5913072191178799, "rewards/cosine_scaled_reward": -0.024290837347507477, "rewards/format_reward": 0.8750000074505806, "step": 310 }, { "advantage_max": 1.3643943965435028, "advantage_mean": -1.3721486946671746e-07, "advantage_min": -1.2142458334565163, "advantage_std": 0.9997890368103981, "completion_length": 1374.0000381469727, "epoch": 0.3554285714285714, "grad_norm": 0.4340468943119049, "kl": 0.015289306640625, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.3808955077581546e-07, "loss": 0.0006, "reward": 0.8373637902550399, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.8373637902550399, "reward_after_std": 0.6764303985983133, "reward_before_mean": 0.9891079477965832, "reward_before_std": 0.6704716719686985, "reward_change_max": 0.00013653188943862915, "reward_change_mean": -0.15174414590001106, "reward_change_min": -0.2510115969926119, "reward_change_std": 0.09827401582151651, "reward_std": 0.67643041908741, "rewards/cosine_scaled_reward": 0.0362206120043993, "rewards/format_reward": 0.9166666716337204, "step": 311 }, { "advantage_max": 1.7767666429281235, "advantage_mean": -6.239861538581692e-08, "advantage_min": -0.9839174374938011, "advantage_std": 0.9997939020395279, "completion_length": 1366.6250305175781, "epoch": 0.3565714285714286, "grad_norm": 0.2758287489414215, "kl": 0.016445159912109375, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.350494089288943e-07, "loss": 0.0007, "reward": 1.218240201473236, "reward_advantage_correlation": 1.0, "reward_after_mean": 1.218240201473236, "reward_after_std": 0.7059788070619106, "reward_before_mean": 1.3989666923880577, "reward_before_std": 0.6575389942154288, "reward_change_max": 9.416043758392334e-05, "reward_change_mean": -0.18072648905217648, "reward_change_min": -0.27589481323957443, "reward_change_std": 0.1028918290976435, "reward_std": 0.7059788201004267, "rewards/cosine_scaled_reward": 0.282816668972373, "rewards/format_reward": 0.8333333432674408, "step": 312 }, { "advantage_max": 1.3789242804050446, "advantage_mean": -5.712111927902441e-08, "advantage_min": -1.2307011783123016, "advantage_std": 0.9997663050889969, "completion_length": 1910.125015258789, "epoch": 0.3577142857142857, "grad_norm": 0.4083651304244995, "kl": 0.025684356689453125, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.3201486961161093e-07, "loss": 0.001, "reward": 0.6246846728026867, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6246846728026867, "reward_after_std": 0.7261706329882145, "reward_before_mean": 0.7556617148220539, "reward_before_std": 0.7316101198084652, "reward_change_max": 2.294778823852539e-05, "reward_change_mean": -0.1309770462103188, "reward_change_min": -0.2411420177668333, "reward_change_std": 0.09405340254306793, "reward_std": 0.7261706478893757, "rewards/cosine_scaled_reward": 0.05491418088786304, "rewards/format_reward": 0.6458333432674408, "step": 313 }, { "advantage_max": 1.4088439345359802, "advantage_mean": 1.5211602422127157e-08, "advantage_min": -1.309675395488739, "advantage_std": 0.9997310861945152, "completion_length": 1676.4583549499512, "epoch": 0.3588571428571429, "grad_norm": 0.2689584493637085, "kl": 0.0266265869140625, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.2898608072313045e-07, "loss": 0.0011, "reward": 0.6452626027166843, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.6452626027166843, "reward_after_std": 0.536408307030797, "reward_before_mean": 0.7821810264140368, "reward_before_std": 0.5282867718487978, "reward_change_max": 0.00013487786054611206, "reward_change_mean": -0.13691840320825577, "reward_change_min": -0.21895418595522642, "reward_change_std": 0.08299481403082609, "reward_std": 0.5364083256572485, "rewards/cosine_scaled_reward": 0.03692383784800768, "rewards/format_reward": 0.7083333432674408, "step": 314 }, { "advantage_max": 1.4416030943393707, "advantage_mean": 4.967053879312289e-09, "advantage_min": -1.1885623559355736, "advantage_std": 0.9998395070433617, "completion_length": 2093.750045776367, "epoch": 0.36, "grad_norm": 0.5383113026618958, "kl": 0.047454833984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.2596318988235037e-07, "loss": 0.0019, "reward": 0.42393129877746105, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.42393129877746105, "reward_after_std": 0.8001789897680283, "reward_before_mean": 0.5337961576879025, "reward_before_std": 0.8166777454316616, "reward_change_max": 1.0460615158081055e-05, "reward_change_mean": -0.10986482561565936, "reward_change_min": -0.2027928214520216, "reward_change_std": 0.08343219291418791, "reward_std": 0.8001790381968021, "rewards/cosine_scaled_reward": -0.06643527187407017, "rewards/format_reward": 0.6666666809469461, "step": 315 }, { "advantage_max": 1.5823724269866943, "advantage_mean": -1.0865429667106241e-09, "advantage_min": -1.2184803411364555, "advantage_std": 0.9997778832912445, "completion_length": 2214.354248046875, "epoch": 0.36114285714285715, "grad_norm": 0.6452406048774719, "kl": 0.03017425537109375, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.2294634442070553e-07, "loss": 0.0012, "reward": 0.042976333759725094, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.042976333759725094, "reward_after_std": 0.5025542117655277, "reward_before_mean": 0.12296128878369927, "reward_before_std": 0.4968912973999977, "reward_change_max": 0.0006083101034164429, "reward_change_mean": -0.07998494571074843, "reward_change_min": -0.14237389434129, "reward_change_std": 0.05578152043744922, "reward_std": 0.5025542229413986, "rewards/cosine_scaled_reward": -0.29268604703247547, "rewards/format_reward": 0.7083333469927311, "step": 316 }, { "advantage_max": 1.4967581778764725, "advantage_mean": -7.32640422773656e-08, "advantage_min": -1.2174543887376785, "advantage_std": 0.9997687339782715, "completion_length": 2063.270881652832, "epoch": 0.36228571428571427, "grad_norm": 0.46099042892456055, "kl": 0.03884124755859375, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.1993569137498776e-07, "loss": 0.0016, "reward": 0.45239776093512774, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.45239776093512774, "reward_after_std": 0.6172072049230337, "reward_before_mean": 0.5681051621213555, "reward_before_std": 0.6142678093165159, "reward_change_max": 0.00023806840181350708, "reward_change_mean": -0.11570744588971138, "reward_change_min": -0.2016570856794715, "reward_change_std": 0.08009989093989134, "reward_std": 0.6172072291374207, "rewards/cosine_scaled_reward": 0.013219265267252922, "rewards/format_reward": 0.5416666753590107, "step": 317 }, { "advantage_max": 1.567556545138359, "advantage_mean": -1.9868215073159945e-08, "advantage_min": -1.2147565111517906, "advantage_std": 0.9997292533516884, "completion_length": 1573.3750457763672, "epoch": 0.36342857142857143, "grad_norm": 0.33571857213974, "kl": 0.038578033447265625, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.1693137748017915e-07, "loss": 0.0015, "reward": 0.3722683619707823, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3722683619707823, "reward_after_std": 0.486986355856061, "reward_before_mean": 0.48412251146510243, "reward_before_std": 0.4724613279104233, "reward_change_max": 0.00047060102224349976, "reward_change_mean": -0.11185414809733629, "reward_change_min": -0.17265755124390125, "reward_change_std": 0.06754804449155927, "reward_std": 0.486986368894577, "rewards/cosine_scaled_reward": -0.1641887491568923, "rewards/format_reward": 0.8125000055879354, "step": 318 }, { "advantage_max": 1.5715395361185074, "advantage_mean": -8.071462553882469e-09, "advantage_min": -1.107521429657936, "advantage_std": 0.9997441917657852, "completion_length": 1662.8125457763672, "epoch": 0.36457142857142855, "grad_norm": 0.44830116629600525, "kl": 0.024005889892578125, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.1393354916230005e-07, "loss": 0.001, "reward": 0.41969322599470615, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.41969322599470615, "reward_after_std": 0.5853044390678406, "reward_before_mean": 0.5322350189089775, "reward_before_std": 0.5741870794445276, "reward_change_max": 0.0, "reward_change_mean": -0.1125417877919972, "reward_change_min": -0.1796498317271471, "reward_change_std": 0.07052160147577524, "reward_std": 0.5853044539690018, "rewards/cosine_scaled_reward": -0.15054917754605412, "rewards/format_reward": 0.8333333488553762, "step": 319 }, { "advantage_max": 1.5654927790164948, "advantage_mean": -3.0423205343854676e-08, "advantage_min": -0.9983152002096176, "advantage_std": 0.9998209252953529, "completion_length": 1109.3333587646484, "epoch": 0.3657142857142857, "grad_norm": 0.4801042079925537, "kl": 0.0272216796875, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.1094235253127374e-07, "loss": 0.0011, "reward": 0.6591540115623502, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6591540115623502, "reward_after_std": 0.7177729271352291, "reward_before_mean": 0.7893197052180767, "reward_before_std": 0.7012372482568026, "reward_change_max": 0.0, "reward_change_mean": -0.13016566913574934, "reward_change_min": -0.22104015946388245, "reward_change_std": 0.07897943677380681, "reward_std": 0.7177729494869709, "rewards/cosine_scaled_reward": -0.06367350154323503, "rewards/format_reward": 0.9166666716337204, "step": 320 }, { "advantage_max": 1.5770218819379807, "advantage_mean": -7.698933512934047e-08, "advantage_min": -1.113427273929119, "advantage_std": 0.9998316392302513, "completion_length": 1142.2292022705078, "epoch": 0.3668571428571429, "grad_norm": 0.3772043287754059, "kl": 0.019496917724609375, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.079579333738039e-07, "loss": 0.0008, "reward": 1.0956639312207699, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 1.0956639312207699, "reward_after_std": 0.7559323236346245, "reward_before_mean": 1.2653162218630314, "reward_before_std": 0.7302691843360662, "reward_change_max": 0.0, "reward_change_mean": -0.16965226829051971, "reward_change_min": -0.264015831053257, "reward_change_std": 0.09789514960721135, "reward_std": 0.755932342261076, "rewards/cosine_scaled_reward": 0.15349140530452132, "rewards/format_reward": 0.9583333358168602, "step": 321 }, { "advantage_max": 1.6221220940351486, "advantage_mean": -2.6077033199456423e-08, "advantage_min": -0.9467347487807274, "advantage_std": 0.9998131617903709, "completion_length": 1825.3333740234375, "epoch": 0.368, "grad_norm": 0.6722792983055115, "kl": 0.0529022216796875, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.0498043714627006e-07, "loss": 0.0021, "reward": 0.15624785982072353, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.15624785982072353, "reward_after_std": 0.6913126781582832, "reward_before_mean": 0.24026533402502537, "reward_before_std": 0.6873844414949417, "reward_change_max": 0.00028805434703826904, "reward_change_mean": -0.08401747269090265, "reward_change_min": -0.1541888639330864, "reward_change_std": 0.06128338072448969, "reward_std": 0.6913126930594444, "rewards/cosine_scaled_reward": -0.18195067904889584, "rewards/format_reward": 0.6041666679084301, "step": 322 }, { "advantage_max": 1.512984074652195, "advantage_mean": -2.2662183907229405e-08, "advantage_min": -1.2236190289258957, "advantage_std": 0.999837301671505, "completion_length": 1798.4166946411133, "epoch": 0.36914285714285716, "grad_norm": 0.6877778172492981, "kl": 0.04708099365234375, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.020100089676376e-07, "loss": 0.0019, "reward": 0.39301418559625745, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.39301418559625745, "reward_after_std": 0.8095175884664059, "reward_before_mean": 0.49585794657468796, "reward_before_std": 0.8129806108772755, "reward_change_max": 8.770078420639038e-05, "reward_change_mean": -0.10284376982599497, "reward_change_min": -0.17867287807166576, "reward_change_std": 0.0713571673259139, "reward_std": 0.8095176108181477, "rewards/cosine_scaled_reward": -0.03332102671265602, "rewards/format_reward": 0.5625000111758709, "step": 323 }, { "advantage_max": 1.4784416109323502, "advantage_mean": 3.725290298461914e-09, "advantage_min": -1.0888723582029343, "advantage_std": 0.9997969344258308, "completion_length": 1833.7500381469727, "epoch": 0.3702857142857143, "grad_norm": 0.6865483522415161, "kl": 0.05321502685546875, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.9904679361238526e-07, "loss": 0.0021, "reward": 0.2840481363236904, "reward_advantage_correlation": 0.9999999999999994, "reward_after_mean": 0.2840481363236904, "reward_after_std": 0.6864129733294249, "reward_before_mean": 0.38250038865953684, "reward_before_std": 0.6952138058841228, "reward_change_max": 2.1405518054962158e-05, "reward_change_mean": -0.09845226665493101, "reward_change_min": -0.20211569592356682, "reward_change_std": 0.074963403865695, "reward_std": 0.68641297519207, "rewards/cosine_scaled_reward": -0.1316664731130004, "rewards/format_reward": 0.6458333488553762, "step": 324 }, { "advantage_max": 1.582233265042305, "advantage_mean": -4.159907507350624e-08, "advantage_min": -0.9999347180128098, "advantage_std": 0.9998515471816063, "completion_length": 2004.9167022705078, "epoch": 0.37142857142857144, "grad_norm": 0.4835790693759918, "kl": 0.04077911376953125, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.9609093550344907e-07, "loss": 0.0016, "reward": 0.5837226863950491, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5837226863950491, "reward_after_std": 0.8696581162512302, "reward_before_mean": 0.7024334259331226, "reward_before_std": 0.8600073345005512, "reward_change_max": 0.00013503432273864746, "reward_change_mean": -0.11871073208749294, "reward_change_min": -0.22101427800953388, "reward_change_std": 0.08400990348309278, "reward_std": 0.8696581199765205, "rewards/cosine_scaled_reward": -0.013366644561756402, "rewards/format_reward": 0.7291666753590107, "step": 325 }, { "advantage_max": 1.466815024614334, "advantage_mean": -9.934107980669182e-09, "advantage_min": -1.0336529538035393, "advantage_std": 0.9998352974653244, "completion_length": 1691.750057220459, "epoch": 0.37257142857142855, "grad_norm": 0.4159682095050812, "kl": 0.050548553466796875, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.931425787051832e-07, "loss": 0.002, "reward": 0.5287162624299526, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.5287162624299526, "reward_after_std": 0.7326546274125576, "reward_before_mean": 0.6483225747942924, "reward_before_std": 0.7302737832069397, "reward_change_max": 0.0, "reward_change_mean": -0.11960631795227528, "reward_change_min": -0.2105935337021947, "reward_change_std": 0.08047133404761553, "reward_std": 0.7326546646654606, "rewards/cosine_scaled_reward": -0.0404220474883914, "rewards/format_reward": 0.7291666734963655, "step": 326 }, { "advantage_max": 1.6500040888786316, "advantage_mean": -7.140140018124796e-08, "advantage_min": -0.9951028749346733, "advantage_std": 0.999831348657608, "completion_length": 1675.0000457763672, "epoch": 0.3737142857142857, "grad_norm": 0.4650628864765167, "kl": 0.031185150146484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.902018669163384e-07, "loss": 0.0012, "reward": 0.8487563850358129, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.8487563850358129, "reward_after_std": 0.6994316130876541, "reward_before_mean": 0.9964957498013973, "reward_before_std": 0.6714913509786129, "reward_change_max": 0.0004935041069984436, "reward_change_mean": -0.1477393419481814, "reward_change_min": -0.24240724183619022, "reward_change_std": 0.09056165255606174, "reward_std": 0.6994316205382347, "rewards/cosine_scaled_reward": 0.1128311650827527, "rewards/format_reward": 0.7708333376795053, "step": 327 }, { "advantage_max": 1.593785047531128, "advantage_mean": -3.7252901874396116e-09, "advantage_min": -1.053187295794487, "advantage_std": 0.9997126534581184, "completion_length": 1876.0208740234375, "epoch": 0.37485714285714283, "grad_norm": 0.43940746784210205, "kl": 0.047298431396484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.872689434630585e-07, "loss": 0.0019, "reward": 0.23065691691590473, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.23065691691590473, "reward_after_std": 0.6557473633438349, "reward_before_mean": 0.3228796450421214, "reward_before_std": 0.6465305294841528, "reward_change_max": 0.0001626908779144287, "reward_change_mean": -0.09222274320200086, "reward_change_min": -0.16623501293361187, "reward_change_std": 0.06370436307042837, "reward_std": 0.6557473940774798, "rewards/cosine_scaled_reward": -0.19272685050964355, "rewards/format_reward": 0.7083333414047956, "step": 328 }, { "advantage_max": 1.5264313220977783, "advantage_mean": -1.5149514409618092e-07, "advantage_min": -1.2123200222849846, "advantage_std": 0.9997549876570702, "completion_length": 1171.4375495910645, "epoch": 0.376, "grad_norm": 0.5207266211509705, "kl": 0.03118896484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.843439512918949e-07, "loss": 0.0012, "reward": 0.9414299409836531, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.9414299409836531, "reward_after_std": 0.6094411462545395, "reward_before_mean": 1.1017081029713154, "reward_before_std": 0.5816697841510177, "reward_change_max": 0.0, "reward_change_mean": -0.16027818620204926, "reward_change_min": -0.2311301939189434, "reward_change_std": 0.09064106363803148, "reward_std": 0.6094411574304104, "rewards/cosine_scaled_reward": 0.08210405427962542, "rewards/format_reward": 0.9375000149011612, "step": 329 }, { "advantage_max": 1.5140583962202072, "advantage_mean": -3.476937660007451e-08, "advantage_min": -1.1497740894556046, "advantage_std": 0.9998085647821426, "completion_length": 1699.7292022705078, "epoch": 0.37714285714285717, "grad_norm": 0.9815267324447632, "kl": 0.0795135498046875, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.8142703296283953e-07, "loss": 0.0032, "reward": 0.28797438461333513, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.28797438461333513, "reward_after_std": 0.6291838064789772, "reward_before_mean": 0.38602944649755955, "reward_before_std": 0.6264833547174931, "reward_change_max": 9.696930646896362e-05, "reward_change_mean": -0.09805506188422441, "reward_change_min": -0.17832140903919935, "reward_change_std": 0.06699541071429849, "reward_std": 0.6291838251054287, "rewards/cosine_scaled_reward": -0.17156862188130617, "rewards/format_reward": 0.7291666734963655, "step": 330 }, { "advantage_max": 1.549956038594246, "advantage_mean": -6.208817682207268e-09, "advantage_min": -1.2178971469402313, "advantage_std": 0.9997905716300011, "completion_length": 1860.1250686645508, "epoch": 0.3782857142857143, "grad_norm": 0.8947551250457764, "kl": 0.05895233154296875, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.785183306423767e-07, "loss": 0.0024, "reward": 0.31997561175376177, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.31997561175376177, "reward_after_std": 0.5985409691929817, "reward_before_mean": 0.4238438168540597, "reward_before_std": 0.5951807573437691, "reward_change_max": 0.0001405850052833557, "reward_change_mean": -0.10386821650899947, "reward_change_min": -0.1779593052342534, "reward_change_std": 0.06808420806191862, "reward_std": 0.5985409840941429, "rewards/cosine_scaled_reward": -0.07974475575610995, "rewards/format_reward": 0.583333345130086, "step": 331 }, { "advantage_max": 1.4827670902013779, "advantage_mean": -4.967053213178474e-09, "advantage_min": -1.1617141589522362, "advantage_std": 0.9997115060687065, "completion_length": 1583.0000457763672, "epoch": 0.37942857142857145, "grad_norm": 0.36066368222236633, "kl": 0.03369140625, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.7561798609655373e-07, "loss": 0.0013, "reward": 0.3906476739794016, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3906476739794016, "reward_after_std": 0.47099397890269756, "reward_before_mean": 0.5055399723351002, "reward_before_std": 0.46263338066637516, "reward_change_max": 0.0, "reward_change_mean": -0.11489230440929532, "reward_change_min": -0.1840682066977024, "reward_change_std": 0.06766448728740215, "reward_std": 0.4709939956665039, "rewards/cosine_scaled_reward": -0.12223003013059497, "rewards/format_reward": 0.75, "step": 332 }, { "advantage_max": 1.4400066137313843, "advantage_mean": -2.4835271617007493e-09, "advantage_min": -1.3157860115170479, "advantage_std": 0.9998525753617287, "completion_length": 1373.9375457763672, "epoch": 0.38057142857142856, "grad_norm": 0.4405366778373718, "kl": 0.037105560302734375, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.72726140684072e-07, "loss": 0.0015, "reward": 0.878742154687643, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.878742154687643, "reward_after_std": 0.8494612164795399, "reward_before_mean": 1.027712881565094, "reward_before_std": 0.8498156182467937, "reward_change_max": 0.0, "reward_change_mean": -0.14897070918232203, "reward_change_min": -0.2575899437069893, "reward_change_std": 0.09805800672620535, "reward_std": 0.8494612462818623, "rewards/cosine_scaled_reward": 0.06593976262956858, "rewards/format_reward": 0.8958333507180214, "step": 333 }, { "advantage_max": 1.479415774345398, "advantage_mean": -7.45058070794613e-09, "advantage_min": -1.1129272356629372, "advantage_std": 0.999847337603569, "completion_length": 2343.5833892822266, "epoch": 0.38171428571428573, "grad_norm": 0.7468224763870239, "kl": 0.11907958984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.6984293534939737e-07, "loss": 0.0048, "reward": 0.07790927402675152, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.07790927402675152, "reward_after_std": 0.8249085582792759, "reward_before_mean": 0.15207926771836355, "reward_before_std": 0.8387485817074776, "reward_change_max": 0.00037413090467453003, "reward_change_mean": -0.07416999898850918, "reward_change_min": -0.16692013293504715, "reward_change_std": 0.06690044444985688, "reward_std": 0.8249085918068886, "rewards/cosine_scaled_reward": -0.18437703466042876, "rewards/format_reward": 0.5208333488553762, "step": 334 }, { "advantage_max": 1.5890333950519562, "advantage_mean": -6.581346334577631e-08, "advantage_min": -1.1691535487771034, "advantage_std": 0.9998458698391914, "completion_length": 1481.020866394043, "epoch": 0.38285714285714284, "grad_norm": 0.3421805202960968, "kl": 0.036113739013671875, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.6696851061588994e-07, "loss": 0.0014, "reward": 0.7999673548620194, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7999673548620194, "reward_after_std": 0.7872883416712284, "reward_before_mean": 0.9402303099632263, "reward_before_std": 0.7693048864603043, "reward_change_max": 0.00010892003774642944, "reward_change_mean": -0.1402629679068923, "reward_change_min": -0.21821013279259205, "reward_change_std": 0.08301450358703732, "reward_std": 0.7872883789241314, "rewards/cosine_scaled_reward": 0.043031807988882065, "rewards/format_reward": 0.8541666772216558, "step": 335 }, { "advantage_max": 1.5050934553146362, "advantage_mean": -1.8936892387522164e-08, "advantage_min": -1.2547463476657867, "advantage_std": 0.999822124838829, "completion_length": 1533.645896911621, "epoch": 0.384, "grad_norm": 0.4388027489185333, "kl": 0.04041290283203125, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.641030065789562e-07, "loss": 0.0016, "reward": 0.6022516712546349, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6022516712546349, "reward_after_std": 0.6543654501438141, "reward_before_mean": 0.7301745153963566, "reward_before_std": 0.6454097218811512, "reward_change_max": 0.000157281756401062, "reward_change_mean": -0.12792286463081837, "reward_change_min": -0.20177920907735825, "reward_change_std": 0.07864083210006356, "reward_std": 0.6543654501438141, "rewards/cosine_scaled_reward": 0.03175393491983414, "rewards/format_reward": 0.6666666772216558, "step": 336 }, { "advantage_max": 1.5357903391122818, "advantage_mean": -3.97364304793868e-08, "advantage_min": -1.2102145925164223, "advantage_std": 0.9998451396822929, "completion_length": 1482.979232788086, "epoch": 0.3851428571428571, "grad_norm": 0.6808515787124634, "kl": 0.035221099853515625, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.612465628992203e-07, "loss": 0.0014, "reward": 0.8413368645124137, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.8413368645124137, "reward_after_std": 0.8354975320398808, "reward_before_mean": 0.9865454584360123, "reward_before_std": 0.8282735012471676, "reward_change_max": 0.0, "reward_change_mean": -0.1452085990458727, "reward_change_min": -0.25506412237882614, "reward_change_std": 0.09339386876672506, "reward_std": 0.8354975394904613, "rewards/cosine_scaled_reward": 0.0349393846699968, "rewards/format_reward": 0.9166666865348816, "step": 337 }, { "advantage_max": 1.5026657208800316, "advantage_mean": -5.4637592117323663e-08, "advantage_min": -1.128111731261015, "advantage_std": 0.999842643737793, "completion_length": 1445.9792251586914, "epoch": 0.3862857142857143, "grad_norm": 0.4162408709526062, "kl": 0.04589080810546875, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.5839931879571725e-07, "loss": 0.0018, "reward": 0.8082408686168492, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.8082408686168492, "reward_after_std": 0.8124595545232296, "reward_before_mean": 0.9513399479910731, "reward_before_std": 0.8106804341077805, "reward_change_max": 0.0, "reward_change_mean": -0.14309907238930464, "reward_change_min": -0.23549943324178457, "reward_change_std": 0.09040046157315373, "reward_std": 0.8124595619738102, "rewards/cosine_scaled_reward": 0.04858662304468453, "rewards/format_reward": 0.8541666772216558, "step": 338 }, { "advantage_max": 1.317950114607811, "advantage_mean": 8.381903254806033e-09, "advantage_min": -1.4605398029088974, "advantage_std": 0.9998449608683586, "completion_length": 1954.3125457763672, "epoch": 0.38742857142857146, "grad_norm": 1.3577280044555664, "kl": 0.08642578125, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.555614130391079e-07, "loss": 0.0035, "reward": 0.30176051147282124, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.30176051147282124, "reward_after_std": 0.6594284176826477, "reward_before_mean": 0.4035015068948269, "reward_before_std": 0.674022451043129, "reward_change_max": 0.0011976435780525208, "reward_change_mean": -0.10174098331481218, "reward_change_min": -0.18010628037154675, "reward_change_std": 0.0762557522393763, "reward_std": 0.6594284400343895, "rewards/cosine_scaled_reward": -0.110749252140522, "rewards/format_reward": 0.6250000167638063, "step": 339 }, { "advantage_max": 1.4834815636277199, "advantage_mean": -4.594524882772788e-08, "advantage_min": -1.2522487416863441, "advantage_std": 0.9998084381222725, "completion_length": 1579.2500381469727, "epoch": 0.38857142857142857, "grad_norm": 0.33722105622291565, "kl": 0.04709053039550781, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.5273298394491515e-07, "loss": 0.0019, "reward": 0.6650699935853481, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6650699935853481, "reward_after_std": 0.6854303106665611, "reward_before_mean": 0.7988633252680302, "reward_before_std": 0.6824867390096188, "reward_change_max": 8.215010166168213e-05, "reward_change_mean": -0.133793359156698, "reward_change_min": -0.21632769331336021, "reward_change_std": 0.08351973094977438, "reward_std": 0.6854303181171417, "rewards/cosine_scaled_reward": -0.017234998289495707, "rewards/format_reward": 0.8333333488553762, "step": 340 }, { "advantage_max": 1.689264178276062, "advantage_mean": -2.7877590941249863e-07, "advantage_min": -1.024775207042694, "advantage_std": 0.9997997581958771, "completion_length": 1468.0417022705078, "epoch": 0.38971428571428574, "grad_norm": 0.68641597032547, "kl": 0.0506744384765625, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.4991416936678276e-07, "loss": 0.002, "reward": 0.6998622994869947, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6998622994869947, "reward_after_std": 0.736074797809124, "reward_before_mean": 0.8322503371164203, "reward_before_std": 0.70997529104352, "reward_change_max": 0.0, "reward_change_mean": -0.13238807581365108, "reward_change_min": -0.21726097911596298, "reward_change_std": 0.08524594688788056, "reward_std": 0.7360748127102852, "rewards/cosine_scaled_reward": 0.04112516465829685, "rewards/format_reward": 0.7500000037252903, "step": 341 }, { "advantage_max": 1.5366226136684418, "advantage_mean": -3.911554946611773e-08, "advantage_min": -1.165475644171238, "advantage_std": 0.9998475164175034, "completion_length": 1879.416748046875, "epoch": 0.39085714285714285, "grad_norm": 0.9024969339370728, "kl": 0.09264373779296875, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.471051066897562e-07, "loss": 0.0037, "reward": 0.7040500938892365, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7040500938892365, "reward_after_std": 0.9079513140022755, "reward_before_mean": 0.8336313590407372, "reward_before_std": 0.9082721434533596, "reward_change_max": 0.0, "reward_change_mean": -0.12958126701414585, "reward_change_min": -0.227171890437603, "reward_change_std": 0.08963154442608356, "reward_std": 0.907951358705759, "rewards/cosine_scaled_reward": 0.010565669741481543, "rewards/format_reward": 0.8125000186264515, "step": 342 }, { "advantage_max": 1.412947177886963, "advantage_mean": -2.017865641246175e-08, "advantage_min": -1.2453822493553162, "advantage_std": 0.9998679906129837, "completion_length": 1775.9792022705078, "epoch": 0.392, "grad_norm": 0.6765278577804565, "kl": 0.08543014526367188, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.4430593282358777e-07, "loss": 0.0034, "reward": 0.6520730927586555, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6520730927586555, "reward_after_std": 0.9671961665153503, "reward_before_mean": 0.776901314035058, "reward_before_std": 0.9851310290396214, "reward_change_max": 0.0001538395881652832, "reward_change_mean": -0.12482821242883801, "reward_change_min": -0.24192855693399906, "reward_change_std": 0.09555056085810065, "reward_std": 0.9671961963176727, "rewards/cosine_scaled_reward": 0.0342839767690748, "rewards/format_reward": 0.708333345130086, "step": 343 }, { "advantage_max": 1.5145802944898605, "advantage_mean": -1.2728075615697776e-07, "advantage_min": -1.318326160311699, "advantage_std": 0.9997060596942902, "completion_length": 1550.6875534057617, "epoch": 0.3931428571428571, "grad_norm": 0.8243290781974792, "kl": 0.084259033203125, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.4151678419606233e-07, "loss": 0.0034, "reward": 1.1622946355491877, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 1.1622946355491877, "reward_after_std": 0.5587964607402682, "reward_before_mean": 1.3463120497763157, "reward_before_std": 0.5346446477342397, "reward_change_max": 0.0005112588405609131, "reward_change_mean": -0.1840174519456923, "reward_change_min": -0.2756340950727463, "reward_change_std": 0.10637355549260974, "reward_std": 0.5587964681908488, "rewards/cosine_scaled_reward": 0.24607268278487027, "rewards/format_reward": 0.8541666753590107, "step": 344 }, { "advantage_max": 1.540076158940792, "advantage_mean": -5.215406517766752e-08, "advantage_min": -1.1613540425896645, "advantage_std": 0.9998455420136452, "completion_length": 1671.3750305175781, "epoch": 0.3942857142857143, "grad_norm": 1.7589654922485352, "kl": 0.1017913818359375, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.387377967463493e-07, "loss": 0.0041, "reward": 0.5623795920982957, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5623795920982957, "reward_after_std": 0.8140008598566055, "reward_before_mean": 0.6826059645973146, "reward_before_std": 0.8165422268211842, "reward_change_max": 0.0010628923773765564, "reward_change_mean": -0.12022638134658337, "reward_change_min": -0.21904029790312052, "reward_change_std": 0.08774518640711904, "reward_std": 0.8140008747577667, "rewards/cosine_scaled_reward": 0.04963629972189665, "rewards/format_reward": 0.5833333414047956, "step": 345 }, { "advantage_max": 1.645007699728012, "advantage_mean": -5.0912302596017867e-08, "advantage_min": -1.0988484546542168, "advantage_std": 0.9998019188642502, "completion_length": 1430.3959045410156, "epoch": 0.3954285714285714, "grad_norm": 0.32801398634910583, "kl": 0.046825408935546875, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.359691059183761e-07, "loss": 0.0019, "reward": 0.5887374058365822, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5887374058365822, "reward_after_std": 0.7088357359170914, "reward_before_mean": 0.7109762877225876, "reward_before_std": 0.6876601781696081, "reward_change_max": 0.0, "reward_change_mean": -0.12223889189772308, "reward_change_min": -0.17916064709424973, "reward_change_std": 0.06939612235873938, "reward_std": 0.7088357619941235, "rewards/cosine_scaled_reward": -0.09242854062176775, "rewards/format_reward": 0.895833333954215, "step": 346 }, { "advantage_max": 1.5532027930021286, "advantage_mean": -1.179675318541129e-08, "advantage_min": -1.0655813068151474, "advantage_std": 0.9997884854674339, "completion_length": 1528.8750457763672, "epoch": 0.3965714285714286, "grad_norm": 0.578608512878418, "kl": 0.04645538330078125, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.3321084665422803e-07, "loss": 0.0019, "reward": 0.30343328788876534, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.30343328788876534, "reward_after_std": 0.5521973073482513, "reward_before_mean": 0.4068300393410027, "reward_before_std": 0.5445283949375153, "reward_change_max": 0.0, "reward_change_mean": -0.10339675471186638, "reward_change_min": -0.1844671368598938, "reward_change_std": 0.06681121652945876, "reward_std": 0.5521973147988319, "rewards/cosine_scaled_reward": -0.22366831824183464, "rewards/format_reward": 0.8541666865348816, "step": 347 }, { "advantage_max": 1.4550811648368835, "advantage_mean": -3.7563345905988754e-08, "advantage_min": -1.2732831984758377, "advantage_std": 0.9997940734028816, "completion_length": 1711.5000228881836, "epoch": 0.3977142857142857, "grad_norm": 0.9890735149383545, "kl": 0.12113571166992188, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.3046315338757026e-07, "loss": 0.0048, "reward": 0.6786003398301546, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.6786003398301546, "reward_after_std": 0.6397302523255348, "reward_before_mean": 0.8155205333605409, "reward_before_std": 0.6368211433291435, "reward_change_max": 0.00017334520816802979, "reward_change_mean": -0.13692022114992142, "reward_change_min": -0.22533964831382036, "reward_change_std": 0.0878034750930965, "reward_std": 0.6397302746772766, "rewards/cosine_scaled_reward": 0.011926926672458649, "rewards/format_reward": 0.791666679084301, "step": 348 }, { "advantage_max": 1.5810499042272568, "advantage_mean": -3.973643136756522e-08, "advantage_min": -1.0555768236517906, "advantage_std": 0.9997776970267296, "completion_length": 1204.1667022705078, "epoch": 0.39885714285714285, "grad_norm": 0.6008383631706238, "kl": 0.03905487060546875, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.2772616003709616e-07, "loss": 0.0016, "reward": 0.6463835099712014, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.6463835099712014, "reward_after_std": 0.5708205699920654, "reward_before_mean": 0.7810069844126701, "reward_before_std": 0.5535554438829422, "reward_change_max": 0.0, "reward_change_mean": -0.13462348422035575, "reward_change_min": -0.2155060712248087, "reward_change_std": 0.07849409175105393, "reward_std": 0.5708205848932266, "rewards/cosine_scaled_reward": -0.0053298622369766235, "rewards/format_reward": 0.7916666716337204, "step": 349 }, { "advantage_max": 1.4653938859701157, "advantage_mean": -1.6142925107764938e-08, "advantage_min": -1.2062378972768784, "advantage_std": 0.9998128265142441, "completion_length": 917.1250305175781, "epoch": 0.4, "grad_norm": 0.537183403968811, "kl": 0.012424468994140625, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.250000000000001e-07, "loss": 0.0005, "reward": 0.6380548775196075, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6380548775196075, "reward_after_std": 0.6736834794282913, "reward_before_mean": 0.7693752646446228, "reward_before_std": 0.6650245450437069, "reward_change_max": 0.0, "reward_change_mean": -0.13132039550691843, "reward_change_min": -0.22087561339139938, "reward_change_std": 0.08057826245203614, "reward_std": 0.6736834980547428, "rewards/cosine_scaled_reward": -0.09447903372347355, "rewards/format_reward": 0.9583333432674408, "step": 350 }, { "advantage_max": 1.5197671800851822, "advantage_mean": -4.532436648219118e-08, "advantage_min": -1.1577540412545204, "advantage_std": 0.9998496472835541, "completion_length": 1553.4792022705078, "epoch": 0.40114285714285713, "grad_norm": 0.8147112727165222, "kl": 0.12945556640625, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.222848061454764e-07, "loss": 0.0052, "reward": 0.7770597245544195, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7770597245544195, "reward_after_std": 0.7373732291162014, "reward_before_mean": 0.9183787778019905, "reward_before_std": 0.7273126542568207, "reward_change_max": 0.0005416646599769592, "reward_change_mean": -0.14131904486566782, "reward_change_min": -0.23502979520708323, "reward_change_std": 0.08941868646070361, "reward_std": 0.737373273819685, "rewards/cosine_scaled_reward": 0.06335602421313524, "rewards/format_reward": 0.7916666828095913, "step": 351 }, { "advantage_max": 1.569848746061325, "advantage_mean": -4.190951918836561e-09, "advantage_min": -1.2060598954558372, "advantage_std": 0.9997753575444221, "completion_length": 1646.0834045410156, "epoch": 0.4022857142857143, "grad_norm": 0.8164442181587219, "kl": 0.11374664306640625, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.195807108082429e-07, "loss": 0.0046, "reward": 0.5288133807480335, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5288133807480335, "reward_after_std": 0.610162977129221, "reward_before_mean": 0.6497844681143761, "reward_before_std": 0.5933005921542645, "reward_change_max": 0.0, "reward_change_mean": -0.12097106548026204, "reward_change_min": -0.1816613031551242, "reward_change_std": 0.06907373643480241, "reward_std": 0.6101629845798016, "rewards/cosine_scaled_reward": -0.029274450847879052, "rewards/format_reward": 0.708333333954215, "step": 352 }, { "advantage_max": 1.3838004171848297, "advantage_mean": -4.718701124284408e-08, "advantage_min": -1.350169561803341, "advantage_std": 0.9998253807425499, "completion_length": 1370.3750381469727, "epoch": 0.4034285714285714, "grad_norm": 1.0508337020874023, "kl": 0.0648651123046875, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.168878457820915e-07, "loss": 0.0026, "reward": 0.8240749211981893, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.8240749211981893, "reward_after_std": 0.718726497143507, "reward_before_mean": 0.9740468331146985, "reward_before_std": 0.7258542701601982, "reward_change_max": 0.0, "reward_change_mean": -0.14997189305722713, "reward_change_min": -0.24729277566075325, "reward_change_std": 0.0964922783896327, "reward_std": 0.7187265120446682, "rewards/cosine_scaled_reward": 0.08077336475253105, "rewards/format_reward": 0.8125000111758709, "step": 353 }, { "advantage_max": 1.486832708120346, "advantage_mean": -1.9868215184182247e-08, "advantage_min": -1.33545982837677, "advantage_std": 0.9997362196445465, "completion_length": 1081.6250305175781, "epoch": 0.4045714285714286, "grad_norm": 0.6051336526870728, "kl": 0.052875518798828125, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.142063423134644e-07, "loss": 0.0021, "reward": 0.933193551376462, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.933193551376462, "reward_after_std": 0.5013358984142542, "reward_before_mean": 1.097759174183011, "reward_before_std": 0.48550001345574856, "reward_change_max": 0.0, "reward_change_mean": -0.16456563211977482, "reward_change_min": -0.23568181600421667, "reward_change_std": 0.09174512466415763, "reward_std": 0.5013359021395445, "rewards/cosine_scaled_reward": 0.06971290893852711, "rewards/format_reward": 0.9583333358168602, "step": 354 }, { "advantage_max": 1.3949600085616112, "advantage_mean": -4.842877388000488e-08, "advantage_min": -1.2098092809319496, "advantage_std": 0.9998485893011093, "completion_length": 1034.3125190734863, "epoch": 0.4057142857142857, "grad_norm": 0.4866308271884918, "kl": 0.037677764892578125, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.115363310950578e-07, "loss": 0.0015, "reward": 0.8702779617160559, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.8702779617160559, "reward_after_std": 0.8237923942506313, "reward_before_mean": 1.020679783076048, "reward_before_std": 0.8269104994833469, "reward_change_max": 0.00039453059434890747, "reward_change_mean": -0.1504018036648631, "reward_change_min": -0.28018930554389954, "reward_change_std": 0.10402650013566017, "reward_std": 0.8237924389541149, "rewards/cosine_scaled_reward": 0.08325653476640582, "rewards/format_reward": 0.8541666716337204, "step": 355 }, { "advantage_max": 1.4376270696520805, "advantage_mean": -2.2351742789972207e-08, "advantage_min": -1.1074972301721573, "advantage_std": 0.9998405128717422, "completion_length": 1922.1875381469727, "epoch": 0.40685714285714286, "grad_norm": 0.8787837028503418, "kl": 0.18633270263671875, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.0887794225945143e-07, "loss": 0.0075, "reward": 0.5590815953910351, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5590815953910351, "reward_after_std": 0.7449811734259129, "reward_before_mean": 0.6816292963922024, "reward_before_std": 0.7479454576969147, "reward_change_max": 0.00029243528842926025, "reward_change_mean": -0.12254770565778017, "reward_change_min": -0.23000308498740196, "reward_change_std": 0.08476777747273445, "reward_std": 0.7449811846017838, "rewards/cosine_scaled_reward": -0.08626868622377515, "rewards/format_reward": 0.8541666865348816, "step": 356 }, { "advantage_max": 1.421733245253563, "advantage_mean": -1.3038516599728212e-08, "advantage_min": -1.3238706812262535, "advantage_std": 0.9998385459184647, "completion_length": 2226.3125610351562, "epoch": 0.408, "grad_norm": 2.134793519973755, "kl": 0.2264862060546875, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.062313053727671e-07, "loss": 0.0091, "reward": 0.31191481556743383, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.31191481556743383, "reward_after_std": 0.698531374335289, "reward_before_mean": 0.4118203781545162, "reward_before_std": 0.7032448500394821, "reward_change_max": 0.0, "reward_change_mean": -0.09990557003766298, "reward_change_min": -0.17775661405175924, "reward_change_std": 0.06979941623285413, "reward_std": 0.6985313966870308, "rewards/cosine_scaled_reward": -0.13783981930464506, "rewards/format_reward": 0.6875000186264515, "step": 357 }, { "advantage_max": 1.2833376079797745, "advantage_mean": 4.3461718668424965e-09, "advantage_min": -1.2507511153817177, "advantage_std": 0.9998571276664734, "completion_length": 1489.083351135254, "epoch": 0.40914285714285714, "grad_norm": 0.5106679201126099, "kl": 0.06841659545898438, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.0359654942835247e-07, "loss": 0.0027, "reward": 1.0083662807010114, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 1.0083662807010114, "reward_after_std": 0.8467534109950066, "reward_before_mean": 1.171830676496029, "reward_before_std": 0.8582195043563843, "reward_change_max": 0.0, "reward_change_mean": -0.16346431523561478, "reward_change_min": -0.27867283672094345, "reward_change_std": 0.10701682418584824, "reward_std": 0.846753440797329, "rewards/cosine_scaled_reward": 0.16924862004816532, "rewards/format_reward": 0.833333333954215, "step": 358 }, { "advantage_max": 1.3780936226248741, "advantage_mean": -4.594524882772788e-08, "advantage_min": -1.2478744611144066, "advantage_std": 0.9997677430510521, "completion_length": 1134.2292175292969, "epoch": 0.4102857142857143, "grad_norm": 0.6363142132759094, "kl": 0.10742950439453125, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.0097380284049523e-07, "loss": 0.0043, "reward": 0.6243336275219917, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6243336275219917, "reward_after_std": 0.5924880225211382, "reward_before_mean": 0.7577880509197712, "reward_before_std": 0.590686340816319, "reward_change_max": 0.0007399767637252808, "reward_change_mean": -0.13345444854348898, "reward_change_min": -0.21509629674255848, "reward_change_std": 0.08265585312619805, "reward_std": 0.5924880467355251, "rewards/cosine_scaled_reward": -0.0690226498991251, "rewards/format_reward": 0.8958333432674408, "step": 359 }, { "advantage_max": 1.388232484459877, "advantage_mean": -6.519258266557415e-08, "advantage_min": -1.3220400288701057, "advantage_std": 0.9998097345232964, "completion_length": 1587.1250610351562, "epoch": 0.4114285714285714, "grad_norm": 0.7408303022384644, "kl": 0.2151947021484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.9836319343816397e-07, "loss": 0.0086, "reward": 0.841188732534647, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.841188732534647, "reward_after_std": 0.7126155123114586, "reward_before_mean": 0.992447454482317, "reward_before_std": 0.7154055722057819, "reward_change_max": 0.0, "reward_change_mean": -0.15125871915370226, "reward_change_min": -0.24961007386446, "reward_change_std": 0.09662879165261984, "reward_std": 0.7126155272126198, "rewards/cosine_scaled_reward": 0.04830702394247055, "rewards/format_reward": 0.8958333507180214, "step": 360 }, { "advantage_max": 1.5855596661567688, "advantage_mean": 4.2840838099245104e-08, "advantage_min": -1.1460353285074234, "advantage_std": 0.9997694715857506, "completion_length": 1471.395866394043, "epoch": 0.4125714285714286, "grad_norm": 1.182603359222412, "kl": 0.10492706298828125, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.9576484845877793e-07, "loss": 0.0042, "reward": 0.470302056055516, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.470302056055516, "reward_after_std": 0.6256013102829456, "reward_before_mean": 0.5854224106296897, "reward_before_std": 0.6100865183398128, "reward_change_max": 0.0, "reward_change_mean": -0.11512034106999636, "reward_change_min": -0.19347173534333706, "reward_change_std": 0.07023542281240225, "reward_std": 0.6256013140082359, "rewards/cosine_scaled_reward": -0.13437213900033385, "rewards/format_reward": 0.854166679084301, "step": 361 }, { "advantage_max": 1.4198757410049438, "advantage_mean": -6.953875464343895e-08, "advantage_min": -1.3393580988049507, "advantage_std": 0.9997484311461449, "completion_length": 925.854190826416, "epoch": 0.4137142857142857, "grad_norm": 0.6270393133163452, "kl": 0.06605148315429688, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.931788945420058e-07, "loss": 0.0026, "reward": 0.8872093297541142, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.8872093297541142, "reward_after_std": 0.4866991452872753, "reward_before_mean": 1.0480562169104815, "reward_before_std": 0.46841985266655684, "reward_change_max": 0.0, "reward_change_mean": -0.1608469020575285, "reward_change_min": -0.2332429401576519, "reward_change_std": 0.09089976316317916, "reward_std": 0.4866991676390171, "rewards/cosine_scaled_reward": 0.04486143495887518, "rewards/format_reward": 0.9583333358168602, "step": 362 }, { "advantage_max": 1.4218028262257576, "advantage_mean": 8.19563861220729e-08, "advantage_min": -1.1774882376194, "advantage_std": 0.9997980892658234, "completion_length": 1123.0416984558105, "epoch": 0.41485714285714287, "grad_norm": 0.6843811869621277, "kl": 0.14236831665039062, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.9060545772359305e-07, "loss": 0.0057, "reward": 1.0501264370977879, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 1.0501264370977879, "reward_after_std": 0.6898505575954914, "reward_before_mean": 1.2204857654869556, "reward_before_std": 0.6796518871560693, "reward_change_max": 0.0, "reward_change_mean": -0.17035925202071667, "reward_change_min": -0.26173054426908493, "reward_change_std": 0.10411297017708421, "reward_std": 0.689850565046072, "rewards/cosine_scaled_reward": 0.19357617758214474, "rewards/format_reward": 0.8333333358168602, "step": 363 }, { "advantage_max": 1.5182277262210846, "advantage_mean": -2.5456150964942026e-08, "advantage_min": -1.3673394322395325, "advantage_std": 0.9997450858354568, "completion_length": 1234.833381652832, "epoch": 0.416, "grad_norm": 1.0001448392868042, "kl": 0.08144378662109375, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.8804466342921987e-07, "loss": 0.0033, "reward": 0.3523000096902251, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.3523000096902251, "reward_after_std": 0.48937233351171017, "reward_before_mean": 0.46050266548991203, "reward_before_std": 0.47428031265735626, "reward_change_max": 0.0, "reward_change_mean": -0.10820264089852571, "reward_change_min": -0.1646072268486023, "reward_change_std": 0.062361706513911486, "reward_std": 0.4893723502755165, "rewards/cosine_scaled_reward": -0.217665349598974, "rewards/format_reward": 0.8958333432674408, "step": 364 }, { "advantage_max": 1.3571320548653603, "advantage_mean": 4.65661231796588e-09, "advantage_min": -1.4831850975751877, "advantage_std": 0.9996974319219589, "completion_length": 2165.5833740234375, "epoch": 0.41714285714285715, "grad_norm": 1.301094889640808, "kl": 0.2874908447265625, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.854966364683872e-07, "loss": 0.0115, "reward": 0.5533080464228988, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5533080464228988, "reward_after_std": 0.5401174901053309, "reward_before_mean": 0.6822809707373381, "reward_before_std": 0.5410689422860742, "reward_change_max": 0.00028949230909347534, "reward_change_mean": -0.12897292617708445, "reward_change_min": -0.20345229096710682, "reward_change_std": 0.08068801555782557, "reward_std": 0.5401174938306212, "rewards/cosine_scaled_reward": 0.03905716352164745, "rewards/format_reward": 0.6041666734963655, "step": 365 }, { "advantage_max": 1.5208699703216553, "advantage_mean": -4.8428774879205605e-08, "advantage_min": -1.1703914254903793, "advantage_std": 0.9997816234827042, "completion_length": 1370.395881652832, "epoch": 0.41828571428571426, "grad_norm": 0.7431749105453491, "kl": 0.10540771484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.829615010283344e-07, "loss": 0.0042, "reward": 1.0022234451025724, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 1.0022234451025724, "reward_after_std": 0.6891144718974829, "reward_before_mean": 1.1668678969144821, "reward_before_std": 0.6747738681733608, "reward_change_max": 0.0, "reward_change_mean": -0.1646444108337164, "reward_change_min": -0.25490029994398355, "reward_change_std": 0.09806211944669485, "reward_std": 0.6891144774854183, "rewards/cosine_scaled_reward": 0.12510060099884868, "rewards/format_reward": 0.9166666679084301, "step": 366 }, { "advantage_max": 1.5256575047969818, "advantage_mean": -4.718701029915451e-08, "advantage_min": -1.1822673827409744, "advantage_std": 0.9997625052928925, "completion_length": 1623.8958892822266, "epoch": 0.41942857142857143, "grad_norm": 0.8612035512924194, "kl": 0.1331787109375, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.8043938066798645e-07, "loss": 0.0053, "reward": 0.6489149909466505, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6489149909466505, "reward_after_std": 0.704158004373312, "reward_before_mean": 0.7802309468388557, "reward_before_std": 0.6997333746403456, "reward_change_max": 0.0003588870167732239, "reward_change_mean": -0.13131593633443117, "reward_change_min": -0.22332393191754818, "reward_change_std": 0.08702679723501205, "reward_std": 0.7041580304503441, "rewards/cosine_scaled_reward": -0.057801210321485996, "rewards/format_reward": 0.8958333432674408, "step": 367 }, { "advantage_max": 1.5013212859630585, "advantage_mean": -1.8626449271863521e-09, "advantage_min": -1.1598549410700798, "advantage_std": 0.9998278617858887, "completion_length": 2047.5625534057617, "epoch": 0.4205714285714286, "grad_norm": 1.596835970878601, "kl": 0.2228240966796875, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.7793039831193133e-07, "loss": 0.0089, "reward": 0.2928560241125524, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2928560241125524, "reward_after_std": 0.7190740033984184, "reward_before_mean": 0.3913656147196889, "reward_before_std": 0.7281620763242245, "reward_change_max": 0.0003599002957344055, "reward_change_mean": -0.09850956918671727, "reward_change_min": -0.1961950259283185, "reward_change_std": 0.07544383313506842, "reward_std": 0.7190740182995796, "rewards/cosine_scaled_reward": -0.12723387405276299, "rewards/format_reward": 0.6458333414047956, "step": 368 }, { "advantage_max": 1.5033908188343048, "advantage_mean": -3.042320473323201e-08, "advantage_min": -1.068381130695343, "advantage_std": 0.9998282641172409, "completion_length": 1654.1875534057617, "epoch": 0.4217142857142857, "grad_norm": 1.0307308435440063, "kl": 0.21329116821289062, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.7543467624442956e-07, "loss": 0.0085, "reward": 0.639374952763319, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.639374952763319, "reward_after_std": 0.68315190076828, "reward_before_mean": 0.7693626917898655, "reward_before_std": 0.6706436909735203, "reward_change_max": 0.0, "reward_change_mean": -0.12998773623257875, "reward_change_min": -0.2221611039713025, "reward_change_std": 0.07870964519679546, "reward_std": 0.6831519119441509, "rewards/cosine_scaled_reward": -0.03198532899841666, "rewards/format_reward": 0.8333333469927311, "step": 369 }, { "advantage_max": 1.6313235014677048, "advantage_mean": 4.9049655004296255e-08, "advantage_min": -1.0117665193974972, "advantage_std": 0.9997687488794327, "completion_length": 2152.3750610351562, "epoch": 0.4228571428571429, "grad_norm": 0.996197521686554, "kl": 0.40521240234375, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.729523361034538e-07, "loss": 0.0162, "reward": 0.4512446033768356, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4512446033768356, "reward_after_std": 0.591382460668683, "reward_before_mean": 0.5645157024264336, "reward_before_std": 0.5668165199458599, "reward_change_max": 0.0011430829763412476, "reward_change_mean": -0.11327109299600124, "reward_change_min": -0.18050487712025642, "reward_change_std": 0.07153899781405926, "reward_std": 0.5913824774324894, "rewards/cosine_scaled_reward": -0.030242161825299263, "rewards/format_reward": 0.6250000055879354, "step": 370 }, { "advantage_max": 1.6017784476280212, "advantage_mean": -2.607703397661254e-08, "advantage_min": -0.9585048705339432, "advantage_std": 0.999807633459568, "completion_length": 1053.56254196167, "epoch": 0.424, "grad_norm": 1.5119962692260742, "kl": 0.134368896484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.7048349887476037e-07, "loss": 0.0054, "reward": 0.8266900572925806, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.8266900572925806, "reward_after_std": 0.6163216028362513, "reward_before_mean": 0.975990392267704, "reward_before_std": 0.5858824178576469, "reward_change_max": 0.0008676201105117798, "reward_change_mean": -0.14930032240226865, "reward_change_min": -0.23938406724482775, "reward_change_std": 0.09261591965332627, "reward_std": 0.6163216270506382, "rewards/cosine_scaled_reward": 0.11299517937004566, "rewards/format_reward": 0.7500000111758709, "step": 371 }, { "advantage_max": 1.4052574709057808, "advantage_mean": 3.6011141624214815e-08, "advantage_min": -1.2968919053673744, "advantage_std": 0.9998442903161049, "completion_length": 1930.7292175292969, "epoch": 0.42514285714285716, "grad_norm": 1.3718596696853638, "kl": 0.19745254516601562, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.6802828488599294e-07, "loss": 0.0079, "reward": 0.7759344661608338, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7759344661608338, "reward_after_std": 0.8030894659459591, "reward_before_mean": 0.9179776012897491, "reward_before_std": 0.8054710291326046, "reward_change_max": 0.00045900046825408936, "reward_change_mean": -0.1420431211590767, "reward_change_min": -0.24235581792891026, "reward_change_std": 0.09905825974419713, "reward_std": 0.8030894808471203, "rewards/cosine_scaled_reward": 0.05273880437016487, "rewards/format_reward": 0.8125000223517418, "step": 372 }, { "advantage_max": 1.5664848685264587, "advantage_mean": -6.20881684954e-09, "advantage_min": -1.2124197706580162, "advantage_std": 0.9998014271259308, "completion_length": 1053.2500381469727, "epoch": 0.42628571428571427, "grad_norm": 0.9296903610229492, "kl": 0.08166122436523438, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.655868138008171e-07, "loss": 0.0033, "reward": 0.5102668823674321, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5102668823674321, "reward_after_std": 0.7223041206598282, "reward_before_mean": 0.6259184032678604, "reward_before_std": 0.7110856045037508, "reward_change_max": 0.0005139410495758057, "reward_change_mean": -0.11565148271620274, "reward_change_min": -0.18873510602861643, "reward_change_std": 0.07335183955729008, "reward_std": 0.72230414301157, "rewards/cosine_scaled_reward": -0.14537415117956698, "rewards/format_reward": 0.916666679084301, "step": 373 }, { "advantage_max": 1.3690541833639145, "advantage_mean": -4.035731082652205e-09, "advantage_min": -1.3715066015720367, "advantage_std": 0.9997926205396652, "completion_length": 1368.4167251586914, "epoch": 0.42742857142857144, "grad_norm": 0.9108665585517883, "kl": 0.12201690673828125, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.631592046130896e-07, "loss": 0.0049, "reward": 0.7745124213397503, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7745124213397503, "reward_after_std": 0.5470051765441895, "reward_before_mean": 0.9227103255689144, "reward_before_std": 0.5317458175122738, "reward_change_max": 0.0, "reward_change_mean": -0.14819789212197065, "reward_change_min": -0.22565262019634247, "reward_change_std": 0.08558421535417438, "reward_std": 0.547005195170641, "rewards/cosine_scaled_reward": 0.023855158127844334, "rewards/format_reward": 0.8750000223517418, "step": 374 }, { "advantage_max": 1.5265842229127884, "advantage_mean": -2.23517424569053e-08, "advantage_min": -1.228366658091545, "advantage_std": 0.9998601377010345, "completion_length": 1652.5625686645508, "epoch": 0.42857142857142855, "grad_norm": 0.9376990795135498, "kl": 0.26873016357421875, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.6074557564105724e-07, "loss": 0.0107, "reward": 0.812188274692744, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.812188274692744, "reward_after_std": 0.9057653173804283, "reward_before_mean": 0.9515043869614601, "reward_before_std": 0.9038424082100391, "reward_change_max": 0.0001263841986656189, "reward_change_mean": -0.13931614579632878, "reward_change_min": -0.24343886598944664, "reward_change_std": 0.09589880565181375, "reward_std": 0.9057653844356537, "rewards/cosine_scaled_reward": 0.0799188744276762, "rewards/format_reward": 0.7916666753590107, "step": 375 }, { "advantage_max": 1.5179670602083206, "advantage_mean": -2.421438682898014e-08, "advantage_min": -1.1654746755957603, "advantage_std": 0.9998091086745262, "completion_length": 1489.7292404174805, "epoch": 0.4297142857142857, "grad_norm": 1.5551291704177856, "kl": 0.2132110595703125, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.583460445215911e-07, "loss": 0.0085, "reward": 0.5052896784618497, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5052896784618497, "reward_after_std": 0.6814988665282726, "reward_before_mean": 0.6225324124097824, "reward_before_std": 0.6745848506689072, "reward_change_max": 0.0, "reward_change_mean": -0.1172427274286747, "reward_change_min": -0.19756066799163818, "reward_change_std": 0.07500209799036384, "reward_std": 0.6814988739788532, "rewards/cosine_scaled_reward": -0.11581713845953345, "rewards/format_reward": 0.854166679084301, "step": 376 }, { "advantage_max": 1.4136180728673935, "advantage_mean": -2.3593505260599557e-08, "advantage_min": -1.2113258317112923, "advantage_std": 0.999827466905117, "completion_length": 1939.833381652832, "epoch": 0.4308571428571429, "grad_norm": 0.8605461120605469, "kl": 0.288116455078125, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.5596072820445254e-07, "loss": 0.0115, "reward": 0.47748872451484203, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.47748872451484203, "reward_after_std": 0.7632182575762272, "reward_before_mean": 0.5919480286538601, "reward_before_std": 0.7724727466702461, "reward_change_max": 0.0, "reward_change_mean": -0.11445930134505033, "reward_change_min": -0.2100103199481964, "reward_change_std": 0.08319698181003332, "reward_std": 0.763218279927969, "rewards/cosine_scaled_reward": -0.11027600057423115, "rewards/format_reward": 0.8125000149011612, "step": 377 }, { "advantage_max": 1.4481362104415894, "advantage_mean": -6.581346156941947e-08, "advantage_min": -1.1869488134980202, "advantage_std": 0.9998489990830421, "completion_length": 1121.0208950042725, "epoch": 0.432, "grad_norm": 0.6732988953590393, "kl": 0.09349822998046875, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.5358974294659373e-07, "loss": 0.0037, "reward": 0.9872011113911867, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.9872011113911867, "reward_after_std": 0.7485641278326511, "reward_before_mean": 1.1486159078776836, "reward_before_std": 0.7412164583802223, "reward_change_max": 0.0, "reward_change_mean": -0.16141479928046465, "reward_change_min": -0.25539009273052216, "reward_change_std": 0.09596162987872958, "reward_std": 0.7485641501843929, "rewards/cosine_scaled_reward": 0.0951412720605731, "rewards/format_reward": 0.9583333358168602, "step": 378 }, { "advantage_max": 1.4938328862190247, "advantage_mean": -2.980232349791834e-08, "advantage_min": -1.1671525463461876, "advantage_std": 0.9998154565691948, "completion_length": 1591.1458740234375, "epoch": 0.43314285714285716, "grad_norm": 1.0055720806121826, "kl": 0.21600341796875, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.512332043064913e-07, "loss": 0.0086, "reward": 0.607740237377584, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.607740237377584, "reward_after_std": 0.7770608048886061, "reward_before_mean": 0.7329375743865967, "reward_before_std": 0.775208655744791, "reward_change_max": 0.0, "reward_change_mean": -0.12519735004752874, "reward_change_min": -0.21551300026476383, "reward_change_std": 0.0834552114829421, "reward_std": 0.7770608123391867, "rewards/cosine_scaled_reward": -0.07103121210820973, "rewards/format_reward": 0.8750000074505806, "step": 379 }, { "advantage_max": 1.4799382463097572, "advantage_mean": -3.414849480964932e-08, "advantage_min": -1.0861377716064453, "advantage_std": 0.9998506456613541, "completion_length": 1408.06254196167, "epoch": 0.4342857142857143, "grad_norm": 1.4439901113510132, "kl": 0.22924041748046875, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.488912271385139e-07, "loss": 0.0092, "reward": 0.6337316166609526, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6337316166609526, "reward_after_std": 0.8558752126991749, "reward_before_mean": 0.7605880023911595, "reward_before_std": 0.8635857813060284, "reward_change_max": 0.0, "reward_change_mean": -0.12685640575364232, "reward_change_min": -0.2282596305012703, "reward_change_std": 0.0902928994037211, "reward_std": 0.8558752313256264, "rewards/cosine_scaled_reward": -0.015539344982244074, "rewards/format_reward": 0.7916666753590107, "step": 380 }, { "advantage_max": 1.7427802830934525, "advantage_mean": 5.5879355587151736e-09, "advantage_min": -1.088472604751587, "advantage_std": 0.9997963383793831, "completion_length": 1819.583351135254, "epoch": 0.43542857142857144, "grad_norm": 1.249358892440796, "kl": 0.3687019348144531, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.465639255873246e-07, "loss": 0.0147, "reward": 0.2271743305027485, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2271743305027485, "reward_after_std": 0.5816497430205345, "reward_before_mean": 0.31924774509388953, "reward_before_std": 0.561051607131958, "reward_change_max": 0.0, "reward_change_mean": -0.09207342192530632, "reward_change_min": -0.14389674551784992, "reward_change_std": 0.05652611888945103, "reward_std": 0.5816497728228569, "rewards/cosine_scaled_reward": -0.2049594670534134, "rewards/format_reward": 0.7291666809469461, "step": 381 }, { "advantage_max": 1.4776546210050583, "advantage_mean": -3.197540932031728e-08, "advantage_min": -1.2696739807724953, "advantage_std": 0.9998131394386292, "completion_length": 1192.9583587646484, "epoch": 0.43657142857142855, "grad_norm": 1.0161633491516113, "kl": 0.13286590576171875, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.4425141308231765e-07, "loss": 0.0053, "reward": 0.49811657425016165, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.49811657425016165, "reward_after_std": 0.7143443040549755, "reward_before_mean": 0.6148605179041624, "reward_before_std": 0.7126270364969969, "reward_change_max": 2.530217170715332e-05, "reward_change_mean": -0.1167439166456461, "reward_change_min": -0.2098684161901474, "reward_change_std": 0.07956612063571811, "reward_std": 0.7143443375825882, "rewards/cosine_scaled_reward": -0.14048643223941326, "rewards/format_reward": 0.895833358168602, "step": 382 }, { "advantage_max": 1.3780869543552399, "advantage_mean": -1.2107194136135035e-08, "advantage_min": -1.2136836722493172, "advantage_std": 0.9998874962329865, "completion_length": 1577.4375610351562, "epoch": 0.4377142857142857, "grad_norm": 1.369926929473877, "kl": 0.3840484619140625, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.4195380233209006e-07, "loss": 0.0154, "reward": 0.6665458576753736, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.6665458576753736, "reward_after_std": 1.086703211069107, "reward_before_mean": 0.7915002275258303, "reward_before_std": 1.1197119541466236, "reward_change_max": 0.00027097761631011963, "reward_change_mean": -0.12495436519384384, "reward_change_min": -0.2746748309582472, "reward_change_std": 0.10923281265422702, "reward_std": 1.0867032408714294, "rewards/cosine_scaled_reward": -8.322112262248993e-05, "rewards/format_reward": 0.7916666865348816, "step": 383 }, { "advantage_max": 1.5250187814235687, "advantage_mean": -1.1734665017471002e-07, "advantage_min": -1.0902344584465027, "advantage_std": 0.9998218566179276, "completion_length": 1203.895851135254, "epoch": 0.43885714285714283, "grad_norm": 1.4417327642440796, "kl": 0.1812744140625, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.3967120531894857e-07, "loss": 0.0072, "reward": 1.1576847899705172, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 1.1576847899705172, "reward_after_std": 0.8532119113951921, "reward_before_mean": 1.3330178875476122, "reward_before_std": 0.8452331945300102, "reward_change_max": 7.525086402893066e-05, "reward_change_mean": -0.17533311154693365, "reward_change_min": -0.29297889675945044, "reward_change_std": 0.11483225552365184, "reward_std": 0.853211922571063, "rewards/cosine_scaled_reward": 0.2602589353919029, "rewards/format_reward": 0.8125000149011612, "step": 384 }, { "advantage_max": 1.4220309108495712, "advantage_mean": -1.459072063170197e-08, "advantage_min": -1.322306603193283, "advantage_std": 0.9997835829854012, "completion_length": 1766.1250457763672, "epoch": 0.44, "grad_norm": 1.2814421653747559, "kl": 0.4825325012207031, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.374037332934512e-07, "loss": 0.0193, "reward": 0.3343192981556058, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3343192981556058, "reward_after_std": 0.6680205948650837, "reward_before_mean": 0.43759361281991005, "reward_before_std": 0.6733908774331212, "reward_change_max": 0.0005120784044265747, "reward_change_mean": -0.10327431745827198, "reward_change_min": -0.19344800151884556, "reward_change_std": 0.07480754144489765, "reward_std": 0.6680206172168255, "rewards/cosine_scaled_reward": -0.06245320290327072, "rewards/format_reward": 0.5625000167638063, "step": 385 }, { "advantage_max": 1.5401915460824966, "advantage_mean": -3.539025822396624e-08, "advantage_min": -1.2550265565514565, "advantage_std": 0.9997994750738144, "completion_length": 1499.6875305175781, "epoch": 0.44114285714285717, "grad_norm": 1.1877615451812744, "kl": 0.3754119873046875, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.3515149676898552e-07, "loss": 0.015, "reward": 0.7220457578077912, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.7220457578077912, "reward_after_std": 0.5384745597839355, "reward_before_mean": 0.863738858141005, "reward_before_std": 0.5172322764992714, "reward_change_max": 0.0008039996027946472, "reward_change_mean": -0.14169306913390756, "reward_change_min": -0.20913443807512522, "reward_change_std": 0.08123286440968513, "reward_std": 0.5384745635092258, "rewards/cosine_scaled_reward": 0.025619419291615486, "rewards/format_reward": 0.8125000149011612, "step": 386 }, { "advantage_max": 1.525049164891243, "advantage_mean": -4.3461719556603384e-08, "advantage_min": -1.3049319833517075, "advantage_std": 0.9997809082269669, "completion_length": 2077.104232788086, "epoch": 0.4422857142857143, "grad_norm": 2.191981554031372, "kl": 0.593994140625, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.3291460551638237e-07, "loss": 0.0238, "reward": 0.6428085435181856, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6428085435181856, "reward_after_std": 0.6132344920188189, "reward_before_mean": 0.7758152484893799, "reward_before_std": 0.6024291254580021, "reward_change_max": 0.0003018230199813843, "reward_change_mean": -0.1330067147500813, "reward_change_min": -0.20926320180296898, "reward_change_std": 0.08395129209384322, "reward_std": 0.6132345125079155, "rewards/cosine_scaled_reward": -0.007925715297460556, "rewards/format_reward": 0.7916666753590107, "step": 387 }, { "advantage_max": 1.5071382969617844, "advantage_mean": -1.2440917529499274e-07, "advantage_min": -1.265833929181099, "advantage_std": 0.9998080208897591, "completion_length": 1300.2292175292969, "epoch": 0.44342857142857145, "grad_norm": 1.908517837524414, "kl": 0.2214508056640625, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.306931685585657e-07, "loss": 0.0089, "reward": 0.8852858282625675, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.8852858282625675, "reward_after_std": 0.7093720734119415, "reward_before_mean": 1.0393500942736864, "reward_before_std": 0.6999770719558001, "reward_change_max": 0.00048132985830307007, "reward_change_mean": -0.15406430745497346, "reward_change_min": -0.25052541866898537, "reward_change_std": 0.09875405393540859, "reward_std": 0.7093720883131027, "rewards/cosine_scaled_reward": 0.11342504154890776, "rewards/format_reward": 0.8125000111758709, "step": 388 }, { "advantage_max": 1.4574102386832237, "advantage_mean": -4.967053768289986e-08, "advantage_min": -1.299857720732689, "advantage_std": 0.9997845217585564, "completion_length": 1413.4167022705078, "epoch": 0.44457142857142856, "grad_norm": 0.9911835193634033, "kl": 0.2678680419921875, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.2848729416523859e-07, "loss": 0.0107, "reward": 0.7586075998842716, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7586075998842716, "reward_after_std": 0.6198668666183949, "reward_before_mean": 0.9024220667779446, "reward_before_std": 0.6068241856992245, "reward_change_max": 0.0, "reward_change_mean": -0.14381443057209253, "reward_change_min": -0.2356892367824912, "reward_change_std": 0.08788485545665026, "reward_std": 0.619866881519556, "rewards/cosine_scaled_reward": 0.003294333815574646, "rewards/format_reward": 0.8958333432674408, "step": 389 }, { "advantage_max": 1.5046132057905197, "advantage_mean": -3.166496836959354e-08, "advantage_min": -1.114329144358635, "advantage_std": 0.9997953996062279, "completion_length": 1732.0000228881836, "epoch": 0.44571428571428573, "grad_norm": 1.1819705963134766, "kl": 0.4037322998046875, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.2629708984760706e-07, "loss": 0.0162, "reward": 0.6063512277323753, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6063512277323753, "reward_after_std": 0.8176131937652826, "reward_before_mean": 0.7317489488050342, "reward_before_std": 0.8221844676882029, "reward_change_max": 0.0, "reward_change_mean": -0.12539772223681211, "reward_change_min": -0.24319585226476192, "reward_change_std": 0.09066474437713623, "reward_std": 0.8176132310181856, "rewards/cosine_scaled_reward": -0.009125546552240849, "rewards/format_reward": 0.7500000074505806, "step": 390 }, { "advantage_max": 1.3619165793061256, "advantage_mean": -2.793967834868738e-08, "advantage_min": -1.2452645674347878, "advantage_std": 0.9998195543885231, "completion_length": 1382.3125267028809, "epoch": 0.44685714285714284, "grad_norm": 2.8021371364593506, "kl": 0.3901214599609375, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.2412266235313973e-07, "loss": 0.0156, "reward": 0.8029541606083512, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.8029541606083512, "reward_after_std": 0.6770164184272289, "reward_before_mean": 0.9506847970187664, "reward_before_std": 0.6766212470829487, "reward_change_max": 0.00014794617891311646, "reward_change_mean": -0.14773060707375407, "reward_change_min": -0.24860516004264355, "reward_change_std": 0.09467540634796023, "reward_std": 0.6770164258778095, "rewards/cosine_scaled_reward": 0.058675711043179035, "rewards/format_reward": 0.8333333469927311, "step": 391 }, { "advantage_max": 1.5857295244932175, "advantage_mean": -1.5522043150806297e-08, "advantage_min": -1.0785433277487755, "advantage_std": 0.9998374804854393, "completion_length": 1728.770881652832, "epoch": 0.448, "grad_norm": 1.6562546491622925, "kl": 0.5024871826171875, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.2196411766036487e-07, "loss": 0.0201, "reward": 0.4962415201589465, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.4962415201589465, "reward_after_std": 0.8928264938294888, "reward_before_mean": 0.6067917384207249, "reward_before_std": 0.893587950617075, "reward_change_max": 0.0, "reward_change_mean": -0.11055022478103638, "reward_change_min": -0.2252865731716156, "reward_change_std": 0.08553655026480556, "reward_std": 0.89282650873065, "rewards/cosine_scaled_reward": -0.07160413172096014, "rewards/format_reward": 0.7500000093132257, "step": 392 }, { "advantage_max": 1.5273478254675865, "advantage_mean": -4.967054101356894e-09, "advantage_min": -1.0835141614079475, "advantage_std": 0.9998887106776237, "completion_length": 1534.2291946411133, "epoch": 0.4491428571428571, "grad_norm": 1.2979336977005005, "kl": 0.332183837890625, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.1982156097370557e-07, "loss": 0.0133, "reward": 0.658613370731473, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.658613370731473, "reward_after_std": 1.0538778081536293, "reward_before_mean": 0.7801401242613792, "reward_before_std": 1.0620463229715824, "reward_change_max": 0.0003866031765937805, "reward_change_mean": -0.12152672559022903, "reward_change_min": -0.24229469522833824, "reward_change_std": 0.09234014619141817, "reward_std": 1.053877830505371, "rewards/cosine_scaled_reward": -0.016179951839148998, "rewards/format_reward": 0.812500013038516, "step": 393 }, { "advantage_max": 1.4706476479768753, "advantage_mean": -7.326404349861093e-08, "advantage_min": -1.205642156302929, "advantage_std": 0.9997804909944534, "completion_length": 1600.2500457763672, "epoch": 0.4502857142857143, "grad_norm": 3.4596385955810547, "kl": 0.54315185546875, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.1769509671835223e-07, "loss": 0.0217, "reward": 0.4269302450120449, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4269302450120449, "reward_after_std": 0.5508731566369534, "reward_before_mean": 0.5413588918745518, "reward_before_std": 0.5420444570481777, "reward_change_max": 0.0, "reward_change_mean": -0.11442865990102291, "reward_change_min": -0.1859685741364956, "reward_change_std": 0.07011770131066442, "reward_std": 0.5508731603622437, "rewards/cosine_scaled_reward": -0.1564038973301649, "rewards/format_reward": 0.854166679084301, "step": 394 }, { "advantage_max": 1.6219586357474327, "advantage_mean": -6.891787274199146e-08, "advantage_min": -1.0342840030789375, "advantage_std": 0.9997628480195999, "completion_length": 1293.2500267028809, "epoch": 0.4514285714285714, "grad_norm": 1.1021445989608765, "kl": 0.4151763916015625, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.1558482853517253e-07, "loss": 0.0166, "reward": 0.6994353365153074, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.6994353365153074, "reward_after_std": 0.5848857667297125, "reward_before_mean": 0.837128933519125, "reward_before_std": 0.5620684530586004, "reward_change_max": 0.0002937912940979004, "reward_change_mean": -0.13769357604905963, "reward_change_min": -0.2083145957440138, "reward_change_std": 0.07981055462732911, "reward_std": 0.5848857890814543, "rewards/cosine_scaled_reward": 0.022731118835508823, "rewards/format_reward": 0.7916666679084301, "step": 395 }, { "advantage_max": 1.5212388187646866, "advantage_mean": -1.5211602200082552e-08, "advantage_min": -1.202341765165329, "advantage_std": 0.9998112320899963, "completion_length": 1216.1875228881836, "epoch": 0.45257142857142857, "grad_norm": 1.7865386009216309, "kl": 0.31902313232421875, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.134908592756607e-07, "loss": 0.0128, "reward": 0.5041277073323727, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5041277073323727, "reward_after_std": 0.7280652057379484, "reward_before_mean": 0.6195306107401848, "reward_before_std": 0.724288634955883, "reward_change_max": 0.0, "reward_change_mean": -0.11540291737765074, "reward_change_min": -0.19210434705018997, "reward_change_std": 0.07592118624597788, "reward_std": 0.728065237402916, "rewards/cosine_scaled_reward": -0.09648469707462937, "rewards/format_reward": 0.812500013038516, "step": 396 }, { "advantage_max": 1.4690123051404953, "advantage_mean": -1.7384688910659918e-08, "advantage_min": -1.2216744720935822, "advantage_std": 0.9997849240899086, "completion_length": 1188.9791870117188, "epoch": 0.45371428571428574, "grad_norm": 1.075379729270935, "kl": 0.24041748046875, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.1141329099692406e-07, "loss": 0.0096, "reward": 0.5569867407903075, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5569867407903075, "reward_after_std": 0.7179490067064762, "reward_before_mean": 0.6796721828486625, "reward_before_std": 0.716059971600771, "reward_change_max": 0.0, "reward_change_mean": -0.12268543615937233, "reward_change_min": -0.2074470091611147, "reward_change_std": 0.08350208308547735, "reward_std": 0.7179490327835083, "rewards/cosine_scaled_reward": -0.06641392130404711, "rewards/format_reward": 0.8125000186264515, "step": 397 }, { "advantage_max": 1.5344518646597862, "advantage_mean": -3.57006997298015e-08, "advantage_min": -1.1795164123177528, "advantage_std": 0.999800331890583, "completion_length": 1327.895881652832, "epoch": 0.45485714285714285, "grad_norm": 1.5244557857513428, "kl": 0.3427886962890625, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.0935222495670968e-07, "loss": 0.0137, "reward": 0.5405859863385558, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.5405859863385558, "reward_after_std": 0.6886147819459438, "reward_before_mean": 0.6625741459429264, "reward_before_std": 0.6864035166800022, "reward_change_max": 0.0, "reward_change_mean": -0.12198818568140268, "reward_change_min": -0.20752749498933554, "reward_change_std": 0.07999280700460076, "reward_std": 0.6886148191988468, "rewards/cosine_scaled_reward": -0.07496293634176254, "rewards/format_reward": 0.812500013038516, "step": 398 }, { "advantage_max": 1.5416178330779076, "advantage_mean": -3.539025855703315e-08, "advantage_min": -1.1947909593582153, "advantage_std": 0.9998286813497543, "completion_length": 1322.5417175292969, "epoch": 0.456, "grad_norm": 1.22747802734375, "kl": 0.246826171875, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.0730776160846853e-07, "loss": 0.0099, "reward": 0.8436704650521278, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.8436704650521278, "reward_after_std": 0.8051801025867462, "reward_before_mean": 0.9887518286705017, "reward_before_std": 0.7946115285158157, "reward_change_max": 0.0, "reward_change_mean": -0.1450813477858901, "reward_change_min": -0.23790498822927475, "reward_change_std": 0.09142806520685554, "reward_std": 0.8051801361143589, "rewards/cosine_scaled_reward": 0.05687588080763817, "rewards/format_reward": 0.8750000149011612, "step": 399 }, { "advantage_max": 1.418625384569168, "advantage_mean": -9.685755153476805e-08, "advantage_min": -1.2240911647677422, "advantage_std": 0.999837763607502, "completion_length": 1084.7291793823242, "epoch": 0.45714285714285713, "grad_norm": 1.2166296243667603, "kl": 0.27674102783203125, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.0528000059645995e-07, "loss": 0.0111, "reward": 1.2985866218805313, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 1.2985866218805313, "reward_after_std": 0.8396326303482056, "reward_before_mean": 1.488419085741043, "reward_before_std": 0.8349230848252773, "reward_change_max": 0.00015439093112945557, "reward_change_mean": -0.18983250577002764, "reward_change_min": -0.311465822160244, "reward_change_std": 0.11947321891784668, "reward_std": 0.8396326526999474, "rewards/cosine_scaled_reward": 0.2962928842753172, "rewards/format_reward": 0.8958333358168602, "step": 400 }, { "advantage_max": 1.3777280449867249, "advantage_mean": -4.4082603789519226e-08, "advantage_min": -1.185884103178978, "advantage_std": 0.9997954741120338, "completion_length": 1630.770866394043, "epoch": 0.4582857142857143, "grad_norm": 1.5643917322158813, "kl": 0.403472900390625, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.032690407508949e-07, "loss": 0.0161, "reward": 0.5990715604275465, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.5990715604275465, "reward_after_std": 0.5103095322847366, "reward_before_mean": 0.7318368963897228, "reward_before_std": 0.5031833313405514, "reward_change_max": 0.0, "reward_change_mean": -0.1327653443440795, "reward_change_min": -0.21006182581186295, "reward_change_std": 0.07824637182056904, "reward_std": 0.5103095509111881, "rewards/cosine_scaled_reward": -0.040331561118364334, "rewards/format_reward": 0.8125000149011612, "step": 401 }, { "advantage_max": 1.3117186725139618, "advantage_mean": -9.93410786964688e-09, "advantage_min": -1.4394584074616432, "advantage_std": 0.99981340020895, "completion_length": 1447.3333740234375, "epoch": 0.4594285714285714, "grad_norm": 1.5800583362579346, "kl": 0.4939117431640625, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.0127498008311922e-07, "loss": 0.0198, "reward": 0.7458519488573074, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7458519488573074, "reward_after_std": 0.7214295417070389, "reward_before_mean": 0.8878723345696926, "reward_before_std": 0.7308622431010008, "reward_change_max": 0.0, "reward_change_mean": -0.14202034566551447, "reward_change_min": -0.22765328735113144, "reward_change_std": 0.0920207086019218, "reward_std": 0.7214295528829098, "rewards/cosine_scaled_reward": 0.006436141207814217, "rewards/format_reward": 0.8750000223517418, "step": 402 }, { "advantage_max": 1.5777545720338821, "advantage_mean": -1.4901161582425715e-08, "advantage_min": -1.0918622389435768, "advantage_std": 0.9997143223881721, "completion_length": 1011.1458587646484, "epoch": 0.4605714285714286, "grad_norm": 2.2372660636901855, "kl": 0.34453582763671875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.9929791578083655e-07, "loss": 0.0138, "reward": 0.7979833465069532, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7979833465069532, "reward_after_std": 0.46419939398765564, "reward_before_mean": 0.9484133645892143, "reward_before_std": 0.43025317415595055, "reward_change_max": 0.0, "reward_change_mean": -0.15042999852448702, "reward_change_min": -0.22080524545162916, "reward_change_std": 0.08285338198766112, "reward_std": 0.46419941633939743, "rewards/cosine_scaled_reward": 0.0783733231946826, "rewards/format_reward": 0.7916666697710752, "step": 403 }, { "advantage_max": 1.499540537595749, "advantage_mean": -1.204510566843453e-07, "advantage_min": -1.2422676607966423, "advantage_std": 0.9997927024960518, "completion_length": 1443.4375228881836, "epoch": 0.4617142857142857, "grad_norm": 1.9905834197998047, "kl": 0.4494171142578125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.9733794420337213e-07, "loss": 0.018, "reward": 0.7095265840180218, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7095265840180218, "reward_after_std": 0.5653872638940811, "reward_before_mean": 0.8499099458567798, "reward_before_std": 0.5469077229499817, "reward_change_max": 0.0, "reward_change_mean": -0.1403833832591772, "reward_change_min": -0.212770015001297, "reward_change_std": 0.0808732183650136, "reward_std": 0.5653872825205326, "rewards/cosine_scaled_reward": -0.022961702197790146, "rewards/format_reward": 0.8958333432674408, "step": 404 }, { "advantage_max": 1.7194660305976868, "advantage_mean": 4.470348680118974e-08, "advantage_min": -1.0948041006922722, "advantage_std": 0.9997475519776344, "completion_length": 1167.2917251586914, "epoch": 0.46285714285714286, "grad_norm": 1.041972041130066, "kl": 0.289215087890625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.9539516087697517e-07, "loss": 0.0116, "reward": 1.2832879004999995, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 1.2832879004999995, "reward_after_std": 0.5691489465534687, "reward_before_mean": 1.4747139997780323, "reward_before_std": 0.5159460082650185, "reward_change_max": 0.0, "reward_change_mean": -0.19142606016248465, "reward_change_min": -0.26474354043602943, "reward_change_std": 0.10462368186563253, "reward_std": 0.5691489800810814, "rewards/cosine_scaled_reward": 0.27902365755289793, "rewards/format_reward": 0.916666679084301, "step": 405 }, { "advantage_max": 1.6274562031030655, "advantage_mean": -3.2906732005955064e-08, "advantage_min": -1.0492018535733223, "advantage_std": 0.9997759088873863, "completion_length": 1456.5833740234375, "epoch": 0.464, "grad_norm": 2.3249237537384033, "kl": 0.54071044921875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.934696604901642e-07, "loss": 0.0216, "reward": 0.8334343023598194, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.8334343023598194, "reward_after_std": 0.7299204599112272, "reward_before_mean": 0.9799941023811698, "reward_before_std": 0.7119660619646311, "reward_change_max": 0.0, "reward_change_mean": -0.14655979629606009, "reward_change_min": -0.24078886583447456, "reward_change_std": 0.0892445114441216, "reward_std": 0.7299204748123884, "rewards/cosine_scaled_reward": 0.010830356506630778, "rewards/format_reward": 0.9583333358168602, "step": 406 }, { "advantage_max": 1.3725356981158257, "advantage_mean": -2.2351741679749182e-08, "advantage_min": -1.2306954599916935, "advantage_std": 0.9997261166572571, "completion_length": 1250.5625381469727, "epoch": 0.46514285714285714, "grad_norm": 1.371690034866333, "kl": 0.2197418212890625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.915615368891117e-07, "loss": 0.0088, "reward": 0.752561591565609, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.752561591565609, "reward_after_std": 0.566648356616497, "reward_before_mean": 0.9003201862797141, "reward_before_std": 0.5630356483161449, "reward_change_max": 0.0, "reward_change_mean": -0.1477585742250085, "reward_change_min": -0.22929508332163095, "reward_change_std": 0.09294451726600528, "reward_std": 0.5666483640670776, "rewards/cosine_scaled_reward": 0.002243412658572197, "rewards/format_reward": 0.8958333358168602, "step": 407 }, { "advantage_max": 1.6451401710510254, "advantage_mean": -1.5770395678238458e-07, "advantage_min": -0.9748510047793388, "advantage_std": 0.9998021051287651, "completion_length": 1450.2709045410156, "epoch": 0.4662857142857143, "grad_norm": 1.4156643152236938, "kl": 0.18245697021484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.8967088307307e-07, "loss": 0.0073, "reward": 0.9860383477061987, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.9860383477061987, "reward_after_std": 0.7557676881551743, "reward_before_mean": 1.1457962021231651, "reward_before_std": 0.7287969561293721, "reward_change_max": 0.0, "reward_change_mean": -0.1597578590735793, "reward_change_min": -0.262321799993515, "reward_change_std": 0.09481826471164823, "reward_std": 0.7557677067816257, "rewards/cosine_scaled_reward": 0.10414808837231249, "rewards/format_reward": 0.9375000074505806, "step": 408 }, { "advantage_max": 1.4862465560436249, "advantage_mean": -9.31322596819939e-09, "advantage_min": -1.1640625074505806, "advantage_std": 0.999810591340065, "completion_length": 1856.8333740234375, "epoch": 0.4674285714285714, "grad_norm": 1.2075356245040894, "kl": 0.50775146484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.8779779118983867e-07, "loss": 0.0203, "reward": 0.48998264502733946, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.48998264502733946, "reward_after_std": 0.6932985447347164, "reward_before_mean": 0.6081876456737518, "reward_before_std": 0.6996648348867893, "reward_change_max": 0.000249423086643219, "reward_change_mean": -0.11820495565189049, "reward_change_min": -0.21144821494817734, "reward_change_std": 0.08027452556416392, "reward_std": 0.6932985782623291, "rewards/cosine_scaled_reward": -0.09173952601850033, "rewards/format_reward": 0.791666679084301, "step": 409 }, { "advantage_max": 1.6873383074998856, "advantage_mean": -1.2417640249395845e-09, "advantage_min": -1.0137062221765518, "advantage_std": 0.9998016655445099, "completion_length": 1704.4792213439941, "epoch": 0.4685714285714286, "grad_norm": 1.8969690799713135, "kl": 0.7142829895019531, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.8594235253127372e-07, "loss": 0.0287, "reward": 0.48481374606490135, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.48481374606490135, "reward_after_std": 0.7214395180344582, "reward_before_mean": 0.5968669969588518, "reward_before_std": 0.7014450505375862, "reward_change_max": 0.00011499971151351929, "reward_change_mean": -0.11205322481691837, "reward_change_min": -0.17780397459864616, "reward_change_std": 0.06741911824792624, "reward_std": 0.7214395329356194, "rewards/cosine_scaled_reward": -0.09739985689520836, "rewards/format_reward": 0.7916666734963655, "step": 410 }, { "advantage_max": 1.344476506114006, "advantage_mean": 6.208816794028849e-10, "advantage_min": -1.2045889720320702, "advantage_std": 0.999833382666111, "completion_length": 1973.6458740234375, "epoch": 0.4697142857142857, "grad_norm": 1.6440776586532593, "kl": 0.85833740234375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.8410465752883758e-07, "loss": 0.0343, "reward": 0.42943368293344975, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.42943368293344975, "reward_after_std": 0.8023342192173004, "reward_before_mean": 0.5399559205397964, "reward_before_std": 0.821831464767456, "reward_change_max": 0.0005313009023666382, "reward_change_mean": -0.11052223108708858, "reward_change_min": -0.21459924895316362, "reward_change_std": 0.08706922875717282, "reward_std": 0.8023342490196228, "rewards/cosine_scaled_reward": -0.06335536949336529, "rewards/format_reward": 0.6666666734963655, "step": 411 }, { "advantage_max": 1.4021871536970139, "advantage_mean": -6.270905494876189e-08, "advantage_min": -1.19430410861969, "advantage_std": 0.9998315647244453, "completion_length": 1140.3958702087402, "epoch": 0.47085714285714286, "grad_norm": 1.5791558027267456, "kl": 0.285064697265625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.822847957491922e-07, "loss": 0.0114, "reward": 0.7805769965052605, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7805769965052605, "reward_after_std": 0.8468297980725765, "reward_before_mean": 0.9220419060438871, "reward_before_std": 0.8579179160296917, "reward_change_max": 0.0002397671341896057, "reward_change_mean": -0.14146495051681995, "reward_change_min": -0.25670984014868736, "reward_change_std": 0.09965648734942079, "reward_std": 0.8468298017978668, "rewards/cosine_scaled_reward": 0.0235209371894598, "rewards/format_reward": 0.8750000149011612, "step": 412 }, { "advantage_max": 1.4463470578193665, "advantage_mean": -3.352761379638025e-08, "advantage_min": -1.2316881269216537, "advantage_std": 0.9998175576329231, "completion_length": 1210.2500610351562, "epoch": 0.472, "grad_norm": 1.284919261932373, "kl": 0.293426513671875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.804828558898332e-07, "loss": 0.0117, "reward": 0.8780468343757093, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.8780468343757093, "reward_after_std": 0.7943502962589264, "reward_before_mean": 1.0290323235094547, "reward_before_std": 0.7974189501255751, "reward_change_max": 0.0, "reward_change_mean": -0.15098547749221325, "reward_change_min": -0.2520767832174897, "reward_change_std": 0.09872942883521318, "reward_std": 0.7943503148853779, "rewards/cosine_scaled_reward": 0.06659948639571667, "rewards/format_reward": 0.8958333358168602, "step": 413 }, { "advantage_max": 1.4396483451128006, "advantage_mean": -2.2351742789972207e-08, "advantage_min": -1.1945699751377106, "advantage_std": 0.9998373538255692, "completion_length": 1927.7083892822266, "epoch": 0.47314285714285714, "grad_norm": 1.1197619438171387, "kl": 0.7069091796875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.7869892577476722e-07, "loss": 0.0283, "reward": 0.33350180089473724, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.33350180089473724, "reward_after_std": 0.7306559979915619, "reward_before_mean": 0.43494257144629955, "reward_before_std": 0.7363609932363033, "reward_change_max": 0.00020420551300048828, "reward_change_mean": -0.10144078405573964, "reward_change_min": -0.20094910357147455, "reward_change_std": 0.07442115899175406, "reward_std": 0.7306560054421425, "rewards/cosine_scaled_reward": -0.14711205288767815, "rewards/format_reward": 0.7291666772216558, "step": 414 }, { "advantage_max": 1.3788573667407036, "advantage_mean": -3.601114040296949e-08, "advantage_min": -1.2387224435806274, "advantage_std": 0.9998515993356705, "completion_length": 1650.0000381469727, "epoch": 0.4742857142857143, "grad_norm": 1.4316720962524414, "kl": 0.5947723388671875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.7693309235023127e-07, "loss": 0.0238, "reward": 0.5648692059330642, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5648692059330642, "reward_after_std": 0.8455432876944542, "reward_before_mean": 0.685730435885489, "reward_before_std": 0.862141527235508, "reward_change_max": 0.0007596015930175781, "reward_change_mean": -0.12086124438792467, "reward_change_min": -0.23698932025581598, "reward_change_std": 0.09399270685389638, "reward_std": 0.8455432951450348, "rewards/cosine_scaled_reward": -0.04255145916249603, "rewards/format_reward": 0.7708333469927311, "step": 415 }, { "advantage_max": 1.5565531551837921, "advantage_mean": -3.725290476097598e-08, "advantage_min": -1.1521182730793953, "advantage_std": 0.9998418241739273, "completion_length": 1293.5000457763672, "epoch": 0.4754285714285714, "grad_norm": 1.0963540077209473, "kl": 0.36377716064453125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.7518544168045524e-07, "loss": 0.0146, "reward": 1.0165877528488636, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 1.0165877528488636, "reward_after_std": 0.7492444217205048, "reward_before_mean": 1.1790525019168854, "reward_before_std": 0.7278725281357765, "reward_change_max": 0.0, "reward_change_mean": -0.16246474720537663, "reward_change_min": -0.24140873830765486, "reward_change_std": 0.09044545330107212, "reward_std": 0.7492444440722466, "rewards/cosine_scaled_reward": 0.11035957233980298, "rewards/format_reward": 0.9583333432674408, "step": 416 }, { "advantage_max": 1.604439303278923, "advantage_mean": -3.539025844601085e-08, "advantage_min": -1.240592211484909, "advantage_std": 0.9998293966054916, "completion_length": 1556.0000610351562, "epoch": 0.4765714285714286, "grad_norm": 2.053286075592041, "kl": 0.60235595703125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.7345605894346726e-07, "loss": 0.0241, "reward": 0.44515037967357785, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.44515037967357785, "reward_after_std": 0.6873351112008095, "reward_before_mean": 0.5556222386658192, "reward_before_std": 0.6735187582671642, "reward_change_max": 0.0002534613013267517, "reward_change_mean": -0.11047186609357595, "reward_change_min": -0.18285326100885868, "reward_change_std": 0.0725367316044867, "reward_std": 0.6873351410031319, "rewards/cosine_scaled_reward": -0.09718889463692904, "rewards/format_reward": 0.7500000186264515, "step": 417 }, { "advantage_max": 1.4074752032756805, "advantage_mean": -8.257727068805565e-08, "advantage_min": -1.2240310907363892, "advantage_std": 0.9998078942298889, "completion_length": 1065.7708587646484, "epoch": 0.4777142857142857, "grad_norm": 0.9967370629310608, "kl": 0.18988037109375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.7174502842694212e-07, "loss": 0.0076, "reward": 1.1236931383609772, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 1.1236931383609772, "reward_after_std": 0.6934684608131647, "reward_before_mean": 1.3006681762635708, "reward_before_std": 0.6769684087485075, "reward_change_max": 0.00027079880237579346, "reward_change_mean": -0.17697507236152887, "reward_change_min": -0.2605667933821678, "reward_change_std": 0.10411691945046186, "reward_std": 0.6934684999287128, "rewards/cosine_scaled_reward": 0.18158408568706363, "rewards/format_reward": 0.9375000149011612, "step": 418 }, { "advantage_max": 1.5458858013153076, "advantage_mean": 2.483526828633842e-09, "advantage_min": -1.1178877651691437, "advantage_std": 0.9998831227421761, "completion_length": 1536.9375534057617, "epoch": 0.47885714285714287, "grad_norm": 2.0971181392669678, "kl": 0.45917510986328125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.7005243352409333e-07, "loss": 0.0184, "reward": 0.719907971099019, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.719907971099019, "reward_after_std": 0.9108130037784576, "reward_before_mean": 0.8503690185025334, "reward_before_std": 0.909957580268383, "reward_change_max": 0.00023179501295089722, "reward_change_mean": -0.13046098314225674, "reward_change_min": -0.23345648124814034, "reward_change_std": 0.08942006062716246, "reward_std": 0.9108130559325218, "rewards/cosine_scaled_reward": 0.008517796639353037, "rewards/format_reward": 0.8333333507180214, "step": 419 }, { "advantage_max": 1.655995175242424, "advantage_mean": -3.6011140958081e-08, "advantage_min": -1.1085584685206413, "advantage_std": 0.9997412338852882, "completion_length": 1058.2916870117188, "epoch": 0.48, "grad_norm": 2.4459757804870605, "kl": 0.248291015625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.6837835672960831e-07, "loss": 0.0099, "reward": 0.5104020063299686, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5104020063299686, "reward_after_std": 0.5309116821736097, "reward_before_mean": 0.632997702807188, "reward_before_std": 0.5120272561907768, "reward_change_max": 0.0, "reward_change_mean": -0.12259569112211466, "reward_change_min": -0.18111898750066757, "reward_change_std": 0.07065618922933936, "reward_std": 0.5309117008000612, "rewards/cosine_scaled_reward": -0.1105844946578145, "rewards/format_reward": 0.8541666865348816, "step": 420 }, { "advantage_max": 1.58281809091568, "advantage_mean": -1.2728075760026769e-08, "advantage_min": -1.1530317813158035, "advantage_std": 0.9998201727867126, "completion_length": 1308.645881652832, "epoch": 0.48114285714285715, "grad_norm": 1.251358985900879, "kl": 0.3505859375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.6672287963562852e-07, "loss": 0.014, "reward": 0.489888122305274, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.489888122305274, "reward_after_std": 0.712477371096611, "reward_before_mean": 0.6046459935605526, "reward_before_std": 0.7005138099193573, "reward_change_max": 0.00022549182176589966, "reward_change_mean": -0.11475785728543997, "reward_change_min": -0.20619327947497368, "reward_change_std": 0.07444826699793339, "reward_std": 0.712477408349514, "rewards/cosine_scaled_reward": -0.16642701055388898, "rewards/format_reward": 0.9375000149011612, "step": 421 }, { "advantage_max": 1.1890934333205223, "advantage_mean": -1.30385160446167e-08, "advantage_min": -1.4458886981010437, "advantage_std": 0.9998152554035187, "completion_length": 1730.9375762939453, "epoch": 0.48228571428571426, "grad_norm": 1.2435977458953857, "kl": 0.5941848754882812, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.6508608292777203e-07, "loss": 0.0237, "reward": 0.5873144883662462, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5873144883662462, "reward_after_std": 0.6254005320370197, "reward_before_mean": 0.7183939876267686, "reward_before_std": 0.6377482563257217, "reward_change_max": 0.00011101365089416504, "reward_change_mean": -0.13107949635013938, "reward_change_min": -0.2110171616077423, "reward_change_std": 0.08531484520062804, "reward_std": 0.6254005543887615, "rewards/cosine_scaled_reward": -0.057469683699309826, "rewards/format_reward": 0.8333333469927311, "step": 422 }, { "advantage_max": 1.4273897409439087, "advantage_mean": -3.8494666954047574e-08, "advantage_min": -1.2456609457731247, "advantage_std": 0.9997662082314491, "completion_length": 1455.6667098999023, "epoch": 0.48342857142857143, "grad_norm": 1.3013380765914917, "kl": 0.4249420166015625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.6346804638120098e-07, "loss": 0.017, "reward": 0.41419703885912895, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.41419703885912895, "reward_after_std": 0.6243961397558451, "reward_before_mean": 0.5267076604068279, "reward_before_std": 0.6270595081150532, "reward_change_max": 0.0, "reward_change_mean": -0.11251062992960215, "reward_change_min": -0.19462671875953674, "reward_change_std": 0.0770272184163332, "reward_std": 0.6243961472064257, "rewards/cosine_scaled_reward": -0.1324795256368816, "rewards/format_reward": 0.7916666772216558, "step": 423 }, { "advantage_max": 1.5881786197423935, "advantage_mean": -3.725290464995368e-08, "advantage_min": -0.9874609559774399, "advantage_std": 0.9997924491763115, "completion_length": 1607.1458740234375, "epoch": 0.4845714285714286, "grad_norm": 2.638868570327759, "kl": 0.5507659912109375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.6186884885673413e-07, "loss": 0.022, "reward": 0.493446989916265, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.493446989916265, "reward_after_std": 0.5820804536342621, "reward_before_mean": 0.611056812107563, "reward_before_std": 0.5580542217940092, "reward_change_max": 0.0, "reward_change_mean": -0.11760981846600771, "reward_change_min": -0.18333129212260246, "reward_change_std": 0.06667398847639561, "reward_std": 0.5820804722607136, "rewards/cosine_scaled_reward": -0.16322161629796028, "rewards/format_reward": 0.9375000149011612, "step": 424 }, { "advantage_max": 1.5448092222213745, "advantage_mean": -1.297642824305001e-07, "advantage_min": -1.1348483115434647, "advantage_std": 0.9997741878032684, "completion_length": 1215.4791793823242, "epoch": 0.4857142857142857, "grad_norm": 2.198212146759033, "kl": 0.18170928955078125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.6028856829700258e-07, "loss": 0.0073, "reward": 1.1097919731400907, "reward_advantage_correlation": 1.0, "reward_after_mean": 1.1097919731400907, "reward_after_std": 0.7644475474953651, "reward_before_mean": 1.2826090063899755, "reward_before_std": 0.7507690298371017, "reward_change_max": 0.00022091716527938843, "reward_change_mean": -0.1728170160204172, "reward_change_min": -0.28230510652065277, "reward_change_std": 0.11318794079124928, "reward_std": 0.764447558671236, "rewards/cosine_scaled_reward": 0.18297115061432123, "rewards/format_reward": 0.916666679084301, "step": 425 }, { "advantage_max": 1.6671266108751297, "advantage_mean": -1.0461857269383756e-07, "advantage_min": -0.9880219921469688, "advantage_std": 0.9998507276177406, "completion_length": 1172.0000381469727, "epoch": 0.4868571428571429, "grad_norm": 1.7160371541976929, "kl": 0.5969390869140625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.5872728172265146e-07, "loss": 0.0239, "reward": 0.8034339547157288, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.8034339547157288, "reward_after_std": 0.8283376954495907, "reward_before_mean": 0.9418845884501934, "reward_before_std": 0.8046383187174797, "reward_change_max": 0.00014852732419967651, "reward_change_mean": -0.13845067285001278, "reward_change_min": -0.2270393744111061, "reward_change_std": 0.08508762950077653, "reward_std": 0.8283377438783646, "rewards/cosine_scaled_reward": -0.008224384859204292, "rewards/format_reward": 0.9583333432674408, "step": 426 }, { "advantage_max": 1.3611183911561966, "advantage_mean": -2.359350548264416e-08, "advantage_min": -1.2368653267621994, "advantage_std": 0.9998747855424881, "completion_length": 1647.8541870117188, "epoch": 0.488, "grad_norm": 1.7571823596954346, "kl": 0.385345458984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.5718506522858572e-07, "loss": 0.0154, "reward": 0.8747087176889181, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.8747087176889181, "reward_after_std": 0.9652072787284851, "reward_before_mean": 1.0225733071565628, "reward_before_std": 0.9864769503474236, "reward_change_max": 0.0, "reward_change_mean": -0.14786457549780607, "reward_change_min": -0.2746960259974003, "reward_change_std": 0.1091885594651103, "reward_std": 0.9652072936296463, "rewards/cosine_scaled_reward": 0.10503664053976536, "rewards/format_reward": 0.8125000074505806, "step": 427 }, { "advantage_max": 1.4560476392507553, "advantage_mean": -8.6923440667519e-09, "advantage_min": -1.1091465428471565, "advantage_std": 0.999860942363739, "completion_length": 1579.708396911621, "epoch": 0.48914285714285716, "grad_norm": 1.801839828491211, "kl": 0.40151214599609375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.5566199398026147e-07, "loss": 0.016, "reward": 0.5794356926344335, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5794356926344335, "reward_after_std": 0.8744179606437683, "reward_before_mean": 0.699925497174263, "reward_before_std": 0.8827432319521904, "reward_change_max": 0.00021196156740188599, "reward_change_mean": -0.12048980919644237, "reward_change_min": -0.23152614384889603, "reward_change_std": 0.08844160987064242, "reward_std": 0.8744179755449295, "rewards/cosine_scaled_reward": -0.05628725979477167, "rewards/format_reward": 0.8125000074505806, "step": 428 }, { "advantage_max": 1.6187241524457932, "advantage_mean": -1.4280279847511679e-08, "advantage_min": -1.0603943690657616, "advantage_std": 0.9998475760221481, "completion_length": 1263.1042098999023, "epoch": 0.49028571428571427, "grad_norm": 1.4531282186508179, "kl": 0.6030960083007812, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.5415814221002265e-07, "loss": 0.0241, "reward": 0.43697307258844376, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.43697307258844376, "reward_after_std": 0.7528307847678661, "reward_before_mean": 0.5441529117524624, "reward_before_std": 0.7386700659990311, "reward_change_max": 0.0006353110074996948, "reward_change_mean": -0.10717981401830912, "reward_change_min": -0.18293942417949438, "reward_change_std": 0.06950774369761348, "reward_std": 0.7528308033943176, "rewards/cosine_scaled_reward": -0.16542356554418802, "rewards/format_reward": 0.8750000074505806, "step": 429 }, { "advantage_max": 1.566932499408722, "advantage_mean": 1.0865429667106241e-08, "advantage_min": -1.0978027358651161, "advantage_std": 0.9998347610235214, "completion_length": 1255.0417098999023, "epoch": 0.49142857142857144, "grad_norm": 1.7678227424621582, "kl": 0.28025054931640625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.5267358321348285e-07, "loss": 0.0112, "reward": 0.6415118533186615, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6415118533186615, "reward_after_std": 0.7025103121995926, "reward_before_mean": 0.7709160801023245, "reward_before_std": 0.6886178329586983, "reward_change_max": 0.00012315809726715088, "reward_change_mean": -0.12940420676022768, "reward_change_min": -0.21598459593951702, "reward_change_std": 0.08058440638706088, "reward_std": 0.7025103233754635, "rewards/cosine_scaled_reward": -0.06245864322409034, "rewards/format_reward": 0.8958333507180214, "step": 430 }, { "advantage_max": 1.4055536314845085, "advantage_mean": 3.725291186640334e-09, "advantage_min": -1.2864673808217049, "advantage_std": 0.9997454509139061, "completion_length": 1462.4375457763672, "epoch": 0.49257142857142855, "grad_norm": 1.5079882144927979, "kl": 0.5954742431640625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.5120838934595337e-07, "loss": 0.0238, "reward": 0.5773325273767114, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5773325273767114, "reward_after_std": 0.5111439414322376, "reward_before_mean": 0.7075983798131347, "reward_before_std": 0.5024440437555313, "reward_change_max": 0.0, "reward_change_mean": -0.1302658156491816, "reward_change_min": -0.2057191450148821, "reward_change_std": 0.07728143502026796, "reward_std": 0.5111439451575279, "rewards/cosine_scaled_reward": -0.10453415662050247, "rewards/format_reward": 0.9166666716337204, "step": 431 }, { "advantage_max": 1.557257518172264, "advantage_mean": -2.7318796336217588e-08, "advantage_min": -0.9859335571527481, "advantage_std": 0.9998309835791588, "completion_length": 1686.395851135254, "epoch": 0.4937142857142857, "grad_norm": 2.2990598678588867, "kl": 0.598297119140625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.4976263201891613e-07, "loss": 0.0239, "reward": 0.37111999094486237, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.37111999094486237, "reward_after_std": 0.7255898527801037, "reward_before_mean": 0.47415912989526987, "reward_before_std": 0.7179553136229515, "reward_change_max": 0.0, "reward_change_mean": -0.10303913801908493, "reward_change_min": -0.1894255429506302, "reward_change_std": 0.06987693347036839, "reward_std": 0.7255898788571358, "rewards/cosine_scaled_reward": -0.15875378297641873, "rewards/format_reward": 0.7916666753590107, "step": 432 }, { "advantage_max": 1.3101054728031158, "advantage_mean": -5.587935669737476e-09, "advantage_min": -1.4408425688743591, "advantage_std": 0.9997941181063652, "completion_length": 1531.2917251586914, "epoch": 0.4948571428571429, "grad_norm": 1.3603363037109375, "kl": 0.4079132080078125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.483363816965435e-07, "loss": 0.0163, "reward": 0.7492245864123106, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7492245864123106, "reward_after_std": 0.6005501635372639, "reward_before_mean": 0.894620930776, "reward_before_std": 0.6026262082159519, "reward_change_max": 2.2485852241516113e-05, "reward_change_mean": -0.14539633970707655, "reward_change_min": -0.23189252614974976, "reward_change_std": 0.09261250263080001, "reward_std": 0.6005501784384251, "rewards/cosine_scaled_reward": 0.030643776757642627, "rewards/format_reward": 0.8333333507180214, "step": 433 }, { "advantage_max": 1.482622116804123, "advantage_mean": -3.321717245707845e-08, "advantage_min": -1.2409061938524246, "advantage_std": 0.9998137354850769, "completion_length": 1580.020881652832, "epoch": 0.496, "grad_norm": 2.1808040142059326, "kl": 0.7606887817382812, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.469297078922642e-07, "loss": 0.0305, "reward": 0.13291472848504782, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.13291472848504782, "reward_after_std": 0.6066357120871544, "reward_before_mean": 0.21792132034897804, "reward_before_std": 0.6074311584234238, "reward_change_max": 0.0010739341378211975, "reward_change_mean": -0.08500660490244627, "reward_change_min": -0.1500966176390648, "reward_change_std": 0.06016435939818621, "reward_std": 0.6066357530653477, "rewards/cosine_scaled_reward": -0.25562268076464534, "rewards/format_reward": 0.729166679084301, "step": 434 }, { "advantage_max": 1.4640971571207047, "advantage_mean": -1.3038516710750514e-08, "advantage_min": -1.2968028336763382, "advantage_std": 0.9997572973370552, "completion_length": 1197.9792098999023, "epoch": 0.49714285714285716, "grad_norm": 1.8218203783035278, "kl": 0.5258941650390625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.4554267916537495e-07, "loss": 0.0211, "reward": 0.5289339208975434, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.5289339208975434, "reward_after_std": 0.48837145417928696, "reward_before_mean": 0.6544436899712309, "reward_before_std": 0.47141289338469505, "reward_change_max": 0.00042301416397094727, "reward_change_mean": -0.1255097622051835, "reward_change_min": -0.19645695015788078, "reward_change_std": 0.07207289850339293, "reward_std": 0.48837146535515785, "rewards/cosine_scaled_reward": -0.1311115063726902, "rewards/format_reward": 0.916666679084301, "step": 435 }, { "advantage_max": 1.34403195977211, "advantage_mean": -3.4769377377230626e-08, "advantage_min": -1.3322007581591606, "advantage_std": 0.9998322278261185, "completion_length": 1186.458366394043, "epoch": 0.4982857142857143, "grad_norm": 1.2338637113571167, "kl": 0.35684967041015625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.4417536311769885e-07, "loss": 0.0143, "reward": 1.0391714964061975, "reward_advantage_correlation": 1.0, "reward_after_mean": 1.0391714964061975, "reward_after_std": 0.7189607694745064, "reward_before_mean": 1.2088293116539717, "reward_before_std": 0.7188755720853806, "reward_change_max": 0.0, "reward_change_mean": -0.169657819904387, "reward_change_min": -0.25869670882821083, "reward_change_std": 0.10130301676690578, "reward_std": 0.7189607881009579, "rewards/cosine_scaled_reward": 0.1460813172161579, "rewards/format_reward": 0.9166666679084301, "step": 436 }, { "advantage_max": 1.476868376135826, "advantage_mean": -2.6077032533322608e-08, "advantage_min": -1.2403504475951195, "advantage_std": 0.9998257905244827, "completion_length": 1359.9583740234375, "epoch": 0.49942857142857144, "grad_norm": 1.9391827583312988, "kl": 0.3791656494140625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.4282782639029128e-07, "loss": 0.0152, "reward": 0.6853760741651058, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6853760741651058, "reward_after_std": 0.6799805872142315, "reward_before_mean": 0.8209675564430654, "reward_before_std": 0.6743724048137665, "reward_change_max": 0.0, "reward_change_mean": -0.1355915078893304, "reward_change_min": -0.2195442169904709, "reward_change_std": 0.0846588434651494, "reward_std": 0.679980605840683, "rewards/cosine_scaled_reward": -0.05826622620224953, "rewards/format_reward": 0.9375000074505806, "step": 437 }, { "advantage_max": 1.4305167347192764, "advantage_mean": -4.967053990334591e-09, "advantage_min": -1.2116082832217216, "advantage_std": 0.9997850134968758, "completion_length": 1912.6667098999023, "epoch": 0.5005714285714286, "grad_norm": 1.7599682807922363, "kl": 0.93927001953125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.4150013466019114e-07, "loss": 0.0376, "reward": 0.30255572497844696, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.30255572497844696, "reward_after_std": 0.5124889835715294, "reward_before_mean": 0.4072955325245857, "reward_before_std": 0.5075165517628193, "reward_change_max": 0.0, "reward_change_mean": -0.10473981127142906, "reward_change_min": -0.17432630248367786, "reward_change_std": 0.06469799065962434, "reward_std": 0.5124890096485615, "rewards/cosine_scaled_reward": -0.2130189104937017, "rewards/format_reward": 0.8333333432674408, "step": 438 }, { "advantage_max": 1.3590333685278893, "advantage_mean": -1.8626451603331873e-08, "advantage_min": -1.317535161972046, "advantage_std": 0.9997846111655235, "completion_length": 1555.5625534057617, "epoch": 0.5017142857142857, "grad_norm": 2.5428316593170166, "kl": 0.6335601806640625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.4019235263722034e-07, "loss": 0.0253, "reward": 0.33497130312025547, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.33497130312025547, "reward_after_std": 0.5517369508743286, "reward_before_mean": 0.44286380242556334, "reward_before_std": 0.5564365647733212, "reward_change_max": 0.0, "reward_change_mean": -0.10789249558001757, "reward_change_min": -0.18508377857506275, "reward_change_std": 0.07193143153563142, "reward_std": 0.5517369657754898, "rewards/cosine_scaled_reward": -0.16398477833718061, "rewards/format_reward": 0.7708333469927311, "step": 439 }, { "advantage_max": 1.687167003750801, "advantage_mean": -3.1664968702660445e-08, "advantage_min": -0.9845371693372726, "advantage_std": 0.9997920244932175, "completion_length": 1395.2708702087402, "epoch": 0.5028571428571429, "grad_norm": 1.3478094339370728, "kl": 0.426025390625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.3890454406082956e-07, "loss": 0.017, "reward": 0.4049868443980813, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4049868443980813, "reward_after_std": 0.5812089741230011, "reward_before_mean": 0.5134525969624519, "reward_before_std": 0.5556198842823505, "reward_change_max": 0.0, "reward_change_mean": -0.1084657683968544, "reward_change_min": -0.17195551469922066, "reward_change_std": 0.06114753941074014, "reward_std": 0.5812089964747429, "rewards/cosine_scaled_reward": -0.19119038060307503, "rewards/format_reward": 0.8958333507180214, "step": 440 }, { "advantage_max": 1.3728258907794952, "advantage_mean": -3.166496831408239e-08, "advantage_min": -1.2612786442041397, "advantage_std": 0.9998279735445976, "completion_length": 1365.6041946411133, "epoch": 0.504, "grad_norm": 1.260138750076294, "kl": 0.362945556640625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.3763677169699217e-07, "loss": 0.0145, "reward": 0.8448034885077504, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.8448034885077504, "reward_after_std": 0.7627089060842991, "reward_before_mean": 0.9935674387961626, "reward_before_std": 0.7690016403794289, "reward_change_max": 8.557736873626709e-05, "reward_change_mean": -0.1487639732658863, "reward_change_min": -0.24890049546957016, "reward_change_std": 0.0970719731412828, "reward_std": 0.7627089321613312, "rewards/cosine_scaled_reward": 0.048867044039070606, "rewards/format_reward": 0.8958333432674408, "step": 441 }, { "advantage_max": 1.6671375334262848, "advantage_mean": -4.346172155500483e-08, "advantage_min": -1.0221184343099594, "advantage_std": 0.9998374804854393, "completion_length": 1317.5000457763672, "epoch": 0.5051428571428571, "grad_norm": 1.947296142578125, "kl": 0.659881591796875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.3638909733514452e-07, "loss": 0.0264, "reward": 0.846515204757452, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.846515204757452, "reward_after_std": 0.7835588194429874, "reward_before_mean": 0.9904348067939281, "reward_before_std": 0.7543639913201332, "reward_change_max": 9.492039680480957e-06, "reward_change_mean": -0.1439195815473795, "reward_change_min": -0.2259671613574028, "reward_change_std": 0.0858937781304121, "reward_std": 0.7835588529706001, "rewards/cosine_scaled_reward": 0.07855071779340506, "rewards/format_reward": 0.8333333432674408, "step": 442 }, { "advantage_max": 1.4112332686781883, "advantage_mean": -4.967053890414519e-08, "advantage_min": -1.2800021320581436, "advantage_std": 0.9997948706150055, "completion_length": 1655.5625839233398, "epoch": 0.5062857142857143, "grad_norm": 1.9380803108215332, "kl": 0.643463134765625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.351615817851748e-07, "loss": 0.0257, "reward": 0.4038530308753252, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4038530308753252, "reward_after_std": 0.5651445239782333, "reward_before_mean": 0.516314348205924, "reward_before_std": 0.5649879388511181, "reward_change_max": 0.0004429370164871216, "reward_change_mean": -0.11246132105588913, "reward_change_min": -0.18150242697447538, "reward_change_std": 0.07103017391636968, "reward_std": 0.5651445314288139, "rewards/cosine_scaled_reward": -0.10642617009580135, "rewards/format_reward": 0.7291666697710752, "step": 443 }, { "advantage_max": 1.4937669187784195, "advantage_mean": -3.663202252646158e-08, "advantage_min": -1.1797254905104637, "advantage_std": 0.9998287782073021, "completion_length": 1368.9791870117188, "epoch": 0.5074285714285715, "grad_norm": 1.5908373594284058, "kl": 0.542572021484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.3395428487445914e-07, "loss": 0.0217, "reward": 0.5830896962434053, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5830896962434053, "reward_after_std": 0.8080550767481327, "reward_before_mean": 0.7053059078752995, "reward_before_std": 0.810644131153822, "reward_change_max": 0.0, "reward_change_mean": -0.12221621721982956, "reward_change_min": -0.22589535266160965, "reward_change_std": 0.08656603936105967, "reward_std": 0.8080550953745842, "rewards/cosine_scaled_reward": -0.10568038653582335, "rewards/format_reward": 0.9166666865348816, "step": 444 }, { "advantage_max": 1.6786562949419022, "advantage_mean": 9.934107758624577e-09, "advantage_min": -1.1170127242803574, "advantage_std": 0.9997536465525627, "completion_length": 1343.583366394043, "epoch": 0.5085714285714286, "grad_norm": 1.6728626489639282, "kl": 0.6574554443359375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.3276726544494571e-07, "loss": 0.0263, "reward": 0.4528891518712044, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4528891518712044, "reward_after_std": 0.5791403837502003, "reward_before_mean": 0.5661626718938351, "reward_before_std": 0.5518585778772831, "reward_change_max": 0.0, "reward_change_mean": -0.11327349953353405, "reward_change_min": -0.16920770704746246, "reward_change_std": 0.06325740413740277, "reward_std": 0.5791404116898775, "rewards/cosine_scaled_reward": -0.17525201058015227, "rewards/format_reward": 0.916666679084301, "step": 445 }, { "advantage_max": 1.405263438820839, "advantage_mean": -1.862645060413115e-08, "advantage_min": -1.3909134268760681, "advantage_std": 0.9997911751270294, "completion_length": 1314.9167022705078, "epoch": 0.5097142857142857, "grad_norm": 2.1797311305999756, "kl": 0.2393798828125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.316005813502869e-07, "loss": 0.0096, "reward": 0.8286987226456404, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.8286987226456404, "reward_after_std": 0.6055082138627768, "reward_before_mean": 0.9798233285546303, "reward_before_std": 0.5961595419794321, "reward_change_max": 0.0, "reward_change_mean": -0.151124594733119, "reward_change_min": -0.23226046934723854, "reward_change_std": 0.09119729977101088, "reward_std": 0.605508241802454, "rewards/cosine_scaled_reward": 0.031578321009874344, "rewards/format_reward": 0.916666679084301, "step": 446 }, { "advantage_max": 1.4877047389745712, "advantage_mean": -2.1109979653211042e-08, "advantage_min": -1.2679708823561668, "advantage_std": 0.9997538030147552, "completion_length": 1493.2292098999023, "epoch": 0.5108571428571429, "grad_norm": 1.5136258602142334, "kl": 0.736480712890625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.3045428945301953e-07, "loss": 0.0295, "reward": 0.4689008966088295, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4689008966088295, "reward_after_std": 0.5583977196365595, "reward_before_mean": 0.5876877517439425, "reward_before_std": 0.5523215904831886, "reward_change_max": 0.0, "reward_change_mean": -0.11878686537966132, "reward_change_min": -0.20156334061175585, "reward_change_std": 0.07434120122343302, "reward_std": 0.5583977401256561, "rewards/cosine_scaled_reward": -0.10198946483433247, "rewards/format_reward": 0.791666679084301, "step": 447 }, { "advantage_max": 1.3867352455854416, "advantage_mean": -5.712112005618053e-08, "advantage_min": -1.2888628989458084, "advantage_std": 0.9998233914375305, "completion_length": 1300.645866394043, "epoch": 0.512, "grad_norm": 2.749136447906494, "kl": 0.6655731201171875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.2932844562179352e-07, "loss": 0.0266, "reward": 0.5472166938707232, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5472166938707232, "reward_after_std": 0.6558889821171761, "reward_before_mean": 0.6711777672171593, "reward_before_std": 0.6643517129123211, "reward_change_max": 0.00017968565225601196, "reward_change_mean": -0.1239611036144197, "reward_change_min": -0.21109597571194172, "reward_change_std": 0.08364540711045265, "reward_std": 0.6558890230953693, "rewards/cosine_scaled_reward": -0.060244444757699966, "rewards/format_reward": 0.791666679084301, "step": 448 }, { "advantage_max": 1.5808791145682335, "advantage_mean": -3.60111408470587e-08, "advantage_min": -1.0195664539933205, "advantage_std": 0.9997886493802071, "completion_length": 1297.833381652832, "epoch": 0.5131428571428571, "grad_norm": 3.942514419555664, "kl": 0.5755157470703125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.2822310472864885e-07, "loss": 0.023, "reward": 0.3039223924279213, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.3039223924279213, "reward_after_std": 0.5791955068707466, "reward_before_mean": 0.4055069088935852, "reward_before_std": 0.5674984790384769, "reward_change_max": 0.0, "reward_change_mean": -0.10158453835174441, "reward_change_min": -0.17059927806258202, "reward_change_std": 0.06457435572519898, "reward_std": 0.5791955254971981, "rewards/cosine_scaled_reward": -0.2555798841640353, "rewards/format_reward": 0.9166666716337204, "step": 449 }, { "advantage_max": 1.3982073590159416, "advantage_mean": -8.381903204845997e-08, "advantage_min": -1.2664097175002098, "advantage_std": 0.9997754022479057, "completion_length": 1329.541732788086, "epoch": 0.5142857142857142, "grad_norm": 1.6348826885223389, "kl": 0.3032073974609375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.2713832064634125e-07, "loss": 0.0121, "reward": 0.43079722626134753, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.43079722626134753, "reward_after_std": 0.5070368982851505, "reward_before_mean": 0.548844444565475, "reward_before_std": 0.5014333333820105, "reward_change_max": 0.00021466612815856934, "reward_change_mean": -0.11804721876978874, "reward_change_min": -0.1936736386269331, "reward_change_std": 0.07638330943882465, "reward_std": 0.5070369057357311, "rewards/cosine_scaled_reward": -0.15266112051904202, "rewards/format_reward": 0.8541666828095913, "step": 450 }, { "advantage_max": 1.440886214375496, "advantage_mean": -3.47693762670076e-08, "advantage_min": -1.1266870200634003, "advantage_std": 0.9997949972748756, "completion_length": 1200.8542098999023, "epoch": 0.5154285714285715, "grad_norm": 1.7086747884750366, "kl": 0.6575164794921875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.260741462457165e-07, "loss": 0.0263, "reward": 0.6295475661754608, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6295475661754608, "reward_after_std": 0.7362482752650976, "reward_before_mean": 0.7578316442668438, "reward_before_std": 0.7389062829315662, "reward_change_max": 0.00040249526500701904, "reward_change_mean": -0.12828407809138298, "reward_change_min": -0.24284372478723526, "reward_change_std": 0.08936772076413035, "reward_std": 0.7362482752650976, "rewards/cosine_scaled_reward": -0.06900085625238717, "rewards/format_reward": 0.8958333395421505, "step": 451 }, { "advantage_max": 1.4972828030586243, "advantage_mean": -4.221995908437748e-08, "advantage_min": -1.2125985845923424, "advantage_std": 0.9998395889997482, "completion_length": 1394.4375610351562, "epoch": 0.5165714285714286, "grad_norm": 1.2934503555297852, "kl": 0.5374298095703125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.2503063339313356e-07, "loss": 0.0215, "reward": 0.8485817462205887, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.8485817462205887, "reward_after_std": 0.748363334685564, "reward_before_mean": 0.9961145296692848, "reward_before_std": 0.7355900332331657, "reward_change_max": 0.0, "reward_change_mean": -0.14753276854753494, "reward_change_min": -0.23141445498913527, "reward_change_std": 0.09052707394585013, "reward_std": 0.7483633458614349, "rewards/cosine_scaled_reward": 0.06055724306497723, "rewards/format_reward": 0.8750000074505806, "step": 452 }, { "advantage_max": 1.4784182906150818, "advantage_mean": -2.3593506592867186e-08, "advantage_min": -1.2552871480584145, "advantage_std": 0.999831311404705, "completion_length": 1348.562557220459, "epoch": 0.5177142857142857, "grad_norm": 1.3329824209213257, "kl": 0.463165283203125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.2400783294793668e-07, "loss": 0.0185, "reward": 0.6962817385792732, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.6962817385792732, "reward_after_std": 0.6942555904388428, "reward_before_mean": 0.8324069250375032, "reward_before_std": 0.6887706816196442, "reward_change_max": 0.00016242265701293945, "reward_change_mean": -0.1361251873895526, "reward_change_min": -0.22889462485909462, "reward_change_std": 0.08675140561535954, "reward_std": 0.6942556016147137, "rewards/cosine_scaled_reward": 0.0203701239079237, "rewards/format_reward": 0.7916666772216558, "step": 453 }, { "advantage_max": 1.4930087327957153, "advantage_mean": -3.4458937342440876e-08, "advantage_min": -1.2544832825660706, "advantage_std": 0.9997552409768105, "completion_length": 1382.6875228881836, "epoch": 0.5188571428571429, "grad_norm": 1.1521000862121582, "kl": 0.7465667724609375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.2300579475997657e-07, "loss": 0.0298, "reward": 0.500964343547821, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.500964343547821, "reward_after_std": 0.6572593171149492, "reward_before_mean": 0.6187019534409046, "reward_before_std": 0.6519017405807972, "reward_change_max": 9.223073720932007e-05, "reward_change_mean": -0.1177376201376319, "reward_change_min": -0.19812804460525513, "reward_change_std": 0.07715894654393196, "reward_std": 0.6572593450546265, "rewards/cosine_scaled_reward": -0.09689902793616056, "rewards/format_reward": 0.8125000149011612, "step": 454 }, { "advantage_max": 1.551740899682045, "advantage_mean": -1.8626451714354175e-08, "advantage_min": -1.0669294819235802, "advantage_std": 0.9997464343905449, "completion_length": 1280.770866394043, "epoch": 0.52, "grad_norm": 1.89170503616333, "kl": 0.398681640625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.220245676671809e-07, "loss": 0.016, "reward": 0.3647182397544384, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3647182397544384, "reward_after_std": 0.4574992284178734, "reward_before_mean": 0.4750360809266567, "reward_before_std": 0.4399758540093899, "reward_change_max": 0.0, "reward_change_mean": -0.1103178346529603, "reward_change_min": -0.173393864184618, "reward_change_std": 0.06261738482862711, "reward_std": 0.4574992470443249, "rewards/cosine_scaled_reward": -0.22081530094146729, "rewards/format_reward": 0.9166666716337204, "step": 455 }, { "advantage_max": 1.487000197172165, "advantage_mean": -4.563480771047068e-08, "advantage_min": -1.0528950244188309, "advantage_std": 0.9998614117503166, "completion_length": 1337.0417137145996, "epoch": 0.5211428571428571, "grad_norm": 6.231215953826904, "kl": 0.7177734375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.2106419949317388e-07, "loss": 0.0287, "reward": 0.5764753445982933, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5764753445982933, "reward_after_std": 0.8477189987897873, "reward_before_mean": 0.6952435150742531, "reward_before_std": 0.8480991721153259, "reward_change_max": 0.00027041882276535034, "reward_change_mean": -0.11876817606389523, "reward_change_min": -0.22616205736994743, "reward_change_std": 0.08259849390015006, "reward_std": 0.8477190397679806, "rewards/cosine_scaled_reward": -0.08987825782969594, "rewards/format_reward": 0.8750000223517418, "step": 456 }, { "advantage_max": 1.4224179536104202, "advantage_mean": -3.725289521305797e-09, "advantage_min": -1.3244177401065826, "advantage_std": 0.9997441843152046, "completion_length": 1460.5000381469727, "epoch": 0.5222857142857142, "grad_norm": 2.2744693756103516, "kl": 0.912109375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.2012473704494537e-07, "loss": 0.0365, "reward": 0.576944915112108, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.576944915112108, "reward_after_std": 0.5542113147675991, "reward_before_mean": 0.7066980539821088, "reward_before_std": 0.5536979511380196, "reward_change_max": 0.0001699402928352356, "reward_change_mean": -0.12975311558693647, "reward_change_min": -0.2050950825214386, "reward_change_std": 0.0803024135529995, "reward_std": 0.5542113371193409, "rewards/cosine_scaled_reward": -0.07373432070016861, "rewards/format_reward": 0.8541666865348816, "step": 457 }, { "advantage_max": 1.5208216905593872, "advantage_mean": -3.601114051399179e-08, "advantage_min": -1.1582210585474968, "advantage_std": 0.9998268038034439, "completion_length": 1149.7708587646484, "epoch": 0.5234285714285715, "grad_norm": 1.9077669382095337, "kl": 0.43023681640625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.1920622611056974e-07, "loss": 0.0172, "reward": 0.7393535878509283, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7393535878509283, "reward_after_std": 0.6870045587420464, "reward_before_mean": 0.8786043375730515, "reward_before_std": 0.6741609685122967, "reward_change_max": 0.0, "reward_change_mean": -0.13925075996667147, "reward_change_min": -0.22290120273828506, "reward_change_std": 0.08484920859336853, "reward_std": 0.6870045736432076, "rewards/cosine_scaled_reward": -0.039864509366452694, "rewards/format_reward": 0.9583333432674408, "step": 458 }, { "advantage_max": 1.2947088852524757, "advantage_mean": -1.8626451825376478e-08, "advantage_min": -1.2270531356334686, "advantage_std": 0.999851182103157, "completion_length": 1194.8958740234375, "epoch": 0.5245714285714286, "grad_norm": 2.4065475463867188, "kl": 0.529296875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.1830871145697412e-07, "loss": 0.0212, "reward": 0.8759551551192999, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.8759551551192999, "reward_after_std": 0.869237381964922, "reward_before_mean": 1.0280643533915281, "reward_before_std": 0.8942184932529926, "reward_change_max": 0.0, "reward_change_mean": -0.15210918057709932, "reward_change_min": -0.2711542509496212, "reward_change_std": 0.10925045888870955, "reward_std": 0.8692374229431152, "rewards/cosine_scaled_reward": 0.08694883063435555, "rewards/format_reward": 0.854166679084301, "step": 459 }, { "advantage_max": 1.47978987544775, "advantage_mean": 1.2417635808503746e-09, "advantage_min": -1.13826222717762, "advantage_std": 0.9998352378606796, "completion_length": 1906.5417175292969, "epoch": 0.5257142857142857, "grad_norm": 2.8608667850494385, "kl": 1.1624984741210938, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.1743223682775649e-07, "loss": 0.0465, "reward": 0.4057202450931072, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.4057202450931072, "reward_after_std": 0.7728376425802708, "reward_before_mean": 0.511981688439846, "reward_before_std": 0.7764072492718697, "reward_change_max": 0.0001369267702102661, "reward_change_mean": -0.10626146895810962, "reward_change_min": -0.19404671341180801, "reward_change_std": 0.07556618331000209, "reward_std": 0.772837657481432, "rewards/cosine_scaled_reward": -0.12942582089453936, "rewards/format_reward": 0.7708333469927311, "step": 460 }, { "advantage_max": 1.5734613537788391, "advantage_mean": -2.7318796114172983e-08, "advantage_min": -1.161512367427349, "advantage_std": 0.9997789859771729, "completion_length": 1528.0209045410156, "epoch": 0.5268571428571428, "grad_norm": 1.8937029838562012, "kl": 0.5341567993164062, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.1657684494105386e-07, "loss": 0.0214, "reward": 0.5294137634336948, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5294137634336948, "reward_after_std": 0.6099908128380775, "reward_before_mean": 0.6501117488369346, "reward_before_std": 0.592260580509901, "reward_change_max": 0.0, "reward_change_mean": -0.1206979900598526, "reward_change_min": -0.19131910055875778, "reward_change_std": 0.07167271664366126, "reward_std": 0.6099908147007227, "rewards/cosine_scaled_reward": -0.11244414187967777, "rewards/format_reward": 0.8750000149011612, "step": 461 }, { "advantage_max": 1.5587438941001892, "advantage_mean": -5.587935503204022e-09, "advantage_min": -1.1386344656348228, "advantage_std": 0.999779962003231, "completion_length": 1313.5000228881836, "epoch": 0.528, "grad_norm": 2.6109707355499268, "kl": 0.7901763916015625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.1574257748745986e-07, "loss": 0.0317, "reward": 0.2571336994878948, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2571336994878948, "reward_after_std": 0.6080882269889116, "reward_before_mean": 0.35165482480078936, "reward_before_std": 0.5977811366319656, "reward_change_max": 0.00030838698148727417, "reward_change_mean": -0.094521121121943, "reward_change_min": -0.15098145883530378, "reward_change_std": 0.059791785664856434, "reward_std": 0.6080882381647825, "rewards/cosine_scaled_reward": -0.2304226029664278, "rewards/format_reward": 0.8125000149011612, "step": 462 }, { "advantage_max": 1.3307348042726517, "advantage_mean": -2.3903946239078877e-08, "advantage_min": -1.3292298913002014, "advantage_std": 0.9998722821474075, "completion_length": 1411.1458740234375, "epoch": 0.5291428571428571, "grad_norm": 1.875565528869629, "kl": 0.419036865234375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.1492947512799328e-07, "loss": 0.0168, "reward": 0.7753808298148215, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7753808298148215, "reward_after_std": 0.9464625902473927, "reward_before_mean": 0.9129773788154125, "reward_before_std": 0.9668588750064373, "reward_change_max": 0.000978812575340271, "reward_change_mean": -0.13759654574096203, "reward_change_min": -0.27825625985860825, "reward_change_std": 0.10437601897865534, "reward_std": 0.9464626125991344, "rewards/cosine_scaled_reward": 0.02940535603556782, "rewards/format_reward": 0.8541666865348816, "step": 463 }, { "advantage_max": 1.618887484073639, "advantage_mean": -2.5766591477127676e-07, "advantage_min": -1.086423322558403, "advantage_std": 0.9997170269489288, "completion_length": 958.895866394043, "epoch": 0.5302857142857142, "grad_norm": 1.9415994882583618, "kl": 0.5035934448242188, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.1413757749211602e-07, "loss": 0.0202, "reward": 1.0455838665366173, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 1.0455838665366173, "reward_after_std": 0.3843573573976755, "reward_before_mean": 1.2220981623977423, "reward_before_std": 0.3369241552427411, "reward_change_max": 0.0, "reward_change_mean": -0.1765143796801567, "reward_change_min": -0.24797379225492477, "reward_change_std": 0.09332408988848329, "reward_std": 0.38435736298561096, "rewards/cosine_scaled_reward": 0.12146575003862381, "rewards/format_reward": 0.9791666716337204, "step": 464 }, { "advantage_max": 1.4793611317873, "advantage_mean": -3.8494666565469515e-08, "advantage_min": -1.103214792907238, "advantage_std": 0.9998826235532761, "completion_length": 1525.7083587646484, "epoch": 0.5314285714285715, "grad_norm": 1.4860286712646484, "kl": 0.501861572265625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.1336692317580158e-07, "loss": 0.02, "reward": 0.8048709314316511, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.8048709314316511, "reward_after_std": 0.9882866255939007, "reward_before_mean": 0.9428267925977707, "reward_before_std": 0.9986936561763287, "reward_change_max": 0.0, "reward_change_mean": -0.1379559077322483, "reward_change_min": -0.25486108660697937, "reward_change_std": 0.09958967566490173, "reward_std": 0.9882866889238358, "rewards/cosine_scaled_reward": 0.013080062344670296, "rewards/format_reward": 0.916666679084301, "step": 465 }, { "advantage_max": 1.4345777779817581, "advantage_mean": -7.885197977897107e-08, "advantage_min": -1.2190702483057976, "advantage_std": 0.9998356327414513, "completion_length": 1442.8541870117188, "epoch": 0.5325714285714286, "grad_norm": 1.3499559164047241, "kl": 0.7032089233398438, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.1261754973965422e-07, "loss": 0.0282, "reward": 0.836047200486064, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.836047200486064, "reward_after_std": 0.7958601415157318, "reward_before_mean": 0.9831163268536329, "reward_before_std": 0.7962427716702223, "reward_change_max": 0.0007705315947532654, "reward_change_mean": -0.14706913474947214, "reward_change_min": -0.2674461305141449, "reward_change_std": 0.10129892360419035, "reward_std": 0.795860156416893, "rewards/cosine_scaled_reward": 0.0853081488457974, "rewards/format_reward": 0.812500013038516, "step": 466 }, { "advantage_max": 1.5279236733913422, "advantage_mean": -4.967053546245381e-09, "advantage_min": -1.1415907591581345, "advantage_std": 0.9998429045081139, "completion_length": 1567.8750457763672, "epoch": 0.5337142857142857, "grad_norm": 3.2998692989349365, "kl": 0.63177490234375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.1188949370707787e-07, "loss": 0.0253, "reward": 0.2946251416578889, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2946251416578889, "reward_after_std": 0.7547971494495869, "reward_before_mean": 0.3902467442676425, "reward_before_std": 0.7570151165127754, "reward_change_max": 0.00019928067922592163, "reward_change_mean": -0.09562160400673747, "reward_change_min": -0.18911895900964737, "reward_change_std": 0.07319759530946612, "reward_std": 0.7547971531748772, "rewards/cosine_scaled_reward": -0.19029329670593143, "rewards/format_reward": 0.7708333432674408, "step": 467 }, { "advantage_max": 1.6056719273328781, "advantage_mean": -1.862645193639878e-08, "advantage_min": -1.0361272692680359, "advantage_std": 0.9998692721128464, "completion_length": 1681.145866394043, "epoch": 0.5348571428571428, "grad_norm": 1.96733820438385, "kl": 1.0497665405273438, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.1118279056249653e-07, "loss": 0.042, "reward": 0.25632472475990653, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.25632472475990653, "reward_after_std": 0.8274875283241272, "reward_before_mean": 0.34465243108570576, "reward_before_std": 0.828721173107624, "reward_change_max": 0.0021148771047592163, "reward_change_mean": -0.08832769468426704, "reward_change_min": -0.1855623424053192, "reward_change_std": 0.06841085152700543, "reward_std": 0.8274875581264496, "rewards/cosine_scaled_reward": -0.14017378957942128, "rewards/format_reward": 0.6250000037252903, "step": 468 }, { "advantage_max": 1.6077563017606735, "advantage_mean": -1.5522043039783995e-08, "advantage_min": -1.0340360701084137, "advantage_std": 0.9998153671622276, "completion_length": 1132.208351135254, "epoch": 0.536, "grad_norm": 2.0020787715911865, "kl": 0.48426055908203125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.1049747474962444e-07, "loss": 0.0194, "reward": 0.5653686504811049, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5653686504811049, "reward_after_std": 0.7147700805217028, "reward_before_mean": 0.6868355348706245, "reward_before_std": 0.6977136358618736, "reward_change_max": 0.0, "reward_change_mean": -0.12146689835935831, "reward_change_min": -0.20983821526169777, "reward_change_std": 0.07952037081122398, "reward_std": 0.7147701065987349, "rewards/cosine_scaled_reward": -0.1044988944195211, "rewards/format_reward": 0.8958333507180214, "step": 469 }, { "advantage_max": 1.4791221618652344, "advantage_mean": -1.9247334059890875e-08, "advantage_min": -1.3737546727061272, "advantage_std": 0.9998335763812065, "completion_length": 1760.1250381469727, "epoch": 0.5371428571428571, "grad_norm": 2.24651837348938, "kl": 1.114990234375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0983357966978745e-07, "loss": 0.0447, "reward": 0.2855025250464678, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2855025250464678, "reward_after_std": 0.675163846462965, "reward_before_mean": 0.38193464977666736, "reward_before_std": 0.6749286688864231, "reward_change_max": 0.0002874135971069336, "reward_change_mean": -0.09643213078379631, "reward_change_min": -0.16725542396306992, "reward_change_std": 0.06746510276570916, "reward_std": 0.6751638650894165, "rewards/cosine_scaled_reward": -0.15278268977999687, "rewards/format_reward": 0.6875000186264515, "step": 470 }, { "advantage_max": 1.1827488467097282, "advantage_mean": -2.545615063187512e-08, "advantage_min": -1.522478125989437, "advantage_std": 0.9998515471816063, "completion_length": 1635.645896911621, "epoch": 0.5382857142857143, "grad_norm": 2.085747718811035, "kl": 0.7410888671875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0919113768029517e-07, "loss": 0.0297, "reward": 0.8258672105148435, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.8258672105148435, "reward_after_std": 0.7812116891145706, "reward_before_mean": 0.9759960640221834, "reward_before_std": 0.806895449757576, "reward_change_max": 0.0, "reward_change_mean": -0.15012879762798548, "reward_change_min": -0.25330642610788345, "reward_change_std": 0.10425947420299053, "reward_std": 0.7812117375433445, "rewards/cosine_scaled_reward": 0.09216467384248972, "rewards/format_reward": 0.7916666865348816, "step": 471 }, { "advantage_max": 1.6919645369052887, "advantage_mean": -2.6077032311278003e-08, "advantage_min": -1.075069934129715, "advantage_std": 0.9997465685009956, "completion_length": 1520.1667098999023, "epoch": 0.5394285714285715, "grad_norm": 1.32899010181427, "kl": 0.627593994140625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0857018009286381e-07, "loss": 0.0251, "reward": 0.35412935609929264, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.35412935609929264, "reward_after_std": 0.6487985327839851, "reward_before_mean": 0.45631421357393265, "reward_before_std": 0.6299792267382145, "reward_change_max": 0.0, "reward_change_mean": -0.10218486096709967, "reward_change_min": -0.17434173543006182, "reward_change_std": 0.062425535172224045, "reward_std": 0.6487985476851463, "rewards/cosine_scaled_reward": -0.209342903457582, "rewards/format_reward": 0.8750000074505806, "step": 472 }, { "advantage_max": 1.3841595649719238, "advantage_mean": -4.23751782552273e-08, "advantage_min": -1.1973591893911362, "advantage_std": 0.9998006895184517, "completion_length": 1474.208381652832, "epoch": 0.5405714285714286, "grad_norm": 3.4161858558654785, "kl": 0.3667144775390625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0797073717209013e-07, "loss": 0.0147, "reward": 0.33687769807875156, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.33687769807875156, "reward_after_std": 0.5530556403100491, "reward_before_mean": 0.44355832412838936, "reward_before_std": 0.5523912459611893, "reward_change_max": 0.0, "reward_change_mean": -0.10668064840137959, "reward_change_min": -0.1907523050904274, "reward_change_std": 0.06848618015646935, "reward_std": 0.5530556440353394, "rewards/cosine_scaled_reward": -0.19488751143217087, "rewards/format_reward": 0.8333333469927311, "step": 473 }, { "advantage_max": 1.3778210431337357, "advantage_mean": -6.705522881400583e-08, "advantage_min": -1.3139918148517609, "advantage_std": 0.9998078420758247, "completion_length": 1363.9375305175781, "epoch": 0.5417142857142857, "grad_norm": 3.2250938415527344, "kl": 0.5262603759765625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0739283813397639e-07, "loss": 0.0211, "reward": 1.2284482046961784, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 1.2284482046961784, "reward_after_std": 0.7306374609470367, "reward_before_mean": 1.415264431387186, "reward_before_std": 0.7239528931677341, "reward_change_max": 0.0, "reward_change_mean": -0.18681625835597515, "reward_change_min": -0.2915050946176052, "reward_change_std": 0.11583147803321481, "reward_std": 0.7306374758481979, "rewards/cosine_scaled_reward": 0.2805488705635071, "rewards/format_reward": 0.8541666939854622, "step": 474 }, { "advantage_max": 1.5258950591087341, "advantage_mean": -2.4524827946237338e-08, "advantage_min": -1.0890448316931725, "advantage_std": 0.9997752085328102, "completion_length": 1477.833381652832, "epoch": 0.5428571428571428, "grad_norm": 1.7994341850280762, "kl": 0.28729248046875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.068365111445064e-07, "loss": 0.0115, "reward": 0.5002380846999586, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5002380846999586, "reward_after_std": 0.5856530722230673, "reward_before_mean": 0.6214745305478573, "reward_before_std": 0.5763567853718996, "reward_change_max": 0.0, "reward_change_mean": -0.12123645003885031, "reward_change_min": -0.20592597592622042, "reward_change_std": 0.07472612150013447, "reward_std": 0.5856531001627445, "rewards/cosine_scaled_reward": -0.16842940403148532, "rewards/format_reward": 0.9583333432674408, "step": 475 }, { "advantage_max": 1.3964376598596573, "advantage_mean": -2.2351742123838392e-08, "advantage_min": -1.254280962049961, "advantage_std": 0.9997788667678833, "completion_length": 1635.4375610351562, "epoch": 0.544, "grad_norm": 2.187488555908203, "kl": 0.5134124755859375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.063017833182728e-07, "loss": 0.0205, "reward": 0.6432311162352562, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6432311162352562, "reward_after_std": 0.6332471240311861, "reward_before_mean": 0.7760857921093702, "reward_before_std": 0.6299569476395845, "reward_change_max": 0.00022538751363754272, "reward_change_mean": -0.13285462884232402, "reward_change_min": -0.21562502346932888, "reward_change_std": 0.0833498639985919, "reward_std": 0.6332471258938313, "rewards/cosine_scaled_reward": -0.07029045931994915, "rewards/format_reward": 0.9166666716337204, "step": 476 }, { "advantage_max": 1.7337168902158737, "advantage_mean": 6.2088170160734535e-09, "advantage_min": -1.0874443799257278, "advantage_std": 0.999891571700573, "completion_length": 1237.0625305175781, "epoch": 0.5451428571428572, "grad_norm": 1.326654314994812, "kl": 0.44366455078125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0578868071715544e-07, "loss": 0.0178, "reward": 0.8383656330406666, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.8383656330406666, "reward_after_std": 0.9872477427124977, "reward_before_mean": 0.9752098955214024, "reward_before_std": 0.9661254063248634, "reward_change_max": 0.0, "reward_change_mean": -0.13684424851089716, "reward_change_min": -0.2305159643292427, "reward_change_std": 0.08920324314385653, "reward_std": 0.9872477427124977, "rewards/cosine_scaled_reward": 0.029271604435052723, "rewards/format_reward": 0.916666679084301, "step": 477 }, { "advantage_max": 1.573157086968422, "advantage_mean": -3.0423204788743163e-08, "advantage_min": -1.1611996442079544, "advantage_std": 0.9998464584350586, "completion_length": 1965.833366394043, "epoch": 0.5462857142857143, "grad_norm": 1.792330265045166, "kl": 0.904815673828125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0529722834905125e-07, "loss": 0.0362, "reward": 0.48183274059556425, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.48183274059556425, "reward_after_std": 0.8001515120267868, "reward_before_mean": 0.5936882272362709, "reward_before_std": 0.7990560345351696, "reward_change_max": 0.0, "reward_change_mean": -0.11185550084337592, "reward_change_min": -0.20309627056121826, "reward_change_std": 0.07982863765209913, "reward_std": 0.800151526927948, "rewards/cosine_scaled_reward": -0.06773922825232148, "rewards/format_reward": 0.7291666846722364, "step": 478 }, { "advantage_max": 1.4396943747997284, "advantage_mean": -9.623667751590403e-09, "advantage_min": -1.3289758563041687, "advantage_std": 0.9998451471328735, "completion_length": 1589.4792098999023, "epoch": 0.5474285714285714, "grad_norm": 1.5058865547180176, "kl": 0.708740234375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0482745016665526e-07, "loss": 0.0284, "reward": 0.4269852042198181, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4269852042198181, "reward_after_std": 0.8025585748255253, "reward_before_mean": 0.5355709902942181, "reward_before_std": 0.8142498098313808, "reward_change_max": 0.0011701732873916626, "reward_change_mean": -0.10858577489852905, "reward_change_min": -0.19784590601921082, "reward_change_std": 0.08253068244084716, "reward_std": 0.8025586009025574, "rewards/cosine_scaled_reward": -0.12804784905165434, "rewards/format_reward": 0.7916666902601719, "step": 479 }, { "advantage_max": 1.5044909566640854, "advantage_mean": -2.2972623581196672e-08, "advantage_min": -1.1191659942269325, "advantage_std": 0.9997437074780464, "completion_length": 1412.8333587646484, "epoch": 0.5485714285714286, "grad_norm": 1.9218823909759521, "kl": 0.5767822265625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0437936906629334e-07, "loss": 0.023, "reward": 0.40811170265078545, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.40811170265078545, "reward_after_std": 0.5391519796103239, "reward_before_mean": 0.5209373664110899, "reward_before_std": 0.5283264443278313, "reward_change_max": 0.0, "reward_change_mean": -0.11282562743872404, "reward_change_min": -0.19051403924822807, "reward_change_std": 0.07031045900657773, "reward_std": 0.5391519945114851, "rewards/cosine_scaled_reward": -0.17703134287148714, "rewards/format_reward": 0.8750000074505806, "step": 480 }, { "advantage_max": 1.4230820909142494, "advantage_mean": -2.297262435835279e-08, "advantage_min": -1.2027820497751236, "advantage_std": 0.9998055920004845, "completion_length": 1643.5833892822266, "epoch": 0.5497142857142857, "grad_norm": 1.3952871561050415, "kl": 0.5593643188476562, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0395300688680625e-07, "loss": 0.0224, "reward": 0.23358649760484695, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.23358649760484695, "reward_after_std": 0.6157410964369774, "reward_before_mean": 0.3300169431604445, "reward_before_std": 0.6253011748194695, "reward_change_max": 0.0006894916296005249, "reward_change_mean": -0.09643045393750072, "reward_change_min": -0.1772970948368311, "reward_change_std": 0.07094644149765372, "reward_std": 0.6157411076128483, "rewards/cosine_scaled_reward": -0.22040820494294167, "rewards/format_reward": 0.7708333544433117, "step": 481 }, { "advantage_max": 1.3363105058670044, "advantage_mean": -3.3527614351491764e-08, "advantage_min": -1.3465068489313126, "advantage_std": 0.9998397678136826, "completion_length": 1521.4792251586914, "epoch": 0.5508571428571428, "grad_norm": 1.705883502960205, "kl": 0.665130615234375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0354838440848501e-07, "loss": 0.0266, "reward": 0.8281307835131884, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.8281307835131884, "reward_after_std": 0.7850172445178032, "reward_before_mean": 0.9757171748206019, "reward_before_std": 0.7954017668962479, "reward_change_max": 0.0005815252661705017, "reward_change_mean": -0.14758640620857477, "reward_change_min": -0.26257020607590675, "reward_change_std": 0.09904332272708416, "reward_std": 0.785017266869545, "rewards/cosine_scaled_reward": 0.019108579959720373, "rewards/format_reward": 0.9375000149011612, "step": 482 }, { "advantage_max": 1.423334315419197, "advantage_mean": -8.69234451084111e-09, "advantage_min": -1.1823545172810555, "advantage_std": 0.9998378828167915, "completion_length": 1683.3125305175781, "epoch": 0.552, "grad_norm": 2.6206977367401123, "kl": 0.6625442504882812, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0316552135205837e-07, "loss": 0.0265, "reward": 0.7536204941570759, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7536204941570759, "reward_after_std": 0.8240559808909893, "reward_before_mean": 0.8923968635499477, "reward_before_std": 0.8315610997378826, "reward_change_max": 0.0, "reward_change_mean": -0.13877638336271048, "reward_change_min": -0.24121517688035965, "reward_change_std": 0.09393159532919526, "reward_std": 0.8240560032427311, "rewards/cosine_scaled_reward": 0.03994842991232872, "rewards/format_reward": 0.812500013038516, "step": 483 }, { "advantage_max": 1.4809504449367523, "advantage_mean": -4.7187011964489045e-08, "advantage_min": -1.1418846175074577, "advantage_std": 0.9997940585017204, "completion_length": 1221.270851135254, "epoch": 0.5531428571428572, "grad_norm": 1.6962131261825562, "kl": 0.384124755859375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0280443637773163e-07, "loss": 0.0154, "reward": 0.7981872851960361, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.7981872851960361, "reward_after_std": 0.8525705002248287, "reward_before_mean": 0.9407486170530319, "reward_before_std": 0.8583914032205939, "reward_change_max": 0.00044048577547073364, "reward_change_mean": -0.14256139378994703, "reward_change_min": -0.25064817070961, "reward_change_std": 0.09835958620533347, "reward_std": 0.8525705300271511, "rewards/cosine_scaled_reward": 0.001624307595193386, "rewards/format_reward": 0.9375000149011612, "step": 484 }, { "advantage_max": 1.3821987211704254, "advantage_mean": -5.308538825188336e-08, "advantage_min": -1.3728350549936295, "advantage_std": 0.9997694715857506, "completion_length": 1498.8125610351562, "epoch": 0.5542857142857143, "grad_norm": 1.9051717519760132, "kl": 0.82757568359375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0246514708427701e-07, "loss": 0.0331, "reward": 0.486922824755311, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.486922824755311, "reward_after_std": 0.6849659271538258, "reward_before_mean": 0.6047742627561092, "reward_before_std": 0.6905794739723206, "reward_change_max": 0.0, "reward_change_mean": -0.11785144358873367, "reward_change_min": -0.20198489725589752, "reward_change_std": 0.08204239793121815, "reward_std": 0.6849659346044064, "rewards/cosine_scaled_reward": -0.12469621049240232, "rewards/format_reward": 0.854166679084301, "step": 485 }, { "advantage_max": 1.7733388990163803, "advantage_mean": -5.0136199525319114e-08, "advantage_min": -0.9555219374597073, "advantage_std": 0.99978306889534, "completion_length": 884.2500305175781, "epoch": 0.5554285714285714, "grad_norm": 2.335850715637207, "kl": 0.38094329833984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0214767000817596e-07, "loss": 0.0152, "reward": 0.5196775365620852, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5196775365620852, "reward_after_std": 0.5223635025322437, "reward_before_mean": 0.6409784676507115, "reward_before_std": 0.4872880391776562, "reward_change_max": 0.0, "reward_change_mean": -0.12130093993619084, "reward_change_min": -0.1774542685598135, "reward_change_std": 0.06820766627788544, "reward_std": 0.5223635211586952, "rewards/cosine_scaled_reward": -0.13784411549568176, "rewards/format_reward": 0.916666679084301, "step": 486 }, { "advantage_max": 1.4368427097797394, "advantage_mean": -4.796311425803168e-08, "advantage_min": -1.1753825396299362, "advantage_std": 0.9998473450541496, "completion_length": 973.0625228881836, "epoch": 0.5565714285714286, "grad_norm": 1.179042100906372, "kl": 0.1561431884765625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0185202062281336e-07, "loss": 0.0063, "reward": 1.0191260538995266, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 1.0191260538995266, "reward_after_std": 0.8055990971624851, "reward_before_mean": 1.1827868521213531, "reward_before_std": 0.802632138133049, "reward_change_max": 0.0003419220447540283, "reward_change_mean": -0.1636607814580202, "reward_change_min": -0.28896985203027725, "reward_change_std": 0.10505983280017972, "reward_std": 0.8055991157889366, "rewards/cosine_scaled_reward": 0.13306006882339716, "rewards/format_reward": 0.9166666716337204, "step": 487 }, { "advantage_max": 1.3047830387949944, "advantage_mean": -7.823109782201243e-08, "advantage_min": -1.430648073554039, "advantage_std": 0.9997356534004211, "completion_length": 1126.270839691162, "epoch": 0.5577142857142857, "grad_norm": 1.4515074491500854, "kl": 0.390380859375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0157821333772304e-07, "loss": 0.0156, "reward": 0.49460936337709427, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.49460936337709427, "reward_after_std": 0.45185090601444244, "reward_before_mean": 0.6208971869200468, "reward_before_std": 0.4511682763695717, "reward_change_max": 0.0, "reward_change_mean": -0.12628783658146858, "reward_change_min": -0.19421625509858131, "reward_change_std": 0.07442756928503513, "reward_std": 0.4518509153276682, "rewards/cosine_scaled_reward": -0.16871808469295502, "rewards/format_reward": 0.9583333432674408, "step": 488 }, { "advantage_max": 1.4228082448244095, "advantage_mean": -2.359350631531143e-08, "advantage_min": -1.3723071962594986, "advantage_std": 0.9997459053993225, "completion_length": 1585.041706085205, "epoch": 0.5588571428571428, "grad_norm": 1.8846523761749268, "kl": 0.680572509765625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.013262614978859e-07, "loss": 0.0272, "reward": 0.3229170944541693, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3229170944541693, "reward_after_std": 0.5093741305172443, "reward_before_mean": 0.4295815769582987, "reward_before_std": 0.5072203408926725, "reward_change_max": 5.799531936645508e-05, "reward_change_mean": -0.10666447039693594, "reward_change_min": -0.17716624028980732, "reward_change_std": 0.06782869761809707, "reward_std": 0.5093741416931152, "rewards/cosine_scaled_reward": -0.20187588641420007, "rewards/format_reward": 0.8333333432674408, "step": 489 }, { "advantage_max": 1.6553240045905113, "advantage_mean": -2.545615163107584e-08, "advantage_min": -1.0602488964796066, "advantage_std": 0.9997842386364937, "completion_length": 1372.4375610351562, "epoch": 0.56, "grad_norm": 1.7037945985794067, "kl": 0.612060546875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0109617738307911e-07, "loss": 0.0245, "reward": 0.5970811229199171, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5970811229199171, "reward_after_std": 0.6770407669246197, "reward_before_mean": 0.7228398490697145, "reward_before_std": 0.6587517447769642, "reward_change_max": 0.0, "reward_change_mean": -0.12575868796557188, "reward_change_min": -0.19899218156933784, "reward_change_std": 0.07600692426785827, "reward_std": 0.6770407911390066, "rewards/cosine_scaled_reward": -0.0969134415499866, "rewards/format_reward": 0.9166666716337204, "step": 490 }, { "advantage_max": 1.5155636966228485, "advantage_mean": -1.9247334392957782e-08, "advantage_min": -1.1463100016117096, "advantage_std": 0.9998896941542625, "completion_length": 1695.854232788086, "epoch": 0.5611428571428572, "grad_norm": 1.6566426753997803, "kl": 0.6379241943359375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0088797220727779e-07, "loss": 0.0255, "reward": 0.7128791492432356, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7128791492432356, "reward_after_std": 0.9784063994884491, "reward_before_mean": 0.8413580115884542, "reward_before_std": 0.9818304255604744, "reward_change_max": 9.988248348236084e-05, "reward_change_mean": -0.12847886700183153, "reward_change_min": -0.25756747275590897, "reward_change_std": 0.093171376734972, "reward_std": 0.9784064367413521, "rewards/cosine_scaled_reward": -0.03765433467924595, "rewards/format_reward": 0.9166666865348816, "step": 491 }, { "advantage_max": 1.4262542724609375, "advantage_mean": -2.7939677682553565e-08, "advantage_min": -1.191407211124897, "advantage_std": 0.9998179897665977, "completion_length": 1575.583396911621, "epoch": 0.5622857142857143, "grad_norm": 3.479654550552368, "kl": 0.9051971435546875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0070165611810855e-07, "loss": 0.0362, "reward": 0.5793958441354334, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.5793958441354334, "reward_after_std": 0.6907056048512459, "reward_before_mean": 0.7047333084046841, "reward_before_std": 0.6915153935551643, "reward_change_max": 0.0, "reward_change_mean": -0.1253374693915248, "reward_change_min": -0.2124233189970255, "reward_change_std": 0.08227851102128625, "reward_std": 0.6907056123018265, "rewards/cosine_scaled_reward": -0.07471668440848589, "rewards/format_reward": 0.8541666753590107, "step": 492 }, { "advantage_max": 1.4624339193105698, "advantage_mean": -2.235174201281609e-08, "advantage_min": -1.2118503227829933, "advantage_std": 0.9998716413974762, "completion_length": 1361.3542175292969, "epoch": 0.5634285714285714, "grad_norm": 2.0603668689727783, "kl": 0.7406387329101562, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.005372381963547e-07, "loss": 0.0296, "reward": 0.84825224801898, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.84825224801898, "reward_after_std": 0.9701013043522835, "reward_before_mean": 0.9899251163005829, "reward_before_std": 0.9750980362296104, "reward_change_max": 0.0, "reward_change_mean": -0.14167285151779652, "reward_change_min": -0.2806529551744461, "reward_change_std": 0.10109834838658571, "reward_std": 0.9701013043522835, "rewards/cosine_scaled_reward": 0.057462539232801646, "rewards/format_reward": 0.8750000149011612, "step": 493 }, { "advantage_max": 1.5129066854715347, "advantage_mean": -3.849466811978175e-08, "advantage_min": -1.1654788628220558, "advantage_std": 0.9998171105980873, "completion_length": 1063.9791946411133, "epoch": 0.5645714285714286, "grad_norm": 1.551969289779663, "kl": 0.3380584716796875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0039472645551372e-07, "loss": 0.0135, "reward": 0.7001366913318634, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.7001366913318634, "reward_after_std": 0.7168645672500134, "reward_before_mean": 0.8348819054663181, "reward_before_std": 0.7078965455293655, "reward_change_max": 0.0, "reward_change_mean": -0.13474521692842245, "reward_change_min": -0.23480108752846718, "reward_change_std": 0.08507911022752523, "reward_std": 0.7168645933270454, "rewards/cosine_scaled_reward": -0.04089239612221718, "rewards/format_reward": 0.916666679084301, "step": 494 }, { "advantage_max": 1.4147690832614899, "advantage_mean": -7.823109826610164e-08, "advantage_min": -1.255192093551159, "advantage_std": 0.9998476803302765, "completion_length": 1362.250015258789, "epoch": 0.5657142857142857, "grad_norm": 1.8394250869750977, "kl": 0.43761444091796875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.002741278414069e-07, "loss": 0.0175, "reward": 0.8606002209708095, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.8606002209708095, "reward_after_std": 0.8067436181008816, "reward_before_mean": 1.0086772553622723, "reward_before_std": 0.8072432391345501, "reward_change_max": 0.0, "reward_change_mean": -0.1480770716443658, "reward_change_min": -0.2526344172656536, "reward_change_std": 0.09575034817680717, "reward_std": 0.8067436292767525, "rewards/cosine_scaled_reward": 0.046005279291421175, "rewards/format_reward": 0.916666679084301, "step": 495 }, { "advantage_max": 1.4694621339440346, "advantage_mean": -6.27090555038734e-08, "advantage_min": -1.134453445672989, "advantage_std": 0.9998220652341843, "completion_length": 1408.083366394043, "epoch": 0.5668571428571428, "grad_norm": 2.118278980255127, "kl": 0.7026290893554688, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0017544823184055e-07, "loss": 0.0281, "reward": 0.821080063469708, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.821080063469708, "reward_after_std": 0.70627411454916, "reward_before_mean": 0.968472232343629, "reward_before_std": 0.7007058300077915, "reward_change_max": 0.0, "reward_change_mean": -0.14739223755896091, "reward_change_min": -0.24292783066630363, "reward_change_std": 0.09062009677290916, "reward_std": 0.7062741294503212, "rewards/cosine_scaled_reward": 0.03631945559754968, "rewards/format_reward": 0.8958333395421505, "step": 496 }, { "advantage_max": 1.4108513593673706, "advantage_mean": -1.0927518634407107e-07, "advantage_min": -1.1535490825772285, "advantage_std": 0.9997596219182014, "completion_length": 1099.5833625793457, "epoch": 0.568, "grad_norm": 1.8349206447601318, "kl": 0.5226058959960938, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0009869243631952e-07, "loss": 0.0209, "reward": 0.819844264537096, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.819844264537096, "reward_after_std": 0.4837149791419506, "reward_before_mean": 0.9735658243298531, "reward_before_std": 0.46303862147033215, "reward_change_max": 0.0, "reward_change_mean": -0.1537215718999505, "reward_change_min": -0.2222919762134552, "reward_change_std": 0.08454245794564486, "reward_std": 0.4837149903178215, "rewards/cosine_scaled_reward": -0.013217097148299217, "rewards/format_reward": 1.0, "step": 497 }, { "advantage_max": 1.4655998349189758, "advantage_mean": -3.10440866346795e-08, "advantage_min": -1.3093776553869247, "advantage_std": 0.9998064488172531, "completion_length": 1443.1875305175781, "epoch": 0.5691428571428572, "grad_norm": 1.950182557106018, "kl": 0.6890029907226562, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.000438641958131e-07, "loss": 0.0276, "reward": 0.593856418505311, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.593856418505311, "reward_after_std": 0.7217038404196501, "reward_before_mean": 0.719802012128639, "reward_before_std": 0.7204098887741566, "reward_change_max": 0.0, "reward_change_mean": -0.125945626758039, "reward_change_min": -0.20720598101615906, "reward_change_std": 0.08468122826889157, "reward_std": 0.721703888848424, "rewards/cosine_scaled_reward": -0.06718233320862055, "rewards/format_reward": 0.854166679084301, "step": 498 }, { "advantage_max": 1.6709688156843185, "advantage_mean": -3.663202252646158e-08, "advantage_min": -1.1494032144546509, "advantage_std": 0.9997295960783958, "completion_length": 1472.3958740234375, "epoch": 0.5702857142857143, "grad_norm": 1.0532234907150269, "kl": 0.49327850341796875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0001096618257236e-07, "loss": 0.0197, "reward": 0.5505574708804488, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.5505574708804488, "reward_after_std": 0.6247312221676111, "reward_before_mean": 0.6727077215909958, "reward_before_std": 0.6021678037941456, "reward_change_max": 0.0, "reward_change_mean": -0.12215026002377272, "reward_change_min": -0.18972956016659737, "reward_change_std": 0.0723155359737575, "reward_std": 0.6247312305495143, "rewards/cosine_scaled_reward": -0.14281281549483538, "rewards/format_reward": 0.9583333432674408, "step": 499 }, { "advantage_max": 1.541480839252472, "advantage_mean": -5.587935614226325e-09, "advantage_min": -1.154192365705967, "advantage_std": 0.9998290911316872, "completion_length": 1295.6250610351562, "epoch": 0.5714285714285714, "grad_norm": 1.619492769241333, "kl": 0.4624786376953125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1e-07, "loss": 0.0185, "reward": 0.6967308446764946, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6967308446764946, "reward_after_std": 0.736558023840189, "reward_before_mean": 0.8310529440641403, "reward_before_std": 0.7277755029499531, "reward_change_max": 0.0, "reward_change_mean": -0.13432206492871046, "reward_change_min": -0.2356853261590004, "reward_change_std": 0.0854858374223113, "reward_std": 0.7365580387413502, "rewards/cosine_scaled_reward": -0.053223551250994205, "rewards/format_reward": 0.9375000074505806, "step": 500 }, { "epoch": 0.5714285714285714, "step": 500, "total_flos": 0.0, "train_loss": 0.0055785658547727055, "train_runtime": 58514.5956, "train_samples_per_second": 0.41, "train_steps_per_second": 0.009 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }