diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13542 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5714285714285714, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "advantage_max": 1.5284520238637924, + "advantage_mean": 5.587935225648266e-09, + "advantage_min": -1.2071861922740936, + "advantage_std": 0.9998291358351707, + "completion_length": 2571.2083587646484, + "epoch": 0.001142857142857143, + "grad_norm": 0.1954135149717331, + "kl": 0.0, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2e-08, + "loss": -0.0, + "reward": 0.2781674414873123, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2781674414873123, + "reward_after_std": 0.7989529222249985, + "reward_before_mean": 0.4897647276520729, + "reward_before_std": 0.8290339298546314, + "reward_change_max": 0.00028071552515029907, + "reward_change_mean": -0.2115972964093089, + "reward_change_min": -0.4146200343966484, + "reward_change_std": 0.16823832830414176, + "reward_std": 0.7989529222249985, + "rewards/cosine_scaled_reward": -0.015534311532974243, + "rewards/format_reward": 0.5208333488553762, + "step": 1 + }, + { + "advantage_max": 1.440006211400032, + "advantage_mean": 1.8005570590062803e-08, + "advantage_min": -1.196444258093834, + "advantage_std": 0.9997469857335091, + "completion_length": 2804.395881652832, + "epoch": 0.002285714285714286, + "grad_norm": 0.18144740164279938, + "kl": 0.0, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4e-08, + "loss": -0.0, + "reward": 0.07961943745613098, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.07961943745613098, + "reward_after_std": 0.42851265892386436, + "reward_before_mean": 0.27539755403995514, + "reward_before_std": 0.42092561535537243, + "reward_change_max": 0.0006531104445457458, + "reward_change_mean": -0.19577811146155, + "reward_change_min": -0.3188221678137779, + "reward_change_std": 0.13006281899288297, + "reward_std": 0.42851267009973526, + "rewards/cosine_scaled_reward": -0.04980122856795788, + "rewards/format_reward": 0.37500000558793545, + "step": 2 + }, + { + "advantage_max": 1.746221885085106, + "advantage_mean": 4.2219957641087547e-08, + "advantage_min": -1.0095228850841522, + "advantage_std": 0.9996554180979729, + "completion_length": 3243.9166870117188, + "epoch": 0.0034285714285714284, + "grad_norm": 0.16465424001216888, + "kl": 4.291534423828125e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6e-08, + "loss": 0.0, + "reward": -0.3449444258585572, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.3449444258585572, + "reward_after_std": 0.45981596130877733, + "reward_before_mean": -0.24710791744291782, + "reward_before_std": 0.44803297333419323, + "reward_change_max": 0.0, + "reward_change_mean": -0.09783650934696198, + "reward_change_min": -0.18044066429138184, + "reward_change_std": 0.0714771922212094, + "reward_std": 0.4598159771412611, + "rewards/cosine_scaled_reward": -0.21730396151542664, + "rewards/format_reward": 0.1875000074505806, + "step": 3 + }, + { + "advantage_max": 1.7801509350538254, + "advantage_mean": 4.346172077784871e-08, + "advantage_min": -0.9519815891981125, + "advantage_std": 0.9998303577303886, + "completion_length": 2360.3333740234375, + "epoch": 0.004571428571428572, + "grad_norm": 0.19501709938049316, + "kl": 3.4168362617492676e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8e-08, + "loss": 0.0, + "reward": 0.1274346588179469, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.1274346588179469, + "reward_after_std": 0.8090842105448246, + "reward_before_mean": 0.29423846723511815, + "reward_before_std": 0.7769927233457565, + "reward_change_max": 0.0007443279027938843, + "reward_change_mean": -0.1668038028292358, + "reward_change_min": -0.27894750237464905, + "reward_change_std": 0.10436896700412035, + "reward_std": 0.8090842328965664, + "rewards/cosine_scaled_reward": -0.14454743452370167, + "rewards/format_reward": 0.5833333414047956, + "step": 4 + }, + { + "advantage_max": 1.8142333924770355, + "advantage_mean": 2.8560559472978753e-08, + "advantage_min": -0.8820345476269722, + "advantage_std": 0.9997325539588928, + "completion_length": 3316.5833435058594, + "epoch": 0.005714285714285714, + "grad_norm": 0.29815995693206787, + "kl": 4.64441254734993e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1e-07, + "loss": 0.0, + "reward": -0.4195337798446417, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.4195337798446417, + "reward_after_std": 0.5661724396049976, + "reward_before_mean": -0.3463286105543375, + "reward_before_std": 0.5550692845135927, + "reward_change_max": 5.933642387390137e-05, + "reward_change_mean": -0.07320518855703995, + "reward_change_min": -0.15249120816588402, + "reward_change_std": 0.05859863373916596, + "reward_std": 0.5661724433302879, + "rewards/cosine_scaled_reward": -0.2564976374414982, + "rewards/format_reward": 0.16666667349636555, + "step": 5 + }, + { + "advantage_max": 1.6999807804822922, + "advantage_mean": 2.173086377510458e-09, + "advantage_min": -1.0560009852051735, + "advantage_std": 0.9997874870896339, + "completion_length": 3148.7083740234375, + "epoch": 0.006857142857142857, + "grad_norm": 0.19052845239639282, + "kl": 4.4211745262145996e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.2e-07, + "loss": 0.0, + "reward": -0.17428898997604847, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.17428898997604847, + "reward_after_std": 0.7152512725442648, + "reward_before_mean": -0.060131706297397614, + "reward_before_std": 0.7098329775035381, + "reward_change_max": 0.00010123848915100098, + "reward_change_mean": -0.1141572711057961, + "reward_change_min": -0.2221994549036026, + "reward_change_std": 0.09262027451768517, + "reward_std": 0.7152512930333614, + "rewards/cosine_scaled_reward": -0.1446491980459541, + "rewards/format_reward": 0.22916667349636555, + "step": 6 + }, + { + "advantage_max": 1.6318527460098267, + "advantage_mean": 8.692344177774203e-09, + "advantage_min": -1.0107521638274193, + "advantage_std": 0.9998287558555603, + "completion_length": 3002.0209045410156, + "epoch": 0.008, + "grad_norm": 0.1495773047208786, + "kl": 2.2746622562408447e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.4e-07, + "loss": 0.0, + "reward": 0.2590144984424114, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2590144984424114, + "reward_after_std": 0.7710397019982338, + "reward_before_mean": 0.465796678327024, + "reward_before_std": 0.7602494731545448, + "reward_change_max": 0.00027186423540115356, + "reward_change_mean": -0.20678219757974148, + "reward_change_min": -0.4025501310825348, + "reward_change_std": 0.1585400952026248, + "reward_std": 0.7710397355258465, + "rewards/cosine_scaled_reward": -0.06918500177562237, + "rewards/format_reward": 0.6041666753590107, + "step": 7 + }, + { + "advantage_max": 1.5791463106870651, + "advantage_mean": -2.235174290099451e-08, + "advantage_min": -1.0492639392614365, + "advantage_std": 0.9998195469379425, + "completion_length": 2731.458366394043, + "epoch": 0.009142857142857144, + "grad_norm": 0.16950403153896332, + "kl": 2.017989754676819e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.6e-07, + "loss": 0.0, + "reward": 0.33847012650221586, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.33847012650221586, + "reward_after_std": 0.8393780812621117, + "reward_before_mean": 0.5573682561516762, + "reward_before_std": 0.8381953444331884, + "reward_change_max": 0.0001522451639175415, + "reward_change_mean": -0.21889818459749222, + "reward_change_min": -0.3979425001889467, + "reward_change_std": 0.17012304533272982, + "reward_std": 0.839378122240305, + "rewards/cosine_scaled_reward": 0.0807674679235788, + "rewards/format_reward": 0.39583333767950535, + "step": 8 + }, + { + "advantage_max": 1.5046225041151047, + "advantage_mean": -9.934107758624577e-09, + "advantage_min": -1.0348439291119576, + "advantage_std": 0.999826692044735, + "completion_length": 3279.3959350585938, + "epoch": 0.010285714285714285, + "grad_norm": 0.1702805757522583, + "kl": 4.1447579860687256e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.8e-07, + "loss": 0.0, + "reward": 0.04321199515834451, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.04321199515834451, + "reward_after_std": 0.86030338332057, + "reward_before_mean": 0.20256559806875885, + "reward_before_std": 0.9098124578595161, + "reward_change_max": 0.0004805624485015869, + "reward_change_mean": -0.15935362223535776, + "reward_change_min": -0.4127720557153225, + "reward_change_std": 0.16414799448102713, + "reward_std": 0.8603034280240536, + "rewards/cosine_scaled_reward": -0.07580053666606545, + "rewards/format_reward": 0.3541666716337204, + "step": 9 + }, + { + "advantage_max": 1.5961240381002426, + "advantage_mean": 1.8005570145973593e-08, + "advantage_min": -0.9946450442075729, + "advantage_std": 0.999793753027916, + "completion_length": 2836.2083740234375, + "epoch": 0.011428571428571429, + "grad_norm": 0.25149303674697876, + "kl": 2.4536624550819397e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2e-07, + "loss": 0.0, + "reward": -0.18135902285575867, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.18135902285575867, + "reward_after_std": 0.6317292973399162, + "reward_before_mean": -0.05718802846968174, + "reward_before_std": 0.647784736007452, + "reward_change_max": 0.0007606744766235352, + "reward_change_mean": -0.12417101149912924, + "reward_change_min": -0.2580604609102011, + "reward_change_std": 0.10907016729470342, + "reward_std": 0.6317293085157871, + "rewards/cosine_scaled_reward": -0.17442734353244305, + "rewards/format_reward": 0.29166667349636555, + "step": 10 + }, + { + "advantage_max": 1.539865881204605, + "advantage_mean": 5.401670999383157e-08, + "advantage_min": -1.039774589240551, + "advantage_std": 0.999774768948555, + "completion_length": 3389.2083740234375, + "epoch": 0.012571428571428572, + "grad_norm": 0.18692435324192047, + "kl": 3.342330455780029e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.1999999999999998e-07, + "loss": 0.0, + "reward": -0.32752789044752717, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.32752789044752717, + "reward_after_std": 0.6646033525466919, + "reward_before_mean": -0.23555048741400242, + "reward_before_std": 0.6987365372478962, + "reward_change_max": 0.0015906840562820435, + "reward_change_mean": -0.09197739418596029, + "reward_change_min": -0.23515602201223373, + "reward_change_std": 0.10101438639685512, + "reward_std": 0.664603378623724, + "rewards/cosine_scaled_reward": -0.19069190602749586, + "rewards/format_reward": 0.1458333358168602, + "step": 11 + }, + { + "advantage_max": 1.5572585463523865, + "advantage_mean": -2.1109978043387656e-08, + "advantage_min": -1.1109998114407063, + "advantage_std": 0.9998391792178154, + "completion_length": 2623.2083740234375, + "epoch": 0.013714285714285714, + "grad_norm": 0.19698189198970795, + "kl": 4.7206878662109375e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.4e-07, + "loss": 0.0, + "reward": 0.22108511440455914, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.22108511440455914, + "reward_after_std": 0.7672835923731327, + "reward_before_mean": 0.4219110906124115, + "reward_before_std": 0.7882883995771408, + "reward_change_max": 0.0005475208163261414, + "reward_change_mean": -0.20082599110901356, + "reward_change_min": -0.38023688457906246, + "reward_change_std": 0.15198138437699527, + "reward_std": 0.7672836184501648, + "rewards/cosine_scaled_reward": -0.07029447052627802, + "rewards/format_reward": 0.562500013038516, + "step": 12 + }, + { + "advantage_max": 1.743264377117157, + "advantage_mean": 8.071463275527435e-09, + "advantage_min": -0.9408519268035889, + "advantage_std": 0.9997067749500275, + "completion_length": 3043.0833740234375, + "epoch": 0.014857142857142857, + "grad_norm": 0.21475082635879517, + "kl": 3.6597251892089844e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.6e-07, + "loss": 0.0, + "reward": 0.07958985678851604, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.07958985678851604, + "reward_after_std": 0.45133444853127, + "reward_before_mean": 0.269378375262022, + "reward_before_std": 0.4202164653688669, + "reward_change_max": 0.00046546757221221924, + "reward_change_mean": -0.18978853151202202, + "reward_change_min": -0.3052454721182585, + "reward_change_std": 0.11908666882663965, + "reward_std": 0.45133446156978607, + "rewards/cosine_scaled_reward": -0.07364415284246206, + "rewards/format_reward": 0.41666667349636555, + "step": 13 + }, + { + "advantage_max": 1.5736165642738342, + "advantage_mean": -1.490116174895917e-08, + "advantage_min": -1.0592405423521996, + "advantage_std": 0.9998423829674721, + "completion_length": 2943.6458740234375, + "epoch": 0.016, + "grad_norm": 0.1676233559846878, + "kl": 2.8867274522781372e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.8e-07, + "loss": 0.0, + "reward": 0.042353540658950806, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.042353540658950806, + "reward_after_std": 0.8170745447278023, + "reward_before_mean": 0.19811346009373665, + "reward_before_std": 0.8294630981981754, + "reward_change_max": 0.0007490590214729309, + "reward_change_mean": -0.1557599287480116, + "reward_change_min": -0.3378952704370022, + "reward_change_std": 0.1310901055112481, + "reward_std": 0.8170745633542538, + "rewards/cosine_scaled_reward": -0.08844327414408326, + "rewards/format_reward": 0.37500000931322575, + "step": 14 + }, + { + "advantage_max": 1.7620093375444412, + "advantage_mean": -6.767610916114108e-08, + "advantage_min": -0.9291465580463409, + "advantage_std": 0.9997983947396278, + "completion_length": 2803.687530517578, + "epoch": 0.017142857142857144, + "grad_norm": 0.20531098544597626, + "kl": 3.5099685192108154e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3e-07, + "loss": 0.0, + "reward": 0.22886049561202526, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.22886049561202526, + "reward_after_std": 0.6503882668912411, + "reward_before_mean": 0.43278513848781586, + "reward_before_std": 0.6143735107034445, + "reward_change_max": 0.0001234114170074463, + "reward_change_mean": -0.20392468804493546, + "reward_change_min": -0.33097884617745876, + "reward_change_std": 0.1261005480773747, + "reward_std": 0.6503882892429829, + "rewards/cosine_scaled_reward": 0.008059246152697597, + "rewards/format_reward": 0.4166666679084301, + "step": 15 + }, + { + "advantage_max": 1.6819724440574646, + "advantage_mean": 3.352761290820183e-08, + "advantage_min": -0.9239891991019249, + "advantage_std": 0.9997138306498528, + "completion_length": 3483.0625, + "epoch": 0.018285714285714287, + "grad_norm": 0.1682252585887909, + "kl": 3.56920063495636e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.2e-07, + "loss": 0.0, + "reward": -0.5018233098089695, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.5018233098089695, + "reward_after_std": 0.5142541136592627, + "reward_before_mean": -0.43988440558314323, + "reward_before_std": 0.524260614067316, + "reward_change_max": 0.001581817865371704, + "reward_change_mean": -0.061938912025652826, + "reward_change_min": -0.14432105235755444, + "reward_change_std": 0.059407457476481795, + "reward_std": 0.514254117384553, + "rewards/cosine_scaled_reward": -0.27202554512768984, + "rewards/format_reward": 0.10416666977107525, + "step": 16 + }, + { + "advantage_max": 1.6719516217708588, + "advantage_mean": 9.313226079221693e-09, + "advantage_min": -1.0224130228161812, + "advantage_std": 0.9998493865132332, + "completion_length": 2241.5208778381348, + "epoch": 0.019428571428571427, + "grad_norm": 0.26219215989112854, + "kl": 4.7460198402404785e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.4000000000000003e-07, + "loss": 0.0, + "reward": 0.26397125981748104, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.26397125981748104, + "reward_after_std": 0.8621900156140327, + "reward_before_mean": 0.4611313515342772, + "reward_before_std": 0.8517364151775837, + "reward_change_max": 0.0003229379653930664, + "reward_change_mean": -0.19716009264811873, + "reward_change_min": -0.382378987967968, + "reward_change_std": 0.14341816492378712, + "reward_std": 0.8621900603175163, + "rewards/cosine_scaled_reward": -0.06110099982470274, + "rewards/format_reward": 0.5833333358168602, + "step": 17 + }, + { + "advantage_max": 1.6789888739585876, + "advantage_mean": 3.601114006990258e-08, + "advantage_min": -1.0117030963301659, + "advantage_std": 0.9998209774494171, + "completion_length": 2828.041702270508, + "epoch": 0.02057142857142857, + "grad_norm": 0.1963949203491211, + "kl": 2.551823854446411e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.6e-07, + "loss": 0.0, + "reward": 0.038342010229825974, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.038342010229825974, + "reward_after_std": 0.7603168785572052, + "reward_before_mean": 0.1992601379752159, + "reward_before_std": 0.7735672742128372, + "reward_change_max": 0.0, + "reward_change_mean": -0.16091809794306755, + "reward_change_min": -0.3329150825738907, + "reward_change_std": 0.13252681493759155, + "reward_std": 0.7603169232606888, + "rewards/cosine_scaled_reward": -0.11911993799731135, + "rewards/format_reward": 0.4375000111758709, + "step": 18 + }, + { + "advantage_max": 1.4660861641168594, + "advantage_mean": 3.8494665677291096e-08, + "advantage_min": -1.297618992626667, + "advantage_std": 0.9997608736157417, + "completion_length": 2957.3125610351562, + "epoch": 0.021714285714285714, + "grad_norm": 0.20244932174682617, + "kl": 3.133341670036316e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.7999999999999996e-07, + "loss": 0.0, + "reward": 0.1908008144237101, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.1908008144237101, + "reward_after_std": 0.654106542468071, + "reward_before_mean": 0.3968064859509468, + "reward_before_std": 0.6879367008805275, + "reward_change_max": 0.0, + "reward_change_mean": -0.20600564847700298, + "reward_change_min": -0.386847285553813, + "reward_change_std": 0.1555578326806426, + "reward_std": 0.6541065834462643, + "rewards/cosine_scaled_reward": 0.021319888532161713, + "rewards/format_reward": 0.3541666716337204, + "step": 19 + }, + { + "advantage_max": 1.4845254868268967, + "advantage_mean": 5.587935891782081e-09, + "advantage_min": -1.1302195489406586, + "advantage_std": 0.9997977316379547, + "completion_length": 2531.250068664551, + "epoch": 0.022857142857142857, + "grad_norm": 0.2369108349084854, + "kl": 2.1103769540786743e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4e-07, + "loss": 0.0, + "reward": 0.2883046194911003, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2883046194911003, + "reward_after_std": 0.7216798197478056, + "reward_before_mean": 0.5108506195247173, + "reward_before_std": 0.760659109801054, + "reward_change_max": 0.0005205795168876648, + "reward_change_mean": -0.22254599630832672, + "reward_change_min": -0.4257675837725401, + "reward_change_std": 0.1746676228940487, + "reward_std": 0.7216798420995474, + "rewards/cosine_scaled_reward": -0.025824700482189655, + "rewards/format_reward": 0.5625000093132257, + "step": 20 + }, + { + "advantage_max": 1.5942478775978088, + "advantage_mean": -1.9247333504779363e-08, + "advantage_min": -1.0359741151332855, + "advantage_std": 0.999727338552475, + "completion_length": 2936.229202270508, + "epoch": 0.024, + "grad_norm": 0.21686924993991852, + "kl": 4.1641294956207275e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.1999999999999995e-07, + "loss": 0.0, + "reward": 0.0217365100979805, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.0217365100979805, + "reward_after_std": 0.4158258568495512, + "reward_before_mean": 0.20216498710215092, + "reward_before_std": 0.39231848157942295, + "reward_change_max": 0.0008887350559234619, + "reward_change_mean": -0.18042848724871874, + "reward_change_min": -0.28633993305265903, + "reward_change_std": 0.11420578742399812, + "reward_std": 0.41582586988806725, + "rewards/cosine_scaled_reward": -0.08641753438860178, + "rewards/format_reward": 0.37500000558793545, + "step": 21 + }, + { + "advantage_max": 1.4737742841243744, + "advantage_mean": -3.91155481338501e-08, + "advantage_min": -1.2801623418927193, + "advantage_std": 0.9998248517513275, + "completion_length": 1926.7083587646484, + "epoch": 0.025142857142857144, + "grad_norm": 0.34481367468833923, + "kl": 2.0101666450500488e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.3999999999999997e-07, + "loss": 0.0, + "reward": 0.5544739328324795, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5544739328324795, + "reward_after_std": 0.6403865702450275, + "reward_before_mean": 0.8353974921628833, + "reward_before_std": 0.6390982866287231, + "reward_change_max": 0.0, + "reward_change_mean": -0.2809235444292426, + "reward_change_min": -0.43336551636457443, + "reward_change_std": 0.17704242002218962, + "reward_std": 0.6403866037726402, + "rewards/cosine_scaled_reward": 0.05311539862304926, + "rewards/format_reward": 0.729166679084301, + "step": 22 + }, + { + "advantage_max": 1.7071952670812607, + "advantage_mean": 1.7384688244526103e-08, + "advantage_min": -1.0333659648895264, + "advantage_std": 0.9998112097382545, + "completion_length": 2552.354232788086, + "epoch": 0.026285714285714287, + "grad_norm": 0.1796521246433258, + "kl": 2.888031303882599e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.6e-07, + "loss": 0.0, + "reward": 0.27095113415271044, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.27095113415271044, + "reward_after_std": 0.8931148834526539, + "reward_before_mean": 0.46421122178435326, + "reward_before_std": 0.8784515354782343, + "reward_change_max": 0.0, + "reward_change_mean": -0.19326008297502995, + "reward_change_min": -0.35773650370538235, + "reward_change_std": 0.13798850402235985, + "reward_std": 0.8931148983538151, + "rewards/cosine_scaled_reward": -0.038727725856006145, + "rewards/format_reward": 0.5416666846722364, + "step": 23 + }, + { + "advantage_max": 1.6876031756401062, + "advantage_mean": 1.8005570812107408e-08, + "advantage_min": -0.9738664329051971, + "advantage_std": 0.9998361021280289, + "completion_length": 2762.0208892822266, + "epoch": 0.027428571428571427, + "grad_norm": 0.2101898491382599, + "kl": 2.544000744819641e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.8e-07, + "loss": 0.0, + "reward": 0.345141158439219, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.345141158439219, + "reward_after_std": 0.9434406235814095, + "reward_before_mean": 0.5552112711593509, + "reward_before_std": 0.9572884701192379, + "reward_change_max": 0.0001425519585609436, + "reward_change_mean": -0.2100701043382287, + "reward_change_min": -0.41364777088165283, + "reward_change_std": 0.16276149917393923, + "reward_std": 0.943440642207861, + "rewards/cosine_scaled_reward": 0.017188958823680878, + "rewards/format_reward": 0.5208333358168602, + "step": 24 + }, + { + "advantage_max": 1.6947638243436813, + "advantage_mean": 2.9491882130860958e-08, + "advantage_min": -1.0033902749419212, + "advantage_std": 0.9997927024960518, + "completion_length": 2592.041702270508, + "epoch": 0.02857142857142857, + "grad_norm": 0.1946055144071579, + "kl": 2.8666399884968996e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5e-07, + "loss": 0.0, + "reward": -0.019655877724289894, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.019655877724289894, + "reward_after_std": 0.6330535672605038, + "reward_before_mean": 0.13670190423727036, + "reward_before_std": 0.6341910995543003, + "reward_change_max": 0.0004054456949234009, + "reward_change_mean": -0.15635776380077004, + "reward_change_min": -0.29186590760946274, + "reward_change_std": 0.11788261751644313, + "reward_std": 0.6330535747110844, + "rewards/cosine_scaled_reward": -0.13998239114880562, + "rewards/format_reward": 0.4166666716337204, + "step": 25 + }, + { + "advantage_max": 1.4772356897592545, + "advantage_mean": 1.8936893386722886e-08, + "advantage_min": -1.2885529696941376, + "advantage_std": 0.9997257962822914, + "completion_length": 2979.750045776367, + "epoch": 0.029714285714285714, + "grad_norm": 0.15528684854507446, + "kl": 2.7121976017951965e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.2e-07, + "loss": 0.0, + "reward": 0.18640884256456047, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.18640884256456047, + "reward_after_std": 0.5655598230659962, + "reward_before_mean": 0.3976056035608053, + "reward_before_std": 0.5835991930216551, + "reward_change_max": 0.0, + "reward_change_mean": -0.211196749471128, + "reward_change_min": -0.35364590398967266, + "reward_change_std": 0.14743430353701115, + "reward_std": 0.5655598565936089, + "rewards/cosine_scaled_reward": -0.009530545212328434, + "rewards/format_reward": 0.4166666716337204, + "step": 26 + }, + { + "advantage_max": 1.5101052522659302, + "advantage_mean": -3.725290853573426e-09, + "advantage_min": -1.162248358130455, + "advantage_std": 0.9997957795858383, + "completion_length": 3105.395866394043, + "epoch": 0.030857142857142857, + "grad_norm": 0.20341047644615173, + "kl": 3.5478733479976654e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.4e-07, + "loss": 0.0, + "reward": -0.07532213162630796, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.07532213162630796, + "reward_after_std": 0.6800261065363884, + "reward_before_mean": 0.071015989407897, + "reward_before_std": 0.7142519578337669, + "reward_change_max": 0.00037226080894470215, + "reward_change_mean": -0.1463381163775921, + "reward_change_min": -0.3533896040171385, + "reward_change_std": 0.13723576348274946, + "reward_std": 0.6800261326134205, + "rewards/cosine_scaled_reward": -0.13115867972373962, + "rewards/format_reward": 0.33333333767950535, + "step": 27 + }, + { + "advantage_max": 1.573099598288536, + "advantage_mean": 3.849466723160333e-08, + "advantage_min": -1.1150267273187637, + "advantage_std": 0.9998030215501785, + "completion_length": 2894.7708740234375, + "epoch": 0.032, + "grad_norm": 0.18499167263507843, + "kl": 3.559887409210205e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.6e-07, + "loss": 0.0, + "reward": 0.16924337297677994, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.16924337297677994, + "reward_after_std": 0.8595078736543655, + "reward_before_mean": 0.3546360591426492, + "reward_before_std": 0.8972543813288212, + "reward_change_max": 0.00022292882204055786, + "reward_change_mean": -0.1853926726616919, + "reward_change_min": -0.4034426249563694, + "reward_change_std": 0.16123890411108732, + "reward_std": 0.8595079220831394, + "rewards/cosine_scaled_reward": -0.031015303684398532, + "rewards/format_reward": 0.41666667349636555, + "step": 28 + }, + { + "advantage_max": 1.6571213752031326, + "advantage_mean": 2.60770322002557e-08, + "advantage_min": -1.023493006825447, + "advantage_std": 0.9997655674815178, + "completion_length": 3431.125030517578, + "epoch": 0.03314285714285714, + "grad_norm": 0.16913940012454987, + "kl": 2.547353506088257e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.8e-07, + "loss": 0.0, + "reward": -0.41504489071667194, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.41504489071667194, + "reward_after_std": 0.5817391686141491, + "reward_before_mean": -0.3383926101960242, + "reward_before_std": 0.5976084098219872, + "reward_change_max": 0.0009746253490447998, + "reward_change_mean": -0.07665228494443, + "reward_change_min": -0.1926496997475624, + "reward_change_std": 0.07682911981828511, + "reward_std": 0.58173917979002, + "rewards/cosine_scaled_reward": -0.24211297556757927, + "rewards/format_reward": 0.1458333358168602, + "step": 29 + }, + { + "advantage_max": 1.5788090974092484, + "advantage_mean": 3.073364673866763e-08, + "advantage_min": -1.1611577719449997, + "advantage_std": 0.999844379723072, + "completion_length": 3024.166732788086, + "epoch": 0.03428571428571429, + "grad_norm": 0.1804714798927307, + "kl": 2.2821128368377686e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6e-07, + "loss": 0.0, + "reward": 0.16531530115753412, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.16531530115753412, + "reward_after_std": 0.9392066784203053, + "reward_before_mean": 0.3431825675070286, + "reward_before_std": 0.977239266037941, + "reward_change_max": 0.00046259909868240356, + "reward_change_mean": -0.1778672719374299, + "reward_change_min": -0.3380998335778713, + "reward_change_std": 0.1529026017524302, + "reward_std": 0.9392067044973373, + "rewards/cosine_scaled_reward": -0.02632538042962551, + "rewards/format_reward": 0.39583334513008595, + "step": 30 + }, + { + "advantage_max": 1.565725862979889, + "advantage_mean": 5.0291419750880806e-08, + "advantage_min": -1.1075504496693611, + "advantage_std": 0.9997828751802444, + "completion_length": 3189.791702270508, + "epoch": 0.03542857142857143, + "grad_norm": 0.17691554129123688, + "kl": 1.4291144907474518e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.2e-07, + "loss": 0.0, + "reward": -0.2586498372256756, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.2586498372256756, + "reward_after_std": 0.6254929751157761, + "reward_before_mean": -0.14998156733054202, + "reward_before_std": 0.6498836800456047, + "reward_change_max": 0.0, + "reward_change_mean": -0.10866825701668859, + "reward_change_min": -0.2638713177293539, + "reward_change_std": 0.10512422723695636, + "reward_std": 0.6254929825663567, + "rewards/cosine_scaled_reward": -0.1791574526578188, + "rewards/format_reward": 0.2083333395421505, + "step": 31 + }, + { + "advantage_max": 1.6380120068788528, + "advantage_mean": 4.0046871152554786e-08, + "advantage_min": -1.063338615000248, + "advantage_std": 0.9998109042644501, + "completion_length": 3098.2708892822266, + "epoch": 0.036571428571428574, + "grad_norm": 0.1579563021659851, + "kl": 2.1046027541160583e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.4e-07, + "loss": 0.0, + "reward": 0.20158327370882034, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.20158327370882034, + "reward_after_std": 0.6476054154336452, + "reward_before_mean": 0.40322505310177803, + "reward_before_std": 0.6246501244604588, + "reward_change_max": 0.0005530714988708496, + "reward_change_mean": -0.20164170674979687, + "reward_change_min": -0.3284043725579977, + "reward_change_std": 0.13300398597493768, + "reward_std": 0.6476054154336452, + "rewards/cosine_scaled_reward": 0.014112494885921478, + "rewards/format_reward": 0.3750000111758709, + "step": 32 + }, + { + "advantage_max": 1.749614492058754, + "advantage_mean": 3.10440866346795e-08, + "advantage_min": -0.8797961287200451, + "advantage_std": 0.9998220503330231, + "completion_length": 3210.875030517578, + "epoch": 0.037714285714285714, + "grad_norm": 0.18192623555660248, + "kl": 2.9414892196655273e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.6e-07, + "loss": 0.0, + "reward": -0.229713948443532, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.229713948443532, + "reward_after_std": 0.8090166114270687, + "reward_before_mean": -0.1334769814275205, + "reward_before_std": 0.8181703835725784, + "reward_change_max": 0.0003531351685523987, + "reward_change_mean": -0.09623696561902761, + "reward_change_min": -0.2311191949993372, + "reward_change_std": 0.09210817841812968, + "reward_std": 0.8090166114270687, + "rewards/cosine_scaled_reward": -0.1917384904809296, + "rewards/format_reward": 0.2500000074505806, + "step": 33 + }, + { + "advantage_max": 1.3934744000434875, + "advantage_mean": -2.0489096863585132e-08, + "advantage_min": -1.2853097319602966, + "advantage_std": 0.9998067542910576, + "completion_length": 2512.229202270508, + "epoch": 0.038857142857142854, + "grad_norm": 0.23543889820575714, + "kl": 3.3546239137649536e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.800000000000001e-07, + "loss": 0.0, + "reward": 0.5275341346859932, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.5275341346859932, + "reward_after_std": 0.8143299408257008, + "reward_before_mean": 0.7970656305551529, + "reward_before_std": 0.8669828977435827, + "reward_change_max": 0.0003156885504722595, + "reward_change_mean": -0.26953150518238544, + "reward_change_min": -0.4885811097919941, + "reward_change_std": 0.20852079056203365, + "reward_std": 0.8143299594521523, + "rewards/cosine_scaled_reward": 0.1276994850486517, + "rewards/format_reward": 0.541666679084301, + "step": 34 + }, + { + "advantage_max": 1.5553712397813797, + "advantage_mean": 5.0912303040107076e-08, + "advantage_min": -1.1710423156619072, + "advantage_std": 0.9997952580451965, + "completion_length": 3039.5625534057617, + "epoch": 0.04, + "grad_norm": 0.2202300876379013, + "kl": 4.0199607610702515e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7e-07, + "loss": 0.0, + "reward": -0.0422150120139122, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.0422150120139122, + "reward_after_std": 0.7378262039273977, + "reward_before_mean": 0.10306231211870909, + "reward_before_std": 0.7508488856256008, + "reward_change_max": 0.0005578547716140747, + "reward_change_mean": -0.1452772947959602, + "reward_change_min": -0.27861824072897434, + "reward_change_std": 0.11854508286342025, + "reward_std": 0.7378262225538492, + "rewards/cosine_scaled_reward": -0.11513552069664001, + "rewards/format_reward": 0.33333333767950535, + "step": 35 + }, + { + "advantage_max": 1.6745474636554718, + "advantage_mean": 1.862645149230957e-08, + "advantage_min": -1.0759836360812187, + "advantage_std": 0.9997546598315239, + "completion_length": 3388.3333435058594, + "epoch": 0.04114285714285714, + "grad_norm": 0.18159471452236176, + "kl": 2.983957529067993e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.2e-07, + "loss": 0.0, + "reward": -0.2961166016757488, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.2961166016757488, + "reward_after_std": 0.621714374050498, + "reward_before_mean": -0.19892606884241104, + "reward_before_std": 0.6282427292317152, + "reward_change_max": 0.0011901110410690308, + "reward_change_mean": -0.09719053143635392, + "reward_change_min": -0.20177586935460567, + "reward_change_std": 0.08723159926012158, + "reward_std": 0.6217143908143044, + "rewards/cosine_scaled_reward": -0.1932130428031087, + "rewards/format_reward": 0.18750000186264515, + "step": 36 + }, + { + "advantage_max": 1.5861639976501465, + "advantage_mean": 3.476937804336444e-08, + "advantage_min": -1.0689679086208344, + "advantage_std": 0.9997375980019569, + "completion_length": 3153.1666870117188, + "epoch": 0.04228571428571429, + "grad_norm": 0.15919217467308044, + "kl": 2.7313828468322754e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.4e-07, + "loss": 0.0, + "reward": -0.288279028609395, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.288279028609395, + "reward_after_std": 0.5106042847037315, + "reward_before_mean": -0.17917627468705177, + "reward_before_std": 0.5166680738329887, + "reward_change_max": 0.0004027709364891052, + "reward_change_mean": -0.10910274600610137, + "reward_change_min": -0.21067695692181587, + "reward_change_std": 0.08434443082660437, + "reward_std": 0.510604303330183, + "rewards/cosine_scaled_reward": -0.20417147409170866, + "rewards/format_reward": 0.22916666977107525, + "step": 37 + }, + { + "advantage_max": 1.5880178362131119, + "advantage_mean": 6.76761100493195e-08, + "advantage_min": -1.0702078267931938, + "advantage_std": 0.9997448474168777, + "completion_length": 3234.812530517578, + "epoch": 0.04342857142857143, + "grad_norm": 0.1559906005859375, + "kl": 2.4594366550445557e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.599999999999999e-07, + "loss": 0.0, + "reward": -0.17546416074037552, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.17546416074037552, + "reward_after_std": 0.48496192693710327, + "reward_before_mean": -0.04148578643798828, + "reward_before_std": 0.47018149122595787, + "reward_change_max": 0.0006171911954879761, + "reward_change_mean": -0.1339783607982099, + "reward_change_min": -0.24786392971873283, + "reward_change_std": 0.09997595380991697, + "reward_std": 0.48496193811297417, + "rewards/cosine_scaled_reward": -0.11449289601296186, + "rewards/format_reward": 0.1875, + "step": 38 + }, + { + "advantage_max": 1.6684064418077469, + "advantage_mean": 1.7384688355548406e-08, + "advantage_min": -0.9282046966254711, + "advantage_std": 0.9997768849134445, + "completion_length": 2681.9792098999023, + "epoch": 0.044571428571428574, + "grad_norm": 0.21223194897174835, + "kl": 1.5147030353546143e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.799999999999999e-07, + "loss": 0.0, + "reward": 0.13156955409795046, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.13156955409795046, + "reward_after_std": 0.6208246052265167, + "reward_before_mean": 0.32295298824647034, + "reward_before_std": 0.615071473759599, + "reward_change_max": 0.0005634576082229614, + "reward_change_mean": -0.1913834000006318, + "reward_change_min": -0.36978882923722267, + "reward_change_std": 0.14454567758366466, + "reward_std": 0.6208246275782585, + "rewards/cosine_scaled_reward": -0.06769019179046154, + "rewards/format_reward": 0.4583333358168602, + "step": 39 + }, + { + "advantage_max": 1.655065342783928, + "advantage_mean": 4.0978195503527104e-08, + "advantage_min": -1.0308792516589165, + "advantage_std": 0.9998110383749008, + "completion_length": 2748.8334197998047, + "epoch": 0.045714285714285714, + "grad_norm": 0.24321109056472778, + "kl": 2.113846130669117e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8e-07, + "loss": 0.0, + "reward": -0.0020755892619490623, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.0020755892619490623, + "reward_after_std": 0.6307364739477634, + "reward_before_mean": 0.15877384413033724, + "reward_before_std": 0.633373312652111, + "reward_change_max": 0.0006052106618881226, + "reward_change_mean": -0.16084943106397986, + "reward_change_min": -0.29501060023903847, + "reward_change_std": 0.12053346633911133, + "reward_std": 0.6307364776730537, + "rewards/cosine_scaled_reward": -0.13936308398842812, + "rewards/format_reward": 0.4375000037252903, + "step": 40 + }, + { + "advantage_max": 1.7262564301490784, + "advantage_mean": -1.4901161637936866e-08, + "advantage_min": -0.8481588140130043, + "advantage_std": 0.9998667165637016, + "completion_length": 3027.6250610351562, + "epoch": 0.046857142857142854, + "grad_norm": 0.14843006432056427, + "kl": 1.632794737815857e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.199999999999999e-07, + "loss": 0.0, + "reward": 0.0690681068226695, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.0690681068226695, + "reward_after_std": 0.949213694781065, + "reward_before_mean": 0.21814646408893168, + "reward_before_std": 0.9525175243616104, + "reward_change_max": 0.0005423203110694885, + "reward_change_mean": -0.1490783947519958, + "reward_change_min": -0.3277065698057413, + "reward_change_std": 0.1299262880347669, + "reward_std": 0.9492137059569359, + "rewards/cosine_scaled_reward": -0.0992600962636061, + "rewards/format_reward": 0.41666666977107525, + "step": 41 + }, + { + "advantage_max": 1.6299243718385696, + "advantage_mean": 3.849466723160333e-08, + "advantage_min": -1.0378217250108719, + "advantage_std": 0.9997099861502647, + "completion_length": 2848.7083435058594, + "epoch": 0.048, + "grad_norm": 0.27955660223960876, + "kl": 3.980100154876709e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.399999999999999e-07, + "loss": 0.0, + "reward": -0.3497111543547362, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.3497111543547362, + "reward_after_std": 0.40947396866977215, + "reward_before_mean": -0.24614406749606133, + "reward_before_std": 0.4084125515073538, + "reward_change_max": 0.0008740201592445374, + "reward_change_mean": -0.10356708150357008, + "reward_change_min": -0.20952831394970417, + "reward_change_std": 0.08169832732528448, + "reward_std": 0.40947398729622364, + "rewards/cosine_scaled_reward": -0.26890537329018116, + "rewards/format_reward": 0.2916666679084301, + "step": 42 + }, + { + "advantage_max": 1.5146324634552002, + "advantage_mean": 2.2351742345882997e-08, + "advantage_min": -1.0660174414515495, + "advantage_std": 0.9998110681772232, + "completion_length": 3207.312530517578, + "epoch": 0.04914285714285714, + "grad_norm": 0.1813088059425354, + "kl": 3.0316412448883057e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.599999999999999e-07, + "loss": 0.0, + "reward": 0.03310042805969715, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.03310042805969715, + "reward_after_std": 0.6946298256516457, + "reward_before_mean": 0.2010643444955349, + "reward_before_std": 0.7252266854047775, + "reward_change_max": 0.0003236532211303711, + "reward_change_mean": -0.1679639001376927, + "reward_change_min": -0.3087310250848532, + "reward_change_std": 0.133016605861485, + "reward_std": 0.6946298368275166, + "rewards/cosine_scaled_reward": -0.024467838928103447, + "rewards/format_reward": 0.2500000037252903, + "step": 43 + }, + { + "advantage_max": 1.4735457301139832, + "advantage_mean": 3.445893648201803e-08, + "advantage_min": -1.25593750923872, + "advantage_std": 0.9997965097427368, + "completion_length": 2763.541702270508, + "epoch": 0.05028571428571429, + "grad_norm": 0.2757427990436554, + "kl": 8.969008922576904e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.799999999999999e-07, + "loss": 0.0, + "reward": 0.3218963295221329, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3218963295221329, + "reward_after_std": 0.7521186731755733, + "reward_before_mean": 0.548164501786232, + "reward_before_std": 0.7862608321011066, + "reward_change_max": 0.00045564770698547363, + "reward_change_mean": -0.22626816853880882, + "reward_change_min": -0.4221660625189543, + "reward_change_std": 0.17340195435099304, + "reward_std": 0.7521186843514442, + "rewards/cosine_scaled_reward": 0.024082249961793423, + "rewards/format_reward": 0.5000000111758709, + "step": 44 + }, + { + "advantage_max": 1.4633118212223053, + "advantage_mean": 1.6142925329809543e-08, + "advantage_min": -1.1568865031003952, + "advantage_std": 0.9998212307691574, + "completion_length": 3349.7083435058594, + "epoch": 0.05142857142857143, + "grad_norm": 0.15581615269184113, + "kl": 3.790855407714844e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9e-07, + "loss": 0.0, + "reward": 0.03242434747517109, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.03242434747517109, + "reward_after_std": 0.8387883510440588, + "reward_before_mean": 0.193950076274632, + "reward_before_std": 0.9024345204234123, + "reward_change_max": 0.0004741176962852478, + "reward_change_mean": -0.16152575379237533, + "reward_change_min": -0.38652732968330383, + "reward_change_std": 0.16492859972640872, + "reward_std": 0.8387883920222521, + "rewards/cosine_scaled_reward": -0.04885828774422407, + "rewards/format_reward": 0.2916666753590107, + "step": 45 + }, + { + "advantage_max": 1.5030267983675003, + "advantage_mean": 3.9736430867964856e-08, + "advantage_min": -1.1738435924053192, + "advantage_std": 0.9997305870056152, + "completion_length": 3177.500015258789, + "epoch": 0.052571428571428575, + "grad_norm": 0.18234607577323914, + "kl": 5.453219637274742e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.2e-07, + "loss": 0.0, + "reward": -0.41419660672545433, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.41419660672545433, + "reward_after_std": 0.3787010833621025, + "reward_before_mean": -0.319430336356163, + "reward_before_std": 0.3915441706776619, + "reward_change_max": 0.00046756118535995483, + "reward_change_mean": -0.09476626757532358, + "reward_change_min": -0.18947959877550602, + "reward_change_std": 0.07930145971477032, + "reward_std": 0.3787010908126831, + "rewards/cosine_scaled_reward": -0.2430485039949417, + "rewards/format_reward": 0.1666666679084301, + "step": 46 + }, + { + "advantage_max": 1.5899191051721573, + "advantage_mean": 1.490116141589226e-08, + "advantage_min": -1.1299419030547142, + "advantage_std": 0.9998330995440483, + "completion_length": 2857.875045776367, + "epoch": 0.053714285714285714, + "grad_norm": 0.21604590117931366, + "kl": 4.843901842832565e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.399999999999999e-07, + "loss": 0.0, + "reward": 0.1896874513477087, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1896874513477087, + "reward_after_std": 0.9090043418109417, + "reward_before_mean": 0.3739042170345783, + "reward_before_std": 0.9458130896091461, + "reward_change_max": 0.0004040822386741638, + "reward_change_mean": -0.18421675683930516, + "reward_change_min": -0.3901894185692072, + "reward_change_std": 0.1618319470435381, + "reward_std": 0.9090043976902962, + "rewards/cosine_scaled_reward": -0.04221456404775381, + "rewards/format_reward": 0.45833334513008595, + "step": 47 + }, + { + "advantage_max": 1.6952250599861145, + "advantage_mean": 7.450580485901526e-09, + "advantage_min": -1.0082858800888062, + "advantage_std": 0.9997942596673965, + "completion_length": 2867.062530517578, + "epoch": 0.054857142857142854, + "grad_norm": 0.18633955717086792, + "kl": 0.00017702952027320862, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.6e-07, + "loss": 0.0, + "reward": -0.06481963396072388, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.06481963396072388, + "reward_after_std": 0.7094365991652012, + "reward_before_mean": 0.07328080199658871, + "reward_before_std": 0.7008146084845066, + "reward_change_max": 0.0003138333559036255, + "reward_change_mean": -0.138100431766361, + "reward_change_min": -0.26004249788820744, + "reward_change_std": 0.10102563817054033, + "reward_std": 0.709436621516943, + "rewards/cosine_scaled_reward": -0.11960960738360882, + "rewards/format_reward": 0.31250000186264515, + "step": 48 + }, + { + "advantage_max": 1.6078014522790909, + "advantage_mean": 2.048909675256283e-08, + "advantage_min": -1.0765347704291344, + "advantage_std": 0.9998094365000725, + "completion_length": 2509.7500762939453, + "epoch": 0.056, + "grad_norm": 0.2146293818950653, + "kl": 8.615851402282715e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.8e-07, + "loss": 0.0, + "reward": 0.2446850063279271, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2446850063279271, + "reward_after_std": 0.8773886524140835, + "reward_before_mean": 0.44149335473775864, + "reward_before_std": 0.8929827082902193, + "reward_change_max": 2.390146255493164e-05, + "reward_change_mean": -0.19680832419544458, + "reward_change_min": -0.4127675499767065, + "reward_change_std": 0.1626983918249607, + "reward_std": 0.877388671040535, + "rewards/cosine_scaled_reward": -0.039670001016929746, + "rewards/format_reward": 0.5208333358168602, + "step": 49 + }, + { + "advantage_max": 1.4889815598726273, + "advantage_mean": 3.011276339393021e-08, + "advantage_min": -1.19339619576931, + "advantage_std": 0.9997689872980118, + "completion_length": 2894.2500228881836, + "epoch": 0.05714285714285714, + "grad_norm": 0.16805996000766754, + "kl": 0.00012464821338653564, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1e-06, + "loss": 0.0, + "reward": 0.1484907530248165, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.1484907530248165, + "reward_after_std": 0.6498783119022846, + "reward_before_mean": 0.34459424391388893, + "reward_before_std": 0.6648210007697344, + "reward_change_max": 0.0, + "reward_change_mean": -0.19610351603478193, + "reward_change_min": -0.3851831816136837, + "reward_change_std": 0.15433361660689116, + "reward_std": 0.6498783379793167, + "rewards/cosine_scaled_reward": 0.005630466155707836, + "rewards/format_reward": 0.33333334140479565, + "step": 50 + }, + { + "advantage_max": 1.5318742841482162, + "advantage_mean": 2.2351741790771484e-08, + "advantage_min": -1.0917627662420273, + "advantage_std": 0.9998071938753128, + "completion_length": 2442.750045776367, + "epoch": 0.05828571428571429, + "grad_norm": 0.22353631258010864, + "kl": 0.00023058801889419556, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.999890338174275e-07, + "loss": 0.0, + "reward": 0.16875093430280685, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.16875093430280685, + "reward_after_std": 0.7135776728391647, + "reward_before_mean": 0.36318233981728554, + "reward_before_std": 0.7265008091926575, + "reward_change_max": 0.0001400187611579895, + "reward_change_mean": -0.1944313943386078, + "reward_change_min": -0.38128767162561417, + "reward_change_std": 0.15260417386889458, + "reward_std": 0.7135776914656162, + "rewards/cosine_scaled_reward": -0.06840883800759912, + "rewards/format_reward": 0.5000000074505806, + "step": 51 + }, + { + "advantage_max": 1.6993749290704727, + "advantage_mean": 5.650023915393376e-08, + "advantage_min": -0.9708743765950203, + "advantage_std": 0.9998618885874748, + "completion_length": 2915.1250534057617, + "epoch": 0.05942857142857143, + "grad_norm": 0.24529911577701569, + "kl": 0.0002612881362438202, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.999561358041868e-07, + "loss": 0.0, + "reward": 0.2074111569672823, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2074111569672823, + "reward_after_std": 0.9998992644250393, + "reward_before_mean": 0.3853365269023925, + "reward_before_std": 1.025053035467863, + "reward_change_max": 0.0015909001231193542, + "reward_change_mean": -0.17792532336898148, + "reward_change_min": -0.35967270471155643, + "reward_change_std": 0.1540224920026958, + "reward_std": 0.9998992942273617, + "rewards/cosine_scaled_reward": -0.005248422268778086, + "rewards/format_reward": 0.3958333395421505, + "step": 52 + }, + { + "advantage_max": 1.6098654568195343, + "advantage_mean": -7.45058070794613e-09, + "advantage_min": -1.1653679832816124, + "advantage_std": 0.9998446926474571, + "completion_length": 2782.6250610351562, + "epoch": 0.060571428571428575, + "grad_norm": 0.19299021363258362, + "kl": 0.00019798800349235535, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.999013075636804e-07, + "loss": 0.0, + "reward": 0.5741048266645521, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.5741048266645521, + "reward_after_std": 0.8326258882880211, + "reward_before_mean": 0.8417719714343548, + "reward_before_std": 0.8171909488737583, + "reward_change_max": 0.00019788742065429688, + "reward_change_mean": -0.26766714826226234, + "reward_change_min": -0.4490860775113106, + "reward_change_std": 0.17976201511919498, + "reward_std": 0.8326258957386017, + "rewards/cosine_scaled_reward": 0.09796930849552155, + "rewards/format_reward": 0.6458333469927311, + "step": 53 + }, + { + "advantage_max": 1.4913389384746552, + "advantage_mean": 7.450580929990736e-09, + "advantage_min": -1.1815472394227982, + "advantage_std": 0.9998629316687584, + "completion_length": 2925.4584045410156, + "epoch": 0.061714285714285715, + "grad_norm": 0.18331590294837952, + "kl": 4.350394010543823e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.998245517681593e-07, + "loss": 0.0, + "reward": 0.42693280428647995, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.42693280428647995, + "reward_after_std": 1.0608244948089123, + "reward_before_mean": 0.6571898418478668, + "reward_before_std": 1.1342002339661121, + "reward_change_max": 0.0, + "reward_change_mean": -0.2302570380270481, + "reward_change_min": -0.4764634482562542, + "reward_change_std": 0.2096308101899922, + "reward_std": 1.060824528336525, + "rewards/cosine_scaled_reward": 0.09942827746272087, + "rewards/format_reward": 0.4583333469927311, + "step": 54 + }, + { + "advantage_max": 1.6245067715644836, + "advantage_mean": -4.3461718668424965e-09, + "advantage_min": -0.9192568957805634, + "advantage_std": 0.9998599514365196, + "completion_length": 3095.250030517578, + "epoch": 0.06285714285714286, + "grad_norm": 0.1808166652917862, + "kl": 0.00031198933720588684, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.997258721585931e-07, + "loss": 0.0, + "reward": 0.10264583956450224, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.10264583956450224, + "reward_after_std": 0.989537637680769, + "reward_before_mean": 0.2625856678932905, + "reward_before_std": 1.0343751683831215, + "reward_change_max": 0.0006072595715522766, + "reward_change_mean": -0.15993982320651412, + "reward_change_min": -0.3970005866140127, + "reward_change_std": 0.16320207389071584, + "reward_std": 0.9895376712083817, + "rewards/cosine_scaled_reward": -0.04579051467590034, + "rewards/format_reward": 0.35416666977107525, + "step": 55 + }, + { + "advantage_max": 1.4185740798711777, + "advantage_mean": -1.2728075787782345e-08, + "advantage_min": -1.3164423555135727, + "advantage_std": 0.9997757226228714, + "completion_length": 2954.458396911621, + "epoch": 0.064, + "grad_norm": 0.16553562879562378, + "kl": 4.982948303222656e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.996052735444862e-07, + "loss": 0.0, + "reward": 0.14053409546613693, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.14053409546613693, + "reward_after_std": 0.6105850823223591, + "reward_before_mean": 0.3413895256817341, + "reward_before_std": 0.6498310156166553, + "reward_change_max": 0.0002667754888534546, + "reward_change_mean": -0.2008554646745324, + "reward_change_min": -0.36588573828339577, + "reward_change_std": 0.15487558394670486, + "reward_std": 0.6105850897729397, + "rewards/cosine_scaled_reward": -0.027221906930208206, + "rewards/format_reward": 0.3958333432674408, + "step": 56 + }, + { + "advantage_max": 1.62399423122406, + "advantage_mean": 1.6763807009212428e-08, + "advantage_min": -0.9893188774585724, + "advantage_std": 0.9997558370232582, + "completion_length": 3263.812530517578, + "epoch": 0.06514285714285714, + "grad_norm": 0.12242446094751358, + "kl": 2.575106918811798e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.994627618036452e-07, + "loss": 0.0, + "reward": -0.31899499148130417, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.31899499148130417, + "reward_after_std": 0.5046761892735958, + "reward_before_mean": -0.21401306986808777, + "reward_before_std": 0.515403188765049, + "reward_change_max": 0.00018546730279922485, + "reward_change_mean": -0.10498191299848258, + "reward_change_min": -0.23341021686792374, + "reward_change_std": 0.08848186326213181, + "reward_std": 0.5046762004494667, + "rewards/cosine_scaled_reward": -0.22158987261354923, + "rewards/format_reward": 0.22916666977107525, + "step": 57 + }, + { + "advantage_max": 1.560118854045868, + "advantage_mean": 4.346172421954009e-09, + "advantage_min": -1.130972020328045, + "advantage_std": 0.9998400285840034, + "completion_length": 2412.2708892822266, + "epoch": 0.06628571428571428, + "grad_norm": 0.2044786512851715, + "kl": 0.0010882318019866943, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.992983438818915e-07, + "loss": 0.0, + "reward": 0.2907734867185354, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2907734867185354, + "reward_after_std": 0.8199318274855614, + "reward_before_mean": 0.501036109868437, + "reward_before_std": 0.8361069560050964, + "reward_change_max": 9.82433557510376e-05, + "reward_change_mean": -0.21026262175291777, + "reward_change_min": -0.4459004085510969, + "reward_change_std": 0.16762463841587305, + "reward_std": 0.819931834936142, + "rewards/cosine_scaled_reward": -0.07239861227571964, + "rewards/format_reward": 0.6458333395421505, + "step": 58 + }, + { + "advantage_max": 1.7214796096086502, + "advantage_mean": 6.100163174593831e-08, + "advantage_min": -1.0006466582417488, + "advantage_std": 0.9996944218873978, + "completion_length": 2856.187515258789, + "epoch": 0.06742857142857143, + "grad_norm": 0.16563907265663147, + "kl": 7.169204764068127e-05, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.991120277927223e-07, + "loss": 0.0, + "reward": -0.12141696130856872, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": -0.12141696130856872, + "reward_after_std": 0.5117970556020737, + "reward_before_mean": 0.019515281077474356, + "reward_before_std": 0.486568721011281, + "reward_change_max": 0.0001515224575996399, + "reward_change_mean": -0.14093223959207535, + "reward_change_min": -0.23997912742197514, + "reward_change_std": 0.09495735168457031, + "reward_std": 0.511797059327364, + "rewards/cosine_scaled_reward": -0.13607568992301822, + "rewards/format_reward": 0.2916666679084301, + "step": 59 + }, + { + "advantage_max": 1.693726196885109, + "advantage_mean": 4.2840839042934675e-08, + "advantage_min": -1.0366674736142159, + "advantage_std": 0.9997766688466072, + "completion_length": 2948.125045776367, + "epoch": 0.06857142857142857, + "grad_norm": 0.16493219137191772, + "kl": 0.0001513250172138214, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.989038226169207e-07, + "loss": 0.0, + "reward": -0.09105870872735977, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.09105870872735977, + "reward_after_std": 0.686959195882082, + "reward_before_mean": 0.04372099228203297, + "reward_before_std": 0.6810658574104309, + "reward_change_max": 0.00010024756193161011, + "reward_change_mean": -0.1347797194030136, + "reward_change_min": -0.27008931152522564, + "reward_change_std": 0.10537515860050917, + "reward_std": 0.6869592033326626, + "rewards/cosine_scaled_reward": -0.16563950292766094, + "rewards/format_reward": 0.3750000037252903, + "step": 60 + }, + { + "advantage_max": 1.3835174441337585, + "advantage_mean": 1.4280279403422469e-08, + "advantage_min": -1.3410531058907509, + "advantage_std": 0.9998080208897591, + "completion_length": 3108.3958740234375, + "epoch": 0.06971428571428571, + "grad_norm": 0.17201192677021027, + "kl": 0.00028091808781027794, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.98673738502114e-07, + "loss": 0.0, + "reward": 0.23692655563354492, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.23692655563354492, + "reward_after_std": 0.8451647032052279, + "reward_before_mean": 0.4443775750696659, + "reward_before_std": 0.9202493615448475, + "reward_change_max": 0.0005453750491142273, + "reward_change_mean": -0.2074510301463306, + "reward_change_min": -0.4427679292857647, + "reward_change_std": 0.1908978926949203, + "reward_std": 0.8451647274196148, + "rewards/cosine_scaled_reward": 0.01385545451194048, + "rewards/format_reward": 0.41666667722165585, + "step": 61 + }, + { + "advantage_max": 1.806217536330223, + "advantage_mean": -1.9868215961338365e-08, + "advantage_min": -0.887168750166893, + "advantage_std": 0.9998476803302765, + "completion_length": 2752.0834197998047, + "epoch": 0.07085714285714285, + "grad_norm": 0.19357971847057343, + "kl": 0.0005202442407608032, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.98421786662277e-07, + "loss": 0.0, + "reward": 0.3755918840470258, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3755918840470258, + "reward_after_std": 0.8301325552165508, + "reward_before_mean": 0.5955550698563457, + "reward_before_std": 0.790971240028739, + "reward_change_max": 0.00032426416873931885, + "reward_change_mean": -0.21996318409219384, + "reward_change_min": -0.36245875246822834, + "reward_change_std": 0.13983059162274003, + "reward_std": 0.8301325589418411, + "rewards/cosine_scaled_reward": 0.01652752747759223, + "rewards/format_reward": 0.5625000093132257, + "step": 62 + }, + { + "advantage_max": 1.525961548089981, + "advantage_mean": -1.7384688355548406e-08, + "advantage_min": -1.127110406756401, + "advantage_std": 0.999846026301384, + "completion_length": 2262.6458892822266, + "epoch": 0.072, + "grad_norm": 0.20876488089561462, + "kl": 0.0008518993854522705, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.981479793771866e-07, + "loss": 0.0, + "reward": 0.7662359848618507, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.7662359848618507, + "reward_after_std": 0.909334447234869, + "reward_before_mean": 1.0756743550300598, + "reward_before_std": 0.9336301498115063, + "reward_change_max": 8.27684998512268e-05, + "reward_change_mean": -0.30943838274106383, + "reward_change_min": -0.5926293302327394, + "reward_change_std": 0.22869372786954045, + "reward_std": 0.9093344509601593, + "rewards/cosine_scaled_reward": 0.16283717821352184, + "rewards/format_reward": 0.7500000149011612, + "step": 63 + }, + { + "advantage_max": 1.5487523674964905, + "advantage_mean": 1.8005570256995895e-08, + "advantage_min": -1.0956207066774368, + "advantage_std": 0.9997992515563965, + "completion_length": 2957.4583587646484, + "epoch": 0.07314285714285715, + "grad_norm": 0.18354231119155884, + "kl": 0.000421963632106781, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.97852329991824e-07, + "loss": 0.0, + "reward": 0.13231371343135834, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.13231371343135834, + "reward_after_std": 0.9308353830128908, + "reward_before_mean": 0.3047089036554098, + "reward_before_std": 0.9812476169317961, + "reward_change_max": 0.0006142035126686096, + "reward_change_mean": -0.17239517951384187, + "reward_change_min": -0.4214697778224945, + "reward_change_std": 0.17370698787271976, + "reward_std": 0.930835397914052, + "rewards/cosine_scaled_reward": -0.024728883057832718, + "rewards/format_reward": 0.3541666753590107, + "step": 64 + }, + { + "advantage_max": 1.708268865942955, + "advantage_mean": 2.8560559917067962e-08, + "advantage_min": -0.9543309882283211, + "advantage_std": 0.9997209906578064, + "completion_length": 2630.708366394043, + "epoch": 0.07428571428571429, + "grad_norm": 0.2504342198371887, + "kl": 0.0003350377082824707, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.975348529157229e-07, + "loss": 0.0, + "reward": -0.05934199318289757, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.05934199318289757, + "reward_after_std": 0.5288446377962828, + "reward_before_mean": 0.09470355277881026, + "reward_before_std": 0.5060996157117188, + "reward_change_max": 0.000491216778755188, + "reward_change_mean": -0.15404552780091763, + "reward_change_min": -0.25607624277472496, + "reward_change_std": 0.10440768301486969, + "reward_std": 0.5288446471095085, + "rewards/cosine_scaled_reward": -0.17139821499586105, + "rewards/format_reward": 0.43750000558793545, + "step": 65 + }, + { + "advantage_max": 1.6425568014383316, + "advantage_mean": 4.035731526741415e-09, + "advantage_min": -1.1181970834732056, + "advantage_std": 0.9997746720910072, + "completion_length": 2079.5208435058594, + "epoch": 0.07542857142857143, + "grad_norm": 0.27181532979011536, + "kl": 0.00034831464290618896, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.971955636222684e-07, + "loss": 0.0, + "reward": 0.4275641590356827, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4275641590356827, + "reward_after_std": 0.553373359143734, + "reward_before_mean": 0.683971457183361, + "reward_before_std": 0.5225912407040596, + "reward_change_max": 0.0009416639804840088, + "reward_change_mean": -0.2564073045505211, + "reward_change_min": -0.40475964918732643, + "reward_change_std": 0.15713666449300945, + "reward_std": 0.5533733814954758, + "rewards/cosine_scaled_reward": 0.09198573045432568, + "rewards/format_reward": 0.5, + "step": 66 + }, + { + "advantage_max": 1.5558879524469376, + "advantage_mean": 6.457170098617127e-08, + "advantage_min": -1.0571024790406227, + "advantage_std": 0.9997260123491287, + "completion_length": 3432.6875, + "epoch": 0.07657142857142857, + "grad_norm": 0.13905449211597443, + "kl": 0.0003644903190433979, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.968344786479415e-07, + "loss": 0.0, + "reward": -0.47604336217045784, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.47604336217045784, + "reward_after_std": 0.4485955648124218, + "reward_before_mean": -0.40127869322896004, + "reward_before_std": 0.459877897053957, + "reward_change_max": 0.0012263581156730652, + "reward_change_mean": -0.07476464239880443, + "reward_change_min": -0.16927983611822128, + "reward_change_std": 0.06955831311643124, + "reward_std": 0.4485955722630024, + "rewards/cosine_scaled_reward": -0.25272269267588854, + "rewards/format_reward": 0.1041666716337204, + "step": 67 + }, + { + "advantage_max": 1.4970561861991882, + "advantage_mean": -4.96705393482344e-09, + "advantage_min": -1.2759140655398369, + "advantage_std": 0.9997949376702309, + "completion_length": 1971.9167175292969, + "epoch": 0.07771428571428571, + "grad_norm": 0.24344401061534882, + "kl": 0.00177764892578125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.964516155915151e-07, + "loss": 0.0001, + "reward": 0.3115036394447088, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.3115036394447088, + "reward_after_std": 0.640905000269413, + "reward_before_mean": 0.5413862890563905, + "reward_before_std": 0.6520151775330305, + "reward_change_max": 0.00031547248363494873, + "reward_change_mean": -0.22988267801702023, + "reward_change_min": -0.3882032725960016, + "reward_change_std": 0.1589061007834971, + "reward_std": 0.6409050039947033, + "rewards/cosine_scaled_reward": -0.06264018453657627, + "rewards/format_reward": 0.666666679084301, + "step": 68 + }, + { + "advantage_max": 1.7017274498939514, + "advantage_mean": 5.091230503850852e-08, + "advantage_min": -0.9753537401556969, + "advantage_std": 0.999798521399498, + "completion_length": 2390.083381652832, + "epoch": 0.07885714285714286, + "grad_norm": 0.2992507219314575, + "kl": 0.0013431459665298462, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.960469931131936e-07, + "loss": 0.0001, + "reward": 0.018286951817572117, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.018286951817572117, + "reward_after_std": 0.757911205291748, + "reward_before_mean": 0.1728294063359499, + "reward_before_std": 0.7661503255367279, + "reward_change_max": 0.000638522207736969, + "reward_change_mean": -0.15454245172441006, + "reward_change_min": -0.3369054328650236, + "reward_change_std": 0.13101212214678526, + "reward_std": 0.7579112146049738, + "rewards/cosine_scaled_reward": -0.17400196427479386, + "rewards/format_reward": 0.5208333376795053, + "step": 69 + }, + { + "advantage_max": 1.7679117619991302, + "advantage_mean": 1.3038515822572094e-08, + "advantage_min": -0.8881788477301598, + "advantage_std": 0.999812088906765, + "completion_length": 3126.875045776367, + "epoch": 0.08, + "grad_norm": 0.2363167405128479, + "kl": 0.0009987987577915192, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.956206309337066e-07, + "loss": 0.0, + "reward": -0.052413856610655785, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.052413856610655785, + "reward_after_std": 0.7765421830117702, + "reward_before_mean": 0.0839406659360975, + "reward_before_std": 0.7712229937314987, + "reward_change_max": 0.000931374728679657, + "reward_change_mean": -0.13635451719164848, + "reward_change_min": -0.29352002777159214, + "reward_change_std": 0.1165924184024334, + "reward_std": 0.7765422016382217, + "rewards/cosine_scaled_reward": -0.14552967669442296, + "rewards/format_reward": 0.37500000931322575, + "step": 70 + }, + { + "advantage_max": 1.4911705553531647, + "advantage_mean": 8.692343733684993e-09, + "advantage_min": -1.1096780076622963, + "advantage_std": 0.999790795147419, + "completion_length": 2761.750015258789, + "epoch": 0.08114285714285714, + "grad_norm": 0.20265944302082062, + "kl": 0.000859379768371582, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.951725498333448e-07, + "loss": 0.0, + "reward": 0.12095781043171883, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.12095781043171883, + "reward_after_std": 0.70588543638587, + "reward_before_mean": 0.3066416233778, + "reward_before_std": 0.737454067915678, + "reward_change_max": 0.0, + "reward_change_mean": -0.18568379897624254, + "reward_change_min": -0.37347991578280926, + "reward_change_std": 0.15028795832768083, + "reward_std": 0.70588543638587, + "rewards/cosine_scaled_reward": -0.034179212525486946, + "rewards/format_reward": 0.3750000074505806, + "step": 71 + }, + { + "advantage_max": 1.6158892661333084, + "advantage_mean": 1.2417632477834672e-09, + "advantage_min": -1.0985428914427757, + "advantage_std": 0.9998162090778351, + "completion_length": 3004.3334045410156, + "epoch": 0.08228571428571428, + "grad_norm": 0.2693294584751129, + "kl": 0.001021057367324829, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.947027716509488e-07, + "loss": 0.0, + "reward": -0.1556640777271241, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.1556640777271241, + "reward_after_std": 0.7115676589310169, + "reward_before_mean": -0.03549688681960106, + "reward_before_std": 0.7211866304278374, + "reward_change_max": 0.0001872330904006958, + "reward_change_mean": -0.12016719300299883, + "reward_change_min": -0.24851389415562153, + "reward_change_std": 0.10526996431872249, + "reward_std": 0.7115676626563072, + "rewards/cosine_scaled_reward": -0.18441510945558548, + "rewards/format_reward": 0.33333333767950535, + "step": 72 + }, + { + "advantage_max": 1.5327227264642715, + "advantage_mean": 1.241762692671955e-09, + "advantage_min": -1.1601731404662132, + "advantage_std": 0.9997736439108849, + "completion_length": 3494.604217529297, + "epoch": 0.08342857142857144, + "grad_norm": 0.1661466360092163, + "kl": 0.00032639503479003906, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.942113192828444e-07, + "loss": 0.0, + "reward": -0.08516270108520985, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.08516270108520985, + "reward_after_std": 0.6497222892940044, + "reward_before_mean": 0.059801749885082245, + "reward_before_std": 0.6753613352775574, + "reward_change_max": 0.0002906620502471924, + "reward_change_mean": -0.1449644397944212, + "reward_change_min": -0.2802998274564743, + "reward_change_std": 0.11904488690197468, + "reward_std": 0.6497223116457462, + "rewards/cosine_scaled_reward": -0.08468246832489967, + "rewards/format_reward": 0.22916666977107525, + "step": 73 + }, + { + "advantage_max": 1.71546071767807, + "advantage_mean": 4.594524849466097e-08, + "advantage_min": -0.8454092368483543, + "advantage_std": 0.9998168796300888, + "completion_length": 3046.8125610351562, + "epoch": 0.08457142857142858, + "grad_norm": 0.18538318574428558, + "kl": 0.0007867217063903809, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.93698216681727e-07, + "loss": 0.0, + "reward": 0.17524679680354893, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.17524679680354893, + "reward_after_std": 0.9069931507110596, + "reward_before_mean": 0.3507931437343359, + "reward_before_std": 0.8982865624129772, + "reward_change_max": 0.0001408606767654419, + "reward_change_mean": -0.1755463215522468, + "reward_change_min": -0.37799850665032864, + "reward_change_std": 0.1365469004958868, + "reward_std": 0.9069931656122208, + "rewards/cosine_scaled_reward": 0.008729891385883093, + "rewards/format_reward": 0.3333333395421505, + "step": 74 + }, + { + "advantage_max": 1.5503149926662445, + "advantage_mean": -3.104408519138957e-08, + "advantage_min": -1.2087038829922676, + "advantage_std": 0.9997450262308121, + "completion_length": 2894.8333740234375, + "epoch": 0.08571428571428572, + "grad_norm": 0.16822047531604767, + "kl": 0.0011026561260223389, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.931634888554935e-07, + "loss": 0.0, + "reward": 0.23183363070711493, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.23183363070711493, + "reward_after_std": 0.4754980690777302, + "reward_before_mean": 0.4545082226395607, + "reward_before_std": 0.46481961756944656, + "reward_change_max": 0.0005934983491897583, + "reward_change_mean": -0.22267461940646172, + "reward_change_min": -0.3571764323860407, + "reward_change_std": 0.13745499728247523, + "reward_std": 0.4754980690777302, + "rewards/cosine_scaled_reward": 0.029337426647543907, + "rewards/format_reward": 0.3958333358168602, + "step": 75 + }, + { + "advantage_max": 1.6219773888587952, + "advantage_mean": 5.587936668938198e-09, + "advantage_min": -1.1091139391064644, + "advantage_std": 0.9997748509049416, + "completion_length": 2880.791679382324, + "epoch": 0.08685714285714285, + "grad_norm": 0.1717507541179657, + "kl": 0.00029241712763905525, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.926071618660237e-07, + "loss": 0.0, + "reward": -0.023375704884529114, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.023375704884529114, + "reward_after_std": 0.5844264999032021, + "reward_before_mean": 0.14006047509610653, + "reward_before_std": 0.6008395608514547, + "reward_change_max": 0.00044924765825271606, + "reward_change_mean": -0.16343621350824833, + "reward_change_min": -0.29554445296525955, + "reward_change_std": 0.1233857732731849, + "reward_std": 0.5844265222549438, + "rewards/cosine_scaled_reward": -0.1695530880242586, + "rewards/format_reward": 0.4791666753590107, + "step": 76 + }, + { + "advantage_max": 1.4377646893262863, + "advantage_mean": 1.9868214629070735e-08, + "advantage_min": -1.153957448899746, + "advantage_std": 0.9997784495353699, + "completion_length": 3049.250015258789, + "epoch": 0.088, + "grad_norm": 0.16906258463859558, + "kl": 0.0003077983856201172, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.9202926282791e-07, + "loss": 0.0, + "reward": -0.08434882014989853, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.08434882014989853, + "reward_after_std": 0.5435480587184429, + "reward_before_mean": 0.07107899896800518, + "reward_before_std": 0.5732218399643898, + "reward_change_max": 0.0005913972854614258, + "reward_change_mean": -0.15542782377451658, + "reward_change_min": -0.30807364732027054, + "reward_change_std": 0.12492271605879068, + "reward_std": 0.5435480773448944, + "rewards/cosine_scaled_reward": -0.13112716563045979, + "rewards/format_reward": 0.3333333358168602, + "step": 77 + }, + { + "advantage_max": 1.3530033379793167, + "advantage_mean": 3.352761412944716e-08, + "advantage_min": -1.1873060315847397, + "advantage_std": 0.9997893124818802, + "completion_length": 3167.5208740234375, + "epoch": 0.08914285714285715, + "grad_norm": 0.1513669639825821, + "kl": 0.00021767616271972656, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.91429819907136e-07, + "loss": 0.0, + "reward": 0.07496153563261032, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.07496153563261032, + "reward_after_std": 0.7540915813297033, + "reward_before_mean": 0.2513010837137699, + "reward_before_std": 0.8067440576851368, + "reward_change_max": 0.0001747533679008484, + "reward_change_mean": -0.17633954668417573, + "reward_change_min": -0.3947325777262449, + "reward_change_std": 0.16072524525225163, + "reward_std": 0.7540916260331869, + "rewards/cosine_scaled_reward": -0.030599456280469894, + "rewards/format_reward": 0.3125000074505806, + "step": 78 + }, + { + "advantage_max": 1.682113841176033, + "advantage_mean": -4.097819394921487e-08, + "advantage_min": -1.0798411667346954, + "advantage_std": 0.9997269585728645, + "completion_length": 2341.895866394043, + "epoch": 0.09028571428571429, + "grad_norm": 0.2215704619884491, + "kl": 0.0007579028606414795, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.908088623197048e-07, + "loss": 0.0, + "reward": 0.264179325196892, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.264179325196892, + "reward_after_std": 0.5166768245398998, + "reward_before_mean": 0.48735319514526054, + "reward_before_std": 0.4782428778707981, + "reward_change_max": 0.0010138675570487976, + "reward_change_mean": -0.22317388840019703, + "reward_change_min": -0.3387150280177593, + "reward_change_std": 0.13743248512037098, + "reward_std": 0.5166768468916416, + "rewards/cosine_scaled_reward": -0.05840673670172691, + "rewards/format_reward": 0.6041666679084301, + "step": 79 + }, + { + "advantage_max": 1.5736753195524216, + "advantage_mean": 3.2285850215529877e-08, + "advantage_min": -0.999790869653225, + "advantage_std": 0.9998230561614037, + "completion_length": 3286.5416870117188, + "epoch": 0.09142857142857143, + "grad_norm": 0.16919022798538208, + "kl": 0.0008649379014968872, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.901664203302124e-07, + "loss": 0.0, + "reward": 0.005955344066023827, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.005955344066023827, + "reward_after_std": 0.8890957869589329, + "reward_before_mean": 0.15279491746332496, + "reward_before_std": 0.9225866943597794, + "reward_change_max": 4.447251558303833e-05, + "reward_change_mean": -0.14683958096429706, + "reward_change_min": -0.32787839509546757, + "reward_change_std": 0.13908664509654045, + "reward_std": 0.8890958093106747, + "rewards/cosine_scaled_reward": -0.07985254935920238, + "rewards/format_reward": 0.31250000558793545, + "step": 80 + }, + { + "advantage_max": 1.4473999440670013, + "advantage_mean": 4.159907629475157e-08, + "advantage_min": -1.2874359339475632, + "advantage_std": 0.9996728822588921, + "completion_length": 3123.4583740234375, + "epoch": 0.09257142857142857, + "grad_norm": 0.23505662381649017, + "kl": 0.0028659701347351074, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.895025252503755e-07, + "loss": 0.0001, + "reward": -0.13922469969838858, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": -0.13922469969838858, + "reward_after_std": 0.3902856092900038, + "reward_before_mean": 0.013303263112902641, + "reward_before_std": 0.3994307592511177, + "reward_change_max": 0.0002063661813735962, + "reward_change_mean": -0.15252797584980726, + "reward_change_min": -0.2631298340857029, + "reward_change_std": 0.10517477197572589, + "reward_std": 0.3902856223285198, + "rewards/cosine_scaled_reward": -0.13918170426040888, + "rewards/format_reward": 0.2916666679084301, + "step": 81 + }, + { + "advantage_max": 1.5119963884353638, + "advantage_mean": -3.725290298461914e-08, + "advantage_min": -1.1589691415429115, + "advantage_std": 0.9997723922133446, + "completion_length": 2829.0208892822266, + "epoch": 0.09371428571428571, + "grad_norm": 0.2004895955324173, + "kl": 0.0015895962715148926, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.888172094375033e-07, + "loss": 0.0001, + "reward": 0.157549187541008, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.157549187541008, + "reward_after_std": 0.6032798625528812, + "reward_before_mean": 0.3559020347893238, + "reward_before_std": 0.6031227596104145, + "reward_change_max": 0.00014865398406982422, + "reward_change_mean": -0.1983528840355575, + "reward_change_min": -0.35573250614106655, + "reward_change_std": 0.14031734503805637, + "reward_std": 0.6032798700034618, + "rewards/cosine_scaled_reward": 0.0008676820434629917, + "rewards/format_reward": 0.3541666679084301, + "step": 82 + }, + { + "advantage_max": 1.7299436926841736, + "advantage_mean": 3.7252894102834944e-09, + "advantage_min": -0.9761759266257286, + "advantage_std": 0.9997840225696564, + "completion_length": 2715.5416946411133, + "epoch": 0.09485714285714286, + "grad_norm": 0.24527642130851746, + "kl": 0.001336120069026947, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.881105062929221e-07, + "loss": 0.0001, + "reward": -0.06252476340159774, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.06252476340159774, + "reward_after_std": 0.6791763417422771, + "reward_before_mean": 0.0788705749437213, + "reward_before_std": 0.6742656137794256, + "reward_change_max": 0.0, + "reward_change_mean": -0.1413953397423029, + "reward_change_min": -0.27979685738682747, + "reward_change_std": 0.10586337419226766, + "reward_std": 0.6791763938963413, + "rewards/cosine_scaled_reward": -0.13764804881066084, + "rewards/format_reward": 0.3541666679084301, + "step": 83 + }, + { + "advantage_max": 1.5118530690670013, + "advantage_mean": 2.669791410170319e-08, + "advantage_min": -1.1131732016801834, + "advantage_std": 0.9997848272323608, + "completion_length": 3013.6250610351562, + "epoch": 0.096, + "grad_norm": 0.19372142851352692, + "kl": 0.0005471706390380859, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.873824502603459e-07, + "loss": 0.0, + "reward": 0.09955209773033857, + "reward_advantage_correlation": 0.9999999999999994, + "reward_after_mean": 0.09955209773033857, + "reward_after_std": 0.7380244378000498, + "reward_before_mean": 0.28240909799933434, + "reward_before_std": 0.7860660124570131, + "reward_change_max": 0.0006183013319969177, + "reward_change_mean": -0.18285699002444744, + "reward_change_min": -0.39008102752268314, + "reward_change_std": 0.15990980505011976, + "reward_std": 0.7380244564265013, + "rewards/cosine_scaled_reward": -0.056712113320827484, + "rewards/format_reward": 0.3958333395421505, + "step": 84 + }, + { + "advantage_max": 1.696160763502121, + "advantage_mean": 3.0423204844254315e-08, + "advantage_min": -0.9830008372664452, + "advantage_std": 0.9998309686779976, + "completion_length": 2940.6875610351562, + "epoch": 0.09714285714285714, + "grad_norm": 0.1844056099653244, + "kl": 0.0002780407667160034, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.866330768241983e-07, + "loss": 0.0, + "reward": 0.025807244703173637, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.025807244703173637, + "reward_after_std": 0.8476546891033649, + "reward_before_mean": 0.1735303965397179, + "reward_before_std": 0.846455778926611, + "reward_change_max": 0.00022524595260620117, + "reward_change_mean": -0.14772315858863294, + "reward_change_min": -0.2896654698997736, + "reward_change_std": 0.11199493327876553, + "reward_std": 0.8476547226309776, + "rewards/cosine_scaled_reward": -0.1424014689400792, + "rewards/format_reward": 0.4583333395421505, + "step": 85 + }, + { + "advantage_max": 1.5399595648050308, + "advantage_mean": 2.98023239975187e-08, + "advantage_min": -1.1645502522587776, + "advantage_std": 0.9998061656951904, + "completion_length": 2697.1666946411133, + "epoch": 0.09828571428571428, + "grad_norm": 0.22625777125358582, + "kl": 0.0011822879314422607, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.85862422507884e-07, + "loss": 0.0, + "reward": 0.17234379425644875, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.17234379425644875, + "reward_after_std": 0.6737720742821693, + "reward_before_mean": 0.37005191296339035, + "reward_before_std": 0.6888385564088821, + "reward_change_max": 0.00036728382110595703, + "reward_change_mean": -0.19770813873037696, + "reward_change_min": -0.3463681824505329, + "reward_change_std": 0.14334939466789365, + "reward_std": 0.6737720891833305, + "rewards/cosine_scaled_reward": -0.033724045380949974, + "rewards/format_reward": 0.4375000074505806, + "step": 86 + }, + { + "advantage_max": 1.5217802822589874, + "advantage_mean": 5.316300510926908e-09, + "advantage_min": -1.1270632445812225, + "advantage_std": 0.9998283088207245, + "completion_length": 2674.1875610351562, + "epoch": 0.09942857142857142, + "grad_norm": 0.20976316928863525, + "kl": 0.001379743218421936, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.850705248720068e-07, + "loss": 0.0001, + "reward": 0.2193904248997569, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2193904248997569, + "reward_after_std": 0.77983483299613, + "reward_before_mean": 0.4205781789496541, + "reward_before_std": 0.8077935613691807, + "reward_change_max": 0.0005700737237930298, + "reward_change_mean": -0.20118775311857462, + "reward_change_min": -0.38294284231960773, + "reward_change_std": 0.16150949569419026, + "reward_std": 0.7798348590731621, + "rewards/cosine_scaled_reward": -0.03971092030405998, + "rewards/format_reward": 0.5000000111758709, + "step": 87 + }, + { + "advantage_max": 1.6352401971817017, + "advantage_mean": -6.208817238118058e-09, + "advantage_min": -1.065637744963169, + "advantage_std": 0.9998790919780731, + "completion_length": 2831.937545776367, + "epoch": 0.10057142857142858, + "grad_norm": 0.2115127146244049, + "kl": 0.001442551612854004, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.8425742251254e-07, + "loss": 0.0001, + "reward": 0.37121852952986956, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.37121852952986956, + "reward_after_std": 1.0690153501927853, + "reward_before_mean": 0.5801740437746048, + "reward_before_std": 1.098650336265564, + "reward_change_max": 0.00029071420431137085, + "reward_change_mean": -0.20895551680587232, + "reward_change_min": -0.4042000826448202, + "reward_change_std": 0.16902310587465763, + "reward_std": 1.0690153948962688, + "rewards/cosine_scaled_reward": 0.0505036786198616, + "rewards/format_reward": 0.4791666828095913, + "step": 88 + }, + { + "advantage_max": 1.4792246967554092, + "advantage_mean": -3.7252901874396116e-09, + "advantage_min": -1.1064741685986519, + "advantage_std": 0.9998375922441483, + "completion_length": 3161.2709350585938, + "epoch": 0.10171428571428572, + "grad_norm": 0.1943075805902481, + "kl": 0.0011938810348510742, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.83423155058946e-07, + "loss": 0.0, + "reward": 0.043435624334961176, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.043435624334961176, + "reward_after_std": 0.7929012589156628, + "reward_before_mean": 0.20786358881741762, + "reward_before_std": 0.8400050476193428, + "reward_change_max": 0.0006680861115455627, + "reward_change_mean": -0.16442797426134348, + "reward_change_min": -0.3740697056055069, + "reward_change_std": 0.1559822354465723, + "reward_std": 0.7929012849926949, + "rewards/cosine_scaled_reward": -0.07315154653042555, + "rewards/format_reward": 0.35416667349636555, + "step": 89 + }, + { + "advantage_max": 1.7707126438617706, + "advantage_mean": -1.707424912567035e-08, + "advantage_min": -0.8974402844905853, + "advantage_std": 0.9997777789831161, + "completion_length": 2409.062545776367, + "epoch": 0.10285714285714286, + "grad_norm": 0.24570851027965546, + "kl": 0.0012853443622589111, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.825677631722435e-07, + "loss": 0.0001, + "reward": -0.05016228114254773, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.05016228114254773, + "reward_after_std": 0.6008680164813995, + "reward_before_mean": 0.0965384691953659, + "reward_before_std": 0.5710118785500526, + "reward_change_max": 0.0, + "reward_change_mean": -0.14670075592584908, + "reward_change_min": -0.2537856996059418, + "reward_change_std": 0.09590160532388836, + "reward_std": 0.6008680239319801, + "rewards/cosine_scaled_reward": -0.22256410913541913, + "rewards/format_reward": 0.5416666716337204, + "step": 90 + }, + { + "advantage_max": 1.4991742223501205, + "advantage_mean": 2.8560559584001055e-08, + "advantage_min": -1.0803657919168472, + "advantage_std": 0.9998156875371933, + "completion_length": 2925.812545776367, + "epoch": 0.104, + "grad_norm": 0.18674921989440918, + "kl": 0.000809013843536377, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.816912885430258e-07, + "loss": 0.0, + "reward": -0.02295660600066185, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": -0.02295660600066185, + "reward_after_std": 0.8140906505286694, + "reward_before_mean": 0.12554599717259407, + "reward_before_std": 0.8680821731686592, + "reward_change_max": 0.0007210969924926758, + "reward_change_mean": -0.14850260131061077, + "reward_change_min": -0.37167999893426895, + "reward_change_std": 0.1564339753240347, + "reward_std": 0.8140906654298306, + "rewards/cosine_scaled_reward": -0.13514367304742336, + "rewards/format_reward": 0.3958333358168602, + "step": 91 + }, + { + "advantage_max": 1.5517508536577225, + "advantage_mean": -4.967053879312289e-09, + "advantage_min": -1.1872400864958763, + "advantage_std": 0.9997856393456459, + "completion_length": 2539.3750381469727, + "epoch": 0.10514285714285715, + "grad_norm": 0.33332663774490356, + "kl": 0.0048322901129722595, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.807937738894303e-07, + "loss": 0.0002, + "reward": 0.1360047198832035, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.1360047198832035, + "reward_after_std": 0.7280963324010372, + "reward_before_mean": 0.3214022181928158, + "reward_before_std": 0.7474944517016411, + "reward_change_max": 0.0, + "reward_change_mean": -0.18539749644696712, + "reward_change_min": -0.3729815445840359, + "reward_change_std": 0.14568164059892297, + "reward_std": 0.7280963659286499, + "rewards/cosine_scaled_reward": -0.08929889462888241, + "rewards/format_reward": 0.5000000074505806, + "step": 92 + }, + { + "advantage_max": 1.489373043179512, + "advantage_mean": 4.0357312269811985e-08, + "advantage_min": -1.0695101916790009, + "advantage_std": 0.9997303858399391, + "completion_length": 3477.4791870117188, + "epoch": 0.10628571428571429, + "grad_norm": 0.19943885505199432, + "kl": 0.001136481761932373, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.798752629550546e-07, + "loss": 0.0, + "reward": -0.5609270744025707, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.5609270744025707, + "reward_after_std": 0.39100511744618416, + "reward_before_mean": -0.4983716458082199, + "reward_before_std": 0.415749229490757, + "reward_change_max": 0.0004532933235168457, + "reward_change_mean": -0.06255542347207665, + "reward_change_min": -0.15995178557932377, + "reward_change_std": 0.06816363241523504, + "reward_std": 0.39100512489676476, + "rewards/cosine_scaled_reward": -0.270019156858325, + "rewards/format_reward": 0.0416666679084301, + "step": 93 + }, + { + "advantage_max": 1.7367701828479767, + "advantage_mean": 4.1599076627818476e-08, + "advantage_min": -0.9185703918337822, + "advantage_std": 0.9997115805745125, + "completion_length": 3026.5000228881836, + "epoch": 0.10742857142857143, + "grad_norm": 0.22087015211582184, + "kl": 0.002016555517911911, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.78935800506826e-07, + "loss": 0.0001, + "reward": -0.1684376262128353, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.1684376262128353, + "reward_after_std": 0.373518081381917, + "reward_before_mean": -0.026155206374824047, + "reward_before_std": 0.34390043281018734, + "reward_change_max": 0.0008780360221862793, + "reward_change_mean": -0.14228241797536612, + "reward_change_min": -0.23626244626939297, + "reward_change_std": 0.09374386863783002, + "reward_std": 0.37351808696985245, + "rewards/cosine_scaled_reward": -0.12766093760728836, + "rewards/format_reward": 0.2291666679084301, + "step": 94 + }, + { + "advantage_max": 1.590524211525917, + "advantage_mean": 1.2914340219438714e-07, + "advantage_min": -1.0176689475774765, + "advantage_std": 0.9993797987699509, + "completion_length": 3369.4166870117188, + "epoch": 0.10857142857142857, + "grad_norm": 0.15471990406513214, + "kl": 0.0005910694599151611, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.779754323328192e-07, + "loss": 0.0, + "reward": -0.41158403269946575, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.41158403269946575, + "reward_after_std": 0.4722477972973138, + "reward_before_mean": -0.32287996634840965, + "reward_before_std": 0.49264019494876266, + "reward_change_max": 0.0006460621953010559, + "reward_change_mean": -0.08870406670030206, + "reward_change_min": -0.20848171971738338, + "reward_change_std": 0.08687506098067388, + "reward_std": 0.47224780498072505, + "rewards/cosine_scaled_reward": -0.25518998503685, + "rewards/format_reward": 0.1875, + "step": 95 + }, + { + "advantage_max": 1.724830448627472, + "advantage_mean": -2.483526162500027e-09, + "advantage_min": -0.9390768259763718, + "advantage_std": 0.9998311027884483, + "completion_length": 2654.375045776367, + "epoch": 0.10971428571428571, + "grad_norm": 0.2167777717113495, + "kl": 0.001465141773223877, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.769942052400235e-07, + "loss": 0.0001, + "reward": 0.1042107567191124, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1042107567191124, + "reward_after_std": 0.770032986998558, + "reward_before_mean": 0.2759747635573149, + "reward_before_std": 0.7717161737382412, + "reward_change_max": 0.0006542354822158813, + "reward_change_mean": -0.17176401265896857, + "reward_change_min": -0.3010180573910475, + "reward_change_std": 0.12796866125427186, + "reward_std": 0.7700330205261707, + "rewards/cosine_scaled_reward": -0.05992929823696613, + "rewards/format_reward": 0.39583333767950535, + "step": 96 + }, + { + "advantage_max": 1.6630063951015472, + "advantage_mean": 4.03573128249235e-08, + "advantage_min": -1.0467227101325989, + "advantage_std": 0.9998231157660484, + "completion_length": 3095.3958892822266, + "epoch": 0.11085714285714286, + "grad_norm": 0.17923100292682648, + "kl": 0.0008588209748268127, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.759921670520634e-07, + "loss": 0.0, + "reward": 0.04914311692118645, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.04914311692118645, + "reward_after_std": 0.7510417252779007, + "reward_before_mean": 0.2142169401049614, + "reward_before_std": 0.7740973867475986, + "reward_change_max": 0.0006477609276771545, + "reward_change_mean": -0.165073798969388, + "reward_change_min": -0.32045017182826996, + "reward_change_std": 0.13384919241070747, + "reward_std": 0.7510417327284813, + "rewards/cosine_scaled_reward": -0.0699748694896698, + "rewards/format_reward": 0.35416668094694614, + "step": 97 + }, + { + "advantage_max": 1.6298557221889496, + "advantage_mean": 8.692344732885715e-09, + "advantage_min": -1.0451266095042229, + "advantage_std": 0.9997927322983742, + "completion_length": 2451.7708892822266, + "epoch": 0.112, + "grad_norm": 0.17727041244506836, + "kl": 0.0005920976400375366, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.749693666068663e-07, + "loss": 0.0, + "reward": 0.1988861383870244, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.1988861383870244, + "reward_after_std": 0.5740308053791523, + "reward_before_mean": 0.40678128972649574, + "reward_before_std": 0.5631402656435966, + "reward_change_max": 0.0012294650077819824, + "reward_change_mean": -0.2078951743314974, + "reward_change_min": -0.37305937707424164, + "reward_change_std": 0.143737900769338, + "reward_std": 0.5740308240056038, + "rewards/cosine_scaled_reward": -0.09869268629699945, + "rewards/format_reward": 0.6041666753590107, + "step": 98 + }, + { + "advantage_max": 1.5828326791524887, + "advantage_mean": -8.537124052132583e-08, + "advantage_min": -1.1269954815506935, + "advantage_std": 0.99972303211689, + "completion_length": 2781.875, + "epoch": 0.11314285714285714, + "grad_norm": 0.24164098501205444, + "kl": 0.0007622838020324707, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.739258537542835e-07, + "loss": 0.0, + "reward": -0.023720188066363335, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.023720188066363335, + "reward_after_std": 0.46708662807941437, + "reward_before_mean": 0.14562718383967876, + "reward_before_std": 0.4578824061900377, + "reward_change_max": 0.00011779367923736572, + "reward_change_mean": -0.1693474086932838, + "reward_change_min": -0.3044974785298109, + "reward_change_std": 0.12167800427414477, + "reward_std": 0.46708663180470467, + "rewards/cosine_scaled_reward": -0.0834364052861929, + "rewards/format_reward": 0.31250000186264515, + "step": 99 + }, + { + "advantage_max": 1.570393443107605, + "advantage_mean": 4.035731249185659e-08, + "advantage_min": -1.19875980168581, + "advantage_std": 0.9998155683279037, + "completion_length": 2425.1041870117188, + "epoch": 0.11428571428571428, + "grad_norm": 0.18487881124019623, + "kl": 0.001100778579711914, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.728616793536587e-07, + "loss": 0.0, + "reward": 0.4371534734964371, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.4371534734964371, + "reward_after_std": 0.7649065852165222, + "reward_before_mean": 0.6844560205936432, + "reward_before_std": 0.7754265181720257, + "reward_change_max": 0.0013605356216430664, + "reward_change_mean": -0.24730252707377076, + "reward_change_min": -0.4665789268910885, + "reward_change_std": 0.17839907761663198, + "reward_std": 0.7649066299200058, + "rewards/cosine_scaled_reward": 0.0505613349378109, + "rewards/format_reward": 0.5833333488553762, + "step": 100 + }, + { + "advantage_max": 1.6452988535165787, + "advantage_mean": 2.2351742234860694e-08, + "advantage_min": -0.959521122276783, + "advantage_std": 0.9998143017292023, + "completion_length": 2734.4166870117188, + "epoch": 0.11542857142857142, + "grad_norm": 0.19890400767326355, + "kl": 0.0011498034000396729, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.717768952713511e-07, + "loss": 0.0, + "reward": 0.10417449288070202, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.10417449288070202, + "reward_after_std": 0.6690843030810356, + "reward_before_mean": 0.28429789654910564, + "reward_before_std": 0.6643826402723789, + "reward_change_max": 0.0015150904655456543, + "reward_change_mean": -0.18012344324961305, + "reward_change_min": -0.3563338704407215, + "reward_change_std": 0.13597459299489856, + "reward_std": 0.669084332883358, + "rewards/cosine_scaled_reward": -0.07660102914087474, + "rewards/format_reward": 0.4375000074505806, + "step": 101 + }, + { + "advantage_max": 1.628109648823738, + "advantage_mean": -1.6142924885720333e-08, + "advantage_min": -1.130338653922081, + "advantage_std": 0.9998352602124214, + "completion_length": 2244.0833892822266, + "epoch": 0.11657142857142858, + "grad_norm": 0.35045328736305237, + "kl": 0.0018877387046813965, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.706715543782064e-07, + "loss": 0.0001, + "reward": 0.23090652655810118, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.23090652655810118, + "reward_after_std": 0.8405257500708103, + "reward_before_mean": 0.426285058259964, + "reward_before_std": 0.8548384718596935, + "reward_change_max": 0.0013198107481002808, + "reward_change_mean": -0.1953785545192659, + "reward_change_min": -0.3686686437577009, + "reward_change_std": 0.1524599560070783, + "reward_std": 0.8405257761478424, + "rewards/cosine_scaled_reward": -0.10977413924410939, + "rewards/format_reward": 0.6458333432674408, + "step": 102 + }, + { + "advantage_max": 1.3960938453674316, + "advantage_mean": 2.6077032311278003e-08, + "advantage_min": -1.2053559049963951, + "advantage_std": 0.9997885152697563, + "completion_length": 2521.833396911621, + "epoch": 0.11771428571428572, + "grad_norm": 0.278216689825058, + "kl": 0.001301884651184082, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.695457105469804e-07, + "loss": 0.0001, + "reward": 0.060690226033329964, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.060690226033329964, + "reward_after_std": 0.6560997925698757, + "reward_before_mean": 0.24024609848856926, + "reward_before_std": 0.7025391049683094, + "reward_change_max": 0.0, + "reward_change_mean": -0.1795558724552393, + "reward_change_min": -0.3511783704161644, + "reward_change_std": 0.15032944874837995, + "reward_std": 0.6560998372733593, + "rewards/cosine_scaled_reward": -0.11946028470993042, + "rewards/format_reward": 0.47916666977107525, + "step": 103 + }, + { + "advantage_max": 1.5210340768098831, + "advantage_mean": 1.9868215073159945e-08, + "advantage_min": -1.1603004932403564, + "advantage_std": 0.9997299611568451, + "completion_length": 2693.020835876465, + "epoch": 0.11885714285714286, + "grad_norm": 0.22005805373191833, + "kl": 0.0020142793655395508, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.683994186497132e-07, + "loss": 0.0001, + "reward": -0.08458196744322777, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.08458196744322777, + "reward_after_std": 0.47468145191669464, + "reward_before_mean": 0.07341913506388664, + "reward_before_std": 0.4918606597930193, + "reward_change_max": 0.00020316988229751587, + "reward_change_mean": -0.15800110436975956, + "reward_change_min": -0.26773341558873653, + "reward_change_std": 0.11250392789952457, + "reward_std": 0.47468145936727524, + "rewards/cosine_scaled_reward": -0.12995710503309965, + "rewards/format_reward": 0.3333333358168602, + "step": 104 + }, + { + "advantage_max": 1.5753152966499329, + "advantage_mean": 2.4835264955669345e-09, + "advantage_min": -1.0181199088692665, + "advantage_std": 0.99983299523592, + "completion_length": 2375.0416870117188, + "epoch": 0.12, + "grad_norm": 0.21237428486347198, + "kl": 0.0008479952812194824, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.672327345550543e-07, + "loss": 0.0, + "reward": 0.2439766377210617, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2439766377210617, + "reward_after_std": 0.779191005975008, + "reward_before_mean": 0.44646067079156637, + "reward_before_std": 0.785377386957407, + "reward_change_max": 0.0, + "reward_change_mean": -0.2024840395897627, + "reward_change_min": -0.4037261363118887, + "reward_change_std": 0.15447102207690477, + "reward_std": 0.7791910246014595, + "rewards/cosine_scaled_reward": -0.037186328787356615, + "rewards/format_reward": 0.5208333414047956, + "step": 105 + }, + { + "advantage_max": 1.7184691429138184, + "advantage_mean": -6.51925804451281e-08, + "advantage_min": -1.036706604063511, + "advantage_std": 0.9998025968670845, + "completion_length": 2241.833381652832, + "epoch": 0.12114285714285715, + "grad_norm": 0.1902162879705429, + "kl": 0.0011308789253234863, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.66045715125541e-07, + "loss": 0.0, + "reward": 0.7149951979517937, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.7149951979517937, + "reward_after_std": 0.6896206475794315, + "reward_before_mean": 1.0211203414946795, + "reward_before_std": 0.6423603612929583, + "reward_change_max": 0.0, + "reward_change_mean": -0.3061251426115632, + "reward_change_min": -0.5069556701928377, + "reward_change_std": 0.19168227072805166, + "reward_std": 0.6896206885576248, + "rewards/cosine_scaled_reward": 0.17722682980820537, + "rewards/format_reward": 0.6666666734963655, + "step": 106 + }, + { + "advantage_max": 1.3700329214334488, + "advantage_mean": 3.2285850992685994e-08, + "advantage_min": -1.3256009072065353, + "advantage_std": 0.9997893422842026, + "completion_length": 2744.166717529297, + "epoch": 0.12228571428571429, + "grad_norm": 0.20612262189388275, + "kl": 0.0011415481567382812, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.648384182148252e-07, + "loss": 0.0, + "reward": 0.06857027532532811, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.06857027532532811, + "reward_after_std": 0.6631270665675402, + "reward_before_mean": 0.252403543330729, + "reward_before_std": 0.7201212346553802, + "reward_change_max": 0.0002407953143119812, + "reward_change_mean": -0.1838332605548203, + "reward_change_min": -0.3560641389340162, + "reward_change_std": 0.15651204530149698, + "reward_std": 0.663127088919282, + "rewards/cosine_scaled_reward": -0.10296489670872688, + "rewards/format_reward": 0.4583333507180214, + "step": 107 + }, + { + "advantage_max": 1.5325934290885925, + "advantage_mean": 6.208818126296478e-09, + "advantage_min": -1.0848430544137955, + "advantage_std": 0.9998377189040184, + "completion_length": 2616.3333740234375, + "epoch": 0.12342857142857143, + "grad_norm": 0.32614660263061523, + "kl": 0.0012401752173900604, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.636109026648554e-07, + "loss": 0.0, + "reward": 0.057555489242076874, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.057555489242076874, + "reward_after_std": 0.9342328980565071, + "reward_before_mean": 0.21481921151280403, + "reward_before_std": 0.9939168691635132, + "reward_change_max": 0.0006889700889587402, + "reward_change_mean": -0.15726373670622706, + "reward_change_min": -0.38562384992837906, + "reward_change_std": 0.16911761928349733, + "reward_std": 0.9342329241335392, + "rewards/cosine_scaled_reward": -0.11134038865566254, + "rewards/format_reward": 0.4375000149011612, + "step": 108 + }, + { + "advantage_max": 1.717758134007454, + "advantage_mean": 2.7008355329982692e-08, + "advantage_min": -0.9031755030155182, + "advantage_std": 0.9998158439993858, + "completion_length": 3081.729217529297, + "epoch": 0.12457142857142857, + "grad_norm": 0.18283237516880035, + "kl": 0.0006889104843139648, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.623632283030077e-07, + "loss": 0.0, + "reward": 0.031152330338954926, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.031152330338954926, + "reward_after_std": 0.738107368350029, + "reward_before_mean": 0.1895277239382267, + "reward_before_std": 0.7382989898324013, + "reward_change_max": 0.0006342306733131409, + "reward_change_mean": -0.1583753984887153, + "reward_change_min": -0.3311379738152027, + "reward_change_std": 0.12449920130893588, + "reward_std": 0.7381073944270611, + "rewards/cosine_scaled_reward": -0.1031528078019619, + "rewards/format_reward": 0.3958333358168602, + "step": 109 + }, + { + "advantage_max": 1.5491592735052109, + "advantage_mean": 1.350417788703595e-08, + "advantage_min": -1.104156732559204, + "advantage_std": 0.9997999370098114, + "completion_length": 2665.6250228881836, + "epoch": 0.12571428571428572, + "grad_norm": 0.24647559225559235, + "kl": 0.0017870888113975525, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.610954559391704e-07, + "loss": 0.0001, + "reward": 0.06417747336672619, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.06417747336672619, + "reward_after_std": 0.6984993778169155, + "reward_before_mean": 0.23563515860587358, + "reward_before_std": 0.7118737921118736, + "reward_change_max": 0.00017995387315750122, + "reward_change_mean": -0.1714576887898147, + "reward_change_min": -0.33860295079648495, + "reward_change_std": 0.13551720790565014, + "reward_std": 0.698499396443367, + "rewards/cosine_scaled_reward": -0.12176575046032667, + "rewards/format_reward": 0.4791666716337204, + "step": 110 + }, + { + "advantage_max": 1.6004591435194016, + "advantage_mean": 1.862645149230957e-09, + "advantage_min": -0.9542756676673889, + "advantage_std": 0.9998093247413635, + "completion_length": 2937.3750610351562, + "epoch": 0.12685714285714286, + "grad_norm": 0.21009349822998047, + "kl": 0.0018000602722167969, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.598076473627796e-07, + "loss": 0.0001, + "reward": -0.1473899253178388, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.1473899253178388, + "reward_after_std": 0.7092999331653118, + "reward_before_mean": -0.022016488015651703, + "reward_before_std": 0.7311879619956017, + "reward_change_max": 0.001234203577041626, + "reward_change_mean": -0.12537344172596931, + "reward_change_min": -0.3080690782517195, + "reward_change_std": 0.1185176195576787, + "reward_std": 0.7092999368906021, + "rewards/cosine_scaled_reward": -0.16725825425237417, + "rewards/format_reward": 0.3125000037252903, + "step": 111 + }, + { + "advantage_max": 1.58974190056324, + "advantage_mean": 1.1175871006408045e-08, + "advantage_min": -1.0684397667646408, + "advantage_std": 0.9997754022479057, + "completion_length": 2916.3333740234375, + "epoch": 0.128, + "grad_norm": 0.17282706499099731, + "kl": 0.0009534507989883423, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.58499865339809e-07, + "loss": 0.0, + "reward": 0.047515214420855045, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.047515214420855045, + "reward_after_std": 0.6808386445045471, + "reward_before_mean": 0.21764793386682868, + "reward_before_std": 0.6999966204166412, + "reward_change_max": 0.0, + "reward_change_mean": -0.17013270873576403, + "reward_change_min": -0.32235524989664555, + "reward_change_std": 0.1295095095410943, + "reward_std": 0.6808386482298374, + "rewards/cosine_scaled_reward": -0.0786760482005775, + "rewards/format_reward": 0.37500000558793545, + "step": 112 + }, + { + "advantage_max": 1.5703244507312775, + "advantage_mean": 1.3659398057086491e-08, + "advantage_min": -1.1096737533807755, + "advantage_std": 0.999800406396389, + "completion_length": 2577.0208892822266, + "epoch": 0.12914285714285714, + "grad_norm": 0.2460978776216507, + "kl": 0.003580331802368164, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.571721736097088e-07, + "loss": 0.0001, + "reward": 0.23878616420552135, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.23878616420552135, + "reward_after_std": 0.7030831761658192, + "reward_before_mean": 0.44820527732372284, + "reward_before_std": 0.7209436502307653, + "reward_change_max": 0.0, + "reward_change_mean": -0.20941910333931446, + "reward_change_min": -0.39444285817444324, + "reward_change_std": 0.15796104352921247, + "reward_std": 0.703083198517561, + "rewards/cosine_scaled_reward": -0.036314038559794426, + "rewards/format_reward": 0.5208333432674408, + "step": 113 + }, + { + "advantage_max": 1.7269503027200699, + "advantage_mean": 3.725291630729544e-09, + "advantage_min": -0.9455199539661407, + "advantage_std": 0.999740943312645, + "completion_length": 2578.437511444092, + "epoch": 0.13028571428571428, + "grad_norm": 0.19760534167289734, + "kl": 0.003066539764404297, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.55824636882301e-07, + "loss": 0.0001, + "reward": 0.01859831716865301, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.01859831716865301, + "reward_after_std": 0.4981700386852026, + "reward_before_mean": 0.19175568968057632, + "reward_before_std": 0.4783357158303261, + "reward_change_max": 0.0004209578037261963, + "reward_change_mean": -0.1731573868310079, + "reward_change_min": -0.2967113181948662, + "reward_change_std": 0.11035828175954521, + "reward_std": 0.49817005544900894, + "rewards/cosine_scaled_reward": -0.20620550867170095, + "rewards/format_reward": 0.6041666772216558, + "step": 114 + }, + { + "advantage_max": 1.5616178661584854, + "advantage_mean": 3.5390257835388184e-08, + "advantage_min": -1.0633350536227226, + "advantage_std": 0.9997276589274406, + "completion_length": 2888.2916717529297, + "epoch": 0.13142857142857142, + "grad_norm": 0.24375756084918976, + "kl": 0.002779722213745117, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.54457320834625e-07, + "loss": 0.0001, + "reward": -0.1527273915708065, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.1527273915708065, + "reward_after_std": 0.5216103941202164, + "reward_before_mean": -0.013282615691423416, + "reward_before_std": 0.532561769708991, + "reward_change_max": 0.0011752843856811523, + "reward_change_mean": -0.1394447716884315, + "reward_change_min": -0.2619510591030121, + "reward_change_std": 0.10499659506604075, + "reward_std": 0.5216104164719582, + "rewards/cosine_scaled_reward": -0.13164131715893745, + "rewards/format_reward": 0.25, + "step": 115 + }, + { + "advantage_max": 1.7398159205913544, + "advantage_mean": 6.208817238118058e-09, + "advantage_min": -0.8426778316497803, + "advantage_std": 0.9997789934277534, + "completion_length": 3405.0208435058594, + "epoch": 0.13257142857142856, + "grad_norm": 0.1686706840991974, + "kl": 0.0017733573913574219, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.530702921077358e-07, + "loss": 0.0001, + "reward": -0.3314667074009776, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.3314667074009776, + "reward_after_std": 0.7026853635907173, + "reward_before_mean": -0.2483118800446391, + "reward_before_std": 0.7108087986707687, + "reward_change_max": 0.0009364485740661621, + "reward_change_mean": -0.08315482863690704, + "reward_change_min": -0.20761460438370705, + "reward_change_std": 0.08251205913256854, + "reward_std": 0.7026854008436203, + "rewards/cosine_scaled_reward": -0.18665594549383968, + "rewards/format_reward": 0.1250000037252903, + "step": 116 + }, + { + "advantage_max": 1.6006111353635788, + "advantage_mean": -2.7318796780306798e-08, + "advantage_min": -1.041724719107151, + "advantage_std": 0.999773383140564, + "completion_length": 2680.083366394043, + "epoch": 0.1337142857142857, + "grad_norm": 0.18357373774051666, + "kl": 0.0026030540466308594, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.516636183034564e-07, + "loss": 0.0001, + "reward": 0.012444857507944107, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.012444857507944107, + "reward_after_std": 0.6517674252390862, + "reward_before_mean": 0.17331765219569206, + "reward_before_std": 0.6458356343209743, + "reward_change_max": 0.0004078969359397888, + "reward_change_mean": -0.16087282774969935, + "reward_change_min": -0.3154468797147274, + "reward_change_std": 0.11643311567604542, + "reward_std": 0.6517674289643764, + "rewards/cosine_scaled_reward": -0.10084117203950882, + "rewards/format_reward": 0.37500000558793545, + "step": 117 + }, + { + "advantage_max": 1.4670916646718979, + "advantage_mean": 1.4280280180578586e-08, + "advantage_min": -1.1820987686514854, + "advantage_std": 0.999840185046196, + "completion_length": 2915.9166870117188, + "epoch": 0.13485714285714287, + "grad_norm": 0.17332710325717926, + "kl": 0.0017366409301757812, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.502373679810839e-07, + "loss": 0.0001, + "reward": 0.2379293106496334, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2379293106496334, + "reward_after_std": 0.8679434321820736, + "reward_before_mean": 0.43904879316687584, + "reward_before_std": 0.916411992162466, + "reward_change_max": 0.00032275915145874023, + "reward_change_mean": -0.20111947320401669, + "reward_change_min": -0.4205525293946266, + "reward_change_std": 0.17429276509210467, + "reward_std": 0.8679434508085251, + "rewards/cosine_scaled_reward": 0.0007743909955024719, + "rewards/format_reward": 0.4375000111758709, + "step": 118 + }, + { + "advantage_max": 1.5654396712779999, + "advantage_mean": -1.2417634698280722e-08, + "advantage_min": -1.0977480113506317, + "advantage_std": 0.9998264610767365, + "completion_length": 2394.6459045410156, + "epoch": 0.136, + "grad_norm": 0.27868223190307617, + "kl": 0.0041304826736450195, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.487916106540465e-07, + "loss": 0.0002, + "reward": 0.29523699606215814, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.29523699606215814, + "reward_after_std": 0.744273629039526, + "reward_before_mean": 0.5135078746825457, + "reward_before_std": 0.7622634246945381, + "reward_change_max": 0.0, + "reward_change_mean": -0.21827087132260203, + "reward_change_min": -0.4341272786259651, + "reward_change_std": 0.16735805850476027, + "reward_std": 0.7442736364901066, + "rewards/cosine_scaled_reward": -0.03491273708641529, + "rewards/format_reward": 0.5833333376795053, + "step": 119 + }, + { + "advantage_max": 1.7137190848588943, + "advantage_mean": -2.23517424569053e-08, + "advantage_min": -1.0148278772830963, + "advantage_std": 0.9998286962509155, + "completion_length": 2304.145866394043, + "epoch": 0.13714285714285715, + "grad_norm": 0.2446976751089096, + "kl": 0.0027112960815429688, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.473264167865171e-07, + "loss": 0.0001, + "reward": 0.15172169636934996, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.15172169636934996, + "reward_after_std": 0.6846943460404873, + "reward_before_mean": 0.33898145519196987, + "reward_before_std": 0.6702734418213367, + "reward_change_max": 0.0, + "reward_change_mean": -0.1872597902547568, + "reward_change_min": -0.31060235388576984, + "reward_change_std": 0.12618307769298553, + "reward_std": 0.6846943572163582, + "rewards/cosine_scaled_reward": -0.10134260216727853, + "rewards/format_reward": 0.5416666753590107, + "step": 120 + }, + { + "advantage_max": 1.691588580608368, + "advantage_mean": -5.836288297089709e-08, + "advantage_min": -1.034871518611908, + "advantage_std": 0.9998171031475067, + "completion_length": 1725.9375381469727, + "epoch": 0.1382857142857143, + "grad_norm": 0.2284398376941681, + "kl": 0.0031642913818359375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.458418577899774e-07, + "loss": 0.0001, + "reward": 0.421614283695817, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.421614283695817, + "reward_after_std": 0.678892083466053, + "reward_before_mean": 0.6639131233096123, + "reward_before_std": 0.6397330313920975, + "reward_change_max": 0.0, + "reward_change_mean": -0.2422988973557949, + "reward_change_min": -0.4020249769091606, + "reward_change_std": 0.15020818077027798, + "reward_std": 0.6788921020925045, + "rewards/cosine_scaled_reward": -0.043043429497629404, + "rewards/format_reward": 0.7500000111758709, + "step": 121 + }, + { + "advantage_max": 1.5243453085422516, + "advantage_mean": -1.2417634476236117e-08, + "advantage_min": -1.0929933041334152, + "advantage_std": 0.999860443174839, + "completion_length": 2846.1458740234375, + "epoch": 0.13942857142857143, + "grad_norm": 0.2239234298467636, + "kl": 0.0016736984252929688, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.443380060197385e-07, + "loss": 0.0001, + "reward": 0.3787113861180842, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.3787113861180842, + "reward_after_std": 1.0122371204197407, + "reward_before_mean": 0.6001707105897367, + "reward_before_std": 1.072056818753481, + "reward_change_max": 0.00045921653509140015, + "reward_change_mean": -0.22145933331921697, + "reward_change_min": -0.45815131813287735, + "reward_change_std": 0.19878085469827056, + "reward_std": 1.0122371390461922, + "rewards/cosine_scaled_reward": 0.060502004344016314, + "rewards/format_reward": 0.4791666753590107, + "step": 122 + }, + { + "advantage_max": 1.5138305127620697, + "advantage_mean": 6.208817793229571e-09, + "advantage_min": -1.2074719741940498, + "advantage_std": 0.9997866898775101, + "completion_length": 2553.125045776367, + "epoch": 0.14057142857142857, + "grad_norm": 0.21360571682453156, + "kl": 0.0025411248207092285, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.428149347714143e-07, + "loss": 0.0001, + "reward": 0.042357919504866004, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.042357919504866004, + "reward_after_std": 0.636686947196722, + "reward_before_mean": 0.21523546800017357, + "reward_before_std": 0.661348432302475, + "reward_change_max": 0.00020268559455871582, + "reward_change_mean": -0.17287754639983177, + "reward_change_min": -0.32666971161961555, + "reward_change_std": 0.13960499968379736, + "reward_std": 0.6366869881749153, + "rewards/cosine_scaled_reward": -0.15279894787818193, + "rewards/format_reward": 0.5208333395421505, + "step": 123 + }, + { + "advantage_max": 1.5655941367149353, + "advantage_mean": -1.641456082168702e-08, + "advantage_min": -0.9814046025276184, + "advantage_std": 0.999843031167984, + "completion_length": 2003.0000381469727, + "epoch": 0.1417142857142857, + "grad_norm": 0.20767082273960114, + "kl": 0.004428386688232422, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.412727182773486e-07, + "loss": 0.0002, + "reward": 0.500551930628717, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.500551930628717, + "reward_after_std": 0.8857000321149826, + "reward_before_mean": 0.7559270821511745, + "reward_before_std": 0.9190114215016365, + "reward_change_max": 0.0, + "reward_change_mean": -0.2553751552477479, + "reward_change_min": -0.5128838941454887, + "reward_change_std": 0.2021732535213232, + "reward_std": 0.8857000656425953, + "rewards/cosine_scaled_reward": -0.007453134283423424, + "rewards/format_reward": 0.7708333414047956, + "step": 124 + }, + { + "advantage_max": 1.6035060584545135, + "advantage_mean": 6.705522670458208e-08, + "advantage_min": -1.1286407485604286, + "advantage_std": 0.9997504726052284, + "completion_length": 2833.958335876465, + "epoch": 0.14285714285714285, + "grad_norm": 0.15640655159950256, + "kl": 0.0021719932556152344, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.397114317029974e-07, + "loss": 0.0001, + "reward": 0.16567879915237427, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.16567879915237427, + "reward_after_std": 0.5571850594133139, + "reward_before_mean": 0.36717021465301514, + "reward_before_std": 0.5443516932427883, + "reward_change_max": 0.0002313479781150818, + "reward_change_mean": -0.20149140153080225, + "reward_change_min": -0.3646851126104593, + "reward_change_std": 0.14025269588455558, + "reward_std": 0.5571850873529911, + "rewards/cosine_scaled_reward": 0.02733509987592697, + "rewards/format_reward": 0.3125, + "step": 125 + }, + { + "advantage_max": 1.5312575846910477, + "advantage_mean": 1.1486311707331609e-08, + "advantage_min": -1.2798721194267273, + "advantage_std": 0.9998003914952278, + "completion_length": 2863.5208892822266, + "epoch": 0.144, + "grad_norm": 0.22090090811252594, + "kl": 0.0013557672500610352, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.381311511432658e-07, + "loss": 0.0001, + "reward": 0.0737285241484642, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.0737285241484642, + "reward_after_std": 0.7071263864636421, + "reward_before_mean": 0.25064082257449627, + "reward_before_std": 0.7428994365036488, + "reward_change_max": 0.0, + "reward_change_mean": -0.17691227421164513, + "reward_change_min": -0.35780435614287853, + "reward_change_std": 0.14994592033326626, + "reward_std": 0.707126397639513, + "rewards/cosine_scaled_reward": -0.09342960081994534, + "rewards/format_reward": 0.4375000149011612, + "step": 126 + }, + { + "advantage_max": 1.4340872317552567, + "advantage_mean": -6.208817349140361e-09, + "advantage_min": -1.183464154601097, + "advantage_std": 0.9997255057096481, + "completion_length": 3002.1041870117188, + "epoch": 0.14514285714285713, + "grad_norm": 0.16190996766090393, + "kl": 0.0020918846130371094, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.36531953618799e-07, + "loss": 0.0001, + "reward": -0.2233894734235946, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.2233894734235946, + "reward_after_std": 0.4971660189330578, + "reward_before_mean": -0.0952535979449749, + "reward_before_std": 0.5213297717273235, + "reward_change_max": 0.0009558200836181641, + "reward_change_mean": -0.12813590315636247, + "reward_change_min": -0.26985314674675465, + "reward_change_std": 0.10562613094225526, + "reward_std": 0.4971660412847996, + "rewards/cosine_scaled_reward": -0.2142934650182724, + "rewards/format_reward": 0.3333333358168602, + "step": 127 + }, + { + "advantage_max": 1.4159886687994003, + "advantage_mean": 4.3461721332960224e-08, + "advantage_min": -1.182981289923191, + "advantage_std": 0.9998199418187141, + "completion_length": 2886.50004196167, + "epoch": 0.1462857142857143, + "grad_norm": 0.1824210286140442, + "kl": 0.002896904945373535, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.34913917072228e-07, + "loss": 0.0001, + "reward": 0.2526640184223652, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2526640184223652, + "reward_after_std": 0.7988780178129673, + "reward_before_mean": 0.4635799862444401, + "reward_before_std": 0.8548481166362762, + "reward_change_max": 0.0006513744592666626, + "reward_change_mean": -0.21091592963784933, + "reward_change_min": -0.4153004623949528, + "reward_change_std": 0.17933304305188358, + "reward_std": 0.7988780252635479, + "rewards/cosine_scaled_reward": 0.03387330658733845, + "rewards/format_reward": 0.3958333432674408, + "step": 128 + }, + { + "advantage_max": 1.670387864112854, + "advantage_mean": 2.0178656190417144e-08, + "advantage_min": -0.9403666146099567, + "advantage_std": 0.9997882694005966, + "completion_length": 3401.6666870117188, + "epoch": 0.14742857142857144, + "grad_norm": 0.17621037364006042, + "kl": 0.002868175506591797, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.332771203643714e-07, + "loss": 0.0001, + "reward": -0.26161413080990314, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.26161413080990314, + "reward_after_std": 0.6878196895122528, + "reward_before_mean": -0.1574427995365113, + "reward_before_std": 0.7175455689430237, + "reward_change_max": 0.0006723701953887939, + "reward_change_mean": -0.10417133261216804, + "reward_change_min": -0.2432860340923071, + "reward_change_std": 0.10660239483695477, + "reward_std": 0.6878196895122528, + "rewards/cosine_scaled_reward": -0.1724714022129774, + "rewards/format_reward": 0.1875000037252903, + "step": 129 + }, + { + "advantage_max": 1.6903793960809708, + "advantage_mean": 2.0489096419495922e-08, + "advantage_min": -0.9747004956007004, + "advantage_std": 0.9998284503817558, + "completion_length": 2804.6250381469727, + "epoch": 0.14857142857142858, + "grad_norm": 0.1896054595708847, + "kl": 0.001954317092895508, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.316216432703916e-07, + "loss": 0.0001, + "reward": -0.028209278360009193, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.028209278360009193, + "reward_after_std": 0.7870006188750267, + "reward_before_mean": 0.11392226428142749, + "reward_before_std": 0.7941737547516823, + "reward_change_max": 0.0006090477108955383, + "reward_change_mean": -0.14213155990000814, + "reward_change_min": -0.3012427128851414, + "reward_change_std": 0.1142815554048866, + "reward_std": 0.7870006375014782, + "rewards/cosine_scaled_reward": -0.10970553383231163, + "rewards/format_reward": 0.3333333358168602, + "step": 130 + }, + { + "advantage_max": 1.5626876056194305, + "advantage_mean": 5.960464544152444e-08, + "advantage_min": -1.1384482607245445, + "advantage_std": 0.9997840076684952, + "completion_length": 2861.3958740234375, + "epoch": 0.14971428571428572, + "grad_norm": 0.20207345485687256, + "kl": 0.0037682056427001953, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.299475664759068e-07, + "loss": 0.0002, + "reward": 0.3409400451928377, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3409400451928377, + "reward_after_std": 0.6878578290343285, + "reward_before_mean": 0.5735721625387669, + "reward_before_std": 0.6877081072889268, + "reward_change_max": 0.0010700449347496033, + "reward_change_mean": -0.23263203352689743, + "reward_change_min": -0.41999047063291073, + "reward_change_std": 0.17137008626013994, + "reward_std": 0.6878578290343285, + "rewards/cosine_scaled_reward": 0.07845271937549114, + "rewards/format_reward": 0.416666679084301, + "step": 131 + }, + { + "advantage_max": 1.629030168056488, + "advantage_mean": 3.725290742551124e-09, + "advantage_min": -1.0061270147562027, + "advantage_std": 0.9997803717851639, + "completion_length": 2558.562530517578, + "epoch": 0.15085714285714286, + "grad_norm": 0.20671804249286652, + "kl": 0.002033710479736328, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.282549715730579e-07, + "loss": 0.0001, + "reward": 0.1808385867625475, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.1808385867625475, + "reward_after_std": 0.6454941257834435, + "reward_before_mean": 0.3788890652358532, + "reward_before_std": 0.6303308047354221, + "reward_change_max": 0.0003954395651817322, + "reward_change_mean": -0.1980504752136767, + "reward_change_min": -0.36066864989697933, + "reward_change_std": 0.14119471702724695, + "reward_std": 0.6454941593110561, + "rewards/cosine_scaled_reward": 0.0019445132929831743, + "rewards/format_reward": 0.3750000037252903, + "step": 132 + }, + { + "advantage_max": 1.4284230470657349, + "advantage_mean": 8.940697038273271e-08, + "advantage_min": -1.192399837076664, + "advantage_std": 0.9997306689620018, + "completion_length": 3046.8958587646484, + "epoch": 0.152, + "grad_norm": 0.2751116454601288, + "kl": 0.003151416778564453, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.265439410565328e-07, + "loss": 0.0001, + "reward": -0.26124978717416525, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.26124978717416525, + "reward_after_std": 0.4831140823662281, + "reward_before_mean": -0.13964797370135784, + "reward_before_std": 0.5154471658170223, + "reward_change_max": 0.0006668567657470703, + "reward_change_mean": -0.12160179018974304, + "reward_change_min": -0.25765814632177353, + "reward_change_std": 0.10493774805217981, + "reward_std": 0.48311409167945385, + "rewards/cosine_scaled_reward": -0.215657327324152, + "rewards/format_reward": 0.291666679084301, + "step": 133 + }, + { + "advantage_max": 1.650213047862053, + "advantage_mean": 1.3969839257610417e-08, + "advantage_min": -0.9398018196225166, + "advantage_std": 0.9998574033379555, + "completion_length": 2290.812545776367, + "epoch": 0.15314285714285714, + "grad_norm": 0.24014216661453247, + "kl": 0.0033774375915527344, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.248145583195447e-07, + "loss": 0.0001, + "reward": 0.2990442682057619, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2990442682057619, + "reward_after_std": 0.9332752451300621, + "reward_before_mean": 0.5012820335105062, + "reward_before_std": 0.9426852278411388, + "reward_change_max": 0.00036494433879852295, + "reward_change_mean": -0.2022377629764378, + "reward_change_min": -0.4220910370349884, + "reward_change_std": 0.1617970857769251, + "reward_std": 0.933275256305933, + "rewards/cosine_scaled_reward": -0.07227565790526569, + "rewards/format_reward": 0.6458333376795053, + "step": 134 + }, + { + "advantage_max": 1.740107610821724, + "advantage_mean": 3.104402290787789e-10, + "advantage_min": -0.9589984938502312, + "advantage_std": 0.999826468527317, + "completion_length": 2104.5834045410156, + "epoch": 0.15428571428571428, + "grad_norm": 0.2661309540271759, + "kl": 0.00379180908203125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.230669076497687e-07, + "loss": 0.0002, + "reward": 0.9028741903603077, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.9028741903603077, + "reward_after_std": 0.7912594079971313, + "reward_before_mean": 1.2412361446768045, + "reward_before_std": 0.7384288650937378, + "reward_change_max": 0.00038820505142211914, + "reward_change_mean": -0.33836194314062595, + "reward_change_min": -0.5658552143722773, + "reward_change_std": 0.2178718103095889, + "reward_std": 0.7912594079971313, + "rewards/cosine_scaled_reward": 0.30811806954443455, + "rewards/format_reward": 0.6250000093132257, + "step": 135 + }, + { + "advantage_max": 1.5855561196804047, + "advantage_mean": 1.5522043650406658e-08, + "advantage_min": -1.045980878174305, + "advantage_std": 0.9998907074332237, + "completion_length": 2559.5833892822266, + "epoch": 0.15542857142857142, + "grad_norm": 0.2847113609313965, + "kl": 0.003814697265625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.213010742252327e-07, + "loss": 0.0002, + "reward": 0.37419047206640244, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.37419047206640244, + "reward_after_std": 1.0629618167877197, + "reward_before_mean": 0.5864962767809629, + "reward_before_std": 1.1105365753173828, + "reward_change_max": 0.0009570121765136719, + "reward_change_mean": -0.21230582473799586, + "reward_change_min": -0.4814386647194624, + "reward_change_std": 0.19272688124328852, + "reward_std": 1.0629618465900421, + "rewards/cosine_scaled_reward": 0.03283148072659969, + "rewards/format_reward": 0.5208333432674408, + "step": 136 + }, + { + "advantage_max": 1.5506631284952164, + "advantage_mean": -3.476937715518602e-08, + "advantage_min": -1.1965996623039246, + "advantage_std": 0.9997732192277908, + "completion_length": 2663.3750610351562, + "epoch": 0.15657142857142858, + "grad_norm": 0.15699128806591034, + "kl": 0.0024433135986328125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.195171441101668e-07, + "loss": 0.0001, + "reward": 0.014663774520158768, + "reward_advantage_correlation": 0.9999999999999994, + "reward_after_mean": 0.014663774520158768, + "reward_after_std": 0.5894833207130432, + "reward_before_mean": 0.18573991488665342, + "reward_before_std": 0.6053555309772491, + "reward_change_max": 0.0003803074359893799, + "reward_change_mean": -0.17107615806162357, + "reward_change_min": -0.30640072375535965, + "reward_change_std": 0.12884120550006628, + "reward_std": 0.5894833244383335, + "rewards/cosine_scaled_reward": -0.1467133816331625, + "rewards/format_reward": 0.4791666753590107, + "step": 137 + }, + { + "advantage_max": 1.6773697882890701, + "advantage_mean": -7.450580818968433e-09, + "advantage_min": -1.1602972447872162, + "advantage_std": 0.9997936189174652, + "completion_length": 2176.3333892822266, + "epoch": 0.15771428571428572, + "grad_norm": 0.23005136847496033, + "kl": 0.002424955368041992, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.177152042508077e-07, + "loss": 0.0001, + "reward": 0.23747283313423395, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.23747283313423395, + "reward_after_std": 0.6506821662187576, + "reward_before_mean": 0.4451894210651517, + "reward_before_std": 0.6293397713452578, + "reward_change_max": 0.0003636404871940613, + "reward_change_mean": -0.20771658699959517, + "reward_change_min": -0.3176217880100012, + "reward_change_std": 0.1273680031299591, + "reward_std": 0.6506821922957897, + "rewards/cosine_scaled_reward": -0.12115529365837574, + "rewards/format_reward": 0.6875000186264515, + "step": 138 + }, + { + "advantage_max": 1.499302163720131, + "advantage_mean": -5.587935536510713e-08, + "advantage_min": -1.1127532124519348, + "advantage_std": 0.9997905865311623, + "completion_length": 2995.812545776367, + "epoch": 0.15885714285714286, + "grad_norm": 0.249574676156044, + "kl": 0.0039157867431640625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.158953424711624e-07, + "loss": 0.0002, + "reward": -0.003708356380229816, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.003708356380229816, + "reward_after_std": 0.6921847648918629, + "reward_before_mean": 0.15705988928675652, + "reward_before_std": 0.7318538166582584, + "reward_change_max": 0.000463239848613739, + "reward_change_mean": -0.16076825419440866, + "reward_change_min": -0.35990045219659805, + "reward_change_std": 0.14408531039953232, + "reward_std": 0.6921847872436047, + "rewards/cosine_scaled_reward": -0.11938672885298729, + "rewards/format_reward": 0.3958333395421505, + "step": 139 + }, + { + "advantage_max": 1.7067873626947403, + "advantage_mean": -7.450582262258365e-09, + "advantage_min": -0.869616873562336, + "advantage_std": 0.9997568354010582, + "completion_length": 2614.166732788086, + "epoch": 0.16, + "grad_norm": 0.38242799043655396, + "kl": 0.004871368408203125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.140576474687263e-07, + "loss": 0.0002, + "reward": 0.0828480685595423, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.0828480685595423, + "reward_after_std": 0.5462512150406837, + "reward_before_mean": 0.2640006002038717, + "reward_before_std": 0.5157320648431778, + "reward_change_max": 0.0004016086459159851, + "reward_change_mean": -0.18115251511335373, + "reward_change_min": -0.3285767696797848, + "reward_change_std": 0.11956191342324018, + "reward_std": 0.5462512299418449, + "rewards/cosine_scaled_reward": -0.08674971852451563, + "rewards/format_reward": 0.4375000037252903, + "step": 140 + }, + { + "advantage_max": 1.7529622614383698, + "advantage_mean": 2.4835269396561444e-09, + "advantage_min": -1.000135324895382, + "advantage_std": 0.9998727887868881, + "completion_length": 2590.7500762939453, + "epoch": 0.16114285714285714, + "grad_norm": 0.2258317768573761, + "kl": 0.004827022552490234, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.122022088101613e-07, + "loss": 0.0002, + "reward": 0.28294834215193987, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.28294834215193987, + "reward_after_std": 0.9506829380989075, + "reward_before_mean": 0.479689369443804, + "reward_before_std": 0.9543975107371807, + "reward_change_max": 0.00037819892168045044, + "reward_change_mean": -0.1967410072684288, + "reward_change_min": -0.37072051130235195, + "reward_change_std": 0.1559813655912876, + "reward_std": 0.950682982802391, + "rewards/cosine_scaled_reward": -0.06223865784704685, + "rewards/format_reward": 0.6041666753590107, + "step": 141 + }, + { + "advantage_max": 1.5266470611095428, + "advantage_mean": 8.881784197001252e-16, + "advantage_min": -1.2423633113503456, + "advantage_std": 0.999744102358818, + "completion_length": 2659.000030517578, + "epoch": 0.16228571428571428, + "grad_norm": 0.17855477333068848, + "kl": 0.0037615299224853516, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.103291169269299e-07, + "loss": 0.0002, + "reward": 0.164220348931849, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.164220348931849, + "reward_after_std": 0.6876718364655972, + "reward_before_mean": 0.35929872654378414, + "reward_before_std": 0.708413734100759, + "reward_change_max": 0.0006590783596038818, + "reward_change_mean": -0.19507834874093533, + "reward_change_min": -0.37512016855180264, + "reward_change_std": 0.15412273351103067, + "reward_std": 0.6876718625426292, + "rewards/cosine_scaled_reward": -0.10160065069794655, + "rewards/format_reward": 0.5625000149011612, + "step": 142 + }, + { + "advantage_max": 1.6525219678878784, + "advantage_mean": 1.6142926662077173e-08, + "advantage_min": -0.9572406560182571, + "advantage_std": 0.9998095482587814, + "completion_length": 2580.1458740234375, + "epoch": 0.16342857142857142, + "grad_norm": 0.256779283285141, + "kl": 0.004227638244628906, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.084384631108882e-07, + "loss": 0.0002, + "reward": 3.49879264831543e-05, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 3.49879264831543e-05, + "reward_after_std": 0.675796166062355, + "reward_before_mean": 0.15760892163962126, + "reward_before_std": 0.6773778162896633, + "reward_change_max": 0.0, + "reward_change_mean": -0.15757392905652523, + "reward_change_min": -0.3442469611763954, + "reward_change_std": 0.12573405727744102, + "reward_std": 0.6757961846888065, + "rewards/cosine_scaled_reward": -0.18161221034824848, + "rewards/format_reward": 0.5208333432674408, + "step": 143 + }, + { + "advantage_max": 1.7686203569173813, + "advantage_mean": 4.3461721777049434e-08, + "advantage_min": -0.8464352861046791, + "advantage_std": 0.999814823269844, + "completion_length": 2639.333366394043, + "epoch": 0.16457142857142856, + "grad_norm": 0.23858557641506195, + "kl": 0.0036993026733398438, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.065303395098358e-07, + "loss": 0.0001, + "reward": -0.06461599344993374, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.06461599344993374, + "reward_after_std": 0.7022199928760529, + "reward_before_mean": 0.07492145337164402, + "reward_before_std": 0.699074275791645, + "reward_change_max": 0.00051155686378479, + "reward_change_mean": -0.13953743898309767, + "reward_change_min": -0.28874889947474003, + "reward_change_std": 0.10882806684821844, + "reward_std": 0.7022200152277946, + "rewards/cosine_scaled_reward": -0.17087261471897364, + "rewards/format_reward": 0.4166666716337204, + "step": 144 + }, + { + "advantage_max": 1.6545881032943726, + "advantage_mean": -4.346170645597169e-09, + "advantage_min": -1.0852633118629456, + "advantage_std": 0.9997621700167656, + "completion_length": 1951.4792175292969, + "epoch": 0.1657142857142857, + "grad_norm": 0.31547772884368896, + "kl": 0.003958702087402344, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.046048391230247e-07, + "loss": 0.0002, + "reward": 0.5314989294856787, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5314989294856787, + "reward_after_std": 0.6146979462355375, + "reward_before_mean": 0.8071410674601793, + "reward_before_std": 0.5886675817891955, + "reward_change_max": 0.0004691779613494873, + "reward_change_mean": -0.27564213797450066, + "reward_change_min": -0.4286304134875536, + "reward_change_std": 0.1728304447606206, + "reward_std": 0.6146979611366987, + "rewards/cosine_scaled_reward": 0.07023719977587461, + "rewards/format_reward": 0.6666666734963655, + "step": 145 + }, + { + "advantage_max": 1.5834444910287857, + "advantage_mean": -6.208816794028849e-10, + "advantage_min": -1.2619915455579758, + "advantage_std": 0.9997970163822174, + "completion_length": 2003.5000610351562, + "epoch": 0.16685714285714287, + "grad_norm": 0.22241827845573425, + "kl": 0.0024411678314208984, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.026620557966279e-07, + "loss": 0.0001, + "reward": 0.2519833882106468, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.2519833882106468, + "reward_after_std": 0.6221999675035477, + "reward_before_mean": 0.4680800251662731, + "reward_before_std": 0.6158224176615477, + "reward_change_max": 0.00015006959438323975, + "reward_change_mean": -0.21609664289280772, + "reward_change_min": -0.3714016154408455, + "reward_change_std": 0.14444420114159584, + "reward_std": 0.6222000010311604, + "rewards/cosine_scaled_reward": -0.151376660913229, + "rewards/format_reward": 0.7708333432674408, + "step": 146 + }, + { + "advantage_max": 1.5665484219789505, + "advantage_mean": 3.7252903650752955e-08, + "advantage_min": -1.1734899654984474, + "advantage_std": 0.9998107403516769, + "completion_length": 2070.0208587646484, + "epoch": 0.168, + "grad_norm": 0.2511901259422302, + "kl": 0.0043354034423828125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 9.007020842191634e-07, + "loss": 0.0002, + "reward": 0.1382943361531943, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1382943361531943, + "reward_after_std": 0.6228159815073013, + "reward_before_mean": 0.3309053098782897, + "reward_before_std": 0.629545658826828, + "reward_change_max": 0.0, + "reward_change_mean": -0.1926109748892486, + "reward_change_min": -0.34466840885579586, + "reward_change_std": 0.1377943456172943, + "reward_std": 0.6228160075843334, + "rewards/cosine_scaled_reward": -0.14704734086990356, + "rewards/format_reward": 0.6250000074505806, + "step": 147 + }, + { + "advantage_max": 1.6768000572919846, + "advantage_mean": -6.705522803684971e-08, + "advantage_min": -1.1405689418315887, + "advantage_std": 0.9997821599245071, + "completion_length": 1937.4583740234375, + "epoch": 0.16914285714285715, + "grad_norm": 0.20754331350326538, + "kl": 0.0034346580505371094, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.987250199168808e-07, + "loss": 0.0001, + "reward": 0.40766859240829945, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.40766859240829945, + "reward_after_std": 0.6350481193512678, + "reward_before_mean": 0.650043424218893, + "reward_before_std": 0.5965264923870564, + "reward_change_max": 0.0, + "reward_change_mean": -0.24237484019249678, + "reward_change_min": -0.3769410066306591, + "reward_change_std": 0.14409376867115498, + "reward_std": 0.6350481417030096, + "rewards/cosine_scaled_reward": -0.039561630226671696, + "rewards/format_reward": 0.729166679084301, + "step": 148 + }, + { + "advantage_max": 1.5765099674463272, + "advantage_mean": -6.581346534417776e-08, + "advantage_min": -1.1912664473056793, + "advantage_std": 0.9997623637318611, + "completion_length": 2463.2500228881836, + "epoch": 0.1702857142857143, + "grad_norm": 0.20378859341144562, + "kl": 0.004099845886230469, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.967309592491052e-07, + "loss": 0.0002, + "reward": 0.41491406969726086, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.41491406969726086, + "reward_after_std": 0.6035808995366096, + "reward_before_mean": 0.6691889259964228, + "reward_before_std": 0.5983965341001749, + "reward_change_max": 0.00039667636156082153, + "reward_change_mean": -0.2542748870328069, + "reward_change_min": -0.4188149496912956, + "reward_change_std": 0.16949533484876156, + "reward_std": 0.6035809032619, + "rewards/cosine_scaled_reward": 0.02209446392953396, + "rewards/format_reward": 0.6250000037252903, + "step": 149 + }, + { + "advantage_max": 1.5017241835594177, + "advantage_mean": -5.587935891782081e-09, + "advantage_min": -1.1115936785936356, + "advantage_std": 0.9998666793107986, + "completion_length": 2137.7083435058594, + "epoch": 0.17142857142857143, + "grad_norm": 0.2677984833717346, + "kl": 0.0045261383056640625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.9471999940354e-07, + "loss": 0.0002, + "reward": 0.27933480869978666, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.27933480869978666, + "reward_after_std": 0.9380423426628113, + "reward_before_mean": 0.4810619349591434, + "reward_before_std": 0.9789088033139706, + "reward_change_max": 0.0010481253266334534, + "reward_change_mean": -0.20172712206840515, + "reward_change_min": -0.42329406924545765, + "reward_change_std": 0.18029167037457228, + "reward_std": 0.9380423575639725, + "rewards/cosine_scaled_reward": -0.06155238504288718, + "rewards/format_reward": 0.6041666828095913, + "step": 150 + }, + { + "advantage_max": 1.6314374655485153, + "advantage_mean": -5.0912302596017867e-08, + "advantage_min": -1.2180282175540924, + "advantage_std": 0.9997744411230087, + "completion_length": 2193.5208587646484, + "epoch": 0.17257142857142857, + "grad_norm": 0.2930396497249603, + "kl": 0.0044596195220947266, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.926922383915315e-07, + "loss": 0.0002, + "reward": 0.45010002702474594, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.45010002702474594, + "reward_after_std": 0.6098031923174858, + "reward_before_mean": 0.7081503644585609, + "reward_before_std": 0.5848918259143829, + "reward_change_max": 0.0, + "reward_change_mean": -0.2580503453500569, + "reward_change_min": -0.39921652898192406, + "reward_change_std": 0.16081867553293705, + "reward_std": 0.6098032146692276, + "rewards/cosine_scaled_reward": 0.020741842687129974, + "rewards/format_reward": 0.6666666716337204, + "step": 151 + }, + { + "advantage_max": 1.706085816025734, + "advantage_mean": 1.862645193639878e-08, + "advantage_min": -1.015485629439354, + "advantage_std": 0.9998258575797081, + "completion_length": 2515.2083892822266, + "epoch": 0.1737142857142857, + "grad_norm": 0.2551548182964325, + "kl": 0.0040667057037353516, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.906477750432903e-07, + "loss": 0.0002, + "reward": -0.012446035631000996, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.012446035631000996, + "reward_after_std": 0.7643734775483608, + "reward_before_mean": 0.1334016639739275, + "reward_before_std": 0.7614435665309429, + "reward_change_max": 0.0005860179662704468, + "reward_change_mean": -0.14584769657813013, + "reward_change_min": -0.26715877279639244, + "reward_change_std": 0.10718716867268085, + "reward_std": 0.7643734961748123, + "rewards/cosine_scaled_reward": -0.16246584057807922, + "rewards/format_reward": 0.45833334140479565, + "step": 152 + }, + { + "advantage_max": 1.5629348754882812, + "advantage_mean": 6.581346390088783e-08, + "advantage_min": -1.113683059811592, + "advantage_std": 0.9996767640113831, + "completion_length": 2569.2292098999023, + "epoch": 0.17485714285714285, + "grad_norm": 0.23137755692005157, + "kl": 0.00739288330078125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.88586709003076e-07, + "loss": 0.0003, + "reward": -0.18267223238945007, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.18267223238945007, + "reward_after_std": 0.5054036788642406, + "reward_before_mean": -0.05120914429426193, + "reward_before_std": 0.5079370914027095, + "reward_change_max": 0.0, + "reward_change_mean": -0.13146307598799467, + "reward_change_min": -0.25649657659232616, + "reward_change_std": 0.09824450453743339, + "reward_std": 0.5054036900401115, + "rewards/cosine_scaled_reward": -0.2547712437444716, + "rewards/format_reward": 0.4583333358168602, + "step": 153 + }, + { + "advantage_max": 1.6035524010658264, + "advantage_mean": -2.6077032755367213e-08, + "advantage_min": -1.1122926324605942, + "advantage_std": 0.9998549222946167, + "completion_length": 3082.4375610351562, + "epoch": 0.176, + "grad_norm": 0.18362846970558167, + "kl": 0.0031795501708984375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.865091407243394e-07, + "loss": 0.0001, + "reward": 0.37665724381804466, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.37665724381804466, + "reward_after_std": 0.8660299405455589, + "reward_before_mean": 0.6022710939869285, + "reward_before_std": 0.8814903013408184, + "reward_change_max": 0.00025279074907302856, + "reward_change_mean": -0.22561387112364173, + "reward_change_min": -0.4025569446384907, + "reward_change_std": 0.16582211665809155, + "reward_std": 0.8660299628973007, + "rewards/cosine_scaled_reward": 0.0719688767567277, + "rewards/format_reward": 0.4583333432674408, + "step": 154 + }, + { + "advantage_max": 1.6857571452856064, + "advantage_mean": -3.050081476274613e-08, + "advantage_min": -1.0754043608903885, + "advantage_std": 0.9998093396425247, + "completion_length": 2467.458381652832, + "epoch": 0.17714285714285713, + "grad_norm": 0.2297108769416809, + "kl": 0.00498199462890625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.844151714648274e-07, + "loss": 0.0002, + "reward": 0.5354392826557159, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5354392826557159, + "reward_after_std": 0.6706315167248249, + "reward_before_mean": 0.8052331898361444, + "reward_before_std": 0.6433561127632856, + "reward_change_max": 0.00030928850173950195, + "reward_change_mean": -0.26979394583031535, + "reward_change_min": -0.4219784028828144, + "reward_change_std": 0.16768614412285388, + "reward_std": 0.6706315279006958, + "rewards/cosine_scaled_reward": 0.14219993818551302, + "rewards/format_reward": 0.5208333376795053, + "step": 155 + }, + { + "advantage_max": 1.4635108709335327, + "advantage_mean": -1.8626452602532595e-09, + "advantage_min": -1.2058681324124336, + "advantage_std": 0.9998436570167542, + "completion_length": 2434.500030517578, + "epoch": 0.1782857142857143, + "grad_norm": 0.1800357550382614, + "kl": 0.0037064552307128906, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.823049032816478e-07, + "loss": 0.0001, + "reward": 0.322545756585896, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.322545756585896, + "reward_after_std": 0.7853665836155415, + "reward_before_mean": 0.5472987722605467, + "reward_before_std": 0.8228888586163521, + "reward_change_max": 0.00011435151100158691, + "reward_change_mean": -0.22475302033126354, + "reward_change_min": -0.43423015251755714, + "reward_change_std": 0.17797227203845978, + "reward_std": 0.785366591066122, + "rewards/cosine_scaled_reward": 0.03406604006886482, + "rewards/format_reward": 0.47916667722165585, + "step": 156 + }, + { + "advantage_max": 1.6851200014352798, + "advantage_mean": 3.60111410691033e-08, + "advantage_min": -0.9669445231556892, + "advantage_std": 0.999786801636219, + "completion_length": 2656.1458587646484, + "epoch": 0.17942857142857144, + "grad_norm": 0.2275025099515915, + "kl": 0.005410194396972656, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.801784390262943e-07, + "loss": 0.0002, + "reward": -0.12188863917253911, + "reward_advantage_correlation": 0.9999999999999994, + "reward_after_mean": -0.12188863917253911, + "reward_after_std": 0.5806295238435268, + "reward_before_mean": 0.01793377846479416, + "reward_before_std": 0.5874885879456997, + "reward_change_max": 0.0, + "reward_change_mean": -0.13982242881320417, + "reward_change_min": -0.283700505271554, + "reward_change_std": 0.11282584932632744, + "reward_std": 0.5806295461952686, + "rewards/cosine_scaled_reward": -0.22019978612661362, + "rewards/format_reward": 0.4583333507180214, + "step": 157 + }, + { + "advantage_max": 1.5106781423091888, + "advantage_mean": 2.483527050678447e-09, + "advantage_min": -1.2967959195375443, + "advantage_std": 0.9998632296919823, + "completion_length": 2356.0625610351562, + "epoch": 0.18057142857142858, + "grad_norm": 0.27170735597610474, + "kl": 0.0045986175537109375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.780358823396352e-07, + "loss": 0.0002, + "reward": 0.6479261901695281, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6479261901695281, + "reward_after_std": 0.8374496065080166, + "reward_before_mean": 0.9347638841718435, + "reward_before_std": 0.8472369872033596, + "reward_change_max": 0.0009616687893867493, + "reward_change_mean": -0.2868376709520817, + "reward_change_min": -0.4877729155123234, + "reward_change_std": 0.20384064875543118, + "reward_std": 0.8374496325850487, + "rewards/cosine_scaled_reward": 0.12363193836063147, + "rewards/format_reward": 0.6875000279396772, + "step": 158 + }, + { + "advantage_max": 1.618651032447815, + "advantage_mean": -1.3038516433194758e-08, + "advantage_min": -1.1713954880833626, + "advantage_std": 0.9997406974434853, + "completion_length": 2258.854202270508, + "epoch": 0.18171428571428572, + "grad_norm": 0.19485744833946228, + "kl": 0.004391670227050781, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.758773376468604e-07, + "loss": 0.0002, + "reward": -0.007745785638689995, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.007745785638689995, + "reward_after_std": 0.4710584804415703, + "reward_before_mean": 0.16189392702654004, + "reward_before_std": 0.4530893340706825, + "reward_change_max": 0.0002969801425933838, + "reward_change_mean": -0.16963971918448806, + "reward_change_min": -0.2686846721917391, + "reward_change_std": 0.10416083410382271, + "reward_std": 0.4710584916174412, + "rewards/cosine_scaled_reward": -0.18988638184964657, + "rewards/format_reward": 0.5416666716337204, + "step": 159 + }, + { + "advantage_max": 1.601723164319992, + "advantage_mean": -6.2088170160734535e-09, + "advantage_min": -0.9842819944024086, + "advantage_std": 0.9998537227511406, + "completion_length": 2300.520896911621, + "epoch": 0.18285714285714286, + "grad_norm": 0.24165897071361542, + "kl": 0.0068359375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.737029101523929e-07, + "loss": 0.0003, + "reward": 0.47148462012410164, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.47148462012410164, + "reward_after_std": 0.9986433014273643, + "reward_before_mean": 0.7081597335636616, + "reward_before_std": 1.017629697918892, + "reward_change_max": 0.0008212849497795105, + "reward_change_mean": -0.2366751218214631, + "reward_change_min": -0.5349176675081253, + "reward_change_std": 0.2049960969015956, + "reward_std": 0.9986433461308479, + "rewards/cosine_scaled_reward": 0.07282985420897603, + "rewards/format_reward": 0.5625000111758709, + "step": 160 + }, + { + "advantage_max": 1.6861842274665833, + "advantage_mean": -2.9181441152381637e-08, + "advantage_min": -0.9965731874108315, + "advantage_std": 0.9998380914330482, + "completion_length": 2000.770896911621, + "epoch": 0.184, + "grad_norm": 0.2589896023273468, + "kl": 0.004801750183105469, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.715127058347614e-07, + "loss": 0.0002, + "reward": 0.2427341677248478, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2427341677248478, + "reward_after_std": 0.7803987562656403, + "reward_before_mean": 0.4423756841570139, + "reward_before_std": 0.7710563689470291, + "reward_change_max": 0.0, + "reward_change_mean": -0.19964152900502086, + "reward_change_min": -0.3690079543739557, + "reward_change_std": 0.13987152371555567, + "reward_std": 0.7803987711668015, + "rewards/cosine_scaled_reward": -0.09131215792149305, + "rewards/format_reward": 0.6250000074505806, + "step": 161 + }, + { + "advantage_max": 1.6020589023828506, + "advantage_mean": -8.071462720415923e-09, + "advantage_min": -1.11840408295393, + "advantage_std": 0.9997873082756996, + "completion_length": 2577.354217529297, + "epoch": 0.18514285714285714, + "grad_norm": 0.2400018274784088, + "kl": 0.007944107055664062, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.693068314414344e-07, + "loss": 0.0003, + "reward": 0.1532662883400917, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.1532662883400917, + "reward_after_std": 0.7209969665855169, + "reward_before_mean": 0.34282832220196724, + "reward_before_std": 0.7363618742674589, + "reward_change_max": 0.00031294673681259155, + "reward_change_mean": -0.1895620170980692, + "reward_change_min": -0.36849910393357277, + "reward_change_std": 0.15132501488551497, + "reward_std": 0.7209969665855169, + "rewards/cosine_scaled_reward": -0.057752519845962524, + "rewards/format_reward": 0.4583333395421505, + "step": 162 + }, + { + "advantage_max": 1.6749805212020874, + "advantage_mean": -3.911555057634075e-08, + "advantage_min": -0.9457454830408096, + "advantage_std": 0.9998423382639885, + "completion_length": 2090.0625610351562, + "epoch": 0.18628571428571428, + "grad_norm": 0.2369341254234314, + "kl": 0.0051097869873046875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.670853944836176e-07, + "loss": 0.0002, + "reward": 0.5305918380618095, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5305918380618095, + "reward_after_std": 0.8068078309297562, + "reward_before_mean": 0.7879913533106446, + "reward_before_std": 0.7713843509554863, + "reward_change_max": 0.0002727434039115906, + "reward_change_mean": -0.2573995175771415, + "reward_change_min": -0.43212768994271755, + "reward_change_std": 0.16625749971717596, + "reward_std": 0.8068078495562077, + "rewards/cosine_scaled_reward": 0.050245666585396975, + "rewards/format_reward": 0.6875000055879354, + "step": 163 + }, + { + "advantage_max": 1.692627653479576, + "advantage_mean": -4.097819339410336e-08, + "advantage_min": -1.0569606199860573, + "advantage_std": 0.9998034909367561, + "completion_length": 1950.6250381469727, + "epoch": 0.18742857142857142, + "grad_norm": 0.2518463432788849, + "kl": 0.0050716400146484375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.648485032310144e-07, + "loss": 0.0002, + "reward": 0.5412261649034917, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5412261649034917, + "reward_after_std": 0.6402173116803169, + "reward_before_mean": 0.8132719174027443, + "reward_before_std": 0.602929200977087, + "reward_change_max": 0.0, + "reward_change_mean": -0.27204577438533306, + "reward_change_min": -0.43784534372389317, + "reward_change_std": 0.1645910618826747, + "reward_std": 0.640217337757349, + "rewards/cosine_scaled_reward": 0.0420526172965765, + "rewards/format_reward": 0.7291666753590107, + "step": 164 + }, + { + "advantage_max": 1.714088037610054, + "advantage_mean": -5.58793583627093e-09, + "advantage_min": -0.9592446982860565, + "advantage_std": 0.9998089000582695, + "completion_length": 2122.0416946411133, + "epoch": 0.18857142857142858, + "grad_norm": 0.2594444751739502, + "kl": 0.00640106201171875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.625962667065487e-07, + "loss": 0.0003, + "reward": 0.049126192927360535, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.049126192927360535, + "reward_after_std": 0.6851355955004692, + "reward_before_mean": 0.2161144088022411, + "reward_before_std": 0.6790233179926872, + "reward_change_max": 0.0009498968720436096, + "reward_change_mean": -0.16698822565376759, + "reward_change_min": -0.32509128004312515, + "reward_change_std": 0.12738678557798266, + "reward_std": 0.685135617852211, + "rewards/cosine_scaled_reward": -0.1940261390991509, + "rewards/format_reward": 0.6041666734963655, + "step": 165 + }, + { + "advantage_max": 1.6857409626245499, + "advantage_mean": 3.5390257835388184e-08, + "advantage_min": -1.1037379801273346, + "advantage_std": 0.9998189136385918, + "completion_length": 2194.3541946411133, + "epoch": 0.18971428571428572, + "grad_norm": 0.2024780958890915, + "kl": 0.004538536071777344, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.603287946810513e-07, + "loss": 0.0002, + "reward": 0.2784705702215433, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2784705702215433, + "reward_after_std": 0.7903713285923004, + "reward_before_mean": 0.48560900520533323, + "reward_before_std": 0.7836416717618704, + "reward_change_max": 0.0002986118197441101, + "reward_change_mean": -0.20713840331882238, + "reward_change_min": -0.36814062111079693, + "reward_change_std": 0.14702307805418968, + "reward_std": 0.790371336042881, + "rewards/cosine_scaled_reward": -0.06969551555812359, + "rewards/format_reward": 0.6250000111758709, + "step": 166 + }, + { + "advantage_max": 1.7721141129732132, + "advantage_mean": -1.303851654421706e-08, + "advantage_min": -1.060515969991684, + "advantage_std": 0.9997974634170532, + "completion_length": 1851.4792251586914, + "epoch": 0.19085714285714286, + "grad_norm": 0.2363491803407669, + "kl": 0.0055522918701171875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.580461976679099e-07, + "loss": 0.0002, + "reward": 0.199244512245059, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.199244512245059, + "reward_after_std": 0.771139208227396, + "reward_before_mean": 0.38737055473029613, + "reward_before_std": 0.7454683519899845, + "reward_change_max": 0.000876501202583313, + "reward_change_mean": -0.18812604527920485, + "reward_change_min": -0.33890487626194954, + "reward_change_std": 0.13139595091342926, + "reward_std": 0.7711392100900412, + "rewards/cosine_scaled_reward": -0.2021480556577444, + "rewards/format_reward": 0.7916666828095913, + "step": 167 + }, + { + "advantage_max": 1.516417846083641, + "advantage_mean": -2.0489097085629737e-08, + "advantage_min": -1.0871713608503342, + "advantage_std": 0.9998452961444855, + "completion_length": 2492.3333892822266, + "epoch": 0.192, + "grad_norm": 0.2922627925872803, + "kl": 0.0047931671142578125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.557485869176825e-07, + "loss": 0.0002, + "reward": 0.35394715797156096, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.35394715797156096, + "reward_after_std": 0.9185864347964525, + "reward_before_mean": 0.5737202242016792, + "reward_before_std": 0.9552944917231798, + "reward_change_max": 0.0001585930585861206, + "reward_change_mean": -0.21977307926863432, + "reward_change_min": -0.43790194019675255, + "reward_change_std": 0.1791673693805933, + "reward_std": 0.9185864515602589, + "rewards/cosine_scaled_reward": 0.005610108375549316, + "rewards/format_reward": 0.5625000093132257, + "step": 168 + }, + { + "advantage_max": 1.5727615356445312, + "advantage_mean": -1.1051694892572073e-07, + "advantage_min": -1.1902239173650742, + "advantage_std": 0.9998266100883484, + "completion_length": 1229.208381652832, + "epoch": 0.19314285714285714, + "grad_norm": 0.25364622473716736, + "kl": 0.005054473876953125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.534360744126753e-07, + "loss": 0.0002, + "reward": 1.235820960253477, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 1.235820960253477, + "reward_after_std": 0.6339393146336079, + "reward_before_mean": 1.6583482660353184, + "reward_before_std": 0.5706211663782597, + "reward_change_max": 0.0, + "reward_change_mean": -0.42252730391919613, + "reward_change_min": -0.599956326186657, + "reward_change_std": 0.23817383870482445, + "reward_std": 0.6339393258094788, + "rewards/cosine_scaled_reward": 0.35000744462013245, + "rewards/format_reward": 0.9583333432674408, + "step": 169 + }, + { + "advantage_max": 1.722038522362709, + "advantage_mean": 1.2169281882190575e-07, + "advantage_min": -0.9152258113026619, + "advantage_std": 0.9997827783226967, + "completion_length": 2099.750026702881, + "epoch": 0.19428571428571428, + "grad_norm": 0.21759958565235138, + "kl": 0.005249977111816406, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.511087728614862e-07, + "loss": 0.0002, + "reward": 0.35102659091353416, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.35102659091353416, + "reward_after_std": 0.647955346852541, + "reward_before_mean": 0.5838488005101681, + "reward_before_std": 0.6074068415910006, + "reward_change_max": 0.0, + "reward_change_mean": -0.23282210575416684, + "reward_change_min": -0.42266797088086605, + "reward_change_std": 0.1610524570569396, + "reward_std": 0.6479553729295731, + "rewards/cosine_scaled_reward": 0.00025770440697669983, + "rewards/format_reward": 0.5833333395421505, + "step": 170 + }, + { + "advantage_max": 1.50984887778759, + "advantage_mean": 2.8560560361157172e-08, + "advantage_min": -1.1283286213874817, + "advantage_std": 0.9997936487197876, + "completion_length": 2158.2708740234375, + "epoch": 0.19542857142857142, + "grad_norm": 0.230439692735672, + "kl": 0.0041656494140625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.487667956935087e-07, + "loss": 0.0002, + "reward": 0.31339962780475616, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.31339962780475616, + "reward_after_std": 0.6719086579978466, + "reward_before_mean": 0.5419529378414154, + "reward_before_std": 0.7005261778831482, + "reward_change_max": 0.00029709935188293457, + "reward_change_mean": -0.22855328931473196, + "reward_change_min": -0.42273118533194065, + "reward_change_std": 0.16405913210473955, + "reward_std": 0.6719086859375238, + "rewards/cosine_scaled_reward": -0.020690208300948143, + "rewards/format_reward": 0.5833333358168602, + "step": 171 + }, + { + "advantage_max": 1.504933387041092, + "advantage_mean": -3.10440865236572e-08, + "advantage_min": -1.4171021580696106, + "advantage_std": 0.9998073801398277, + "completion_length": 2092.9166984558105, + "epoch": 0.19657142857142856, + "grad_norm": 0.2690483331680298, + "kl": 0.0072803497314453125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.464102570534061e-07, + "loss": 0.0003, + "reward": 0.5798506364226341, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5798506364226341, + "reward_after_std": 0.6986503172665834, + "reward_before_mean": 0.8605402838438749, + "reward_before_std": 0.6976092047989368, + "reward_change_max": 0.00021129846572875977, + "reward_change_mean": -0.2806896660476923, + "reward_change_min": -0.435400377959013, + "reward_change_std": 0.1796800745651126, + "reward_std": 0.6986503265798092, + "rewards/cosine_scaled_reward": 0.10735346004366875, + "rewards/format_reward": 0.6458333432674408, + "step": 172 + }, + { + "advantage_max": 1.7530633360147476, + "advantage_mean": 1.7384688688615313e-08, + "advantage_min": -0.8142209053039551, + "advantage_std": 0.9996874928474426, + "completion_length": 1415.9583587646484, + "epoch": 0.1977142857142857, + "grad_norm": 0.32314759492874146, + "kl": 0.0056514739990234375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.440392717955475e-07, + "loss": 0.0002, + "reward": 0.18319927039556205, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.18319927039556205, + "reward_after_std": 0.6402694256976247, + "reward_before_mean": 0.3805189239792526, + "reward_before_std": 0.619793354999274, + "reward_change_max": 0.0, + "reward_change_mean": -0.19731965800747275, + "reward_change_min": -0.35540652461349964, + "reward_change_std": 0.13836060231551528, + "reward_std": 0.6402694452553988, + "rewards/cosine_scaled_reward": -0.19515720537674497, + "rewards/format_reward": 0.7708333358168602, + "step": 173 + }, + { + "advantage_max": 1.6947945654392242, + "advantage_mean": -5.5879355587151736e-09, + "advantage_min": -1.1162981167435646, + "advantage_std": 0.9998343512415886, + "completion_length": 1546.520896911621, + "epoch": 0.19885714285714284, + "grad_norm": 0.24973110854625702, + "kl": 0.006852149963378906, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.416539554784089e-07, + "loss": 0.0003, + "reward": 0.447128068190068, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.447128068190068, + "reward_after_std": 0.7851445488631725, + "reward_before_mean": 0.6881698596407659, + "reward_before_std": 0.7564616817981005, + "reward_change_max": 0.00030616670846939087, + "reward_change_mean": -0.24104176089167595, + "reward_change_min": -0.38989388942718506, + "reward_change_std": 0.1559823602437973, + "reward_std": 0.7851445563137531, + "rewards/cosine_scaled_reward": -0.07258176291361451, + "rewards/format_reward": 0.8333333432674408, + "step": 174 + }, + { + "advantage_max": 1.6422483325004578, + "advantage_mean": -2.561137124601487e-08, + "advantage_min": -1.2339301407337189, + "advantage_std": 0.9998004734516144, + "completion_length": 2264.791717529297, + "epoch": 0.2, + "grad_norm": 0.24177344143390656, + "kl": 0.0054874420166015625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.392544243589427e-07, + "loss": 0.0002, + "reward": 0.5153841646388173, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5153841646388173, + "reward_after_std": 0.608320277184248, + "reward_before_mean": 0.786954928888008, + "reward_before_std": 0.5843202453106642, + "reward_change_max": 0.0006819292902946472, + "reward_change_mean": -0.2715707626193762, + "reward_change_min": -0.4168048519641161, + "reward_change_std": 0.1677942699752748, + "reward_std": 0.6083202883601189, + "rewards/cosine_scaled_reward": 0.08097745012491941, + "rewards/format_reward": 0.6250000093132257, + "step": 175 + }, + { + "advantage_max": 1.5760944485664368, + "advantage_mean": -2.8560559250934148e-08, + "advantage_min": -1.1510667353868484, + "advantage_std": 0.9998167455196381, + "completion_length": 1851.9375228881836, + "epoch": 0.20114285714285715, + "grad_norm": 0.2656446397304535, + "kl": 0.0049724578857421875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.368407953869103e-07, + "loss": 0.0002, + "reward": 0.49231592612341046, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.49231592612341046, + "reward_after_std": 0.8085062839090824, + "reward_before_mean": 0.7481458615511656, + "reward_before_std": 0.8251261785626411, + "reward_change_max": 6.766617298126221e-05, + "reward_change_mean": -0.25582992658019066, + "reward_change_min": -0.42961229756474495, + "reward_change_std": 0.18010017089545727, + "reward_std": 0.8085063360631466, + "rewards/cosine_scaled_reward": -0.011343751102685928, + "rewards/format_reward": 0.770833345130086, + "step": 176 + }, + { + "advantage_max": 1.6836175918579102, + "advantage_mean": -5.712111839084599e-08, + "advantage_min": -1.1872343942523003, + "advantage_std": 0.9998055696487427, + "completion_length": 2073.1667098999023, + "epoch": 0.2022857142857143, + "grad_norm": 0.26497936248779297, + "kl": 0.00461578369140625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.344131861991828e-07, + "loss": 0.0002, + "reward": 0.5681694131344557, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5681694131344557, + "reward_after_std": 0.6743853017687798, + "reward_before_mean": 0.841622298117727, + "reward_before_std": 0.6278502456843853, + "reward_change_max": 0.0, + "reward_change_mean": -0.27345291804522276, + "reward_change_min": -0.40037749521434307, + "reward_change_std": 0.15704370383173227, + "reward_std": 0.6743853129446507, + "rewards/cosine_scaled_reward": 0.04581115394830704, + "rewards/format_reward": 0.7500000093132257, + "step": 177 + }, + { + "advantage_max": 1.4844506978988647, + "advantage_mean": -8.071462553882469e-09, + "advantage_min": -1.1723910346627235, + "advantage_std": 0.999858170747757, + "completion_length": 2002.2083740234375, + "epoch": 0.20342857142857143, + "grad_norm": 0.33194902539253235, + "kl": 0.008449554443359375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.319717151140072e-07, + "loss": 0.0003, + "reward": 0.42172466265037656, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.42172466265037656, + "reward_after_std": 0.8564918860793114, + "reward_before_mean": 0.6597280632704496, + "reward_before_std": 0.8828203082084656, + "reward_change_max": 0.0, + "reward_change_mean": -0.23800339736044407, + "reward_change_min": -0.4743681848049164, + "reward_change_std": 0.18404243979603052, + "reward_std": 0.856491930782795, + "rewards/cosine_scaled_reward": -0.0034693063935264945, + "rewards/format_reward": 0.6666666753590107, + "step": 178 + }, + { + "advantage_max": 1.6311480104923248, + "advantage_mean": -3.352761296371298e-08, + "advantage_min": -1.1216321215033531, + "advantage_std": 0.9997258484363556, + "completion_length": 2185.187530517578, + "epoch": 0.20457142857142857, + "grad_norm": 0.28444021940231323, + "kl": 0.0072917938232421875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.295165011252396e-07, + "loss": 0.0003, + "reward": -0.058215420227497816, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.058215420227497816, + "reward_after_std": 0.4829340185970068, + "reward_before_mean": 0.10054558701813221, + "reward_before_std": 0.474730771034956, + "reward_change_max": 0.0004728659987449646, + "reward_change_mean": -0.15876100910827518, + "reward_change_min": -0.27880000323057175, + "reward_change_std": 0.11060563754290342, + "reward_std": 0.4829340223222971, + "rewards/cosine_scaled_reward": -0.24139389023184776, + "rewards/format_reward": 0.5833333395421505, + "step": 179 + }, + { + "advantage_max": 1.6618616580963135, + "advantage_mean": 6.208817238118058e-09, + "advantage_min": -1.071367308497429, + "advantage_std": 0.9998577609658241, + "completion_length": 1255.833381652832, + "epoch": 0.2057142857142857, + "grad_norm": 0.30001556873321533, + "kl": 0.0062732696533203125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.270476638965461e-07, + "loss": 0.0003, + "reward": 1.1591643691062927, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 1.1591643691062927, + "reward_after_std": 0.8961893171072006, + "reward_before_mean": 1.5456229411065578, + "reward_before_std": 0.849966811016202, + "reward_change_max": 0.0, + "reward_change_mean": -0.3864585403352976, + "reward_change_min": -0.6112453117966652, + "reward_change_std": 0.23929630033671856, + "reward_std": 0.8961893357336521, + "rewards/cosine_scaled_reward": 0.3144781200680882, + "rewards/format_reward": 0.9166666716337204, + "step": 180 + }, + { + "advantage_max": 1.6392957419157028, + "advantage_mean": -6.146728892542086e-08, + "advantage_min": -1.0734733194112778, + "advantage_std": 0.9997735172510147, + "completion_length": 2378.6667098999023, + "epoch": 0.20685714285714285, + "grad_norm": 0.25959938764572144, + "kl": 0.00646209716796875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.245653237555705e-07, + "loss": 0.0003, + "reward": 0.23480892833322287, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.23480892833322287, + "reward_after_std": 0.6075701769441366, + "reward_before_mean": 0.44690791331231594, + "reward_before_std": 0.5976003762334585, + "reward_change_max": 0.00015873461961746216, + "reward_change_mean": -0.2120990320108831, + "reward_change_min": -0.40071258693933487, + "reward_change_std": 0.15138957416638732, + "reward_std": 0.6075701992958784, + "rewards/cosine_scaled_reward": -0.0890460480004549, + "rewards/format_reward": 0.6250000074505806, + "step": 181 + }, + { + "advantage_max": 1.6973135769367218, + "advantage_mean": -1.2417631367611648e-09, + "advantage_min": -1.1010829582810402, + "advantage_std": 0.9998375773429871, + "completion_length": 1855.06254196167, + "epoch": 0.208, + "grad_norm": 0.17940671741962433, + "kl": 0.0037069320678710938, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.220696016880687e-07, + "loss": 0.0001, + "reward": 0.4569451562128961, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4569451562128961, + "reward_after_std": 0.7582255005836487, + "reward_before_mean": 0.7031884212046862, + "reward_before_std": 0.7348581291735172, + "reward_change_max": 0.0003945082426071167, + "reward_change_mean": -0.2462432268075645, + "reward_change_min": -0.41578804701566696, + "reward_change_std": 0.16094863507896662, + "reward_std": 0.7582255192101002, + "rewards/cosine_scaled_reward": -0.0025724750012159348, + "rewards/format_reward": 0.7083333414047956, + "step": 182 + }, + { + "advantage_max": 1.6113858073949814, + "advantage_mean": -6.332993829349931e-08, + "advantage_min": -1.0258197262883186, + "advantage_std": 0.9998429715633392, + "completion_length": 1424.5000457763672, + "epoch": 0.20914285714285713, + "grad_norm": 0.29113176465034485, + "kl": 0.008054733276367188, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.195606193320136e-07, + "loss": 0.0003, + "reward": 0.6616654456593096, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.6616654456593096, + "reward_after_std": 0.7637336738407612, + "reward_before_mean": 0.9540510524529964, + "reward_before_std": 0.7442143596708775, + "reward_change_max": 0.0002907067537307739, + "reward_change_mean": -0.2923855986446142, + "reward_change_min": -0.5103072412312031, + "reward_change_std": 0.19355090707540512, + "reward_std": 0.7637336924672127, + "rewards/cosine_scaled_reward": 0.04994217213243246, + "rewards/format_reward": 0.8541666716337204, + "step": 183 + }, + { + "advantage_max": 1.524808943271637, + "advantage_mean": -3.911554946611773e-08, + "advantage_min": -1.3263940066099167, + "advantage_std": 0.9998024180531502, + "completion_length": 1542.6875457763672, + "epoch": 0.2102857142857143, + "grad_norm": 0.2789245843887329, + "kl": 0.006900787353515625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.170384989716657e-07, + "loss": 0.0003, + "reward": 0.3295530015602708, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3295530015602708, + "reward_after_std": 0.6151285581290722, + "reward_before_mean": 0.5630174519028515, + "reward_before_std": 0.6208750866353512, + "reward_change_max": 0.0004479065537452698, + "reward_change_mean": -0.2334644440561533, + "reward_change_min": -0.39546626433730125, + "reward_change_std": 0.15795165114104748, + "reward_std": 0.6151285581290722, + "rewards/cosine_scaled_reward": -0.13515795394778252, + "rewards/format_reward": 0.8333333432674408, + "step": 184 + }, + { + "advantage_max": 1.7052653282880783, + "advantage_mean": -1.5522042928761692e-08, + "advantage_min": -0.8801075518131256, + "advantage_std": 0.9997084736824036, + "completion_length": 1614.5834045410156, + "epoch": 0.21142857142857144, + "grad_norm": 0.25438177585601807, + "kl": 0.005855560302734375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.145033635316128e-07, + "loss": 0.0002, + "reward": 0.16461107577197254, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.16461107577197254, + "reward_after_std": 0.5498490314930677, + "reward_before_mean": 0.36692870780825615, + "reward_before_std": 0.5359976505860686, + "reward_change_max": 0.0002930089831352234, + "reward_change_mean": -0.20231764344498515, + "reward_change_min": -0.3774881921708584, + "reward_change_std": 0.13948537409305573, + "reward_std": 0.5498490333557129, + "rewards/cosine_scaled_reward": -0.21236898249480873, + "rewards/format_reward": 0.7916666697710752, + "step": 185 + }, + { + "advantage_max": 1.589905396103859, + "advantage_mean": 1.862645149230957e-09, + "advantage_min": -1.0278316587209702, + "advantage_std": 0.999826580286026, + "completion_length": 1909.6667175292969, + "epoch": 0.21257142857142858, + "grad_norm": 0.20909112691879272, + "kl": 0.0068912506103515625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.119553365707802e-07, + "loss": 0.0003, + "reward": 0.43765153270214796, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.43765153270214796, + "reward_after_std": 0.7173998765647411, + "reward_before_mean": 0.6869608238339424, + "reward_before_std": 0.7174548655748367, + "reward_change_max": 0.00025334954261779785, + "reward_change_mean": -0.24930927250534296, + "reward_change_min": -0.4509042240679264, + "reward_change_std": 0.1700389552861452, + "reward_std": 0.7173998989164829, + "rewards/cosine_scaled_reward": 0.010147074237465858, + "rewards/format_reward": 0.6666666716337204, + "step": 186 + }, + { + "advantage_max": 1.6517557352781296, + "advantage_mean": 5.2154067287091266e-08, + "advantage_min": -1.1060415133833885, + "advantage_std": 0.9997224509716034, + "completion_length": 1636.2916870117188, + "epoch": 0.21371428571428572, + "grad_norm": 0.24228616058826447, + "kl": 0.006427764892578125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.093945422764069e-07, + "loss": 0.0003, + "reward": 0.30055883899331093, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.30055883899331093, + "reward_after_std": 0.39326165057718754, + "reward_before_mean": 0.5415985980071127, + "reward_before_std": 0.35194770991802216, + "reward_change_max": 0.00021256506443023682, + "reward_change_mean": -0.24103973433375359, + "reward_change_min": -0.35694490373134613, + "reward_change_std": 0.13662862265482545, + "reward_std": 0.3932616636157036, + "rewards/cosine_scaled_reward": -0.12503403797745705, + "rewards/format_reward": 0.791666679084301, + "step": 187 + }, + { + "advantage_max": 1.55316960811615, + "advantage_mean": 6.8296996946770605e-09, + "advantage_min": -1.3121953383088112, + "advantage_std": 0.9997300058603287, + "completion_length": 2161.750045776367, + "epoch": 0.21485714285714286, + "grad_norm": 0.20883195102214813, + "kl": 0.0077056884765625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.068211054579943e-07, + "loss": 0.0003, + "reward": 0.03851320035755634, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.03851320035755634, + "reward_after_std": 0.47359400801360607, + "reward_before_mean": 0.22079682408366352, + "reward_before_std": 0.47062006406486034, + "reward_change_max": 6.236881017684937e-05, + "reward_change_mean": -0.1822836329229176, + "reward_change_min": -0.32229095324873924, + "reward_change_std": 0.12202021991834044, + "reward_std": 0.47359402664005756, + "rewards/cosine_scaled_reward": -0.1916849333792925, + "rewards/format_reward": 0.6041666753590107, + "step": 188 + }, + { + "advantage_max": 1.7091278731822968, + "advantage_mean": -2.7939677238464355e-09, + "advantage_min": -1.0550750717520714, + "advantage_std": 0.9997809454798698, + "completion_length": 1451.9792022705078, + "epoch": 0.216, + "grad_norm": 0.2504546642303467, + "kl": 0.007526397705078125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.04235151541222e-07, + "loss": 0.0003, + "reward": 0.36948077380657196, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.36948077380657196, + "reward_after_std": 0.6774904727935791, + "reward_before_mean": 0.6019853809848428, + "reward_before_std": 0.6514282152056694, + "reward_change_max": 0.0, + "reward_change_mean": -0.23250459227710962, + "reward_change_min": -0.39352178759872913, + "reward_change_std": 0.14997949916869402, + "reward_std": 0.6774904876947403, + "rewards/cosine_scaled_reward": -0.1260906618554145, + "rewards/format_reward": 0.8541666753590107, + "step": 189 + }, + { + "advantage_max": 1.5744300931692123, + "advantage_mean": -1.83160115962977e-08, + "advantage_min": -1.0411207303404808, + "advantage_std": 0.9998467117547989, + "completion_length": 1512.3333892822266, + "epoch": 0.21714285714285714, + "grad_norm": 0.2728053331375122, + "kl": 0.008083343505859375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 8.01636806561836e-07, + "loss": 0.0003, + "reward": 0.5598836690187454, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5598836690187454, + "reward_after_std": 0.7893565334379673, + "reward_before_mean": 0.8278303791303188, + "reward_before_std": 0.7781634032726288, + "reward_change_max": 0.0, + "reward_change_mean": -0.2679467387497425, + "reward_change_min": -0.4867997542023659, + "reward_change_std": 0.1817149631679058, + "reward_std": 0.7893565557897091, + "rewards/cosine_scaled_reward": -0.013168148114345968, + "rewards/format_reward": 0.854166679084301, + "step": 190 + }, + { + "advantage_max": 1.7405344098806381, + "advantage_mean": -3.47693762670076e-08, + "advantage_min": -0.9780000820755959, + "advantage_std": 0.9997963458299637, + "completion_length": 951.4375305175781, + "epoch": 0.21828571428571428, + "grad_norm": 0.2622581124305725, + "kl": 0.0057964324951171875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.990261971595048e-07, + "loss": 0.0002, + "reward": 0.864835481159389, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.864835481159389, + "reward_after_std": 0.8386146035045385, + "reward_before_mean": 1.192215159535408, + "reward_before_std": 0.7938100956380367, + "reward_change_max": 0.0, + "reward_change_mean": -0.32737973518669605, + "reward_change_min": -0.5288929119706154, + "reward_change_std": 0.20689752884209156, + "reward_std": 0.8386146258562803, + "rewards/cosine_scaled_reward": 0.11694091919343919, + "rewards/format_reward": 0.9583333432674408, + "step": 191 + }, + { + "advantage_max": 1.6680810749530792, + "advantage_mean": -2.297262446937509e-08, + "advantage_min": -1.0754312574863434, + "advantage_std": 0.9998167529702187, + "completion_length": 1985.270866394043, + "epoch": 0.21942857142857142, + "grad_norm": 0.23176267743110657, + "kl": 0.007137298583984375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.964034505716476e-07, + "loss": 0.0003, + "reward": 0.240308852866292, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.240308852866292, + "reward_after_std": 0.7073502019047737, + "reward_before_mean": 0.4445103667676449, + "reward_before_std": 0.6949526704847813, + "reward_change_max": 0.0, + "reward_change_mean": -0.20420153997838497, + "reward_change_min": -0.384893910959363, + "reward_change_std": 0.1380018894560635, + "reward_std": 0.7073502317070961, + "rewards/cosine_scaled_reward": -0.12149481847882271, + "rewards/format_reward": 0.6875000186264515, + "step": 192 + }, + { + "advantage_max": 1.7411630600690842, + "advantage_mean": 3.1044145032410597e-10, + "advantage_min": -0.9968269243836403, + "advantage_std": 0.9998300522565842, + "completion_length": 2419.7084045410156, + "epoch": 0.22057142857142858, + "grad_norm": 0.20773284137248993, + "kl": 0.0078582763671875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.93768694627233e-07, + "loss": 0.0003, + "reward": 0.06769353523850441, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.06769353523850441, + "reward_after_std": 0.819042269140482, + "reward_before_mean": 0.22256720066070557, + "reward_before_std": 0.7970810793340206, + "reward_change_max": 0.0009588897228240967, + "reward_change_mean": -0.15487368637695909, + "reward_change_min": -0.27775100991129875, + "reward_change_std": 0.1097005819901824, + "reward_std": 0.8190423138439655, + "rewards/cosine_scaled_reward": -0.15954973798943684, + "rewards/format_reward": 0.5416666753590107, + "step": 193 + }, + { + "advantage_max": 1.6247856467962265, + "advantage_mean": -7.450580818968433e-09, + "advantage_min": -0.9195460751652718, + "advantage_std": 0.9998468160629272, + "completion_length": 2124.458366394043, + "epoch": 0.22171428571428572, + "grad_norm": 0.22982855141162872, + "kl": 0.007961273193359375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.911220577405484e-07, + "loss": 0.0003, + "reward": 0.6637103334069252, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6637103334069252, + "reward_after_std": 0.898862686008215, + "reward_before_mean": 0.9483567178249359, + "reward_before_std": 0.8963012248277664, + "reward_change_max": 0.00025325268507003784, + "reward_change_mean": -0.2846463564783335, + "reward_change_min": -0.5547180697321892, + "reward_change_std": 0.2137763760983944, + "reward_std": 0.8988627195358276, + "rewards/cosine_scaled_reward": 0.09917834028601646, + "rewards/format_reward": 0.7500000074505806, + "step": 194 + }, + { + "advantage_max": 1.7464525401592255, + "advantage_mean": -6.208817238118058e-09, + "advantage_min": -0.8979768976569176, + "advantage_std": 0.9997522979974747, + "completion_length": 1192.8125076293945, + "epoch": 0.22285714285714286, + "grad_norm": 0.22150519490242004, + "kl": 0.0071315765380859375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.884636689049422e-07, + "loss": 0.0003, + "reward": 0.5498414165340364, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.5498414165340364, + "reward_after_std": 0.548849031329155, + "reward_before_mean": 0.8287271652370691, + "reward_before_std": 0.4871873203665018, + "reward_change_max": 0.0, + "reward_change_mean": -0.27888575568795204, + "reward_change_min": -0.4258766584098339, + "reward_change_std": 0.155844459310174, + "reward_std": 0.5488490350544453, + "rewards/cosine_scaled_reward": -0.07521975645795465, + "rewards/format_reward": 0.9791666716337204, + "step": 195 + }, + { + "advantage_max": 1.5998663306236267, + "advantage_mean": -5.3395830201807826e-08, + "advantage_min": -1.0904487520456314, + "advantage_std": 0.9998405128717422, + "completion_length": 2543.1875610351562, + "epoch": 0.224, + "grad_norm": 0.24054710566997528, + "kl": 0.008991241455078125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.857936576865356e-07, + "loss": 0.0004, + "reward": 0.3771222997456789, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3771222997456789, + "reward_after_std": 0.7970558479428291, + "reward_before_mean": 0.607517649885267, + "reward_before_std": 0.7950811851769686, + "reward_change_max": 2.668052911758423e-05, + "reward_change_mean": -0.23039534082636237, + "reward_change_min": -0.435683973133564, + "reward_change_std": 0.1760418750345707, + "reward_std": 0.7970558591187, + "rewards/cosine_scaled_reward": -0.03999119042418897, + "rewards/format_reward": 0.687500013038516, + "step": 196 + }, + { + "advantage_max": 1.7193616777658463, + "advantage_mean": -1.769512969485021e-08, + "advantage_min": -1.0334468707442284, + "advantage_std": 0.9998365789651871, + "completion_length": 1056.9375457763672, + "epoch": 0.22514285714285714, + "grad_norm": 0.2736615240573883, + "kl": 0.006931304931640625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.831121542179086e-07, + "loss": 0.0003, + "reward": 0.6537089729681611, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6537089729681611, + "reward_after_std": 0.7846135124564171, + "reward_before_mean": 0.9389471057802439, + "reward_before_std": 0.7512768171727657, + "reward_change_max": 0.0, + "reward_change_mean": -0.28523814491927624, + "reward_change_min": -0.4611331969499588, + "reward_change_std": 0.17364253383129835, + "reward_std": 0.7846135310828686, + "rewards/cosine_scaled_reward": 0.011140207760035992, + "rewards/format_reward": 0.9166666679084301, + "step": 197 + }, + { + "advantage_max": 1.5378694832324982, + "advantage_mean": -1.986821485111534e-08, + "advantage_min": -1.0649435445666313, + "advantage_std": 0.9998499155044556, + "completion_length": 1377.5000228881836, + "epoch": 0.22628571428571428, + "grad_norm": 0.2656911313533783, + "kl": 0.0081787109375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.804192891917571e-07, + "loss": 0.0003, + "reward": 0.7236167434602976, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.7236167434602976, + "reward_after_std": 0.8226776085793972, + "reward_before_mean": 1.0276835672557354, + "reward_before_std": 0.8301276378333569, + "reward_change_max": 0.0, + "reward_change_mean": -0.30406678281724453, + "reward_change_min": -0.5162977389991283, + "reward_change_std": 0.20391633734107018, + "reward_std": 0.822677630931139, + "rewards/cosine_scaled_reward": 0.04509176965802908, + "rewards/format_reward": 0.9375000074505806, + "step": 198 + }, + { + "advantage_max": 1.7326484769582748, + "advantage_mean": 2.4835269396561444e-09, + "advantage_min": -0.9086687192320824, + "advantage_std": 0.9997498765587807, + "completion_length": 1354.2292098999023, + "epoch": 0.22742857142857142, + "grad_norm": 0.24720071256160736, + "kl": 0.007568359375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.777151938545235e-07, + "loss": 0.0003, + "reward": 0.24219063017517328, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.24219063017517328, + "reward_after_std": 0.48400944843888283, + "reward_before_mean": 0.46279463171958923, + "reward_before_std": 0.44461123645305634, + "reward_change_max": 0.0, + "reward_change_mean": -0.22060398757457733, + "reward_change_min": -0.36466255225241184, + "reward_change_std": 0.12946867663413286, + "reward_std": 0.4840094521641731, + "rewards/cosine_scaled_reward": -0.2477693718392402, + "rewards/format_reward": 0.9583333432674408, + "step": 199 + }, + { + "advantage_max": 1.6117542684078217, + "advantage_mean": -6.208818126296478e-09, + "advantage_min": -1.0823587998747826, + "advantage_std": 0.999851405620575, + "completion_length": 1368.333396911621, + "epoch": 0.22857142857142856, + "grad_norm": 0.20899228751659393, + "kl": 0.007480621337890625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.75e-07, + "loss": 0.0003, + "reward": 0.7081074807792902, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.7081074807792902, + "reward_after_std": 0.8072471618652344, + "reward_before_mean": 1.0080002145841718, + "reward_before_std": 0.8053440153598785, + "reward_change_max": 0.0, + "reward_change_mean": -0.2998927291482687, + "reward_change_min": -0.5310935415327549, + "reward_change_std": 0.19609952345490456, + "reward_std": 0.8072471879422665, + "rewards/cosine_scaled_reward": 0.04566674306988716, + "rewards/format_reward": 0.9166666716337204, + "step": 200 + }, + { + "advantage_max": 1.5719723999500275, + "advantage_mean": -7.823109804405703e-08, + "advantage_min": -1.1847958788275719, + "advantage_std": 0.9998306035995483, + "completion_length": 1690.8958435058594, + "epoch": 0.2297142857142857, + "grad_norm": 0.2967695891857147, + "kl": 0.007305145263671875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.72273839962904e-07, + "loss": 0.0003, + "reward": 0.9961107671260834, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.9961107671260834, + "reward_after_std": 0.7360798977315426, + "reward_before_mean": 1.3610759600996971, + "reward_before_std": 0.7041471414268017, + "reward_change_max": 0.0, + "reward_change_mean": -0.36496521160006523, + "reward_change_min": -0.5439505577087402, + "reward_change_std": 0.220447919331491, + "reward_std": 0.7360799051821232, + "rewards/cosine_scaled_reward": 0.2742879637517035, + "rewards/format_reward": 0.8125000074505806, + "step": 201 + }, + { + "advantage_max": 1.7172722667455673, + "advantage_mean": -8.381903465748408e-08, + "advantage_min": -1.0472783595323563, + "advantage_std": 0.999752514064312, + "completion_length": 1256.3750305175781, + "epoch": 0.23085714285714284, + "grad_norm": 0.23032940924167633, + "kl": 0.00678253173828125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.695368466124296e-07, + "loss": 0.0003, + "reward": 0.95308491459582, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.95308491459582, + "reward_after_std": 0.47053003683686256, + "reward_before_mean": 1.3243922591209412, + "reward_before_std": 0.36816588416695595, + "reward_change_max": 0.0, + "reward_change_mean": -0.37130735348910093, + "reward_change_min": -0.5054810401052237, + "reward_change_std": 0.2038074992597103, + "reward_std": 0.47053005918860435, + "rewards/cosine_scaled_reward": 0.2351127788424492, + "rewards/format_reward": 0.8541666716337204, + "step": 202 + }, + { + "advantage_max": 1.7689315378665924, + "advantage_mean": -5.587935669737476e-09, + "advantage_min": -1.1071807369589806, + "advantage_std": 0.9998086988925934, + "completion_length": 1289.8750305175781, + "epoch": 0.232, + "grad_norm": 0.2538374960422516, + "kl": 0.009387969970703125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.667891533457718e-07, + "loss": 0.0004, + "reward": 0.6541607966646552, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6541607966646552, + "reward_after_std": 0.61843141913414, + "reward_before_mean": 0.947968315333128, + "reward_before_std": 0.5485080145299435, + "reward_change_max": 0.0002954155206680298, + "reward_change_mean": -0.29380753077566624, + "reward_change_min": -0.4340146593749523, + "reward_change_std": 0.16420075949281454, + "reward_std": 0.6184314265847206, + "rewards/cosine_scaled_reward": 0.01565080275759101, + "rewards/format_reward": 0.916666679084301, + "step": 203 + }, + { + "advantage_max": 1.4909850060939789, + "advantage_mean": -2.6077032311278003e-08, + "advantage_min": -1.2436881214380264, + "advantage_std": 0.999848447740078, + "completion_length": 1297.0417098999023, + "epoch": 0.23314285714285715, + "grad_norm": 0.35677286982536316, + "kl": 0.008518218994140625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.640308940816239e-07, + "loss": 0.0003, + "reward": 0.6019344963133335, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.6019344963133335, + "reward_after_std": 0.8923456855118275, + "reward_before_mean": 0.8748259395360947, + "reward_before_std": 0.9185019806027412, + "reward_change_max": 0.0012612491846084595, + "reward_change_mean": -0.2728914525359869, + "reward_change_min": -0.5382020473480225, + "reward_change_std": 0.20347497053444386, + "reward_std": 0.8923456855118275, + "rewards/cosine_scaled_reward": -0.010503708384931087, + "rewards/format_reward": 0.8958333432674408, + "step": 204 + }, + { + "advantage_max": 1.611358642578125, + "advantage_mean": -8.53712394111028e-08, + "advantage_min": -1.0590423047542572, + "advantage_std": 0.9998035132884979, + "completion_length": 1398.5625381469727, + "epoch": 0.2342857142857143, + "grad_norm": 0.26555773615837097, + "kl": 0.006938934326171875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.612622032536507e-07, + "loss": 0.0003, + "reward": 0.9240311346948147, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.9240311346948147, + "reward_after_std": 0.6977498419582844, + "reward_before_mean": 1.275739498436451, + "reward_before_std": 0.6501687755808234, + "reward_change_max": 0.0, + "reward_change_mean": -0.3517083413898945, + "reward_change_min": -0.565269511193037, + "reward_change_std": 0.21987238712608814, + "reward_std": 0.697749849408865, + "rewards/cosine_scaled_reward": 0.2003697256441228, + "rewards/format_reward": 0.8750000074505806, + "step": 205 + }, + { + "advantage_max": 1.7118752002716064, + "advantage_mean": 6.208816905051151e-09, + "advantage_min": -0.9464834704995155, + "advantage_std": 0.9997338354587555, + "completion_length": 2159.041702270508, + "epoch": 0.23542857142857143, + "grad_norm": 0.2459092140197754, + "kl": 0.00711822509765625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.584832158039378e-07, + "loss": 0.0003, + "reward": -0.07530895713716745, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.07530895713716745, + "reward_after_std": 0.4827368911355734, + "reward_before_mean": 0.07829463575035334, + "reward_before_std": 0.4613391198217869, + "reward_change_max": 0.0008892938494682312, + "reward_change_mean": -0.153603594051674, + "reward_change_min": -0.2621854990720749, + "reward_change_std": 0.10310475202277303, + "reward_std": 0.4827369023114443, + "rewards/cosine_scaled_reward": -0.2941860295832157, + "rewards/format_reward": 0.6666666716337204, + "step": 206 + }, + { + "advantage_max": 1.7646289765834808, + "advantage_mean": 3.1044089521259366e-09, + "advantage_min": -0.9611957967281342, + "advantage_std": 0.9998351857066154, + "completion_length": 1640.5625610351562, + "epoch": 0.23657142857142857, + "grad_norm": 0.3210693895816803, + "kl": 0.012020111083984375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.556940671764124e-07, + "loss": 0.0005, + "reward": 0.3758093472570181, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3758093472570181, + "reward_after_std": 0.7883238419890404, + "reward_before_mean": 0.5995778944343328, + "reward_before_std": 0.7552689164876938, + "reward_change_max": 0.0, + "reward_change_mean": -0.223768537864089, + "reward_change_min": -0.38106095790863037, + "reward_change_std": 0.14635274559259415, + "reward_std": 0.7883238643407822, + "rewards/cosine_scaled_reward": -0.10646106884814799, + "rewards/format_reward": 0.8125000074505806, + "step": 207 + }, + { + "advantage_max": 1.5686787962913513, + "advantage_mean": 2.110997954218874e-08, + "advantage_min": -1.2580528557300568, + "advantage_std": 0.9997912496328354, + "completion_length": 1009.8125534057617, + "epoch": 0.2377142857142857, + "grad_norm": 0.2531468868255615, + "kl": 0.00777435302734375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.528948933102438e-07, + "loss": 0.0003, + "reward": 0.6899872645735741, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6899872645735741, + "reward_after_std": 0.5505052581429482, + "reward_before_mean": 1.0031409859657288, + "reward_before_std": 0.5190521031618118, + "reward_change_max": 0.0, + "reward_change_mean": -0.31315369717776775, + "reward_change_min": -0.48251595720648766, + "reward_change_std": 0.1780382813885808, + "reward_std": 0.5505052730441093, + "rewards/cosine_scaled_reward": 0.011987147852778435, + "rewards/format_reward": 0.9791666716337204, + "step": 208 + }, + { + "advantage_max": 1.3751467913389206, + "advantage_mean": -2.297262435835279e-08, + "advantage_min": -1.2741581797599792, + "advantage_std": 0.9998413994908333, + "completion_length": 1126.4166946411133, + "epoch": 0.23885714285714285, + "grad_norm": 0.3193022608757019, + "kl": 0.009975433349609375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.500858306332172e-07, + "loss": 0.0004, + "reward": 0.6218211939558387, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.6218211939558387, + "reward_after_std": 0.7225112542510033, + "reward_before_mean": 0.9142632093280554, + "reward_before_std": 0.7535572983324528, + "reward_change_max": 0.000842459499835968, + "reward_change_mean": -0.2924419930204749, + "reward_change_min": -0.5019077956676483, + "reward_change_std": 0.1989587116986513, + "reward_std": 0.7225112766027451, + "rewards/cosine_scaled_reward": 0.019631581380963326, + "rewards/format_reward": 0.8750000149011612, + "step": 209 + }, + { + "advantage_max": 1.652031123638153, + "advantage_mean": -9.499490866149429e-08, + "advantage_min": -1.114754095673561, + "advantage_std": 0.9998092278838158, + "completion_length": 1526.333366394043, + "epoch": 0.24, + "grad_norm": 0.23487278819084167, + "kl": 0.006862640380859375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.472670160550848e-07, + "loss": 0.0003, + "reward": 0.5745669873431325, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5745669873431325, + "reward_after_std": 0.6210574917495251, + "reward_before_mean": 0.8549545677378774, + "reward_before_std": 0.5745654674246907, + "reward_change_max": 2.635270357131958e-05, + "reward_change_mean": -0.2803875617682934, + "reward_change_min": -0.4581737369298935, + "reward_change_std": 0.17075308226048946, + "reward_std": 0.6210575066506863, + "rewards/cosine_scaled_reward": -0.010022742673754692, + "rewards/format_reward": 0.8750000111758709, + "step": 210 + }, + { + "advantage_max": 1.7116133570671082, + "advantage_mean": -6.581346634337848e-08, + "advantage_min": -0.9903494343161583, + "advantage_std": 0.9997931122779846, + "completion_length": 1457.3750534057617, + "epoch": 0.24114285714285713, + "grad_norm": 0.2358456403017044, + "kl": 0.00867462158203125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.444385869608921e-07, + "loss": 0.0003, + "reward": 0.6318370220251381, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6318370220251381, + "reward_after_std": 0.576850164681673, + "reward_before_mean": 0.9263378567993641, + "reward_before_std": 0.5047197928652167, + "reward_change_max": 0.00010591000318527222, + "reward_change_mean": -0.2945008259266615, + "reward_change_min": -0.4437066949903965, + "reward_change_std": 0.17200073320418596, + "reward_std": 0.576850164681673, + "rewards/cosine_scaled_reward": 0.05691891070455313, + "rewards/format_reward": 0.8125000074505806, + "step": 211 + }, + { + "advantage_max": 1.6534480303525925, + "advantage_mean": -5.836288397009781e-08, + "advantage_min": -1.0853853449225426, + "advantage_std": 0.9998204335570335, + "completion_length": 1065.4375457763672, + "epoch": 0.2422857142857143, + "grad_norm": 0.26351648569107056, + "kl": 0.01288604736328125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.416006812042827e-07, + "loss": 0.0005, + "reward": 0.8122778884135187, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.8122778884135187, + "reward_after_std": 0.6671516038477421, + "reward_before_mean": 1.1411408353596926, + "reward_before_std": 0.620491236448288, + "reward_change_max": 0.0, + "reward_change_mean": -0.32886295951902866, + "reward_change_min": -0.5069613792002201, + "reward_change_std": 0.19332514610141516, + "reward_std": 0.6671516112983227, + "rewards/cosine_scaled_reward": 0.11223706416785717, + "rewards/format_reward": 0.9166666716337204, + "step": 212 + }, + { + "advantage_max": 1.6381594836711884, + "advantage_mean": -4.8428774657161e-08, + "advantage_min": -1.2275662645697594, + "advantage_std": 0.9998309686779976, + "completion_length": 1396.125015258789, + "epoch": 0.24342857142857144, + "grad_norm": 0.31681808829307556, + "kl": 0.011737823486328125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.387534371007797e-07, + "loss": 0.0005, + "reward": 0.7322141584008932, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7322141584008932, + "reward_after_std": 0.7107599079608917, + "reward_before_mean": 1.039572605281137, + "reward_before_std": 0.6669244170188904, + "reward_change_max": 5.4270029067993164e-05, + "reward_change_mean": -0.30735844001173973, + "reward_change_min": -0.4567818343639374, + "reward_change_std": 0.18181548546999693, + "reward_std": 0.7107599154114723, + "rewards/cosine_scaled_reward": 0.0718696154654026, + "rewards/format_reward": 0.8958333432674408, + "step": 213 + }, + { + "advantage_max": 1.562519982457161, + "advantage_mean": -2.7318795670083773e-08, + "advantage_min": -1.0746354907751083, + "advantage_std": 0.9997971132397652, + "completion_length": 1867.5625610351562, + "epoch": 0.24457142857142858, + "grad_norm": 0.3209463059902191, + "kl": 0.01001739501953125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.358969934210438e-07, + "loss": 0.0004, + "reward": 0.48180012218654156, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.48180012218654156, + "reward_after_std": 0.6115984097123146, + "reward_before_mean": 0.7484859389369376, + "reward_before_std": 0.6006402317434549, + "reward_change_max": 0.0003897324204444885, + "reward_change_mean": -0.2666857857257128, + "reward_change_min": -0.4256005696952343, + "reward_change_std": 0.1750076524913311, + "reward_std": 0.6115984097123146, + "rewards/cosine_scaled_reward": 0.00965961068868637, + "rewards/format_reward": 0.7291666846722364, + "step": 214 + }, + { + "advantage_max": 1.61798395216465, + "advantage_mean": -8.133550644107146e-08, + "advantage_min": -1.126285158097744, + "advantage_std": 0.9997453913092613, + "completion_length": 1641.9167098999023, + "epoch": 0.24571428571428572, + "grad_norm": 0.25225481390953064, + "kl": 0.0077953338623046875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.330314893841101e-07, + "loss": 0.0003, + "reward": 0.3076944574713707, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3076944574713707, + "reward_after_std": 0.4532122015953064, + "reward_before_mean": 0.5455555971711874, + "reward_before_std": 0.431146040558815, + "reward_change_max": 0.0002174675464630127, + "reward_change_mean": -0.23786117020063102, + "reward_change_min": -0.3689110726118088, + "reward_change_std": 0.1408306835219264, + "reward_std": 0.4532122276723385, + "rewards/cosine_scaled_reward": -0.12305555492639542, + "rewards/format_reward": 0.791666679084301, + "step": 215 + }, + { + "advantage_max": 1.705009326338768, + "advantage_mean": -9.126961747485396e-08, + "advantage_min": -1.099523849785328, + "advantage_std": 0.9997986108064651, + "completion_length": 902.7916793823242, + "epoch": 0.24685714285714286, + "grad_norm": 0.2886558175086975, + "kl": 0.008966445922851562, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.301570646506027e-07, + "loss": 0.0004, + "reward": 0.8277550789935049, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.8277550789935049, + "reward_after_std": 0.6353770308196545, + "reward_before_mean": 1.1602430697530508, + "reward_before_std": 0.5769399423152208, + "reward_change_max": 0.0, + "reward_change_mean": -0.33248797059059143, + "reward_change_min": -0.5088830962777138, + "reward_change_std": 0.19062372762709856, + "reward_std": 0.6353770382702351, + "rewards/cosine_scaled_reward": 0.10095484089106321, + "rewards/format_reward": 0.9583333358168602, + "step": 216 + }, + { + "advantage_max": 1.7508010566234589, + "advantage_mean": -2.607703353252333e-08, + "advantage_min": -0.8888709247112274, + "advantage_std": 0.9998551979660988, + "completion_length": 1312.458381652832, + "epoch": 0.248, + "grad_norm": 0.261865496635437, + "kl": 0.009029388427734375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.27273859315928e-07, + "loss": 0.0004, + "reward": 0.793455844046548, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.793455844046548, + "reward_after_std": 0.8174287676811218, + "reward_before_mean": 1.1041301563382149, + "reward_before_std": 0.7625340297818184, + "reward_change_max": 0.0, + "reward_change_mean": -0.3106742948293686, + "reward_change_min": -0.5126793663948774, + "reward_change_std": 0.18933186866343021, + "reward_std": 0.8174287900328636, + "rewards/cosine_scaled_reward": 0.12498173583298922, + "rewards/format_reward": 0.8541666679084301, + "step": 217 + }, + { + "advantage_max": 1.6184561103582382, + "advantage_mean": -1.4280280402623191e-08, + "advantage_min": -1.0892015025019646, + "advantage_std": 0.9997873827815056, + "completion_length": 1377.6250305175781, + "epoch": 0.24914285714285714, + "grad_norm": 0.237302765250206, + "kl": 0.007476806640625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.243820139034464e-07, + "loss": 0.0003, + "reward": 0.2374113779515028, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.2374113779515028, + "reward_after_std": 0.5709048807621002, + "reward_before_mean": 0.4522370882332325, + "reward_before_std": 0.5512584503740072, + "reward_change_max": 0.0008431896567344666, + "reward_change_mean": -0.2148257028311491, + "reward_change_min": -0.3685059826821089, + "reward_change_std": 0.13741043116897345, + "reward_std": 0.570904903113842, + "rewards/cosine_scaled_reward": -0.21138146799057722, + "rewards/format_reward": 0.8750000111758709, + "step": 218 + }, + { + "advantage_max": 1.6508960127830505, + "advantage_mean": -3.1044087300813317e-09, + "advantage_min": -1.0674656257033348, + "advantage_std": 0.9998109415173531, + "completion_length": 1250.1666870117188, + "epoch": 0.2502857142857143, + "grad_norm": 0.30771803855895996, + "kl": 0.008953094482421875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.214816693576234e-07, + "loss": 0.0004, + "reward": 0.7107078991830349, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.7107078991830349, + "reward_after_std": 0.6987728551030159, + "reward_before_mean": 1.0161756947636604, + "reward_before_std": 0.6660973466932774, + "reward_change_max": 0.0, + "reward_change_mean": -0.3054677518084645, + "reward_change_min": -0.48597782105207443, + "reward_change_std": 0.18221312388777733, + "reward_std": 0.6987728625535965, + "rewards/cosine_scaled_reward": 0.08100447617471218, + "rewards/format_reward": 0.8541666716337204, + "step": 219 + }, + { + "advantage_max": 1.6507864892482758, + "advantage_mean": 1.2417634476236117e-08, + "advantage_min": -0.8715488091111183, + "advantage_std": 0.9997882917523384, + "completion_length": 1461.0208644866943, + "epoch": 0.25142857142857145, + "grad_norm": 0.30054739117622375, + "kl": 0.010099411010742188, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.185729670371604e-07, + "loss": 0.0004, + "reward": 0.08521711453795433, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.08521711453795433, + "reward_after_std": 0.5252517238259315, + "reward_before_mean": 0.2699965760111809, + "reward_before_std": 0.5070017725229263, + "reward_change_max": 0.0002897605299949646, + "reward_change_mean": -0.18477945029735565, + "reward_change_min": -0.34365820325911045, + "reward_change_std": 0.12291183322668076, + "reward_std": 0.5252517312765121, + "rewards/cosine_scaled_reward": -0.29208505246788263, + "rewards/format_reward": 0.8541666697710752, + "step": 220 + }, + { + "advantage_max": 1.7164071947336197, + "advantage_mean": -9.93410742555767e-09, + "advantage_min": -0.881681602448225, + "advantage_std": 0.9997705519199371, + "completion_length": 1046.5833702087402, + "epoch": 0.25257142857142856, + "grad_norm": 0.26651760935783386, + "kl": 0.00786590576171875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.156560487081051e-07, + "loss": 0.0003, + "reward": 0.6971848933026195, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6971848933026195, + "reward_after_std": 0.5582109466195107, + "reward_before_mean": 1.0091250874102116, + "reward_before_std": 0.49160440918058157, + "reward_change_max": 0.0, + "reward_change_mean": -0.31194019317626953, + "reward_change_min": -0.48183613270521164, + "reward_change_std": 0.1836419040337205, + "reward_std": 0.558210976421833, + "rewards/cosine_scaled_reward": 0.03581253904849291, + "rewards/format_reward": 0.9375000074505806, + "step": 221 + }, + { + "advantage_max": 1.6569068133831024, + "advantage_mean": -3.104408563547878e-08, + "advantage_min": -0.9789787083864212, + "advantage_std": 0.9998027682304382, + "completion_length": 1323.1875381469727, + "epoch": 0.2537142857142857, + "grad_norm": 0.28783875703811646, + "kl": 0.009412765502929688, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.127310565369415e-07, + "loss": 0.0004, + "reward": 0.7466136773582548, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.7466136773582548, + "reward_after_std": 0.57162756472826, + "reward_before_mean": 1.0669129355810583, + "reward_before_std": 0.5085346233099699, + "reward_change_max": 0.0, + "reward_change_mean": -0.3202992007136345, + "reward_change_min": -0.5050487257540226, + "reward_change_std": 0.18606600351631641, + "reward_std": 0.5716275870800018, + "rewards/cosine_scaled_reward": 0.11678976844996214, + "rewards/format_reward": 0.8333333358168602, + "step": 222 + }, + { + "advantage_max": 1.6416790187358856, + "advantage_mean": -5.960464510845753e-08, + "advantage_min": -1.1050887554883957, + "advantage_std": 0.9997776672244072, + "completion_length": 1424.4167098999023, + "epoch": 0.25485714285714284, + "grad_norm": 0.23706035315990448, + "kl": 0.008235931396484375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.097981330836616e-07, + "loss": 0.0003, + "reward": 0.46483216737397015, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.46483216737397015, + "reward_after_std": 0.5388064533472061, + "reward_before_mean": 0.7305796891450882, + "reward_before_std": 0.508102111518383, + "reward_change_max": 0.0, + "reward_change_mean": -0.2657475220039487, + "reward_change_min": -0.42238375917077065, + "reward_change_std": 0.16147217992693186, + "reward_std": 0.5388064719736576, + "rewards/cosine_scaled_reward": -0.0513768270611763, + "rewards/format_reward": 0.8333333432674408, + "step": 223 + }, + { + "advantage_max": 1.5843409299850464, + "advantage_mean": -6.208817238118058e-09, + "advantage_min": -1.094870388507843, + "advantage_std": 0.9998336359858513, + "completion_length": 1769.8125610351562, + "epoch": 0.256, + "grad_norm": 0.22389104962348938, + "kl": 0.00800323486328125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.068574212948169e-07, + "loss": 0.0003, + "reward": 0.426870440132916, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.426870440132916, + "reward_after_std": 0.7609815746545792, + "reward_before_mean": 0.6692265486344695, + "reward_before_std": 0.755431704223156, + "reward_change_max": 0.0, + "reward_change_mean": -0.24235611967742443, + "reward_change_min": -0.4466691426932812, + "reward_change_std": 0.16528659500181675, + "reward_std": 0.760981597006321, + "rewards/cosine_scaled_reward": -0.07163673074683174, + "rewards/format_reward": 0.8125000074505806, + "step": 224 + }, + { + "advantage_max": 1.5787824094295502, + "advantage_mean": -1.6142924552653426e-08, + "advantage_min": -1.0859878808259964, + "advantage_std": 0.9998409599065781, + "completion_length": 1509.2916946411133, + "epoch": 0.2571428571428571, + "grad_norm": 0.31962674856185913, + "kl": 0.013401031494140625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.039090644965509e-07, + "loss": 0.0005, + "reward": 0.4917536824941635, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4917536824941635, + "reward_after_std": 0.7545221112668514, + "reward_before_mean": 0.7470511943101883, + "reward_before_std": 0.7450950369238853, + "reward_change_max": 0.0, + "reward_change_mean": -0.2552974782884121, + "reward_change_min": -0.4263688549399376, + "reward_change_std": 0.16516633983701468, + "reward_std": 0.7545221336185932, + "rewards/cosine_scaled_reward": -0.053557755425572395, + "rewards/format_reward": 0.8541666753590107, + "step": 225 + }, + { + "advantage_max": 1.488643042743206, + "advantage_mean": -5.525847351917079e-08, + "advantage_min": -1.2618045508861542, + "advantage_std": 0.999822273850441, + "completion_length": 1414.9792098999023, + "epoch": 0.2582857142857143, + "grad_norm": 0.21461425721645355, + "kl": 0.008453369140625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 7.009532063876148e-07, + "loss": 0.0003, + "reward": 0.906906258314848, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.906906258314848, + "reward_after_std": 0.7242296598851681, + "reward_before_mean": 1.2583957947790623, + "reward_before_std": 0.7252499852329493, + "reward_change_max": 0.0, + "reward_change_mean": -0.35148957930505276, + "reward_change_min": -0.5474782064557076, + "reward_change_std": 0.221489024348557, + "reward_std": 0.7242296785116196, + "rewards/cosine_scaled_reward": 0.17086457274854183, + "rewards/format_reward": 0.9166666865348816, + "step": 226 + }, + { + "advantage_max": 1.657107725739479, + "advantage_mean": 6.208814573582799e-10, + "advantage_min": -1.0568899437785149, + "advantage_std": 0.999826692044735, + "completion_length": 1037.1667022705078, + "epoch": 0.25942857142857145, + "grad_norm": 0.30343934893608093, + "kl": 0.011219024658203125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.979899910323624e-07, + "loss": 0.0004, + "reward": 0.5653025805950165, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5653025805950165, + "reward_after_std": 0.7990770377218723, + "reward_before_mean": 0.829374760389328, + "reward_before_std": 0.7766488343477249, + "reward_change_max": 0.0, + "reward_change_mean": -0.2640721816569567, + "reward_change_min": -0.4902346208691597, + "reward_change_std": 0.1744185872375965, + "reward_std": 0.7990770451724529, + "rewards/cosine_scaled_reward": -0.07489595795050263, + "rewards/format_reward": 0.9791666716337204, + "step": 227 + }, + { + "advantage_max": 1.6345582455396652, + "advantage_mean": -1.552204320631745e-08, + "advantage_min": -1.091313198208809, + "advantage_std": 0.9997934699058533, + "completion_length": 1175.6875190734863, + "epoch": 0.26057142857142856, + "grad_norm": 0.2735670506954193, + "kl": 0.008899688720703125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.950195628537299e-07, + "loss": 0.0004, + "reward": 0.7679368201643229, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7679368201643229, + "reward_after_std": 0.5992216616868973, + "reward_before_mean": 1.0934365428984165, + "reward_before_std": 0.5540211275219917, + "reward_change_max": 0.00032895803451538086, + "reward_change_mean": -0.3254997171461582, + "reward_change_min": -0.5017792768776417, + "reward_change_std": 0.18997229263186455, + "reward_std": 0.5992216691374779, + "rewards/cosine_scaled_reward": 0.07796826213598251, + "rewards/format_reward": 0.9375000074505806, + "step": 228 + }, + { + "advantage_max": 1.6010667532682419, + "advantage_mean": -1.1796752796833232e-08, + "advantage_min": -1.0798330828547478, + "advantage_std": 0.9998021498322487, + "completion_length": 1383.2708892822266, + "epoch": 0.26171428571428573, + "grad_norm": 0.3279111087322235, + "kl": 0.01459503173828125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.920420666261961e-07, + "loss": 0.0006, + "reward": 0.5010130619630218, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5010130619630218, + "reward_after_std": 0.5620199963450432, + "reward_before_mean": 0.7702241754159331, + "reward_before_std": 0.5329751931130886, + "reward_change_max": 0.00018850713968276978, + "reward_change_mean": -0.2692110911011696, + "reward_change_min": -0.45008396729826927, + "reward_change_std": 0.15964961983263493, + "reward_std": 0.5620200261473656, + "rewards/cosine_scaled_reward": -0.052387919276952744, + "rewards/format_reward": 0.8750000074505806, + "step": 229 + }, + { + "advantage_max": 1.7147899568080902, + "advantage_mean": -2.1109978876054925e-08, + "advantage_min": -1.0958684533834457, + "advantage_std": 0.9997913986444473, + "completion_length": 1725.1875457763672, + "epoch": 0.26285714285714284, + "grad_norm": 0.2207302302122116, + "kl": 0.009916305541992188, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.890576474687263e-07, + "loss": 0.0004, + "reward": 0.09772306028753519, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.09772306028753519, + "reward_after_std": 0.6152992285788059, + "reward_before_mean": 0.2763853659853339, + "reward_before_std": 0.5864697769284248, + "reward_change_max": 4.410743713378906e-05, + "reward_change_mean": -0.1786623066291213, + "reward_change_min": -0.29143037647008896, + "reward_change_std": 0.11186036374419928, + "reward_std": 0.6152992323040962, + "rewards/cosine_scaled_reward": -0.257640658528544, + "rewards/format_reward": 0.7916666846722364, + "step": 230 + }, + { + "advantage_max": 1.649023875594139, + "advantage_mean": -5.650023726655462e-08, + "advantage_min": -1.1833641976118088, + "advantage_std": 0.9998235106468201, + "completion_length": 1206.6875534057617, + "epoch": 0.264, + "grad_norm": 0.2692098915576935, + "kl": 0.0087738037109375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.860664508377001e-07, + "loss": 0.0004, + "reward": 0.7332293977960944, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.7332293977960944, + "reward_after_std": 0.6610595881938934, + "reward_before_mean": 1.045009451918304, + "reward_before_std": 0.6180670075118542, + "reward_change_max": 0.0, + "reward_change_mean": -0.3117800485342741, + "reward_change_min": -0.5020957123488188, + "reward_change_std": 0.18412381410598755, + "reward_std": 0.6610595881938934, + "rewards/cosine_scaled_reward": 0.0641713603399694, + "rewards/format_reward": 0.9166666716337204, + "step": 231 + }, + { + "advantage_max": 1.7163164764642715, + "advantage_mean": -1.1175871117430347e-08, + "advantage_min": -0.9364407062530518, + "advantage_std": 0.9998013451695442, + "completion_length": 1682.9375305175781, + "epoch": 0.2651428571428571, + "grad_norm": 0.22369278967380524, + "kl": 0.010684967041015625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.83068622519821e-07, + "loss": 0.0004, + "reward": 0.34763477742671967, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.34763477742671967, + "reward_after_std": 0.6970238201320171, + "reward_before_mean": 0.5743373781442642, + "reward_before_std": 0.6692356579005718, + "reward_change_max": 0.00011929869651794434, + "reward_change_mean": -0.22670260351151228, + "reward_change_min": -0.4087493382394314, + "reward_change_std": 0.14438875764608383, + "reward_std": 0.6970238536596298, + "rewards/cosine_scaled_reward": -0.1399146532639861, + "rewards/format_reward": 0.8541666716337204, + "step": 232 + }, + { + "advantage_max": 1.7902098149061203, + "advantage_mean": -4.097819406023717e-08, + "advantage_min": -0.9070549011230469, + "advantage_std": 0.9997976720333099, + "completion_length": 923.9792098999023, + "epoch": 0.2662857142857143, + "grad_norm": 0.2506750226020813, + "kl": 0.006481170654296875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.800643086250121e-07, + "loss": 0.0003, + "reward": 0.4007737059146166, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4007737059146166, + "reward_after_std": 0.6145709976553917, + "reward_before_mean": 0.6410532779991627, + "reward_before_std": 0.5568097867071629, + "reward_change_max": 0.0, + "reward_change_mean": -0.24027959816157818, + "reward_change_min": -0.38382837176322937, + "reward_change_std": 0.13632231950759888, + "reward_std": 0.6145710311830044, + "rewards/cosine_scaled_reward": -0.17947336845099926, + "rewards/format_reward": 1.0, + "step": 233 + }, + { + "advantage_max": 1.7843565493822098, + "advantage_mean": 2.6077034309679448e-08, + "advantage_min": -0.914627306163311, + "advantage_std": 0.9997813999652863, + "completion_length": 1496.5416946411133, + "epoch": 0.2674285714285714, + "grad_norm": 0.2820890545845032, + "kl": 0.011810302734375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.770536555792944e-07, + "loss": 0.0005, + "reward": 0.39785597764421254, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.39785597764421254, + "reward_after_std": 0.6381666585803032, + "reward_before_mean": 0.6381850223988295, + "reward_before_std": 0.5803314950317144, + "reward_change_max": 0.0, + "reward_change_mean": -0.24032901134341955, + "reward_change_min": -0.38727836683392525, + "reward_change_std": 0.145506224129349, + "reward_std": 0.6381666623055935, + "rewards/cosine_scaled_reward": -0.09757416089996696, + "rewards/format_reward": 0.8333333432674408, + "step": 234 + }, + { + "advantage_max": 1.8626025319099426, + "advantage_mean": -5.463759256141287e-08, + "advantage_min": -0.8086363673210144, + "advantage_std": 0.9998244345188141, + "completion_length": 1031.7917022705078, + "epoch": 0.26857142857142857, + "grad_norm": 0.26428404450416565, + "kl": 0.00981903076171875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.740368101176495e-07, + "loss": 0.0004, + "reward": 0.9631862174719572, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.9631862174719572, + "reward_after_std": 0.7271666266024113, + "reward_before_mean": 1.3131061717867851, + "reward_before_std": 0.6203845215495676, + "reward_change_max": 0.0, + "reward_change_mean": -0.34991992451250553, + "reward_change_min": -0.5234523415565491, + "reward_change_std": 0.19631488993763924, + "reward_std": 0.727166660130024, + "rewards/cosine_scaled_reward": 0.16696972399950027, + "rewards/format_reward": 0.9791666716337204, + "step": 235 + }, + { + "advantage_max": 1.6912491768598557, + "advantage_mean": -4.3461718668424965e-09, + "advantage_min": -1.0445576757192612, + "advantage_std": 0.9997703358530998, + "completion_length": 1854.4167022705078, + "epoch": 0.26971428571428574, + "grad_norm": 0.24529314041137695, + "kl": 0.012115478515625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.710139192768694e-07, + "loss": 0.0005, + "reward": 0.22261589765548706, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.22261589765548706, + "reward_after_std": 0.6593184359371662, + "reward_before_mean": 0.42628917563706636, + "reward_before_std": 0.6346053350716829, + "reward_change_max": 0.0, + "reward_change_mean": -0.20367325004190207, + "reward_change_min": -0.37755058892071247, + "reward_change_std": 0.13592194765806198, + "reward_std": 0.6593184620141983, + "rewards/cosine_scaled_reward": -0.1514387633651495, + "rewards/format_reward": 0.7291666679084301, + "step": 236 + }, + { + "advantage_max": 1.6201680451631546, + "advantage_mean": -3.104408596854569e-08, + "advantage_min": -1.2312413528561592, + "advantage_std": 0.9998116791248322, + "completion_length": 1428.4791870117188, + "epoch": 0.27085714285714285, + "grad_norm": 0.21748077869415283, + "kl": 0.008556365966796875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.679851303883891e-07, + "loss": 0.0003, + "reward": 0.6938337534666061, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6938337534666061, + "reward_after_std": 0.6575784459710121, + "reward_before_mean": 0.9981541857123375, + "reward_before_std": 0.6225257329642773, + "reward_change_max": 0.0, + "reward_change_mean": -0.30432047322392464, + "reward_change_min": -0.4677902590483427, + "reward_change_std": 0.1763472305610776, + "reward_std": 0.6575784794986248, + "rewards/cosine_scaled_reward": 0.061577089596539736, + "rewards/format_reward": 0.875, + "step": 237 + }, + { + "advantage_max": 1.7355145364999771, + "advantage_mean": -8.07146262049585e-08, + "advantage_min": -0.9608321115374565, + "advantage_std": 0.9998087286949158, + "completion_length": 1263.1042251586914, + "epoch": 0.272, + "grad_norm": 0.25904324650764465, + "kl": 0.01227569580078125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.649505910711058e-07, + "loss": 0.0005, + "reward": 0.7409389466047287, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.7409389466047287, + "reward_after_std": 0.8139899540692568, + "reward_before_mean": 1.042520135641098, + "reward_before_std": 0.7730010617524385, + "reward_change_max": 0.000684693455696106, + "reward_change_mean": -0.3015812076628208, + "reward_change_min": -0.5144244804978371, + "reward_change_std": 0.19731996580958366, + "reward_std": 0.8139900006353855, + "rewards/cosine_scaled_reward": 0.06292672269046307, + "rewards/format_reward": 0.9166666716337204, + "step": 238 + }, + { + "advantage_max": 1.5609101951122284, + "advantage_mean": -7.047007721805443e-08, + "advantage_min": -1.255035139620304, + "advantage_std": 0.9998224526643753, + "completion_length": 1285.2292137145996, + "epoch": 0.27314285714285713, + "grad_norm": 0.2311781495809555, + "kl": 0.00722503662109375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.619104492241847e-07, + "loss": 0.0003, + "reward": 1.021848929580301, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 1.021848929580301, + "reward_after_std": 0.7617662567645311, + "reward_before_mean": 1.3911364481318742, + "reward_before_std": 0.7432506419718266, + "reward_change_max": 0.00010582804679870605, + "reward_change_mean": -0.3692875001579523, + "reward_change_min": -0.5409754365682602, + "reward_change_std": 0.21951518952846527, + "reward_std": 0.7617662828415632, + "rewards/cosine_scaled_reward": 0.27890151739120483, + "rewards/format_reward": 0.833333333954215, + "step": 239 + }, + { + "advantage_max": 1.8245199620723724, + "advantage_mean": -4.967053879312289e-09, + "advantage_min": -0.8752617910504341, + "advantage_std": 0.9997757226228714, + "completion_length": 1043.7708587646484, + "epoch": 0.2742857142857143, + "grad_norm": 0.33578115701675415, + "kl": 0.010406494140625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.588648530198504e-07, + "loss": 0.0004, + "reward": 0.2590970569290221, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2590970569290221, + "reward_after_std": 0.5167682282626629, + "reward_before_mean": 0.4774673692882061, + "reward_before_std": 0.4506372455507517, + "reward_change_max": 0.0, + "reward_change_mean": -0.21837032958865166, + "reward_change_min": -0.32546042278409004, + "reward_change_std": 0.11699494253844023, + "reward_std": 0.5167682506144047, + "rewards/cosine_scaled_reward": -0.2404329781420529, + "rewards/format_reward": 0.9583333432674408, + "step": 240 + }, + { + "advantage_max": 1.6402872800827026, + "advantage_mean": -2.4214386995513593e-08, + "advantage_min": -1.0937683582305908, + "advantage_std": 0.9998233839869499, + "completion_length": 1367.1666870117188, + "epoch": 0.2754285714285714, + "grad_norm": 0.27239882946014404, + "kl": 0.01021575927734375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.558139508961654e-07, + "loss": 0.0004, + "reward": 0.358948964625597, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.358948964625597, + "reward_after_std": 0.6905892789363861, + "reward_before_mean": 0.5892646436113864, + "reward_before_std": 0.6633242294192314, + "reward_change_max": 0.0001598149538040161, + "reward_change_mean": -0.2303156852722168, + "reward_change_min": -0.394557923078537, + "reward_change_std": 0.14510553609579802, + "reward_std": 0.6905892826616764, + "rewards/cosine_scaled_reward": -0.17411768180318177, + "rewards/format_reward": 0.9375000074505806, + "step": 241 + }, + { + "advantage_max": 1.710006132721901, + "advantage_mean": 2.1730860000346297e-08, + "advantage_min": -0.9204581826925278, + "advantage_std": 0.9996551722288132, + "completion_length": 946.6041870117188, + "epoch": 0.2765714285714286, + "grad_norm": 0.33036869764328003, + "kl": 0.014711380004882812, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.527578915497951e-07, + "loss": 0.0006, + "reward": 0.4683981789276004, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4683981789276004, + "reward_after_std": 0.4530638074502349, + "reward_before_mean": 0.7394059164216742, + "reward_before_std": 0.39871928084176034, + "reward_change_max": 0.0, + "reward_change_mean": -0.2710077129304409, + "reward_change_min": -0.42335289902985096, + "reward_change_std": 0.1545707117766142, + "reward_std": 0.4530638186261058, + "rewards/cosine_scaled_reward": -0.11988039966672659, + "rewards/format_reward": 0.9791666716337204, + "step": 242 + }, + { + "advantage_max": 1.7547654956579208, + "advantage_mean": -3.7252904094842165e-08, + "advantage_min": -0.8856014385819435, + "advantage_std": 0.9998505413532257, + "completion_length": 1448.6875228881836, + "epoch": 0.2777142857142857, + "grad_norm": 0.21732625365257263, + "kl": 0.008548736572265625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.496968239287603e-07, + "loss": 0.0003, + "reward": 0.6196297630667686, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6196297630667686, + "reward_after_std": 0.7986881732940674, + "reward_before_mean": 0.8930200412869453, + "reward_before_std": 0.7416506707668304, + "reward_change_max": 0.0, + "reward_change_mean": -0.2733902661129832, + "reward_change_min": -0.4578908830881119, + "reward_change_std": 0.16526594944298267, + "reward_std": 0.7986881770193577, + "rewards/cosine_scaled_reward": 0.009009993635118008, + "rewards/format_reward": 0.875, + "step": 243 + }, + { + "advantage_max": 1.6024657785892487, + "advantage_mean": -4.284083965355734e-08, + "advantage_min": -1.1661089807748795, + "advantage_std": 0.9998447969555855, + "completion_length": 1410.9792098999023, + "epoch": 0.27885714285714286, + "grad_norm": 0.25495943427085876, + "kl": 0.007928848266601562, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.466308972251785e-07, + "loss": 0.0003, + "reward": 0.8061085338704288, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.8061085338704288, + "reward_after_std": 0.7023773118853569, + "reward_before_mean": 1.131754757836461, + "reward_before_std": 0.6597034148871899, + "reward_change_max": 0.0, + "reward_change_mean": -0.32564621791243553, + "reward_change_min": -0.515510767698288, + "reward_change_std": 0.19546143896877766, + "reward_std": 0.7023773454129696, + "rewards/cosine_scaled_reward": 0.08671068772673607, + "rewards/format_reward": 0.9583333432674408, + "step": 244 + }, + { + "advantage_max": 1.6460980474948883, + "advantage_mean": -2.980232349791834e-08, + "advantage_min": -1.046926312148571, + "advantage_std": 0.999882735311985, + "completion_length": 1751.062515258789, + "epoch": 0.28, + "grad_norm": 0.2102421373128891, + "kl": 0.008523941040039062, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.435602608679916e-07, + "loss": 0.0003, + "reward": 0.6302961353212595, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6302961353212595, + "reward_after_std": 0.992115680128336, + "reward_before_mean": 0.9001847244799137, + "reward_before_std": 0.9945877194404602, + "reward_change_max": 0.00019932538270950317, + "reward_change_mean": -0.2698885854333639, + "reward_change_min": -0.5097962245345116, + "reward_change_std": 0.19624944310635328, + "reward_std": 0.9921157024800777, + "rewards/cosine_scaled_reward": 0.04384234419558197, + "rewards/format_reward": 0.8125000055879354, + "step": 245 + }, + { + "advantage_max": 1.7886092513799667, + "advantage_mean": 2.8560559139911845e-08, + "advantage_min": -0.8694706782698631, + "advantage_std": 0.9998341947793961, + "completion_length": 1462.0000457763672, + "epoch": 0.28114285714285714, + "grad_norm": 0.23334245383739471, + "kl": 0.012065887451171875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.404850645156841e-07, + "loss": 0.0005, + "reward": 0.4586481023579836, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4586481023579836, + "reward_after_std": 0.6987153068184853, + "reward_before_mean": 0.7044544890522957, + "reward_before_std": 0.6329842433333397, + "reward_change_max": 0.0, + "reward_change_mean": -0.24580636341124773, + "reward_change_min": -0.4013601616024971, + "reward_change_std": 0.14217450562864542, + "reward_std": 0.698715329170227, + "rewards/cosine_scaled_reward": -0.08527276385575533, + "rewards/format_reward": 0.8750000111758709, + "step": 246 + }, + { + "advantage_max": 1.689618080854416, + "advantage_mean": -1.117587078436344e-08, + "advantage_min": -1.0913282707333565, + "advantage_std": 0.9997552260756493, + "completion_length": 2003.4375305175781, + "epoch": 0.2822857142857143, + "grad_norm": 0.2768010199069977, + "kl": 0.014162063598632812, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.374054580489873e-07, + "loss": 0.0006, + "reward": 0.13496133871376514, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.13496133871376514, + "reward_after_std": 0.5509476698935032, + "reward_before_mean": 0.33065611124038696, + "reward_before_std": 0.5430520595982671, + "reward_change_max": 0.0004723742604255676, + "reward_change_mean": -0.19569478183984756, + "reward_change_min": -0.3195792939513922, + "reward_change_std": 0.13450893759727478, + "reward_std": 0.5509477015584707, + "rewards/cosine_scaled_reward": -0.18883861787617207, + "rewards/format_reward": 0.7083333469927311, + "step": 247 + }, + { + "advantage_max": 1.5921323150396347, + "advantage_mean": -1.3348957372816272e-07, + "advantage_min": -1.2049953117966652, + "advantage_std": 0.9998230114579201, + "completion_length": 1375.2291946411133, + "epoch": 0.2834285714285714, + "grad_norm": 0.32450905442237854, + "kl": 0.012350082397460938, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.343215915635761e-07, + "loss": 0.0005, + "reward": 0.8880419675260782, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.8880419675260782, + "reward_after_std": 0.676459863781929, + "reward_before_mean": 1.2321948036551476, + "reward_before_std": 0.6238557770848274, + "reward_change_max": 0.0, + "reward_change_mean": -0.3441528985276818, + "reward_change_min": -0.5215148255228996, + "reward_change_std": 0.2035660557448864, + "reward_std": 0.6764598675072193, + "rewards/cosine_scaled_reward": 0.22026405856013298, + "rewards/format_reward": 0.7916666716337204, + "step": 248 + }, + { + "advantage_max": 1.7209616303443909, + "advantage_mean": -2.8560559028889543e-08, + "advantage_min": -0.9680159725248814, + "advantage_std": 0.9998212978243828, + "completion_length": 1224.8750457763672, + "epoch": 0.2845714285714286, + "grad_norm": 0.26189547777175903, + "kl": 0.0107269287109375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.31233615362752e-07, + "loss": 0.0004, + "reward": 0.8255430636927485, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.8255430636927485, + "reward_after_std": 0.6577041186392307, + "reward_before_mean": 1.1544302143156528, + "reward_before_std": 0.5823190435767174, + "reward_change_max": 0.0, + "reward_change_mean": -0.3288871766999364, + "reward_change_min": -0.4670917894691229, + "reward_change_std": 0.18136184941977262, + "reward_std": 0.6577041260898113, + "rewards/cosine_scaled_reward": 0.1188817722722888, + "rewards/format_reward": 0.9166666679084301, + "step": 249 + }, + { + "advantage_max": 1.5140611678361893, + "advantage_mean": -1.8005570368018198e-08, + "advantage_min": -1.203395776450634, + "advantage_std": 0.9998101890087128, + "completion_length": 1255.0208473205566, + "epoch": 0.2857142857142857, + "grad_norm": 0.3561646342277527, + "kl": 0.0129852294921875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.281416799501187e-07, + "loss": 0.0005, + "reward": 0.5807215161621571, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5807215161621571, + "reward_after_std": 0.762598255649209, + "reward_before_mean": 0.8581455871462822, + "reward_before_std": 0.7728062570095062, + "reward_change_max": 0.0, + "reward_change_mean": -0.27742408588528633, + "reward_change_min": -0.4606732130050659, + "reward_change_std": 0.18659934867173433, + "reward_std": 0.7625982705503702, + "rewards/cosine_scaled_reward": -0.01884387107565999, + "rewards/format_reward": 0.8958333507180214, + "step": 250 + }, + { + "advantage_max": 1.5809199213981628, + "advantage_mean": -8.692343844707295e-09, + "advantage_min": -1.2370038107037544, + "advantage_std": 0.9998120293021202, + "completion_length": 1102.0833740234375, + "epoch": 0.28685714285714287, + "grad_norm": 0.28108564019203186, + "kl": 0.010408401489257812, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.25045936022246e-07, + "loss": 0.0004, + "reward": 0.4316780879162252, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4316780879162252, + "reward_after_std": 0.6830427572131157, + "reward_before_mean": 0.6816601119935513, + "reward_before_std": 0.6788753867149353, + "reward_change_max": 0.0003069266676902771, + "reward_change_mean": -0.2499820338562131, + "reward_change_min": -0.40494656190276146, + "reward_change_std": 0.16739745903760195, + "reward_std": 0.6830427646636963, + "rewards/cosine_scaled_reward": -0.08625328214839101, + "rewards/format_reward": 0.8541666753590107, + "step": 251 + }, + { + "advantage_max": 1.7180158644914627, + "advantage_mean": -4.967053657267684e-09, + "advantage_min": -0.973113164305687, + "advantage_std": 0.9998108670115471, + "completion_length": 1398.2917137145996, + "epoch": 0.288, + "grad_norm": 0.26431670784950256, + "kl": 0.010639190673828125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.219465344613258e-07, + "loss": 0.0004, + "reward": 0.4735152288340032, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4735152288340032, + "reward_after_std": 0.6365889646112919, + "reward_before_mean": 0.7305772360414267, + "reward_before_std": 0.5807833485305309, + "reward_change_max": 0.0, + "reward_change_mean": -0.2570619937032461, + "reward_change_min": -0.3992270193994045, + "reward_change_std": 0.15293935127556324, + "reward_std": 0.6365889683365822, + "rewards/cosine_scaled_reward": -0.07221140991896391, + "rewards/format_reward": 0.8750000037252903, + "step": 252 + }, + { + "advantage_max": 1.6413754224777222, + "advantage_mean": 1.862645149230957e-08, + "advantage_min": -0.9406393691897392, + "advantage_std": 0.9998555779457092, + "completion_length": 1747.3333854675293, + "epoch": 0.28914285714285715, + "grad_norm": 0.3222593665122986, + "kl": 0.022695541381835938, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.188436263278172e-07, + "loss": 0.0009, + "reward": 0.41165209421887994, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.41165209421887994, + "reward_after_std": 0.8608526214957237, + "reward_before_mean": 0.6432545175775886, + "reward_before_std": 0.8611173704266548, + "reward_change_max": 0.0003280565142631531, + "reward_change_mean": -0.23160241451114416, + "reward_change_min": -0.45648399367928505, + "reward_change_std": 0.16804132983088493, + "reward_std": 0.8608526289463043, + "rewards/cosine_scaled_reward": -0.07420608215034008, + "rewards/format_reward": 0.7916666753590107, + "step": 253 + }, + { + "advantage_max": 1.7923977673053741, + "advantage_mean": -5.4327151444155675e-08, + "advantage_min": -0.9530624970793724, + "advantage_std": 0.9998274743556976, + "completion_length": 1410.0833740234375, + "epoch": 0.29028571428571426, + "grad_norm": 0.3107492923736572, + "kl": 0.013109207153320312, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.157373628530852e-07, + "loss": 0.0005, + "reward": 0.5809852974489331, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5809852974489331, + "reward_after_std": 0.7450582385063171, + "reward_before_mean": 0.8495601508766413, + "reward_before_std": 0.6797804534435272, + "reward_change_max": 0.00035993754863739014, + "reward_change_mean": -0.2685748729854822, + "reward_change_min": -0.42683035880327225, + "reward_change_std": 0.15862839203327894, + "reward_std": 0.7450582459568977, + "rewards/cosine_scaled_reward": -0.03355327108874917, + "rewards/format_reward": 0.9166666716337204, + "step": 254 + }, + { + "advantage_max": 1.7443495839834213, + "advantage_mean": 9.93410742555767e-09, + "advantage_min": -0.9862060695886612, + "advantage_std": 0.9997766390442848, + "completion_length": 1702.7917175292969, + "epoch": 0.2914285714285714, + "grad_norm": 0.26845040917396545, + "kl": 0.015506744384765625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.126278954320294e-07, + "loss": 0.0006, + "reward": 0.07089572306722403, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.07089572306722403, + "reward_after_std": 0.5136217400431633, + "reward_before_mean": 0.2524457387626171, + "reward_before_std": 0.4779893010854721, + "reward_change_max": 0.0, + "reward_change_mean": -0.1815500007942319, + "reward_change_min": -0.29097018018364906, + "reward_change_std": 0.10849831020459533, + "reward_std": 0.5136217400431633, + "rewards/cosine_scaled_reward": -0.24877713713794947, + "rewards/format_reward": 0.7500000055879354, + "step": 255 + }, + { + "advantage_max": 1.5460007637739182, + "advantage_mean": -9.62366669687853e-09, + "advantage_min": -1.2941532135009766, + "advantage_std": 0.9998073950409889, + "completion_length": 1409.7917022705078, + "epoch": 0.2925714285714286, + "grad_norm": 0.2874889075756073, + "kl": 0.01189422607421875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.095153756157051e-07, + "loss": 0.0005, + "reward": 0.634954672306776, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.634954672306776, + "reward_after_std": 0.7295485325157642, + "reward_before_mean": 0.9263447821140289, + "reward_before_std": 0.7339564729481936, + "reward_change_max": 0.0031501948833465576, + "reward_change_mean": -0.2913900911808014, + "reward_change_min": -0.48522017523646355, + "reward_change_std": 0.19647251721471548, + "reward_std": 0.7295485362410545, + "rewards/cosine_scaled_reward": -0.005577614530920982, + "rewards/format_reward": 0.9375000149011612, + "step": 256 + }, + { + "advantage_max": 1.5703508257865906, + "advantage_mean": -1.6608585839961165e-08, + "advantage_min": -1.2059873640537262, + "advantage_std": 0.9998743832111359, + "completion_length": 1937.0417251586914, + "epoch": 0.2937142857142857, + "grad_norm": 0.35278812050819397, + "kl": 0.012939453125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.06399955103937e-07, + "loss": 0.0005, + "reward": 0.694405922666192, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.694405922666192, + "reward_after_std": 0.9290311932563782, + "reward_before_mean": 0.9840117041021585, + "reward_before_std": 0.9439280852675438, + "reward_change_max": 0.0, + "reward_change_mean": -0.28960578329861164, + "reward_change_min": -0.5085005983710289, + "reward_change_std": 0.20747256092727184, + "reward_std": 0.9290312454104424, + "rewards/cosine_scaled_reward": 0.07533917389810085, + "rewards/format_reward": 0.8333333358168602, + "step": 257 + }, + { + "advantage_max": 1.7613529562950134, + "advantage_mean": -2.110997909809953e-08, + "advantage_min": -0.9053919687867165, + "advantage_std": 0.9998543411493301, + "completion_length": 1757.7500305175781, + "epoch": 0.2948571428571429, + "grad_norm": 0.25500622391700745, + "kl": 0.01361083984375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.032817857379256e-07, + "loss": 0.0005, + "reward": 0.39868373051285744, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.39868373051285744, + "reward_after_std": 0.8846242874860764, + "reward_before_mean": 0.6204931288957596, + "reward_before_std": 0.8531522713601589, + "reward_change_max": 0.00040918588638305664, + "reward_change_mean": -0.22180940210819244, + "reward_change_min": -0.38025897182524204, + "reward_change_std": 0.14975974522531033, + "reward_std": 0.8846242912113667, + "rewards/cosine_scaled_reward": -0.0960034430027008, + "rewards/format_reward": 0.8125000055879354, + "step": 258 + }, + { + "advantage_max": 1.6192731708288193, + "advantage_mean": -5.5879355587151736e-09, + "advantage_min": -1.0627397671341896, + "advantage_std": 0.999801829457283, + "completion_length": 1482.5625610351562, + "epoch": 0.296, + "grad_norm": 0.3482678234577179, + "kl": 0.014739990234375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 6.001610194928464e-07, + "loss": 0.0006, + "reward": 0.7368679717183113, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.7368679717183113, + "reward_after_std": 0.6360752396285534, + "reward_before_mean": 1.0532444640994072, + "reward_before_std": 0.5891787149012089, + "reward_change_max": 0.0006304755806922913, + "reward_change_mean": -0.3163764523342252, + "reward_change_min": -0.5091553628444672, + "reward_change_std": 0.1979432748630643, + "reward_std": 0.6360752433538437, + "rewards/cosine_scaled_reward": 0.0787055566906929, + "rewards/format_reward": 0.8958333358168602, + "step": 259 + }, + { + "advantage_max": 1.8113128542900085, + "advantage_mean": -1.8626450382086546e-08, + "advantage_min": -0.8748406283557415, + "advantage_std": 0.9997461587190628, + "completion_length": 1027.979211807251, + "epoch": 0.29714285714285715, + "grad_norm": 0.3253263831138611, + "kl": 0.008440017700195312, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.97037808470444e-07, + "loss": 0.0003, + "reward": 0.8471368737518787, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.8471368737518787, + "reward_after_std": 0.558805363252759, + "reward_before_mean": 1.1868980564177036, + "reward_before_std": 0.4591397875919938, + "reward_change_max": 2.481788396835327e-05, + "reward_change_mean": -0.33976118452847004, + "reward_change_min": -0.5015393383800983, + "reward_change_std": 0.19328110944479704, + "reward_std": 0.5588053781539202, + "rewards/cosine_scaled_reward": 0.1142823500558734, + "rewards/format_reward": 0.9583333358168602, + "step": 260 + }, + { + "advantage_max": 1.7223718613386154, + "advantage_mean": -8.84756468089165e-09, + "advantage_min": -1.0590002834796906, + "advantage_std": 0.9997850880026817, + "completion_length": 1965.2500686645508, + "epoch": 0.29828571428571427, + "grad_norm": 0.268317312002182, + "kl": 0.0142059326171875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.939123048916173e-07, + "loss": 0.0006, + "reward": 0.20198887400329113, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.20198887400329113, + "reward_after_std": 0.5538261011242867, + "reward_before_mean": 0.4085303219035268, + "reward_before_std": 0.5178539101034403, + "reward_change_max": 0.0, + "reward_change_mean": -0.20654146187007427, + "reward_change_min": -0.3276657313108444, + "reward_change_std": 0.12507850490510464, + "reward_std": 0.5538261160254478, + "rewards/cosine_scaled_reward": -0.12906817917246372, + "rewards/format_reward": 0.6666666679084301, + "step": 261 + }, + { + "advantage_max": 1.5404580384492874, + "advantage_mean": 3.104409063148239e-09, + "advantage_min": -1.210778832435608, + "advantage_std": 0.9997341260313988, + "completion_length": 1730.5208587646484, + "epoch": 0.29942857142857143, + "grad_norm": 0.41677936911582947, + "kl": 0.019683837890625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.907846610890011e-07, + "loss": 0.0008, + "reward": 0.08706101775169373, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.08706101775169373, + "reward_after_std": 0.4480132460594177, + "reward_before_mean": 0.27937868889421225, + "reward_before_std": 0.4376929756253958, + "reward_change_max": 0.0, + "reward_change_mean": -0.19231767486780882, + "reward_change_min": -0.3273830972611904, + "reward_change_std": 0.11917602550238371, + "reward_std": 0.4480132479220629, + "rewards/cosine_scaled_reward": -0.2248940011486411, + "rewards/format_reward": 0.7291666716337204, + "step": 262 + }, + { + "advantage_max": 1.6942091435194016, + "advantage_mean": -1.6142925329809543e-08, + "advantage_min": -0.979148268699646, + "advantage_std": 0.9997653514146805, + "completion_length": 1288.770866394043, + "epoch": 0.30057142857142854, + "grad_norm": 0.22912034392356873, + "kl": 0.00693511962890625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.87655029499542e-07, + "loss": 0.0003, + "reward": 0.42538353987038136, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.42538353987038136, + "reward_after_std": 0.518636416643858, + "reward_before_mean": 0.681599510833621, + "reward_before_std": 0.4740534070879221, + "reward_change_max": 0.0, + "reward_change_mean": -0.25621596723794937, + "reward_change_min": -0.3982585147023201, + "reward_change_std": 0.14761138334870338, + "reward_std": 0.5186364278197289, + "rewards/cosine_scaled_reward": -0.12795027159154415, + "rewards/format_reward": 0.9375000074505806, + "step": 263 + }, + { + "advantage_max": 1.5913608968257904, + "advantage_mean": -2.2351742678949904e-08, + "advantage_min": -1.1664466261863708, + "advantage_std": 0.9998384416103363, + "completion_length": 1332.9583587646484, + "epoch": 0.3017142857142857, + "grad_norm": 0.28963515162467957, + "kl": 0.010570526123046875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.845235626570683e-07, + "loss": 0.0004, + "reward": 0.6335734352469444, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6335734352469444, + "reward_after_std": 0.740757841616869, + "reward_before_mean": 0.9181164689362049, + "reward_before_std": 0.7137619107961655, + "reward_change_max": 0.0, + "reward_change_mean": -0.28454304300248623, + "reward_change_min": -0.4633651450276375, + "reward_change_std": 0.17281420156359673, + "reward_std": 0.7407578490674496, + "rewards/cosine_scaled_reward": -0.009691774845123291, + "rewards/format_reward": 0.9375000149011612, + "step": 264 + }, + { + "advantage_max": 1.5019582211971283, + "advantage_mean": -4.23751784772719e-08, + "advantage_min": -1.2765265554189682, + "advantage_std": 0.9998071640729904, + "completion_length": 1453.1667175292969, + "epoch": 0.3028571428571429, + "grad_norm": 0.3727766275405884, + "kl": 0.0173187255859375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.813904131848564e-07, + "loss": 0.0007, + "reward": 0.6399871921166778, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6399871921166778, + "reward_after_std": 0.7324034199118614, + "reward_before_mean": 0.934943444095552, + "reward_before_std": 0.7509109638631344, + "reward_change_max": 0.0006519109010696411, + "reward_change_mean": -0.2949562631547451, + "reward_change_min": -0.5017080642282963, + "reward_change_std": 0.20068126823753119, + "reward_std": 0.7324034459888935, + "rewards/cosine_scaled_reward": -0.0012782979756593704, + "rewards/format_reward": 0.9375000074505806, + "step": 265 + }, + { + "advantage_max": 1.6453713923692703, + "advantage_mean": -4.0667754053203e-08, + "advantage_min": -1.1235066056251526, + "advantage_std": 0.9998007044196129, + "completion_length": 1395.9792251586914, + "epoch": 0.304, + "grad_norm": 0.28933483362197876, + "kl": 0.01245880126953125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.78255733788191e-07, + "loss": 0.0005, + "reward": 0.4439257560297847, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4439257560297847, + "reward_after_std": 0.5643942356109619, + "reward_before_mean": 0.703898387029767, + "reward_before_std": 0.5410022251307964, + "reward_change_max": 0.000517331063747406, + "reward_change_mean": -0.2599726375192404, + "reward_change_min": -0.40890974923968315, + "reward_change_std": 0.15803362615406513, + "reward_std": 0.5643942579627037, + "rewards/cosine_scaled_reward": -0.09596748650074005, + "rewards/format_reward": 0.8958333432674408, + "step": 266 + }, + { + "advantage_max": 1.6140058189630508, + "advantage_mean": 1.6653345369377348e-16, + "advantage_min": -1.124117873609066, + "advantage_std": 0.9997650906443596, + "completion_length": 1893.9792022705078, + "epoch": 0.30514285714285716, + "grad_norm": 0.32439637184143066, + "kl": 0.02964019775390625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.751196772469237e-07, + "loss": 0.0012, + "reward": 0.19872340001165867, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.19872340001165867, + "reward_after_std": 0.5362277999520302, + "reward_before_mean": 0.4080928210169077, + "reward_before_std": 0.5116072688251734, + "reward_change_max": 0.0001492425799369812, + "reward_change_mean": -0.20936942659318447, + "reward_change_min": -0.3517877943813801, + "reward_change_std": 0.13033864740282297, + "reward_std": 0.5362278260290623, + "rewards/cosine_scaled_reward": -0.16053693334106356, + "rewards/format_reward": 0.7291666716337204, + "step": 267 + }, + { + "advantage_max": 1.544311910867691, + "advantage_mean": -1.552204320631745e-08, + "advantage_min": -1.1754939407110214, + "advantage_std": 0.9998464211821556, + "completion_length": 1315.9167251586914, + "epoch": 0.3062857142857143, + "grad_norm": 0.5348718762397766, + "kl": 0.02091217041015625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.71982396408026e-07, + "loss": 0.0008, + "reward": 0.7074840739369392, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.7074840739369392, + "reward_after_std": 0.8638963103294373, + "reward_before_mean": 1.0027257055044174, + "reward_before_std": 0.8752267155796289, + "reward_change_max": 0.0, + "reward_change_mean": -0.29524165019392967, + "reward_change_min": -0.5318833738565445, + "reward_change_std": 0.20659902412444353, + "reward_std": 0.8638963364064693, + "rewards/cosine_scaled_reward": 0.05344618018716574, + "rewards/format_reward": 0.8958333507180214, + "step": 268 + }, + { + "advantage_max": 1.7572058737277985, + "advantage_mean": -2.4835269396561444e-09, + "advantage_min": -1.0555767640471458, + "advantage_std": 0.9997468441724777, + "completion_length": 1388.0625190734863, + "epoch": 0.30742857142857144, + "grad_norm": 0.3705489933490753, + "kl": 0.013675689697265625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.688440441781398e-07, + "loss": 0.0005, + "reward": 0.2904005544260144, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2904005544260144, + "reward_after_std": 0.6595730949193239, + "reward_before_mean": 0.5057724388316274, + "reward_before_std": 0.6225594077259302, + "reward_change_max": 0.0, + "reward_change_mean": -0.2153718858025968, + "reward_change_min": -0.31822727248072624, + "reward_change_std": 0.12597669241949916, + "reward_std": 0.6595731098204851, + "rewards/cosine_scaled_reward": -0.16378045734018087, + "rewards/format_reward": 0.8333333488553762, + "step": 269 + }, + { + "advantage_max": 1.6698594987392426, + "advantage_mean": -5.339582942465171e-08, + "advantage_min": -1.1226786375045776, + "advantage_std": 0.9998543411493301, + "completion_length": 1540.2083587646484, + "epoch": 0.30857142857142855, + "grad_norm": 0.2806699872016907, + "kl": 0.014301300048828125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.657047735161255e-07, + "loss": 0.0006, + "reward": 0.8029788322746754, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.8029788322746754, + "reward_after_std": 0.9154071733355522, + "reward_before_mean": 1.1118664667010307, + "reward_before_std": 0.9013979975134134, + "reward_change_max": 0.0001629069447517395, + "reward_change_mean": -0.30888766795396805, + "reward_change_min": -0.5338790118694305, + "reward_change_std": 0.20814104191958904, + "reward_std": 0.9154071845114231, + "rewards/cosine_scaled_reward": 0.10801657056435943, + "rewards/format_reward": 0.8958333507180214, + "step": 270 + }, + { + "advantage_max": 1.6810975968837738, + "advantage_mean": -6.332993707225398e-08, + "advantage_min": -1.1149731278419495, + "advantage_std": 0.9998246654868126, + "completion_length": 1321.1250267028809, + "epoch": 0.3097142857142857, + "grad_norm": 0.2993323802947998, + "kl": 0.020111083984375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.625647374256061e-07, + "loss": 0.0008, + "reward": 0.8111615749076009, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.8111615749076009, + "reward_after_std": 0.6996302008628845, + "reward_before_mean": 1.1366247907280922, + "reward_before_std": 0.6638240143656731, + "reward_change_max": 4.407763481140137e-05, + "reward_change_mean": -0.3254632391035557, + "reward_change_min": -0.4952653609216213, + "reward_change_std": 0.19443896505981684, + "reward_std": 0.6996302269399166, + "rewards/cosine_scaled_reward": 0.13081239815801382, + "rewards/format_reward": 0.8750000055879354, + "step": 271 + }, + { + "advantage_max": 1.5797275602817535, + "advantage_mean": -5.587935614226325e-09, + "advantage_min": -1.1289891824126244, + "advantage_std": 0.9998372495174408, + "completion_length": 1758.8750305175781, + "epoch": 0.31085714285714283, + "grad_norm": 0.41097596287727356, + "kl": 0.01924896240234375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.594240889475106e-07, + "loss": 0.0008, + "reward": 0.3049433889100328, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3049433889100328, + "reward_after_std": 0.7466313242912292, + "reward_before_mean": 0.5239668264985085, + "reward_before_std": 0.7606498152017593, + "reward_change_max": 0.0008180141448974609, + "reward_change_mean": -0.219023450743407, + "reward_change_min": -0.42113321274518967, + "reward_change_std": 0.16112497728317976, + "reward_std": 0.7466313354671001, + "rewards/cosine_scaled_reward": -0.11301657650619745, + "rewards/format_reward": 0.7500000149011612, + "step": 272 + }, + { + "advantage_max": 1.7094184756278992, + "advantage_mean": -6.70552275927605e-08, + "advantage_min": -1.0091482996940613, + "advantage_std": 0.9998253583908081, + "completion_length": 1344.6875228881836, + "epoch": 0.312, + "grad_norm": 0.27919331192970276, + "kl": 0.016841888427734375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.562829811526154e-07, + "loss": 0.0007, + "reward": 0.6978060295805335, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6978060295805335, + "reward_after_std": 0.7023932486772537, + "reward_before_mean": 0.9987507201731205, + "reward_before_std": 0.6539960987865925, + "reward_change_max": 0.0, + "reward_change_mean": -0.3009446784853935, + "reward_change_min": -0.4641147553920746, + "reward_change_std": 0.17846697755157948, + "reward_std": 0.7023932598531246, + "rewards/cosine_scaled_reward": 0.06187533074989915, + "rewards/format_reward": 0.8750000055879354, + "step": 273 + }, + { + "advantage_max": 1.5880873054265976, + "advantage_mean": -1.8626452158443385e-08, + "advantage_min": -1.1842782869935036, + "advantage_std": 0.9998295158147812, + "completion_length": 1051.2292098999023, + "epoch": 0.31314285714285717, + "grad_norm": 0.3718518614768982, + "kl": 0.01607513427734375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.531415671340826e-07, + "loss": 0.0006, + "reward": 0.7379360813647509, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.7379360813647509, + "reward_after_std": 0.7393338270485401, + "reward_before_mean": 1.045187957584858, + "reward_before_std": 0.7175006531178951, + "reward_change_max": 0.0, + "reward_change_mean": -0.3072518855333328, + "reward_change_min": -0.5100837834179401, + "reward_change_std": 0.1890790481120348, + "reward_std": 0.7393338270485401, + "rewards/cosine_scaled_reward": 0.053843963891267776, + "rewards/format_reward": 0.9375000149011612, + "step": 274 + }, + { + "advantage_max": 1.491246446967125, + "advantage_mean": 6.829698917520943e-09, + "advantage_min": -1.1501619592308998, + "advantage_std": 0.9998340085148811, + "completion_length": 1448.645866394043, + "epoch": 0.3142857142857143, + "grad_norm": 0.34003862738609314, + "kl": 0.022502899169921875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.5e-07, + "loss": 0.0009, + "reward": 0.7442736756056547, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.7442736756056547, + "reward_after_std": 0.8110382054001093, + "reward_before_mean": 1.0544355604797602, + "reward_before_std": 0.8261386286467314, + "reward_change_max": 0.000140458345413208, + "reward_change_mean": -0.3101618802174926, + "reward_change_min": -0.5264877546578646, + "reward_change_std": 0.20748563203960657, + "reward_std": 0.8110382370650768, + "rewards/cosine_scaled_reward": 0.11055111582390964, + "rewards/format_reward": 0.8333333414047956, + "step": 275 + }, + { + "advantage_max": 1.6783827245235443, + "advantage_mean": -2.8560560139112567e-08, + "advantage_min": -1.0959996059536934, + "advantage_std": 0.9998864680528641, + "completion_length": 1484.4375381469727, + "epoch": 0.31542857142857145, + "grad_norm": 0.35709822177886963, + "kl": 0.02801513671875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.468584328659172e-07, + "loss": 0.0011, + "reward": 0.609470259398222, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.609470259398222, + "reward_after_std": 1.0163805298507214, + "reward_before_mean": 0.867477897554636, + "reward_before_std": 1.0112642236053944, + "reward_change_max": 0.0029065459966659546, + "reward_change_mean": -0.2580076390877366, + "reward_change_min": -0.474948413670063, + "reward_change_std": 0.1875058664008975, + "reward_std": 1.0163805782794952, + "rewards/cosine_scaled_reward": 0.05873893201351166, + "rewards/format_reward": 0.7500000204890966, + "step": 276 + }, + { + "advantage_max": 1.6457444429397583, + "advantage_mean": 6.8296987509874896e-09, + "advantage_min": -0.9661366939544678, + "advantage_std": 0.9998582229018211, + "completion_length": 1450.3333587646484, + "epoch": 0.31657142857142856, + "grad_norm": 0.7636739611625671, + "kl": 0.02922821044921875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.437170188473847e-07, + "loss": 0.0012, + "reward": 0.437352629378438, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.437352629378438, + "reward_after_std": 0.8048081770539284, + "reward_before_mean": 0.6762080068292562, + "reward_before_std": 0.7981316186487675, + "reward_change_max": 0.00011983513832092285, + "reward_change_mean": -0.23885535076260567, + "reward_change_min": -0.46324050053954124, + "reward_change_std": 0.16672399919480085, + "reward_std": 0.8048081956803799, + "rewards/cosine_scaled_reward": -0.08897935040295124, + "rewards/format_reward": 0.8541666716337204, + "step": 277 + }, + { + "advantage_max": 1.712010532617569, + "advantage_mean": -1.7074247238291207e-08, + "advantage_min": -1.0471594706177711, + "advantage_std": 0.9997750818729401, + "completion_length": 1293.1250495910645, + "epoch": 0.3177142857142857, + "grad_norm": 0.46666115522384644, + "kl": 0.0188751220703125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.405759110524894e-07, + "loss": 0.0008, + "reward": 0.8984961975365877, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.8984961975365877, + "reward_after_std": 0.550977161154151, + "reward_before_mean": 1.2493544705212116, + "reward_before_std": 0.46790555119514465, + "reward_change_max": 0.0, + "reward_change_mean": -0.3508582729846239, + "reward_change_min": -0.5096213817596436, + "reward_change_std": 0.1945639243349433, + "reward_std": 0.5509771760553122, + "rewards/cosine_scaled_reward": 0.16634388361126184, + "rewards/format_reward": 0.9166666716337204, + "step": 278 + }, + { + "advantage_max": 1.630677729845047, + "advantage_mean": -1.8160790427046436e-08, + "advantage_min": -1.0016423761844635, + "advantage_std": 0.9998414814472198, + "completion_length": 1697.3750305175781, + "epoch": 0.31885714285714284, + "grad_norm": 0.3881392478942871, + "kl": 0.02802276611328125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.37435262574394e-07, + "loss": 0.0011, + "reward": 0.5066275419667363, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5066275419667363, + "reward_after_std": 0.7542143501341343, + "reward_before_mean": 0.7644317261874676, + "reward_before_std": 0.7297962121665478, + "reward_change_max": 0.0014480352401733398, + "reward_change_mean": -0.25780418422073126, + "reward_change_min": -0.4481316953897476, + "reward_change_std": 0.16544347070157528, + "reward_std": 0.7542143575847149, + "rewards/cosine_scaled_reward": -0.044867485761642456, + "rewards/format_reward": 0.854166679084301, + "step": 279 + }, + { + "advantage_max": 1.6418682932853699, + "advantage_mean": -6.8296996946770605e-09, + "advantage_min": -1.1618360579013824, + "advantage_std": 0.9998872727155685, + "completion_length": 1712.520896911621, + "epoch": 0.32, + "grad_norm": 0.4829684793949127, + "kl": 0.03343963623046875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.342952264838747e-07, + "loss": 0.0013, + "reward": 0.8820310495793819, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.8820310495793819, + "reward_after_std": 0.9785490781068802, + "reward_before_mean": 1.200088301091455, + "reward_before_std": 0.9552184268832207, + "reward_change_max": 0.00046744197607040405, + "reward_change_mean": -0.31805719900876284, + "reward_change_min": -0.519169632345438, + "reward_change_std": 0.20428536739200354, + "reward_std": 0.9785491079092026, + "rewards/cosine_scaled_reward": 0.1833774563856423, + "rewards/format_reward": 0.8333333414047956, + "step": 280 + }, + { + "advantage_max": 1.6659259349107742, + "advantage_mean": 4.967053879312289e-09, + "advantage_min": -0.8772311583161354, + "advantage_std": 0.9998160675168037, + "completion_length": 2214.104232788086, + "epoch": 0.3211428571428571, + "grad_norm": 0.2734784185886383, + "kl": 0.04001617431640625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.311559558218603e-07, + "loss": 0.0016, + "reward": 0.018884988501667976, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.018884988501667976, + "reward_after_std": 0.6570006497204304, + "reward_before_mean": 0.1808290034532547, + "reward_before_std": 0.6510476768016815, + "reward_change_max": 0.0, + "reward_change_mean": -0.16194400051608682, + "reward_change_min": -0.3184575643390417, + "reward_change_std": 0.1197090744972229, + "reward_std": 0.6570006608963013, + "rewards/cosine_scaled_reward": -0.2116688375826925, + "rewards/format_reward": 0.6041666697710752, + "step": 281 + }, + { + "advantage_max": 1.646447241306305, + "advantage_mean": -1.8471231655325937e-08, + "advantage_min": -1.2341381013393402, + "advantage_std": 0.9998238310217857, + "completion_length": 1421.708366394043, + "epoch": 0.3222857142857143, + "grad_norm": 0.40240851044654846, + "kl": 0.026641845703125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.28017603591974e-07, + "loss": 0.0011, + "reward": 0.5995541553274961, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.5995541553274961, + "reward_after_std": 0.6374145857989788, + "reward_before_mean": 0.8840107689611614, + "reward_before_std": 0.5970096457749605, + "reward_change_max": 0.0002270340919494629, + "reward_change_mean": -0.2844566013664007, + "reward_change_min": -0.4448739532381296, + "reward_change_std": 0.17153234407305717, + "reward_std": 0.6374146081507206, + "rewards/cosine_scaled_reward": 0.025338694918900728, + "rewards/format_reward": 0.8333333432674408, + "step": 282 + }, + { + "advantage_max": 1.608649656176567, + "advantage_mean": -2.9802322165650708e-08, + "advantage_min": -1.0181390345096588, + "advantage_std": 0.9998424425721169, + "completion_length": 1923.958381652832, + "epoch": 0.32342857142857145, + "grad_norm": 0.4259525537490845, + "kl": 0.03574371337890625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.248803227530763e-07, + "loss": 0.0014, + "reward": 0.5078537920489907, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5078537920489907, + "reward_after_std": 0.8294965587556362, + "reward_before_mean": 0.7620069230906665, + "reward_before_std": 0.8187153935432434, + "reward_change_max": 0.0005861744284629822, + "reward_change_mean": -0.25415316317230463, + "reward_change_min": -0.5123951118439436, + "reward_change_std": 0.189014982432127, + "reward_std": 0.8294965997338295, + "rewards/cosine_scaled_reward": 0.03725345712155104, + "rewards/format_reward": 0.6875000186264515, + "step": 283 + }, + { + "advantage_max": 1.6817447692155838, + "advantage_mean": -3.073364585048921e-08, + "advantage_min": -1.0475571602582932, + "advantage_std": 0.9998420029878616, + "completion_length": 1284.7917175292969, + "epoch": 0.32457142857142857, + "grad_norm": 0.47929516434669495, + "kl": 0.02382659912109375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.21744266211809e-07, + "loss": 0.001, + "reward": 0.4919102769345045, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4919102769345045, + "reward_after_std": 0.7489107400178909, + "reward_before_mean": 0.7442134652919776, + "reward_before_std": 0.7232272922992706, + "reward_change_max": 0.0010956302285194397, + "reward_change_mean": -0.25230319052934647, + "reward_change_min": -0.43385446071624756, + "reward_change_std": 0.16482721455395222, + "reward_std": 0.7489107698202133, + "rewards/cosine_scaled_reward": -0.07580995094031096, + "rewards/format_reward": 0.8958333395421505, + "step": 284 + }, + { + "advantage_max": 1.7304245829582214, + "advantage_mean": 9.002785406053704e-09, + "advantage_min": -1.0338216125965118, + "advantage_std": 0.99979517608881, + "completion_length": 1267.0000228881836, + "epoch": 0.32571428571428573, + "grad_norm": 0.3164341449737549, + "kl": 0.045284271240234375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.186095868151436e-07, + "loss": 0.0018, + "reward": 0.4816157463937998, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4816157463937998, + "reward_after_std": 0.6492583490908146, + "reward_before_mean": 0.7395215413998812, + "reward_before_std": 0.6058793980628252, + "reward_change_max": 0.0, + "reward_change_mean": -0.2579057849943638, + "reward_change_min": -0.39217982813715935, + "reward_change_std": 0.14816969074308872, + "reward_std": 0.6492583639919758, + "rewards/cosine_scaled_reward": -0.08857256267219782, + "rewards/format_reward": 0.9166666716337204, + "step": 285 + }, + { + "advantage_max": 1.5697939693927765, + "advantage_mean": -2.23517424569053e-08, + "advantage_min": -1.0931710600852966, + "advantage_std": 0.9998166635632515, + "completion_length": 1508.7291793823242, + "epoch": 0.32685714285714285, + "grad_norm": 0.3677847385406494, + "kl": 0.036373138427734375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.154764373429315e-07, + "loss": 0.0015, + "reward": 0.5672296602278948, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5672296602278948, + "reward_after_std": 0.738838504999876, + "reward_before_mean": 0.8405419194605201, + "reward_before_std": 0.7304916121065617, + "reward_change_max": 0.0, + "reward_change_mean": -0.27331228740513325, + "reward_change_min": -0.4712460860610008, + "reward_change_std": 0.17583474051207304, + "reward_std": 0.7388385497033596, + "rewards/cosine_scaled_reward": 0.014020954258739948, + "rewards/format_reward": 0.8125000074505806, + "step": 286 + }, + { + "advantage_max": 1.6180351376533508, + "advantage_mean": -2.4835272727230517e-09, + "advantage_min": -1.0623664110898972, + "advantage_std": 0.9997753202915192, + "completion_length": 1579.354206085205, + "epoch": 0.328, + "grad_norm": 0.49448496103286743, + "kl": 0.052417755126953125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.123449705004581e-07, + "loss": 0.0021, + "reward": 0.44745656475424767, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.44745656475424767, + "reward_after_std": 0.5133853312581778, + "reward_before_mean": 0.7085697203874588, + "reward_before_std": 0.4728548899292946, + "reward_change_max": 0.0006938055157661438, + "reward_change_mean": -0.2611131672747433, + "reward_change_min": -0.40582090616226196, + "reward_change_std": 0.15666163619607687, + "reward_std": 0.5133853498846292, + "rewards/cosine_scaled_reward": -0.010298481676727533, + "rewards/format_reward": 0.7291666697710752, + "step": 287 + }, + { + "advantage_max": 1.648850455880165, + "advantage_mean": -2.7008355940605355e-08, + "advantage_min": -1.0658856928348541, + "advantage_std": 0.99979068338871, + "completion_length": 1750.9375534057617, + "epoch": 0.3291428571428571, + "grad_norm": 0.46777138113975525, + "kl": 0.062206268310546875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.09215338910999e-07, + "loss": 0.0025, + "reward": 0.3870700172847137, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3870700172847137, + "reward_after_std": 0.5809841006994247, + "reward_before_mean": 0.6324769873172045, + "reward_before_std": 0.5547001287341118, + "reward_change_max": 0.0, + "reward_change_mean": -0.24540697038173676, + "reward_change_min": -0.4253687206655741, + "reward_change_std": 0.15523213241249323, + "reward_std": 0.5809841156005859, + "rewards/cosine_scaled_reward": -0.06917817890644073, + "rewards/format_reward": 0.7708333469927311, + "step": 288 + }, + { + "advantage_max": 1.75955268740654, + "advantage_mean": -3.9736431700632124e-08, + "advantage_min": -1.0853909105062485, + "advantage_std": 0.9997709766030312, + "completion_length": 1411.5000686645508, + "epoch": 0.3302857142857143, + "grad_norm": 0.4917508363723755, + "kl": 0.028003692626953125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.060876951083828e-07, + "loss": 0.0011, + "reward": 0.4485252061858773, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4485252061858773, + "reward_after_std": 0.5251850299537182, + "reward_before_mean": 0.7066366064827889, + "reward_before_std": 0.47138636000454426, + "reward_change_max": 0.0006059929728507996, + "reward_change_mean": -0.2581114200875163, + "reward_change_min": -0.38431514613330364, + "reward_change_std": 0.15068622399121523, + "reward_std": 0.5251850336790085, + "rewards/cosine_scaled_reward": -0.05293171480298042, + "rewards/format_reward": 0.8125000037252903, + "step": 289 + }, + { + "advantage_max": 1.7379557341337204, + "advantage_mean": -4.2840839042934675e-08, + "advantage_min": -1.0089939087629318, + "advantage_std": 0.9998431578278542, + "completion_length": 1222.3542022705078, + "epoch": 0.3314285714285714, + "grad_norm": 0.495219886302948, + "kl": 0.0377655029296875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 5.02962191529556e-07, + "loss": 0.0015, + "reward": 0.746434886008501, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.746434886008501, + "reward_after_std": 0.7607799246907234, + "reward_before_mean": 1.0512062585912645, + "reward_before_std": 0.6999847833067179, + "reward_change_max": 0.0002563074231147766, + "reward_change_mean": -0.30477137491106987, + "reward_change_min": -0.48462648317217827, + "reward_change_std": 0.18312491476535797, + "reward_std": 0.7607799656689167, + "rewards/cosine_scaled_reward": 0.07768644799944013, + "rewards/format_reward": 0.8958333395421505, + "step": 290 + }, + { + "advantage_max": 1.5778974145650864, + "advantage_mean": -1.0554989438027462e-08, + "advantage_min": -1.2307512015104294, + "advantage_std": 0.9998789802193642, + "completion_length": 2183.125045776367, + "epoch": 0.3325714285714286, + "grad_norm": 0.4300452172756195, + "kl": 0.081756591796875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.998389805071536e-07, + "loss": 0.0033, + "reward": 0.637520014308393, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.637520014308393, + "reward_after_std": 0.9906865693628788, + "reward_before_mean": 0.9098777715116739, + "reward_before_std": 1.0078083127737045, + "reward_change_max": 0.0, + "reward_change_mean": -0.272357739508152, + "reward_change_min": -0.48343963362276554, + "reward_change_std": 0.1993736457079649, + "reward_std": 0.9906866066157818, + "rewards/cosine_scaled_reward": 0.04868887457996607, + "rewards/format_reward": 0.8125000298023224, + "step": 291 + }, + { + "advantage_max": 1.7553779780864716, + "advantage_mean": -7.450581041013038e-09, + "advantage_min": -1.0212047845125198, + "advantage_std": 0.9998128712177277, + "completion_length": 2034.020896911621, + "epoch": 0.33371428571428574, + "grad_norm": 0.37723788619041443, + "kl": 0.08984375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.967182142620745e-07, + "loss": 0.0036, + "reward": 0.4268048144876957, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4268048144876957, + "reward_after_std": 0.5782118141651154, + "reward_before_mean": 0.6775916237384081, + "reward_before_std": 0.5195513293147087, + "reward_change_max": 0.0, + "reward_change_mean": -0.25078682228922844, + "reward_change_min": -0.37887974828481674, + "reward_change_std": 0.1440643798559904, + "reward_std": 0.5782118327915668, + "rewards/cosine_scaled_reward": -0.05703752930276096, + "rewards/format_reward": 0.7916666772216558, + "step": 292 + }, + { + "advantage_max": 1.6476120948791504, + "advantage_mean": -4.035731265839004e-08, + "advantage_min": -1.119403451681137, + "advantage_std": 0.9997932389378548, + "completion_length": 1621.3125534057617, + "epoch": 0.33485714285714285, + "grad_norm": 0.4459417760372162, + "kl": 0.08220672607421875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.93600044896063e-07, + "loss": 0.0033, + "reward": 0.4751888904720545, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4751888904720545, + "reward_after_std": 0.5447135083377361, + "reward_before_mean": 0.7424647515872493, + "reward_before_std": 0.5116943549364805, + "reward_change_max": 7.440149784088135e-05, + "reward_change_mean": -0.2672758949920535, + "reward_change_min": -0.42255673184990883, + "reward_change_std": 0.1610535578802228, + "reward_std": 0.5447135455906391, + "rewards/cosine_scaled_reward": -0.05585093982517719, + "rewards/format_reward": 0.8541666753590107, + "step": 293 + }, + { + "advantage_max": 1.5670886784791946, + "advantage_mean": 3.4769376933141416e-08, + "advantage_min": -1.1001440435647964, + "advantage_std": 0.9998358115553856, + "completion_length": 1629.1250457763672, + "epoch": 0.336, + "grad_norm": 0.7223255038261414, + "kl": 0.04345703125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.904846243842949e-07, + "loss": 0.0017, + "reward": 0.3658771354239434, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3658771354239434, + "reward_after_std": 0.7138045094907284, + "reward_before_mean": 0.5983532564714551, + "reward_before_std": 0.7150961086153984, + "reward_change_max": 0.0004177391529083252, + "reward_change_mean": -0.2324760644696653, + "reward_change_min": -0.43696724623441696, + "reward_change_std": 0.16605904418975115, + "reward_std": 0.713804516941309, + "rewards/cosine_scaled_reward": -0.054990069940686226, + "rewards/format_reward": 0.7083333507180214, + "step": 294 + }, + { + "advantage_max": 1.5662293583154678, + "advantage_mean": -6.208817238118058e-09, + "advantage_min": -1.1451702490448952, + "advantage_std": 0.9998404234647751, + "completion_length": 1902.7917251586914, + "epoch": 0.33714285714285713, + "grad_norm": 0.9967929124832153, + "kl": 0.07762527465820312, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.873721045679706e-07, + "loss": 0.0031, + "reward": 0.2228870950639248, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2228870950639248, + "reward_after_std": 0.8324792645871639, + "reward_before_mean": 0.4196047708392143, + "reward_before_std": 0.8619285393506289, + "reward_change_max": 0.0006353557109832764, + "reward_change_mean": -0.19671765249222517, + "reward_change_min": -0.4279610961675644, + "reward_change_std": 0.16788294166326523, + "reward_std": 0.8324792832136154, + "rewards/cosine_scaled_reward": -0.11311429599300027, + "rewards/format_reward": 0.6458333488553762, + "step": 295 + }, + { + "advantage_max": 1.6409805715084076, + "advantage_mean": -2.0799538841265175e-08, + "advantage_min": -1.136774018406868, + "advantage_std": 0.9998015239834785, + "completion_length": 2284.4584045410156, + "epoch": 0.3382857142857143, + "grad_norm": 0.6031951904296875, + "kl": 0.11769866943359375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.842626371469149e-07, + "loss": 0.0047, + "reward": 0.18271854612976313, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.18271854612976313, + "reward_after_std": 0.7190746963024139, + "reward_before_mean": 0.3754655672237277, + "reward_before_std": 0.7232628418132663, + "reward_change_max": 0.002197861671447754, + "reward_change_mean": -0.19274703226983547, + "reward_change_min": -0.3422145713120699, + "reward_change_std": 0.14056797232478857, + "reward_std": 0.7190746963024139, + "rewards/cosine_scaled_reward": -0.15601721964776516, + "rewards/format_reward": 0.6875000149011612, + "step": 296 + }, + { + "advantage_max": 1.6504765450954437, + "advantage_mean": -5.5879357807597785e-09, + "advantage_min": -1.0491151213645935, + "advantage_std": 0.9998462647199631, + "completion_length": 2423.666732788086, + "epoch": 0.3394285714285714, + "grad_norm": 0.9855747222900391, + "kl": 0.11363983154296875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.811563736721829e-07, + "loss": 0.0045, + "reward": 0.12774308491498232, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.12774308491498232, + "reward_after_std": 0.6702112555503845, + "reward_before_mean": 0.309725821018219, + "reward_before_std": 0.6645154766738415, + "reward_change_max": 0.0006378963589668274, + "reward_change_mean": -0.1819827202707529, + "reward_change_min": -0.3592865318059921, + "reward_change_std": 0.13777123484760523, + "reward_std": 0.6702112555503845, + "rewards/cosine_scaled_reward": -0.09513710625469685, + "rewards/format_reward": 0.5000000074505806, + "step": 297 + }, + { + "advantage_max": 1.6699796468019485, + "advantage_mean": -9.31322596819939e-09, + "advantage_min": -1.0168022587895393, + "advantage_std": 0.9997926652431488, + "completion_length": 1597.0416793823242, + "epoch": 0.3405714285714286, + "grad_norm": 0.5280109643936157, + "kl": 0.06150054931640625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.780534655386743e-07, + "loss": 0.0025, + "reward": 0.32745575811713934, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.32745575811713934, + "reward_after_std": 0.5900943968445063, + "reward_before_mean": 0.5592165207490325, + "reward_before_std": 0.5651387330144644, + "reward_change_max": 0.00022490322589874268, + "reward_change_mean": -0.23176079522818327, + "reward_change_min": -0.3906538709998131, + "reward_change_std": 0.1459486922249198, + "reward_std": 0.590094406157732, + "rewards/cosine_scaled_reward": -0.14747506566345692, + "rewards/format_reward": 0.8541666753590107, + "step": 298 + }, + { + "advantage_max": 1.556631863117218, + "advantage_mean": -6.208817182606907e-09, + "advantage_min": -1.1320670247077942, + "advantage_std": 0.999798409640789, + "completion_length": 1924.3125610351562, + "epoch": 0.3417142857142857, + "grad_norm": 0.8472985625267029, + "kl": 0.09732818603515625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.749540639777539e-07, + "loss": 0.0039, + "reward": 0.2577241810504347, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2577241810504347, + "reward_after_std": 0.6427079103887081, + "reward_before_mean": 0.4723318573087454, + "reward_before_std": 0.6371937114745378, + "reward_change_max": 0.00032889842987060547, + "reward_change_mean": -0.21460766345262527, + "reward_change_min": -0.360564760863781, + "reward_change_std": 0.14255648292601109, + "reward_std": 0.642707921564579, + "rewards/cosine_scaled_reward": -0.17008409556001425, + "rewards/format_reward": 0.8125000074505806, + "step": 299 + }, + { + "advantage_max": 1.5028170943260193, + "advantage_mean": -1.8626452158443385e-08, + "advantage_min": -1.2105746865272522, + "advantage_std": 0.9998276457190514, + "completion_length": 1971.0417022705078, + "epoch": 0.34285714285714286, + "grad_norm": 1.1485674381256104, + "kl": 0.09633636474609375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.7185832004988133e-07, + "loss": 0.0039, + "reward": 0.38906230591237545, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.38906230591237545, + "reward_after_std": 0.7436303533613682, + "reward_before_mean": 0.6281124204397202, + "reward_before_std": 0.7673552930355072, + "reward_change_max": 0.00047088414430618286, + "reward_change_mean": -0.23905012011528015, + "reward_change_min": -0.43147959001362324, + "reward_change_std": 0.17824300099164248, + "reward_std": 0.7436303719878197, + "rewards/cosine_scaled_reward": -0.029693802818655968, + "rewards/format_reward": 0.6875000111758709, + "step": 300 + }, + { + "advantage_max": 1.639797881245613, + "advantage_mean": -3.16649689802162e-08, + "advantage_min": -1.1045557409524918, + "advantage_std": 0.9997733682394028, + "completion_length": 1297.81254196167, + "epoch": 0.344, + "grad_norm": 0.7328884601593018, + "kl": 0.05051422119140625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.68766384637248e-07, + "loss": 0.002, + "reward": 0.22625624496868113, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.22625624496868113, + "reward_after_std": 0.5197729840874672, + "reward_before_mean": 0.44249421264976263, + "reward_before_std": 0.4990711696445942, + "reward_change_max": 0.0, + "reward_change_mean": -0.21623797714710236, + "reward_change_min": -0.3751807101070881, + "reward_change_std": 0.1369111454114318, + "reward_std": 0.5197729952633381, + "rewards/cosine_scaled_reward": -0.2370862402021885, + "rewards/format_reward": 0.9166666865348816, + "step": 301 + }, + { + "advantage_max": 1.6990403681993484, + "advantage_mean": -5.5879356919419365e-08, + "advantage_min": -1.0498097091913223, + "advantage_std": 0.9998420625925064, + "completion_length": 1804.833351135254, + "epoch": 0.34514285714285714, + "grad_norm": 0.9516170024871826, + "kl": 0.1143646240234375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.656784084364238e-07, + "loss": 0.0046, + "reward": 0.43568436801433563, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.43568436801433563, + "reward_after_std": 0.7954321466386318, + "reward_before_mean": 0.6727348929271102, + "reward_before_std": 0.7747104242444038, + "reward_change_max": 0.0005867332220077515, + "reward_change_mean": -0.23705054307356477, + "reward_change_min": -0.408731022849679, + "reward_change_std": 0.1665506912395358, + "reward_std": 0.795432161539793, + "rewards/cosine_scaled_reward": 0.013450777158141136, + "rewards/format_reward": 0.6458333469927311, + "step": 302 + }, + { + "advantage_max": 1.6118300408124924, + "advantage_mean": -1.614292521878724e-08, + "advantage_min": -1.0991563871502876, + "advantage_std": 0.9997985139489174, + "completion_length": 1132.7916946411133, + "epoch": 0.3462857142857143, + "grad_norm": 0.6698048114776611, + "kl": 0.0547027587890625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.6259454195101267e-07, + "loss": 0.0022, + "reward": 0.415790211642161, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.415790211642161, + "reward_after_std": 0.6736731752753258, + "reward_before_mean": 0.6621344964951277, + "reward_before_std": 0.6673592794686556, + "reward_change_max": 0.00024134665727615356, + "reward_change_mean": -0.2463442850857973, + "reward_change_min": -0.4515748545527458, + "reward_change_std": 0.16697289608418941, + "reward_std": 0.673673190176487, + "rewards/cosine_scaled_reward": -0.12726609222590923, + "rewards/format_reward": 0.9166666716337204, + "step": 303 + }, + { + "advantage_max": 1.573637142777443, + "advantage_mean": -9.934107536579972e-09, + "advantage_min": -1.1766441836953163, + "advantage_std": 0.9997837841510773, + "completion_length": 1487.0208587646484, + "epoch": 0.3474285714285714, + "grad_norm": 0.5351270437240601, + "kl": 0.10761260986328125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.59514935484316e-07, + "loss": 0.0043, + "reward": 0.37789439666084945, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.37789439666084945, + "reward_after_std": 0.6138196587562561, + "reward_before_mean": 0.6197326518595219, + "reward_before_std": 0.6024635452777147, + "reward_change_max": 0.0, + "reward_change_mean": -0.24183825589716434, + "reward_change_min": -0.39685399271547794, + "reward_change_std": 0.15235036052763462, + "reward_std": 0.613819669932127, + "rewards/cosine_scaled_reward": -0.10680035129189491, + "rewards/format_reward": 0.8333333358168602, + "step": 304 + }, + { + "advantage_max": 1.5553628653287888, + "advantage_mean": -6.208817682207268e-09, + "advantage_min": -1.1495722085237503, + "advantage_std": 0.9997849240899086, + "completion_length": 1641.2917022705078, + "epoch": 0.3485714285714286, + "grad_norm": 0.998511016368866, + "kl": 0.0987548828125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.5643973913200837e-07, + "loss": 0.0039, + "reward": 0.25423115864396095, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.25423115864396095, + "reward_after_std": 0.6495640091598034, + "reward_before_mean": 0.4696862930431962, + "reward_before_std": 0.657315599732101, + "reward_change_max": 0.00010583549737930298, + "reward_change_mean": -0.21545512787997723, + "reward_change_min": -0.38556710444390774, + "reward_change_std": 0.15349662397056818, + "reward_std": 0.6495640445500612, + "rewards/cosine_scaled_reward": -0.1714068679139018, + "rewards/format_reward": 0.8125000111758709, + "step": 305 + }, + { + "advantage_max": 1.7128386795520782, + "advantage_mean": 1.179675312990014e-08, + "advantage_min": -0.9719715788960457, + "advantage_std": 0.9997956454753876, + "completion_length": 1563.2084045410156, + "epoch": 0.3497142857142857, + "grad_norm": 1.1378474235534668, + "kl": 0.13939666748046875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.5336910277482155e-07, + "loss": 0.0056, + "reward": 0.47938904957845807, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.47938904957845807, + "reward_after_std": 0.7479777634143829, + "reward_before_mean": 0.7320452854037285, + "reward_before_std": 0.723258962854743, + "reward_change_max": 0.0, + "reward_change_mean": -0.25265623442828655, + "reward_change_min": -0.4746490456163883, + "reward_change_std": 0.18077436927706003, + "reward_std": 0.747977789491415, + "rewards/cosine_scaled_reward": -0.029810683365212753, + "rewards/format_reward": 0.791666679084301, + "step": 306 + }, + { + "advantage_max": 1.6958726346492767, + "advantage_mean": 1.4280280180578586e-08, + "advantage_min": -1.1393985226750374, + "advantage_std": 0.9998360052704811, + "completion_length": 1480.583381652832, + "epoch": 0.35085714285714287, + "grad_norm": 1.4653286933898926, + "kl": 0.10507965087890625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.503031760712397e-07, + "loss": 0.0042, + "reward": 0.4037772142328322, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4037772142328322, + "reward_after_std": 0.7926469035446644, + "reward_before_mean": 0.6346738813444972, + "reward_before_std": 0.774210948497057, + "reward_change_max": 0.0005128830671310425, + "reward_change_mean": -0.23089664988219738, + "reward_change_min": -0.4010511841624975, + "reward_change_std": 0.15531510580331087, + "reward_std": 0.7926469184458256, + "rewards/cosine_scaled_reward": -0.08891306724399328, + "rewards/format_reward": 0.8125000111758709, + "step": 307 + }, + { + "advantage_max": 1.5332411974668503, + "advantage_mean": -3.942599025030802e-08, + "advantage_min": -1.291197545826435, + "advantage_std": 0.9998232498764992, + "completion_length": 1984.4375305175781, + "epoch": 0.352, + "grad_norm": 0.6880925893783569, + "kl": 0.13637542724609375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.4724210845020494e-07, + "loss": 0.0055, + "reward": 0.4717167126946151, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4717167126946151, + "reward_after_std": 0.6849931702017784, + "reward_before_mean": 0.7313460372388363, + "reward_before_std": 0.6948912851512432, + "reward_change_max": 3.4242868423461914e-05, + "reward_change_mean": -0.25962932175025344, + "reward_change_min": -0.4542670212686062, + "reward_change_std": 0.17313092714175582, + "reward_std": 0.6849932186305523, + "rewards/cosine_scaled_reward": -0.009326999075710773, + "rewards/format_reward": 0.7500000074505806, + "step": 308 + }, + { + "advantage_max": 1.5543510168790817, + "advantage_mean": 4.967053990334591e-09, + "advantage_min": -1.1442887485027313, + "advantage_std": 0.9998382106423378, + "completion_length": 1430.1041870117188, + "epoch": 0.35314285714285715, + "grad_norm": 1.7190380096435547, + "kl": 0.08464813232421875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.441860491038345e-07, + "loss": 0.0034, + "reward": 0.4691953402943909, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4691953402943909, + "reward_after_std": 0.7070669531822205, + "reward_before_mean": 0.7250326937064528, + "reward_before_std": 0.7026441767811775, + "reward_change_max": 0.0, + "reward_change_mean": -0.25583732686936855, + "reward_change_min": -0.46527867391705513, + "reward_change_std": 0.16947638988494873, + "reward_std": 0.707066971808672, + "rewards/cosine_scaled_reward": -0.09581700339913368, + "rewards/format_reward": 0.916666679084301, + "step": 309 + }, + { + "advantage_max": 1.6268791556358337, + "advantage_mean": -4.128863528851667e-08, + "advantage_min": -1.1320114061236382, + "advantage_std": 0.9997777566313744, + "completion_length": 1413.3542404174805, + "epoch": 0.35428571428571426, + "grad_norm": 0.781093180179596, + "kl": 0.0948944091796875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.4113514698014953e-07, + "loss": 0.0038, + "reward": 0.34962170582730323, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.34962170582730323, + "reward_after_std": 0.5699597038328648, + "reward_before_mean": 0.587611180730164, + "reward_before_std": 0.5539778638631105, + "reward_change_max": 0.0003587454557418823, + "reward_change_mean": -0.2379895057529211, + "reward_change_min": -0.3951511085033417, + "reward_change_std": 0.14933669101446867, + "reward_std": 0.5699597336351871, + "rewards/cosine_scaled_reward": -0.13327774591743946, + "rewards/format_reward": 0.854166679084301, + "step": 310 + }, + { + "advantage_max": 1.5723091959953308, + "advantage_mean": -1.490116185998147e-08, + "advantage_min": -1.1205863133072853, + "advantage_std": 0.9998145774006844, + "completion_length": 1074.2708587646484, + "epoch": 0.3554285714285714, + "grad_norm": 0.6670510768890381, + "kl": 0.03679656982421875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.3808955077581546e-07, + "loss": 0.0015, + "reward": 0.7845532577484846, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.7845532577484846, + "reward_after_std": 0.7538176812231541, + "reward_before_mean": 1.105867974460125, + "reward_before_std": 0.7525460831820965, + "reward_change_max": 0.0, + "reward_change_mean": -0.3213147222995758, + "reward_change_min": -0.5401858016848564, + "reward_change_std": 0.206632686778903, + "reward_std": 0.7538177222013474, + "rewards/cosine_scaled_reward": 0.05293398164212704, + "rewards/format_reward": 1.0, + "step": 311 + }, + { + "advantage_max": 1.810782939195633, + "advantage_mean": -5.184362417143262e-08, + "advantage_min": -0.994841955602169, + "advantage_std": 0.9998369365930557, + "completion_length": 1114.3333549499512, + "epoch": 0.3565714285714286, + "grad_norm": 0.7077937126159668, + "kl": 0.08843231201171875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.350494089288943e-07, + "loss": 0.0035, + "reward": 0.9057276744861156, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.9057276744861156, + "reward_after_std": 0.7223017476499081, + "reward_before_mean": 1.2431134916841984, + "reward_before_std": 0.6302335932850838, + "reward_change_max": 0.0, + "reward_change_mean": -0.337385768070817, + "reward_change_min": -0.482417494058609, + "reward_change_std": 0.18416978046298027, + "reward_std": 0.7223017923533916, + "rewards/cosine_scaled_reward": 0.19447338953614235, + "rewards/format_reward": 0.8541666865348816, + "step": 312 + }, + { + "advantage_max": 1.7264662384986877, + "advantage_mean": -8.071462631598081e-08, + "advantage_min": -0.8811491578817368, + "advantage_std": 0.9998324140906334, + "completion_length": 1650.3958892822266, + "epoch": 0.3577142857142857, + "grad_norm": 0.9196247458457947, + "kl": 0.14367294311523438, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.3201486961161093e-07, + "loss": 0.0057, + "reward": 0.6459367610514164, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6459367610514164, + "reward_after_std": 0.8145084977149963, + "reward_before_mean": 0.9274978432804346, + "reward_before_std": 0.783172954339534, + "reward_change_max": 0.00030690431594848633, + "reward_change_mean": -0.2815611115656793, + "reward_change_min": -0.5300815589725971, + "reward_change_std": 0.18957517808303237, + "reward_std": 0.8145085163414478, + "rewards/cosine_scaled_reward": 0.07833224721252918, + "rewards/format_reward": 0.7708333395421505, + "step": 313 + }, + { + "advantage_max": 1.3890406340360641, + "advantage_mean": -3.011276428210863e-08, + "advantage_min": -1.3965424448251724, + "advantage_std": 0.9998088404536247, + "completion_length": 1131.375015258789, + "epoch": 0.3588571428571429, + "grad_norm": 0.7419488430023193, + "kl": 0.0739593505859375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.2898608072313045e-07, + "loss": 0.003, + "reward": 0.8312315121293068, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.8312315121293068, + "reward_after_std": 0.6450635753571987, + "reward_before_mean": 1.1716954428702593, + "reward_before_std": 0.6489796601235867, + "reward_change_max": 0.0005232319235801697, + "reward_change_mean": -0.3404639083892107, + "reward_change_min": -0.5209429115056992, + "reward_change_std": 0.2113346103578806, + "reward_std": 0.6450635828077793, + "rewards/cosine_scaled_reward": 0.11709769815206528, + "rewards/format_reward": 0.9375000074505806, + "step": 314 + }, + { + "advantage_max": 1.6860020756721497, + "advantage_mean": -5.587935891782081e-09, + "advantage_min": -0.9125220403075218, + "advantage_std": 0.9997944608330727, + "completion_length": 1653.0000305175781, + "epoch": 0.36, + "grad_norm": 1.6936687231063843, + "kl": 0.25146484375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.2596318988235037e-07, + "loss": 0.0101, + "reward": 0.4668788071721792, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4668788071721792, + "reward_after_std": 0.6091582365334034, + "reward_before_mean": 0.7244241368025541, + "reward_before_std": 0.5609946362674236, + "reward_change_max": 0.0, + "reward_change_mean": -0.2575453044846654, + "reward_change_min": -0.42978838086128235, + "reward_change_std": 0.15476837567985058, + "reward_std": 0.6091582626104355, + "rewards/cosine_scaled_reward": -0.0336212863549008, + "rewards/format_reward": 0.7916666772216558, + "step": 315 + }, + { + "advantage_max": 1.6541273891925812, + "advantage_mean": -1.676380706472358e-08, + "advantage_min": -1.09672212600708, + "advantage_std": 0.9998091906309128, + "completion_length": 1717.6250305175781, + "epoch": 0.36114285714285715, + "grad_norm": 1.417986273765564, + "kl": 0.1870574951171875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.2294634442070553e-07, + "loss": 0.0075, + "reward": 0.10566316498443484, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.10566316498443484, + "reward_after_std": 0.5733887776732445, + "reward_before_mean": 0.29063704796135426, + "reward_before_std": 0.5616068728268147, + "reward_change_max": 0.0, + "reward_change_mean": -0.1849738946184516, + "reward_change_min": -0.3238295055925846, + "reward_change_std": 0.12428497988730669, + "reward_std": 0.5733887813985348, + "rewards/cosine_scaled_reward": -0.2296814899891615, + "rewards/format_reward": 0.7500000204890966, + "step": 316 + }, + { + "advantage_max": 1.5123141556978226, + "advantage_mean": -2.9802322498717615e-08, + "advantage_min": -1.3178307265043259, + "advantage_std": 0.9998439252376556, + "completion_length": 1461.2292251586914, + "epoch": 0.36228571428571427, + "grad_norm": 1.5687153339385986, + "kl": 0.1681976318359375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.1993569137498776e-07, + "loss": 0.0067, + "reward": 0.4049488212913275, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4049488212913275, + "reward_after_std": 0.7206962667405605, + "reward_before_mean": 0.6449471823871136, + "reward_before_std": 0.7243769988417625, + "reward_change_max": 0.0008472129702568054, + "reward_change_mean": -0.23999838065356016, + "reward_change_min": -0.43049536645412445, + "reward_change_std": 0.1642217980697751, + "reward_std": 0.7206962741911411, + "rewards/cosine_scaled_reward": -0.052526420913636684, + "rewards/format_reward": 0.7500000260770321, + "step": 317 + }, + { + "advantage_max": 1.5850563496351242, + "advantage_mean": -2.4835269507583746e-08, + "advantage_min": -1.1503583490848541, + "advantage_std": 0.999801941215992, + "completion_length": 1027.145851135254, + "epoch": 0.36342857142857143, + "grad_norm": 2.1071932315826416, + "kl": 0.11089324951171875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.1693137748017915e-07, + "loss": 0.0044, + "reward": 0.49303111620247364, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.49303111620247364, + "reward_after_std": 0.6135988608002663, + "reward_before_mean": 0.7602047026157379, + "reward_before_std": 0.6004465334117413, + "reward_change_max": 0.0, + "reward_change_mean": -0.26717356964945793, + "reward_change_min": -0.44262557849287987, + "reward_change_std": 0.16659097839146852, + "reward_std": 0.613598894327879, + "rewards/cosine_scaled_reward": -0.09906433057039976, + "rewards/format_reward": 0.9583333432674408, + "step": 318 + }, + { + "advantage_max": 1.7661184072494507, + "advantage_mean": 1.2417632477834672e-09, + "advantage_min": -0.9878272712230682, + "advantage_std": 0.9998244121670723, + "completion_length": 1393.3750305175781, + "epoch": 0.36457142857142855, + "grad_norm": 1.081502914428711, + "kl": 0.16871261596679688, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.1393354916230005e-07, + "loss": 0.0067, + "reward": 0.17947366731823422, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.17947366731823422, + "reward_after_std": 0.6435183212161064, + "reward_before_mean": 0.3727762456983328, + "reward_before_std": 0.6059022732079029, + "reward_change_max": 0.0006429404020309448, + "reward_change_mean": -0.1933025810867548, + "reward_change_min": -0.3257717378437519, + "reward_change_std": 0.12049250770360231, + "reward_std": 0.643518328666687, + "rewards/cosine_scaled_reward": -0.23027855902910233, + "rewards/format_reward": 0.8333333469927311, + "step": 319 + }, + { + "advantage_max": 1.7249931246042252, + "advantage_mean": 1.5211601978037947e-08, + "advantage_min": -1.0006217509508133, + "advantage_std": 0.9998496472835541, + "completion_length": 947.458366394043, + "epoch": 0.3657142857142857, + "grad_norm": 1.0906606912612915, + "kl": 0.09429550170898438, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.1094235253127374e-07, + "loss": 0.0038, + "reward": 0.6802430953830481, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6802430953830481, + "reward_after_std": 0.8332295939326286, + "reward_before_mean": 0.9649859443306923, + "reward_before_std": 0.7885765507817268, + "reward_change_max": 0.0, + "reward_change_mean": -0.2847428247332573, + "reward_change_min": -0.46728481724858284, + "reward_change_std": 0.16841666772961617, + "reward_std": 0.8332296200096607, + "rewards/cosine_scaled_reward": 0.0033262865617871284, + "rewards/format_reward": 0.9583333432674408, + "step": 320 + }, + { + "advantage_max": 1.7146756947040558, + "advantage_mean": -5.091230237397326e-08, + "advantage_min": -1.0093270689249039, + "advantage_std": 0.999833919107914, + "completion_length": 1098.9166946411133, + "epoch": 0.3668571428571429, + "grad_norm": 1.029427170753479, + "kl": 0.12979888916015625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.079579333738039e-07, + "loss": 0.0052, + "reward": 0.8162387441843748, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.8162387441843748, + "reward_after_std": 0.72203304246068, + "reward_before_mean": 1.1376890763640404, + "reward_before_std": 0.6585705541074276, + "reward_change_max": 0.0, + "reward_change_mean": -0.3214503303170204, + "reward_change_min": -0.48446333035826683, + "reward_change_std": 0.1824331246316433, + "reward_std": 0.7220330536365509, + "rewards/cosine_scaled_reward": 0.08967785281129181, + "rewards/format_reward": 0.9583333432674408, + "step": 321 + }, + { + "advantage_max": 1.614561453461647, + "advantage_mean": -1.179675312990014e-08, + "advantage_min": -1.0946208611130714, + "advantage_std": 0.9998006448149681, + "completion_length": 1206.5625267028809, + "epoch": 0.368, + "grad_norm": 2.3137221336364746, + "kl": 0.16611480712890625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.0498043714627006e-07, + "loss": 0.0066, + "reward": 0.2254452295601368, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2254452295601368, + "reward_after_std": 0.6338132582604885, + "reward_before_mean": 0.4344533123075962, + "reward_before_std": 0.6367870792746544, + "reward_change_max": 0.0006910189986228943, + "reward_change_mean": -0.2090080864727497, + "reward_change_min": -0.37567378394305706, + "reward_change_std": 0.1448584054596722, + "reward_std": 0.6338132806122303, + "rewards/cosine_scaled_reward": -0.14735668897628784, + "rewards/format_reward": 0.7291666809469461, + "step": 322 + }, + { + "advantage_max": 1.5460123121738434, + "advantage_mean": -1.3659398556686853e-08, + "advantage_min": -1.2054516822099686, + "advantage_std": 0.9997941702604294, + "completion_length": 1176.0208778381348, + "epoch": 0.36914285714285716, + "grad_norm": 1.450031042098999, + "kl": 0.116943359375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 4.020100089676376e-07, + "loss": 0.0047, + "reward": 0.5555176772177219, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5555176772177219, + "reward_after_std": 0.6632481273263693, + "reward_before_mean": 0.8349560154601932, + "reward_before_std": 0.6657208111137152, + "reward_change_max": 0.0, + "reward_change_mean": -0.2794383354485035, + "reward_change_min": -0.4561638943850994, + "reward_change_std": 0.17854246776551008, + "reward_std": 0.6632481273263693, + "rewards/cosine_scaled_reward": -0.040855332277715206, + "rewards/format_reward": 0.916666679084301, + "step": 323 + }, + { + "advantage_max": 1.6410552561283112, + "advantage_mean": -2.2351742678949904e-08, + "advantage_min": -1.1239653453230858, + "advantage_std": 0.9997935220599174, + "completion_length": 1479.3125534057617, + "epoch": 0.3702857142857143, + "grad_norm": 1.3957271575927734, + "kl": 0.29621124267578125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.9904679361238526e-07, + "loss": 0.0118, + "reward": 0.36106334580108523, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.36106334580108523, + "reward_after_std": 0.5908253006637096, + "reward_before_mean": 0.5981873874552548, + "reward_before_std": 0.5554264299571514, + "reward_change_max": 0.0005471184849739075, + "reward_change_mean": -0.2371240258216858, + "reward_change_min": -0.3761534318327904, + "reward_change_std": 0.14011078514158726, + "reward_std": 0.5908253267407417, + "rewards/cosine_scaled_reward": -0.16965633165091276, + "rewards/format_reward": 0.9375000074505806, + "step": 324 + }, + { + "advantage_max": 1.6871405392885208, + "advantage_mean": -7.605801100041276e-09, + "advantage_min": -1.0215219408273697, + "advantage_std": 0.9998608082532883, + "completion_length": 1574.1667137145996, + "epoch": 0.37142857142857144, + "grad_norm": 1.4152541160583496, + "kl": 0.2861785888671875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.9609093550344907e-07, + "loss": 0.0114, + "reward": 0.6319072768092155, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6319072768092155, + "reward_after_std": 0.8270172514021397, + "reward_before_mean": 0.9088499108329415, + "reward_before_std": 0.797621414065361, + "reward_change_max": 0.0, + "reward_change_mean": -0.2769426228478551, + "reward_change_min": -0.46800280176103115, + "reward_change_std": 0.17760974913835526, + "reward_std": 0.8270172588527203, + "rewards/cosine_scaled_reward": 0.04817493752489099, + "rewards/format_reward": 0.812500013038516, + "step": 325 + }, + { + "advantage_max": 1.5468260794878006, + "advantage_mean": -1.2417638028949796e-09, + "advantage_min": -1.231099657714367, + "advantage_std": 0.9998420029878616, + "completion_length": 1152.2500343322754, + "epoch": 0.37257142857142855, + "grad_norm": 1.4574198722839355, + "kl": 0.19783782958984375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.931425787051832e-07, + "loss": 0.0079, + "reward": 0.6057899557054043, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6057899557054043, + "reward_after_std": 0.7603934742510319, + "reward_before_mean": 0.8865679651498795, + "reward_before_std": 0.7577711865305901, + "reward_change_max": 0.0, + "reward_change_mean": -0.2807780094444752, + "reward_change_min": -0.47002286091446877, + "reward_change_std": 0.18309276923537254, + "reward_std": 0.7603935040533543, + "rewards/cosine_scaled_reward": 0.005783975124359131, + "rewards/format_reward": 0.8750000149011612, + "step": 326 + }, + { + "advantage_max": 1.6103992611169815, + "advantage_mean": -8.257726913374341e-08, + "advantage_min": -1.2234643921256065, + "advantage_std": 0.9998258948326111, + "completion_length": 1431.8958892822266, + "epoch": 0.3737142857142857, + "grad_norm": 0.8417478799819946, + "kl": 0.12567138671875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.902018669163384e-07, + "loss": 0.005, + "reward": 0.8357805621344596, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.8357805621344596, + "reward_after_std": 0.6931793317198753, + "reward_before_mean": 1.1680885925889015, + "reward_before_std": 0.6554881166666746, + "reward_change_max": 0.0007588863372802734, + "reward_change_mean": -0.3323080986738205, + "reward_change_min": -0.52970290184021, + "reward_change_std": 0.20208781119436026, + "reward_std": 0.6931793540716171, + "rewards/cosine_scaled_reward": 0.14654429350048304, + "rewards/format_reward": 0.8750000149011612, + "step": 327 + }, + { + "advantage_max": 1.691834032535553, + "advantage_mean": 1.2417635808503746e-09, + "advantage_min": -1.0485369563102722, + "advantage_std": 0.9998220056295395, + "completion_length": 1368.833381652832, + "epoch": 0.37485714285714283, + "grad_norm": 1.4414052963256836, + "kl": 0.21993255615234375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.872689434630585e-07, + "loss": 0.0088, + "reward": 0.17786777764558792, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.17786777764558792, + "reward_after_std": 0.6675361059606075, + "reward_before_mean": 0.3704228848218918, + "reward_before_std": 0.6458631716668606, + "reward_change_max": 0.0, + "reward_change_mean": -0.19255511928349733, + "reward_change_min": -0.3381800428032875, + "reward_change_std": 0.12796216271817684, + "reward_std": 0.6675361357629299, + "rewards/cosine_scaled_reward": -0.24187190178781748, + "rewards/format_reward": 0.8541666828095913, + "step": 328 + }, + { + "advantage_max": 1.7343352884054184, + "advantage_mean": -2.421438682898014e-08, + "advantage_min": -0.9665001779794693, + "advantage_std": 0.9998097270727158, + "completion_length": 1006.5833587646484, + "epoch": 0.376, + "grad_norm": 0.9054838418960571, + "kl": 0.19991302490234375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.843439512918949e-07, + "loss": 0.008, + "reward": 0.7864454248920083, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.7864454248920083, + "reward_after_std": 0.6544234752655029, + "reward_before_mean": 1.1081649232655764, + "reward_before_std": 0.5919336788356304, + "reward_change_max": 0.0, + "reward_change_mean": -0.32171949837356806, + "reward_change_min": -0.48736608400940895, + "reward_change_std": 0.18318084720522165, + "reward_std": 0.6544234827160835, + "rewards/cosine_scaled_reward": 0.09574911929666996, + "rewards/format_reward": 0.9166666679084301, + "step": 329 + }, + { + "advantage_max": 1.7217664271593094, + "advantage_mean": -4.594524871670558e-08, + "advantage_min": -1.0755222663283348, + "advantage_std": 0.9998010918498039, + "completion_length": 1190.7500267028809, + "epoch": 0.37714285714285717, + "grad_norm": 0.9707238078117371, + "kl": 0.2688751220703125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.8142703296283953e-07, + "loss": 0.0108, + "reward": 0.5854407958686352, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5854407958686352, + "reward_after_std": 0.737566763535142, + "reward_before_mean": 0.8584175645373762, + "reward_before_std": 0.6991724539548159, + "reward_change_max": 0.00036550313234329224, + "reward_change_mean": -0.2729767709970474, + "reward_change_min": -0.45406655967235565, + "reward_change_std": 0.1680940967053175, + "reward_std": 0.7375667933374643, + "rewards/cosine_scaled_reward": -0.018707887269556522, + "rewards/format_reward": 0.8958333432674408, + "step": 330 + }, + { + "advantage_max": 1.7191912680864334, + "advantage_mean": -3.1044086745701804e-08, + "advantage_min": -0.953365832567215, + "advantage_std": 0.9997701942920685, + "completion_length": 1443.9583587646484, + "epoch": 0.3782857142857143, + "grad_norm": 3.559812545776367, + "kl": 0.39825439453125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.785183306423767e-07, + "loss": 0.0159, + "reward": 0.3996207695454359, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.3996207695454359, + "reward_after_std": 0.5887615773826838, + "reward_before_mean": 0.6428104415535927, + "reward_before_std": 0.5397970164194703, + "reward_change_max": 0.00024040043354034424, + "reward_change_mean": -0.24318967200815678, + "reward_change_min": -0.3880366124212742, + "reward_change_std": 0.1508668838068843, + "reward_std": 0.5887615997344255, + "rewards/cosine_scaled_reward": -0.06401145167183131, + "rewards/format_reward": 0.7708333395421505, + "step": 331 + }, + { + "advantage_max": 1.5935689955949783, + "advantage_mean": -4.1599075351062e-08, + "advantage_min": -1.270665518939495, + "advantage_std": 0.9997954964637756, + "completion_length": 1409.208381652832, + "epoch": 0.37942857142857145, + "grad_norm": 1.2800945043563843, + "kl": 0.22563934326171875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.7561798609655373e-07, + "loss": 0.009, + "reward": 0.4048396535217762, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.4048396535217762, + "reward_after_std": 0.5992461815476418, + "reward_before_mean": 0.6520956940948963, + "reward_before_std": 0.5757904518395662, + "reward_change_max": 0.0027850568294525146, + "reward_change_mean": -0.24725607503205538, + "reward_change_min": -0.390565924346447, + "reward_change_std": 0.15339871495962143, + "reward_std": 0.5992461927235126, + "rewards/cosine_scaled_reward": -0.10103549575433135, + "rewards/format_reward": 0.8541666865348816, + "step": 332 + }, + { + "advantage_max": 1.7609997540712357, + "advantage_mean": -1.490116141589226e-08, + "advantage_min": -0.9781611263751984, + "advantage_std": 0.9997869431972504, + "completion_length": 1225.0416946411133, + "epoch": 0.38057142857142856, + "grad_norm": 1.0518617630004883, + "kl": 0.15213775634765625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.72726140684072e-07, + "loss": 0.0061, + "reward": 0.26022319309413433, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.26022319309413433, + "reward_after_std": 0.560534905642271, + "reward_before_mean": 0.47590574622154236, + "reward_before_std": 0.5113978497684002, + "reward_change_max": 0.0, + "reward_change_mean": -0.21568256057798862, + "reward_change_min": -0.33122558146715164, + "reward_change_std": 0.12318441737443209, + "reward_std": 0.5605349130928516, + "rewards/cosine_scaled_reward": -0.23079713946208358, + "rewards/format_reward": 0.9375000074505806, + "step": 333 + }, + { + "advantage_max": 1.7300200462341309, + "advantage_mean": -1.1175870895385742e-08, + "advantage_min": -0.892622597515583, + "advantage_std": 0.9998339638113976, + "completion_length": 1481.0000610351562, + "epoch": 0.38171428571428573, + "grad_norm": 1.437591791152954, + "kl": 0.3504638671875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.6984293534939737e-07, + "loss": 0.014, + "reward": 0.2755582988029346, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2755582988029346, + "reward_after_std": 0.7258717641234398, + "reward_before_mean": 0.48202365916222334, + "reward_before_std": 0.6885374151170254, + "reward_change_max": 0.0, + "reward_change_mean": -0.2064653616398573, + "reward_change_min": -0.34851996414363384, + "reward_change_std": 0.12769698351621628, + "reward_std": 0.7258717827498913, + "rewards/cosine_scaled_reward": -0.17565484810620546, + "rewards/format_reward": 0.8333333395421505, + "step": 334 + }, + { + "advantage_max": 1.738677054643631, + "advantage_mean": -2.6697913879658586e-08, + "advantage_min": -1.0176256000995636, + "advantage_std": 0.9998103380203247, + "completion_length": 1122.1667098999023, + "epoch": 0.38285714285714284, + "grad_norm": 0.8629187345504761, + "kl": 0.16303253173828125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.6696851061588994e-07, + "loss": 0.0065, + "reward": 0.8083305526524782, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.8083305526524782, + "reward_after_std": 0.732852453365922, + "reward_before_mean": 1.1283803507685661, + "reward_before_std": 0.6743199601769447, + "reward_change_max": 0.0, + "reward_change_mean": -0.3200498167425394, + "reward_change_min": -0.4992302544414997, + "reward_change_std": 0.1873677847906947, + "reward_std": 0.7328524719923735, + "rewards/cosine_scaled_reward": 0.10585683188401163, + "rewards/format_reward": 0.9166666716337204, + "step": 335 + }, + { + "advantage_max": 1.71261827647686, + "advantage_mean": -3.6942462977584967e-08, + "advantage_min": -1.093318596482277, + "advantage_std": 0.9997923597693443, + "completion_length": 1373.6250610351562, + "epoch": 0.384, + "grad_norm": 1.218349575996399, + "kl": 0.28626251220703125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.641030065789562e-07, + "loss": 0.0115, + "reward": 0.5922711892053485, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5922711892053485, + "reward_after_std": 0.5163947194814682, + "reward_before_mean": 0.8827753812074661, + "reward_before_std": 0.45310843363404274, + "reward_change_max": 0.0, + "reward_change_mean": -0.29050417989492416, + "reward_change_min": -0.4375855065882206, + "reward_change_std": 0.16001543402671814, + "reward_std": 0.5163947381079197, + "rewards/cosine_scaled_reward": -0.016945652663707733, + "rewards/format_reward": 0.9166666865348816, + "step": 336 + }, + { + "advantage_max": 1.6926042586565018, + "advantage_mean": -4.346172066682641e-08, + "advantage_min": -1.014223888516426, + "advantage_std": 0.9998311400413513, + "completion_length": 1264.2917022705078, + "epoch": 0.3851428571428571, + "grad_norm": 0.7854025363922119, + "kl": 0.1107635498046875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.612465628992203e-07, + "loss": 0.0044, + "reward": 0.5707871560007334, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5707871560007334, + "reward_after_std": 0.7214007787406445, + "reward_before_mean": 0.8423281982541084, + "reward_before_std": 0.6791776567697525, + "reward_change_max": 0.0, + "reward_change_mean": -0.2715410515666008, + "reward_change_min": -0.43035536259412766, + "reward_change_std": 0.15988810174167156, + "reward_std": 0.7214007899165154, + "rewards/cosine_scaled_reward": -0.06841926136985421, + "rewards/format_reward": 0.9791666716337204, + "step": 337 + }, + { + "advantage_max": 1.6522819548845291, + "advantage_mean": -1.9868215406226852e-08, + "advantage_min": -1.112328127026558, + "advantage_std": 0.999812588095665, + "completion_length": 1322.7916870117188, + "epoch": 0.3862857142857143, + "grad_norm": 1.565803050994873, + "kl": 0.5242538452148438, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.5839931879571725e-07, + "loss": 0.021, + "reward": 0.5619577057659626, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.5619577057659626, + "reward_after_std": 0.6290275603532791, + "reward_before_mean": 0.8397875525988638, + "reward_before_std": 0.5936907455325127, + "reward_change_max": 0.000512540340423584, + "reward_change_mean": -0.2778298445045948, + "reward_change_min": -0.47006095573306084, + "reward_change_std": 0.17123521026223898, + "reward_std": 0.6290275789797306, + "rewards/cosine_scaled_reward": -0.01760623953305185, + "rewards/format_reward": 0.8750000055879354, + "step": 338 + }, + { + "advantage_max": 1.6528789550065994, + "advantage_mean": 8.6923440667519e-09, + "advantage_min": -1.210326187312603, + "advantage_std": 0.9997606724500656, + "completion_length": 1296.5417022705078, + "epoch": 0.38742857142857146, + "grad_norm": 2.363879919052124, + "kl": 0.2260894775390625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.555614130391079e-07, + "loss": 0.009, + "reward": 0.23270575946662575, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.23270575946662575, + "reward_after_std": 0.5161370243877172, + "reward_before_mean": 0.4495003125630319, + "reward_before_std": 0.49957178719341755, + "reward_change_max": 0.0010571181774139404, + "reward_change_mean": -0.21679451875388622, + "reward_change_min": -0.3489368353039026, + "reward_change_std": 0.13622549921274185, + "reward_std": 0.516137033700943, + "rewards/cosine_scaled_reward": -0.1710832081735134, + "rewards/format_reward": 0.7916666865348816, + "step": 339 + }, + { + "advantage_max": 1.7887818366289139, + "advantage_mean": -5.3395829757718616e-08, + "advantage_min": -0.9258808940649033, + "advantage_std": 0.9998010918498039, + "completion_length": 1079.9583587646484, + "epoch": 0.38857142857142857, + "grad_norm": 1.0839471817016602, + "kl": 0.2021484375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.5273298394491515e-07, + "loss": 0.0081, + "reward": 0.5940607134252787, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5940607134252787, + "reward_after_std": 0.594856072217226, + "reward_before_mean": 0.875343382358551, + "reward_before_std": 0.5153414569795132, + "reward_change_max": 0.0, + "reward_change_mean": -0.2812826782464981, + "reward_change_min": -0.41136982291936874, + "reward_change_std": 0.14915307890623808, + "reward_std": 0.5948560945689678, + "rewards/cosine_scaled_reward": -0.041494992794469, + "rewards/format_reward": 0.9583333432674408, + "step": 340 + }, + { + "advantage_max": 1.7246081233024597, + "advantage_mean": -1.2759119849548028e-07, + "advantage_min": -1.0545568354427814, + "advantage_std": 0.9998086541891098, + "completion_length": 1165.0625381469727, + "epoch": 0.38971428571428574, + "grad_norm": 1.0827921628952026, + "kl": 0.13689804077148438, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.4991416936678276e-07, + "loss": 0.0055, + "reward": 0.9295502845197916, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.9295502845197916, + "reward_after_std": 0.6908070221543312, + "reward_before_mean": 1.2785332389175892, + "reward_before_std": 0.616323871538043, + "reward_change_max": 0.0, + "reward_change_mean": -0.34898300282657146, + "reward_change_min": -0.5290452390909195, + "reward_change_std": 0.19935437012463808, + "reward_std": 0.6908070258796215, + "rewards/cosine_scaled_reward": 0.16009995341300964, + "rewards/format_reward": 0.9583333432674408, + "step": 341 + }, + { + "advantage_max": 1.455557405948639, + "advantage_mean": -3.725290431688677e-08, + "advantage_min": -1.3320115879178047, + "advantage_std": 0.9998503029346466, + "completion_length": 1276.145881652832, + "epoch": 0.39085714285714285, + "grad_norm": 2.228071451187134, + "kl": 0.3672637939453125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.471051066897562e-07, + "loss": 0.0147, + "reward": 0.7400090312585235, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.7400090312585235, + "reward_after_std": 0.8655583336949348, + "reward_before_mean": 1.0439782813191414, + "reward_before_std": 0.8965267464518547, + "reward_change_max": 0.0, + "reward_change_mean": -0.3039692733436823, + "reward_change_min": -0.5514669045805931, + "reward_change_std": 0.2145256232470274, + "reward_std": 0.8655583411455154, + "rewards/cosine_scaled_reward": 0.08448914252221584, + "rewards/format_reward": 0.8750000223517418, + "step": 342 + }, + { + "advantage_max": 1.601967141032219, + "advantage_mean": 3.725290520506519e-09, + "advantage_min": -1.1046533659100533, + "advantage_std": 0.9997711554169655, + "completion_length": 1441.2500457763672, + "epoch": 0.392, + "grad_norm": 3.3537521362304688, + "kl": 0.3649749755859375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.4430593282358777e-07, + "loss": 0.0146, + "reward": 0.6907957500079647, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6907957500079647, + "reward_after_std": 0.7117171566933393, + "reward_before_mean": 0.9908101595938206, + "reward_before_std": 0.7002871641889215, + "reward_change_max": 0.0, + "reward_change_mean": -0.3000143878161907, + "reward_change_min": -0.5172241926193237, + "reward_change_std": 0.1907307654619217, + "reward_std": 0.7117171976715326, + "rewards/cosine_scaled_reward": 0.047488420736044645, + "rewards/format_reward": 0.8958333507180214, + "step": 343 + }, + { + "advantage_max": 1.589540719985962, + "advantage_mean": -5.836288163862946e-08, + "advantage_min": -1.3002796024084091, + "advantage_std": 0.9998637288808823, + "completion_length": 1252.43754196167, + "epoch": 0.3931428571428571, + "grad_norm": 2.5770859718322754, + "kl": 0.473236083984375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.4151678419606233e-07, + "loss": 0.0189, + "reward": 0.9171247731428593, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.9171247731428593, + "reward_after_std": 0.8720968402922153, + "reward_before_mean": 1.2546650283038616, + "reward_before_std": 0.8525521457195282, + "reward_change_max": 0.002484917640686035, + "reward_change_mean": -0.3375402484089136, + "reward_change_min": -0.531851053237915, + "reward_change_std": 0.220105716958642, + "reward_std": 0.872096873819828, + "rewards/cosine_scaled_reward": 0.20024916948750615, + "rewards/format_reward": 0.8541666865348816, + "step": 344 + }, + { + "advantage_max": 1.6149013713002205, + "advantage_mean": -9.158006020193454e-09, + "advantage_min": -1.1341613978147507, + "advantage_std": 0.9997781962156296, + "completion_length": 1171.8542022705078, + "epoch": 0.3942857142857143, + "grad_norm": 2.487800121307373, + "kl": 0.27524566650390625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.387377967463493e-07, + "loss": 0.011, + "reward": 0.6368197742849588, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6368197742849588, + "reward_after_std": 0.5455946698784828, + "reward_before_mean": 0.9370312727987766, + "reward_before_std": 0.5042991433292627, + "reward_change_max": 0.0010842680931091309, + "reward_change_mean": -0.3002114836126566, + "reward_change_min": -0.45550261810421944, + "reward_change_std": 0.17128814291208982, + "reward_std": 0.5455946773290634, + "rewards/cosine_scaled_reward": -0.021067719906568527, + "rewards/format_reward": 0.9791666716337204, + "step": 345 + }, + { + "advantage_max": 1.6020869314670563, + "advantage_mean": -3.849466778671484e-08, + "advantage_min": -1.1575617864727974, + "advantage_std": 0.9998195618391037, + "completion_length": 1161.1458587646484, + "epoch": 0.3954285714285714, + "grad_norm": 2.9014134407043457, + "kl": 0.1278076171875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.359691059183761e-07, + "loss": 0.0051, + "reward": 0.6770103015005589, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6770103015005589, + "reward_after_std": 0.6890826895833015, + "reward_before_mean": 0.9757176786661148, + "reward_before_std": 0.6647419556975365, + "reward_change_max": 0.0, + "reward_change_mean": -0.29870735108852386, + "reward_change_min": -0.487264234572649, + "reward_change_std": 0.1784799639135599, + "reward_std": 0.6890827268362045, + "rewards/cosine_scaled_reward": -0.012141183018684387, + "rewards/format_reward": 1.0, + "step": 346 + }, + { + "advantage_max": 1.712681457400322, + "advantage_mean": -2.110997865401032e-08, + "advantage_min": -1.050035186111927, + "advantage_std": 0.9997965469956398, + "completion_length": 1166.6458740234375, + "epoch": 0.3965714285714286, + "grad_norm": 2.8097753524780273, + "kl": 0.1442718505859375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.3321084665422803e-07, + "loss": 0.0058, + "reward": 0.36580729484558105, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.36580729484558105, + "reward_after_std": 0.5895061790943146, + "reward_before_mean": 0.6037146374583244, + "reward_before_std": 0.5479989349842072, + "reward_change_max": 0.0, + "reward_change_mean": -0.23790733329951763, + "reward_change_min": -0.38576383143663406, + "reward_change_std": 0.14080576319247484, + "reward_std": 0.5895062014460564, + "rewards/cosine_scaled_reward": -0.18772602826356888, + "rewards/format_reward": 0.9791666716337204, + "step": 347 + }, + { + "advantage_max": 1.4286227524280548, + "advantage_mean": -2.002343674201157e-08, + "advantage_min": -1.3194576650857925, + "advantage_std": 0.9998432993888855, + "completion_length": 952.458366394043, + "epoch": 0.3977142857142857, + "grad_norm": 1.2307054996490479, + "kl": 0.1024932861328125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.3046315338757026e-07, + "loss": 0.0041, + "reward": 0.5614768331870437, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5614768331870437, + "reward_after_std": 0.7371022440493107, + "reward_before_mean": 0.8400985077023506, + "reward_before_std": 0.7679982222616673, + "reward_change_max": 0.0005593299865722656, + "reward_change_mean": -0.2786216828972101, + "reward_change_min": -0.4986792951822281, + "reward_change_std": 0.19410818628966808, + "reward_std": 0.7371022514998913, + "rewards/cosine_scaled_reward": -0.027867418713867664, + "rewards/format_reward": 0.8958333507180214, + "step": 348 + }, + { + "advantage_max": 1.4817928969860077, + "advantage_mean": -2.421438782818086e-08, + "advantage_min": -1.2049047872424126, + "advantage_std": 0.9998388364911079, + "completion_length": 980.7500305175781, + "epoch": 0.39885714285714285, + "grad_norm": 2.26857590675354, + "kl": 0.26554107666015625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.2772616003709616e-07, + "loss": 0.0106, + "reward": 0.5565350241959095, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.5565350241959095, + "reward_after_std": 0.7860203348100185, + "reward_before_mean": 0.8280746899545193, + "reward_before_std": 0.8170614093542099, + "reward_change_max": 0.0019630417227745056, + "reward_change_mean": -0.27153966948390007, + "reward_change_min": -0.49386022612452507, + "reward_change_std": 0.20041564013808966, + "reward_std": 0.7860203571617603, + "rewards/cosine_scaled_reward": -0.013046002015471458, + "rewards/format_reward": 0.8541666716337204, + "step": 349 + }, + { + "advantage_max": 1.8311565965414047, + "advantage_mean": -2.2972624191819335e-08, + "advantage_min": -0.8188979849219322, + "advantage_std": 0.9998144879937172, + "completion_length": 1051.6250343322754, + "epoch": 0.4, + "grad_norm": 1.1339340209960938, + "kl": 0.22963714599609375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.250000000000001e-07, + "loss": 0.0092, + "reward": 0.36821131221950054, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.36821131221950054, + "reward_after_std": 0.7234288677573204, + "reward_before_mean": 0.5922360196709633, + "reward_before_std": 0.6629289668053389, + "reward_change_max": 0.0, + "reward_change_mean": -0.2240247093141079, + "reward_change_min": -0.35875629261136055, + "reward_change_std": 0.13018367905169725, + "reward_std": 0.7234288677573204, + "rewards/cosine_scaled_reward": -0.18304866866674274, + "rewards/format_reward": 0.9583333432674408, + "step": 350 + }, + { + "advantage_max": 1.7308290600776672, + "advantage_mean": -1.8626451797620902e-08, + "advantage_min": -1.074808619916439, + "advantage_std": 0.9998176246881485, + "completion_length": 998.5416870117188, + "epoch": 0.40114285714285713, + "grad_norm": 1.4578030109405518, + "kl": 0.062774658203125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.222848061454764e-07, + "loss": 0.0025, + "reward": 0.5936012240126729, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5936012240126729, + "reward_after_std": 0.6550269387662411, + "reward_before_mean": 0.87417808547616, + "reward_before_std": 0.6078615933656693, + "reward_change_max": 0.0, + "reward_change_mean": -0.280576853081584, + "reward_change_min": -0.4290633983910084, + "reward_change_std": 0.16038669738918543, + "reward_std": 0.6550269685685635, + "rewards/cosine_scaled_reward": -0.042077645659446716, + "rewards/format_reward": 0.9583333358168602, + "step": 351 + }, + { + "advantage_max": 1.8318933993577957, + "advantage_mean": -5.215406784220278e-08, + "advantage_min": -1.0214067623019218, + "advantage_std": 0.9997903853654861, + "completion_length": 1247.5000457763672, + "epoch": 0.4022857142857143, + "grad_norm": 1.924621343612671, + "kl": 0.40142059326171875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.195807108082429e-07, + "loss": 0.0161, + "reward": 0.5140209225937724, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.5140209225937724, + "reward_after_std": 0.6413620505481958, + "reward_before_mean": 0.7743731364607811, + "reward_before_std": 0.5711536388844252, + "reward_change_max": 0.00038998574018478394, + "reward_change_mean": -0.26035225205123425, + "reward_change_min": -0.37987200915813446, + "reward_change_std": 0.14672334119677544, + "reward_std": 0.6413620561361313, + "rewards/cosine_scaled_reward": -0.06073010340332985, + "rewards/format_reward": 0.8958333507180214, + "step": 352 + }, + { + "advantage_max": 1.6363515406847, + "advantage_mean": -9.9341087578253e-09, + "advantage_min": -1.1301787421107292, + "advantage_std": 0.999807745218277, + "completion_length": 1015.1041831970215, + "epoch": 0.4034285714285714, + "grad_norm": 1.1657310724258423, + "kl": 0.24164581298828125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.168878457820915e-07, + "loss": 0.0097, + "reward": 0.8914534251671284, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.8914534251671284, + "reward_after_std": 0.7140428274869919, + "reward_before_mean": 1.233622845262289, + "reward_before_std": 0.6822979040443897, + "reward_change_max": 0.0, + "reward_change_mean": -0.34216937981545925, + "reward_change_min": -0.5395658630877733, + "reward_change_std": 0.20784274209290743, + "reward_std": 0.7140428423881531, + "rewards/cosine_scaled_reward": 0.14806138863787055, + "rewards/format_reward": 0.9375000074505806, + "step": 353 + }, + { + "advantage_max": 1.599271759390831, + "advantage_mean": -2.7939677682553565e-08, + "advantage_min": -1.200988955795765, + "advantage_std": 0.999802254140377, + "completion_length": 941.0416946411133, + "epoch": 0.4045714285714286, + "grad_norm": 0.8728342056274414, + "kl": 0.1999359130859375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.142063423134644e-07, + "loss": 0.008, + "reward": 0.8483670018613338, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.8483670018613338, + "reward_after_std": 0.6414902880787849, + "reward_before_mean": 1.1879227459430695, + "reward_before_std": 0.6066556219011545, + "reward_change_max": 0.0, + "reward_change_mean": -0.3395557254552841, + "reward_change_min": -0.5009829849004745, + "reward_change_std": 0.19546702224761248, + "reward_std": 0.6414902955293655, + "rewards/cosine_scaled_reward": 0.11479469854384661, + "rewards/format_reward": 0.9583333432674408, + "step": 354 + }, + { + "advantage_max": 1.5588997304439545, + "advantage_mean": -6.084640968850863e-08, + "advantage_min": -1.1517152562737465, + "advantage_std": 0.9998615756630898, + "completion_length": 1016.4167022705078, + "epoch": 0.4057142857142857, + "grad_norm": 1.469150185585022, + "kl": 0.19208145141601562, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.115363310950578e-07, + "loss": 0.0077, + "reward": 0.7298020347952843, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.7298020347952843, + "reward_after_std": 0.8792191408574581, + "reward_before_mean": 1.028229609131813, + "reward_before_std": 0.8823234438896179, + "reward_change_max": 0.000792287290096283, + "reward_change_mean": -0.2984275911003351, + "reward_change_min": -0.5347305983304977, + "reward_change_std": 0.2032134924083948, + "reward_std": 0.8792191408574581, + "rewards/cosine_scaled_reward": 0.045364788733422756, + "rewards/format_reward": 0.9375000149011612, + "step": 355 + }, + { + "advantage_max": 1.5244620889425278, + "advantage_mean": -5.2774948189338033e-08, + "advantage_min": -1.2598972916603088, + "advantage_std": 0.9998257234692574, + "completion_length": 1121.3125381469727, + "epoch": 0.40685714285714286, + "grad_norm": 1.8578813076019287, + "kl": 0.23694610595703125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.0887794225945143e-07, + "loss": 0.0095, + "reward": 0.5678749307990074, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5678749307990074, + "reward_after_std": 0.7130421288311481, + "reward_before_mean": 0.8438384272158146, + "reward_before_std": 0.7152368873357773, + "reward_change_max": 0.0011678412556648254, + "reward_change_mean": -0.2759635243564844, + "reward_change_min": -0.45209793746471405, + "reward_change_std": 0.1840755846351385, + "reward_std": 0.7130421474575996, + "rewards/cosine_scaled_reward": -0.025997468270361423, + "rewards/format_reward": 0.895833358168602, + "step": 356 + }, + { + "advantage_max": 1.785775288939476, + "advantage_mean": -2.1730860499946658e-08, + "advantage_min": -0.836478516459465, + "advantage_std": 0.9998077526688576, + "completion_length": 1165.31254196167, + "epoch": 0.408, + "grad_norm": 4.218529224395752, + "kl": 0.4105377197265625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.062313053727671e-07, + "loss": 0.0164, + "reward": 0.36710093077272177, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.36710093077272177, + "reward_after_std": 0.7088046334683895, + "reward_before_mean": 0.5877696983516216, + "reward_before_std": 0.6624849736690521, + "reward_change_max": 0.0, + "reward_change_mean": -0.22066876105964184, + "reward_change_min": -0.383340559899807, + "reward_change_std": 0.13388633634895086, + "reward_std": 0.7088046558201313, + "rewards/cosine_scaled_reward": -0.15403182711452246, + "rewards/format_reward": 0.8958333432674408, + "step": 357 + }, + { + "advantage_max": 1.6324420422315598, + "advantage_mean": -5.587936335871291e-09, + "advantage_min": -1.0728551223874092, + "advantage_std": 0.999869205057621, + "completion_length": 1239.2500305175781, + "epoch": 0.40914285714285714, + "grad_norm": 0.9532815217971802, + "kl": 0.367156982421875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.0359654942835247e-07, + "loss": 0.0146, + "reward": 0.8309711366891861, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.8309711366891861, + "reward_after_std": 0.9139232225716114, + "reward_before_mean": 1.1447288244962692, + "reward_before_std": 0.8961115628480911, + "reward_change_max": 0.0, + "reward_change_mean": -0.3137576524168253, + "reward_change_min": -0.5409190393984318, + "reward_change_std": 0.2029905915260315, + "reward_std": 0.9139232337474823, + "rewards/cosine_scaled_reward": 0.10361439734697342, + "rewards/format_reward": 0.9375000074505806, + "step": 358 + }, + { + "advantage_max": 1.668090134859085, + "advantage_mean": -4.672134945593598e-08, + "advantage_min": -1.0278353244066238, + "advantage_std": 0.9997305795550346, + "completion_length": 831.1875190734863, + "epoch": 0.4102857142857143, + "grad_norm": 1.9846396446228027, + "kl": 0.22521209716796875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 3.0097380284049523e-07, + "loss": 0.009, + "reward": 0.43807821813970804, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.43807821813970804, + "reward_after_std": 0.48141663894057274, + "reward_before_mean": 0.6997129991650581, + "reward_before_std": 0.4477991936728358, + "reward_change_max": 0.0, + "reward_change_mean": -0.2616347875446081, + "reward_change_min": -0.4088161289691925, + "reward_change_std": 0.15432436391711235, + "reward_std": 0.48141664266586304, + "rewards/cosine_scaled_reward": -0.10847685020416975, + "rewards/format_reward": 0.9166666716337204, + "step": 359 + }, + { + "advantage_max": 1.7154224514961243, + "advantage_mean": -7.698933801592034e-08, + "advantage_min": -1.068584568798542, + "advantage_std": 0.9998128190636635, + "completion_length": 1000.2083587646484, + "epoch": 0.4114285714285714, + "grad_norm": 2.706932783126831, + "kl": 0.156585693359375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.9836319343816397e-07, + "loss": 0.0063, + "reward": 0.8180207312107086, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.8180207312107086, + "reward_after_std": 0.666923388838768, + "reward_before_mean": 1.1441688016057014, + "reward_before_std": 0.5980559233576059, + "reward_change_max": 0.0, + "reward_change_mean": -0.3261481188237667, + "reward_change_min": -0.4748628959059715, + "reward_change_std": 0.18042928539216518, + "reward_std": 0.6669234037399292, + "rewards/cosine_scaled_reward": 0.09291772660799325, + "rewards/format_reward": 0.9583333432674408, + "step": 360 + }, + { + "advantage_max": 1.8375728726387024, + "advantage_mean": -5.8362883192941695e-08, + "advantage_min": -0.8818403705954552, + "advantage_std": 0.999719150364399, + "completion_length": 1167.6667175292969, + "epoch": 0.4125714285714286, + "grad_norm": 0.9807596206665039, + "kl": 0.40717315673828125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.9576484845877793e-07, + "loss": 0.0163, + "reward": 0.4029839560389519, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4029839560389519, + "reward_after_std": 0.505818497389555, + "reward_before_mean": 0.6519249677658081, + "reward_before_std": 0.4368733561132103, + "reward_change_max": 0.0, + "reward_change_mean": -0.24894102104008198, + "reward_change_min": -0.3820135109126568, + "reward_change_std": 0.14100301824510098, + "reward_std": 0.5058185234665871, + "rewards/cosine_scaled_reward": -0.13237086776643991, + "rewards/format_reward": 0.916666679084301, + "step": 361 + }, + { + "advantage_max": 1.8133844584226608, + "advantage_mean": -6.2088184593633855e-09, + "advantage_min": -0.8550900742411613, + "advantage_std": 0.9996546134352684, + "completion_length": 876.8958473205566, + "epoch": 0.4137142857142857, + "grad_norm": 1.415966272354126, + "kl": 0.10638427734375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.931788945420058e-07, + "loss": 0.0043, + "reward": 0.7734895506873727, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.7734895506873727, + "reward_after_std": 0.3775805290788412, + "reward_before_mean": 1.110964479856193, + "reward_before_std": 0.27502059168182313, + "reward_change_max": 0.0, + "reward_change_mean": -0.3374749179929495, + "reward_change_min": -0.4654871243983507, + "reward_change_std": 0.17572191823273897, + "reward_std": 0.3775805290788412, + "rewards/cosine_scaled_reward": 0.05548222362995148, + "rewards/format_reward": 1.0, + "step": 362 + }, + { + "advantage_max": 1.6137598305940628, + "advantage_mean": -3.3527614018424856e-08, + "advantage_min": -1.2129024267196655, + "advantage_std": 0.9997744932770729, + "completion_length": 814.0625305175781, + "epoch": 0.41485714285714287, + "grad_norm": 0.6677997708320618, + "kl": 0.09397125244140625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.9060545772359305e-07, + "loss": 0.0038, + "reward": 1.0306510236114264, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 1.0306510236114264, + "reward_after_std": 0.5626971535384655, + "reward_before_mean": 1.4122940003871918, + "reward_before_std": 0.5126284081488848, + "reward_change_max": 0.0, + "reward_change_mean": -0.3816429302096367, + "reward_change_min": -0.5517849400639534, + "reward_change_std": 0.20960881654173136, + "reward_std": 0.5626971572637558, + "rewards/cosine_scaled_reward": 0.22698031552135944, + "rewards/format_reward": 0.9583333358168602, + "step": 363 + }, + { + "advantage_max": 1.8092490285634995, + "advantage_mean": -8.69234451084111e-09, + "advantage_min": -0.888951875269413, + "advantage_std": 0.999782919883728, + "completion_length": 1011.770866394043, + "epoch": 0.416, + "grad_norm": 1.6947314739227295, + "kl": 0.155548095703125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.8804466342921987e-07, + "loss": 0.0062, + "reward": 0.23180574737489223, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.23180574737489223, + "reward_after_std": 0.510407954454422, + "reward_before_mean": 0.44417205080389977, + "reward_before_std": 0.45651115104556084, + "reward_change_max": 0.0, + "reward_change_mean": -0.21236632205545902, + "reward_change_min": -0.32998935878276825, + "reward_change_std": 0.12107465602457523, + "reward_std": 0.5104079619050026, + "rewards/cosine_scaled_reward": -0.2570806494913995, + "rewards/format_reward": 0.9583333432674408, + "step": 364 + }, + { + "advantage_max": 1.7235835492610931, + "advantage_mean": -3.13545276409144e-08, + "advantage_min": -0.9894550256431103, + "advantage_std": 0.9998277202248573, + "completion_length": 1275.5000305175781, + "epoch": 0.41714285714285715, + "grad_norm": 1.446303129196167, + "kl": 0.24599456787109375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.854966364683872e-07, + "loss": 0.0099, + "reward": 0.379636493511498, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.379636493511498, + "reward_after_std": 0.7502324618399143, + "reward_before_mean": 0.6058452092111111, + "reward_before_std": 0.7191748432815075, + "reward_change_max": 0.0, + "reward_change_mean": -0.22620872594416142, + "reward_change_min": -0.3859753981232643, + "reward_change_std": 0.14396012295037508, + "reward_std": 0.7502324692904949, + "rewards/cosine_scaled_reward": -0.12416074390057474, + "rewards/format_reward": 0.854166679084301, + "step": 365 + }, + { + "advantage_max": 1.7427651286125183, + "advantage_mean": -1.2417634476236117e-08, + "advantage_min": -1.0082111209630966, + "advantage_std": 0.9998277053236961, + "completion_length": 1200.9583549499512, + "epoch": 0.41828571428571426, + "grad_norm": 1.0100181102752686, + "kl": 0.10961151123046875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.829615010283344e-07, + "loss": 0.0044, + "reward": 0.7537405379116535, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.7537405379116535, + "reward_after_std": 0.7664575092494488, + "reward_before_mean": 1.0614567548036575, + "reward_before_std": 0.7194897159934044, + "reward_change_max": 0.0, + "reward_change_mean": -0.307716216892004, + "reward_change_min": -0.49389464780688286, + "reward_change_std": 0.18914083298295736, + "reward_std": 0.7664575390517712, + "rewards/cosine_scaled_reward": 0.051561687141656876, + "rewards/format_reward": 0.9583333432674408, + "step": 366 + }, + { + "advantage_max": 1.588751271367073, + "advantage_mean": -1.2728075482471013e-08, + "advantage_min": -1.3483324870467186, + "advantage_std": 0.9998136162757874, + "completion_length": 1194.437515258789, + "epoch": 0.41942857142857143, + "grad_norm": 1.5068424940109253, + "kl": 0.2128753662109375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.8043938066798645e-07, + "loss": 0.0085, + "reward": 0.5264971938449889, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5264971938449889, + "reward_after_std": 0.6666704639792442, + "reward_before_mean": 0.7939775194972754, + "reward_before_std": 0.6534105539321899, + "reward_change_max": 0.0004104003310203552, + "reward_change_mean": -0.2674803026020527, + "reward_change_min": -0.4358235076069832, + "reward_change_std": 0.16966222040355206, + "reward_std": 0.6666704788804054, + "rewards/cosine_scaled_reward": -0.06134458933956921, + "rewards/format_reward": 0.916666679084301, + "step": 367 + }, + { + "advantage_max": 1.6325557231903076, + "advantage_mean": -1.986821529520455e-08, + "advantage_min": -1.0944968909025192, + "advantage_std": 0.9997764453291893, + "completion_length": 1412.7500305175781, + "epoch": 0.4205714285714286, + "grad_norm": 2.15203595161438, + "kl": 0.304656982421875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.7793039831193133e-07, + "loss": 0.0122, + "reward": 0.43621888384222984, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.43621888384222984, + "reward_after_std": 0.6182968616485596, + "reward_before_mean": 0.6909672629553825, + "reward_before_std": 0.604456739500165, + "reward_change_max": 0.0, + "reward_change_mean": -0.25474837608635426, + "reward_change_min": -0.44062450528144836, + "reward_change_std": 0.16634196415543556, + "reward_std": 0.6182968728244305, + "rewards/cosine_scaled_reward": -0.10243305005133152, + "rewards/format_reward": 0.8958333395421505, + "step": 368 + }, + { + "advantage_max": 1.6553238332271576, + "advantage_mean": -1.3659398168108794e-08, + "advantage_min": -1.0655308365821838, + "advantage_std": 0.9998332932591438, + "completion_length": 1120.020851135254, + "epoch": 0.4217142857142857, + "grad_norm": 2.682713747024536, + "kl": 0.19134521484375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.7543467624442956e-07, + "loss": 0.0077, + "reward": 0.5608847080729902, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5608847080729902, + "reward_after_std": 0.8303621709346771, + "reward_before_mean": 0.8241658713668585, + "reward_before_std": 0.8261894173920155, + "reward_change_max": 0.0, + "reward_change_mean": -0.2632811777293682, + "reward_change_min": -0.5107766017317772, + "reward_change_std": 0.18701652251183987, + "reward_std": 0.8303622044622898, + "rewards/cosine_scaled_reward": -0.04625041130930185, + "rewards/format_reward": 0.9166666865348816, + "step": 369 + }, + { + "advantage_max": 1.606068804860115, + "advantage_mean": -4.842877510125021e-08, + "advantage_min": -1.1060052961111069, + "advantage_std": 0.999760165810585, + "completion_length": 1001.6042098999023, + "epoch": 0.4228571428571429, + "grad_norm": 0.9525280594825745, + "kl": 0.124969482421875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.729523361034538e-07, + "loss": 0.005, + "reward": 0.5136999785900116, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5136999785900116, + "reward_after_std": 0.4509398341178894, + "reward_before_mean": 0.7930864812806249, + "reward_before_std": 0.39981068670749664, + "reward_change_max": 0.0006158947944641113, + "reward_change_mean": -0.2793865194544196, + "reward_change_min": -0.411048436537385, + "reward_change_std": 0.16224909853190184, + "reward_std": 0.45093984156847, + "rewards/cosine_scaled_reward": -0.07220677100121975, + "rewards/format_reward": 0.9375000074505806, + "step": 370 + }, + { + "advantage_max": 1.7884211093187332, + "advantage_mean": -1.1424224100053948e-07, + "advantage_min": -0.9556703455746174, + "advantage_std": 0.9997562393546104, + "completion_length": 696.0625228881836, + "epoch": 0.424, + "grad_norm": 0.7482513189315796, + "kl": 0.0223236083984375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.7048349887476037e-07, + "loss": 0.0009, + "reward": 0.89418915938586, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.89418915938586, + "reward_after_std": 0.5914383120834827, + "reward_before_mean": 1.2429718682542443, + "reward_before_std": 0.5024533607065678, + "reward_change_max": 0.0, + "reward_change_mean": -0.3487827442586422, + "reward_change_min": -0.48125503212213516, + "reward_change_std": 0.19065604731440544, + "reward_std": 0.5914383307099342, + "rewards/cosine_scaled_reward": 0.14231925923377275, + "rewards/format_reward": 0.9583333358168602, + "step": 371 + }, + { + "advantage_max": 1.7135415375232697, + "advantage_mean": -4.842877521227251e-08, + "advantage_min": -1.1480904445052147, + "advantage_std": 0.9998008906841278, + "completion_length": 1243.895881652832, + "epoch": 0.42514285714285716, + "grad_norm": 2.1823599338531494, + "kl": 0.28174591064453125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.6802828488599294e-07, + "loss": 0.0113, + "reward": 0.6811677659861743, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6811677659861743, + "reward_after_std": 0.5767421014606953, + "reward_before_mean": 0.9846199788153172, + "reward_before_std": 0.5272019132971764, + "reward_change_max": 0.0, + "reward_change_mean": -0.3034522123634815, + "reward_change_min": -0.4524131715297699, + "reward_change_std": 0.17368095833808184, + "reward_std": 0.5767421163618565, + "rewards/cosine_scaled_reward": 0.07564329542219639, + "rewards/format_reward": 0.8333333507180214, + "step": 372 + }, + { + "advantage_max": 1.6011265963315964, + "advantage_mean": -1.9868215517249155e-08, + "advantage_min": -1.062184102833271, + "advantage_std": 0.9997665360569954, + "completion_length": 776.4792022705078, + "epoch": 0.42628571428571427, + "grad_norm": 1.846110463142395, + "kl": 0.09600067138671875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.655868138008171e-07, + "loss": 0.0038, + "reward": 0.2838655477389693, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2838655477389693, + "reward_after_std": 0.5549126155674458, + "reward_before_mean": 0.5090842507779598, + "reward_before_std": 0.5350898541510105, + "reward_change_max": 0.002757057547569275, + "reward_change_mean": -0.22521871328353882, + "reward_change_min": -0.3687067572027445, + "reward_change_std": 0.1422775825485587, + "reward_std": 0.5549126267433167, + "rewards/cosine_scaled_reward": -0.22462454997003078, + "rewards/format_reward": 0.9583333432674408, + "step": 373 + }, + { + "advantage_max": 1.7018559277057648, + "advantage_mean": -7.792065803702286e-08, + "advantage_min": -1.1280000060796738, + "advantage_std": 0.9998096823692322, + "completion_length": 959.5000305175781, + "epoch": 0.42742857142857144, + "grad_norm": 1.918383240699768, + "kl": 0.103363037109375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.631592046130896e-07, + "loss": 0.0041, + "reward": 0.5767368387896568, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5767368387896568, + "reward_after_std": 0.7010692954063416, + "reward_before_mean": 0.8509616479277611, + "reward_before_std": 0.6722024735063314, + "reward_change_max": 0.0, + "reward_change_mean": -0.27422484941780567, + "reward_change_min": -0.4227101244032383, + "reward_change_std": 0.1696557030081749, + "reward_std": 0.7010693028569221, + "rewards/cosine_scaled_reward": -0.03285252209752798, + "rewards/format_reward": 0.9166666865348816, + "step": 374 + }, + { + "advantage_max": 1.7416959404945374, + "advantage_mean": 6.208816794028849e-10, + "advantage_min": -1.097055770456791, + "advantage_std": 0.999822311103344, + "completion_length": 1183.270866394043, + "epoch": 0.42857142857142855, + "grad_norm": 1.3700175285339355, + "kl": 0.21044921875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.6074557564105724e-07, + "loss": 0.0084, + "reward": 0.7338640615344048, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.7338640615344048, + "reward_after_std": 0.6855736374855042, + "reward_before_mean": 1.0383987962268293, + "reward_before_std": 0.6238086558878422, + "reward_change_max": 0.0, + "reward_change_mean": -0.30453467927873135, + "reward_change_min": -0.46512749418616295, + "reward_change_std": 0.17507017496973276, + "reward_std": 0.6855736672878265, + "rewards/cosine_scaled_reward": 0.07128270622342825, + "rewards/format_reward": 0.895833358168602, + "step": 375 + }, + { + "advantage_max": 1.6964893490076065, + "advantage_mean": -2.8250118910833066e-08, + "advantage_min": -1.028324469923973, + "advantage_std": 0.9998196437954903, + "completion_length": 1078.8750305175781, + "epoch": 0.4297142857142857, + "grad_norm": 0.6061641573905945, + "kl": 0.114166259765625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.583460445215911e-07, + "loss": 0.0046, + "reward": 0.8502661599777639, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.8502661599777639, + "reward_after_std": 0.7495008744299412, + "reward_before_mean": 1.177900506183505, + "reward_before_std": 0.690794087946415, + "reward_change_max": 0.0, + "reward_change_mean": -0.327634334564209, + "reward_change_min": -0.5012494549155235, + "reward_change_std": 0.18774799816310406, + "reward_std": 0.7495008744299412, + "rewards/cosine_scaled_reward": 0.08895024354569614, + "rewards/format_reward": 1.0, + "step": 376 + }, + { + "advantage_max": 1.652765303850174, + "advantage_mean": 3.104408563547878e-09, + "advantage_min": -1.1104619428515434, + "advantage_std": 0.9998508244752884, + "completion_length": 1177.3125190734863, + "epoch": 0.4308571428571429, + "grad_norm": 2.5280399322509766, + "kl": 0.2825164794921875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.5596072820445254e-07, + "loss": 0.0113, + "reward": 0.29627796332351863, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.29627796332351863, + "reward_after_std": 0.8333842419087887, + "reward_before_mean": 0.5023422092199326, + "reward_before_std": 0.8306805156171322, + "reward_change_max": 0.00010532140731811523, + "reward_change_mean": -0.206064248457551, + "reward_change_min": -0.41014874540269375, + "reward_change_std": 0.15475755836814642, + "reward_std": 0.8333842568099499, + "rewards/cosine_scaled_reward": -0.16549558006227016, + "rewards/format_reward": 0.8333333507180214, + "step": 377 + }, + { + "advantage_max": 1.5440286844968796, + "advantage_mean": -5.215406562175673e-08, + "advantage_min": -1.1627759784460068, + "advantage_std": 0.9998477250337601, + "completion_length": 998.8750228881836, + "epoch": 0.432, + "grad_norm": 1.106315016746521, + "kl": 0.0748138427734375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.5358974294659373e-07, + "loss": 0.003, + "reward": 0.8236580304801464, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.8236580304801464, + "reward_after_std": 0.7726437486708164, + "reward_before_mean": 1.1485275700688362, + "reward_before_std": 0.760891504585743, + "reward_change_max": 0.0, + "reward_change_mean": -0.3248695805668831, + "reward_change_min": -0.5357452519237995, + "reward_change_std": 0.20013280678540468, + "reward_std": 0.7726437710225582, + "rewards/cosine_scaled_reward": 0.09509711805731058, + "rewards/format_reward": 0.9583333432674408, + "step": 378 + }, + { + "advantage_max": 1.7088170647621155, + "advantage_mean": 7.450580735701706e-09, + "advantage_min": -1.0716500952839851, + "advantage_std": 0.99978818744421, + "completion_length": 1143.8333625793457, + "epoch": 0.43314285714285716, + "grad_norm": 1.33283269405365, + "kl": 0.16924285888671875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.512332043064913e-07, + "loss": 0.0068, + "reward": 0.6464580819010735, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.6464580819010735, + "reward_after_std": 0.602570503950119, + "reward_before_mean": 0.9419687129557133, + "reward_before_std": 0.5499370843172073, + "reward_change_max": 0.0, + "reward_change_mean": -0.2955106142908335, + "reward_change_min": -0.45715222880244255, + "reward_change_std": 0.16575047001242638, + "reward_std": 0.6025705374777317, + "rewards/cosine_scaled_reward": -0.018598987255245447, + "rewards/format_reward": 0.9791666716337204, + "step": 379 + }, + { + "advantage_max": 1.668600931763649, + "advantage_mean": -2.7939676461308238e-08, + "advantage_min": -1.1768651977181435, + "advantage_std": 0.9997805878520012, + "completion_length": 915.5208702087402, + "epoch": 0.4342857142857143, + "grad_norm": 0.6677605509757996, + "kl": 0.12503814697265625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.488912271385139e-07, + "loss": 0.005, + "reward": 0.8405767795629799, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.8405767795629799, + "reward_after_std": 0.5229048319160938, + "reward_before_mean": 1.1833235658705235, + "reward_before_std": 0.45090617425739765, + "reward_change_max": 0.0, + "reward_change_mean": -0.34274679608643055, + "reward_change_min": -0.5035507827997208, + "reward_change_std": 0.1878078691661358, + "reward_std": 0.5229048356413841, + "rewards/cosine_scaled_reward": 0.11249509919434786, + "rewards/format_reward": 0.9583333432674408, + "step": 380 + }, + { + "advantage_max": 1.6725091934204102, + "advantage_mean": -1.5366822592177698e-08, + "advantage_min": -1.1343127712607384, + "advantage_std": 0.999749131500721, + "completion_length": 1320.1875228881836, + "epoch": 0.43542857142857144, + "grad_norm": 1.8746590614318848, + "kl": 0.4720611572265625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.465639255873246e-07, + "loss": 0.0189, + "reward": 0.1860162508673966, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.1860162508673966, + "reward_after_std": 0.5676998719573021, + "reward_before_mean": 0.38731856519007124, + "reward_before_std": 0.5420579835772514, + "reward_change_max": 0.00022460520267486572, + "reward_change_mean": -0.20130231231451035, + "reward_change_min": -0.3233092837035656, + "reward_change_std": 0.1276199435815215, + "reward_std": 0.5676998980343342, + "rewards/cosine_scaled_reward": -0.22300739493221045, + "rewards/format_reward": 0.833333358168602, + "step": 381 + }, + { + "advantage_max": 1.754372239112854, + "advantage_mean": -5.836288241578558e-08, + "advantage_min": -0.9967377930879593, + "advantage_std": 0.9997845068573952, + "completion_length": 907.1250076293945, + "epoch": 0.43657142857142855, + "grad_norm": 0.689816415309906, + "kl": 0.05706024169921875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.4425141308231765e-07, + "loss": 0.0023, + "reward": 0.4433266781270504, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4433266781270504, + "reward_after_std": 0.5864979885518551, + "reward_before_mean": 0.6966921277344227, + "reward_before_std": 0.5301778148859739, + "reward_change_max": 0.0, + "reward_change_mean": -0.253365445882082, + "reward_change_min": -0.40340840443968773, + "reward_change_std": 0.1430813828483224, + "reward_std": 0.5864980109035969, + "rewards/cosine_scaled_reward": -0.13082062639296055, + "rewards/format_reward": 0.9583333432674408, + "step": 382 + }, + { + "advantage_max": 1.7084899097681046, + "advantage_mean": -1.0927518856451712e-07, + "advantage_min": -1.0436795875430107, + "advantage_std": 0.9998108968138695, + "completion_length": 1010.9166870117188, + "epoch": 0.4377142857142857, + "grad_norm": 1.530521273612976, + "kl": 0.320953369140625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.4195380233209006e-07, + "loss": 0.0128, + "reward": 0.9637196809053421, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.9637196809053421, + "reward_after_std": 0.7095706351101398, + "reward_before_mean": 1.315572265535593, + "reward_before_std": 0.6498497929424047, + "reward_change_max": 0.0005817189812660217, + "reward_change_mean": -0.35185267589986324, + "reward_change_min": -0.5468010529875755, + "reward_change_std": 0.2091050622984767, + "reward_std": 0.7095706537365913, + "rewards/cosine_scaled_reward": 0.1786194909363985, + "rewards/format_reward": 0.9583333358168602, + "step": 383 + }, + { + "advantage_max": 1.5724465548992157, + "advantage_mean": -4.221995775210985e-08, + "advantage_min": -1.0186072289943695, + "advantage_std": 0.9998998194932938, + "completion_length": 874.6666870117188, + "epoch": 0.43885714285714283, + "grad_norm": 1.4073811769485474, + "kl": 0.1265411376953125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.3967120531894857e-07, + "loss": 0.0051, + "reward": 0.9594648890197277, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.9594648890197277, + "reward_after_std": 1.026247426867485, + "reward_before_mean": 1.2966319471597672, + "reward_before_std": 1.0357157215476036, + "reward_change_max": 0.0, + "reward_change_mean": -0.33716709539294243, + "reward_change_min": -0.6071061193943024, + "reward_change_std": 0.23107926733791828, + "reward_std": 1.0262474715709686, + "rewards/cosine_scaled_reward": 0.17956596659496427, + "rewards/format_reward": 0.9375000074505806, + "step": 384 + }, + { + "advantage_max": 1.67272287607193, + "advantage_mean": -4.159907573964006e-08, + "advantage_min": -1.1390063092112541, + "advantage_std": 0.9998363107442856, + "completion_length": 1162.333381652832, + "epoch": 0.44, + "grad_norm": 2.0045926570892334, + "kl": 0.509124755859375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.374037332934512e-07, + "loss": 0.0203, + "reward": 0.588574624620378, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.588574624620378, + "reward_after_std": 0.8666601609438658, + "reward_before_mean": 0.8494505360722542, + "reward_before_std": 0.8376770131289959, + "reward_change_max": 0.0007246658205986023, + "reward_change_mean": -0.2608759067952633, + "reward_change_min": -0.45984548330307007, + "reward_change_std": 0.1684568226337433, + "reward_std": 0.866660175845027, + "rewards/cosine_scaled_reward": -0.04402475664392114, + "rewards/format_reward": 0.9375000149011612, + "step": 385 + }, + { + "advantage_max": 1.5228633731603622, + "advantage_mean": -1.4901161637936866e-08, + "advantage_min": -1.2088345661759377, + "advantage_std": 0.999832883477211, + "completion_length": 1030.208366394043, + "epoch": 0.44114285714285717, + "grad_norm": 3.5907418727874756, + "kl": 0.333953857421875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.3515149676898552e-07, + "loss": 0.0134, + "reward": 0.5742043564096093, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5742043564096093, + "reward_after_std": 0.7357494235038757, + "reward_before_mean": 0.852111779153347, + "reward_before_std": 0.7517198957502842, + "reward_change_max": 0.0, + "reward_change_mean": -0.27790740318596363, + "reward_change_min": -0.48950906842947006, + "reward_change_std": 0.1909992415457964, + "reward_std": 0.7357494346797466, + "rewards/cosine_scaled_reward": -0.01144411601126194, + "rewards/format_reward": 0.8750000298023224, + "step": 386 + }, + { + "advantage_max": 1.442567840218544, + "advantage_mean": -6.208817349140361e-09, + "advantage_min": -1.3513574451208115, + "advantage_std": 0.9998595416545868, + "completion_length": 1112.1042022705078, + "epoch": 0.4422857142857143, + "grad_norm": 2.41609263420105, + "kl": 0.5802001953125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.3291460551638237e-07, + "loss": 0.0232, + "reward": 0.5504259113222361, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5504259113222361, + "reward_after_std": 0.8110573142766953, + "reward_before_mean": 0.8169084526598454, + "reward_before_std": 0.8353900499641895, + "reward_change_max": 0.0005081519484519958, + "reward_change_mean": -0.2664825525134802, + "reward_change_min": -0.4708978869020939, + "reward_change_std": 0.18911569099873304, + "reward_std": 0.8110573403537273, + "rewards/cosine_scaled_reward": -0.008212439250200987, + "rewards/format_reward": 0.8333333507180214, + "step": 387 + }, + { + "advantage_max": 1.4479832649230957, + "advantage_mean": -3.445893592690652e-08, + "advantage_min": -1.3541993200778961, + "advantage_std": 0.9998626410961151, + "completion_length": 1048.25004196167, + "epoch": 0.44342857142857145, + "grad_norm": 2.266045331954956, + "kl": 0.32752227783203125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.306931685585657e-07, + "loss": 0.0131, + "reward": 0.7618649862706661, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.7618649862706661, + "reward_after_std": 0.8572219982743263, + "reward_before_mean": 1.070565501227975, + "reward_before_std": 0.8761973008513451, + "reward_change_max": 0.0, + "reward_change_mean": -0.3087005550041795, + "reward_change_min": -0.5235641039907932, + "reward_change_std": 0.20725560653954744, + "reward_std": 0.8572220206260681, + "rewards/cosine_scaled_reward": 0.09778274083510041, + "rewards/format_reward": 0.8750000111758709, + "step": 388 + }, + { + "advantage_max": 1.5555903911590576, + "advantage_mean": -1.179675274132208e-08, + "advantage_min": -1.23585844039917, + "advantage_std": 0.9997957497835159, + "completion_length": 992.4166946411133, + "epoch": 0.44457142857142856, + "grad_norm": 1.277256965637207, + "kl": 0.1567840576171875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.2848729416523859e-07, + "loss": 0.0063, + "reward": 0.5500368820503354, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5500368820503354, + "reward_after_std": 0.5696189440786839, + "reward_before_mean": 0.831305742263794, + "reward_before_std": 0.5486670546233654, + "reward_change_max": 0.0004959702491760254, + "reward_change_mean": -0.28126880899071693, + "reward_change_min": -0.44873275980353355, + "reward_change_std": 0.16680119093507528, + "reward_std": 0.569618958979845, + "rewards/cosine_scaled_reward": -0.05309715494513512, + "rewards/format_reward": 0.9375000074505806, + "step": 389 + }, + { + "advantage_max": 1.7426921427249908, + "advantage_mean": -2.048909719665204e-08, + "advantage_min": -1.0142326354980469, + "advantage_std": 0.9997293725609779, + "completion_length": 1207.3750228881836, + "epoch": 0.44571428571428573, + "grad_norm": 4.534472942352295, + "kl": 0.4759674072265625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.2629708984760706e-07, + "loss": 0.019, + "reward": 0.36581041291356087, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.36581041291356087, + "reward_after_std": 0.5021828729659319, + "reward_before_mean": 0.6087434627115726, + "reward_before_std": 0.44767044857144356, + "reward_change_max": 0.0, + "reward_change_mean": -0.24293305538594723, + "reward_change_min": -0.37964949011802673, + "reward_change_std": 0.13425681181252003, + "reward_std": 0.502182874828577, + "rewards/cosine_scaled_reward": -0.17479494586586952, + "rewards/format_reward": 0.9583333358168602, + "step": 390 + }, + { + "advantage_max": 1.5873253792524338, + "advantage_mean": -6.208818459363386e-10, + "advantage_min": -1.0947215482592583, + "advantage_std": 0.9998257681727409, + "completion_length": 919.7500267028809, + "epoch": 0.44685714285714284, + "grad_norm": 2.652310848236084, + "kl": 0.55010986328125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.2412266235313973e-07, + "loss": 0.022, + "reward": 0.6688290182501078, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6688290182501078, + "reward_after_std": 0.7176588997244835, + "reward_before_mean": 0.9627345129847527, + "reward_before_std": 0.7010093629360199, + "reward_change_max": 0.0, + "reward_change_mean": -0.293905483558774, + "reward_change_min": -0.4939264915883541, + "reward_change_std": 0.18045319989323616, + "reward_std": 0.7176589332520962, + "rewards/cosine_scaled_reward": 0.02303390298038721, + "rewards/format_reward": 0.916666679084301, + "step": 391 + }, + { + "advantage_max": 1.6615222543478012, + "advantage_mean": -7.636845200664766e-08, + "advantage_min": -1.0366918966174126, + "advantage_std": 0.9997808933258057, + "completion_length": 1095.5416984558105, + "epoch": 0.448, + "grad_norm": 2.397010326385498, + "kl": 0.3244476318359375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.2196411766036487e-07, + "loss": 0.013, + "reward": 0.5707610095851123, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5707610095851123, + "reward_after_std": 0.5330556966364384, + "reward_before_mean": 0.8551894128322601, + "reward_before_std": 0.4827442280948162, + "reward_change_max": 0.0, + "reward_change_mean": -0.2844284549355507, + "reward_change_min": -0.4532608240842819, + "reward_change_std": 0.16107281111180782, + "reward_std": 0.5330557078123093, + "rewards/cosine_scaled_reward": -0.051571968011558056, + "rewards/format_reward": 0.9583333432674408, + "step": 392 + }, + { + "advantage_max": 1.6159837245941162, + "advantage_mean": -2.6077032422300306e-08, + "advantage_min": -1.1400106847286224, + "advantage_std": 0.9998411536216736, + "completion_length": 990.8750038146973, + "epoch": 0.4491428571428571, + "grad_norm": 2.5222160816192627, + "kl": 0.299957275390625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.1982156097370557e-07, + "loss": 0.012, + "reward": 0.5573246697895229, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5573246697895229, + "reward_after_std": 0.8247486017644405, + "reward_before_mean": 0.8237691428512335, + "reward_before_std": 0.8334475830197334, + "reward_change_max": 0.0019779950380325317, + "reward_change_mean": -0.26644447445869446, + "reward_change_min": -0.4715557172894478, + "reward_change_std": 0.1896289987489581, + "reward_std": 0.8247486054897308, + "rewards/cosine_scaled_reward": -0.03603211464360356, + "rewards/format_reward": 0.895833358168602, + "step": 393 + }, + { + "advantage_max": 1.7188266068696976, + "advantage_mean": -3.725290076417309e-09, + "advantage_min": -0.9942247793078423, + "advantage_std": 0.9998029246926308, + "completion_length": 1272.8958740234375, + "epoch": 0.4502857142857143, + "grad_norm": 1.3673608303070068, + "kl": 0.454193115234375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.1769509671835223e-07, + "loss": 0.0182, + "reward": 0.26626094873063266, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.26626094873063266, + "reward_after_std": 0.5838912799954414, + "reward_before_mean": 0.48196034505963326, + "reward_before_std": 0.5439142994582653, + "reward_change_max": 0.0, + "reward_change_mean": -0.21569938398897648, + "reward_change_min": -0.3452872224152088, + "reward_change_std": 0.13051257003098726, + "reward_std": 0.583891287446022, + "rewards/cosine_scaled_reward": -0.2277698372490704, + "rewards/format_reward": 0.9375000074505806, + "step": 394 + }, + { + "advantage_max": 1.6559451222419739, + "advantage_mean": -3.849466734262563e-08, + "advantage_min": -1.066048376262188, + "advantage_std": 0.9998432993888855, + "completion_length": 982.9791927337646, + "epoch": 0.4514285714285714, + "grad_norm": 4.505710601806641, + "kl": 0.381683349609375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.1558482853517253e-07, + "loss": 0.0153, + "reward": 0.6048067780211568, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6048067780211568, + "reward_after_std": 0.7917204722762108, + "reward_before_mean": 0.8776602856814861, + "reward_before_std": 0.7769417259842157, + "reward_change_max": 0.0, + "reward_change_mean": -0.2728535272181034, + "reward_change_min": -0.49752773344516754, + "reward_change_std": 0.18069400545209646, + "reward_std": 0.7917204722762108, + "rewards/cosine_scaled_reward": -0.02991985995322466, + "rewards/format_reward": 0.9375000149011612, + "step": 395 + }, + { + "advantage_max": 1.6781842708587646, + "advantage_mean": -1.7229467852430957e-08, + "advantage_min": -1.1216829270124435, + "advantage_std": 0.9997909143567085, + "completion_length": 957.6875381469727, + "epoch": 0.45257142857142857, + "grad_norm": 1.514703392982483, + "kl": 0.1158599853515625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.134908592756607e-07, + "loss": 0.0046, + "reward": 0.554951966740191, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.554951966740191, + "reward_after_std": 0.5902515798807144, + "reward_before_mean": 0.8319753324612975, + "reward_before_std": 0.5441593956202269, + "reward_change_max": 0.0, + "reward_change_mean": -0.2770233787596226, + "reward_change_min": -0.4109371602535248, + "reward_change_std": 0.1581767164170742, + "reward_std": 0.5902515836060047, + "rewards/cosine_scaled_reward": -0.052762338891625404, + "rewards/format_reward": 0.9375000074505806, + "step": 396 + }, + { + "advantage_max": 1.7707886546850204, + "advantage_mean": -5.502564409676225e-08, + "advantage_min": -0.994467705488205, + "advantage_std": 0.999783493578434, + "completion_length": 962.8958587646484, + "epoch": 0.45371428571428574, + "grad_norm": 0.738290548324585, + "kl": 0.18896484375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.1141329099692406e-07, + "loss": 0.0076, + "reward": 0.5158554278314114, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5158554278314114, + "reward_after_std": 0.6469662524759769, + "reward_before_mean": 0.7763611227273941, + "reward_before_std": 0.5864624343812466, + "reward_change_max": 0.0, + "reward_change_mean": -0.26050567254424095, + "reward_change_min": -0.38990357145667076, + "reward_change_std": 0.14337906893342733, + "reward_std": 0.6469662673771381, + "rewards/cosine_scaled_reward": -0.08056945540010929, + "rewards/format_reward": 0.9375000074505806, + "step": 397 + }, + { + "advantage_max": 1.7128700017929077, + "advantage_mean": -1.8626452269465688e-08, + "advantage_min": -0.9575743451714516, + "advantage_std": 0.9998482540249825, + "completion_length": 953.5833587646484, + "epoch": 0.45485714285714285, + "grad_norm": 3.236091136932373, + "kl": 0.448089599609375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.0935222495670968e-07, + "loss": 0.0179, + "reward": 0.478635611012578, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.478635611012578, + "reward_after_std": 0.8629732169210911, + "reward_before_mean": 0.7193904295563698, + "reward_before_std": 0.8417020551860332, + "reward_change_max": 0.0, + "reward_change_mean": -0.24075481295585632, + "reward_change_min": -0.4607015699148178, + "reward_change_std": 0.16871712915599346, + "reward_std": 0.862973265349865, + "rewards/cosine_scaled_reward": -0.0882214680314064, + "rewards/format_reward": 0.8958333507180214, + "step": 398 + }, + { + "advantage_max": 1.8339340090751648, + "advantage_mean": -5.463759156221215e-08, + "advantage_min": -0.9071780741214752, + "advantage_std": 0.9998093396425247, + "completion_length": 950.1250305175781, + "epoch": 0.456, + "grad_norm": 1.681481957435608, + "kl": 0.17317962646484375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.0730776160846853e-07, + "loss": 0.0069, + "reward": 0.768366850912571, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.768366850912571, + "reward_after_std": 0.7463151644915342, + "reward_before_mean": 1.0743264742195606, + "reward_before_std": 0.6716021299362183, + "reward_change_max": 0.0, + "reward_change_mean": -0.3059596102684736, + "reward_change_min": -0.4656844697892666, + "reward_change_std": 0.17044306732714176, + "reward_std": 0.7463151644915342, + "rewards/cosine_scaled_reward": 0.04757988639175892, + "rewards/format_reward": 0.9791666716337204, + "step": 399 + }, + { + "advantage_max": 1.6985308676958084, + "advantage_mean": -5.091230281806247e-08, + "advantage_min": -1.1028654128313065, + "advantage_std": 0.999831885099411, + "completion_length": 962.2500190734863, + "epoch": 0.45714285714285713, + "grad_norm": 1.1464098691940308, + "kl": 0.377685546875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.0528000059645995e-07, + "loss": 0.0151, + "reward": 1.0859681889414787, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 1.0859681889414787, + "reward_after_std": 0.7348118424415588, + "reward_before_mean": 1.4632576033473015, + "reward_before_std": 0.6647907719016075, + "reward_change_max": 0.0, + "reward_change_mean": -0.37728938460350037, + "reward_change_min": -0.5801199749112129, + "reward_change_std": 0.21376279927790165, + "reward_std": 0.7348118610680103, + "rewards/cosine_scaled_reward": 0.2420454490929842, + "rewards/format_reward": 0.9791666716337204, + "step": 400 + }, + { + "advantage_max": 1.6830779165029526, + "advantage_mean": -1.9247333171712455e-08, + "advantage_min": -1.0928455740213394, + "advantage_std": 0.9998010620474815, + "completion_length": 1125.5625228881836, + "epoch": 0.4582857142857143, + "grad_norm": 2.874091386795044, + "kl": 0.520751953125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.032690407508949e-07, + "loss": 0.0208, + "reward": 0.7291412346530706, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.7291412346530706, + "reward_after_std": 0.561864573508501, + "reward_before_mean": 1.0420525595545769, + "reward_before_std": 0.496932677924633, + "reward_change_max": 0.0, + "reward_change_mean": -0.31291132792830467, + "reward_change_min": -0.45355165749788284, + "reward_change_std": 0.17144971620291471, + "reward_std": 0.5618645772337914, + "rewards/cosine_scaled_reward": 0.07310961186885834, + "rewards/format_reward": 0.8958333432674408, + "step": 401 + }, + { + "advantage_max": 1.5561466589570045, + "advantage_mean": -1.2262414639252484e-08, + "advantage_min": -1.185271441936493, + "advantage_std": 0.9997816905379295, + "completion_length": 947.8541946411133, + "epoch": 0.4594285714285714, + "grad_norm": 2.5312564373016357, + "kl": 0.2422943115234375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 2.0127498008311922e-07, + "loss": 0.0097, + "reward": 0.6046357601881027, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6046357601881027, + "reward_after_std": 0.526579961180687, + "reward_before_mean": 0.9012303352355957, + "reward_before_std": 0.4946477487683296, + "reward_change_max": 0.0, + "reward_change_mean": -0.29659455455839634, + "reward_change_min": -0.4401095174252987, + "reward_change_std": 0.1744805257767439, + "reward_std": 0.5265799760818481, + "rewards/cosine_scaled_reward": -0.018134850077331066, + "rewards/format_reward": 0.9375000074505806, + "step": 402 + }, + { + "advantage_max": 1.7446418106555939, + "advantage_mean": -5.463759467083662e-08, + "advantage_min": -1.0116091333329678, + "advantage_std": 0.9997680559754372, + "completion_length": 935.0833511352539, + "epoch": 0.4605714285714286, + "grad_norm": 1.1639891862869263, + "kl": 0.3494873046875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.9929791578083655e-07, + "loss": 0.014, + "reward": 0.7093625888228416, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.7093625888228416, + "reward_after_std": 0.5025767236948013, + "reward_before_mean": 1.0228974930942059, + "reward_before_std": 0.41769439866766334, + "reward_change_max": 0.0, + "reward_change_mean": -0.3135349079966545, + "reward_change_min": -0.4433485083281994, + "reward_change_std": 0.1674872562289238, + "reward_std": 0.5025767236948013, + "rewards/cosine_scaled_reward": 0.021865406539291143, + "rewards/format_reward": 0.9791666716337204, + "step": 403 + }, + { + "advantage_max": 1.4135560542345047, + "advantage_mean": -3.601114090256985e-08, + "advantage_min": -1.425866760313511, + "advantage_std": 0.9998155757784843, + "completion_length": 1190.395851135254, + "epoch": 0.4617142857142857, + "grad_norm": 3.522855281829834, + "kl": 0.6092529296875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.9733794420337213e-07, + "loss": 0.0244, + "reward": 0.5564087391830981, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5564087391830981, + "reward_after_std": 0.6142255328595638, + "reward_before_mean": 0.8380017122253776, + "reward_before_std": 0.6324377954006195, + "reward_change_max": 0.002075694501399994, + "reward_change_mean": -0.28159296326339245, + "reward_change_min": -0.44256654381752014, + "reward_change_std": 0.1803694237023592, + "reward_std": 0.6142255514860153, + "rewards/cosine_scaled_reward": -0.039332504384219646, + "rewards/format_reward": 0.9166666865348816, + "step": 404 + }, + { + "advantage_max": 1.6321633905172348, + "advantage_mean": -3.725290298461914e-09, + "advantage_min": -1.157719410955906, + "advantage_std": 0.999863937497139, + "completion_length": 943.4792022705078, + "epoch": 0.46285714285714286, + "grad_norm": 2.017138719558716, + "kl": 0.25444793701171875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.9539516087697517e-07, + "loss": 0.0102, + "reward": 0.8636754900217056, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.8636754900217056, + "reward_after_std": 0.8582662865519524, + "reward_before_mean": 1.1882897391915321, + "reward_before_std": 0.8196110427379608, + "reward_change_max": 0.0, + "reward_change_mean": -0.3246142454445362, + "reward_change_min": -0.5169795639812946, + "reward_change_std": 0.2024751529097557, + "reward_std": 0.8582663163542747, + "rewards/cosine_scaled_reward": 0.1253948686644435, + "rewards/format_reward": 0.9375000149011612, + "step": 405 + }, + { + "advantage_max": 1.6853253245353699, + "advantage_mean": -5.898376320701004e-08, + "advantage_min": -0.9819885492324829, + "advantage_std": 0.9998152479529381, + "completion_length": 1081.6250228881836, + "epoch": 0.464, + "grad_norm": 1.2055500745773315, + "kl": 0.2242889404296875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.934696604901642e-07, + "loss": 0.009, + "reward": 0.619488287717104, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.619488287717104, + "reward_after_std": 0.7117302119731903, + "reward_before_mean": 0.9034831523895264, + "reward_before_std": 0.6682868003845215, + "reward_change_max": 0.0012862235307693481, + "reward_change_mean": -0.28399485163390636, + "reward_change_min": -0.486436128616333, + "reward_change_std": 0.1785287642851472, + "reward_std": 0.7117302231490612, + "rewards/cosine_scaled_reward": -0.027425101026892662, + "rewards/format_reward": 0.9583333432674408, + "step": 406 + }, + { + "advantage_max": 1.6408893316984177, + "advantage_mean": 1.9868215517249155e-08, + "advantage_min": -1.091450996696949, + "advantage_std": 0.9997433796525002, + "completion_length": 952.7500305175781, + "epoch": 0.46514285714285714, + "grad_norm": 1.565846562385559, + "kl": 0.438507080078125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.915615368891117e-07, + "loss": 0.0176, + "reward": 0.612242775503546, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.612242775503546, + "reward_after_std": 0.621722761541605, + "reward_before_mean": 0.9034815113991499, + "reward_before_std": 0.596266932785511, + "reward_change_max": 0.0011898577213287354, + "reward_change_mean": -0.29123874474316835, + "reward_change_min": -0.4727095998823643, + "reward_change_std": 0.1861831620335579, + "reward_std": 0.6217227801680565, + "rewards/cosine_scaled_reward": -0.006592577323317528, + "rewards/format_reward": 0.9166666716337204, + "step": 407 + }, + { + "advantage_max": 1.6723837852478027, + "advantage_mean": 4.346171422753287e-09, + "advantage_min": -0.966572105884552, + "advantage_std": 0.9998452588915825, + "completion_length": 1221.7291870117188, + "epoch": 0.4662857142857143, + "grad_norm": 0.8200952410697937, + "kl": 0.225555419921875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.8967088307307e-07, + "loss": 0.009, + "reward": 0.7919215075671673, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.7919215075671673, + "reward_after_std": 0.7756611332297325, + "reward_before_mean": 1.1055903919041157, + "reward_before_std": 0.7286336794495583, + "reward_change_max": 0.0, + "reward_change_mean": -0.3136688694357872, + "reward_change_min": -0.5055144652724266, + "reward_change_std": 0.18916036747395992, + "reward_std": 0.7756611555814743, + "rewards/cosine_scaled_reward": 0.06321184895932674, + "rewards/format_reward": 0.9791666716337204, + "step": 408 + }, + { + "advantage_max": 1.710029736161232, + "advantage_mean": -2.4835268952472234e-08, + "advantage_min": -0.9915341734886169, + "advantage_std": 0.9998149424791336, + "completion_length": 1522.7292175292969, + "epoch": 0.4674285714285714, + "grad_norm": 3.4706716537475586, + "kl": 1.00244140625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.8779779118983867e-07, + "loss": 0.04, + "reward": 0.482068314217031, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.482068314217031, + "reward_after_std": 0.6969355121254921, + "reward_before_mean": 0.7364165559411049, + "reward_before_std": 0.6672717742621899, + "reward_change_max": 0.0, + "reward_change_mean": -0.25434826500713825, + "reward_change_min": -0.4222247414290905, + "reward_change_std": 0.15766187477856874, + "reward_std": 0.6969355382025242, + "rewards/cosine_scaled_reward": -0.10054172901436687, + "rewards/format_reward": 0.9375000149011612, + "step": 409 + }, + { + "advantage_max": 1.6289841532707214, + "advantage_mean": 4.656615093523442e-10, + "advantage_min": -1.147033229470253, + "advantage_std": 0.9998533651232719, + "completion_length": 1149.8125228881836, + "epoch": 0.4685714285714286, + "grad_norm": 2.2747011184692383, + "kl": 0.551116943359375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.8594235253127372e-07, + "loss": 0.0221, + "reward": 0.5382617581635714, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5382617581635714, + "reward_after_std": 0.8370620533823967, + "reward_before_mean": 0.794367466121912, + "reward_before_std": 0.8302499540150166, + "reward_change_max": 0.0013188868761062622, + "reward_change_mean": -0.2561057098209858, + "reward_change_min": -0.4616791568696499, + "reward_change_std": 0.17526278086006641, + "reward_std": 0.8370620794594288, + "rewards/cosine_scaled_reward": -0.05073293065652251, + "rewards/format_reward": 0.8958333507180214, + "step": 410 + }, + { + "advantage_max": 1.6610813438892365, + "advantage_mean": -2.2972624524886243e-08, + "advantage_min": -1.0270356684923172, + "advantage_std": 0.9998531341552734, + "completion_length": 1607.1875457763672, + "epoch": 0.4697142857142857, + "grad_norm": 1.5188755989074707, + "kl": 0.5805892944335938, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.8410465752883758e-07, + "loss": 0.0232, + "reward": 0.5008664312772453, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5008664312772453, + "reward_after_std": 0.8480726853013039, + "reward_before_mean": 0.7502545667812228, + "reward_before_std": 0.8465875536203384, + "reward_change_max": 0.0003954470157623291, + "reward_change_mean": -0.2493881806731224, + "reward_change_min": -0.45750103518366814, + "reward_change_std": 0.17382167372852564, + "reward_std": 0.8480726890265942, + "rewards/cosine_scaled_reward": -0.05195604544132948, + "rewards/format_reward": 0.8541666828095913, + "step": 411 + }, + { + "advantage_max": 1.6989116072654724, + "advantage_mean": -3.8649887429409446e-08, + "advantage_min": -1.031688578426838, + "advantage_std": 0.9998216927051544, + "completion_length": 970.8541946411133, + "epoch": 0.47085714285714286, + "grad_norm": 1.3265846967697144, + "kl": 0.204864501953125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.822847957491922e-07, + "loss": 0.0082, + "reward": 0.5912466086447239, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5912466086447239, + "reward_after_std": 0.7558763138949871, + "reward_before_mean": 0.8657120540738106, + "reward_before_std": 0.7238161973655224, + "reward_change_max": 0.0, + "reward_change_mean": -0.2744654770940542, + "reward_change_min": -0.46801483258605003, + "reward_change_std": 0.17276459746062756, + "reward_std": 0.7558763138949871, + "rewards/cosine_scaled_reward": -0.03589397203177214, + "rewards/format_reward": 0.9375000149011612, + "step": 412 + }, + { + "advantage_max": 1.7271312475204468, + "advantage_mean": -6.146729214506763e-08, + "advantage_min": -1.0142634138464928, + "advantage_std": 0.9998214244842529, + "completion_length": 1049.8541831970215, + "epoch": 0.472, + "grad_norm": 1.6498157978057861, + "kl": 0.3886566162109375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.804828558898332e-07, + "loss": 0.0156, + "reward": 0.8821854656562209, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.8821854656562209, + "reward_after_std": 0.6554340831935406, + "reward_before_mean": 1.2208792939782143, + "reward_before_std": 0.575843945145607, + "reward_change_max": 0.0, + "reward_change_mean": -0.3386938404291868, + "reward_change_min": -0.4963124096393585, + "reward_change_std": 0.19166480377316475, + "reward_std": 0.6554341055452824, + "rewards/cosine_scaled_reward": 0.12085631024092436, + "rewards/format_reward": 0.9791666716337204, + "step": 413 + }, + { + "advantage_max": 1.741827353835106, + "advantage_mean": -2.9181441818515452e-08, + "advantage_min": -0.9508061856031418, + "advantage_std": 0.999826967716217, + "completion_length": 1378.1042175292969, + "epoch": 0.47314285714285714, + "grad_norm": 2.6625030040740967, + "kl": 0.2706451416015625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.7869892577476722e-07, + "loss": 0.0108, + "reward": 0.3892311230301857, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3892311230301857, + "reward_after_std": 0.6967345997691154, + "reward_before_mean": 0.6230743918567896, + "reward_before_std": 0.6546646431088448, + "reward_change_max": 0.0008612871170043945, + "reward_change_mean": -0.23384328000247478, + "reward_change_min": -0.3955848142504692, + "reward_change_std": 0.14179929625242949, + "reward_std": 0.6967346221208572, + "rewards/cosine_scaled_reward": -0.1572128008119762, + "rewards/format_reward": 0.9375000074505806, + "step": 414 + }, + { + "advantage_max": 1.6893931478261948, + "advantage_mean": -6.208817238118058e-09, + "advantage_min": -1.1113485097885132, + "advantage_std": 0.9998065233230591, + "completion_length": 1258.7916870117188, + "epoch": 0.4742857142857143, + "grad_norm": 26.61659049987793, + "kl": 0.7390365600585938, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.7693309235023127e-07, + "loss": 0.0295, + "reward": 0.37299776542931795, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.37299776542931795, + "reward_after_std": 0.6642138287425041, + "reward_before_mean": 0.6022981218993664, + "reward_before_std": 0.6314213052392006, + "reward_change_max": 0.0, + "reward_change_mean": -0.22930035181343555, + "reward_change_min": -0.37265395000576973, + "reward_change_std": 0.13867665268480778, + "reward_std": 0.6642138548195362, + "rewards/cosine_scaled_reward": -0.09468427952378988, + "rewards/format_reward": 0.7916666828095913, + "step": 415 + }, + { + "advantage_max": 1.690375730395317, + "advantage_mean": -2.0799537758797726e-08, + "advantage_min": -1.0659868568181992, + "advantage_std": 0.9998829066753387, + "completion_length": 1272.0208740234375, + "epoch": 0.4754285714285714, + "grad_norm": 1.6187878847122192, + "kl": 0.358184814453125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.7518544168045524e-07, + "loss": 0.0143, + "reward": 0.6220204895362258, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6220204895362258, + "reward_after_std": 1.002012237906456, + "reward_before_mean": 0.8834403976798058, + "reward_before_std": 0.9930763803422451, + "reward_change_max": 0.0028750449419021606, + "reward_change_mean": -0.2614199183881283, + "reward_change_min": -0.4997607283294201, + "reward_change_std": 0.18725214153528214, + "reward_std": 1.002012237906456, + "rewards/cosine_scaled_reward": -0.0061964658088982105, + "rewards/format_reward": 0.8958333432674408, + "step": 416 + }, + { + "advantage_max": 1.5955565720796585, + "advantage_mean": -3.8494667453647935e-08, + "advantage_min": -1.0881210714578629, + "advantage_std": 0.9998477771878242, + "completion_length": 1262.7708740234375, + "epoch": 0.4765714285714286, + "grad_norm": 1.8294861316680908, + "kl": 0.29937744140625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.7345605894346726e-07, + "loss": 0.012, + "reward": 0.5803577015176415, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5803577015176415, + "reward_after_std": 0.7609933242201805, + "reward_before_mean": 0.8526553846895695, + "reward_before_std": 0.7512392140924931, + "reward_change_max": 0.0019479095935821533, + "reward_change_mean": -0.2722976878285408, + "reward_change_min": -0.502078078687191, + "reward_change_std": 0.1809261217713356, + "reward_std": 0.7609933689236641, + "rewards/cosine_scaled_reward": -0.05283900024369359, + "rewards/format_reward": 0.9583333432674408, + "step": 417 + }, + { + "advantage_max": 1.694640338420868, + "advantage_mean": -3.430371520174447e-08, + "advantage_min": -1.11370088160038, + "advantage_std": 0.9998883605003357, + "completion_length": 893.958366394043, + "epoch": 0.4777142857142857, + "grad_norm": 1.4542772769927979, + "kl": 0.157196044921875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.7174502842694212e-07, + "loss": 0.0063, + "reward": 1.1903546750545502, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 1.1903546750545502, + "reward_after_std": 0.9878413453698158, + "reward_before_mean": 1.5717356577515602, + "reward_before_std": 0.9362938571721315, + "reward_change_max": 0.0, + "reward_change_mean": -0.38138093799352646, + "reward_change_min": -0.5612896308302879, + "reward_change_std": 0.22556099202483892, + "reward_std": 0.9878413639962673, + "rewards/cosine_scaled_reward": 0.31711778859607875, + "rewards/format_reward": 0.9375000149011612, + "step": 418 + }, + { + "advantage_max": 1.618606060743332, + "advantage_mean": -4.4082603345430016e-08, + "advantage_min": -1.005763828754425, + "advantage_std": 0.999868668615818, + "completion_length": 1429.7083892822266, + "epoch": 0.47885714285714287, + "grad_norm": 2.259946346282959, + "kl": 0.345733642578125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.7005243352409333e-07, + "loss": 0.0139, + "reward": 0.9187582801096141, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.9187582801096141, + "reward_after_std": 0.8637475855648518, + "reward_before_mean": 1.2546281795948744, + "reward_before_std": 0.8357833307236433, + "reward_change_max": 0.00046975165605545044, + "reward_change_mean": -0.3358698934316635, + "reward_change_min": -0.5807428881525993, + "reward_change_std": 0.21599006466567516, + "reward_std": 0.8637476190924644, + "rewards/cosine_scaled_reward": 0.17939739441499114, + "rewards/format_reward": 0.8958333507180214, + "step": 419 + }, + { + "advantage_max": 1.688274398446083, + "advantage_mean": -2.5766592193221527e-08, + "advantage_min": -1.0731761306524277, + "advantage_std": 0.9998203814029694, + "completion_length": 906.9792022705078, + "epoch": 0.48, + "grad_norm": 2.0228357315063477, + "kl": 0.1790008544921875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.6837835672960831e-07, + "loss": 0.0071, + "reward": 0.39381164871156216, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.39381164871156216, + "reward_after_std": 0.7148094028234482, + "reward_before_mean": 0.6268439115956426, + "reward_before_std": 0.6882984191179276, + "reward_change_max": 0.0, + "reward_change_mean": -0.2330322489142418, + "reward_change_min": -0.40335175208747387, + "reward_change_std": 0.1487152185291052, + "reward_std": 0.7148094102740288, + "rewards/cosine_scaled_reward": -0.1449114013230428, + "rewards/format_reward": 0.9166666865348816, + "step": 420 + }, + { + "advantage_max": 1.6927991807460785, + "advantage_mean": -1.0554989104960555e-08, + "advantage_min": -1.1053090691566467, + "advantage_std": 0.9997538402676582, + "completion_length": 1260.5833702087402, + "epoch": 0.48114285714285715, + "grad_norm": 1.5715601444244385, + "kl": 0.400909423828125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.6672287963562852e-07, + "loss": 0.0161, + "reward": 0.3637576922774315, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3637576922774315, + "reward_after_std": 0.49668950214982033, + "reward_before_mean": 0.606179989874363, + "reward_before_std": 0.45100926607847214, + "reward_change_max": 0.0, + "reward_change_mean": -0.24242228642106056, + "reward_change_min": -0.3647758923470974, + "reward_change_std": 0.13684486132115126, + "reward_std": 0.4966895170509815, + "rewards/cosine_scaled_reward": -0.1760766813531518, + "rewards/format_reward": 0.9583333432674408, + "step": 421 + }, + { + "advantage_max": 1.7518580704927444, + "advantage_mean": -3.942599113848644e-08, + "advantage_min": -0.9576732888817787, + "advantage_std": 0.9998401924967766, + "completion_length": 1086.9166946411133, + "epoch": 0.48228571428571426, + "grad_norm": 1.1396833658218384, + "kl": 0.22650146484375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.6508608292777203e-07, + "loss": 0.009, + "reward": 0.6568402461707592, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6568402461707592, + "reward_after_std": 0.7277768515050411, + "reward_before_mean": 0.9418208077549934, + "reward_before_std": 0.6685466133058071, + "reward_change_max": 0.0, + "reward_change_mean": -0.2849805485457182, + "reward_change_min": -0.44854456931352615, + "reward_change_std": 0.16233503445982933, + "reward_std": 0.7277768775820732, + "rewards/cosine_scaled_reward": -0.029089616611599922, + "rewards/format_reward": 1.0, + "step": 422 + }, + { + "advantage_max": 1.6433405727148056, + "advantage_mean": -3.2285850992685994e-08, + "advantage_min": -1.0724581107497215, + "advantage_std": 0.9997552186250687, + "completion_length": 1292.5000457763672, + "epoch": 0.48342857142857143, + "grad_norm": 1.9447253942489624, + "kl": 0.39086151123046875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.6346804638120098e-07, + "loss": 0.0157, + "reward": 0.33056771755218506, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.33056771755218506, + "reward_after_std": 0.5717285163700581, + "reward_before_mean": 0.56210732832551, + "reward_before_std": 0.5475243860855699, + "reward_change_max": 0.0008535012602806091, + "reward_change_mean": -0.23153959307819605, + "reward_change_min": -0.40224830619990826, + "reward_change_std": 0.14767762832343578, + "reward_std": 0.571728527545929, + "rewards/cosine_scaled_reward": -0.1772796856239438, + "rewards/format_reward": 0.9166666716337204, + "step": 423 + }, + { + "advantage_max": 1.6328357756137848, + "advantage_mean": -4.967053657267684e-09, + "advantage_min": -1.156449869275093, + "advantage_std": 0.9997916594147682, + "completion_length": 1412.4375305175781, + "epoch": 0.4845714285714286, + "grad_norm": 3.788841485977173, + "kl": 0.7708892822265625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.6186884885673413e-07, + "loss": 0.0309, + "reward": 0.30179769173264503, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.30179769173264503, + "reward_after_std": 0.5867325812578201, + "reward_before_mean": 0.526239525526762, + "reward_before_std": 0.5687926262617111, + "reward_change_max": 0.0, + "reward_change_mean": -0.22444182448089123, + "reward_change_min": -0.3760899640619755, + "reward_change_std": 0.14267886988818645, + "reward_std": 0.5867325849831104, + "rewards/cosine_scaled_reward": -0.1847969237715006, + "rewards/format_reward": 0.8958333507180214, + "step": 424 + }, + { + "advantage_max": 1.5353441387414932, + "advantage_mean": -8.692344177774203e-08, + "advantage_min": -1.2177848778665066, + "advantage_std": 0.9998636916279793, + "completion_length": 1029.3125381469727, + "epoch": 0.4857142857142857, + "grad_norm": 1.9667916297912598, + "kl": 0.15512847900390625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.6028856829700258e-07, + "loss": 0.0062, + "reward": 1.059104137122631, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 1.059104137122631, + "reward_after_std": 0.8383343629539013, + "reward_before_mean": 1.431776024401188, + "reward_before_std": 0.8190486012026668, + "reward_change_max": 0.0023517385125160217, + "reward_change_mean": -0.3726718984544277, + "reward_change_min": -0.5964533090591431, + "reward_change_std": 0.23735353536903858, + "reward_std": 0.8383343853056431, + "rewards/cosine_scaled_reward": 0.25755466148257256, + "rewards/format_reward": 0.9166666865348816, + "step": 425 + }, + { + "advantage_max": 1.7242664843797684, + "advantage_mean": -2.9181440985848184e-08, + "advantage_min": -1.0022350773215294, + "advantage_std": 0.9997998252511024, + "completion_length": 976.8958587646484, + "epoch": 0.4868571428571429, + "grad_norm": 1.1338958740234375, + "kl": 0.20705413818359375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.5872728172265146e-07, + "loss": 0.0083, + "reward": 0.5903048403561115, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5903048403561115, + "reward_after_std": 0.5925074368715286, + "reward_before_mean": 0.8734835237264633, + "reward_before_std": 0.5249487236142159, + "reward_change_max": 0.0, + "reward_change_mean": -0.28317867405712605, + "reward_change_min": -0.4353860914707184, + "reward_change_std": 0.15731794014573097, + "reward_std": 0.5925074480473995, + "rewards/cosine_scaled_reward": -0.052841583266854286, + "rewards/format_reward": 0.9791666716337204, + "step": 426 + }, + { + "advantage_max": 1.4864351898431778, + "advantage_mean": -3.849466678751412e-08, + "advantage_min": -1.1698247194290161, + "advantage_std": 0.9998398199677467, + "completion_length": 1417.645896911621, + "epoch": 0.488, + "grad_norm": 1.3676238059997559, + "kl": 0.2432708740234375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.5718506522858572e-07, + "loss": 0.0097, + "reward": 0.7394802048802376, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.7394802048802376, + "reward_after_std": 0.795317318290472, + "reward_before_mean": 1.04683218896389, + "reward_before_std": 0.8088573627173901, + "reward_change_max": 0.0, + "reward_change_mean": -0.3073519878089428, + "reward_change_min": -0.5503535270690918, + "reward_change_std": 0.20516410283744335, + "reward_std": 0.7953173480927944, + "rewards/cosine_scaled_reward": 0.03383274283260107, + "rewards/format_reward": 0.9791666716337204, + "step": 427 + }, + { + "advantage_max": 1.6609529703855515, + "advantage_mean": -4.718701163142214e-08, + "advantage_min": -1.1213230341672897, + "advantage_std": 0.9997992888092995, + "completion_length": 1233.4792022705078, + "epoch": 0.48914285714285716, + "grad_norm": 2.9406940937042236, + "kl": 0.372589111328125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.5566199398026147e-07, + "loss": 0.0149, + "reward": 0.36294333823025227, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.36294333823025227, + "reward_after_std": 0.645927868783474, + "reward_before_mean": 0.5969743453897536, + "reward_before_std": 0.6330409869551659, + "reward_change_max": 0.002034500241279602, + "reward_change_mean": -0.23403102438896894, + "reward_change_min": -0.42131222784519196, + "reward_change_std": 0.1592898527160287, + "reward_std": 0.6459279023110867, + "rewards/cosine_scaled_reward": -0.13901282846927643, + "rewards/format_reward": 0.8750000149011612, + "step": 428 + }, + { + "advantage_max": 1.697268322110176, + "advantage_mean": 1.1486313233888268e-08, + "advantage_min": -1.1414394900202751, + "advantage_std": 0.999734528362751, + "completion_length": 794.1458511352539, + "epoch": 0.49028571428571427, + "grad_norm": 0.8047446608543396, + "kl": 0.1827239990234375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.5415814221002265e-07, + "loss": 0.0073, + "reward": 0.45462355855852365, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.45462355855852365, + "reward_after_std": 0.4094177149236202, + "reward_before_mean": 0.7234781309962273, + "reward_before_std": 0.3584022521972656, + "reward_change_max": 0.0, + "reward_change_mean": -0.268854558467865, + "reward_change_min": -0.38506653904914856, + "reward_change_std": 0.14412853959947824, + "reward_std": 0.4094177260994911, + "rewards/cosine_scaled_reward": -0.13826094195246696, + "rewards/format_reward": 1.0, + "step": 429 + }, + { + "advantage_max": 1.6596029549837112, + "advantage_mean": -1.1796752963366686e-08, + "advantage_min": -1.1022100150585175, + "advantage_std": 0.9998620226979256, + "completion_length": 1144.750015258789, + "epoch": 0.49142857142857144, + "grad_norm": 2.4741756916046143, + "kl": 0.2149505615234375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.5267358321348285e-07, + "loss": 0.0086, + "reward": 0.7748331986367702, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7748331986367702, + "reward_after_std": 0.9026194624602795, + "reward_before_mean": 1.0760859698057175, + "reward_before_std": 0.8741047717630863, + "reward_change_max": 0.0, + "reward_change_mean": -0.3012527599930763, + "reward_change_min": -0.512249581515789, + "reward_change_std": 0.19205154851078987, + "reward_std": 0.9026195108890533, + "rewards/cosine_scaled_reward": 0.05887631943915039, + "rewards/format_reward": 0.9583333432674408, + "step": 430 + }, + { + "advantage_max": 1.8094293773174286, + "advantage_mean": -3.1044085080367267e-09, + "advantage_min": -0.9238940328359604, + "advantage_std": 0.9997945874929428, + "completion_length": 1009.0833435058594, + "epoch": 0.49257142857142855, + "grad_norm": 2.0188703536987305, + "kl": 0.3837890625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.5120838934595337e-07, + "loss": 0.0154, + "reward": 0.41635072650387883, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.41635072650387883, + "reward_after_std": 0.6224270965903997, + "reward_before_mean": 0.6578373126685619, + "reward_before_std": 0.5729903429746628, + "reward_change_max": 0.0, + "reward_change_mean": -0.24148658476769924, + "reward_change_min": -0.38924745842814445, + "reward_change_std": 0.14313186053186655, + "reward_std": 0.62242710031569, + "rewards/cosine_scaled_reward": -0.13983136042952538, + "rewards/format_reward": 0.9375000149011612, + "step": 431 + }, + { + "advantage_max": 1.641411080956459, + "advantage_mean": -1.7384689243726825e-08, + "advantage_min": -1.1451390460133553, + "advantage_std": 0.9998093023896217, + "completion_length": 1437.0208587646484, + "epoch": 0.4937142857142857, + "grad_norm": 3.1914052963256836, + "kl": 0.754730224609375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.4976263201891613e-07, + "loss": 0.0301, + "reward": 0.2787760675419122, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2787760675419122, + "reward_after_std": 0.6417630799114704, + "reward_before_mean": 0.49448305927217007, + "reward_before_std": 0.6182979214936495, + "reward_change_max": 1.4379620552062988e-06, + "reward_change_mean": -0.21570699103176594, + "reward_change_min": -0.3621121309697628, + "reward_change_std": 0.1398151321336627, + "reward_std": 0.6417631022632122, + "rewards/cosine_scaled_reward": -0.16942514950642362, + "rewards/format_reward": 0.8333333432674408, + "step": 432 + }, + { + "advantage_max": 1.5136045515537262, + "advantage_mean": -1.9868215850316062e-08, + "advantage_min": -1.2275639027357101, + "advantage_std": 0.9998220428824425, + "completion_length": 1271.7917098999023, + "epoch": 0.4948571428571429, + "grad_norm": 1.8898147344589233, + "kl": 0.28757476806640625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.483363816965435e-07, + "loss": 0.0115, + "reward": 0.5753043964505196, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5753043964505196, + "reward_after_std": 0.7662338018417358, + "reward_before_mean": 0.8524379003793001, + "reward_before_std": 0.7830417305231094, + "reward_change_max": 0.0003195628523826599, + "reward_change_mean": -0.2771335020661354, + "reward_change_min": -0.4404204413294792, + "reward_change_std": 0.1899934383109212, + "reward_std": 0.7662338204681873, + "rewards/cosine_scaled_reward": -0.021697734715417027, + "rewards/format_reward": 0.8958333507180214, + "step": 433 + }, + { + "advantage_max": 1.7336640357971191, + "advantage_mean": 1.862645149230957e-09, + "advantage_min": -1.012288175523281, + "advantage_std": 0.9997320026159286, + "completion_length": 1327.5208435058594, + "epoch": 0.496, + "grad_norm": 1.9366543292999268, + "kl": 0.615142822265625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.469297078922642e-07, + "loss": 0.0246, + "reward": 0.17344986740499735, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.17344986740499735, + "reward_after_std": 0.40367303043603897, + "reward_before_mean": 0.3821178646758199, + "reward_before_std": 0.36095103435218334, + "reward_change_max": 0.0, + "reward_change_mean": -0.20866799354553223, + "reward_change_min": -0.3159416187554598, + "reward_change_std": 0.11741238739341497, + "reward_std": 0.40367304161190987, + "rewards/cosine_scaled_reward": -0.2672744058072567, + "rewards/format_reward": 0.9166666865348816, + "step": 434 + }, + { + "advantage_max": 1.6692968308925629, + "advantage_mean": 1.1102230246251565e-16, + "advantage_min": -1.1007107272744179, + "advantage_std": 0.9997232183814049, + "completion_length": 867.9375152587891, + "epoch": 0.49714285714285716, + "grad_norm": 1.5377548933029175, + "kl": 0.3750457763671875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.4554267916537495e-07, + "loss": 0.015, + "reward": 0.34692182997241616, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.34692182997241616, + "reward_after_std": 0.40469590574502945, + "reward_before_mean": 0.5923015549778938, + "reward_before_std": 0.35693977400660515, + "reward_change_max": 0.0, + "reward_change_mean": -0.2453797236084938, + "reward_change_min": -0.3618598096072674, + "reward_change_std": 0.13376801926642656, + "reward_std": 0.40469592064619064, + "rewards/cosine_scaled_reward": -0.1934325685724616, + "rewards/format_reward": 0.9791666716337204, + "step": 435 + }, + { + "advantage_max": 1.5167209059000015, + "advantage_mean": -2.0644317699769488e-08, + "advantage_min": -1.2784735634922981, + "advantage_std": 0.9998432099819183, + "completion_length": 813.3333549499512, + "epoch": 0.4982857142857143, + "grad_norm": 1.6000545024871826, + "kl": 0.24234771728515625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.4417536311769885e-07, + "loss": 0.0097, + "reward": 0.9885508413426578, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.9885508413426578, + "reward_after_std": 0.7530980035662651, + "reward_before_mean": 1.350018784403801, + "reward_before_std": 0.7326938565820456, + "reward_change_max": 0.0017473623156547546, + "reward_change_mean": -0.36146787740290165, + "reward_change_min": -0.5618167705833912, + "reward_change_std": 0.22278902772814035, + "reward_std": 0.7530980110168457, + "rewards/cosine_scaled_reward": 0.19584268890321255, + "rewards/format_reward": 0.9583333432674408, + "step": 436 + }, + { + "advantage_max": 1.662267044186592, + "advantage_mean": -4.066775316502458e-08, + "advantage_min": -1.2343645691871643, + "advantage_std": 0.9997730180621147, + "completion_length": 1176.9792022705078, + "epoch": 0.49942857142857144, + "grad_norm": 1.9178744554519653, + "kl": 0.27816009521484375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.4282782639029128e-07, + "loss": 0.0111, + "reward": 0.5513465432450175, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5513465432450175, + "reward_after_std": 0.5375252179801464, + "reward_before_mean": 0.8317223340272903, + "reward_before_std": 0.49098595418035984, + "reward_change_max": 0.0, + "reward_change_mean": -0.2803757805377245, + "reward_change_min": -0.42643988877534866, + "reward_change_std": 0.15675612725317478, + "reward_std": 0.5375252217054367, + "rewards/cosine_scaled_reward": -0.06330552324652672, + "rewards/format_reward": 0.9583333432674408, + "step": 437 + }, + { + "advantage_max": 1.5528530925512314, + "advantage_mean": -3.104408696774641e-08, + "advantage_min": -1.2435024604201317, + "advantage_std": 0.9998113289475441, + "completion_length": 1343.6250381469727, + "epoch": 0.5005714285714286, + "grad_norm": 2.4138340950012207, + "kl": 0.6477813720703125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.4150013466019114e-07, + "loss": 0.0259, + "reward": 0.4180278740823269, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.4180278740823269, + "reward_after_std": 0.61670046672225, + "reward_before_mean": 0.6680997647345066, + "reward_before_std": 0.6113879233598709, + "reward_change_max": 0.0, + "reward_change_mean": -0.2500718683004379, + "reward_change_min": -0.42597054317593575, + "reward_change_std": 0.1606674799695611, + "reward_std": 0.6167004853487015, + "rewards/cosine_scaled_reward": -0.10345013532787561, + "rewards/format_reward": 0.8750000149011612, + "step": 438 + }, + { + "advantage_max": 1.8177452832460403, + "advantage_mean": -8.071462720415923e-09, + "advantage_min": -0.8807521760463715, + "advantage_std": 0.9997581467032433, + "completion_length": 908.2500228881836, + "epoch": 0.5017142857142857, + "grad_norm": 0.8225669264793396, + "kl": 0.2245941162109375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.4019235263722034e-07, + "loss": 0.009, + "reward": 0.27506811420107624, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.27506811420107624, + "reward_after_std": 0.5774744208902121, + "reward_before_mean": 0.49151327461004257, + "reward_before_std": 0.5247315559536219, + "reward_change_max": 0.0, + "reward_change_mean": -0.21644516475498676, + "reward_change_min": -0.3448958247900009, + "reward_change_std": 0.12561427615582943, + "reward_std": 0.5774744469672441, + "rewards/cosine_scaled_reward": -0.2438267096877098, + "rewards/format_reward": 0.9791666716337204, + "step": 439 + }, + { + "advantage_max": 1.688047081232071, + "advantage_mean": -9.31322552411018e-09, + "advantage_min": -1.1031463593244553, + "advantage_std": 0.9997521713376045, + "completion_length": 1091.2500190734863, + "epoch": 0.5028571428571429, + "grad_norm": 2.169895648956299, + "kl": 0.50408935546875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.3890454406082956e-07, + "loss": 0.0202, + "reward": 0.2982004745863378, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2982004745863378, + "reward_after_std": 0.47428241185843945, + "reward_before_mean": 0.5280682481825352, + "reward_before_std": 0.4377642162144184, + "reward_change_max": 0.0, + "reward_change_mean": -0.2298677545040846, + "reward_change_min": -0.364645067602396, + "reward_change_std": 0.13350447546690702, + "reward_std": 0.4742824211716652, + "rewards/cosine_scaled_reward": -0.20471589546650648, + "rewards/format_reward": 0.9375000149011612, + "step": 440 + }, + { + "advantage_max": 1.6149345934391022, + "advantage_mean": -1.2417635586459141e-08, + "advantage_min": -1.1340602338314056, + "advantage_std": 0.9997862130403519, + "completion_length": 1496.5000534057617, + "epoch": 0.504, + "grad_norm": 2.867658853530884, + "kl": 0.64959716796875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.3763677169699217e-07, + "loss": 0.026, + "reward": 0.6025845520198345, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6025845520198345, + "reward_after_std": 0.5532146245241165, + "reward_before_mean": 0.8909978433512151, + "reward_before_std": 0.5169778261333704, + "reward_change_max": 0.0, + "reward_change_mean": -0.28841328993439674, + "reward_change_min": -0.45017037354409695, + "reward_change_std": 0.16753645054996014, + "reward_std": 0.5532146394252777, + "rewards/cosine_scaled_reward": 0.00799890048801899, + "rewards/format_reward": 0.8750000149011612, + "step": 441 + }, + { + "advantage_max": 1.6888891458511353, + "advantage_mean": -8.568168063938231e-08, + "advantage_min": -1.0125275775790215, + "advantage_std": 0.9998335763812065, + "completion_length": 1133.1667022705078, + "epoch": 0.5051428571428571, + "grad_norm": 2.600919246673584, + "kl": 0.41729736328125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.3638909733514452e-07, + "loss": 0.0167, + "reward": 0.6196513641625643, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.6196513641625643, + "reward_after_std": 0.7795599326491356, + "reward_before_mean": 0.8948934227228165, + "reward_before_std": 0.7337003983557224, + "reward_change_max": 0.0004445314407348633, + "reward_change_mean": -0.2752421023324132, + "reward_change_min": -0.4447060916572809, + "reward_change_std": 0.17305165994912386, + "reward_std": 0.7795599810779095, + "rewards/cosine_scaled_reward": -0.010886628180742264, + "rewards/format_reward": 0.916666679084301, + "step": 442 + }, + { + "advantage_max": 1.57286237180233, + "advantage_mean": -3.9736430812453705e-08, + "advantage_min": -1.2943106442689896, + "advantage_std": 0.9998047053813934, + "completion_length": 1557.0208587646484, + "epoch": 0.5062857142857143, + "grad_norm": 2.567955732345581, + "kl": 0.8605499267578125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.351615817851748e-07, + "loss": 0.0345, + "reward": 0.48115325393155217, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.48115325393155217, + "reward_after_std": 0.6308228820562363, + "reward_before_mean": 0.741462922655046, + "reward_before_std": 0.6177244447171688, + "reward_change_max": 0.0005853325128555298, + "reward_change_mean": -0.260309673845768, + "reward_change_min": -0.4191274531185627, + "reward_change_std": 0.1612487519159913, + "reward_std": 0.6308228969573975, + "rewards/cosine_scaled_reward": -0.07718522319191834, + "rewards/format_reward": 0.8958333432674408, + "step": 443 + }, + { + "advantage_max": 1.7246465533971786, + "advantage_mean": -1.9247333948868572e-08, + "advantage_min": -0.9812377840280533, + "advantage_std": 0.9998320639133453, + "completion_length": 1051.0000381469727, + "epoch": 0.5074285714285715, + "grad_norm": 2.602076768875122, + "kl": 0.476654052734375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.3395428487445914e-07, + "loss": 0.0191, + "reward": 0.5566119570285082, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5566119570285082, + "reward_after_std": 0.7091410793364048, + "reward_before_mean": 0.8225421812385321, + "reward_before_std": 0.6658905595541, + "reward_change_max": 0.0, + "reward_change_mean": -0.2659302204847336, + "reward_change_min": -0.4204677902162075, + "reward_change_std": 0.15753941144794226, + "reward_std": 0.7091410867869854, + "rewards/cosine_scaled_reward": -0.07831226149573922, + "rewards/format_reward": 0.9791666716337204, + "step": 444 + }, + { + "advantage_max": 1.588456466794014, + "advantage_mean": -1.4901161637936866e-08, + "advantage_min": -1.1749219596385956, + "advantage_std": 0.9998450726270676, + "completion_length": 1186.7708435058594, + "epoch": 0.5085714285714286, + "grad_norm": 2.270045518875122, + "kl": 0.49578857421875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.3276726544494571e-07, + "loss": 0.0199, + "reward": 0.42726368457078934, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.42726368457078934, + "reward_after_std": 0.8470357619225979, + "reward_before_mean": 0.6616728538647294, + "reward_before_std": 0.8420430682599545, + "reward_change_max": 0.0025559216737747192, + "reward_change_mean": -0.2344091683626175, + "reward_change_min": -0.4344114977866411, + "reward_change_std": 0.16814141906797886, + "reward_std": 0.8470357991755009, + "rewards/cosine_scaled_reward": -0.12749691866338253, + "rewards/format_reward": 0.9166666865348816, + "step": 445 + }, + { + "advantage_max": 1.7508121132850647, + "advantage_mean": -9.313226689844356e-09, + "advantage_min": -1.0302319675683975, + "advantage_std": 0.9997759759426117, + "completion_length": 1124.0416870117188, + "epoch": 0.5097142857142857, + "grad_norm": 6.001957893371582, + "kl": 0.41686248779296875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.316005813502869e-07, + "loss": 0.0167, + "reward": 0.5921612880192697, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5921612880192697, + "reward_after_std": 0.5231526885181665, + "reward_before_mean": 0.8797222785651684, + "reward_before_std": 0.45472224801778793, + "reward_change_max": 0.0, + "reward_change_mean": -0.28756098076701164, + "reward_change_min": -0.42036930844187737, + "reward_change_std": 0.15772765688598156, + "reward_std": 0.5231527201831341, + "rewards/cosine_scaled_reward": -0.049722205847501755, + "rewards/format_reward": 0.9791666716337204, + "step": 446 + }, + { + "advantage_max": 1.6464007794857025, + "advantage_mean": 1.6608586214661436e-08, + "advantage_min": -1.2194004356861115, + "advantage_std": 0.9997731596231461, + "completion_length": 929.0416870117188, + "epoch": 0.5108571428571429, + "grad_norm": 4.027428150177002, + "kl": 0.464569091796875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.3045428945301953e-07, + "loss": 0.0186, + "reward": 0.49217029428109527, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.49217029428109527, + "reward_after_std": 0.4973028674721718, + "reward_before_mean": 0.7639967501163483, + "reward_before_std": 0.461477916687727, + "reward_change_max": 0.0, + "reward_change_mean": -0.2718264479190111, + "reward_change_min": -0.39409972354769707, + "reward_change_std": 0.15628806222230196, + "reward_std": 0.4973028898239136, + "rewards/cosine_scaled_reward": -0.06591829285025597, + "rewards/format_reward": 0.8958333507180214, + "step": 447 + }, + { + "advantage_max": 1.6986867785453796, + "advantage_mean": -5.5879357807597785e-08, + "advantage_min": -0.9982610195875168, + "advantage_std": 0.9998103454709053, + "completion_length": 972.4583587646484, + "epoch": 0.512, + "grad_norm": 2.053443193435669, + "kl": 0.65228271484375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.2932844562179352e-07, + "loss": 0.0261, + "reward": 0.48171089054085314, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.48171089054085314, + "reward_after_std": 0.5779558382928371, + "reward_before_mean": 0.7415714748203754, + "reward_before_std": 0.5251903366297483, + "reward_change_max": 0.0009759292006492615, + "reward_change_mean": -0.25986063852906227, + "reward_change_min": -0.40990063920617104, + "reward_change_std": 0.15151193924248219, + "reward_std": 0.577955849468708, + "rewards/cosine_scaled_reward": -0.06671425537206233, + "rewards/format_reward": 0.8750000074505806, + "step": 448 + }, + { + "advantage_max": 1.6733713150024414, + "advantage_mean": -3.352761368535795e-08, + "advantage_min": -1.0281840190291405, + "advantage_std": 0.9997676908969879, + "completion_length": 963.9792022705078, + "epoch": 0.5131428571428571, + "grad_norm": 3.2843992710113525, + "kl": 0.435089111328125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.2822310472864885e-07, + "loss": 0.0174, + "reward": 0.3808920937590301, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3808920937590301, + "reward_after_std": 0.5548666939139366, + "reward_before_mean": 0.6236836463212967, + "reward_before_std": 0.5186882894486189, + "reward_change_max": 0.0, + "reward_change_mean": -0.2427915446460247, + "reward_change_min": -0.3859280236065388, + "reward_change_std": 0.14151952601969242, + "reward_std": 0.5548667013645172, + "rewards/cosine_scaled_reward": -0.15690819779410958, + "rewards/format_reward": 0.9375000149011612, + "step": 449 + }, + { + "advantage_max": 1.6094936579465866, + "advantage_mean": -1.5522043428362053e-08, + "advantage_min": -1.2767504006624222, + "advantage_std": 0.9997935369610786, + "completion_length": 992.1458549499512, + "epoch": 0.5142857142857142, + "grad_norm": 0.9147229790687561, + "kl": 0.232177734375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.2713832064634125e-07, + "loss": 0.0093, + "reward": 0.4996907636523247, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4996907636523247, + "reward_after_std": 0.5958952084183693, + "reward_before_mean": 0.7667517811059952, + "reward_before_std": 0.5675084516406059, + "reward_change_max": 0.0006358623504638672, + "reward_change_mean": -0.2670610249042511, + "reward_change_min": -0.40510354191064835, + "reward_change_std": 0.15649499371647835, + "reward_std": 0.5958952307701111, + "rewards/cosine_scaled_reward": -0.09579078573733568, + "rewards/format_reward": 0.9583333432674408, + "step": 450 + }, + { + "advantage_max": 1.7193011492490768, + "advantage_mean": -4.8428775767384025e-08, + "advantage_min": -1.0626923367381096, + "advantage_std": 0.9998032078146935, + "completion_length": 863.7708587646484, + "epoch": 0.5154285714285715, + "grad_norm": 2.7563045024871826, + "kl": 0.323333740234375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.260741462457165e-07, + "loss": 0.0129, + "reward": 0.5790690593421459, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5790690593421459, + "reward_after_std": 0.6669084951281548, + "reward_before_mean": 0.8539356961846352, + "reward_before_std": 0.6240298300981522, + "reward_change_max": 0.0, + "reward_change_mean": -0.27486664801836014, + "reward_change_min": -0.42612981237471104, + "reward_change_std": 0.1603500172495842, + "reward_std": 0.666908498853445, + "rewards/cosine_scaled_reward": -0.052198843099176884, + "rewards/format_reward": 0.9583333432674408, + "step": 451 + }, + { + "advantage_max": 1.642724797129631, + "advantage_mean": -5.3395828869540196e-08, + "advantage_min": -1.1115128174424171, + "advantage_std": 0.9997927471995354, + "completion_length": 1056.7500381469727, + "epoch": 0.5165714285714286, + "grad_norm": 3.9191229343414307, + "kl": 0.5685272216796875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.2503063339313356e-07, + "loss": 0.0227, + "reward": 0.6581305470317602, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.6581305470317602, + "reward_after_std": 0.5201870016753674, + "reward_before_mean": 0.9607983604073524, + "reward_before_std": 0.4658200293779373, + "reward_change_max": 0.0, + "reward_change_mean": -0.3026678394526243, + "reward_change_min": -0.4494563788175583, + "reward_change_std": 0.16615933552384377, + "reward_std": 0.5201870128512383, + "rewards/cosine_scaled_reward": 0.011649169027805328, + "rewards/format_reward": 0.9375000149011612, + "step": 452 + }, + { + "advantage_max": 1.6432772874832153, + "advantage_mean": -4.346172111091562e-08, + "advantage_min": -1.0461668819189072, + "advantage_std": 0.9998030662536621, + "completion_length": 1105.0625305175781, + "epoch": 0.5177142857142857, + "grad_norm": 1.6952801942825317, + "kl": 0.33880615234375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.2400783294793668e-07, + "loss": 0.0135, + "reward": 0.6465382017195225, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6465382017195225, + "reward_after_std": 0.6454170644283295, + "reward_before_mean": 0.9408576935529709, + "reward_before_std": 0.6176839880645275, + "reward_change_max": 0.0, + "reward_change_mean": -0.2943194881081581, + "reward_change_min": -0.4937918931245804, + "reward_change_std": 0.17771916277706623, + "reward_std": 0.645417083054781, + "rewards/cosine_scaled_reward": -0.019154516980051994, + "rewards/format_reward": 0.9791666716337204, + "step": 453 + }, + { + "advantage_max": 1.5095160454511642, + "advantage_mean": -3.9736431256542915e-08, + "advantage_min": -1.3069396615028381, + "advantage_std": 0.99983149766922, + "completion_length": 1135.6875305175781, + "epoch": 0.5188571428571429, + "grad_norm": 3.1263599395751953, + "kl": 0.41485595703125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.2300579475997657e-07, + "loss": 0.0166, + "reward": 0.4435289604589343, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4435289604589343, + "reward_after_std": 0.6856096424162388, + "reward_before_mean": 0.6957365237176418, + "reward_before_std": 0.6848980709910393, + "reward_change_max": 0.0010104477405548096, + "reward_change_mean": -0.25220758467912674, + "reward_change_min": -0.42572787031531334, + "reward_change_std": 0.16586639359593391, + "reward_std": 0.6856096535921097, + "rewards/cosine_scaled_reward": -0.10004841070622206, + "rewards/format_reward": 0.895833358168602, + "step": 454 + }, + { + "advantage_max": 1.5948998034000397, + "advantage_mean": -1.1796753074388988e-08, + "advantage_min": -1.2343806698918343, + "advantage_std": 0.9997700154781342, + "completion_length": 1371.770851135254, + "epoch": 0.52, + "grad_norm": 3.6875624656677246, + "kl": 0.72686767578125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.220245676671809e-07, + "loss": 0.029, + "reward": 0.27308704424649477, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.27308704424649477, + "reward_after_std": 0.4806528389453888, + "reward_before_mean": 0.4997571799904108, + "reward_before_std": 0.45495717599987984, + "reward_change_max": 0.0, + "reward_change_mean": -0.22667013481259346, + "reward_change_min": -0.3506789766252041, + "reward_change_std": 0.13179008476436138, + "reward_std": 0.48065285384655, + "rewards/cosine_scaled_reward": -0.22928809002041817, + "rewards/format_reward": 0.9583333432674408, + "step": 455 + }, + { + "advantage_max": 1.75045645236969, + "advantage_mean": -2.0489097418696645e-08, + "advantage_min": -0.8442424722015858, + "advantage_std": 0.9998552426695824, + "completion_length": 1283.333366394043, + "epoch": 0.5211428571428571, + "grad_norm": 1.8698930740356445, + "kl": 0.563690185546875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.2106419949317388e-07, + "loss": 0.0225, + "reward": 0.4764702459797263, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4764702459797263, + "reward_after_std": 0.8666420541703701, + "reward_before_mean": 0.7162495022639632, + "reward_before_std": 0.8417605478316545, + "reward_change_max": 0.0, + "reward_change_mean": -0.2397792637348175, + "reward_change_min": -0.431858966127038, + "reward_change_std": 0.16008818428963423, + "reward_std": 0.8666420765221119, + "rewards/cosine_scaled_reward": -0.05854193802224472, + "rewards/format_reward": 0.833333358168602, + "step": 456 + }, + { + "advantage_max": 1.657249003648758, + "advantage_mean": -5.4637594004702805e-08, + "advantage_min": -1.111772559583187, + "advantage_std": 0.9997870773077011, + "completion_length": 1104.604190826416, + "epoch": 0.5222857142857142, + "grad_norm": 2.366246461868286, + "kl": 0.4698486328125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.2012473704494537e-07, + "loss": 0.0188, + "reward": 0.5776587019208819, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5776587019208819, + "reward_after_std": 0.6012655943632126, + "reward_before_mean": 0.860280629247427, + "reward_before_std": 0.5697008725255728, + "reward_change_max": 0.0, + "reward_change_mean": -0.2826219145208597, + "reward_change_min": -0.46650177612900734, + "reward_change_std": 0.1752561703324318, + "reward_std": 0.6012655980885029, + "rewards/cosine_scaled_reward": -0.007359715178608894, + "rewards/format_reward": 0.8750000149011612, + "step": 457 + }, + { + "advantage_max": 1.7160254791378975, + "advantage_mean": -1.0244548542814869e-08, + "advantage_min": -0.9474887922406197, + "advantage_std": 0.9998100101947784, + "completion_length": 939.5625305175781, + "epoch": 0.5234285714285715, + "grad_norm": 0.805923581123352, + "kl": 0.1466064453125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.1920622611056974e-07, + "loss": 0.0059, + "reward": 0.37487271800637245, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.37487271800637245, + "reward_after_std": 0.6352785937488079, + "reward_before_mean": 0.6104363277554512, + "reward_before_std": 0.5971984900534153, + "reward_change_max": 0.0, + "reward_change_mean": -0.2355636265128851, + "reward_change_min": -0.3710940182209015, + "reward_change_std": 0.1407453790307045, + "reward_std": 0.6352786086499691, + "rewards/cosine_scaled_reward": -0.18436517822556198, + "rewards/format_reward": 0.9791666716337204, + "step": 458 + }, + { + "advantage_max": 1.5805244594812393, + "advantage_mean": -3.228585032655218e-08, + "advantage_min": -1.0805974081158638, + "advantage_std": 0.9998439848423004, + "completion_length": 1056.4167022705078, + "epoch": 0.5245714285714286, + "grad_norm": 1.8266743421554565, + "kl": 0.5707550048828125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.1830871145697412e-07, + "loss": 0.0228, + "reward": 0.5873242821544409, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5873242821544409, + "reward_after_std": 0.8205331340432167, + "reward_before_mean": 0.8586870357394218, + "reward_before_std": 0.8227408565580845, + "reward_change_max": 0.0006539598107337952, + "reward_change_mean": -0.27136277966201305, + "reward_change_min": -0.48420753702521324, + "reward_change_std": 0.18666737619787455, + "reward_std": 0.8205331601202488, + "rewards/cosine_scaled_reward": -0.04982315469533205, + "rewards/format_reward": 0.9583333432674408, + "step": 459 + }, + { + "advantage_max": 1.7686534374952316, + "advantage_mean": -2.2351742789972207e-08, + "advantage_min": -1.0162685364484787, + "advantage_std": 0.999843567609787, + "completion_length": 1311.833396911621, + "epoch": 0.5257142857142857, + "grad_norm": 1.6238162517547607, + "kl": 0.510040283203125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.1743223682775649e-07, + "loss": 0.0204, + "reward": 0.5407150648534298, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5407150648534298, + "reward_after_std": 0.8093853369355202, + "reward_before_mean": 0.7947937436401844, + "reward_before_std": 0.7604817524552345, + "reward_change_max": 0.0006181150674819946, + "reward_change_mean": -0.2540786974132061, + "reward_change_min": -0.4123247377574444, + "reward_change_std": 0.16012915410101414, + "reward_std": 0.8093853667378426, + "rewards/cosine_scaled_reward": -0.04010312771424651, + "rewards/format_reward": 0.8750000149011612, + "step": 460 + }, + { + "advantage_max": 1.6886788457632065, + "advantage_mean": 2.2662183352117893e-08, + "advantage_min": -0.9786590412259102, + "advantage_std": 0.99976596981287, + "completion_length": 1219.500015258789, + "epoch": 0.5268571428571428, + "grad_norm": 1.783687710762024, + "kl": 0.5525741577148438, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.1657684494105386e-07, + "loss": 0.0221, + "reward": 0.4577493495307863, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.4577493495307863, + "reward_after_std": 0.49723924323916435, + "reward_before_mean": 0.72218307107687, + "reward_before_std": 0.4564433563500643, + "reward_change_max": 0.0, + "reward_change_mean": -0.26443369407206774, + "reward_change_min": -0.40887835435569286, + "reward_change_std": 0.15175122302025557, + "reward_std": 0.49723926186561584, + "rewards/cosine_scaled_reward": -0.09724180959165096, + "rewards/format_reward": 0.9166666679084301, + "step": 461 + }, + { + "advantage_max": 1.8399271368980408, + "advantage_mean": -4.842877565636172e-08, + "advantage_min": -0.8609659969806671, + "advantage_std": 0.9997665509581566, + "completion_length": 1047.0000457763672, + "epoch": 0.528, + "grad_norm": 2.1252048015594482, + "kl": 0.381317138671875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.1574257748745986e-07, + "loss": 0.0153, + "reward": 0.3878229036927223, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3878229036927223, + "reward_after_std": 0.6147576775401831, + "reward_before_mean": 0.6220925338566303, + "reward_before_std": 0.5424556620419025, + "reward_change_max": 0.0, + "reward_change_mean": -0.2342696338891983, + "reward_change_min": -0.3471482917666435, + "reward_change_std": 0.1255068052560091, + "reward_std": 0.6147576849907637, + "rewards/cosine_scaled_reward": -0.18895374238491058, + "rewards/format_reward": 1.0, + "step": 462 + }, + { + "advantage_max": 1.5150933861732483, + "advantage_mean": -5.1533183498264634e-08, + "advantage_min": -1.2166873961687088, + "advantage_std": 0.9998409226536751, + "completion_length": 1437.9583892822266, + "epoch": 0.5291428571428571, + "grad_norm": 2.537919759750366, + "kl": 0.34942626953125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.1492947512799328e-07, + "loss": 0.014, + "reward": 0.6093145990744233, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6093145990744233, + "reward_after_std": 0.7604724206030369, + "reward_before_mean": 0.8918596671428531, + "reward_before_std": 0.7663224712014198, + "reward_change_max": 0.0, + "reward_change_mean": -0.28254508040845394, + "reward_change_min": -0.49530068039894104, + "reward_change_std": 0.18866461794823408, + "reward_std": 0.760472446680069, + "rewards/cosine_scaled_reward": -0.001986853778362274, + "rewards/format_reward": 0.8958333507180214, + "step": 463 + }, + { + "advantage_max": 1.539069339632988, + "advantage_mean": -9.002784961964494e-08, + "advantage_min": -1.322128288447857, + "advantage_std": 0.9997992739081383, + "completion_length": 872.5208740234375, + "epoch": 0.5302857142857142, + "grad_norm": 1.7395652532577515, + "kl": 0.36365509033203125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.1413757749211602e-07, + "loss": 0.0145, + "reward": 0.834553528111428, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.834553528111428, + "reward_after_std": 0.5309878177940845, + "reward_before_mean": 1.178483560681343, + "reward_before_std": 0.4831850230693817, + "reward_change_max": 0.001415453851222992, + "reward_change_mean": -0.34393005445599556, + "reward_change_min": -0.48923714458942413, + "reward_change_std": 0.19740524981170893, + "reward_std": 0.5309878475964069, + "rewards/cosine_scaled_reward": 0.12049175798892975, + "rewards/format_reward": 0.9375000149011612, + "step": 464 + }, + { + "advantage_max": 1.5249205529689789, + "advantage_mean": -3.849466712058103e-08, + "advantage_min": -1.2455387338995934, + "advantage_std": 0.999851331114769, + "completion_length": 1280.9375534057617, + "epoch": 0.5314285714285715, + "grad_norm": 2.718585968017578, + "kl": 0.674560546875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.1336692317580158e-07, + "loss": 0.027, + "reward": 0.4109771801158786, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4109771801158786, + "reward_after_std": 0.740178968757391, + "reward_before_mean": 0.6509956270456314, + "reward_before_std": 0.7452997379004955, + "reward_change_max": 0.0005368664860725403, + "reward_change_mean": -0.24001849628984928, + "reward_change_min": -0.4369941111654043, + "reward_change_std": 0.16760590951889753, + "reward_std": 0.7401789985597134, + "rewards/cosine_scaled_reward": -0.1015855111181736, + "rewards/format_reward": 0.8541666865348816, + "step": 465 + }, + { + "advantage_max": 1.6975940018892288, + "advantage_mean": -3.042320506629892e-08, + "advantage_min": -0.9658067002892494, + "advantage_std": 0.9998613074421883, + "completion_length": 1185.2083702087402, + "epoch": 0.5325714285714286, + "grad_norm": 1.249248743057251, + "kl": 0.25506591796875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.1261754973965422e-07, + "loss": 0.0102, + "reward": 0.799970980733633, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.799970980733633, + "reward_after_std": 0.8710318058729172, + "reward_before_mean": 1.106414571404457, + "reward_before_std": 0.8215276412665844, + "reward_change_max": 0.0, + "reward_change_mean": -0.3064435701817274, + "reward_change_min": -0.5177381709218025, + "reward_change_std": 0.1868924666196108, + "reward_std": 0.8710318133234978, + "rewards/cosine_scaled_reward": 0.06362392473965883, + "rewards/format_reward": 0.9791666716337204, + "step": 466 + }, + { + "advantage_max": 1.846468836069107, + "advantage_mean": -3.7252906315288215e-09, + "advantage_min": -0.811167448759079, + "advantage_std": 0.9998011738061905, + "completion_length": 1365.5208892822266, + "epoch": 0.5337142857142857, + "grad_norm": 1.8020029067993164, + "kl": 0.4116058349609375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.1188949370707787e-07, + "loss": 0.0165, + "reward": 0.2844287045300007, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2844287045300007, + "reward_after_std": 0.6075214967131615, + "reward_before_mean": 0.49876469373703003, + "reward_before_std": 0.5446117036044598, + "reward_change_max": 0.0, + "reward_change_mean": -0.2143359947949648, + "reward_change_min": -0.34634244069457054, + "reward_change_std": 0.11892245709896088, + "reward_std": 0.6075215078890324, + "rewards/cosine_scaled_reward": -0.24020099081099033, + "rewards/format_reward": 0.9791666716337204, + "step": 467 + }, + { + "advantage_max": 1.6614352017641068, + "advantage_mean": -1.0554989382516311e-08, + "advantage_min": -0.9151201918721199, + "advantage_std": 0.9998412430286407, + "completion_length": 1222.1042251586914, + "epoch": 0.5348571428571428, + "grad_norm": 2.5508921146392822, + "kl": 0.562591552734375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.1118279056249653e-07, + "loss": 0.0225, + "reward": 0.30447601340711117, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.30447601340711117, + "reward_after_std": 0.7888620086014271, + "reward_before_mean": 0.5132625997066498, + "reward_before_std": 0.7809022553265095, + "reward_change_max": 0.0, + "reward_change_mean": -0.20878657698631287, + "reward_change_min": -0.42520975694060326, + "reward_change_std": 0.1491988254711032, + "reward_std": 0.7888620272278786, + "rewards/cosine_scaled_reward": -0.20170205205795355, + "rewards/format_reward": 0.9166666865348816, + "step": 468 + }, + { + "advantage_max": 1.7594375908374786, + "advantage_mean": 7.528191479921897e-09, + "advantage_min": -1.024169247597456, + "advantage_std": 0.9997720122337341, + "completion_length": 1013.3125343322754, + "epoch": 0.536, + "grad_norm": 1.44214928150177, + "kl": 0.382080078125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.1049747474962444e-07, + "loss": 0.0152, + "reward": 0.42605833522975445, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.42605833522975445, + "reward_after_std": 0.5564350076019764, + "reward_before_mean": 0.6749656535685062, + "reward_before_std": 0.49210809590294957, + "reward_change_max": 0.0, + "reward_change_mean": -0.24890729784965515, + "reward_change_min": -0.36836569011211395, + "reward_change_std": 0.14090245869010687, + "reward_std": 0.556435015052557, + "rewards/cosine_scaled_reward": -0.12085053510963917, + "rewards/format_reward": 0.9166666716337204, + "step": 469 + }, + { + "advantage_max": 1.689832404255867, + "advantage_mean": -3.8494667453647935e-08, + "advantage_min": -0.9729603379964828, + "advantage_std": 0.9998282045125961, + "completion_length": 1316.8958587646484, + "epoch": 0.5371428571428571, + "grad_norm": 1.4733878374099731, + "kl": 0.545074462890625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.0983357966978745e-07, + "loss": 0.0218, + "reward": 0.4230445548892021, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4230445548892021, + "reward_after_std": 0.7050646170973778, + "reward_before_mean": 0.6629515116801485, + "reward_before_std": 0.6786227710545063, + "reward_change_max": 0.0005048736929893494, + "reward_change_mean": -0.23990696109831333, + "reward_change_min": -0.3978872671723366, + "reward_change_std": 0.14986501820385456, + "reward_std": 0.7050646580755711, + "rewards/cosine_scaled_reward": -0.11644092667847872, + "rewards/format_reward": 0.8958333432674408, + "step": 470 + }, + { + "advantage_max": 1.5878158211708069, + "advantage_mean": -5.8983763762121555e-08, + "advantage_min": -1.0156637877225876, + "advantage_std": 0.9998416975140572, + "completion_length": 1333.4375381469727, + "epoch": 0.5382857142857143, + "grad_norm": 1.898854374885559, + "kl": 0.4473304748535156, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.0919113768029517e-07, + "loss": 0.0179, + "reward": 0.9115908909589052, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.9115908909589052, + "reward_after_std": 0.8066898584365845, + "reward_before_mean": 1.250825822353363, + "reward_before_std": 0.7856942266225815, + "reward_change_max": 0.0, + "reward_change_mean": -0.33923495560884476, + "reward_change_min": -0.5825177431106567, + "reward_change_std": 0.2132352814078331, + "reward_std": 0.806689877063036, + "rewards/cosine_scaled_reward": 0.1566629009321332, + "rewards/format_reward": 0.9375000149011612, + "step": 471 + }, + { + "advantage_max": 1.7407121658325195, + "advantage_mean": -2.235174201281609e-08, + "advantage_min": -1.008700355887413, + "advantage_std": 0.999788723886013, + "completion_length": 1235.4375228881836, + "epoch": 0.5394285714285715, + "grad_norm": 2.009488344192505, + "kl": 0.3810272216796875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.0857018009286381e-07, + "loss": 0.0153, + "reward": 0.3421945869922638, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3421945869922638, + "reward_after_std": 0.5907893590629101, + "reward_before_mean": 0.573419526219368, + "reward_before_std": 0.5489665865898132, + "reward_change_max": 0.0, + "reward_change_mean": -0.23122494295239449, + "reward_change_min": -0.3739938288927078, + "reward_change_std": 0.1434150319546461, + "reward_std": 0.5907893814146519, + "rewards/cosine_scaled_reward": -0.17162358853965998, + "rewards/format_reward": 0.9166666716337204, + "step": 472 + }, + { + "advantage_max": 1.5840518921613693, + "advantage_mean": -2.3050233888266547e-08, + "advantage_min": -1.2334761917591095, + "advantage_std": 0.9997791424393654, + "completion_length": 1190.5000305175781, + "epoch": 0.5405714285714286, + "grad_norm": 1.7814481258392334, + "kl": 0.351806640625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.0797073717209013e-07, + "loss": 0.0141, + "reward": 0.4066920541226864, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.4066920541226864, + "reward_after_std": 0.564589936286211, + "reward_before_mean": 0.6539732131641358, + "reward_before_std": 0.5371009334921837, + "reward_change_max": 0.0, + "reward_change_mean": -0.2472811546176672, + "reward_change_min": -0.39397401735186577, + "reward_change_std": 0.14508823212236166, + "reward_std": 0.5645899474620819, + "rewards/cosine_scaled_reward": -0.15218008181545883, + "rewards/format_reward": 0.9583333432674408, + "step": 473 + }, + { + "advantage_max": 1.506261795759201, + "advantage_mean": -6.829699006338785e-08, + "advantage_min": -1.307990886271, + "advantage_std": 0.9998388066887856, + "completion_length": 1047.6041984558105, + "epoch": 0.5417142857142857, + "grad_norm": 2.3977925777435303, + "kl": 0.4084930419921875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.0739283813397639e-07, + "loss": 0.0163, + "reward": 0.994497782237886, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.994497782237886, + "reward_after_std": 0.7347078956663609, + "reward_before_mean": 1.360306840389967, + "reward_before_std": 0.7280658148229122, + "reward_change_max": 0.0005192682147026062, + "reward_change_mean": -0.36580908484756947, + "reward_change_min": -0.5518549457192421, + "reward_change_std": 0.22464457992464304, + "reward_std": 0.734707910567522, + "rewards/cosine_scaled_reward": 0.2009867411106825, + "rewards/format_reward": 0.9583333432674408, + "step": 474 + }, + { + "advantage_max": 1.69073885679245, + "advantage_mean": -1.7384688355548406e-08, + "advantage_min": -0.9535181820392609, + "advantage_std": 0.9998545795679092, + "completion_length": 1170.8750228881836, + "epoch": 0.5428571428571428, + "grad_norm": 1.6326664686203003, + "kl": 0.35323333740234375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.068365111445064e-07, + "loss": 0.0141, + "reward": 0.3926006439141929, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.3926006439141929, + "reward_after_std": 0.876279816031456, + "reward_before_mean": 0.6156278997659683, + "reward_before_std": 0.8685860857367516, + "reward_change_max": 0.0, + "reward_change_mean": -0.22302727587521076, + "reward_change_min": -0.41833243519067764, + "reward_change_std": 0.15927229821681976, + "reward_std": 0.8762798272073269, + "rewards/cosine_scaled_reward": -0.09843605477362871, + "rewards/format_reward": 0.8125000186264515, + "step": 475 + }, + { + "advantage_max": 1.7651716619729996, + "advantage_mean": -1.8626451714354175e-08, + "advantage_min": -0.993249699473381, + "advantage_std": 0.9998532608151436, + "completion_length": 1432.8958587646484, + "epoch": 0.544, + "grad_norm": 3.7423794269561768, + "kl": 1.0514984130859375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.063017833182728e-07, + "loss": 0.042, + "reward": 0.4363984651863575, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4363984651863575, + "reward_after_std": 0.8045771718025208, + "reward_before_mean": 0.6670037880539894, + "reward_before_std": 0.7607868574559689, + "reward_change_max": 0.002155087888240814, + "reward_change_mean": -0.2306053228676319, + "reward_change_min": -0.38322553038597107, + "reward_change_std": 0.14566559065133333, + "reward_std": 0.8045771829783916, + "rewards/cosine_scaled_reward": -0.0935814508702606, + "rewards/format_reward": 0.8541666828095913, + "step": 476 + }, + { + "advantage_max": 1.626573994755745, + "advantage_mean": -6.953875364423823e-08, + "advantage_min": -0.8963503763079643, + "advantage_std": 0.9998547807335854, + "completion_length": 926.833366394043, + "epoch": 0.5451428571428572, + "grad_norm": 1.9580730199813843, + "kl": 0.38103485107421875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.0578868071715544e-07, + "loss": 0.0152, + "reward": 0.7679003030061722, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.7679003030061722, + "reward_after_std": 0.837723758071661, + "reward_before_mean": 1.074661746621132, + "reward_before_std": 0.8077759854495525, + "reward_change_max": 0.0, + "reward_change_mean": -0.3067614659667015, + "reward_change_min": -0.5569753423333168, + "reward_change_std": 0.20120880007743835, + "reward_std": 0.8377237804234028, + "rewards/cosine_scaled_reward": 0.05816417885944247, + "rewards/format_reward": 0.9583333358168602, + "step": 477 + }, + { + "advantage_max": 1.6303537040948868, + "advantage_mean": -9.313226079221693e-09, + "advantage_min": -1.1908142790198326, + "advantage_std": 0.9998129531741142, + "completion_length": 1573.2292175292969, + "epoch": 0.5462857142857143, + "grad_norm": 1.9702447652816772, + "kl": 0.5894775390625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.0529722834905125e-07, + "loss": 0.0235, + "reward": 0.5314593832008541, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5314593832008541, + "reward_after_std": 0.5914321132004261, + "reward_before_mean": 0.8025474827736616, + "reward_before_std": 0.5585166476666927, + "reward_change_max": 0.0, + "reward_change_mean": -0.2710880674421787, + "reward_change_min": -0.42790525406599045, + "reward_change_std": 0.1580025451257825, + "reward_std": 0.5914321169257164, + "rewards/cosine_scaled_reward": -0.03622628003358841, + "rewards/format_reward": 0.8750000074505806, + "step": 478 + }, + { + "advantage_max": 1.6044540852308273, + "advantage_mean": -2.545615118698663e-08, + "advantage_min": -1.003111258149147, + "advantage_std": 0.9998004958033562, + "completion_length": 1238.7917098999023, + "epoch": 0.5474285714285714, + "grad_norm": 2.2569944858551025, + "kl": 0.76922607421875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.0482745016665526e-07, + "loss": 0.0308, + "reward": 0.34396560629829764, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.34396560629829764, + "reward_after_std": 0.650309395045042, + "reward_before_mean": 0.5750420242547989, + "reward_before_std": 0.6423451155424118, + "reward_change_max": 0.0, + "reward_change_mean": -0.23107640631496906, + "reward_change_min": -0.416254710406065, + "reward_change_std": 0.15591457672417164, + "reward_std": 0.650309432297945, + "rewards/cosine_scaled_reward": -0.16039567068219185, + "rewards/format_reward": 0.8958333432674408, + "step": 479 + }, + { + "advantage_max": 1.6508120000362396, + "advantage_mean": -1.3038516155639002e-08, + "advantage_min": -1.1292091310024261, + "advantage_std": 0.999795213341713, + "completion_length": 1310.0625228881836, + "epoch": 0.5485714285714286, + "grad_norm": 2.6421027183532715, + "kl": 0.9313812255859375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.0437936906629334e-07, + "loss": 0.0373, + "reward": 0.49241532757878304, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.49241532757878304, + "reward_after_std": 0.6414160765707493, + "reward_before_mean": 0.7527041956782341, + "reward_before_std": 0.6134338118135929, + "reward_change_max": 0.0, + "reward_change_mean": -0.260288855060935, + "reward_change_min": -0.4212849773466587, + "reward_change_std": 0.1581674963235855, + "reward_std": 0.6414160802960396, + "rewards/cosine_scaled_reward": -0.09239792544394732, + "rewards/format_reward": 0.9375000149011612, + "step": 480 + }, + { + "advantage_max": 1.6010807305574417, + "advantage_mean": -1.7384688355548406e-08, + "advantage_min": -1.1426882520318031, + "advantage_std": 0.9998107254505157, + "completion_length": 1488.4583740234375, + "epoch": 0.5497142857142857, + "grad_norm": 1.4233379364013672, + "kl": 0.5397415161132812, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.0395300688680625e-07, + "loss": 0.0216, + "reward": 0.37583464104682207, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.37583464104682207, + "reward_after_std": 0.6667274013161659, + "reward_before_mean": 0.6133236847817898, + "reward_before_std": 0.6600178182125092, + "reward_change_max": 0.0, + "reward_change_mean": -0.2374890260398388, + "reward_change_min": -0.4234742745757103, + "reward_change_std": 0.16119471471756697, + "reward_std": 0.6667274124920368, + "rewards/cosine_scaled_reward": -0.12042150646448135, + "rewards/format_reward": 0.8541666865348816, + "step": 481 + }, + { + "advantage_max": 1.5314403101801872, + "advantage_mean": -3.0423204844254315e-08, + "advantage_min": -1.2384950369596481, + "advantage_std": 0.9998578727245331, + "completion_length": 1068.645851135254, + "epoch": 0.5508571428571428, + "grad_norm": 3.218018054962158, + "kl": 0.6636505126953125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.0354838440848501e-07, + "loss": 0.0265, + "reward": 0.7987027624621987, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.7987027624621987, + "reward_after_std": 0.8954098485410213, + "reward_before_mean": 1.1136324778199196, + "reward_before_std": 0.9130747206509113, + "reward_change_max": 0.0013197064399719238, + "reward_change_mean": -0.3149297498166561, + "reward_change_min": -0.5331053957343102, + "reward_change_std": 0.21649126335978508, + "reward_std": 0.8954098559916019, + "rewards/cosine_scaled_reward": 0.09848290542140603, + "rewards/format_reward": 0.9166666865348816, + "step": 482 + }, + { + "advantage_max": 1.6196284890174866, + "advantage_mean": -5.5879355587151736e-09, + "advantage_min": -1.110151432454586, + "advantage_std": 0.9998453184962273, + "completion_length": 1403.6041870117188, + "epoch": 0.552, + "grad_norm": 2.3577053546905518, + "kl": 0.647979736328125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.0316552135205837e-07, + "loss": 0.026, + "reward": 0.6319026295095682, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6319026295095682, + "reward_after_std": 0.8189057596027851, + "reward_before_mean": 0.9091856814920902, + "reward_before_std": 0.7972288802266121, + "reward_change_max": 0.0, + "reward_change_mean": -0.27728303894400597, + "reward_change_min": -0.4613271728157997, + "reward_change_std": 0.1789004895836115, + "reward_std": 0.8189057968556881, + "rewards/cosine_scaled_reward": -0.0037405104376375675, + "rewards/format_reward": 0.916666679084301, + "step": 483 + }, + { + "advantage_max": 1.5716918855905533, + "advantage_mean": -4.96705393482344e-08, + "advantage_min": -1.2597123309969902, + "advantage_std": 0.999833457171917, + "completion_length": 1248.9375381469727, + "epoch": 0.5531428571428572, + "grad_norm": 1.623803973197937, + "kl": 0.68231201171875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.0280443637773163e-07, + "loss": 0.0274, + "reward": 0.5417165439575911, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5417165439575911, + "reward_after_std": 0.7372452020645142, + "reward_before_mean": 0.8069544602185488, + "reward_before_std": 0.7322539314627647, + "reward_change_max": 0.0, + "reward_change_mean": -0.2652379460632801, + "reward_change_min": -0.44489338994026184, + "reward_change_std": 0.17211270052939653, + "reward_std": 0.7372452132403851, + "rewards/cosine_scaled_reward": -0.023606109898537397, + "rewards/format_reward": 0.8541666828095913, + "step": 484 + }, + { + "advantage_max": 1.7099245637655258, + "advantage_mean": -1.4280279625467074e-08, + "advantage_min": -1.1274260729551315, + "advantage_std": 0.9997733682394028, + "completion_length": 965.2708587646484, + "epoch": 0.5542857142857143, + "grad_norm": 2.46708345413208, + "kl": 0.3107147216796875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.0246514708427701e-07, + "loss": 0.0124, + "reward": 0.5324733089655638, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5324733089655638, + "reward_after_std": 0.5767040736973286, + "reward_before_mean": 0.8058453351259232, + "reward_before_std": 0.5237753307446837, + "reward_change_max": 0.0, + "reward_change_mean": -0.27337202802300453, + "reward_change_min": -0.4182005673646927, + "reward_change_std": 0.15673528984189034, + "reward_std": 0.5767040941864252, + "rewards/cosine_scaled_reward": -0.07624401268549263, + "rewards/format_reward": 0.9583333432674408, + "step": 485 + }, + { + "advantage_max": 1.765414834022522, + "advantage_mean": 4.346172255420555e-09, + "advantage_min": -0.9041767120361328, + "advantage_std": 0.9997631907463074, + "completion_length": 862.6041793823242, + "epoch": 0.5554285714285714, + "grad_norm": 2.2586798667907715, + "kl": 0.461151123046875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.0214767000817596e-07, + "loss": 0.0185, + "reward": 0.5346746001159772, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5346746001159772, + "reward_after_std": 0.5360549800097942, + "reward_before_mean": 0.8094351254403591, + "reward_before_std": 0.4725265856832266, + "reward_change_max": 0.0, + "reward_change_mean": -0.2747605200856924, + "reward_change_min": -0.43837128207087517, + "reward_change_std": 0.15725676529109478, + "reward_std": 0.5360549874603748, + "rewards/cosine_scaled_reward": -0.06403244659304619, + "rewards/format_reward": 0.9375000149011612, + "step": 486 + }, + { + "advantage_max": 1.5992169827222824, + "advantage_mean": -3.476937671109681e-08, + "advantage_min": -1.1532378867268562, + "advantage_std": 0.9998497292399406, + "completion_length": 1159.229206085205, + "epoch": 0.5565714285714286, + "grad_norm": 1.466489315032959, + "kl": 0.40506744384765625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.0185202062281336e-07, + "loss": 0.0162, + "reward": 0.7347278879024088, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.7347278879024088, + "reward_after_std": 0.8287507370114326, + "reward_before_mean": 1.0370606668293476, + "reward_before_std": 0.8203627225011587, + "reward_change_max": 0.0031530559062957764, + "reward_change_mean": -0.30233279056847095, + "reward_change_min": -0.5047247745096684, + "reward_change_std": 0.20336042065173388, + "reward_std": 0.8287507519125938, + "rewards/cosine_scaled_reward": 0.08103030489291996, + "rewards/format_reward": 0.8750000149011612, + "step": 487 + }, + { + "advantage_max": 1.5731254816055298, + "advantage_mean": -6.8296991950766994e-09, + "advantage_min": -1.1291136145591736, + "advantage_std": 0.9997728988528252, + "completion_length": 994.7708511352539, + "epoch": 0.5577142857142857, + "grad_norm": 1.401617407798767, + "kl": 0.2291259765625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.0157821333772304e-07, + "loss": 0.0092, + "reward": 0.3939766474068165, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.3939766474068165, + "reward_after_std": 0.541620772331953, + "reward_before_mean": 0.6443051844835281, + "reward_before_std": 0.5205097924917936, + "reward_change_max": 0.0014897137880325317, + "reward_change_mean": -0.2503285203129053, + "reward_change_min": -0.3798618447035551, + "reward_change_std": 0.1473851716145873, + "reward_std": 0.5416207909584045, + "rewards/cosine_scaled_reward": -0.16743076220154762, + "rewards/format_reward": 0.9791666716337204, + "step": 488 + }, + { + "advantage_max": 1.7112076729536057, + "advantage_mean": -2.483526828633842e-09, + "advantage_min": -0.9740470126271248, + "advantage_std": 0.9997779279947281, + "completion_length": 1289.8958892822266, + "epoch": 0.5588571428571428, + "grad_norm": 2.128153085708618, + "kl": 0.60748291015625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.013262614978859e-07, + "loss": 0.0243, + "reward": 0.18788336508441716, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.18788336508441716, + "reward_after_std": 0.49343743547797203, + "reward_before_mean": 0.39214257523417473, + "reward_before_std": 0.45715833082795143, + "reward_change_max": 0.0, + "reward_change_mean": -0.20425921119749546, + "reward_change_min": -0.33435640670359135, + "reward_change_std": 0.11896615382283926, + "reward_std": 0.4934374466538429, + "rewards/cosine_scaled_reward": -0.25184538774192333, + "rewards/format_reward": 0.8958333432674408, + "step": 489 + }, + { + "advantage_max": 1.7367768734693527, + "advantage_mean": -4.7187011076310625e-08, + "advantage_min": -0.9195685312151909, + "advantage_std": 0.9997827708721161, + "completion_length": 1221.7500228881836, + "epoch": 0.56, + "grad_norm": 1.6772288084030151, + "kl": 0.43460845947265625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.0109617738307911e-07, + "loss": 0.0173, + "reward": 0.7127898004837334, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7127898004837334, + "reward_after_std": 0.6761909052729607, + "reward_before_mean": 1.0172344259917736, + "reward_before_std": 0.6181280873715878, + "reward_change_max": 0.0, + "reward_change_mean": -0.3044446799904108, + "reward_change_min": -0.4935770258307457, + "reward_change_std": 0.18400804046541452, + "reward_std": 0.6761909127235413, + "rewards/cosine_scaled_reward": 0.019033881602808833, + "rewards/format_reward": 0.9791666716337204, + "step": 490 + }, + { + "advantage_max": 1.6076852083206177, + "advantage_mean": -2.0178655746327934e-08, + "advantage_min": -1.1311817914247513, + "advantage_std": 0.999858483672142, + "completion_length": 1386.5833740234375, + "epoch": 0.5611428571428572, + "grad_norm": 1.6963690519332886, + "kl": 0.5140380859375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.0088797220727779e-07, + "loss": 0.0206, + "reward": 0.7685029455460608, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.7685029455460608, + "reward_after_std": 0.8746408671140671, + "reward_before_mean": 1.072487998753786, + "reward_before_std": 0.8700667172670364, + "reward_change_max": 0.0, + "reward_change_mean": -0.3039850238710642, + "reward_change_min": -0.5531091075390577, + "reward_change_std": 0.20315628219395876, + "reward_std": 0.874640878289938, + "rewards/cosine_scaled_reward": 0.0883273258805275, + "rewards/format_reward": 0.8958333432674408, + "step": 491 + }, + { + "advantage_max": 1.6640974879264832, + "advantage_mean": -1.8626452047421083e-08, + "advantage_min": -0.9844799563288689, + "advantage_std": 0.9997724816203117, + "completion_length": 1141.6250228881836, + "epoch": 0.5622857142857143, + "grad_norm": 2.6036548614501953, + "kl": 0.2451629638671875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.0070165611810855e-07, + "loss": 0.0098, + "reward": 0.5456169964745641, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5456169964745641, + "reward_after_std": 0.6167141497135162, + "reward_before_mean": 0.8201209753751755, + "reward_before_std": 0.5838107857853174, + "reward_change_max": 0.0, + "reward_change_mean": -0.2745039686560631, + "reward_change_min": -0.4336605127900839, + "reward_change_std": 0.16109473910182714, + "reward_std": 0.6167141608893871, + "rewards/cosine_scaled_reward": -0.08993951743468642, + "rewards/format_reward": 1.0, + "step": 492 + }, + { + "advantage_max": 1.5081126242876053, + "advantage_mean": -3.2906732116977366e-08, + "advantage_min": -1.241824135184288, + "advantage_std": 0.9998714104294777, + "completion_length": 1327.3750305175781, + "epoch": 0.5634285714285714, + "grad_norm": 3.3362510204315186, + "kl": 0.88671875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.005372381963547e-07, + "loss": 0.0354, + "reward": 0.7321626851335168, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.7321626851335168, + "reward_after_std": 0.9542999267578125, + "reward_before_mean": 1.0276228515431285, + "reward_before_std": 0.996599406003952, + "reward_change_max": 0.0023065879940986633, + "reward_change_mean": -0.29546014219522476, + "reward_change_min": -0.5584845636039972, + "reward_change_std": 0.22464005090296268, + "reward_std": 0.9542999565601349, + "rewards/cosine_scaled_reward": 0.08672806993126869, + "rewards/format_reward": 0.8541666865348816, + "step": 493 + }, + { + "advantage_max": 1.626434475183487, + "advantage_mean": -5.743156084037082e-08, + "advantage_min": -1.069405935704708, + "advantage_std": 0.9998456314206123, + "completion_length": 977.3333740234375, + "epoch": 0.5645714285714286, + "grad_norm": 0.7659087181091309, + "kl": 0.0702667236328125, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.0039472645551372e-07, + "loss": 0.0028, + "reward": 0.7825715020298958, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.7825715020298958, + "reward_after_std": 0.7810787223279476, + "reward_before_mean": 1.0978356748819351, + "reward_before_std": 0.7559734731912613, + "reward_change_max": 0.0, + "reward_change_mean": -0.3152642250061035, + "reward_change_min": -0.5318855568766594, + "reward_change_std": 0.19349340070039034, + "reward_std": 0.7810787446796894, + "rewards/cosine_scaled_reward": 0.05933448998257518, + "rewards/format_reward": 0.9791666716337204, + "step": 494 + }, + { + "advantage_max": 1.5209117978811264, + "advantage_mean": -6.208815683805824e-10, + "advantage_min": -1.1773782223463058, + "advantage_std": 0.9998350664973259, + "completion_length": 1327.4167022705078, + "epoch": 0.5657142857142857, + "grad_norm": 1.8314779996871948, + "kl": 0.396820068359375, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.002741278414069e-07, + "loss": 0.0159, + "reward": 0.61646170867607, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.61646170867607, + "reward_after_std": 0.7499135285615921, + "reward_before_mean": 0.9014103151857853, + "reward_before_std": 0.7563531119376421, + "reward_change_max": 0.0020629242062568665, + "reward_change_mean": -0.2849485781043768, + "reward_change_min": -0.5066464394330978, + "reward_change_std": 0.19107064697891474, + "reward_std": 0.7499135434627533, + "rewards/cosine_scaled_reward": -0.02846152102574706, + "rewards/format_reward": 0.9583333432674408, + "step": 495 + }, + { + "advantage_max": 1.607502669095993, + "advantage_mean": -2.7629237009385577e-08, + "advantage_min": -1.1763157099485397, + "advantage_std": 0.9998272061347961, + "completion_length": 1260.8958930969238, + "epoch": 0.5668571428571428, + "grad_norm": 2.71171236038208, + "kl": 0.671295166015625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.0017544823184055e-07, + "loss": 0.0268, + "reward": 0.5370115237310529, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5370115237310529, + "reward_after_std": 0.7076732777059078, + "reward_before_mean": 0.8070572554133832, + "reward_before_std": 0.7081652507185936, + "reward_change_max": 0.0, + "reward_change_mean": -0.2700457442551851, + "reward_change_min": -0.4546791072934866, + "reward_change_std": 0.17802696116268635, + "reward_std": 0.7076733037829399, + "rewards/cosine_scaled_reward": 0.0181119367480278, + "rewards/format_reward": 0.7708333395421505, + "step": 496 + }, + { + "advantage_max": 1.5937969386577606, + "advantage_mean": 3.725292629930266e-09, + "advantage_min": -1.0978671796619892, + "advantage_std": 0.9997798278927803, + "completion_length": 1065.6458587646484, + "epoch": 0.568, + "grad_norm": 2.594193696975708, + "kl": 0.5436172485351562, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.0009869243631952e-07, + "loss": 0.0218, + "reward": 0.759913792979205, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.759913792979205, + "reward_after_std": 0.6107112001627684, + "reward_before_mean": 1.0830433431547135, + "reward_before_std": 0.5669339969754219, + "reward_change_max": 0.0005127191543579102, + "reward_change_mean": -0.32312954775989056, + "reward_change_min": -0.4868850149214268, + "reward_change_std": 0.19642097689211369, + "reward_std": 0.610711220651865, + "rewards/cosine_scaled_reward": 0.0936049991287291, + "rewards/format_reward": 0.8958333507180214, + "step": 497 + }, + { + "advantage_max": 1.6812223494052887, + "advantage_mean": -8.07146260939362e-09, + "advantage_min": -1.058426357805729, + "advantage_std": 0.9998469650745392, + "completion_length": 1336.208381652832, + "epoch": 0.5691428571428572, + "grad_norm": 2.6437041759490967, + "kl": 0.59625244140625, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.000438641958131e-07, + "loss": 0.0239, + "reward": 0.37023794968263246, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.37023794968263246, + "reward_after_std": 0.8235311470925808, + "reward_before_mean": 0.5918802302330732, + "reward_before_std": 0.8089180812239647, + "reward_change_max": 0.0, + "reward_change_mean": -0.2216422688215971, + "reward_change_min": -0.41504093259572983, + "reward_change_std": 0.15451766457408667, + "reward_std": 0.8235311731696129, + "rewards/cosine_scaled_reward": -0.14155989978462458, + "rewards/format_reward": 0.8750000074505806, + "step": 498 + }, + { + "advantage_max": 1.6329741179943085, + "advantage_mean": -1.8626452269465688e-08, + "advantage_min": -1.2069002091884613, + "advantage_std": 0.9998198002576828, + "completion_length": 1437.1458740234375, + "epoch": 0.5702857142857143, + "grad_norm": 1.9619033336639404, + "kl": 0.6727371215820312, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1.0001096618257236e-07, + "loss": 0.027, + "reward": 0.5796591965481639, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5796591965481639, + "reward_after_std": 0.7735571078956127, + "reward_before_mean": 0.8483046852052212, + "reward_before_std": 0.7528968974947929, + "reward_change_max": 0.0, + "reward_change_mean": -0.2686454653739929, + "reward_change_min": -0.45184313133358955, + "reward_change_std": 0.17254280857741833, + "reward_std": 0.7735571376979351, + "rewards/cosine_scaled_reward": -0.013347673695534468, + "rewards/format_reward": 0.8750000149011612, + "step": 499 + }, + { + "advantage_max": 1.7250654101371765, + "advantage_mean": -2.7318796558262193e-08, + "advantage_min": -1.0749098509550095, + "advantage_std": 0.9998650997877121, + "completion_length": 1341.2708740234375, + "epoch": 0.5714285714285714, + "grad_norm": 1.3440532684326172, + "kl": 0.57421875, + "lambda_div_used": 0.7999999999999999, + "learning_rate": 1e-07, + "loss": 0.0229, + "reward": 0.5764958932995796, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5764958932995796, + "reward_after_std": 0.8865836374461651, + "reward_before_mean": 0.8342440668493509, + "reward_before_std": 0.8536566384136677, + "reward_change_max": 0.0, + "reward_change_mean": -0.25774817913770676, + "reward_change_min": -0.44375080429017544, + "reward_change_std": 0.16682033147662878, + "reward_std": 0.8865836411714554, + "rewards/cosine_scaled_reward": -0.020377989509142935, + "rewards/format_reward": 0.8750000111758709, + "step": 500 + }, + { + "epoch": 0.5714285714285714, + "step": 500, + "total_flos": 0.0, + "train_loss": 0.005620188481842881, + "train_runtime": 52392.8619, + "train_samples_per_second": 0.458, + "train_steps_per_second": 0.01 + } + ], + "logging_steps": 1, + "max_steps": 500, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}