{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5714285714285714, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "advantage_max": 1.5284520238637924, "advantage_mean": 5.587935225648266e-09, "advantage_min": -1.2071861922740936, "advantage_std": 0.9998291358351707, "completion_length": 2571.2083587646484, "epoch": 0.001142857142857143, "grad_norm": 0.1954135149717331, "kl": 0.0, "lambda_div_used": 0.7999999999999999, "learning_rate": 2e-08, "loss": -0.0, "reward": 0.2781674414873123, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2781674414873123, "reward_after_std": 0.7989529222249985, "reward_before_mean": 0.4897647276520729, "reward_before_std": 0.8290339298546314, "reward_change_max": 0.00028071552515029907, "reward_change_mean": -0.2115972964093089, "reward_change_min": -0.4146200343966484, "reward_change_std": 0.16823832830414176, "reward_std": 0.7989529222249985, "rewards/cosine_scaled_reward": -0.015534311532974243, "rewards/format_reward": 0.5208333488553762, "step": 1 }, { "advantage_max": 1.440006211400032, "advantage_mean": 1.8005570590062803e-08, "advantage_min": -1.196444258093834, "advantage_std": 0.9997469857335091, "completion_length": 2804.395881652832, "epoch": 0.002285714285714286, "grad_norm": 0.18144740164279938, "kl": 0.0, "lambda_div_used": 0.7999999999999999, "learning_rate": 4e-08, "loss": -0.0, "reward": 0.07961943745613098, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.07961943745613098, "reward_after_std": 0.42851265892386436, "reward_before_mean": 0.27539755403995514, "reward_before_std": 0.42092561535537243, "reward_change_max": 0.0006531104445457458, "reward_change_mean": -0.19577811146155, "reward_change_min": -0.3188221678137779, "reward_change_std": 0.13006281899288297, "reward_std": 0.42851267009973526, "rewards/cosine_scaled_reward": -0.04980122856795788, "rewards/format_reward": 0.37500000558793545, "step": 2 }, { "advantage_max": 1.746221885085106, "advantage_mean": 4.2219957641087547e-08, "advantage_min": -1.0095228850841522, "advantage_std": 0.9996554180979729, "completion_length": 3243.9166870117188, "epoch": 0.0034285714285714284, "grad_norm": 0.16465424001216888, "kl": 4.291534423828125e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 6e-08, "loss": 0.0, "reward": -0.3449444258585572, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3449444258585572, "reward_after_std": 0.45981596130877733, "reward_before_mean": -0.24710791744291782, "reward_before_std": 0.44803297333419323, "reward_change_max": 0.0, "reward_change_mean": -0.09783650934696198, "reward_change_min": -0.18044066429138184, "reward_change_std": 0.0714771922212094, "reward_std": 0.4598159771412611, "rewards/cosine_scaled_reward": -0.21730396151542664, "rewards/format_reward": 0.1875000074505806, "step": 3 }, { "advantage_max": 1.7801509350538254, "advantage_mean": 4.346172077784871e-08, "advantage_min": -0.9519815891981125, "advantage_std": 0.9998303577303886, "completion_length": 2360.3333740234375, "epoch": 0.004571428571428572, "grad_norm": 0.19501709938049316, "kl": 3.4168362617492676e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 8e-08, "loss": 0.0, "reward": 0.1274346588179469, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1274346588179469, "reward_after_std": 0.8090842105448246, "reward_before_mean": 0.29423846723511815, "reward_before_std": 0.7769927233457565, "reward_change_max": 0.0007443279027938843, "reward_change_mean": -0.1668038028292358, "reward_change_min": -0.27894750237464905, "reward_change_std": 0.10436896700412035, "reward_std": 0.8090842328965664, "rewards/cosine_scaled_reward": -0.14454743452370167, "rewards/format_reward": 0.5833333414047956, "step": 4 }, { "advantage_max": 1.8142333924770355, "advantage_mean": 2.8560559472978753e-08, "advantage_min": -0.8820345476269722, "advantage_std": 0.9997325539588928, "completion_length": 3316.5833435058594, "epoch": 0.005714285714285714, "grad_norm": 0.29815995693206787, "kl": 4.64441254734993e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 1e-07, "loss": 0.0, "reward": -0.4195337798446417, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.4195337798446417, "reward_after_std": 0.5661724396049976, "reward_before_mean": -0.3463286105543375, "reward_before_std": 0.5550692845135927, "reward_change_max": 5.933642387390137e-05, "reward_change_mean": -0.07320518855703995, "reward_change_min": -0.15249120816588402, "reward_change_std": 0.05859863373916596, "reward_std": 0.5661724433302879, "rewards/cosine_scaled_reward": -0.2564976374414982, "rewards/format_reward": 0.16666667349636555, "step": 5 }, { "advantage_max": 1.6999807804822922, "advantage_mean": 2.173086377510458e-09, "advantage_min": -1.0560009852051735, "advantage_std": 0.9997874870896339, "completion_length": 3148.7083740234375, "epoch": 0.006857142857142857, "grad_norm": 0.19052845239639282, "kl": 4.4211745262145996e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.2e-07, "loss": 0.0, "reward": -0.17428898997604847, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.17428898997604847, "reward_after_std": 0.7152512725442648, "reward_before_mean": -0.060131706297397614, "reward_before_std": 0.7098329775035381, "reward_change_max": 0.00010123848915100098, "reward_change_mean": -0.1141572711057961, "reward_change_min": -0.2221994549036026, "reward_change_std": 0.09262027451768517, "reward_std": 0.7152512930333614, "rewards/cosine_scaled_reward": -0.1446491980459541, "rewards/format_reward": 0.22916667349636555, "step": 6 }, { "advantage_max": 1.6318527460098267, "advantage_mean": 8.692344177774203e-09, "advantage_min": -1.0107521638274193, "advantage_std": 0.9998287558555603, "completion_length": 3002.0209045410156, "epoch": 0.008, "grad_norm": 0.1495773047208786, "kl": 2.2746622562408447e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.4e-07, "loss": 0.0, "reward": 0.2590144984424114, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2590144984424114, "reward_after_std": 0.7710397019982338, "reward_before_mean": 0.465796678327024, "reward_before_std": 0.7602494731545448, "reward_change_max": 0.00027186423540115356, "reward_change_mean": -0.20678219757974148, "reward_change_min": -0.4025501310825348, "reward_change_std": 0.1585400952026248, "reward_std": 0.7710397355258465, "rewards/cosine_scaled_reward": -0.06918500177562237, "rewards/format_reward": 0.6041666753590107, "step": 7 }, { "advantage_max": 1.5791463106870651, "advantage_mean": -2.235174290099451e-08, "advantage_min": -1.0492639392614365, "advantage_std": 0.9998195469379425, "completion_length": 2731.458366394043, "epoch": 0.009142857142857144, "grad_norm": 0.16950403153896332, "kl": 2.017989754676819e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.6e-07, "loss": 0.0, "reward": 0.33847012650221586, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.33847012650221586, "reward_after_std": 0.8393780812621117, "reward_before_mean": 0.5573682561516762, "reward_before_std": 0.8381953444331884, "reward_change_max": 0.0001522451639175415, "reward_change_mean": -0.21889818459749222, "reward_change_min": -0.3979425001889467, "reward_change_std": 0.17012304533272982, "reward_std": 0.839378122240305, "rewards/cosine_scaled_reward": 0.0807674679235788, "rewards/format_reward": 0.39583333767950535, "step": 8 }, { "advantage_max": 1.5046225041151047, "advantage_mean": -9.934107758624577e-09, "advantage_min": -1.0348439291119576, "advantage_std": 0.999826692044735, "completion_length": 3279.3959350585938, "epoch": 0.010285714285714285, "grad_norm": 0.1702805757522583, "kl": 4.1447579860687256e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.8e-07, "loss": 0.0, "reward": 0.04321199515834451, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.04321199515834451, "reward_after_std": 0.86030338332057, "reward_before_mean": 0.20256559806875885, "reward_before_std": 0.9098124578595161, "reward_change_max": 0.0004805624485015869, "reward_change_mean": -0.15935362223535776, "reward_change_min": -0.4127720557153225, "reward_change_std": 0.16414799448102713, "reward_std": 0.8603034280240536, "rewards/cosine_scaled_reward": -0.07580053666606545, "rewards/format_reward": 0.3541666716337204, "step": 9 }, { "advantage_max": 1.5961240381002426, "advantage_mean": 1.8005570145973593e-08, "advantage_min": -0.9946450442075729, "advantage_std": 0.999793753027916, "completion_length": 2836.2083740234375, "epoch": 0.011428571428571429, "grad_norm": 0.25149303674697876, "kl": 2.4536624550819397e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 2e-07, "loss": 0.0, "reward": -0.18135902285575867, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.18135902285575867, "reward_after_std": 0.6317292973399162, "reward_before_mean": -0.05718802846968174, "reward_before_std": 0.647784736007452, "reward_change_max": 0.0007606744766235352, "reward_change_mean": -0.12417101149912924, "reward_change_min": -0.2580604609102011, "reward_change_std": 0.10907016729470342, "reward_std": 0.6317293085157871, "rewards/cosine_scaled_reward": -0.17442734353244305, "rewards/format_reward": 0.29166667349636555, "step": 10 }, { "advantage_max": 1.539865881204605, "advantage_mean": 5.401670999383157e-08, "advantage_min": -1.039774589240551, "advantage_std": 0.999774768948555, "completion_length": 3389.2083740234375, "epoch": 0.012571428571428572, "grad_norm": 0.18692435324192047, "kl": 3.342330455780029e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.1999999999999998e-07, "loss": 0.0, "reward": -0.32752789044752717, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.32752789044752717, "reward_after_std": 0.6646033525466919, "reward_before_mean": -0.23555048741400242, "reward_before_std": 0.6987365372478962, "reward_change_max": 0.0015906840562820435, "reward_change_mean": -0.09197739418596029, "reward_change_min": -0.23515602201223373, "reward_change_std": 0.10101438639685512, "reward_std": 0.664603378623724, "rewards/cosine_scaled_reward": -0.19069190602749586, "rewards/format_reward": 0.1458333358168602, "step": 11 }, { "advantage_max": 1.5572585463523865, "advantage_mean": -2.1109978043387656e-08, "advantage_min": -1.1109998114407063, "advantage_std": 0.9998391792178154, "completion_length": 2623.2083740234375, "epoch": 0.013714285714285714, "grad_norm": 0.19698189198970795, "kl": 4.7206878662109375e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.4e-07, "loss": 0.0, "reward": 0.22108511440455914, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.22108511440455914, "reward_after_std": 0.7672835923731327, "reward_before_mean": 0.4219110906124115, "reward_before_std": 0.7882883995771408, "reward_change_max": 0.0005475208163261414, "reward_change_mean": -0.20082599110901356, "reward_change_min": -0.38023688457906246, "reward_change_std": 0.15198138437699527, "reward_std": 0.7672836184501648, "rewards/cosine_scaled_reward": -0.07029447052627802, "rewards/format_reward": 0.562500013038516, "step": 12 }, { "advantage_max": 1.743264377117157, "advantage_mean": 8.071463275527435e-09, "advantage_min": -0.9408519268035889, "advantage_std": 0.9997067749500275, "completion_length": 3043.0833740234375, "epoch": 0.014857142857142857, "grad_norm": 0.21475082635879517, "kl": 3.6597251892089844e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.6e-07, "loss": 0.0, "reward": 0.07958985678851604, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.07958985678851604, "reward_after_std": 0.45133444853127, "reward_before_mean": 0.269378375262022, "reward_before_std": 0.4202164653688669, "reward_change_max": 0.00046546757221221924, "reward_change_mean": -0.18978853151202202, "reward_change_min": -0.3052454721182585, "reward_change_std": 0.11908666882663965, "reward_std": 0.45133446156978607, "rewards/cosine_scaled_reward": -0.07364415284246206, "rewards/format_reward": 0.41666667349636555, "step": 13 }, { "advantage_max": 1.5736165642738342, "advantage_mean": -1.490116174895917e-08, "advantage_min": -1.0592405423521996, "advantage_std": 0.9998423829674721, "completion_length": 2943.6458740234375, "epoch": 0.016, "grad_norm": 0.1676233559846878, "kl": 2.8867274522781372e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.8e-07, "loss": 0.0, "reward": 0.042353540658950806, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.042353540658950806, "reward_after_std": 0.8170745447278023, "reward_before_mean": 0.19811346009373665, "reward_before_std": 0.8294630981981754, "reward_change_max": 0.0007490590214729309, "reward_change_mean": -0.1557599287480116, "reward_change_min": -0.3378952704370022, "reward_change_std": 0.1310901055112481, "reward_std": 0.8170745633542538, "rewards/cosine_scaled_reward": -0.08844327414408326, "rewards/format_reward": 0.37500000931322575, "step": 14 }, { "advantage_max": 1.7620093375444412, "advantage_mean": -6.767610916114108e-08, "advantage_min": -0.9291465580463409, "advantage_std": 0.9997983947396278, "completion_length": 2803.687530517578, "epoch": 0.017142857142857144, "grad_norm": 0.20531098544597626, "kl": 3.5099685192108154e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.22886049561202526, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.22886049561202526, "reward_after_std": 0.6503882668912411, "reward_before_mean": 0.43278513848781586, "reward_before_std": 0.6143735107034445, "reward_change_max": 0.0001234114170074463, "reward_change_mean": -0.20392468804493546, "reward_change_min": -0.33097884617745876, "reward_change_std": 0.1261005480773747, "reward_std": 0.6503882892429829, "rewards/cosine_scaled_reward": 0.008059246152697597, "rewards/format_reward": 0.4166666679084301, "step": 15 }, { "advantage_max": 1.6819724440574646, "advantage_mean": 3.352761290820183e-08, "advantage_min": -0.9239891991019249, "advantage_std": 0.9997138306498528, "completion_length": 3483.0625, "epoch": 0.018285714285714287, "grad_norm": 0.1682252585887909, "kl": 3.56920063495636e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.2e-07, "loss": 0.0, "reward": -0.5018233098089695, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.5018233098089695, "reward_after_std": 0.5142541136592627, "reward_before_mean": -0.43988440558314323, "reward_before_std": 0.524260614067316, "reward_change_max": 0.001581817865371704, "reward_change_mean": -0.061938912025652826, "reward_change_min": -0.14432105235755444, "reward_change_std": 0.059407457476481795, "reward_std": 0.514254117384553, "rewards/cosine_scaled_reward": -0.27202554512768984, "rewards/format_reward": 0.10416666977107525, "step": 16 }, { "advantage_max": 1.6719516217708588, "advantage_mean": 9.313226079221693e-09, "advantage_min": -1.0224130228161812, "advantage_std": 0.9998493865132332, "completion_length": 2241.5208778381348, "epoch": 0.019428571428571427, "grad_norm": 0.26219215989112854, "kl": 4.7460198402404785e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.4000000000000003e-07, "loss": 0.0, "reward": 0.26397125981748104, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.26397125981748104, "reward_after_std": 0.8621900156140327, "reward_before_mean": 0.4611313515342772, "reward_before_std": 0.8517364151775837, "reward_change_max": 0.0003229379653930664, "reward_change_mean": -0.19716009264811873, "reward_change_min": -0.382378987967968, "reward_change_std": 0.14341816492378712, "reward_std": 0.8621900603175163, "rewards/cosine_scaled_reward": -0.06110099982470274, "rewards/format_reward": 0.5833333358168602, "step": 17 }, { "advantage_max": 1.6789888739585876, "advantage_mean": 3.601114006990258e-08, "advantage_min": -1.0117030963301659, "advantage_std": 0.9998209774494171, "completion_length": 2828.041702270508, "epoch": 0.02057142857142857, "grad_norm": 0.1963949203491211, "kl": 2.551823854446411e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.6e-07, "loss": 0.0, "reward": 0.038342010229825974, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.038342010229825974, "reward_after_std": 0.7603168785572052, "reward_before_mean": 0.1992601379752159, "reward_before_std": 0.7735672742128372, "reward_change_max": 0.0, "reward_change_mean": -0.16091809794306755, "reward_change_min": -0.3329150825738907, "reward_change_std": 0.13252681493759155, "reward_std": 0.7603169232606888, "rewards/cosine_scaled_reward": -0.11911993799731135, "rewards/format_reward": 0.4375000111758709, "step": 18 }, { "advantage_max": 1.4660861641168594, "advantage_mean": 3.8494665677291096e-08, "advantage_min": -1.297618992626667, "advantage_std": 0.9997608736157417, "completion_length": 2957.3125610351562, "epoch": 0.021714285714285714, "grad_norm": 0.20244932174682617, "kl": 3.133341670036316e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.7999999999999996e-07, "loss": 0.0, "reward": 0.1908008144237101, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1908008144237101, "reward_after_std": 0.654106542468071, "reward_before_mean": 0.3968064859509468, "reward_before_std": 0.6879367008805275, "reward_change_max": 0.0, "reward_change_mean": -0.20600564847700298, "reward_change_min": -0.386847285553813, "reward_change_std": 0.1555578326806426, "reward_std": 0.6541065834462643, "rewards/cosine_scaled_reward": 0.021319888532161713, "rewards/format_reward": 0.3541666716337204, "step": 19 }, { "advantage_max": 1.4845254868268967, "advantage_mean": 5.587935891782081e-09, "advantage_min": -1.1302195489406586, "advantage_std": 0.9997977316379547, "completion_length": 2531.250068664551, "epoch": 0.022857142857142857, "grad_norm": 0.2369108349084854, "kl": 2.1103769540786743e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 4e-07, "loss": 0.0, "reward": 0.2883046194911003, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2883046194911003, "reward_after_std": 0.7216798197478056, "reward_before_mean": 0.5108506195247173, "reward_before_std": 0.760659109801054, "reward_change_max": 0.0005205795168876648, "reward_change_mean": -0.22254599630832672, "reward_change_min": -0.4257675837725401, "reward_change_std": 0.1746676228940487, "reward_std": 0.7216798420995474, "rewards/cosine_scaled_reward": -0.025824700482189655, "rewards/format_reward": 0.5625000093132257, "step": 20 }, { "advantage_max": 1.5942478775978088, "advantage_mean": -1.9247333504779363e-08, "advantage_min": -1.0359741151332855, "advantage_std": 0.999727338552475, "completion_length": 2936.229202270508, "epoch": 0.024, "grad_norm": 0.21686924993991852, "kl": 4.1641294956207275e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.1999999999999995e-07, "loss": 0.0, "reward": 0.0217365100979805, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.0217365100979805, "reward_after_std": 0.4158258568495512, "reward_before_mean": 0.20216498710215092, "reward_before_std": 0.39231848157942295, "reward_change_max": 0.0008887350559234619, "reward_change_mean": -0.18042848724871874, "reward_change_min": -0.28633993305265903, "reward_change_std": 0.11420578742399812, "reward_std": 0.41582586988806725, "rewards/cosine_scaled_reward": -0.08641753438860178, "rewards/format_reward": 0.37500000558793545, "step": 21 }, { "advantage_max": 1.4737742841243744, "advantage_mean": -3.91155481338501e-08, "advantage_min": -1.2801623418927193, "advantage_std": 0.9998248517513275, "completion_length": 1926.7083587646484, "epoch": 0.025142857142857144, "grad_norm": 0.34481367468833923, "kl": 2.0101666450500488e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.3999999999999997e-07, "loss": 0.0, "reward": 0.5544739328324795, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5544739328324795, "reward_after_std": 0.6403865702450275, "reward_before_mean": 0.8353974921628833, "reward_before_std": 0.6390982866287231, "reward_change_max": 0.0, "reward_change_mean": -0.2809235444292426, "reward_change_min": -0.43336551636457443, "reward_change_std": 0.17704242002218962, "reward_std": 0.6403866037726402, "rewards/cosine_scaled_reward": 0.05311539862304926, "rewards/format_reward": 0.729166679084301, "step": 22 }, { "advantage_max": 1.7071952670812607, "advantage_mean": 1.7384688244526103e-08, "advantage_min": -1.0333659648895264, "advantage_std": 0.9998112097382545, "completion_length": 2552.354232788086, "epoch": 0.026285714285714287, "grad_norm": 0.1796521246433258, "kl": 2.888031303882599e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.6e-07, "loss": 0.0, "reward": 0.27095113415271044, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.27095113415271044, "reward_after_std": 0.8931148834526539, "reward_before_mean": 0.46421122178435326, "reward_before_std": 0.8784515354782343, "reward_change_max": 0.0, "reward_change_mean": -0.19326008297502995, "reward_change_min": -0.35773650370538235, "reward_change_std": 0.13798850402235985, "reward_std": 0.8931148983538151, "rewards/cosine_scaled_reward": -0.038727725856006145, "rewards/format_reward": 0.5416666846722364, "step": 23 }, { "advantage_max": 1.6876031756401062, "advantage_mean": 1.8005570812107408e-08, "advantage_min": -0.9738664329051971, "advantage_std": 0.9998361021280289, "completion_length": 2762.0208892822266, "epoch": 0.027428571428571427, "grad_norm": 0.2101898491382599, "kl": 2.544000744819641e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.8e-07, "loss": 0.0, "reward": 0.345141158439219, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.345141158439219, "reward_after_std": 0.9434406235814095, "reward_before_mean": 0.5552112711593509, "reward_before_std": 0.9572884701192379, "reward_change_max": 0.0001425519585609436, "reward_change_mean": -0.2100701043382287, "reward_change_min": -0.41364777088165283, "reward_change_std": 0.16276149917393923, "reward_std": 0.943440642207861, "rewards/cosine_scaled_reward": 0.017188958823680878, "rewards/format_reward": 0.5208333358168602, "step": 24 }, { "advantage_max": 1.6947638243436813, "advantage_mean": 2.9491882130860958e-08, "advantage_min": -1.0033902749419212, "advantage_std": 0.9997927024960518, "completion_length": 2592.041702270508, "epoch": 0.02857142857142857, "grad_norm": 0.1946055144071579, "kl": 2.8666399884968996e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 5e-07, "loss": 0.0, "reward": -0.019655877724289894, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.019655877724289894, "reward_after_std": 0.6330535672605038, "reward_before_mean": 0.13670190423727036, "reward_before_std": 0.6341910995543003, "reward_change_max": 0.0004054456949234009, "reward_change_mean": -0.15635776380077004, "reward_change_min": -0.29186590760946274, "reward_change_std": 0.11788261751644313, "reward_std": 0.6330535747110844, "rewards/cosine_scaled_reward": -0.13998239114880562, "rewards/format_reward": 0.4166666716337204, "step": 25 }, { "advantage_max": 1.4772356897592545, "advantage_mean": 1.8936893386722886e-08, "advantage_min": -1.2885529696941376, "advantage_std": 0.9997257962822914, "completion_length": 2979.750045776367, "epoch": 0.029714285714285714, "grad_norm": 0.15528684854507446, "kl": 2.7121976017951965e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.2e-07, "loss": 0.0, "reward": 0.18640884256456047, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.18640884256456047, "reward_after_std": 0.5655598230659962, "reward_before_mean": 0.3976056035608053, "reward_before_std": 0.5835991930216551, "reward_change_max": 0.0, "reward_change_mean": -0.211196749471128, "reward_change_min": -0.35364590398967266, "reward_change_std": 0.14743430353701115, "reward_std": 0.5655598565936089, "rewards/cosine_scaled_reward": -0.009530545212328434, "rewards/format_reward": 0.4166666716337204, "step": 26 }, { "advantage_max": 1.5101052522659302, "advantage_mean": -3.725290853573426e-09, "advantage_min": -1.162248358130455, "advantage_std": 0.9997957795858383, "completion_length": 3105.395866394043, "epoch": 0.030857142857142857, "grad_norm": 0.20341047644615173, "kl": 3.5478733479976654e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.4e-07, "loss": 0.0, "reward": -0.07532213162630796, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.07532213162630796, "reward_after_std": 0.6800261065363884, "reward_before_mean": 0.071015989407897, "reward_before_std": 0.7142519578337669, "reward_change_max": 0.00037226080894470215, "reward_change_mean": -0.1463381163775921, "reward_change_min": -0.3533896040171385, "reward_change_std": 0.13723576348274946, "reward_std": 0.6800261326134205, "rewards/cosine_scaled_reward": -0.13115867972373962, "rewards/format_reward": 0.33333333767950535, "step": 27 }, { "advantage_max": 1.573099598288536, "advantage_mean": 3.849466723160333e-08, "advantage_min": -1.1150267273187637, "advantage_std": 0.9998030215501785, "completion_length": 2894.7708740234375, "epoch": 0.032, "grad_norm": 0.18499167263507843, "kl": 3.559887409210205e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.6e-07, "loss": 0.0, "reward": 0.16924337297677994, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.16924337297677994, "reward_after_std": 0.8595078736543655, "reward_before_mean": 0.3546360591426492, "reward_before_std": 0.8972543813288212, "reward_change_max": 0.00022292882204055786, "reward_change_mean": -0.1853926726616919, "reward_change_min": -0.4034426249563694, "reward_change_std": 0.16123890411108732, "reward_std": 0.8595079220831394, "rewards/cosine_scaled_reward": -0.031015303684398532, "rewards/format_reward": 0.41666667349636555, "step": 28 }, { "advantage_max": 1.6571213752031326, "advantage_mean": 2.60770322002557e-08, "advantage_min": -1.023493006825447, "advantage_std": 0.9997655674815178, "completion_length": 3431.125030517578, "epoch": 0.03314285714285714, "grad_norm": 0.16913940012454987, "kl": 2.547353506088257e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.8e-07, "loss": 0.0, "reward": -0.41504489071667194, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.41504489071667194, "reward_after_std": 0.5817391686141491, "reward_before_mean": -0.3383926101960242, "reward_before_std": 0.5976084098219872, "reward_change_max": 0.0009746253490447998, "reward_change_mean": -0.07665228494443, "reward_change_min": -0.1926496997475624, "reward_change_std": 0.07682911981828511, "reward_std": 0.58173917979002, "rewards/cosine_scaled_reward": -0.24211297556757927, "rewards/format_reward": 0.1458333358168602, "step": 29 }, { "advantage_max": 1.5788090974092484, "advantage_mean": 3.073364673866763e-08, "advantage_min": -1.1611577719449997, "advantage_std": 0.999844379723072, "completion_length": 3024.166732788086, "epoch": 0.03428571428571429, "grad_norm": 0.1804714798927307, "kl": 2.2821128368377686e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 6e-07, "loss": 0.0, "reward": 0.16531530115753412, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.16531530115753412, "reward_after_std": 0.9392066784203053, "reward_before_mean": 0.3431825675070286, "reward_before_std": 0.977239266037941, "reward_change_max": 0.00046259909868240356, "reward_change_mean": -0.1778672719374299, "reward_change_min": -0.3380998335778713, "reward_change_std": 0.1529026017524302, "reward_std": 0.9392067044973373, "rewards/cosine_scaled_reward": -0.02632538042962551, "rewards/format_reward": 0.39583334513008595, "step": 30 }, { "advantage_max": 1.565725862979889, "advantage_mean": 5.0291419750880806e-08, "advantage_min": -1.1075504496693611, "advantage_std": 0.9997828751802444, "completion_length": 3189.791702270508, "epoch": 0.03542857142857143, "grad_norm": 0.17691554129123688, "kl": 1.4291144907474518e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.2e-07, "loss": 0.0, "reward": -0.2586498372256756, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.2586498372256756, "reward_after_std": 0.6254929751157761, "reward_before_mean": -0.14998156733054202, "reward_before_std": 0.6498836800456047, "reward_change_max": 0.0, "reward_change_mean": -0.10866825701668859, "reward_change_min": -0.2638713177293539, "reward_change_std": 0.10512422723695636, "reward_std": 0.6254929825663567, "rewards/cosine_scaled_reward": -0.1791574526578188, "rewards/format_reward": 0.2083333395421505, "step": 31 }, { "advantage_max": 1.6380120068788528, "advantage_mean": 4.0046871152554786e-08, "advantage_min": -1.063338615000248, "advantage_std": 0.9998109042644501, "completion_length": 3098.2708892822266, "epoch": 0.036571428571428574, "grad_norm": 0.1579563021659851, "kl": 2.1046027541160583e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.4e-07, "loss": 0.0, "reward": 0.20158327370882034, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.20158327370882034, "reward_after_std": 0.6476054154336452, "reward_before_mean": 0.40322505310177803, "reward_before_std": 0.6246501244604588, "reward_change_max": 0.0005530714988708496, "reward_change_mean": -0.20164170674979687, "reward_change_min": -0.3284043725579977, "reward_change_std": 0.13300398597493768, "reward_std": 0.6476054154336452, "rewards/cosine_scaled_reward": 0.014112494885921478, "rewards/format_reward": 0.3750000111758709, "step": 32 }, { "advantage_max": 1.749614492058754, "advantage_mean": 3.10440866346795e-08, "advantage_min": -0.8797961287200451, "advantage_std": 0.9998220503330231, "completion_length": 3210.875030517578, "epoch": 0.037714285714285714, "grad_norm": 0.18192623555660248, "kl": 2.9414892196655273e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.6e-07, "loss": 0.0, "reward": -0.229713948443532, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.229713948443532, "reward_after_std": 0.8090166114270687, "reward_before_mean": -0.1334769814275205, "reward_before_std": 0.8181703835725784, "reward_change_max": 0.0003531351685523987, "reward_change_mean": -0.09623696561902761, "reward_change_min": -0.2311191949993372, "reward_change_std": 0.09210817841812968, "reward_std": 0.8090166114270687, "rewards/cosine_scaled_reward": -0.1917384904809296, "rewards/format_reward": 0.2500000074505806, "step": 33 }, { "advantage_max": 1.3934744000434875, "advantage_mean": -2.0489096863585132e-08, "advantage_min": -1.2853097319602966, "advantage_std": 0.9998067542910576, "completion_length": 2512.229202270508, "epoch": 0.038857142857142854, "grad_norm": 0.23543889820575714, "kl": 3.3546239137649536e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.800000000000001e-07, "loss": 0.0, "reward": 0.5275341346859932, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.5275341346859932, "reward_after_std": 0.8143299408257008, "reward_before_mean": 0.7970656305551529, "reward_before_std": 0.8669828977435827, "reward_change_max": 0.0003156885504722595, "reward_change_mean": -0.26953150518238544, "reward_change_min": -0.4885811097919941, "reward_change_std": 0.20852079056203365, "reward_std": 0.8143299594521523, "rewards/cosine_scaled_reward": 0.1276994850486517, "rewards/format_reward": 0.541666679084301, "step": 34 }, { "advantage_max": 1.5553712397813797, "advantage_mean": 5.0912303040107076e-08, "advantage_min": -1.1710423156619072, "advantage_std": 0.9997952580451965, "completion_length": 3039.5625534057617, "epoch": 0.04, "grad_norm": 0.2202300876379013, "kl": 4.0199607610702515e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 7e-07, "loss": 0.0, "reward": -0.0422150120139122, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.0422150120139122, "reward_after_std": 0.7378262039273977, "reward_before_mean": 0.10306231211870909, "reward_before_std": 0.7508488856256008, "reward_change_max": 0.0005578547716140747, "reward_change_mean": -0.1452772947959602, "reward_change_min": -0.27861824072897434, "reward_change_std": 0.11854508286342025, "reward_std": 0.7378262225538492, "rewards/cosine_scaled_reward": -0.11513552069664001, "rewards/format_reward": 0.33333333767950535, "step": 35 }, { "advantage_max": 1.6745474636554718, "advantage_mean": 1.862645149230957e-08, "advantage_min": -1.0759836360812187, "advantage_std": 0.9997546598315239, "completion_length": 3388.3333435058594, "epoch": 0.04114285714285714, "grad_norm": 0.18159471452236176, "kl": 2.983957529067993e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.2e-07, "loss": 0.0, "reward": -0.2961166016757488, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2961166016757488, "reward_after_std": 0.621714374050498, "reward_before_mean": -0.19892606884241104, "reward_before_std": 0.6282427292317152, "reward_change_max": 0.0011901110410690308, "reward_change_mean": -0.09719053143635392, "reward_change_min": -0.20177586935460567, "reward_change_std": 0.08723159926012158, "reward_std": 0.6217143908143044, "rewards/cosine_scaled_reward": -0.1932130428031087, "rewards/format_reward": 0.18750000186264515, "step": 36 }, { "advantage_max": 1.5861639976501465, "advantage_mean": 3.476937804336444e-08, "advantage_min": -1.0689679086208344, "advantage_std": 0.9997375980019569, "completion_length": 3153.1666870117188, "epoch": 0.04228571428571429, "grad_norm": 0.15919217467308044, "kl": 2.7313828468322754e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.4e-07, "loss": 0.0, "reward": -0.288279028609395, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.288279028609395, "reward_after_std": 0.5106042847037315, "reward_before_mean": -0.17917627468705177, "reward_before_std": 0.5166680738329887, "reward_change_max": 0.0004027709364891052, "reward_change_mean": -0.10910274600610137, "reward_change_min": -0.21067695692181587, "reward_change_std": 0.08434443082660437, "reward_std": 0.510604303330183, "rewards/cosine_scaled_reward": -0.20417147409170866, "rewards/format_reward": 0.22916666977107525, "step": 37 }, { "advantage_max": 1.5880178362131119, "advantage_mean": 6.76761100493195e-08, "advantage_min": -1.0702078267931938, "advantage_std": 0.9997448474168777, "completion_length": 3234.812530517578, "epoch": 0.04342857142857143, "grad_norm": 0.1559906005859375, "kl": 2.4594366550445557e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.599999999999999e-07, "loss": 0.0, "reward": -0.17546416074037552, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.17546416074037552, "reward_after_std": 0.48496192693710327, "reward_before_mean": -0.04148578643798828, "reward_before_std": 0.47018149122595787, "reward_change_max": 0.0006171911954879761, "reward_change_mean": -0.1339783607982099, "reward_change_min": -0.24786392971873283, "reward_change_std": 0.09997595380991697, "reward_std": 0.48496193811297417, "rewards/cosine_scaled_reward": -0.11449289601296186, "rewards/format_reward": 0.1875, "step": 38 }, { "advantage_max": 1.6684064418077469, "advantage_mean": 1.7384688355548406e-08, "advantage_min": -0.9282046966254711, "advantage_std": 0.9997768849134445, "completion_length": 2681.9792098999023, "epoch": 0.044571428571428574, "grad_norm": 0.21223194897174835, "kl": 1.5147030353546143e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.799999999999999e-07, "loss": 0.0, "reward": 0.13156955409795046, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.13156955409795046, "reward_after_std": 0.6208246052265167, "reward_before_mean": 0.32295298824647034, "reward_before_std": 0.615071473759599, "reward_change_max": 0.0005634576082229614, "reward_change_mean": -0.1913834000006318, "reward_change_min": -0.36978882923722267, "reward_change_std": 0.14454567758366466, "reward_std": 0.6208246275782585, "rewards/cosine_scaled_reward": -0.06769019179046154, "rewards/format_reward": 0.4583333358168602, "step": 39 }, { "advantage_max": 1.655065342783928, "advantage_mean": 4.0978195503527104e-08, "advantage_min": -1.0308792516589165, "advantage_std": 0.9998110383749008, "completion_length": 2748.8334197998047, "epoch": 0.045714285714285714, "grad_norm": 0.24321109056472778, "kl": 2.113846130669117e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 8e-07, "loss": 0.0, "reward": -0.0020755892619490623, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.0020755892619490623, "reward_after_std": 0.6307364739477634, "reward_before_mean": 0.15877384413033724, "reward_before_std": 0.633373312652111, "reward_change_max": 0.0006052106618881226, "reward_change_mean": -0.16084943106397986, "reward_change_min": -0.29501060023903847, "reward_change_std": 0.12053346633911133, "reward_std": 0.6307364776730537, "rewards/cosine_scaled_reward": -0.13936308398842812, "rewards/format_reward": 0.4375000037252903, "step": 40 }, { "advantage_max": 1.7262564301490784, "advantage_mean": -1.4901161637936866e-08, "advantage_min": -0.8481588140130043, "advantage_std": 0.9998667165637016, "completion_length": 3027.6250610351562, "epoch": 0.046857142857142854, "grad_norm": 0.14843006432056427, "kl": 1.632794737815857e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.199999999999999e-07, "loss": 0.0, "reward": 0.0690681068226695, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.0690681068226695, "reward_after_std": 0.949213694781065, "reward_before_mean": 0.21814646408893168, "reward_before_std": 0.9525175243616104, "reward_change_max": 0.0005423203110694885, "reward_change_mean": -0.1490783947519958, "reward_change_min": -0.3277065698057413, "reward_change_std": 0.1299262880347669, "reward_std": 0.9492137059569359, "rewards/cosine_scaled_reward": -0.0992600962636061, "rewards/format_reward": 0.41666666977107525, "step": 41 }, { "advantage_max": 1.6299243718385696, "advantage_mean": 3.849466723160333e-08, "advantage_min": -1.0378217250108719, "advantage_std": 0.9997099861502647, "completion_length": 2848.7083435058594, "epoch": 0.048, "grad_norm": 0.27955660223960876, "kl": 3.980100154876709e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.399999999999999e-07, "loss": 0.0, "reward": -0.3497111543547362, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3497111543547362, "reward_after_std": 0.40947396866977215, "reward_before_mean": -0.24614406749606133, "reward_before_std": 0.4084125515073538, "reward_change_max": 0.0008740201592445374, "reward_change_mean": -0.10356708150357008, "reward_change_min": -0.20952831394970417, "reward_change_std": 0.08169832732528448, "reward_std": 0.40947398729622364, "rewards/cosine_scaled_reward": -0.26890537329018116, "rewards/format_reward": 0.2916666679084301, "step": 42 }, { "advantage_max": 1.5146324634552002, "advantage_mean": 2.2351742345882997e-08, "advantage_min": -1.0660174414515495, "advantage_std": 0.9998110681772232, "completion_length": 3207.312530517578, "epoch": 0.04914285714285714, "grad_norm": 0.1813088059425354, "kl": 3.0316412448883057e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.599999999999999e-07, "loss": 0.0, "reward": 0.03310042805969715, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.03310042805969715, "reward_after_std": 0.6946298256516457, "reward_before_mean": 0.2010643444955349, "reward_before_std": 0.7252266854047775, "reward_change_max": 0.0003236532211303711, "reward_change_mean": -0.1679639001376927, "reward_change_min": -0.3087310250848532, "reward_change_std": 0.133016605861485, "reward_std": 0.6946298368275166, "rewards/cosine_scaled_reward": -0.024467838928103447, "rewards/format_reward": 0.2500000037252903, "step": 43 }, { "advantage_max": 1.4735457301139832, "advantage_mean": 3.445893648201803e-08, "advantage_min": -1.25593750923872, "advantage_std": 0.9997965097427368, "completion_length": 2763.541702270508, "epoch": 0.05028571428571429, "grad_norm": 0.2757427990436554, "kl": 8.969008922576904e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.799999999999999e-07, "loss": 0.0, "reward": 0.3218963295221329, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3218963295221329, "reward_after_std": 0.7521186731755733, "reward_before_mean": 0.548164501786232, "reward_before_std": 0.7862608321011066, "reward_change_max": 0.00045564770698547363, "reward_change_mean": -0.22626816853880882, "reward_change_min": -0.4221660625189543, "reward_change_std": 0.17340195435099304, "reward_std": 0.7521186843514442, "rewards/cosine_scaled_reward": 0.024082249961793423, "rewards/format_reward": 0.5000000111758709, "step": 44 }, { "advantage_max": 1.4633118212223053, "advantage_mean": 1.6142925329809543e-08, "advantage_min": -1.1568865031003952, "advantage_std": 0.9998212307691574, "completion_length": 3349.7083435058594, "epoch": 0.05142857142857143, "grad_norm": 0.15581615269184113, "kl": 3.790855407714844e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 9e-07, "loss": 0.0, "reward": 0.03242434747517109, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.03242434747517109, "reward_after_std": 0.8387883510440588, "reward_before_mean": 0.193950076274632, "reward_before_std": 0.9024345204234123, "reward_change_max": 0.0004741176962852478, "reward_change_mean": -0.16152575379237533, "reward_change_min": -0.38652732968330383, "reward_change_std": 0.16492859972640872, "reward_std": 0.8387883920222521, "rewards/cosine_scaled_reward": -0.04885828774422407, "rewards/format_reward": 0.2916666753590107, "step": 45 }, { "advantage_max": 1.5030267983675003, "advantage_mean": 3.9736430867964856e-08, "advantage_min": -1.1738435924053192, "advantage_std": 0.9997305870056152, "completion_length": 3177.500015258789, "epoch": 0.052571428571428575, "grad_norm": 0.18234607577323914, "kl": 5.453219637274742e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.2e-07, "loss": 0.0, "reward": -0.41419660672545433, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.41419660672545433, "reward_after_std": 0.3787010833621025, "reward_before_mean": -0.319430336356163, "reward_before_std": 0.3915441706776619, "reward_change_max": 0.00046756118535995483, "reward_change_mean": -0.09476626757532358, "reward_change_min": -0.18947959877550602, "reward_change_std": 0.07930145971477032, "reward_std": 0.3787010908126831, "rewards/cosine_scaled_reward": -0.2430485039949417, "rewards/format_reward": 0.1666666679084301, "step": 46 }, { "advantage_max": 1.5899191051721573, "advantage_mean": 1.490116141589226e-08, "advantage_min": -1.1299419030547142, "advantage_std": 0.9998330995440483, "completion_length": 2857.875045776367, "epoch": 0.053714285714285714, "grad_norm": 0.21604590117931366, "kl": 4.843901842832565e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.399999999999999e-07, "loss": 0.0, "reward": 0.1896874513477087, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1896874513477087, "reward_after_std": 0.9090043418109417, "reward_before_mean": 0.3739042170345783, "reward_before_std": 0.9458130896091461, "reward_change_max": 0.0004040822386741638, "reward_change_mean": -0.18421675683930516, "reward_change_min": -0.3901894185692072, "reward_change_std": 0.1618319470435381, "reward_std": 0.9090043976902962, "rewards/cosine_scaled_reward": -0.04221456404775381, "rewards/format_reward": 0.45833334513008595, "step": 47 }, { "advantage_max": 1.6952250599861145, "advantage_mean": 7.450580485901526e-09, "advantage_min": -1.0082858800888062, "advantage_std": 0.9997942596673965, "completion_length": 2867.062530517578, "epoch": 0.054857142857142854, "grad_norm": 0.18633955717086792, "kl": 0.00017702952027320862, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.6e-07, "loss": 0.0, "reward": -0.06481963396072388, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.06481963396072388, "reward_after_std": 0.7094365991652012, "reward_before_mean": 0.07328080199658871, "reward_before_std": 0.7008146084845066, "reward_change_max": 0.0003138333559036255, "reward_change_mean": -0.138100431766361, "reward_change_min": -0.26004249788820744, "reward_change_std": 0.10102563817054033, "reward_std": 0.709436621516943, "rewards/cosine_scaled_reward": -0.11960960738360882, "rewards/format_reward": 0.31250000186264515, "step": 48 }, { "advantage_max": 1.6078014522790909, "advantage_mean": 2.048909675256283e-08, "advantage_min": -1.0765347704291344, "advantage_std": 0.9998094365000725, "completion_length": 2509.7500762939453, "epoch": 0.056, "grad_norm": 0.2146293818950653, "kl": 8.615851402282715e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.8e-07, "loss": 0.0, "reward": 0.2446850063279271, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2446850063279271, "reward_after_std": 0.8773886524140835, "reward_before_mean": 0.44149335473775864, "reward_before_std": 0.8929827082902193, "reward_change_max": 2.390146255493164e-05, "reward_change_mean": -0.19680832419544458, "reward_change_min": -0.4127675499767065, "reward_change_std": 0.1626983918249607, "reward_std": 0.877388671040535, "rewards/cosine_scaled_reward": -0.039670001016929746, "rewards/format_reward": 0.5208333358168602, "step": 49 }, { "advantage_max": 1.4889815598726273, "advantage_mean": 3.011276339393021e-08, "advantage_min": -1.19339619576931, "advantage_std": 0.9997689872980118, "completion_length": 2894.2500228881836, "epoch": 0.05714285714285714, "grad_norm": 0.16805996000766754, "kl": 0.00012464821338653564, "lambda_div_used": 0.7999999999999999, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.1484907530248165, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1484907530248165, "reward_after_std": 0.6498783119022846, "reward_before_mean": 0.34459424391388893, "reward_before_std": 0.6648210007697344, "reward_change_max": 0.0, "reward_change_mean": -0.19610351603478193, "reward_change_min": -0.3851831816136837, "reward_change_std": 0.15433361660689116, "reward_std": 0.6498783379793167, "rewards/cosine_scaled_reward": 0.005630466155707836, "rewards/format_reward": 0.33333334140479565, "step": 50 }, { "advantage_max": 1.5318742841482162, "advantage_mean": 2.2351741790771484e-08, "advantage_min": -1.0917627662420273, "advantage_std": 0.9998071938753128, "completion_length": 2442.750045776367, "epoch": 0.05828571428571429, "grad_norm": 0.22353631258010864, "kl": 0.00023058801889419556, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.999890338174275e-07, "loss": 0.0, "reward": 0.16875093430280685, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.16875093430280685, "reward_after_std": 0.7135776728391647, "reward_before_mean": 0.36318233981728554, "reward_before_std": 0.7265008091926575, "reward_change_max": 0.0001400187611579895, "reward_change_mean": -0.1944313943386078, "reward_change_min": -0.38128767162561417, "reward_change_std": 0.15260417386889458, "reward_std": 0.7135776914656162, "rewards/cosine_scaled_reward": -0.06840883800759912, "rewards/format_reward": 0.5000000074505806, "step": 51 }, { "advantage_max": 1.6993749290704727, "advantage_mean": 5.650023915393376e-08, "advantage_min": -0.9708743765950203, "advantage_std": 0.9998618885874748, "completion_length": 2915.1250534057617, "epoch": 0.05942857142857143, "grad_norm": 0.24529911577701569, "kl": 0.0002612881362438202, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.999561358041868e-07, "loss": 0.0, "reward": 0.2074111569672823, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2074111569672823, "reward_after_std": 0.9998992644250393, "reward_before_mean": 0.3853365269023925, "reward_before_std": 1.025053035467863, "reward_change_max": 0.0015909001231193542, "reward_change_mean": -0.17792532336898148, "reward_change_min": -0.35967270471155643, "reward_change_std": 0.1540224920026958, "reward_std": 0.9998992942273617, "rewards/cosine_scaled_reward": -0.005248422268778086, "rewards/format_reward": 0.3958333395421505, "step": 52 }, { "advantage_max": 1.6098654568195343, "advantage_mean": -7.45058070794613e-09, "advantage_min": -1.1653679832816124, "advantage_std": 0.9998446926474571, "completion_length": 2782.6250610351562, "epoch": 0.060571428571428575, "grad_norm": 0.19299021363258362, "kl": 0.00019798800349235535, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.999013075636804e-07, "loss": 0.0, "reward": 0.5741048266645521, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.5741048266645521, "reward_after_std": 0.8326258882880211, "reward_before_mean": 0.8417719714343548, "reward_before_std": 0.8171909488737583, "reward_change_max": 0.00019788742065429688, "reward_change_mean": -0.26766714826226234, "reward_change_min": -0.4490860775113106, "reward_change_std": 0.17976201511919498, "reward_std": 0.8326258957386017, "rewards/cosine_scaled_reward": 0.09796930849552155, "rewards/format_reward": 0.6458333469927311, "step": 53 }, { "advantage_max": 1.4913389384746552, "advantage_mean": 7.450580929990736e-09, "advantage_min": -1.1815472394227982, "advantage_std": 0.9998629316687584, "completion_length": 2925.4584045410156, "epoch": 0.061714285714285715, "grad_norm": 0.18331590294837952, "kl": 4.350394010543823e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.998245517681593e-07, "loss": 0.0, "reward": 0.42693280428647995, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.42693280428647995, "reward_after_std": 1.0608244948089123, "reward_before_mean": 0.6571898418478668, "reward_before_std": 1.1342002339661121, "reward_change_max": 0.0, "reward_change_mean": -0.2302570380270481, "reward_change_min": -0.4764634482562542, "reward_change_std": 0.2096308101899922, "reward_std": 1.060824528336525, "rewards/cosine_scaled_reward": 0.09942827746272087, "rewards/format_reward": 0.4583333469927311, "step": 54 }, { "advantage_max": 1.6245067715644836, "advantage_mean": -4.3461718668424965e-09, "advantage_min": -0.9192568957805634, "advantage_std": 0.9998599514365196, "completion_length": 3095.250030517578, "epoch": 0.06285714285714286, "grad_norm": 0.1808166652917862, "kl": 0.00031198933720588684, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.997258721585931e-07, "loss": 0.0, "reward": 0.10264583956450224, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.10264583956450224, "reward_after_std": 0.989537637680769, "reward_before_mean": 0.2625856678932905, "reward_before_std": 1.0343751683831215, "reward_change_max": 0.0006072595715522766, "reward_change_mean": -0.15993982320651412, "reward_change_min": -0.3970005866140127, "reward_change_std": 0.16320207389071584, "reward_std": 0.9895376712083817, "rewards/cosine_scaled_reward": -0.04579051467590034, "rewards/format_reward": 0.35416666977107525, "step": 55 }, { "advantage_max": 1.4185740798711777, "advantage_mean": -1.2728075787782345e-08, "advantage_min": -1.3164423555135727, "advantage_std": 0.9997757226228714, "completion_length": 2954.458396911621, "epoch": 0.064, "grad_norm": 0.16553562879562378, "kl": 4.982948303222656e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.996052735444862e-07, "loss": 0.0, "reward": 0.14053409546613693, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.14053409546613693, "reward_after_std": 0.6105850823223591, "reward_before_mean": 0.3413895256817341, "reward_before_std": 0.6498310156166553, "reward_change_max": 0.0002667754888534546, "reward_change_mean": -0.2008554646745324, "reward_change_min": -0.36588573828339577, "reward_change_std": 0.15487558394670486, "reward_std": 0.6105850897729397, "rewards/cosine_scaled_reward": -0.027221906930208206, "rewards/format_reward": 0.3958333432674408, "step": 56 }, { "advantage_max": 1.62399423122406, "advantage_mean": 1.6763807009212428e-08, "advantage_min": -0.9893188774585724, "advantage_std": 0.9997558370232582, "completion_length": 3263.812530517578, "epoch": 0.06514285714285714, "grad_norm": 0.12242446094751358, "kl": 2.575106918811798e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.994627618036452e-07, "loss": 0.0, "reward": -0.31899499148130417, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.31899499148130417, "reward_after_std": 0.5046761892735958, "reward_before_mean": -0.21401306986808777, "reward_before_std": 0.515403188765049, "reward_change_max": 0.00018546730279922485, "reward_change_mean": -0.10498191299848258, "reward_change_min": -0.23341021686792374, "reward_change_std": 0.08848186326213181, "reward_std": 0.5046762004494667, "rewards/cosine_scaled_reward": -0.22158987261354923, "rewards/format_reward": 0.22916666977107525, "step": 57 }, { "advantage_max": 1.560118854045868, "advantage_mean": 4.346172421954009e-09, "advantage_min": -1.130972020328045, "advantage_std": 0.9998400285840034, "completion_length": 2412.2708892822266, "epoch": 0.06628571428571428, "grad_norm": 0.2044786512851715, "kl": 0.0010882318019866943, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.992983438818915e-07, "loss": 0.0, "reward": 0.2907734867185354, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2907734867185354, "reward_after_std": 0.8199318274855614, "reward_before_mean": 0.501036109868437, "reward_before_std": 0.8361069560050964, "reward_change_max": 9.82433557510376e-05, "reward_change_mean": -0.21026262175291777, "reward_change_min": -0.4459004085510969, "reward_change_std": 0.16762463841587305, "reward_std": 0.819931834936142, "rewards/cosine_scaled_reward": -0.07239861227571964, "rewards/format_reward": 0.6458333395421505, "step": 58 }, { "advantage_max": 1.7214796096086502, "advantage_mean": 6.100163174593831e-08, "advantage_min": -1.0006466582417488, "advantage_std": 0.9996944218873978, "completion_length": 2856.187515258789, "epoch": 0.06742857142857143, "grad_norm": 0.16563907265663147, "kl": 7.169204764068127e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.991120277927223e-07, "loss": 0.0, "reward": -0.12141696130856872, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.12141696130856872, "reward_after_std": 0.5117970556020737, "reward_before_mean": 0.019515281077474356, "reward_before_std": 0.486568721011281, "reward_change_max": 0.0001515224575996399, "reward_change_mean": -0.14093223959207535, "reward_change_min": -0.23997912742197514, "reward_change_std": 0.09495735168457031, "reward_std": 0.511797059327364, "rewards/cosine_scaled_reward": -0.13607568992301822, "rewards/format_reward": 0.2916666679084301, "step": 59 }, { "advantage_max": 1.693726196885109, "advantage_mean": 4.2840839042934675e-08, "advantage_min": -1.0366674736142159, "advantage_std": 0.9997766688466072, "completion_length": 2948.125045776367, "epoch": 0.06857142857142857, "grad_norm": 0.16493219137191772, "kl": 0.0001513250172138214, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.989038226169207e-07, "loss": 0.0, "reward": -0.09105870872735977, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.09105870872735977, "reward_after_std": 0.686959195882082, "reward_before_mean": 0.04372099228203297, "reward_before_std": 0.6810658574104309, "reward_change_max": 0.00010024756193161011, "reward_change_mean": -0.1347797194030136, "reward_change_min": -0.27008931152522564, "reward_change_std": 0.10537515860050917, "reward_std": 0.6869592033326626, "rewards/cosine_scaled_reward": -0.16563950292766094, "rewards/format_reward": 0.3750000037252903, "step": 60 }, { "advantage_max": 1.3835174441337585, "advantage_mean": 1.4280279403422469e-08, "advantage_min": -1.3410531058907509, "advantage_std": 0.9998080208897591, "completion_length": 3108.3958740234375, "epoch": 0.06971428571428571, "grad_norm": 0.17201192677021027, "kl": 0.00028091808781027794, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.98673738502114e-07, "loss": 0.0, "reward": 0.23692655563354492, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.23692655563354492, "reward_after_std": 0.8451647032052279, "reward_before_mean": 0.4443775750696659, "reward_before_std": 0.9202493615448475, "reward_change_max": 0.0005453750491142273, "reward_change_mean": -0.2074510301463306, "reward_change_min": -0.4427679292857647, "reward_change_std": 0.1908978926949203, "reward_std": 0.8451647274196148, "rewards/cosine_scaled_reward": 0.01385545451194048, "rewards/format_reward": 0.41666667722165585, "step": 61 }, { "advantage_max": 1.806217536330223, "advantage_mean": -1.9868215961338365e-08, "advantage_min": -0.887168750166893, "advantage_std": 0.9998476803302765, "completion_length": 2752.0834197998047, "epoch": 0.07085714285714285, "grad_norm": 0.19357971847057343, "kl": 0.0005202442407608032, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.98421786662277e-07, "loss": 0.0, "reward": 0.3755918840470258, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3755918840470258, "reward_after_std": 0.8301325552165508, "reward_before_mean": 0.5955550698563457, "reward_before_std": 0.790971240028739, "reward_change_max": 0.00032426416873931885, "reward_change_mean": -0.21996318409219384, "reward_change_min": -0.36245875246822834, "reward_change_std": 0.13983059162274003, "reward_std": 0.8301325589418411, "rewards/cosine_scaled_reward": 0.01652752747759223, "rewards/format_reward": 0.5625000093132257, "step": 62 }, { "advantage_max": 1.525961548089981, "advantage_mean": -1.7384688355548406e-08, "advantage_min": -1.127110406756401, "advantage_std": 0.999846026301384, "completion_length": 2262.6458892822266, "epoch": 0.072, "grad_norm": 0.20876488089561462, "kl": 0.0008518993854522705, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.981479793771866e-07, "loss": 0.0, "reward": 0.7662359848618507, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7662359848618507, "reward_after_std": 0.909334447234869, "reward_before_mean": 1.0756743550300598, "reward_before_std": 0.9336301498115063, "reward_change_max": 8.27684998512268e-05, "reward_change_mean": -0.30943838274106383, "reward_change_min": -0.5926293302327394, "reward_change_std": 0.22869372786954045, "reward_std": 0.9093344509601593, "rewards/cosine_scaled_reward": 0.16283717821352184, "rewards/format_reward": 0.7500000149011612, "step": 63 }, { "advantage_max": 1.5487523674964905, "advantage_mean": 1.8005570256995895e-08, "advantage_min": -1.0956207066774368, "advantage_std": 0.9997992515563965, "completion_length": 2957.4583587646484, "epoch": 0.07314285714285715, "grad_norm": 0.18354231119155884, "kl": 0.000421963632106781, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.97852329991824e-07, "loss": 0.0, "reward": 0.13231371343135834, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.13231371343135834, "reward_after_std": 0.9308353830128908, "reward_before_mean": 0.3047089036554098, "reward_before_std": 0.9812476169317961, "reward_change_max": 0.0006142035126686096, "reward_change_mean": -0.17239517951384187, "reward_change_min": -0.4214697778224945, "reward_change_std": 0.17370698787271976, "reward_std": 0.930835397914052, "rewards/cosine_scaled_reward": -0.024728883057832718, "rewards/format_reward": 0.3541666753590107, "step": 64 }, { "advantage_max": 1.708268865942955, "advantage_mean": 2.8560559917067962e-08, "advantage_min": -0.9543309882283211, "advantage_std": 0.9997209906578064, "completion_length": 2630.708366394043, "epoch": 0.07428571428571429, "grad_norm": 0.2504342198371887, "kl": 0.0003350377082824707, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.975348529157229e-07, "loss": 0.0, "reward": -0.05934199318289757, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.05934199318289757, "reward_after_std": 0.5288446377962828, "reward_before_mean": 0.09470355277881026, "reward_before_std": 0.5060996157117188, "reward_change_max": 0.000491216778755188, "reward_change_mean": -0.15404552780091763, "reward_change_min": -0.25607624277472496, "reward_change_std": 0.10440768301486969, "reward_std": 0.5288446471095085, "rewards/cosine_scaled_reward": -0.17139821499586105, "rewards/format_reward": 0.43750000558793545, "step": 65 }, { "advantage_max": 1.6425568014383316, "advantage_mean": 4.035731526741415e-09, "advantage_min": -1.1181970834732056, "advantage_std": 0.9997746720910072, "completion_length": 2079.5208435058594, "epoch": 0.07542857142857143, "grad_norm": 0.27181532979011536, "kl": 0.00034831464290618896, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.971955636222684e-07, "loss": 0.0, "reward": 0.4275641590356827, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4275641590356827, "reward_after_std": 0.553373359143734, "reward_before_mean": 0.683971457183361, "reward_before_std": 0.5225912407040596, "reward_change_max": 0.0009416639804840088, "reward_change_mean": -0.2564073045505211, "reward_change_min": -0.40475964918732643, "reward_change_std": 0.15713666449300945, "reward_std": 0.5533733814954758, "rewards/cosine_scaled_reward": 0.09198573045432568, "rewards/format_reward": 0.5, "step": 66 }, { "advantage_max": 1.5558879524469376, "advantage_mean": 6.457170098617127e-08, "advantage_min": -1.0571024790406227, "advantage_std": 0.9997260123491287, "completion_length": 3432.6875, "epoch": 0.07657142857142857, "grad_norm": 0.13905449211597443, "kl": 0.0003644903190433979, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.968344786479415e-07, "loss": 0.0, "reward": -0.47604336217045784, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.47604336217045784, "reward_after_std": 0.4485955648124218, "reward_before_mean": -0.40127869322896004, "reward_before_std": 0.459877897053957, "reward_change_max": 0.0012263581156730652, "reward_change_mean": -0.07476464239880443, "reward_change_min": -0.16927983611822128, "reward_change_std": 0.06955831311643124, "reward_std": 0.4485955722630024, "rewards/cosine_scaled_reward": -0.25272269267588854, "rewards/format_reward": 0.1041666716337204, "step": 67 }, { "advantage_max": 1.4970561861991882, "advantage_mean": -4.96705393482344e-09, "advantage_min": -1.2759140655398369, "advantage_std": 0.9997949376702309, "completion_length": 1971.9167175292969, "epoch": 0.07771428571428571, "grad_norm": 0.24344401061534882, "kl": 0.00177764892578125, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.964516155915151e-07, "loss": 0.0001, "reward": 0.3115036394447088, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.3115036394447088, "reward_after_std": 0.640905000269413, "reward_before_mean": 0.5413862890563905, "reward_before_std": 0.6520151775330305, "reward_change_max": 0.00031547248363494873, "reward_change_mean": -0.22988267801702023, "reward_change_min": -0.3882032725960016, "reward_change_std": 0.1589061007834971, "reward_std": 0.6409050039947033, "rewards/cosine_scaled_reward": -0.06264018453657627, "rewards/format_reward": 0.666666679084301, "step": 68 }, { "advantage_max": 1.7017274498939514, "advantage_mean": 5.091230503850852e-08, "advantage_min": -0.9753537401556969, "advantage_std": 0.999798521399498, "completion_length": 2390.083381652832, "epoch": 0.07885714285714286, "grad_norm": 0.2992507219314575, "kl": 0.0013431459665298462, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.960469931131936e-07, "loss": 0.0001, "reward": 0.018286951817572117, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.018286951817572117, "reward_after_std": 0.757911205291748, "reward_before_mean": 0.1728294063359499, "reward_before_std": 0.7661503255367279, "reward_change_max": 0.000638522207736969, "reward_change_mean": -0.15454245172441006, "reward_change_min": -0.3369054328650236, "reward_change_std": 0.13101212214678526, "reward_std": 0.7579112146049738, "rewards/cosine_scaled_reward": -0.17400196427479386, "rewards/format_reward": 0.5208333376795053, "step": 69 }, { "advantage_max": 1.7679117619991302, "advantage_mean": 1.3038515822572094e-08, "advantage_min": -0.8881788477301598, "advantage_std": 0.999812088906765, "completion_length": 3126.875045776367, "epoch": 0.08, "grad_norm": 0.2363167405128479, "kl": 0.0009987987577915192, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.956206309337066e-07, "loss": 0.0, "reward": -0.052413856610655785, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.052413856610655785, "reward_after_std": 0.7765421830117702, "reward_before_mean": 0.0839406659360975, "reward_before_std": 0.7712229937314987, "reward_change_max": 0.000931374728679657, "reward_change_mean": -0.13635451719164848, "reward_change_min": -0.29352002777159214, "reward_change_std": 0.1165924184024334, "reward_std": 0.7765422016382217, "rewards/cosine_scaled_reward": -0.14552967669442296, "rewards/format_reward": 0.37500000931322575, "step": 70 }, { "advantage_max": 1.4911705553531647, "advantage_mean": 8.692343733684993e-09, "advantage_min": -1.1096780076622963, "advantage_std": 0.999790795147419, "completion_length": 2761.750015258789, "epoch": 0.08114285714285714, "grad_norm": 0.20265944302082062, "kl": 0.000859379768371582, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.951725498333448e-07, "loss": 0.0, "reward": 0.12095781043171883, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.12095781043171883, "reward_after_std": 0.70588543638587, "reward_before_mean": 0.3066416233778, "reward_before_std": 0.737454067915678, "reward_change_max": 0.0, "reward_change_mean": -0.18568379897624254, "reward_change_min": -0.37347991578280926, "reward_change_std": 0.15028795832768083, "reward_std": 0.70588543638587, "rewards/cosine_scaled_reward": -0.034179212525486946, "rewards/format_reward": 0.3750000074505806, "step": 71 }, { "advantage_max": 1.6158892661333084, "advantage_mean": 1.2417632477834672e-09, "advantage_min": -1.0985428914427757, "advantage_std": 0.9998162090778351, "completion_length": 3004.3334045410156, "epoch": 0.08228571428571428, "grad_norm": 0.2693294584751129, "kl": 0.001021057367324829, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.947027716509488e-07, "loss": 0.0, "reward": -0.1556640777271241, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1556640777271241, "reward_after_std": 0.7115676589310169, "reward_before_mean": -0.03549688681960106, "reward_before_std": 0.7211866304278374, "reward_change_max": 0.0001872330904006958, "reward_change_mean": -0.12016719300299883, "reward_change_min": -0.24851389415562153, "reward_change_std": 0.10526996431872249, "reward_std": 0.7115676626563072, "rewards/cosine_scaled_reward": -0.18441510945558548, "rewards/format_reward": 0.33333333767950535, "step": 72 }, { "advantage_max": 1.5327227264642715, "advantage_mean": 1.241762692671955e-09, "advantage_min": -1.1601731404662132, "advantage_std": 0.9997736439108849, "completion_length": 3494.604217529297, "epoch": 0.08342857142857144, "grad_norm": 0.1661466360092163, "kl": 0.00032639503479003906, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.942113192828444e-07, "loss": 0.0, "reward": -0.08516270108520985, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.08516270108520985, "reward_after_std": 0.6497222892940044, "reward_before_mean": 0.059801749885082245, "reward_before_std": 0.6753613352775574, "reward_change_max": 0.0002906620502471924, "reward_change_mean": -0.1449644397944212, "reward_change_min": -0.2802998274564743, "reward_change_std": 0.11904488690197468, "reward_std": 0.6497223116457462, "rewards/cosine_scaled_reward": -0.08468246832489967, "rewards/format_reward": 0.22916666977107525, "step": 73 }, { "advantage_max": 1.71546071767807, "advantage_mean": 4.594524849466097e-08, "advantage_min": -0.8454092368483543, "advantage_std": 0.9998168796300888, "completion_length": 3046.8125610351562, "epoch": 0.08457142857142858, "grad_norm": 0.18538318574428558, "kl": 0.0007867217063903809, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.93698216681727e-07, "loss": 0.0, "reward": 0.17524679680354893, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.17524679680354893, "reward_after_std": 0.9069931507110596, "reward_before_mean": 0.3507931437343359, "reward_before_std": 0.8982865624129772, "reward_change_max": 0.0001408606767654419, "reward_change_mean": -0.1755463215522468, "reward_change_min": -0.37799850665032864, "reward_change_std": 0.1365469004958868, "reward_std": 0.9069931656122208, "rewards/cosine_scaled_reward": 0.008729891385883093, "rewards/format_reward": 0.3333333395421505, "step": 74 }, { "advantage_max": 1.5503149926662445, "advantage_mean": -3.104408519138957e-08, "advantage_min": -1.2087038829922676, "advantage_std": 0.9997450262308121, "completion_length": 2894.8333740234375, "epoch": 0.08571428571428572, "grad_norm": 0.16822047531604767, "kl": 0.0011026561260223389, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.931634888554935e-07, "loss": 0.0, "reward": 0.23183363070711493, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.23183363070711493, "reward_after_std": 0.4754980690777302, "reward_before_mean": 0.4545082226395607, "reward_before_std": 0.46481961756944656, "reward_change_max": 0.0005934983491897583, "reward_change_mean": -0.22267461940646172, "reward_change_min": -0.3571764323860407, "reward_change_std": 0.13745499728247523, "reward_std": 0.4754980690777302, "rewards/cosine_scaled_reward": 0.029337426647543907, "rewards/format_reward": 0.3958333358168602, "step": 75 }, { "advantage_max": 1.6219773888587952, "advantage_mean": 5.587936668938198e-09, "advantage_min": -1.1091139391064644, "advantage_std": 0.9997748509049416, "completion_length": 2880.791679382324, "epoch": 0.08685714285714285, "grad_norm": 0.1717507541179657, "kl": 0.00029241712763905525, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.926071618660237e-07, "loss": 0.0, "reward": -0.023375704884529114, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.023375704884529114, "reward_after_std": 0.5844264999032021, "reward_before_mean": 0.14006047509610653, "reward_before_std": 0.6008395608514547, "reward_change_max": 0.00044924765825271606, "reward_change_mean": -0.16343621350824833, "reward_change_min": -0.29554445296525955, "reward_change_std": 0.1233857732731849, "reward_std": 0.5844265222549438, "rewards/cosine_scaled_reward": -0.1695530880242586, "rewards/format_reward": 0.4791666753590107, "step": 76 }, { "advantage_max": 1.4377646893262863, "advantage_mean": 1.9868214629070735e-08, "advantage_min": -1.153957448899746, "advantage_std": 0.9997784495353699, "completion_length": 3049.250015258789, "epoch": 0.088, "grad_norm": 0.16906258463859558, "kl": 0.0003077983856201172, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.9202926282791e-07, "loss": 0.0, "reward": -0.08434882014989853, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.08434882014989853, "reward_after_std": 0.5435480587184429, "reward_before_mean": 0.07107899896800518, "reward_before_std": 0.5732218399643898, "reward_change_max": 0.0005913972854614258, "reward_change_mean": -0.15542782377451658, "reward_change_min": -0.30807364732027054, "reward_change_std": 0.12492271605879068, "reward_std": 0.5435480773448944, "rewards/cosine_scaled_reward": -0.13112716563045979, "rewards/format_reward": 0.3333333358168602, "step": 77 }, { "advantage_max": 1.3530033379793167, "advantage_mean": 3.352761412944716e-08, "advantage_min": -1.1873060315847397, "advantage_std": 0.9997893124818802, "completion_length": 3167.5208740234375, "epoch": 0.08914285714285715, "grad_norm": 0.1513669639825821, "kl": 0.00021767616271972656, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.91429819907136e-07, "loss": 0.0, "reward": 0.07496153563261032, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.07496153563261032, "reward_after_std": 0.7540915813297033, "reward_before_mean": 0.2513010837137699, "reward_before_std": 0.8067440576851368, "reward_change_max": 0.0001747533679008484, "reward_change_mean": -0.17633954668417573, "reward_change_min": -0.3947325777262449, "reward_change_std": 0.16072524525225163, "reward_std": 0.7540916260331869, "rewards/cosine_scaled_reward": -0.030599456280469894, "rewards/format_reward": 0.3125000074505806, "step": 78 }, { "advantage_max": 1.682113841176033, "advantage_mean": -4.097819394921487e-08, "advantage_min": -1.0798411667346954, "advantage_std": 0.9997269585728645, "completion_length": 2341.895866394043, "epoch": 0.09028571428571429, "grad_norm": 0.2215704619884491, "kl": 0.0007579028606414795, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.908088623197048e-07, "loss": 0.0, "reward": 0.264179325196892, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.264179325196892, "reward_after_std": 0.5166768245398998, "reward_before_mean": 0.48735319514526054, "reward_before_std": 0.4782428778707981, "reward_change_max": 0.0010138675570487976, "reward_change_mean": -0.22317388840019703, "reward_change_min": -0.3387150280177593, "reward_change_std": 0.13743248512037098, "reward_std": 0.5166768468916416, "rewards/cosine_scaled_reward": -0.05840673670172691, "rewards/format_reward": 0.6041666679084301, "step": 79 }, { "advantage_max": 1.5736753195524216, "advantage_mean": 3.2285850215529877e-08, "advantage_min": -0.999790869653225, "advantage_std": 0.9998230561614037, "completion_length": 3286.5416870117188, "epoch": 0.09142857142857143, "grad_norm": 0.16919022798538208, "kl": 0.0008649379014968872, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.901664203302124e-07, "loss": 0.0, "reward": 0.005955344066023827, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.005955344066023827, "reward_after_std": 0.8890957869589329, "reward_before_mean": 0.15279491746332496, "reward_before_std": 0.9225866943597794, "reward_change_max": 4.447251558303833e-05, "reward_change_mean": -0.14683958096429706, "reward_change_min": -0.32787839509546757, "reward_change_std": 0.13908664509654045, "reward_std": 0.8890958093106747, "rewards/cosine_scaled_reward": -0.07985254935920238, "rewards/format_reward": 0.31250000558793545, "step": 80 }, { "advantage_max": 1.4473999440670013, "advantage_mean": 4.159907629475157e-08, "advantage_min": -1.2874359339475632, "advantage_std": 0.9996728822588921, "completion_length": 3123.4583740234375, "epoch": 0.09257142857142857, "grad_norm": 0.23505662381649017, "kl": 0.0028659701347351074, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.895025252503755e-07, "loss": 0.0001, "reward": -0.13922469969838858, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.13922469969838858, "reward_after_std": 0.3902856092900038, "reward_before_mean": 0.013303263112902641, "reward_before_std": 0.3994307592511177, "reward_change_max": 0.0002063661813735962, "reward_change_mean": -0.15252797584980726, "reward_change_min": -0.2631298340857029, "reward_change_std": 0.10517477197572589, "reward_std": 0.3902856223285198, "rewards/cosine_scaled_reward": -0.13918170426040888, "rewards/format_reward": 0.2916666679084301, "step": 81 }, { "advantage_max": 1.5119963884353638, "advantage_mean": -3.725290298461914e-08, "advantage_min": -1.1589691415429115, "advantage_std": 0.9997723922133446, "completion_length": 2829.0208892822266, "epoch": 0.09371428571428571, "grad_norm": 0.2004895955324173, "kl": 0.0015895962715148926, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.888172094375033e-07, "loss": 0.0001, "reward": 0.157549187541008, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.157549187541008, "reward_after_std": 0.6032798625528812, "reward_before_mean": 0.3559020347893238, "reward_before_std": 0.6031227596104145, "reward_change_max": 0.00014865398406982422, "reward_change_mean": -0.1983528840355575, "reward_change_min": -0.35573250614106655, "reward_change_std": 0.14031734503805637, "reward_std": 0.6032798700034618, "rewards/cosine_scaled_reward": 0.0008676820434629917, "rewards/format_reward": 0.3541666679084301, "step": 82 }, { "advantage_max": 1.7299436926841736, "advantage_mean": 3.7252894102834944e-09, "advantage_min": -0.9761759266257286, "advantage_std": 0.9997840225696564, "completion_length": 2715.5416946411133, "epoch": 0.09485714285714286, "grad_norm": 0.24527642130851746, "kl": 0.001336120069026947, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.881105062929221e-07, "loss": 0.0001, "reward": -0.06252476340159774, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.06252476340159774, "reward_after_std": 0.6791763417422771, "reward_before_mean": 0.0788705749437213, "reward_before_std": 0.6742656137794256, "reward_change_max": 0.0, "reward_change_mean": -0.1413953397423029, "reward_change_min": -0.27979685738682747, "reward_change_std": 0.10586337419226766, "reward_std": 0.6791763938963413, "rewards/cosine_scaled_reward": -0.13764804881066084, "rewards/format_reward": 0.3541666679084301, "step": 83 }, { "advantage_max": 1.5118530690670013, "advantage_mean": 2.669791410170319e-08, "advantage_min": -1.1131732016801834, "advantage_std": 0.9997848272323608, "completion_length": 3013.6250610351562, "epoch": 0.096, "grad_norm": 0.19372142851352692, "kl": 0.0005471706390380859, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.873824502603459e-07, "loss": 0.0, "reward": 0.09955209773033857, "reward_advantage_correlation": 0.9999999999999994, "reward_after_mean": 0.09955209773033857, "reward_after_std": 0.7380244378000498, "reward_before_mean": 0.28240909799933434, "reward_before_std": 0.7860660124570131, "reward_change_max": 0.0006183013319969177, "reward_change_mean": -0.18285699002444744, "reward_change_min": -0.39008102752268314, "reward_change_std": 0.15990980505011976, "reward_std": 0.7380244564265013, "rewards/cosine_scaled_reward": -0.056712113320827484, "rewards/format_reward": 0.3958333395421505, "step": 84 }, { "advantage_max": 1.696160763502121, "advantage_mean": 3.0423204844254315e-08, "advantage_min": -0.9830008372664452, "advantage_std": 0.9998309686779976, "completion_length": 2940.6875610351562, "epoch": 0.09714285714285714, "grad_norm": 0.1844056099653244, "kl": 0.0002780407667160034, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.866330768241983e-07, "loss": 0.0, "reward": 0.025807244703173637, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.025807244703173637, "reward_after_std": 0.8476546891033649, "reward_before_mean": 0.1735303965397179, "reward_before_std": 0.846455778926611, "reward_change_max": 0.00022524595260620117, "reward_change_mean": -0.14772315858863294, "reward_change_min": -0.2896654698997736, "reward_change_std": 0.11199493327876553, "reward_std": 0.8476547226309776, "rewards/cosine_scaled_reward": -0.1424014689400792, "rewards/format_reward": 0.4583333395421505, "step": 85 }, { "advantage_max": 1.5399595648050308, "advantage_mean": 2.98023239975187e-08, "advantage_min": -1.1645502522587776, "advantage_std": 0.9998061656951904, "completion_length": 2697.1666946411133, "epoch": 0.09828571428571428, "grad_norm": 0.22625777125358582, "kl": 0.0011822879314422607, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.85862422507884e-07, "loss": 0.0, "reward": 0.17234379425644875, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.17234379425644875, "reward_after_std": 0.6737720742821693, "reward_before_mean": 0.37005191296339035, "reward_before_std": 0.6888385564088821, "reward_change_max": 0.00036728382110595703, "reward_change_mean": -0.19770813873037696, "reward_change_min": -0.3463681824505329, "reward_change_std": 0.14334939466789365, "reward_std": 0.6737720891833305, "rewards/cosine_scaled_reward": -0.033724045380949974, "rewards/format_reward": 0.4375000074505806, "step": 86 }, { "advantage_max": 1.5217802822589874, "advantage_mean": 5.316300510926908e-09, "advantage_min": -1.1270632445812225, "advantage_std": 0.9998283088207245, "completion_length": 2674.1875610351562, "epoch": 0.09942857142857142, "grad_norm": 0.20976316928863525, "kl": 0.001379743218421936, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.850705248720068e-07, "loss": 0.0001, "reward": 0.2193904248997569, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2193904248997569, "reward_after_std": 0.77983483299613, "reward_before_mean": 0.4205781789496541, "reward_before_std": 0.8077935613691807, "reward_change_max": 0.0005700737237930298, "reward_change_mean": -0.20118775311857462, "reward_change_min": -0.38294284231960773, "reward_change_std": 0.16150949569419026, "reward_std": 0.7798348590731621, "rewards/cosine_scaled_reward": -0.03971092030405998, "rewards/format_reward": 0.5000000111758709, "step": 87 }, { "advantage_max": 1.6352401971817017, "advantage_mean": -6.208817238118058e-09, "advantage_min": -1.065637744963169, "advantage_std": 0.9998790919780731, "completion_length": 2831.937545776367, "epoch": 0.10057142857142858, "grad_norm": 0.2115127146244049, "kl": 0.001442551612854004, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.8425742251254e-07, "loss": 0.0001, "reward": 0.37121852952986956, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.37121852952986956, "reward_after_std": 1.0690153501927853, "reward_before_mean": 0.5801740437746048, "reward_before_std": 1.098650336265564, "reward_change_max": 0.00029071420431137085, "reward_change_mean": -0.20895551680587232, "reward_change_min": -0.4042000826448202, "reward_change_std": 0.16902310587465763, "reward_std": 1.0690153948962688, "rewards/cosine_scaled_reward": 0.0505036786198616, "rewards/format_reward": 0.4791666828095913, "step": 88 }, { "advantage_max": 1.4792246967554092, "advantage_mean": -3.7252901874396116e-09, "advantage_min": -1.1064741685986519, "advantage_std": 0.9998375922441483, "completion_length": 3161.2709350585938, "epoch": 0.10171428571428572, "grad_norm": 0.1943075805902481, "kl": 0.0011938810348510742, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.83423155058946e-07, "loss": 0.0, "reward": 0.043435624334961176, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.043435624334961176, "reward_after_std": 0.7929012589156628, "reward_before_mean": 0.20786358881741762, "reward_before_std": 0.8400050476193428, "reward_change_max": 0.0006680861115455627, "reward_change_mean": -0.16442797426134348, "reward_change_min": -0.3740697056055069, "reward_change_std": 0.1559822354465723, "reward_std": 0.7929012849926949, "rewards/cosine_scaled_reward": -0.07315154653042555, "rewards/format_reward": 0.35416667349636555, "step": 89 }, { "advantage_max": 1.7707126438617706, "advantage_mean": -1.707424912567035e-08, "advantage_min": -0.8974402844905853, "advantage_std": 0.9997777789831161, "completion_length": 2409.062545776367, "epoch": 0.10285714285714286, "grad_norm": 0.24570851027965546, "kl": 0.0012853443622589111, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.825677631722435e-07, "loss": 0.0001, "reward": -0.05016228114254773, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.05016228114254773, "reward_after_std": 0.6008680164813995, "reward_before_mean": 0.0965384691953659, "reward_before_std": 0.5710118785500526, "reward_change_max": 0.0, "reward_change_mean": -0.14670075592584908, "reward_change_min": -0.2537856996059418, "reward_change_std": 0.09590160532388836, "reward_std": 0.6008680239319801, "rewards/cosine_scaled_reward": -0.22256410913541913, "rewards/format_reward": 0.5416666716337204, "step": 90 }, { "advantage_max": 1.4991742223501205, "advantage_mean": 2.8560559584001055e-08, "advantage_min": -1.0803657919168472, "advantage_std": 0.9998156875371933, "completion_length": 2925.812545776367, "epoch": 0.104, "grad_norm": 0.18674921989440918, "kl": 0.000809013843536377, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.816912885430258e-07, "loss": 0.0, "reward": -0.02295660600066185, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.02295660600066185, "reward_after_std": 0.8140906505286694, "reward_before_mean": 0.12554599717259407, "reward_before_std": 0.8680821731686592, "reward_change_max": 0.0007210969924926758, "reward_change_mean": -0.14850260131061077, "reward_change_min": -0.37167999893426895, "reward_change_std": 0.1564339753240347, "reward_std": 0.8140906654298306, "rewards/cosine_scaled_reward": -0.13514367304742336, "rewards/format_reward": 0.3958333358168602, "step": 91 }, { "advantage_max": 1.5517508536577225, "advantage_mean": -4.967053879312289e-09, "advantage_min": -1.1872400864958763, "advantage_std": 0.9997856393456459, "completion_length": 2539.3750381469727, "epoch": 0.10514285714285715, "grad_norm": 0.33332663774490356, "kl": 0.0048322901129722595, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.807937738894303e-07, "loss": 0.0002, "reward": 0.1360047198832035, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1360047198832035, "reward_after_std": 0.7280963324010372, "reward_before_mean": 0.3214022181928158, "reward_before_std": 0.7474944517016411, "reward_change_max": 0.0, "reward_change_mean": -0.18539749644696712, "reward_change_min": -0.3729815445840359, "reward_change_std": 0.14568164059892297, "reward_std": 0.7280963659286499, "rewards/cosine_scaled_reward": -0.08929889462888241, "rewards/format_reward": 0.5000000074505806, "step": 92 }, { "advantage_max": 1.489373043179512, "advantage_mean": 4.0357312269811985e-08, "advantage_min": -1.0695101916790009, "advantage_std": 0.9997303858399391, "completion_length": 3477.4791870117188, "epoch": 0.10628571428571429, "grad_norm": 0.19943885505199432, "kl": 0.001136481761932373, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.798752629550546e-07, "loss": 0.0, "reward": -0.5609270744025707, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5609270744025707, "reward_after_std": 0.39100511744618416, "reward_before_mean": -0.4983716458082199, "reward_before_std": 0.415749229490757, "reward_change_max": 0.0004532933235168457, "reward_change_mean": -0.06255542347207665, "reward_change_min": -0.15995178557932377, "reward_change_std": 0.06816363241523504, "reward_std": 0.39100512489676476, "rewards/cosine_scaled_reward": -0.270019156858325, "rewards/format_reward": 0.0416666679084301, "step": 93 }, { "advantage_max": 1.7367701828479767, "advantage_mean": 4.1599076627818476e-08, "advantage_min": -0.9185703918337822, "advantage_std": 0.9997115805745125, "completion_length": 3026.5000228881836, "epoch": 0.10742857142857143, "grad_norm": 0.22087015211582184, "kl": 0.002016555517911911, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.78935800506826e-07, "loss": 0.0001, "reward": -0.1684376262128353, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.1684376262128353, "reward_after_std": 0.373518081381917, "reward_before_mean": -0.026155206374824047, "reward_before_std": 0.34390043281018734, "reward_change_max": 0.0008780360221862793, "reward_change_mean": -0.14228241797536612, "reward_change_min": -0.23626244626939297, "reward_change_std": 0.09374386863783002, "reward_std": 0.37351808696985245, "rewards/cosine_scaled_reward": -0.12766093760728836, "rewards/format_reward": 0.2291666679084301, "step": 94 }, { "advantage_max": 1.590524211525917, "advantage_mean": 1.2914340219438714e-07, "advantage_min": -1.0176689475774765, "advantage_std": 0.9993797987699509, "completion_length": 3369.4166870117188, "epoch": 0.10857142857142857, "grad_norm": 0.15471990406513214, "kl": 0.0005910694599151611, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.779754323328192e-07, "loss": 0.0, "reward": -0.41158403269946575, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.41158403269946575, "reward_after_std": 0.4722477972973138, "reward_before_mean": -0.32287996634840965, "reward_before_std": 0.49264019494876266, "reward_change_max": 0.0006460621953010559, "reward_change_mean": -0.08870406670030206, "reward_change_min": -0.20848171971738338, "reward_change_std": 0.08687506098067388, "reward_std": 0.47224780498072505, "rewards/cosine_scaled_reward": -0.25518998503685, "rewards/format_reward": 0.1875, "step": 95 }, { "advantage_max": 1.724830448627472, "advantage_mean": -2.483526162500027e-09, "advantage_min": -0.9390768259763718, "advantage_std": 0.9998311027884483, "completion_length": 2654.375045776367, "epoch": 0.10971428571428571, "grad_norm": 0.2167777717113495, "kl": 0.001465141773223877, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.769942052400235e-07, "loss": 0.0001, "reward": 0.1042107567191124, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1042107567191124, "reward_after_std": 0.770032986998558, "reward_before_mean": 0.2759747635573149, "reward_before_std": 0.7717161737382412, "reward_change_max": 0.0006542354822158813, "reward_change_mean": -0.17176401265896857, "reward_change_min": -0.3010180573910475, "reward_change_std": 0.12796866125427186, "reward_std": 0.7700330205261707, "rewards/cosine_scaled_reward": -0.05992929823696613, "rewards/format_reward": 0.39583333767950535, "step": 96 }, { "advantage_max": 1.6630063951015472, "advantage_mean": 4.03573128249235e-08, "advantage_min": -1.0467227101325989, "advantage_std": 0.9998231157660484, "completion_length": 3095.3958892822266, "epoch": 0.11085714285714286, "grad_norm": 0.17923100292682648, "kl": 0.0008588209748268127, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.759921670520634e-07, "loss": 0.0, "reward": 0.04914311692118645, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.04914311692118645, "reward_after_std": 0.7510417252779007, "reward_before_mean": 0.2142169401049614, "reward_before_std": 0.7740973867475986, "reward_change_max": 0.0006477609276771545, "reward_change_mean": -0.165073798969388, "reward_change_min": -0.32045017182826996, "reward_change_std": 0.13384919241070747, "reward_std": 0.7510417327284813, "rewards/cosine_scaled_reward": -0.0699748694896698, "rewards/format_reward": 0.35416668094694614, "step": 97 }, { "advantage_max": 1.6298557221889496, "advantage_mean": 8.692344732885715e-09, "advantage_min": -1.0451266095042229, "advantage_std": 0.9997927322983742, "completion_length": 2451.7708892822266, "epoch": 0.112, "grad_norm": 0.17727041244506836, "kl": 0.0005920976400375366, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.749693666068663e-07, "loss": 0.0, "reward": 0.1988861383870244, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1988861383870244, "reward_after_std": 0.5740308053791523, "reward_before_mean": 0.40678128972649574, "reward_before_std": 0.5631402656435966, "reward_change_max": 0.0012294650077819824, "reward_change_mean": -0.2078951743314974, "reward_change_min": -0.37305937707424164, "reward_change_std": 0.143737900769338, "reward_std": 0.5740308240056038, "rewards/cosine_scaled_reward": -0.09869268629699945, "rewards/format_reward": 0.6041666753590107, "step": 98 }, { "advantage_max": 1.5828326791524887, "advantage_mean": -8.537124052132583e-08, "advantage_min": -1.1269954815506935, "advantage_std": 0.99972303211689, "completion_length": 2781.875, "epoch": 0.11314285714285714, "grad_norm": 0.24164098501205444, "kl": 0.0007622838020324707, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.739258537542835e-07, "loss": 0.0, "reward": -0.023720188066363335, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.023720188066363335, "reward_after_std": 0.46708662807941437, "reward_before_mean": 0.14562718383967876, "reward_before_std": 0.4578824061900377, "reward_change_max": 0.00011779367923736572, "reward_change_mean": -0.1693474086932838, "reward_change_min": -0.3044974785298109, "reward_change_std": 0.12167800427414477, "reward_std": 0.46708663180470467, "rewards/cosine_scaled_reward": -0.0834364052861929, "rewards/format_reward": 0.31250000186264515, "step": 99 }, { "advantage_max": 1.570393443107605, "advantage_mean": 4.035731249185659e-08, "advantage_min": -1.19875980168581, "advantage_std": 0.9998155683279037, "completion_length": 2425.1041870117188, "epoch": 0.11428571428571428, "grad_norm": 0.18487881124019623, "kl": 0.001100778579711914, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.728616793536587e-07, "loss": 0.0, "reward": 0.4371534734964371, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.4371534734964371, "reward_after_std": 0.7649065852165222, "reward_before_mean": 0.6844560205936432, "reward_before_std": 0.7754265181720257, "reward_change_max": 0.0013605356216430664, "reward_change_mean": -0.24730252707377076, "reward_change_min": -0.4665789268910885, "reward_change_std": 0.17839907761663198, "reward_std": 0.7649066299200058, "rewards/cosine_scaled_reward": 0.0505613349378109, "rewards/format_reward": 0.5833333488553762, "step": 100 }, { "advantage_max": 1.6452988535165787, "advantage_mean": 2.2351742234860694e-08, "advantage_min": -0.959521122276783, "advantage_std": 0.9998143017292023, "completion_length": 2734.4166870117188, "epoch": 0.11542857142857142, "grad_norm": 0.19890400767326355, "kl": 0.0011498034000396729, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.717768952713511e-07, "loss": 0.0, "reward": 0.10417449288070202, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.10417449288070202, "reward_after_std": 0.6690843030810356, "reward_before_mean": 0.28429789654910564, "reward_before_std": 0.6643826402723789, "reward_change_max": 0.0015150904655456543, "reward_change_mean": -0.18012344324961305, "reward_change_min": -0.3563338704407215, "reward_change_std": 0.13597459299489856, "reward_std": 0.669084332883358, "rewards/cosine_scaled_reward": -0.07660102914087474, "rewards/format_reward": 0.4375000074505806, "step": 101 }, { "advantage_max": 1.628109648823738, "advantage_mean": -1.6142924885720333e-08, "advantage_min": -1.130338653922081, "advantage_std": 0.9998352602124214, "completion_length": 2244.0833892822266, "epoch": 0.11657142857142858, "grad_norm": 0.35045328736305237, "kl": 0.0018877387046813965, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.706715543782064e-07, "loss": 0.0001, "reward": 0.23090652655810118, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.23090652655810118, "reward_after_std": 0.8405257500708103, "reward_before_mean": 0.426285058259964, "reward_before_std": 0.8548384718596935, "reward_change_max": 0.0013198107481002808, "reward_change_mean": -0.1953785545192659, "reward_change_min": -0.3686686437577009, "reward_change_std": 0.1524599560070783, "reward_std": 0.8405257761478424, "rewards/cosine_scaled_reward": -0.10977413924410939, "rewards/format_reward": 0.6458333432674408, "step": 102 }, { "advantage_max": 1.3960938453674316, "advantage_mean": 2.6077032311278003e-08, "advantage_min": -1.2053559049963951, "advantage_std": 0.9997885152697563, "completion_length": 2521.833396911621, "epoch": 0.11771428571428572, "grad_norm": 0.278216689825058, "kl": 0.001301884651184082, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.695457105469804e-07, "loss": 0.0001, "reward": 0.060690226033329964, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.060690226033329964, "reward_after_std": 0.6560997925698757, "reward_before_mean": 0.24024609848856926, "reward_before_std": 0.7025391049683094, "reward_change_max": 0.0, "reward_change_mean": -0.1795558724552393, "reward_change_min": -0.3511783704161644, "reward_change_std": 0.15032944874837995, "reward_std": 0.6560998372733593, "rewards/cosine_scaled_reward": -0.11946028470993042, "rewards/format_reward": 0.47916666977107525, "step": 103 }, { "advantage_max": 1.5210340768098831, "advantage_mean": 1.9868215073159945e-08, "advantage_min": -1.1603004932403564, "advantage_std": 0.9997299611568451, "completion_length": 2693.020835876465, "epoch": 0.11885714285714286, "grad_norm": 0.22005805373191833, "kl": 0.0020142793655395508, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.683994186497132e-07, "loss": 0.0001, "reward": -0.08458196744322777, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.08458196744322777, "reward_after_std": 0.47468145191669464, "reward_before_mean": 0.07341913506388664, "reward_before_std": 0.4918606597930193, "reward_change_max": 0.00020316988229751587, "reward_change_mean": -0.15800110436975956, "reward_change_min": -0.26773341558873653, "reward_change_std": 0.11250392789952457, "reward_std": 0.47468145936727524, "rewards/cosine_scaled_reward": -0.12995710503309965, "rewards/format_reward": 0.3333333358168602, "step": 104 }, { "advantage_max": 1.5753152966499329, "advantage_mean": 2.4835264955669345e-09, "advantage_min": -1.0181199088692665, "advantage_std": 0.99983299523592, "completion_length": 2375.0416870117188, "epoch": 0.12, "grad_norm": 0.21237428486347198, "kl": 0.0008479952812194824, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.672327345550543e-07, "loss": 0.0, "reward": 0.2439766377210617, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2439766377210617, "reward_after_std": 0.779191005975008, "reward_before_mean": 0.44646067079156637, "reward_before_std": 0.785377386957407, "reward_change_max": 0.0, "reward_change_mean": -0.2024840395897627, "reward_change_min": -0.4037261363118887, "reward_change_std": 0.15447102207690477, "reward_std": 0.7791910246014595, "rewards/cosine_scaled_reward": -0.037186328787356615, "rewards/format_reward": 0.5208333414047956, "step": 105 }, { "advantage_max": 1.7184691429138184, "advantage_mean": -6.51925804451281e-08, "advantage_min": -1.036706604063511, "advantage_std": 0.9998025968670845, "completion_length": 2241.833381652832, "epoch": 0.12114285714285715, "grad_norm": 0.1902162879705429, "kl": 0.0011308789253234863, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.66045715125541e-07, "loss": 0.0, "reward": 0.7149951979517937, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7149951979517937, "reward_after_std": 0.6896206475794315, "reward_before_mean": 1.0211203414946795, "reward_before_std": 0.6423603612929583, "reward_change_max": 0.0, "reward_change_mean": -0.3061251426115632, "reward_change_min": -0.5069556701928377, "reward_change_std": 0.19168227072805166, "reward_std": 0.6896206885576248, "rewards/cosine_scaled_reward": 0.17722682980820537, "rewards/format_reward": 0.6666666734963655, "step": 106 }, { "advantage_max": 1.3700329214334488, "advantage_mean": 3.2285850992685994e-08, "advantage_min": -1.3256009072065353, "advantage_std": 0.9997893422842026, "completion_length": 2744.166717529297, "epoch": 0.12228571428571429, "grad_norm": 0.20612262189388275, "kl": 0.0011415481567382812, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.648384182148252e-07, "loss": 0.0, "reward": 0.06857027532532811, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.06857027532532811, "reward_after_std": 0.6631270665675402, "reward_before_mean": 0.252403543330729, "reward_before_std": 0.7201212346553802, "reward_change_max": 0.0002407953143119812, "reward_change_mean": -0.1838332605548203, "reward_change_min": -0.3560641389340162, "reward_change_std": 0.15651204530149698, "reward_std": 0.663127088919282, "rewards/cosine_scaled_reward": -0.10296489670872688, "rewards/format_reward": 0.4583333507180214, "step": 107 }, { "advantage_max": 1.5325934290885925, "advantage_mean": 6.208818126296478e-09, "advantage_min": -1.0848430544137955, "advantage_std": 0.9998377189040184, "completion_length": 2616.3333740234375, "epoch": 0.12342857142857143, "grad_norm": 0.32614660263061523, "kl": 0.0012401752173900604, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.636109026648554e-07, "loss": 0.0, "reward": 0.057555489242076874, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.057555489242076874, "reward_after_std": 0.9342328980565071, "reward_before_mean": 0.21481921151280403, "reward_before_std": 0.9939168691635132, "reward_change_max": 0.0006889700889587402, "reward_change_mean": -0.15726373670622706, "reward_change_min": -0.38562384992837906, "reward_change_std": 0.16911761928349733, "reward_std": 0.9342329241335392, "rewards/cosine_scaled_reward": -0.11134038865566254, "rewards/format_reward": 0.4375000149011612, "step": 108 }, { "advantage_max": 1.717758134007454, "advantage_mean": 2.7008355329982692e-08, "advantage_min": -0.9031755030155182, "advantage_std": 0.9998158439993858, "completion_length": 3081.729217529297, "epoch": 0.12457142857142857, "grad_norm": 0.18283237516880035, "kl": 0.0006889104843139648, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.623632283030077e-07, "loss": 0.0, "reward": 0.031152330338954926, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.031152330338954926, "reward_after_std": 0.738107368350029, "reward_before_mean": 0.1895277239382267, "reward_before_std": 0.7382989898324013, "reward_change_max": 0.0006342306733131409, "reward_change_mean": -0.1583753984887153, "reward_change_min": -0.3311379738152027, "reward_change_std": 0.12449920130893588, "reward_std": 0.7381073944270611, "rewards/cosine_scaled_reward": -0.1031528078019619, "rewards/format_reward": 0.3958333358168602, "step": 109 }, { "advantage_max": 1.5491592735052109, "advantage_mean": 1.350417788703595e-08, "advantage_min": -1.104156732559204, "advantage_std": 0.9997999370098114, "completion_length": 2665.6250228881836, "epoch": 0.12571428571428572, "grad_norm": 0.24647559225559235, "kl": 0.0017870888113975525, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.610954559391704e-07, "loss": 0.0001, "reward": 0.06417747336672619, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.06417747336672619, "reward_after_std": 0.6984993778169155, "reward_before_mean": 0.23563515860587358, "reward_before_std": 0.7118737921118736, "reward_change_max": 0.00017995387315750122, "reward_change_mean": -0.1714576887898147, "reward_change_min": -0.33860295079648495, "reward_change_std": 0.13551720790565014, "reward_std": 0.698499396443367, "rewards/cosine_scaled_reward": -0.12176575046032667, "rewards/format_reward": 0.4791666716337204, "step": 110 }, { "advantage_max": 1.6004591435194016, "advantage_mean": 1.862645149230957e-09, "advantage_min": -0.9542756676673889, "advantage_std": 0.9998093247413635, "completion_length": 2937.3750610351562, "epoch": 0.12685714285714286, "grad_norm": 0.21009349822998047, "kl": 0.0018000602722167969, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.598076473627796e-07, "loss": 0.0001, "reward": -0.1473899253178388, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.1473899253178388, "reward_after_std": 0.7092999331653118, "reward_before_mean": -0.022016488015651703, "reward_before_std": 0.7311879619956017, "reward_change_max": 0.001234203577041626, "reward_change_mean": -0.12537344172596931, "reward_change_min": -0.3080690782517195, "reward_change_std": 0.1185176195576787, "reward_std": 0.7092999368906021, "rewards/cosine_scaled_reward": -0.16725825425237417, "rewards/format_reward": 0.3125000037252903, "step": 111 }, { "advantage_max": 1.58974190056324, "advantage_mean": 1.1175871006408045e-08, "advantage_min": -1.0684397667646408, "advantage_std": 0.9997754022479057, "completion_length": 2916.3333740234375, "epoch": 0.128, "grad_norm": 0.17282706499099731, "kl": 0.0009534507989883423, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.58499865339809e-07, "loss": 0.0, "reward": 0.047515214420855045, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.047515214420855045, "reward_after_std": 0.6808386445045471, "reward_before_mean": 0.21764793386682868, "reward_before_std": 0.6999966204166412, "reward_change_max": 0.0, "reward_change_mean": -0.17013270873576403, "reward_change_min": -0.32235524989664555, "reward_change_std": 0.1295095095410943, "reward_std": 0.6808386482298374, "rewards/cosine_scaled_reward": -0.0786760482005775, "rewards/format_reward": 0.37500000558793545, "step": 112 }, { "advantage_max": 1.5703244507312775, "advantage_mean": 1.3659398057086491e-08, "advantage_min": -1.1096737533807755, "advantage_std": 0.999800406396389, "completion_length": 2577.0208892822266, "epoch": 0.12914285714285714, "grad_norm": 0.2460978776216507, "kl": 0.003580331802368164, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.571721736097088e-07, "loss": 0.0001, "reward": 0.23878616420552135, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.23878616420552135, "reward_after_std": 0.7030831761658192, "reward_before_mean": 0.44820527732372284, "reward_before_std": 0.7209436502307653, "reward_change_max": 0.0, "reward_change_mean": -0.20941910333931446, "reward_change_min": -0.39444285817444324, "reward_change_std": 0.15796104352921247, "reward_std": 0.703083198517561, "rewards/cosine_scaled_reward": -0.036314038559794426, "rewards/format_reward": 0.5208333432674408, "step": 113 }, { "advantage_max": 1.7269503027200699, "advantage_mean": 3.725291630729544e-09, "advantage_min": -0.9455199539661407, "advantage_std": 0.999740943312645, "completion_length": 2578.437511444092, "epoch": 0.13028571428571428, "grad_norm": 0.19760534167289734, "kl": 0.003066539764404297, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.55824636882301e-07, "loss": 0.0001, "reward": 0.01859831716865301, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.01859831716865301, "reward_after_std": 0.4981700386852026, "reward_before_mean": 0.19175568968057632, "reward_before_std": 0.4783357158303261, "reward_change_max": 0.0004209578037261963, "reward_change_mean": -0.1731573868310079, "reward_change_min": -0.2967113181948662, "reward_change_std": 0.11035828175954521, "reward_std": 0.49817005544900894, "rewards/cosine_scaled_reward": -0.20620550867170095, "rewards/format_reward": 0.6041666772216558, "step": 114 }, { "advantage_max": 1.5616178661584854, "advantage_mean": 3.5390257835388184e-08, "advantage_min": -1.0633350536227226, "advantage_std": 0.9997276589274406, "completion_length": 2888.2916717529297, "epoch": 0.13142857142857142, "grad_norm": 0.24375756084918976, "kl": 0.002779722213745117, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.54457320834625e-07, "loss": 0.0001, "reward": -0.1527273915708065, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.1527273915708065, "reward_after_std": 0.5216103941202164, "reward_before_mean": -0.013282615691423416, "reward_before_std": 0.532561769708991, "reward_change_max": 0.0011752843856811523, "reward_change_mean": -0.1394447716884315, "reward_change_min": -0.2619510591030121, "reward_change_std": 0.10499659506604075, "reward_std": 0.5216104164719582, "rewards/cosine_scaled_reward": -0.13164131715893745, "rewards/format_reward": 0.25, "step": 115 }, { "advantage_max": 1.7398159205913544, "advantage_mean": 6.208817238118058e-09, "advantage_min": -0.8426778316497803, "advantage_std": 0.9997789934277534, "completion_length": 3405.0208435058594, "epoch": 0.13257142857142856, "grad_norm": 0.1686706840991974, "kl": 0.0017733573913574219, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.530702921077358e-07, "loss": 0.0001, "reward": -0.3314667074009776, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3314667074009776, "reward_after_std": 0.7026853635907173, "reward_before_mean": -0.2483118800446391, "reward_before_std": 0.7108087986707687, "reward_change_max": 0.0009364485740661621, "reward_change_mean": -0.08315482863690704, "reward_change_min": -0.20761460438370705, "reward_change_std": 0.08251205913256854, "reward_std": 0.7026854008436203, "rewards/cosine_scaled_reward": -0.18665594549383968, "rewards/format_reward": 0.1250000037252903, "step": 116 }, { "advantage_max": 1.6006111353635788, "advantage_mean": -2.7318796780306798e-08, "advantage_min": -1.041724719107151, "advantage_std": 0.999773383140564, "completion_length": 2680.083366394043, "epoch": 0.1337142857142857, "grad_norm": 0.18357373774051666, "kl": 0.0026030540466308594, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.516636183034564e-07, "loss": 0.0001, "reward": 0.012444857507944107, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.012444857507944107, "reward_after_std": 0.6517674252390862, "reward_before_mean": 0.17331765219569206, "reward_before_std": 0.6458356343209743, "reward_change_max": 0.0004078969359397888, "reward_change_mean": -0.16087282774969935, "reward_change_min": -0.3154468797147274, "reward_change_std": 0.11643311567604542, "reward_std": 0.6517674289643764, "rewards/cosine_scaled_reward": -0.10084117203950882, "rewards/format_reward": 0.37500000558793545, "step": 117 }, { "advantage_max": 1.4670916646718979, "advantage_mean": 1.4280280180578586e-08, "advantage_min": -1.1820987686514854, "advantage_std": 0.999840185046196, "completion_length": 2915.9166870117188, "epoch": 0.13485714285714287, "grad_norm": 0.17332710325717926, "kl": 0.0017366409301757812, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.502373679810839e-07, "loss": 0.0001, "reward": 0.2379293106496334, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2379293106496334, "reward_after_std": 0.8679434321820736, "reward_before_mean": 0.43904879316687584, "reward_before_std": 0.916411992162466, "reward_change_max": 0.00032275915145874023, "reward_change_mean": -0.20111947320401669, "reward_change_min": -0.4205525293946266, "reward_change_std": 0.17429276509210467, "reward_std": 0.8679434508085251, "rewards/cosine_scaled_reward": 0.0007743909955024719, "rewards/format_reward": 0.4375000111758709, "step": 118 }, { "advantage_max": 1.5654396712779999, "advantage_mean": -1.2417634698280722e-08, "advantage_min": -1.0977480113506317, "advantage_std": 0.9998264610767365, "completion_length": 2394.6459045410156, "epoch": 0.136, "grad_norm": 0.27868223190307617, "kl": 0.0041304826736450195, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.487916106540465e-07, "loss": 0.0002, "reward": 0.29523699606215814, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.29523699606215814, "reward_after_std": 0.744273629039526, "reward_before_mean": 0.5135078746825457, "reward_before_std": 0.7622634246945381, "reward_change_max": 0.0, "reward_change_mean": -0.21827087132260203, "reward_change_min": -0.4341272786259651, "reward_change_std": 0.16735805850476027, "reward_std": 0.7442736364901066, "rewards/cosine_scaled_reward": -0.03491273708641529, "rewards/format_reward": 0.5833333376795053, "step": 119 }, { "advantage_max": 1.7137190848588943, "advantage_mean": -2.23517424569053e-08, "advantage_min": -1.0148278772830963, "advantage_std": 0.9998286962509155, "completion_length": 2304.145866394043, "epoch": 0.13714285714285715, "grad_norm": 0.2446976751089096, "kl": 0.0027112960815429688, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.473264167865171e-07, "loss": 0.0001, "reward": 0.15172169636934996, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.15172169636934996, "reward_after_std": 0.6846943460404873, "reward_before_mean": 0.33898145519196987, "reward_before_std": 0.6702734418213367, "reward_change_max": 0.0, "reward_change_mean": -0.1872597902547568, "reward_change_min": -0.31060235388576984, "reward_change_std": 0.12618307769298553, "reward_std": 0.6846943572163582, "rewards/cosine_scaled_reward": -0.10134260216727853, "rewards/format_reward": 0.5416666753590107, "step": 120 }, { "advantage_max": 1.691588580608368, "advantage_mean": -5.836288297089709e-08, "advantage_min": -1.034871518611908, "advantage_std": 0.9998171031475067, "completion_length": 1725.9375381469727, "epoch": 0.1382857142857143, "grad_norm": 0.2284398376941681, "kl": 0.0031642913818359375, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.458418577899774e-07, "loss": 0.0001, "reward": 0.421614283695817, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.421614283695817, "reward_after_std": 0.678892083466053, "reward_before_mean": 0.6639131233096123, "reward_before_std": 0.6397330313920975, "reward_change_max": 0.0, "reward_change_mean": -0.2422988973557949, "reward_change_min": -0.4020249769091606, "reward_change_std": 0.15020818077027798, "reward_std": 0.6788921020925045, "rewards/cosine_scaled_reward": -0.043043429497629404, "rewards/format_reward": 0.7500000111758709, "step": 121 }, { "advantage_max": 1.5243453085422516, "advantage_mean": -1.2417634476236117e-08, "advantage_min": -1.0929933041334152, "advantage_std": 0.999860443174839, "completion_length": 2846.1458740234375, "epoch": 0.13942857142857143, "grad_norm": 0.2239234298467636, "kl": 0.0016736984252929688, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.443380060197385e-07, "loss": 0.0001, "reward": 0.3787113861180842, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.3787113861180842, "reward_after_std": 1.0122371204197407, "reward_before_mean": 0.6001707105897367, "reward_before_std": 1.072056818753481, "reward_change_max": 0.00045921653509140015, "reward_change_mean": -0.22145933331921697, "reward_change_min": -0.45815131813287735, "reward_change_std": 0.19878085469827056, "reward_std": 1.0122371390461922, "rewards/cosine_scaled_reward": 0.060502004344016314, "rewards/format_reward": 0.4791666753590107, "step": 122 }, { "advantage_max": 1.5138305127620697, "advantage_mean": 6.208817793229571e-09, "advantage_min": -1.2074719741940498, "advantage_std": 0.9997866898775101, "completion_length": 2553.125045776367, "epoch": 0.14057142857142857, "grad_norm": 0.21360571682453156, "kl": 0.0025411248207092285, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.428149347714143e-07, "loss": 0.0001, "reward": 0.042357919504866004, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.042357919504866004, "reward_after_std": 0.636686947196722, "reward_before_mean": 0.21523546800017357, "reward_before_std": 0.661348432302475, "reward_change_max": 0.00020268559455871582, "reward_change_mean": -0.17287754639983177, "reward_change_min": -0.32666971161961555, "reward_change_std": 0.13960499968379736, "reward_std": 0.6366869881749153, "rewards/cosine_scaled_reward": -0.15279894787818193, "rewards/format_reward": 0.5208333395421505, "step": 123 }, { "advantage_max": 1.5655941367149353, "advantage_mean": -1.641456082168702e-08, "advantage_min": -0.9814046025276184, "advantage_std": 0.999843031167984, "completion_length": 2003.0000381469727, "epoch": 0.1417142857142857, "grad_norm": 0.20767082273960114, "kl": 0.004428386688232422, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.412727182773486e-07, "loss": 0.0002, "reward": 0.500551930628717, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.500551930628717, "reward_after_std": 0.8857000321149826, "reward_before_mean": 0.7559270821511745, "reward_before_std": 0.9190114215016365, "reward_change_max": 0.0, "reward_change_mean": -0.2553751552477479, "reward_change_min": -0.5128838941454887, "reward_change_std": 0.2021732535213232, "reward_std": 0.8857000656425953, "rewards/cosine_scaled_reward": -0.007453134283423424, "rewards/format_reward": 0.7708333414047956, "step": 124 }, { "advantage_max": 1.6035060584545135, "advantage_mean": 6.705522670458208e-08, "advantage_min": -1.1286407485604286, "advantage_std": 0.9997504726052284, "completion_length": 2833.958335876465, "epoch": 0.14285714285714285, "grad_norm": 0.15640655159950256, "kl": 0.0021719932556152344, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.397114317029974e-07, "loss": 0.0001, "reward": 0.16567879915237427, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.16567879915237427, "reward_after_std": 0.5571850594133139, "reward_before_mean": 0.36717021465301514, "reward_before_std": 0.5443516932427883, "reward_change_max": 0.0002313479781150818, "reward_change_mean": -0.20149140153080225, "reward_change_min": -0.3646851126104593, "reward_change_std": 0.14025269588455558, "reward_std": 0.5571850873529911, "rewards/cosine_scaled_reward": 0.02733509987592697, "rewards/format_reward": 0.3125, "step": 125 }, { "advantage_max": 1.5312575846910477, "advantage_mean": 1.1486311707331609e-08, "advantage_min": -1.2798721194267273, "advantage_std": 0.9998003914952278, "completion_length": 2863.5208892822266, "epoch": 0.144, "grad_norm": 0.22090090811252594, "kl": 0.0013557672500610352, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.381311511432658e-07, "loss": 0.0001, "reward": 0.0737285241484642, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.0737285241484642, "reward_after_std": 0.7071263864636421, "reward_before_mean": 0.25064082257449627, "reward_before_std": 0.7428994365036488, "reward_change_max": 0.0, "reward_change_mean": -0.17691227421164513, "reward_change_min": -0.35780435614287853, "reward_change_std": 0.14994592033326626, "reward_std": 0.707126397639513, "rewards/cosine_scaled_reward": -0.09342960081994534, "rewards/format_reward": 0.4375000149011612, "step": 126 }, { "advantage_max": 1.4340872317552567, "advantage_mean": -6.208817349140361e-09, "advantage_min": -1.183464154601097, "advantage_std": 0.9997255057096481, "completion_length": 3002.1041870117188, "epoch": 0.14514285714285713, "grad_norm": 0.16190996766090393, "kl": 0.0020918846130371094, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.36531953618799e-07, "loss": 0.0001, "reward": -0.2233894734235946, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.2233894734235946, "reward_after_std": 0.4971660189330578, "reward_before_mean": -0.0952535979449749, "reward_before_std": 0.5213297717273235, "reward_change_max": 0.0009558200836181641, "reward_change_mean": -0.12813590315636247, "reward_change_min": -0.26985314674675465, "reward_change_std": 0.10562613094225526, "reward_std": 0.4971660412847996, "rewards/cosine_scaled_reward": -0.2142934650182724, "rewards/format_reward": 0.3333333358168602, "step": 127 }, { "advantage_max": 1.4159886687994003, "advantage_mean": 4.3461721332960224e-08, "advantage_min": -1.182981289923191, "advantage_std": 0.9998199418187141, "completion_length": 2886.50004196167, "epoch": 0.1462857142857143, "grad_norm": 0.1824210286140442, "kl": 0.002896904945373535, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.34913917072228e-07, "loss": 0.0001, "reward": 0.2526640184223652, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2526640184223652, "reward_after_std": 0.7988780178129673, "reward_before_mean": 0.4635799862444401, "reward_before_std": 0.8548481166362762, "reward_change_max": 0.0006513744592666626, "reward_change_mean": -0.21091592963784933, "reward_change_min": -0.4153004623949528, "reward_change_std": 0.17933304305188358, "reward_std": 0.7988780252635479, "rewards/cosine_scaled_reward": 0.03387330658733845, "rewards/format_reward": 0.3958333432674408, "step": 128 }, { "advantage_max": 1.670387864112854, "advantage_mean": 2.0178656190417144e-08, "advantage_min": -0.9403666146099567, "advantage_std": 0.9997882694005966, "completion_length": 3401.6666870117188, "epoch": 0.14742857142857144, "grad_norm": 0.17621037364006042, "kl": 0.002868175506591797, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.332771203643714e-07, "loss": 0.0001, "reward": -0.26161413080990314, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.26161413080990314, "reward_after_std": 0.6878196895122528, "reward_before_mean": -0.1574427995365113, "reward_before_std": 0.7175455689430237, "reward_change_max": 0.0006723701953887939, "reward_change_mean": -0.10417133261216804, "reward_change_min": -0.2432860340923071, "reward_change_std": 0.10660239483695477, "reward_std": 0.6878196895122528, "rewards/cosine_scaled_reward": -0.1724714022129774, "rewards/format_reward": 0.1875000037252903, "step": 129 }, { "advantage_max": 1.6903793960809708, "advantage_mean": 2.0489096419495922e-08, "advantage_min": -0.9747004956007004, "advantage_std": 0.9998284503817558, "completion_length": 2804.6250381469727, "epoch": 0.14857142857142858, "grad_norm": 0.1896054595708847, "kl": 0.001954317092895508, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.316216432703916e-07, "loss": 0.0001, "reward": -0.028209278360009193, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.028209278360009193, "reward_after_std": 0.7870006188750267, "reward_before_mean": 0.11392226428142749, "reward_before_std": 0.7941737547516823, "reward_change_max": 0.0006090477108955383, "reward_change_mean": -0.14213155990000814, "reward_change_min": -0.3012427128851414, "reward_change_std": 0.1142815554048866, "reward_std": 0.7870006375014782, "rewards/cosine_scaled_reward": -0.10970553383231163, "rewards/format_reward": 0.3333333358168602, "step": 130 }, { "advantage_max": 1.5626876056194305, "advantage_mean": 5.960464544152444e-08, "advantage_min": -1.1384482607245445, "advantage_std": 0.9997840076684952, "completion_length": 2861.3958740234375, "epoch": 0.14971428571428572, "grad_norm": 0.20207345485687256, "kl": 0.0037682056427001953, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.299475664759068e-07, "loss": 0.0002, "reward": 0.3409400451928377, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3409400451928377, "reward_after_std": 0.6878578290343285, "reward_before_mean": 0.5735721625387669, "reward_before_std": 0.6877081072889268, "reward_change_max": 0.0010700449347496033, "reward_change_mean": -0.23263203352689743, "reward_change_min": -0.41999047063291073, "reward_change_std": 0.17137008626013994, "reward_std": 0.6878578290343285, "rewards/cosine_scaled_reward": 0.07845271937549114, "rewards/format_reward": 0.416666679084301, "step": 131 }, { "advantage_max": 1.629030168056488, "advantage_mean": 3.725290742551124e-09, "advantage_min": -1.0061270147562027, "advantage_std": 0.9997803717851639, "completion_length": 2558.562530517578, "epoch": 0.15085714285714286, "grad_norm": 0.20671804249286652, "kl": 0.002033710479736328, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.282549715730579e-07, "loss": 0.0001, "reward": 0.1808385867625475, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1808385867625475, "reward_after_std": 0.6454941257834435, "reward_before_mean": 0.3788890652358532, "reward_before_std": 0.6303308047354221, "reward_change_max": 0.0003954395651817322, "reward_change_mean": -0.1980504752136767, "reward_change_min": -0.36066864989697933, "reward_change_std": 0.14119471702724695, "reward_std": 0.6454941593110561, "rewards/cosine_scaled_reward": 0.0019445132929831743, "rewards/format_reward": 0.3750000037252903, "step": 132 }, { "advantage_max": 1.4284230470657349, "advantage_mean": 8.940697038273271e-08, "advantage_min": -1.192399837076664, "advantage_std": 0.9997306689620018, "completion_length": 3046.8958587646484, "epoch": 0.152, "grad_norm": 0.2751116454601288, "kl": 0.003151416778564453, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.265439410565328e-07, "loss": 0.0001, "reward": -0.26124978717416525, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.26124978717416525, "reward_after_std": 0.4831140823662281, "reward_before_mean": -0.13964797370135784, "reward_before_std": 0.5154471658170223, "reward_change_max": 0.0006668567657470703, "reward_change_mean": -0.12160179018974304, "reward_change_min": -0.25765814632177353, "reward_change_std": 0.10493774805217981, "reward_std": 0.48311409167945385, "rewards/cosine_scaled_reward": -0.215657327324152, "rewards/format_reward": 0.291666679084301, "step": 133 }, { "advantage_max": 1.650213047862053, "advantage_mean": 1.3969839257610417e-08, "advantage_min": -0.9398018196225166, "advantage_std": 0.9998574033379555, "completion_length": 2290.812545776367, "epoch": 0.15314285714285714, "grad_norm": 0.24014216661453247, "kl": 0.0033774375915527344, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.248145583195447e-07, "loss": 0.0001, "reward": 0.2990442682057619, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2990442682057619, "reward_after_std": 0.9332752451300621, "reward_before_mean": 0.5012820335105062, "reward_before_std": 0.9426852278411388, "reward_change_max": 0.00036494433879852295, "reward_change_mean": -0.2022377629764378, "reward_change_min": -0.4220910370349884, "reward_change_std": 0.1617970857769251, "reward_std": 0.933275256305933, "rewards/cosine_scaled_reward": -0.07227565790526569, "rewards/format_reward": 0.6458333376795053, "step": 134 }, { "advantage_max": 1.740107610821724, "advantage_mean": 3.104402290787789e-10, "advantage_min": -0.9589984938502312, "advantage_std": 0.999826468527317, "completion_length": 2104.5834045410156, "epoch": 0.15428571428571428, "grad_norm": 0.2661309540271759, "kl": 0.00379180908203125, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.230669076497687e-07, "loss": 0.0002, "reward": 0.9028741903603077, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.9028741903603077, "reward_after_std": 0.7912594079971313, "reward_before_mean": 1.2412361446768045, "reward_before_std": 0.7384288650937378, "reward_change_max": 0.00038820505142211914, "reward_change_mean": -0.33836194314062595, "reward_change_min": -0.5658552143722773, "reward_change_std": 0.2178718103095889, "reward_std": 0.7912594079971313, "rewards/cosine_scaled_reward": 0.30811806954443455, "rewards/format_reward": 0.6250000093132257, "step": 135 }, { "advantage_max": 1.5855561196804047, "advantage_mean": 1.5522043650406658e-08, "advantage_min": -1.045980878174305, "advantage_std": 0.9998907074332237, "completion_length": 2559.5833892822266, "epoch": 0.15542857142857142, "grad_norm": 0.2847113609313965, "kl": 0.003814697265625, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.213010742252327e-07, "loss": 0.0002, "reward": 0.37419047206640244, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.37419047206640244, "reward_after_std": 1.0629618167877197, "reward_before_mean": 0.5864962767809629, "reward_before_std": 1.1105365753173828, "reward_change_max": 0.0009570121765136719, "reward_change_mean": -0.21230582473799586, "reward_change_min": -0.4814386647194624, "reward_change_std": 0.19272688124328852, "reward_std": 1.0629618465900421, "rewards/cosine_scaled_reward": 0.03283148072659969, "rewards/format_reward": 0.5208333432674408, "step": 136 }, { "advantage_max": 1.5506631284952164, "advantage_mean": -3.476937715518602e-08, "advantage_min": -1.1965996623039246, "advantage_std": 0.9997732192277908, "completion_length": 2663.3750610351562, "epoch": 0.15657142857142858, "grad_norm": 0.15699128806591034, "kl": 0.0024433135986328125, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.195171441101668e-07, "loss": 0.0001, "reward": 0.014663774520158768, "reward_advantage_correlation": 0.9999999999999994, "reward_after_mean": 0.014663774520158768, "reward_after_std": 0.5894833207130432, "reward_before_mean": 0.18573991488665342, "reward_before_std": 0.6053555309772491, "reward_change_max": 0.0003803074359893799, "reward_change_mean": -0.17107615806162357, "reward_change_min": -0.30640072375535965, "reward_change_std": 0.12884120550006628, "reward_std": 0.5894833244383335, "rewards/cosine_scaled_reward": -0.1467133816331625, "rewards/format_reward": 0.4791666753590107, "step": 137 }, { "advantage_max": 1.6773697882890701, "advantage_mean": -7.450580818968433e-09, "advantage_min": -1.1602972447872162, "advantage_std": 0.9997936189174652, "completion_length": 2176.3333892822266, "epoch": 0.15771428571428572, "grad_norm": 0.23005136847496033, "kl": 0.002424955368041992, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.177152042508077e-07, "loss": 0.0001, "reward": 0.23747283313423395, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.23747283313423395, "reward_after_std": 0.6506821662187576, "reward_before_mean": 0.4451894210651517, "reward_before_std": 0.6293397713452578, "reward_change_max": 0.0003636404871940613, "reward_change_mean": -0.20771658699959517, "reward_change_min": -0.3176217880100012, "reward_change_std": 0.1273680031299591, "reward_std": 0.6506821922957897, "rewards/cosine_scaled_reward": -0.12115529365837574, "rewards/format_reward": 0.6875000186264515, "step": 138 }, { "advantage_max": 1.499302163720131, "advantage_mean": -5.587935536510713e-08, "advantage_min": -1.1127532124519348, "advantage_std": 0.9997905865311623, "completion_length": 2995.812545776367, "epoch": 0.15885714285714286, "grad_norm": 0.249574676156044, "kl": 0.0039157867431640625, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.158953424711624e-07, "loss": 0.0002, "reward": -0.003708356380229816, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.003708356380229816, "reward_after_std": 0.6921847648918629, "reward_before_mean": 0.15705988928675652, "reward_before_std": 0.7318538166582584, "reward_change_max": 0.000463239848613739, "reward_change_mean": -0.16076825419440866, "reward_change_min": -0.35990045219659805, "reward_change_std": 0.14408531039953232, "reward_std": 0.6921847872436047, "rewards/cosine_scaled_reward": -0.11938672885298729, "rewards/format_reward": 0.3958333395421505, "step": 139 }, { "advantage_max": 1.7067873626947403, "advantage_mean": -7.450582262258365e-09, "advantage_min": -0.869616873562336, "advantage_std": 0.9997568354010582, "completion_length": 2614.166732788086, "epoch": 0.16, "grad_norm": 0.38242799043655396, "kl": 0.004871368408203125, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.140576474687263e-07, "loss": 0.0002, "reward": 0.0828480685595423, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.0828480685595423, "reward_after_std": 0.5462512150406837, "reward_before_mean": 0.2640006002038717, "reward_before_std": 0.5157320648431778, "reward_change_max": 0.0004016086459159851, "reward_change_mean": -0.18115251511335373, "reward_change_min": -0.3285767696797848, "reward_change_std": 0.11956191342324018, "reward_std": 0.5462512299418449, "rewards/cosine_scaled_reward": -0.08674971852451563, "rewards/format_reward": 0.4375000037252903, "step": 140 }, { "advantage_max": 1.7529622614383698, "advantage_mean": 2.4835269396561444e-09, "advantage_min": -1.000135324895382, "advantage_std": 0.9998727887868881, "completion_length": 2590.7500762939453, "epoch": 0.16114285714285714, "grad_norm": 0.2258317768573761, "kl": 0.004827022552490234, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.122022088101613e-07, "loss": 0.0002, "reward": 0.28294834215193987, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.28294834215193987, "reward_after_std": 0.9506829380989075, "reward_before_mean": 0.479689369443804, "reward_before_std": 0.9543975107371807, "reward_change_max": 0.00037819892168045044, "reward_change_mean": -0.1967410072684288, "reward_change_min": -0.37072051130235195, "reward_change_std": 0.1559813655912876, "reward_std": 0.950682982802391, "rewards/cosine_scaled_reward": -0.06223865784704685, "rewards/format_reward": 0.6041666753590107, "step": 141 }, { "advantage_max": 1.5266470611095428, "advantage_mean": 8.881784197001252e-16, "advantage_min": -1.2423633113503456, "advantage_std": 0.999744102358818, "completion_length": 2659.000030517578, "epoch": 0.16228571428571428, "grad_norm": 0.17855477333068848, "kl": 0.0037615299224853516, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.103291169269299e-07, "loss": 0.0002, "reward": 0.164220348931849, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.164220348931849, "reward_after_std": 0.6876718364655972, "reward_before_mean": 0.35929872654378414, "reward_before_std": 0.708413734100759, "reward_change_max": 0.0006590783596038818, "reward_change_mean": -0.19507834874093533, "reward_change_min": -0.37512016855180264, "reward_change_std": 0.15412273351103067, "reward_std": 0.6876718625426292, "rewards/cosine_scaled_reward": -0.10160065069794655, "rewards/format_reward": 0.5625000149011612, "step": 142 }, { "advantage_max": 1.6525219678878784, "advantage_mean": 1.6142926662077173e-08, "advantage_min": -0.9572406560182571, "advantage_std": 0.9998095482587814, "completion_length": 2580.1458740234375, "epoch": 0.16342857142857142, "grad_norm": 0.256779283285141, "kl": 0.004227638244628906, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.084384631108882e-07, "loss": 0.0002, "reward": 3.49879264831543e-05, "reward_advantage_correlation": 1.0, "reward_after_mean": 3.49879264831543e-05, "reward_after_std": 0.675796166062355, "reward_before_mean": 0.15760892163962126, "reward_before_std": 0.6773778162896633, "reward_change_max": 0.0, "reward_change_mean": -0.15757392905652523, "reward_change_min": -0.3442469611763954, "reward_change_std": 0.12573405727744102, "reward_std": 0.6757961846888065, "rewards/cosine_scaled_reward": -0.18161221034824848, "rewards/format_reward": 0.5208333432674408, "step": 143 }, { "advantage_max": 1.7686203569173813, "advantage_mean": 4.3461721777049434e-08, "advantage_min": -0.8464352861046791, "advantage_std": 0.999814823269844, "completion_length": 2639.333366394043, "epoch": 0.16457142857142856, "grad_norm": 0.23858557641506195, "kl": 0.0036993026733398438, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.065303395098358e-07, "loss": 0.0001, "reward": -0.06461599344993374, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.06461599344993374, "reward_after_std": 0.7022199928760529, "reward_before_mean": 0.07492145337164402, "reward_before_std": 0.699074275791645, "reward_change_max": 0.00051155686378479, "reward_change_mean": -0.13953743898309767, "reward_change_min": -0.28874889947474003, "reward_change_std": 0.10882806684821844, "reward_std": 0.7022200152277946, "rewards/cosine_scaled_reward": -0.17087261471897364, "rewards/format_reward": 0.4166666716337204, "step": 144 }, { "advantage_max": 1.6545881032943726, "advantage_mean": -4.346170645597169e-09, "advantage_min": -1.0852633118629456, "advantage_std": 0.9997621700167656, "completion_length": 1951.4792175292969, "epoch": 0.1657142857142857, "grad_norm": 0.31547772884368896, "kl": 0.003958702087402344, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.046048391230247e-07, "loss": 0.0002, "reward": 0.5314989294856787, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5314989294856787, "reward_after_std": 0.6146979462355375, "reward_before_mean": 0.8071410674601793, "reward_before_std": 0.5886675817891955, "reward_change_max": 0.0004691779613494873, "reward_change_mean": -0.27564213797450066, "reward_change_min": -0.4286304134875536, "reward_change_std": 0.1728304447606206, "reward_std": 0.6146979611366987, "rewards/cosine_scaled_reward": 0.07023719977587461, "rewards/format_reward": 0.6666666734963655, "step": 145 }, { "advantage_max": 1.5834444910287857, "advantage_mean": -6.208816794028849e-10, "advantage_min": -1.2619915455579758, "advantage_std": 0.9997970163822174, "completion_length": 2003.5000610351562, "epoch": 0.16685714285714287, "grad_norm": 0.22241827845573425, "kl": 0.0024411678314208984, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.026620557966279e-07, "loss": 0.0001, "reward": 0.2519833882106468, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.2519833882106468, "reward_after_std": 0.6221999675035477, "reward_before_mean": 0.4680800251662731, "reward_before_std": 0.6158224176615477, "reward_change_max": 0.00015006959438323975, "reward_change_mean": -0.21609664289280772, "reward_change_min": -0.3714016154408455, "reward_change_std": 0.14444420114159584, "reward_std": 0.6222000010311604, "rewards/cosine_scaled_reward": -0.151376660913229, "rewards/format_reward": 0.7708333432674408, "step": 146 }, { "advantage_max": 1.5665484219789505, "advantage_mean": 3.7252903650752955e-08, "advantage_min": -1.1734899654984474, "advantage_std": 0.9998107403516769, "completion_length": 2070.0208587646484, "epoch": 0.168, "grad_norm": 0.2511901259422302, "kl": 0.0043354034423828125, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.007020842191634e-07, "loss": 0.0002, "reward": 0.1382943361531943, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1382943361531943, "reward_after_std": 0.6228159815073013, "reward_before_mean": 0.3309053098782897, "reward_before_std": 0.629545658826828, "reward_change_max": 0.0, "reward_change_mean": -0.1926109748892486, "reward_change_min": -0.34466840885579586, "reward_change_std": 0.1377943456172943, "reward_std": 0.6228160075843334, "rewards/cosine_scaled_reward": -0.14704734086990356, "rewards/format_reward": 0.6250000074505806, "step": 147 }, { "advantage_max": 1.6768000572919846, "advantage_mean": -6.705522803684971e-08, "advantage_min": -1.1405689418315887, "advantage_std": 0.9997821599245071, "completion_length": 1937.4583740234375, "epoch": 0.16914285714285715, "grad_norm": 0.20754331350326538, "kl": 0.0034346580505371094, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.987250199168808e-07, "loss": 0.0001, "reward": 0.40766859240829945, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.40766859240829945, "reward_after_std": 0.6350481193512678, "reward_before_mean": 0.650043424218893, "reward_before_std": 0.5965264923870564, "reward_change_max": 0.0, "reward_change_mean": -0.24237484019249678, "reward_change_min": -0.3769410066306591, "reward_change_std": 0.14409376867115498, "reward_std": 0.6350481417030096, "rewards/cosine_scaled_reward": -0.039561630226671696, "rewards/format_reward": 0.729166679084301, "step": 148 }, { "advantage_max": 1.5765099674463272, "advantage_mean": -6.581346534417776e-08, "advantage_min": -1.1912664473056793, "advantage_std": 0.9997623637318611, "completion_length": 2463.2500228881836, "epoch": 0.1702857142857143, "grad_norm": 0.20378859341144562, "kl": 0.004099845886230469, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.967309592491052e-07, "loss": 0.0002, "reward": 0.41491406969726086, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.41491406969726086, "reward_after_std": 0.6035808995366096, "reward_before_mean": 0.6691889259964228, "reward_before_std": 0.5983965341001749, "reward_change_max": 0.00039667636156082153, "reward_change_mean": -0.2542748870328069, "reward_change_min": -0.4188149496912956, "reward_change_std": 0.16949533484876156, "reward_std": 0.6035809032619, "rewards/cosine_scaled_reward": 0.02209446392953396, "rewards/format_reward": 0.6250000037252903, "step": 149 }, { "advantage_max": 1.5017241835594177, "advantage_mean": -5.587935891782081e-09, "advantage_min": -1.1115936785936356, "advantage_std": 0.9998666793107986, "completion_length": 2137.7083435058594, "epoch": 0.17142857142857143, "grad_norm": 0.2677984833717346, "kl": 0.0045261383056640625, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.9471999940354e-07, "loss": 0.0002, "reward": 0.27933480869978666, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.27933480869978666, "reward_after_std": 0.9380423426628113, "reward_before_mean": 0.4810619349591434, "reward_before_std": 0.9789088033139706, "reward_change_max": 0.0010481253266334534, "reward_change_mean": -0.20172712206840515, "reward_change_min": -0.42329406924545765, "reward_change_std": 0.18029167037457228, "reward_std": 0.9380423575639725, "rewards/cosine_scaled_reward": -0.06155238504288718, "rewards/format_reward": 0.6041666828095913, "step": 150 }, { "advantage_max": 1.6314374655485153, "advantage_mean": -5.0912302596017867e-08, "advantage_min": -1.2180282175540924, "advantage_std": 0.9997744411230087, "completion_length": 2193.5208587646484, "epoch": 0.17257142857142857, "grad_norm": 0.2930396497249603, "kl": 0.0044596195220947266, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.926922383915315e-07, "loss": 0.0002, "reward": 0.45010002702474594, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.45010002702474594, "reward_after_std": 0.6098031923174858, "reward_before_mean": 0.7081503644585609, "reward_before_std": 0.5848918259143829, "reward_change_max": 0.0, "reward_change_mean": -0.2580503453500569, "reward_change_min": -0.39921652898192406, "reward_change_std": 0.16081867553293705, "reward_std": 0.6098032146692276, "rewards/cosine_scaled_reward": 0.020741842687129974, "rewards/format_reward": 0.6666666716337204, "step": 151 }, { "advantage_max": 1.706085816025734, "advantage_mean": 1.862645193639878e-08, "advantage_min": -1.015485629439354, "advantage_std": 0.9998258575797081, "completion_length": 2515.2083892822266, "epoch": 0.1737142857142857, "grad_norm": 0.2551548182964325, "kl": 0.0040667057037353516, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.906477750432903e-07, "loss": 0.0002, "reward": -0.012446035631000996, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.012446035631000996, "reward_after_std": 0.7643734775483608, "reward_before_mean": 0.1334016639739275, "reward_before_std": 0.7614435665309429, "reward_change_max": 0.0005860179662704468, "reward_change_mean": -0.14584769657813013, "reward_change_min": -0.26715877279639244, "reward_change_std": 0.10718716867268085, "reward_std": 0.7643734961748123, "rewards/cosine_scaled_reward": -0.16246584057807922, "rewards/format_reward": 0.45833334140479565, "step": 152 }, { "advantage_max": 1.5629348754882812, "advantage_mean": 6.581346390088783e-08, "advantage_min": -1.113683059811592, "advantage_std": 0.9996767640113831, "completion_length": 2569.2292098999023, "epoch": 0.17485714285714285, "grad_norm": 0.23137755692005157, "kl": 0.00739288330078125, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.88586709003076e-07, "loss": 0.0003, "reward": -0.18267223238945007, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.18267223238945007, "reward_after_std": 0.5054036788642406, "reward_before_mean": -0.05120914429426193, "reward_before_std": 0.5079370914027095, "reward_change_max": 0.0, "reward_change_mean": -0.13146307598799467, "reward_change_min": -0.25649657659232616, "reward_change_std": 0.09824450453743339, "reward_std": 0.5054036900401115, "rewards/cosine_scaled_reward": -0.2547712437444716, "rewards/format_reward": 0.4583333358168602, "step": 153 }, { "advantage_max": 1.6035524010658264, "advantage_mean": -2.6077032755367213e-08, "advantage_min": -1.1122926324605942, "advantage_std": 0.9998549222946167, "completion_length": 3082.4375610351562, "epoch": 0.176, "grad_norm": 0.18362846970558167, "kl": 0.0031795501708984375, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.865091407243394e-07, "loss": 0.0001, "reward": 0.37665724381804466, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.37665724381804466, "reward_after_std": 0.8660299405455589, "reward_before_mean": 0.6022710939869285, "reward_before_std": 0.8814903013408184, "reward_change_max": 0.00025279074907302856, "reward_change_mean": -0.22561387112364173, "reward_change_min": -0.4025569446384907, "reward_change_std": 0.16582211665809155, "reward_std": 0.8660299628973007, "rewards/cosine_scaled_reward": 0.0719688767567277, "rewards/format_reward": 0.4583333432674408, "step": 154 }, { "advantage_max": 1.6857571452856064, "advantage_mean": -3.050081476274613e-08, "advantage_min": -1.0754043608903885, "advantage_std": 0.9998093396425247, "completion_length": 2467.458381652832, "epoch": 0.17714285714285713, "grad_norm": 0.2297108769416809, "kl": 0.00498199462890625, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.844151714648274e-07, "loss": 0.0002, "reward": 0.5354392826557159, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5354392826557159, "reward_after_std": 0.6706315167248249, "reward_before_mean": 0.8052331898361444, "reward_before_std": 0.6433561127632856, "reward_change_max": 0.00030928850173950195, "reward_change_mean": -0.26979394583031535, "reward_change_min": -0.4219784028828144, "reward_change_std": 0.16768614412285388, "reward_std": 0.6706315279006958, "rewards/cosine_scaled_reward": 0.14219993818551302, "rewards/format_reward": 0.5208333376795053, "step": 155 }, { "advantage_max": 1.4635108709335327, "advantage_mean": -1.8626452602532595e-09, "advantage_min": -1.2058681324124336, "advantage_std": 0.9998436570167542, "completion_length": 2434.500030517578, "epoch": 0.1782857142857143, "grad_norm": 0.1800357550382614, "kl": 0.0037064552307128906, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.823049032816478e-07, "loss": 0.0001, "reward": 0.322545756585896, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.322545756585896, "reward_after_std": 0.7853665836155415, "reward_before_mean": 0.5472987722605467, "reward_before_std": 0.8228888586163521, "reward_change_max": 0.00011435151100158691, "reward_change_mean": -0.22475302033126354, "reward_change_min": -0.43423015251755714, "reward_change_std": 0.17797227203845978, "reward_std": 0.785366591066122, "rewards/cosine_scaled_reward": 0.03406604006886482, "rewards/format_reward": 0.47916667722165585, "step": 156 }, { "advantage_max": 1.6851200014352798, "advantage_mean": 3.60111410691033e-08, "advantage_min": -0.9669445231556892, "advantage_std": 0.999786801636219, "completion_length": 2656.1458587646484, "epoch": 0.17942857142857144, "grad_norm": 0.2275025099515915, "kl": 0.005410194396972656, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.801784390262943e-07, "loss": 0.0002, "reward": -0.12188863917253911, "reward_advantage_correlation": 0.9999999999999994, "reward_after_mean": -0.12188863917253911, "reward_after_std": 0.5806295238435268, "reward_before_mean": 0.01793377846479416, "reward_before_std": 0.5874885879456997, "reward_change_max": 0.0, "reward_change_mean": -0.13982242881320417, "reward_change_min": -0.283700505271554, "reward_change_std": 0.11282584932632744, "reward_std": 0.5806295461952686, "rewards/cosine_scaled_reward": -0.22019978612661362, "rewards/format_reward": 0.4583333507180214, "step": 157 }, { "advantage_max": 1.5106781423091888, "advantage_mean": 2.483527050678447e-09, "advantage_min": -1.2967959195375443, "advantage_std": 0.9998632296919823, "completion_length": 2356.0625610351562, "epoch": 0.18057142857142858, "grad_norm": 0.27170735597610474, "kl": 0.0045986175537109375, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.780358823396352e-07, "loss": 0.0002, "reward": 0.6479261901695281, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6479261901695281, "reward_after_std": 0.8374496065080166, "reward_before_mean": 0.9347638841718435, "reward_before_std": 0.8472369872033596, "reward_change_max": 0.0009616687893867493, "reward_change_mean": -0.2868376709520817, "reward_change_min": -0.4877729155123234, "reward_change_std": 0.20384064875543118, "reward_std": 0.8374496325850487, "rewards/cosine_scaled_reward": 0.12363193836063147, "rewards/format_reward": 0.6875000279396772, "step": 158 }, { "advantage_max": 1.618651032447815, "advantage_mean": -1.3038516433194758e-08, "advantage_min": -1.1713954880833626, "advantage_std": 0.9997406974434853, "completion_length": 2258.854202270508, "epoch": 0.18171428571428572, "grad_norm": 0.19485744833946228, "kl": 0.004391670227050781, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.758773376468604e-07, "loss": 0.0002, "reward": -0.007745785638689995, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.007745785638689995, "reward_after_std": 0.4710584804415703, "reward_before_mean": 0.16189392702654004, "reward_before_std": 0.4530893340706825, "reward_change_max": 0.0002969801425933838, "reward_change_mean": -0.16963971918448806, "reward_change_min": -0.2686846721917391, "reward_change_std": 0.10416083410382271, "reward_std": 0.4710584916174412, "rewards/cosine_scaled_reward": -0.18988638184964657, "rewards/format_reward": 0.5416666716337204, "step": 159 }, { "advantage_max": 1.601723164319992, "advantage_mean": -6.2088170160734535e-09, "advantage_min": -0.9842819944024086, "advantage_std": 0.9998537227511406, "completion_length": 2300.520896911621, "epoch": 0.18285714285714286, "grad_norm": 0.24165897071361542, "kl": 0.0068359375, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.737029101523929e-07, "loss": 0.0003, "reward": 0.47148462012410164, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.47148462012410164, "reward_after_std": 0.9986433014273643, "reward_before_mean": 0.7081597335636616, "reward_before_std": 1.017629697918892, "reward_change_max": 0.0008212849497795105, "reward_change_mean": -0.2366751218214631, "reward_change_min": -0.5349176675081253, "reward_change_std": 0.2049960969015956, "reward_std": 0.9986433461308479, "rewards/cosine_scaled_reward": 0.07282985420897603, "rewards/format_reward": 0.5625000111758709, "step": 160 }, { "advantage_max": 1.6861842274665833, "advantage_mean": -2.9181441152381637e-08, "advantage_min": -0.9965731874108315, "advantage_std": 0.9998380914330482, "completion_length": 2000.770896911621, "epoch": 0.184, "grad_norm": 0.2589896023273468, "kl": 0.004801750183105469, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.715127058347614e-07, "loss": 0.0002, "reward": 0.2427341677248478, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2427341677248478, "reward_after_std": 0.7803987562656403, "reward_before_mean": 0.4423756841570139, "reward_before_std": 0.7710563689470291, "reward_change_max": 0.0, "reward_change_mean": -0.19964152900502086, "reward_change_min": -0.3690079543739557, "reward_change_std": 0.13987152371555567, "reward_std": 0.7803987711668015, "rewards/cosine_scaled_reward": -0.09131215792149305, "rewards/format_reward": 0.6250000074505806, "step": 161 }, { "advantage_max": 1.6020589023828506, "advantage_mean": -8.071462720415923e-09, "advantage_min": -1.11840408295393, "advantage_std": 0.9997873082756996, "completion_length": 2577.354217529297, "epoch": 0.18514285714285714, "grad_norm": 0.2400018274784088, "kl": 0.007944107055664062, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.693068314414344e-07, "loss": 0.0003, "reward": 0.1532662883400917, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.1532662883400917, "reward_after_std": 0.7209969665855169, "reward_before_mean": 0.34282832220196724, "reward_before_std": 0.7363618742674589, "reward_change_max": 0.00031294673681259155, "reward_change_mean": -0.1895620170980692, "reward_change_min": -0.36849910393357277, "reward_change_std": 0.15132501488551497, "reward_std": 0.7209969665855169, "rewards/cosine_scaled_reward": -0.057752519845962524, "rewards/format_reward": 0.4583333395421505, "step": 162 }, { "advantage_max": 1.6749805212020874, "advantage_mean": -3.911555057634075e-08, "advantage_min": -0.9457454830408096, "advantage_std": 0.9998423382639885, "completion_length": 2090.0625610351562, "epoch": 0.18628571428571428, "grad_norm": 0.2369341254234314, "kl": 0.0051097869873046875, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.670853944836176e-07, "loss": 0.0002, "reward": 0.5305918380618095, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5305918380618095, "reward_after_std": 0.8068078309297562, "reward_before_mean": 0.7879913533106446, "reward_before_std": 0.7713843509554863, "reward_change_max": 0.0002727434039115906, "reward_change_mean": -0.2573995175771415, "reward_change_min": -0.43212768994271755, "reward_change_std": 0.16625749971717596, "reward_std": 0.8068078495562077, "rewards/cosine_scaled_reward": 0.050245666585396975, "rewards/format_reward": 0.6875000055879354, "step": 163 }, { "advantage_max": 1.692627653479576, "advantage_mean": -4.097819339410336e-08, "advantage_min": -1.0569606199860573, "advantage_std": 0.9998034909367561, "completion_length": 1950.6250381469727, "epoch": 0.18742857142857142, "grad_norm": 0.2518463432788849, "kl": 0.0050716400146484375, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.648485032310144e-07, "loss": 0.0002, "reward": 0.5412261649034917, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5412261649034917, "reward_after_std": 0.6402173116803169, "reward_before_mean": 0.8132719174027443, "reward_before_std": 0.602929200977087, "reward_change_max": 0.0, "reward_change_mean": -0.27204577438533306, "reward_change_min": -0.43784534372389317, "reward_change_std": 0.1645910618826747, "reward_std": 0.640217337757349, "rewards/cosine_scaled_reward": 0.0420526172965765, "rewards/format_reward": 0.7291666753590107, "step": 164 }, { "advantage_max": 1.714088037610054, "advantage_mean": -5.58793583627093e-09, "advantage_min": -0.9592446982860565, "advantage_std": 0.9998089000582695, "completion_length": 2122.0416946411133, "epoch": 0.18857142857142858, "grad_norm": 0.2594444751739502, "kl": 0.00640106201171875, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.625962667065487e-07, "loss": 0.0003, "reward": 0.049126192927360535, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.049126192927360535, "reward_after_std": 0.6851355955004692, "reward_before_mean": 0.2161144088022411, "reward_before_std": 0.6790233179926872, "reward_change_max": 0.0009498968720436096, "reward_change_mean": -0.16698822565376759, "reward_change_min": -0.32509128004312515, "reward_change_std": 0.12738678557798266, "reward_std": 0.685135617852211, "rewards/cosine_scaled_reward": -0.1940261390991509, "rewards/format_reward": 0.6041666734963655, "step": 165 }, { "advantage_max": 1.6857409626245499, "advantage_mean": 3.5390257835388184e-08, "advantage_min": -1.1037379801273346, "advantage_std": 0.9998189136385918, "completion_length": 2194.3541946411133, "epoch": 0.18971428571428572, "grad_norm": 0.2024780958890915, "kl": 0.004538536071777344, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.603287946810513e-07, "loss": 0.0002, "reward": 0.2784705702215433, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2784705702215433, "reward_after_std": 0.7903713285923004, "reward_before_mean": 0.48560900520533323, "reward_before_std": 0.7836416717618704, "reward_change_max": 0.0002986118197441101, "reward_change_mean": -0.20713840331882238, "reward_change_min": -0.36814062111079693, "reward_change_std": 0.14702307805418968, "reward_std": 0.790371336042881, "rewards/cosine_scaled_reward": -0.06969551555812359, "rewards/format_reward": 0.6250000111758709, "step": 166 }, { "advantage_max": 1.7721141129732132, "advantage_mean": -1.303851654421706e-08, "advantage_min": -1.060515969991684, "advantage_std": 0.9997974634170532, "completion_length": 1851.4792251586914, "epoch": 0.19085714285714286, "grad_norm": 0.2363491803407669, "kl": 0.0055522918701171875, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.580461976679099e-07, "loss": 0.0002, "reward": 0.199244512245059, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.199244512245059, "reward_after_std": 0.771139208227396, "reward_before_mean": 0.38737055473029613, "reward_before_std": 0.7454683519899845, "reward_change_max": 0.000876501202583313, "reward_change_mean": -0.18812604527920485, "reward_change_min": -0.33890487626194954, "reward_change_std": 0.13139595091342926, "reward_std": 0.7711392100900412, "rewards/cosine_scaled_reward": -0.2021480556577444, "rewards/format_reward": 0.7916666828095913, "step": 167 }, { "advantage_max": 1.516417846083641, "advantage_mean": -2.0489097085629737e-08, "advantage_min": -1.0871713608503342, "advantage_std": 0.9998452961444855, "completion_length": 2492.3333892822266, "epoch": 0.192, "grad_norm": 0.2922627925872803, "kl": 0.0047931671142578125, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.557485869176825e-07, "loss": 0.0002, "reward": 0.35394715797156096, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.35394715797156096, "reward_after_std": 0.9185864347964525, "reward_before_mean": 0.5737202242016792, "reward_before_std": 0.9552944917231798, "reward_change_max": 0.0001585930585861206, "reward_change_mean": -0.21977307926863432, "reward_change_min": -0.43790194019675255, "reward_change_std": 0.1791673693805933, "reward_std": 0.9185864515602589, "rewards/cosine_scaled_reward": 0.005610108375549316, "rewards/format_reward": 0.5625000093132257, "step": 168 }, { "advantage_max": 1.5727615356445312, "advantage_mean": -1.1051694892572073e-07, "advantage_min": -1.1902239173650742, "advantage_std": 0.9998266100883484, "completion_length": 1229.208381652832, "epoch": 0.19314285714285714, "grad_norm": 0.25364622473716736, "kl": 0.005054473876953125, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.534360744126753e-07, "loss": 0.0002, "reward": 1.235820960253477, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 1.235820960253477, "reward_after_std": 0.6339393146336079, "reward_before_mean": 1.6583482660353184, "reward_before_std": 0.5706211663782597, "reward_change_max": 0.0, "reward_change_mean": -0.42252730391919613, "reward_change_min": -0.599956326186657, "reward_change_std": 0.23817383870482445, "reward_std": 0.6339393258094788, "rewards/cosine_scaled_reward": 0.35000744462013245, "rewards/format_reward": 0.9583333432674408, "step": 169 }, { "advantage_max": 1.722038522362709, "advantage_mean": 1.2169281882190575e-07, "advantage_min": -0.9152258113026619, "advantage_std": 0.9997827783226967, "completion_length": 2099.750026702881, "epoch": 0.19428571428571428, "grad_norm": 0.21759958565235138, "kl": 0.005249977111816406, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.511087728614862e-07, "loss": 0.0002, "reward": 0.35102659091353416, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.35102659091353416, "reward_after_std": 0.647955346852541, "reward_before_mean": 0.5838488005101681, "reward_before_std": 0.6074068415910006, "reward_change_max": 0.0, "reward_change_mean": -0.23282210575416684, "reward_change_min": -0.42266797088086605, "reward_change_std": 0.1610524570569396, "reward_std": 0.6479553729295731, "rewards/cosine_scaled_reward": 0.00025770440697669983, "rewards/format_reward": 0.5833333395421505, "step": 170 }, { "advantage_max": 1.50984887778759, "advantage_mean": 2.8560560361157172e-08, "advantage_min": -1.1283286213874817, "advantage_std": 0.9997936487197876, "completion_length": 2158.2708740234375, "epoch": 0.19542857142857142, "grad_norm": 0.230439692735672, "kl": 0.0041656494140625, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.487667956935087e-07, "loss": 0.0002, "reward": 0.31339962780475616, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.31339962780475616, "reward_after_std": 0.6719086579978466, "reward_before_mean": 0.5419529378414154, "reward_before_std": 0.7005261778831482, "reward_change_max": 0.00029709935188293457, "reward_change_mean": -0.22855328931473196, "reward_change_min": -0.42273118533194065, "reward_change_std": 0.16405913210473955, "reward_std": 0.6719086859375238, "rewards/cosine_scaled_reward": -0.020690208300948143, "rewards/format_reward": 0.5833333358168602, "step": 171 }, { "advantage_max": 1.504933387041092, "advantage_mean": -3.10440865236572e-08, "advantage_min": -1.4171021580696106, "advantage_std": 0.9998073801398277, "completion_length": 2092.9166984558105, "epoch": 0.19657142857142856, "grad_norm": 0.2690483331680298, "kl": 0.0072803497314453125, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.464102570534061e-07, "loss": 0.0003, "reward": 0.5798506364226341, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5798506364226341, "reward_after_std": 0.6986503172665834, "reward_before_mean": 0.8605402838438749, "reward_before_std": 0.6976092047989368, "reward_change_max": 0.00021129846572875977, "reward_change_mean": -0.2806896660476923, "reward_change_min": -0.435400377959013, "reward_change_std": 0.1796800745651126, "reward_std": 0.6986503265798092, "rewards/cosine_scaled_reward": 0.10735346004366875, "rewards/format_reward": 0.6458333432674408, "step": 172 }, { "advantage_max": 1.7530633360147476, "advantage_mean": 1.7384688688615313e-08, "advantage_min": -0.8142209053039551, "advantage_std": 0.9996874928474426, "completion_length": 1415.9583587646484, "epoch": 0.1977142857142857, "grad_norm": 0.32314759492874146, "kl": 0.0056514739990234375, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.440392717955475e-07, "loss": 0.0002, "reward": 0.18319927039556205, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.18319927039556205, "reward_after_std": 0.6402694256976247, "reward_before_mean": 0.3805189239792526, "reward_before_std": 0.619793354999274, "reward_change_max": 0.0, "reward_change_mean": -0.19731965800747275, "reward_change_min": -0.35540652461349964, "reward_change_std": 0.13836060231551528, "reward_std": 0.6402694452553988, "rewards/cosine_scaled_reward": -0.19515720537674497, "rewards/format_reward": 0.7708333358168602, "step": 173 }, { "advantage_max": 1.6947945654392242, "advantage_mean": -5.5879355587151736e-09, "advantage_min": -1.1162981167435646, "advantage_std": 0.9998343512415886, "completion_length": 1546.520896911621, "epoch": 0.19885714285714284, "grad_norm": 0.24973110854625702, "kl": 0.006852149963378906, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.416539554784089e-07, "loss": 0.0003, "reward": 0.447128068190068, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.447128068190068, "reward_after_std": 0.7851445488631725, "reward_before_mean": 0.6881698596407659, "reward_before_std": 0.7564616817981005, "reward_change_max": 0.00030616670846939087, "reward_change_mean": -0.24104176089167595, "reward_change_min": -0.38989388942718506, "reward_change_std": 0.1559823602437973, "reward_std": 0.7851445563137531, "rewards/cosine_scaled_reward": -0.07258176291361451, "rewards/format_reward": 0.8333333432674408, "step": 174 }, { "advantage_max": 1.6422483325004578, "advantage_mean": -2.561137124601487e-08, "advantage_min": -1.2339301407337189, "advantage_std": 0.9998004734516144, "completion_length": 2264.791717529297, "epoch": 0.2, "grad_norm": 0.24177344143390656, "kl": 0.0054874420166015625, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.392544243589427e-07, "loss": 0.0002, "reward": 0.5153841646388173, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5153841646388173, "reward_after_std": 0.608320277184248, "reward_before_mean": 0.786954928888008, "reward_before_std": 0.5843202453106642, "reward_change_max": 0.0006819292902946472, "reward_change_mean": -0.2715707626193762, "reward_change_min": -0.4168048519641161, "reward_change_std": 0.1677942699752748, "reward_std": 0.6083202883601189, "rewards/cosine_scaled_reward": 0.08097745012491941, "rewards/format_reward": 0.6250000093132257, "step": 175 }, { "advantage_max": 1.5760944485664368, "advantage_mean": -2.8560559250934148e-08, "advantage_min": -1.1510667353868484, "advantage_std": 0.9998167455196381, "completion_length": 1851.9375228881836, "epoch": 0.20114285714285715, "grad_norm": 0.2656446397304535, "kl": 0.0049724578857421875, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.368407953869103e-07, "loss": 0.0002, "reward": 0.49231592612341046, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.49231592612341046, "reward_after_std": 0.8085062839090824, "reward_before_mean": 0.7481458615511656, "reward_before_std": 0.8251261785626411, "reward_change_max": 6.766617298126221e-05, "reward_change_mean": -0.25582992658019066, "reward_change_min": -0.42961229756474495, "reward_change_std": 0.18010017089545727, "reward_std": 0.8085063360631466, "rewards/cosine_scaled_reward": -0.011343751102685928, "rewards/format_reward": 0.770833345130086, "step": 176 }, { "advantage_max": 1.6836175918579102, "advantage_mean": -5.712111839084599e-08, "advantage_min": -1.1872343942523003, "advantage_std": 0.9998055696487427, "completion_length": 2073.1667098999023, "epoch": 0.2022857142857143, "grad_norm": 0.26497936248779297, "kl": 0.00461578369140625, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.344131861991828e-07, "loss": 0.0002, "reward": 0.5681694131344557, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5681694131344557, "reward_after_std": 0.6743853017687798, "reward_before_mean": 0.841622298117727, "reward_before_std": 0.6278502456843853, "reward_change_max": 0.0, "reward_change_mean": -0.27345291804522276, "reward_change_min": -0.40037749521434307, "reward_change_std": 0.15704370383173227, "reward_std": 0.6743853129446507, "rewards/cosine_scaled_reward": 0.04581115394830704, "rewards/format_reward": 0.7500000093132257, "step": 177 }, { "advantage_max": 1.4844506978988647, "advantage_mean": -8.071462553882469e-09, "advantage_min": -1.1723910346627235, "advantage_std": 0.999858170747757, "completion_length": 2002.2083740234375, "epoch": 0.20342857142857143, "grad_norm": 0.33194902539253235, "kl": 0.008449554443359375, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.319717151140072e-07, "loss": 0.0003, "reward": 0.42172466265037656, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.42172466265037656, "reward_after_std": 0.8564918860793114, "reward_before_mean": 0.6597280632704496, "reward_before_std": 0.8828203082084656, "reward_change_max": 0.0, "reward_change_mean": -0.23800339736044407, "reward_change_min": -0.4743681848049164, "reward_change_std": 0.18404243979603052, "reward_std": 0.856491930782795, "rewards/cosine_scaled_reward": -0.0034693063935264945, "rewards/format_reward": 0.6666666753590107, "step": 178 }, { "advantage_max": 1.6311480104923248, "advantage_mean": -3.352761296371298e-08, "advantage_min": -1.1216321215033531, "advantage_std": 0.9997258484363556, "completion_length": 2185.187530517578, "epoch": 0.20457142857142857, "grad_norm": 0.28444021940231323, "kl": 0.0072917938232421875, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.295165011252396e-07, "loss": 0.0003, "reward": -0.058215420227497816, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.058215420227497816, "reward_after_std": 0.4829340185970068, "reward_before_mean": 0.10054558701813221, "reward_before_std": 0.474730771034956, "reward_change_max": 0.0004728659987449646, "reward_change_mean": -0.15876100910827518, "reward_change_min": -0.27880000323057175, "reward_change_std": 0.11060563754290342, "reward_std": 0.4829340223222971, "rewards/cosine_scaled_reward": -0.24139389023184776, "rewards/format_reward": 0.5833333395421505, "step": 179 }, { "advantage_max": 1.6618616580963135, "advantage_mean": 6.208817238118058e-09, "advantage_min": -1.071367308497429, "advantage_std": 0.9998577609658241, "completion_length": 1255.833381652832, "epoch": 0.2057142857142857, "grad_norm": 0.30001556873321533, "kl": 0.0062732696533203125, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.270476638965461e-07, "loss": 0.0003, "reward": 1.1591643691062927, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 1.1591643691062927, "reward_after_std": 0.8961893171072006, "reward_before_mean": 1.5456229411065578, "reward_before_std": 0.849966811016202, "reward_change_max": 0.0, "reward_change_mean": -0.3864585403352976, "reward_change_min": -0.6112453117966652, "reward_change_std": 0.23929630033671856, "reward_std": 0.8961893357336521, "rewards/cosine_scaled_reward": 0.3144781200680882, "rewards/format_reward": 0.9166666716337204, "step": 180 }, { "advantage_max": 1.6392957419157028, "advantage_mean": -6.146728892542086e-08, "advantage_min": -1.0734733194112778, "advantage_std": 0.9997735172510147, "completion_length": 2378.6667098999023, "epoch": 0.20685714285714285, "grad_norm": 0.25959938764572144, "kl": 0.00646209716796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.245653237555705e-07, "loss": 0.0003, "reward": 0.23480892833322287, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.23480892833322287, "reward_after_std": 0.6075701769441366, "reward_before_mean": 0.44690791331231594, "reward_before_std": 0.5976003762334585, "reward_change_max": 0.00015873461961746216, "reward_change_mean": -0.2120990320108831, "reward_change_min": -0.40071258693933487, "reward_change_std": 0.15138957416638732, "reward_std": 0.6075701992958784, "rewards/cosine_scaled_reward": -0.0890460480004549, "rewards/format_reward": 0.6250000074505806, "step": 181 }, { "advantage_max": 1.6973135769367218, "advantage_mean": -1.2417631367611648e-09, "advantage_min": -1.1010829582810402, "advantage_std": 0.9998375773429871, "completion_length": 1855.06254196167, "epoch": 0.208, "grad_norm": 0.17940671741962433, "kl": 0.0037069320678710938, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.220696016880687e-07, "loss": 0.0001, "reward": 0.4569451562128961, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4569451562128961, "reward_after_std": 0.7582255005836487, "reward_before_mean": 0.7031884212046862, "reward_before_std": 0.7348581291735172, "reward_change_max": 0.0003945082426071167, "reward_change_mean": -0.2462432268075645, "reward_change_min": -0.41578804701566696, "reward_change_std": 0.16094863507896662, "reward_std": 0.7582255192101002, "rewards/cosine_scaled_reward": -0.0025724750012159348, "rewards/format_reward": 0.7083333414047956, "step": 182 }, { "advantage_max": 1.6113858073949814, "advantage_mean": -6.332993829349931e-08, "advantage_min": -1.0258197262883186, "advantage_std": 0.9998429715633392, "completion_length": 1424.5000457763672, "epoch": 0.20914285714285713, "grad_norm": 0.29113176465034485, "kl": 0.008054733276367188, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.195606193320136e-07, "loss": 0.0003, "reward": 0.6616654456593096, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.6616654456593096, "reward_after_std": 0.7637336738407612, "reward_before_mean": 0.9540510524529964, "reward_before_std": 0.7442143596708775, "reward_change_max": 0.0002907067537307739, "reward_change_mean": -0.2923855986446142, "reward_change_min": -0.5103072412312031, "reward_change_std": 0.19355090707540512, "reward_std": 0.7637336924672127, "rewards/cosine_scaled_reward": 0.04994217213243246, "rewards/format_reward": 0.8541666716337204, "step": 183 }, { "advantage_max": 1.524808943271637, "advantage_mean": -3.911554946611773e-08, "advantage_min": -1.3263940066099167, "advantage_std": 0.9998024180531502, "completion_length": 1542.6875457763672, "epoch": 0.2102857142857143, "grad_norm": 0.2789245843887329, "kl": 0.006900787353515625, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.170384989716657e-07, "loss": 0.0003, "reward": 0.3295530015602708, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3295530015602708, "reward_after_std": 0.6151285581290722, "reward_before_mean": 0.5630174519028515, "reward_before_std": 0.6208750866353512, "reward_change_max": 0.0004479065537452698, "reward_change_mean": -0.2334644440561533, "reward_change_min": -0.39546626433730125, "reward_change_std": 0.15795165114104748, "reward_std": 0.6151285581290722, "rewards/cosine_scaled_reward": -0.13515795394778252, "rewards/format_reward": 0.8333333432674408, "step": 184 }, { "advantage_max": 1.7052653282880783, "advantage_mean": -1.5522042928761692e-08, "advantage_min": -0.8801075518131256, "advantage_std": 0.9997084736824036, "completion_length": 1614.5834045410156, "epoch": 0.21142857142857144, "grad_norm": 0.25438177585601807, "kl": 0.005855560302734375, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.145033635316128e-07, "loss": 0.0002, "reward": 0.16461107577197254, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.16461107577197254, "reward_after_std": 0.5498490314930677, "reward_before_mean": 0.36692870780825615, "reward_before_std": 0.5359976505860686, "reward_change_max": 0.0002930089831352234, "reward_change_mean": -0.20231764344498515, "reward_change_min": -0.3774881921708584, "reward_change_std": 0.13948537409305573, "reward_std": 0.5498490333557129, "rewards/cosine_scaled_reward": -0.21236898249480873, "rewards/format_reward": 0.7916666697710752, "step": 185 }, { "advantage_max": 1.589905396103859, "advantage_mean": 1.862645149230957e-09, "advantage_min": -1.0278316587209702, "advantage_std": 0.999826580286026, "completion_length": 1909.6667175292969, "epoch": 0.21257142857142858, "grad_norm": 0.20909112691879272, "kl": 0.0068912506103515625, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.119553365707802e-07, "loss": 0.0003, "reward": 0.43765153270214796, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.43765153270214796, "reward_after_std": 0.7173998765647411, "reward_before_mean": 0.6869608238339424, "reward_before_std": 0.7174548655748367, "reward_change_max": 0.00025334954261779785, "reward_change_mean": -0.24930927250534296, "reward_change_min": -0.4509042240679264, "reward_change_std": 0.1700389552861452, "reward_std": 0.7173998989164829, "rewards/cosine_scaled_reward": 0.010147074237465858, "rewards/format_reward": 0.6666666716337204, "step": 186 }, { "advantage_max": 1.6517557352781296, "advantage_mean": 5.2154067287091266e-08, "advantage_min": -1.1060415133833885, "advantage_std": 0.9997224509716034, "completion_length": 1636.2916870117188, "epoch": 0.21371428571428572, "grad_norm": 0.24228616058826447, "kl": 0.006427764892578125, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.093945422764069e-07, "loss": 0.0003, "reward": 0.30055883899331093, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.30055883899331093, "reward_after_std": 0.39326165057718754, "reward_before_mean": 0.5415985980071127, "reward_before_std": 0.35194770991802216, "reward_change_max": 0.00021256506443023682, "reward_change_mean": -0.24103973433375359, "reward_change_min": -0.35694490373134613, "reward_change_std": 0.13662862265482545, "reward_std": 0.3932616636157036, "rewards/cosine_scaled_reward": -0.12503403797745705, "rewards/format_reward": 0.791666679084301, "step": 187 }, { "advantage_max": 1.55316960811615, "advantage_mean": 6.8296996946770605e-09, "advantage_min": -1.3121953383088112, "advantage_std": 0.9997300058603287, "completion_length": 2161.750045776367, "epoch": 0.21485714285714286, "grad_norm": 0.20883195102214813, "kl": 0.0077056884765625, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.068211054579943e-07, "loss": 0.0003, "reward": 0.03851320035755634, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.03851320035755634, "reward_after_std": 0.47359400801360607, "reward_before_mean": 0.22079682408366352, "reward_before_std": 0.47062006406486034, "reward_change_max": 6.236881017684937e-05, "reward_change_mean": -0.1822836329229176, "reward_change_min": -0.32229095324873924, "reward_change_std": 0.12202021991834044, "reward_std": 0.47359402664005756, "rewards/cosine_scaled_reward": -0.1916849333792925, "rewards/format_reward": 0.6041666753590107, "step": 188 }, { "advantage_max": 1.7091278731822968, "advantage_mean": -2.7939677238464355e-09, "advantage_min": -1.0550750717520714, "advantage_std": 0.9997809454798698, "completion_length": 1451.9792022705078, "epoch": 0.216, "grad_norm": 0.2504546642303467, "kl": 0.007526397705078125, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.04235151541222e-07, "loss": 0.0003, "reward": 0.36948077380657196, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.36948077380657196, "reward_after_std": 0.6774904727935791, "reward_before_mean": 0.6019853809848428, "reward_before_std": 0.6514282152056694, "reward_change_max": 0.0, "reward_change_mean": -0.23250459227710962, "reward_change_min": -0.39352178759872913, "reward_change_std": 0.14997949916869402, "reward_std": 0.6774904876947403, "rewards/cosine_scaled_reward": -0.1260906618554145, "rewards/format_reward": 0.8541666753590107, "step": 189 }, { "advantage_max": 1.5744300931692123, "advantage_mean": -1.83160115962977e-08, "advantage_min": -1.0411207303404808, "advantage_std": 0.9998467117547989, "completion_length": 1512.3333892822266, "epoch": 0.21714285714285714, "grad_norm": 0.2728053331375122, "kl": 0.008083343505859375, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.01636806561836e-07, "loss": 0.0003, "reward": 0.5598836690187454, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5598836690187454, "reward_after_std": 0.7893565334379673, "reward_before_mean": 0.8278303791303188, "reward_before_std": 0.7781634032726288, "reward_change_max": 0.0, "reward_change_mean": -0.2679467387497425, "reward_change_min": -0.4867997542023659, "reward_change_std": 0.1817149631679058, "reward_std": 0.7893565557897091, "rewards/cosine_scaled_reward": -0.013168148114345968, "rewards/format_reward": 0.854166679084301, "step": 190 }, { "advantage_max": 1.7405344098806381, "advantage_mean": -3.47693762670076e-08, "advantage_min": -0.9780000820755959, "advantage_std": 0.9997963458299637, "completion_length": 951.4375305175781, "epoch": 0.21828571428571428, "grad_norm": 0.2622581124305725, "kl": 0.0057964324951171875, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.990261971595048e-07, "loss": 0.0002, "reward": 0.864835481159389, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.864835481159389, "reward_after_std": 0.8386146035045385, "reward_before_mean": 1.192215159535408, "reward_before_std": 0.7938100956380367, "reward_change_max": 0.0, "reward_change_mean": -0.32737973518669605, "reward_change_min": -0.5288929119706154, "reward_change_std": 0.20689752884209156, "reward_std": 0.8386146258562803, "rewards/cosine_scaled_reward": 0.11694091919343919, "rewards/format_reward": 0.9583333432674408, "step": 191 }, { "advantage_max": 1.6680810749530792, "advantage_mean": -2.297262446937509e-08, "advantage_min": -1.0754312574863434, "advantage_std": 0.9998167529702187, "completion_length": 1985.270866394043, "epoch": 0.21942857142857142, "grad_norm": 0.23176267743110657, "kl": 0.007137298583984375, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.964034505716476e-07, "loss": 0.0003, "reward": 0.240308852866292, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.240308852866292, "reward_after_std": 0.7073502019047737, "reward_before_mean": 0.4445103667676449, "reward_before_std": 0.6949526704847813, "reward_change_max": 0.0, "reward_change_mean": -0.20420153997838497, "reward_change_min": -0.384893910959363, "reward_change_std": 0.1380018894560635, "reward_std": 0.7073502317070961, "rewards/cosine_scaled_reward": -0.12149481847882271, "rewards/format_reward": 0.6875000186264515, "step": 192 }, { "advantage_max": 1.7411630600690842, "advantage_mean": 3.1044145032410597e-10, "advantage_min": -0.9968269243836403, "advantage_std": 0.9998300522565842, "completion_length": 2419.7084045410156, "epoch": 0.22057142857142858, "grad_norm": 0.20773284137248993, "kl": 0.0078582763671875, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.93768694627233e-07, "loss": 0.0003, "reward": 0.06769353523850441, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.06769353523850441, "reward_after_std": 0.819042269140482, "reward_before_mean": 0.22256720066070557, "reward_before_std": 0.7970810793340206, "reward_change_max": 0.0009588897228240967, "reward_change_mean": -0.15487368637695909, "reward_change_min": -0.27775100991129875, "reward_change_std": 0.1097005819901824, "reward_std": 0.8190423138439655, "rewards/cosine_scaled_reward": -0.15954973798943684, "rewards/format_reward": 0.5416666753590107, "step": 193 }, { "advantage_max": 1.6247856467962265, "advantage_mean": -7.450580818968433e-09, "advantage_min": -0.9195460751652718, "advantage_std": 0.9998468160629272, "completion_length": 2124.458366394043, "epoch": 0.22171428571428572, "grad_norm": 0.22982855141162872, "kl": 0.007961273193359375, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.911220577405484e-07, "loss": 0.0003, "reward": 0.6637103334069252, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6637103334069252, "reward_after_std": 0.898862686008215, "reward_before_mean": 0.9483567178249359, "reward_before_std": 0.8963012248277664, "reward_change_max": 0.00025325268507003784, "reward_change_mean": -0.2846463564783335, "reward_change_min": -0.5547180697321892, "reward_change_std": 0.2137763760983944, "reward_std": 0.8988627195358276, "rewards/cosine_scaled_reward": 0.09917834028601646, "rewards/format_reward": 0.7500000074505806, "step": 194 }, { "advantage_max": 1.7464525401592255, "advantage_mean": -6.208817238118058e-09, "advantage_min": -0.8979768976569176, "advantage_std": 0.9997522979974747, "completion_length": 1192.8125076293945, "epoch": 0.22285714285714286, "grad_norm": 0.22150519490242004, "kl": 0.0071315765380859375, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.884636689049422e-07, "loss": 0.0003, "reward": 0.5498414165340364, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.5498414165340364, "reward_after_std": 0.548849031329155, "reward_before_mean": 0.8287271652370691, "reward_before_std": 0.4871873203665018, "reward_change_max": 0.0, "reward_change_mean": -0.27888575568795204, "reward_change_min": -0.4258766584098339, "reward_change_std": 0.155844459310174, "reward_std": 0.5488490350544453, "rewards/cosine_scaled_reward": -0.07521975645795465, "rewards/format_reward": 0.9791666716337204, "step": 195 }, { "advantage_max": 1.5998663306236267, "advantage_mean": -5.3395830201807826e-08, "advantage_min": -1.0904487520456314, "advantage_std": 0.9998405128717422, "completion_length": 2543.1875610351562, "epoch": 0.224, "grad_norm": 0.24054710566997528, "kl": 0.008991241455078125, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.857936576865356e-07, "loss": 0.0004, "reward": 0.3771222997456789, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3771222997456789, "reward_after_std": 0.7970558479428291, "reward_before_mean": 0.607517649885267, "reward_before_std": 0.7950811851769686, "reward_change_max": 2.668052911758423e-05, "reward_change_mean": -0.23039534082636237, "reward_change_min": -0.435683973133564, "reward_change_std": 0.1760418750345707, "reward_std": 0.7970558591187, "rewards/cosine_scaled_reward": -0.03999119042418897, "rewards/format_reward": 0.687500013038516, "step": 196 }, { "advantage_max": 1.7193616777658463, "advantage_mean": -1.769512969485021e-08, "advantage_min": -1.0334468707442284, "advantage_std": 0.9998365789651871, "completion_length": 1056.9375457763672, "epoch": 0.22514285714285714, "grad_norm": 0.2736615240573883, "kl": 0.006931304931640625, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.831121542179086e-07, "loss": 0.0003, "reward": 0.6537089729681611, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6537089729681611, "reward_after_std": 0.7846135124564171, "reward_before_mean": 0.9389471057802439, "reward_before_std": 0.7512768171727657, "reward_change_max": 0.0, "reward_change_mean": -0.28523814491927624, "reward_change_min": -0.4611331969499588, "reward_change_std": 0.17364253383129835, "reward_std": 0.7846135310828686, "rewards/cosine_scaled_reward": 0.011140207760035992, "rewards/format_reward": 0.9166666679084301, "step": 197 }, { "advantage_max": 1.5378694832324982, "advantage_mean": -1.986821485111534e-08, "advantage_min": -1.0649435445666313, "advantage_std": 0.9998499155044556, "completion_length": 1377.5000228881836, "epoch": 0.22628571428571428, "grad_norm": 0.2656911313533783, "kl": 0.0081787109375, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.804192891917571e-07, "loss": 0.0003, "reward": 0.7236167434602976, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7236167434602976, "reward_after_std": 0.8226776085793972, "reward_before_mean": 1.0276835672557354, "reward_before_std": 0.8301276378333569, "reward_change_max": 0.0, "reward_change_mean": -0.30406678281724453, "reward_change_min": -0.5162977389991283, "reward_change_std": 0.20391633734107018, "reward_std": 0.822677630931139, "rewards/cosine_scaled_reward": 0.04509176965802908, "rewards/format_reward": 0.9375000074505806, "step": 198 }, { "advantage_max": 1.7326484769582748, "advantage_mean": 2.4835269396561444e-09, "advantage_min": -0.9086687192320824, "advantage_std": 0.9997498765587807, "completion_length": 1354.2292098999023, "epoch": 0.22742857142857142, "grad_norm": 0.24720071256160736, "kl": 0.007568359375, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.777151938545235e-07, "loss": 0.0003, "reward": 0.24219063017517328, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.24219063017517328, "reward_after_std": 0.48400944843888283, "reward_before_mean": 0.46279463171958923, "reward_before_std": 0.44461123645305634, "reward_change_max": 0.0, "reward_change_mean": -0.22060398757457733, "reward_change_min": -0.36466255225241184, "reward_change_std": 0.12946867663413286, "reward_std": 0.4840094521641731, "rewards/cosine_scaled_reward": -0.2477693718392402, "rewards/format_reward": 0.9583333432674408, "step": 199 }, { "advantage_max": 1.6117542684078217, "advantage_mean": -6.208818126296478e-09, "advantage_min": -1.0823587998747826, "advantage_std": 0.999851405620575, "completion_length": 1368.333396911621, "epoch": 0.22857142857142856, "grad_norm": 0.20899228751659393, "kl": 0.007480621337890625, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.75e-07, "loss": 0.0003, "reward": 0.7081074807792902, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7081074807792902, "reward_after_std": 0.8072471618652344, "reward_before_mean": 1.0080002145841718, "reward_before_std": 0.8053440153598785, "reward_change_max": 0.0, "reward_change_mean": -0.2998927291482687, "reward_change_min": -0.5310935415327549, "reward_change_std": 0.19609952345490456, "reward_std": 0.8072471879422665, "rewards/cosine_scaled_reward": 0.04566674306988716, "rewards/format_reward": 0.9166666716337204, "step": 200 }, { "advantage_max": 1.5719723999500275, "advantage_mean": -7.823109804405703e-08, "advantage_min": -1.1847958788275719, "advantage_std": 0.9998306035995483, "completion_length": 1690.8958435058594, "epoch": 0.2297142857142857, "grad_norm": 0.2967695891857147, "kl": 0.007305145263671875, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.72273839962904e-07, "loss": 0.0003, "reward": 0.9961107671260834, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.9961107671260834, "reward_after_std": 0.7360798977315426, "reward_before_mean": 1.3610759600996971, "reward_before_std": 0.7041471414268017, "reward_change_max": 0.0, "reward_change_mean": -0.36496521160006523, "reward_change_min": -0.5439505577087402, "reward_change_std": 0.220447919331491, "reward_std": 0.7360799051821232, "rewards/cosine_scaled_reward": 0.2742879637517035, "rewards/format_reward": 0.8125000074505806, "step": 201 }, { "advantage_max": 1.7172722667455673, "advantage_mean": -8.381903465748408e-08, "advantage_min": -1.0472783595323563, "advantage_std": 0.999752514064312, "completion_length": 1256.3750305175781, "epoch": 0.23085714285714284, "grad_norm": 0.23032940924167633, "kl": 0.00678253173828125, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.695368466124296e-07, "loss": 0.0003, "reward": 0.95308491459582, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.95308491459582, "reward_after_std": 0.47053003683686256, "reward_before_mean": 1.3243922591209412, "reward_before_std": 0.36816588416695595, "reward_change_max": 0.0, "reward_change_mean": -0.37130735348910093, "reward_change_min": -0.5054810401052237, "reward_change_std": 0.2038074992597103, "reward_std": 0.47053005918860435, "rewards/cosine_scaled_reward": 0.2351127788424492, "rewards/format_reward": 0.8541666716337204, "step": 202 }, { "advantage_max": 1.7689315378665924, "advantage_mean": -5.587935669737476e-09, "advantage_min": -1.1071807369589806, "advantage_std": 0.9998086988925934, "completion_length": 1289.8750305175781, "epoch": 0.232, "grad_norm": 0.2538374960422516, "kl": 0.009387969970703125, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.667891533457718e-07, "loss": 0.0004, "reward": 0.6541607966646552, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6541607966646552, "reward_after_std": 0.61843141913414, "reward_before_mean": 0.947968315333128, "reward_before_std": 0.5485080145299435, "reward_change_max": 0.0002954155206680298, "reward_change_mean": -0.29380753077566624, "reward_change_min": -0.4340146593749523, "reward_change_std": 0.16420075949281454, "reward_std": 0.6184314265847206, "rewards/cosine_scaled_reward": 0.01565080275759101, "rewards/format_reward": 0.916666679084301, "step": 203 }, { "advantage_max": 1.4909850060939789, "advantage_mean": -2.6077032311278003e-08, "advantage_min": -1.2436881214380264, "advantage_std": 0.999848447740078, "completion_length": 1297.0417098999023, "epoch": 0.23314285714285715, "grad_norm": 0.35677286982536316, "kl": 0.008518218994140625, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.640308940816239e-07, "loss": 0.0003, "reward": 0.6019344963133335, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.6019344963133335, "reward_after_std": 0.8923456855118275, "reward_before_mean": 0.8748259395360947, "reward_before_std": 0.9185019806027412, "reward_change_max": 0.0012612491846084595, "reward_change_mean": -0.2728914525359869, "reward_change_min": -0.5382020473480225, "reward_change_std": 0.20347497053444386, "reward_std": 0.8923456855118275, "rewards/cosine_scaled_reward": -0.010503708384931087, "rewards/format_reward": 0.8958333432674408, "step": 204 }, { "advantage_max": 1.611358642578125, "advantage_mean": -8.53712394111028e-08, "advantage_min": -1.0590423047542572, "advantage_std": 0.9998035132884979, "completion_length": 1398.5625381469727, "epoch": 0.2342857142857143, "grad_norm": 0.26555773615837097, "kl": 0.006938934326171875, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.612622032536507e-07, "loss": 0.0003, "reward": 0.9240311346948147, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.9240311346948147, "reward_after_std": 0.6977498419582844, "reward_before_mean": 1.275739498436451, "reward_before_std": 0.6501687755808234, "reward_change_max": 0.0, "reward_change_mean": -0.3517083413898945, "reward_change_min": -0.565269511193037, "reward_change_std": 0.21987238712608814, "reward_std": 0.697749849408865, "rewards/cosine_scaled_reward": 0.2003697256441228, "rewards/format_reward": 0.8750000074505806, "step": 205 }, { "advantage_max": 1.7118752002716064, "advantage_mean": 6.208816905051151e-09, "advantage_min": -0.9464834704995155, "advantage_std": 0.9997338354587555, "completion_length": 2159.041702270508, "epoch": 0.23542857142857143, "grad_norm": 0.2459092140197754, "kl": 0.00711822509765625, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.584832158039378e-07, "loss": 0.0003, "reward": -0.07530895713716745, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.07530895713716745, "reward_after_std": 0.4827368911355734, "reward_before_mean": 0.07829463575035334, "reward_before_std": 0.4613391198217869, "reward_change_max": 0.0008892938494682312, "reward_change_mean": -0.153603594051674, "reward_change_min": -0.2621854990720749, "reward_change_std": 0.10310475202277303, "reward_std": 0.4827369023114443, "rewards/cosine_scaled_reward": -0.2941860295832157, "rewards/format_reward": 0.6666666716337204, "step": 206 }, { "advantage_max": 1.7646289765834808, "advantage_mean": 3.1044089521259366e-09, "advantage_min": -0.9611957967281342, "advantage_std": 0.9998351857066154, "completion_length": 1640.5625610351562, "epoch": 0.23657142857142857, "grad_norm": 0.3210693895816803, "kl": 0.012020111083984375, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.556940671764124e-07, "loss": 0.0005, "reward": 0.3758093472570181, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3758093472570181, "reward_after_std": 0.7883238419890404, "reward_before_mean": 0.5995778944343328, "reward_before_std": 0.7552689164876938, "reward_change_max": 0.0, "reward_change_mean": -0.223768537864089, "reward_change_min": -0.38106095790863037, "reward_change_std": 0.14635274559259415, "reward_std": 0.7883238643407822, "rewards/cosine_scaled_reward": -0.10646106884814799, "rewards/format_reward": 0.8125000074505806, "step": 207 }, { "advantage_max": 1.5686787962913513, "advantage_mean": 2.110997954218874e-08, "advantage_min": -1.2580528557300568, "advantage_std": 0.9997912496328354, "completion_length": 1009.8125534057617, "epoch": 0.2377142857142857, "grad_norm": 0.2531468868255615, "kl": 0.00777435302734375, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.528948933102438e-07, "loss": 0.0003, "reward": 0.6899872645735741, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6899872645735741, "reward_after_std": 0.5505052581429482, "reward_before_mean": 1.0031409859657288, "reward_before_std": 0.5190521031618118, "reward_change_max": 0.0, "reward_change_mean": -0.31315369717776775, "reward_change_min": -0.48251595720648766, "reward_change_std": 0.1780382813885808, "reward_std": 0.5505052730441093, "rewards/cosine_scaled_reward": 0.011987147852778435, "rewards/format_reward": 0.9791666716337204, "step": 208 }, { "advantage_max": 1.3751467913389206, "advantage_mean": -2.297262435835279e-08, "advantage_min": -1.2741581797599792, "advantage_std": 0.9998413994908333, "completion_length": 1126.4166946411133, "epoch": 0.23885714285714285, "grad_norm": 0.3193022608757019, "kl": 0.009975433349609375, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.500858306332172e-07, "loss": 0.0004, "reward": 0.6218211939558387, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.6218211939558387, "reward_after_std": 0.7225112542510033, "reward_before_mean": 0.9142632093280554, "reward_before_std": 0.7535572983324528, "reward_change_max": 0.000842459499835968, "reward_change_mean": -0.2924419930204749, "reward_change_min": -0.5019077956676483, "reward_change_std": 0.1989587116986513, "reward_std": 0.7225112766027451, "rewards/cosine_scaled_reward": 0.019631581380963326, "rewards/format_reward": 0.8750000149011612, "step": 209 }, { "advantage_max": 1.652031123638153, "advantage_mean": -9.499490866149429e-08, "advantage_min": -1.114754095673561, "advantage_std": 0.9998092278838158, "completion_length": 1526.333366394043, "epoch": 0.24, "grad_norm": 0.23487278819084167, "kl": 0.006862640380859375, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.472670160550848e-07, "loss": 0.0003, "reward": 0.5745669873431325, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5745669873431325, "reward_after_std": 0.6210574917495251, "reward_before_mean": 0.8549545677378774, "reward_before_std": 0.5745654674246907, "reward_change_max": 2.635270357131958e-05, "reward_change_mean": -0.2803875617682934, "reward_change_min": -0.4581737369298935, "reward_change_std": 0.17075308226048946, "reward_std": 0.6210575066506863, "rewards/cosine_scaled_reward": -0.010022742673754692, "rewards/format_reward": 0.8750000111758709, "step": 210 }, { "advantage_max": 1.7116133570671082, "advantage_mean": -6.581346634337848e-08, "advantage_min": -0.9903494343161583, "advantage_std": 0.9997931122779846, "completion_length": 1457.3750534057617, "epoch": 0.24114285714285713, "grad_norm": 0.2358456403017044, "kl": 0.00867462158203125, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.444385869608921e-07, "loss": 0.0003, "reward": 0.6318370220251381, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6318370220251381, "reward_after_std": 0.576850164681673, "reward_before_mean": 0.9263378567993641, "reward_before_std": 0.5047197928652167, "reward_change_max": 0.00010591000318527222, "reward_change_mean": -0.2945008259266615, "reward_change_min": -0.4437066949903965, "reward_change_std": 0.17200073320418596, "reward_std": 0.576850164681673, "rewards/cosine_scaled_reward": 0.05691891070455313, "rewards/format_reward": 0.8125000074505806, "step": 211 }, { "advantage_max": 1.6534480303525925, "advantage_mean": -5.836288397009781e-08, "advantage_min": -1.0853853449225426, "advantage_std": 0.9998204335570335, "completion_length": 1065.4375457763672, "epoch": 0.2422857142857143, "grad_norm": 0.26351648569107056, "kl": 0.01288604736328125, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.416006812042827e-07, "loss": 0.0005, "reward": 0.8122778884135187, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.8122778884135187, "reward_after_std": 0.6671516038477421, "reward_before_mean": 1.1411408353596926, "reward_before_std": 0.620491236448288, "reward_change_max": 0.0, "reward_change_mean": -0.32886295951902866, "reward_change_min": -0.5069613792002201, "reward_change_std": 0.19332514610141516, "reward_std": 0.6671516112983227, "rewards/cosine_scaled_reward": 0.11223706416785717, "rewards/format_reward": 0.9166666716337204, "step": 212 }, { "advantage_max": 1.6381594836711884, "advantage_mean": -4.8428774657161e-08, "advantage_min": -1.2275662645697594, "advantage_std": 0.9998309686779976, "completion_length": 1396.125015258789, "epoch": 0.24342857142857144, "grad_norm": 0.31681808829307556, "kl": 0.011737823486328125, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.387534371007797e-07, "loss": 0.0005, "reward": 0.7322141584008932, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7322141584008932, "reward_after_std": 0.7107599079608917, "reward_before_mean": 1.039572605281137, "reward_before_std": 0.6669244170188904, "reward_change_max": 5.4270029067993164e-05, "reward_change_mean": -0.30735844001173973, "reward_change_min": -0.4567818343639374, "reward_change_std": 0.18181548546999693, "reward_std": 0.7107599154114723, "rewards/cosine_scaled_reward": 0.0718696154654026, "rewards/format_reward": 0.8958333432674408, "step": 213 }, { "advantage_max": 1.562519982457161, "advantage_mean": -2.7318795670083773e-08, "advantage_min": -1.0746354907751083, "advantage_std": 0.9997971132397652, "completion_length": 1867.5625610351562, "epoch": 0.24457142857142858, "grad_norm": 0.3209463059902191, "kl": 0.01001739501953125, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.358969934210438e-07, "loss": 0.0004, "reward": 0.48180012218654156, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.48180012218654156, "reward_after_std": 0.6115984097123146, "reward_before_mean": 0.7484859389369376, "reward_before_std": 0.6006402317434549, "reward_change_max": 0.0003897324204444885, "reward_change_mean": -0.2666857857257128, "reward_change_min": -0.4256005696952343, "reward_change_std": 0.1750076524913311, "reward_std": 0.6115984097123146, "rewards/cosine_scaled_reward": 0.00965961068868637, "rewards/format_reward": 0.7291666846722364, "step": 214 }, { "advantage_max": 1.61798395216465, "advantage_mean": -8.133550644107146e-08, "advantage_min": -1.126285158097744, "advantage_std": 0.9997453913092613, "completion_length": 1641.9167098999023, "epoch": 0.24571428571428572, "grad_norm": 0.25225481390953064, "kl": 0.0077953338623046875, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.330314893841101e-07, "loss": 0.0003, "reward": 0.3076944574713707, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3076944574713707, "reward_after_std": 0.4532122015953064, "reward_before_mean": 0.5455555971711874, "reward_before_std": 0.431146040558815, "reward_change_max": 0.0002174675464630127, "reward_change_mean": -0.23786117020063102, "reward_change_min": -0.3689110726118088, "reward_change_std": 0.1408306835219264, "reward_std": 0.4532122276723385, "rewards/cosine_scaled_reward": -0.12305555492639542, "rewards/format_reward": 0.791666679084301, "step": 215 }, { "advantage_max": 1.705009326338768, "advantage_mean": -9.126961747485396e-08, "advantage_min": -1.099523849785328, "advantage_std": 0.9997986108064651, "completion_length": 902.7916793823242, "epoch": 0.24685714285714286, "grad_norm": 0.2886558175086975, "kl": 0.008966445922851562, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.301570646506027e-07, "loss": 0.0004, "reward": 0.8277550789935049, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.8277550789935049, "reward_after_std": 0.6353770308196545, "reward_before_mean": 1.1602430697530508, "reward_before_std": 0.5769399423152208, "reward_change_max": 0.0, "reward_change_mean": -0.33248797059059143, "reward_change_min": -0.5088830962777138, "reward_change_std": 0.19062372762709856, "reward_std": 0.6353770382702351, "rewards/cosine_scaled_reward": 0.10095484089106321, "rewards/format_reward": 0.9583333358168602, "step": 216 }, { "advantage_max": 1.7508010566234589, "advantage_mean": -2.607703353252333e-08, "advantage_min": -0.8888709247112274, "advantage_std": 0.9998551979660988, "completion_length": 1312.458381652832, "epoch": 0.248, "grad_norm": 0.261865496635437, "kl": 0.009029388427734375, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.27273859315928e-07, "loss": 0.0004, "reward": 0.793455844046548, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.793455844046548, "reward_after_std": 0.8174287676811218, "reward_before_mean": 1.1041301563382149, "reward_before_std": 0.7625340297818184, "reward_change_max": 0.0, "reward_change_mean": -0.3106742948293686, "reward_change_min": -0.5126793663948774, "reward_change_std": 0.18933186866343021, "reward_std": 0.8174287900328636, "rewards/cosine_scaled_reward": 0.12498173583298922, "rewards/format_reward": 0.8541666679084301, "step": 217 }, { "advantage_max": 1.6184561103582382, "advantage_mean": -1.4280280402623191e-08, "advantage_min": -1.0892015025019646, "advantage_std": 0.9997873827815056, "completion_length": 1377.6250305175781, "epoch": 0.24914285714285714, "grad_norm": 0.237302765250206, "kl": 0.007476806640625, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.243820139034464e-07, "loss": 0.0003, "reward": 0.2374113779515028, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.2374113779515028, "reward_after_std": 0.5709048807621002, "reward_before_mean": 0.4522370882332325, "reward_before_std": 0.5512584503740072, "reward_change_max": 0.0008431896567344666, "reward_change_mean": -0.2148257028311491, "reward_change_min": -0.3685059826821089, "reward_change_std": 0.13741043116897345, "reward_std": 0.570904903113842, "rewards/cosine_scaled_reward": -0.21138146799057722, "rewards/format_reward": 0.8750000111758709, "step": 218 }, { "advantage_max": 1.6508960127830505, "advantage_mean": -3.1044087300813317e-09, "advantage_min": -1.0674656257033348, "advantage_std": 0.9998109415173531, "completion_length": 1250.1666870117188, "epoch": 0.2502857142857143, "grad_norm": 0.30771803855895996, "kl": 0.008953094482421875, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.214816693576234e-07, "loss": 0.0004, "reward": 0.7107078991830349, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.7107078991830349, "reward_after_std": 0.6987728551030159, "reward_before_mean": 1.0161756947636604, "reward_before_std": 0.6660973466932774, "reward_change_max": 0.0, "reward_change_mean": -0.3054677518084645, "reward_change_min": -0.48597782105207443, "reward_change_std": 0.18221312388777733, "reward_std": 0.6987728625535965, "rewards/cosine_scaled_reward": 0.08100447617471218, "rewards/format_reward": 0.8541666716337204, "step": 219 }, { "advantage_max": 1.6507864892482758, "advantage_mean": 1.2417634476236117e-08, "advantage_min": -0.8715488091111183, "advantage_std": 0.9997882917523384, "completion_length": 1461.0208644866943, "epoch": 0.25142857142857145, "grad_norm": 0.30054739117622375, "kl": 0.010099411010742188, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.185729670371604e-07, "loss": 0.0004, "reward": 0.08521711453795433, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.08521711453795433, "reward_after_std": 0.5252517238259315, "reward_before_mean": 0.2699965760111809, "reward_before_std": 0.5070017725229263, "reward_change_max": 0.0002897605299949646, "reward_change_mean": -0.18477945029735565, "reward_change_min": -0.34365820325911045, "reward_change_std": 0.12291183322668076, "reward_std": 0.5252517312765121, "rewards/cosine_scaled_reward": -0.29208505246788263, "rewards/format_reward": 0.8541666697710752, "step": 220 }, { "advantage_max": 1.7164071947336197, "advantage_mean": -9.93410742555767e-09, "advantage_min": -0.881681602448225, "advantage_std": 0.9997705519199371, "completion_length": 1046.5833702087402, "epoch": 0.25257142857142856, "grad_norm": 0.26651760935783386, "kl": 0.00786590576171875, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.156560487081051e-07, "loss": 0.0003, "reward": 0.6971848933026195, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6971848933026195, "reward_after_std": 0.5582109466195107, "reward_before_mean": 1.0091250874102116, "reward_before_std": 0.49160440918058157, "reward_change_max": 0.0, "reward_change_mean": -0.31194019317626953, "reward_change_min": -0.48183613270521164, "reward_change_std": 0.1836419040337205, "reward_std": 0.558210976421833, "rewards/cosine_scaled_reward": 0.03581253904849291, "rewards/format_reward": 0.9375000074505806, "step": 221 }, { "advantage_max": 1.6569068133831024, "advantage_mean": -3.104408563547878e-08, "advantage_min": -0.9789787083864212, "advantage_std": 0.9998027682304382, "completion_length": 1323.1875381469727, "epoch": 0.2537142857142857, "grad_norm": 0.28783875703811646, "kl": 0.009412765502929688, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.127310565369415e-07, "loss": 0.0004, "reward": 0.7466136773582548, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7466136773582548, "reward_after_std": 0.57162756472826, "reward_before_mean": 1.0669129355810583, "reward_before_std": 0.5085346233099699, "reward_change_max": 0.0, "reward_change_mean": -0.3202992007136345, "reward_change_min": -0.5050487257540226, "reward_change_std": 0.18606600351631641, "reward_std": 0.5716275870800018, "rewards/cosine_scaled_reward": 0.11678976844996214, "rewards/format_reward": 0.8333333358168602, "step": 222 }, { "advantage_max": 1.6416790187358856, "advantage_mean": -5.960464510845753e-08, "advantage_min": -1.1050887554883957, "advantage_std": 0.9997776672244072, "completion_length": 1424.4167098999023, "epoch": 0.25485714285714284, "grad_norm": 0.23706035315990448, "kl": 0.008235931396484375, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.097981330836616e-07, "loss": 0.0003, "reward": 0.46483216737397015, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.46483216737397015, "reward_after_std": 0.5388064533472061, "reward_before_mean": 0.7305796891450882, "reward_before_std": 0.508102111518383, "reward_change_max": 0.0, "reward_change_mean": -0.2657475220039487, "reward_change_min": -0.42238375917077065, "reward_change_std": 0.16147217992693186, "reward_std": 0.5388064719736576, "rewards/cosine_scaled_reward": -0.0513768270611763, "rewards/format_reward": 0.8333333432674408, "step": 223 }, { "advantage_max": 1.5843409299850464, "advantage_mean": -6.208817238118058e-09, "advantage_min": -1.094870388507843, "advantage_std": 0.9998336359858513, "completion_length": 1769.8125610351562, "epoch": 0.256, "grad_norm": 0.22389104962348938, "kl": 0.00800323486328125, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.068574212948169e-07, "loss": 0.0003, "reward": 0.426870440132916, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.426870440132916, "reward_after_std": 0.7609815746545792, "reward_before_mean": 0.6692265486344695, "reward_before_std": 0.755431704223156, "reward_change_max": 0.0, "reward_change_mean": -0.24235611967742443, "reward_change_min": -0.4466691426932812, "reward_change_std": 0.16528659500181675, "reward_std": 0.760981597006321, "rewards/cosine_scaled_reward": -0.07163673074683174, "rewards/format_reward": 0.8125000074505806, "step": 224 }, { "advantage_max": 1.5787824094295502, "advantage_mean": -1.6142924552653426e-08, "advantage_min": -1.0859878808259964, "advantage_std": 0.9998409599065781, "completion_length": 1509.2916946411133, "epoch": 0.2571428571428571, "grad_norm": 0.31962674856185913, "kl": 0.013401031494140625, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.039090644965509e-07, "loss": 0.0005, "reward": 0.4917536824941635, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4917536824941635, "reward_after_std": 0.7545221112668514, "reward_before_mean": 0.7470511943101883, "reward_before_std": 0.7450950369238853, "reward_change_max": 0.0, "reward_change_mean": -0.2552974782884121, "reward_change_min": -0.4263688549399376, "reward_change_std": 0.16516633983701468, "reward_std": 0.7545221336185932, "rewards/cosine_scaled_reward": -0.053557755425572395, "rewards/format_reward": 0.8541666753590107, "step": 225 }, { "advantage_max": 1.488643042743206, "advantage_mean": -5.525847351917079e-08, "advantage_min": -1.2618045508861542, "advantage_std": 0.999822273850441, "completion_length": 1414.9792098999023, "epoch": 0.2582857142857143, "grad_norm": 0.21461425721645355, "kl": 0.008453369140625, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.009532063876148e-07, "loss": 0.0003, "reward": 0.906906258314848, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.906906258314848, "reward_after_std": 0.7242296598851681, "reward_before_mean": 1.2583957947790623, "reward_before_std": 0.7252499852329493, "reward_change_max": 0.0, "reward_change_mean": -0.35148957930505276, "reward_change_min": -0.5474782064557076, "reward_change_std": 0.221489024348557, "reward_std": 0.7242296785116196, "rewards/cosine_scaled_reward": 0.17086457274854183, "rewards/format_reward": 0.9166666865348816, "step": 226 }, { "advantage_max": 1.657107725739479, "advantage_mean": 6.208814573582799e-10, "advantage_min": -1.0568899437785149, "advantage_std": 0.999826692044735, "completion_length": 1037.1667022705078, "epoch": 0.25942857142857145, "grad_norm": 0.30343934893608093, "kl": 0.011219024658203125, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.979899910323624e-07, "loss": 0.0004, "reward": 0.5653025805950165, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5653025805950165, "reward_after_std": 0.7990770377218723, "reward_before_mean": 0.829374760389328, "reward_before_std": 0.7766488343477249, "reward_change_max": 0.0, "reward_change_mean": -0.2640721816569567, "reward_change_min": -0.4902346208691597, "reward_change_std": 0.1744185872375965, "reward_std": 0.7990770451724529, "rewards/cosine_scaled_reward": -0.07489595795050263, "rewards/format_reward": 0.9791666716337204, "step": 227 }, { "advantage_max": 1.6345582455396652, "advantage_mean": -1.552204320631745e-08, "advantage_min": -1.091313198208809, "advantage_std": 0.9997934699058533, "completion_length": 1175.6875190734863, "epoch": 0.26057142857142856, "grad_norm": 0.2735670506954193, "kl": 0.008899688720703125, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.950195628537299e-07, "loss": 0.0004, "reward": 0.7679368201643229, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7679368201643229, "reward_after_std": 0.5992216616868973, "reward_before_mean": 1.0934365428984165, "reward_before_std": 0.5540211275219917, "reward_change_max": 0.00032895803451538086, "reward_change_mean": -0.3254997171461582, "reward_change_min": -0.5017792768776417, "reward_change_std": 0.18997229263186455, "reward_std": 0.5992216691374779, "rewards/cosine_scaled_reward": 0.07796826213598251, "rewards/format_reward": 0.9375000074505806, "step": 228 }, { "advantage_max": 1.6010667532682419, "advantage_mean": -1.1796752796833232e-08, "advantage_min": -1.0798330828547478, "advantage_std": 0.9998021498322487, "completion_length": 1383.2708892822266, "epoch": 0.26171428571428573, "grad_norm": 0.3279111087322235, "kl": 0.01459503173828125, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.920420666261961e-07, "loss": 0.0006, "reward": 0.5010130619630218, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5010130619630218, "reward_after_std": 0.5620199963450432, "reward_before_mean": 0.7702241754159331, "reward_before_std": 0.5329751931130886, "reward_change_max": 0.00018850713968276978, "reward_change_mean": -0.2692110911011696, "reward_change_min": -0.45008396729826927, "reward_change_std": 0.15964961983263493, "reward_std": 0.5620200261473656, "rewards/cosine_scaled_reward": -0.052387919276952744, "rewards/format_reward": 0.8750000074505806, "step": 229 }, { "advantage_max": 1.7147899568080902, "advantage_mean": -2.1109978876054925e-08, "advantage_min": -1.0958684533834457, "advantage_std": 0.9997913986444473, "completion_length": 1725.1875457763672, "epoch": 0.26285714285714284, "grad_norm": 0.2207302302122116, "kl": 0.009916305541992188, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.890576474687263e-07, "loss": 0.0004, "reward": 0.09772306028753519, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.09772306028753519, "reward_after_std": 0.6152992285788059, "reward_before_mean": 0.2763853659853339, "reward_before_std": 0.5864697769284248, "reward_change_max": 4.410743713378906e-05, "reward_change_mean": -0.1786623066291213, "reward_change_min": -0.29143037647008896, "reward_change_std": 0.11186036374419928, "reward_std": 0.6152992323040962, "rewards/cosine_scaled_reward": -0.257640658528544, "rewards/format_reward": 0.7916666846722364, "step": 230 }, { "advantage_max": 1.649023875594139, "advantage_mean": -5.650023726655462e-08, "advantage_min": -1.1833641976118088, "advantage_std": 0.9998235106468201, "completion_length": 1206.6875534057617, "epoch": 0.264, "grad_norm": 0.2692098915576935, "kl": 0.0087738037109375, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.860664508377001e-07, "loss": 0.0004, "reward": 0.7332293977960944, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7332293977960944, "reward_after_std": 0.6610595881938934, "reward_before_mean": 1.045009451918304, "reward_before_std": 0.6180670075118542, "reward_change_max": 0.0, "reward_change_mean": -0.3117800485342741, "reward_change_min": -0.5020957123488188, "reward_change_std": 0.18412381410598755, "reward_std": 0.6610595881938934, "rewards/cosine_scaled_reward": 0.0641713603399694, "rewards/format_reward": 0.9166666716337204, "step": 231 }, { "advantage_max": 1.7163164764642715, "advantage_mean": -1.1175871117430347e-08, "advantage_min": -0.9364407062530518, "advantage_std": 0.9998013451695442, "completion_length": 1682.9375305175781, "epoch": 0.2651428571428571, "grad_norm": 0.22369278967380524, "kl": 0.010684967041015625, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.83068622519821e-07, "loss": 0.0004, "reward": 0.34763477742671967, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.34763477742671967, "reward_after_std": 0.6970238201320171, "reward_before_mean": 0.5743373781442642, "reward_before_std": 0.6692356579005718, "reward_change_max": 0.00011929869651794434, "reward_change_mean": -0.22670260351151228, "reward_change_min": -0.4087493382394314, "reward_change_std": 0.14438875764608383, "reward_std": 0.6970238536596298, "rewards/cosine_scaled_reward": -0.1399146532639861, "rewards/format_reward": 0.8541666716337204, "step": 232 }, { "advantage_max": 1.7902098149061203, "advantage_mean": -4.097819406023717e-08, "advantage_min": -0.9070549011230469, "advantage_std": 0.9997976720333099, "completion_length": 923.9792098999023, "epoch": 0.2662857142857143, "grad_norm": 0.2506750226020813, "kl": 0.006481170654296875, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.800643086250121e-07, "loss": 0.0003, "reward": 0.4007737059146166, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4007737059146166, "reward_after_std": 0.6145709976553917, "reward_before_mean": 0.6410532779991627, "reward_before_std": 0.5568097867071629, "reward_change_max": 0.0, "reward_change_mean": -0.24027959816157818, "reward_change_min": -0.38382837176322937, "reward_change_std": 0.13632231950759888, "reward_std": 0.6145710311830044, "rewards/cosine_scaled_reward": -0.17947336845099926, "rewards/format_reward": 1.0, "step": 233 }, { "advantage_max": 1.7843565493822098, "advantage_mean": 2.6077034309679448e-08, "advantage_min": -0.914627306163311, "advantage_std": 0.9997813999652863, "completion_length": 1496.5416946411133, "epoch": 0.2674285714285714, "grad_norm": 0.2820890545845032, "kl": 0.011810302734375, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.770536555792944e-07, "loss": 0.0005, "reward": 0.39785597764421254, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.39785597764421254, "reward_after_std": 0.6381666585803032, "reward_before_mean": 0.6381850223988295, "reward_before_std": 0.5803314950317144, "reward_change_max": 0.0, "reward_change_mean": -0.24032901134341955, "reward_change_min": -0.38727836683392525, "reward_change_std": 0.145506224129349, "reward_std": 0.6381666623055935, "rewards/cosine_scaled_reward": -0.09757416089996696, "rewards/format_reward": 0.8333333432674408, "step": 234 }, { "advantage_max": 1.8626025319099426, "advantage_mean": -5.463759256141287e-08, "advantage_min": -0.8086363673210144, "advantage_std": 0.9998244345188141, "completion_length": 1031.7917022705078, "epoch": 0.26857142857142857, "grad_norm": 0.26428404450416565, "kl": 0.00981903076171875, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.740368101176495e-07, "loss": 0.0004, "reward": 0.9631862174719572, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.9631862174719572, "reward_after_std": 0.7271666266024113, "reward_before_mean": 1.3131061717867851, "reward_before_std": 0.6203845215495676, "reward_change_max": 0.0, "reward_change_mean": -0.34991992451250553, "reward_change_min": -0.5234523415565491, "reward_change_std": 0.19631488993763924, "reward_std": 0.727166660130024, "rewards/cosine_scaled_reward": 0.16696972399950027, "rewards/format_reward": 0.9791666716337204, "step": 235 }, { "advantage_max": 1.6912491768598557, "advantage_mean": -4.3461718668424965e-09, "advantage_min": -1.0445576757192612, "advantage_std": 0.9997703358530998, "completion_length": 1854.4167022705078, "epoch": 0.26971428571428574, "grad_norm": 0.24529314041137695, "kl": 0.012115478515625, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.710139192768694e-07, "loss": 0.0005, "reward": 0.22261589765548706, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.22261589765548706, "reward_after_std": 0.6593184359371662, "reward_before_mean": 0.42628917563706636, "reward_before_std": 0.6346053350716829, "reward_change_max": 0.0, "reward_change_mean": -0.20367325004190207, "reward_change_min": -0.37755058892071247, "reward_change_std": 0.13592194765806198, "reward_std": 0.6593184620141983, "rewards/cosine_scaled_reward": -0.1514387633651495, "rewards/format_reward": 0.7291666679084301, "step": 236 }, { "advantage_max": 1.6201680451631546, "advantage_mean": -3.104408596854569e-08, "advantage_min": -1.2312413528561592, "advantage_std": 0.9998116791248322, "completion_length": 1428.4791870117188, "epoch": 0.27085714285714285, "grad_norm": 0.21748077869415283, "kl": 0.008556365966796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.679851303883891e-07, "loss": 0.0003, "reward": 0.6938337534666061, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6938337534666061, "reward_after_std": 0.6575784459710121, "reward_before_mean": 0.9981541857123375, "reward_before_std": 0.6225257329642773, "reward_change_max": 0.0, "reward_change_mean": -0.30432047322392464, "reward_change_min": -0.4677902590483427, "reward_change_std": 0.1763472305610776, "reward_std": 0.6575784794986248, "rewards/cosine_scaled_reward": 0.061577089596539736, "rewards/format_reward": 0.875, "step": 237 }, { "advantage_max": 1.7355145364999771, "advantage_mean": -8.07146262049585e-08, "advantage_min": -0.9608321115374565, "advantage_std": 0.9998087286949158, "completion_length": 1263.1042251586914, "epoch": 0.272, "grad_norm": 0.25904324650764465, "kl": 0.01227569580078125, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.649505910711058e-07, "loss": 0.0005, "reward": 0.7409389466047287, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7409389466047287, "reward_after_std": 0.8139899540692568, "reward_before_mean": 1.042520135641098, "reward_before_std": 0.7730010617524385, "reward_change_max": 0.000684693455696106, "reward_change_mean": -0.3015812076628208, "reward_change_min": -0.5144244804978371, "reward_change_std": 0.19731996580958366, "reward_std": 0.8139900006353855, "rewards/cosine_scaled_reward": 0.06292672269046307, "rewards/format_reward": 0.9166666716337204, "step": 238 }, { "advantage_max": 1.5609101951122284, "advantage_mean": -7.047007721805443e-08, "advantage_min": -1.255035139620304, "advantage_std": 0.9998224526643753, "completion_length": 1285.2292137145996, "epoch": 0.27314285714285713, "grad_norm": 0.2311781495809555, "kl": 0.00722503662109375, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.619104492241847e-07, "loss": 0.0003, "reward": 1.021848929580301, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 1.021848929580301, "reward_after_std": 0.7617662567645311, "reward_before_mean": 1.3911364481318742, "reward_before_std": 0.7432506419718266, "reward_change_max": 0.00010582804679870605, "reward_change_mean": -0.3692875001579523, "reward_change_min": -0.5409754365682602, "reward_change_std": 0.21951518952846527, "reward_std": 0.7617662828415632, "rewards/cosine_scaled_reward": 0.27890151739120483, "rewards/format_reward": 0.833333333954215, "step": 239 }, { "advantage_max": 1.8245199620723724, "advantage_mean": -4.967053879312289e-09, "advantage_min": -0.8752617910504341, "advantage_std": 0.9997757226228714, "completion_length": 1043.7708587646484, "epoch": 0.2742857142857143, "grad_norm": 0.33578115701675415, "kl": 0.010406494140625, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.588648530198504e-07, "loss": 0.0004, "reward": 0.2590970569290221, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2590970569290221, "reward_after_std": 0.5167682282626629, "reward_before_mean": 0.4774673692882061, "reward_before_std": 0.4506372455507517, "reward_change_max": 0.0, "reward_change_mean": -0.21837032958865166, "reward_change_min": -0.32546042278409004, "reward_change_std": 0.11699494253844023, "reward_std": 0.5167682506144047, "rewards/cosine_scaled_reward": -0.2404329781420529, "rewards/format_reward": 0.9583333432674408, "step": 240 }, { "advantage_max": 1.6402872800827026, "advantage_mean": -2.4214386995513593e-08, "advantage_min": -1.0937683582305908, "advantage_std": 0.9998233839869499, "completion_length": 1367.1666870117188, "epoch": 0.2754285714285714, "grad_norm": 0.27239882946014404, "kl": 0.01021575927734375, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.558139508961654e-07, "loss": 0.0004, "reward": 0.358948964625597, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.358948964625597, "reward_after_std": 0.6905892789363861, "reward_before_mean": 0.5892646436113864, "reward_before_std": 0.6633242294192314, "reward_change_max": 0.0001598149538040161, "reward_change_mean": -0.2303156852722168, "reward_change_min": -0.394557923078537, "reward_change_std": 0.14510553609579802, "reward_std": 0.6905892826616764, "rewards/cosine_scaled_reward": -0.17411768180318177, "rewards/format_reward": 0.9375000074505806, "step": 241 }, { "advantage_max": 1.710006132721901, "advantage_mean": 2.1730860000346297e-08, "advantage_min": -0.9204581826925278, "advantage_std": 0.9996551722288132, "completion_length": 946.6041870117188, "epoch": 0.2765714285714286, "grad_norm": 0.33036869764328003, "kl": 0.014711380004882812, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.527578915497951e-07, "loss": 0.0006, "reward": 0.4683981789276004, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4683981789276004, "reward_after_std": 0.4530638074502349, "reward_before_mean": 0.7394059164216742, "reward_before_std": 0.39871928084176034, "reward_change_max": 0.0, "reward_change_mean": -0.2710077129304409, "reward_change_min": -0.42335289902985096, "reward_change_std": 0.1545707117766142, "reward_std": 0.4530638186261058, "rewards/cosine_scaled_reward": -0.11988039966672659, "rewards/format_reward": 0.9791666716337204, "step": 242 }, { "advantage_max": 1.7547654956579208, "advantage_mean": -3.7252904094842165e-08, "advantage_min": -0.8856014385819435, "advantage_std": 0.9998505413532257, "completion_length": 1448.6875228881836, "epoch": 0.2777142857142857, "grad_norm": 0.21732625365257263, "kl": 0.008548736572265625, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.496968239287603e-07, "loss": 0.0003, "reward": 0.6196297630667686, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6196297630667686, "reward_after_std": 0.7986881732940674, "reward_before_mean": 0.8930200412869453, "reward_before_std": 0.7416506707668304, "reward_change_max": 0.0, "reward_change_mean": -0.2733902661129832, "reward_change_min": -0.4578908830881119, "reward_change_std": 0.16526594944298267, "reward_std": 0.7986881770193577, "rewards/cosine_scaled_reward": 0.009009993635118008, "rewards/format_reward": 0.875, "step": 243 }, { "advantage_max": 1.6024657785892487, "advantage_mean": -4.284083965355734e-08, "advantage_min": -1.1661089807748795, "advantage_std": 0.9998447969555855, "completion_length": 1410.9792098999023, "epoch": 0.27885714285714286, "grad_norm": 0.25495943427085876, "kl": 0.007928848266601562, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.466308972251785e-07, "loss": 0.0003, "reward": 0.8061085338704288, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.8061085338704288, "reward_after_std": 0.7023773118853569, "reward_before_mean": 1.131754757836461, "reward_before_std": 0.6597034148871899, "reward_change_max": 0.0, "reward_change_mean": -0.32564621791243553, "reward_change_min": -0.515510767698288, "reward_change_std": 0.19546143896877766, "reward_std": 0.7023773454129696, "rewards/cosine_scaled_reward": 0.08671068772673607, "rewards/format_reward": 0.9583333432674408, "step": 244 }, { "advantage_max": 1.6460980474948883, "advantage_mean": -2.980232349791834e-08, "advantage_min": -1.046926312148571, "advantage_std": 0.999882735311985, "completion_length": 1751.062515258789, "epoch": 0.28, "grad_norm": 0.2102421373128891, "kl": 0.008523941040039062, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.435602608679916e-07, "loss": 0.0003, "reward": 0.6302961353212595, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6302961353212595, "reward_after_std": 0.992115680128336, "reward_before_mean": 0.9001847244799137, "reward_before_std": 0.9945877194404602, "reward_change_max": 0.00019932538270950317, "reward_change_mean": -0.2698885854333639, "reward_change_min": -0.5097962245345116, "reward_change_std": 0.19624944310635328, "reward_std": 0.9921157024800777, "rewards/cosine_scaled_reward": 0.04384234419558197, "rewards/format_reward": 0.8125000055879354, "step": 245 }, { "advantage_max": 1.7886092513799667, "advantage_mean": 2.8560559139911845e-08, "advantage_min": -0.8694706782698631, "advantage_std": 0.9998341947793961, "completion_length": 1462.0000457763672, "epoch": 0.28114285714285714, "grad_norm": 0.23334245383739471, "kl": 0.012065887451171875, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.404850645156841e-07, "loss": 0.0005, "reward": 0.4586481023579836, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4586481023579836, "reward_after_std": 0.6987153068184853, "reward_before_mean": 0.7044544890522957, "reward_before_std": 0.6329842433333397, "reward_change_max": 0.0, "reward_change_mean": -0.24580636341124773, "reward_change_min": -0.4013601616024971, "reward_change_std": 0.14217450562864542, "reward_std": 0.698715329170227, "rewards/cosine_scaled_reward": -0.08527276385575533, "rewards/format_reward": 0.8750000111758709, "step": 246 }, { "advantage_max": 1.689618080854416, "advantage_mean": -1.117587078436344e-08, "advantage_min": -1.0913282707333565, "advantage_std": 0.9997552260756493, "completion_length": 2003.4375305175781, "epoch": 0.2822857142857143, "grad_norm": 0.2768010199069977, "kl": 0.014162063598632812, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.374054580489873e-07, "loss": 0.0006, "reward": 0.13496133871376514, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.13496133871376514, "reward_after_std": 0.5509476698935032, "reward_before_mean": 0.33065611124038696, "reward_before_std": 0.5430520595982671, "reward_change_max": 0.0004723742604255676, "reward_change_mean": -0.19569478183984756, "reward_change_min": -0.3195792939513922, "reward_change_std": 0.13450893759727478, "reward_std": 0.5509477015584707, "rewards/cosine_scaled_reward": -0.18883861787617207, "rewards/format_reward": 0.7083333469927311, "step": 247 }, { "advantage_max": 1.5921323150396347, "advantage_mean": -1.3348957372816272e-07, "advantage_min": -1.2049953117966652, "advantage_std": 0.9998230114579201, "completion_length": 1375.2291946411133, "epoch": 0.2834285714285714, "grad_norm": 0.32450905442237854, "kl": 0.012350082397460938, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.343215915635761e-07, "loss": 0.0005, "reward": 0.8880419675260782, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.8880419675260782, "reward_after_std": 0.676459863781929, "reward_before_mean": 1.2321948036551476, "reward_before_std": 0.6238557770848274, "reward_change_max": 0.0, "reward_change_mean": -0.3441528985276818, "reward_change_min": -0.5215148255228996, "reward_change_std": 0.2035660557448864, "reward_std": 0.6764598675072193, "rewards/cosine_scaled_reward": 0.22026405856013298, "rewards/format_reward": 0.7916666716337204, "step": 248 }, { "advantage_max": 1.7209616303443909, "advantage_mean": -2.8560559028889543e-08, "advantage_min": -0.9680159725248814, "advantage_std": 0.9998212978243828, "completion_length": 1224.8750457763672, "epoch": 0.2845714285714286, "grad_norm": 0.26189547777175903, "kl": 0.0107269287109375, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.31233615362752e-07, "loss": 0.0004, "reward": 0.8255430636927485, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.8255430636927485, "reward_after_std": 0.6577041186392307, "reward_before_mean": 1.1544302143156528, "reward_before_std": 0.5823190435767174, "reward_change_max": 0.0, "reward_change_mean": -0.3288871766999364, "reward_change_min": -0.4670917894691229, "reward_change_std": 0.18136184941977262, "reward_std": 0.6577041260898113, "rewards/cosine_scaled_reward": 0.1188817722722888, "rewards/format_reward": 0.9166666679084301, "step": 249 }, { "advantage_max": 1.5140611678361893, "advantage_mean": -1.8005570368018198e-08, "advantage_min": -1.203395776450634, "advantage_std": 0.9998101890087128, "completion_length": 1255.0208473205566, "epoch": 0.2857142857142857, "grad_norm": 0.3561646342277527, "kl": 0.0129852294921875, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.281416799501187e-07, "loss": 0.0005, "reward": 0.5807215161621571, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5807215161621571, "reward_after_std": 0.762598255649209, "reward_before_mean": 0.8581455871462822, "reward_before_std": 0.7728062570095062, "reward_change_max": 0.0, "reward_change_mean": -0.27742408588528633, "reward_change_min": -0.4606732130050659, "reward_change_std": 0.18659934867173433, "reward_std": 0.7625982705503702, "rewards/cosine_scaled_reward": -0.01884387107565999, "rewards/format_reward": 0.8958333507180214, "step": 250 }, { "advantage_max": 1.5809199213981628, "advantage_mean": -8.692343844707295e-09, "advantage_min": -1.2370038107037544, "advantage_std": 0.9998120293021202, "completion_length": 1102.0833740234375, "epoch": 0.28685714285714287, "grad_norm": 0.28108564019203186, "kl": 0.010408401489257812, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.25045936022246e-07, "loss": 0.0004, "reward": 0.4316780879162252, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4316780879162252, "reward_after_std": 0.6830427572131157, "reward_before_mean": 0.6816601119935513, "reward_before_std": 0.6788753867149353, "reward_change_max": 0.0003069266676902771, "reward_change_mean": -0.2499820338562131, "reward_change_min": -0.40494656190276146, "reward_change_std": 0.16739745903760195, "reward_std": 0.6830427646636963, "rewards/cosine_scaled_reward": -0.08625328214839101, "rewards/format_reward": 0.8541666753590107, "step": 251 }, { "advantage_max": 1.7180158644914627, "advantage_mean": -4.967053657267684e-09, "advantage_min": -0.973113164305687, "advantage_std": 0.9998108670115471, "completion_length": 1398.2917137145996, "epoch": 0.288, "grad_norm": 0.26431670784950256, "kl": 0.010639190673828125, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.219465344613258e-07, "loss": 0.0004, "reward": 0.4735152288340032, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4735152288340032, "reward_after_std": 0.6365889646112919, "reward_before_mean": 0.7305772360414267, "reward_before_std": 0.5807833485305309, "reward_change_max": 0.0, "reward_change_mean": -0.2570619937032461, "reward_change_min": -0.3992270193994045, "reward_change_std": 0.15293935127556324, "reward_std": 0.6365889683365822, "rewards/cosine_scaled_reward": -0.07221140991896391, "rewards/format_reward": 0.8750000037252903, "step": 252 }, { "advantage_max": 1.6413754224777222, "advantage_mean": 1.862645149230957e-08, "advantage_min": -0.9406393691897392, "advantage_std": 0.9998555779457092, "completion_length": 1747.3333854675293, "epoch": 0.28914285714285715, "grad_norm": 0.3222593665122986, "kl": 0.022695541381835938, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.188436263278172e-07, "loss": 0.0009, "reward": 0.41165209421887994, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.41165209421887994, "reward_after_std": 0.8608526214957237, "reward_before_mean": 0.6432545175775886, "reward_before_std": 0.8611173704266548, "reward_change_max": 0.0003280565142631531, "reward_change_mean": -0.23160241451114416, "reward_change_min": -0.45648399367928505, "reward_change_std": 0.16804132983088493, "reward_std": 0.8608526289463043, "rewards/cosine_scaled_reward": -0.07420608215034008, "rewards/format_reward": 0.7916666753590107, "step": 253 }, { "advantage_max": 1.7923977673053741, "advantage_mean": -5.4327151444155675e-08, "advantage_min": -0.9530624970793724, "advantage_std": 0.9998274743556976, "completion_length": 1410.0833740234375, "epoch": 0.29028571428571426, "grad_norm": 0.3107492923736572, "kl": 0.013109207153320312, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.157373628530852e-07, "loss": 0.0005, "reward": 0.5809852974489331, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5809852974489331, "reward_after_std": 0.7450582385063171, "reward_before_mean": 0.8495601508766413, "reward_before_std": 0.6797804534435272, "reward_change_max": 0.00035993754863739014, "reward_change_mean": -0.2685748729854822, "reward_change_min": -0.42683035880327225, "reward_change_std": 0.15862839203327894, "reward_std": 0.7450582459568977, "rewards/cosine_scaled_reward": -0.03355327108874917, "rewards/format_reward": 0.9166666716337204, "step": 254 }, { "advantage_max": 1.7443495839834213, "advantage_mean": 9.93410742555767e-09, "advantage_min": -0.9862060695886612, "advantage_std": 0.9997766390442848, "completion_length": 1702.7917175292969, "epoch": 0.2914285714285714, "grad_norm": 0.26845040917396545, "kl": 0.015506744384765625, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.126278954320294e-07, "loss": 0.0006, "reward": 0.07089572306722403, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.07089572306722403, "reward_after_std": 0.5136217400431633, "reward_before_mean": 0.2524457387626171, "reward_before_std": 0.4779893010854721, "reward_change_max": 0.0, "reward_change_mean": -0.1815500007942319, "reward_change_min": -0.29097018018364906, "reward_change_std": 0.10849831020459533, "reward_std": 0.5136217400431633, "rewards/cosine_scaled_reward": -0.24877713713794947, "rewards/format_reward": 0.7500000055879354, "step": 255 }, { "advantage_max": 1.5460007637739182, "advantage_mean": -9.62366669687853e-09, "advantage_min": -1.2941532135009766, "advantage_std": 0.9998073950409889, "completion_length": 1409.7917022705078, "epoch": 0.2925714285714286, "grad_norm": 0.2874889075756073, "kl": 0.01189422607421875, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.095153756157051e-07, "loss": 0.0005, "reward": 0.634954672306776, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.634954672306776, "reward_after_std": 0.7295485325157642, "reward_before_mean": 0.9263447821140289, "reward_before_std": 0.7339564729481936, "reward_change_max": 0.0031501948833465576, "reward_change_mean": -0.2913900911808014, "reward_change_min": -0.48522017523646355, "reward_change_std": 0.19647251721471548, "reward_std": 0.7295485362410545, "rewards/cosine_scaled_reward": -0.005577614530920982, "rewards/format_reward": 0.9375000149011612, "step": 256 }, { "advantage_max": 1.5703508257865906, "advantage_mean": -1.6608585839961165e-08, "advantage_min": -1.2059873640537262, "advantage_std": 0.9998743832111359, "completion_length": 1937.0417251586914, "epoch": 0.2937142857142857, "grad_norm": 0.35278812050819397, "kl": 0.012939453125, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.06399955103937e-07, "loss": 0.0005, "reward": 0.694405922666192, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.694405922666192, "reward_after_std": 0.9290311932563782, "reward_before_mean": 0.9840117041021585, "reward_before_std": 0.9439280852675438, "reward_change_max": 0.0, "reward_change_mean": -0.28960578329861164, "reward_change_min": -0.5085005983710289, "reward_change_std": 0.20747256092727184, "reward_std": 0.9290312454104424, "rewards/cosine_scaled_reward": 0.07533917389810085, "rewards/format_reward": 0.8333333358168602, "step": 257 }, { "advantage_max": 1.7613529562950134, "advantage_mean": -2.110997909809953e-08, "advantage_min": -0.9053919687867165, "advantage_std": 0.9998543411493301, "completion_length": 1757.7500305175781, "epoch": 0.2948571428571429, "grad_norm": 0.25500622391700745, "kl": 0.01361083984375, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.032817857379256e-07, "loss": 0.0005, "reward": 0.39868373051285744, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.39868373051285744, "reward_after_std": 0.8846242874860764, "reward_before_mean": 0.6204931288957596, "reward_before_std": 0.8531522713601589, "reward_change_max": 0.00040918588638305664, "reward_change_mean": -0.22180940210819244, "reward_change_min": -0.38025897182524204, "reward_change_std": 0.14975974522531033, "reward_std": 0.8846242912113667, "rewards/cosine_scaled_reward": -0.0960034430027008, "rewards/format_reward": 0.8125000055879354, "step": 258 }, { "advantage_max": 1.6192731708288193, "advantage_mean": -5.5879355587151736e-09, "advantage_min": -1.0627397671341896, "advantage_std": 0.999801829457283, "completion_length": 1482.5625610351562, "epoch": 0.296, "grad_norm": 0.3482678234577179, "kl": 0.014739990234375, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.001610194928464e-07, "loss": 0.0006, "reward": 0.7368679717183113, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.7368679717183113, "reward_after_std": 0.6360752396285534, "reward_before_mean": 1.0532444640994072, "reward_before_std": 0.5891787149012089, "reward_change_max": 0.0006304755806922913, "reward_change_mean": -0.3163764523342252, "reward_change_min": -0.5091553628444672, "reward_change_std": 0.1979432748630643, "reward_std": 0.6360752433538437, "rewards/cosine_scaled_reward": 0.0787055566906929, "rewards/format_reward": 0.8958333358168602, "step": 259 }, { "advantage_max": 1.8113128542900085, "advantage_mean": -1.8626450382086546e-08, "advantage_min": -0.8748406283557415, "advantage_std": 0.9997461587190628, "completion_length": 1027.979211807251, "epoch": 0.29714285714285715, "grad_norm": 0.3253263831138611, "kl": 0.008440017700195312, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.97037808470444e-07, "loss": 0.0003, "reward": 0.8471368737518787, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.8471368737518787, "reward_after_std": 0.558805363252759, "reward_before_mean": 1.1868980564177036, "reward_before_std": 0.4591397875919938, "reward_change_max": 2.481788396835327e-05, "reward_change_mean": -0.33976118452847004, "reward_change_min": -0.5015393383800983, "reward_change_std": 0.19328110944479704, "reward_std": 0.5588053781539202, "rewards/cosine_scaled_reward": 0.1142823500558734, "rewards/format_reward": 0.9583333358168602, "step": 260 }, { "advantage_max": 1.7223718613386154, "advantage_mean": -8.84756468089165e-09, "advantage_min": -1.0590002834796906, "advantage_std": 0.9997850880026817, "completion_length": 1965.2500686645508, "epoch": 0.29828571428571427, "grad_norm": 0.268317312002182, "kl": 0.0142059326171875, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.939123048916173e-07, "loss": 0.0006, "reward": 0.20198887400329113, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.20198887400329113, "reward_after_std": 0.5538261011242867, "reward_before_mean": 0.4085303219035268, "reward_before_std": 0.5178539101034403, "reward_change_max": 0.0, "reward_change_mean": -0.20654146187007427, "reward_change_min": -0.3276657313108444, "reward_change_std": 0.12507850490510464, "reward_std": 0.5538261160254478, "rewards/cosine_scaled_reward": -0.12906817917246372, "rewards/format_reward": 0.6666666679084301, "step": 261 }, { "advantage_max": 1.5404580384492874, "advantage_mean": 3.104409063148239e-09, "advantage_min": -1.210778832435608, "advantage_std": 0.9997341260313988, "completion_length": 1730.5208587646484, "epoch": 0.29942857142857143, "grad_norm": 0.41677936911582947, "kl": 0.019683837890625, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.907846610890011e-07, "loss": 0.0008, "reward": 0.08706101775169373, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.08706101775169373, "reward_after_std": 0.4480132460594177, "reward_before_mean": 0.27937868889421225, "reward_before_std": 0.4376929756253958, "reward_change_max": 0.0, "reward_change_mean": -0.19231767486780882, "reward_change_min": -0.3273830972611904, "reward_change_std": 0.11917602550238371, "reward_std": 0.4480132479220629, "rewards/cosine_scaled_reward": -0.2248940011486411, "rewards/format_reward": 0.7291666716337204, "step": 262 }, { "advantage_max": 1.6942091435194016, "advantage_mean": -1.6142925329809543e-08, "advantage_min": -0.979148268699646, "advantage_std": 0.9997653514146805, "completion_length": 1288.770866394043, "epoch": 0.30057142857142854, "grad_norm": 0.22912034392356873, "kl": 0.00693511962890625, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.87655029499542e-07, "loss": 0.0003, "reward": 0.42538353987038136, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.42538353987038136, "reward_after_std": 0.518636416643858, "reward_before_mean": 0.681599510833621, "reward_before_std": 0.4740534070879221, "reward_change_max": 0.0, "reward_change_mean": -0.25621596723794937, "reward_change_min": -0.3982585147023201, "reward_change_std": 0.14761138334870338, "reward_std": 0.5186364278197289, "rewards/cosine_scaled_reward": -0.12795027159154415, "rewards/format_reward": 0.9375000074505806, "step": 263 }, { "advantage_max": 1.5913608968257904, "advantage_mean": -2.2351742678949904e-08, "advantage_min": -1.1664466261863708, "advantage_std": 0.9998384416103363, "completion_length": 1332.9583587646484, "epoch": 0.3017142857142857, "grad_norm": 0.28963515162467957, "kl": 0.010570526123046875, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.845235626570683e-07, "loss": 0.0004, "reward": 0.6335734352469444, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6335734352469444, "reward_after_std": 0.740757841616869, "reward_before_mean": 0.9181164689362049, "reward_before_std": 0.7137619107961655, "reward_change_max": 0.0, "reward_change_mean": -0.28454304300248623, "reward_change_min": -0.4633651450276375, "reward_change_std": 0.17281420156359673, "reward_std": 0.7407578490674496, "rewards/cosine_scaled_reward": -0.009691774845123291, "rewards/format_reward": 0.9375000149011612, "step": 264 }, { "advantage_max": 1.5019582211971283, "advantage_mean": -4.23751784772719e-08, "advantage_min": -1.2765265554189682, "advantage_std": 0.9998071640729904, "completion_length": 1453.1667175292969, "epoch": 0.3028571428571429, "grad_norm": 0.3727766275405884, "kl": 0.0173187255859375, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.813904131848564e-07, "loss": 0.0007, "reward": 0.6399871921166778, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6399871921166778, "reward_after_std": 0.7324034199118614, "reward_before_mean": 0.934943444095552, "reward_before_std": 0.7509109638631344, "reward_change_max": 0.0006519109010696411, "reward_change_mean": -0.2949562631547451, "reward_change_min": -0.5017080642282963, "reward_change_std": 0.20068126823753119, "reward_std": 0.7324034459888935, "rewards/cosine_scaled_reward": -0.0012782979756593704, "rewards/format_reward": 0.9375000074505806, "step": 265 }, { "advantage_max": 1.6453713923692703, "advantage_mean": -4.0667754053203e-08, "advantage_min": -1.1235066056251526, "advantage_std": 0.9998007044196129, "completion_length": 1395.9792251586914, "epoch": 0.304, "grad_norm": 0.28933483362197876, "kl": 0.01245880126953125, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.78255733788191e-07, "loss": 0.0005, "reward": 0.4439257560297847, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4439257560297847, "reward_after_std": 0.5643942356109619, "reward_before_mean": 0.703898387029767, "reward_before_std": 0.5410022251307964, "reward_change_max": 0.000517331063747406, "reward_change_mean": -0.2599726375192404, "reward_change_min": -0.40890974923968315, "reward_change_std": 0.15803362615406513, "reward_std": 0.5643942579627037, "rewards/cosine_scaled_reward": -0.09596748650074005, "rewards/format_reward": 0.8958333432674408, "step": 266 }, { "advantage_max": 1.6140058189630508, "advantage_mean": 1.6653345369377348e-16, "advantage_min": -1.124117873609066, "advantage_std": 0.9997650906443596, "completion_length": 1893.9792022705078, "epoch": 0.30514285714285716, "grad_norm": 0.32439637184143066, "kl": 0.02964019775390625, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.751196772469237e-07, "loss": 0.0012, "reward": 0.19872340001165867, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.19872340001165867, "reward_after_std": 0.5362277999520302, "reward_before_mean": 0.4080928210169077, "reward_before_std": 0.5116072688251734, "reward_change_max": 0.0001492425799369812, "reward_change_mean": -0.20936942659318447, "reward_change_min": -0.3517877943813801, "reward_change_std": 0.13033864740282297, "reward_std": 0.5362278260290623, "rewards/cosine_scaled_reward": -0.16053693334106356, "rewards/format_reward": 0.7291666716337204, "step": 267 }, { "advantage_max": 1.544311910867691, "advantage_mean": -1.552204320631745e-08, "advantage_min": -1.1754939407110214, "advantage_std": 0.9998464211821556, "completion_length": 1315.9167251586914, "epoch": 0.3062857142857143, "grad_norm": 0.5348718762397766, "kl": 0.02091217041015625, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.71982396408026e-07, "loss": 0.0008, "reward": 0.7074840739369392, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.7074840739369392, "reward_after_std": 0.8638963103294373, "reward_before_mean": 1.0027257055044174, "reward_before_std": 0.8752267155796289, "reward_change_max": 0.0, "reward_change_mean": -0.29524165019392967, "reward_change_min": -0.5318833738565445, "reward_change_std": 0.20659902412444353, "reward_std": 0.8638963364064693, "rewards/cosine_scaled_reward": 0.05344618018716574, "rewards/format_reward": 0.8958333507180214, "step": 268 }, { "advantage_max": 1.7572058737277985, "advantage_mean": -2.4835269396561444e-09, "advantage_min": -1.0555767640471458, "advantage_std": 0.9997468441724777, "completion_length": 1388.0625190734863, "epoch": 0.30742857142857144, "grad_norm": 0.3705489933490753, "kl": 0.013675689697265625, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.688440441781398e-07, "loss": 0.0005, "reward": 0.2904005544260144, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2904005544260144, "reward_after_std": 0.6595730949193239, "reward_before_mean": 0.5057724388316274, "reward_before_std": 0.6225594077259302, "reward_change_max": 0.0, "reward_change_mean": -0.2153718858025968, "reward_change_min": -0.31822727248072624, "reward_change_std": 0.12597669241949916, "reward_std": 0.6595731098204851, "rewards/cosine_scaled_reward": -0.16378045734018087, "rewards/format_reward": 0.8333333488553762, "step": 269 }, { "advantage_max": 1.6698594987392426, "advantage_mean": -5.339582942465171e-08, "advantage_min": -1.1226786375045776, "advantage_std": 0.9998543411493301, "completion_length": 1540.2083587646484, "epoch": 0.30857142857142855, "grad_norm": 0.2806699872016907, "kl": 0.014301300048828125, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.657047735161255e-07, "loss": 0.0006, "reward": 0.8029788322746754, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.8029788322746754, "reward_after_std": 0.9154071733355522, "reward_before_mean": 1.1118664667010307, "reward_before_std": 0.9013979975134134, "reward_change_max": 0.0001629069447517395, "reward_change_mean": -0.30888766795396805, "reward_change_min": -0.5338790118694305, "reward_change_std": 0.20814104191958904, "reward_std": 0.9154071845114231, "rewards/cosine_scaled_reward": 0.10801657056435943, "rewards/format_reward": 0.8958333507180214, "step": 270 }, { "advantage_max": 1.6810975968837738, "advantage_mean": -6.332993707225398e-08, "advantage_min": -1.1149731278419495, "advantage_std": 0.9998246654868126, "completion_length": 1321.1250267028809, "epoch": 0.3097142857142857, "grad_norm": 0.2993323802947998, "kl": 0.020111083984375, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.625647374256061e-07, "loss": 0.0008, "reward": 0.8111615749076009, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.8111615749076009, "reward_after_std": 0.6996302008628845, "reward_before_mean": 1.1366247907280922, "reward_before_std": 0.6638240143656731, "reward_change_max": 4.407763481140137e-05, "reward_change_mean": -0.3254632391035557, "reward_change_min": -0.4952653609216213, "reward_change_std": 0.19443896505981684, "reward_std": 0.6996302269399166, "rewards/cosine_scaled_reward": 0.13081239815801382, "rewards/format_reward": 0.8750000055879354, "step": 271 }, { "advantage_max": 1.5797275602817535, "advantage_mean": -5.587935614226325e-09, "advantage_min": -1.1289891824126244, "advantage_std": 0.9998372495174408, "completion_length": 1758.8750305175781, "epoch": 0.31085714285714283, "grad_norm": 0.41097596287727356, "kl": 0.01924896240234375, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.594240889475106e-07, "loss": 0.0008, "reward": 0.3049433889100328, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3049433889100328, "reward_after_std": 0.7466313242912292, "reward_before_mean": 0.5239668264985085, "reward_before_std": 0.7606498152017593, "reward_change_max": 0.0008180141448974609, "reward_change_mean": -0.219023450743407, "reward_change_min": -0.42113321274518967, "reward_change_std": 0.16112497728317976, "reward_std": 0.7466313354671001, "rewards/cosine_scaled_reward": -0.11301657650619745, "rewards/format_reward": 0.7500000149011612, "step": 272 }, { "advantage_max": 1.7094184756278992, "advantage_mean": -6.70552275927605e-08, "advantage_min": -1.0091482996940613, "advantage_std": 0.9998253583908081, "completion_length": 1344.6875228881836, "epoch": 0.312, "grad_norm": 0.27919331192970276, "kl": 0.016841888427734375, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.562829811526154e-07, "loss": 0.0007, "reward": 0.6978060295805335, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6978060295805335, "reward_after_std": 0.7023932486772537, "reward_before_mean": 0.9987507201731205, "reward_before_std": 0.6539960987865925, "reward_change_max": 0.0, "reward_change_mean": -0.3009446784853935, "reward_change_min": -0.4641147553920746, "reward_change_std": 0.17846697755157948, "reward_std": 0.7023932598531246, "rewards/cosine_scaled_reward": 0.06187533074989915, "rewards/format_reward": 0.8750000055879354, "step": 273 }, { "advantage_max": 1.5880873054265976, "advantage_mean": -1.8626452158443385e-08, "advantage_min": -1.1842782869935036, "advantage_std": 0.9998295158147812, "completion_length": 1051.2292098999023, "epoch": 0.31314285714285717, "grad_norm": 0.3718518614768982, "kl": 0.01607513427734375, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.531415671340826e-07, "loss": 0.0006, "reward": 0.7379360813647509, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.7379360813647509, "reward_after_std": 0.7393338270485401, "reward_before_mean": 1.045187957584858, "reward_before_std": 0.7175006531178951, "reward_change_max": 0.0, "reward_change_mean": -0.3072518855333328, "reward_change_min": -0.5100837834179401, "reward_change_std": 0.1890790481120348, "reward_std": 0.7393338270485401, "rewards/cosine_scaled_reward": 0.053843963891267776, "rewards/format_reward": 0.9375000149011612, "step": 274 }, { "advantage_max": 1.491246446967125, "advantage_mean": 6.829698917520943e-09, "advantage_min": -1.1501619592308998, "advantage_std": 0.9998340085148811, "completion_length": 1448.645866394043, "epoch": 0.3142857142857143, "grad_norm": 0.34003862738609314, "kl": 0.022502899169921875, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.5e-07, "loss": 0.0009, "reward": 0.7442736756056547, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7442736756056547, "reward_after_std": 0.8110382054001093, "reward_before_mean": 1.0544355604797602, "reward_before_std": 0.8261386286467314, "reward_change_max": 0.000140458345413208, "reward_change_mean": -0.3101618802174926, "reward_change_min": -0.5264877546578646, "reward_change_std": 0.20748563203960657, "reward_std": 0.8110382370650768, "rewards/cosine_scaled_reward": 0.11055111582390964, "rewards/format_reward": 0.8333333414047956, "step": 275 }, { "advantage_max": 1.6783827245235443, "advantage_mean": -2.8560560139112567e-08, "advantage_min": -1.0959996059536934, "advantage_std": 0.9998864680528641, "completion_length": 1484.4375381469727, "epoch": 0.31542857142857145, "grad_norm": 0.35709822177886963, "kl": 0.02801513671875, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.468584328659172e-07, "loss": 0.0011, "reward": 0.609470259398222, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.609470259398222, "reward_after_std": 1.0163805298507214, "reward_before_mean": 0.867477897554636, "reward_before_std": 1.0112642236053944, "reward_change_max": 0.0029065459966659546, "reward_change_mean": -0.2580076390877366, "reward_change_min": -0.474948413670063, "reward_change_std": 0.1875058664008975, "reward_std": 1.0163805782794952, "rewards/cosine_scaled_reward": 0.05873893201351166, "rewards/format_reward": 0.7500000204890966, "step": 276 }, { "advantage_max": 1.6457444429397583, "advantage_mean": 6.8296987509874896e-09, "advantage_min": -0.9661366939544678, "advantage_std": 0.9998582229018211, "completion_length": 1450.3333587646484, "epoch": 0.31657142857142856, "grad_norm": 0.7636739611625671, "kl": 0.02922821044921875, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.437170188473847e-07, "loss": 0.0012, "reward": 0.437352629378438, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.437352629378438, "reward_after_std": 0.8048081770539284, "reward_before_mean": 0.6762080068292562, "reward_before_std": 0.7981316186487675, "reward_change_max": 0.00011983513832092285, "reward_change_mean": -0.23885535076260567, "reward_change_min": -0.46324050053954124, "reward_change_std": 0.16672399919480085, "reward_std": 0.8048081956803799, "rewards/cosine_scaled_reward": -0.08897935040295124, "rewards/format_reward": 0.8541666716337204, "step": 277 }, { "advantage_max": 1.712010532617569, "advantage_mean": -1.7074247238291207e-08, "advantage_min": -1.0471594706177711, "advantage_std": 0.9997750818729401, "completion_length": 1293.1250495910645, "epoch": 0.3177142857142857, "grad_norm": 0.46666115522384644, "kl": 0.0188751220703125, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.405759110524894e-07, "loss": 0.0008, "reward": 0.8984961975365877, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.8984961975365877, "reward_after_std": 0.550977161154151, "reward_before_mean": 1.2493544705212116, "reward_before_std": 0.46790555119514465, "reward_change_max": 0.0, "reward_change_mean": -0.3508582729846239, "reward_change_min": -0.5096213817596436, "reward_change_std": 0.1945639243349433, "reward_std": 0.5509771760553122, "rewards/cosine_scaled_reward": 0.16634388361126184, "rewards/format_reward": 0.9166666716337204, "step": 278 }, { "advantage_max": 1.630677729845047, "advantage_mean": -1.8160790427046436e-08, "advantage_min": -1.0016423761844635, "advantage_std": 0.9998414814472198, "completion_length": 1697.3750305175781, "epoch": 0.31885714285714284, "grad_norm": 0.3881392478942871, "kl": 0.02802276611328125, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.37435262574394e-07, "loss": 0.0011, "reward": 0.5066275419667363, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5066275419667363, "reward_after_std": 0.7542143501341343, "reward_before_mean": 0.7644317261874676, "reward_before_std": 0.7297962121665478, "reward_change_max": 0.0014480352401733398, "reward_change_mean": -0.25780418422073126, "reward_change_min": -0.4481316953897476, "reward_change_std": 0.16544347070157528, "reward_std": 0.7542143575847149, "rewards/cosine_scaled_reward": -0.044867485761642456, "rewards/format_reward": 0.854166679084301, "step": 279 }, { "advantage_max": 1.6418682932853699, "advantage_mean": -6.8296996946770605e-09, "advantage_min": -1.1618360579013824, "advantage_std": 0.9998872727155685, "completion_length": 1712.520896911621, "epoch": 0.32, "grad_norm": 0.4829684793949127, "kl": 0.03343963623046875, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.342952264838747e-07, "loss": 0.0013, "reward": 0.8820310495793819, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.8820310495793819, "reward_after_std": 0.9785490781068802, "reward_before_mean": 1.200088301091455, "reward_before_std": 0.9552184268832207, "reward_change_max": 0.00046744197607040405, "reward_change_mean": -0.31805719900876284, "reward_change_min": -0.519169632345438, "reward_change_std": 0.20428536739200354, "reward_std": 0.9785491079092026, "rewards/cosine_scaled_reward": 0.1833774563856423, "rewards/format_reward": 0.8333333414047956, "step": 280 }, { "advantage_max": 1.6659259349107742, "advantage_mean": 4.967053879312289e-09, "advantage_min": -0.8772311583161354, "advantage_std": 0.9998160675168037, "completion_length": 2214.104232788086, "epoch": 0.3211428571428571, "grad_norm": 0.2734784185886383, "kl": 0.04001617431640625, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.311559558218603e-07, "loss": 0.0016, "reward": 0.018884988501667976, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.018884988501667976, "reward_after_std": 0.6570006497204304, "reward_before_mean": 0.1808290034532547, "reward_before_std": 0.6510476768016815, "reward_change_max": 0.0, "reward_change_mean": -0.16194400051608682, "reward_change_min": -0.3184575643390417, "reward_change_std": 0.1197090744972229, "reward_std": 0.6570006608963013, "rewards/cosine_scaled_reward": -0.2116688375826925, "rewards/format_reward": 0.6041666697710752, "step": 281 }, { "advantage_max": 1.646447241306305, "advantage_mean": -1.8471231655325937e-08, "advantage_min": -1.2341381013393402, "advantage_std": 0.9998238310217857, "completion_length": 1421.708366394043, "epoch": 0.3222857142857143, "grad_norm": 0.40240851044654846, "kl": 0.026641845703125, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.28017603591974e-07, "loss": 0.0011, "reward": 0.5995541553274961, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.5995541553274961, "reward_after_std": 0.6374145857989788, "reward_before_mean": 0.8840107689611614, "reward_before_std": 0.5970096457749605, "reward_change_max": 0.0002270340919494629, "reward_change_mean": -0.2844566013664007, "reward_change_min": -0.4448739532381296, "reward_change_std": 0.17153234407305717, "reward_std": 0.6374146081507206, "rewards/cosine_scaled_reward": 0.025338694918900728, "rewards/format_reward": 0.8333333432674408, "step": 282 }, { "advantage_max": 1.608649656176567, "advantage_mean": -2.9802322165650708e-08, "advantage_min": -1.0181390345096588, "advantage_std": 0.9998424425721169, "completion_length": 1923.958381652832, "epoch": 0.32342857142857145, "grad_norm": 0.4259525537490845, "kl": 0.03574371337890625, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.248803227530763e-07, "loss": 0.0014, "reward": 0.5078537920489907, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5078537920489907, "reward_after_std": 0.8294965587556362, "reward_before_mean": 0.7620069230906665, "reward_before_std": 0.8187153935432434, "reward_change_max": 0.0005861744284629822, "reward_change_mean": -0.25415316317230463, "reward_change_min": -0.5123951118439436, "reward_change_std": 0.189014982432127, "reward_std": 0.8294965997338295, "rewards/cosine_scaled_reward": 0.03725345712155104, "rewards/format_reward": 0.6875000186264515, "step": 283 }, { "advantage_max": 1.6817447692155838, "advantage_mean": -3.073364585048921e-08, "advantage_min": -1.0475571602582932, "advantage_std": 0.9998420029878616, "completion_length": 1284.7917175292969, "epoch": 0.32457142857142857, "grad_norm": 0.47929516434669495, "kl": 0.02382659912109375, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.21744266211809e-07, "loss": 0.001, "reward": 0.4919102769345045, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4919102769345045, "reward_after_std": 0.7489107400178909, "reward_before_mean": 0.7442134652919776, "reward_before_std": 0.7232272922992706, "reward_change_max": 0.0010956302285194397, "reward_change_mean": -0.25230319052934647, "reward_change_min": -0.43385446071624756, "reward_change_std": 0.16482721455395222, "reward_std": 0.7489107698202133, "rewards/cosine_scaled_reward": -0.07580995094031096, "rewards/format_reward": 0.8958333395421505, "step": 284 }, { "advantage_max": 1.7304245829582214, "advantage_mean": 9.002785406053704e-09, "advantage_min": -1.0338216125965118, "advantage_std": 0.99979517608881, "completion_length": 1267.0000228881836, "epoch": 0.32571428571428573, "grad_norm": 0.3164341449737549, "kl": 0.045284271240234375, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.186095868151436e-07, "loss": 0.0018, "reward": 0.4816157463937998, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4816157463937998, "reward_after_std": 0.6492583490908146, "reward_before_mean": 0.7395215413998812, "reward_before_std": 0.6058793980628252, "reward_change_max": 0.0, "reward_change_mean": -0.2579057849943638, "reward_change_min": -0.39217982813715935, "reward_change_std": 0.14816969074308872, "reward_std": 0.6492583639919758, "rewards/cosine_scaled_reward": -0.08857256267219782, "rewards/format_reward": 0.9166666716337204, "step": 285 }, { "advantage_max": 1.5697939693927765, "advantage_mean": -2.23517424569053e-08, "advantage_min": -1.0931710600852966, "advantage_std": 0.9998166635632515, "completion_length": 1508.7291793823242, "epoch": 0.32685714285714285, "grad_norm": 0.3677847385406494, "kl": 0.036373138427734375, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.154764373429315e-07, "loss": 0.0015, "reward": 0.5672296602278948, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5672296602278948, "reward_after_std": 0.738838504999876, "reward_before_mean": 0.8405419194605201, "reward_before_std": 0.7304916121065617, "reward_change_max": 0.0, "reward_change_mean": -0.27331228740513325, "reward_change_min": -0.4712460860610008, "reward_change_std": 0.17583474051207304, "reward_std": 0.7388385497033596, "rewards/cosine_scaled_reward": 0.014020954258739948, "rewards/format_reward": 0.8125000074505806, "step": 286 }, { "advantage_max": 1.6180351376533508, "advantage_mean": -2.4835272727230517e-09, "advantage_min": -1.0623664110898972, "advantage_std": 0.9997753202915192, "completion_length": 1579.354206085205, "epoch": 0.328, "grad_norm": 0.49448496103286743, "kl": 0.052417755126953125, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.123449705004581e-07, "loss": 0.0021, "reward": 0.44745656475424767, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.44745656475424767, "reward_after_std": 0.5133853312581778, "reward_before_mean": 0.7085697203874588, "reward_before_std": 0.4728548899292946, "reward_change_max": 0.0006938055157661438, "reward_change_mean": -0.2611131672747433, "reward_change_min": -0.40582090616226196, "reward_change_std": 0.15666163619607687, "reward_std": 0.5133853498846292, "rewards/cosine_scaled_reward": -0.010298481676727533, "rewards/format_reward": 0.7291666697710752, "step": 287 }, { "advantage_max": 1.648850455880165, "advantage_mean": -2.7008355940605355e-08, "advantage_min": -1.0658856928348541, "advantage_std": 0.99979068338871, "completion_length": 1750.9375534057617, "epoch": 0.3291428571428571, "grad_norm": 0.46777138113975525, "kl": 0.062206268310546875, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.09215338910999e-07, "loss": 0.0025, "reward": 0.3870700172847137, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3870700172847137, "reward_after_std": 0.5809841006994247, "reward_before_mean": 0.6324769873172045, "reward_before_std": 0.5547001287341118, "reward_change_max": 0.0, "reward_change_mean": -0.24540697038173676, "reward_change_min": -0.4253687206655741, "reward_change_std": 0.15523213241249323, "reward_std": 0.5809841156005859, "rewards/cosine_scaled_reward": -0.06917817890644073, "rewards/format_reward": 0.7708333469927311, "step": 288 }, { "advantage_max": 1.75955268740654, "advantage_mean": -3.9736431700632124e-08, "advantage_min": -1.0853909105062485, "advantage_std": 0.9997709766030312, "completion_length": 1411.5000686645508, "epoch": 0.3302857142857143, "grad_norm": 0.4917508363723755, "kl": 0.028003692626953125, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.060876951083828e-07, "loss": 0.0011, "reward": 0.4485252061858773, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4485252061858773, "reward_after_std": 0.5251850299537182, "reward_before_mean": 0.7066366064827889, "reward_before_std": 0.47138636000454426, "reward_change_max": 0.0006059929728507996, "reward_change_mean": -0.2581114200875163, "reward_change_min": -0.38431514613330364, "reward_change_std": 0.15068622399121523, "reward_std": 0.5251850336790085, "rewards/cosine_scaled_reward": -0.05293171480298042, "rewards/format_reward": 0.8125000037252903, "step": 289 }, { "advantage_max": 1.7379557341337204, "advantage_mean": -4.2840839042934675e-08, "advantage_min": -1.0089939087629318, "advantage_std": 0.9998431578278542, "completion_length": 1222.3542022705078, "epoch": 0.3314285714285714, "grad_norm": 0.495219886302948, "kl": 0.0377655029296875, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.02962191529556e-07, "loss": 0.0015, "reward": 0.746434886008501, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.746434886008501, "reward_after_std": 0.7607799246907234, "reward_before_mean": 1.0512062585912645, "reward_before_std": 0.6999847833067179, "reward_change_max": 0.0002563074231147766, "reward_change_mean": -0.30477137491106987, "reward_change_min": -0.48462648317217827, "reward_change_std": 0.18312491476535797, "reward_std": 0.7607799656689167, "rewards/cosine_scaled_reward": 0.07768644799944013, "rewards/format_reward": 0.8958333395421505, "step": 290 }, { "advantage_max": 1.5778974145650864, "advantage_mean": -1.0554989438027462e-08, "advantage_min": -1.2307512015104294, "advantage_std": 0.9998789802193642, "completion_length": 2183.125045776367, "epoch": 0.3325714285714286, "grad_norm": 0.4300452172756195, "kl": 0.081756591796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.998389805071536e-07, "loss": 0.0033, "reward": 0.637520014308393, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.637520014308393, "reward_after_std": 0.9906865693628788, "reward_before_mean": 0.9098777715116739, "reward_before_std": 1.0078083127737045, "reward_change_max": 0.0, "reward_change_mean": -0.272357739508152, "reward_change_min": -0.48343963362276554, "reward_change_std": 0.1993736457079649, "reward_std": 0.9906866066157818, "rewards/cosine_scaled_reward": 0.04868887457996607, "rewards/format_reward": 0.8125000298023224, "step": 291 }, { "advantage_max": 1.7553779780864716, "advantage_mean": -7.450581041013038e-09, "advantage_min": -1.0212047845125198, "advantage_std": 0.9998128712177277, "completion_length": 2034.020896911621, "epoch": 0.33371428571428574, "grad_norm": 0.37723788619041443, "kl": 0.08984375, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.967182142620745e-07, "loss": 0.0036, "reward": 0.4268048144876957, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4268048144876957, "reward_after_std": 0.5782118141651154, "reward_before_mean": 0.6775916237384081, "reward_before_std": 0.5195513293147087, "reward_change_max": 0.0, "reward_change_mean": -0.25078682228922844, "reward_change_min": -0.37887974828481674, "reward_change_std": 0.1440643798559904, "reward_std": 0.5782118327915668, "rewards/cosine_scaled_reward": -0.05703752930276096, "rewards/format_reward": 0.7916666772216558, "step": 292 }, { "advantage_max": 1.6476120948791504, "advantage_mean": -4.035731265839004e-08, "advantage_min": -1.119403451681137, "advantage_std": 0.9997932389378548, "completion_length": 1621.3125534057617, "epoch": 0.33485714285714285, "grad_norm": 0.4459417760372162, "kl": 0.08220672607421875, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.93600044896063e-07, "loss": 0.0033, "reward": 0.4751888904720545, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4751888904720545, "reward_after_std": 0.5447135083377361, "reward_before_mean": 0.7424647515872493, "reward_before_std": 0.5116943549364805, "reward_change_max": 7.440149784088135e-05, "reward_change_mean": -0.2672758949920535, "reward_change_min": -0.42255673184990883, "reward_change_std": 0.1610535578802228, "reward_std": 0.5447135455906391, "rewards/cosine_scaled_reward": -0.05585093982517719, "rewards/format_reward": 0.8541666753590107, "step": 293 }, { "advantage_max": 1.5670886784791946, "advantage_mean": 3.4769376933141416e-08, "advantage_min": -1.1001440435647964, "advantage_std": 0.9998358115553856, "completion_length": 1629.1250457763672, "epoch": 0.336, "grad_norm": 0.7223255038261414, "kl": 0.04345703125, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.904846243842949e-07, "loss": 0.0017, "reward": 0.3658771354239434, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3658771354239434, "reward_after_std": 0.7138045094907284, "reward_before_mean": 0.5983532564714551, "reward_before_std": 0.7150961086153984, "reward_change_max": 0.0004177391529083252, "reward_change_mean": -0.2324760644696653, "reward_change_min": -0.43696724623441696, "reward_change_std": 0.16605904418975115, "reward_std": 0.713804516941309, "rewards/cosine_scaled_reward": -0.054990069940686226, "rewards/format_reward": 0.7083333507180214, "step": 294 }, { "advantage_max": 1.5662293583154678, "advantage_mean": -6.208817238118058e-09, "advantage_min": -1.1451702490448952, "advantage_std": 0.9998404234647751, "completion_length": 1902.7917251586914, "epoch": 0.33714285714285713, "grad_norm": 0.9967929124832153, "kl": 0.07762527465820312, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.873721045679706e-07, "loss": 0.0031, "reward": 0.2228870950639248, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2228870950639248, "reward_after_std": 0.8324792645871639, "reward_before_mean": 0.4196047708392143, "reward_before_std": 0.8619285393506289, "reward_change_max": 0.0006353557109832764, "reward_change_mean": -0.19671765249222517, "reward_change_min": -0.4279610961675644, "reward_change_std": 0.16788294166326523, "reward_std": 0.8324792832136154, "rewards/cosine_scaled_reward": -0.11311429599300027, "rewards/format_reward": 0.6458333488553762, "step": 295 }, { "advantage_max": 1.6409805715084076, "advantage_mean": -2.0799538841265175e-08, "advantage_min": -1.136774018406868, "advantage_std": 0.9998015239834785, "completion_length": 2284.4584045410156, "epoch": 0.3382857142857143, "grad_norm": 0.6031951904296875, "kl": 0.11769866943359375, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.842626371469149e-07, "loss": 0.0047, "reward": 0.18271854612976313, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.18271854612976313, "reward_after_std": 0.7190746963024139, "reward_before_mean": 0.3754655672237277, "reward_before_std": 0.7232628418132663, "reward_change_max": 0.002197861671447754, "reward_change_mean": -0.19274703226983547, "reward_change_min": -0.3422145713120699, "reward_change_std": 0.14056797232478857, "reward_std": 0.7190746963024139, "rewards/cosine_scaled_reward": -0.15601721964776516, "rewards/format_reward": 0.6875000149011612, "step": 296 }, { "advantage_max": 1.6504765450954437, "advantage_mean": -5.5879357807597785e-09, "advantage_min": -1.0491151213645935, "advantage_std": 0.9998462647199631, "completion_length": 2423.666732788086, "epoch": 0.3394285714285714, "grad_norm": 0.9855747222900391, "kl": 0.11363983154296875, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.811563736721829e-07, "loss": 0.0045, "reward": 0.12774308491498232, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.12774308491498232, "reward_after_std": 0.6702112555503845, "reward_before_mean": 0.309725821018219, "reward_before_std": 0.6645154766738415, "reward_change_max": 0.0006378963589668274, "reward_change_mean": -0.1819827202707529, "reward_change_min": -0.3592865318059921, "reward_change_std": 0.13777123484760523, "reward_std": 0.6702112555503845, "rewards/cosine_scaled_reward": -0.09513710625469685, "rewards/format_reward": 0.5000000074505806, "step": 297 }, { "advantage_max": 1.6699796468019485, "advantage_mean": -9.31322596819939e-09, "advantage_min": -1.0168022587895393, "advantage_std": 0.9997926652431488, "completion_length": 1597.0416793823242, "epoch": 0.3405714285714286, "grad_norm": 0.5280109643936157, "kl": 0.06150054931640625, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.780534655386743e-07, "loss": 0.0025, "reward": 0.32745575811713934, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.32745575811713934, "reward_after_std": 0.5900943968445063, "reward_before_mean": 0.5592165207490325, "reward_before_std": 0.5651387330144644, "reward_change_max": 0.00022490322589874268, "reward_change_mean": -0.23176079522818327, "reward_change_min": -0.3906538709998131, "reward_change_std": 0.1459486922249198, "reward_std": 0.590094406157732, "rewards/cosine_scaled_reward": -0.14747506566345692, "rewards/format_reward": 0.8541666753590107, "step": 298 }, { "advantage_max": 1.556631863117218, "advantage_mean": -6.208817182606907e-09, "advantage_min": -1.1320670247077942, "advantage_std": 0.999798409640789, "completion_length": 1924.3125610351562, "epoch": 0.3417142857142857, "grad_norm": 0.8472985625267029, "kl": 0.09732818603515625, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.749540639777539e-07, "loss": 0.0039, "reward": 0.2577241810504347, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2577241810504347, "reward_after_std": 0.6427079103887081, "reward_before_mean": 0.4723318573087454, "reward_before_std": 0.6371937114745378, "reward_change_max": 0.00032889842987060547, "reward_change_mean": -0.21460766345262527, "reward_change_min": -0.360564760863781, "reward_change_std": 0.14255648292601109, "reward_std": 0.642707921564579, "rewards/cosine_scaled_reward": -0.17008409556001425, "rewards/format_reward": 0.8125000074505806, "step": 299 }, { "advantage_max": 1.5028170943260193, "advantage_mean": -1.8626452158443385e-08, "advantage_min": -1.2105746865272522, "advantage_std": 0.9998276457190514, "completion_length": 1971.0417022705078, "epoch": 0.34285714285714286, "grad_norm": 1.1485674381256104, "kl": 0.09633636474609375, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.7185832004988133e-07, "loss": 0.0039, "reward": 0.38906230591237545, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.38906230591237545, "reward_after_std": 0.7436303533613682, "reward_before_mean": 0.6281124204397202, "reward_before_std": 0.7673552930355072, "reward_change_max": 0.00047088414430618286, "reward_change_mean": -0.23905012011528015, "reward_change_min": -0.43147959001362324, "reward_change_std": 0.17824300099164248, "reward_std": 0.7436303719878197, "rewards/cosine_scaled_reward": -0.029693802818655968, "rewards/format_reward": 0.6875000111758709, "step": 300 }, { "advantage_max": 1.639797881245613, "advantage_mean": -3.16649689802162e-08, "advantage_min": -1.1045557409524918, "advantage_std": 0.9997733682394028, "completion_length": 1297.81254196167, "epoch": 0.344, "grad_norm": 0.7328884601593018, "kl": 0.05051422119140625, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.68766384637248e-07, "loss": 0.002, "reward": 0.22625624496868113, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.22625624496868113, "reward_after_std": 0.5197729840874672, "reward_before_mean": 0.44249421264976263, "reward_before_std": 0.4990711696445942, "reward_change_max": 0.0, "reward_change_mean": -0.21623797714710236, "reward_change_min": -0.3751807101070881, "reward_change_std": 0.1369111454114318, "reward_std": 0.5197729952633381, "rewards/cosine_scaled_reward": -0.2370862402021885, "rewards/format_reward": 0.9166666865348816, "step": 301 }, { "advantage_max": 1.6990403681993484, "advantage_mean": -5.5879356919419365e-08, "advantage_min": -1.0498097091913223, "advantage_std": 0.9998420625925064, "completion_length": 1804.833351135254, "epoch": 0.34514285714285714, "grad_norm": 0.9516170024871826, "kl": 0.1143646240234375, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.656784084364238e-07, "loss": 0.0046, "reward": 0.43568436801433563, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.43568436801433563, "reward_after_std": 0.7954321466386318, "reward_before_mean": 0.6727348929271102, "reward_before_std": 0.7747104242444038, "reward_change_max": 0.0005867332220077515, "reward_change_mean": -0.23705054307356477, "reward_change_min": -0.408731022849679, "reward_change_std": 0.1665506912395358, "reward_std": 0.795432161539793, "rewards/cosine_scaled_reward": 0.013450777158141136, "rewards/format_reward": 0.6458333469927311, "step": 302 }, { "advantage_max": 1.6118300408124924, "advantage_mean": -1.614292521878724e-08, "advantage_min": -1.0991563871502876, "advantage_std": 0.9997985139489174, "completion_length": 1132.7916946411133, "epoch": 0.3462857142857143, "grad_norm": 0.6698048114776611, "kl": 0.0547027587890625, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.6259454195101267e-07, "loss": 0.0022, "reward": 0.415790211642161, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.415790211642161, "reward_after_std": 0.6736731752753258, "reward_before_mean": 0.6621344964951277, "reward_before_std": 0.6673592794686556, "reward_change_max": 0.00024134665727615356, "reward_change_mean": -0.2463442850857973, "reward_change_min": -0.4515748545527458, "reward_change_std": 0.16697289608418941, "reward_std": 0.673673190176487, "rewards/cosine_scaled_reward": -0.12726609222590923, "rewards/format_reward": 0.9166666716337204, "step": 303 }, { "advantage_max": 1.573637142777443, "advantage_mean": -9.934107536579972e-09, "advantage_min": -1.1766441836953163, "advantage_std": 0.9997837841510773, "completion_length": 1487.0208587646484, "epoch": 0.3474285714285714, "grad_norm": 0.5351270437240601, "kl": 0.10761260986328125, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.59514935484316e-07, "loss": 0.0043, "reward": 0.37789439666084945, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.37789439666084945, "reward_after_std": 0.6138196587562561, "reward_before_mean": 0.6197326518595219, "reward_before_std": 0.6024635452777147, "reward_change_max": 0.0, "reward_change_mean": -0.24183825589716434, "reward_change_min": -0.39685399271547794, "reward_change_std": 0.15235036052763462, "reward_std": 0.613819669932127, "rewards/cosine_scaled_reward": -0.10680035129189491, "rewards/format_reward": 0.8333333358168602, "step": 304 }, { "advantage_max": 1.5553628653287888, "advantage_mean": -6.208817682207268e-09, "advantage_min": -1.1495722085237503, "advantage_std": 0.9997849240899086, "completion_length": 1641.2917022705078, "epoch": 0.3485714285714286, "grad_norm": 0.998511016368866, "kl": 0.0987548828125, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.5643973913200837e-07, "loss": 0.0039, "reward": 0.25423115864396095, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.25423115864396095, "reward_after_std": 0.6495640091598034, "reward_before_mean": 0.4696862930431962, "reward_before_std": 0.657315599732101, "reward_change_max": 0.00010583549737930298, "reward_change_mean": -0.21545512787997723, "reward_change_min": -0.38556710444390774, "reward_change_std": 0.15349662397056818, "reward_std": 0.6495640445500612, "rewards/cosine_scaled_reward": -0.1714068679139018, "rewards/format_reward": 0.8125000111758709, "step": 305 }, { "advantage_max": 1.7128386795520782, "advantage_mean": 1.179675312990014e-08, "advantage_min": -0.9719715788960457, "advantage_std": 0.9997956454753876, "completion_length": 1563.2084045410156, "epoch": 0.3497142857142857, "grad_norm": 1.1378474235534668, "kl": 0.13939666748046875, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.5336910277482155e-07, "loss": 0.0056, "reward": 0.47938904957845807, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.47938904957845807, "reward_after_std": 0.7479777634143829, "reward_before_mean": 0.7320452854037285, "reward_before_std": 0.723258962854743, "reward_change_max": 0.0, "reward_change_mean": -0.25265623442828655, "reward_change_min": -0.4746490456163883, "reward_change_std": 0.18077436927706003, "reward_std": 0.747977789491415, "rewards/cosine_scaled_reward": -0.029810683365212753, "rewards/format_reward": 0.791666679084301, "step": 306 }, { "advantage_max": 1.6958726346492767, "advantage_mean": 1.4280280180578586e-08, "advantage_min": -1.1393985226750374, "advantage_std": 0.9998360052704811, "completion_length": 1480.583381652832, "epoch": 0.35085714285714287, "grad_norm": 1.4653286933898926, "kl": 0.10507965087890625, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.503031760712397e-07, "loss": 0.0042, "reward": 0.4037772142328322, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4037772142328322, "reward_after_std": 0.7926469035446644, "reward_before_mean": 0.6346738813444972, "reward_before_std": 0.774210948497057, "reward_change_max": 0.0005128830671310425, "reward_change_mean": -0.23089664988219738, "reward_change_min": -0.4010511841624975, "reward_change_std": 0.15531510580331087, "reward_std": 0.7926469184458256, "rewards/cosine_scaled_reward": -0.08891306724399328, "rewards/format_reward": 0.8125000111758709, "step": 307 }, { "advantage_max": 1.5332411974668503, "advantage_mean": -3.942599025030802e-08, "advantage_min": -1.291197545826435, "advantage_std": 0.9998232498764992, "completion_length": 1984.4375305175781, "epoch": 0.352, "grad_norm": 0.6880925893783569, "kl": 0.13637542724609375, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.4724210845020494e-07, "loss": 0.0055, "reward": 0.4717167126946151, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4717167126946151, "reward_after_std": 0.6849931702017784, "reward_before_mean": 0.7313460372388363, "reward_before_std": 0.6948912851512432, "reward_change_max": 3.4242868423461914e-05, "reward_change_mean": -0.25962932175025344, "reward_change_min": -0.4542670212686062, "reward_change_std": 0.17313092714175582, "reward_std": 0.6849932186305523, "rewards/cosine_scaled_reward": -0.009326999075710773, "rewards/format_reward": 0.7500000074505806, "step": 308 }, { "advantage_max": 1.5543510168790817, "advantage_mean": 4.967053990334591e-09, "advantage_min": -1.1442887485027313, "advantage_std": 0.9998382106423378, "completion_length": 1430.1041870117188, "epoch": 0.35314285714285715, "grad_norm": 1.7190380096435547, "kl": 0.08464813232421875, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.441860491038345e-07, "loss": 0.0034, "reward": 0.4691953402943909, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4691953402943909, "reward_after_std": 0.7070669531822205, "reward_before_mean": 0.7250326937064528, "reward_before_std": 0.7026441767811775, "reward_change_max": 0.0, "reward_change_mean": -0.25583732686936855, "reward_change_min": -0.46527867391705513, "reward_change_std": 0.16947638988494873, "reward_std": 0.707066971808672, "rewards/cosine_scaled_reward": -0.09581700339913368, "rewards/format_reward": 0.916666679084301, "step": 309 }, { "advantage_max": 1.6268791556358337, "advantage_mean": -4.128863528851667e-08, "advantage_min": -1.1320114061236382, "advantage_std": 0.9997777566313744, "completion_length": 1413.3542404174805, "epoch": 0.35428571428571426, "grad_norm": 0.781093180179596, "kl": 0.0948944091796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.4113514698014953e-07, "loss": 0.0038, "reward": 0.34962170582730323, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.34962170582730323, "reward_after_std": 0.5699597038328648, "reward_before_mean": 0.587611180730164, "reward_before_std": 0.5539778638631105, "reward_change_max": 0.0003587454557418823, "reward_change_mean": -0.2379895057529211, "reward_change_min": -0.3951511085033417, "reward_change_std": 0.14933669101446867, "reward_std": 0.5699597336351871, "rewards/cosine_scaled_reward": -0.13327774591743946, "rewards/format_reward": 0.854166679084301, "step": 310 }, { "advantage_max": 1.5723091959953308, "advantage_mean": -1.490116185998147e-08, "advantage_min": -1.1205863133072853, "advantage_std": 0.9998145774006844, "completion_length": 1074.2708587646484, "epoch": 0.3554285714285714, "grad_norm": 0.6670510768890381, "kl": 0.03679656982421875, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.3808955077581546e-07, "loss": 0.0015, "reward": 0.7845532577484846, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7845532577484846, "reward_after_std": 0.7538176812231541, "reward_before_mean": 1.105867974460125, "reward_before_std": 0.7525460831820965, "reward_change_max": 0.0, "reward_change_mean": -0.3213147222995758, "reward_change_min": -0.5401858016848564, "reward_change_std": 0.206632686778903, "reward_std": 0.7538177222013474, "rewards/cosine_scaled_reward": 0.05293398164212704, "rewards/format_reward": 1.0, "step": 311 }, { "advantage_max": 1.810782939195633, "advantage_mean": -5.184362417143262e-08, "advantage_min": -0.994841955602169, "advantage_std": 0.9998369365930557, "completion_length": 1114.3333549499512, "epoch": 0.3565714285714286, "grad_norm": 0.7077937126159668, "kl": 0.08843231201171875, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.350494089288943e-07, "loss": 0.0035, "reward": 0.9057276744861156, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.9057276744861156, "reward_after_std": 0.7223017476499081, "reward_before_mean": 1.2431134916841984, "reward_before_std": 0.6302335932850838, "reward_change_max": 0.0, "reward_change_mean": -0.337385768070817, "reward_change_min": -0.482417494058609, "reward_change_std": 0.18416978046298027, "reward_std": 0.7223017923533916, "rewards/cosine_scaled_reward": 0.19447338953614235, "rewards/format_reward": 0.8541666865348816, "step": 312 }, { "advantage_max": 1.7264662384986877, "advantage_mean": -8.071462631598081e-08, "advantage_min": -0.8811491578817368, "advantage_std": 0.9998324140906334, "completion_length": 1650.3958892822266, "epoch": 0.3577142857142857, "grad_norm": 0.9196247458457947, "kl": 0.14367294311523438, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.3201486961161093e-07, "loss": 0.0057, "reward": 0.6459367610514164, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6459367610514164, "reward_after_std": 0.8145084977149963, "reward_before_mean": 0.9274978432804346, "reward_before_std": 0.783172954339534, "reward_change_max": 0.00030690431594848633, "reward_change_mean": -0.2815611115656793, "reward_change_min": -0.5300815589725971, "reward_change_std": 0.18957517808303237, "reward_std": 0.8145085163414478, "rewards/cosine_scaled_reward": 0.07833224721252918, "rewards/format_reward": 0.7708333395421505, "step": 313 }, { "advantage_max": 1.3890406340360641, "advantage_mean": -3.011276428210863e-08, "advantage_min": -1.3965424448251724, "advantage_std": 0.9998088404536247, "completion_length": 1131.375015258789, "epoch": 0.3588571428571429, "grad_norm": 0.7419488430023193, "kl": 0.0739593505859375, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.2898608072313045e-07, "loss": 0.003, "reward": 0.8312315121293068, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.8312315121293068, "reward_after_std": 0.6450635753571987, "reward_before_mean": 1.1716954428702593, "reward_before_std": 0.6489796601235867, "reward_change_max": 0.0005232319235801697, "reward_change_mean": -0.3404639083892107, "reward_change_min": -0.5209429115056992, "reward_change_std": 0.2113346103578806, "reward_std": 0.6450635828077793, "rewards/cosine_scaled_reward": 0.11709769815206528, "rewards/format_reward": 0.9375000074505806, "step": 314 }, { "advantage_max": 1.6860020756721497, "advantage_mean": -5.587935891782081e-09, "advantage_min": -0.9125220403075218, "advantage_std": 0.9997944608330727, "completion_length": 1653.0000305175781, "epoch": 0.36, "grad_norm": 1.6936687231063843, "kl": 0.25146484375, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.2596318988235037e-07, "loss": 0.0101, "reward": 0.4668788071721792, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4668788071721792, "reward_after_std": 0.6091582365334034, "reward_before_mean": 0.7244241368025541, "reward_before_std": 0.5609946362674236, "reward_change_max": 0.0, "reward_change_mean": -0.2575453044846654, "reward_change_min": -0.42978838086128235, "reward_change_std": 0.15476837567985058, "reward_std": 0.6091582626104355, "rewards/cosine_scaled_reward": -0.0336212863549008, "rewards/format_reward": 0.7916666772216558, "step": 315 }, { "advantage_max": 1.6541273891925812, "advantage_mean": -1.676380706472358e-08, "advantage_min": -1.09672212600708, "advantage_std": 0.9998091906309128, "completion_length": 1717.6250305175781, "epoch": 0.36114285714285715, "grad_norm": 1.417986273765564, "kl": 0.1870574951171875, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.2294634442070553e-07, "loss": 0.0075, "reward": 0.10566316498443484, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.10566316498443484, "reward_after_std": 0.5733887776732445, "reward_before_mean": 0.29063704796135426, "reward_before_std": 0.5616068728268147, "reward_change_max": 0.0, "reward_change_mean": -0.1849738946184516, "reward_change_min": -0.3238295055925846, "reward_change_std": 0.12428497988730669, "reward_std": 0.5733887813985348, "rewards/cosine_scaled_reward": -0.2296814899891615, "rewards/format_reward": 0.7500000204890966, "step": 316 }, { "advantage_max": 1.5123141556978226, "advantage_mean": -2.9802322498717615e-08, "advantage_min": -1.3178307265043259, "advantage_std": 0.9998439252376556, "completion_length": 1461.2292251586914, "epoch": 0.36228571428571427, "grad_norm": 1.5687153339385986, "kl": 0.1681976318359375, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.1993569137498776e-07, "loss": 0.0067, "reward": 0.4049488212913275, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4049488212913275, "reward_after_std": 0.7206962667405605, "reward_before_mean": 0.6449471823871136, "reward_before_std": 0.7243769988417625, "reward_change_max": 0.0008472129702568054, "reward_change_mean": -0.23999838065356016, "reward_change_min": -0.43049536645412445, "reward_change_std": 0.1642217980697751, "reward_std": 0.7206962741911411, "rewards/cosine_scaled_reward": -0.052526420913636684, "rewards/format_reward": 0.7500000260770321, "step": 317 }, { "advantage_max": 1.5850563496351242, "advantage_mean": -2.4835269507583746e-08, "advantage_min": -1.1503583490848541, "advantage_std": 0.999801941215992, "completion_length": 1027.145851135254, "epoch": 0.36342857142857143, "grad_norm": 2.1071932315826416, "kl": 0.11089324951171875, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.1693137748017915e-07, "loss": 0.0044, "reward": 0.49303111620247364, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.49303111620247364, "reward_after_std": 0.6135988608002663, "reward_before_mean": 0.7602047026157379, "reward_before_std": 0.6004465334117413, "reward_change_max": 0.0, "reward_change_mean": -0.26717356964945793, "reward_change_min": -0.44262557849287987, "reward_change_std": 0.16659097839146852, "reward_std": 0.613598894327879, "rewards/cosine_scaled_reward": -0.09906433057039976, "rewards/format_reward": 0.9583333432674408, "step": 318 }, { "advantage_max": 1.7661184072494507, "advantage_mean": 1.2417632477834672e-09, "advantage_min": -0.9878272712230682, "advantage_std": 0.9998244121670723, "completion_length": 1393.3750305175781, "epoch": 0.36457142857142855, "grad_norm": 1.081502914428711, "kl": 0.16871261596679688, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.1393354916230005e-07, "loss": 0.0067, "reward": 0.17947366731823422, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.17947366731823422, "reward_after_std": 0.6435183212161064, "reward_before_mean": 0.3727762456983328, "reward_before_std": 0.6059022732079029, "reward_change_max": 0.0006429404020309448, "reward_change_mean": -0.1933025810867548, "reward_change_min": -0.3257717378437519, "reward_change_std": 0.12049250770360231, "reward_std": 0.643518328666687, "rewards/cosine_scaled_reward": -0.23027855902910233, "rewards/format_reward": 0.8333333469927311, "step": 319 }, { "advantage_max": 1.7249931246042252, "advantage_mean": 1.5211601978037947e-08, "advantage_min": -1.0006217509508133, "advantage_std": 0.9998496472835541, "completion_length": 947.458366394043, "epoch": 0.3657142857142857, "grad_norm": 1.0906606912612915, "kl": 0.09429550170898438, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.1094235253127374e-07, "loss": 0.0038, "reward": 0.6802430953830481, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6802430953830481, "reward_after_std": 0.8332295939326286, "reward_before_mean": 0.9649859443306923, "reward_before_std": 0.7885765507817268, "reward_change_max": 0.0, "reward_change_mean": -0.2847428247332573, "reward_change_min": -0.46728481724858284, "reward_change_std": 0.16841666772961617, "reward_std": 0.8332296200096607, "rewards/cosine_scaled_reward": 0.0033262865617871284, "rewards/format_reward": 0.9583333432674408, "step": 320 }, { "advantage_max": 1.7146756947040558, "advantage_mean": -5.091230237397326e-08, "advantage_min": -1.0093270689249039, "advantage_std": 0.999833919107914, "completion_length": 1098.9166946411133, "epoch": 0.3668571428571429, "grad_norm": 1.029427170753479, "kl": 0.12979888916015625, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.079579333738039e-07, "loss": 0.0052, "reward": 0.8162387441843748, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.8162387441843748, "reward_after_std": 0.72203304246068, "reward_before_mean": 1.1376890763640404, "reward_before_std": 0.6585705541074276, "reward_change_max": 0.0, "reward_change_mean": -0.3214503303170204, "reward_change_min": -0.48446333035826683, "reward_change_std": 0.1824331246316433, "reward_std": 0.7220330536365509, "rewards/cosine_scaled_reward": 0.08967785281129181, "rewards/format_reward": 0.9583333432674408, "step": 321 }, { "advantage_max": 1.614561453461647, "advantage_mean": -1.179675312990014e-08, "advantage_min": -1.0946208611130714, "advantage_std": 0.9998006448149681, "completion_length": 1206.5625267028809, "epoch": 0.368, "grad_norm": 2.3137221336364746, "kl": 0.16611480712890625, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.0498043714627006e-07, "loss": 0.0066, "reward": 0.2254452295601368, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2254452295601368, "reward_after_std": 0.6338132582604885, "reward_before_mean": 0.4344533123075962, "reward_before_std": 0.6367870792746544, "reward_change_max": 0.0006910189986228943, "reward_change_mean": -0.2090080864727497, "reward_change_min": -0.37567378394305706, "reward_change_std": 0.1448584054596722, "reward_std": 0.6338132806122303, "rewards/cosine_scaled_reward": -0.14735668897628784, "rewards/format_reward": 0.7291666809469461, "step": 322 }, { "advantage_max": 1.5460123121738434, "advantage_mean": -1.3659398556686853e-08, "advantage_min": -1.2054516822099686, "advantage_std": 0.9997941702604294, "completion_length": 1176.0208778381348, "epoch": 0.36914285714285716, "grad_norm": 1.450031042098999, "kl": 0.116943359375, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.020100089676376e-07, "loss": 0.0047, "reward": 0.5555176772177219, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5555176772177219, "reward_after_std": 0.6632481273263693, "reward_before_mean": 0.8349560154601932, "reward_before_std": 0.6657208111137152, "reward_change_max": 0.0, "reward_change_mean": -0.2794383354485035, "reward_change_min": -0.4561638943850994, "reward_change_std": 0.17854246776551008, "reward_std": 0.6632481273263693, "rewards/cosine_scaled_reward": -0.040855332277715206, "rewards/format_reward": 0.916666679084301, "step": 323 }, { "advantage_max": 1.6410552561283112, "advantage_mean": -2.2351742678949904e-08, "advantage_min": -1.1239653453230858, "advantage_std": 0.9997935220599174, "completion_length": 1479.3125534057617, "epoch": 0.3702857142857143, "grad_norm": 1.3957271575927734, "kl": 0.29621124267578125, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.9904679361238526e-07, "loss": 0.0118, "reward": 0.36106334580108523, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.36106334580108523, "reward_after_std": 0.5908253006637096, "reward_before_mean": 0.5981873874552548, "reward_before_std": 0.5554264299571514, "reward_change_max": 0.0005471184849739075, "reward_change_mean": -0.2371240258216858, "reward_change_min": -0.3761534318327904, "reward_change_std": 0.14011078514158726, "reward_std": 0.5908253267407417, "rewards/cosine_scaled_reward": -0.16965633165091276, "rewards/format_reward": 0.9375000074505806, "step": 324 }, { "advantage_max": 1.6871405392885208, "advantage_mean": -7.605801100041276e-09, "advantage_min": -1.0215219408273697, "advantage_std": 0.9998608082532883, "completion_length": 1574.1667137145996, "epoch": 0.37142857142857144, "grad_norm": 1.4152541160583496, "kl": 0.2861785888671875, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.9609093550344907e-07, "loss": 0.0114, "reward": 0.6319072768092155, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6319072768092155, "reward_after_std": 0.8270172514021397, "reward_before_mean": 0.9088499108329415, "reward_before_std": 0.797621414065361, "reward_change_max": 0.0, "reward_change_mean": -0.2769426228478551, "reward_change_min": -0.46800280176103115, "reward_change_std": 0.17760974913835526, "reward_std": 0.8270172588527203, "rewards/cosine_scaled_reward": 0.04817493752489099, "rewards/format_reward": 0.812500013038516, "step": 325 }, { "advantage_max": 1.5468260794878006, "advantage_mean": -1.2417638028949796e-09, "advantage_min": -1.231099657714367, "advantage_std": 0.9998420029878616, "completion_length": 1152.2500343322754, "epoch": 0.37257142857142855, "grad_norm": 1.4574198722839355, "kl": 0.19783782958984375, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.931425787051832e-07, "loss": 0.0079, "reward": 0.6057899557054043, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6057899557054043, "reward_after_std": 0.7603934742510319, "reward_before_mean": 0.8865679651498795, "reward_before_std": 0.7577711865305901, "reward_change_max": 0.0, "reward_change_mean": -0.2807780094444752, "reward_change_min": -0.47002286091446877, "reward_change_std": 0.18309276923537254, "reward_std": 0.7603935040533543, "rewards/cosine_scaled_reward": 0.005783975124359131, "rewards/format_reward": 0.8750000149011612, "step": 326 }, { "advantage_max": 1.6103992611169815, "advantage_mean": -8.257726913374341e-08, "advantage_min": -1.2234643921256065, "advantage_std": 0.9998258948326111, "completion_length": 1431.8958892822266, "epoch": 0.3737142857142857, "grad_norm": 0.8417478799819946, "kl": 0.12567138671875, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.902018669163384e-07, "loss": 0.005, "reward": 0.8357805621344596, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.8357805621344596, "reward_after_std": 0.6931793317198753, "reward_before_mean": 1.1680885925889015, "reward_before_std": 0.6554881166666746, "reward_change_max": 0.0007588863372802734, "reward_change_mean": -0.3323080986738205, "reward_change_min": -0.52970290184021, "reward_change_std": 0.20208781119436026, "reward_std": 0.6931793540716171, "rewards/cosine_scaled_reward": 0.14654429350048304, "rewards/format_reward": 0.8750000149011612, "step": 327 }, { "advantage_max": 1.691834032535553, "advantage_mean": 1.2417635808503746e-09, "advantage_min": -1.0485369563102722, "advantage_std": 0.9998220056295395, "completion_length": 1368.833381652832, "epoch": 0.37485714285714283, "grad_norm": 1.4414052963256836, "kl": 0.21993255615234375, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.872689434630585e-07, "loss": 0.0088, "reward": 0.17786777764558792, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.17786777764558792, "reward_after_std": 0.6675361059606075, "reward_before_mean": 0.3704228848218918, "reward_before_std": 0.6458631716668606, "reward_change_max": 0.0, "reward_change_mean": -0.19255511928349733, "reward_change_min": -0.3381800428032875, "reward_change_std": 0.12796216271817684, "reward_std": 0.6675361357629299, "rewards/cosine_scaled_reward": -0.24187190178781748, "rewards/format_reward": 0.8541666828095913, "step": 328 }, { "advantage_max": 1.7343352884054184, "advantage_mean": -2.421438682898014e-08, "advantage_min": -0.9665001779794693, "advantage_std": 0.9998097270727158, "completion_length": 1006.5833587646484, "epoch": 0.376, "grad_norm": 0.9054838418960571, "kl": 0.19991302490234375, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.843439512918949e-07, "loss": 0.008, "reward": 0.7864454248920083, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7864454248920083, "reward_after_std": 0.6544234752655029, "reward_before_mean": 1.1081649232655764, "reward_before_std": 0.5919336788356304, "reward_change_max": 0.0, "reward_change_mean": -0.32171949837356806, "reward_change_min": -0.48736608400940895, "reward_change_std": 0.18318084720522165, "reward_std": 0.6544234827160835, "rewards/cosine_scaled_reward": 0.09574911929666996, "rewards/format_reward": 0.9166666679084301, "step": 329 }, { "advantage_max": 1.7217664271593094, "advantage_mean": -4.594524871670558e-08, "advantage_min": -1.0755222663283348, "advantage_std": 0.9998010918498039, "completion_length": 1190.7500267028809, "epoch": 0.37714285714285717, "grad_norm": 0.9707238078117371, "kl": 0.2688751220703125, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.8142703296283953e-07, "loss": 0.0108, "reward": 0.5854407958686352, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5854407958686352, "reward_after_std": 0.737566763535142, "reward_before_mean": 0.8584175645373762, "reward_before_std": 0.6991724539548159, "reward_change_max": 0.00036550313234329224, "reward_change_mean": -0.2729767709970474, "reward_change_min": -0.45406655967235565, "reward_change_std": 0.1680940967053175, "reward_std": 0.7375667933374643, "rewards/cosine_scaled_reward": -0.018707887269556522, "rewards/format_reward": 0.8958333432674408, "step": 330 }, { "advantage_max": 1.7191912680864334, "advantage_mean": -3.1044086745701804e-08, "advantage_min": -0.953365832567215, "advantage_std": 0.9997701942920685, "completion_length": 1443.9583587646484, "epoch": 0.3782857142857143, "grad_norm": 3.559812545776367, "kl": 0.39825439453125, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.785183306423767e-07, "loss": 0.0159, "reward": 0.3996207695454359, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.3996207695454359, "reward_after_std": 0.5887615773826838, "reward_before_mean": 0.6428104415535927, "reward_before_std": 0.5397970164194703, "reward_change_max": 0.00024040043354034424, "reward_change_mean": -0.24318967200815678, "reward_change_min": -0.3880366124212742, "reward_change_std": 0.1508668838068843, "reward_std": 0.5887615997344255, "rewards/cosine_scaled_reward": -0.06401145167183131, "rewards/format_reward": 0.7708333395421505, "step": 331 }, { "advantage_max": 1.5935689955949783, "advantage_mean": -4.1599075351062e-08, "advantage_min": -1.270665518939495, "advantage_std": 0.9997954964637756, "completion_length": 1409.208381652832, "epoch": 0.37942857142857145, "grad_norm": 1.2800945043563843, "kl": 0.22563934326171875, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.7561798609655373e-07, "loss": 0.009, "reward": 0.4048396535217762, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.4048396535217762, "reward_after_std": 0.5992461815476418, "reward_before_mean": 0.6520956940948963, "reward_before_std": 0.5757904518395662, "reward_change_max": 0.0027850568294525146, "reward_change_mean": -0.24725607503205538, "reward_change_min": -0.390565924346447, "reward_change_std": 0.15339871495962143, "reward_std": 0.5992461927235126, "rewards/cosine_scaled_reward": -0.10103549575433135, "rewards/format_reward": 0.8541666865348816, "step": 332 }, { "advantage_max": 1.7609997540712357, "advantage_mean": -1.490116141589226e-08, "advantage_min": -0.9781611263751984, "advantage_std": 0.9997869431972504, "completion_length": 1225.0416946411133, "epoch": 0.38057142857142856, "grad_norm": 1.0518617630004883, "kl": 0.15213775634765625, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.72726140684072e-07, "loss": 0.0061, "reward": 0.26022319309413433, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.26022319309413433, "reward_after_std": 0.560534905642271, "reward_before_mean": 0.47590574622154236, "reward_before_std": 0.5113978497684002, "reward_change_max": 0.0, "reward_change_mean": -0.21568256057798862, "reward_change_min": -0.33122558146715164, "reward_change_std": 0.12318441737443209, "reward_std": 0.5605349130928516, "rewards/cosine_scaled_reward": -0.23079713946208358, "rewards/format_reward": 0.9375000074505806, "step": 333 }, { "advantage_max": 1.7300200462341309, "advantage_mean": -1.1175870895385742e-08, "advantage_min": -0.892622597515583, "advantage_std": 0.9998339638113976, "completion_length": 1481.0000610351562, "epoch": 0.38171428571428573, "grad_norm": 1.437591791152954, "kl": 0.3504638671875, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.6984293534939737e-07, "loss": 0.014, "reward": 0.2755582988029346, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2755582988029346, "reward_after_std": 0.7258717641234398, "reward_before_mean": 0.48202365916222334, "reward_before_std": 0.6885374151170254, "reward_change_max": 0.0, "reward_change_mean": -0.2064653616398573, "reward_change_min": -0.34851996414363384, "reward_change_std": 0.12769698351621628, "reward_std": 0.7258717827498913, "rewards/cosine_scaled_reward": -0.17565484810620546, "rewards/format_reward": 0.8333333395421505, "step": 334 }, { "advantage_max": 1.738677054643631, "advantage_mean": -2.6697913879658586e-08, "advantage_min": -1.0176256000995636, "advantage_std": 0.9998103380203247, "completion_length": 1122.1667098999023, "epoch": 0.38285714285714284, "grad_norm": 0.8629187345504761, "kl": 0.16303253173828125, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.6696851061588994e-07, "loss": 0.0065, "reward": 0.8083305526524782, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.8083305526524782, "reward_after_std": 0.732852453365922, "reward_before_mean": 1.1283803507685661, "reward_before_std": 0.6743199601769447, "reward_change_max": 0.0, "reward_change_mean": -0.3200498167425394, "reward_change_min": -0.4992302544414997, "reward_change_std": 0.1873677847906947, "reward_std": 0.7328524719923735, "rewards/cosine_scaled_reward": 0.10585683188401163, "rewards/format_reward": 0.9166666716337204, "step": 335 }, { "advantage_max": 1.71261827647686, "advantage_mean": -3.6942462977584967e-08, "advantage_min": -1.093318596482277, "advantage_std": 0.9997923597693443, "completion_length": 1373.6250610351562, "epoch": 0.384, "grad_norm": 1.218349575996399, "kl": 0.28626251220703125, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.641030065789562e-07, "loss": 0.0115, "reward": 0.5922711892053485, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5922711892053485, "reward_after_std": 0.5163947194814682, "reward_before_mean": 0.8827753812074661, "reward_before_std": 0.45310843363404274, "reward_change_max": 0.0, "reward_change_mean": -0.29050417989492416, "reward_change_min": -0.4375855065882206, "reward_change_std": 0.16001543402671814, "reward_std": 0.5163947381079197, "rewards/cosine_scaled_reward": -0.016945652663707733, "rewards/format_reward": 0.9166666865348816, "step": 336 }, { "advantage_max": 1.6926042586565018, "advantage_mean": -4.346172066682641e-08, "advantage_min": -1.014223888516426, "advantage_std": 0.9998311400413513, "completion_length": 1264.2917022705078, "epoch": 0.3851428571428571, "grad_norm": 0.7854025363922119, "kl": 0.1107635498046875, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.612465628992203e-07, "loss": 0.0044, "reward": 0.5707871560007334, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5707871560007334, "reward_after_std": 0.7214007787406445, "reward_before_mean": 0.8423281982541084, "reward_before_std": 0.6791776567697525, "reward_change_max": 0.0, "reward_change_mean": -0.2715410515666008, "reward_change_min": -0.43035536259412766, "reward_change_std": 0.15988810174167156, "reward_std": 0.7214007899165154, "rewards/cosine_scaled_reward": -0.06841926136985421, "rewards/format_reward": 0.9791666716337204, "step": 337 }, { "advantage_max": 1.6522819548845291, "advantage_mean": -1.9868215406226852e-08, "advantage_min": -1.112328127026558, "advantage_std": 0.999812588095665, "completion_length": 1322.7916870117188, "epoch": 0.3862857142857143, "grad_norm": 1.565803050994873, "kl": 0.5242538452148438, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.5839931879571725e-07, "loss": 0.021, "reward": 0.5619577057659626, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.5619577057659626, "reward_after_std": 0.6290275603532791, "reward_before_mean": 0.8397875525988638, "reward_before_std": 0.5936907455325127, "reward_change_max": 0.000512540340423584, "reward_change_mean": -0.2778298445045948, "reward_change_min": -0.47006095573306084, "reward_change_std": 0.17123521026223898, "reward_std": 0.6290275789797306, "rewards/cosine_scaled_reward": -0.01760623953305185, "rewards/format_reward": 0.8750000055879354, "step": 338 }, { "advantage_max": 1.6528789550065994, "advantage_mean": 8.6923440667519e-09, "advantage_min": -1.210326187312603, "advantage_std": 0.9997606724500656, "completion_length": 1296.5417022705078, "epoch": 0.38742857142857146, "grad_norm": 2.363879919052124, "kl": 0.2260894775390625, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.555614130391079e-07, "loss": 0.009, "reward": 0.23270575946662575, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.23270575946662575, "reward_after_std": 0.5161370243877172, "reward_before_mean": 0.4495003125630319, "reward_before_std": 0.49957178719341755, "reward_change_max": 0.0010571181774139404, "reward_change_mean": -0.21679451875388622, "reward_change_min": -0.3489368353039026, "reward_change_std": 0.13622549921274185, "reward_std": 0.516137033700943, "rewards/cosine_scaled_reward": -0.1710832081735134, "rewards/format_reward": 0.7916666865348816, "step": 339 }, { "advantage_max": 1.7887818366289139, "advantage_mean": -5.3395829757718616e-08, "advantage_min": -0.9258808940649033, "advantage_std": 0.9998010918498039, "completion_length": 1079.9583587646484, "epoch": 0.38857142857142857, "grad_norm": 1.0839471817016602, "kl": 0.2021484375, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.5273298394491515e-07, "loss": 0.0081, "reward": 0.5940607134252787, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5940607134252787, "reward_after_std": 0.594856072217226, "reward_before_mean": 0.875343382358551, "reward_before_std": 0.5153414569795132, "reward_change_max": 0.0, "reward_change_mean": -0.2812826782464981, "reward_change_min": -0.41136982291936874, "reward_change_std": 0.14915307890623808, "reward_std": 0.5948560945689678, "rewards/cosine_scaled_reward": -0.041494992794469, "rewards/format_reward": 0.9583333432674408, "step": 340 }, { "advantage_max": 1.7246081233024597, "advantage_mean": -1.2759119849548028e-07, "advantage_min": -1.0545568354427814, "advantage_std": 0.9998086541891098, "completion_length": 1165.0625381469727, "epoch": 0.38971428571428574, "grad_norm": 1.0827921628952026, "kl": 0.13689804077148438, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.4991416936678276e-07, "loss": 0.0055, "reward": 0.9295502845197916, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.9295502845197916, "reward_after_std": 0.6908070221543312, "reward_before_mean": 1.2785332389175892, "reward_before_std": 0.616323871538043, "reward_change_max": 0.0, "reward_change_mean": -0.34898300282657146, "reward_change_min": -0.5290452390909195, "reward_change_std": 0.19935437012463808, "reward_std": 0.6908070258796215, "rewards/cosine_scaled_reward": 0.16009995341300964, "rewards/format_reward": 0.9583333432674408, "step": 341 }, { "advantage_max": 1.455557405948639, "advantage_mean": -3.725290431688677e-08, "advantage_min": -1.3320115879178047, "advantage_std": 0.9998503029346466, "completion_length": 1276.145881652832, "epoch": 0.39085714285714285, "grad_norm": 2.228071451187134, "kl": 0.3672637939453125, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.471051066897562e-07, "loss": 0.0147, "reward": 0.7400090312585235, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7400090312585235, "reward_after_std": 0.8655583336949348, "reward_before_mean": 1.0439782813191414, "reward_before_std": 0.8965267464518547, "reward_change_max": 0.0, "reward_change_mean": -0.3039692733436823, "reward_change_min": -0.5514669045805931, "reward_change_std": 0.2145256232470274, "reward_std": 0.8655583411455154, "rewards/cosine_scaled_reward": 0.08448914252221584, "rewards/format_reward": 0.8750000223517418, "step": 342 }, { "advantage_max": 1.601967141032219, "advantage_mean": 3.725290520506519e-09, "advantage_min": -1.1046533659100533, "advantage_std": 0.9997711554169655, "completion_length": 1441.2500457763672, "epoch": 0.392, "grad_norm": 3.3537521362304688, "kl": 0.3649749755859375, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.4430593282358777e-07, "loss": 0.0146, "reward": 0.6907957500079647, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6907957500079647, "reward_after_std": 0.7117171566933393, "reward_before_mean": 0.9908101595938206, "reward_before_std": 0.7002871641889215, "reward_change_max": 0.0, "reward_change_mean": -0.3000143878161907, "reward_change_min": -0.5172241926193237, "reward_change_std": 0.1907307654619217, "reward_std": 0.7117171976715326, "rewards/cosine_scaled_reward": 0.047488420736044645, "rewards/format_reward": 0.8958333507180214, "step": 343 }, { "advantage_max": 1.589540719985962, "advantage_mean": -5.836288163862946e-08, "advantage_min": -1.3002796024084091, "advantage_std": 0.9998637288808823, "completion_length": 1252.43754196167, "epoch": 0.3931428571428571, "grad_norm": 2.5770859718322754, "kl": 0.473236083984375, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.4151678419606233e-07, "loss": 0.0189, "reward": 0.9171247731428593, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.9171247731428593, "reward_after_std": 0.8720968402922153, "reward_before_mean": 1.2546650283038616, "reward_before_std": 0.8525521457195282, "reward_change_max": 0.002484917640686035, "reward_change_mean": -0.3375402484089136, "reward_change_min": -0.531851053237915, "reward_change_std": 0.220105716958642, "reward_std": 0.872096873819828, "rewards/cosine_scaled_reward": 0.20024916948750615, "rewards/format_reward": 0.8541666865348816, "step": 344 }, { "advantage_max": 1.6149013713002205, "advantage_mean": -9.158006020193454e-09, "advantage_min": -1.1341613978147507, "advantage_std": 0.9997781962156296, "completion_length": 1171.8542022705078, "epoch": 0.3942857142857143, "grad_norm": 2.487800121307373, "kl": 0.27524566650390625, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.387377967463493e-07, "loss": 0.011, "reward": 0.6368197742849588, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6368197742849588, "reward_after_std": 0.5455946698784828, "reward_before_mean": 0.9370312727987766, "reward_before_std": 0.5042991433292627, "reward_change_max": 0.0010842680931091309, "reward_change_mean": -0.3002114836126566, "reward_change_min": -0.45550261810421944, "reward_change_std": 0.17128814291208982, "reward_std": 0.5455946773290634, "rewards/cosine_scaled_reward": -0.021067719906568527, "rewards/format_reward": 0.9791666716337204, "step": 345 }, { "advantage_max": 1.6020869314670563, "advantage_mean": -3.849466778671484e-08, "advantage_min": -1.1575617864727974, "advantage_std": 0.9998195618391037, "completion_length": 1161.1458587646484, "epoch": 0.3954285714285714, "grad_norm": 2.9014134407043457, "kl": 0.1278076171875, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.359691059183761e-07, "loss": 0.0051, "reward": 0.6770103015005589, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6770103015005589, "reward_after_std": 0.6890826895833015, "reward_before_mean": 0.9757176786661148, "reward_before_std": 0.6647419556975365, "reward_change_max": 0.0, "reward_change_mean": -0.29870735108852386, "reward_change_min": -0.487264234572649, "reward_change_std": 0.1784799639135599, "reward_std": 0.6890827268362045, "rewards/cosine_scaled_reward": -0.012141183018684387, "rewards/format_reward": 1.0, "step": 346 }, { "advantage_max": 1.712681457400322, "advantage_mean": -2.110997865401032e-08, "advantage_min": -1.050035186111927, "advantage_std": 0.9997965469956398, "completion_length": 1166.6458740234375, "epoch": 0.3965714285714286, "grad_norm": 2.8097753524780273, "kl": 0.1442718505859375, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.3321084665422803e-07, "loss": 0.0058, "reward": 0.36580729484558105, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.36580729484558105, "reward_after_std": 0.5895061790943146, "reward_before_mean": 0.6037146374583244, "reward_before_std": 0.5479989349842072, "reward_change_max": 0.0, "reward_change_mean": -0.23790733329951763, "reward_change_min": -0.38576383143663406, "reward_change_std": 0.14080576319247484, "reward_std": 0.5895062014460564, "rewards/cosine_scaled_reward": -0.18772602826356888, "rewards/format_reward": 0.9791666716337204, "step": 347 }, { "advantage_max": 1.4286227524280548, "advantage_mean": -2.002343674201157e-08, "advantage_min": -1.3194576650857925, "advantage_std": 0.9998432993888855, "completion_length": 952.458366394043, "epoch": 0.3977142857142857, "grad_norm": 1.2307054996490479, "kl": 0.1024932861328125, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.3046315338757026e-07, "loss": 0.0041, "reward": 0.5614768331870437, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5614768331870437, "reward_after_std": 0.7371022440493107, "reward_before_mean": 0.8400985077023506, "reward_before_std": 0.7679982222616673, "reward_change_max": 0.0005593299865722656, "reward_change_mean": -0.2786216828972101, "reward_change_min": -0.4986792951822281, "reward_change_std": 0.19410818628966808, "reward_std": 0.7371022514998913, "rewards/cosine_scaled_reward": -0.027867418713867664, "rewards/format_reward": 0.8958333507180214, "step": 348 }, { "advantage_max": 1.4817928969860077, "advantage_mean": -2.421438782818086e-08, "advantage_min": -1.2049047872424126, "advantage_std": 0.9998388364911079, "completion_length": 980.7500305175781, "epoch": 0.39885714285714285, "grad_norm": 2.26857590675354, "kl": 0.26554107666015625, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.2772616003709616e-07, "loss": 0.0106, "reward": 0.5565350241959095, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.5565350241959095, "reward_after_std": 0.7860203348100185, "reward_before_mean": 0.8280746899545193, "reward_before_std": 0.8170614093542099, "reward_change_max": 0.0019630417227745056, "reward_change_mean": -0.27153966948390007, "reward_change_min": -0.49386022612452507, "reward_change_std": 0.20041564013808966, "reward_std": 0.7860203571617603, "rewards/cosine_scaled_reward": -0.013046002015471458, "rewards/format_reward": 0.8541666716337204, "step": 349 }, { "advantage_max": 1.8311565965414047, "advantage_mean": -2.2972624191819335e-08, "advantage_min": -0.8188979849219322, "advantage_std": 0.9998144879937172, "completion_length": 1051.6250343322754, "epoch": 0.4, "grad_norm": 1.1339340209960938, "kl": 0.22963714599609375, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.250000000000001e-07, "loss": 0.0092, "reward": 0.36821131221950054, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.36821131221950054, "reward_after_std": 0.7234288677573204, "reward_before_mean": 0.5922360196709633, "reward_before_std": 0.6629289668053389, "reward_change_max": 0.0, "reward_change_mean": -0.2240247093141079, "reward_change_min": -0.35875629261136055, "reward_change_std": 0.13018367905169725, "reward_std": 0.7234288677573204, "rewards/cosine_scaled_reward": -0.18304866866674274, "rewards/format_reward": 0.9583333432674408, "step": 350 }, { "advantage_max": 1.7308290600776672, "advantage_mean": -1.8626451797620902e-08, "advantage_min": -1.074808619916439, "advantage_std": 0.9998176246881485, "completion_length": 998.5416870117188, "epoch": 0.40114285714285713, "grad_norm": 1.4578030109405518, "kl": 0.062774658203125, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.222848061454764e-07, "loss": 0.0025, "reward": 0.5936012240126729, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5936012240126729, "reward_after_std": 0.6550269387662411, "reward_before_mean": 0.87417808547616, "reward_before_std": 0.6078615933656693, "reward_change_max": 0.0, "reward_change_mean": -0.280576853081584, "reward_change_min": -0.4290633983910084, "reward_change_std": 0.16038669738918543, "reward_std": 0.6550269685685635, "rewards/cosine_scaled_reward": -0.042077645659446716, "rewards/format_reward": 0.9583333358168602, "step": 351 }, { "advantage_max": 1.8318933993577957, "advantage_mean": -5.215406784220278e-08, "advantage_min": -1.0214067623019218, "advantage_std": 0.9997903853654861, "completion_length": 1247.5000457763672, "epoch": 0.4022857142857143, "grad_norm": 1.924621343612671, "kl": 0.40142059326171875, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.195807108082429e-07, "loss": 0.0161, "reward": 0.5140209225937724, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.5140209225937724, "reward_after_std": 0.6413620505481958, "reward_before_mean": 0.7743731364607811, "reward_before_std": 0.5711536388844252, "reward_change_max": 0.00038998574018478394, "reward_change_mean": -0.26035225205123425, "reward_change_min": -0.37987200915813446, "reward_change_std": 0.14672334119677544, "reward_std": 0.6413620561361313, "rewards/cosine_scaled_reward": -0.06073010340332985, "rewards/format_reward": 0.8958333507180214, "step": 352 }, { "advantage_max": 1.6363515406847, "advantage_mean": -9.9341087578253e-09, "advantage_min": -1.1301787421107292, "advantage_std": 0.999807745218277, "completion_length": 1015.1041831970215, "epoch": 0.4034285714285714, "grad_norm": 1.1657310724258423, "kl": 0.24164581298828125, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.168878457820915e-07, "loss": 0.0097, "reward": 0.8914534251671284, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.8914534251671284, "reward_after_std": 0.7140428274869919, "reward_before_mean": 1.233622845262289, "reward_before_std": 0.6822979040443897, "reward_change_max": 0.0, "reward_change_mean": -0.34216937981545925, "reward_change_min": -0.5395658630877733, "reward_change_std": 0.20784274209290743, "reward_std": 0.7140428423881531, "rewards/cosine_scaled_reward": 0.14806138863787055, "rewards/format_reward": 0.9375000074505806, "step": 353 }, { "advantage_max": 1.599271759390831, "advantage_mean": -2.7939677682553565e-08, "advantage_min": -1.200988955795765, "advantage_std": 0.999802254140377, "completion_length": 941.0416946411133, "epoch": 0.4045714285714286, "grad_norm": 0.8728342056274414, "kl": 0.1999359130859375, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.142063423134644e-07, "loss": 0.008, "reward": 0.8483670018613338, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.8483670018613338, "reward_after_std": 0.6414902880787849, "reward_before_mean": 1.1879227459430695, "reward_before_std": 0.6066556219011545, "reward_change_max": 0.0, "reward_change_mean": -0.3395557254552841, "reward_change_min": -0.5009829849004745, "reward_change_std": 0.19546702224761248, "reward_std": 0.6414902955293655, "rewards/cosine_scaled_reward": 0.11479469854384661, "rewards/format_reward": 0.9583333432674408, "step": 354 }, { "advantage_max": 1.5588997304439545, "advantage_mean": -6.084640968850863e-08, "advantage_min": -1.1517152562737465, "advantage_std": 0.9998615756630898, "completion_length": 1016.4167022705078, "epoch": 0.4057142857142857, "grad_norm": 1.469150185585022, "kl": 0.19208145141601562, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.115363310950578e-07, "loss": 0.0077, "reward": 0.7298020347952843, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7298020347952843, "reward_after_std": 0.8792191408574581, "reward_before_mean": 1.028229609131813, "reward_before_std": 0.8823234438896179, "reward_change_max": 0.000792287290096283, "reward_change_mean": -0.2984275911003351, "reward_change_min": -0.5347305983304977, "reward_change_std": 0.2032134924083948, "reward_std": 0.8792191408574581, "rewards/cosine_scaled_reward": 0.045364788733422756, "rewards/format_reward": 0.9375000149011612, "step": 355 }, { "advantage_max": 1.5244620889425278, "advantage_mean": -5.2774948189338033e-08, "advantage_min": -1.2598972916603088, "advantage_std": 0.9998257234692574, "completion_length": 1121.3125381469727, "epoch": 0.40685714285714286, "grad_norm": 1.8578813076019287, "kl": 0.23694610595703125, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.0887794225945143e-07, "loss": 0.0095, "reward": 0.5678749307990074, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5678749307990074, "reward_after_std": 0.7130421288311481, "reward_before_mean": 0.8438384272158146, "reward_before_std": 0.7152368873357773, "reward_change_max": 0.0011678412556648254, "reward_change_mean": -0.2759635243564844, "reward_change_min": -0.45209793746471405, "reward_change_std": 0.1840755846351385, "reward_std": 0.7130421474575996, "rewards/cosine_scaled_reward": -0.025997468270361423, "rewards/format_reward": 0.895833358168602, "step": 356 }, { "advantage_max": 1.785775288939476, "advantage_mean": -2.1730860499946658e-08, "advantage_min": -0.836478516459465, "advantage_std": 0.9998077526688576, "completion_length": 1165.31254196167, "epoch": 0.408, "grad_norm": 4.218529224395752, "kl": 0.4105377197265625, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.062313053727671e-07, "loss": 0.0164, "reward": 0.36710093077272177, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.36710093077272177, "reward_after_std": 0.7088046334683895, "reward_before_mean": 0.5877696983516216, "reward_before_std": 0.6624849736690521, "reward_change_max": 0.0, "reward_change_mean": -0.22066876105964184, "reward_change_min": -0.383340559899807, "reward_change_std": 0.13388633634895086, "reward_std": 0.7088046558201313, "rewards/cosine_scaled_reward": -0.15403182711452246, "rewards/format_reward": 0.8958333432674408, "step": 357 }, { "advantage_max": 1.6324420422315598, "advantage_mean": -5.587936335871291e-09, "advantage_min": -1.0728551223874092, "advantage_std": 0.999869205057621, "completion_length": 1239.2500305175781, "epoch": 0.40914285714285714, "grad_norm": 0.9532815217971802, "kl": 0.367156982421875, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.0359654942835247e-07, "loss": 0.0146, "reward": 0.8309711366891861, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.8309711366891861, "reward_after_std": 0.9139232225716114, "reward_before_mean": 1.1447288244962692, "reward_before_std": 0.8961115628480911, "reward_change_max": 0.0, "reward_change_mean": -0.3137576524168253, "reward_change_min": -0.5409190393984318, "reward_change_std": 0.2029905915260315, "reward_std": 0.9139232337474823, "rewards/cosine_scaled_reward": 0.10361439734697342, "rewards/format_reward": 0.9375000074505806, "step": 358 }, { "advantage_max": 1.668090134859085, "advantage_mean": -4.672134945593598e-08, "advantage_min": -1.0278353244066238, "advantage_std": 0.9997305795550346, "completion_length": 831.1875190734863, "epoch": 0.4102857142857143, "grad_norm": 1.9846396446228027, "kl": 0.22521209716796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.0097380284049523e-07, "loss": 0.009, "reward": 0.43807821813970804, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.43807821813970804, "reward_after_std": 0.48141663894057274, "reward_before_mean": 0.6997129991650581, "reward_before_std": 0.4477991936728358, "reward_change_max": 0.0, "reward_change_mean": -0.2616347875446081, "reward_change_min": -0.4088161289691925, "reward_change_std": 0.15432436391711235, "reward_std": 0.48141664266586304, "rewards/cosine_scaled_reward": -0.10847685020416975, "rewards/format_reward": 0.9166666716337204, "step": 359 }, { "advantage_max": 1.7154224514961243, "advantage_mean": -7.698933801592034e-08, "advantage_min": -1.068584568798542, "advantage_std": 0.9998128190636635, "completion_length": 1000.2083587646484, "epoch": 0.4114285714285714, "grad_norm": 2.706932783126831, "kl": 0.156585693359375, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.9836319343816397e-07, "loss": 0.0063, "reward": 0.8180207312107086, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.8180207312107086, "reward_after_std": 0.666923388838768, "reward_before_mean": 1.1441688016057014, "reward_before_std": 0.5980559233576059, "reward_change_max": 0.0, "reward_change_mean": -0.3261481188237667, "reward_change_min": -0.4748628959059715, "reward_change_std": 0.18042928539216518, "reward_std": 0.6669234037399292, "rewards/cosine_scaled_reward": 0.09291772660799325, "rewards/format_reward": 0.9583333432674408, "step": 360 }, { "advantage_max": 1.8375728726387024, "advantage_mean": -5.8362883192941695e-08, "advantage_min": -0.8818403705954552, "advantage_std": 0.999719150364399, "completion_length": 1167.6667175292969, "epoch": 0.4125714285714286, "grad_norm": 0.9807596206665039, "kl": 0.40717315673828125, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.9576484845877793e-07, "loss": 0.0163, "reward": 0.4029839560389519, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4029839560389519, "reward_after_std": 0.505818497389555, "reward_before_mean": 0.6519249677658081, "reward_before_std": 0.4368733561132103, "reward_change_max": 0.0, "reward_change_mean": -0.24894102104008198, "reward_change_min": -0.3820135109126568, "reward_change_std": 0.14100301824510098, "reward_std": 0.5058185234665871, "rewards/cosine_scaled_reward": -0.13237086776643991, "rewards/format_reward": 0.916666679084301, "step": 361 }, { "advantage_max": 1.8133844584226608, "advantage_mean": -6.2088184593633855e-09, "advantage_min": -0.8550900742411613, "advantage_std": 0.9996546134352684, "completion_length": 876.8958473205566, "epoch": 0.4137142857142857, "grad_norm": 1.415966272354126, "kl": 0.10638427734375, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.931788945420058e-07, "loss": 0.0043, "reward": 0.7734895506873727, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7734895506873727, "reward_after_std": 0.3775805290788412, "reward_before_mean": 1.110964479856193, "reward_before_std": 0.27502059168182313, "reward_change_max": 0.0, "reward_change_mean": -0.3374749179929495, "reward_change_min": -0.4654871243983507, "reward_change_std": 0.17572191823273897, "reward_std": 0.3775805290788412, "rewards/cosine_scaled_reward": 0.05548222362995148, "rewards/format_reward": 1.0, "step": 362 }, { "advantage_max": 1.6137598305940628, "advantage_mean": -3.3527614018424856e-08, "advantage_min": -1.2129024267196655, "advantage_std": 0.9997744932770729, "completion_length": 814.0625305175781, "epoch": 0.41485714285714287, "grad_norm": 0.6677997708320618, "kl": 0.09397125244140625, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.9060545772359305e-07, "loss": 0.0038, "reward": 1.0306510236114264, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 1.0306510236114264, "reward_after_std": 0.5626971535384655, "reward_before_mean": 1.4122940003871918, "reward_before_std": 0.5126284081488848, "reward_change_max": 0.0, "reward_change_mean": -0.3816429302096367, "reward_change_min": -0.5517849400639534, "reward_change_std": 0.20960881654173136, "reward_std": 0.5626971572637558, "rewards/cosine_scaled_reward": 0.22698031552135944, "rewards/format_reward": 0.9583333358168602, "step": 363 }, { "advantage_max": 1.8092490285634995, "advantage_mean": -8.69234451084111e-09, "advantage_min": -0.888951875269413, "advantage_std": 0.999782919883728, "completion_length": 1011.770866394043, "epoch": 0.416, "grad_norm": 1.6947314739227295, "kl": 0.155548095703125, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.8804466342921987e-07, "loss": 0.0062, "reward": 0.23180574737489223, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.23180574737489223, "reward_after_std": 0.510407954454422, "reward_before_mean": 0.44417205080389977, "reward_before_std": 0.45651115104556084, "reward_change_max": 0.0, "reward_change_mean": -0.21236632205545902, "reward_change_min": -0.32998935878276825, "reward_change_std": 0.12107465602457523, "reward_std": 0.5104079619050026, "rewards/cosine_scaled_reward": -0.2570806494913995, "rewards/format_reward": 0.9583333432674408, "step": 364 }, { "advantage_max": 1.7235835492610931, "advantage_mean": -3.13545276409144e-08, "advantage_min": -0.9894550256431103, "advantage_std": 0.9998277202248573, "completion_length": 1275.5000305175781, "epoch": 0.41714285714285715, "grad_norm": 1.446303129196167, "kl": 0.24599456787109375, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.854966364683872e-07, "loss": 0.0099, "reward": 0.379636493511498, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.379636493511498, "reward_after_std": 0.7502324618399143, "reward_before_mean": 0.6058452092111111, "reward_before_std": 0.7191748432815075, "reward_change_max": 0.0, "reward_change_mean": -0.22620872594416142, "reward_change_min": -0.3859753981232643, "reward_change_std": 0.14396012295037508, "reward_std": 0.7502324692904949, "rewards/cosine_scaled_reward": -0.12416074390057474, "rewards/format_reward": 0.854166679084301, "step": 365 }, { "advantage_max": 1.7427651286125183, "advantage_mean": -1.2417634476236117e-08, "advantage_min": -1.0082111209630966, "advantage_std": 0.9998277053236961, "completion_length": 1200.9583549499512, "epoch": 0.41828571428571426, "grad_norm": 1.0100181102752686, "kl": 0.10961151123046875, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.829615010283344e-07, "loss": 0.0044, "reward": 0.7537405379116535, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7537405379116535, "reward_after_std": 0.7664575092494488, "reward_before_mean": 1.0614567548036575, "reward_before_std": 0.7194897159934044, "reward_change_max": 0.0, "reward_change_mean": -0.307716216892004, "reward_change_min": -0.49389464780688286, "reward_change_std": 0.18914083298295736, "reward_std": 0.7664575390517712, "rewards/cosine_scaled_reward": 0.051561687141656876, "rewards/format_reward": 0.9583333432674408, "step": 366 }, { "advantage_max": 1.588751271367073, "advantage_mean": -1.2728075482471013e-08, "advantage_min": -1.3483324870467186, "advantage_std": 0.9998136162757874, "completion_length": 1194.437515258789, "epoch": 0.41942857142857143, "grad_norm": 1.5068424940109253, "kl": 0.2128753662109375, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.8043938066798645e-07, "loss": 0.0085, "reward": 0.5264971938449889, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5264971938449889, "reward_after_std": 0.6666704639792442, "reward_before_mean": 0.7939775194972754, "reward_before_std": 0.6534105539321899, "reward_change_max": 0.0004104003310203552, "reward_change_mean": -0.2674803026020527, "reward_change_min": -0.4358235076069832, "reward_change_std": 0.16966222040355206, "reward_std": 0.6666704788804054, "rewards/cosine_scaled_reward": -0.06134458933956921, "rewards/format_reward": 0.916666679084301, "step": 367 }, { "advantage_max": 1.6325557231903076, "advantage_mean": -1.986821529520455e-08, "advantage_min": -1.0944968909025192, "advantage_std": 0.9997764453291893, "completion_length": 1412.7500305175781, "epoch": 0.4205714285714286, "grad_norm": 2.15203595161438, "kl": 0.304656982421875, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.7793039831193133e-07, "loss": 0.0122, "reward": 0.43621888384222984, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.43621888384222984, "reward_after_std": 0.6182968616485596, "reward_before_mean": 0.6909672629553825, "reward_before_std": 0.604456739500165, "reward_change_max": 0.0, "reward_change_mean": -0.25474837608635426, "reward_change_min": -0.44062450528144836, "reward_change_std": 0.16634196415543556, "reward_std": 0.6182968728244305, "rewards/cosine_scaled_reward": -0.10243305005133152, "rewards/format_reward": 0.8958333395421505, "step": 368 }, { "advantage_max": 1.6553238332271576, "advantage_mean": -1.3659398168108794e-08, "advantage_min": -1.0655308365821838, "advantage_std": 0.9998332932591438, "completion_length": 1120.020851135254, "epoch": 0.4217142857142857, "grad_norm": 2.682713747024536, "kl": 0.19134521484375, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.7543467624442956e-07, "loss": 0.0077, "reward": 0.5608847080729902, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5608847080729902, "reward_after_std": 0.8303621709346771, "reward_before_mean": 0.8241658713668585, "reward_before_std": 0.8261894173920155, "reward_change_max": 0.0, "reward_change_mean": -0.2632811777293682, "reward_change_min": -0.5107766017317772, "reward_change_std": 0.18701652251183987, "reward_std": 0.8303622044622898, "rewards/cosine_scaled_reward": -0.04625041130930185, "rewards/format_reward": 0.9166666865348816, "step": 369 }, { "advantage_max": 1.606068804860115, "advantage_mean": -4.842877510125021e-08, "advantage_min": -1.1060052961111069, "advantage_std": 0.999760165810585, "completion_length": 1001.6042098999023, "epoch": 0.4228571428571429, "grad_norm": 0.9525280594825745, "kl": 0.124969482421875, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.729523361034538e-07, "loss": 0.005, "reward": 0.5136999785900116, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5136999785900116, "reward_after_std": 0.4509398341178894, "reward_before_mean": 0.7930864812806249, "reward_before_std": 0.39981068670749664, "reward_change_max": 0.0006158947944641113, "reward_change_mean": -0.2793865194544196, "reward_change_min": -0.411048436537385, "reward_change_std": 0.16224909853190184, "reward_std": 0.45093984156847, "rewards/cosine_scaled_reward": -0.07220677100121975, "rewards/format_reward": 0.9375000074505806, "step": 370 }, { "advantage_max": 1.7884211093187332, "advantage_mean": -1.1424224100053948e-07, "advantage_min": -0.9556703455746174, "advantage_std": 0.9997562393546104, "completion_length": 696.0625228881836, "epoch": 0.424, "grad_norm": 0.7482513189315796, "kl": 0.0223236083984375, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.7048349887476037e-07, "loss": 0.0009, "reward": 0.89418915938586, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.89418915938586, "reward_after_std": 0.5914383120834827, "reward_before_mean": 1.2429718682542443, "reward_before_std": 0.5024533607065678, "reward_change_max": 0.0, "reward_change_mean": -0.3487827442586422, "reward_change_min": -0.48125503212213516, "reward_change_std": 0.19065604731440544, "reward_std": 0.5914383307099342, "rewards/cosine_scaled_reward": 0.14231925923377275, "rewards/format_reward": 0.9583333358168602, "step": 371 }, { "advantage_max": 1.7135415375232697, "advantage_mean": -4.842877521227251e-08, "advantage_min": -1.1480904445052147, "advantage_std": 0.9998008906841278, "completion_length": 1243.895881652832, "epoch": 0.42514285714285716, "grad_norm": 2.1823599338531494, "kl": 0.28174591064453125, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.6802828488599294e-07, "loss": 0.0113, "reward": 0.6811677659861743, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6811677659861743, "reward_after_std": 0.5767421014606953, "reward_before_mean": 0.9846199788153172, "reward_before_std": 0.5272019132971764, "reward_change_max": 0.0, "reward_change_mean": -0.3034522123634815, "reward_change_min": -0.4524131715297699, "reward_change_std": 0.17368095833808184, "reward_std": 0.5767421163618565, "rewards/cosine_scaled_reward": 0.07564329542219639, "rewards/format_reward": 0.8333333507180214, "step": 372 }, { "advantage_max": 1.6011265963315964, "advantage_mean": -1.9868215517249155e-08, "advantage_min": -1.062184102833271, "advantage_std": 0.9997665360569954, "completion_length": 776.4792022705078, "epoch": 0.42628571428571427, "grad_norm": 1.846110463142395, "kl": 0.09600067138671875, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.655868138008171e-07, "loss": 0.0038, "reward": 0.2838655477389693, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2838655477389693, "reward_after_std": 0.5549126155674458, "reward_before_mean": 0.5090842507779598, "reward_before_std": 0.5350898541510105, "reward_change_max": 0.002757057547569275, "reward_change_mean": -0.22521871328353882, "reward_change_min": -0.3687067572027445, "reward_change_std": 0.1422775825485587, "reward_std": 0.5549126267433167, "rewards/cosine_scaled_reward": -0.22462454997003078, "rewards/format_reward": 0.9583333432674408, "step": 373 }, { "advantage_max": 1.7018559277057648, "advantage_mean": -7.792065803702286e-08, "advantage_min": -1.1280000060796738, "advantage_std": 0.9998096823692322, "completion_length": 959.5000305175781, "epoch": 0.42742857142857144, "grad_norm": 1.918383240699768, "kl": 0.103363037109375, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.631592046130896e-07, "loss": 0.0041, "reward": 0.5767368387896568, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5767368387896568, "reward_after_std": 0.7010692954063416, "reward_before_mean": 0.8509616479277611, "reward_before_std": 0.6722024735063314, "reward_change_max": 0.0, "reward_change_mean": -0.27422484941780567, "reward_change_min": -0.4227101244032383, "reward_change_std": 0.1696557030081749, "reward_std": 0.7010693028569221, "rewards/cosine_scaled_reward": -0.03285252209752798, "rewards/format_reward": 0.9166666865348816, "step": 374 }, { "advantage_max": 1.7416959404945374, "advantage_mean": 6.208816794028849e-10, "advantage_min": -1.097055770456791, "advantage_std": 0.999822311103344, "completion_length": 1183.270866394043, "epoch": 0.42857142857142855, "grad_norm": 1.3700175285339355, "kl": 0.21044921875, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.6074557564105724e-07, "loss": 0.0084, "reward": 0.7338640615344048, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.7338640615344048, "reward_after_std": 0.6855736374855042, "reward_before_mean": 1.0383987962268293, "reward_before_std": 0.6238086558878422, "reward_change_max": 0.0, "reward_change_mean": -0.30453467927873135, "reward_change_min": -0.46512749418616295, "reward_change_std": 0.17507017496973276, "reward_std": 0.6855736672878265, "rewards/cosine_scaled_reward": 0.07128270622342825, "rewards/format_reward": 0.895833358168602, "step": 375 }, { "advantage_max": 1.6964893490076065, "advantage_mean": -2.8250118910833066e-08, "advantage_min": -1.028324469923973, "advantage_std": 0.9998196437954903, "completion_length": 1078.8750305175781, "epoch": 0.4297142857142857, "grad_norm": 0.6061641573905945, "kl": 0.114166259765625, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.583460445215911e-07, "loss": 0.0046, "reward": 0.8502661599777639, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.8502661599777639, "reward_after_std": 0.7495008744299412, "reward_before_mean": 1.177900506183505, "reward_before_std": 0.690794087946415, "reward_change_max": 0.0, "reward_change_mean": -0.327634334564209, "reward_change_min": -0.5012494549155235, "reward_change_std": 0.18774799816310406, "reward_std": 0.7495008744299412, "rewards/cosine_scaled_reward": 0.08895024354569614, "rewards/format_reward": 1.0, "step": 376 }, { "advantage_max": 1.652765303850174, "advantage_mean": 3.104408563547878e-09, "advantage_min": -1.1104619428515434, "advantage_std": 0.9998508244752884, "completion_length": 1177.3125190734863, "epoch": 0.4308571428571429, "grad_norm": 2.5280399322509766, "kl": 0.2825164794921875, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.5596072820445254e-07, "loss": 0.0113, "reward": 0.29627796332351863, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.29627796332351863, "reward_after_std": 0.8333842419087887, "reward_before_mean": 0.5023422092199326, "reward_before_std": 0.8306805156171322, "reward_change_max": 0.00010532140731811523, "reward_change_mean": -0.206064248457551, "reward_change_min": -0.41014874540269375, "reward_change_std": 0.15475755836814642, "reward_std": 0.8333842568099499, "rewards/cosine_scaled_reward": -0.16549558006227016, "rewards/format_reward": 0.8333333507180214, "step": 377 }, { "advantage_max": 1.5440286844968796, "advantage_mean": -5.215406562175673e-08, "advantage_min": -1.1627759784460068, "advantage_std": 0.9998477250337601, "completion_length": 998.8750228881836, "epoch": 0.432, "grad_norm": 1.106315016746521, "kl": 0.0748138427734375, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.5358974294659373e-07, "loss": 0.003, "reward": 0.8236580304801464, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.8236580304801464, "reward_after_std": 0.7726437486708164, "reward_before_mean": 1.1485275700688362, "reward_before_std": 0.760891504585743, "reward_change_max": 0.0, "reward_change_mean": -0.3248695805668831, "reward_change_min": -0.5357452519237995, "reward_change_std": 0.20013280678540468, "reward_std": 0.7726437710225582, "rewards/cosine_scaled_reward": 0.09509711805731058, "rewards/format_reward": 0.9583333432674408, "step": 378 }, { "advantage_max": 1.7088170647621155, "advantage_mean": 7.450580735701706e-09, "advantage_min": -1.0716500952839851, "advantage_std": 0.99978818744421, "completion_length": 1143.8333625793457, "epoch": 0.43314285714285716, "grad_norm": 1.33283269405365, "kl": 0.16924285888671875, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.512332043064913e-07, "loss": 0.0068, "reward": 0.6464580819010735, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.6464580819010735, "reward_after_std": 0.602570503950119, "reward_before_mean": 0.9419687129557133, "reward_before_std": 0.5499370843172073, "reward_change_max": 0.0, "reward_change_mean": -0.2955106142908335, "reward_change_min": -0.45715222880244255, "reward_change_std": 0.16575047001242638, "reward_std": 0.6025705374777317, "rewards/cosine_scaled_reward": -0.018598987255245447, "rewards/format_reward": 0.9791666716337204, "step": 379 }, { "advantage_max": 1.668600931763649, "advantage_mean": -2.7939676461308238e-08, "advantage_min": -1.1768651977181435, "advantage_std": 0.9997805878520012, "completion_length": 915.5208702087402, "epoch": 0.4342857142857143, "grad_norm": 0.6677605509757996, "kl": 0.12503814697265625, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.488912271385139e-07, "loss": 0.005, "reward": 0.8405767795629799, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.8405767795629799, "reward_after_std": 0.5229048319160938, "reward_before_mean": 1.1833235658705235, "reward_before_std": 0.45090617425739765, "reward_change_max": 0.0, "reward_change_mean": -0.34274679608643055, "reward_change_min": -0.5035507827997208, "reward_change_std": 0.1878078691661358, "reward_std": 0.5229048356413841, "rewards/cosine_scaled_reward": 0.11249509919434786, "rewards/format_reward": 0.9583333432674408, "step": 380 }, { "advantage_max": 1.6725091934204102, "advantage_mean": -1.5366822592177698e-08, "advantage_min": -1.1343127712607384, "advantage_std": 0.999749131500721, "completion_length": 1320.1875228881836, "epoch": 0.43542857142857144, "grad_norm": 1.8746590614318848, "kl": 0.4720611572265625, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.465639255873246e-07, "loss": 0.0189, "reward": 0.1860162508673966, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.1860162508673966, "reward_after_std": 0.5676998719573021, "reward_before_mean": 0.38731856519007124, "reward_before_std": 0.5420579835772514, "reward_change_max": 0.00022460520267486572, "reward_change_mean": -0.20130231231451035, "reward_change_min": -0.3233092837035656, "reward_change_std": 0.1276199435815215, "reward_std": 0.5676998980343342, "rewards/cosine_scaled_reward": -0.22300739493221045, "rewards/format_reward": 0.833333358168602, "step": 381 }, { "advantage_max": 1.754372239112854, "advantage_mean": -5.836288241578558e-08, "advantage_min": -0.9967377930879593, "advantage_std": 0.9997845068573952, "completion_length": 907.1250076293945, "epoch": 0.43657142857142855, "grad_norm": 0.689816415309906, "kl": 0.05706024169921875, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.4425141308231765e-07, "loss": 0.0023, "reward": 0.4433266781270504, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4433266781270504, "reward_after_std": 0.5864979885518551, "reward_before_mean": 0.6966921277344227, "reward_before_std": 0.5301778148859739, "reward_change_max": 0.0, "reward_change_mean": -0.253365445882082, "reward_change_min": -0.40340840443968773, "reward_change_std": 0.1430813828483224, "reward_std": 0.5864980109035969, "rewards/cosine_scaled_reward": -0.13082062639296055, "rewards/format_reward": 0.9583333432674408, "step": 382 }, { "advantage_max": 1.7084899097681046, "advantage_mean": -1.0927518856451712e-07, "advantage_min": -1.0436795875430107, "advantage_std": 0.9998108968138695, "completion_length": 1010.9166870117188, "epoch": 0.4377142857142857, "grad_norm": 1.530521273612976, "kl": 0.320953369140625, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.4195380233209006e-07, "loss": 0.0128, "reward": 0.9637196809053421, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.9637196809053421, "reward_after_std": 0.7095706351101398, "reward_before_mean": 1.315572265535593, "reward_before_std": 0.6498497929424047, "reward_change_max": 0.0005817189812660217, "reward_change_mean": -0.35185267589986324, "reward_change_min": -0.5468010529875755, "reward_change_std": 0.2091050622984767, "reward_std": 0.7095706537365913, "rewards/cosine_scaled_reward": 0.1786194909363985, "rewards/format_reward": 0.9583333358168602, "step": 383 }, { "advantage_max": 1.5724465548992157, "advantage_mean": -4.221995775210985e-08, "advantage_min": -1.0186072289943695, "advantage_std": 0.9998998194932938, "completion_length": 874.6666870117188, "epoch": 0.43885714285714283, "grad_norm": 1.4073811769485474, "kl": 0.1265411376953125, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.3967120531894857e-07, "loss": 0.0051, "reward": 0.9594648890197277, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.9594648890197277, "reward_after_std": 1.026247426867485, "reward_before_mean": 1.2966319471597672, "reward_before_std": 1.0357157215476036, "reward_change_max": 0.0, "reward_change_mean": -0.33716709539294243, "reward_change_min": -0.6071061193943024, "reward_change_std": 0.23107926733791828, "reward_std": 1.0262474715709686, "rewards/cosine_scaled_reward": 0.17956596659496427, "rewards/format_reward": 0.9375000074505806, "step": 384 }, { "advantage_max": 1.67272287607193, "advantage_mean": -4.159907573964006e-08, "advantage_min": -1.1390063092112541, "advantage_std": 0.9998363107442856, "completion_length": 1162.333381652832, "epoch": 0.44, "grad_norm": 2.0045926570892334, "kl": 0.509124755859375, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.374037332934512e-07, "loss": 0.0203, "reward": 0.588574624620378, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.588574624620378, "reward_after_std": 0.8666601609438658, "reward_before_mean": 0.8494505360722542, "reward_before_std": 0.8376770131289959, "reward_change_max": 0.0007246658205986023, "reward_change_mean": -0.2608759067952633, "reward_change_min": -0.45984548330307007, "reward_change_std": 0.1684568226337433, "reward_std": 0.866660175845027, "rewards/cosine_scaled_reward": -0.04402475664392114, "rewards/format_reward": 0.9375000149011612, "step": 385 }, { "advantage_max": 1.5228633731603622, "advantage_mean": -1.4901161637936866e-08, "advantage_min": -1.2088345661759377, "advantage_std": 0.999832883477211, "completion_length": 1030.208366394043, "epoch": 0.44114285714285717, "grad_norm": 3.5907418727874756, "kl": 0.333953857421875, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.3515149676898552e-07, "loss": 0.0134, "reward": 0.5742043564096093, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5742043564096093, "reward_after_std": 0.7357494235038757, "reward_before_mean": 0.852111779153347, "reward_before_std": 0.7517198957502842, "reward_change_max": 0.0, "reward_change_mean": -0.27790740318596363, "reward_change_min": -0.48950906842947006, "reward_change_std": 0.1909992415457964, "reward_std": 0.7357494346797466, "rewards/cosine_scaled_reward": -0.01144411601126194, "rewards/format_reward": 0.8750000298023224, "step": 386 }, { "advantage_max": 1.442567840218544, "advantage_mean": -6.208817349140361e-09, "advantage_min": -1.3513574451208115, "advantage_std": 0.9998595416545868, "completion_length": 1112.1042022705078, "epoch": 0.4422857142857143, "grad_norm": 2.41609263420105, "kl": 0.5802001953125, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.3291460551638237e-07, "loss": 0.0232, "reward": 0.5504259113222361, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5504259113222361, "reward_after_std": 0.8110573142766953, "reward_before_mean": 0.8169084526598454, "reward_before_std": 0.8353900499641895, "reward_change_max": 0.0005081519484519958, "reward_change_mean": -0.2664825525134802, "reward_change_min": -0.4708978869020939, "reward_change_std": 0.18911569099873304, "reward_std": 0.8110573403537273, "rewards/cosine_scaled_reward": -0.008212439250200987, "rewards/format_reward": 0.8333333507180214, "step": 387 }, { "advantage_max": 1.4479832649230957, "advantage_mean": -3.445893592690652e-08, "advantage_min": -1.3541993200778961, "advantage_std": 0.9998626410961151, "completion_length": 1048.25004196167, "epoch": 0.44342857142857145, "grad_norm": 2.266045331954956, "kl": 0.32752227783203125, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.306931685585657e-07, "loss": 0.0131, "reward": 0.7618649862706661, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.7618649862706661, "reward_after_std": 0.8572219982743263, "reward_before_mean": 1.070565501227975, "reward_before_std": 0.8761973008513451, "reward_change_max": 0.0, "reward_change_mean": -0.3087005550041795, "reward_change_min": -0.5235641039907932, "reward_change_std": 0.20725560653954744, "reward_std": 0.8572220206260681, "rewards/cosine_scaled_reward": 0.09778274083510041, "rewards/format_reward": 0.8750000111758709, "step": 388 }, { "advantage_max": 1.5555903911590576, "advantage_mean": -1.179675274132208e-08, "advantage_min": -1.23585844039917, "advantage_std": 0.9997957497835159, "completion_length": 992.4166946411133, "epoch": 0.44457142857142856, "grad_norm": 1.277256965637207, "kl": 0.1567840576171875, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.2848729416523859e-07, "loss": 0.0063, "reward": 0.5500368820503354, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5500368820503354, "reward_after_std": 0.5696189440786839, "reward_before_mean": 0.831305742263794, "reward_before_std": 0.5486670546233654, "reward_change_max": 0.0004959702491760254, "reward_change_mean": -0.28126880899071693, "reward_change_min": -0.44873275980353355, "reward_change_std": 0.16680119093507528, "reward_std": 0.569618958979845, "rewards/cosine_scaled_reward": -0.05309715494513512, "rewards/format_reward": 0.9375000074505806, "step": 389 }, { "advantage_max": 1.7426921427249908, "advantage_mean": -2.048909719665204e-08, "advantage_min": -1.0142326354980469, "advantage_std": 0.9997293725609779, "completion_length": 1207.3750228881836, "epoch": 0.44571428571428573, "grad_norm": 4.534472942352295, "kl": 0.4759674072265625, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.2629708984760706e-07, "loss": 0.019, "reward": 0.36581041291356087, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.36581041291356087, "reward_after_std": 0.5021828729659319, "reward_before_mean": 0.6087434627115726, "reward_before_std": 0.44767044857144356, "reward_change_max": 0.0, "reward_change_mean": -0.24293305538594723, "reward_change_min": -0.37964949011802673, "reward_change_std": 0.13425681181252003, "reward_std": 0.502182874828577, "rewards/cosine_scaled_reward": -0.17479494586586952, "rewards/format_reward": 0.9583333358168602, "step": 390 }, { "advantage_max": 1.5873253792524338, "advantage_mean": -6.208818459363386e-10, "advantage_min": -1.0947215482592583, "advantage_std": 0.9998257681727409, "completion_length": 919.7500267028809, "epoch": 0.44685714285714284, "grad_norm": 2.652310848236084, "kl": 0.55010986328125, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.2412266235313973e-07, "loss": 0.022, "reward": 0.6688290182501078, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6688290182501078, "reward_after_std": 0.7176588997244835, "reward_before_mean": 0.9627345129847527, "reward_before_std": 0.7010093629360199, "reward_change_max": 0.0, "reward_change_mean": -0.293905483558774, "reward_change_min": -0.4939264915883541, "reward_change_std": 0.18045319989323616, "reward_std": 0.7176589332520962, "rewards/cosine_scaled_reward": 0.02303390298038721, "rewards/format_reward": 0.916666679084301, "step": 391 }, { "advantage_max": 1.6615222543478012, "advantage_mean": -7.636845200664766e-08, "advantage_min": -1.0366918966174126, "advantage_std": 0.9997808933258057, "completion_length": 1095.5416984558105, "epoch": 0.448, "grad_norm": 2.397010326385498, "kl": 0.3244476318359375, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.2196411766036487e-07, "loss": 0.013, "reward": 0.5707610095851123, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5707610095851123, "reward_after_std": 0.5330556966364384, "reward_before_mean": 0.8551894128322601, "reward_before_std": 0.4827442280948162, "reward_change_max": 0.0, "reward_change_mean": -0.2844284549355507, "reward_change_min": -0.4532608240842819, "reward_change_std": 0.16107281111180782, "reward_std": 0.5330557078123093, "rewards/cosine_scaled_reward": -0.051571968011558056, "rewards/format_reward": 0.9583333432674408, "step": 392 }, { "advantage_max": 1.6159837245941162, "advantage_mean": -2.6077032422300306e-08, "advantage_min": -1.1400106847286224, "advantage_std": 0.9998411536216736, "completion_length": 990.8750038146973, "epoch": 0.4491428571428571, "grad_norm": 2.5222160816192627, "kl": 0.299957275390625, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.1982156097370557e-07, "loss": 0.012, "reward": 0.5573246697895229, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5573246697895229, "reward_after_std": 0.8247486017644405, "reward_before_mean": 0.8237691428512335, "reward_before_std": 0.8334475830197334, "reward_change_max": 0.0019779950380325317, "reward_change_mean": -0.26644447445869446, "reward_change_min": -0.4715557172894478, "reward_change_std": 0.1896289987489581, "reward_std": 0.8247486054897308, "rewards/cosine_scaled_reward": -0.03603211464360356, "rewards/format_reward": 0.895833358168602, "step": 393 }, { "advantage_max": 1.7188266068696976, "advantage_mean": -3.725290076417309e-09, "advantage_min": -0.9942247793078423, "advantage_std": 0.9998029246926308, "completion_length": 1272.8958740234375, "epoch": 0.4502857142857143, "grad_norm": 1.3673608303070068, "kl": 0.454193115234375, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.1769509671835223e-07, "loss": 0.0182, "reward": 0.26626094873063266, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.26626094873063266, "reward_after_std": 0.5838912799954414, "reward_before_mean": 0.48196034505963326, "reward_before_std": 0.5439142994582653, "reward_change_max": 0.0, "reward_change_mean": -0.21569938398897648, "reward_change_min": -0.3452872224152088, "reward_change_std": 0.13051257003098726, "reward_std": 0.583891287446022, "rewards/cosine_scaled_reward": -0.2277698372490704, "rewards/format_reward": 0.9375000074505806, "step": 394 }, { "advantage_max": 1.6559451222419739, "advantage_mean": -3.849466734262563e-08, "advantage_min": -1.066048376262188, "advantage_std": 0.9998432993888855, "completion_length": 982.9791927337646, "epoch": 0.4514285714285714, "grad_norm": 4.505710601806641, "kl": 0.381683349609375, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.1558482853517253e-07, "loss": 0.0153, "reward": 0.6048067780211568, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6048067780211568, "reward_after_std": 0.7917204722762108, "reward_before_mean": 0.8776602856814861, "reward_before_std": 0.7769417259842157, "reward_change_max": 0.0, "reward_change_mean": -0.2728535272181034, "reward_change_min": -0.49752773344516754, "reward_change_std": 0.18069400545209646, "reward_std": 0.7917204722762108, "rewards/cosine_scaled_reward": -0.02991985995322466, "rewards/format_reward": 0.9375000149011612, "step": 395 }, { "advantage_max": 1.6781842708587646, "advantage_mean": -1.7229467852430957e-08, "advantage_min": -1.1216829270124435, "advantage_std": 0.9997909143567085, "completion_length": 957.6875381469727, "epoch": 0.45257142857142857, "grad_norm": 1.514703392982483, "kl": 0.1158599853515625, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.134908592756607e-07, "loss": 0.0046, "reward": 0.554951966740191, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.554951966740191, "reward_after_std": 0.5902515798807144, "reward_before_mean": 0.8319753324612975, "reward_before_std": 0.5441593956202269, "reward_change_max": 0.0, "reward_change_mean": -0.2770233787596226, "reward_change_min": -0.4109371602535248, "reward_change_std": 0.1581767164170742, "reward_std": 0.5902515836060047, "rewards/cosine_scaled_reward": -0.052762338891625404, "rewards/format_reward": 0.9375000074505806, "step": 396 }, { "advantage_max": 1.7707886546850204, "advantage_mean": -5.502564409676225e-08, "advantage_min": -0.994467705488205, "advantage_std": 0.999783493578434, "completion_length": 962.8958587646484, "epoch": 0.45371428571428574, "grad_norm": 0.738290548324585, "kl": 0.18896484375, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.1141329099692406e-07, "loss": 0.0076, "reward": 0.5158554278314114, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5158554278314114, "reward_after_std": 0.6469662524759769, "reward_before_mean": 0.7763611227273941, "reward_before_std": 0.5864624343812466, "reward_change_max": 0.0, "reward_change_mean": -0.26050567254424095, "reward_change_min": -0.38990357145667076, "reward_change_std": 0.14337906893342733, "reward_std": 0.6469662673771381, "rewards/cosine_scaled_reward": -0.08056945540010929, "rewards/format_reward": 0.9375000074505806, "step": 397 }, { "advantage_max": 1.7128700017929077, "advantage_mean": -1.8626452269465688e-08, "advantage_min": -0.9575743451714516, "advantage_std": 0.9998482540249825, "completion_length": 953.5833587646484, "epoch": 0.45485714285714285, "grad_norm": 3.236091136932373, "kl": 0.448089599609375, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.0935222495670968e-07, "loss": 0.0179, "reward": 0.478635611012578, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.478635611012578, "reward_after_std": 0.8629732169210911, "reward_before_mean": 0.7193904295563698, "reward_before_std": 0.8417020551860332, "reward_change_max": 0.0, "reward_change_mean": -0.24075481295585632, "reward_change_min": -0.4607015699148178, "reward_change_std": 0.16871712915599346, "reward_std": 0.862973265349865, "rewards/cosine_scaled_reward": -0.0882214680314064, "rewards/format_reward": 0.8958333507180214, "step": 398 }, { "advantage_max": 1.8339340090751648, "advantage_mean": -5.463759156221215e-08, "advantage_min": -0.9071780741214752, "advantage_std": 0.9998093396425247, "completion_length": 950.1250305175781, "epoch": 0.456, "grad_norm": 1.681481957435608, "kl": 0.17317962646484375, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.0730776160846853e-07, "loss": 0.0069, "reward": 0.768366850912571, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.768366850912571, "reward_after_std": 0.7463151644915342, "reward_before_mean": 1.0743264742195606, "reward_before_std": 0.6716021299362183, "reward_change_max": 0.0, "reward_change_mean": -0.3059596102684736, "reward_change_min": -0.4656844697892666, "reward_change_std": 0.17044306732714176, "reward_std": 0.7463151644915342, "rewards/cosine_scaled_reward": 0.04757988639175892, "rewards/format_reward": 0.9791666716337204, "step": 399 }, { "advantage_max": 1.6985308676958084, "advantage_mean": -5.091230281806247e-08, "advantage_min": -1.1028654128313065, "advantage_std": 0.999831885099411, "completion_length": 962.2500190734863, "epoch": 0.45714285714285713, "grad_norm": 1.1464098691940308, "kl": 0.377685546875, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.0528000059645995e-07, "loss": 0.0151, "reward": 1.0859681889414787, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 1.0859681889414787, "reward_after_std": 0.7348118424415588, "reward_before_mean": 1.4632576033473015, "reward_before_std": 0.6647907719016075, "reward_change_max": 0.0, "reward_change_mean": -0.37728938460350037, "reward_change_min": -0.5801199749112129, "reward_change_std": 0.21376279927790165, "reward_std": 0.7348118610680103, "rewards/cosine_scaled_reward": 0.2420454490929842, "rewards/format_reward": 0.9791666716337204, "step": 400 }, { "advantage_max": 1.6830779165029526, "advantage_mean": -1.9247333171712455e-08, "advantage_min": -1.0928455740213394, "advantage_std": 0.9998010620474815, "completion_length": 1125.5625228881836, "epoch": 0.4582857142857143, "grad_norm": 2.874091386795044, "kl": 0.520751953125, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.032690407508949e-07, "loss": 0.0208, "reward": 0.7291412346530706, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.7291412346530706, "reward_after_std": 0.561864573508501, "reward_before_mean": 1.0420525595545769, "reward_before_std": 0.496932677924633, "reward_change_max": 0.0, "reward_change_mean": -0.31291132792830467, "reward_change_min": -0.45355165749788284, "reward_change_std": 0.17144971620291471, "reward_std": 0.5618645772337914, "rewards/cosine_scaled_reward": 0.07310961186885834, "rewards/format_reward": 0.8958333432674408, "step": 401 }, { "advantage_max": 1.5561466589570045, "advantage_mean": -1.2262414639252484e-08, "advantage_min": -1.185271441936493, "advantage_std": 0.9997816905379295, "completion_length": 947.8541946411133, "epoch": 0.4594285714285714, "grad_norm": 2.5312564373016357, "kl": 0.2422943115234375, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.0127498008311922e-07, "loss": 0.0097, "reward": 0.6046357601881027, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6046357601881027, "reward_after_std": 0.526579961180687, "reward_before_mean": 0.9012303352355957, "reward_before_std": 0.4946477487683296, "reward_change_max": 0.0, "reward_change_mean": -0.29659455455839634, "reward_change_min": -0.4401095174252987, "reward_change_std": 0.1744805257767439, "reward_std": 0.5265799760818481, "rewards/cosine_scaled_reward": -0.018134850077331066, "rewards/format_reward": 0.9375000074505806, "step": 402 }, { "advantage_max": 1.7446418106555939, "advantage_mean": -5.463759467083662e-08, "advantage_min": -1.0116091333329678, "advantage_std": 0.9997680559754372, "completion_length": 935.0833511352539, "epoch": 0.4605714285714286, "grad_norm": 1.1639891862869263, "kl": 0.3494873046875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.9929791578083655e-07, "loss": 0.014, "reward": 0.7093625888228416, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7093625888228416, "reward_after_std": 0.5025767236948013, "reward_before_mean": 1.0228974930942059, "reward_before_std": 0.41769439866766334, "reward_change_max": 0.0, "reward_change_mean": -0.3135349079966545, "reward_change_min": -0.4433485083281994, "reward_change_std": 0.1674872562289238, "reward_std": 0.5025767236948013, "rewards/cosine_scaled_reward": 0.021865406539291143, "rewards/format_reward": 0.9791666716337204, "step": 403 }, { "advantage_max": 1.4135560542345047, "advantage_mean": -3.601114090256985e-08, "advantage_min": -1.425866760313511, "advantage_std": 0.9998155757784843, "completion_length": 1190.395851135254, "epoch": 0.4617142857142857, "grad_norm": 3.522855281829834, "kl": 0.6092529296875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.9733794420337213e-07, "loss": 0.0244, "reward": 0.5564087391830981, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5564087391830981, "reward_after_std": 0.6142255328595638, "reward_before_mean": 0.8380017122253776, "reward_before_std": 0.6324377954006195, "reward_change_max": 0.002075694501399994, "reward_change_mean": -0.28159296326339245, "reward_change_min": -0.44256654381752014, "reward_change_std": 0.1803694237023592, "reward_std": 0.6142255514860153, "rewards/cosine_scaled_reward": -0.039332504384219646, "rewards/format_reward": 0.9166666865348816, "step": 404 }, { "advantage_max": 1.6321633905172348, "advantage_mean": -3.725290298461914e-09, "advantage_min": -1.157719410955906, "advantage_std": 0.999863937497139, "completion_length": 943.4792022705078, "epoch": 0.46285714285714286, "grad_norm": 2.017138719558716, "kl": 0.25444793701171875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.9539516087697517e-07, "loss": 0.0102, "reward": 0.8636754900217056, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.8636754900217056, "reward_after_std": 0.8582662865519524, "reward_before_mean": 1.1882897391915321, "reward_before_std": 0.8196110427379608, "reward_change_max": 0.0, "reward_change_mean": -0.3246142454445362, "reward_change_min": -0.5169795639812946, "reward_change_std": 0.2024751529097557, "reward_std": 0.8582663163542747, "rewards/cosine_scaled_reward": 0.1253948686644435, "rewards/format_reward": 0.9375000149011612, "step": 405 }, { "advantage_max": 1.6853253245353699, "advantage_mean": -5.898376320701004e-08, "advantage_min": -0.9819885492324829, "advantage_std": 0.9998152479529381, "completion_length": 1081.6250228881836, "epoch": 0.464, "grad_norm": 1.2055500745773315, "kl": 0.2242889404296875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.934696604901642e-07, "loss": 0.009, "reward": 0.619488287717104, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.619488287717104, "reward_after_std": 0.7117302119731903, "reward_before_mean": 0.9034831523895264, "reward_before_std": 0.6682868003845215, "reward_change_max": 0.0012862235307693481, "reward_change_mean": -0.28399485163390636, "reward_change_min": -0.486436128616333, "reward_change_std": 0.1785287642851472, "reward_std": 0.7117302231490612, "rewards/cosine_scaled_reward": -0.027425101026892662, "rewards/format_reward": 0.9583333432674408, "step": 406 }, { "advantage_max": 1.6408893316984177, "advantage_mean": 1.9868215517249155e-08, "advantage_min": -1.091450996696949, "advantage_std": 0.9997433796525002, "completion_length": 952.7500305175781, "epoch": 0.46514285714285714, "grad_norm": 1.565846562385559, "kl": 0.438507080078125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.915615368891117e-07, "loss": 0.0176, "reward": 0.612242775503546, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.612242775503546, "reward_after_std": 0.621722761541605, "reward_before_mean": 0.9034815113991499, "reward_before_std": 0.596266932785511, "reward_change_max": 0.0011898577213287354, "reward_change_mean": -0.29123874474316835, "reward_change_min": -0.4727095998823643, "reward_change_std": 0.1861831620335579, "reward_std": 0.6217227801680565, "rewards/cosine_scaled_reward": -0.006592577323317528, "rewards/format_reward": 0.9166666716337204, "step": 407 }, { "advantage_max": 1.6723837852478027, "advantage_mean": 4.346171422753287e-09, "advantage_min": -0.966572105884552, "advantage_std": 0.9998452588915825, "completion_length": 1221.7291870117188, "epoch": 0.4662857142857143, "grad_norm": 0.8200952410697937, "kl": 0.225555419921875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.8967088307307e-07, "loss": 0.009, "reward": 0.7919215075671673, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7919215075671673, "reward_after_std": 0.7756611332297325, "reward_before_mean": 1.1055903919041157, "reward_before_std": 0.7286336794495583, "reward_change_max": 0.0, "reward_change_mean": -0.3136688694357872, "reward_change_min": -0.5055144652724266, "reward_change_std": 0.18916036747395992, "reward_std": 0.7756611555814743, "rewards/cosine_scaled_reward": 0.06321184895932674, "rewards/format_reward": 0.9791666716337204, "step": 408 }, { "advantage_max": 1.710029736161232, "advantage_mean": -2.4835268952472234e-08, "advantage_min": -0.9915341734886169, "advantage_std": 0.9998149424791336, "completion_length": 1522.7292175292969, "epoch": 0.4674285714285714, "grad_norm": 3.4706716537475586, "kl": 1.00244140625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.8779779118983867e-07, "loss": 0.04, "reward": 0.482068314217031, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.482068314217031, "reward_after_std": 0.6969355121254921, "reward_before_mean": 0.7364165559411049, "reward_before_std": 0.6672717742621899, "reward_change_max": 0.0, "reward_change_mean": -0.25434826500713825, "reward_change_min": -0.4222247414290905, "reward_change_std": 0.15766187477856874, "reward_std": 0.6969355382025242, "rewards/cosine_scaled_reward": -0.10054172901436687, "rewards/format_reward": 0.9375000149011612, "step": 409 }, { "advantage_max": 1.6289841532707214, "advantage_mean": 4.656615093523442e-10, "advantage_min": -1.147033229470253, "advantage_std": 0.9998533651232719, "completion_length": 1149.8125228881836, "epoch": 0.4685714285714286, "grad_norm": 2.2747011184692383, "kl": 0.551116943359375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.8594235253127372e-07, "loss": 0.0221, "reward": 0.5382617581635714, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5382617581635714, "reward_after_std": 0.8370620533823967, "reward_before_mean": 0.794367466121912, "reward_before_std": 0.8302499540150166, "reward_change_max": 0.0013188868761062622, "reward_change_mean": -0.2561057098209858, "reward_change_min": -0.4616791568696499, "reward_change_std": 0.17526278086006641, "reward_std": 0.8370620794594288, "rewards/cosine_scaled_reward": -0.05073293065652251, "rewards/format_reward": 0.8958333507180214, "step": 410 }, { "advantage_max": 1.6610813438892365, "advantage_mean": -2.2972624524886243e-08, "advantage_min": -1.0270356684923172, "advantage_std": 0.9998531341552734, "completion_length": 1607.1875457763672, "epoch": 0.4697142857142857, "grad_norm": 1.5188755989074707, "kl": 0.5805892944335938, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.8410465752883758e-07, "loss": 0.0232, "reward": 0.5008664312772453, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5008664312772453, "reward_after_std": 0.8480726853013039, "reward_before_mean": 0.7502545667812228, "reward_before_std": 0.8465875536203384, "reward_change_max": 0.0003954470157623291, "reward_change_mean": -0.2493881806731224, "reward_change_min": -0.45750103518366814, "reward_change_std": 0.17382167372852564, "reward_std": 0.8480726890265942, "rewards/cosine_scaled_reward": -0.05195604544132948, "rewards/format_reward": 0.8541666828095913, "step": 411 }, { "advantage_max": 1.6989116072654724, "advantage_mean": -3.8649887429409446e-08, "advantage_min": -1.031688578426838, "advantage_std": 0.9998216927051544, "completion_length": 970.8541946411133, "epoch": 0.47085714285714286, "grad_norm": 1.3265846967697144, "kl": 0.204864501953125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.822847957491922e-07, "loss": 0.0082, "reward": 0.5912466086447239, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5912466086447239, "reward_after_std": 0.7558763138949871, "reward_before_mean": 0.8657120540738106, "reward_before_std": 0.7238161973655224, "reward_change_max": 0.0, "reward_change_mean": -0.2744654770940542, "reward_change_min": -0.46801483258605003, "reward_change_std": 0.17276459746062756, "reward_std": 0.7558763138949871, "rewards/cosine_scaled_reward": -0.03589397203177214, "rewards/format_reward": 0.9375000149011612, "step": 412 }, { "advantage_max": 1.7271312475204468, "advantage_mean": -6.146729214506763e-08, "advantage_min": -1.0142634138464928, "advantage_std": 0.9998214244842529, "completion_length": 1049.8541831970215, "epoch": 0.472, "grad_norm": 1.6498157978057861, "kl": 0.3886566162109375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.804828558898332e-07, "loss": 0.0156, "reward": 0.8821854656562209, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.8821854656562209, "reward_after_std": 0.6554340831935406, "reward_before_mean": 1.2208792939782143, "reward_before_std": 0.575843945145607, "reward_change_max": 0.0, "reward_change_mean": -0.3386938404291868, "reward_change_min": -0.4963124096393585, "reward_change_std": 0.19166480377316475, "reward_std": 0.6554341055452824, "rewards/cosine_scaled_reward": 0.12085631024092436, "rewards/format_reward": 0.9791666716337204, "step": 413 }, { "advantage_max": 1.741827353835106, "advantage_mean": -2.9181441818515452e-08, "advantage_min": -0.9508061856031418, "advantage_std": 0.999826967716217, "completion_length": 1378.1042175292969, "epoch": 0.47314285714285714, "grad_norm": 2.6625030040740967, "kl": 0.2706451416015625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.7869892577476722e-07, "loss": 0.0108, "reward": 0.3892311230301857, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3892311230301857, "reward_after_std": 0.6967345997691154, "reward_before_mean": 0.6230743918567896, "reward_before_std": 0.6546646431088448, "reward_change_max": 0.0008612871170043945, "reward_change_mean": -0.23384328000247478, "reward_change_min": -0.3955848142504692, "reward_change_std": 0.14179929625242949, "reward_std": 0.6967346221208572, "rewards/cosine_scaled_reward": -0.1572128008119762, "rewards/format_reward": 0.9375000074505806, "step": 414 }, { "advantage_max": 1.6893931478261948, "advantage_mean": -6.208817238118058e-09, "advantage_min": -1.1113485097885132, "advantage_std": 0.9998065233230591, "completion_length": 1258.7916870117188, "epoch": 0.4742857142857143, "grad_norm": 26.61659049987793, "kl": 0.7390365600585938, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.7693309235023127e-07, "loss": 0.0295, "reward": 0.37299776542931795, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.37299776542931795, "reward_after_std": 0.6642138287425041, "reward_before_mean": 0.6022981218993664, "reward_before_std": 0.6314213052392006, "reward_change_max": 0.0, "reward_change_mean": -0.22930035181343555, "reward_change_min": -0.37265395000576973, "reward_change_std": 0.13867665268480778, "reward_std": 0.6642138548195362, "rewards/cosine_scaled_reward": -0.09468427952378988, "rewards/format_reward": 0.7916666828095913, "step": 415 }, { "advantage_max": 1.690375730395317, "advantage_mean": -2.0799537758797726e-08, "advantage_min": -1.0659868568181992, "advantage_std": 0.9998829066753387, "completion_length": 1272.0208740234375, "epoch": 0.4754285714285714, "grad_norm": 1.6187878847122192, "kl": 0.358184814453125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.7518544168045524e-07, "loss": 0.0143, "reward": 0.6220204895362258, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6220204895362258, "reward_after_std": 1.002012237906456, "reward_before_mean": 0.8834403976798058, "reward_before_std": 0.9930763803422451, "reward_change_max": 0.0028750449419021606, "reward_change_mean": -0.2614199183881283, "reward_change_min": -0.4997607283294201, "reward_change_std": 0.18725214153528214, "reward_std": 1.002012237906456, "rewards/cosine_scaled_reward": -0.0061964658088982105, "rewards/format_reward": 0.8958333432674408, "step": 416 }, { "advantage_max": 1.5955565720796585, "advantage_mean": -3.8494667453647935e-08, "advantage_min": -1.0881210714578629, "advantage_std": 0.9998477771878242, "completion_length": 1262.7708740234375, "epoch": 0.4765714285714286, "grad_norm": 1.8294861316680908, "kl": 0.29937744140625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.7345605894346726e-07, "loss": 0.012, "reward": 0.5803577015176415, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5803577015176415, "reward_after_std": 0.7609933242201805, "reward_before_mean": 0.8526553846895695, "reward_before_std": 0.7512392140924931, "reward_change_max": 0.0019479095935821533, "reward_change_mean": -0.2722976878285408, "reward_change_min": -0.502078078687191, "reward_change_std": 0.1809261217713356, "reward_std": 0.7609933689236641, "rewards/cosine_scaled_reward": -0.05283900024369359, "rewards/format_reward": 0.9583333432674408, "step": 417 }, { "advantage_max": 1.694640338420868, "advantage_mean": -3.430371520174447e-08, "advantage_min": -1.11370088160038, "advantage_std": 0.9998883605003357, "completion_length": 893.958366394043, "epoch": 0.4777142857142857, "grad_norm": 1.4542772769927979, "kl": 0.157196044921875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.7174502842694212e-07, "loss": 0.0063, "reward": 1.1903546750545502, "reward_advantage_correlation": 1.0, "reward_after_mean": 1.1903546750545502, "reward_after_std": 0.9878413453698158, "reward_before_mean": 1.5717356577515602, "reward_before_std": 0.9362938571721315, "reward_change_max": 0.0, "reward_change_mean": -0.38138093799352646, "reward_change_min": -0.5612896308302879, "reward_change_std": 0.22556099202483892, "reward_std": 0.9878413639962673, "rewards/cosine_scaled_reward": 0.31711778859607875, "rewards/format_reward": 0.9375000149011612, "step": 418 }, { "advantage_max": 1.618606060743332, "advantage_mean": -4.4082603345430016e-08, "advantage_min": -1.005763828754425, "advantage_std": 0.999868668615818, "completion_length": 1429.7083892822266, "epoch": 0.47885714285714287, "grad_norm": 2.259946346282959, "kl": 0.345733642578125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.7005243352409333e-07, "loss": 0.0139, "reward": 0.9187582801096141, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.9187582801096141, "reward_after_std": 0.8637475855648518, "reward_before_mean": 1.2546281795948744, "reward_before_std": 0.8357833307236433, "reward_change_max": 0.00046975165605545044, "reward_change_mean": -0.3358698934316635, "reward_change_min": -0.5807428881525993, "reward_change_std": 0.21599006466567516, "reward_std": 0.8637476190924644, "rewards/cosine_scaled_reward": 0.17939739441499114, "rewards/format_reward": 0.8958333507180214, "step": 419 }, { "advantage_max": 1.688274398446083, "advantage_mean": -2.5766592193221527e-08, "advantage_min": -1.0731761306524277, "advantage_std": 0.9998203814029694, "completion_length": 906.9792022705078, "epoch": 0.48, "grad_norm": 2.0228357315063477, "kl": 0.1790008544921875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.6837835672960831e-07, "loss": 0.0071, "reward": 0.39381164871156216, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.39381164871156216, "reward_after_std": 0.7148094028234482, "reward_before_mean": 0.6268439115956426, "reward_before_std": 0.6882984191179276, "reward_change_max": 0.0, "reward_change_mean": -0.2330322489142418, "reward_change_min": -0.40335175208747387, "reward_change_std": 0.1487152185291052, "reward_std": 0.7148094102740288, "rewards/cosine_scaled_reward": -0.1449114013230428, "rewards/format_reward": 0.9166666865348816, "step": 420 }, { "advantage_max": 1.6927991807460785, "advantage_mean": -1.0554989104960555e-08, "advantage_min": -1.1053090691566467, "advantage_std": 0.9997538402676582, "completion_length": 1260.5833702087402, "epoch": 0.48114285714285715, "grad_norm": 1.5715601444244385, "kl": 0.400909423828125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.6672287963562852e-07, "loss": 0.0161, "reward": 0.3637576922774315, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3637576922774315, "reward_after_std": 0.49668950214982033, "reward_before_mean": 0.606179989874363, "reward_before_std": 0.45100926607847214, "reward_change_max": 0.0, "reward_change_mean": -0.24242228642106056, "reward_change_min": -0.3647758923470974, "reward_change_std": 0.13684486132115126, "reward_std": 0.4966895170509815, "rewards/cosine_scaled_reward": -0.1760766813531518, "rewards/format_reward": 0.9583333432674408, "step": 421 }, { "advantage_max": 1.7518580704927444, "advantage_mean": -3.942599113848644e-08, "advantage_min": -0.9576732888817787, "advantage_std": 0.9998401924967766, "completion_length": 1086.9166946411133, "epoch": 0.48228571428571426, "grad_norm": 1.1396833658218384, "kl": 0.22650146484375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.6508608292777203e-07, "loss": 0.009, "reward": 0.6568402461707592, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6568402461707592, "reward_after_std": 0.7277768515050411, "reward_before_mean": 0.9418208077549934, "reward_before_std": 0.6685466133058071, "reward_change_max": 0.0, "reward_change_mean": -0.2849805485457182, "reward_change_min": -0.44854456931352615, "reward_change_std": 0.16233503445982933, "reward_std": 0.7277768775820732, "rewards/cosine_scaled_reward": -0.029089616611599922, "rewards/format_reward": 1.0, "step": 422 }, { "advantage_max": 1.6433405727148056, "advantage_mean": -3.2285850992685994e-08, "advantage_min": -1.0724581107497215, "advantage_std": 0.9997552186250687, "completion_length": 1292.5000457763672, "epoch": 0.48342857142857143, "grad_norm": 1.9447253942489624, "kl": 0.39086151123046875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.6346804638120098e-07, "loss": 0.0157, "reward": 0.33056771755218506, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.33056771755218506, "reward_after_std": 0.5717285163700581, "reward_before_mean": 0.56210732832551, "reward_before_std": 0.5475243860855699, "reward_change_max": 0.0008535012602806091, "reward_change_mean": -0.23153959307819605, "reward_change_min": -0.40224830619990826, "reward_change_std": 0.14767762832343578, "reward_std": 0.571728527545929, "rewards/cosine_scaled_reward": -0.1772796856239438, "rewards/format_reward": 0.9166666716337204, "step": 423 }, { "advantage_max": 1.6328357756137848, "advantage_mean": -4.967053657267684e-09, "advantage_min": -1.156449869275093, "advantage_std": 0.9997916594147682, "completion_length": 1412.4375305175781, "epoch": 0.4845714285714286, "grad_norm": 3.788841485977173, "kl": 0.7708892822265625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.6186884885673413e-07, "loss": 0.0309, "reward": 0.30179769173264503, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.30179769173264503, "reward_after_std": 0.5867325812578201, "reward_before_mean": 0.526239525526762, "reward_before_std": 0.5687926262617111, "reward_change_max": 0.0, "reward_change_mean": -0.22444182448089123, "reward_change_min": -0.3760899640619755, "reward_change_std": 0.14267886988818645, "reward_std": 0.5867325849831104, "rewards/cosine_scaled_reward": -0.1847969237715006, "rewards/format_reward": 0.8958333507180214, "step": 424 }, { "advantage_max": 1.5353441387414932, "advantage_mean": -8.692344177774203e-08, "advantage_min": -1.2177848778665066, "advantage_std": 0.9998636916279793, "completion_length": 1029.3125381469727, "epoch": 0.4857142857142857, "grad_norm": 1.9667916297912598, "kl": 0.15512847900390625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.6028856829700258e-07, "loss": 0.0062, "reward": 1.059104137122631, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 1.059104137122631, "reward_after_std": 0.8383343629539013, "reward_before_mean": 1.431776024401188, "reward_before_std": 0.8190486012026668, "reward_change_max": 0.0023517385125160217, "reward_change_mean": -0.3726718984544277, "reward_change_min": -0.5964533090591431, "reward_change_std": 0.23735353536903858, "reward_std": 0.8383343853056431, "rewards/cosine_scaled_reward": 0.25755466148257256, "rewards/format_reward": 0.9166666865348816, "step": 425 }, { "advantage_max": 1.7242664843797684, "advantage_mean": -2.9181440985848184e-08, "advantage_min": -1.0022350773215294, "advantage_std": 0.9997998252511024, "completion_length": 976.8958587646484, "epoch": 0.4868571428571429, "grad_norm": 1.1338958740234375, "kl": 0.20705413818359375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.5872728172265146e-07, "loss": 0.0083, "reward": 0.5903048403561115, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5903048403561115, "reward_after_std": 0.5925074368715286, "reward_before_mean": 0.8734835237264633, "reward_before_std": 0.5249487236142159, "reward_change_max": 0.0, "reward_change_mean": -0.28317867405712605, "reward_change_min": -0.4353860914707184, "reward_change_std": 0.15731794014573097, "reward_std": 0.5925074480473995, "rewards/cosine_scaled_reward": -0.052841583266854286, "rewards/format_reward": 0.9791666716337204, "step": 426 }, { "advantage_max": 1.4864351898431778, "advantage_mean": -3.849466678751412e-08, "advantage_min": -1.1698247194290161, "advantage_std": 0.9998398199677467, "completion_length": 1417.645896911621, "epoch": 0.488, "grad_norm": 1.3676238059997559, "kl": 0.2432708740234375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.5718506522858572e-07, "loss": 0.0097, "reward": 0.7394802048802376, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.7394802048802376, "reward_after_std": 0.795317318290472, "reward_before_mean": 1.04683218896389, "reward_before_std": 0.8088573627173901, "reward_change_max": 0.0, "reward_change_mean": -0.3073519878089428, "reward_change_min": -0.5503535270690918, "reward_change_std": 0.20516410283744335, "reward_std": 0.7953173480927944, "rewards/cosine_scaled_reward": 0.03383274283260107, "rewards/format_reward": 0.9791666716337204, "step": 427 }, { "advantage_max": 1.6609529703855515, "advantage_mean": -4.718701163142214e-08, "advantage_min": -1.1213230341672897, "advantage_std": 0.9997992888092995, "completion_length": 1233.4792022705078, "epoch": 0.48914285714285716, "grad_norm": 2.9406940937042236, "kl": 0.372589111328125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.5566199398026147e-07, "loss": 0.0149, "reward": 0.36294333823025227, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.36294333823025227, "reward_after_std": 0.645927868783474, "reward_before_mean": 0.5969743453897536, "reward_before_std": 0.6330409869551659, "reward_change_max": 0.002034500241279602, "reward_change_mean": -0.23403102438896894, "reward_change_min": -0.42131222784519196, "reward_change_std": 0.1592898527160287, "reward_std": 0.6459279023110867, "rewards/cosine_scaled_reward": -0.13901282846927643, "rewards/format_reward": 0.8750000149011612, "step": 428 }, { "advantage_max": 1.697268322110176, "advantage_mean": 1.1486313233888268e-08, "advantage_min": -1.1414394900202751, "advantage_std": 0.999734528362751, "completion_length": 794.1458511352539, "epoch": 0.49028571428571427, "grad_norm": 0.8047446608543396, "kl": 0.1827239990234375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.5415814221002265e-07, "loss": 0.0073, "reward": 0.45462355855852365, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.45462355855852365, "reward_after_std": 0.4094177149236202, "reward_before_mean": 0.7234781309962273, "reward_before_std": 0.3584022521972656, "reward_change_max": 0.0, "reward_change_mean": -0.268854558467865, "reward_change_min": -0.38506653904914856, "reward_change_std": 0.14412853959947824, "reward_std": 0.4094177260994911, "rewards/cosine_scaled_reward": -0.13826094195246696, "rewards/format_reward": 1.0, "step": 429 }, { "advantage_max": 1.6596029549837112, "advantage_mean": -1.1796752963366686e-08, "advantage_min": -1.1022100150585175, "advantage_std": 0.9998620226979256, "completion_length": 1144.750015258789, "epoch": 0.49142857142857144, "grad_norm": 2.4741756916046143, "kl": 0.2149505615234375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.5267358321348285e-07, "loss": 0.0086, "reward": 0.7748331986367702, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7748331986367702, "reward_after_std": 0.9026194624602795, "reward_before_mean": 1.0760859698057175, "reward_before_std": 0.8741047717630863, "reward_change_max": 0.0, "reward_change_mean": -0.3012527599930763, "reward_change_min": -0.512249581515789, "reward_change_std": 0.19205154851078987, "reward_std": 0.9026195108890533, "rewards/cosine_scaled_reward": 0.05887631943915039, "rewards/format_reward": 0.9583333432674408, "step": 430 }, { "advantage_max": 1.8094293773174286, "advantage_mean": -3.1044085080367267e-09, "advantage_min": -0.9238940328359604, "advantage_std": 0.9997945874929428, "completion_length": 1009.0833435058594, "epoch": 0.49257142857142855, "grad_norm": 2.0188703536987305, "kl": 0.3837890625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.5120838934595337e-07, "loss": 0.0154, "reward": 0.41635072650387883, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.41635072650387883, "reward_after_std": 0.6224270965903997, "reward_before_mean": 0.6578373126685619, "reward_before_std": 0.5729903429746628, "reward_change_max": 0.0, "reward_change_mean": -0.24148658476769924, "reward_change_min": -0.38924745842814445, "reward_change_std": 0.14313186053186655, "reward_std": 0.62242710031569, "rewards/cosine_scaled_reward": -0.13983136042952538, "rewards/format_reward": 0.9375000149011612, "step": 431 }, { "advantage_max": 1.641411080956459, "advantage_mean": -1.7384689243726825e-08, "advantage_min": -1.1451390460133553, "advantage_std": 0.9998093023896217, "completion_length": 1437.0208587646484, "epoch": 0.4937142857142857, "grad_norm": 3.1914052963256836, "kl": 0.754730224609375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.4976263201891613e-07, "loss": 0.0301, "reward": 0.2787760675419122, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2787760675419122, "reward_after_std": 0.6417630799114704, "reward_before_mean": 0.49448305927217007, "reward_before_std": 0.6182979214936495, "reward_change_max": 1.4379620552062988e-06, "reward_change_mean": -0.21570699103176594, "reward_change_min": -0.3621121309697628, "reward_change_std": 0.1398151321336627, "reward_std": 0.6417631022632122, "rewards/cosine_scaled_reward": -0.16942514950642362, "rewards/format_reward": 0.8333333432674408, "step": 432 }, { "advantage_max": 1.5136045515537262, "advantage_mean": -1.9868215850316062e-08, "advantage_min": -1.2275639027357101, "advantage_std": 0.9998220428824425, "completion_length": 1271.7917098999023, "epoch": 0.4948571428571429, "grad_norm": 1.8898147344589233, "kl": 0.28757476806640625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.483363816965435e-07, "loss": 0.0115, "reward": 0.5753043964505196, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5753043964505196, "reward_after_std": 0.7662338018417358, "reward_before_mean": 0.8524379003793001, "reward_before_std": 0.7830417305231094, "reward_change_max": 0.0003195628523826599, "reward_change_mean": -0.2771335020661354, "reward_change_min": -0.4404204413294792, "reward_change_std": 0.1899934383109212, "reward_std": 0.7662338204681873, "rewards/cosine_scaled_reward": -0.021697734715417027, "rewards/format_reward": 0.8958333507180214, "step": 433 }, { "advantage_max": 1.7336640357971191, "advantage_mean": 1.862645149230957e-09, "advantage_min": -1.012288175523281, "advantage_std": 0.9997320026159286, "completion_length": 1327.5208435058594, "epoch": 0.496, "grad_norm": 1.9366543292999268, "kl": 0.615142822265625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.469297078922642e-07, "loss": 0.0246, "reward": 0.17344986740499735, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.17344986740499735, "reward_after_std": 0.40367303043603897, "reward_before_mean": 0.3821178646758199, "reward_before_std": 0.36095103435218334, "reward_change_max": 0.0, "reward_change_mean": -0.20866799354553223, "reward_change_min": -0.3159416187554598, "reward_change_std": 0.11741238739341497, "reward_std": 0.40367304161190987, "rewards/cosine_scaled_reward": -0.2672744058072567, "rewards/format_reward": 0.9166666865348816, "step": 434 }, { "advantage_max": 1.6692968308925629, "advantage_mean": 1.1102230246251565e-16, "advantage_min": -1.1007107272744179, "advantage_std": 0.9997232183814049, "completion_length": 867.9375152587891, "epoch": 0.49714285714285716, "grad_norm": 1.5377548933029175, "kl": 0.3750457763671875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.4554267916537495e-07, "loss": 0.015, "reward": 0.34692182997241616, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.34692182997241616, "reward_after_std": 0.40469590574502945, "reward_before_mean": 0.5923015549778938, "reward_before_std": 0.35693977400660515, "reward_change_max": 0.0, "reward_change_mean": -0.2453797236084938, "reward_change_min": -0.3618598096072674, "reward_change_std": 0.13376801926642656, "reward_std": 0.40469592064619064, "rewards/cosine_scaled_reward": -0.1934325685724616, "rewards/format_reward": 0.9791666716337204, "step": 435 }, { "advantage_max": 1.5167209059000015, "advantage_mean": -2.0644317699769488e-08, "advantage_min": -1.2784735634922981, "advantage_std": 0.9998432099819183, "completion_length": 813.3333549499512, "epoch": 0.4982857142857143, "grad_norm": 1.6000545024871826, "kl": 0.24234771728515625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.4417536311769885e-07, "loss": 0.0097, "reward": 0.9885508413426578, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.9885508413426578, "reward_after_std": 0.7530980035662651, "reward_before_mean": 1.350018784403801, "reward_before_std": 0.7326938565820456, "reward_change_max": 0.0017473623156547546, "reward_change_mean": -0.36146787740290165, "reward_change_min": -0.5618167705833912, "reward_change_std": 0.22278902772814035, "reward_std": 0.7530980110168457, "rewards/cosine_scaled_reward": 0.19584268890321255, "rewards/format_reward": 0.9583333432674408, "step": 436 }, { "advantage_max": 1.662267044186592, "advantage_mean": -4.066775316502458e-08, "advantage_min": -1.2343645691871643, "advantage_std": 0.9997730180621147, "completion_length": 1176.9792022705078, "epoch": 0.49942857142857144, "grad_norm": 1.9178744554519653, "kl": 0.27816009521484375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.4282782639029128e-07, "loss": 0.0111, "reward": 0.5513465432450175, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5513465432450175, "reward_after_std": 0.5375252179801464, "reward_before_mean": 0.8317223340272903, "reward_before_std": 0.49098595418035984, "reward_change_max": 0.0, "reward_change_mean": -0.2803757805377245, "reward_change_min": -0.42643988877534866, "reward_change_std": 0.15675612725317478, "reward_std": 0.5375252217054367, "rewards/cosine_scaled_reward": -0.06330552324652672, "rewards/format_reward": 0.9583333432674408, "step": 437 }, { "advantage_max": 1.5528530925512314, "advantage_mean": -3.104408696774641e-08, "advantage_min": -1.2435024604201317, "advantage_std": 0.9998113289475441, "completion_length": 1343.6250381469727, "epoch": 0.5005714285714286, "grad_norm": 2.4138340950012207, "kl": 0.6477813720703125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.4150013466019114e-07, "loss": 0.0259, "reward": 0.4180278740823269, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.4180278740823269, "reward_after_std": 0.61670046672225, "reward_before_mean": 0.6680997647345066, "reward_before_std": 0.6113879233598709, "reward_change_max": 0.0, "reward_change_mean": -0.2500718683004379, "reward_change_min": -0.42597054317593575, "reward_change_std": 0.1606674799695611, "reward_std": 0.6167004853487015, "rewards/cosine_scaled_reward": -0.10345013532787561, "rewards/format_reward": 0.8750000149011612, "step": 438 }, { "advantage_max": 1.8177452832460403, "advantage_mean": -8.071462720415923e-09, "advantage_min": -0.8807521760463715, "advantage_std": 0.9997581467032433, "completion_length": 908.2500228881836, "epoch": 0.5017142857142857, "grad_norm": 0.8225669264793396, "kl": 0.2245941162109375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.4019235263722034e-07, "loss": 0.009, "reward": 0.27506811420107624, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.27506811420107624, "reward_after_std": 0.5774744208902121, "reward_before_mean": 0.49151327461004257, "reward_before_std": 0.5247315559536219, "reward_change_max": 0.0, "reward_change_mean": -0.21644516475498676, "reward_change_min": -0.3448958247900009, "reward_change_std": 0.12561427615582943, "reward_std": 0.5774744469672441, "rewards/cosine_scaled_reward": -0.2438267096877098, "rewards/format_reward": 0.9791666716337204, "step": 439 }, { "advantage_max": 1.688047081232071, "advantage_mean": -9.31322552411018e-09, "advantage_min": -1.1031463593244553, "advantage_std": 0.9997521713376045, "completion_length": 1091.2500190734863, "epoch": 0.5028571428571429, "grad_norm": 2.169895648956299, "kl": 0.50408935546875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.3890454406082956e-07, "loss": 0.0202, "reward": 0.2982004745863378, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2982004745863378, "reward_after_std": 0.47428241185843945, "reward_before_mean": 0.5280682481825352, "reward_before_std": 0.4377642162144184, "reward_change_max": 0.0, "reward_change_mean": -0.2298677545040846, "reward_change_min": -0.364645067602396, "reward_change_std": 0.13350447546690702, "reward_std": 0.4742824211716652, "rewards/cosine_scaled_reward": -0.20471589546650648, "rewards/format_reward": 0.9375000149011612, "step": 440 }, { "advantage_max": 1.6149345934391022, "advantage_mean": -1.2417635586459141e-08, "advantage_min": -1.1340602338314056, "advantage_std": 0.9997862130403519, "completion_length": 1496.5000534057617, "epoch": 0.504, "grad_norm": 2.867658853530884, "kl": 0.64959716796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.3763677169699217e-07, "loss": 0.026, "reward": 0.6025845520198345, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6025845520198345, "reward_after_std": 0.5532146245241165, "reward_before_mean": 0.8909978433512151, "reward_before_std": 0.5169778261333704, "reward_change_max": 0.0, "reward_change_mean": -0.28841328993439674, "reward_change_min": -0.45017037354409695, "reward_change_std": 0.16753645054996014, "reward_std": 0.5532146394252777, "rewards/cosine_scaled_reward": 0.00799890048801899, "rewards/format_reward": 0.8750000149011612, "step": 441 }, { "advantage_max": 1.6888891458511353, "advantage_mean": -8.568168063938231e-08, "advantage_min": -1.0125275775790215, "advantage_std": 0.9998335763812065, "completion_length": 1133.1667022705078, "epoch": 0.5051428571428571, "grad_norm": 2.600919246673584, "kl": 0.41729736328125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.3638909733514452e-07, "loss": 0.0167, "reward": 0.6196513641625643, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.6196513641625643, "reward_after_std": 0.7795599326491356, "reward_before_mean": 0.8948934227228165, "reward_before_std": 0.7337003983557224, "reward_change_max": 0.0004445314407348633, "reward_change_mean": -0.2752421023324132, "reward_change_min": -0.4447060916572809, "reward_change_std": 0.17305165994912386, "reward_std": 0.7795599810779095, "rewards/cosine_scaled_reward": -0.010886628180742264, "rewards/format_reward": 0.916666679084301, "step": 442 }, { "advantage_max": 1.57286237180233, "advantage_mean": -3.9736430812453705e-08, "advantage_min": -1.2943106442689896, "advantage_std": 0.9998047053813934, "completion_length": 1557.0208587646484, "epoch": 0.5062857142857143, "grad_norm": 2.567955732345581, "kl": 0.8605499267578125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.351615817851748e-07, "loss": 0.0345, "reward": 0.48115325393155217, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.48115325393155217, "reward_after_std": 0.6308228820562363, "reward_before_mean": 0.741462922655046, "reward_before_std": 0.6177244447171688, "reward_change_max": 0.0005853325128555298, "reward_change_mean": -0.260309673845768, "reward_change_min": -0.4191274531185627, "reward_change_std": 0.1612487519159913, "reward_std": 0.6308228969573975, "rewards/cosine_scaled_reward": -0.07718522319191834, "rewards/format_reward": 0.8958333432674408, "step": 443 }, { "advantage_max": 1.7246465533971786, "advantage_mean": -1.9247333948868572e-08, "advantage_min": -0.9812377840280533, "advantage_std": 0.9998320639133453, "completion_length": 1051.0000381469727, "epoch": 0.5074285714285715, "grad_norm": 2.602076768875122, "kl": 0.476654052734375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.3395428487445914e-07, "loss": 0.0191, "reward": 0.5566119570285082, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5566119570285082, "reward_after_std": 0.7091410793364048, "reward_before_mean": 0.8225421812385321, "reward_before_std": 0.6658905595541, "reward_change_max": 0.0, "reward_change_mean": -0.2659302204847336, "reward_change_min": -0.4204677902162075, "reward_change_std": 0.15753941144794226, "reward_std": 0.7091410867869854, "rewards/cosine_scaled_reward": -0.07831226149573922, "rewards/format_reward": 0.9791666716337204, "step": 444 }, { "advantage_max": 1.588456466794014, "advantage_mean": -1.4901161637936866e-08, "advantage_min": -1.1749219596385956, "advantage_std": 0.9998450726270676, "completion_length": 1186.7708435058594, "epoch": 0.5085714285714286, "grad_norm": 2.270045518875122, "kl": 0.49578857421875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.3276726544494571e-07, "loss": 0.0199, "reward": 0.42726368457078934, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.42726368457078934, "reward_after_std": 0.8470357619225979, "reward_before_mean": 0.6616728538647294, "reward_before_std": 0.8420430682599545, "reward_change_max": 0.0025559216737747192, "reward_change_mean": -0.2344091683626175, "reward_change_min": -0.4344114977866411, "reward_change_std": 0.16814141906797886, "reward_std": 0.8470357991755009, "rewards/cosine_scaled_reward": -0.12749691866338253, "rewards/format_reward": 0.9166666865348816, "step": 445 }, { "advantage_max": 1.7508121132850647, "advantage_mean": -9.313226689844356e-09, "advantage_min": -1.0302319675683975, "advantage_std": 0.9997759759426117, "completion_length": 1124.0416870117188, "epoch": 0.5097142857142857, "grad_norm": 6.001957893371582, "kl": 0.41686248779296875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.316005813502869e-07, "loss": 0.0167, "reward": 0.5921612880192697, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5921612880192697, "reward_after_std": 0.5231526885181665, "reward_before_mean": 0.8797222785651684, "reward_before_std": 0.45472224801778793, "reward_change_max": 0.0, "reward_change_mean": -0.28756098076701164, "reward_change_min": -0.42036930844187737, "reward_change_std": 0.15772765688598156, "reward_std": 0.5231527201831341, "rewards/cosine_scaled_reward": -0.049722205847501755, "rewards/format_reward": 0.9791666716337204, "step": 446 }, { "advantage_max": 1.6464007794857025, "advantage_mean": 1.6608586214661436e-08, "advantage_min": -1.2194004356861115, "advantage_std": 0.9997731596231461, "completion_length": 929.0416870117188, "epoch": 0.5108571428571429, "grad_norm": 4.027428150177002, "kl": 0.464569091796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.3045428945301953e-07, "loss": 0.0186, "reward": 0.49217029428109527, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.49217029428109527, "reward_after_std": 0.4973028674721718, "reward_before_mean": 0.7639967501163483, "reward_before_std": 0.461477916687727, "reward_change_max": 0.0, "reward_change_mean": -0.2718264479190111, "reward_change_min": -0.39409972354769707, "reward_change_std": 0.15628806222230196, "reward_std": 0.4973028898239136, "rewards/cosine_scaled_reward": -0.06591829285025597, "rewards/format_reward": 0.8958333507180214, "step": 447 }, { "advantage_max": 1.6986867785453796, "advantage_mean": -5.5879357807597785e-08, "advantage_min": -0.9982610195875168, "advantage_std": 0.9998103454709053, "completion_length": 972.4583587646484, "epoch": 0.512, "grad_norm": 2.053443193435669, "kl": 0.65228271484375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.2932844562179352e-07, "loss": 0.0261, "reward": 0.48171089054085314, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.48171089054085314, "reward_after_std": 0.5779558382928371, "reward_before_mean": 0.7415714748203754, "reward_before_std": 0.5251903366297483, "reward_change_max": 0.0009759292006492615, "reward_change_mean": -0.25986063852906227, "reward_change_min": -0.40990063920617104, "reward_change_std": 0.15151193924248219, "reward_std": 0.577955849468708, "rewards/cosine_scaled_reward": -0.06671425537206233, "rewards/format_reward": 0.8750000074505806, "step": 448 }, { "advantage_max": 1.6733713150024414, "advantage_mean": -3.352761368535795e-08, "advantage_min": -1.0281840190291405, "advantage_std": 0.9997676908969879, "completion_length": 963.9792022705078, "epoch": 0.5131428571428571, "grad_norm": 3.2843992710113525, "kl": 0.435089111328125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.2822310472864885e-07, "loss": 0.0174, "reward": 0.3808920937590301, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3808920937590301, "reward_after_std": 0.5548666939139366, "reward_before_mean": 0.6236836463212967, "reward_before_std": 0.5186882894486189, "reward_change_max": 0.0, "reward_change_mean": -0.2427915446460247, "reward_change_min": -0.3859280236065388, "reward_change_std": 0.14151952601969242, "reward_std": 0.5548667013645172, "rewards/cosine_scaled_reward": -0.15690819779410958, "rewards/format_reward": 0.9375000149011612, "step": 449 }, { "advantage_max": 1.6094936579465866, "advantage_mean": -1.5522043428362053e-08, "advantage_min": -1.2767504006624222, "advantage_std": 0.9997935369610786, "completion_length": 992.1458549499512, "epoch": 0.5142857142857142, "grad_norm": 0.9147229790687561, "kl": 0.232177734375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.2713832064634125e-07, "loss": 0.0093, "reward": 0.4996907636523247, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4996907636523247, "reward_after_std": 0.5958952084183693, "reward_before_mean": 0.7667517811059952, "reward_before_std": 0.5675084516406059, "reward_change_max": 0.0006358623504638672, "reward_change_mean": -0.2670610249042511, "reward_change_min": -0.40510354191064835, "reward_change_std": 0.15649499371647835, "reward_std": 0.5958952307701111, "rewards/cosine_scaled_reward": -0.09579078573733568, "rewards/format_reward": 0.9583333432674408, "step": 450 }, { "advantage_max": 1.7193011492490768, "advantage_mean": -4.8428775767384025e-08, "advantage_min": -1.0626923367381096, "advantage_std": 0.9998032078146935, "completion_length": 863.7708587646484, "epoch": 0.5154285714285715, "grad_norm": 2.7563045024871826, "kl": 0.323333740234375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.260741462457165e-07, "loss": 0.0129, "reward": 0.5790690593421459, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5790690593421459, "reward_after_std": 0.6669084951281548, "reward_before_mean": 0.8539356961846352, "reward_before_std": 0.6240298300981522, "reward_change_max": 0.0, "reward_change_mean": -0.27486664801836014, "reward_change_min": -0.42612981237471104, "reward_change_std": 0.1603500172495842, "reward_std": 0.666908498853445, "rewards/cosine_scaled_reward": -0.052198843099176884, "rewards/format_reward": 0.9583333432674408, "step": 451 }, { "advantage_max": 1.642724797129631, "advantage_mean": -5.3395828869540196e-08, "advantage_min": -1.1115128174424171, "advantage_std": 0.9997927471995354, "completion_length": 1056.7500381469727, "epoch": 0.5165714285714286, "grad_norm": 3.9191229343414307, "kl": 0.5685272216796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.2503063339313356e-07, "loss": 0.0227, "reward": 0.6581305470317602, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.6581305470317602, "reward_after_std": 0.5201870016753674, "reward_before_mean": 0.9607983604073524, "reward_before_std": 0.4658200293779373, "reward_change_max": 0.0, "reward_change_mean": -0.3026678394526243, "reward_change_min": -0.4494563788175583, "reward_change_std": 0.16615933552384377, "reward_std": 0.5201870128512383, "rewards/cosine_scaled_reward": 0.011649169027805328, "rewards/format_reward": 0.9375000149011612, "step": 452 }, { "advantage_max": 1.6432772874832153, "advantage_mean": -4.346172111091562e-08, "advantage_min": -1.0461668819189072, "advantage_std": 0.9998030662536621, "completion_length": 1105.0625305175781, "epoch": 0.5177142857142857, "grad_norm": 1.6952801942825317, "kl": 0.33880615234375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.2400783294793668e-07, "loss": 0.0135, "reward": 0.6465382017195225, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6465382017195225, "reward_after_std": 0.6454170644283295, "reward_before_mean": 0.9408576935529709, "reward_before_std": 0.6176839880645275, "reward_change_max": 0.0, "reward_change_mean": -0.2943194881081581, "reward_change_min": -0.4937918931245804, "reward_change_std": 0.17771916277706623, "reward_std": 0.645417083054781, "rewards/cosine_scaled_reward": -0.019154516980051994, "rewards/format_reward": 0.9791666716337204, "step": 453 }, { "advantage_max": 1.5095160454511642, "advantage_mean": -3.9736431256542915e-08, "advantage_min": -1.3069396615028381, "advantage_std": 0.99983149766922, "completion_length": 1135.6875305175781, "epoch": 0.5188571428571429, "grad_norm": 3.1263599395751953, "kl": 0.41485595703125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.2300579475997657e-07, "loss": 0.0166, "reward": 0.4435289604589343, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4435289604589343, "reward_after_std": 0.6856096424162388, "reward_before_mean": 0.6957365237176418, "reward_before_std": 0.6848980709910393, "reward_change_max": 0.0010104477405548096, "reward_change_mean": -0.25220758467912674, "reward_change_min": -0.42572787031531334, "reward_change_std": 0.16586639359593391, "reward_std": 0.6856096535921097, "rewards/cosine_scaled_reward": -0.10004841070622206, "rewards/format_reward": 0.895833358168602, "step": 454 }, { "advantage_max": 1.5948998034000397, "advantage_mean": -1.1796753074388988e-08, "advantage_min": -1.2343806698918343, "advantage_std": 0.9997700154781342, "completion_length": 1371.770851135254, "epoch": 0.52, "grad_norm": 3.6875624656677246, "kl": 0.72686767578125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.220245676671809e-07, "loss": 0.029, "reward": 0.27308704424649477, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.27308704424649477, "reward_after_std": 0.4806528389453888, "reward_before_mean": 0.4997571799904108, "reward_before_std": 0.45495717599987984, "reward_change_max": 0.0, "reward_change_mean": -0.22667013481259346, "reward_change_min": -0.3506789766252041, "reward_change_std": 0.13179008476436138, "reward_std": 0.48065285384655, "rewards/cosine_scaled_reward": -0.22928809002041817, "rewards/format_reward": 0.9583333432674408, "step": 455 }, { "advantage_max": 1.75045645236969, "advantage_mean": -2.0489097418696645e-08, "advantage_min": -0.8442424722015858, "advantage_std": 0.9998552426695824, "completion_length": 1283.333366394043, "epoch": 0.5211428571428571, "grad_norm": 1.8698930740356445, "kl": 0.563690185546875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.2106419949317388e-07, "loss": 0.0225, "reward": 0.4764702459797263, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4764702459797263, "reward_after_std": 0.8666420541703701, "reward_before_mean": 0.7162495022639632, "reward_before_std": 0.8417605478316545, "reward_change_max": 0.0, "reward_change_mean": -0.2397792637348175, "reward_change_min": -0.431858966127038, "reward_change_std": 0.16008818428963423, "reward_std": 0.8666420765221119, "rewards/cosine_scaled_reward": -0.05854193802224472, "rewards/format_reward": 0.833333358168602, "step": 456 }, { "advantage_max": 1.657249003648758, "advantage_mean": -5.4637594004702805e-08, "advantage_min": -1.111772559583187, "advantage_std": 0.9997870773077011, "completion_length": 1104.604190826416, "epoch": 0.5222857142857142, "grad_norm": 2.366246461868286, "kl": 0.4698486328125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.2012473704494537e-07, "loss": 0.0188, "reward": 0.5776587019208819, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5776587019208819, "reward_after_std": 0.6012655943632126, "reward_before_mean": 0.860280629247427, "reward_before_std": 0.5697008725255728, "reward_change_max": 0.0, "reward_change_mean": -0.2826219145208597, "reward_change_min": -0.46650177612900734, "reward_change_std": 0.1752561703324318, "reward_std": 0.6012655980885029, "rewards/cosine_scaled_reward": -0.007359715178608894, "rewards/format_reward": 0.8750000149011612, "step": 457 }, { "advantage_max": 1.7160254791378975, "advantage_mean": -1.0244548542814869e-08, "advantage_min": -0.9474887922406197, "advantage_std": 0.9998100101947784, "completion_length": 939.5625305175781, "epoch": 0.5234285714285715, "grad_norm": 0.805923581123352, "kl": 0.1466064453125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.1920622611056974e-07, "loss": 0.0059, "reward": 0.37487271800637245, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.37487271800637245, "reward_after_std": 0.6352785937488079, "reward_before_mean": 0.6104363277554512, "reward_before_std": 0.5971984900534153, "reward_change_max": 0.0, "reward_change_mean": -0.2355636265128851, "reward_change_min": -0.3710940182209015, "reward_change_std": 0.1407453790307045, "reward_std": 0.6352786086499691, "rewards/cosine_scaled_reward": -0.18436517822556198, "rewards/format_reward": 0.9791666716337204, "step": 458 }, { "advantage_max": 1.5805244594812393, "advantage_mean": -3.228585032655218e-08, "advantage_min": -1.0805974081158638, "advantage_std": 0.9998439848423004, "completion_length": 1056.4167022705078, "epoch": 0.5245714285714286, "grad_norm": 1.8266743421554565, "kl": 0.5707550048828125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.1830871145697412e-07, "loss": 0.0228, "reward": 0.5873242821544409, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5873242821544409, "reward_after_std": 0.8205331340432167, "reward_before_mean": 0.8586870357394218, "reward_before_std": 0.8227408565580845, "reward_change_max": 0.0006539598107337952, "reward_change_mean": -0.27136277966201305, "reward_change_min": -0.48420753702521324, "reward_change_std": 0.18666737619787455, "reward_std": 0.8205331601202488, "rewards/cosine_scaled_reward": -0.04982315469533205, "rewards/format_reward": 0.9583333432674408, "step": 459 }, { "advantage_max": 1.7686534374952316, "advantage_mean": -2.2351742789972207e-08, "advantage_min": -1.0162685364484787, "advantage_std": 0.999843567609787, "completion_length": 1311.833396911621, "epoch": 0.5257142857142857, "grad_norm": 1.6238162517547607, "kl": 0.510040283203125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.1743223682775649e-07, "loss": 0.0204, "reward": 0.5407150648534298, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5407150648534298, "reward_after_std": 0.8093853369355202, "reward_before_mean": 0.7947937436401844, "reward_before_std": 0.7604817524552345, "reward_change_max": 0.0006181150674819946, "reward_change_mean": -0.2540786974132061, "reward_change_min": -0.4123247377574444, "reward_change_std": 0.16012915410101414, "reward_std": 0.8093853667378426, "rewards/cosine_scaled_reward": -0.04010312771424651, "rewards/format_reward": 0.8750000149011612, "step": 460 }, { "advantage_max": 1.6886788457632065, "advantage_mean": 2.2662183352117893e-08, "advantage_min": -0.9786590412259102, "advantage_std": 0.99976596981287, "completion_length": 1219.500015258789, "epoch": 0.5268571428571428, "grad_norm": 1.783687710762024, "kl": 0.5525741577148438, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.1657684494105386e-07, "loss": 0.0221, "reward": 0.4577493495307863, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.4577493495307863, "reward_after_std": 0.49723924323916435, "reward_before_mean": 0.72218307107687, "reward_before_std": 0.4564433563500643, "reward_change_max": 0.0, "reward_change_mean": -0.26443369407206774, "reward_change_min": -0.40887835435569286, "reward_change_std": 0.15175122302025557, "reward_std": 0.49723926186561584, "rewards/cosine_scaled_reward": -0.09724180959165096, "rewards/format_reward": 0.9166666679084301, "step": 461 }, { "advantage_max": 1.8399271368980408, "advantage_mean": -4.842877565636172e-08, "advantage_min": -0.8609659969806671, "advantage_std": 0.9997665509581566, "completion_length": 1047.0000457763672, "epoch": 0.528, "grad_norm": 2.1252048015594482, "kl": 0.381317138671875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.1574257748745986e-07, "loss": 0.0153, "reward": 0.3878229036927223, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3878229036927223, "reward_after_std": 0.6147576775401831, "reward_before_mean": 0.6220925338566303, "reward_before_std": 0.5424556620419025, "reward_change_max": 0.0, "reward_change_mean": -0.2342696338891983, "reward_change_min": -0.3471482917666435, "reward_change_std": 0.1255068052560091, "reward_std": 0.6147576849907637, "rewards/cosine_scaled_reward": -0.18895374238491058, "rewards/format_reward": 1.0, "step": 462 }, { "advantage_max": 1.5150933861732483, "advantage_mean": -5.1533183498264634e-08, "advantage_min": -1.2166873961687088, "advantage_std": 0.9998409226536751, "completion_length": 1437.9583892822266, "epoch": 0.5291428571428571, "grad_norm": 2.537919759750366, "kl": 0.34942626953125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.1492947512799328e-07, "loss": 0.014, "reward": 0.6093145990744233, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6093145990744233, "reward_after_std": 0.7604724206030369, "reward_before_mean": 0.8918596671428531, "reward_before_std": 0.7663224712014198, "reward_change_max": 0.0, "reward_change_mean": -0.28254508040845394, "reward_change_min": -0.49530068039894104, "reward_change_std": 0.18866461794823408, "reward_std": 0.760472446680069, "rewards/cosine_scaled_reward": -0.001986853778362274, "rewards/format_reward": 0.8958333507180214, "step": 463 }, { "advantage_max": 1.539069339632988, "advantage_mean": -9.002784961964494e-08, "advantage_min": -1.322128288447857, "advantage_std": 0.9997992739081383, "completion_length": 872.5208740234375, "epoch": 0.5302857142857142, "grad_norm": 1.7395652532577515, "kl": 0.36365509033203125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.1413757749211602e-07, "loss": 0.0145, "reward": 0.834553528111428, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.834553528111428, "reward_after_std": 0.5309878177940845, "reward_before_mean": 1.178483560681343, "reward_before_std": 0.4831850230693817, "reward_change_max": 0.001415453851222992, "reward_change_mean": -0.34393005445599556, "reward_change_min": -0.48923714458942413, "reward_change_std": 0.19740524981170893, "reward_std": 0.5309878475964069, "rewards/cosine_scaled_reward": 0.12049175798892975, "rewards/format_reward": 0.9375000149011612, "step": 464 }, { "advantage_max": 1.5249205529689789, "advantage_mean": -3.849466712058103e-08, "advantage_min": -1.2455387338995934, "advantage_std": 0.999851331114769, "completion_length": 1280.9375534057617, "epoch": 0.5314285714285715, "grad_norm": 2.718585968017578, "kl": 0.674560546875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.1336692317580158e-07, "loss": 0.027, "reward": 0.4109771801158786, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4109771801158786, "reward_after_std": 0.740178968757391, "reward_before_mean": 0.6509956270456314, "reward_before_std": 0.7452997379004955, "reward_change_max": 0.0005368664860725403, "reward_change_mean": -0.24001849628984928, "reward_change_min": -0.4369941111654043, "reward_change_std": 0.16760590951889753, "reward_std": 0.7401789985597134, "rewards/cosine_scaled_reward": -0.1015855111181736, "rewards/format_reward": 0.8541666865348816, "step": 465 }, { "advantage_max": 1.6975940018892288, "advantage_mean": -3.042320506629892e-08, "advantage_min": -0.9658067002892494, "advantage_std": 0.9998613074421883, "completion_length": 1185.2083702087402, "epoch": 0.5325714285714286, "grad_norm": 1.249248743057251, "kl": 0.25506591796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.1261754973965422e-07, "loss": 0.0102, "reward": 0.799970980733633, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.799970980733633, "reward_after_std": 0.8710318058729172, "reward_before_mean": 1.106414571404457, "reward_before_std": 0.8215276412665844, "reward_change_max": 0.0, "reward_change_mean": -0.3064435701817274, "reward_change_min": -0.5177381709218025, "reward_change_std": 0.1868924666196108, "reward_std": 0.8710318133234978, "rewards/cosine_scaled_reward": 0.06362392473965883, "rewards/format_reward": 0.9791666716337204, "step": 466 }, { "advantage_max": 1.846468836069107, "advantage_mean": -3.7252906315288215e-09, "advantage_min": -0.811167448759079, "advantage_std": 0.9998011738061905, "completion_length": 1365.5208892822266, "epoch": 0.5337142857142857, "grad_norm": 1.8020029067993164, "kl": 0.4116058349609375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.1188949370707787e-07, "loss": 0.0165, "reward": 0.2844287045300007, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2844287045300007, "reward_after_std": 0.6075214967131615, "reward_before_mean": 0.49876469373703003, "reward_before_std": 0.5446117036044598, "reward_change_max": 0.0, "reward_change_mean": -0.2143359947949648, "reward_change_min": -0.34634244069457054, "reward_change_std": 0.11892245709896088, "reward_std": 0.6075215078890324, "rewards/cosine_scaled_reward": -0.24020099081099033, "rewards/format_reward": 0.9791666716337204, "step": 467 }, { "advantage_max": 1.6614352017641068, "advantage_mean": -1.0554989382516311e-08, "advantage_min": -0.9151201918721199, "advantage_std": 0.9998412430286407, "completion_length": 1222.1042251586914, "epoch": 0.5348571428571428, "grad_norm": 2.5508921146392822, "kl": 0.562591552734375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.1118279056249653e-07, "loss": 0.0225, "reward": 0.30447601340711117, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.30447601340711117, "reward_after_std": 0.7888620086014271, "reward_before_mean": 0.5132625997066498, "reward_before_std": 0.7809022553265095, "reward_change_max": 0.0, "reward_change_mean": -0.20878657698631287, "reward_change_min": -0.42520975694060326, "reward_change_std": 0.1491988254711032, "reward_std": 0.7888620272278786, "rewards/cosine_scaled_reward": -0.20170205205795355, "rewards/format_reward": 0.9166666865348816, "step": 468 }, { "advantage_max": 1.7594375908374786, "advantage_mean": 7.528191479921897e-09, "advantage_min": -1.024169247597456, "advantage_std": 0.9997720122337341, "completion_length": 1013.3125343322754, "epoch": 0.536, "grad_norm": 1.44214928150177, "kl": 0.382080078125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.1049747474962444e-07, "loss": 0.0152, "reward": 0.42605833522975445, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.42605833522975445, "reward_after_std": 0.5564350076019764, "reward_before_mean": 0.6749656535685062, "reward_before_std": 0.49210809590294957, "reward_change_max": 0.0, "reward_change_mean": -0.24890729784965515, "reward_change_min": -0.36836569011211395, "reward_change_std": 0.14090245869010687, "reward_std": 0.556435015052557, "rewards/cosine_scaled_reward": -0.12085053510963917, "rewards/format_reward": 0.9166666716337204, "step": 469 }, { "advantage_max": 1.689832404255867, "advantage_mean": -3.8494667453647935e-08, "advantage_min": -0.9729603379964828, "advantage_std": 0.9998282045125961, "completion_length": 1316.8958587646484, "epoch": 0.5371428571428571, "grad_norm": 1.4733878374099731, "kl": 0.545074462890625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0983357966978745e-07, "loss": 0.0218, "reward": 0.4230445548892021, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4230445548892021, "reward_after_std": 0.7050646170973778, "reward_before_mean": 0.6629515116801485, "reward_before_std": 0.6786227710545063, "reward_change_max": 0.0005048736929893494, "reward_change_mean": -0.23990696109831333, "reward_change_min": -0.3978872671723366, "reward_change_std": 0.14986501820385456, "reward_std": 0.7050646580755711, "rewards/cosine_scaled_reward": -0.11644092667847872, "rewards/format_reward": 0.8958333432674408, "step": 470 }, { "advantage_max": 1.5878158211708069, "advantage_mean": -5.8983763762121555e-08, "advantage_min": -1.0156637877225876, "advantage_std": 0.9998416975140572, "completion_length": 1333.4375381469727, "epoch": 0.5382857142857143, "grad_norm": 1.898854374885559, "kl": 0.4473304748535156, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0919113768029517e-07, "loss": 0.0179, "reward": 0.9115908909589052, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.9115908909589052, "reward_after_std": 0.8066898584365845, "reward_before_mean": 1.250825822353363, "reward_before_std": 0.7856942266225815, "reward_change_max": 0.0, "reward_change_mean": -0.33923495560884476, "reward_change_min": -0.5825177431106567, "reward_change_std": 0.2132352814078331, "reward_std": 0.806689877063036, "rewards/cosine_scaled_reward": 0.1566629009321332, "rewards/format_reward": 0.9375000149011612, "step": 471 }, { "advantage_max": 1.7407121658325195, "advantage_mean": -2.235174201281609e-08, "advantage_min": -1.008700355887413, "advantage_std": 0.999788723886013, "completion_length": 1235.4375228881836, "epoch": 0.5394285714285715, "grad_norm": 2.009488344192505, "kl": 0.3810272216796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0857018009286381e-07, "loss": 0.0153, "reward": 0.3421945869922638, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3421945869922638, "reward_after_std": 0.5907893590629101, "reward_before_mean": 0.573419526219368, "reward_before_std": 0.5489665865898132, "reward_change_max": 0.0, "reward_change_mean": -0.23122494295239449, "reward_change_min": -0.3739938288927078, "reward_change_std": 0.1434150319546461, "reward_std": 0.5907893814146519, "rewards/cosine_scaled_reward": -0.17162358853965998, "rewards/format_reward": 0.9166666716337204, "step": 472 }, { "advantage_max": 1.5840518921613693, "advantage_mean": -2.3050233888266547e-08, "advantage_min": -1.2334761917591095, "advantage_std": 0.9997791424393654, "completion_length": 1190.5000305175781, "epoch": 0.5405714285714286, "grad_norm": 1.7814481258392334, "kl": 0.351806640625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0797073717209013e-07, "loss": 0.0141, "reward": 0.4066920541226864, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.4066920541226864, "reward_after_std": 0.564589936286211, "reward_before_mean": 0.6539732131641358, "reward_before_std": 0.5371009334921837, "reward_change_max": 0.0, "reward_change_mean": -0.2472811546176672, "reward_change_min": -0.39397401735186577, "reward_change_std": 0.14508823212236166, "reward_std": 0.5645899474620819, "rewards/cosine_scaled_reward": -0.15218008181545883, "rewards/format_reward": 0.9583333432674408, "step": 473 }, { "advantage_max": 1.506261795759201, "advantage_mean": -6.829699006338785e-08, "advantage_min": -1.307990886271, "advantage_std": 0.9998388066887856, "completion_length": 1047.6041984558105, "epoch": 0.5417142857142857, "grad_norm": 2.3977925777435303, "kl": 0.4084930419921875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0739283813397639e-07, "loss": 0.0163, "reward": 0.994497782237886, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.994497782237886, "reward_after_std": 0.7347078956663609, "reward_before_mean": 1.360306840389967, "reward_before_std": 0.7280658148229122, "reward_change_max": 0.0005192682147026062, "reward_change_mean": -0.36580908484756947, "reward_change_min": -0.5518549457192421, "reward_change_std": 0.22464457992464304, "reward_std": 0.734707910567522, "rewards/cosine_scaled_reward": 0.2009867411106825, "rewards/format_reward": 0.9583333432674408, "step": 474 }, { "advantage_max": 1.69073885679245, "advantage_mean": -1.7384688355548406e-08, "advantage_min": -0.9535181820392609, "advantage_std": 0.9998545795679092, "completion_length": 1170.8750228881836, "epoch": 0.5428571428571428, "grad_norm": 1.6326664686203003, "kl": 0.35323333740234375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.068365111445064e-07, "loss": 0.0141, "reward": 0.3926006439141929, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.3926006439141929, "reward_after_std": 0.876279816031456, "reward_before_mean": 0.6156278997659683, "reward_before_std": 0.8685860857367516, "reward_change_max": 0.0, "reward_change_mean": -0.22302727587521076, "reward_change_min": -0.41833243519067764, "reward_change_std": 0.15927229821681976, "reward_std": 0.8762798272073269, "rewards/cosine_scaled_reward": -0.09843605477362871, "rewards/format_reward": 0.8125000186264515, "step": 475 }, { "advantage_max": 1.7651716619729996, "advantage_mean": -1.8626451714354175e-08, "advantage_min": -0.993249699473381, "advantage_std": 0.9998532608151436, "completion_length": 1432.8958587646484, "epoch": 0.544, "grad_norm": 3.7423794269561768, "kl": 1.0514984130859375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.063017833182728e-07, "loss": 0.042, "reward": 0.4363984651863575, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4363984651863575, "reward_after_std": 0.8045771718025208, "reward_before_mean": 0.6670037880539894, "reward_before_std": 0.7607868574559689, "reward_change_max": 0.002155087888240814, "reward_change_mean": -0.2306053228676319, "reward_change_min": -0.38322553038597107, "reward_change_std": 0.14566559065133333, "reward_std": 0.8045771829783916, "rewards/cosine_scaled_reward": -0.0935814508702606, "rewards/format_reward": 0.8541666828095913, "step": 476 }, { "advantage_max": 1.626573994755745, "advantage_mean": -6.953875364423823e-08, "advantage_min": -0.8963503763079643, "advantage_std": 0.9998547807335854, "completion_length": 926.833366394043, "epoch": 0.5451428571428572, "grad_norm": 1.9580730199813843, "kl": 0.38103485107421875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0578868071715544e-07, "loss": 0.0152, "reward": 0.7679003030061722, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7679003030061722, "reward_after_std": 0.837723758071661, "reward_before_mean": 1.074661746621132, "reward_before_std": 0.8077759854495525, "reward_change_max": 0.0, "reward_change_mean": -0.3067614659667015, "reward_change_min": -0.5569753423333168, "reward_change_std": 0.20120880007743835, "reward_std": 0.8377237804234028, "rewards/cosine_scaled_reward": 0.05816417885944247, "rewards/format_reward": 0.9583333358168602, "step": 477 }, { "advantage_max": 1.6303537040948868, "advantage_mean": -9.313226079221693e-09, "advantage_min": -1.1908142790198326, "advantage_std": 0.9998129531741142, "completion_length": 1573.2292175292969, "epoch": 0.5462857142857143, "grad_norm": 1.9702447652816772, "kl": 0.5894775390625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0529722834905125e-07, "loss": 0.0235, "reward": 0.5314593832008541, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5314593832008541, "reward_after_std": 0.5914321132004261, "reward_before_mean": 0.8025474827736616, "reward_before_std": 0.5585166476666927, "reward_change_max": 0.0, "reward_change_mean": -0.2710880674421787, "reward_change_min": -0.42790525406599045, "reward_change_std": 0.1580025451257825, "reward_std": 0.5914321169257164, "rewards/cosine_scaled_reward": -0.03622628003358841, "rewards/format_reward": 0.8750000074505806, "step": 478 }, { "advantage_max": 1.6044540852308273, "advantage_mean": -2.545615118698663e-08, "advantage_min": -1.003111258149147, "advantage_std": 0.9998004958033562, "completion_length": 1238.7917098999023, "epoch": 0.5474285714285714, "grad_norm": 2.2569944858551025, "kl": 0.76922607421875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0482745016665526e-07, "loss": 0.0308, "reward": 0.34396560629829764, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.34396560629829764, "reward_after_std": 0.650309395045042, "reward_before_mean": 0.5750420242547989, "reward_before_std": 0.6423451155424118, "reward_change_max": 0.0, "reward_change_mean": -0.23107640631496906, "reward_change_min": -0.416254710406065, "reward_change_std": 0.15591457672417164, "reward_std": 0.650309432297945, "rewards/cosine_scaled_reward": -0.16039567068219185, "rewards/format_reward": 0.8958333432674408, "step": 479 }, { "advantage_max": 1.6508120000362396, "advantage_mean": -1.3038516155639002e-08, "advantage_min": -1.1292091310024261, "advantage_std": 0.999795213341713, "completion_length": 1310.0625228881836, "epoch": 0.5485714285714286, "grad_norm": 2.6421027183532715, "kl": 0.9313812255859375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0437936906629334e-07, "loss": 0.0373, "reward": 0.49241532757878304, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.49241532757878304, "reward_after_std": 0.6414160765707493, "reward_before_mean": 0.7527041956782341, "reward_before_std": 0.6134338118135929, "reward_change_max": 0.0, "reward_change_mean": -0.260288855060935, "reward_change_min": -0.4212849773466587, "reward_change_std": 0.1581674963235855, "reward_std": 0.6414160802960396, "rewards/cosine_scaled_reward": -0.09239792544394732, "rewards/format_reward": 0.9375000149011612, "step": 480 }, { "advantage_max": 1.6010807305574417, "advantage_mean": -1.7384688355548406e-08, "advantage_min": -1.1426882520318031, "advantage_std": 0.9998107254505157, "completion_length": 1488.4583740234375, "epoch": 0.5497142857142857, "grad_norm": 1.4233379364013672, "kl": 0.5397415161132812, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0395300688680625e-07, "loss": 0.0216, "reward": 0.37583464104682207, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.37583464104682207, "reward_after_std": 0.6667274013161659, "reward_before_mean": 0.6133236847817898, "reward_before_std": 0.6600178182125092, "reward_change_max": 0.0, "reward_change_mean": -0.2374890260398388, "reward_change_min": -0.4234742745757103, "reward_change_std": 0.16119471471756697, "reward_std": 0.6667274124920368, "rewards/cosine_scaled_reward": -0.12042150646448135, "rewards/format_reward": 0.8541666865348816, "step": 481 }, { "advantage_max": 1.5314403101801872, "advantage_mean": -3.0423204844254315e-08, "advantage_min": -1.2384950369596481, "advantage_std": 0.9998578727245331, "completion_length": 1068.645851135254, "epoch": 0.5508571428571428, "grad_norm": 3.218018054962158, "kl": 0.6636505126953125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0354838440848501e-07, "loss": 0.0265, "reward": 0.7987027624621987, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7987027624621987, "reward_after_std": 0.8954098485410213, "reward_before_mean": 1.1136324778199196, "reward_before_std": 0.9130747206509113, "reward_change_max": 0.0013197064399719238, "reward_change_mean": -0.3149297498166561, "reward_change_min": -0.5331053957343102, "reward_change_std": 0.21649126335978508, "reward_std": 0.8954098559916019, "rewards/cosine_scaled_reward": 0.09848290542140603, "rewards/format_reward": 0.9166666865348816, "step": 482 }, { "advantage_max": 1.6196284890174866, "advantage_mean": -5.5879355587151736e-09, "advantage_min": -1.110151432454586, "advantage_std": 0.9998453184962273, "completion_length": 1403.6041870117188, "epoch": 0.552, "grad_norm": 2.3577053546905518, "kl": 0.647979736328125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0316552135205837e-07, "loss": 0.026, "reward": 0.6319026295095682, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6319026295095682, "reward_after_std": 0.8189057596027851, "reward_before_mean": 0.9091856814920902, "reward_before_std": 0.7972288802266121, "reward_change_max": 0.0, "reward_change_mean": -0.27728303894400597, "reward_change_min": -0.4613271728157997, "reward_change_std": 0.1789004895836115, "reward_std": 0.8189057968556881, "rewards/cosine_scaled_reward": -0.0037405104376375675, "rewards/format_reward": 0.916666679084301, "step": 483 }, { "advantage_max": 1.5716918855905533, "advantage_mean": -4.96705393482344e-08, "advantage_min": -1.2597123309969902, "advantage_std": 0.999833457171917, "completion_length": 1248.9375381469727, "epoch": 0.5531428571428572, "grad_norm": 1.623803973197937, "kl": 0.68231201171875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0280443637773163e-07, "loss": 0.0274, "reward": 0.5417165439575911, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5417165439575911, "reward_after_std": 0.7372452020645142, "reward_before_mean": 0.8069544602185488, "reward_before_std": 0.7322539314627647, "reward_change_max": 0.0, "reward_change_mean": -0.2652379460632801, "reward_change_min": -0.44489338994026184, "reward_change_std": 0.17211270052939653, "reward_std": 0.7372452132403851, "rewards/cosine_scaled_reward": -0.023606109898537397, "rewards/format_reward": 0.8541666828095913, "step": 484 }, { "advantage_max": 1.7099245637655258, "advantage_mean": -1.4280279625467074e-08, "advantage_min": -1.1274260729551315, "advantage_std": 0.9997733682394028, "completion_length": 965.2708587646484, "epoch": 0.5542857142857143, "grad_norm": 2.46708345413208, "kl": 0.3107147216796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0246514708427701e-07, "loss": 0.0124, "reward": 0.5324733089655638, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5324733089655638, "reward_after_std": 0.5767040736973286, "reward_before_mean": 0.8058453351259232, "reward_before_std": 0.5237753307446837, "reward_change_max": 0.0, "reward_change_mean": -0.27337202802300453, "reward_change_min": -0.4182005673646927, "reward_change_std": 0.15673528984189034, "reward_std": 0.5767040941864252, "rewards/cosine_scaled_reward": -0.07624401268549263, "rewards/format_reward": 0.9583333432674408, "step": 485 }, { "advantage_max": 1.765414834022522, "advantage_mean": 4.346172255420555e-09, "advantage_min": -0.9041767120361328, "advantage_std": 0.9997631907463074, "completion_length": 862.6041793823242, "epoch": 0.5554285714285714, "grad_norm": 2.2586798667907715, "kl": 0.461151123046875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0214767000817596e-07, "loss": 0.0185, "reward": 0.5346746001159772, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5346746001159772, "reward_after_std": 0.5360549800097942, "reward_before_mean": 0.8094351254403591, "reward_before_std": 0.4725265856832266, "reward_change_max": 0.0, "reward_change_mean": -0.2747605200856924, "reward_change_min": -0.43837128207087517, "reward_change_std": 0.15725676529109478, "reward_std": 0.5360549874603748, "rewards/cosine_scaled_reward": -0.06403244659304619, "rewards/format_reward": 0.9375000149011612, "step": 486 }, { "advantage_max": 1.5992169827222824, "advantage_mean": -3.476937671109681e-08, "advantage_min": -1.1532378867268562, "advantage_std": 0.9998497292399406, "completion_length": 1159.229206085205, "epoch": 0.5565714285714286, "grad_norm": 1.466489315032959, "kl": 0.40506744384765625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0185202062281336e-07, "loss": 0.0162, "reward": 0.7347278879024088, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7347278879024088, "reward_after_std": 0.8287507370114326, "reward_before_mean": 1.0370606668293476, "reward_before_std": 0.8203627225011587, "reward_change_max": 0.0031530559062957764, "reward_change_mean": -0.30233279056847095, "reward_change_min": -0.5047247745096684, "reward_change_std": 0.20336042065173388, "reward_std": 0.8287507519125938, "rewards/cosine_scaled_reward": 0.08103030489291996, "rewards/format_reward": 0.8750000149011612, "step": 487 }, { "advantage_max": 1.5731254816055298, "advantage_mean": -6.8296991950766994e-09, "advantage_min": -1.1291136145591736, "advantage_std": 0.9997728988528252, "completion_length": 994.7708511352539, "epoch": 0.5577142857142857, "grad_norm": 1.401617407798767, "kl": 0.2291259765625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0157821333772304e-07, "loss": 0.0092, "reward": 0.3939766474068165, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.3939766474068165, "reward_after_std": 0.541620772331953, "reward_before_mean": 0.6443051844835281, "reward_before_std": 0.5205097924917936, "reward_change_max": 0.0014897137880325317, "reward_change_mean": -0.2503285203129053, "reward_change_min": -0.3798618447035551, "reward_change_std": 0.1473851716145873, "reward_std": 0.5416207909584045, "rewards/cosine_scaled_reward": -0.16743076220154762, "rewards/format_reward": 0.9791666716337204, "step": 488 }, { "advantage_max": 1.7112076729536057, "advantage_mean": -2.483526828633842e-09, "advantage_min": -0.9740470126271248, "advantage_std": 0.9997779279947281, "completion_length": 1289.8958892822266, "epoch": 0.5588571428571428, "grad_norm": 2.128153085708618, "kl": 0.60748291015625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.013262614978859e-07, "loss": 0.0243, "reward": 0.18788336508441716, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.18788336508441716, "reward_after_std": 0.49343743547797203, "reward_before_mean": 0.39214257523417473, "reward_before_std": 0.45715833082795143, "reward_change_max": 0.0, "reward_change_mean": -0.20425921119749546, "reward_change_min": -0.33435640670359135, "reward_change_std": 0.11896615382283926, "reward_std": 0.4934374466538429, "rewards/cosine_scaled_reward": -0.25184538774192333, "rewards/format_reward": 0.8958333432674408, "step": 489 }, { "advantage_max": 1.7367768734693527, "advantage_mean": -4.7187011076310625e-08, "advantage_min": -0.9195685312151909, "advantage_std": 0.9997827708721161, "completion_length": 1221.7500228881836, "epoch": 0.56, "grad_norm": 1.6772288084030151, "kl": 0.43460845947265625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0109617738307911e-07, "loss": 0.0173, "reward": 0.7127898004837334, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7127898004837334, "reward_after_std": 0.6761909052729607, "reward_before_mean": 1.0172344259917736, "reward_before_std": 0.6181280873715878, "reward_change_max": 0.0, "reward_change_mean": -0.3044446799904108, "reward_change_min": -0.4935770258307457, "reward_change_std": 0.18400804046541452, "reward_std": 0.6761909127235413, "rewards/cosine_scaled_reward": 0.019033881602808833, "rewards/format_reward": 0.9791666716337204, "step": 490 }, { "advantage_max": 1.6076852083206177, "advantage_mean": -2.0178655746327934e-08, "advantage_min": -1.1311817914247513, "advantage_std": 0.999858483672142, "completion_length": 1386.5833740234375, "epoch": 0.5611428571428572, "grad_norm": 1.6963690519332886, "kl": 0.5140380859375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0088797220727779e-07, "loss": 0.0206, "reward": 0.7685029455460608, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7685029455460608, "reward_after_std": 0.8746408671140671, "reward_before_mean": 1.072487998753786, "reward_before_std": 0.8700667172670364, "reward_change_max": 0.0, "reward_change_mean": -0.3039850238710642, "reward_change_min": -0.5531091075390577, "reward_change_std": 0.20315628219395876, "reward_std": 0.874640878289938, "rewards/cosine_scaled_reward": 0.0883273258805275, "rewards/format_reward": 0.8958333432674408, "step": 491 }, { "advantage_max": 1.6640974879264832, "advantage_mean": -1.8626452047421083e-08, "advantage_min": -0.9844799563288689, "advantage_std": 0.9997724816203117, "completion_length": 1141.6250228881836, "epoch": 0.5622857142857143, "grad_norm": 2.6036548614501953, "kl": 0.2451629638671875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0070165611810855e-07, "loss": 0.0098, "reward": 0.5456169964745641, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5456169964745641, "reward_after_std": 0.6167141497135162, "reward_before_mean": 0.8201209753751755, "reward_before_std": 0.5838107857853174, "reward_change_max": 0.0, "reward_change_mean": -0.2745039686560631, "reward_change_min": -0.4336605127900839, "reward_change_std": 0.16109473910182714, "reward_std": 0.6167141608893871, "rewards/cosine_scaled_reward": -0.08993951743468642, "rewards/format_reward": 1.0, "step": 492 }, { "advantage_max": 1.5081126242876053, "advantage_mean": -3.2906732116977366e-08, "advantage_min": -1.241824135184288, "advantage_std": 0.9998714104294777, "completion_length": 1327.3750305175781, "epoch": 0.5634285714285714, "grad_norm": 3.3362510204315186, "kl": 0.88671875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.005372381963547e-07, "loss": 0.0354, "reward": 0.7321626851335168, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7321626851335168, "reward_after_std": 0.9542999267578125, "reward_before_mean": 1.0276228515431285, "reward_before_std": 0.996599406003952, "reward_change_max": 0.0023065879940986633, "reward_change_mean": -0.29546014219522476, "reward_change_min": -0.5584845636039972, "reward_change_std": 0.22464005090296268, "reward_std": 0.9542999565601349, "rewards/cosine_scaled_reward": 0.08672806993126869, "rewards/format_reward": 0.8541666865348816, "step": 493 }, { "advantage_max": 1.626434475183487, "advantage_mean": -5.743156084037082e-08, "advantage_min": -1.069405935704708, "advantage_std": 0.9998456314206123, "completion_length": 977.3333740234375, "epoch": 0.5645714285714286, "grad_norm": 0.7659087181091309, "kl": 0.0702667236328125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0039472645551372e-07, "loss": 0.0028, "reward": 0.7825715020298958, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7825715020298958, "reward_after_std": 0.7810787223279476, "reward_before_mean": 1.0978356748819351, "reward_before_std": 0.7559734731912613, "reward_change_max": 0.0, "reward_change_mean": -0.3152642250061035, "reward_change_min": -0.5318855568766594, "reward_change_std": 0.19349340070039034, "reward_std": 0.7810787446796894, "rewards/cosine_scaled_reward": 0.05933448998257518, "rewards/format_reward": 0.9791666716337204, "step": 494 }, { "advantage_max": 1.5209117978811264, "advantage_mean": -6.208815683805824e-10, "advantage_min": -1.1773782223463058, "advantage_std": 0.9998350664973259, "completion_length": 1327.4167022705078, "epoch": 0.5657142857142857, "grad_norm": 1.8314779996871948, "kl": 0.396820068359375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.002741278414069e-07, "loss": 0.0159, "reward": 0.61646170867607, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.61646170867607, "reward_after_std": 0.7499135285615921, "reward_before_mean": 0.9014103151857853, "reward_before_std": 0.7563531119376421, "reward_change_max": 0.0020629242062568665, "reward_change_mean": -0.2849485781043768, "reward_change_min": -0.5066464394330978, "reward_change_std": 0.19107064697891474, "reward_std": 0.7499135434627533, "rewards/cosine_scaled_reward": -0.02846152102574706, "rewards/format_reward": 0.9583333432674408, "step": 495 }, { "advantage_max": 1.607502669095993, "advantage_mean": -2.7629237009385577e-08, "advantage_min": -1.1763157099485397, "advantage_std": 0.9998272061347961, "completion_length": 1260.8958930969238, "epoch": 0.5668571428571428, "grad_norm": 2.71171236038208, "kl": 0.671295166015625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0017544823184055e-07, "loss": 0.0268, "reward": 0.5370115237310529, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5370115237310529, "reward_after_std": 0.7076732777059078, "reward_before_mean": 0.8070572554133832, "reward_before_std": 0.7081652507185936, "reward_change_max": 0.0, "reward_change_mean": -0.2700457442551851, "reward_change_min": -0.4546791072934866, "reward_change_std": 0.17802696116268635, "reward_std": 0.7076733037829399, "rewards/cosine_scaled_reward": 0.0181119367480278, "rewards/format_reward": 0.7708333395421505, "step": 496 }, { "advantage_max": 1.5937969386577606, "advantage_mean": 3.725292629930266e-09, "advantage_min": -1.0978671796619892, "advantage_std": 0.9997798278927803, "completion_length": 1065.6458587646484, "epoch": 0.568, "grad_norm": 2.594193696975708, "kl": 0.5436172485351562, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0009869243631952e-07, "loss": 0.0218, "reward": 0.759913792979205, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.759913792979205, "reward_after_std": 0.6107112001627684, "reward_before_mean": 1.0830433431547135, "reward_before_std": 0.5669339969754219, "reward_change_max": 0.0005127191543579102, "reward_change_mean": -0.32312954775989056, "reward_change_min": -0.4868850149214268, "reward_change_std": 0.19642097689211369, "reward_std": 0.610711220651865, "rewards/cosine_scaled_reward": 0.0936049991287291, "rewards/format_reward": 0.8958333507180214, "step": 497 }, { "advantage_max": 1.6812223494052887, "advantage_mean": -8.07146260939362e-09, "advantage_min": -1.058426357805729, "advantage_std": 0.9998469650745392, "completion_length": 1336.208381652832, "epoch": 0.5691428571428572, "grad_norm": 2.6437041759490967, "kl": 0.59625244140625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.000438641958131e-07, "loss": 0.0239, "reward": 0.37023794968263246, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.37023794968263246, "reward_after_std": 0.8235311470925808, "reward_before_mean": 0.5918802302330732, "reward_before_std": 0.8089180812239647, "reward_change_max": 0.0, "reward_change_mean": -0.2216422688215971, "reward_change_min": -0.41504093259572983, "reward_change_std": 0.15451766457408667, "reward_std": 0.8235311731696129, "rewards/cosine_scaled_reward": -0.14155989978462458, "rewards/format_reward": 0.8750000074505806, "step": 498 }, { "advantage_max": 1.6329741179943085, "advantage_mean": -1.8626452269465688e-08, "advantage_min": -1.2069002091884613, "advantage_std": 0.9998198002576828, "completion_length": 1437.1458740234375, "epoch": 0.5702857142857143, "grad_norm": 1.9619033336639404, "kl": 0.6727371215820312, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0001096618257236e-07, "loss": 0.027, "reward": 0.5796591965481639, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5796591965481639, "reward_after_std": 0.7735571078956127, "reward_before_mean": 0.8483046852052212, "reward_before_std": 0.7528968974947929, "reward_change_max": 0.0, "reward_change_mean": -0.2686454653739929, "reward_change_min": -0.45184313133358955, "reward_change_std": 0.17254280857741833, "reward_std": 0.7735571376979351, "rewards/cosine_scaled_reward": -0.013347673695534468, "rewards/format_reward": 0.8750000149011612, "step": 499 }, { "advantage_max": 1.7250654101371765, "advantage_mean": -2.7318796558262193e-08, "advantage_min": -1.0749098509550095, "advantage_std": 0.9998650997877121, "completion_length": 1341.2708740234375, "epoch": 0.5714285714285714, "grad_norm": 1.3440532684326172, "kl": 0.57421875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1e-07, "loss": 0.0229, "reward": 0.5764958932995796, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5764958932995796, "reward_after_std": 0.8865836374461651, "reward_before_mean": 0.8342440668493509, "reward_before_std": 0.8536566384136677, "reward_change_max": 0.0, "reward_change_mean": -0.25774817913770676, "reward_change_min": -0.44375080429017544, "reward_change_std": 0.16682033147662878, "reward_std": 0.8865836411714554, "rewards/cosine_scaled_reward": -0.020377989509142935, "rewards/format_reward": 0.8750000111758709, "step": 500 }, { "epoch": 0.5714285714285714, "step": 500, "total_flos": 0.0, "train_loss": 0.005620188481842881, "train_runtime": 52392.8619, "train_samples_per_second": 0.458, "train_steps_per_second": 0.01 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }