diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13542 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5714285714285714, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "advantage_max": 1.4294447675347328, + "advantage_mean": 1.8626452602532595e-09, + "advantage_min": -0.8205335959792137, + "advantage_std": 0.805392861366272, + "completion_length": 2571.2083587646484, + "epoch": 0.001142857142857143, + "grad_norm": 0.1222195252776146, + "kl": 0.0, + "lambda_div_used": 0.6, + "learning_rate": 2e-08, + "loss": 0.0641, + "reward": 0.06657012924551964, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.06657012924551964, + "reward_after_std": 0.805392861366272, + "reward_before_mean": 0.4897647276520729, + "reward_before_std": 0.8290339298546314, + "reward_change_max": 0.0005614385008811951, + "reward_change_mean": -0.4231945872306824, + "reward_change_min": -0.8292400389909744, + "reward_change_std": 0.33647667057812214, + "reward_std": 0.8053928762674332, + "rewards/cosine_scaled_reward": -0.015534311532974243, + "rewards/format_reward": 0.5208333488553762, + "step": 1 + }, + { + "advantage_max": 0.8193490244448185, + "advantage_mean": 1.2417634698280722e-09, + "advantage_min": -0.4622782990336418, + "advantage_std": 0.4655082952231169, + "completion_length": 2804.395881652832, + "epoch": 0.002285714285714286, + "grad_norm": 0.0620713047683239, + "kl": 0.0, + "lambda_div_used": 0.6, + "learning_rate": 4e-08, + "loss": 0.0241, + "reward": -0.11615866981446743, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.11615866981446743, + "reward_after_std": 0.4655082933604717, + "reward_before_mean": 0.27539755403995514, + "reward_before_std": 0.42092561535537243, + "reward_change_max": 0.0013062208890914917, + "reward_change_mean": -0.39155622851103544, + "reward_change_min": -0.6376443430781364, + "reward_change_std": 0.26012564916163683, + "reward_std": 0.46550831012427807, + "rewards/cosine_scaled_reward": -0.04980122856795788, + "rewards/format_reward": 0.37500000558793545, + "step": 2 + }, + { + "advantage_max": 1.1013623401522636, + "advantage_mean": 6.208817460162663e-09, + "advantage_min": -0.48275332152843475, + "advantage_std": 0.5911364443600178, + "completion_length": 3411.6666870117188, + "epoch": 0.0034285714285714284, + "grad_norm": 0.10215908288955688, + "kl": 4.32431697845459e-05, + "lambda_div_used": 0.6, + "learning_rate": 6e-08, + "loss": -0.0046, + "reward": -0.4395183250308037, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.4395183250308037, + "reward_after_std": 0.5911364406347275, + "reward_before_mean": -0.2608025949448347, + "reward_before_std": 0.6000149585306644, + "reward_change_max": 0.0014719441533088684, + "reward_change_mean": -0.17871572636067867, + "reward_change_min": -0.3646555207669735, + "reward_change_std": 0.16313384287059307, + "reward_std": 0.5911364704370499, + "rewards/cosine_scaled_reward": -0.21373463701456785, + "rewards/format_reward": 0.16666667349636555, + "step": 3 + }, + { + "advantage_max": 1.765574298799038, + "advantage_mean": -1.4280279569955923e-08, + "advantage_min": -0.7085130885243416, + "advantage_std": 0.9218316338956356, + "completion_length": 2263.8333892822266, + "epoch": 0.004571428571428572, + "grad_norm": 0.1338474303483963, + "kl": 4.819035530090332e-05, + "lambda_div_used": 0.6, + "learning_rate": 8e-08, + "loss": 0.0427, + "reward": 0.06497885985299945, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.06497885985299945, + "reward_after_std": 0.9218316301703453, + "reward_before_mean": 0.4491391107439995, + "reward_before_std": 0.858651926741004, + "reward_change_max": 0.0004479065537452698, + "reward_change_mean": -0.384160247631371, + "reward_change_min": -0.755715049803257, + "reward_change_std": 0.284343633800745, + "reward_std": 0.9218316525220871, + "rewards/cosine_scaled_reward": -0.09834712743759155, + "rewards/format_reward": 0.6458333414047956, + "step": 4 + }, + { + "advantage_max": 1.026982732117176, + "advantage_mean": 7.450580596923828e-09, + "advantage_min": -0.5032768584787846, + "advantage_std": 0.5551957823336124, + "completion_length": 3247.812530517578, + "epoch": 0.005714285714285714, + "grad_norm": 0.105912946164608, + "kl": 4.468671977519989e-05, + "lambda_div_used": 0.6, + "learning_rate": 1e-07, + "loss": 0.0086, + "reward": -0.41966925258748233, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.41966925258748233, + "reward_after_std": 0.5551957786083221, + "reward_before_mean": -0.22130390163511038, + "reward_before_std": 0.5660610608756542, + "reward_change_max": 0.0005140230059623718, + "reward_change_mean": -0.19836535304784775, + "reward_change_min": -0.41412150859832764, + "reward_change_std": 0.1705097910016775, + "reward_std": 0.5551958009600639, + "rewards/cosine_scaled_reward": -0.2669019568711519, + "rewards/format_reward": 0.31250001303851604, + "step": 5 + }, + { + "advantage_max": 1.353376865386963, + "advantage_mean": 2.545615063187512e-08, + "advantage_min": -0.5026888102293015, + "advantage_std": 0.6921062879264355, + "completion_length": 3013.5834045410156, + "epoch": 0.006857142857142857, + "grad_norm": 0.14181621372699738, + "kl": 4.999339580535889e-05, + "lambda_div_used": 0.6, + "learning_rate": 1.2e-07, + "loss": 0.0561, + "reward": -0.324100736528635, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.324100736528635, + "reward_after_std": 0.6921062991023064, + "reward_before_mean": -0.1107131689786911, + "reward_before_std": 0.6496838089078665, + "reward_change_max": 0.0008455067873001099, + "reward_change_mean": -0.21338756661862135, + "reward_change_min": -0.3878238834440708, + "reward_change_std": 0.15879615675657988, + "reward_std": 0.6921063400804996, + "rewards/cosine_scaled_reward": -0.1907732579857111, + "rewards/format_reward": 0.27083333767950535, + "step": 6 + }, + { + "advantage_max": 1.4423074126243591, + "advantage_mean": 2.5766591998932498e-08, + "advantage_min": -0.8147472143173218, + "advantage_std": 0.8471020106226206, + "completion_length": 3183.5000915527344, + "epoch": 0.008, + "grad_norm": 0.15475912392139435, + "kl": 2.7105212211608887e-05, + "lambda_div_used": 0.6, + "learning_rate": 1.4e-07, + "loss": 0.0486, + "reward": -0.029259571339935064, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.029259571339935064, + "reward_after_std": 0.8471020236611366, + "reward_before_mean": 0.34296554513275623, + "reward_before_std": 0.9372109081596136, + "reward_change_max": 0.0007103309035301208, + "reward_change_mean": -0.3722251045401208, + "reward_change_min": -0.8584106490015984, + "reward_change_std": 0.3660194616531953, + "reward_std": 0.8471020497381687, + "rewards/cosine_scaled_reward": -0.0993505665101111, + "rewards/format_reward": 0.5416666828095913, + "step": 7 + }, + { + "advantage_max": 1.623491793870926, + "advantage_mean": -1.8626449826975033e-09, + "advantage_min": -0.9304697960615158, + "advantage_std": 0.9237196668982506, + "completion_length": 2693.1667098999023, + "epoch": 0.009142857142857144, + "grad_norm": 0.15012045204639435, + "kl": 2.0734965801239014e-05, + "lambda_div_used": 0.6, + "learning_rate": 1.6e-07, + "loss": 0.0278, + "reward": 0.279265059158206, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.279265059158206, + "reward_after_std": 0.9237196668982506, + "reward_before_mean": 0.8001202214509249, + "reward_before_std": 0.9343188181519508, + "reward_change_max": 0.0, + "reward_change_mean": -0.5208551697432995, + "reward_change_min": -0.9651905745267868, + "reward_change_std": 0.40238416008651257, + "reward_std": 0.9237196817994118, + "rewards/cosine_scaled_reward": 0.160476787481457, + "rewards/format_reward": 0.47916668094694614, + "step": 8 + }, + { + "advantage_max": 1.3903833664953709, + "advantage_mean": -5.898376481683343e-09, + "advantage_min": -0.7002911232411861, + "advantage_std": 0.7612915430217981, + "completion_length": 3116.7708740234375, + "epoch": 0.010285714285714285, + "grad_norm": 0.1323755532503128, + "kl": 3.580749034881592e-05, + "lambda_div_used": 0.6, + "learning_rate": 1.8e-07, + "loss": 0.047, + "reward": -0.07552929408848286, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.07552929408848286, + "reward_after_std": 0.7612915430217981, + "reward_before_mean": 0.2753769364207983, + "reward_before_std": 0.7574359718710184, + "reward_change_max": 0.0, + "reward_change_mean": -0.35090621933341026, + "reward_change_min": -0.6658867225050926, + "reward_change_std": 0.27499448135495186, + "reward_std": 0.7612915616482496, + "rewards/cosine_scaled_reward": -0.028978207614272833, + "rewards/format_reward": 0.3333333395421505, + "step": 9 + }, + { + "advantage_max": 1.6468712911009789, + "advantage_mean": 1.8626452269465688e-08, + "advantage_min": -0.6253786683082581, + "advantage_std": 0.8856963291764259, + "completion_length": 2866.791702270508, + "epoch": 0.011428571428571429, + "grad_norm": 0.13161250948905945, + "kl": 2.7856789529323578e-05, + "lambda_div_used": 0.6, + "learning_rate": 2e-07, + "loss": 0.0504, + "reward": -0.18678228557109833, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.18678228557109833, + "reward_after_std": 0.8856963291764259, + "reward_before_mean": 0.07037698850035667, + "reward_before_std": 0.9093780852854252, + "reward_change_max": 0.00042964518070220947, + "reward_change_mean": -0.25715925730764866, + "reward_change_min": -0.6563235446810722, + "reward_change_std": 0.25602648686617613, + "reward_std": 0.8856963478028774, + "rewards/cosine_scaled_reward": -0.1314781814289745, + "rewards/format_reward": 0.33333334140479565, + "step": 10 + }, + { + "advantage_max": 1.1373428963124752, + "advantage_mean": 2.6387472956690416e-08, + "advantage_min": -0.5235589370131493, + "advantage_std": 0.6247740015387535, + "completion_length": 3366.125, + "epoch": 0.012571428571428572, + "grad_norm": 0.10979026556015015, + "kl": 3.491342067718506e-05, + "lambda_div_used": 0.6, + "learning_rate": 2.1999999999999998e-07, + "loss": 0.0204, + "reward": -0.3875628258101642, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.3875628258101642, + "reward_after_std": 0.6247740034013987, + "reward_before_mean": -0.1830587424337864, + "reward_before_std": 0.653161009773612, + "reward_change_max": 0.0008605122566223145, + "reward_change_mean": -0.2045040603261441, + "reward_change_min": -0.4535864554345608, + "reward_change_std": 0.19186571810860187, + "reward_std": 0.6247740127146244, + "rewards/cosine_scaled_reward": -0.17486270423978567, + "rewards/format_reward": 0.16666666977107525, + "step": 11 + }, + { + "advantage_max": 1.6733285710215569, + "advantage_mean": -6.208817127095756e-09, + "advantage_min": -0.7657397910952568, + "advantage_std": 0.9073885902762413, + "completion_length": 2669.312515258789, + "epoch": 0.013714285714285714, + "grad_norm": 0.11108776926994324, + "kl": 4.035234451293945e-05, + "lambda_div_used": 0.6, + "learning_rate": 2.4e-07, + "loss": 0.0246, + "reward": 0.05015108606312424, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.05015108606312424, + "reward_after_std": 0.9073885828256607, + "reward_before_mean": 0.43837890587747097, + "reward_before_std": 0.901796817779541, + "reward_change_max": 0.001746349036693573, + "reward_change_mean": -0.38822780828922987, + "reward_change_min": -0.7632462754845619, + "reward_change_std": 0.29466503486037254, + "reward_std": 0.9073885828256607, + "rewards/cosine_scaled_reward": -0.06206055777147412, + "rewards/format_reward": 0.562500013038516, + "step": 12 + }, + { + "advantage_max": 1.2397056221961975, + "advantage_mean": 1.0554989993138975e-08, + "advantage_min": -0.5433732494711876, + "advantage_std": 0.6575523428618908, + "completion_length": 2878.437530517578, + "epoch": 0.014857142857142857, + "grad_norm": 0.09044164419174194, + "kl": 3.176182508468628e-05, + "lambda_div_used": 0.6, + "learning_rate": 2.6e-07, + "loss": 0.0186, + "reward": -0.06514125317335129, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.06514125317335129, + "reward_after_std": 0.6575523391366005, + "reward_before_mean": 0.3063340336084366, + "reward_before_std": 0.5916530340909958, + "reward_change_max": 0.0, + "reward_change_mean": -0.3714753072708845, + "reward_change_min": -0.6489017568528652, + "reward_change_std": 0.25239738542586565, + "reward_std": 0.657552357763052, + "rewards/cosine_scaled_reward": -0.06558297201991081, + "rewards/format_reward": 0.4375000074505806, + "step": 13 + }, + { + "advantage_max": 1.3778835982084274, + "advantage_mean": 2.4835269951672956e-09, + "advantage_min": -0.6107060834765434, + "advantage_std": 0.7284672744572163, + "completion_length": 3030.0833740234375, + "epoch": 0.016, + "grad_norm": 0.12247473001480103, + "kl": 3.055110573768616e-05, + "lambda_div_used": 0.6, + "learning_rate": 2.8e-07, + "loss": 0.0256, + "reward": -0.21163499914109707, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.21163499914109707, + "reward_after_std": 0.7284672446548939, + "reward_before_mean": 0.06481372565031052, + "reward_before_std": 0.7108139060437679, + "reward_change_max": 0.0006881281733512878, + "reward_change_mean": -0.2764487350359559, + "reward_change_min": -0.5137332789599895, + "reward_change_std": 0.20831829216331244, + "reward_std": 0.7284672893583775, + "rewards/cosine_scaled_reward": -0.12384314276278019, + "rewards/format_reward": 0.3125000074505806, + "step": 14 + }, + { + "advantage_max": 0.962163083255291, + "advantage_mean": -3.7252904094842165e-09, + "advantage_min": -0.540690753608942, + "advantage_std": 0.5459639094769955, + "completion_length": 2782.9166831970215, + "epoch": 0.017142857142857144, + "grad_norm": 0.05843517929315567, + "kl": 2.6464462280273438e-05, + "lambda_div_used": 0.6, + "learning_rate": 3e-07, + "loss": 0.0283, + "reward": -0.02799556404352188, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.02799556404352188, + "reward_after_std": 0.54596390388906, + "reward_before_mean": 0.3959492538124323, + "reward_before_std": 0.5094255739822984, + "reward_change_max": 0.0006490200757980347, + "reward_change_mean": -0.42394478945061564, + "reward_change_min": -0.6969937682151794, + "reward_change_std": 0.284216845408082, + "reward_std": 0.5459639206528664, + "rewards/cosine_scaled_reward": 0.010474616661667824, + "rewards/format_reward": 0.3750000037252903, + "step": 15 + }, + { + "advantage_max": 0.548663005232811, + "advantage_mean": 2.9181441818515452e-08, + "advantage_min": -0.3422815389931202, + "advantage_std": 0.3159701582044363, + "completion_length": 3456.2708435058594, + "epoch": 0.018285714285714287, + "grad_norm": 0.0581691712141037, + "kl": 4.1425228118896484e-05, + "lambda_div_used": 0.6, + "learning_rate": 3.2e-07, + "loss": 0.0218, + "reward": -0.6043956913053989, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.6043956913053989, + "reward_after_std": 0.31597016006708145, + "reward_before_mean": -0.4554057829082012, + "reward_before_std": 0.3371294569224119, + "reward_change_max": 0.002277083694934845, + "reward_change_mean": -0.1489899060688913, + "reward_change_min": -0.2999635115265846, + "reward_change_std": 0.1290613072924316, + "reward_std": 0.3159701693803072, + "rewards/cosine_scaled_reward": -0.2485362235456705, + "rewards/format_reward": 0.0416666679084301, + "step": 16 + }, + { + "advantage_max": 1.6136809401214123, + "advantage_mean": -7.45058070794613e-09, + "advantage_min": -0.9330508001148701, + "advantage_std": 0.9109026230871677, + "completion_length": 2188.1667098999023, + "epoch": 0.019428571428571427, + "grad_norm": 0.12020345032215118, + "kl": 3.744661808013916e-05, + "lambda_div_used": 0.6, + "learning_rate": 3.4000000000000003e-07, + "loss": -0.0071, + "reward": 0.3284926589112729, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3284926589112729, + "reward_after_std": 0.9109026528894901, + "reward_before_mean": 0.8773855976760387, + "reward_before_std": 0.9138228315860033, + "reward_change_max": 0.0, + "reward_change_mean": -0.5488929115235806, + "reward_change_min": -0.981883842498064, + "reward_change_std": 0.39753549825400114, + "reward_std": 0.9109026566147804, + "rewards/cosine_scaled_reward": 0.10535944253206253, + "rewards/format_reward": 0.6666666716337204, + "step": 17 + }, + { + "advantage_max": 1.1283956617116928, + "advantage_mean": -2.4835269396561444e-09, + "advantage_min": -0.6506418436765671, + "advantage_std": 0.6350066661834717, + "completion_length": 3028.291717529297, + "epoch": 0.02057142857142857, + "grad_norm": 0.11939737945795059, + "kl": 2.0876526832580566e-05, + "lambda_div_used": 0.6, + "learning_rate": 3.6e-07, + "loss": 0.0498, + "reward": -0.245563886128366, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.245563886128366, + "reward_after_std": 0.6350066922605038, + "reward_before_mean": 0.040940748527646065, + "reward_before_std": 0.6616333723068237, + "reward_change_max": 0.00138884037733078, + "reward_change_mean": -0.2865046225488186, + "reward_change_min": -0.576061837375164, + "reward_change_std": 0.24392648972570896, + "reward_std": 0.6350067257881165, + "rewards/cosine_scaled_reward": -0.13577963784337044, + "rewards/format_reward": 0.31250000931322575, + "step": 18 + }, + { + "advantage_max": 1.8192770816385746, + "advantage_mean": -8.692343955729598e-09, + "advantage_min": -0.8841699659824371, + "advantage_std": 0.9976575020700693, + "completion_length": 2959.000030517578, + "epoch": 0.021714285714285714, + "grad_norm": 0.2074793130159378, + "kl": 2.925097942352295e-05, + "lambda_div_used": 0.6, + "learning_rate": 3.7999999999999996e-07, + "loss": 0.0619, + "reward": 0.20424228720366955, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.20424228720366955, + "reward_after_std": 0.9976575020700693, + "reward_before_mean": 0.6615649559535086, + "reward_before_std": 1.0007536001503468, + "reward_change_max": 0.0005467459559440613, + "reward_change_mean": -0.45732265897095203, + "reward_change_min": -0.8358132503926754, + "reward_change_std": 0.358495632186532, + "reward_std": 0.9976575709879398, + "rewards/cosine_scaled_reward": 0.12244914239272475, + "rewards/format_reward": 0.4166666753590107, + "step": 19 + }, + { + "advantage_max": 1.7250901013612747, + "advantage_mean": -1.8005570590062803e-08, + "advantage_min": -0.7911568731069565, + "advantage_std": 0.9380798451602459, + "completion_length": 2311.916706085205, + "epoch": 0.022857142857142857, + "grad_norm": 0.13781826198101044, + "kl": 1.3463897630572319e-05, + "lambda_div_used": 0.6, + "learning_rate": 4e-07, + "loss": 0.055, + "reward": 0.33410761249251664, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.33410761249251664, + "reward_after_std": 0.9380798451602459, + "reward_before_mean": 0.8728911895304918, + "reward_before_std": 0.8704607700929046, + "reward_change_max": 0.0, + "reward_change_mean": -0.538783598691225, + "reward_change_min": -0.9747267179191113, + "reward_change_std": 0.384306114166975, + "reward_std": 0.9380798749625683, + "rewards/cosine_scaled_reward": 0.08227891783462837, + "rewards/format_reward": 0.7083333488553762, + "step": 20 + }, + { + "advantage_max": 1.2891832739114761, + "advantage_mean": 1.8626453157644107e-09, + "advantage_min": -0.4525096267461777, + "advantage_std": 0.6711226403713226, + "completion_length": 2708.604202270508, + "epoch": 0.024, + "grad_norm": 0.08989676833152771, + "kl": 4.710257053375244e-05, + "lambda_div_used": 0.6, + "learning_rate": 4.1999999999999995e-07, + "loss": 0.0325, + "reward": -0.022732137236744165, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.022732137236744165, + "reward_after_std": 0.6711226366460323, + "reward_before_mean": 0.36769186705350876, + "reward_before_std": 0.5723638404160738, + "reward_change_max": 0.0, + "reward_change_mean": -0.3904240126721561, + "reward_change_min": -0.6175281815230846, + "reward_change_std": 0.2368222870863974, + "reward_std": 0.6711226478219032, + "rewards/cosine_scaled_reward": -0.03490406461060047, + "rewards/format_reward": 0.43750000558793545, + "step": 21 + }, + { + "advantage_max": 1.127735674381256, + "advantage_mean": 1.862645149230957e-09, + "advantage_min": -0.5973842702805996, + "advantage_std": 0.6144330948591232, + "completion_length": 1753.437515258789, + "epoch": 0.025142857142857144, + "grad_norm": 0.07954632490873337, + "kl": 1.9297003746032715e-05, + "lambda_div_used": 0.6, + "learning_rate": 4.3999999999999997e-07, + "loss": 0.0189, + "reward": 0.18515374744310975, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.18515374744310975, + "reward_after_std": 0.6144330874085426, + "reward_before_mean": 0.707999374717474, + "reward_before_std": 0.525731585919857, + "reward_change_max": 0.0008831322193145752, + "reward_change_mean": -0.5228456184267998, + "reward_change_min": -0.8473505116999149, + "reward_change_std": 0.32211419753730297, + "reward_std": 0.6144330948591232, + "rewards/cosine_scaled_reward": -0.02100032288581133, + "rewards/format_reward": 0.7500000111758709, + "step": 22 + }, + { + "advantage_max": 1.9255974665284157, + "advantage_mean": 1.8005570145973593e-08, + "advantage_min": -0.7307319566607475, + "advantage_std": 0.9828944765031338, + "completion_length": 2248.062515258789, + "epoch": 0.026285714285714287, + "grad_norm": 0.14250506460666656, + "kl": 3.49581241607666e-05, + "lambda_div_used": 0.6, + "learning_rate": 4.6e-07, + "loss": 0.0199, + "reward": 0.05501225683838129, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.05501225683838129, + "reward_after_std": 0.9828944765031338, + "reward_before_mean": 0.4148938748985529, + "reward_before_std": 0.9027231149375439, + "reward_change_max": 0.002379797399044037, + "reward_change_mean": -0.359881600830704, + "reward_change_min": -0.6697551794350147, + "reward_change_std": 0.25272045843303204, + "reward_std": 0.9828945063054562, + "rewards/cosine_scaled_reward": -0.07380307232961059, + "rewards/format_reward": 0.5625000074505806, + "step": 23 + }, + { + "advantage_max": 1.5978022366762161, + "advantage_mean": -1.0554989549049765e-08, + "advantage_min": -0.925018347799778, + "advantage_std": 0.9181984178721905, + "completion_length": 2784.166732788086, + "epoch": 0.027428571428571427, + "grad_norm": 0.14002500474452972, + "kl": 1.7780810594558716e-05, + "lambda_div_used": 0.6, + "learning_rate": 4.8e-07, + "loss": 0.0686, + "reward": 0.12083139270544052, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.12083139270544052, + "reward_after_std": 0.9181984104216099, + "reward_before_mean": 0.5569423735141754, + "reward_before_std": 0.97861173376441, + "reward_change_max": 0.0005804151296615601, + "reward_change_mean": -0.4361109873279929, + "reward_change_min": -0.9359410665929317, + "reward_change_std": 0.37820393592119217, + "reward_std": 0.918198436498642, + "rewards/cosine_scaled_reward": 0.03888785373419523, + "rewards/format_reward": 0.4791666753590107, + "step": 24 + }, + { + "advantage_max": 1.394439235329628, + "advantage_mean": 4.346172199909404e-09, + "advantage_min": -0.758088156580925, + "advantage_std": 0.7881841994822025, + "completion_length": 2770.2083740234375, + "epoch": 0.02857142857142857, + "grad_norm": 0.13693547248840332, + "kl": 2.9962509870529175e-05, + "lambda_div_used": 0.6, + "learning_rate": 5e-07, + "loss": 0.0448, + "reward": -0.1243234477005899, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.1243234477005899, + "reward_after_std": 0.7881842032074928, + "reward_before_mean": 0.19657514989376068, + "reward_before_std": 0.8378110863268375, + "reward_change_max": 0.0010121017694473267, + "reward_change_mean": -0.32089861761778593, + "reward_change_min": -0.7076228894293308, + "reward_change_std": 0.29621788300573826, + "reward_std": 0.7881842367351055, + "rewards/cosine_scaled_reward": -0.08921242598444223, + "rewards/format_reward": 0.3750000074505806, + "step": 25 + }, + { + "advantage_max": 1.2581812031567097, + "advantage_mean": 1.8626452158443385e-08, + "advantage_min": -0.6945818662643433, + "advantage_std": 0.6935589909553528, + "completion_length": 2960.2500762939453, + "epoch": 0.029714285714285714, + "grad_norm": 0.09827658534049988, + "kl": 3.141164779663086e-05, + "lambda_div_used": 0.6, + "learning_rate": 5.2e-07, + "loss": 0.0366, + "reward": -0.0345622468739748, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.0345622468739748, + "reward_after_std": 0.6935589741915464, + "reward_before_mean": 0.35348133370280266, + "reward_before_std": 0.6798408385366201, + "reward_change_max": 0.0019709691405296326, + "reward_change_mean": -0.3880435563623905, + "reward_change_min": -0.6920421347022057, + "reward_change_std": 0.28112479858100414, + "reward_std": 0.6935589928179979, + "rewards/cosine_scaled_reward": -0.05242600850760937, + "rewards/format_reward": 0.45833334140479565, + "step": 26 + }, + { + "advantage_max": 1.429284494370222, + "advantage_mean": -1.4901161526914564e-08, + "advantage_min": -0.7001686692237854, + "advantage_std": 0.803062416613102, + "completion_length": 3211.750030517578, + "epoch": 0.030857142857142857, + "grad_norm": 0.14858104288578033, + "kl": 2.596527338027954e-05, + "lambda_div_used": 0.6, + "learning_rate": 5.4e-07, + "loss": 0.0311, + "reward": -0.09592162817716599, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.09592162817716599, + "reward_after_std": 0.8030624315142632, + "reward_before_mean": 0.24063999578356743, + "reward_before_std": 0.8367564044892788, + "reward_change_max": 0.0014395639300346375, + "reward_change_mean": -0.3365616141818464, + "reward_change_min": -0.6913486272096634, + "reward_change_std": 0.30171436443924904, + "reward_std": 0.8030624389648438, + "rewards/cosine_scaled_reward": -0.08801335096359253, + "rewards/format_reward": 0.41666667349636555, + "step": 27 + }, + { + "advantage_max": 1.9752441011369228, + "advantage_mean": -1.490116136038111e-08, + "advantage_min": -0.8696260899305344, + "advantage_std": 1.0674383416771889, + "completion_length": 2980.5833740234375, + "epoch": 0.032, + "grad_norm": 0.1666395366191864, + "kl": 2.1807849407196045e-05, + "lambda_div_used": 0.6, + "learning_rate": 5.6e-07, + "loss": 0.0266, + "reward": 0.13897611014544964, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.13897611014544964, + "reward_after_std": 1.06743835657835, + "reward_before_mean": 0.544797046110034, + "reward_before_std": 1.0800247061997652, + "reward_change_max": 0.0, + "reward_change_mean": -0.4058209341019392, + "reward_change_min": -0.8129916898906231, + "reward_change_std": 0.3283620811998844, + "reward_std": 1.067438393831253, + "rewards/cosine_scaled_reward": 0.05364852462662384, + "rewards/format_reward": 0.4375000074505806, + "step": 28 + }, + { + "advantage_max": 1.1171382665634155, + "advantage_mean": 4.967054101356894e-09, + "advantage_min": -0.5580497160553932, + "advantage_std": 0.620572954416275, + "completion_length": 3343.291717529297, + "epoch": 0.03314285714285714, + "grad_norm": 0.11576099693775177, + "kl": 1.6135862097144127e-05, + "lambda_div_used": 0.6, + "learning_rate": 5.8e-07, + "loss": 0.0468, + "reward": -0.3974270708858967, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.3974270708858967, + "reward_after_std": 0.6205729730427265, + "reward_before_mean": -0.19575086934491992, + "reward_before_std": 0.6581322774291039, + "reward_change_max": 0.0009334981441497803, + "reward_change_mean": -0.20167620666325092, + "reward_change_min": -0.5106723494827747, + "reward_change_std": 0.20898526348173618, + "reward_std": 0.6205730102956295, + "rewards/cosine_scaled_reward": -0.18120876979082823, + "rewards/format_reward": 0.1666666679084301, + "step": 29 + }, + { + "advantage_max": 1.5233756005764008, + "advantage_mean": -1.9868215850316062e-08, + "advantage_min": -0.8757916316390038, + "advantage_std": 0.8625713251531124, + "completion_length": 3079.1459045410156, + "epoch": 0.03428571428571429, + "grad_norm": 0.14153487980365753, + "kl": 1.2964010238647461e-05, + "lambda_div_used": 0.6, + "learning_rate": 6e-07, + "loss": 0.0369, + "reward": 0.13468949683010578, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.13468949683010578, + "reward_after_std": 0.8625713251531124, + "reward_before_mean": 0.5874797366559505, + "reward_before_std": 0.8919285908341408, + "reward_change_max": 0.00054217129945755, + "reward_change_mean": -0.4527902854606509, + "reward_change_min": -0.9095434695482254, + "reward_change_std": 0.36228089965879917, + "reward_std": 0.8625713437795639, + "rewards/cosine_scaled_reward": 0.0749898748472333, + "rewards/format_reward": 0.43750001676380634, + "step": 30 + }, + { + "advantage_max": 1.368198987096548, + "advantage_mean": -5.587935447692871e-09, + "advantage_min": -0.773968905210495, + "advantage_std": 0.7986135166138411, + "completion_length": 2872.312515258789, + "epoch": 0.03542857142857143, + "grad_norm": 0.0970737561583519, + "kl": 1.2390315532684326e-05, + "lambda_div_used": 0.6, + "learning_rate": 6.2e-07, + "loss": -0.0173, + "reward": -0.04627075418829918, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.04627075418829918, + "reward_after_std": 0.7986135128885508, + "reward_before_mean": 0.32107847463339567, + "reward_before_std": 0.8681342117488384, + "reward_change_max": 0.0013233870267868042, + "reward_change_mean": -0.36734923627227545, + "reward_change_min": -0.8485586605966091, + "reward_change_std": 0.3394074449315667, + "reward_std": 0.7986135166138411, + "rewards/cosine_scaled_reward": -0.03737742733210325, + "rewards/format_reward": 0.3958333358168602, + "step": 31 + }, + { + "advantage_max": 1.283915750682354, + "advantage_mean": 2.1730860499946658e-08, + "advantage_min": -0.5879904553294182, + "advantage_std": 0.6885505132377148, + "completion_length": 2963.0208740234375, + "epoch": 0.036571428571428574, + "grad_norm": 0.11873957514762878, + "kl": 2.668425440788269e-05, + "lambda_div_used": 0.6, + "learning_rate": 6.4e-07, + "loss": 0.0583, + "reward": 0.021974913775920868, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.021974913775920868, + "reward_after_std": 0.6885505355894566, + "reward_before_mean": 0.4379478101618588, + "reward_before_std": 0.6379991136491299, + "reward_change_max": 0.0, + "reward_change_mean": -0.4159728898666799, + "reward_change_min": -0.7113733068108559, + "reward_change_std": 0.2826508111320436, + "reward_std": 0.6885505504906178, + "rewards/cosine_scaled_reward": -0.020609423518180847, + "rewards/format_reward": 0.4791666753590107, + "step": 32 + }, + { + "advantage_max": 1.6484842039644718, + "advantage_mean": 4.967054045845742e-09, + "advantage_min": -0.7562173083424568, + "advantage_std": 0.9086056165397167, + "completion_length": 3314.9791870117188, + "epoch": 0.037714285714285714, + "grad_norm": 0.12317641079425812, + "kl": 3.3482909202575684e-05, + "lambda_div_used": 0.6, + "learning_rate": 6.6e-07, + "loss": 0.0104, + "reward": -0.08508192864246666, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.08508192864246666, + "reward_after_std": 0.9086056537926197, + "reward_before_mean": 0.23197847977280617, + "reward_before_std": 0.9516045339405537, + "reward_change_max": 0.0015941113233566284, + "reward_change_mean": -0.31706040538847446, + "reward_change_min": -0.7498720176517963, + "reward_change_std": 0.30978111177682877, + "reward_std": 0.9086056780070066, + "rewards/cosine_scaled_reward": -0.0506774433888495, + "rewards/format_reward": 0.3333333395421505, + "step": 33 + }, + { + "advantage_max": 1.4823268465697765, + "advantage_mean": -2.483526928553914e-08, + "advantage_min": -0.8470183648169041, + "advantage_std": 0.8238020017743111, + "completion_length": 2463.958351135254, + "epoch": 0.038857142857142854, + "grad_norm": 0.10921584069728851, + "kl": 2.8777867555618286e-05, + "lambda_div_used": 0.6, + "learning_rate": 6.800000000000001e-07, + "loss": 0.0203, + "reward": 0.26816181279718876, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.26816181279718876, + "reward_after_std": 0.8238019905984402, + "reward_before_mean": 0.7997418083250523, + "reward_before_std": 0.7961917296051979, + "reward_change_max": 1.0728836059570312e-05, + "reward_change_mean": -0.5315799824893475, + "reward_change_min": -0.9185572080314159, + "reward_change_std": 0.3667104300111532, + "reward_std": 0.8238020315766335, + "rewards/cosine_scaled_reward": 0.12903754762373865, + "rewards/format_reward": 0.5416666679084301, + "step": 34 + }, + { + "advantage_max": 1.7259946167469025, + "advantage_mean": 8.071462387349015e-09, + "advantage_min": -0.6502956449985504, + "advantage_std": 0.8964182175695896, + "completion_length": 3076.020854949951, + "epoch": 0.04, + "grad_norm": 0.1264086663722992, + "kl": 5.2697956562042236e-05, + "lambda_div_used": 0.6, + "learning_rate": 7e-07, + "loss": 0.0334, + "reward": -0.22558780387043953, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.22558780387043953, + "reward_after_std": 0.8964182436466217, + "reward_before_mean": 0.004136897623538971, + "reward_before_std": 0.8808662742376328, + "reward_change_max": 0.0007063820958137512, + "reward_change_mean": -0.22972470242530107, + "reward_change_min": -0.5383105166256428, + "reward_change_std": 0.2124389884993434, + "reward_std": 0.8964182622730732, + "rewards/cosine_scaled_reward": -0.14376488840207458, + "rewards/format_reward": 0.2916666716337204, + "step": 35 + }, + { + "advantage_max": 0.8015561476349831, + "advantage_mean": 3.725290464995368e-09, + "advantage_min": -0.3946578651666641, + "advantage_std": 0.44071637094020844, + "completion_length": 3414.0208435058594, + "epoch": 0.04114285714285714, + "grad_norm": 0.07707090675830841, + "kl": 4.0883664041757584e-05, + "lambda_div_used": 0.6, + "learning_rate": 7.2e-07, + "loss": 0.012, + "reward": -0.5596103649586439, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.5596103649586439, + "reward_after_std": 0.44071637466549873, + "reward_before_mean": -0.41469016298651695, + "reward_before_std": 0.4619513005018234, + "reward_change_max": 0.0, + "reward_change_mean": -0.14492022106423974, + "reward_change_min": -0.3504052981734276, + "reward_change_std": 0.14387890603393316, + "reward_std": 0.44071637839078903, + "rewards/cosine_scaled_reward": -0.2802617456763983, + "rewards/format_reward": 0.14583333767950535, + "step": 36 + }, + { + "advantage_max": 0.782890573143959, + "advantage_mean": 3.476937748825293e-08, + "advantage_min": -0.3986804038286209, + "advantage_std": 0.42814368568360806, + "completion_length": 3353.9166870117188, + "epoch": 0.04228571428571429, + "grad_norm": 0.07412921637296677, + "kl": 2.1278858184814453e-05, + "lambda_div_used": 0.6, + "learning_rate": 7.4e-07, + "loss": 0.0032, + "reward": -0.49189055524766445, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.49189055524766445, + "reward_after_std": 0.4281436949968338, + "reward_before_mean": -0.30478277802467346, + "reward_before_std": 0.42716532200574875, + "reward_change_max": 0.0016405805945396423, + "reward_change_mean": -0.18710775269755686, + "reward_change_min": -0.38063835352659225, + "reward_change_std": 0.14960275805788115, + "reward_std": 0.4281436949968338, + "rewards/cosine_scaled_reward": -0.25655805692076683, + "rewards/format_reward": 0.2083333358168602, + "step": 37 + }, + { + "advantage_max": 1.234726544469595, + "advantage_mean": 4.9670536017565325e-09, + "advantage_min": -0.5854838155210018, + "advantage_std": 0.6552534718066454, + "completion_length": 3217.9791870117188, + "epoch": 0.04342857142857143, + "grad_norm": 0.12655900418758392, + "kl": 4.662945866584778e-05, + "lambda_div_used": 0.6, + "learning_rate": 7.599999999999999e-07, + "loss": 0.0154, + "reward": -0.19940327107906342, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.19940327107906342, + "reward_after_std": 0.655253479257226, + "reward_before_mean": 0.09616225806530565, + "reward_before_std": 0.6347245592623949, + "reward_change_max": 0.0019334778189659119, + "reward_change_mean": -0.2955655427649617, + "reward_change_min": -0.5166320390999317, + "reward_change_std": 0.21207730285823345, + "reward_std": 0.6552534829825163, + "rewards/cosine_scaled_reward": -0.05608554696664214, + "rewards/format_reward": 0.20833333395421505, + "step": 38 + }, + { + "advantage_max": 0.8507803976535797, + "advantage_mean": 4.9670543234014986e-09, + "advantage_min": -0.5066513679921627, + "advantage_std": 0.49880874902009964, + "completion_length": 2870.875011444092, + "epoch": 0.044571428571428574, + "grad_norm": 0.06974208354949951, + "kl": 2.384372055530548e-05, + "lambda_div_used": 0.6, + "learning_rate": 7.799999999999999e-07, + "loss": 0.03, + "reward": -0.16522181406617165, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.16522181406617165, + "reward_after_std": 0.49880874156951904, + "reward_before_mean": 0.19637863337993622, + "reward_before_std": 0.48516307689715177, + "reward_change_max": 0.00154896080493927, + "reward_change_mean": -0.36160044465214014, + "reward_change_min": -0.6046626195311546, + "reward_change_std": 0.2663704315200448, + "reward_std": 0.49880874156951904, + "rewards/cosine_scaled_reward": -0.12056069076061249, + "rewards/format_reward": 0.4375000074505806, + "step": 39 + }, + { + "advantage_max": 1.5168475210666656, + "advantage_mean": 1.862645371275562e-09, + "advantage_min": -0.5961493253707886, + "advantage_std": 0.8030089661478996, + "completion_length": 2627.8333587646484, + "epoch": 0.045714285714285714, + "grad_norm": 0.1441635638475418, + "kl": 0.0001430148258805275, + "lambda_div_used": 0.6, + "learning_rate": 8e-07, + "loss": 0.0472, + "reward": -0.02972988225519657, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.02972988225519657, + "reward_after_std": 0.8030089475214481, + "reward_before_mean": 0.3298732750117779, + "reward_before_std": 0.7454587928950787, + "reward_change_max": 0.0009643435478210449, + "reward_change_mean": -0.35960315354168415, + "reward_change_min": -0.7189357168972492, + "reward_change_std": 0.27363201417028904, + "reward_std": 0.8030089773237705, + "rewards/cosine_scaled_reward": -0.06423003599047661, + "rewards/format_reward": 0.4583333395421505, + "step": 40 + }, + { + "advantage_max": 1.5067855417728424, + "advantage_mean": 1.9247333948868572e-08, + "advantage_min": -0.6676128581166267, + "advantage_std": 0.813670065253973, + "completion_length": 3086.3959045410156, + "epoch": 0.046857142857142854, + "grad_norm": 0.1339729130268097, + "kl": 3.9830803871154785e-05, + "lambda_div_used": 0.6, + "learning_rate": 8.199999999999999e-07, + "loss": 0.0698, + "reward": -0.2257232129049953, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2257232129049953, + "reward_after_std": 0.8136700578033924, + "reward_before_mean": 0.026448657736182213, + "reward_before_std": 0.8387161567807198, + "reward_change_max": 0.0006530433893203735, + "reward_change_mean": -0.2521718628704548, + "reward_change_min": -0.6511830650269985, + "reward_change_std": 0.24910242576152086, + "reward_std": 0.8136700727045536, + "rewards/cosine_scaled_reward": -0.15344234509393573, + "rewards/format_reward": 0.33333334140479565, + "step": 41 + }, + { + "advantage_max": 0.8451377004384995, + "advantage_mean": 1.6763806787167823e-08, + "advantage_min": -0.3755408897995949, + "advantage_std": 0.4480682760477066, + "completion_length": 2820.666717529297, + "epoch": 0.048, + "grad_norm": 0.061046577990055084, + "kl": 4.439055919647217e-05, + "lambda_div_used": 0.6, + "learning_rate": 8.399999999999999e-07, + "loss": 0.0256, + "reward": -0.43968756031244993, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.43968756031244993, + "reward_after_std": 0.4480682760477066, + "reward_before_mean": -0.23473063483834267, + "reward_before_std": 0.41953370813280344, + "reward_change_max": 0.0012653842568397522, + "reward_change_mean": -0.204956928268075, + "reward_change_min": -0.38732779771089554, + "reward_change_std": 0.15360154025256634, + "reward_std": 0.4480682946741581, + "rewards/cosine_scaled_reward": -0.2736153192818165, + "rewards/format_reward": 0.31250000186264515, + "step": 42 + }, + { + "advantage_max": 1.198561392724514, + "advantage_mean": 1.3659398168108794e-08, + "advantage_min": -0.6220279037952423, + "advantage_std": 0.6821011230349541, + "completion_length": 3063.6875762939453, + "epoch": 0.04914285714285714, + "grad_norm": 0.12261078506708145, + "kl": 7.574260234832764e-05, + "lambda_div_used": 0.6, + "learning_rate": 8.599999999999999e-07, + "loss": 0.0814, + "reward": -0.2531822435557842, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.2531822435557842, + "reward_after_std": 0.6821011155843735, + "reward_before_mean": 0.01838504709303379, + "reward_before_std": 0.7341145165264606, + "reward_change_max": 0.001272149384021759, + "reward_change_mean": -0.27156728971749544, + "reward_change_min": -0.6277378126978874, + "reward_change_std": 0.2601107247173786, + "reward_std": 0.6821011193096638, + "rewards/cosine_scaled_reward": -0.1262241369113326, + "rewards/format_reward": 0.2708333358168602, + "step": 43 + }, + { + "advantage_max": 1.5276354625821114, + "advantage_mean": -8.07146305348283e-09, + "advantage_min": -0.7380058616399765, + "advantage_std": 0.8330572284758091, + "completion_length": 2725.2083740234375, + "epoch": 0.05028571428571429, + "grad_norm": 0.17938436567783356, + "kl": 0.00020732451230287552, + "lambda_div_used": 0.6, + "learning_rate": 8.799999999999999e-07, + "loss": 0.0918, + "reward": -0.06681879423558712, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.06681879423558712, + "reward_after_std": 0.8330572471022606, + "reward_before_mean": 0.27059781178832054, + "reward_before_std": 0.8473841995000839, + "reward_change_max": 0.003085687756538391, + "reward_change_mean": -0.33741660602390766, + "reward_change_min": -0.7176680006086826, + "reward_change_std": 0.28439064137637615, + "reward_std": 0.8330572918057442, + "rewards/cosine_scaled_reward": -0.06261776690371335, + "rewards/format_reward": 0.3958333395421505, + "step": 44 + }, + { + "advantage_max": 1.1212849467992783, + "advantage_mean": 9.934107758624577e-09, + "advantage_min": -0.4997914358973503, + "advantage_std": 0.6178267244249582, + "completion_length": 3435.9791870117188, + "epoch": 0.05142857142857143, + "grad_norm": 0.10308519750833511, + "kl": 8.343835361301899e-05, + "lambda_div_used": 0.6, + "learning_rate": 9e-07, + "loss": 0.0177, + "reward": -0.4351994302123785, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.4351994302123785, + "reward_after_std": 0.6178267393261194, + "reward_before_mean": -0.25422532111406326, + "reward_before_std": 0.6577106714248657, + "reward_change_max": 0.0012275278568267822, + "reward_change_mean": -0.18097413005307317, + "reward_change_min": -0.5485908165574074, + "reward_change_std": 0.20650537125766277, + "reward_std": 0.6178267672657967, + "rewards/cosine_scaled_reward": -0.21044599390006624, + "rewards/format_reward": 0.1666666716337204, + "step": 45 + }, + { + "advantage_max": 0.9788857847452164, + "advantage_mean": 1.3659398612198004e-08, + "advantage_min": -0.3992095962166786, + "advantage_std": 0.5113865826278925, + "completion_length": 3239.3125, + "epoch": 0.052571428571428575, + "grad_norm": 0.08502691239118576, + "kl": 0.00022674724459648132, + "lambda_div_used": 0.6, + "learning_rate": 9.2e-07, + "loss": -0.0064, + "reward": -0.4699201360344887, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.4699201360344887, + "reward_after_std": 0.5113865602761507, + "reward_before_mean": -0.29295278526842594, + "reward_before_std": 0.48858173191547394, + "reward_change_max": 0.0021085962653160095, + "reward_change_mean": -0.1769673554226756, + "reward_change_min": -0.3407178223133087, + "reward_change_std": 0.1363795893266797, + "reward_std": 0.511386577039957, + "rewards/cosine_scaled_reward": -0.22980972193181515, + "rewards/format_reward": 0.1666666679084301, + "step": 46 + }, + { + "advantage_max": 1.38753230124712, + "advantage_mean": -1.1175871117430347e-08, + "advantage_min": -0.8971096090972424, + "advantage_std": 0.831636069342494, + "completion_length": 2720.6667251586914, + "epoch": 0.053714285714285714, + "grad_norm": 0.11728230863809586, + "kl": 0.00011475756764411926, + "lambda_div_used": 0.6, + "learning_rate": 9.399999999999999e-07, + "loss": 0.0287, + "reward": 0.16340744495391846, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.16340744495391846, + "reward_after_std": 0.831636056303978, + "reward_before_mean": 0.6462972527369857, + "reward_before_std": 0.9100265484303236, + "reward_change_max": 0.000314466655254364, + "reward_change_mean": -0.482889830134809, + "reward_change_min": -0.8927135579288006, + "reward_change_std": 0.4036479415372014, + "reward_std": 0.831636093556881, + "rewards/cosine_scaled_reward": 0.05231529846787453, + "rewards/format_reward": 0.5416666734963655, + "step": 47 + }, + { + "advantage_max": 1.1383882090449333, + "advantage_mean": 1.1175871117430347e-08, + "advantage_min": -0.5719936788082123, + "advantage_std": 0.62079693749547, + "completion_length": 2835.3333740234375, + "epoch": 0.054857142857142854, + "grad_norm": 0.10334164649248123, + "kl": 0.0005565360188484192, + "lambda_div_used": 0.6, + "learning_rate": 9.6e-07, + "loss": 0.0129, + "reward": -0.1817401812877506, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.1817401812877506, + "reward_after_std": 0.6207969449460506, + "reward_before_mean": 0.13473353162407875, + "reward_before_std": 0.6070270985364914, + "reward_change_max": 0.002810366451740265, + "reward_change_mean": -0.3164736973121762, + "reward_change_min": -0.6078220754861832, + "reward_change_std": 0.23711966536939144, + "reward_std": 0.6207969635725021, + "rewards/cosine_scaled_reward": -0.1305499104782939, + "rewards/format_reward": 0.39583333767950535, + "step": 48 + }, + { + "advantage_max": 1.3860639333724976, + "advantage_mean": 1.4280280069556284e-08, + "advantage_min": -0.6047784183174372, + "advantage_std": 0.7415264807641506, + "completion_length": 2556.1042098999023, + "epoch": 0.056, + "grad_norm": 0.10148799419403076, + "kl": 0.0002539954148232937, + "lambda_div_used": 0.6, + "learning_rate": 9.8e-07, + "loss": 0.058, + "reward": -0.21707582101225853, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.21707582101225853, + "reward_after_std": 0.7415264658629894, + "reward_before_mean": 0.05638875346630812, + "reward_before_std": 0.7281204629689455, + "reward_change_max": 0.00105363130569458, + "reward_change_mean": -0.2734645586460829, + "reward_change_min": -0.574739396572113, + "reward_change_std": 0.23461730778217316, + "reward_std": 0.7415264882147312, + "rewards/cosine_scaled_reward": -0.18013896653428674, + "rewards/format_reward": 0.4166666716337204, + "step": 49 + }, + { + "advantage_max": 1.3879567012190819, + "advantage_mean": -5.587935614226325e-09, + "advantage_min": -0.6811681613326073, + "advantage_std": 0.7566171064972878, + "completion_length": 2899.0625228881836, + "epoch": 0.05714285714285714, + "grad_norm": 0.10399441421031952, + "kl": 0.0004333890974521637, + "lambda_div_used": 0.6, + "learning_rate": 1e-06, + "loss": 0.0508, + "reward": -0.027900653425604105, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.027900653425604105, + "reward_after_std": 0.756617084145546, + "reward_before_mean": 0.3487557955086231, + "reward_before_std": 0.7330795712769032, + "reward_change_max": 0.002227097749710083, + "reward_change_mean": -0.3766564065590501, + "reward_change_min": -0.6959068775177002, + "reward_change_std": 0.29322480224072933, + "reward_std": 0.756617110222578, + "rewards/cosine_scaled_reward": 0.007711221929639578, + "rewards/format_reward": 0.3333333395421505, + "step": 50 + }, + { + "advantage_max": 1.3180744349956512, + "advantage_mean": -2.1109978987077227e-08, + "advantage_min": -0.5348929353058338, + "advantage_std": 0.6934761293232441, + "completion_length": 2412.520866394043, + "epoch": 0.05828571428571429, + "grad_norm": 0.11199980974197388, + "kl": 0.001006007194519043, + "lambda_div_used": 0.6, + "learning_rate": 9.999890338174275e-07, + "loss": 0.0196, + "reward": -0.08373632282018661, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.08373632282018661, + "reward_after_std": 0.6934761442244053, + "reward_before_mean": 0.2698266333900392, + "reward_before_std": 0.6321978941559792, + "reward_change_max": 7.683038711547852e-05, + "reward_change_mean": -0.3535629725083709, + "reward_change_min": -0.6554724425077438, + "reward_change_std": 0.2407732205465436, + "reward_std": 0.6934761591255665, + "rewards/cosine_scaled_reward": -0.115086690755561, + "rewards/format_reward": 0.5000000037252903, + "step": 51 + }, + { + "advantage_max": 1.5817754976451397, + "advantage_mean": 2.4835267176115394e-09, + "advantage_min": -0.6358248367905617, + "advantage_std": 0.8387469574809074, + "completion_length": 2993.6875534057617, + "epoch": 0.05942857142857143, + "grad_norm": 0.11231490969657898, + "kl": 0.001201428472995758, + "lambda_div_used": 0.6, + "learning_rate": 9.999561358041868e-07, + "loss": 0.0178, + "reward": -0.18798981048166752, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.18798981048166752, + "reward_after_std": 0.8387469500303268, + "reward_before_mean": 0.07809478137642145, + "reward_before_std": 0.8421785943210125, + "reward_change_max": 0.001999780535697937, + "reward_change_mean": -0.266084595117718, + "reward_change_min": -0.5899167135357857, + "reward_change_std": 0.22805576538667083, + "reward_std": 0.8387469574809074, + "rewards/cosine_scaled_reward": -0.11720261455047876, + "rewards/format_reward": 0.31250000558793545, + "step": 52 + }, + { + "advantage_max": 1.4530688673257828, + "advantage_mean": -4.967053657267684e-09, + "advantage_min": -0.6794195547699928, + "advantage_std": 0.7901476360857487, + "completion_length": 2776.8958587646484, + "epoch": 0.060571428571428575, + "grad_norm": 0.10073976218700409, + "kl": 0.0005531087517738342, + "lambda_div_used": 0.6, + "learning_rate": 9.999013075636804e-07, + "loss": 0.0231, + "reward": 0.1613441277295351, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1613441277295351, + "reward_after_std": 0.7901476416736841, + "reward_before_mean": 0.6379873100668192, + "reward_before_std": 0.7355969715863466, + "reward_change_max": 0.0003918856382369995, + "reward_change_mean": -0.47664317348971963, + "reward_change_min": -0.8067650347948074, + "reward_change_std": 0.32972801849246025, + "reward_std": 0.7901476863771677, + "rewards/cosine_scaled_reward": 0.04816031642258167, + "rewards/format_reward": 0.5416666697710752, + "step": 53 + }, + { + "advantage_max": 1.3110667504370213, + "advantage_mean": -1.8005569923928988e-08, + "advantage_min": -0.7371912263333797, + "advantage_std": 0.7567489556968212, + "completion_length": 2940.541732788086, + "epoch": 0.061714285714285715, + "grad_norm": 0.13281145691871643, + "kl": 0.0002486109733581543, + "lambda_div_used": 0.6, + "learning_rate": 9.998245517681593e-07, + "loss": 0.0728, + "reward": 0.1529210014268756, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1529210014268756, + "reward_after_std": 0.7567489566281438, + "reward_before_mean": 0.6375395655632019, + "reward_before_std": 0.7690273942425847, + "reward_change_max": 0.002032041549682617, + "reward_change_mean": -0.48461859254166484, + "reward_change_min": -0.8870695792138577, + "reward_change_std": 0.3667809630278498, + "reward_std": 0.7567489612847567, + "rewards/cosine_scaled_reward": 0.07918643951416016, + "rewards/format_reward": 0.4791666716337204, + "step": 54 + }, + { + "advantage_max": 1.3867616951465607, + "advantage_mean": 1.2417635808503746e-09, + "advantage_min": -0.6409769840538502, + "advantage_std": 0.7622390799224377, + "completion_length": 3007.229217529297, + "epoch": 0.06285714285714286, + "grad_norm": 0.11522038280963898, + "kl": 0.0010178424417972565, + "lambda_div_used": 0.6, + "learning_rate": 9.997258721585931e-07, + "loss": 0.0388, + "reward": -0.011176850646734238, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.011176850646734238, + "reward_after_std": 0.7622390948235989, + "reward_before_mean": 0.37410153821110725, + "reward_before_std": 0.7458370700478554, + "reward_change_max": 0.0006311237812042236, + "reward_change_mean": -0.3852783814072609, + "reward_change_min": -0.6944508105516434, + "reward_change_std": 0.2901465371251106, + "reward_std": 0.7622391171753407, + "rewards/cosine_scaled_reward": -0.00044924020767211914, + "rewards/format_reward": 0.37500000558793545, + "step": 55 + }, + { + "advantage_max": 1.076439805328846, + "advantage_mean": 1.30385160446167e-08, + "advantage_min": -0.6210877783596516, + "advantage_std": 0.6181384474039078, + "completion_length": 3061.854217529297, + "epoch": 0.064, + "grad_norm": 0.10272736847400665, + "kl": 0.0003896951675415039, + "lambda_div_used": 0.6, + "learning_rate": 9.996052735444862e-07, + "loss": 0.0233, + "reward": -0.15545228496193886, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.15545228496193886, + "reward_after_std": 0.618138462305069, + "reward_before_mean": 0.1853547152131796, + "reward_before_std": 0.6448741275817156, + "reward_change_max": 0.0, + "reward_change_mean": -0.3408069796860218, + "reward_change_min": -0.6736448742449284, + "reward_change_std": 0.27636164613068104, + "reward_std": 0.6181384846568108, + "rewards/cosine_scaled_reward": -0.09482264146208763, + "rewards/format_reward": 0.37500000558793545, + "step": 56 + }, + { + "advantage_max": 1.1007253751158714, + "advantage_mean": 1.9868215073159945e-08, + "advantage_min": -0.5071723088622093, + "advantage_std": 0.5871976688504219, + "completion_length": 3461.2083435058594, + "epoch": 0.06514285714285714, + "grad_norm": 0.08488260954618454, + "kl": 0.0002168715000152588, + "lambda_div_used": 0.6, + "learning_rate": 9.994627618036452e-07, + "loss": 0.0112, + "reward": -0.39496668986976147, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.39496668986976147, + "reward_after_std": 0.5871976688504219, + "reward_before_mean": -0.18953094072639942, + "reward_before_std": 0.5871923677623272, + "reward_change_max": 0.00031384825706481934, + "reward_change_mean": -0.2054357398301363, + "reward_change_min": -0.42640750110149384, + "reward_change_std": 0.17269728146493435, + "reward_std": 0.5871977023780346, + "rewards/cosine_scaled_reward": -0.19893213408067822, + "rewards/format_reward": 0.20833334140479565, + "step": 57 + }, + { + "advantage_max": 1.5824204310774803, + "advantage_mean": -4.03573105489663e-09, + "advantage_min": -0.8778524771332741, + "advantage_std": 0.8872012719511986, + "completion_length": 2214.7917251586914, + "epoch": 0.06628571428571428, + "grad_norm": 0.13823264837265015, + "kl": 0.0023623108863830566, + "lambda_div_used": 0.6, + "learning_rate": 9.992983438818915e-07, + "loss": 0.0361, + "reward": 0.3328779856674373, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3328779856674373, + "reward_after_std": 0.887201264500618, + "reward_before_mean": 0.8885044003836811, + "reward_before_std": 0.8764373175799847, + "reward_change_max": 9.519606828689575e-05, + "reward_change_mean": -0.5556264445185661, + "reward_change_min": -1.002270046621561, + "reward_change_std": 0.39555412344634533, + "reward_std": 0.8872012794017792, + "rewards/cosine_scaled_reward": 0.0692522106692195, + "rewards/format_reward": 0.7500000074505806, + "step": 58 + }, + { + "advantage_max": 1.0460245832800865, + "advantage_mean": -3.104408341503273e-09, + "advantage_min": -0.6583354324102402, + "advantage_std": 0.6132517829537392, + "completion_length": 2964.625030517578, + "epoch": 0.06742857142857143, + "grad_norm": 0.11187250167131424, + "kl": 0.0005678483285009861, + "lambda_div_used": 0.6, + "learning_rate": 9.991120277927223e-07, + "loss": 0.0396, + "reward": -0.23224904853850603, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.23224904853850603, + "reward_after_std": 0.613251805305481, + "reward_before_mean": 0.06915994361042976, + "reward_before_std": 0.6662236377596855, + "reward_change_max": 0.0012992247939109802, + "reward_change_mean": -0.30140898609533906, + "reward_change_min": -0.6153623089194298, + "reward_change_std": 0.2672270992770791, + "reward_std": 0.6132518127560616, + "rewards/cosine_scaled_reward": -0.13208669982850552, + "rewards/format_reward": 0.33333334140479565, + "step": 59 + }, + { + "advantage_max": 1.3265932314097881, + "advantage_mean": 1.8005570145973593e-08, + "advantage_min": -0.5170855298638344, + "advantage_std": 0.7021647803485394, + "completion_length": 3072.6041870117188, + "epoch": 0.06857142857142857, + "grad_norm": 0.10385601967573166, + "kl": 0.0005446523427963257, + "lambda_div_used": 0.6, + "learning_rate": 9.989038226169207e-07, + "loss": 0.0085, + "reward": -0.2865824941545725, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.2865824941545725, + "reward_after_std": 0.7021647542715073, + "reward_before_mean": -0.047466689720749855, + "reward_before_std": 0.6895656269043684, + "reward_change_max": 0.001022636890411377, + "reward_change_mean": -0.23911579558625817, + "reward_change_min": -0.5408961176872253, + "reward_change_std": 0.20852595707401633, + "reward_std": 0.7021647542715073, + "rewards/cosine_scaled_reward": -0.19040001556277275, + "rewards/format_reward": 0.3333333358168602, + "step": 60 + }, + { + "advantage_max": 1.3558772429823875, + "advantage_mean": 9.93410786964688e-09, + "advantage_min": -0.5892483256757259, + "advantage_std": 0.7217530123889446, + "completion_length": 3244.791748046875, + "epoch": 0.06971428571428571, + "grad_norm": 0.1278354525566101, + "kl": 0.0008572190999984741, + "lambda_div_used": 0.6, + "learning_rate": 9.98673738502114e-07, + "loss": 0.0538, + "reward": -0.23935077455826104, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.23935077455826104, + "reward_after_std": 0.7217530133202672, + "reward_before_mean": 0.02287537232041359, + "reward_before_std": 0.7123109959065914, + "reward_change_max": 0.0023524612188339233, + "reward_change_mean": -0.26222612289711833, + "reward_change_min": -0.5236610397696495, + "reward_change_std": 0.217988062184304, + "reward_std": 0.721753029152751, + "rewards/cosine_scaled_reward": -0.1760623399168253, + "rewards/format_reward": 0.3750000074505806, + "step": 61 + }, + { + "advantage_max": 1.7632023394107819, + "advantage_mean": -4.656613122877573e-09, + "advantage_min": -0.9446415901184082, + "advantage_std": 1.0154646262526512, + "completion_length": 2728.5625228881836, + "epoch": 0.07085714285714285, + "grad_norm": 0.12908360362052917, + "kl": 0.0017460063099861145, + "lambda_div_used": 0.6, + "learning_rate": 9.98421786662277e-07, + "loss": 0.0681, + "reward": 0.21019299514591694, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.21019299514591694, + "reward_after_std": 1.01546461135149, + "reward_before_mean": 0.6783958990126848, + "reward_before_std": 1.1028874181210995, + "reward_change_max": 0.0018347501754760742, + "reward_change_mean": -0.4682029206305742, + "reward_change_min": -0.968503512442112, + "reward_change_std": 0.4169737081974745, + "reward_std": 1.015464648604393, + "rewards/cosine_scaled_reward": 0.047531288117170334, + "rewards/format_reward": 0.5833333395421505, + "step": 62 + }, + { + "advantage_max": 1.3256918042898178, + "advantage_mean": -8.692344399818808e-09, + "advantage_min": -0.7518532276153564, + "advantage_std": 0.7779847048223019, + "completion_length": 2488.1458740234375, + "epoch": 0.072, + "grad_norm": 0.14880503714084625, + "kl": 0.002031862735748291, + "lambda_div_used": 0.6, + "learning_rate": 9.981479793771866e-07, + "loss": 0.0716, + "reward": 0.16536018857732415, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.16536018857732415, + "reward_after_std": 0.7779847234487534, + "reward_before_mean": 0.6565921977162361, + "reward_before_std": 0.7892885077744722, + "reward_change_max": 0.00045865029096603394, + "reward_change_mean": -0.49123201705515385, + "reward_change_min": -0.9217961169779301, + "reward_change_std": 0.3878398798406124, + "reward_std": 0.7779847532510757, + "rewards/cosine_scaled_reward": 0.005379423499107361, + "rewards/format_reward": 0.6458333358168602, + "step": 63 + }, + { + "advantage_max": 1.1469408124685287, + "advantage_mean": -6.208820124697922e-10, + "advantage_min": -0.7107248418033123, + "advantage_std": 0.6794019639492035, + "completion_length": 2977.8333740234375, + "epoch": 0.07314285714285715, + "grad_norm": 0.12988926470279694, + "kl": 0.0029876232147216797, + "lambda_div_used": 0.6, + "learning_rate": 9.97852329991824e-07, + "loss": 0.056, + "reward": 0.05250055715441704, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.05250055715441704, + "reward_after_std": 0.6794019490480423, + "reward_before_mean": 0.49878887087106705, + "reward_before_std": 0.6982266828417778, + "reward_change_max": 0.0, + "reward_change_mean": -0.4462883062660694, + "reward_change_min": -0.7984127178788185, + "reward_change_std": 0.34245668537914753, + "reward_std": 0.6794019564986229, + "rewards/cosine_scaled_reward": 0.04106108099222183, + "rewards/format_reward": 0.41666667722165585, + "step": 64 + }, + { + "advantage_max": 1.1359502971172333, + "advantage_mean": -6.208819014474898e-10, + "advantage_min": -0.4511838797479868, + "advantage_std": 0.5894279684871435, + "completion_length": 2855.6458435058594, + "epoch": 0.07428571428571429, + "grad_norm": 0.09675120562314987, + "kl": 0.0013644695281982422, + "lambda_div_used": 0.6, + "learning_rate": 9.975348529157229e-07, + "loss": -0.0027, + "reward": -0.13648290559649467, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.13648290559649467, + "reward_after_std": 0.5894279796630144, + "reward_before_mean": 0.20960132515756413, + "reward_before_std": 0.5116656869649887, + "reward_change_max": 0.000279448926448822, + "reward_change_mean": -0.3460842249915004, + "reward_change_min": -0.5561416335403919, + "reward_change_std": 0.22418450471013784, + "reward_std": 0.5894279852509499, + "rewards/cosine_scaled_reward": -0.09311600960791111, + "rewards/format_reward": 0.39583333395421505, + "step": 65 + }, + { + "advantage_max": 1.126391690224409, + "advantage_mean": -4.3461716447978915e-09, + "advantage_min": -0.4565571919083595, + "advantage_std": 0.5905974991619587, + "completion_length": 2221.833351135254, + "epoch": 0.07542857142857143, + "grad_norm": 0.05661759525537491, + "kl": 0.001573324203491211, + "lambda_div_used": 0.6, + "learning_rate": 9.971955636222684e-07, + "loss": -0.0186, + "reward": 0.15690111927688122, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.15690111927688122, + "reward_after_std": 0.5905974879860878, + "reward_before_mean": 0.664361234754324, + "reward_before_std": 0.4507758244872093, + "reward_change_max": 0.0, + "reward_change_mean": -0.5074600903317332, + "reward_change_min": -0.7607763223350048, + "reward_change_std": 0.2888161540031433, + "reward_std": 0.5905974917113781, + "rewards/cosine_scaled_reward": 0.08218058943748474, + "rewards/format_reward": 0.5, + "step": 66 + }, + { + "advantage_max": 0.7976373434066772, + "advantage_mean": 1.1175871117430347e-08, + "advantage_min": -0.3243073895573616, + "advantage_std": 0.41893285885453224, + "completion_length": 3426.0416870117188, + "epoch": 0.07657142857142857, + "grad_norm": 0.06953131407499313, + "kl": 0.0017848312854766846, + "lambda_div_used": 0.6, + "learning_rate": 9.968344786479415e-07, + "loss": -0.0187, + "reward": -0.590395949780941, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.590395949780941, + "reward_after_std": 0.41893285512924194, + "reward_before_mean": -0.460202120244503, + "reward_before_std": 0.40765188075602055, + "reward_change_max": 0.001080058515071869, + "reward_change_mean": -0.1301938333781436, + "reward_change_min": -0.26982005313038826, + "reward_change_std": 0.11151221627369523, + "reward_std": 0.41893286257982254, + "rewards/cosine_scaled_reward": -0.3134343959391117, + "rewards/format_reward": 0.1666666716337204, + "step": 67 + }, + { + "advantage_max": 1.418497547507286, + "advantage_mean": 0.0, + "advantage_min": -0.7557137459516525, + "advantage_std": 0.8144563175737858, + "completion_length": 2536.7709045410156, + "epoch": 0.07771428571428571, + "grad_norm": 0.16494296491146088, + "kl": 0.0022640228271484375, + "lambda_div_used": 0.6, + "learning_rate": 9.964516155915151e-07, + "loss": 0.0918, + "reward": -0.015500393696129322, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.015500393696129322, + "reward_after_std": 0.8144563212990761, + "reward_before_mean": 0.36400349996984005, + "reward_before_std": 0.8677889443933964, + "reward_change_max": 0.0013982132077217102, + "reward_change_mean": -0.3795038778334856, + "reward_change_min": -0.7978788800537586, + "reward_change_std": 0.32792997919023037, + "reward_std": 0.8144563250243664, + "rewards/cosine_scaled_reward": -0.07841494982130826, + "rewards/format_reward": 0.5208333414047956, + "step": 68 + }, + { + "advantage_max": 0.9903690777719021, + "advantage_mean": 1.614292521878724e-08, + "advantage_min": -0.49220066517591476, + "advantage_std": 0.5437542498111725, + "completion_length": 2929.7500381469727, + "epoch": 0.07885714285714286, + "grad_norm": 0.09613347053527832, + "kl": 0.0040988922119140625, + "lambda_div_used": 0.6, + "learning_rate": 9.960469931131936e-07, + "loss": 0.0129, + "reward": -0.3549748270306736, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.3549748270306736, + "reward_after_std": 0.5437542349100113, + "reward_before_mean": -0.11749381478875875, + "reward_before_std": 0.5448034442961216, + "reward_change_max": 0.0013570860028266907, + "reward_change_mean": -0.23748101433739066, + "reward_change_min": -0.5052936151623726, + "reward_change_std": 0.19911040179431438, + "reward_std": 0.5437542647123337, + "rewards/cosine_scaled_reward": -0.23583024507388473, + "rewards/format_reward": 0.35416667349636555, + "step": 69 + }, + { + "advantage_max": 1.2011119946837425, + "advantage_mean": 9.313226134732844e-09, + "advantage_min": -0.43334708362817764, + "advantage_std": 0.616494245827198, + "completion_length": 3077.083335876465, + "epoch": 0.08, + "grad_norm": 0.08861793577671051, + "kl": 0.00223541259765625, + "lambda_div_used": 0.6, + "learning_rate": 9.956206309337066e-07, + "loss": 0.0137, + "reward": -0.30773347429931164, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.30773347429931164, + "reward_after_std": 0.6164942122995853, + "reward_before_mean": -0.06555812992155552, + "reward_before_std": 0.5589950382709503, + "reward_change_max": 0.0006213560700416565, + "reward_change_mean": -0.24217534251511097, + "reward_change_min": -0.40470268577337265, + "reward_change_std": 0.15899777598679066, + "reward_std": 0.6164942272007465, + "rewards/cosine_scaled_reward": -0.18902906961739063, + "rewards/format_reward": 0.31250000186264515, + "step": 70 + }, + { + "advantage_max": 1.255461797118187, + "advantage_mean": 1.4901161637936866e-08, + "advantage_min": -0.53419079631567, + "advantage_std": 0.6641193814575672, + "completion_length": 2718.2292098999023, + "epoch": 0.08114285714285714, + "grad_norm": 0.12506967782974243, + "kl": 0.0030085816979408264, + "lambda_div_used": 0.6, + "learning_rate": 9.951725498333448e-07, + "loss": 0.0502, + "reward": -0.00627492368221283, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.00627492368221283, + "reward_after_std": 0.6641193926334381, + "reward_before_mean": 0.39759753830730915, + "reward_before_std": 0.5728695411235094, + "reward_change_max": 0.0, + "reward_change_mean": -0.4038724033161998, + "reward_change_min": -0.6584755666553974, + "reward_change_std": 0.2662770180031657, + "reward_std": 0.6641194075345993, + "rewards/cosine_scaled_reward": -0.009534597164019942, + "rewards/format_reward": 0.4166666679084301, + "step": 71 + }, + { + "advantage_max": 1.1922738291323185, + "advantage_mean": 1.3659397890553038e-08, + "advantage_min": -0.5972421020269394, + "advantage_std": 0.6567165236920118, + "completion_length": 3158.3958740234375, + "epoch": 0.08228571428571428, + "grad_norm": 0.0950314849615097, + "kl": 0.006608843803405762, + "lambda_div_used": 0.6, + "learning_rate": 9.947027716509488e-07, + "loss": 0.0336, + "reward": -0.3057239428162575, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.3057239428162575, + "reward_after_std": 0.6567165162414312, + "reward_before_mean": -0.06156452978029847, + "reward_before_std": 0.6831415146589279, + "reward_change_max": 0.004357524216175079, + "reward_change_mean": -0.2441594167612493, + "reward_change_min": -0.5717665106058121, + "reward_change_std": 0.23254990810528398, + "reward_std": 0.6567165348678827, + "rewards/cosine_scaled_reward": -0.15578227303922176, + "rewards/format_reward": 0.2500000074505806, + "step": 72 + }, + { + "advantage_max": 1.1263316199183464, + "advantage_mean": 1.9247333282734758e-08, + "advantage_min": -0.4545093812048435, + "advantage_std": 0.6076869647949934, + "completion_length": 3519.8541870117188, + "epoch": 0.08342857142857144, + "grad_norm": 0.09696004539728165, + "kl": 0.0008401870727539062, + "lambda_div_used": 0.6, + "learning_rate": 9.942113192828444e-07, + "loss": 0.0412, + "reward": -0.480790832079947, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.480790832079947, + "reward_after_std": 0.6076869685202837, + "reward_before_mean": -0.3260495774447918, + "reward_before_std": 0.6348689384758472, + "reward_change_max": 0.0018310844898223877, + "reward_change_mean": -0.1547412471845746, + "reward_change_min": -0.42637697234749794, + "reward_change_std": 0.17630695179104805, + "reward_std": 0.6076869815587997, + "rewards/cosine_scaled_reward": -0.204691456630826, + "rewards/format_reward": 0.0833333358168602, + "step": 73 + }, + { + "advantage_max": 1.400854118168354, + "advantage_mean": 1.862645193639878e-08, + "advantage_min": -0.6330046206712723, + "advantage_std": 0.7717082761228085, + "completion_length": 3311.854217529297, + "epoch": 0.08457142857142858, + "grad_norm": 0.13873428106307983, + "kl": 0.002074897289276123, + "lambda_div_used": 0.6, + "learning_rate": 9.93698216681727e-07, + "loss": 0.0536, + "reward": -0.07318782806396484, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.07318782806396484, + "reward_after_std": 0.7717083059251308, + "reward_before_mean": 0.27929894998669624, + "reward_before_std": 0.7782693542540073, + "reward_change_max": 0.000939980149269104, + "reward_change_mean": -0.35248675756156445, + "reward_change_min": -0.6938531585037708, + "reward_change_std": 0.29394845431670547, + "reward_std": 0.7717083431780338, + "rewards/cosine_scaled_reward": -0.006183858960866928, + "rewards/format_reward": 0.29166666977107525, + "step": 74 + }, + { + "advantage_max": 1.1135927364230156, + "advantage_mean": -2.793967834868738e-09, + "advantage_min": -0.4344211630523205, + "advantage_std": 0.5783823095262051, + "completion_length": 3101.500045776367, + "epoch": 0.08571428571428572, + "grad_norm": 0.29595881700515747, + "kl": 0.01248013973236084, + "lambda_div_used": 0.6, + "learning_rate": 9.931634888554935e-07, + "loss": 0.0289, + "reward": -0.1404849924147129, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.1404849924147129, + "reward_after_std": 0.5783823020756245, + "reward_before_mean": 0.20374970324337482, + "reward_before_std": 0.4970117639750242, + "reward_change_max": 0.0015685856342315674, + "reward_change_mean": -0.3442347086966038, + "reward_change_min": -0.5670420341193676, + "reward_change_std": 0.21399930119514465, + "reward_std": 0.5783823281526566, + "rewards/cosine_scaled_reward": -0.08562515117228031, + "rewards/format_reward": 0.37500000558793545, + "step": 75 + }, + { + "advantage_max": 1.2363538295030594, + "advantage_mean": 2.048909714114089e-08, + "advantage_min": -0.5451108776032925, + "advantage_std": 0.6710482239723206, + "completion_length": 2952.979248046875, + "epoch": 0.08685714285714285, + "grad_norm": 0.09811766445636749, + "kl": 0.001201242208480835, + "lambda_div_used": 0.6, + "learning_rate": 9.926071618660237e-07, + "loss": 0.0344, + "reward": -0.23255194473313168, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.23255194473313168, + "reward_after_std": 0.6710482239723206, + "reward_before_mean": 0.046751671470701694, + "reward_before_std": 0.6695912722498178, + "reward_change_max": 0.00038267672061920166, + "reward_change_mean": -0.2793036075308919, + "reward_change_min": -0.600840475410223, + "reward_change_std": 0.23344214539974928, + "reward_std": 0.6710482500493526, + "rewards/cosine_scaled_reward": -0.19537417870014906, + "rewards/format_reward": 0.43750000558793545, + "step": 76 + }, + { + "advantage_max": 1.0073203220963478, + "advantage_mean": -3.1044082304809706e-09, + "advantage_min": -0.5661797672510147, + "advantage_std": 0.5656730942428112, + "completion_length": 3245.1666717529297, + "epoch": 0.088, + "grad_norm": 0.08500754833221436, + "kl": 0.0013560652732849121, + "lambda_div_used": 0.6, + "learning_rate": 9.9202926282791e-07, + "loss": -0.0149, + "reward": -0.2388194277882576, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2388194277882576, + "reward_after_std": 0.5656730942428112, + "reward_before_mean": 0.06565772742033005, + "reward_before_std": 0.5782887861132622, + "reward_change_max": 0.00035362690687179565, + "reward_change_mean": -0.30447718035429716, + "reward_change_min": -0.5347645282745361, + "reward_change_std": 0.22534830961376429, + "reward_std": 0.5656730979681015, + "rewards/cosine_scaled_reward": -0.11300447443500161, + "rewards/format_reward": 0.29166666977107525, + "step": 77 + }, + { + "advantage_max": 1.1894551888108253, + "advantage_mean": -1.3659397946064189e-08, + "advantage_min": -0.7126917093992233, + "advantage_std": 0.6707145310938358, + "completion_length": 3217.8958740234375, + "epoch": 0.08914285714285715, + "grad_norm": 0.0936238095164299, + "kl": 0.0016173124313354492, + "lambda_div_used": 0.6, + "learning_rate": 9.91429819907136e-07, + "loss": 0.0368, + "reward": -0.13307934533804655, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.13307934533804655, + "reward_after_std": 0.6707145348191261, + "reward_before_mean": 0.20870637288317084, + "reward_before_std": 0.6934996079653502, + "reward_change_max": 3.166496753692627e-05, + "reward_change_mean": -0.34178573824465275, + "reward_change_min": -0.6631423011422157, + "reward_change_std": 0.2715051304548979, + "reward_std": 0.6707145571708679, + "rewards/cosine_scaled_reward": -0.051896817167289555, + "rewards/format_reward": 0.31250000931322575, + "step": 78 + }, + { + "advantage_max": 1.5434688106179237, + "advantage_mean": -3.1044088966147854e-09, + "advantage_min": -0.6260135676711798, + "advantage_std": 0.8081723116338253, + "completion_length": 2341.9167098999023, + "epoch": 0.09028571428571429, + "grad_norm": 0.14235326647758484, + "kl": 0.018215656280517578, + "lambda_div_used": 0.6, + "learning_rate": 9.908088623197048e-07, + "loss": 0.0089, + "reward": 0.02448425441980362, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.02448425441980362, + "reward_after_std": 0.808172345161438, + "reward_before_mean": 0.41002992913126945, + "reward_before_std": 0.7505143824964762, + "reward_change_max": 0.000287078320980072, + "reward_change_mean": -0.385545676574111, + "reward_change_min": -0.648455660790205, + "reward_change_std": 0.25417708698660135, + "reward_std": 0.8081723637878895, + "rewards/cosine_scaled_reward": -0.06581836566329002, + "rewards/format_reward": 0.5416666679084301, + "step": 79 + }, + { + "advantage_max": 1.3301952853798866, + "advantage_mean": -2.793967474046255e-09, + "advantage_min": -0.7484759241342545, + "advantage_std": 0.7691389471292496, + "completion_length": 3285.125, + "epoch": 0.09142857142857143, + "grad_norm": 0.11320678889751434, + "kl": 0.0028287172317504883, + "lambda_div_used": 0.6, + "learning_rate": 9.901664203302124e-07, + "loss": 0.0183, + "reward": -0.1481735883280635, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.1481735883280635, + "reward_after_std": 0.7691389322280884, + "reward_before_mean": 0.16942895017564297, + "reward_before_std": 0.8429461754858494, + "reward_change_max": 0.0006430298089981079, + "reward_change_mean": -0.317602532915771, + "reward_change_min": -0.7648013271391392, + "reward_change_std": 0.31389318499714136, + "reward_std": 0.7691389322280884, + "rewards/cosine_scaled_reward": -0.09236887097358704, + "rewards/format_reward": 0.35416667349636555, + "step": 80 + }, + { + "advantage_max": 1.0307377986609936, + "advantage_mean": 1.0554990104161277e-08, + "advantage_min": -0.5289515219628811, + "advantage_std": 0.568504374474287, + "completion_length": 3086.416679382324, + "epoch": 0.09257142857142857, + "grad_norm": 0.11514487117528915, + "kl": 0.0069026947021484375, + "lambda_div_used": 0.6, + "learning_rate": 9.895025252503755e-07, + "loss": 0.0102, + "reward": -0.19615141674876213, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.19615141674876213, + "reward_after_std": 0.568504374474287, + "reward_before_mean": 0.12522562127560377, + "reward_before_std": 0.5510979443788528, + "reward_change_max": 0.0006910786032676697, + "reward_change_mean": -0.32137702871114016, + "reward_change_min": -0.6005649529397488, + "reward_change_std": 0.23301844112575054, + "reward_std": 0.568504374474287, + "rewards/cosine_scaled_reward": -0.0936371935531497, + "rewards/format_reward": 0.31250000558793545, + "step": 81 + }, + { + "advantage_max": 1.2566718012094498, + "advantage_mean": 1.0554989271494009e-08, + "advantage_min": -0.5156496614217758, + "advantage_std": 0.6929546110332012, + "completion_length": 2985.7708435058594, + "epoch": 0.09371428571428571, + "grad_norm": 0.09941416233778, + "kl": 0.002816915512084961, + "lambda_div_used": 0.6, + "learning_rate": 9.888172094375033e-07, + "loss": 0.0118, + "reward": -0.052963174879550934, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.052963174879550934, + "reward_after_std": 0.6929546073079109, + "reward_before_mean": 0.32549452036619186, + "reward_before_std": 0.671553336083889, + "reward_change_max": 0.0007572099566459656, + "reward_change_mean": -0.37845765706151724, + "reward_change_min": -0.8261243067681789, + "reward_change_std": 0.30112940445542336, + "reward_std": 0.692954633384943, + "rewards/cosine_scaled_reward": -0.024752754718065262, + "rewards/format_reward": 0.3750000037252903, + "step": 82 + }, + { + "advantage_max": 0.7631809785962105, + "advantage_mean": -4.967054156868045e-09, + "advantage_min": -0.43150240182876587, + "advantage_std": 0.4299459084868431, + "completion_length": 2912.8333435058594, + "epoch": 0.09485714285714286, + "grad_norm": 0.04903676360845566, + "kl": 0.002282381057739258, + "lambda_div_used": 0.6, + "learning_rate": 9.881105062929221e-07, + "loss": 0.0038, + "reward": -0.3834444247186184, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.3834444247186184, + "reward_after_std": 0.4299459159374237, + "reward_before_mean": -0.1346133127808571, + "reward_before_std": 0.4388551339507103, + "reward_change_max": 0.0004472807049751282, + "reward_change_mean": -0.2488311375491321, + "reward_change_min": -0.45463940128684044, + "reward_change_std": 0.18461166438646615, + "reward_std": 0.4299459271132946, + "rewards/cosine_scaled_reward": -0.19230665266513824, + "rewards/format_reward": 0.25, + "step": 83 + }, + { + "advantage_max": 1.642131645232439, + "advantage_mean": -2.1109978320943412e-08, + "advantage_min": -0.870296873152256, + "advantage_std": 0.9153857529163361, + "completion_length": 3176.7291717529297, + "epoch": 0.096, + "grad_norm": 0.142425537109375, + "kl": 0.0012586116790771484, + "lambda_div_used": 0.6, + "learning_rate": 9.873824502603459e-07, + "loss": 0.0224, + "reward": 0.1766932848840952, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1766932848840952, + "reward_after_std": 0.9153857212513685, + "reward_before_mean": 0.6397379375994205, + "reward_before_std": 0.9262058734893799, + "reward_change_max": 0.0005095526576042175, + "reward_change_mean": -0.463044673204422, + "reward_change_min": -0.879260279238224, + "reward_change_std": 0.361559247598052, + "reward_std": 0.9153857212513685, + "rewards/cosine_scaled_reward": 0.11153563484549522, + "rewards/format_reward": 0.41666667722165585, + "step": 84 + }, + { + "advantage_max": 2.0070881322026253, + "advantage_mean": -9.313226023710541e-09, + "advantage_min": -0.9783229231834412, + "advantage_std": 1.1386385187506676, + "completion_length": 3160.2500762939453, + "epoch": 0.09714285714285714, + "grad_norm": 0.1873876303434372, + "kl": 0.0019626617431640625, + "lambda_div_used": 0.6, + "learning_rate": 9.866330768241983e-07, + "loss": 0.0419, + "reward": 0.12494198745116591, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.12494198745116591, + "reward_after_std": 1.1386385038495064, + "reward_before_mean": 0.5180581012973562, + "reward_before_std": 1.2345999665558338, + "reward_change_max": 0.0020033270120620728, + "reward_change_mean": -0.39311612769961357, + "reward_change_min": -1.0354369431734085, + "reward_change_std": 0.4217766746878624, + "reward_std": 1.1386385075747967, + "rewards/cosine_scaled_reward": 0.009029048029333353, + "rewards/format_reward": 0.5000000074505806, + "step": 85 + }, + { + "advantage_max": 1.3539377562701702, + "advantage_mean": 5.587935669737476e-09, + "advantage_min": -0.703910693526268, + "advantage_std": 0.7683831937611103, + "completion_length": 3129.7083435058594, + "epoch": 0.09828571428571428, + "grad_norm": 0.12810629606246948, + "kl": 0.002894878387451172, + "lambda_div_used": 0.6, + "learning_rate": 9.85862422507884e-07, + "loss": 0.0355, + "reward": -0.15509214252233505, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.15509214252233505, + "reward_after_std": 0.7683831639587879, + "reward_before_mean": 0.1555484477430582, + "reward_before_std": 0.823885153979063, + "reward_change_max": 0.0, + "reward_change_mean": -0.310640599578619, + "reward_change_min": -0.7123654522001743, + "reward_change_std": 0.29474045149981976, + "reward_std": 0.7683831863105297, + "rewards/cosine_scaled_reward": -0.08889244613237679, + "rewards/format_reward": 0.3333333395421505, + "step": 86 + }, + { + "advantage_max": 1.1467025578022003, + "advantage_mean": 5.587935725248627e-09, + "advantage_min": -0.7277822308242321, + "advantage_std": 0.6753969453275204, + "completion_length": 2852.041717529297, + "epoch": 0.09942857142857142, + "grad_norm": 0.1169510930776596, + "kl": 0.0055196285247802734, + "lambda_div_used": 0.6, + "learning_rate": 9.850705248720068e-07, + "loss": 0.0398, + "reward": -0.0842017037793994, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.0842017037793994, + "reward_after_std": 0.6753969304263592, + "reward_before_mean": 0.2875655069947243, + "reward_before_std": 0.7284983061254025, + "reward_change_max": 0.000680871307849884, + "reward_change_mean": -0.3717672023922205, + "reward_change_min": -0.7417283318936825, + "reward_change_std": 0.3114555370993912, + "reward_std": 0.6753969639539719, + "rewards/cosine_scaled_reward": -0.08538391441106796, + "rewards/format_reward": 0.4583333469927311, + "step": 87 + }, + { + "advantage_max": 1.8067155107855797, + "advantage_mean": -1.2417635808503746e-09, + "advantage_min": -0.9640210494399071, + "advantage_std": 0.9902556464076042, + "completion_length": 2936.8750610351562, + "epoch": 0.10057142857142858, + "grad_norm": 0.2037220597267151, + "kl": 0.010509967803955078, + "lambda_div_used": 0.6, + "learning_rate": 9.8425742251254e-07, + "loss": 0.1151, + "reward": 0.12938768789172173, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.12938768789172173, + "reward_after_std": 0.9902556501328945, + "reward_before_mean": 0.5470981888938695, + "reward_before_std": 1.0180102176964283, + "reward_change_max": 0.0006925985217094421, + "reward_change_mean": -0.417710492387414, + "reward_change_min": -0.8149654679000378, + "reward_change_std": 0.34481045603752136, + "reward_std": 0.9902556911110878, + "rewards/cosine_scaled_reward": 0.054799098521471024, + "rewards/format_reward": 0.4375000149011612, + "step": 88 + }, + { + "advantage_max": 1.1965279504656792, + "advantage_mean": 6.829699361610153e-09, + "advantage_min": -0.6220619156956673, + "advantage_std": 0.6542908251285553, + "completion_length": 3258.0416870117188, + "epoch": 0.10171428571428572, + "grad_norm": 0.12059728056192398, + "kl": 0.004296541213989258, + "lambda_div_used": 0.6, + "learning_rate": 9.83423155058946e-07, + "loss": 0.0663, + "reward": -0.30891418643295765, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.30891418643295765, + "reward_after_std": 0.654290821403265, + "reward_before_mean": -0.06621248135343194, + "reward_before_std": 0.6760115548968315, + "reward_change_max": 0.0009444355964660645, + "reward_change_mean": -0.24270169623196125, + "reward_change_min": -0.48976127430796623, + "reward_change_std": 0.20589512679725885, + "reward_std": 0.6542908400297165, + "rewards/cosine_scaled_reward": -0.12685624696314335, + "rewards/format_reward": 0.1875000074505806, + "step": 89 + }, + { + "advantage_max": 1.4118667095899582, + "advantage_mean": 3.1044083970144243e-09, + "advantage_min": -0.5967099294066429, + "advantage_std": 0.7452210150659084, + "completion_length": 2658.6250610351562, + "epoch": 0.10285714285714286, + "grad_norm": 0.12070947885513306, + "kl": 0.009579658508300781, + "lambda_div_used": 0.6, + "learning_rate": 9.825677631722435e-07, + "loss": 0.0573, + "reward": -0.15686163678765297, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.15686163678765297, + "reward_after_std": 0.7452210113406181, + "reward_before_mean": 0.14455214142799377, + "reward_before_std": 0.7128091156482697, + "reward_change_max": 0.0, + "reward_change_mean": -0.3014137726277113, + "reward_change_min": -0.6083030439913273, + "reward_change_std": 0.23668606020510197, + "reward_std": 0.7452210150659084, + "rewards/cosine_scaled_reward": -0.15689060185104609, + "rewards/format_reward": 0.4583333469927311, + "step": 90 + }, + { + "advantage_max": 0.9449239484965801, + "advantage_mean": 6.208819014474898e-10, + "advantage_min": -0.6252702437341213, + "advantage_std": 0.5788875985890627, + "completion_length": 3106.6458587646484, + "epoch": 0.104, + "grad_norm": 0.10415603965520859, + "kl": 0.004826545715332031, + "lambda_div_used": 0.6, + "learning_rate": 9.816912885430258e-07, + "loss": 0.02, + "reward": -0.166163869202137, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.166163869202137, + "reward_after_std": 0.5788875948637724, + "reward_before_mean": 0.18202120624482632, + "reward_before_std": 0.6361364722251892, + "reward_change_max": 0.001609407365322113, + "reward_change_mean": -0.3481850866228342, + "reward_change_min": -0.6712036058306694, + "reward_change_std": 0.2946196533739567, + "reward_std": 0.5788875967264175, + "rewards/cosine_scaled_reward": -0.06523939780890942, + "rewards/format_reward": 0.31250000558793545, + "step": 91 + }, + { + "advantage_max": 1.4620023369789124, + "advantage_mean": 5.551115123125783e-16, + "advantage_min": -0.7397854886949062, + "advantage_std": 0.8217831328511238, + "completion_length": 2918.000015258789, + "epoch": 0.10514285714285715, + "grad_norm": 0.14798514544963837, + "kl": 0.011157512664794922, + "lambda_div_used": 0.6, + "learning_rate": 9.807937738894303e-07, + "loss": 0.0368, + "reward": -0.06641942448914051, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.06641942448914051, + "reward_after_std": 0.8217831626534462, + "reward_before_mean": 0.28103411523625255, + "reward_before_std": 0.8633438013494015, + "reward_change_max": 0.0023638010025024414, + "reward_change_mean": -0.3474535522982478, + "reward_change_min": -0.7106242850422859, + "reward_change_std": 0.2981195440515876, + "reward_std": 0.8217831663787365, + "rewards/cosine_scaled_reward": -0.06781627610325813, + "rewards/format_reward": 0.4166666753590107, + "step": 92 + }, + { + "advantage_max": 0.7241514772176743, + "advantage_mean": 2.110997915361068e-08, + "advantage_min": -0.3528713136911392, + "advantage_std": 0.4079975299537182, + "completion_length": 3430.187530517578, + "epoch": 0.10628571428571429, + "grad_norm": 0.08644148707389832, + "kl": 0.0048465728759765625, + "lambda_div_used": 0.6, + "learning_rate": 9.798752629550546e-07, + "loss": 0.0291, + "reward": -0.5372234713286161, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.5372234713286161, + "reward_after_std": 0.4079975299537182, + "reward_before_mean": -0.3706090720370412, + "reward_before_std": 0.4305304940789938, + "reward_change_max": 0.0016954094171524048, + "reward_change_mean": -0.16661439649760723, + "reward_change_min": -0.40895063802599907, + "reward_change_std": 0.15682219434529543, + "reward_std": 0.4079975336790085, + "rewards/cosine_scaled_reward": -0.23738786298781633, + "rewards/format_reward": 0.10416666977107525, + "step": 93 + }, + { + "advantage_max": 0.8936146646738052, + "advantage_mean": 1.80055704790405e-08, + "advantage_min": -0.3542574942111969, + "advantage_std": 0.47137969732284546, + "completion_length": 3167.958335876465, + "epoch": 0.10742857142857143, + "grad_norm": 0.07212722301483154, + "kl": 0.009177207946777344, + "lambda_div_used": 0.6, + "learning_rate": 9.78935800506826e-07, + "loss": 0.0105, + "reward": -0.2902873866260052, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2902873866260052, + "reward_after_std": 0.47137970849871635, + "reward_before_mean": -0.003107912838459015, + "reward_before_std": 0.4016598341986537, + "reward_change_max": 0.0, + "reward_change_mean": -0.28717945888638496, + "reward_change_min": -0.49013088271021843, + "reward_change_std": 0.18906648084521294, + "reward_std": 0.47137971967458725, + "rewards/cosine_scaled_reward": -0.0953039638698101, + "rewards/format_reward": 0.18750000186264515, + "step": 94 + }, + { + "advantage_max": 1.1060735136270523, + "advantage_mean": 1.2417635031347629e-08, + "advantage_min": -0.5495634824037552, + "advantage_std": 0.6076685786247253, + "completion_length": 3469.125030517578, + "epoch": 0.10857142857142857, + "grad_norm": 0.09135206788778305, + "kl": 0.0022430419921875, + "lambda_div_used": 0.6, + "learning_rate": 9.779754323328192e-07, + "loss": 0.0086, + "reward": -0.40138646960258484, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.40138646960258484, + "reward_after_std": 0.607668599113822, + "reward_before_mean": -0.20004860311746597, + "reward_before_std": 0.6348323151469231, + "reward_change_max": 0.0011594444513320923, + "reward_change_mean": -0.20133786648511887, + "reward_change_min": -0.4667600132524967, + "reward_change_std": 0.20126637630164623, + "reward_std": 0.6076686009764671, + "rewards/cosine_scaled_reward": -0.19377430342137814, + "rewards/format_reward": 0.1875000074505806, + "step": 95 + }, + { + "advantage_max": 1.2968962267041206, + "advantage_mean": 3.1044084525255755e-09, + "advantage_min": -0.5794485211372375, + "advantage_std": 0.7141258921474218, + "completion_length": 2919.187515258789, + "epoch": 0.10971428571428571, + "grad_norm": 0.13126961886882782, + "kl": 0.007322788238525391, + "lambda_div_used": 0.6, + "learning_rate": 9.769942052400235e-07, + "loss": 0.0576, + "reward": -0.05010135751217604, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.05010135751217604, + "reward_after_std": 0.7141258884221315, + "reward_before_mean": 0.32372894510626793, + "reward_before_std": 0.6927786152809858, + "reward_change_max": 0.0015667006373405457, + "reward_change_mean": -0.37383024860173464, + "reward_change_min": -0.7615102715790272, + "reward_change_std": 0.30468545015901327, + "reward_std": 0.714125907048583, + "rewards/cosine_scaled_reward": -0.015218888409435749, + "rewards/format_reward": 0.3541666679084301, + "step": 96 + }, + { + "advantage_max": 1.2440862655639648, + "advantage_mean": 2.4835267176115394e-09, + "advantage_min": -0.6283608749508858, + "advantage_std": 0.6912339441478252, + "completion_length": 3318.3959045410156, + "epoch": 0.11085714285714286, + "grad_norm": 0.13034504652023315, + "kl": 0.003994464874267578, + "lambda_div_used": 0.6, + "learning_rate": 9.759921670520634e-07, + "loss": 0.0228, + "reward": -0.2075312975794077, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2075312975794077, + "reward_after_std": 0.6912339385598898, + "reward_before_mean": 0.08721866272389889, + "reward_before_std": 0.7230119798332453, + "reward_change_max": 0.000818297266960144, + "reward_change_mean": -0.29474993934854865, + "reward_change_min": -0.6336596198379993, + "reward_change_std": 0.2503976479638368, + "reward_std": 0.691233953461051, + "rewards/cosine_scaled_reward": -0.09180734679102898, + "rewards/format_reward": 0.2708333432674408, + "step": 97 + }, + { + "advantage_max": 1.195045854896307, + "advantage_mean": 9.31322596819939e-09, + "advantage_min": -0.535729430615902, + "advantage_std": 0.6331109032034874, + "completion_length": 3111.5208740234375, + "epoch": 0.112, + "grad_norm": 0.1344451755285263, + "kl": 0.0039520263671875, + "lambda_div_used": 0.6, + "learning_rate": 9.749693666068663e-07, + "loss": 0.0187, + "reward": -0.11401326581835747, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.11401326581835747, + "reward_after_std": 0.6331108994781971, + "reward_before_mean": 0.23613072000443935, + "reward_before_std": 0.5695687290281057, + "reward_change_max": 0.0017473921179771423, + "reward_change_mean": -0.35014398489147425, + "reward_change_min": -0.6171726249158382, + "reward_change_std": 0.24364902451634407, + "reward_std": 0.6331109274178743, + "rewards/cosine_scaled_reward": -0.0694346446543932, + "rewards/format_reward": 0.3750000111758709, + "step": 98 + }, + { + "advantage_max": 1.1984662860631943, + "advantage_mean": -2.048909669705168e-08, + "advantage_min": -0.555856853723526, + "advantage_std": 0.6488207280635834, + "completion_length": 2897.708366394043, + "epoch": 0.11314285714285714, + "grad_norm": 0.11851833760738373, + "kl": 0.004616737365722656, + "lambda_div_used": 0.6, + "learning_rate": 9.739258537542835e-07, + "loss": 0.014, + "reward": -0.15122101828455925, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.15122101828455925, + "reward_after_std": 0.6488207206130028, + "reward_before_mean": 0.17770473286509514, + "reward_before_std": 0.6168424002826214, + "reward_change_max": 0.0008178576827049255, + "reward_change_mean": -0.3289257613942027, + "reward_change_min": -0.6208729110658169, + "reward_change_std": 0.25092875584959984, + "reward_std": 0.6488207466900349, + "rewards/cosine_scaled_reward": -0.06739764660596848, + "rewards/format_reward": 0.31250000558793545, + "step": 99 + }, + { + "advantage_max": 1.8971523717045784, + "advantage_mean": 1.614292521878724e-08, + "advantage_min": -0.8356616273522377, + "advantage_std": 1.0355449803173542, + "completion_length": 2997.666717529297, + "epoch": 0.11428571428571428, + "grad_norm": 0.15601147711277008, + "kl": 0.0081787109375, + "lambda_div_used": 0.6, + "learning_rate": 9.728616793536587e-07, + "loss": 0.0539, + "reward": 0.05242241080850363, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.05242241080850363, + "reward_after_std": 1.035545002669096, + "reward_before_mean": 0.417850723490119, + "reward_before_std": 1.0788166895508766, + "reward_change_max": 0.0008053779602050781, + "reward_change_mean": -0.36542829871177673, + "reward_change_min": -0.8287406712770462, + "reward_change_std": 0.33487732522189617, + "reward_std": 1.0355450212955475, + "rewards/cosine_scaled_reward": 0.011008680099621415, + "rewards/format_reward": 0.3958333432674408, + "step": 100 + }, + { + "advantage_max": 0.9307895302772522, + "advantage_mean": 6.208816238917336e-10, + "advantage_min": -0.3656721208244562, + "advantage_std": 0.4907538667321205, + "completion_length": 2841.375015258789, + "epoch": 0.11542857142857142, + "grad_norm": 0.12105463445186615, + "kl": 0.004067897796630859, + "lambda_div_used": 0.6, + "learning_rate": 9.717768952713511e-07, + "loss": -0.012, + "reward": -0.17625251971185207, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.17625251971185207, + "reward_after_std": 0.4907538667321205, + "reward_before_mean": 0.16911687143146992, + "reward_before_std": 0.4058742057532072, + "reward_change_max": 0.0002156868577003479, + "reward_change_mean": -0.3453693939372897, + "reward_change_min": -0.6092492565512657, + "reward_change_std": 0.21868636459112167, + "reward_std": 0.4907538704574108, + "rewards/cosine_scaled_reward": -0.11335823312401772, + "rewards/format_reward": 0.3958333395421505, + "step": 101 + }, + { + "advantage_max": 1.196364901959896, + "advantage_mean": 1.8626449826975033e-09, + "advantage_min": -0.5503832846879959, + "advantage_std": 0.6417879574000835, + "completion_length": 2956.0209045410156, + "epoch": 0.11657142857142858, + "grad_norm": 0.12102524191141129, + "kl": 0.0120391845703125, + "lambda_div_used": 0.6, + "learning_rate": 9.706715543782064e-07, + "loss": 0.0616, + "reward": -0.026549585163593292, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.026549585163593292, + "reward_after_std": 0.6417879462242126, + "reward_before_mean": 0.3726413017138839, + "reward_before_std": 0.5742765087634325, + "reward_change_max": 0.0007911324501037598, + "reward_change_mean": -0.3991908668540418, + "reward_change_min": -0.7056144215166569, + "reward_change_std": 0.27402982767671347, + "reward_std": 0.6417879611253738, + "rewards/cosine_scaled_reward": -0.03242936171591282, + "rewards/format_reward": 0.43750000931322575, + "step": 102 + }, + { + "advantage_max": 1.367197409272194, + "advantage_mean": 2.483527050678447e-09, + "advantage_min": -0.724069282412529, + "advantage_std": 0.7786473240703344, + "completion_length": 2909.666717529297, + "epoch": 0.11771428571428572, + "grad_norm": 0.1296321302652359, + "kl": 0.010121822357177734, + "lambda_div_used": 0.6, + "learning_rate": 9.695457105469804e-07, + "loss": 0.0546, + "reward": -0.09430141933262348, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.09430141933262348, + "reward_after_std": 0.7786473240703344, + "reward_before_mean": 0.24636715522501618, + "reward_before_std": 0.8315681144595146, + "reward_change_max": 0.00037298351526260376, + "reward_change_mean": -0.34066856699064374, + "reward_change_min": -0.7915425300598145, + "reward_change_std": 0.30810489458963275, + "reward_std": 0.7786473408341408, + "rewards/cosine_scaled_reward": -0.10598308267071843, + "rewards/format_reward": 0.4583333469927311, + "step": 103 + }, + { + "advantage_max": 1.1343006566166878, + "advantage_mean": 1.4280279514444771e-08, + "advantage_min": -0.628552533686161, + "advantage_std": 0.6272084675729275, + "completion_length": 2978.1666870117188, + "epoch": 0.11885714285714286, + "grad_norm": 0.09671797603368759, + "kl": 0.005688667297363281, + "lambda_div_used": 0.6, + "learning_rate": 9.683994186497132e-07, + "loss": 0.0212, + "reward": -0.17992812395095825, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.17992812395095825, + "reward_after_std": 0.6272084899246693, + "reward_before_mean": 0.14061287976801395, + "reward_before_std": 0.6303292363882065, + "reward_change_max": 0.0005879104137420654, + "reward_change_mean": -0.3205409971997142, + "reward_change_min": -0.5868665352463722, + "reward_change_std": 0.24384531751275063, + "reward_std": 0.6272085160017014, + "rewards/cosine_scaled_reward": -0.1171935647726059, + "rewards/format_reward": 0.3750000111758709, + "step": 104 + }, + { + "advantage_max": 1.7707880921661854, + "advantage_mean": 3.7252904094842165e-09, + "advantage_min": -0.7792215496301651, + "advantage_std": 0.9740794487297535, + "completion_length": 3094.437545776367, + "epoch": 0.12, + "grad_norm": 0.14219453930854797, + "kl": 0.005229949951171875, + "lambda_div_used": 0.6, + "learning_rate": 9.672327345550543e-07, + "loss": 0.0383, + "reward": -0.00444754958152771, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.00444754958152771, + "reward_after_std": 0.9740794561803341, + "reward_before_mean": 0.34283736534416676, + "reward_before_std": 1.0192876607179642, + "reward_change_max": 0.0017241165041923523, + "reward_change_mean": -0.3472848879173398, + "reward_change_min": -0.8471212461590767, + "reward_change_std": 0.3292017253115773, + "reward_std": 0.9740794897079468, + "rewards/cosine_scaled_reward": -0.0160813401453197, + "rewards/format_reward": 0.3750000074505806, + "step": 105 + }, + { + "advantage_max": 1.506701335310936, + "advantage_mean": -9.934107536579972e-09, + "advantage_min": -0.7705876715481281, + "advantage_std": 0.8312099389731884, + "completion_length": 2482.770927429199, + "epoch": 0.12114285714285715, + "grad_norm": 0.11971443891525269, + "kl": 0.0063114166259765625, + "lambda_div_used": 0.6, + "learning_rate": 9.66045715125541e-07, + "loss": 0.0545, + "reward": 0.44777741096913815, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.44777741096913815, + "reward_after_std": 0.8312099389731884, + "reward_before_mean": 1.0764255952090025, + "reward_before_std": 0.735991109162569, + "reward_change_max": 0.0008112713694572449, + "reward_change_mean": -0.6286481656134129, + "reward_change_min": -1.0969063863158226, + "reward_change_std": 0.4263636786490679, + "reward_std": 0.8312099538743496, + "rewards/cosine_scaled_reward": 0.20487945154309273, + "rewards/format_reward": 0.666666679084301, + "step": 106 + }, + { + "advantage_max": 1.063334859907627, + "advantage_mean": 2.3593505593666464e-08, + "advantage_min": -0.5757645852863789, + "advantage_std": 0.6022264622151852, + "completion_length": 2857.020866394043, + "epoch": 0.12228571428571429, + "grad_norm": 0.10165040194988251, + "kl": 0.0062084197998046875, + "lambda_div_used": 0.6, + "learning_rate": 9.648384182148252e-07, + "loss": 0.052, + "reward": -0.16400642041116953, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.16400642041116953, + "reward_after_std": 0.6022264696657658, + "reward_before_mean": 0.17313838889822364, + "reward_before_std": 0.6170230749994516, + "reward_change_max": 0.00020002573728561401, + "reward_change_mean": -0.3371447781100869, + "reward_change_min": -0.6786707863211632, + "reward_change_std": 0.2606902029365301, + "reward_std": 0.6022264733910561, + "rewards/cosine_scaled_reward": -0.12176414579153061, + "rewards/format_reward": 0.41666667722165585, + "step": 107 + }, + { + "advantage_max": 1.5820023342967033, + "advantage_mean": 9.313225857177088e-09, + "advantage_min": -0.7522710785269737, + "advantage_std": 0.8811581470072269, + "completion_length": 2883.0626068115234, + "epoch": 0.12342857142857143, + "grad_norm": 1.1351174116134644, + "kl": 0.04792070388793945, + "lambda_div_used": 0.6, + "learning_rate": 9.636109026648554e-07, + "loss": 0.0616, + "reward": -0.11980252992361784, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.11980252992361784, + "reward_after_std": 0.8811581172049046, + "reward_before_mean": 0.18240957101806998, + "reward_before_std": 0.9268815889954567, + "reward_change_max": 0.002282187342643738, + "reward_change_mean": -0.30221210699528456, + "reward_change_min": -0.6947237513959408, + "reward_change_std": 0.30414726212620735, + "reward_std": 0.8811581209301949, + "rewards/cosine_scaled_reward": -0.11712856311351061, + "rewards/format_reward": 0.416666679084301, + "step": 108 + }, + { + "advantage_max": 0.8597342558205128, + "advantage_mean": 1.738468857759301e-08, + "advantage_min": -0.48682762682437897, + "advantage_std": 0.480662290006876, + "completion_length": 3087.5, + "epoch": 0.12457142857142857, + "grad_norm": 0.06071054935455322, + "kl": 0.004103660583496094, + "lambda_div_used": 0.6, + "learning_rate": 9.623632283030077e-07, + "loss": -0.0119, + "reward": -0.3114382822532207, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.3114382822532207, + "reward_after_std": 0.4806622937321663, + "reward_before_mean": -0.03207859140820801, + "reward_before_std": 0.48070384934544563, + "reward_change_max": 0.0013087615370750427, + "reward_change_mean": -0.2793596927076578, + "reward_change_min": -0.5258324146270752, + "reward_change_std": 0.2068811203353107, + "reward_std": 0.4806623198091984, + "rewards/cosine_scaled_reward": -0.1722893062978983, + "rewards/format_reward": 0.3125, + "step": 109 + }, + { + "advantage_max": 1.2370356619358063, + "advantage_mean": -6.829699084054397e-09, + "advantage_min": -0.5914059355854988, + "advantage_std": 0.666951946914196, + "completion_length": 2959.312530517578, + "epoch": 0.12571428571428572, + "grad_norm": 0.10299086570739746, + "kl": 0.006114959716796875, + "lambda_div_used": 0.6, + "learning_rate": 9.610954559391704e-07, + "loss": 0.0401, + "reward": -0.28863461688160896, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.28863461688160896, + "reward_after_std": 0.6669519459828734, + "reward_before_mean": -0.04152660258114338, + "reward_before_std": 0.6713255615904927, + "reward_change_max": 0.0018312260508537292, + "reward_change_mean": -0.2471080287359655, + "reward_change_min": -0.517824113368988, + "reward_change_std": 0.21439127274788916, + "reward_std": 0.6669519720599055, + "rewards/cosine_scaled_reward": -0.19784663617610931, + "rewards/format_reward": 0.35416667349636555, + "step": 110 + }, + { + "advantage_max": 1.3072906248271465, + "advantage_mean": 1.862645149230957e-09, + "advantage_min": -0.6446176692843437, + "advantage_std": 0.7294343803077936, + "completion_length": 3483.8958435058594, + "epoch": 0.12685714285714286, + "grad_norm": 0.13278532028198242, + "kl": 0.008441925048828125, + "lambda_div_used": 0.6, + "learning_rate": 9.598076473627796e-07, + "loss": 0.0239, + "reward": -0.2520891670137644, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2520891670137644, + "reward_after_std": 0.7294343784451485, + "reward_before_mean": 0.010047844611108303, + "reward_before_std": 0.7737896014004946, + "reward_change_max": 0.0, + "reward_change_mean": -0.26213700883090496, + "reward_change_min": -0.641155794262886, + "reward_change_std": 0.25784933008253574, + "reward_std": 0.7294344156980515, + "rewards/cosine_scaled_reward": -0.0678927511908114, + "rewards/format_reward": 0.1458333358168602, + "step": 111 + }, + { + "advantage_max": 1.1253357827663422, + "advantage_mean": -3.7252904094842165e-09, + "advantage_min": -0.6119440719485283, + "advantage_std": 0.6345476061105728, + "completion_length": 3364.375030517578, + "epoch": 0.128, + "grad_norm": 0.10261553525924683, + "kl": 0.0039768218994140625, + "lambda_div_used": 0.6, + "learning_rate": 9.58499865339809e-07, + "loss": -0.0321, + "reward": -0.06111699063330889, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.06111699063330889, + "reward_after_std": 0.6345475874841213, + "reward_before_mean": 0.32833828032016754, + "reward_before_std": 0.6291491501033306, + "reward_change_max": 0.0010995790362358093, + "reward_change_mean": -0.38945526140742004, + "reward_change_min": -0.7214230000972748, + "reward_change_std": 0.2952321572229266, + "reward_std": 0.6345476321876049, + "rewards/cosine_scaled_reward": 0.007919133640825748, + "rewards/format_reward": 0.3125, + "step": 112 + }, + { + "advantage_max": 1.6033024415373802, + "advantage_mean": -2.483527050678447e-09, + "advantage_min": -0.8581812679767609, + "advantage_std": 0.9165709689259529, + "completion_length": 2931.104202270508, + "epoch": 0.12914285714285714, + "grad_norm": 0.17789599299430847, + "kl": 0.009304046630859375, + "lambda_div_used": 0.6, + "learning_rate": 9.571721736097088e-07, + "loss": 0.0576, + "reward": -0.006853158585727215, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.006853158585727215, + "reward_after_std": 0.9165709540247917, + "reward_before_mean": 0.35799872130155563, + "reward_before_std": 0.9939543418586254, + "reward_change_max": 0.0007301792502403259, + "reward_change_mean": -0.3648518770933151, + "reward_change_min": -0.9379028156399727, + "reward_change_std": 0.3610955514013767, + "reward_std": 0.9165709912776947, + "rewards/cosine_scaled_reward": -0.05016732541844249, + "rewards/format_reward": 0.4583333358168602, + "step": 113 + }, + { + "advantage_max": 1.2760983109474182, + "advantage_mean": -1.6653345369377348e-16, + "advantage_min": -0.6161659248173237, + "advantage_std": 0.6759795695543289, + "completion_length": 2618.208339691162, + "epoch": 0.13028571428571428, + "grad_norm": 0.09680048376321793, + "kl": 0.00629425048828125, + "lambda_div_used": 0.6, + "learning_rate": 9.55824636882301e-07, + "loss": 0.0299, + "reward": -0.09806261584162712, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.09806261584162712, + "reward_after_std": 0.6759795621037483, + "reward_before_mean": 0.2520258827134967, + "reward_before_std": 0.6346731521189213, + "reward_change_max": 6.187707185745239e-05, + "reward_change_mean": -0.35008850507438183, + "reward_change_min": -0.6094051860272884, + "reward_change_std": 0.23877457296475768, + "reward_std": 0.6759795732796192, + "rewards/cosine_scaled_reward": -0.165653734235093, + "rewards/format_reward": 0.5833333469927311, + "step": 114 + }, + { + "advantage_max": 1.533675353974104, + "advantage_mean": 6.208817404651512e-09, + "advantage_min": -0.6436694450676441, + "advantage_std": 0.799139654263854, + "completion_length": 2905.7500228881836, + "epoch": 0.13142857142857142, + "grad_norm": 0.107809878885746, + "kl": 0.005802154541015625, + "lambda_div_used": 0.6, + "learning_rate": 9.54457320834625e-07, + "loss": 0.0209, + "reward": -0.18184374831616879, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.18184374831616879, + "reward_after_std": 0.799139641225338, + "reward_before_mean": 0.0933333026478067, + "reward_before_std": 0.7632724195718765, + "reward_change_max": 0.00022216886281967163, + "reward_change_mean": -0.2751770419999957, + "reward_change_min": -0.5647606626152992, + "reward_change_std": 0.21543886233121157, + "reward_std": 0.7991396598517895, + "rewards/cosine_scaled_reward": -0.14083335734903812, + "rewards/format_reward": 0.3750000074505806, + "step": 115 + }, + { + "advantage_max": 1.0948162451386452, + "advantage_mean": -3.1044087300813317e-09, + "advantage_min": -0.5002664364874363, + "advantage_std": 0.5961834639310837, + "completion_length": 3541.0416870117188, + "epoch": 0.13257142857142856, + "grad_norm": 0.12247907370328903, + "kl": 0.005596160888671875, + "lambda_div_used": 0.6, + "learning_rate": 9.530702921077358e-07, + "loss": 0.0121, + "reward": -0.3858891613781452, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.3858891613781452, + "reward_after_std": 0.596183467656374, + "reward_before_mean": -0.17424449883401394, + "reward_before_std": 0.6120402682572603, + "reward_change_max": 0.0007318109273910522, + "reward_change_mean": -0.21164466626942158, + "reward_change_min": -0.47822466120123863, + "reward_change_std": 0.19232912454754114, + "reward_std": 0.596183467656374, + "rewards/cosine_scaled_reward": -0.12878892198204994, + "rewards/format_reward": 0.0833333358168602, + "step": 116 + }, + { + "advantage_max": 0.9450958669185638, + "advantage_mean": 9.934107758624577e-09, + "advantage_min": -0.503460593521595, + "advantage_std": 0.5200935825705528, + "completion_length": 3124.5416870117188, + "epoch": 0.1337142857142857, + "grad_norm": 0.08877340704202652, + "kl": 0.0074920654296875, + "lambda_div_used": 0.6, + "learning_rate": 9.516636183034564e-07, + "loss": 0.023, + "reward": -0.38494894467294216, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.38494894467294216, + "reward_after_std": 0.5200935900211334, + "reward_before_mean": -0.15960646513849497, + "reward_before_std": 0.5255400091409683, + "reward_change_max": 0.002323649823665619, + "reward_change_mean": -0.22534247650764883, + "reward_change_min": -0.46065803617239, + "reward_change_std": 0.18960520531982183, + "reward_std": 0.5200935937464237, + "rewards/cosine_scaled_reward": -0.20480323676019907, + "rewards/format_reward": 0.25, + "step": 117 + }, + { + "advantage_max": 1.7024286799132824, + "advantage_mean": -1.5522042984272844e-08, + "advantage_min": -0.9134473949670792, + "advantage_std": 0.9439865052700043, + "completion_length": 3146.104248046875, + "epoch": 0.13485714285714287, + "grad_norm": 0.15422917902469635, + "kl": 0.0044574737548828125, + "lambda_div_used": 0.6, + "learning_rate": 9.502373679810839e-07, + "loss": 0.0438, + "reward": 0.2010413184762001, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2010413184762001, + "reward_after_std": 0.9439864829182625, + "reward_before_mean": 0.6700890860520303, + "reward_before_std": 0.9486183300614357, + "reward_change_max": 0.0010381042957305908, + "reward_change_mean": -0.4690477307885885, + "reward_change_min": -0.9044231325387955, + "reward_change_std": 0.3665061164647341, + "reward_std": 0.9439865499734879, + "rewards/cosine_scaled_reward": 0.12671118369325995, + "rewards/format_reward": 0.4166666753590107, + "step": 118 + }, + { + "advantage_max": 1.1857410296797752, + "advantage_mean": 6.208816238917336e-10, + "advantage_min": -0.5179601944983006, + "advantage_std": 0.6348244119435549, + "completion_length": 2643.3125076293945, + "epoch": 0.136, + "grad_norm": 0.4639641046524048, + "kl": 0.12084007263183594, + "lambda_div_used": 0.6, + "learning_rate": 9.487916106540465e-07, + "loss": 0.0218, + "reward": -0.005526546388864517, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.005526546388864517, + "reward_after_std": 0.6348244119435549, + "reward_before_mean": 0.4011272483621724, + "reward_before_std": 0.563131982460618, + "reward_change_max": 0.000466369092464447, + "reward_change_mean": -0.40665382659062743, + "reward_change_min": -0.7384549304842949, + "reward_change_std": 0.2714037476107478, + "reward_std": 0.6348244175314903, + "rewards/cosine_scaled_reward": -0.08068636374082416, + "rewards/format_reward": 0.562500013038516, + "step": 119 + }, + { + "advantage_max": 1.5117480978369713, + "advantage_mean": 1.1796752574788627e-08, + "advantage_min": -0.5524156130850315, + "advantage_std": 0.7942009922116995, + "completion_length": 2476.3541831970215, + "epoch": 0.13714285714285715, + "grad_norm": 0.13974298536777496, + "kl": 0.0076751708984375, + "lambda_div_used": 0.6, + "learning_rate": 9.473264167865171e-07, + "loss": 0.0168, + "reward": 0.022801415994763374, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.022801415994763374, + "reward_after_std": 0.7942009922116995, + "reward_before_mean": 0.41292600706219673, + "reward_before_std": 0.7128852717578411, + "reward_change_max": 0.0, + "reward_change_mean": -0.3901246301829815, + "reward_change_min": -0.759309895336628, + "reward_change_std": 0.2864347733557224, + "reward_std": 0.7942009996622801, + "rewards/cosine_scaled_reward": -0.043536994606256485, + "rewards/format_reward": 0.5000000018626451, + "step": 120 + }, + { + "advantage_max": 1.3017460890114307, + "advantage_mean": -3.4148494976182775e-08, + "advantage_min": -0.7192301824688911, + "advantage_std": 0.7437176704406738, + "completion_length": 2102.958366394043, + "epoch": 0.1382857142857143, + "grad_norm": 0.16582335531711578, + "kl": 0.013330459594726562, + "lambda_div_used": 0.6, + "learning_rate": 9.458418577899774e-07, + "loss": 0.0745, + "reward": 0.16585742961615324, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.16585742961615324, + "reward_after_std": 0.7437176592648029, + "reward_before_mean": 0.6575608756393194, + "reward_before_std": 0.7263416051864624, + "reward_change_max": 0.0008957311511039734, + "reward_change_mean": -0.4917034823447466, + "reward_change_min": -0.8339627794921398, + "reward_change_std": 0.3542322674766183, + "reward_std": 0.7437177151441574, + "rewards/cosine_scaled_reward": -0.004552898928523064, + "rewards/format_reward": 0.6666666772216558, + "step": 121 + }, + { + "advantage_max": 1.445691742002964, + "advantage_mean": -1.3038516155639002e-08, + "advantage_min": -0.8504197970032692, + "advantage_std": 0.8637564219534397, + "completion_length": 2935.18758392334, + "epoch": 0.13942857142857143, + "grad_norm": 0.19797678291797638, + "kl": 0.0050258636474609375, + "lambda_div_used": 0.6, + "learning_rate": 9.443380060197385e-07, + "loss": 0.0503, + "reward": 0.11646575294435024, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.11646575294435024, + "reward_after_std": 0.8637563772499561, + "reward_before_mean": 0.5674029514193535, + "reward_before_std": 0.9499269165098667, + "reward_change_max": 0.0017427802085876465, + "reward_change_mean": -0.450937207788229, + "reward_change_min": -0.9292780607938766, + "reward_change_std": 0.3989896886050701, + "reward_std": 0.8637564107775688, + "rewards/cosine_scaled_reward": 0.044118134304881096, + "rewards/format_reward": 0.47916667722165585, + "step": 122 + }, + { + "advantage_max": 1.3585578612983227, + "advantage_mean": 6.829699028543246e-09, + "advantage_min": -0.5975468009710312, + "advantage_std": 0.7217418989166617, + "completion_length": 3088.791702270508, + "epoch": 0.14057142857142857, + "grad_norm": 0.11229279637336731, + "kl": 0.0062198638916015625, + "lambda_div_used": 0.6, + "learning_rate": 9.428149347714143e-07, + "loss": 0.0001, + "reward": -0.20415015192702413, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.20415015192702413, + "reward_after_std": 0.721741883084178, + "reward_before_mean": 0.07706481404602528, + "reward_before_std": 0.7037300141528249, + "reward_change_max": 0.0008754059672355652, + "reward_change_mean": -0.28121496737003326, + "reward_change_min": -0.5276207067072392, + "reward_change_std": 0.21633497811853886, + "reward_std": 0.7217418989166617, + "rewards/cosine_scaled_reward": -0.13855092599987984, + "rewards/format_reward": 0.3541666716337204, + "step": 123 + }, + { + "advantage_max": 1.5761494934558868, + "advantage_mean": -1.676380662063437e-08, + "advantage_min": -0.8289666064083576, + "advantage_std": 0.9093691222369671, + "completion_length": 2505.1458892822266, + "epoch": 0.1417142857142857, + "grad_norm": 0.1675301343202591, + "kl": 0.0081329345703125, + "lambda_div_used": 0.6, + "learning_rate": 9.412727182773486e-07, + "loss": 0.0567, + "reward": 0.19109472876880318, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.19109472876880318, + "reward_after_std": 0.9093690924346447, + "reward_before_mean": 0.6679605036042631, + "reward_before_std": 0.9592476300895214, + "reward_change_max": 0.00041870027780532837, + "reward_change_mean": -0.4768657460808754, + "reward_change_min": -0.989845547825098, + "reward_change_std": 0.40578335523605347, + "reward_std": 0.9093691147863865, + "rewards/cosine_scaled_reward": 0.042313557118177414, + "rewards/format_reward": 0.5833333395421505, + "step": 124 + }, + { + "advantage_max": 1.1822724342346191, + "advantage_mean": 1.8626452047421083e-08, + "advantage_min": -0.5536490008234978, + "advantage_std": 0.6343522779643536, + "completion_length": 2876.7291946411133, + "epoch": 0.14285714285714285, + "grad_norm": 0.07611420750617981, + "kl": 0.00714874267578125, + "lambda_div_used": 0.6, + "learning_rate": 9.397114317029974e-07, + "loss": 0.0114, + "reward": -0.08220624923706055, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.08220624923706055, + "reward_after_std": 0.6343522742390633, + "reward_before_mean": 0.2853474132716656, + "reward_before_std": 0.5844516009092331, + "reward_change_max": 0.0006675645709037781, + "reward_change_mean": -0.3675536550581455, + "reward_change_min": -0.6255028396844864, + "reward_change_std": 0.25118013471364975, + "reward_std": 0.634352296590805, + "rewards/cosine_scaled_reward": -0.0031596346525475383, + "rewards/format_reward": 0.2916666679084301, + "step": 125 + }, + { + "advantage_max": 1.1261632405221462, + "advantage_mean": 8.692344205529778e-09, + "advantage_min": -0.5621048547327518, + "advantage_std": 0.637621471658349, + "completion_length": 2894.500045776367, + "epoch": 0.144, + "grad_norm": 0.12789134681224823, + "kl": 0.00453948974609375, + "lambda_div_used": 0.6, + "learning_rate": 9.381311511432658e-07, + "loss": 0.0183, + "reward": -0.2368287304416299, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.2368287304416299, + "reward_after_std": 0.6376214865595102, + "reward_before_mean": 0.05406346544623375, + "reward_before_std": 0.6669083181768656, + "reward_change_max": 0.0019116774201393127, + "reward_change_mean": -0.2908922014757991, + "reward_change_min": -0.6447681747376919, + "reward_change_std": 0.2572120614349842, + "reward_std": 0.6376215238124132, + "rewards/cosine_scaled_reward": -0.191718271933496, + "rewards/format_reward": 0.4375000037252903, + "step": 126 + }, + { + "advantage_max": 0.8779657371342182, + "advantage_mean": 2.2662182880273107e-08, + "advantage_min": -0.4594753198325634, + "advantage_std": 0.4753657840192318, + "completion_length": 3165.000015258789, + "epoch": 0.14514285714285713, + "grad_norm": 0.06951512396335602, + "kl": 0.00635528564453125, + "lambda_div_used": 0.6, + "learning_rate": 9.36531953618799e-07, + "loss": 0.0316, + "reward": -0.44154438376426697, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.44154438376426697, + "reward_after_std": 0.4753657877445221, + "reward_before_mean": -0.23894503759220243, + "reward_before_std": 0.4717731177806854, + "reward_change_max": 0.0, + "reward_change_mean": -0.2025993438437581, + "reward_change_min": -0.4216236099600792, + "reward_change_std": 0.16746219526976347, + "reward_std": 0.4753657951951027, + "rewards/cosine_scaled_reward": -0.24447251576930285, + "rewards/format_reward": 0.2500000074505806, + "step": 127 + }, + { + "advantage_max": 1.191404215991497, + "advantage_mean": -1.490116174895917e-08, + "advantage_min": -0.836443617939949, + "advantage_std": 0.7149866968393326, + "completion_length": 2884.8958587646484, + "epoch": 0.1462857142857143, + "grad_norm": 0.10610302537679672, + "kl": 0.006505012512207031, + "lambda_div_used": 0.6, + "learning_rate": 9.34913917072228e-07, + "loss": 0.0369, + "reward": 0.10793188214302063, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.10793188214302063, + "reward_after_std": 0.7149866968393326, + "reward_before_mean": 0.5834506526589394, + "reward_before_std": 0.7736126147210598, + "reward_change_max": 0.0012885704636573792, + "reward_change_mean": -0.4755187965929508, + "reward_change_min": -0.8745973594486713, + "reward_change_std": 0.3618140686303377, + "reward_std": 0.7149867117404938, + "rewards/cosine_scaled_reward": 0.09380867145955563, + "rewards/format_reward": 0.3958333432674408, + "step": 128 + }, + { + "advantage_max": 1.582452967762947, + "advantage_mean": 1.117587122845265e-08, + "advantage_min": -0.6322049722075462, + "advantage_std": 0.8493271358311176, + "completion_length": 3381.8541870117188, + "epoch": 0.14742857142857144, + "grad_norm": 0.1458180546760559, + "kl": 0.0063571929931640625, + "lambda_div_used": 0.6, + "learning_rate": 9.332771203643714e-07, + "loss": 0.0135, + "reward": -0.2717202575877309, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2717202575877309, + "reward_after_std": 0.8493271190673113, + "reward_before_mean": -0.05207864195108414, + "reward_before_std": 0.8756519798189402, + "reward_change_max": 0.0013925209641456604, + "reward_change_mean": -0.2196416319347918, + "reward_change_min": -0.5734671168029308, + "reward_change_std": 0.22998599475249648, + "reward_std": 0.8493271507322788, + "rewards/cosine_scaled_reward": -0.15103933541104198, + "rewards/format_reward": 0.25000000558793545, + "step": 129 + }, + { + "advantage_max": 1.3907492086291313, + "advantage_mean": 6.8296989730320945e-09, + "advantage_min": -0.6126744374632835, + "advantage_std": 0.7462209239602089, + "completion_length": 3162.6250915527344, + "epoch": 0.14857142857142858, + "grad_norm": 0.13115747272968292, + "kl": 0.006969451904296875, + "lambda_div_used": 0.6, + "learning_rate": 9.316216432703916e-07, + "loss": 0.0978, + "reward": -0.29135762667283416, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.29135762667283416, + "reward_after_std": 0.7462209612131119, + "reward_before_mean": -0.0620481688529253, + "reward_before_std": 0.7572699598968029, + "reward_change_max": 0.0009041875600814819, + "reward_change_mean": -0.2293094601482153, + "reward_change_min": -0.5258476063609123, + "reward_change_std": 0.20949362684041262, + "reward_std": 0.7462209723889828, + "rewards/cosine_scaled_reward": -0.15602409280836582, + "rewards/format_reward": 0.25000000558793545, + "step": 130 + }, + { + "advantage_max": 1.4393389448523521, + "advantage_mean": 1.0554989549049765e-08, + "advantage_min": -0.8425508216023445, + "advantage_std": 0.821558766067028, + "completion_length": 2863.6250228881836, + "epoch": 0.14971428571428572, + "grad_norm": 0.11954847723245621, + "kl": 0.008882522583007812, + "lambda_div_used": 0.6, + "learning_rate": 9.299475664759068e-07, + "loss": 0.0408, + "reward": 0.10050448589026928, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.10050448589026928, + "reward_after_std": 0.8215587623417377, + "reward_before_mean": 0.5430004335939884, + "reward_before_std": 0.8587308675050735, + "reward_change_max": 0.0007584765553474426, + "reward_change_mean": -0.4424959532916546, + "reward_change_min": -0.8454087376594543, + "reward_change_std": 0.3576465295627713, + "reward_std": 0.8215587809681892, + "rewards/cosine_scaled_reward": 0.07358355727046728, + "rewards/format_reward": 0.3958333432674408, + "step": 131 + }, + { + "advantage_max": 1.5376378148794174, + "advantage_mean": 6.208821234920947e-10, + "advantage_min": -0.5504110679030418, + "advantage_std": 0.8050966262817383, + "completion_length": 2799.312530517578, + "epoch": 0.15085714285714286, + "grad_norm": 0.12419920414686203, + "kl": 0.0067138671875, + "lambda_div_used": 0.6, + "learning_rate": 9.282549715730579e-07, + "loss": 0.0195, + "reward": -0.19808600842952728, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.19808600842952728, + "reward_after_std": 0.8050966449081898, + "reward_before_mean": 0.06645407644100487, + "reward_before_std": 0.7814781591296196, + "reward_change_max": 0.0012718439102172852, + "reward_change_mean": -0.2645400739274919, + "reward_change_min": -0.5789077877998352, + "reward_change_std": 0.2121976970229298, + "reward_std": 0.8050966635346413, + "rewards/cosine_scaled_reward": -0.1334396367892623, + "rewards/format_reward": 0.33333334140479565, + "step": 132 + }, + { + "advantage_max": 1.221783734858036, + "advantage_mean": 1.924733383784627e-08, + "advantage_min": -0.5688916221261024, + "advantage_std": 0.6676540970802307, + "completion_length": 3213.8959045410156, + "epoch": 0.152, + "grad_norm": 0.14848573505878448, + "kl": 0.008680343627929688, + "lambda_div_used": 0.6, + "learning_rate": 9.265439410565328e-07, + "loss": 0.0429, + "reward": -0.2992970459163189, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2992970459163189, + "reward_after_std": 0.6676540970802307, + "reward_before_mean": -0.05562702799215913, + "reward_before_std": 0.6827064771205187, + "reward_change_max": 0.0013222172856330872, + "reward_change_mean": -0.24367000348865986, + "reward_change_min": -0.5533005259931087, + "reward_change_std": 0.2156898146495223, + "reward_std": 0.6676541119813919, + "rewards/cosine_scaled_reward": -0.17364685283973813, + "rewards/format_reward": 0.2916666716337204, + "step": 133 + }, + { + "advantage_max": 1.0847848951816559, + "advantage_mean": 1.1796752852344383e-08, + "advantage_min": -0.6085241474211216, + "advantage_std": 0.6005263235419989, + "completion_length": 2500.0833740234375, + "epoch": 0.15314285714285714, + "grad_norm": 0.11016670614480972, + "kl": 0.00949859619140625, + "lambda_div_used": 0.6, + "learning_rate": 9.248145583195447e-07, + "loss": -0.0017, + "reward": 0.05842417012900114, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.05842417012900114, + "reward_after_std": 0.6005263309925795, + "reward_before_mean": 0.5181667357683182, + "reward_before_std": 0.5441521927714348, + "reward_change_max": 0.0009170100092887878, + "reward_change_mean": -0.4597425349056721, + "reward_change_min": -0.7528548240661621, + "reward_change_std": 0.2978973565623164, + "reward_std": 0.6005263365805149, + "rewards/cosine_scaled_reward": -0.032583314925432205, + "rewards/format_reward": 0.5833333432674408, + "step": 134 + }, + { + "advantage_max": 1.4369381442666054, + "advantage_mean": -1.6142925440831846e-08, + "advantage_min": -0.8640343248844147, + "advantage_std": 0.8443533703684807, + "completion_length": 2017.4166946411133, + "epoch": 0.15428571428571428, + "grad_norm": 0.20822252333164215, + "kl": 0.007670402526855469, + "lambda_div_used": 0.6, + "learning_rate": 9.230669076497687e-07, + "loss": 0.0579, + "reward": 0.5198374316096306, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5198374316096306, + "reward_after_std": 0.8443533480167389, + "reward_before_mean": 1.1967530995607376, + "reward_before_std": 0.8075528107583523, + "reward_change_max": 0.0, + "reward_change_mean": -0.6769156903028488, + "reward_change_min": -1.1189482659101486, + "reward_change_std": 0.4737038407474756, + "reward_std": 0.8443533927202225, + "rewards/cosine_scaled_reward": 0.2650432363152504, + "rewards/format_reward": 0.666666679084301, + "step": 135 + }, + { + "advantage_max": 1.6212140694260597, + "advantage_mean": -3.7252899653950067e-09, + "advantage_min": -0.9679715968668461, + "advantage_std": 0.9402196696028113, + "completion_length": 2955.5208740234375, + "epoch": 0.15542857142857142, + "grad_norm": 0.15110325813293457, + "kl": 0.009227752685546875, + "lambda_div_used": 0.6, + "learning_rate": 9.213010742252327e-07, + "loss": 0.0592, + "reward": 0.1325805252417922, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1325805252417922, + "reward_after_std": 0.9402196817100048, + "reward_before_mean": 0.5687055559828877, + "reward_before_std": 1.0172571474686265, + "reward_change_max": 0.0006020441651344299, + "reward_change_mean": -0.4361250060610473, + "reward_change_min": -0.97487448528409, + "reward_change_std": 0.39161210390739143, + "reward_std": 0.940219696611166, + "rewards/cosine_scaled_reward": 0.0656027642544359, + "rewards/format_reward": 0.43750001676380634, + "step": 136 + }, + { + "advantage_max": 1.3895707875490189, + "advantage_mean": 3.1044086745701804e-09, + "advantage_min": -0.5488235056400299, + "advantage_std": 0.7214406877756119, + "completion_length": 2908.4792098999023, + "epoch": 0.15657142857142858, + "grad_norm": 0.12555868923664093, + "kl": 0.007396697998046875, + "lambda_div_used": 0.6, + "learning_rate": 9.195171441101668e-07, + "loss": 0.041, + "reward": -0.27986384340329096, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.27986384340329096, + "reward_after_std": 0.7214406877756119, + "reward_before_mean": -0.0438942676410079, + "reward_before_std": 0.6932445093989372, + "reward_change_max": 0.00017392635345458984, + "reward_change_mean": -0.23596958350390196, + "reward_change_min": -0.4737580604851246, + "reward_change_std": 0.18140669167041779, + "reward_std": 0.7214406877756119, + "rewards/cosine_scaled_reward": -0.23028047289699316, + "rewards/format_reward": 0.4166666716337204, + "step": 137 + }, + { + "advantage_max": 1.1234722062945366, + "advantage_mean": -4.9670538238011375e-09, + "advantage_min": -0.522578340023756, + "advantage_std": 0.5962201803922653, + "completion_length": 2548.2709197998047, + "epoch": 0.15771428571428572, + "grad_norm": 0.07525104284286499, + "kl": 0.0075054168701171875, + "lambda_div_used": 0.6, + "learning_rate": 9.177152042508077e-07, + "loss": 0.0393, + "reward": -0.03399332519620657, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.03399332519620657, + "reward_after_std": 0.5962201841175556, + "reward_before_mean": 0.36882060393691063, + "reward_before_std": 0.5129054486751556, + "reward_change_max": 0.0, + "reward_change_mean": -0.402813920751214, + "reward_change_min": -0.652053989470005, + "reward_change_std": 0.2527422411367297, + "reward_std": 0.5962201952934265, + "rewards/cosine_scaled_reward": -0.10725637432187796, + "rewards/format_reward": 0.5833333432674408, + "step": 138 + }, + { + "advantage_max": 1.276498556137085, + "advantage_mean": 9.313225912688239e-09, + "advantage_min": -0.7927935421466827, + "advantage_std": 0.754135251045227, + "completion_length": 3237.8958740234375, + "epoch": 0.15885714285714286, + "grad_norm": 0.15498626232147217, + "kl": 0.0106353759765625, + "lambda_div_used": 0.6, + "learning_rate": 9.158953424711624e-07, + "loss": 0.0157, + "reward": 0.017847408074885607, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.017847408074885607, + "reward_after_std": 0.7541352808475494, + "reward_before_mean": 0.4330185279250145, + "reward_before_std": 0.8115034103393555, + "reward_change_max": 0.00047279149293899536, + "reward_change_mean": -0.415171118453145, + "reward_change_min": -0.8247731365263462, + "reward_change_std": 0.3465948710218072, + "reward_std": 0.7541352920234203, + "rewards/cosine_scaled_reward": -0.002240734174847603, + "rewards/format_reward": 0.4375000149011612, + "step": 139 + }, + { + "advantage_max": 1.0571947619318962, + "advantage_mean": 1.800557003495129e-08, + "advantage_min": -0.43688103556632996, + "advantage_std": 0.5635790098458529, + "completion_length": 3024.5209045410156, + "epoch": 0.16, + "grad_norm": 0.12986275553703308, + "kl": 0.01467132568359375, + "lambda_div_used": 0.6, + "learning_rate": 9.140576474687263e-07, + "loss": 0.0497, + "reward": -0.24928517825901508, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.24928517825901508, + "reward_after_std": 0.563579011708498, + "reward_before_mean": 0.0354112945497036, + "reward_before_std": 0.5041212420910597, + "reward_change_max": 0.0016125962138175964, + "reward_change_mean": -0.2846964537166059, + "reward_change_min": -0.482378251850605, + "reward_change_std": 0.19983111508190632, + "reward_std": 0.5635790284723043, + "rewards/cosine_scaled_reward": -0.12812769412994385, + "rewards/format_reward": 0.2916666753590107, + "step": 140 + }, + { + "advantage_max": 1.9671744778752327, + "advantage_mean": -6.208817349140361e-09, + "advantage_min": -0.89418925344944, + "advantage_std": 1.0909193567931652, + "completion_length": 2874.104232788086, + "epoch": 0.16114285714285714, + "grad_norm": 0.16195128858089447, + "kl": 0.014141082763671875, + "lambda_div_used": 0.6, + "learning_rate": 9.122022088101613e-07, + "loss": 0.0402, + "reward": 0.2075315216789022, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2075315216789022, + "reward_after_std": 1.0909193120896816, + "reward_before_mean": 0.6503730427939445, + "reward_before_std": 1.1303999871015549, + "reward_change_max": 0.0007979348301887512, + "reward_change_mean": -0.4428415335714817, + "reward_change_min": -0.9751441776752472, + "reward_change_std": 0.3947232998907566, + "reward_std": 1.090919341892004, + "rewards/cosine_scaled_reward": 0.033519853837788105, + "rewards/format_reward": 0.5833333432674408, + "step": 141 + }, + { + "advantage_max": 1.5644628405570984, + "advantage_mean": -7.140139812733537e-09, + "advantage_min": -0.9668753743171692, + "advantage_std": 0.8950626216828823, + "completion_length": 2810.8334197998047, + "epoch": 0.16228571428571428, + "grad_norm": 0.1719711571931839, + "kl": 0.01021575927734375, + "lambda_div_used": 0.6, + "learning_rate": 9.103291169269299e-07, + "loss": 0.1232, + "reward": 0.15125958062708378, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.15125958062708378, + "reward_after_std": 0.8950626142323017, + "reward_before_mean": 0.6069081444293261, + "reward_before_std": 0.9450072459876537, + "reward_change_max": 0.0012427493929862976, + "reward_change_mean": -0.4556485563516617, + "reward_change_min": -0.8686264418065548, + "reward_change_std": 0.37062836065888405, + "reward_std": 0.8950626626610756, + "rewards/cosine_scaled_reward": 0.0013707317411899567, + "rewards/format_reward": 0.6041666846722364, + "step": 142 + }, + { + "advantage_max": 1.0218571051955223, + "advantage_mean": 2.483526928553914e-08, + "advantage_min": -0.4859629459679127, + "advantage_std": 0.5451667793095112, + "completion_length": 2720.604202270508, + "epoch": 0.16342857142857142, + "grad_norm": 0.19208702445030212, + "kl": 0.013195037841796875, + "lambda_div_used": 0.6, + "learning_rate": 9.084384631108882e-07, + "loss": 0.0667, + "reward": -0.3548990674316883, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.3548990674316883, + "reward_after_std": 0.5451667718589306, + "reward_before_mean": -0.12133161188103259, + "reward_before_std": 0.5252682417631149, + "reward_change_max": 0.0, + "reward_change_mean": -0.23356744460761547, + "reward_change_min": -0.471222460269928, + "reward_change_std": 0.18014120031148195, + "reward_std": 0.5451667830348015, + "rewards/cosine_scaled_reward": -0.2377491444349289, + "rewards/format_reward": 0.35416666977107525, + "step": 143 + }, + { + "advantage_max": 1.4134826138615608, + "advantage_mean": 6.208817349140361e-09, + "advantage_min": -0.6104103177785873, + "advantage_std": 0.7537709623575211, + "completion_length": 2884.5833740234375, + "epoch": 0.16457142857142856, + "grad_norm": 0.11878510564565659, + "kl": 0.0088043212890625, + "lambda_div_used": 0.6, + "learning_rate": 9.065303395098358e-07, + "loss": 0.0401, + "reward": -0.055001866072416306, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.055001866072416306, + "reward_after_std": 0.7537709586322308, + "reward_before_mean": 0.3030646312981844, + "reward_before_std": 0.7174690030515194, + "reward_change_max": 0.0010922551155090332, + "reward_change_mean": -0.3580665346235037, + "reward_change_min": -0.6770821623504162, + "reward_change_std": 0.2684730626642704, + "reward_std": 0.7537709660828114, + "rewards/cosine_scaled_reward": -0.03596767038106918, + "rewards/format_reward": 0.37500000186264515, + "step": 144 + }, + { + "advantage_max": 1.1855008229613304, + "advantage_mean": -3.104408619059029e-09, + "advantage_min": -0.6026959903538227, + "advantage_std": 0.6411144360899925, + "completion_length": 2345.3125381469727, + "epoch": 0.1657142857142857, + "grad_norm": 0.11563977599143982, + "kl": 0.01104736328125, + "lambda_div_used": 0.6, + "learning_rate": 9.046048391230247e-07, + "loss": 0.0254, + "reward": 0.12009605206549168, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.12009605206549168, + "reward_after_std": 0.6411144360899925, + "reward_before_mean": 0.5996555439196527, + "reward_before_std": 0.5601776875555515, + "reward_change_max": 0.0, + "reward_change_mean": -0.47955949790775776, + "reward_change_min": -0.8012478165328503, + "reward_change_std": 0.30504490062594414, + "reward_std": 0.6411144435405731, + "rewards/cosine_scaled_reward": -0.0022555720061063766, + "rewards/format_reward": 0.6041666753590107, + "step": 145 + }, + { + "advantage_max": 1.2408719435334206, + "advantage_mean": 1.8936893220189432e-08, + "advantage_min": -0.7066081799566746, + "advantage_std": 0.7005535922944546, + "completion_length": 2367.187545776367, + "epoch": 0.16685714285714287, + "grad_norm": 0.19457805156707764, + "kl": 0.0077648162841796875, + "lambda_div_used": 0.6, + "learning_rate": 9.026620557966279e-07, + "loss": 0.0806, + "reward": -0.08792629465460777, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.08792629465460777, + "reward_after_std": 0.7005535922944546, + "reward_before_mean": 0.2676168754696846, + "reward_before_std": 0.7161291688680649, + "reward_change_max": 0.0023860037326812744, + "reward_change_mean": -0.35554315708577633, + "reward_change_min": -0.7299299165606499, + "reward_change_std": 0.2929388973861933, + "reward_std": 0.7005536258220673, + "rewards/cosine_scaled_reward": -0.15785823203623295, + "rewards/format_reward": 0.5833333414047956, + "step": 146 + }, + { + "advantage_max": 1.4107197225093842, + "advantage_mean": 6.208817571184966e-09, + "advantage_min": -0.657046489417553, + "advantage_std": 0.7767920307815075, + "completion_length": 2868.625015258789, + "epoch": 0.168, + "grad_norm": 0.14225320518016815, + "kl": 0.0139923095703125, + "lambda_div_used": 0.6, + "learning_rate": 9.007020842191634e-07, + "loss": 0.0227, + "reward": -0.1404053345322609, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.1404053345322609, + "reward_after_std": 0.7767920270562172, + "reward_before_mean": 0.1694311499595642, + "reward_before_std": 0.7943050377070904, + "reward_change_max": 0.00027482956647872925, + "reward_change_mean": -0.3098364733159542, + "reward_change_min": -0.7106362171471119, + "reward_change_std": 0.2728282855823636, + "reward_std": 0.7767920419573784, + "rewards/cosine_scaled_reward": -0.09236776456236839, + "rewards/format_reward": 0.3541666753590107, + "step": 147 + }, + { + "advantage_max": 1.4902683272957802, + "advantage_mean": -8.692344288796505e-09, + "advantage_min": -0.6846123561263084, + "advantage_std": 0.8118891194462776, + "completion_length": 2573.291732788086, + "epoch": 0.16914285714285715, + "grad_norm": 0.12192974984645844, + "kl": 0.012142181396484375, + "lambda_div_used": 0.6, + "learning_rate": 8.987250199168808e-07, + "loss": 0.0225, + "reward": 0.1568167768418789, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1568167768418789, + "reward_after_std": 0.8118891008198261, + "reward_before_mean": 0.6254363059997559, + "reward_before_std": 0.7636519968509674, + "reward_change_max": 0.0007330328226089478, + "reward_change_mean": -0.4686195347458124, + "reward_change_min": -0.8460842408239841, + "reward_change_std": 0.32862727902829647, + "reward_std": 0.8118891268968582, + "rewards/cosine_scaled_reward": 0.021051467396318913, + "rewards/format_reward": 0.5833333414047956, + "step": 148 + }, + { + "advantage_max": 1.4633513800799847, + "advantage_mean": 1.0554989549049765e-08, + "advantage_min": -0.6385081559419632, + "advantage_std": 0.7907122951000929, + "completion_length": 2929.1458740234375, + "epoch": 0.1702857142857143, + "grad_norm": 0.13244077563285828, + "kl": 0.01035308837890625, + "lambda_div_used": 0.6, + "learning_rate": 8.967309592491052e-07, + "loss": 0.0284, + "reward": -0.1310775459278375, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.1310775459278375, + "reward_after_std": 0.7907122801989317, + "reward_before_mean": 0.18053598888218403, + "reward_before_std": 0.7945210449397564, + "reward_change_max": 0.0018289387226104736, + "reward_change_mean": -0.31161350570619106, + "reward_change_min": -0.6495312862098217, + "reward_change_std": 0.26187097700312734, + "reward_std": 0.7907122857868671, + "rewards/cosine_scaled_reward": -0.12848201533779502, + "rewards/format_reward": 0.4375000074505806, + "step": 149 + }, + { + "advantage_max": 1.4026121124625206, + "advantage_mean": -3.725290298461914e-09, + "advantage_min": -0.9239209443330765, + "advantage_std": 0.8353138975799084, + "completion_length": 2977.812515258789, + "epoch": 0.17142857142857143, + "grad_norm": 0.17636874318122864, + "kl": 0.0157928466796875, + "lambda_div_used": 0.6, + "learning_rate": 8.9471999940354e-07, + "loss": 0.0718, + "reward": 0.16062995791435242, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.16062995791435242, + "reward_after_std": 0.8353139087557793, + "reward_before_mean": 0.6395070925354958, + "reward_before_std": 0.9086871258914471, + "reward_change_max": 0.0, + "reward_change_mean": -0.47887711599469185, + "reward_change_min": -0.9224202036857605, + "reward_change_std": 0.38951124995946884, + "reward_std": 0.8353139348328114, + "rewards/cosine_scaled_reward": 0.11142019368708134, + "rewards/format_reward": 0.4166666753590107, + "step": 150 + }, + { + "advantage_max": 1.6562565043568611, + "advantage_mean": -1.2417634698280722e-09, + "advantage_min": -0.8369650840759277, + "advantage_std": 0.9075136668980122, + "completion_length": 2601.812545776367, + "epoch": 0.17257142857142857, + "grad_norm": 0.20402175188064575, + "kl": 0.01294708251953125, + "lambda_div_used": 0.6, + "learning_rate": 8.926922383915315e-07, + "loss": 0.0533, + "reward": 0.19714290462434292, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.19714290462434292, + "reward_after_std": 0.9075136668980122, + "reward_before_mean": 0.6664761131396517, + "reward_before_std": 0.8824147135019302, + "reward_change_max": 0.00037372857332229614, + "reward_change_mean": -0.4693332202732563, + "reward_change_min": -0.8574397899210453, + "reward_change_std": 0.35109512601047754, + "reward_std": 0.9075136780738831, + "rewards/cosine_scaled_reward": 0.05198805220425129, + "rewards/format_reward": 0.5625000149011612, + "step": 151 + }, + { + "advantage_max": 1.0910377725958824, + "advantage_mean": 1.2417634698280722e-08, + "advantage_min": -0.5008981414139271, + "advantage_std": 0.5850908644497395, + "completion_length": 2868.7916984558105, + "epoch": 0.1737142857142857, + "grad_norm": 0.1328096240758896, + "kl": 0.016147613525390625, + "lambda_div_used": 0.6, + "learning_rate": 8.906477750432903e-07, + "loss": 0.0549, + "reward": -0.28469391725957394, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.28469391725957394, + "reward_after_std": 0.5850908793509007, + "reward_before_mean": -0.020222843624651432, + "reward_before_std": 0.563703091815114, + "reward_change_max": 0.0006050914525985718, + "reward_change_mean": -0.2644710736349225, + "reward_change_min": -0.48153096437454224, + "reward_change_std": 0.19808495230972767, + "reward_std": 0.5850908905267715, + "rewards/cosine_scaled_reward": -0.16636142708011903, + "rewards/format_reward": 0.31250000186264515, + "step": 152 + }, + { + "advantage_max": 1.010260485112667, + "advantage_mean": 9.313225690643634e-09, + "advantage_min": -0.5362804085016251, + "advantage_std": 0.5542087778449059, + "completion_length": 2930.8750610351562, + "epoch": 0.17485714285714285, + "grad_norm": 0.10720986872911453, + "kl": 0.021121978759765625, + "lambda_div_used": 0.6, + "learning_rate": 8.88586709003076e-07, + "loss": 0.043, + "reward": -0.3041149973869324, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.3041149973869324, + "reward_after_std": 0.5542087741196156, + "reward_before_mean": -0.0405734870582819, + "reward_before_std": 0.5564620681107044, + "reward_change_max": 0.001681201159954071, + "reward_change_mean": -0.2635415093973279, + "reward_change_min": -0.5015093982219696, + "reward_change_std": 0.20337554719299078, + "reward_std": 0.5542087815701962, + "rewards/cosine_scaled_reward": -0.18695341609418392, + "rewards/format_reward": 0.33333334140479565, + "step": 153 + }, + { + "advantage_max": 1.5810632705688477, + "advantage_mean": -3.1044086745701804e-09, + "advantage_min": -0.8457796424627304, + "advantage_std": 0.9236548617482185, + "completion_length": 3452.8333740234375, + "epoch": 0.176, + "grad_norm": 0.1580066680908203, + "kl": 0.012538909912109375, + "lambda_div_used": 0.6, + "learning_rate": 8.865091407243394e-07, + "loss": 0.0571, + "reward": -0.070727514103055, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.070727514103055, + "reward_after_std": 0.9236548617482185, + "reward_before_mean": 0.2621636036783457, + "reward_before_std": 1.036940049380064, + "reward_change_max": 0.001263946294784546, + "reward_change_mean": -0.3328911308199167, + "reward_change_min": -0.8280980698764324, + "reward_change_std": 0.36182656791061163, + "reward_std": 0.9236549139022827, + "rewards/cosine_scaled_reward": -0.004334868863224983, + "rewards/format_reward": 0.2708333395421505, + "step": 154 + }, + { + "advantage_max": 1.1857907101511955, + "advantage_mean": 8.071462720415923e-09, + "advantage_min": -0.5334465727210045, + "advantage_std": 0.6514654830098152, + "completion_length": 2667.937545776367, + "epoch": 0.17714285714285713, + "grad_norm": 0.10593610256910324, + "kl": 0.012603759765625, + "lambda_div_used": 0.6, + "learning_rate": 8.844151714648274e-07, + "loss": 0.0209, + "reward": 0.15297628194093704, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.15297628194093704, + "reward_after_std": 0.6514654830098152, + "reward_before_mean": 0.647703853668645, + "reward_before_std": 0.570495992898941, + "reward_change_max": 0.0, + "reward_change_mean": -0.4947275333106518, + "reward_change_min": -0.8316370770335197, + "reward_change_std": 0.32937124744057655, + "reward_std": 0.6514654979109764, + "rewards/cosine_scaled_reward": 0.07385189644992352, + "rewards/format_reward": 0.5000000018626451, + "step": 155 + }, + { + "advantage_max": 1.4448847994208336, + "advantage_mean": 9.934107980669182e-09, + "advantage_min": -0.723850317299366, + "advantage_std": 0.7949776016175747, + "completion_length": 3009.9583587646484, + "epoch": 0.1782857142857143, + "grad_norm": 0.12247823923826218, + "kl": 0.013317108154296875, + "lambda_div_used": 0.6, + "learning_rate": 8.823049032816478e-07, + "loss": 0.0451, + "reward": -0.10736257396638393, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.10736257396638393, + "reward_after_std": 0.7949775978922844, + "reward_before_mean": 0.2161726988852024, + "reward_before_std": 0.8142803534865379, + "reward_change_max": 0.001227453351020813, + "reward_change_mean": -0.3235352849587798, + "reward_change_min": -0.6421325244009495, + "reward_change_std": 0.2678525187075138, + "reward_std": 0.7949776314198971, + "rewards/cosine_scaled_reward": -0.05858031287789345, + "rewards/format_reward": 0.3333333469927311, + "step": 156 + }, + { + "advantage_max": 0.8361185044050217, + "advantage_mean": 1.1175871117430347e-08, + "advantage_min": -0.5154761485755444, + "advantage_std": 0.48633952997624874, + "completion_length": 3123.854248046875, + "epoch": 0.17942857142857144, + "grad_norm": 0.09848199039697647, + "kl": 0.017620086669921875, + "lambda_div_used": 0.6, + "learning_rate": 8.801784390262943e-07, + "loss": 0.037, + "reward": -0.2843034298857674, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2843034298857674, + "reward_after_std": 0.48633952997624874, + "reward_before_mean": 0.007725675590336323, + "reward_before_std": 0.5010611899197102, + "reward_change_max": 0.0008352473378181458, + "reward_change_mean": -0.2920291116461158, + "reward_change_min": -0.5376311354339123, + "reward_change_std": 0.22594552859663963, + "reward_std": 0.48633954487740993, + "rewards/cosine_scaled_reward": -0.18363716267049313, + "rewards/format_reward": 0.37500000558793545, + "step": 157 + }, + { + "advantage_max": 1.601955957710743, + "advantage_mean": -1.3659398390153399e-08, + "advantage_min": -0.7452918365597725, + "advantage_std": 0.8648084662854671, + "completion_length": 3046.4583892822266, + "epoch": 0.18057142857142858, + "grad_norm": 0.18506969511508942, + "kl": 0.013607025146484375, + "lambda_div_used": 0.6, + "learning_rate": 8.780358823396352e-07, + "loss": 0.0621, + "reward": 0.21542098745703697, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.21542098745703697, + "reward_after_std": 0.8648084886372089, + "reward_before_mean": 0.7011261153966188, + "reward_before_std": 0.824807170778513, + "reward_change_max": 0.0014419779181480408, + "reward_change_mean": -0.4857051018625498, + "reward_change_min": -0.9043072685599327, + "reward_change_std": 0.3465930465608835, + "reward_std": 0.8648085445165634, + "rewards/cosine_scaled_reward": 0.11097969580441713, + "rewards/format_reward": 0.47916667722165585, + "step": 158 + }, + { + "advantage_max": 1.0933760181069374, + "advantage_mean": 2.359350542713301e-08, + "advantage_min": -0.519902590662241, + "advantage_std": 0.5967991352081299, + "completion_length": 3206.4166870117188, + "epoch": 0.18171428571428572, + "grad_norm": 0.09166071563959122, + "kl": 0.0195465087890625, + "lambda_div_used": 0.6, + "learning_rate": 8.758773376468604e-07, + "loss": 0.0258, + "reward": -0.2743441015481949, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2743441015481949, + "reward_after_std": 0.5967991352081299, + "reward_before_mean": -0.003233685391023755, + "reward_before_std": 0.5897813513875008, + "reward_change_max": 0.0, + "reward_change_mean": -0.27111040614545345, + "reward_change_min": -0.5754702016711235, + "reward_change_std": 0.21552940551191568, + "reward_std": 0.5967991426587105, + "rewards/cosine_scaled_reward": -0.13703351188451052, + "rewards/format_reward": 0.2708333432674408, + "step": 159 + }, + { + "advantage_max": 1.1551932729780674, + "advantage_mean": 2.5456150964942026e-08, + "advantage_min": -0.7090373113751411, + "advantage_std": 0.635726273059845, + "completion_length": 2818.000015258789, + "epoch": 0.18285714285714286, + "grad_norm": 0.10902596265077591, + "kl": 0.021236419677734375, + "lambda_div_used": 0.6, + "learning_rate": 8.737029101523929e-07, + "loss": -0.0041, + "reward": 0.001515369862318039, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.001515369862318039, + "reward_after_std": 0.6357262693345547, + "reward_before_mean": 0.4176196716725826, + "reward_before_std": 0.6051103845238686, + "reward_change_max": 0.00033104419708251953, + "reward_change_mean": -0.4161042859777808, + "reward_change_min": -0.6843418888747692, + "reward_change_std": 0.27768101543188095, + "reward_std": 0.635726273059845, + "rewards/cosine_scaled_reward": -0.009940192103385925, + "rewards/format_reward": 0.4375000074505806, + "step": 160 + }, + { + "advantage_max": 1.213698647916317, + "advantage_mean": -3.7252901874396116e-09, + "advantage_min": -0.552368201315403, + "advantage_std": 0.6527365148067474, + "completion_length": 2969.9166870117188, + "epoch": 0.184, + "grad_norm": 0.17307765781879425, + "kl": 0.02019500732421875, + "lambda_div_used": 0.6, + "learning_rate": 8.715127058347614e-07, + "loss": 0.0628, + "reward": -0.10974358767271042, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.10974358767271042, + "reward_after_std": 0.652736522257328, + "reward_before_mean": 0.23834371659904718, + "reward_before_std": 0.6073960103094578, + "reward_change_max": 0.000466175377368927, + "reward_change_mean": -0.34808731684461236, + "reward_change_min": -0.6483714245259762, + "reward_change_std": 0.25496606389060616, + "reward_std": 0.6527365408837795, + "rewards/cosine_scaled_reward": -0.05791149102151394, + "rewards/format_reward": 0.3541666753590107, + "step": 161 + }, + { + "advantage_max": 1.5892940908670425, + "advantage_mean": 8.692344621863413e-09, + "advantage_min": -0.5379619300365448, + "advantage_std": 0.8178718462586403, + "completion_length": 3311.2083435058594, + "epoch": 0.18514285714285714, + "grad_norm": 0.16470083594322205, + "kl": 0.02435302734375, + "lambda_div_used": 0.6, + "learning_rate": 8.693068314414344e-07, + "loss": 0.012, + "reward": -0.2619167543016374, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2619167543016374, + "reward_after_std": 0.8178718611598015, + "reward_before_mean": -0.0391325494274497, + "reward_before_std": 0.788720153272152, + "reward_change_max": 0.0004187747836112976, + "reward_change_mean": -0.2227841846179217, + "reward_change_min": -0.5147033594548702, + "reward_change_std": 0.18670457205735147, + "reward_std": 0.8178718686103821, + "rewards/cosine_scaled_reward": -0.12373294867575169, + "rewards/format_reward": 0.2083333358168602, + "step": 162 + }, + { + "advantage_max": 1.5184885412454605, + "advantage_mean": -5.277494746769307e-09, + "advantage_min": -0.7798839248716831, + "advantage_std": 0.8345548771321774, + "completion_length": 2602.625045776367, + "epoch": 0.18628571428571428, + "grad_norm": 0.15418756008148193, + "kl": 0.018619537353515625, + "lambda_div_used": 0.6, + "learning_rate": 8.670853944836176e-07, + "loss": 0.0306, + "reward": 0.09741232171654701, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.09741232171654701, + "reward_after_std": 0.8345548585057259, + "reward_before_mean": 0.5306345459539443, + "reward_before_std": 0.8378158137202263, + "reward_change_max": 0.0007125288248062134, + "reward_change_mean": -0.43322221748530865, + "reward_change_min": -0.800242580473423, + "reward_change_std": 0.31650349078699946, + "reward_std": 0.8345548957586288, + "rewards/cosine_scaled_reward": -0.005516069009900093, + "rewards/format_reward": 0.5416666772216558, + "step": 163 + }, + { + "advantage_max": 1.150291495025158, + "advantage_mean": -5.587935503204022e-09, + "advantage_min": -0.7518020123243332, + "advantage_std": 0.6527398601174355, + "completion_length": 2719.7708892822266, + "epoch": 0.18742857142857142, + "grad_norm": 0.1282048225402832, + "kl": 0.020420074462890625, + "lambda_div_used": 0.6, + "learning_rate": 8.648485032310144e-07, + "loss": 0.0287, + "reward": 0.05534203629940748, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.05534203629940748, + "reward_after_std": 0.6527398601174355, + "reward_before_mean": 0.503211950417608, + "reward_before_std": 0.6376418210566044, + "reward_change_max": 0.0012490972876548767, + "reward_change_mean": -0.44786988385021687, + "reward_change_min": -0.7799138650298119, + "reward_change_std": 0.3177740080282092, + "reward_std": 0.6527398899197578, + "rewards/cosine_scaled_reward": 0.001605958677828312, + "rewards/format_reward": 0.5000000167638063, + "step": 164 + }, + { + "advantage_max": 1.3831028826534748, + "advantage_mean": 7.45058070794613e-09, + "advantage_min": -0.6921538636088371, + "advantage_std": 0.7546810247004032, + "completion_length": 3058.7291870117188, + "epoch": 0.18857142857142858, + "grad_norm": 0.15882042050361633, + "kl": 0.0272674560546875, + "lambda_div_used": 0.6, + "learning_rate": 8.625962667065487e-07, + "loss": 0.0616, + "reward": -0.1579680386930704, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.1579680386930704, + "reward_after_std": 0.754681009799242, + "reward_before_mean": 0.14193058013916016, + "reward_before_std": 0.762186162173748, + "reward_change_max": 0.0010264962911605835, + "reward_change_mean": -0.2998986216261983, + "reward_change_min": -0.621304091066122, + "reward_change_std": 0.25608566775918007, + "reward_std": 0.7546810209751129, + "rewards/cosine_scaled_reward": -0.10611804574728012, + "rewards/format_reward": 0.35416668094694614, + "step": 165 + }, + { + "advantage_max": 1.441053494811058, + "advantage_mean": 2.4835269396561444e-09, + "advantage_min": -0.8313321396708488, + "advantage_std": 0.8061891719698906, + "completion_length": 3003.104217529297, + "epoch": 0.18971428571428572, + "grad_norm": 0.13318245112895966, + "kl": 0.01674652099609375, + "lambda_div_used": 0.6, + "learning_rate": 8.603287946810513e-07, + "loss": 0.0304, + "reward": -0.012070178985595703, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.012070178985595703, + "reward_after_std": 0.8061891719698906, + "reward_before_mean": 0.3683839440345764, + "reward_before_std": 0.8240198995918036, + "reward_change_max": 0.002330496907234192, + "reward_change_mean": -0.38045413699001074, + "reward_change_min": -0.7002588920295238, + "reward_change_std": 0.3043457716703415, + "reward_std": 0.8061891756951809, + "rewards/cosine_scaled_reward": -0.03455802670214325, + "rewards/format_reward": 0.43750001303851604, + "step": 166 + }, + { + "advantage_max": 1.423269435763359, + "advantage_mean": -3.104408619059029e-09, + "advantage_min": -0.7583351470530033, + "advantage_std": 0.8086444661021233, + "completion_length": 2296.2709045410156, + "epoch": 0.19085714285714286, + "grad_norm": 0.1586252748966217, + "kl": 0.01633453369140625, + "lambda_div_used": 0.6, + "learning_rate": 8.580461976679099e-07, + "loss": 0.0376, + "reward": 0.12070143315941095, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.12070143315941095, + "reward_after_std": 0.8086444847285748, + "reward_before_mean": 0.5716481516137719, + "reward_before_std": 0.8213043101131916, + "reward_change_max": 0.004665866494178772, + "reward_change_mean": -0.45094671472907066, + "reward_change_min": -0.8644659072160721, + "reward_change_std": 0.349481089040637, + "reward_std": 0.808644488453865, + "rewards/cosine_scaled_reward": -0.0995926121249795, + "rewards/format_reward": 0.7708333432674408, + "step": 167 + }, + { + "advantage_max": 1.5459800511598587, + "advantage_mean": -1.4901161193847656e-08, + "advantage_min": -0.823444951325655, + "advantage_std": 0.8288298845291138, + "completion_length": 3093.0834045410156, + "epoch": 0.192, + "grad_norm": 0.15506742894649506, + "kl": 0.0215911865234375, + "lambda_div_used": 0.6, + "learning_rate": 8.557485869176825e-07, + "loss": 0.0362, + "reward": 0.10060895385686308, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.10060895385686308, + "reward_after_std": 0.828829899430275, + "reward_before_mean": 0.5286179166287184, + "reward_before_std": 0.7910416908562183, + "reward_change_max": 0.0007976368069648743, + "reward_change_mean": -0.4280089922249317, + "reward_change_min": -0.7011307924985886, + "reward_change_std": 0.29120912682265043, + "reward_std": 0.8288299404084682, + "rewards/cosine_scaled_reward": 0.02472563646733761, + "rewards/format_reward": 0.47916668094694614, + "step": 168 + }, + { + "advantage_max": 1.312043160200119, + "advantage_mean": -3.2285850604107935e-08, + "advantage_min": -0.5426613911986351, + "advantage_std": 0.7060335651040077, + "completion_length": 2461.9792404174805, + "epoch": 0.19314285714285714, + "grad_norm": 0.12502697110176086, + "kl": 0.02597808837890625, + "lambda_div_used": 0.6, + "learning_rate": 8.534360744126753e-07, + "loss": 0.0444, + "reward": 0.4141941964626312, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4141941964626312, + "reward_after_std": 0.7060335800051689, + "reward_before_mean": 1.0458204597234726, + "reward_before_std": 0.5466500017791986, + "reward_change_max": 0.0015628039836883545, + "reward_change_mean": -0.6316262576729059, + "reward_change_min": -1.0269163250923157, + "reward_change_std": 0.3907633125782013, + "reward_std": 0.7060335874557495, + "rewards/cosine_scaled_reward": 0.21041020873235539, + "rewards/format_reward": 0.6250000037252903, + "step": 169 + }, + { + "advantage_max": 1.2720028311014175, + "advantage_mean": 2.173086055545781e-08, + "advantage_min": -0.5567828081548214, + "advantage_std": 0.6729303523898125, + "completion_length": 2670.395866394043, + "epoch": 0.19428571428571428, + "grad_norm": 0.10678170621395111, + "kl": 0.018474578857421875, + "lambda_div_used": 0.6, + "learning_rate": 8.511087728614862e-07, + "loss": 0.0553, + "reward": -0.009291424183174968, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.009291424183174968, + "reward_after_std": 0.6729303747415543, + "reward_before_mean": 0.3892105184495449, + "reward_before_std": 0.599676800891757, + "reward_change_max": 0.0010467469692230225, + "reward_change_mean": -0.3985019223764539, + "reward_change_min": -0.6365118287503719, + "reward_change_std": 0.2544218208640814, + "reward_std": 0.6729303784668446, + "rewards/cosine_scaled_reward": -0.013728078454732895, + "rewards/format_reward": 0.41666666977107525, + "step": 170 + }, + { + "advantage_max": 1.4900328442454338, + "advantage_mean": -7.450581041013038e-09, + "advantage_min": -0.8795114979147911, + "advantage_std": 0.8389566726982594, + "completion_length": 2690.375030517578, + "epoch": 0.19542857142857142, + "grad_norm": 0.16565613448619843, + "kl": 0.01981353759765625, + "lambda_div_used": 0.6, + "learning_rate": 8.487667956935087e-07, + "loss": 0.0466, + "reward": 0.06884616613388062, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.06884616613388062, + "reward_after_std": 0.8389566875994205, + "reward_before_mean": 0.48622291162610054, + "reward_before_std": 0.8658607117831707, + "reward_change_max": 0.00112181156873703, + "reward_change_mean": -0.4173767250031233, + "reward_change_min": -0.739656388759613, + "reward_change_std": 0.32057770155370235, + "reward_std": 0.8389566913247108, + "rewards/cosine_scaled_reward": 0.02436142461374402, + "rewards/format_reward": 0.43750001303851604, + "step": 171 + }, + { + "advantage_max": 1.3270131349563599, + "advantage_mean": 1.1796752796833232e-08, + "advantage_min": -0.7321847081184387, + "advantage_std": 0.7408189624547958, + "completion_length": 2971.708335876465, + "epoch": 0.19657142857142856, + "grad_norm": 0.15071731805801392, + "kl": 0.0283050537109375, + "lambda_div_used": 0.6, + "learning_rate": 8.464102570534061e-07, + "loss": 0.0236, + "reward": -0.14135470986366272, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.14135470986366272, + "reward_after_std": 0.7408189624547958, + "reward_before_mean": 0.1792165581136942, + "reward_before_std": 0.7741873655468225, + "reward_change_max": 0.001958996057510376, + "reward_change_mean": -0.32057126238942146, + "reward_change_min": -0.6587401293218136, + "reward_change_std": 0.26787589583545923, + "reward_std": 0.7408190034329891, + "rewards/cosine_scaled_reward": -0.06664173398166895, + "rewards/format_reward": 0.31250000558793545, + "step": 172 + }, + { + "advantage_max": 1.5405456945300102, + "advantage_mean": 6.208817682207268e-09, + "advantage_min": -0.763243917375803, + "advantage_std": 0.868730153888464, + "completion_length": 2413.8333854675293, + "epoch": 0.1977142857142857, + "grad_norm": 0.21439699828624725, + "kl": 0.024854660034179688, + "lambda_div_used": 0.6, + "learning_rate": 8.440392717955475e-07, + "loss": 0.0856, + "reward": 0.009342290461063385, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.009342290461063385, + "reward_after_std": 0.8687301687896252, + "reward_before_mean": 0.38826479855924845, + "reward_before_std": 0.9071060288697481, + "reward_change_max": 0.000700242817401886, + "reward_change_mean": -0.37892252765595913, + "reward_change_min": -0.8705897815525532, + "reward_change_std": 0.34363609459251165, + "reward_std": 0.868730217218399, + "rewards/cosine_scaled_reward": -0.08711758349090815, + "rewards/format_reward": 0.5625000074505806, + "step": 173 + }, + { + "advantage_max": 1.3192416802048683, + "advantage_mean": -1.179675312990014e-08, + "advantage_min": -0.6599837280809879, + "advantage_std": 0.7320027984678745, + "completion_length": 2793.7083892822266, + "epoch": 0.19885714285714284, + "grad_norm": 0.1302064061164856, + "kl": 0.02942657470703125, + "lambda_div_used": 0.6, + "learning_rate": 8.416539554784089e-07, + "loss": 0.0173, + "reward": -0.11204979429021478, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.11204979429021478, + "reward_after_std": 0.7320028059184551, + "reward_before_mean": 0.22473372798413038, + "reward_before_std": 0.7440117113292217, + "reward_change_max": 0.0006036907434463501, + "reward_change_mean": -0.3367835180833936, + "reward_change_min": -0.6897301971912384, + "reward_change_std": 0.27322917617857456, + "reward_std": 0.7320028245449066, + "rewards/cosine_scaled_reward": -0.14804981462657452, + "rewards/format_reward": 0.5208333469927311, + "step": 174 + }, + { + "advantage_max": 1.2248894348740578, + "advantage_mean": -1.117587122845265e-08, + "advantage_min": -0.7270784452557564, + "advantage_std": 0.6971467100083828, + "completion_length": 2822.6458740234375, + "epoch": 0.2, + "grad_norm": 0.10310303419828415, + "kl": 0.0259552001953125, + "lambda_div_used": 0.6, + "learning_rate": 8.392544243589427e-07, + "loss": 0.0073, + "reward": 0.09740146715193987, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.09740146715193987, + "reward_after_std": 0.6971467137336731, + "reward_before_mean": 0.5614458620548248, + "reward_before_std": 0.6951047461479902, + "reward_change_max": 0.0006377026438713074, + "reward_change_mean": -0.4640443716198206, + "reward_change_min": -0.8764088936150074, + "reward_change_std": 0.335317213088274, + "reward_std": 0.6971467286348343, + "rewards/cosine_scaled_reward": 0.03072291426360607, + "rewards/format_reward": 0.5000000111758709, + "step": 175 + }, + { + "advantage_max": 1.6224799752235413, + "advantage_mean": 6.208817349140361e-10, + "advantage_min": -0.9095878675580025, + "advantage_std": 0.9159829206764698, + "completion_length": 2468.979263305664, + "epoch": 0.20114285714285715, + "grad_norm": 0.1898711621761322, + "kl": 0.0229339599609375, + "lambda_div_used": 0.6, + "learning_rate": 8.368407953869103e-07, + "loss": 0.0686, + "reward": 0.14530806988477707, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.14530806988477707, + "reward_after_std": 0.9159829206764698, + "reward_before_mean": 0.5826609404757619, + "reward_before_std": 0.9521896243095398, + "reward_change_max": 0.00018244236707687378, + "reward_change_mean": -0.4373528528958559, + "reward_change_min": -0.8462734147906303, + "reward_change_std": 0.35245630517601967, + "reward_std": 0.9159829281270504, + "rewards/cosine_scaled_reward": -0.03158620372414589, + "rewards/format_reward": 0.6458333432674408, + "step": 176 + }, + { + "advantage_max": 1.484945572912693, + "advantage_mean": 6.208817238118058e-09, + "advantage_min": -0.7248594909906387, + "advantage_std": 0.8175345957279205, + "completion_length": 3197.666778564453, + "epoch": 0.2022857142857143, + "grad_norm": 0.19520598649978638, + "kl": 0.031982421875, + "lambda_div_used": 0.6, + "learning_rate": 8.344131861991828e-07, + "loss": 0.035, + "reward": -0.06862121913582087, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.06862121913582087, + "reward_after_std": 0.8175345845520496, + "reward_before_mean": 0.27500755805522203, + "reward_before_std": 0.8386104181408882, + "reward_change_max": 0.00022308528423309326, + "reward_change_mean": -0.34362876880913973, + "reward_change_min": -0.7244202829897404, + "reward_change_std": 0.28399574756622314, + "reward_std": 0.8175345994532108, + "rewards/cosine_scaled_reward": -0.10207957029342651, + "rewards/format_reward": 0.4791666753590107, + "step": 177 + }, + { + "advantage_max": 1.3649575412273407, + "advantage_mean": 2.483526828633842e-09, + "advantage_min": -0.6267384588718414, + "advantage_std": 0.7353874500840902, + "completion_length": 2747.250015258789, + "epoch": 0.20342857142857143, + "grad_norm": 0.14075183868408203, + "kl": 0.038970947265625, + "lambda_div_used": 0.6, + "learning_rate": 8.319717151140072e-07, + "loss": 0.0136, + "reward": -0.08781477063894272, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.08781477063894272, + "reward_after_std": 0.7353874556720257, + "reward_before_mean": 0.2575212549418211, + "reward_before_std": 0.7124766483902931, + "reward_change_max": 0.00017629563808441162, + "reward_change_mean": -0.34533602371811867, + "reward_change_min": -0.6885601654648781, + "reward_change_std": 0.26829767785966396, + "reward_std": 0.7353874780237675, + "rewards/cosine_scaled_reward": -0.10040604881942272, + "rewards/format_reward": 0.4583333358168602, + "step": 178 + }, + { + "advantage_max": 0.8987370580434799, + "advantage_mean": -2.4835269396561444e-09, + "advantage_min": -0.35452789813280106, + "advantage_std": 0.471599405631423, + "completion_length": 3075.7916870117188, + "epoch": 0.20457142857142857, + "grad_norm": 0.11661714315414429, + "kl": 0.03214263916015625, + "lambda_div_used": 0.6, + "learning_rate": 8.295165011252396e-07, + "loss": 0.0306, + "reward": -0.35421258118003607, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.35421258118003607, + "reward_after_std": 0.471599405631423, + "reward_before_mean": -0.10331118968315423, + "reward_before_std": 0.42847144044935703, + "reward_change_max": 0.001559227705001831, + "reward_change_mean": -0.25090140476822853, + "reward_change_min": -0.44767534360289574, + "reward_change_std": 0.17294680699706078, + "reward_std": 0.4715994130820036, + "rewards/cosine_scaled_reward": -0.19748893287032843, + "rewards/format_reward": 0.2916666679084301, + "step": 179 + }, + { + "advantage_max": 1.2968221679329872, + "advantage_mean": -8.692344621863413e-09, + "advantage_min": -0.5777449980378151, + "advantage_std": 0.6874905861914158, + "completion_length": 2111.4166870117188, + "epoch": 0.2057142857142857, + "grad_norm": 0.12424161285161972, + "kl": 0.02796173095703125, + "lambda_div_used": 0.6, + "learning_rate": 8.270476638965461e-07, + "loss": -0.0256, + "reward": 0.23284974694252014, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.23284974694252014, + "reward_after_std": 0.6874905936419964, + "reward_before_mean": 0.7600893080234528, + "reward_before_std": 0.5672389660030603, + "reward_change_max": 0.0007742047309875488, + "reward_change_mean": -0.5272395350039005, + "reward_change_min": -0.8209475874900818, + "reward_change_std": 0.31187237333506346, + "reward_std": 0.6874906159937382, + "rewards/cosine_scaled_reward": 0.06754461862146854, + "rewards/format_reward": 0.625, + "step": 180 + }, + { + "advantage_max": 1.6038127765059471, + "advantage_mean": -1.3659398168108794e-08, + "advantage_min": -0.7976420260965824, + "advantage_std": 0.8899489752948284, + "completion_length": 3054.7500762939453, + "epoch": 0.20685714285714285, + "grad_norm": 0.19416145980358124, + "kl": 0.033782958984375, + "lambda_div_used": 0.6, + "learning_rate": 8.245653237555705e-07, + "loss": 0.0392, + "reward": 0.1256917817518115, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1256917817518115, + "reward_after_std": 0.8899489343166351, + "reward_before_mean": 0.5621891398914158, + "reward_before_std": 0.9076622389256954, + "reward_change_max": 0.0, + "reward_change_mean": -0.4364973697811365, + "reward_change_min": -0.8991809338331223, + "reward_change_std": 0.34503397159278393, + "reward_std": 0.8899489492177963, + "rewards/cosine_scaled_reward": 0.05192789062857628, + "rewards/format_reward": 0.4583333432674408, + "step": 181 + }, + { + "advantage_max": 1.5165415294468403, + "advantage_mean": -1.862645593320167e-09, + "advantage_min": -0.7919049225747585, + "advantage_std": 0.8274138532578945, + "completion_length": 2300.3125228881836, + "epoch": 0.208, + "grad_norm": 0.13749708235263824, + "kl": 0.02083587646484375, + "lambda_div_used": 0.6, + "learning_rate": 8.220696016880687e-07, + "loss": 0.0187, + "reward": 0.12074684211984277, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.12074684211984277, + "reward_after_std": 0.8274138383567333, + "reward_before_mean": 0.5635696525569074, + "reward_before_std": 0.8129143454134464, + "reward_change_max": 0.0022213757038116455, + "reward_change_mean": -0.442822827026248, + "reward_change_min": -0.7646378092467785, + "reward_change_std": 0.31069572921842337, + "reward_std": 0.8274138383567333, + "rewards/cosine_scaled_reward": -0.009881848469376564, + "rewards/format_reward": 0.5833333414047956, + "step": 182 + }, + { + "advantage_max": 1.3605138957500458, + "advantage_mean": -1.6142924996742636e-08, + "advantage_min": -0.7550958581268787, + "advantage_std": 0.7596499547362328, + "completion_length": 2450.2083892822266, + "epoch": 0.20914285714285713, + "grad_norm": 0.15661555528640747, + "kl": 0.04094696044921875, + "lambda_div_used": 0.6, + "learning_rate": 8.195606193320136e-07, + "loss": 0.0469, + "reward": 0.10832425020635128, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.10832425020635128, + "reward_after_std": 0.7596499472856522, + "reward_before_mean": 0.5617624840233475, + "reward_before_std": 0.7516155876219273, + "reward_change_max": 0.0, + "reward_change_mean": -0.4534382503479719, + "reward_change_min": -0.8723913095891476, + "reward_change_std": 0.33325557969510555, + "reward_std": 0.7596499547362328, + "rewards/cosine_scaled_reward": -0.021202084608376026, + "rewards/format_reward": 0.6041666679084301, + "step": 183 + }, + { + "advantage_max": 1.1743084266781807, + "advantage_mean": 6.519258299864106e-09, + "advantage_min": -0.45616957545280457, + "advantage_std": 0.6174315102398396, + "completion_length": 2967.000030517578, + "epoch": 0.2102857142857143, + "grad_norm": 0.089842788875103, + "kl": 0.0363616943359375, + "lambda_div_used": 0.6, + "learning_rate": 8.170384989716657e-07, + "loss": 0.0062, + "reward": -0.3365051681175828, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.3365051681175828, + "reward_after_std": 0.6174314990639687, + "reward_before_mean": -0.1119849169626832, + "reward_before_std": 0.5883787106722593, + "reward_change_max": 0.0007164105772972107, + "reward_change_mean": -0.22452027909457684, + "reward_change_min": -0.4067869149148464, + "reward_change_std": 0.16255799168720841, + "reward_std": 0.6174315363168716, + "rewards/cosine_scaled_reward": -0.22265912871807814, + "rewards/format_reward": 0.33333333395421505, + "step": 184 + }, + { + "advantage_max": 1.1450467370450497, + "advantage_mean": -1.3038516155639002e-08, + "advantage_min": -0.5757462717592716, + "advantage_std": 0.6306551937013865, + "completion_length": 2638.062530517578, + "epoch": 0.21142857142857144, + "grad_norm": 0.1604258418083191, + "kl": 0.03490447998046875, + "lambda_div_used": 0.6, + "learning_rate": 8.145033635316128e-07, + "loss": 0.0312, + "reward": -0.12965551391243935, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.12965551391243935, + "reward_after_std": 0.6306551937013865, + "reward_before_mean": 0.2147392723709345, + "reward_before_std": 0.6197530413046479, + "reward_change_max": 0.00013187527656555176, + "reward_change_mean": -0.34439481515437365, + "reward_change_min": -0.6087727546691895, + "reward_change_std": 0.24250369798392057, + "reward_std": 0.6306551974266768, + "rewards/cosine_scaled_reward": -0.11138035543262959, + "rewards/format_reward": 0.43750000558793545, + "step": 185 + }, + { + "advantage_max": 1.08181943744421, + "advantage_mean": 1.2417633032946185e-09, + "advantage_min": -0.5867179185152054, + "advantage_std": 0.6249041147530079, + "completion_length": 3160.625045776367, + "epoch": 0.21257142857142858, + "grad_norm": 0.15922769904136658, + "kl": 0.04046630859375, + "lambda_div_used": 0.6, + "learning_rate": 8.119553365707802e-07, + "loss": 0.0482, + "reward": -0.06183060258626938, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.06183060258626938, + "reward_after_std": 0.6249041259288788, + "reward_before_mean": 0.33093083277344704, + "reward_before_std": 0.6239690035581589, + "reward_change_max": 0.0009515807032585144, + "reward_change_mean": -0.3927614507265389, + "reward_change_min": -0.7076075598597527, + "reward_change_std": 0.3012672569602728, + "reward_std": 0.62490414083004, + "rewards/cosine_scaled_reward": 0.009215403348207474, + "rewards/format_reward": 0.31250000558793545, + "step": 186 + }, + { + "advantage_max": 1.072705127298832, + "advantage_mean": 3.7252901874396116e-09, + "advantage_min": -0.6445136219263077, + "advantage_std": 0.603652723133564, + "completion_length": 2336.2084045410156, + "epoch": 0.21371428571428572, + "grad_norm": 0.24794240295886993, + "kl": 0.03890228271484375, + "lambda_div_used": 0.6, + "learning_rate": 8.093945422764069e-07, + "loss": 0.0822, + "reward": -0.029120448976755142, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.029120448976755142, + "reward_after_std": 0.6036527305841446, + "reward_before_mean": 0.37975111696869135, + "reward_before_std": 0.5896169766783714, + "reward_change_max": 0.0017495378851890564, + "reward_change_mean": -0.408871547318995, + "reward_change_min": -0.7276890836656094, + "reward_change_std": 0.28704762924462557, + "reward_std": 0.6036527529358864, + "rewards/cosine_scaled_reward": -0.12262445967644453, + "rewards/format_reward": 0.625000013038516, + "step": 187 + }, + { + "advantage_max": 1.2339450418949127, + "advantage_mean": 1.6142924996742636e-08, + "advantage_min": -0.5492425635457039, + "advantage_std": 0.6971351355314255, + "completion_length": 3499.0416870117188, + "epoch": 0.21485714285714286, + "grad_norm": 0.17811760306358337, + "kl": 0.046234130859375, + "lambda_div_used": 0.6, + "learning_rate": 8.068211054579943e-07, + "loss": 0.0231, + "reward": -0.4334766957908869, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.4334766957908869, + "reward_after_std": 0.6971351690590382, + "reward_before_mean": -0.264811088796705, + "reward_before_std": 0.7778226062655449, + "reward_change_max": 0.002253696322441101, + "reward_change_mean": -0.16866560466587543, + "reward_change_min": -0.6171305365860462, + "reward_change_std": 0.24329949263483286, + "reward_std": 0.697135217487812, + "rewards/cosine_scaled_reward": -0.20532220881432295, + "rewards/format_reward": 0.14583333767950535, + "step": 188 + }, + { + "advantage_max": 1.3194897770881653, + "advantage_mean": 1.490116141589226e-08, + "advantage_min": -0.5810928530991077, + "advantage_std": 0.6943271197378635, + "completion_length": 2505.7292251586914, + "epoch": 0.216, + "grad_norm": 0.1453278362751007, + "kl": 0.039276123046875, + "lambda_div_used": 0.6, + "learning_rate": 8.04235151541222e-07, + "loss": 0.0234, + "reward": 0.0326367899106117, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.0326367899106117, + "reward_after_std": 0.6943271197378635, + "reward_before_mean": 0.4496917873620987, + "reward_before_std": 0.6022991053760052, + "reward_change_max": 0.0003833100199699402, + "reward_change_mean": -0.41705494094640017, + "reward_change_min": -0.7159639894962311, + "reward_change_std": 0.2671029521152377, + "reward_std": 0.6943271309137344, + "rewards/cosine_scaled_reward": -0.06682079844176769, + "rewards/format_reward": 0.5833333395421505, + "step": 189 + }, + { + "advantage_max": 1.042035847902298, + "advantage_mean": 7.45058070794613e-09, + "advantage_min": -0.4305584356188774, + "advantage_std": 0.5585471391677856, + "completion_length": 2627.1458587646484, + "epoch": 0.21714285714285714, + "grad_norm": 0.14222775399684906, + "kl": 0.0413818359375, + "lambda_div_used": 0.6, + "learning_rate": 8.01636806561836e-07, + "loss": 0.0329, + "reward": -0.0020353831350803375, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.0020353831350803375, + "reward_after_std": 0.5585471503436565, + "reward_before_mean": 0.4232803890481591, + "reward_before_std": 0.45277058705687523, + "reward_change_max": 0.0002883821725845337, + "reward_change_mean": -0.4253157516941428, + "reward_change_min": -0.6929913498461246, + "reward_change_std": 0.27376699820160866, + "reward_std": 0.5585471540689468, + "rewards/cosine_scaled_reward": -0.04877648875117302, + "rewards/format_reward": 0.520833333954215, + "step": 190 + }, + { + "advantage_max": 1.2923248931765556, + "advantage_mean": 1.8005570368018198e-08, + "advantage_min": -0.6255582571029663, + "advantage_std": 0.7107645235955715, + "completion_length": 2654.1041717529297, + "epoch": 0.21828571428571428, + "grad_norm": 0.1821720153093338, + "kl": 0.0473175048828125, + "lambda_div_used": 0.6, + "learning_rate": 7.990261971595048e-07, + "loss": 0.0433, + "reward": -0.10828890092670918, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.10828890092670918, + "reward_after_std": 0.7107645533978939, + "reward_before_mean": 0.23151674802647904, + "reward_before_std": 0.7050754111260176, + "reward_change_max": 0.0015554353594779968, + "reward_change_mean": -0.3398056421428919, + "reward_change_min": -0.6694670245051384, + "reward_change_std": 0.2771302107721567, + "reward_std": 0.7107645682990551, + "rewards/cosine_scaled_reward": -0.07174161821603775, + "rewards/format_reward": 0.37500000931322575, + "step": 191 + }, + { + "advantage_max": 1.1991769075393677, + "advantage_mean": -2.2972624191819335e-08, + "advantage_min": -0.5654369220137596, + "advantage_std": 0.6564156897366047, + "completion_length": 3291.812530517578, + "epoch": 0.21942857142857142, + "grad_norm": 0.22897185385227203, + "kl": 0.046600341796875, + "lambda_div_used": 0.6, + "learning_rate": 7.964034505716476e-07, + "loss": 0.0558, + "reward": -0.2808880601078272, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2808880601078272, + "reward_after_std": 0.6564157009124756, + "reward_before_mean": -0.024348472245037556, + "reward_before_std": 0.6607515625655651, + "reward_change_max": 0.000874444842338562, + "reward_change_mean": -0.25653960881754756, + "reward_change_min": -0.5323520973324776, + "reward_change_std": 0.2226111926138401, + "reward_std": 0.6564157158136368, + "rewards/cosine_scaled_reward": -0.1788409072905779, + "rewards/format_reward": 0.33333333767950535, + "step": 192 + }, + { + "advantage_max": 1.4964174404740334, + "advantage_mean": 4.346171478264438e-09, + "advantage_min": -0.7282641679048538, + "advantage_std": 0.8228538744151592, + "completion_length": 3145.250030517578, + "epoch": 0.22057142857142858, + "grad_norm": 0.2540678381919861, + "kl": 0.045440673828125, + "lambda_div_used": 0.6, + "learning_rate": 7.93768694627233e-07, + "loss": 0.0548, + "reward": -0.13096075784415007, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.13096075784415007, + "reward_after_std": 0.8228538781404495, + "reward_before_mean": 0.1752599613973871, + "reward_before_std": 0.848308339715004, + "reward_change_max": 0.0002414211630821228, + "reward_change_mean": -0.3062206953763962, + "reward_change_min": -0.6018543504178524, + "reward_change_std": 0.2552442867308855, + "reward_std": 0.8228538781404495, + "rewards/cosine_scaled_reward": -0.06862002797424793, + "rewards/format_reward": 0.3125000074505806, + "step": 193 + }, + { + "advantage_max": 1.5948434360325336, + "advantage_mean": -2.421438738409165e-08, + "advantage_min": -0.9445933252573013, + "advantage_std": 0.8900459110736847, + "completion_length": 2825.041732788086, + "epoch": 0.22171428571428572, + "grad_norm": 0.17521539330482483, + "kl": 0.04534912109375, + "lambda_div_used": 0.6, + "learning_rate": 7.911220577405484e-07, + "loss": 0.0347, + "reward": 0.3043134193867445, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3043134193867445, + "reward_after_std": 0.8900458980351686, + "reward_before_mean": 0.8403315991163254, + "reward_before_std": 0.8644460886716843, + "reward_change_max": 0.0024929121136665344, + "reward_change_mean": -0.5360182207077742, + "reward_change_min": -0.9033717587590218, + "reward_change_std": 0.38277727644890547, + "reward_std": 0.890045927837491, + "rewards/cosine_scaled_reward": 0.17016580794006586, + "rewards/format_reward": 0.5000000149011612, + "step": 194 + }, + { + "advantage_max": 1.6198503710329533, + "advantage_mean": 1.3659398501175701e-08, + "advantage_min": -0.7073510959744453, + "advantage_std": 0.8945747967809439, + "completion_length": 3019.354232788086, + "epoch": 0.22285714285714286, + "grad_norm": 0.27445510029792786, + "kl": 0.04925537109375, + "lambda_div_used": 0.6, + "learning_rate": 7.884636689049422e-07, + "loss": 0.0478, + "reward": 0.024609943851828575, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.024609943851828575, + "reward_after_std": 0.8945747967809439, + "reward_before_mean": 0.40517749823629856, + "reward_before_std": 0.9060044959187508, + "reward_change_max": 0.00021410733461380005, + "reward_change_mean": -0.38056755252182484, + "reward_change_min": -0.9209701716899872, + "reward_change_std": 0.3530062697827816, + "reward_std": 0.8945748265832663, + "rewards/cosine_scaled_reward": 0.015088742948137224, + "rewards/format_reward": 0.3750000074505806, + "step": 195 + }, + { + "advantage_max": 1.3014200776815414, + "advantage_mean": 2.23517424569053e-08, + "advantage_min": -0.6922896057367325, + "advantage_std": 0.7207038998603821, + "completion_length": 3295.3750610351562, + "epoch": 0.224, + "grad_norm": 0.1790717989206314, + "kl": 0.0604248046875, + "lambda_div_used": 0.6, + "learning_rate": 7.857936576865356e-07, + "loss": 0.0255, + "reward": -0.20712384395301342, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.20712384395301342, + "reward_after_std": 0.720703911036253, + "reward_before_mean": 0.08003285154700279, + "reward_before_std": 0.755801547318697, + "reward_change_max": 0.0016879886388778687, + "reward_change_mean": -0.28715668758377433, + "reward_change_min": -0.610984530299902, + "reward_change_std": 0.25006009358912706, + "reward_std": 0.720703911036253, + "rewards/cosine_scaled_reward": -0.06415024306625128, + "rewards/format_reward": 0.20833334140479565, + "step": 196 + }, + { + "advantage_max": 1.5312520191073418, + "advantage_mean": 3.1044084525255755e-09, + "advantage_min": -1.0301610231399536, + "advantage_std": 0.8983987234532833, + "completion_length": 2006.6458740234375, + "epoch": 0.22514285714285714, + "grad_norm": 0.24713905155658722, + "kl": 0.0618438720703125, + "lambda_div_used": 0.6, + "learning_rate": 7.831121542179086e-07, + "loss": 0.0451, + "reward": 0.3392133894376457, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3392133894376457, + "reward_after_std": 0.8983987085521221, + "reward_before_mean": 0.8995099812746048, + "reward_before_std": 0.9543585330247879, + "reward_change_max": 0.0011051148176193237, + "reward_change_mean": -0.5602965541183949, + "reward_change_min": -1.0811551474034786, + "reward_change_std": 0.4375645313411951, + "reward_std": 0.8983987122774124, + "rewards/cosine_scaled_reward": 0.1893383078277111, + "rewards/format_reward": 0.5208333432674408, + "step": 197 + }, + { + "advantage_max": 1.2494731955230236, + "advantage_mean": -6.2088175156738146e-09, + "advantage_min": -0.5514694266021252, + "advantage_std": 0.6800418961793184, + "completion_length": 2836.562530517578, + "epoch": 0.22628571428571428, + "grad_norm": 0.16534265875816345, + "kl": 0.0721435546875, + "lambda_div_used": 0.6, + "learning_rate": 7.804192891917571e-07, + "loss": 0.035, + "reward": -0.16476232931017876, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.16476232931017876, + "reward_after_std": 0.6800418831408024, + "reward_before_mean": 0.14605081919580698, + "reward_before_std": 0.6505880728363991, + "reward_change_max": 0.005474165081977844, + "reward_change_mean": -0.3108131578192115, + "reward_change_min": -0.6044091917574406, + "reward_change_std": 0.2463654656894505, + "reward_std": 0.6800418980419636, + "rewards/cosine_scaled_reward": -0.10405792016535997, + "rewards/format_reward": 0.35416666977107525, + "step": 198 + }, + { + "advantage_max": 1.151242271065712, + "advantage_mean": 6.208817127095756e-09, + "advantage_min": -0.5328905023634434, + "advantage_std": 0.6158375293016434, + "completion_length": 2393.7500610351562, + "epoch": 0.22742857142857142, + "grad_norm": 0.165126234292984, + "kl": 0.075531005859375, + "lambda_div_used": 0.6, + "learning_rate": 7.777151938545235e-07, + "loss": 0.0263, + "reward": -0.34041555039584637, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.34041555039584637, + "reward_after_std": 0.615837536752224, + "reward_before_mean": -0.11647915467619896, + "reward_before_std": 0.6054133623838425, + "reward_change_max": 0.0021468177437782288, + "reward_change_mean": -0.22393639385700226, + "reward_change_min": -0.47274230420589447, + "reward_change_std": 0.1846913443878293, + "reward_std": 0.6158375553786755, + "rewards/cosine_scaled_reward": -0.17282291005176376, + "rewards/format_reward": 0.22916666977107525, + "step": 199 + }, + { + "advantage_max": 1.5527655258774757, + "advantage_mean": 6.208818459363386e-10, + "advantage_min": -0.8078677505254745, + "advantage_std": 0.8602624572813511, + "completion_length": 2471.104202270508, + "epoch": 0.22857142857142856, + "grad_norm": 0.17417526245117188, + "kl": 0.070556640625, + "lambda_div_used": 0.6, + "learning_rate": 7.75e-07, + "loss": 0.0125, + "reward": 0.07807190343737602, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.07807190343737602, + "reward_after_std": 0.8602624572813511, + "reward_before_mean": 0.4883092427626252, + "reward_before_std": 0.8743391893804073, + "reward_change_max": 0.0013150349259376526, + "reward_change_mean": -0.4102373067289591, + "reward_change_min": -0.8249180130660534, + "reward_change_std": 0.3349352069199085, + "reward_std": 0.860262505710125, + "rewards/cosine_scaled_reward": 0.02540461253374815, + "rewards/format_reward": 0.4375000074505806, + "step": 200 + }, + { + "advantage_max": 1.4167628586292267, + "advantage_mean": -1.1102230246251565e-16, + "advantage_min": -0.7990463078022003, + "advantage_std": 0.7999892868101597, + "completion_length": 2076.2083587646484, + "epoch": 0.2297142857142857, + "grad_norm": 0.18805286288261414, + "kl": 0.068634033203125, + "lambda_div_used": 0.6, + "learning_rate": 7.72273839962904e-07, + "loss": 0.0186, + "reward": 0.40698003210127354, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.40698003210127354, + "reward_after_std": 0.7999892979860306, + "reward_before_mean": 1.0221742703579366, + "reward_before_std": 0.7416293583810329, + "reward_change_max": 0.0007732957601547241, + "reward_change_mean": -0.615194228477776, + "reward_change_min": -0.9900593794882298, + "reward_change_std": 0.41214887611567974, + "reward_std": 0.7999893128871918, + "rewards/cosine_scaled_reward": 0.24025380797684193, + "rewards/format_reward": 0.5416666772216558, + "step": 201 + }, + { + "advantage_max": 1.3395941704511642, + "advantage_mean": -2.3593506148777976e-08, + "advantage_min": -0.6523739323019981, + "advantage_std": 0.718066219240427, + "completion_length": 2476.250015258789, + "epoch": 0.23085714285714284, + "grad_norm": 0.28353244066238403, + "kl": 0.095794677734375, + "lambda_div_used": 0.6, + "learning_rate": 7.695368466124296e-07, + "loss": -0.0281, + "reward": 0.2649739682674408, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2649739682674408, + "reward_after_std": 0.7180662304162979, + "reward_before_mean": 0.808698242995888, + "reward_before_std": 0.6214682869613171, + "reward_change_max": 0.00035375356674194336, + "reward_change_mean": -0.5437242835760117, + "reward_change_min": -0.8757462315261364, + "reward_change_std": 0.3358838642016053, + "reward_std": 0.7180662527680397, + "rewards/cosine_scaled_reward": 0.15434912405908108, + "rewards/format_reward": 0.5000000037252903, + "step": 202 + }, + { + "advantage_max": 1.223877239972353, + "advantage_mean": -3.1044085080367267e-09, + "advantage_min": -0.620630256831646, + "advantage_std": 0.6635248996317387, + "completion_length": 2952.5833587646484, + "epoch": 0.232, + "grad_norm": 0.2589733600616455, + "kl": 0.09796142578125, + "lambda_div_used": 0.6, + "learning_rate": 7.667891533457718e-07, + "loss": 0.0524, + "reward": -0.05526367016136646, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.05526367016136646, + "reward_after_std": 0.6635248959064484, + "reward_before_mean": 0.3243457265198231, + "reward_before_std": 0.6201132908463478, + "reward_change_max": 0.0011578574776649475, + "reward_change_mean": -0.3796093687415123, + "reward_change_min": -0.6677472069859505, + "reward_change_std": 0.2712924610823393, + "reward_std": 0.6635249182581902, + "rewards/cosine_scaled_reward": 0.026756178587675095, + "rewards/format_reward": 0.2708333395421505, + "step": 203 + }, + { + "advantage_max": 1.1059998497366905, + "advantage_mean": 7.140140145800444e-09, + "advantage_min": -0.591538067907095, + "advantage_std": 0.6298722177743912, + "completion_length": 2211.312545776367, + "epoch": 0.23314285714285715, + "grad_norm": 0.191162571310997, + "kl": 0.09075927734375, + "lambda_div_used": 0.6, + "learning_rate": 7.640308940816239e-07, + "loss": 0.0189, + "reward": 0.08096916507929564, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.08096916507929564, + "reward_after_std": 0.6298722326755524, + "reward_before_mean": 0.5430826265364885, + "reward_before_std": 0.589900765568018, + "reward_change_max": 0.0, + "reward_change_mean": -0.46211343444883823, + "reward_change_min": -0.8006623312830925, + "reward_change_std": 0.31877398304641247, + "reward_std": 0.6298722624778748, + "rewards/cosine_scaled_reward": -0.0826253816485405, + "rewards/format_reward": 0.7083333395421505, + "step": 204 + }, + { + "advantage_max": 1.4411217793822289, + "advantage_mean": -1.6763806842678974e-08, + "advantage_min": -0.9071615114808083, + "advantage_std": 0.8484151288866997, + "completion_length": 2679.2708740234375, + "epoch": 0.2342857142857143, + "grad_norm": 0.2707881033420563, + "kl": 0.08770751953125, + "lambda_div_used": 0.6, + "learning_rate": 7.612622032536507e-07, + "loss": 0.0232, + "reward": 0.11488889902830124, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.11488889902830124, + "reward_after_std": 0.8484151251614094, + "reward_before_mean": 0.564969852566719, + "reward_before_std": 0.9211336225271225, + "reward_change_max": 0.0, + "reward_change_mean": -0.450080968439579, + "reward_change_min": -0.8733410649001598, + "reward_change_std": 0.380622087046504, + "reward_std": 0.8484151512384415, + "rewards/cosine_scaled_reward": 0.10540158860385418, + "rewards/format_reward": 0.354166679084301, + "step": 205 + }, + { + "advantage_max": 1.9362114071846008, + "advantage_mean": 6.208817904251873e-10, + "advantage_min": -0.6879184618592262, + "advantage_std": 1.0034943111240864, + "completion_length": 3153.7500228881836, + "epoch": 0.23542857142857143, + "grad_norm": 0.40357130765914917, + "kl": 0.1099853515625, + "lambda_div_used": 0.6, + "learning_rate": 7.584832158039378e-07, + "loss": 0.0197, + "reward": -0.1742401469964534, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.1742401469964534, + "reward_after_std": 1.0034943148493767, + "reward_before_mean": 0.05573885515332222, + "reward_before_std": 0.9908393323421478, + "reward_change_max": 0.00036820024251937866, + "reward_change_mean": -0.22997901123017073, + "reward_change_min": -0.5386604145169258, + "reward_change_std": 0.21234728395938873, + "reward_std": 1.0034943595528603, + "rewards/cosine_scaled_reward": -0.10754724405705929, + "rewards/format_reward": 0.2708333358168602, + "step": 206 + }, + { + "advantage_max": 1.4247949346899986, + "advantage_mean": -1.1102230246251565e-16, + "advantage_min": -0.6534974798560143, + "advantage_std": 0.7754468694329262, + "completion_length": 2983.0625610351562, + "epoch": 0.23657142857142857, + "grad_norm": 0.3701333701610565, + "kl": 0.12939453125, + "lambda_div_used": 0.6, + "learning_rate": 7.556940671764124e-07, + "loss": -0.0036, + "reward": -0.2597499608527869, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2597499608527869, + "reward_after_std": 0.775446891784668, + "reward_before_mean": -0.024273216724395752, + "reward_before_std": 0.8059664107859135, + "reward_change_max": 0.005939692258834839, + "reward_change_mean": -0.23547676112502813, + "reward_change_min": -0.5890285782516003, + "reward_change_std": 0.24587653204798698, + "reward_std": 0.7754469364881516, + "rewards/cosine_scaled_reward": -0.17880328325554729, + "rewards/format_reward": 0.33333333767950535, + "step": 207 + }, + { + "advantage_max": 1.4248832762241364, + "advantage_mean": 6.829699306099002e-09, + "advantage_min": -0.7578137814998627, + "advantage_std": 0.7782737948000431, + "completion_length": 2421.3125762939453, + "epoch": 0.2377142857142857, + "grad_norm": 0.2831918001174927, + "kl": 0.097442626953125, + "lambda_div_used": 0.6, + "learning_rate": 7.528948933102438e-07, + "loss": 0.0003, + "reward": 0.18712701415643096, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.18712701415643096, + "reward_after_std": 0.7782737948000431, + "reward_before_mean": 0.6776773352175951, + "reward_before_std": 0.7428862564265728, + "reward_change_max": 0.0, + "reward_change_mean": -0.49055031593889, + "reward_change_min": -0.8148558661341667, + "reward_change_std": 0.32612331211566925, + "reward_std": 0.778273805975914, + "rewards/cosine_scaled_reward": 0.057588656432926655, + "rewards/format_reward": 0.5625000055879354, + "step": 208 + }, + { + "advantage_max": 1.187922965735197, + "advantage_mean": 1.8626449271863521e-09, + "advantage_min": -0.47378795593976974, + "advantage_std": 0.631999060511589, + "completion_length": 2732.4375228881836, + "epoch": 0.23885714285714285, + "grad_norm": 0.2125997692346573, + "kl": 0.128662109375, + "lambda_div_used": 0.6, + "learning_rate": 7.500858306332172e-07, + "loss": 0.0075, + "reward": 0.10858845058828592, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.10858845058828592, + "reward_after_std": 0.6319990754127502, + "reward_before_mean": 0.5761509407311678, + "reward_before_std": 0.5213535856455564, + "reward_change_max": 0.002324998378753662, + "reward_change_mean": -0.467562448233366, + "reward_change_min": -0.7992461994290352, + "reward_change_std": 0.29136813152581453, + "reward_std": 0.6319991014897823, + "rewards/cosine_scaled_reward": 0.058908781968057156, + "rewards/format_reward": 0.4583333358168602, + "step": 209 + }, + { + "advantage_max": 1.3186798729002476, + "advantage_mean": -3.1044086745701804e-10, + "advantage_min": -0.5938226599246264, + "advantage_std": 0.7046267911791801, + "completion_length": 2671.250045776367, + "epoch": 0.24, + "grad_norm": 0.25899404287338257, + "kl": 0.10009765625, + "lambda_div_used": 0.6, + "learning_rate": 7.472670160550848e-07, + "loss": 0.0317, + "reward": 0.018894458189606667, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.018894458189606667, + "reward_after_std": 0.7046268098056316, + "reward_before_mean": 0.4229349633678794, + "reward_before_std": 0.6473653120920062, + "reward_change_max": 0.0013660937547683716, + "reward_change_mean": -0.40404045954346657, + "reward_change_min": -0.6701996028423309, + "reward_change_std": 0.27499571815133095, + "reward_std": 0.7046268302947283, + "rewards/cosine_scaled_reward": 0.013550772797316313, + "rewards/format_reward": 0.39583333395421505, + "step": 210 + }, + { + "advantage_max": 1.372512400150299, + "advantage_mean": -2.110997859849917e-08, + "advantage_min": -0.6729023642838001, + "advantage_std": 0.7465317733585835, + "completion_length": 2482.0833740234375, + "epoch": 0.24114285714285713, + "grad_norm": 0.31670844554901123, + "kl": 0.12445068359375, + "lambda_div_used": 0.6, + "learning_rate": 7.444385869608921e-07, + "loss": 0.0257, + "reward": -0.019909057766199112, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.019909057766199112, + "reward_after_std": 0.7465317510068417, + "reward_before_mean": 0.35514324717223644, + "reward_before_std": 0.7346069552004337, + "reward_change_max": 0.001021161675453186, + "reward_change_mean": -0.37505233788397163, + "reward_change_min": -0.6400614865124226, + "reward_change_std": 0.26454370305873454, + "reward_std": 0.7465317659080029, + "rewards/cosine_scaled_reward": -0.06201172433793545, + "rewards/format_reward": 0.479166679084301, + "step": 211 + }, + { + "advantage_max": 1.4231358543038368, + "advantage_mean": -8.071462720415923e-09, + "advantage_min": -0.8575638234615326, + "advantage_std": 0.8098030164837837, + "completion_length": 2606.6250762939453, + "epoch": 0.2422857142857143, + "grad_norm": 0.40781182050704956, + "kl": 0.128204345703125, + "lambda_div_used": 0.6, + "learning_rate": 7.416006812042827e-07, + "loss": 0.0841, + "reward": 0.09332763217389584, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.09332763217389584, + "reward_after_std": 0.8098030164837837, + "reward_before_mean": 0.5334172658622265, + "reward_before_std": 0.840537752956152, + "reward_change_max": 0.0007770806550979614, + "reward_change_mean": -0.44008962251245975, + "reward_change_min": -0.8539436981081963, + "reward_change_std": 0.3552567921578884, + "reward_std": 0.8098030164837837, + "rewards/cosine_scaled_reward": 0.016708621755242348, + "rewards/format_reward": 0.5000000074505806, + "step": 212 + }, + { + "advantage_max": 1.6412783414125443, + "advantage_mean": -1.1175871450497255e-08, + "advantage_min": -0.8117294907569885, + "advantage_std": 0.8886349983513355, + "completion_length": 2581.3541870117188, + "epoch": 0.24342857142857144, + "grad_norm": 0.286513090133667, + "kl": 0.1539306640625, + "lambda_div_used": 0.6, + "learning_rate": 7.387534371007797e-07, + "loss": 0.0005, + "reward": 0.2363802082836628, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2363802082836628, + "reward_after_std": 0.8886349834501743, + "reward_before_mean": 0.724748283624649, + "reward_before_std": 0.8444754183292389, + "reward_change_max": 0.0027549341320991516, + "reward_change_mean": -0.48836808931082487, + "reward_change_min": -0.9352367371320724, + "reward_change_std": 0.35737268533557653, + "reward_std": 0.8886350020766258, + "rewards/cosine_scaled_reward": 0.07070746086537838, + "rewards/format_reward": 0.5833333395421505, + "step": 213 + }, + { + "advantage_max": 1.603540975600481, + "advantage_mean": 5.587935447692871e-09, + "advantage_min": -0.8505863398313522, + "advantage_std": 0.9302494525909424, + "completion_length": 2973.0625610351562, + "epoch": 0.24457142857142858, + "grad_norm": 0.411465585231781, + "kl": 0.144775390625, + "lambda_div_used": 0.6, + "learning_rate": 7.358969934210438e-07, + "loss": 0.0687, + "reward": 0.009196583181619644, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.009196583181619644, + "reward_after_std": 0.9302494451403618, + "reward_before_mean": 0.37914169020950794, + "reward_before_std": 1.0218745321035385, + "reward_change_max": 6.617605686187744e-05, + "reward_change_mean": -0.36994508653879166, + "reward_change_min": -0.9427313208580017, + "reward_change_std": 0.3748389510437846, + "reward_std": 0.930249460041523, + "rewards/cosine_scaled_reward": -0.039595833979547024, + "rewards/format_reward": 0.4583333395421505, + "step": 214 + }, + { + "advantage_max": 0.8581725731492043, + "advantage_mean": 9.313226634333205e-09, + "advantage_min": -0.40347205474972725, + "advantage_std": 0.46334000304341316, + "completion_length": 2574.8541946411133, + "epoch": 0.24571428571428572, + "grad_norm": 0.13390909135341644, + "kl": 0.13763427734375, + "lambda_div_used": 0.6, + "learning_rate": 7.330314893841101e-07, + "loss": 0.0165, + "reward": -0.2104439791291952, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.2104439791291952, + "reward_after_std": 0.46334001794457436, + "reward_before_mean": 0.11830021068453789, + "reward_before_std": 0.4097316600382328, + "reward_change_max": 0.001212291419506073, + "reward_change_mean": -0.3287441972643137, + "reward_change_min": -0.5551458112895489, + "reward_change_std": 0.20913261640816927, + "reward_std": 0.46334002912044525, + "rewards/cosine_scaled_reward": -0.1804332360625267, + "rewards/format_reward": 0.47916668094694614, + "step": 215 + }, + { + "advantage_max": 1.5061615630984306, + "advantage_mean": -2.4835269951672956e-09, + "advantage_min": -0.8598283156752586, + "advantage_std": 0.8617348670959473, + "completion_length": 2333.916702270508, + "epoch": 0.24685714285714286, + "grad_norm": 0.6332868933677673, + "kl": 0.17523193359375, + "lambda_div_used": 0.6, + "learning_rate": 7.301570646506027e-07, + "loss": 0.0876, + "reward": 0.04702269285917282, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.04702269285917282, + "reward_after_std": 0.8617348782718182, + "reward_before_mean": 0.44580378383398056, + "reward_before_std": 0.9096739925444126, + "reward_change_max": 0.0006563737988471985, + "reward_change_mean": -0.39878107607364655, + "reward_change_min": -0.8786581009626389, + "reward_change_std": 0.35184421949088573, + "reward_std": 0.8617349080741405, + "rewards/cosine_scaled_reward": -0.07918145949952304, + "rewards/format_reward": 0.6041666772216558, + "step": 216 + }, + { + "advantage_max": 1.6048918068408966, + "advantage_mean": -4.967053990334591e-09, + "advantage_min": -0.7554399445652962, + "advantage_std": 0.891699094325304, + "completion_length": 2728.770851135254, + "epoch": 0.248, + "grad_norm": 0.47174057364463806, + "kl": 0.17547607421875, + "lambda_div_used": 0.6, + "learning_rate": 7.27273859315928e-07, + "loss": 0.0602, + "reward": 0.09491589106619358, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.09491589106619358, + "reward_after_std": 0.8916990831494331, + "reward_before_mean": 0.5138254193589091, + "reward_before_std": 0.9023692905902863, + "reward_change_max": 0.0, + "reward_change_mean": -0.418909530621022, + "reward_change_min": -0.8360408432781696, + "reward_change_std": 0.3293918455019593, + "reward_std": 0.8916990980505943, + "rewards/cosine_scaled_reward": 0.06941270176321268, + "rewards/format_reward": 0.3750000074505806, + "step": 217 + }, + { + "advantage_max": 1.4350545406341553, + "advantage_mean": 2.4835269007983385e-08, + "advantage_min": -0.6967899017035961, + "advantage_std": 0.7781109362840652, + "completion_length": 2717.8750762939453, + "epoch": 0.24914285714285714, + "grad_norm": 0.4780595600605011, + "kl": 0.17071533203125, + "lambda_div_used": 0.6, + "learning_rate": 7.243820139034464e-07, + "loss": 0.0052, + "reward": -0.06596539542078972, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.06596539542078972, + "reward_after_std": 0.7781109698116779, + "reward_before_mean": 0.27800588123500347, + "reward_before_std": 0.7713658884167671, + "reward_change_max": 0.0009939447045326233, + "reward_change_mean": -0.3439712468534708, + "reward_change_min": -0.6784108616411686, + "reward_change_std": 0.27120585925877094, + "reward_std": 0.7781109921634197, + "rewards/cosine_scaled_reward": -0.09016373474150896, + "rewards/format_reward": 0.45833334513008595, + "step": 218 + }, + { + "advantage_max": 1.515500433743, + "advantage_mean": -9.934107203513065e-09, + "advantage_min": -0.6553824432194233, + "advantage_std": 0.8291011936962605, + "completion_length": 2393.9583892822266, + "epoch": 0.2502857142857143, + "grad_norm": 0.35546576976776123, + "kl": 0.21331787109375, + "lambda_div_used": 0.6, + "learning_rate": 7.214816693576234e-07, + "loss": 0.0379, + "reward": 0.06899490812793374, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.06899490812793374, + "reward_after_std": 0.829101212322712, + "reward_before_mean": 0.479990987572819, + "reward_before_std": 0.8050318211317062, + "reward_change_max": 0.0008395984768867493, + "reward_change_mean": -0.4109960775822401, + "reward_change_min": -0.8359643630683422, + "reward_change_std": 0.32415905967354774, + "reward_std": 0.8291012309491634, + "rewards/cosine_scaled_reward": -0.01000452577136457, + "rewards/format_reward": 0.5000000111758709, + "step": 219 + }, + { + "advantage_max": 0.7503951676189899, + "advantage_mean": 1.2417634420724966e-08, + "advantage_min": -0.37694836407899857, + "advantage_std": 0.4157340805977583, + "completion_length": 2755.1250228881836, + "epoch": 0.25142857142857145, + "grad_norm": 0.26302239298820496, + "kl": 0.22412109375, + "lambda_div_used": 0.6, + "learning_rate": 7.185729670371604e-07, + "loss": 0.03, + "reward": -0.4798909847741015, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.4798909847741015, + "reward_after_std": 0.41573409363627434, + "reward_before_mean": -0.28886884544044733, + "reward_before_std": 0.4156207535415888, + "reward_change_max": 0.0006025433540344238, + "reward_change_mean": -0.19102213997393847, + "reward_change_min": -0.3795706331729889, + "reward_change_std": 0.15820336434990168, + "reward_std": 0.41573410853743553, + "rewards/cosine_scaled_reward": -0.2798510938882828, + "rewards/format_reward": 0.27083333395421505, + "step": 220 + }, + { + "advantage_max": 1.3563694432377815, + "advantage_mean": -1.8626449826975033e-09, + "advantage_min": -0.6945677511394024, + "advantage_std": 0.7419977821409702, + "completion_length": 1854.3958587646484, + "epoch": 0.25257142857142856, + "grad_norm": 0.2989541292190552, + "kl": 0.149749755859375, + "lambda_div_used": 0.6, + "learning_rate": 7.156560487081051e-07, + "loss": -0.0129, + "reward": 0.10412659542635083, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.10412659542635083, + "reward_after_std": 0.7419977821409702, + "reward_before_mean": 0.5544167058542371, + "reward_before_std": 0.696466002613306, + "reward_change_max": 0.00045955926179885864, + "reward_change_mean": -0.4502900801599026, + "reward_change_min": -0.8158979564905167, + "reward_change_std": 0.3151663765311241, + "reward_std": 0.7419977933168411, + "rewards/cosine_scaled_reward": 0.01679166965186596, + "rewards/format_reward": 0.520833345130086, + "step": 221 + }, + { + "advantage_max": 1.2615111097693443, + "advantage_mean": -3.7485734682984884e-08, + "advantage_min": -0.8112612888216972, + "advantage_std": 0.7181127965450287, + "completion_length": 2456.375015258789, + "epoch": 0.2537142857142857, + "grad_norm": 0.4287218153476715, + "kl": 0.2139892578125, + "lambda_div_used": 0.6, + "learning_rate": 7.127310565369415e-07, + "loss": 0.0029, + "reward": 0.12237783218733966, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.12237783218733966, + "reward_after_std": 0.7181127928197384, + "reward_before_mean": 0.5944197215139866, + "reward_before_std": 0.7205285355448723, + "reward_change_max": 0.00046860426664352417, + "reward_change_mean": -0.47204189747571945, + "reward_change_min": -0.8188545815646648, + "reward_change_std": 0.3446765150874853, + "reward_std": 0.7181127965450287, + "rewards/cosine_scaled_reward": 0.005543181672692299, + "rewards/format_reward": 0.5833333395421505, + "step": 222 + }, + { + "advantage_max": 1.2473274320363998, + "advantage_mean": 9.934108091691485e-09, + "advantage_min": -0.5829054936766624, + "advantage_std": 0.6753346417099237, + "completion_length": 2532.208381652832, + "epoch": 0.25485714285714284, + "grad_norm": 0.23588241636753082, + "kl": 0.20941162109375, + "lambda_div_used": 0.6, + "learning_rate": 7.097981330836616e-07, + "loss": 0.0158, + "reward": -0.045638011768460274, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.045638011768460274, + "reward_after_std": 0.6753346417099237, + "reward_before_mean": 0.3341361992061138, + "reward_before_std": 0.6450802572071552, + "reward_change_max": 0.0019773244857788086, + "reward_change_mean": -0.3797741485759616, + "reward_change_min": -0.6561138592660427, + "reward_change_std": 0.25462134182453156, + "reward_std": 0.6753346435725689, + "rewards/cosine_scaled_reward": -0.08293193019926548, + "rewards/format_reward": 0.5000000093132257, + "step": 223 + }, + { + "advantage_max": 1.8461582660675049, + "advantage_mean": -1.2417632477834672e-09, + "advantage_min": -0.8299898952245712, + "advantage_std": 1.0023904666304588, + "completion_length": 2865.1250762939453, + "epoch": 0.256, + "grad_norm": 0.8151166439056396, + "kl": 0.2145233154296875, + "lambda_div_used": 0.6, + "learning_rate": 7.068574212948169e-07, + "loss": 0.0747, + "reward": 0.04341099318116903, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.04341099318116903, + "reward_after_std": 1.002390455454588, + "reward_before_mean": 0.40595949813723564, + "reward_before_std": 1.0213181041181087, + "reward_change_max": 0.001161724328994751, + "reward_change_mean": -0.3625485133379698, + "reward_change_min": -0.7576224133372307, + "reward_change_std": 0.3075553746894002, + "reward_std": 1.0023904591798782, + "rewards/cosine_scaled_reward": 0.046729736030101776, + "rewards/format_reward": 0.31250000931322575, + "step": 224 + }, + { + "advantage_max": 1.160877875983715, + "advantage_mean": -1.241763414316921e-09, + "advantage_min": -0.5418887920677662, + "advantage_std": 0.6181628629565239, + "completion_length": 2925.6667098999023, + "epoch": 0.2571428571428571, + "grad_norm": 0.48488402366638184, + "kl": 0.2423095703125, + "lambda_div_used": 0.6, + "learning_rate": 7.039090644965509e-07, + "loss": 0.0407, + "reward": -0.09848776552826166, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.09848776552826166, + "reward_after_std": 0.6181628704071045, + "reward_before_mean": 0.2601957079023123, + "reward_before_std": 0.5574781373143196, + "reward_change_max": 0.0013304725289344788, + "reward_change_mean": -0.3586834678426385, + "reward_change_min": -0.6151060201227665, + "reward_change_std": 0.24173601809889078, + "reward_std": 0.6181629002094269, + "rewards/cosine_scaled_reward": -0.08865214767865837, + "rewards/format_reward": 0.43750000558793545, + "step": 225 + }, + { + "advantage_max": 1.6093599423766136, + "advantage_mean": 4.3461718668424965e-09, + "advantage_min": -0.77116759121418, + "advantage_std": 0.8485618270933628, + "completion_length": 2593.7083740234375, + "epoch": 0.2582857142857143, + "grad_norm": 0.4360467195510864, + "kl": 0.2156982421875, + "lambda_div_used": 0.6, + "learning_rate": 7.009532063876148e-07, + "loss": 0.0137, + "reward": 0.37242993898689747, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.37242993898689747, + "reward_after_std": 0.8485618270933628, + "reward_before_mean": 0.9451485552563099, + "reward_before_std": 0.7240049056708813, + "reward_change_max": 0.0004467219114303589, + "reward_change_mean": -0.5727185849100351, + "reward_change_min": -0.9093927554786205, + "reward_change_std": 0.35986475832760334, + "reward_std": 0.8485618606209755, + "rewards/cosine_scaled_reward": 0.20174093917012215, + "rewards/format_reward": 0.5416666697710752, + "step": 226 + }, + { + "advantage_max": 1.6921913027763367, + "advantage_mean": -2.2351741790771484e-08, + "advantage_min": -0.8459499292075634, + "advantage_std": 0.9304548464715481, + "completion_length": 2420.333381652832, + "epoch": 0.25942857142857145, + "grad_norm": 0.5740240812301636, + "kl": 0.211822509765625, + "lambda_div_used": 0.6, + "learning_rate": 6.979899910323624e-07, + "loss": 0.0551, + "reward": 0.050073117949068546, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.050073117949068546, + "reward_after_std": 0.9304548241198063, + "reward_before_mean": 0.43043838534504175, + "reward_before_std": 0.9527036100625992, + "reward_change_max": 0.0013343244791030884, + "reward_change_mean": -0.38036529161036015, + "reward_change_min": -0.8623692579567432, + "reward_change_std": 0.32892103493213654, + "reward_std": 0.9304548390209675, + "rewards/cosine_scaled_reward": -0.04519747570157051, + "rewards/format_reward": 0.5208333469927311, + "step": 227 + }, + { + "advantage_max": 1.1782990470528603, + "advantage_mean": -1.2417634698280722e-08, + "advantage_min": -0.599372997879982, + "advantage_std": 0.631151270121336, + "completion_length": 2588.812545776367, + "epoch": 0.26057142857142856, + "grad_norm": 0.1961567997932434, + "kl": 0.232666015625, + "lambda_div_used": 0.6, + "learning_rate": 6.950195628537299e-07, + "loss": 0.0311, + "reward": 0.004836801439523697, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.004836801439523697, + "reward_after_std": 0.6311512663960457, + "reward_before_mean": 0.4207380283623934, + "reward_before_std": 0.5706529878079891, + "reward_change_max": 0.0011245310306549072, + "reward_change_mean": -0.415901237167418, + "reward_change_min": -0.6850415766239166, + "reward_change_std": 0.2639042199589312, + "reward_std": 0.6311512775719166, + "rewards/cosine_scaled_reward": 0.002035675570368767, + "rewards/format_reward": 0.41666667349636555, + "step": 228 + }, + { + "advantage_max": 1.0126456022262573, + "advantage_mean": -1.241763414316921e-09, + "advantage_min": -0.4646928757429123, + "advantage_std": 0.549139428883791, + "completion_length": 2947.729217529297, + "epoch": 0.26171428571428573, + "grad_norm": 0.32021525502204895, + "kl": 0.3187255859375, + "lambda_div_used": 0.6, + "learning_rate": 6.920420666261961e-07, + "loss": 0.0374, + "reward": -0.21300336718559265, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.21300336718559265, + "reward_after_std": 0.549139428883791, + "reward_before_mean": 0.09695982187986374, + "reward_before_std": 0.5107403621077538, + "reward_change_max": 0.002090074121952057, + "reward_change_mean": -0.3099631778895855, + "reward_change_min": -0.5851683095097542, + "reward_change_std": 0.2205530758947134, + "reward_std": 0.5491394437849522, + "rewards/cosine_scaled_reward": -0.055686766281723976, + "rewards/format_reward": 0.2083333358168602, + "step": 229 + }, + { + "advantage_max": 1.4498926997184753, + "advantage_mean": 1.2417634254191512e-08, + "advantage_min": -0.5673705451190472, + "advantage_std": 0.7717844881117344, + "completion_length": 3185.7708740234375, + "epoch": 0.26285714285714284, + "grad_norm": 0.4693439304828644, + "kl": 0.3175048828125, + "lambda_div_used": 0.6, + "learning_rate": 6.890576474687263e-07, + "loss": 0.0575, + "reward": -0.2647488545626402, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2647488545626402, + "reward_after_std": 0.7717844769358635, + "reward_before_mean": -0.028830239549279213, + "reward_before_std": 0.7798666022717953, + "reward_change_max": 0.0021414458751678467, + "reward_change_mean": -0.2359186140820384, + "reward_change_min": -0.6099616996943951, + "reward_change_std": 0.22370487917214632, + "reward_std": 0.771784495562315, + "rewards/cosine_scaled_reward": -0.16024844953790307, + "rewards/format_reward": 0.29166667349636555, + "step": 230 + }, + { + "advantage_max": 1.1504193618893623, + "advantage_mean": 3.1044091186593903e-09, + "advantage_min": -0.4140487276017666, + "advantage_std": 0.5968481115996838, + "completion_length": 2979.041717529297, + "epoch": 0.264, + "grad_norm": 0.42601606249809265, + "kl": 0.343231201171875, + "lambda_div_used": 0.6, + "learning_rate": 6.860664508377001e-07, + "loss": 0.0293, + "reward": -0.21346711833029985, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.21346711833029985, + "reward_after_std": 0.5968481153249741, + "reward_before_mean": 0.08721326105296612, + "reward_before_std": 0.5268659126013517, + "reward_change_max": 0.0014610588550567627, + "reward_change_mean": -0.3006803933531046, + "reward_change_min": -0.5531118325889111, + "reward_change_std": 0.20393710862845182, + "reward_std": 0.5968481171876192, + "rewards/cosine_scaled_reward": -0.13347670319490135, + "rewards/format_reward": 0.35416666977107525, + "step": 231 + }, + { + "advantage_max": 1.2305606603622437, + "advantage_mean": 9.934108036180334e-09, + "advantage_min": -0.46694882586598396, + "advantage_std": 0.6468341052532196, + "completion_length": 3071.7708587646484, + "epoch": 0.2651428571428571, + "grad_norm": 0.272504061460495, + "kl": 0.3377685546875, + "lambda_div_used": 0.6, + "learning_rate": 6.83068622519821e-07, + "loss": 0.0473, + "reward": -0.31791983102448285, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.31791983102448285, + "reward_after_std": 0.6468341238796711, + "reward_before_mean": -0.08504897356033325, + "reward_before_std": 0.6210133656859398, + "reward_change_max": 0.002808481454849243, + "reward_change_mean": -0.23287085350602865, + "reward_change_min": -0.5016565397381783, + "reward_change_std": 0.1863990006968379, + "reward_std": 0.6468341499567032, + "rewards/cosine_scaled_reward": -0.2300244935322553, + "rewards/format_reward": 0.3750000074505806, + "step": 232 + }, + { + "advantage_max": 1.2980494424700737, + "advantage_mean": 7.45058070794613e-09, + "advantage_min": -0.6965583972632885, + "advantage_std": 0.7367055304348469, + "completion_length": 2740.0209045410156, + "epoch": 0.2662857142857143, + "grad_norm": 0.794073760509491, + "kl": 0.32879638671875, + "lambda_div_used": 0.6, + "learning_rate": 6.800643086250121e-07, + "loss": 0.0914, + "reward": -0.07508878409862518, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.07508878409862518, + "reward_after_std": 0.7367055304348469, + "reward_before_mean": 0.28439231449738145, + "reward_before_std": 0.7708801850676537, + "reward_change_max": 0.0, + "reward_change_mean": -0.35948106786236167, + "reward_change_min": -0.7288849875330925, + "reward_change_std": 0.29315769439563155, + "reward_std": 0.7367055453360081, + "rewards/cosine_scaled_reward": -0.07655386440455914, + "rewards/format_reward": 0.43750000931322575, + "step": 233 + }, + { + "advantage_max": 1.110601656138897, + "advantage_mean": 1.7695129667094633e-08, + "advantage_min": -0.6541510969400406, + "advantage_std": 0.6425810307264328, + "completion_length": 2760.6459007263184, + "epoch": 0.2674285714285714, + "grad_norm": 0.41915634274482727, + "kl": 0.32171630859375, + "lambda_div_used": 0.6, + "learning_rate": 6.770536555792944e-07, + "loss": 0.0248, + "reward": -0.13608171977102757, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.13608171977102757, + "reward_after_std": 0.6425810270011425, + "reward_before_mean": 0.2089826725423336, + "reward_before_std": 0.676423005759716, + "reward_change_max": 0.0001988634467124939, + "reward_change_mean": -0.34506439371034503, + "reward_change_min": -0.6399761959910393, + "reward_change_std": 0.2727101487107575, + "reward_std": 0.642581045627594, + "rewards/cosine_scaled_reward": -0.1038419995456934, + "rewards/format_reward": 0.41666666977107525, + "step": 234 + }, + { + "advantage_max": 1.3879477083683014, + "advantage_mean": -1.2417634809303024e-08, + "advantage_min": -0.702332578599453, + "advantage_std": 0.7649212591350079, + "completion_length": 2257.4791717529297, + "epoch": 0.26857142857142857, + "grad_norm": 0.5285465121269226, + "kl": 0.29833984375, + "lambda_div_used": 0.6, + "learning_rate": 6.740368101176495e-07, + "loss": 0.0536, + "reward": 0.12406621873378754, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.12406621873378754, + "reward_after_std": 0.7649212367832661, + "reward_before_mean": 0.5831809509545565, + "reward_before_std": 0.7322921007871628, + "reward_change_max": 0.0016368404030799866, + "reward_change_mean": -0.45911472756415606, + "reward_change_min": -0.8515269458293915, + "reward_change_std": 0.32818734738975763, + "reward_std": 0.764921247959137, + "rewards/cosine_scaled_reward": 0.02075712662190199, + "rewards/format_reward": 0.5416666716337204, + "step": 235 + }, + { + "advantage_max": 1.4723257198929787, + "advantage_mean": 1.6142925107764938e-08, + "advantage_min": -0.6119029596447945, + "advantage_std": 0.7948046922683716, + "completion_length": 2917.666748046875, + "epoch": 0.26971428571428574, + "grad_norm": 0.39669105410575867, + "kl": 0.313720703125, + "lambda_div_used": 0.6, + "learning_rate": 6.710139192768694e-07, + "loss": 0.0282, + "reward": -0.14454367384314537, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.14454367384314537, + "reward_after_std": 0.7948046959936619, + "reward_before_mean": 0.15732058137655258, + "reward_before_std": 0.7951191272586584, + "reward_change_max": 0.0004043877124786377, + "reward_change_mean": -0.30186421098187566, + "reward_change_min": -0.7042200863361359, + "reward_change_std": 0.26512502413243055, + "reward_std": 0.7948047444224358, + "rewards/cosine_scaled_reward": -0.14008972607553005, + "rewards/format_reward": 0.4375000037252903, + "step": 236 + }, + { + "advantage_max": 1.273007720708847, + "advantage_mean": -1.9868215128671096e-08, + "advantage_min": -0.7582220807671547, + "advantage_std": 0.7382465023547411, + "completion_length": 2680.7083587646484, + "epoch": 0.27085714285714285, + "grad_norm": 0.6276447772979736, + "kl": 0.2998046875, + "lambda_div_used": 0.6, + "learning_rate": 6.679851303883891e-07, + "loss": 0.068, + "reward": 0.12813042849302292, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.12813042849302292, + "reward_after_std": 0.7382464874535799, + "reward_before_mean": 0.5997425927780569, + "reward_before_std": 0.7308066599071026, + "reward_change_max": 0.0017982348799705505, + "reward_change_mean": -0.4716122280806303, + "reward_change_min": -0.8231159038841724, + "reward_change_std": 0.35379940923303366, + "reward_std": 0.7382465451955795, + "rewards/cosine_scaled_reward": 0.029037967324256897, + "rewards/format_reward": 0.5416666772216558, + "step": 237 + }, + { + "advantage_max": 1.7318901792168617, + "advantage_mean": -1.9247333560290514e-08, + "advantage_min": -0.9657696727663279, + "advantage_std": 0.9731025137007236, + "completion_length": 2463.729232788086, + "epoch": 0.272, + "grad_norm": 0.9471742510795593, + "kl": 0.32684326171875, + "lambda_div_used": 0.6, + "learning_rate": 6.649505910711058e-07, + "loss": 0.0699, + "reward": 0.355283772572875, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.355283772572875, + "reward_after_std": 0.9731024987995625, + "reward_before_mean": 0.9031340063083917, + "reward_before_std": 0.9712144713848829, + "reward_change_max": 0.0004456937313079834, + "reward_change_mean": -0.5478502493351698, + "reward_change_min": -1.0639617405831814, + "reward_change_std": 0.41797661781311035, + "reward_std": 0.9731025658547878, + "rewards/cosine_scaled_reward": 0.08698366861790419, + "rewards/format_reward": 0.7291666846722364, + "step": 238 + }, + { + "advantage_max": 1.5018570870161057, + "advantage_mean": -2.0489097085629737e-08, + "advantage_min": -0.7982526607811451, + "advantage_std": 0.8127573877573013, + "completion_length": 2166.1666984558105, + "epoch": 0.27314285714285713, + "grad_norm": 0.36692744493484497, + "kl": 0.2796630859375, + "lambda_div_used": 0.6, + "learning_rate": 6.619104492241847e-07, + "loss": 0.03, + "reward": 0.2998898196965456, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2998898196965456, + "reward_after_std": 0.8127573877573013, + "reward_before_mean": 0.8436064720153809, + "reward_before_std": 0.738035224378109, + "reward_change_max": 0.0012630298733711243, + "reward_change_mean": -0.5437166802585125, + "reward_change_min": -0.9020101800560951, + "reward_change_std": 0.3636026941239834, + "reward_std": 0.8127574101090431, + "rewards/cosine_scaled_reward": 0.1613865476101637, + "rewards/format_reward": 0.5208333376795053, + "step": 239 + }, + { + "advantage_max": 0.97071772813797, + "advantage_mean": 1.7384688466570708e-08, + "advantage_min": -0.4119010157883167, + "advantage_std": 0.5121559863910079, + "completion_length": 3031.812515258789, + "epoch": 0.2742857142857143, + "grad_norm": 0.821390688419342, + "kl": 0.65576171875, + "lambda_div_used": 0.6, + "learning_rate": 6.588648530198504e-07, + "loss": 0.0514, + "reward": -0.32652486581355333, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.32652486581355333, + "reward_after_std": 0.5121559603139758, + "reward_before_mean": -0.07132751506287605, + "reward_before_std": 0.4690159521996975, + "reward_change_max": 0.009721644222736359, + "reward_change_mean": -0.25519734993577003, + "reward_change_min": -0.4614217281341553, + "reward_change_std": 0.1805514907464385, + "reward_std": 0.512155975215137, + "rewards/cosine_scaled_reward": -0.22316376119852066, + "rewards/format_reward": 0.3750000037252903, + "step": 240 + }, + { + "advantage_max": 1.0398504100739956, + "advantage_mean": 2.7939678126642775e-08, + "advantage_min": -0.5276342928409576, + "advantage_std": 0.5748845934867859, + "completion_length": 3315.875, + "epoch": 0.2754285714285714, + "grad_norm": 0.49699658155441284, + "kl": 0.58642578125, + "lambda_div_used": 0.6, + "learning_rate": 6.558139508961654e-07, + "loss": 0.062, + "reward": -0.4127648015273735, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.4127648015273735, + "reward_after_std": 0.5748845934867859, + "reward_before_mean": -0.21113576227799058, + "reward_before_std": 0.6055608876049519, + "reward_change_max": 0.0006798282265663147, + "reward_change_mean": -0.20162902306765318, + "reward_change_min": -0.4717538245022297, + "reward_change_std": 0.19875234365463257, + "reward_std": 0.5748846232891083, + "rewards/cosine_scaled_reward": -0.22015121672302485, + "rewards/format_reward": 0.22916667349636555, + "step": 241 + }, + { + "advantage_max": 1.3193005844950676, + "advantage_mean": -9.934107481068821e-09, + "advantage_min": -0.48087476566433907, + "advantage_std": 0.671697337180376, + "completion_length": 2253.812545776367, + "epoch": 0.2765714285714286, + "grad_norm": 0.6630571484565735, + "kl": 0.50335693359375, + "lambda_div_used": 0.6, + "learning_rate": 6.527578915497951e-07, + "loss": 0.0349, + "reward": 0.1387800257652998, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1387800257652998, + "reward_after_std": 0.6716973129659891, + "reward_before_mean": 0.6152276992797852, + "reward_before_std": 0.5239123981446028, + "reward_change_max": 0.0019446909427642822, + "reward_change_mean": -0.4764476642012596, + "reward_change_min": -0.7161272764205933, + "reward_change_std": 0.27618725039064884, + "reward_std": 0.6716973222792149, + "rewards/cosine_scaled_reward": -0.046552833169698715, + "rewards/format_reward": 0.7083333395421505, + "step": 242 + }, + { + "advantage_max": 1.5651024878025055, + "advantage_mean": -2.6697914767837005e-08, + "advantage_min": -0.8467311263084412, + "advantage_std": 0.8558754622936249, + "completion_length": 2888.9376068115234, + "epoch": 0.2777142857142857, + "grad_norm": 0.5963757038116455, + "kl": 0.505859375, + "lambda_div_used": 0.6, + "learning_rate": 6.496968239287603e-07, + "loss": 0.0682, + "reward": 0.2457761913537979, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2457761913537979, + "reward_after_std": 0.8558754920959473, + "reward_before_mean": 0.7540382880251855, + "reward_before_std": 0.8090751152485609, + "reward_change_max": 0.0006944984197616577, + "reward_change_mean": -0.5082621518522501, + "reward_change_min": -0.9171033464372158, + "reward_change_std": 0.3666645511984825, + "reward_std": 0.8558755144476891, + "rewards/cosine_scaled_reward": 0.11660249065607786, + "rewards/format_reward": 0.520833345130086, + "step": 243 + }, + { + "advantage_max": 1.2792055383324623, + "advantage_mean": 9.934107758624577e-09, + "advantage_min": -0.4830026626586914, + "advantage_std": 0.663687277585268, + "completion_length": 2899.166717529297, + "epoch": 0.27885714285714286, + "grad_norm": 0.6811074614524841, + "kl": 0.5177001953125, + "lambda_div_used": 0.6, + "learning_rate": 6.466308972251785e-07, + "loss": 0.0415, + "reward": 0.08414949290454388, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.08414949290454388, + "reward_after_std": 0.663687277585268, + "reward_before_mean": 0.5329303331673145, + "reward_before_std": 0.5415982883423567, + "reward_change_max": 0.0, + "reward_change_mean": -0.4487808058038354, + "reward_change_min": -0.6938077807426453, + "reward_change_std": 0.2719251224771142, + "reward_std": 0.6636872962117195, + "rewards/cosine_scaled_reward": 0.08938181702978909, + "rewards/format_reward": 0.3541666679084301, + "step": 244 + }, + { + "advantage_max": 1.0939532667398453, + "advantage_mean": 5.551115123125783e-17, + "advantage_min": -0.47427595779299736, + "advantage_std": 0.6055382005870342, + "completion_length": 3353.0625610351562, + "epoch": 0.28, + "grad_norm": 0.5311272740364075, + "kl": 0.607177734375, + "lambda_div_used": 0.6, + "learning_rate": 6.435602608679916e-07, + "loss": 0.0564, + "reward": -0.3409585952758789, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.3409585952758789, + "reward_after_std": 0.605538222938776, + "reward_before_mean": -0.10389573313295841, + "reward_before_std": 0.6280820369720459, + "reward_change_max": 1.564621925354004e-07, + "reward_change_mean": -0.2370628654025495, + "reward_change_min": -0.6031081043183804, + "reward_change_std": 0.2207367429509759, + "reward_std": 0.6055382266640663, + "rewards/cosine_scaled_reward": -0.17694786563515663, + "rewards/format_reward": 0.25000000931322575, + "step": 245 + }, + { + "advantage_max": 1.2697142958641052, + "advantage_mean": -1.6763806509612067e-08, + "advantage_min": -0.5935459956526756, + "advantage_std": 0.6914196014404297, + "completion_length": 3002.0834045410156, + "epoch": 0.28114285714285714, + "grad_norm": 0.4564787745475769, + "kl": 0.54296875, + "lambda_div_used": 0.6, + "learning_rate": 6.404850645156841e-07, + "loss": 0.0736, + "reward": 0.003791339695453644, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.003791339695453644, + "reward_after_std": 0.6914196237921715, + "reward_before_mean": 0.4101965553127229, + "reward_before_std": 0.6638990417122841, + "reward_change_max": 0.0008903145790100098, + "reward_change_mean": -0.4064051969908178, + "reward_change_min": -0.7757539190351963, + "reward_change_std": 0.2804739885032177, + "reward_std": 0.6914196386933327, + "rewards/cosine_scaled_reward": -0.03448507562279701, + "rewards/format_reward": 0.47916667349636555, + "step": 246 + }, + { + "advantage_max": 1.4313408359885216, + "advantage_mean": 9.002785295031401e-09, + "advantage_min": -0.6096308752894402, + "advantage_std": 0.7550412714481354, + "completion_length": 3272.1875610351562, + "epoch": 0.2822857142857143, + "grad_norm": 0.43419909477233887, + "kl": 0.541259765625, + "lambda_div_used": 0.6, + "learning_rate": 6.374054580489873e-07, + "loss": 0.0428, + "reward": -0.21265746047720313, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.21265746047720313, + "reward_after_std": 0.7550412565469742, + "reward_before_mean": 0.052691347897052765, + "reward_before_std": 0.7353229857981205, + "reward_change_max": 0.001304030418395996, + "reward_change_mean": -0.26534880325198174, + "reward_change_min": -0.5918116085231304, + "reward_change_std": 0.22167464904487133, + "reward_std": 0.755041278898716, + "rewards/cosine_scaled_reward": -0.17157100839540362, + "rewards/format_reward": 0.3958333469927311, + "step": 247 + }, + { + "advantage_max": 1.2454994097352028, + "advantage_mean": -4.967053435223079e-09, + "advantage_min": -0.6300271227955818, + "advantage_std": 0.6934917159378529, + "completion_length": 2788.979217529297, + "epoch": 0.2834285714285714, + "grad_norm": 0.46329057216644287, + "kl": 0.4290771484375, + "lambda_div_used": 0.6, + "learning_rate": 6.343215915635761e-07, + "loss": 0.0367, + "reward": 0.034394118934869766, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.034394118934869766, + "reward_after_std": 0.693491704761982, + "reward_before_mean": 0.46039974445011467, + "reward_before_std": 0.6687365509569645, + "reward_change_max": 0.0004130154848098755, + "reward_change_mean": -0.4260056307539344, + "reward_change_min": -0.7904873788356781, + "reward_change_std": 0.3156164027750492, + "reward_std": 0.6934917382895947, + "rewards/cosine_scaled_reward": 0.021866535767912865, + "rewards/format_reward": 0.4166666716337204, + "step": 248 + }, + { + "advantage_max": 1.774680495262146, + "advantage_mean": -2.359350537162186e-08, + "advantage_min": -0.7601935900747776, + "advantage_std": 0.9341864809393883, + "completion_length": 2269.8750381469727, + "epoch": 0.2845714285714286, + "grad_norm": 0.7354264855384827, + "kl": 0.310791015625, + "lambda_div_used": 0.6, + "learning_rate": 6.31233615362752e-07, + "loss": 0.0632, + "reward": 0.21180204581469297, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.21180204581469297, + "reward_after_std": 0.9341864800080657, + "reward_before_mean": 0.677543967962265, + "reward_before_std": 0.8651815243065357, + "reward_change_max": 0.0007881596684455872, + "reward_change_mean": -0.4657419379800558, + "reward_change_min": -0.765058133751154, + "reward_change_std": 0.31003841245546937, + "reward_std": 0.934186520986259, + "rewards/cosine_scaled_reward": 0.047105309553444386, + "rewards/format_reward": 0.5833333414047956, + "step": 249 + }, + { + "advantage_max": 1.3317992761731148, + "advantage_mean": 8.071462387349015e-09, + "advantage_min": -0.6089577861130238, + "advantage_std": 0.7215257622301579, + "completion_length": 2671.8959045410156, + "epoch": 0.2857142857142857, + "grad_norm": 0.6826204061508179, + "kl": 0.4267578125, + "lambda_div_used": 0.6, + "learning_rate": 6.281416799501187e-07, + "loss": 0.0164, + "reward": -0.07462250138632953, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.07462250138632953, + "reward_after_std": 0.7215257547795773, + "reward_before_mean": 0.27664079144597054, + "reward_before_std": 0.7012166492640972, + "reward_change_max": 0.0021498724818229675, + "reward_change_mean": -0.3512632828205824, + "reward_change_min": -0.7547120712697506, + "reward_change_std": 0.27203070372343063, + "reward_std": 0.7215257957577705, + "rewards/cosine_scaled_reward": -0.17417962139006704, + "rewards/format_reward": 0.6250000074505806, + "step": 250 + }, + { + "advantage_max": 1.5387679040431976, + "advantage_mean": -9.93410786964688e-09, + "advantage_min": -0.7006419822573662, + "advantage_std": 0.8206590004265308, + "completion_length": 2208.729202270508, + "epoch": 0.28685714285714287, + "grad_norm": 0.6821022033691406, + "kl": 0.33001708984375, + "lambda_div_used": 0.6, + "learning_rate": 6.25045936022246e-07, + "loss": 0.0084, + "reward": 0.1401051990687847, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1401051990687847, + "reward_after_std": 0.8206590078771114, + "reward_before_mean": 0.5900087499758229, + "reward_before_std": 0.7465699054300785, + "reward_change_max": 0.0010773837566375732, + "reward_change_mean": -0.4499035747721791, + "reward_change_min": -0.8058441616594791, + "reward_change_std": 0.3140787845477462, + "reward_std": 0.820659015327692, + "rewards/cosine_scaled_reward": -0.027912288904190063, + "rewards/format_reward": 0.6458333395421505, + "step": 251 + }, + { + "advantage_max": 1.1559683978557587, + "advantage_mean": -1.2417644135176431e-09, + "advantage_min": -0.5106577202677727, + "advantage_std": 0.6102561987936497, + "completion_length": 3031.6041870117188, + "epoch": 0.288, + "grad_norm": 0.283564954996109, + "kl": 0.33856201171875, + "lambda_div_used": 0.6, + "learning_rate": 6.219465344613258e-07, + "loss": 0.0426, + "reward": -0.11627742386190221, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.11627742386190221, + "reward_after_std": 0.6102561987936497, + "reward_before_mean": 0.23478730767965317, + "reward_before_std": 0.5429081320762634, + "reward_change_max": 0.000623457133769989, + "reward_change_mean": -0.35106473602354527, + "reward_change_min": -0.6218222491443157, + "reward_change_std": 0.2379716858267784, + "reward_std": 0.6102562211453915, + "rewards/cosine_scaled_reward": -0.09093968477100134, + "rewards/format_reward": 0.41666667349636555, + "step": 252 + }, + { + "advantage_max": 1.329535834491253, + "advantage_mean": -1.6763806454100916e-08, + "advantage_min": -0.8195891305804253, + "advantage_std": 0.7661233134567738, + "completion_length": 2583.166732788086, + "epoch": 0.28914285714285715, + "grad_norm": 0.5934932231903076, + "kl": 0.27423095703125, + "lambda_div_used": 0.6, + "learning_rate": 6.188436263278172e-07, + "loss": 0.0548, + "reward": 0.16420308127999306, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.16420308127999306, + "reward_after_std": 0.7661233209073544, + "reward_before_mean": 0.652702329447493, + "reward_before_std": 0.7853073924779892, + "reward_change_max": 0.0004984140396118164, + "reward_change_mean": -0.4884992679581046, + "reward_change_min": -0.8899687603116035, + "reward_change_std": 0.35754732973873615, + "reward_std": 0.7661233320832253, + "rewards/cosine_scaled_reward": -0.017398834694176912, + "rewards/format_reward": 0.6875000111758709, + "step": 253 + }, + { + "advantage_max": 1.16770701110363, + "advantage_mean": 1.6142925107764938e-08, + "advantage_min": -0.5569769516587257, + "advantage_std": 0.6323312036693096, + "completion_length": 3085.729217529297, + "epoch": 0.29028571428571426, + "grad_norm": 0.36022013425827026, + "kl": 0.3563232421875, + "lambda_div_used": 0.6, + "learning_rate": 6.157373628530852e-07, + "loss": 0.0417, + "reward": -0.24699397385120392, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.24699397385120392, + "reward_after_std": 0.6323312278836966, + "reward_before_mean": 0.030835852958261967, + "reward_before_std": 0.621703639626503, + "reward_change_max": 0.003024943172931671, + "reward_change_mean": -0.2778298296034336, + "reward_change_min": -0.5625698752701283, + "reward_change_std": 0.2203113967552781, + "reward_std": 0.6323312316089869, + "rewards/cosine_scaled_reward": -0.16166540794074535, + "rewards/format_reward": 0.35416667349636555, + "step": 254 + }, + { + "advantage_max": 1.2296145930886269, + "advantage_mean": -6.208814573582799e-10, + "advantage_min": -0.6068962588906288, + "advantage_std": 0.6709359288215637, + "completion_length": 3051.7083892822266, + "epoch": 0.2914285714285714, + "grad_norm": 0.3220326006412506, + "kl": 0.35479736328125, + "lambda_div_used": 0.6, + "learning_rate": 6.126278954320294e-07, + "loss": 0.0296, + "reward": -0.29262065328657627, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.29262065328657627, + "reward_after_std": 0.6709359437227249, + "reward_before_mean": -0.0455952831543982, + "reward_before_std": 0.6920541971921921, + "reward_change_max": 0.0018450617790222168, + "reward_change_mean": -0.24702537804841995, + "reward_change_min": -0.5441051162779331, + "reward_change_std": 0.21536783035844564, + "reward_std": 0.6709359511733055, + "rewards/cosine_scaled_reward": -0.16863098926842213, + "rewards/format_reward": 0.2916666716337204, + "step": 255 + }, + { + "advantage_max": 1.2679320387542248, + "advantage_mean": 1.7384688910659918e-08, + "advantage_min": -0.7408056110143661, + "advantage_std": 0.7108086459338665, + "completion_length": 3014.812530517578, + "epoch": 0.2925714285714286, + "grad_norm": 0.47123652696609497, + "kl": 0.3238525390625, + "lambda_div_used": 0.6, + "learning_rate": 6.095153756157051e-07, + "loss": 0.0183, + "reward": -0.056694136932492256, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.056694136932492256, + "reward_after_std": 0.710808627307415, + "reward_before_mean": 0.31453731283545494, + "reward_before_std": 0.7238197699189186, + "reward_change_max": 0.0021554455161094666, + "reward_change_mean": -0.3712314344011247, + "reward_change_min": -0.6968014165759087, + "reward_change_std": 0.28223771415650845, + "reward_std": 0.7108086608350277, + "rewards/cosine_scaled_reward": -0.06148135010153055, + "rewards/format_reward": 0.43750002048909664, + "step": 256 + }, + { + "advantage_max": 1.7293520271778107, + "advantage_mean": -1.4280279847511679e-08, + "advantage_min": -0.8434292376041412, + "advantage_std": 0.9644281379878521, + "completion_length": 3112.6875610351562, + "epoch": 0.2937142857142857, + "grad_norm": 1.1420844793319702, + "kl": 0.31341552734375, + "lambda_div_used": 0.6, + "learning_rate": 6.06399955103937e-07, + "loss": 0.0405, + "reward": 0.11980674788355827, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.11980674788355827, + "reward_after_std": 0.9644281603395939, + "reward_before_mean": 0.541266412474215, + "reward_before_std": 0.9871671870350838, + "reward_change_max": 0.0004280358552932739, + "reward_change_mean": -0.4214596524834633, + "reward_change_min": -0.8742838054895401, + "reward_change_std": 0.36773448437452316, + "reward_std": 0.9644281938672066, + "rewards/cosine_scaled_reward": 0.041466531343758106, + "rewards/format_reward": 0.4583333469927311, + "step": 257 + }, + { + "advantage_max": 1.308477059006691, + "advantage_mean": -8.071462664904772e-09, + "advantage_min": -0.6776211000978947, + "advantage_std": 0.7201198823750019, + "completion_length": 3318.041717529297, + "epoch": 0.2948571428571429, + "grad_norm": 0.5514835715293884, + "kl": 0.3533935546875, + "lambda_div_used": 0.6, + "learning_rate": 6.032817857379256e-07, + "loss": 0.0511, + "reward": -0.1533919759094715, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.1533919759094715, + "reward_after_std": 0.7201199196279049, + "reward_before_mean": 0.16029761778190732, + "reward_before_std": 0.7385069392621517, + "reward_change_max": 0.0006575882434844971, + "reward_change_mean": -0.31368961185216904, + "reward_change_min": -0.6525617055594921, + "reward_change_std": 0.2573896599933505, + "reward_std": 0.7201199345290661, + "rewards/cosine_scaled_reward": -0.09693453786894679, + "rewards/format_reward": 0.354166679084301, + "step": 258 + }, + { + "advantage_max": 1.3717726543545723, + "advantage_mean": -1.6142924996742636e-08, + "advantage_min": -0.6455200538039207, + "advantage_std": 0.7451611235737801, + "completion_length": 2742.0833892822266, + "epoch": 0.296, + "grad_norm": 0.6136261224746704, + "kl": 0.32659912109375, + "lambda_div_used": 0.6, + "learning_rate": 6.001610194928464e-07, + "loss": 0.049, + "reward": 0.19095646031200886, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.19095646031200886, + "reward_after_std": 0.7451611235737801, + "reward_before_mean": 0.6913169464096427, + "reward_before_std": 0.6776915155351162, + "reward_change_max": 0.0005735307931900024, + "reward_change_mean": -0.500360487960279, + "reward_change_min": -0.9127090983092785, + "reward_change_std": 0.34858130011707544, + "reward_std": 0.7451611422002316, + "rewards/cosine_scaled_reward": 0.06440844899043441, + "rewards/format_reward": 0.5625000074505806, + "step": 259 + }, + { + "advantage_max": 1.6590567827224731, + "advantage_mean": 7.45058070794613e-09, + "advantage_min": -0.7553113959729671, + "advantage_std": 0.9088802076876163, + "completion_length": 2397.645881652832, + "epoch": 0.29714285714285715, + "grad_norm": 0.7651255130767822, + "kl": 0.28924560546875, + "lambda_div_used": 0.6, + "learning_rate": 5.97037808470444e-07, + "loss": 0.0515, + "reward": 0.3306399695575237, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3306399695575237, + "reward_after_std": 0.908880215138197, + "reward_before_mean": 0.8759937360882759, + "reward_before_std": 0.861339095979929, + "reward_change_max": 0.0006400644779205322, + "reward_change_mean": -0.545353771187365, + "reward_change_min": -1.0057422816753387, + "reward_change_std": 0.39418873470276594, + "reward_std": 0.9088802374899387, + "rewards/cosine_scaled_reward": 0.1567468661814928, + "rewards/format_reward": 0.5625000037252903, + "step": 260 + }, + { + "advantage_max": 1.0520955845713615, + "advantage_mean": 1.490116219304838e-08, + "advantage_min": -0.5371618010103703, + "advantage_std": 0.5718340016901493, + "completion_length": 3145.166717529297, + "epoch": 0.29828571428571427, + "grad_norm": 0.48807358741760254, + "kl": 0.3935546875, + "lambda_div_used": 0.6, + "learning_rate": 5.939123048916173e-07, + "loss": 0.0241, + "reward": -0.15376926213502884, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.15376926213502884, + "reward_after_std": 0.5718340091407299, + "reward_before_mean": 0.1892886906862259, + "reward_before_std": 0.5439359992742538, + "reward_change_max": 0.000623844563961029, + "reward_change_mean": -0.3430579248815775, + "reward_change_min": -0.5786691717803478, + "reward_change_std": 0.2355566336773336, + "reward_std": 0.5718340240418911, + "rewards/cosine_scaled_reward": -0.1032723356038332, + "rewards/format_reward": 0.3958333358168602, + "step": 261 + }, + { + "advantage_max": 1.3386315703392029, + "advantage_mean": 1.1796753240922442e-08, + "advantage_min": -0.6328083500266075, + "advantage_std": 0.7234783992171288, + "completion_length": 3038.437545776367, + "epoch": 0.29942857142857143, + "grad_norm": 0.9701263904571533, + "kl": 0.442626953125, + "lambda_div_used": 0.6, + "learning_rate": 5.907846610890011e-07, + "loss": 0.096, + "reward": -0.25120984390378, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.25120984390378, + "reward_after_std": 0.7234784215688705, + "reward_before_mean": 0.006802541669458151, + "reward_before_std": 0.7268307991325855, + "reward_change_max": 0.0003486126661300659, + "reward_change_mean": -0.2580123767256737, + "reward_change_min": -0.5682838223874569, + "reward_change_std": 0.23459640704095364, + "reward_std": 0.7234784476459026, + "rewards/cosine_scaled_reward": -0.2257654066197574, + "rewards/format_reward": 0.45833333767950535, + "step": 262 + }, + { + "advantage_max": 1.17459299787879, + "advantage_mean": -1.2417633588057697e-09, + "advantage_min": -0.5453278385102749, + "advantage_std": 0.6335834860801697, + "completion_length": 2903.5625534057617, + "epoch": 0.30057142857142854, + "grad_norm": 0.4829193949699402, + "kl": 0.52587890625, + "lambda_div_used": 0.6, + "learning_rate": 5.87655029499542e-07, + "loss": 0.0646, + "reward": -0.29326684278203174, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.29326684278203174, + "reward_after_std": 0.6335834860801697, + "reward_before_mean": -0.041429828852415085, + "reward_before_std": 0.6316493209451437, + "reward_change_max": 0.00026198476552963257, + "reward_change_mean": -0.2518370160833001, + "reward_change_min": -0.5603325888514519, + "reward_change_std": 0.21581022161990404, + "reward_std": 0.6335835084319115, + "rewards/cosine_scaled_reward": -0.18738158675841987, + "rewards/format_reward": 0.33333333767950535, + "step": 263 + }, + { + "advantage_max": 1.300993226468563, + "advantage_mean": 4.346172144398253e-09, + "advantage_min": -0.6545535698533058, + "advantage_std": 0.724092248827219, + "completion_length": 3088.1250610351562, + "epoch": 0.3017142857142857, + "grad_norm": 0.46063876152038574, + "kl": 0.520751953125, + "lambda_div_used": 0.6, + "learning_rate": 5.845235626570683e-07, + "loss": 0.0537, + "reward": -0.05797311244532466, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.05797311244532466, + "reward_after_std": 0.7240922451019287, + "reward_before_mean": 0.31046564131975174, + "reward_before_std": 0.7186985239386559, + "reward_change_max": 0.0, + "reward_change_mean": -0.3684387691318989, + "reward_change_min": -0.723139800131321, + "reward_change_std": 0.2936291787773371, + "reward_std": 0.724092248827219, + "rewards/cosine_scaled_reward": -0.13643384957686067, + "rewards/format_reward": 0.5833333469927311, + "step": 264 + }, + { + "advantage_max": 1.6109931096434593, + "advantage_mean": -1.552204331733975e-08, + "advantage_min": -0.7705529257655144, + "advantage_std": 0.86626897752285, + "completion_length": 2894.708450317383, + "epoch": 0.3028571428571429, + "grad_norm": 0.9522861838340759, + "kl": 0.5997314453125, + "lambda_div_used": 0.6, + "learning_rate": 5.813904131848564e-07, + "loss": 0.0947, + "reward": 0.16067335568368435, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.16067335568368435, + "reward_after_std": 0.86626897752285, + "reward_before_mean": 0.6178323356434703, + "reward_before_std": 0.8223806396126747, + "reward_change_max": 0.0, + "reward_change_mean": -0.4571589883416891, + "reward_change_min": -0.8531933054327965, + "reward_change_std": 0.3247545287013054, + "reward_std": 0.8662689924240112, + "rewards/cosine_scaled_reward": 0.027666167356073856, + "rewards/format_reward": 0.5625000186264515, + "step": 265 + }, + { + "advantage_max": 1.2452225089073181, + "advantage_mean": 1.1175871117430347e-08, + "advantage_min": -0.6117013469338417, + "advantage_std": 0.6951426491141319, + "completion_length": 3169.5833892822266, + "epoch": 0.304, + "grad_norm": 1.1230192184448242, + "kl": 0.6510009765625, + "lambda_div_used": 0.6, + "learning_rate": 5.78255733788191e-07, + "loss": 0.0369, + "reward": -0.2725262697786093, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.2725262697786093, + "reward_after_std": 0.6951426453888416, + "reward_before_mean": -0.015931952744722366, + "reward_before_std": 0.7379988580942154, + "reward_change_max": 0.0018127188086509705, + "reward_change_mean": -0.25659431144595146, + "reward_change_min": -0.6191375590860844, + "reward_change_std": 0.25174527056515217, + "reward_std": 0.6951426677405834, + "rewards/cosine_scaled_reward": -0.22671598196029663, + "rewards/format_reward": 0.43750000558793545, + "step": 266 + }, + { + "advantage_max": 0.9989327527582645, + "advantage_mean": 6.208818459363386e-10, + "advantage_min": -0.480878084897995, + "advantage_std": 0.5396589785814285, + "completion_length": 3455.2500610351562, + "epoch": 0.30514285714285716, + "grad_norm": 0.9501419067382812, + "kl": 0.77880859375, + "lambda_div_used": 0.6, + "learning_rate": 5.751196772469237e-07, + "loss": 0.0558, + "reward": -0.4211684428155422, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.4211684428155422, + "reward_after_std": 0.5396589823067188, + "reward_before_mean": -0.21912843873724341, + "reward_before_std": 0.5446715801954269, + "reward_change_max": 0.0012646690011024475, + "reward_change_mean": -0.2020400082692504, + "reward_change_min": -0.45151586830616, + "reward_change_std": 0.17916888557374477, + "reward_std": 0.5396590121090412, + "rewards/cosine_scaled_reward": -0.23456422612071037, + "rewards/format_reward": 0.2500000037252903, + "step": 267 + }, + { + "advantage_max": 1.4540027901530266, + "advantage_mean": 1.2728075704515618e-08, + "advantage_min": -0.6779943779110909, + "advantage_std": 0.7723269909620285, + "completion_length": 2636.6458740234375, + "epoch": 0.3062857142857143, + "grad_norm": 0.7164831757545471, + "kl": 0.58544921875, + "lambda_div_used": 0.6, + "learning_rate": 5.71982396408026e-07, + "loss": 0.0633, + "reward": -0.14749359339475632, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.14749359339475632, + "reward_after_std": 0.7723269797861576, + "reward_before_mean": 0.15347140841186047, + "reward_before_std": 0.7567279189825058, + "reward_change_max": 0.0001808255910873413, + "reward_change_mean": -0.30096501484513283, + "reward_change_min": -0.5972777009010315, + "reward_change_std": 0.23437961423769593, + "reward_std": 0.7723270021378994, + "rewards/cosine_scaled_reward": -0.16284763207659125, + "rewards/format_reward": 0.47916668467223644, + "step": 268 + }, + { + "advantage_max": 1.5683661699295044, + "advantage_mean": -1.117587122845265e-08, + "advantage_min": -0.6702335737645626, + "advantage_std": 0.8368349559605122, + "completion_length": 2962.2709197998047, + "epoch": 0.30742857142857144, + "grad_norm": 0.7401025295257568, + "kl": 0.635498046875, + "lambda_div_used": 0.6, + "learning_rate": 5.688440441781398e-07, + "loss": 0.079, + "reward": -0.021501684561371803, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.021501684561371803, + "reward_after_std": 0.8368349559605122, + "reward_before_mean": 0.3379681808874011, + "reward_before_std": 0.8070851732045412, + "reward_change_max": 0.0007351338863372803, + "reward_change_mean": -0.359469898045063, + "reward_change_min": -0.7419305182993412, + "reward_change_std": 0.2838026713579893, + "reward_std": 0.8368349634110928, + "rewards/cosine_scaled_reward": -0.1435159114189446, + "rewards/format_reward": 0.6250000074505806, + "step": 269 + }, + { + "advantage_max": 1.7115082815289497, + "advantage_mean": 2.483527050678447e-09, + "advantage_min": -0.7837255597114563, + "advantage_std": 0.9412933625280857, + "completion_length": 2904.6459045410156, + "epoch": 0.30857142857142855, + "grad_norm": 0.807169497013092, + "kl": 0.6158447265625, + "lambda_div_used": 0.6, + "learning_rate": 5.657047735161255e-07, + "loss": 0.0492, + "reward": 0.009002413600683212, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.009002413600683212, + "reward_after_std": 0.9412933364510536, + "reward_before_mean": 0.3679821668192744, + "reward_before_std": 0.9656250663101673, + "reward_change_max": 0.0015122592449188232, + "reward_change_mean": -0.35897977463901043, + "reward_change_min": -0.8852769210934639, + "reward_change_std": 0.3282699631527066, + "reward_std": 0.9412933550775051, + "rewards/cosine_scaled_reward": -0.07642558356747031, + "rewards/format_reward": 0.5208333414047956, + "step": 270 + }, + { + "advantage_max": 1.5734351687133312, + "advantage_mean": -2.2972623636707823e-08, + "advantage_min": -0.7996720634400845, + "advantage_std": 0.8608956038951874, + "completion_length": 2784.729217529297, + "epoch": 0.3097142857142857, + "grad_norm": 1.0408498048782349, + "kl": 0.579833984375, + "lambda_div_used": 0.6, + "learning_rate": 5.625647374256061e-07, + "loss": 0.0876, + "reward": 0.2030478809028864, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2030478809028864, + "reward_after_std": 0.8608955889940262, + "reward_before_mean": 0.6846325844526291, + "reward_before_std": 0.835495188832283, + "reward_change_max": 0.0, + "reward_change_mean": -0.48158473148941994, + "reward_change_min": -0.8558024764060974, + "reward_change_std": 0.33973479084670544, + "reward_std": 0.8608956038951874, + "rewards/cosine_scaled_reward": 0.019399608485400677, + "rewards/format_reward": 0.6458333432674408, + "step": 271 + }, + { + "advantage_max": 1.3714451864361763, + "advantage_mean": 1.2417633588057697e-09, + "advantage_min": -0.7535846754908562, + "advantage_std": 0.7635341472923756, + "completion_length": 3308.250030517578, + "epoch": 0.31085714285714283, + "grad_norm": 0.6424413323402405, + "kl": 0.76318359375, + "lambda_div_used": 0.6, + "learning_rate": 5.594240889475106e-07, + "loss": 0.0771, + "reward": -0.20335895102471113, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.20335895102471113, + "reward_after_std": 0.7635341547429562, + "reward_before_mean": 0.07921543845441192, + "reward_before_std": 0.8059300296008587, + "reward_change_max": 0.0013331100344657898, + "reward_change_mean": -0.28257441613823175, + "reward_change_min": -0.6223890446126461, + "reward_change_std": 0.2657231818884611, + "reward_std": 0.7635341696441174, + "rewards/cosine_scaled_reward": -0.14789227582514286, + "rewards/format_reward": 0.3750000111758709, + "step": 272 + }, + { + "advantage_max": 1.3589064106345177, + "advantage_mean": 4.346172699509765e-09, + "advantage_min": -0.7836330458521843, + "advantage_std": 0.7793318666517735, + "completion_length": 3080.5209045410156, + "epoch": 0.312, + "grad_norm": 0.7329338192939758, + "kl": 0.748291015625, + "lambda_div_used": 0.6, + "learning_rate": 5.562829811526154e-07, + "loss": 0.0901, + "reward": -0.043059684336185455, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.043059684336185455, + "reward_after_std": 0.7793318778276443, + "reward_before_mean": 0.328045935370028, + "reward_before_std": 0.8345182836055756, + "reward_change_max": 0.0015919134020805359, + "reward_change_mean": -0.3711056038737297, + "reward_change_min": -0.7348584309220314, + "reward_change_std": 0.31267216615378857, + "reward_std": 0.7793319001793861, + "rewards/cosine_scaled_reward": -0.07556037977337837, + "rewards/format_reward": 0.47916667722165585, + "step": 273 + }, + { + "advantage_max": 1.5300347954034805, + "advantage_mean": -2.0489097418696645e-08, + "advantage_min": -0.8000783994793892, + "advantage_std": 0.8445150479674339, + "completion_length": 2011.8333587646484, + "epoch": 0.31314285714285717, + "grad_norm": 1.2785873413085938, + "kl": 0.658935546875, + "lambda_div_used": 0.6, + "learning_rate": 5.531415671340826e-07, + "loss": -0.0116, + "reward": 0.510652432218194, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.510652432218194, + "reward_after_std": 0.8445150516927242, + "reward_before_mean": 1.165109277702868, + "reward_before_std": 0.757109772413969, + "reward_change_max": 0.000974707305431366, + "reward_change_mean": -0.6544568724930286, + "reward_change_min": -1.0643592663109303, + "reward_change_std": 0.4129680562764406, + "reward_std": 0.8445150889456272, + "rewards/cosine_scaled_reward": 0.21797129698097706, + "rewards/format_reward": 0.7291666679084301, + "step": 274 + }, + { + "advantage_max": 1.4885611981153488, + "advantage_mean": -9.934107647602275e-09, + "advantage_min": -0.7650426514446735, + "advantage_std": 0.8269946090877056, + "completion_length": 2577.875030517578, + "epoch": 0.3142857142857143, + "grad_norm": 0.6695863604545593, + "kl": 0.6025390625, + "lambda_div_used": 0.6, + "learning_rate": 5.5e-07, + "loss": 0.0346, + "reward": 0.1038482214207761, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1038482214207761, + "reward_after_std": 0.8269945941865444, + "reward_before_mean": 0.5431328900158405, + "reward_before_std": 0.8103073462843895, + "reward_change_max": 0.0015599504113197327, + "reward_change_mean": -0.4392846738919616, + "reward_change_min": -0.8281485103070736, + "reward_change_std": 0.3419040869921446, + "reward_std": 0.8269945941865444, + "rewards/cosine_scaled_reward": -0.009683551266789436, + "rewards/format_reward": 0.5625000093132257, + "step": 275 + }, + { + "advantage_max": 1.7513000518083572, + "advantage_mean": -1.8626451658843024e-08, + "advantage_min": -0.7792819105088711, + "advantage_std": 0.9374289289116859, + "completion_length": 2389.416717529297, + "epoch": 0.31542857142857145, + "grad_norm": 0.5462023019790649, + "kl": 0.61199951171875, + "lambda_div_used": 0.6, + "learning_rate": 5.468584328659172e-07, + "loss": 0.0499, + "reward": 0.23297729715704918, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.23297729715704918, + "reward_after_std": 0.9374288991093636, + "reward_before_mean": 0.7080944953486323, + "reward_before_std": 0.8812166787683964, + "reward_change_max": 0.0, + "reward_change_mean": -0.4751171786338091, + "reward_change_min": -0.9459218531847, + "reward_change_std": 0.3399658240377903, + "reward_std": 0.9374289140105247, + "rewards/cosine_scaled_reward": -0.00011943886056542397, + "rewards/format_reward": 0.7083333488553762, + "step": 276 + }, + { + "advantage_max": 1.5042954310774803, + "advantage_mean": 1.1796753018877837e-08, + "advantage_min": -0.6724752858281136, + "advantage_std": 0.8150804676115513, + "completion_length": 2284.187568664551, + "epoch": 0.31657142857142856, + "grad_norm": 0.8358162045478821, + "kl": 0.46759033203125, + "lambda_div_used": 0.6, + "learning_rate": 5.437170188473847e-07, + "loss": 0.0198, + "reward": 0.021968248765915632, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.021968248765915632, + "reward_after_std": 0.8150804601609707, + "reward_before_mean": 0.4101370070129633, + "reward_before_std": 0.7939338013529778, + "reward_change_max": 0.0005772784352302551, + "reward_change_mean": -0.3881687559187412, + "reward_change_min": -0.8049648813903332, + "reward_change_std": 0.29894091188907623, + "reward_std": 0.8150804750621319, + "rewards/cosine_scaled_reward": -0.11784817464649677, + "rewards/format_reward": 0.6458333376795053, + "step": 277 + }, + { + "advantage_max": 1.0878044962882996, + "advantage_mean": 3.570070017389071e-09, + "advantage_min": -0.5774106942117214, + "advantage_std": 0.5889769718050957, + "completion_length": 2305.0000915527344, + "epoch": 0.3177142857142857, + "grad_norm": 0.7688349485397339, + "kl": 0.60107421875, + "lambda_div_used": 0.6, + "learning_rate": 5.405759110524894e-07, + "loss": 0.0413, + "reward": 0.218816798646003, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.218816798646003, + "reward_after_std": 0.5889769680798054, + "reward_before_mean": 0.7605785094201565, + "reward_before_std": 0.4702807143330574, + "reward_change_max": 0.0, + "reward_change_mean": -0.5417617131024599, + "reward_change_min": -0.8437193483114243, + "reward_change_std": 0.3238454647362232, + "reward_std": 0.588976975530386, + "rewards/cosine_scaled_reward": 0.015705913305282593, + "rewards/format_reward": 0.7291666734963655, + "step": 278 + }, + { + "advantage_max": 1.6663006246089935, + "advantage_mean": 4.967053879312289e-09, + "advantage_min": -0.7244021371006966, + "advantage_std": 0.8816425837576389, + "completion_length": 3248.0625915527344, + "epoch": 0.31885714285714284, + "grad_norm": 0.9396555423736572, + "kl": 0.61865234375, + "lambda_div_used": 0.6, + "learning_rate": 5.37435262574394e-07, + "loss": 0.0357, + "reward": -0.010367986280471087, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.010367986280471087, + "reward_after_std": 0.8816425688564777, + "reward_before_mean": 0.3451116308569908, + "reward_before_std": 0.8489609006792307, + "reward_change_max": 0.0029679909348487854, + "reward_change_mean": -0.3554796166718006, + "reward_change_min": -0.7218036316335201, + "reward_change_std": 0.28672708943486214, + "reward_std": 0.8816425986588001, + "rewards/cosine_scaled_reward": -0.13994419388473034, + "rewards/format_reward": 0.6250000074505806, + "step": 279 + }, + { + "advantage_max": 1.741914540529251, + "advantage_mean": -6.208817571184966e-09, + "advantage_min": -0.8550667315721512, + "advantage_std": 0.9599592238664627, + "completion_length": 2562.9792442321777, + "epoch": 0.32, + "grad_norm": 0.6332762837409973, + "kl": 0.53643798828125, + "lambda_div_used": 0.6, + "learning_rate": 5.342952264838747e-07, + "loss": 0.0627, + "reward": 0.3203533738851547, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3203533738851547, + "reward_after_std": 0.9599592164158821, + "reward_before_mean": 0.8485358407488093, + "reward_before_std": 0.9343424178659916, + "reward_change_max": 0.0003201141953468323, + "reward_change_mean": -0.5281824506819248, + "reward_change_min": -0.9736247472465038, + "reward_change_std": 0.38505643233656883, + "reward_std": 0.9599592462182045, + "rewards/cosine_scaled_reward": 0.10135123133659363, + "rewards/format_reward": 0.6458333414047956, + "step": 280 + }, + { + "advantage_max": 0.9071239531040192, + "advantage_mean": 1.1796752907855534e-08, + "advantage_min": -0.46878183260560036, + "advantage_std": 0.4929075203835964, + "completion_length": 3433.9791870117188, + "epoch": 0.3211428571428571, + "grad_norm": 0.635045051574707, + "kl": 0.5234375, + "lambda_div_used": 0.6, + "learning_rate": 5.311559558218603e-07, + "loss": 0.0368, + "reward": -0.4045607140287757, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.4045607140287757, + "reward_after_std": 0.492907527834177, + "reward_before_mean": -0.18301626667380333, + "reward_before_std": 0.491706857457757, + "reward_change_max": 0.0007172971963882446, + "reward_change_mean": -0.2215444464236498, + "reward_change_min": -0.4169187992811203, + "reward_change_std": 0.17256073001772165, + "reward_std": 0.4929075501859188, + "rewards/cosine_scaled_reward": -0.19567479752004147, + "rewards/format_reward": 0.20833334140479565, + "step": 281 + }, + { + "advantage_max": 1.310217224061489, + "advantage_mean": -2.4835267176115394e-09, + "advantage_min": -0.6598837561905384, + "advantage_std": 0.7153121419250965, + "completion_length": 2712.916732788086, + "epoch": 0.3222857142857143, + "grad_norm": 0.42539331316947937, + "kl": 0.4349365234375, + "lambda_div_used": 0.6, + "learning_rate": 5.28017603591974e-07, + "loss": 0.0524, + "reward": 0.23194605857133865, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.23194605857133865, + "reward_after_std": 0.7153121344745159, + "reward_before_mean": 0.7622983139008284, + "reward_before_std": 0.6276755481958389, + "reward_change_max": 0.0, + "reward_change_mean": -0.530352272093296, + "reward_change_min": -0.9107529036700726, + "reward_change_std": 0.3500578925013542, + "reward_std": 0.7153121344745159, + "rewards/cosine_scaled_reward": 0.026982491835951805, + "rewards/format_reward": 0.7083333488553762, + "step": 282 + }, + { + "advantage_max": 1.7362473383545876, + "advantage_mean": -2.4835267176115394e-09, + "advantage_min": -0.780534915626049, + "advantage_std": 0.9419512934982777, + "completion_length": 3040.5833740234375, + "epoch": 0.32342857142857145, + "grad_norm": 1.8998653888702393, + "kl": 0.46240234375, + "lambda_div_used": 0.6, + "learning_rate": 5.248803227530763e-07, + "loss": 0.1153, + "reward": -0.013534542173147202, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.013534542173147202, + "reward_after_std": 0.9419512934982777, + "reward_before_mean": 0.330111525952816, + "reward_before_std": 0.9573567919433117, + "reward_change_max": 0.0005117952823638916, + "reward_change_mean": -0.34364606719464064, + "reward_change_min": -0.7443192265927792, + "reward_change_std": 0.2993211802095175, + "reward_std": 0.9419513083994389, + "rewards/cosine_scaled_reward": -0.043277584947645664, + "rewards/format_reward": 0.4166666679084301, + "step": 283 + }, + { + "advantage_max": 1.6517476364970207, + "advantage_mean": -5.587935614226325e-09, + "advantage_min": -0.739282701164484, + "advantage_std": 0.8730613552033901, + "completion_length": 2619.3125915527344, + "epoch": 0.32457142857142857, + "grad_norm": 0.5986530184745789, + "kl": 0.50341796875, + "lambda_div_used": 0.6, + "learning_rate": 5.21744266211809e-07, + "loss": 0.0233, + "reward": 0.06523705890867859, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.06523705890867859, + "reward_after_std": 0.8730613254010677, + "reward_before_mean": 0.46089828852564096, + "reward_before_std": 0.8123955279588699, + "reward_change_max": 0.0, + "reward_change_mean": -0.39566121622920036, + "reward_change_min": -0.7429598942399025, + "reward_change_std": 0.28500969521701336, + "reward_std": 0.8730613365769386, + "rewards/cosine_scaled_reward": -0.11330086522502825, + "rewards/format_reward": 0.6875000111758709, + "step": 284 + }, + { + "advantage_max": 1.032178670167923, + "advantage_mean": 6.829699417121304e-09, + "advantage_min": -0.5508750528097153, + "advantage_std": 0.5619474165141582, + "completion_length": 2122.6458587646484, + "epoch": 0.32571428571428573, + "grad_norm": 0.3667513132095337, + "kl": 0.301513671875, + "lambda_div_used": 0.6, + "learning_rate": 5.186095868151436e-07, + "loss": 0.0112, + "reward": -0.03984304657205939, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.03984304657205939, + "reward_after_std": 0.5619474314153194, + "reward_before_mean": 0.36875259317457676, + "reward_before_std": 0.5008511012420058, + "reward_change_max": 0.0, + "reward_change_mean": -0.40859564114362, + "reward_change_min": -0.6677859574556351, + "reward_change_std": 0.2631244119256735, + "reward_std": 0.5619474649429321, + "rewards/cosine_scaled_reward": -0.16979038482531905, + "rewards/format_reward": 0.7083333414047956, + "step": 285 + }, + { + "advantage_max": 1.6204382330179214, + "advantage_mean": 1.552204503818544e-09, + "advantage_min": -0.7265680469572544, + "advantage_std": 0.8686433807015419, + "completion_length": 2619.8333587646484, + "epoch": 0.32685714285714285, + "grad_norm": 1.1987253427505493, + "kl": 0.50897216796875, + "lambda_div_used": 0.6, + "learning_rate": 5.154764373429315e-07, + "loss": 0.0563, + "reward": -0.004736738046631217, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.004736738046631217, + "reward_after_std": 0.8686433918774128, + "reward_before_mean": 0.3524339944124222, + "reward_before_std": 0.8589610904455185, + "reward_change_max": 0.0024899691343307495, + "reward_change_mean": -0.3571706861257553, + "reward_change_min": -0.7682550400495529, + "reward_change_std": 0.29713574796915054, + "reward_std": 0.8686434105038643, + "rewards/cosine_scaled_reward": -0.08419968781527132, + "rewards/format_reward": 0.5208333414047956, + "step": 286 + }, + { + "advantage_max": 1.2582305893301964, + "advantage_mean": -5.277494496969126e-09, + "advantage_min": -0.5760709270834923, + "advantage_std": 0.6693591959774494, + "completion_length": 2054.291717529297, + "epoch": 0.328, + "grad_norm": 1.019945740699768, + "kl": 0.637664794921875, + "lambda_div_used": 0.6, + "learning_rate": 5.123449705004581e-07, + "loss": 0.004, + "reward": 0.24294462392572314, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.24294462392572314, + "reward_after_std": 0.6693591997027397, + "reward_before_mean": 0.7795294325333089, + "reward_before_std": 0.5335474237799644, + "reward_change_max": 0.0, + "reward_change_mean": -0.536584809422493, + "reward_change_min": -0.8521362952888012, + "reward_change_std": 0.32288376055657864, + "reward_std": 0.6693592220544815, + "rewards/cosine_scaled_reward": 0.05643137916922569, + "rewards/format_reward": 0.6666666716337204, + "step": 287 + }, + { + "advantage_max": 1.6867050975561142, + "advantage_mean": -1.738468857759301e-08, + "advantage_min": -0.636960681527853, + "advantage_std": 0.8619322814047337, + "completion_length": 2832.062530517578, + "epoch": 0.3291428571428571, + "grad_norm": 0.4221585690975189, + "kl": 0.447998046875, + "lambda_div_used": 0.6, + "learning_rate": 5.09215338910999e-07, + "loss": 0.0462, + "reward": 0.07961262296885252, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.07961262296885252, + "reward_after_std": 0.8619322963058949, + "reward_before_mean": 0.4838328785263002, + "reward_before_std": 0.7499809451401234, + "reward_change_max": 0.0010158568620681763, + "reward_change_mean": -0.40422030352056026, + "reward_change_min": -0.6797929219901562, + "reward_change_std": 0.26082053780555725, + "reward_std": 0.8619323261082172, + "rewards/cosine_scaled_reward": -0.06016687932424247, + "rewards/format_reward": 0.6041666828095913, + "step": 288 + }, + { + "advantage_max": 1.3328250646591187, + "advantage_mean": 1.179675274132208e-08, + "advantage_min": -0.6627430729568005, + "advantage_std": 0.7312614023685455, + "completion_length": 2030.6042098999023, + "epoch": 0.3302857142857143, + "grad_norm": 1.2222105264663696, + "kl": 0.337738037109375, + "lambda_div_used": 0.6, + "learning_rate": 5.060876951083828e-07, + "loss": -0.0151, + "reward": 0.18689546827226877, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.18689546827226877, + "reward_after_std": 0.7312614023685455, + "reward_before_mean": 0.6872250400483608, + "reward_before_std": 0.6786364018917084, + "reward_change_max": 0.0, + "reward_change_mean": -0.5003295410424471, + "reward_change_min": -0.8750745318830013, + "reward_change_std": 0.3380218371748924, + "reward_std": 0.7312614247202873, + "rewards/cosine_scaled_reward": 0.02069583162665367, + "rewards/format_reward": 0.6458333376795053, + "step": 289 + }, + { + "advantage_max": 1.6091022714972496, + "advantage_mean": 4.346171977864799e-09, + "advantage_min": -0.7964055761694908, + "advantage_std": 0.8811142966151237, + "completion_length": 2952.041778564453, + "epoch": 0.3314285714285714, + "grad_norm": 1.1612440347671509, + "kl": 0.669189453125, + "lambda_div_used": 0.6, + "learning_rate": 5.02962191529556e-07, + "loss": 0.0857, + "reward": -0.076158725656569, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.076158725656569, + "reward_after_std": 0.8811142891645432, + "reward_before_mean": 0.24741485621780157, + "reward_before_std": 0.9051576480269432, + "reward_change_max": 0.0009580254554748535, + "reward_change_mean": -0.3235735837370157, + "reward_change_min": -0.766759853810072, + "reward_change_std": 0.29583541117608547, + "reward_std": 0.8811143264174461, + "rewards/cosine_scaled_reward": -0.16795924259349704, + "rewards/format_reward": 0.5833333507180214, + "step": 290 + }, + { + "advantage_max": 1.2139404453337193, + "advantage_mean": 1.241763414316921e-09, + "advantage_min": -0.5720047876238823, + "advantage_std": 0.6662472151219845, + "completion_length": 2837.1876220703125, + "epoch": 0.3325714285714286, + "grad_norm": 1.2880405187606812, + "kl": 0.488037109375, + "lambda_div_used": 0.6, + "learning_rate": 4.998389805071536e-07, + "loss": 0.0033, + "reward": -0.05876442790031433, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.05876442790031433, + "reward_after_std": 0.6662472151219845, + "reward_before_mean": 0.31654983200132847, + "reward_before_std": 0.6410909574478865, + "reward_change_max": 0.0004924982786178589, + "reward_change_mean": -0.3753142673522234, + "reward_change_min": -0.7709561586380005, + "reward_change_std": 0.28246007952839136, + "reward_std": 0.666247233748436, + "rewards/cosine_scaled_reward": -0.12297508306801319, + "rewards/format_reward": 0.5625000018626451, + "step": 291 + }, + { + "advantage_max": 1.1201823130249977, + "advantage_mean": 4.96705393482344e-09, + "advantage_min": -0.626116156578064, + "advantage_std": 0.6261259578168392, + "completion_length": 2984.6250610351562, + "epoch": 0.33371428571428574, + "grad_norm": 0.36917972564697266, + "kl": 0.447021484375, + "lambda_div_used": 0.6, + "learning_rate": 4.967182142620745e-07, + "loss": 0.0341, + "reward": -0.17521192249841988, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.17521192249841988, + "reward_after_std": 0.6261259503662586, + "reward_before_mean": 0.1484000850468874, + "reward_before_std": 0.6327078305184841, + "reward_change_max": 0.0006989315152168274, + "reward_change_mean": -0.3236120007932186, + "reward_change_min": -0.611223328858614, + "reward_change_std": 0.2541379611939192, + "reward_std": 0.6261259652674198, + "rewards/cosine_scaled_reward": -0.16538330353796482, + "rewards/format_reward": 0.479166679084301, + "step": 292 + }, + { + "advantage_max": 1.234248362481594, + "advantage_mean": -3.1044085080367267e-09, + "advantage_min": -0.6834790408611298, + "advantage_std": 0.679344616830349, + "completion_length": 2224.791702270508, + "epoch": 0.33485714285714285, + "grad_norm": 0.8810564875602722, + "kl": 0.2921142578125, + "lambda_div_used": 0.6, + "learning_rate": 4.93600044896063e-07, + "loss": 0.0434, + "reward": 0.047704005148261786, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.047704005148261786, + "reward_after_std": 0.6793446019291878, + "reward_before_mean": 0.4809760805219412, + "reward_before_std": 0.6479660160839558, + "reward_change_max": 0.0020105764269828796, + "reward_change_mean": -0.4332720432430506, + "reward_change_min": -0.7641118578612804, + "reward_change_std": 0.29899533465504646, + "reward_std": 0.6793446280062199, + "rewards/cosine_scaled_reward": -0.09284532070159912, + "rewards/format_reward": 0.6666666809469461, + "step": 293 + }, + { + "advantage_max": 1.6753001362085342, + "advantage_mean": 5.587935614226325e-09, + "advantage_min": -0.8242075219750404, + "advantage_std": 0.9228464774787426, + "completion_length": 2752.104232788086, + "epoch": 0.336, + "grad_norm": 0.8626604080200195, + "kl": 0.436859130859375, + "lambda_div_used": 0.6, + "learning_rate": 4.904846243842949e-07, + "loss": 0.0558, + "reward": 0.042919889092445374, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.042919889092445374, + "reward_after_std": 0.9228464849293232, + "reward_before_mean": 0.419957107398659, + "reward_before_std": 0.9347262866795063, + "reward_change_max": 0.0004857778549194336, + "reward_change_mean": -0.3770372150465846, + "reward_change_min": -0.7841157354414463, + "reward_change_std": 0.3177993157878518, + "reward_std": 0.9228465035557747, + "rewards/cosine_scaled_reward": 0.012061880202963948, + "rewards/format_reward": 0.39583334140479565, + "step": 294 + }, + { + "advantage_max": 1.5172969661653042, + "advantage_mean": -2.9802324275074454e-08, + "advantage_min": -0.8133530095219612, + "advantage_std": 0.8478995785117149, + "completion_length": 2791.770896911621, + "epoch": 0.33714285714285713, + "grad_norm": 0.6044384837150574, + "kl": 0.42376708984375, + "lambda_div_used": 0.6, + "learning_rate": 4.873721045679706e-07, + "loss": 0.0297, + "reward": 0.18310544453561306, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.18310544453561306, + "reward_after_std": 0.8478995636105537, + "reward_before_mean": 0.6625577192753553, + "reward_before_std": 0.8396427370607853, + "reward_change_max": 0.0008948817849159241, + "reward_change_mean": -0.4794522263109684, + "reward_change_min": -0.8777386695146561, + "reward_change_std": 0.36423896066844463, + "reward_std": 0.8478995896875858, + "rewards/cosine_scaled_reward": 0.10211216658353806, + "rewards/format_reward": 0.4583333395421505, + "step": 295 + }, + { + "advantage_max": 1.1875118091702461, + "advantage_mean": 1.3969839091076963e-08, + "advantage_min": -0.5539632812142372, + "advantage_std": 0.6299429349601269, + "completion_length": 3087.104217529297, + "epoch": 0.3382857142857143, + "grad_norm": 0.39692825078964233, + "kl": 0.503662109375, + "lambda_div_used": 0.6, + "learning_rate": 4.842626371469149e-07, + "loss": 0.042, + "reward": -0.1963654124410823, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.1963654124410823, + "reward_after_std": 0.6299429349601269, + "reward_before_mean": 0.10445545858237892, + "reward_before_std": 0.5950967706739902, + "reward_change_max": 0.00122012197971344, + "reward_change_mean": -0.3008208554238081, + "reward_change_min": -0.5454224087297916, + "reward_change_std": 0.2132130330428481, + "reward_std": 0.6299429535865784, + "rewards/cosine_scaled_reward": -0.2081889410619624, + "rewards/format_reward": 0.5208333469927311, + "step": 296 + }, + { + "advantage_max": 1.7799878790974617, + "advantage_mean": 1.6453366002977532e-08, + "advantage_min": -0.6720851957798004, + "advantage_std": 0.9102919586002827, + "completion_length": 3144.8125762939453, + "epoch": 0.3394285714285714, + "grad_norm": 1.7664011716842651, + "kl": 0.5186767578125, + "lambda_div_used": 0.6, + "learning_rate": 4.811563736721829e-07, + "loss": 0.1237, + "reward": -0.2473097420297563, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.2473097420297563, + "reward_after_std": 0.9102919548749924, + "reward_before_mean": -0.04021038330392912, + "reward_before_std": 0.8842969015240669, + "reward_change_max": 0.0011814385652542114, + "reward_change_mean": -0.2070993361994624, + "reward_change_min": -0.46454621106386185, + "reward_change_std": 0.18554504588246346, + "reward_std": 0.910291999578476, + "rewards/cosine_scaled_reward": -0.17635520159092266, + "rewards/format_reward": 0.31250000558793545, + "step": 297 + }, + { + "advantage_max": 1.4756926000118256, + "advantage_mean": -1.117587078436344e-08, + "advantage_min": -0.6293516084551811, + "advantage_std": 0.7791335694491863, + "completion_length": 2378.3333740234375, + "epoch": 0.3405714285714286, + "grad_norm": 0.26499053835868835, + "kl": 0.27886962890625, + "lambda_div_used": 0.6, + "learning_rate": 4.780534655386743e-07, + "loss": 0.0181, + "reward": 0.03418944403529167, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.03418944403529167, + "reward_after_std": 0.7791335843503475, + "reward_before_mean": 0.4335096925497055, + "reward_before_std": 0.7074716128408909, + "reward_change_max": 0.0010121092200279236, + "reward_change_mean": -0.3993202708661556, + "reward_change_min": -0.686762023717165, + "reward_change_std": 0.27223058976233006, + "reward_std": 0.779133602976799, + "rewards/cosine_scaled_reward": -0.10616182815283537, + "rewards/format_reward": 0.6458333469927311, + "step": 298 + }, + { + "advantage_max": 1.579380787909031, + "advantage_mean": 2.048909764074125e-08, + "advantage_min": -0.9291949793696404, + "advantage_std": 0.9145001582801342, + "completion_length": 3036.6875610351562, + "epoch": 0.3417142857142857, + "grad_norm": 0.5541883707046509, + "kl": 0.4439697265625, + "lambda_div_used": 0.6, + "learning_rate": 4.749540639777539e-07, + "loss": 0.0378, + "reward": 0.09226292464882135, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.09226292464882135, + "reward_after_std": 0.914500180631876, + "reward_before_mean": 0.5100114718079567, + "reward_before_std": 0.9902982860803604, + "reward_change_max": 0.0006761401891708374, + "reward_change_mean": -0.41774850990623236, + "reward_change_min": -0.9342790246009827, + "reward_change_std": 0.379890457727015, + "reward_std": 0.9145002067089081, + "rewards/cosine_scaled_reward": -0.015827607363462448, + "rewards/format_reward": 0.5416666753590107, + "step": 299 + }, + { + "advantage_max": 1.6460872441530228, + "advantage_mean": -1.1796752630299778e-08, + "advantage_min": -0.8271171525120735, + "advantage_std": 0.8936281614005566, + "completion_length": 2863.0834350585938, + "epoch": 0.34285714285714286, + "grad_norm": 0.44533947110176086, + "kl": 0.3939208984375, + "lambda_div_used": 0.6, + "learning_rate": 4.7185832004988133e-07, + "loss": 0.018, + "reward": 0.044442882761359215, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.044442882761359215, + "reward_after_std": 0.8936281725764275, + "reward_before_mean": 0.4296601233072579, + "reward_before_std": 0.8874883726239204, + "reward_change_max": 0.001688636839389801, + "reward_change_mean": -0.38521726056933403, + "reward_change_min": -0.7698900923132896, + "reward_change_std": 0.299583924934268, + "reward_std": 0.893628191202879, + "rewards/cosine_scaled_reward": -0.03516994509845972, + "rewards/format_reward": 0.5000000149011612, + "step": 300 + }, + { + "advantage_max": 1.1703692227602005, + "advantage_mean": 4.0357309161187516e-09, + "advantage_min": -0.5398488789796829, + "advantage_std": 0.6378449760377407, + "completion_length": 2709.562545776367, + "epoch": 0.344, + "grad_norm": 0.35299813747406006, + "kl": 0.46417236328125, + "lambda_div_used": 0.6, + "learning_rate": 4.68766384637248e-07, + "loss": 0.0359, + "reward": -0.0903393222251907, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.0903393222251907, + "reward_after_std": 0.637844979763031, + "reward_before_mean": 0.272334769833833, + "reward_before_std": 0.5997465178370476, + "reward_change_max": 9.758025407791138e-05, + "reward_change_mean": -0.36267405189573765, + "reward_change_min": -0.6873607896268368, + "reward_change_std": 0.2660413235425949, + "reward_std": 0.6378450095653534, + "rewards/cosine_scaled_reward": -0.13466597348451614, + "rewards/format_reward": 0.5416666753590107, + "step": 301 + }, + { + "advantage_max": 1.3209370002150536, + "advantage_mean": 2.9181441651982e-08, + "advantage_min": -0.7568718492984772, + "advantage_std": 0.7378169521689415, + "completion_length": 2285.541732788086, + "epoch": 0.34514285714285714, + "grad_norm": 0.6973668336868286, + "kl": 0.36492919921875, + "lambda_div_used": 0.6, + "learning_rate": 4.656784084364238e-07, + "loss": 0.0088, + "reward": 0.05789472348988056, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.05789472348988056, + "reward_after_std": 0.7378169372677803, + "reward_before_mean": 0.48412832617759705, + "reward_before_std": 0.7305796556174755, + "reward_change_max": 0.00021295994520187378, + "reward_change_mean": -0.4262335952371359, + "reward_change_min": -0.7714013680815697, + "reward_change_std": 0.3117580823600292, + "reward_std": 0.7378169670701027, + "rewards/cosine_scaled_reward": -0.028769173979526386, + "rewards/format_reward": 0.5416666809469461, + "step": 302 + }, + { + "advantage_max": 1.374896951019764, + "advantage_mean": 1.7384688355548406e-08, + "advantage_min": -0.7600474506616592, + "advantage_std": 0.7772323749959469, + "completion_length": 2603.68754196167, + "epoch": 0.3462857142857143, + "grad_norm": 1.0664433240890503, + "kl": 0.5365142822265625, + "lambda_div_used": 0.6, + "learning_rate": 4.6259454195101267e-07, + "loss": 0.0164, + "reward": -0.05705134989693761, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.05705134989693761, + "reward_after_std": 0.7772323749959469, + "reward_before_mean": 0.3000062759965658, + "reward_before_std": 0.8123316094279289, + "reward_change_max": 0.0020028576254844666, + "reward_change_mean": -0.3570576384663582, + "reward_change_min": -0.7174497433006763, + "reward_change_std": 0.29492284916341305, + "reward_std": 0.7772323973476887, + "rewards/cosine_scaled_reward": -0.17291352711617947, + "rewards/format_reward": 0.6458333507180214, + "step": 303 + }, + { + "advantage_max": 1.230701245367527, + "advantage_mean": -1.2417634698280722e-09, + "advantage_min": -0.6204689294099808, + "advantage_std": 0.6633154228329659, + "completion_length": 2810.8126068115234, + "epoch": 0.3474285714285714, + "grad_norm": 0.49234816431999207, + "kl": 0.545654296875, + "lambda_div_used": 0.6, + "learning_rate": 4.59514935484316e-07, + "loss": 0.0385, + "reward": -0.07311620330438018, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.07311620330438018, + "reward_after_std": 0.6633154153823853, + "reward_before_mean": 0.29546307516284287, + "reward_before_std": 0.6205927468836308, + "reward_change_max": 0.000993296504020691, + "reward_change_mean": -0.36857929173856974, + "reward_change_min": -0.6805753968656063, + "reward_change_std": 0.25794041994959116, + "reward_std": 0.6633154340088367, + "rewards/cosine_scaled_reward": -0.18560180440545082, + "rewards/format_reward": 0.6666666883975267, + "step": 304 + }, + { + "advantage_max": 1.3669767379760742, + "advantage_mean": 1.0554989493538613e-08, + "advantage_min": -0.6285790093243122, + "advantage_std": 0.743263740092516, + "completion_length": 2943.229202270508, + "epoch": 0.3485714285714286, + "grad_norm": 0.5039094090461731, + "kl": 0.510009765625, + "lambda_div_used": 0.6, + "learning_rate": 4.5643973913200837e-07, + "loss": 0.0536, + "reward": -0.028486928436905146, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.028486928436905146, + "reward_after_std": 0.7432637549936771, + "reward_before_mean": 0.34963250160217285, + "reward_before_std": 0.7160216048359871, + "reward_change_max": 0.0010071024298667908, + "reward_change_mean": -0.37811941001564264, + "reward_change_min": -0.6920573115348816, + "reward_change_std": 0.2673141350969672, + "reward_std": 0.7432637773454189, + "rewards/cosine_scaled_reward": -0.15851709432899952, + "rewards/format_reward": 0.666666679084301, + "step": 305 + }, + { + "advantage_max": 1.5899255201220512, + "advantage_mean": -2.825011838347713e-08, + "advantage_min": -0.7959227114915848, + "advantage_std": 0.8745484538376331, + "completion_length": 2690.1459197998047, + "epoch": 0.3497142857142857, + "grad_norm": 0.4012378752231598, + "kl": 0.42352294921875, + "lambda_div_used": 0.6, + "learning_rate": 4.5336910277482155e-07, + "loss": 0.0389, + "reward": 0.36468219189555384, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.36468219189555384, + "reward_after_std": 0.8745484314858913, + "reward_before_mean": 0.9314363989979029, + "reward_before_std": 0.8190644308924675, + "reward_change_max": 0.0, + "reward_change_mean": -0.566754225641489, + "reward_change_min": -0.9831234477460384, + "reward_change_std": 0.3884328678250313, + "reward_std": 0.8745484426617622, + "rewards/cosine_scaled_reward": 0.08030151948332787, + "rewards/format_reward": 0.7708333544433117, + "step": 306 + }, + { + "advantage_max": 1.2558432668447495, + "advantage_mean": 4.0357312769412346e-09, + "advantage_min": -0.7692880481481552, + "advantage_std": 0.7128739319741726, + "completion_length": 2790.7291870117188, + "epoch": 0.35085714285714287, + "grad_norm": 0.741637647151947, + "kl": 0.482177734375, + "lambda_div_used": 0.6, + "learning_rate": 4.503031760712397e-07, + "loss": 0.0262, + "reward": 0.011465976946055889, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.011465976946055889, + "reward_after_std": 0.7128739319741726, + "reward_before_mean": 0.42434445582330227, + "reward_before_std": 0.719377551227808, + "reward_change_max": 0.0, + "reward_change_mean": -0.41287848353385925, + "reward_change_min": -0.7506432235240936, + "reward_change_std": 0.30813906714320183, + "reward_std": 0.7128739431500435, + "rewards/cosine_scaled_reward": -0.12116110511124134, + "rewards/format_reward": 0.6666666883975267, + "step": 307 + }, + { + "advantage_max": 1.6454266011714935, + "advantage_mean": -1.8626453157644107e-09, + "advantage_min": -0.7154504768550396, + "advantage_std": 0.8662220053374767, + "completion_length": 3161.291748046875, + "epoch": 0.352, + "grad_norm": 0.6599620580673218, + "kl": 0.543212890625, + "lambda_div_used": 0.6, + "learning_rate": 4.4724210845020494e-07, + "loss": 0.0301, + "reward": -0.03038888319861144, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.03038888319861144, + "reward_after_std": 0.8662219978868961, + "reward_before_mean": 0.3153821690939367, + "reward_before_std": 0.8270813934504986, + "reward_change_max": 0.0, + "reward_change_mean": -0.3457710575312376, + "reward_change_min": -0.6896585188806057, + "reward_change_std": 0.2616619346663356, + "reward_std": 0.866222009062767, + "rewards/cosine_scaled_reward": -0.12355892173945904, + "rewards/format_reward": 0.5625000093132257, + "step": 308 + }, + { + "advantage_max": 1.464337058365345, + "advantage_mean": 1.2417634698280722e-09, + "advantage_min": -0.7494636550545692, + "advantage_std": 0.8026157282292843, + "completion_length": 3033.3333740234375, + "epoch": 0.35314285714285715, + "grad_norm": 1.1581108570098877, + "kl": 0.415283203125, + "lambda_div_used": 0.6, + "learning_rate": 4.441860491038345e-07, + "loss": 0.0976, + "reward": -0.01489423681050539, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.01489423681050539, + "reward_after_std": 0.8026156984269619, + "reward_before_mean": 0.35966885928064585, + "reward_before_std": 0.8019279204308987, + "reward_change_max": 0.0005437731742858887, + "reward_change_mean": -0.3745631221681833, + "reward_change_min": -0.7600988857448101, + "reward_change_std": 0.302903912961483, + "reward_std": 0.8026157319545746, + "rewards/cosine_scaled_reward": -0.10141556803137064, + "rewards/format_reward": 0.562500013038516, + "step": 309 + }, + { + "advantage_max": 1.4331334754824638, + "advantage_mean": 8.69234451084111e-09, + "advantage_min": -0.6336538568139076, + "advantage_std": 0.7798205390572548, + "completion_length": 2275.1458892822266, + "epoch": 0.35428571428571426, + "grad_norm": 0.4870387017726898, + "kl": 0.38494873046875, + "lambda_div_used": 0.6, + "learning_rate": 4.4113514698014953e-07, + "loss": 0.0438, + "reward": 0.017508748918771744, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.017508748918771744, + "reward_after_std": 0.7798205390572548, + "reward_before_mean": 0.4050704315304756, + "reward_before_std": 0.749288871884346, + "reward_change_max": 0.0, + "reward_change_mean": -0.3875616807490587, + "reward_change_min": -0.7554353252053261, + "reward_change_std": 0.2825618553906679, + "reward_std": 0.7798205390572548, + "rewards/cosine_scaled_reward": -0.14121478982269764, + "rewards/format_reward": 0.6875000093132257, + "step": 310 + }, + { + "advantage_max": 1.3528824746608734, + "advantage_mean": -1.5522043039783995e-08, + "advantage_min": -0.7126908525824547, + "advantage_std": 0.7338298484683037, + "completion_length": 2572.583427429199, + "epoch": 0.3554285714285714, + "grad_norm": 0.3856034576892853, + "kl": 0.32830810546875, + "lambda_div_used": 0.6, + "learning_rate": 4.3808955077581546e-07, + "loss": 0.0441, + "reward": 0.23825683817267418, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.23825683817267418, + "reward_after_std": 0.7338298559188843, + "reward_before_mean": 0.7636316558346152, + "reward_before_std": 0.652825366705656, + "reward_change_max": 0.000289328396320343, + "reward_change_mean": -0.5253748074173927, + "reward_change_min": -0.8759537264704704, + "reward_change_std": 0.33924413844943047, + "reward_std": 0.7338298633694649, + "rewards/cosine_scaled_reward": 0.058899134397506714, + "rewards/format_reward": 0.6458333488553762, + "step": 311 + }, + { + "advantage_max": 1.5806788802146912, + "advantage_mean": -3.414849614191695e-08, + "advantage_min": -0.7388174682855606, + "advantage_std": 0.843331977725029, + "completion_length": 2201.1041870117188, + "epoch": 0.3565714285714286, + "grad_norm": 0.45848318934440613, + "kl": 0.295013427734375, + "lambda_div_used": 0.6, + "learning_rate": 4.350494089288943e-07, + "loss": 0.0287, + "reward": 0.41449636314064264, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.41449636314064264, + "reward_after_std": 0.8433319795876741, + "reward_before_mean": 1.010321255773306, + "reward_before_std": 0.7445136643946171, + "reward_change_max": 0.0019322633743286133, + "reward_change_mean": -0.5958249177783728, + "reward_change_min": -1.0556094981729984, + "reward_change_std": 0.38764845905825496, + "reward_std": 0.8433320187032223, + "rewards/cosine_scaled_reward": 0.18224396905861795, + "rewards/format_reward": 0.6458333507180214, + "step": 312 + }, + { + "advantage_max": 1.4623200297355652, + "advantage_mean": 2.545615118698663e-08, + "advantage_min": -0.8194458559155464, + "advantage_std": 0.8134392872452736, + "completion_length": 2906.083396911621, + "epoch": 0.3577142857142857, + "grad_norm": 0.3625169098377228, + "kl": 0.4002685546875, + "lambda_div_used": 0.6, + "learning_rate": 4.3201486961161093e-07, + "loss": 0.0427, + "reward": 0.26757752522826195, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.26757752522826195, + "reward_after_std": 0.8134392537176609, + "reward_before_mean": 0.7992300987243652, + "reward_before_std": 0.78279247879982, + "reward_change_max": 0.0, + "reward_change_mean": -0.5316525120288134, + "reward_change_min": -0.9376847445964813, + "reward_change_std": 0.3747759759426117, + "reward_std": 0.8134392835199833, + "rewards/cosine_scaled_reward": 0.13919836655259132, + "rewards/format_reward": 0.5208333544433117, + "step": 313 + }, + { + "advantage_max": 1.3962865434587002, + "advantage_mean": -4.34617203337595e-09, + "advantage_min": -0.8272168599069118, + "advantage_std": 0.7858781218528748, + "completion_length": 2466.541732788086, + "epoch": 0.3588571428571429, + "grad_norm": 0.4204121530056, + "kl": 0.306884765625, + "lambda_div_used": 0.6, + "learning_rate": 4.2898608072313045e-07, + "loss": 0.0102, + "reward": 0.10956137627363205, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.10956137627363205, + "reward_after_std": 0.785878136754036, + "reward_before_mean": 0.5606746338307858, + "reward_before_std": 0.7906352952122688, + "reward_change_max": 0.0007210671901702881, + "reward_change_mean": -0.4511132426559925, + "reward_change_min": -0.8396630473434925, + "reward_change_std": 0.3383765425533056, + "reward_std": 0.7858781442046165, + "rewards/cosine_scaled_reward": -0.011329350993037224, + "rewards/format_reward": 0.583333345130086, + "step": 314 + }, + { + "advantage_max": 1.4159748256206512, + "advantage_mean": 3.725290464995368e-09, + "advantage_min": -0.7663627155125141, + "advantage_std": 0.7842210382223129, + "completion_length": 2764.104202270508, + "epoch": 0.36, + "grad_norm": 0.7089937925338745, + "kl": 0.353851318359375, + "lambda_div_used": 0.6, + "learning_rate": 4.2596318988235037e-07, + "loss": 0.035, + "reward": 0.24042465770617127, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.24042465770617127, + "reward_after_std": 0.7842210195958614, + "reward_before_mean": 0.7616824749857187, + "reward_before_std": 0.7417969591915607, + "reward_change_max": 0.0007342100143432617, + "reward_change_mean": -0.5212578698992729, + "reward_change_min": -0.957288570702076, + "reward_change_std": 0.36633536219596863, + "reward_std": 0.784221027046442, + "rewards/cosine_scaled_reward": 0.08917460031807423, + "rewards/format_reward": 0.5833333469927311, + "step": 315 + }, + { + "advantage_max": 1.3287958353757858, + "advantage_mean": -3.104408563547878e-09, + "advantage_min": -0.6579811051487923, + "advantage_std": 0.731446735560894, + "completion_length": 3304.0834045410156, + "epoch": 0.36114285714285715, + "grad_norm": 0.44676336646080017, + "kl": 0.4146728515625, + "lambda_div_used": 0.6, + "learning_rate": 4.2294634442070553e-07, + "loss": 0.0356, + "reward": -0.12744826450943947, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.12744826450943947, + "reward_after_std": 0.731446735560894, + "reward_before_mean": 0.1999343242496252, + "reward_before_std": 0.7348340712487698, + "reward_change_max": 0.0004966631531715393, + "reward_change_mean": -0.3273825887590647, + "reward_change_min": -0.6639083586633205, + "reward_change_std": 0.2678557615727186, + "reward_std": 0.731446772813797, + "rewards/cosine_scaled_reward": -0.18128284346312284, + "rewards/format_reward": 0.5625000111758709, + "step": 316 + }, + { + "advantage_max": 1.4074784219264984, + "advantage_mean": -4.967054045845742e-09, + "advantage_min": -0.8237576186656952, + "advantage_std": 0.7849172949790955, + "completion_length": 2783.5209045410156, + "epoch": 0.36228571428571427, + "grad_norm": 0.7022814750671387, + "kl": 0.3697509765625, + "lambda_div_used": 0.6, + "learning_rate": 4.1993569137498776e-07, + "loss": 0.044, + "reward": 0.09272369928658009, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.09272369928658009, + "reward_after_std": 0.7849173173308372, + "reward_before_mean": 0.5296291098929942, + "reward_before_std": 0.7874506935477257, + "reward_change_max": 0.0004237145185470581, + "reward_change_mean": -0.43690540827810764, + "reward_change_min": -0.8061187639832497, + "reward_change_std": 0.33239541947841644, + "reward_std": 0.7849173545837402, + "rewards/cosine_scaled_reward": -0.03726877458393574, + "rewards/format_reward": 0.6041666809469461, + "step": 317 + }, + { + "advantage_max": 1.751317985355854, + "advantage_mean": -7.45058070794613e-09, + "advantage_min": -0.77198251709342, + "advantage_std": 0.938833799213171, + "completion_length": 2228.666763305664, + "epoch": 0.36342857142857143, + "grad_norm": 0.5400987863540649, + "kl": 0.30938720703125, + "lambda_div_used": 0.6, + "learning_rate": 4.1693137748017915e-07, + "loss": 0.0088, + "reward": 0.12596396543085575, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.12596396543085575, + "reward_after_std": 0.9388338066637516, + "reward_before_mean": 0.541341919451952, + "reward_before_std": 0.9054854735732079, + "reward_change_max": 0.0015599653124809265, + "reward_change_mean": -0.4153779400512576, + "reward_change_min": -0.8198592364788055, + "reward_change_std": 0.31469852663576603, + "reward_std": 0.9388338439166546, + "rewards/cosine_scaled_reward": -0.09391238272655755, + "rewards/format_reward": 0.7291666902601719, + "step": 318 + }, + { + "advantage_max": 1.2390740811824799, + "advantage_mean": -4.967053657267684e-09, + "advantage_min": -0.6836567893624306, + "advantage_std": 0.6966056674718857, + "completion_length": 3035.0625610351562, + "epoch": 0.36457142857142855, + "grad_norm": 0.745254635810852, + "kl": 0.441162109375, + "lambda_div_used": 0.6, + "learning_rate": 4.1393354916230005e-07, + "loss": 0.0169, + "reward": -0.17109492549207062, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.17109492549207062, + "reward_after_std": 0.6966056749224663, + "reward_before_mean": 0.1381418565288186, + "reward_before_std": 0.7229567356407642, + "reward_change_max": 0.0007069632411003113, + "reward_change_mean": -0.30923677794635296, + "reward_change_min": -0.6699307486414909, + "reward_change_std": 0.26837681233882904, + "reward_std": 0.6966056935489178, + "rewards/cosine_scaled_reward": -0.19134573824703693, + "rewards/format_reward": 0.5208333395421505, + "step": 319 + }, + { + "advantage_max": 1.3388882502913475, + "advantage_mean": 2.7939676405797087e-09, + "advantage_min": -0.6234576627612114, + "advantage_std": 0.7290964983403683, + "completion_length": 2174.937545776367, + "epoch": 0.3657142857142857, + "grad_norm": 0.5195040106773376, + "kl": 0.28411865234375, + "lambda_div_used": 0.6, + "learning_rate": 4.1094235253127374e-07, + "loss": 0.0134, + "reward": 0.19459236381953815, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.19459236381953815, + "reward_after_std": 0.7290964983403683, + "reward_before_mean": 0.691473500803113, + "reward_before_std": 0.6570716165006161, + "reward_change_max": 0.0, + "reward_change_mean": -0.49688112549483776, + "reward_change_min": -0.8854967355728149, + "reward_change_std": 0.3344013523310423, + "reward_std": 0.7290965430438519, + "rewards/cosine_scaled_reward": -0.0605132644996047, + "rewards/format_reward": 0.8125000074505806, + "step": 320 + }, + { + "advantage_max": 1.7560719028115273, + "advantage_mean": 6.208817127095756e-09, + "advantage_min": -0.8476521112024784, + "advantage_std": 0.9492146447300911, + "completion_length": 2399.916732788086, + "epoch": 0.3668571428571429, + "grad_norm": 1.3485333919525146, + "kl": 0.360137939453125, + "lambda_div_used": 0.6, + "learning_rate": 4.079579333738039e-07, + "loss": 0.0709, + "reward": 0.32834796188399196, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.32834796188399196, + "reward_after_std": 0.9492146372795105, + "reward_before_mean": 0.8588424747213139, + "reward_before_std": 0.8999190554022789, + "reward_change_max": 2.0541250705718994e-05, + "reward_change_mean": -0.530494537204504, + "reward_change_min": -1.0074727945029736, + "reward_change_std": 0.3776666931807995, + "reward_std": 0.9492146819829941, + "rewards/cosine_scaled_reward": 0.05442123394459486, + "rewards/format_reward": 0.7500000074505806, + "step": 321 + }, + { + "advantage_max": 1.445715993642807, + "advantage_mean": 4.967053768289986e-09, + "advantage_min": -0.7693274356424809, + "advantage_std": 0.8008316233754158, + "completion_length": 2231.7084045410156, + "epoch": 0.368, + "grad_norm": 1.13021981716156, + "kl": 0.28204345703125, + "lambda_div_used": 0.6, + "learning_rate": 4.0498043714627006e-07, + "loss": -0.0215, + "reward": -0.010291448445059359, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.010291448445059359, + "reward_after_std": 0.8008316159248352, + "reward_before_mean": 0.3594322365242988, + "reward_before_std": 0.8020209185779095, + "reward_change_max": 0.0005329325795173645, + "reward_change_mean": -0.36972369998693466, + "reward_change_min": -0.701385248452425, + "reward_change_std": 0.29476769641041756, + "reward_std": 0.8008316159248352, + "rewards/cosine_scaled_reward": -0.1744505581445992, + "rewards/format_reward": 0.7083333469927311, + "step": 322 + }, + { + "advantage_max": 1.3354713916778564, + "advantage_mean": 1.3038516211150153e-08, + "advantage_min": -0.6649999637156725, + "advantage_std": 0.7182338535785675, + "completion_length": 2482.291732788086, + "epoch": 0.36914285714285716, + "grad_norm": 0.6033332347869873, + "kl": 0.267822265625, + "lambda_div_used": 0.6, + "learning_rate": 4.020100089676376e-07, + "loss": 0.0009, + "reward": 0.02367839589715004, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.02367839589715004, + "reward_after_std": 0.7182338386774063, + "reward_before_mean": 0.4334249533712864, + "reward_before_std": 0.6694931406527758, + "reward_change_max": 0.0004015713930130005, + "reward_change_mean": -0.40974652022123337, + "reward_change_min": -0.7208868563175201, + "reward_change_std": 0.27913833782076836, + "reward_std": 0.7182338498532772, + "rewards/cosine_scaled_reward": -0.1270375456660986, + "rewards/format_reward": 0.6875000186264515, + "step": 323 + }, + { + "advantage_max": 1.4470875635743141, + "advantage_mean": -2.4835269396561444e-09, + "advantage_min": -0.744611281901598, + "advantage_std": 0.7921902909874916, + "completion_length": 2745.3750610351562, + "epoch": 0.3702857142857143, + "grad_norm": 0.6796770095825195, + "kl": 0.3516845703125, + "lambda_div_used": 0.6, + "learning_rate": 3.9904679361238526e-07, + "loss": -0.0007, + "reward": 0.018107005394995213, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.018107005394995213, + "reward_after_std": 0.7921902686357498, + "reward_before_mean": 0.4099455289542675, + "reward_before_std": 0.7908703275024891, + "reward_change_max": 0.0, + "reward_change_mean": -0.3918385021388531, + "reward_change_min": -0.8086014091968536, + "reward_change_std": 0.30911492742598057, + "reward_std": 0.7921903058886528, + "rewards/cosine_scaled_reward": -0.12836058484390378, + "rewards/format_reward": 0.6666666753590107, + "step": 324 + }, + { + "advantage_max": 1.2714052945375443, + "advantage_mean": 1.1175871006408045e-08, + "advantage_min": -0.7065200954675674, + "advantage_std": 0.7394993230700493, + "completion_length": 2904.125045776367, + "epoch": 0.37142857142857144, + "grad_norm": 0.4810188412666321, + "kl": 0.40203857421875, + "lambda_div_used": 0.6, + "learning_rate": 3.9609093550344907e-07, + "loss": 0.0351, + "reward": -0.09598593506962061, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.09598593506962061, + "reward_after_std": 0.7394993156194687, + "reward_before_mean": 0.2538135554641485, + "reward_before_std": 0.7984256558120251, + "reward_change_max": 0.001150481402873993, + "reward_change_mean": -0.3497994728386402, + "reward_change_min": -0.7644472010433674, + "reward_change_std": 0.3115408755838871, + "reward_std": 0.7394993454217911, + "rewards/cosine_scaled_reward": -0.13350990042090416, + "rewards/format_reward": 0.5208333414047956, + "step": 325 + }, + { + "advantage_max": 1.1877973601222038, + "advantage_mean": -1.3659397890553038e-08, + "advantage_min": -0.6837801039218903, + "advantage_std": 0.6724862232804298, + "completion_length": 2495.1250534057617, + "epoch": 0.37257142857142855, + "grad_norm": 0.4340304732322693, + "kl": 0.27520751953125, + "lambda_div_used": 0.6, + "learning_rate": 3.931425787051832e-07, + "loss": 0.0059, + "reward": 0.0016772449016571045, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.0016772449016571045, + "reward_after_std": 0.6724862270057201, + "reward_before_mean": 0.4161319313570857, + "reward_before_std": 0.6646304540336132, + "reward_change_max": 0.00013080984354019165, + "reward_change_mean": -0.414454716257751, + "reward_change_min": -0.7668664567172527, + "reward_change_std": 0.3069826615974307, + "reward_std": 0.6724862344563007, + "rewards/cosine_scaled_reward": -0.11485069431364536, + "rewards/format_reward": 0.6458333376795053, + "step": 326 + }, + { + "advantage_max": 1.7221521884202957, + "advantage_mean": -1.1796752574788627e-08, + "advantage_min": -0.9973884001374245, + "advantage_std": 0.97059765458107, + "completion_length": 2255.8959045410156, + "epoch": 0.3737142857142857, + "grad_norm": 0.4257287383079529, + "kl": 0.24005126953125, + "lambda_div_used": 0.6, + "learning_rate": 3.902018669163384e-07, + "loss": 0.0209, + "reward": 0.33461217768490314, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.33461217768490314, + "reward_after_std": 0.9705976694822311, + "reward_before_mean": 0.872645559720695, + "reward_before_std": 0.9799929857254028, + "reward_change_max": 0.001069873571395874, + "reward_change_mean": -0.5380333829671144, + "reward_change_min": -1.0413878448307514, + "reward_change_std": 0.41941458359360695, + "reward_std": 0.9705976694822311, + "rewards/cosine_scaled_reward": 0.11340610310435295, + "rewards/format_reward": 0.6458333507180214, + "step": 327 + }, + { + "advantage_max": 0.9665052369236946, + "advantage_mean": 1.8626452158443385e-08, + "advantage_min": -0.5485107228159904, + "advantage_std": 0.538653664290905, + "completion_length": 3075.6458892822266, + "epoch": 0.37485714285714283, + "grad_norm": 0.35124486684799194, + "kl": 0.323455810546875, + "lambda_div_used": 0.6, + "learning_rate": 3.872689434630585e-07, + "loss": 0.0528, + "reward": -0.2801295481622219, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2801295481622219, + "reward_after_std": 0.5386536531150341, + "reward_before_mean": 0.0012779205571860075, + "reward_before_std": 0.5384062603116035, + "reward_change_max": 0.0, + "reward_change_mean": -0.2814074568450451, + "reward_change_min": -0.5551128089427948, + "reward_change_std": 0.21838002931326628, + "reward_std": 0.5386536568403244, + "rewards/cosine_scaled_reward": -0.20769438333809376, + "rewards/format_reward": 0.4166666753590107, + "step": 328 + }, + { + "advantage_max": 1.6541471779346466, + "advantage_mean": 8.692344233285354e-09, + "advantage_min": -0.9075762033462524, + "advantage_std": 0.9191523641347885, + "completion_length": 2187.604223251343, + "epoch": 0.376, + "grad_norm": 0.6736992001533508, + "kl": 0.28594970703125, + "lambda_div_used": 0.6, + "learning_rate": 3.843439512918949e-07, + "loss": 0.0252, + "reward": 0.11798514844849706, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.11798514844849706, + "reward_after_std": 0.9191523641347885, + "reward_before_mean": 0.5414314409717917, + "reward_before_std": 0.9413018524646759, + "reward_change_max": 0.00015210360288619995, + "reward_change_mean": -0.4234462957829237, + "reward_change_min": -0.8685862831771374, + "reward_change_std": 0.34094572998583317, + "reward_std": 0.9191523678600788, + "rewards/cosine_scaled_reward": -0.010534274391829967, + "rewards/format_reward": 0.5625000149011612, + "step": 329 + }, + { + "advantage_max": 1.466382622718811, + "advantage_mean": 3.414849431004896e-09, + "advantage_min": -0.6231764815747738, + "advantage_std": 0.7823390513658524, + "completion_length": 2061.145896911621, + "epoch": 0.37714285714285717, + "grad_norm": 0.473938912153244, + "kl": 0.232666015625, + "lambda_div_used": 0.6, + "learning_rate": 3.8142703296283953e-07, + "loss": -0.0219, + "reward": 0.0415657889097929, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.0415657889097929, + "reward_after_std": 0.7823390439152718, + "reward_before_mean": 0.4359978437423706, + "reward_before_std": 0.7304593771696091, + "reward_change_max": 0.0, + "reward_change_mean": -0.3944320511072874, + "reward_change_min": -0.6981092765927315, + "reward_change_std": 0.26706850342452526, + "reward_std": 0.7823390439152718, + "rewards/cosine_scaled_reward": -0.11533442325890064, + "rewards/format_reward": 0.6666666828095913, + "step": 330 + }, + { + "advantage_max": 1.2125908732414246, + "advantage_mean": 1.2417634698280722e-08, + "advantage_min": -0.4857662245631218, + "advantage_std": 0.627432931214571, + "completion_length": 2819.250045776367, + "epoch": 0.3782857142857143, + "grad_norm": 0.5773917436599731, + "kl": 0.3829345703125, + "lambda_div_used": 0.6, + "learning_rate": 3.785183306423767e-07, + "loss": 0.0198, + "reward": -0.22609689529053867, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.22609689529053867, + "reward_after_std": 0.6274329125881195, + "reward_before_mean": 0.05956357158720493, + "reward_before_std": 0.5638223402202129, + "reward_change_max": 0.0006118118762969971, + "reward_change_mean": -0.28566044569015503, + "reward_change_min": -0.5349070765078068, + "reward_change_std": 0.19656004663556814, + "reward_std": 0.6274329200387001, + "rewards/cosine_scaled_reward": -0.17855155700817704, + "rewards/format_reward": 0.4166666753590107, + "step": 331 + }, + { + "advantage_max": 1.0075550377368927, + "advantage_mean": -6.364037768991082e-09, + "advantage_min": -0.47515266202390194, + "advantage_std": 0.537880215793848, + "completion_length": 2479.5209045410156, + "epoch": 0.37942857142857145, + "grad_norm": 0.8273375034332275, + "kl": 0.26776123046875, + "lambda_div_used": 0.6, + "learning_rate": 3.7561798609655373e-07, + "loss": -0.0073, + "reward": 0.01811327727045864, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.01811327727045864, + "reward_after_std": 0.5378802232444286, + "reward_before_mean": 0.46176561154425144, + "reward_before_std": 0.43859507143497467, + "reward_change_max": 0.0, + "reward_change_mean": -0.44365235418081284, + "reward_change_min": -0.6914606131613255, + "reward_change_std": 0.2646033428609371, + "reward_std": 0.5378802306950092, + "rewards/cosine_scaled_reward": -0.0712005328387022, + "rewards/format_reward": 0.6041666753590107, + "step": 332 + }, + { + "advantage_max": 1.6488222405314445, + "advantage_mean": -7.450580818968433e-09, + "advantage_min": -0.7030692547559738, + "advantage_std": 0.8828940503299236, + "completion_length": 2545.729248046875, + "epoch": 0.38057142857142856, + "grad_norm": 0.7539603114128113, + "kl": 0.21612548828125, + "lambda_div_used": 0.6, + "learning_rate": 3.72726140684072e-07, + "loss": 0.0547, + "reward": 0.24899009801447392, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.24899009801447392, + "reward_after_std": 0.882894080132246, + "reward_before_mean": 0.7488440172746778, + "reward_before_std": 0.8035490922629833, + "reward_change_max": 0.0008507147431373596, + "reward_change_mean": -0.4998539462685585, + "reward_change_min": -0.8932062350213528, + "reward_change_std": 0.33497907407581806, + "reward_std": 0.882894080132246, + "rewards/cosine_scaled_reward": -0.03182799264322966, + "rewards/format_reward": 0.8125000186264515, + "step": 333 + }, + { + "advantage_max": 1.4733685553073883, + "advantage_mean": 4.346172144398253e-09, + "advantage_min": -0.7610689476132393, + "advantage_std": 0.8075447678565979, + "completion_length": 3106.979248046875, + "epoch": 0.38171428571428573, + "grad_norm": 0.8128080368041992, + "kl": 0.3507080078125, + "lambda_div_used": 0.6, + "learning_rate": 3.6984293534939737e-07, + "loss": 0.0485, + "reward": -0.10258024837821722, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.10258024837821722, + "reward_after_std": 0.8075447678565979, + "reward_before_mean": 0.2223309800028801, + "reward_before_std": 0.824804563075304, + "reward_change_max": 0.002238534390926361, + "reward_change_mean": -0.32491121254861355, + "reward_change_min": -0.6764849834144115, + "reward_change_std": 0.27632055804133415, + "reward_std": 0.8075447827577591, + "rewards/cosine_scaled_reward": -0.1700845193117857, + "rewards/format_reward": 0.5625000167638063, + "step": 334 + }, + { + "advantage_max": 1.2056212276220322, + "advantage_mean": -7.450580596923828e-09, + "advantage_min": -0.6192683838307858, + "advantage_std": 0.6638744994997978, + "completion_length": 2477.958396911621, + "epoch": 0.38285714285714284, + "grad_norm": 0.9470255970954895, + "kl": 0.269683837890625, + "lambda_div_used": 0.6, + "learning_rate": 3.6696851061588994e-07, + "loss": -0.0232, + "reward": -0.05806213865707832, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.05806213865707832, + "reward_after_std": 0.6638744957745075, + "reward_before_mean": 0.3192488541826606, + "reward_before_std": 0.6370225492864847, + "reward_change_max": 0.0, + "reward_change_mean": -0.37731100153177977, + "reward_change_min": -0.67901411652565, + "reward_change_std": 0.27379969879984856, + "reward_std": 0.6638745106756687, + "rewards/cosine_scaled_reward": -0.11120891571044922, + "rewards/format_reward": 0.5416666809469461, + "step": 335 + }, + { + "advantage_max": 1.6421631425619125, + "advantage_mean": -3.7252904094842165e-09, + "advantage_min": -0.955784484744072, + "advantage_std": 0.937946081161499, + "completion_length": 2682.0834197998047, + "epoch": 0.384, + "grad_norm": 0.47615712881088257, + "kl": 0.2991943359375, + "lambda_div_used": 0.6, + "learning_rate": 3.641030065789562e-07, + "loss": 0.0559, + "reward": 0.08940086141228676, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.08940086141228676, + "reward_after_std": 0.9379460960626602, + "reward_before_mean": 0.4974015187472105, + "reward_before_std": 1.001080434769392, + "reward_change_max": 0.003382869064807892, + "reward_change_mean": -0.40800066851079464, + "reward_change_min": -0.8810965530574322, + "reward_change_std": 0.37354159355163574, + "reward_std": 0.9379461221396923, + "rewards/cosine_scaled_reward": 0.009117423556745052, + "rewards/format_reward": 0.4791666828095913, + "step": 336 + }, + { + "advantage_max": 1.554929867386818, + "advantage_mean": -8.071462553882469e-09, + "advantage_min": -0.6676862463355064, + "advantage_std": 0.8406063243746758, + "completion_length": 2635.687545776367, + "epoch": 0.3851428571428571, + "grad_norm": 0.611068844795227, + "kl": 0.26171875, + "lambda_div_used": 0.6, + "learning_rate": 3.612465628992203e-07, + "loss": 0.0136, + "reward": 0.01666065352037549, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.01666065352037549, + "reward_after_std": 0.8406063467264175, + "reward_before_mean": 0.39856263156980276, + "reward_before_std": 0.8238513432443142, + "reward_change_max": 0.0, + "reward_change_mean": -0.3819019701331854, + "reward_change_min": -0.8491205647587776, + "reward_change_std": 0.3106928654015064, + "reward_std": 0.8406063690781593, + "rewards/cosine_scaled_reward": -0.13405203144066036, + "rewards/format_reward": 0.6666666828095913, + "step": 337 + }, + { + "advantage_max": 1.361045453697443, + "advantage_mean": -1.303851654421706e-08, + "advantage_min": -0.605663824826479, + "advantage_std": 0.7161907237023115, + "completion_length": 2193.229232788086, + "epoch": 0.3862857142857143, + "grad_norm": 0.2578127682209015, + "kl": 0.21331787109375, + "lambda_div_used": 0.6, + "learning_rate": 3.5839931879571725e-07, + "loss": 0.0229, + "reward": 0.17649948690086603, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.17649948690086603, + "reward_after_std": 0.7161907143890858, + "reward_before_mean": 0.6687635667622089, + "reward_before_std": 0.6105111464858055, + "reward_change_max": 0.0, + "reward_change_mean": -0.4922640845179558, + "reward_change_min": -0.8056236505508423, + "reward_change_std": 0.29702320881187916, + "reward_std": 0.716190755367279, + "rewards/cosine_scaled_reward": -0.03020156484853942, + "rewards/format_reward": 0.7291666809469461, + "step": 338 + }, + { + "advantage_max": 1.159138262271881, + "advantage_mean": -1.8626452602532595e-09, + "advantage_min": -0.6259487606585026, + "advantage_std": 0.6420266218483448, + "completion_length": 2907.5833587646484, + "epoch": 0.38742857142857146, + "grad_norm": 0.7145529985427856, + "kl": 0.2969970703125, + "lambda_div_used": 0.6, + "learning_rate": 3.555614130391079e-07, + "loss": 0.0416, + "reward": -0.13409867510199547, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.13409867510199547, + "reward_after_std": 0.6420266218483448, + "reward_before_mean": 0.20468079950660467, + "reward_before_std": 0.6328683495521545, + "reward_change_max": 0.0004741773009300232, + "reward_change_mean": -0.33877946995198727, + "reward_change_min": -0.6202888488769531, + "reward_change_std": 0.24783700983971357, + "reward_std": 0.6420266255736351, + "rewards/cosine_scaled_reward": -0.14765961794182658, + "rewards/format_reward": 0.5000000074505806, + "step": 339 + }, + { + "advantage_max": 1.1203817278146744, + "advantage_mean": 9.313226079221693e-09, + "advantage_min": -0.5899899490177631, + "advantage_std": 0.6205705478787422, + "completion_length": 2648.625045776367, + "epoch": 0.38857142857142857, + "grad_norm": 0.7546307444572449, + "kl": 0.317138671875, + "lambda_div_used": 0.6, + "learning_rate": 3.5273298394491515e-07, + "loss": 0.0546, + "reward": -0.01088642206741497, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.01088642206741497, + "reward_after_std": 0.6205705553293228, + "reward_before_mean": 0.39934197315596975, + "reward_before_std": 0.5859780795872211, + "reward_change_max": 0.0002032071352005005, + "reward_change_mean": -0.41022837720811367, + "reward_change_min": -0.718485102057457, + "reward_change_std": 0.27802254259586334, + "reward_std": 0.6205705776810646, + "rewards/cosine_scaled_reward": -0.13366236118599772, + "rewards/format_reward": 0.6666666809469461, + "step": 340 + }, + { + "advantage_max": 1.4242380373179913, + "advantage_mean": -9.31322596819939e-09, + "advantage_min": -0.628170371055603, + "advantage_std": 0.7521231174468994, + "completion_length": 2428.0625610351562, + "epoch": 0.38971428571428574, + "grad_norm": 0.1985091269016266, + "kl": 0.1912994384765625, + "lambda_div_used": 0.6, + "learning_rate": 3.4991416936678276e-07, + "loss": 0.0032, + "reward": 0.45150601863861084, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.45150601863861084, + "reward_after_std": 0.7521231323480606, + "reward_before_mean": 1.0890588415786624, + "reward_before_std": 0.6112305838614702, + "reward_change_max": 0.00016424059867858887, + "reward_change_mean": -0.637552828527987, + "reward_change_min": -0.9662577249109745, + "reward_change_std": 0.3721097456291318, + "reward_std": 0.7521231602877378, + "rewards/cosine_scaled_reward": 0.22161275381222367, + "rewards/format_reward": 0.6458333469927311, + "step": 341 + }, + { + "advantage_max": 1.6415075659751892, + "advantage_mean": -1.3038516322172455e-08, + "advantage_min": -0.7063800022006035, + "advantage_std": 0.8682013414800167, + "completion_length": 2603.62508392334, + "epoch": 0.39085714285714285, + "grad_norm": 0.45659804344177246, + "kl": 0.3743896484375, + "lambda_div_used": 0.6, + "learning_rate": 3.471051066897562e-07, + "loss": 0.0122, + "reward": -0.05222347751259804, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.05222347751259804, + "reward_after_std": 0.8682013303041458, + "reward_before_mean": 0.27251999638974667, + "reward_before_std": 0.8448606841266155, + "reward_change_max": 0.0, + "reward_change_mean": -0.32474347949028015, + "reward_change_min": -0.7152538634836674, + "reward_change_std": 0.2609505634754896, + "reward_std": 0.8682013601064682, + "rewards/cosine_scaled_reward": -0.12415667390450835, + "rewards/format_reward": 0.5208333469927311, + "step": 342 + }, + { + "advantage_max": 1.53754311054945, + "advantage_mean": -3.414849530924968e-08, + "advantage_min": -0.8002948388457298, + "advantage_std": 0.8562918156385422, + "completion_length": 2895.041748046875, + "epoch": 0.392, + "grad_norm": 0.7427569627761841, + "kl": 0.2850341796875, + "lambda_div_used": 0.6, + "learning_rate": 3.4430593282358777e-07, + "loss": 0.0269, + "reward": 0.15211634151637554, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.15211634151637554, + "reward_after_std": 0.8562918230891228, + "reward_before_mean": 0.6098415553569794, + "reward_before_std": 0.8478217422962189, + "reward_change_max": 0.00046613067388534546, + "reward_change_mean": -0.4577252510935068, + "reward_change_min": -0.887396827340126, + "reward_change_std": 0.35419388487935066, + "reward_std": 0.8562918454408646, + "rewards/cosine_scaled_reward": 0.04450410744175315, + "rewards/format_reward": 0.5208333432674408, + "step": 343 + }, + { + "advantage_max": 1.3572538942098618, + "advantage_mean": -2.4214387106535895e-08, + "advantage_min": -0.7771032117307186, + "advantage_std": 0.7490360550582409, + "completion_length": 2392.708396911621, + "epoch": 0.3931428571428571, + "grad_norm": 0.37972065806388855, + "kl": 0.2348785400390625, + "lambda_div_used": 0.6, + "learning_rate": 3.4151678419606233e-07, + "loss": 0.0198, + "reward": 0.4535197149962187, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4535197149962187, + "reward_after_std": 0.7490360513329506, + "reward_before_mean": 1.1015524696558714, + "reward_before_std": 0.6594033464789391, + "reward_change_max": 0.0, + "reward_change_mean": -0.6480327611789107, + "reward_change_min": -1.0308993272483349, + "reward_change_std": 0.3941861046478152, + "reward_std": 0.7490360662341118, + "rewards/cosine_scaled_reward": 0.16535954643040895, + "rewards/format_reward": 0.7708333432674408, + "step": 344 + }, + { + "advantage_max": 1.4794992804527283, + "advantage_mean": -1.2417633588057697e-09, + "advantage_min": -0.8345628753304482, + "advantage_std": 0.8316364139318466, + "completion_length": 2695.2917098999023, + "epoch": 0.3942857142857143, + "grad_norm": 0.9218659996986389, + "kl": 0.342010498046875, + "lambda_div_used": 0.6, + "learning_rate": 3.387377967463493e-07, + "loss": 0.0479, + "reward": 0.1142323762178421, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.1142323762178421, + "reward_after_std": 0.8316364139318466, + "reward_before_mean": 0.5572534778038971, + "reward_before_std": 0.8419076986610889, + "reward_change_max": 0.0008572712540626526, + "reward_change_mean": -0.44302110746502876, + "reward_change_min": -0.8450823724269867, + "reward_change_std": 0.3534926138818264, + "reward_std": 0.8316364288330078, + "rewards/cosine_scaled_reward": -0.03387327026575804, + "rewards/format_reward": 0.6250000167638063, + "step": 345 + }, + { + "advantage_max": 1.4657285958528519, + "advantage_mean": 3.7252904094842165e-09, + "advantage_min": -0.7588490657508373, + "advantage_std": 0.8019114583730698, + "completion_length": 2404.7709197998047, + "epoch": 0.3954285714285714, + "grad_norm": 0.458408921957016, + "kl": 0.28741455078125, + "lambda_div_used": 0.6, + "learning_rate": 3.359691059183761e-07, + "loss": 0.0563, + "reward": 0.17914676276268438, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.17914676276268438, + "reward_after_std": 0.8019114658236504, + "reward_before_mean": 0.6614518268033862, + "reward_before_std": 0.7597181908786297, + "reward_change_max": 0.0015663355588912964, + "reward_change_mean": -0.48230502754449844, + "reward_change_min": -0.9021615609526634, + "reward_change_std": 0.3469356056302786, + "reward_std": 0.801911499351263, + "rewards/cosine_scaled_reward": -0.013024099171161652, + "rewards/format_reward": 0.687500013038516, + "step": 346 + }, + { + "advantage_max": 1.2888052985072136, + "advantage_mean": -1.2417633588057697e-09, + "advantage_min": -0.6205499656498432, + "advantage_std": 0.6916971392929554, + "completion_length": 2850.1459045410156, + "epoch": 0.3965714285714286, + "grad_norm": 1.236312747001648, + "kl": 0.4263916015625, + "lambda_div_used": 0.6, + "learning_rate": 3.3321084665422803e-07, + "loss": -0.0104, + "reward": -0.07637113053351641, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.07637113053351641, + "reward_after_std": 0.6916971504688263, + "reward_before_mean": 0.28298189770430326, + "reward_before_std": 0.6537257945165038, + "reward_change_max": 0.0009397566318511963, + "reward_change_mean": -0.3593530207872391, + "reward_change_min": -0.6594051010906696, + "reward_change_std": 0.268814392387867, + "reward_std": 0.6916971765458584, + "rewards/cosine_scaled_reward": -0.20225906372070312, + "rewards/format_reward": 0.6875000055879354, + "step": 347 + }, + { + "advantage_max": 1.5206566154956818, + "advantage_mean": -4.03573130469681e-09, + "advantage_min": -0.7239486165344715, + "advantage_std": 0.8120072856545448, + "completion_length": 2620.1459045410156, + "epoch": 0.3977142857142857, + "grad_norm": 0.8215629458427429, + "kl": 0.445953369140625, + "lambda_div_used": 0.6, + "learning_rate": 3.3046315338757026e-07, + "loss": 0.0162, + "reward": 0.03434617887251079, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.03434617887251079, + "reward_after_std": 0.8120072670280933, + "reward_before_mean": 0.42738607805222273, + "reward_before_std": 0.770159151405096, + "reward_change_max": 0.0014997944235801697, + "reward_change_mean": -0.39303990080952644, + "reward_change_min": -0.7212648764252663, + "reward_change_std": 0.2756068855524063, + "reward_std": 0.8120072782039642, + "rewards/cosine_scaled_reward": -0.11964030953822657, + "rewards/format_reward": 0.6666666846722364, + "step": 348 + }, + { + "advantage_max": 1.52374729514122, + "advantage_mean": -3.7252904094842165e-09, + "advantage_min": -0.7489364519715309, + "advantage_std": 0.8232544809579849, + "completion_length": 2690.5834350585938, + "epoch": 0.39885714285714285, + "grad_norm": 0.9394125938415527, + "kl": 0.389373779296875, + "lambda_div_used": 0.6, + "learning_rate": 3.2772616003709616e-07, + "loss": 0.0465, + "reward": 0.016556567046791315, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.016556567046791315, + "reward_after_std": 0.8232545182108879, + "reward_before_mean": 0.39954477082937956, + "reward_before_std": 0.8015697486698627, + "reward_change_max": 0.00015076994895935059, + "reward_change_mean": -0.3829882238060236, + "reward_change_min": -0.7486894652247429, + "reward_change_std": 0.28054554015398026, + "reward_std": 0.8232545331120491, + "rewards/cosine_scaled_reward": -0.06064428063109517, + "rewards/format_reward": 0.5208333432674408, + "step": 349 + }, + { + "advantage_max": 1.5367366746068, + "advantage_mean": -9.313225746154785e-09, + "advantage_min": -0.6593474373221397, + "advantage_std": 0.809534341096878, + "completion_length": 2321.9375610351562, + "epoch": 0.4, + "grad_norm": 1.0577102899551392, + "kl": 0.451416015625, + "lambda_div_used": 0.6, + "learning_rate": 3.250000000000001e-07, + "loss": -0.0017, + "reward": 0.031073355115950108, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.031073355115950108, + "reward_after_std": 0.8095343261957169, + "reward_before_mean": 0.4226370635442436, + "reward_before_std": 0.7477500736713409, + "reward_change_max": 0.0005329474806785583, + "reward_change_mean": -0.3915637172758579, + "reward_change_min": -0.7794366888701916, + "reward_change_std": 0.2754232231527567, + "reward_std": 0.8095343485474586, + "rewards/cosine_scaled_reward": -0.09076481917873025, + "rewards/format_reward": 0.6041666828095913, + "step": 350 + }, + { + "advantage_max": 1.559445295482874, + "advantage_mean": -1.2417632477834672e-09, + "advantage_min": -0.9915315806865692, + "advantage_std": 0.9171740040183067, + "completion_length": 2582.5208740234375, + "epoch": 0.40114285714285713, + "grad_norm": 0.5777731537818909, + "kl": 0.343505859375, + "lambda_div_used": 0.6, + "learning_rate": 3.222848061454764e-07, + "loss": 0.0083, + "reward": 0.33935420773923397, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.33935420773923397, + "reward_after_std": 0.9171739853918552, + "reward_before_mean": 0.8997167088091373, + "reward_before_std": 0.9683718234300613, + "reward_change_max": 0.0, + "reward_change_mean": -0.5603625010699034, + "reward_change_min": -1.0499442629516125, + "reward_change_std": 0.4368437882512808, + "reward_std": 0.9171739853918552, + "rewards/cosine_scaled_reward": 0.07485833764076233, + "rewards/format_reward": 0.7500000149011612, + "step": 351 + }, + { + "advantage_max": 1.3428475260734558, + "advantage_mean": -6.8296991950766994e-09, + "advantage_min": -0.5568768233060837, + "advantage_std": 0.7152614146471024, + "completion_length": 2627.5833587646484, + "epoch": 0.4022857142857143, + "grad_norm": 0.8404443860054016, + "kl": 0.407501220703125, + "lambda_div_used": 0.6, + "learning_rate": 3.195807108082429e-07, + "loss": 0.0119, + "reward": 0.12012724267970043, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.12012724267970043, + "reward_after_std": 0.7152614146471024, + "reward_before_mean": 0.5803326356690377, + "reward_before_std": 0.6293035577982664, + "reward_change_max": 0.0056512728333473206, + "reward_change_mean": -0.46020539198070765, + "reward_change_min": -0.7574254907667637, + "reward_change_std": 0.3005659803748131, + "reward_std": 0.7152614295482635, + "rewards/cosine_scaled_reward": -0.043167030438780785, + "rewards/format_reward": 0.6666666734963655, + "step": 352 + }, + { + "advantage_max": 1.4187793508172035, + "advantage_mean": -1.8626451825376478e-08, + "advantage_min": -0.6966684348881245, + "advantage_std": 0.7699579037725925, + "completion_length": 2302.0000762939453, + "epoch": 0.4034285714285714, + "grad_norm": 0.48950308561325073, + "kl": 0.3104248046875, + "lambda_div_used": 0.6, + "learning_rate": 3.168878457820915e-07, + "loss": 0.0198, + "reward": 0.4192593709449284, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4192593709449284, + "reward_after_std": 0.7699578814208508, + "reward_before_mean": 1.0370823116973042, + "reward_before_std": 0.6642060168087482, + "reward_change_max": 0.0, + "reward_change_mean": -0.6178229376673698, + "reward_change_min": -0.9664259105920792, + "reward_change_std": 0.3706759735941887, + "reward_std": 0.7699579186737537, + "rewards/cosine_scaled_reward": 0.08104114048182964, + "rewards/format_reward": 0.8750000111758709, + "step": 353 + }, + { + "advantage_max": 1.3239182755351067, + "advantage_mean": -2.1109979320144134e-08, + "advantage_min": -0.7877817675471306, + "advantage_std": 0.7418571058660746, + "completion_length": 2000.5417442321777, + "epoch": 0.4045714285714286, + "grad_norm": 0.5796849727630615, + "kl": 0.25811767578125, + "lambda_div_used": 0.6, + "learning_rate": 3.142063423134644e-07, + "loss": 0.0468, + "reward": 0.25255117658525705, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.25255117658525705, + "reward_after_std": 0.7418571207672358, + "reward_before_mean": 0.7909721238538623, + "reward_before_std": 0.7007397972047329, + "reward_change_max": 0.0011800751090049744, + "reward_change_mean": -0.5384209705516696, + "reward_change_min": -0.8849985972046852, + "reward_change_std": 0.35534715466201305, + "reward_std": 0.7418571468442678, + "rewards/cosine_scaled_reward": 0.07256940566003323, + "rewards/format_reward": 0.6458333414047956, + "step": 354 + }, + { + "advantage_max": 1.7609666138887405, + "advantage_mean": -1.8626451825376478e-08, + "advantage_min": -0.7916901037096977, + "advantage_std": 0.9337159469723701, + "completion_length": 2235.041702270508, + "epoch": 0.4057142857142857, + "grad_norm": 0.5317572951316833, + "kl": 0.322509765625, + "lambda_div_used": 0.6, + "learning_rate": 3.115363310950578e-07, + "loss": 0.0388, + "reward": 0.3990977890789509, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3990977890789509, + "reward_after_std": 0.9337159618735313, + "reward_before_mean": 0.9705567192286253, + "reward_before_std": 0.8250516392290592, + "reward_change_max": 0.0004812106490135193, + "reward_change_mean": -0.5714588910341263, + "reward_change_min": -0.9926439747214317, + "reward_change_std": 0.3731604963541031, + "reward_std": 0.9337159991264343, + "rewards/cosine_scaled_reward": 0.131111660040915, + "rewards/format_reward": 0.7083333395421505, + "step": 355 + }, + { + "advantage_max": 1.5015154480934143, + "advantage_mean": -1.4280279736489376e-08, + "advantage_min": -0.8326525948941708, + "advantage_std": 0.8183581568300724, + "completion_length": 2302.104232788086, + "epoch": 0.40685714285714286, + "grad_norm": 0.4310028553009033, + "kl": 0.28448486328125, + "lambda_div_used": 0.6, + "learning_rate": 3.0887794225945143e-07, + "loss": 0.0207, + "reward": 0.2149380873888731, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2149380873888731, + "reward_after_std": 0.8183581568300724, + "reward_before_mean": 0.7093568500131369, + "reward_before_std": 0.7762251533567905, + "reward_change_max": 0.0, + "reward_change_mean": -0.4944187719374895, + "reward_change_min": -0.8521025627851486, + "reward_change_std": 0.33673784136772156, + "reward_std": 0.8183581717312336, + "rewards/cosine_scaled_reward": 0.0630117617547512, + "rewards/format_reward": 0.5833333432674408, + "step": 356 + }, + { + "advantage_max": 1.3827911913394928, + "advantage_mean": -1.024454859832602e-08, + "advantage_min": -0.8060562089085579, + "advantage_std": 0.7842890731990337, + "completion_length": 2924.6458740234375, + "epoch": 0.408, + "grad_norm": 0.5307123064994812, + "kl": 0.4029541015625, + "lambda_div_used": 0.6, + "learning_rate": 3.062313053727671e-07, + "loss": 0.0524, + "reward": 0.13803019496845081, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.13803019496845081, + "reward_after_std": 0.7842890806496143, + "reward_before_mean": 0.603233439847827, + "reward_before_std": 0.7885280232876539, + "reward_change_max": 0.0005675703287124634, + "reward_change_mean": -0.46520326659083366, + "reward_change_min": -0.8700950965285301, + "reward_change_std": 0.34981742314994335, + "reward_std": 0.7842891179025173, + "rewards/cosine_scaled_reward": -0.0942166093736887, + "rewards/format_reward": 0.7916666939854622, + "step": 357 + }, + { + "advantage_max": 1.597570613026619, + "advantage_mean": -1.6142924885720333e-08, + "advantage_min": -0.905064333230257, + "advantage_std": 0.8988454565405846, + "completion_length": 2193.166732788086, + "epoch": 0.40914285714285714, + "grad_norm": 0.45412543416023254, + "kl": 0.317138671875, + "lambda_div_used": 0.6, + "learning_rate": 3.0359654942835247e-07, + "loss": 0.0298, + "reward": 0.4682135407347232, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4682135407347232, + "reward_after_std": 0.8988454788923264, + "reward_before_mean": 1.089598136022687, + "reward_before_std": 0.8698871731758118, + "reward_change_max": 0.0, + "reward_change_mean": -0.6213845498859882, + "reward_change_min": -1.0914196744561195, + "reward_change_std": 0.4229634404182434, + "reward_std": 0.8988455012440681, + "rewards/cosine_scaled_reward": 0.1697990447282791, + "rewards/format_reward": 0.7500000149011612, + "step": 358 + }, + { + "advantage_max": 1.3419801220297813, + "advantage_mean": 6.208817460162663e-09, + "advantage_min": -0.618110153824091, + "advantage_std": 0.7201356999576092, + "completion_length": 2469.8333740234375, + "epoch": 0.4102857142857143, + "grad_norm": 1.6776347160339355, + "kl": 0.29156494140625, + "lambda_div_used": 0.6, + "learning_rate": 3.0097380284049523e-07, + "loss": -0.0181, + "reward": 0.19314294261857867, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.19314294261857867, + "reward_after_std": 0.7201356925070286, + "reward_before_mean": 0.6933410288766026, + "reward_before_std": 0.6368299573659897, + "reward_change_max": 0.0, + "reward_change_mean": -0.5001981183886528, + "reward_change_min": -0.8563997894525528, + "reward_change_std": 0.3123783506453037, + "reward_std": 0.7201357260346413, + "rewards/cosine_scaled_reward": -0.05957947578281164, + "rewards/format_reward": 0.8125000111758709, + "step": 359 + }, + { + "advantage_max": 1.674271047115326, + "advantage_mean": -1.8626452213954536e-08, + "advantage_min": -0.9208538271486759, + "advantage_std": 0.9478093385696411, + "completion_length": 2530.729248046875, + "epoch": 0.4114285714285714, + "grad_norm": 1.0951354503631592, + "kl": 0.30206298828125, + "lambda_div_used": 0.6, + "learning_rate": 2.9836319343816397e-07, + "loss": 0.0726, + "reward": 0.30874237247917335, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.30874237247917335, + "reward_after_std": 0.9478093311190605, + "reward_before_mean": 0.8401943445205688, + "reward_before_std": 0.9553318619728088, + "reward_change_max": 0.00043720006942749023, + "reward_change_mean": -0.5314519703388214, + "reward_change_min": -1.0268866904079914, + "reward_change_std": 0.41884367167949677, + "reward_std": 0.9478093758225441, + "rewards/cosine_scaled_reward": 0.024263825733214617, + "rewards/format_reward": 0.7916666939854622, + "step": 360 + }, + { + "advantage_max": 1.4925316274166107, + "advantage_mean": -1.2417634698280722e-09, + "advantage_min": -0.7755002863705158, + "advantage_std": 0.7997158244252205, + "completion_length": 2971.166717529297, + "epoch": 0.4125714285714286, + "grad_norm": 0.4740391969680786, + "kl": 0.4354248046875, + "lambda_div_used": 0.6, + "learning_rate": 2.9576484845877793e-07, + "loss": 0.0421, + "reward": 0.13191415555775166, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.13191415555775166, + "reward_after_std": 0.7997158244252205, + "reward_before_mean": 0.5852155089378357, + "reward_before_std": 0.7439655251801014, + "reward_change_max": 0.0, + "reward_change_mean": -0.4533013366162777, + "reward_change_min": -0.7497556209564209, + "reward_change_std": 0.3015373144298792, + "reward_std": 0.7997158318758011, + "rewards/cosine_scaled_reward": -0.08239225693978369, + "rewards/format_reward": 0.7500000223517418, + "step": 361 + }, + { + "advantage_max": 1.263748835772276, + "advantage_mean": -1.2728076231871555e-08, + "advantage_min": -0.7216246016323566, + "advantage_std": 0.717674445360899, + "completion_length": 1809.4792175292969, + "epoch": 0.4137142857142857, + "grad_norm": 0.5852110385894775, + "kl": 0.2550048828125, + "lambda_div_used": 0.6, + "learning_rate": 2.931788945420058e-07, + "loss": 0.0364, + "reward": 0.09919328487012535, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.09919328487012535, + "reward_after_std": 0.7176744528114796, + "reward_before_mean": 0.5574979344382882, + "reward_before_std": 0.7097251052036881, + "reward_change_max": 0.0014338567852973938, + "reward_change_mean": -0.45830463618040085, + "reward_change_min": -0.8685710802674294, + "reward_change_std": 0.3367288615554571, + "reward_std": 0.7176744528114796, + "rewards/cosine_scaled_reward": -0.0858343681320548, + "rewards/format_reward": 0.7291666828095913, + "step": 362 + }, + { + "advantage_max": 1.4477052614092827, + "advantage_mean": -8.692344122263052e-09, + "advantage_min": -0.6502137240022421, + "advantage_std": 0.7600477300584316, + "completion_length": 1877.1042098999023, + "epoch": 0.41485714285714287, + "grad_norm": 0.4388819932937622, + "kl": 0.316253662109375, + "lambda_div_used": 0.6, + "learning_rate": 2.9060545772359305e-07, + "loss": 0.0497, + "reward": 0.12381302984431386, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.12381302984431386, + "reward_after_std": 0.7600477263331413, + "reward_before_mean": 0.5726947877556086, + "reward_before_std": 0.6706202179193497, + "reward_change_max": 0.0, + "reward_change_mean": -0.44888175278902054, + "reward_change_min": -0.7377006858587265, + "reward_change_std": 0.2865642663091421, + "reward_std": 0.7600477486848831, + "rewards/cosine_scaled_reward": -0.06781928800046444, + "rewards/format_reward": 0.7083333469927311, + "step": 363 + }, + { + "advantage_max": 1.0907182395458221, + "advantage_mean": 1.8626449826975033e-09, + "advantage_min": -0.6054031513631344, + "advantage_std": 0.6060111746191978, + "completion_length": 2772.291748046875, + "epoch": 0.416, + "grad_norm": 0.3423631191253662, + "kl": 0.384765625, + "lambda_div_used": 0.6, + "learning_rate": 2.8804466342921987e-07, + "loss": 0.0308, + "reward": -0.1971070682629943, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.1971070682629943, + "reward_after_std": 0.6060111969709396, + "reward_before_mean": 0.11542971897870302, + "reward_before_std": 0.6073710769414902, + "reward_change_max": 0.000298522412776947, + "reward_change_mean": -0.31253678910434246, + "reward_change_min": -0.6107521578669548, + "reward_change_std": 0.24526064470410347, + "reward_std": 0.606011226773262, + "rewards/cosine_scaled_reward": -0.2860351409763098, + "rewards/format_reward": 0.6875000149011612, + "step": 364 + }, + { + "advantage_max": 1.2950073555111885, + "advantage_mean": -1.2417634698280722e-09, + "advantage_min": -0.7644446343183517, + "advantage_std": 0.7370525486767292, + "completion_length": 2624.729248046875, + "epoch": 0.41714285714285715, + "grad_norm": 0.7466763854026794, + "kl": 0.3004150390625, + "lambda_div_used": 0.6, + "learning_rate": 2.854966364683872e-07, + "loss": 0.0028, + "reward": 0.08483308926224709, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.08483308926224709, + "reward_after_std": 0.7370525635778904, + "reward_before_mean": 0.531798891723156, + "reward_before_std": 0.7544467896223068, + "reward_change_max": 0.0004674270749092102, + "reward_change_mean": -0.4469658061861992, + "reward_change_min": -0.8575878739356995, + "reward_change_std": 0.33127186819911003, + "reward_std": 0.7370525784790516, + "rewards/cosine_scaled_reward": -0.004933889955282211, + "rewards/format_reward": 0.5416666679084301, + "step": 365 + }, + { + "advantage_max": 1.6756793335080147, + "advantage_mean": -2.6697913713125132e-08, + "advantage_min": -0.8520163223147392, + "advantage_std": 0.9116014242172241, + "completion_length": 2026.9375457763672, + "epoch": 0.41828571428571426, + "grad_norm": 0.7191529870033264, + "kl": 0.226348876953125, + "lambda_div_used": 0.6, + "learning_rate": 2.829615010283344e-07, + "loss": 0.0346, + "reward": 0.32207756396383047, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.32207756396383047, + "reward_after_std": 0.9116014316678047, + "reward_before_mean": 0.8569716177880764, + "reward_before_std": 0.8602619133889675, + "reward_change_max": 0.005871579051017761, + "reward_change_mean": -0.5348940622061491, + "reward_change_min": -0.9633550941944122, + "reward_change_std": 0.385333601385355, + "reward_std": 0.9116014465689659, + "rewards/cosine_scaled_reward": 0.08473577909171581, + "rewards/format_reward": 0.6875000223517418, + "step": 366 + }, + { + "advantage_max": 1.5158573985099792, + "advantage_mean": 1.676380712023473e-08, + "advantage_min": -0.8118532225489616, + "advantage_std": 0.8657009229063988, + "completion_length": 2827.0208740234375, + "epoch": 0.41942857142857143, + "grad_norm": 0.9370729327201843, + "kl": 0.374908447265625, + "lambda_div_used": 0.6, + "learning_rate": 2.8043938066798645e-07, + "loss": 0.0387, + "reward": 0.0068329013884067535, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.0068329013884067535, + "reward_after_std": 0.8657009117305279, + "reward_before_mean": 0.3866802779957652, + "reward_before_std": 0.9256978183984756, + "reward_change_max": 0.0018185079097747803, + "reward_change_mean": -0.3798473794013262, + "reward_change_min": -0.850589819252491, + "reward_change_std": 0.34286472108215094, + "reward_std": 0.86570093780756, + "rewards/cosine_scaled_reward": -0.0774932000786066, + "rewards/format_reward": 0.5416666828095913, + "step": 367 + }, + { + "advantage_max": 1.2594080492854118, + "advantage_mean": -5.277494885547185e-09, + "advantage_min": -0.5854111053049564, + "advantage_std": 0.6686476469039917, + "completion_length": 3070.0834045410156, + "epoch": 0.4205714285714286, + "grad_norm": 0.603573739528656, + "kl": 0.31689453125, + "lambda_div_used": 0.6, + "learning_rate": 2.7793039831193133e-07, + "loss": 0.02, + "reward": 0.11028159782290459, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.11028159782290459, + "reward_after_std": 0.6686476245522499, + "reward_before_mean": 0.5779907759279013, + "reward_before_std": 0.5762103162705898, + "reward_change_max": 0.0005208030343055725, + "reward_change_mean": -0.46770920045673847, + "reward_change_min": -0.7650781571865082, + "reward_change_std": 0.2938290312886238, + "reward_std": 0.668647650629282, + "rewards/cosine_scaled_reward": -0.054754629731178284, + "rewards/format_reward": 0.687500013038516, + "step": 368 + }, + { + "advantage_max": 1.7339719384908676, + "advantage_mean": -6.829698862009792e-09, + "advantage_min": -0.8681919798254967, + "advantage_std": 0.9657414592802525, + "completion_length": 2561.6459350585938, + "epoch": 0.4217142857142857, + "grad_norm": 0.8986064791679382, + "kl": 0.28192138671875, + "lambda_div_used": 0.6, + "learning_rate": 2.7543467624442956e-07, + "loss": 0.0236, + "reward": 0.07137996330857277, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.07137996330857277, + "reward_after_std": 0.9657414592802525, + "reward_before_mean": 0.4605141952633858, + "reward_before_std": 1.0080587305128574, + "reward_change_max": 0.0023592039942741394, + "reward_change_mean": -0.38913422264158726, + "reward_change_min": -0.8434956669807434, + "reward_change_std": 0.34523880016058683, + "reward_std": 0.9657415077090263, + "rewards/cosine_scaled_reward": -0.07182625401765108, + "rewards/format_reward": 0.6041666772216558, + "step": 369 + }, + { + "advantage_max": 1.1579053699970245, + "advantage_mean": 2.5456151797609294e-08, + "advantage_min": -0.5593762882053852, + "advantage_std": 0.6268951632082462, + "completion_length": 2476.4375762939453, + "epoch": 0.4228571428571429, + "grad_norm": 0.6041207909584045, + "kl": 0.2985076904296875, + "lambda_div_used": 0.6, + "learning_rate": 2.729523361034538e-07, + "loss": -0.0035, + "reward": 0.02673279179725796, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.02673279179725796, + "reward_after_std": 0.6268951632082462, + "reward_before_mean": 0.45535836229100823, + "reward_before_std": 0.5718512460589409, + "reward_change_max": 0.00044342875480651855, + "reward_change_mean": -0.428625563159585, + "reward_change_min": -0.7268031612038612, + "reward_change_std": 0.28326542116701603, + "reward_std": 0.6268951743841171, + "rewards/cosine_scaled_reward": -0.08482082560658455, + "rewards/format_reward": 0.6250000186264515, + "step": 370 + }, + { + "advantage_max": 1.182312160730362, + "advantage_mean": -7.450580929990736e-09, + "advantage_min": -0.6014973521232605, + "advantage_std": 0.6524987481534481, + "completion_length": 1996.7083435058594, + "epoch": 0.424, + "grad_norm": 0.3128429651260376, + "kl": 0.255828857421875, + "lambda_div_used": 0.6, + "learning_rate": 2.7048349887476037e-07, + "loss": 0.0346, + "reward": 0.329143688082695, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.329143688082695, + "reward_after_std": 0.6524987518787384, + "reward_before_mean": 0.9214862994849682, + "reward_before_std": 0.5499526411294937, + "reward_change_max": 0.00031290203332901, + "reward_change_mean": -0.5923426318913698, + "reward_change_min": -0.95097865909338, + "reward_change_std": 0.3701365366578102, + "reward_std": 0.6524987779557705, + "rewards/cosine_scaled_reward": 0.0857431460171938, + "rewards/format_reward": 0.7500000055879354, + "step": 371 + }, + { + "advantage_max": 1.4317540675401688, + "advantage_mean": -1.800557042352935e-08, + "advantage_min": -0.6863468810915947, + "advantage_std": 0.752282090485096, + "completion_length": 2944.2500915527344, + "epoch": 0.42514285714285716, + "grad_norm": 0.6662515997886658, + "kl": 0.368804931640625, + "lambda_div_used": 0.6, + "learning_rate": 2.6802828488599294e-07, + "loss": 0.0396, + "reward": 0.14754285477101803, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.14754285477101803, + "reward_after_std": 0.7522820681333542, + "reward_before_mean": 0.6144732628017664, + "reward_before_std": 0.6542432829737663, + "reward_change_max": 0.00022286921739578247, + "reward_change_mean": -0.46693046763539314, + "reward_change_min": -0.7703455798327923, + "reward_change_std": 0.3007415384054184, + "reward_std": 0.7522820681333542, + "rewards/cosine_scaled_reward": 0.00515330582857132, + "rewards/format_reward": 0.6041666716337204, + "step": 372 + }, + { + "advantage_max": 1.57389435172081, + "advantage_mean": -2.297262396977473e-08, + "advantage_min": -0.8639702498912811, + "advantage_std": 0.8862636424601078, + "completion_length": 1759.6667175292969, + "epoch": 0.42628571428571427, + "grad_norm": 0.27647003531455994, + "kl": 0.2136993408203125, + "lambda_div_used": 0.6, + "learning_rate": 2.655868138008171e-07, + "loss": -0.0049, + "reward": 0.1719374004751444, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1719374004751444, + "reward_after_std": 0.886263657361269, + "reward_before_mean": 0.6331824623048306, + "reward_before_std": 0.9114639163017273, + "reward_change_max": 0.00457899272441864, + "reward_change_mean": -0.4612450823187828, + "reward_change_min": -0.9425050392746925, + "reward_change_std": 0.37030158564448357, + "reward_std": 0.8862636797130108, + "rewards/cosine_scaled_reward": -0.03757543582469225, + "rewards/format_reward": 0.7083333469927311, + "step": 373 + }, + { + "advantage_max": 1.6222765147686005, + "advantage_mean": -2.4835269396561444e-09, + "advantage_min": -0.8567904755473137, + "advantage_std": 0.8852887079119682, + "completion_length": 2289.354217529297, + "epoch": 0.42742857142857144, + "grad_norm": 0.6635468602180481, + "kl": 0.295684814453125, + "lambda_div_used": 0.6, + "learning_rate": 2.631592046130896e-07, + "loss": 0.0296, + "reward": 0.23773611336946487, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.23773611336946487, + "reward_after_std": 0.8852887004613876, + "reward_before_mean": 0.7330911438912153, + "reward_before_std": 0.8372438997030258, + "reward_change_max": 0.0004977062344551086, + "reward_change_mean": -0.4953550100326538, + "reward_change_min": -0.85561952739954, + "reward_change_std": 0.35369635559618473, + "reward_std": 0.8852887377142906, + "rewards/cosine_scaled_reward": 0.054045562632381916, + "rewards/format_reward": 0.6250000186264515, + "step": 374 + }, + { + "advantage_max": 1.8102723434567451, + "advantage_mean": -6.208817460162663e-09, + "advantage_min": -0.8086415976285934, + "advantage_std": 0.9527671858668327, + "completion_length": 2324.6458740234375, + "epoch": 0.42857142857142855, + "grad_norm": 0.6844770312309265, + "kl": 0.289886474609375, + "lambda_div_used": 0.6, + "learning_rate": 2.6074557564105724e-07, + "loss": 0.0403, + "reward": 0.1343357115983963, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1343357115983963, + "reward_after_std": 0.9527672007679939, + "reward_before_mean": 0.5494648832827806, + "reward_before_std": 0.8957410082221031, + "reward_change_max": 0.0022966861724853516, + "reward_change_mean": -0.4151291511952877, + "reward_change_min": -0.7877466715872288, + "reward_change_std": 0.3019845802336931, + "reward_std": 0.9527672305703163, + "rewards/cosine_scaled_reward": -0.08985091745853424, + "rewards/format_reward": 0.7291666902601719, + "step": 375 + }, + { + "advantage_max": 1.3477432578802109, + "advantage_mean": 6.208817682207268e-09, + "advantage_min": -0.7531176581978798, + "advantage_std": 0.7548925988376141, + "completion_length": 2447.2917251586914, + "epoch": 0.4297142857142857, + "grad_norm": 1.0445984601974487, + "kl": 0.40826416015625, + "lambda_div_used": 0.6, + "learning_rate": 2.583460445215911e-07, + "loss": 0.0045, + "reward": 0.044009771198034286, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.044009771198034286, + "reward_after_std": 0.7548925764858723, + "reward_before_mean": 0.46251408383250237, + "reward_before_std": 0.7559454832226038, + "reward_change_max": 0.0011388212442398071, + "reward_change_mean": -0.41850431449711323, + "reward_change_min": -0.7947998829185963, + "reward_change_std": 0.3245093934237957, + "reward_std": 0.7548926025629044, + "rewards/cosine_scaled_reward": -0.12290963158011436, + "rewards/format_reward": 0.7083333507180214, + "step": 376 + }, + { + "advantage_max": 1.4688562378287315, + "advantage_mean": 1.2417633588057697e-09, + "advantage_min": -0.6980493329465389, + "advantage_std": 0.7906424179673195, + "completion_length": 2726.3541870117188, + "epoch": 0.4308571428571429, + "grad_norm": 0.40274783968925476, + "kl": 0.328857421875, + "lambda_div_used": 0.6, + "learning_rate": 2.5596072820445254e-07, + "loss": 0.0206, + "reward": 0.2742973640561104, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2742973640561104, + "reward_after_std": 0.7906424179673195, + "reward_before_mean": 0.8079462740570307, + "reward_before_std": 0.6935162991285324, + "reward_change_max": 0.0008223950862884521, + "reward_change_mean": -0.5336488857865334, + "reward_change_min": -0.8744311928749084, + "reward_change_std": 0.3369473982602358, + "reward_std": 0.7906424328684807, + "rewards/cosine_scaled_reward": 0.03938978351652622, + "rewards/format_reward": 0.729166679084301, + "step": 377 + }, + { + "advantage_max": 1.7968226224184036, + "advantage_mean": -7.450580929990736e-09, + "advantage_min": -0.9664948359131813, + "advantage_std": 0.9928526803851128, + "completion_length": 2514.2709159851074, + "epoch": 0.432, + "grad_norm": 0.9523245692253113, + "kl": 0.306884765625, + "lambda_div_used": 0.6, + "learning_rate": 2.5358974294659373e-07, + "loss": 0.0033, + "reward": 0.536355035379529, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.536355035379529, + "reward_after_std": 0.9928526468575001, + "reward_before_mean": 1.1798031572252512, + "reward_before_std": 0.9377436637878418, + "reward_change_max": 0.0010698065161705017, + "reward_change_mean": -0.6434481181204319, + "reward_change_min": -1.0964898467063904, + "reward_change_std": 0.43880243599414825, + "reward_std": 0.9928526617586613, + "rewards/cosine_scaled_reward": 0.17323489300906658, + "rewards/format_reward": 0.8333333432674408, + "step": 378 + }, + { + "advantage_max": 1.465286523103714, + "advantage_mean": -2.2895014217816367e-09, + "advantage_min": -0.7986491471529007, + "advantage_std": 0.8143725395202637, + "completion_length": 2638.291702270508, + "epoch": 0.43314285714285716, + "grad_norm": 0.5379257202148438, + "kl": 0.4554443359375, + "lambda_div_used": 0.6, + "learning_rate": 2.512332043064913e-07, + "loss": 0.0501, + "reward": 0.009639232186600566, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.009639232186600566, + "reward_after_std": 0.8143725544214249, + "reward_before_mean": 0.39682869613170624, + "reward_before_std": 0.832601822912693, + "reward_change_max": 0.00031301379203796387, + "reward_change_mean": -0.3871894534677267, + "reward_change_min": -0.7461157105863094, + "reward_change_std": 0.301790377125144, + "reward_std": 0.8143725767731667, + "rewards/cosine_scaled_reward": -0.13491899985820055, + "rewards/format_reward": 0.6666666939854622, + "step": 379 + }, + { + "advantage_max": 1.2629759535193443, + "advantage_mean": -1.6763806953701277e-08, + "advantage_min": -0.8979735262691975, + "advantage_std": 0.7433908097445965, + "completion_length": 2067.666732788086, + "epoch": 0.4342857142857143, + "grad_norm": 0.7482815980911255, + "kl": 0.200897216796875, + "lambda_div_used": 0.6, + "learning_rate": 2.488912271385139e-07, + "loss": 0.0249, + "reward": 0.12406534794718027, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.12406534794718027, + "reward_after_std": 0.7433908171951771, + "reward_before_mean": 0.5958141903392971, + "reward_before_std": 0.7890961095690727, + "reward_change_max": 0.0007266402244567871, + "reward_change_mean": -0.47174884378910065, + "reward_change_min": -0.871945433318615, + "reward_change_std": 0.3627962898463011, + "reward_std": 0.7433908544480801, + "rewards/cosine_scaled_reward": -0.03542623296380043, + "rewards/format_reward": 0.666666679084301, + "step": 380 + }, + { + "advantage_max": 1.2968988865613937, + "advantage_mean": -3.7252904094842165e-09, + "advantage_min": -0.703218549489975, + "advantage_std": 0.7332097329199314, + "completion_length": 2710.437530517578, + "epoch": 0.43542857142857144, + "grad_norm": 0.4402064085006714, + "kl": 0.405792236328125, + "lambda_div_used": 0.6, + "learning_rate": 2.465639255873246e-07, + "loss": 0.0172, + "reward": 0.010914841666817665, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.010914841666817665, + "reward_after_std": 0.7332097329199314, + "reward_before_mean": 0.41524326987564564, + "reward_before_std": 0.7420315518975258, + "reward_change_max": 0.0018931254744529724, + "reward_change_mean": -0.4043284226208925, + "reward_change_min": -0.7467291578650475, + "reward_change_std": 0.3047961834818125, + "reward_std": 0.733209740370512, + "rewards/cosine_scaled_reward": -0.13612838461995125, + "rewards/format_reward": 0.6875000111758709, + "step": 381 + }, + { + "advantage_max": 1.5032060518860817, + "advantage_mean": 6.829699084054397e-09, + "advantage_min": -0.7504288945347071, + "advantage_std": 0.7993675842881203, + "completion_length": 2214.041748046875, + "epoch": 0.43657142857142855, + "grad_norm": 0.4265294671058655, + "kl": 0.3445281982421875, + "lambda_div_used": 0.6, + "learning_rate": 2.4425141308231765e-07, + "loss": 0.0432, + "reward": 0.11006945720873773, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.11006945720873773, + "reward_after_std": 0.7993675693869591, + "reward_before_mean": 0.5445982171222568, + "reward_before_std": 0.7350498847663403, + "reward_change_max": 0.0, + "reward_change_mean": -0.43452877178788185, + "reward_change_min": -0.7362462729215622, + "reward_change_std": 0.29714934155344963, + "reward_std": 0.7993675693869591, + "rewards/cosine_scaled_reward": -0.11311756214126945, + "rewards/format_reward": 0.7708333507180214, + "step": 382 + }, + { + "advantage_max": 1.6201047748327255, + "advantage_mean": -2.607703353252333e-08, + "advantage_min": -0.8351623527705669, + "advantage_std": 0.8896178603172302, + "completion_length": 3022.916748046875, + "epoch": 0.4377142857142857, + "grad_norm": 0.9334142208099365, + "kl": 0.3941650390625, + "lambda_div_used": 0.6, + "learning_rate": 2.4195380233209006e-07, + "loss": 0.0189, + "reward": 0.4147815710166469, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4147815710166469, + "reward_after_std": 0.8896179012954235, + "reward_before_mean": 1.0097543355077505, + "reward_before_std": 0.8233935404568911, + "reward_change_max": 0.0, + "reward_change_mean": -0.5949728116393089, + "reward_change_min": -1.0136651918292046, + "reward_change_std": 0.3960152920335531, + "reward_std": 0.8896179012954235, + "rewards/cosine_scaled_reward": 0.12987715937197208, + "rewards/format_reward": 0.7500000111758709, + "step": 383 + }, + { + "advantage_max": 1.6311913207173347, + "advantage_mean": -4.5324366204635425e-08, + "advantage_min": -0.8875380381941795, + "advantage_std": 0.9250488579273224, + "completion_length": 1581.3125305175781, + "epoch": 0.43885714285714283, + "grad_norm": 0.9882168173789978, + "kl": 0.163604736328125, + "lambda_div_used": 0.6, + "learning_rate": 2.3967120531894857e-07, + "loss": 0.0593, + "reward": 0.5434218298178166, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5434218298178166, + "reward_after_std": 0.9250488504767418, + "reward_before_mean": 1.2076098858378828, + "reward_before_std": 0.8887167200446129, + "reward_change_max": 0.0017512813210487366, + "reward_change_mean": -0.664188090711832, + "reward_change_min": -1.1719777584075928, + "reward_change_std": 0.461810689419508, + "reward_std": 0.9250488728284836, + "rewards/cosine_scaled_reward": 0.26005493476986885, + "rewards/format_reward": 0.6875000074505806, + "step": 384 + }, + { + "advantage_max": 1.601989060640335, + "advantage_mean": -8.692344288796505e-09, + "advantage_min": -0.7441845238208771, + "advantage_std": 0.8422457277774811, + "completion_length": 2135.791732788086, + "epoch": 0.44, + "grad_norm": 0.6853197813034058, + "kl": 0.221923828125, + "lambda_div_used": 0.6, + "learning_rate": 2.374037332934512e-07, + "loss": 0.0038, + "reward": 0.20848367968574166, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.20848367968574166, + "reward_after_std": 0.8422457128763199, + "reward_before_mean": 0.6883424371480942, + "reward_before_std": 0.7463456504046917, + "reward_change_max": 0.0, + "reward_change_mean": -0.47985872626304626, + "reward_change_min": -0.8243284970521927, + "reward_change_std": 0.3069882392883301, + "reward_std": 0.8422457501292229, + "rewards/cosine_scaled_reward": -0.030828803312033415, + "rewards/format_reward": 0.7500000260770321, + "step": 385 + }, + { + "advantage_max": 1.5159094631671906, + "advantage_mean": -7.45058070794613e-09, + "advantage_min": -0.7980938293039799, + "advantage_std": 0.8149101585149765, + "completion_length": 2838.8750915527344, + "epoch": 0.44114285714285717, + "grad_norm": 0.46451351046562195, + "kl": 0.37481689453125, + "lambda_div_used": 0.6, + "learning_rate": 2.3515149676898552e-07, + "loss": 0.0264, + "reward": 0.2753634084947407, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2753634084947407, + "reward_after_std": 0.8149101585149765, + "reward_before_mean": 0.8048880062997341, + "reward_before_std": 0.7382436729967594, + "reward_change_max": 0.0003626197576522827, + "reward_change_mean": -0.5295246057212353, + "reward_change_min": -0.8955358266830444, + "reward_change_std": 0.346976475790143, + "reward_std": 0.8149101696908474, + "rewards/cosine_scaled_reward": 0.058694007340818644, + "rewards/format_reward": 0.6875000111758709, + "step": 386 + }, + { + "advantage_max": 1.5711029320955276, + "advantage_mean": 1.8626450937198058e-09, + "advantage_min": -0.7883075922727585, + "advantage_std": 0.8619452305138111, + "completion_length": 2691.604232788086, + "epoch": 0.4422857142857143, + "grad_norm": 0.39026057720184326, + "kl": 0.354156494140625, + "lambda_div_used": 0.6, + "learning_rate": 2.3291460551638237e-07, + "loss": 0.0308, + "reward": 0.08357733162119985, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.08357733162119985, + "reward_after_std": 0.8619452491402626, + "reward_before_mean": 0.4990644520148635, + "reward_before_std": 0.8501224853098392, + "reward_change_max": 0.0007921233773231506, + "reward_change_mean": -0.41548711247742176, + "reward_change_min": -0.7823773622512817, + "reward_change_std": 0.32049538008868694, + "reward_std": 0.8619452938437462, + "rewards/cosine_scaled_reward": -0.08380109909921885, + "rewards/format_reward": 0.6666666734963655, + "step": 387 + }, + { + "advantage_max": 1.3556857779622078, + "advantage_mean": 2.4835265510780857e-09, + "advantage_min": -0.755245964974165, + "advantage_std": 0.7445164695382118, + "completion_length": 2470.8334045410156, + "epoch": 0.44342857142857145, + "grad_norm": 0.5328013896942139, + "kl": 0.38946533203125, + "lambda_div_used": 0.6, + "learning_rate": 2.306931685585657e-07, + "loss": 0.0397, + "reward": 0.27331763273105025, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.27331763273105025, + "reward_after_std": 0.7445164769887924, + "reward_before_mean": 0.8157808966934681, + "reward_before_std": 0.6723228208720684, + "reward_change_max": 0.0001545920968055725, + "reward_change_mean": -0.5424632374197245, + "reward_change_min": -0.9075334519147873, + "reward_change_std": 0.3518282901495695, + "reward_std": 0.7445164918899536, + "rewards/cosine_scaled_reward": 0.064140435308218, + "rewards/format_reward": 0.6875000242143869, + "step": 388 + }, + { + "advantage_max": 1.287089891731739, + "advantage_mean": -6.208817571184966e-09, + "advantage_min": -0.6707641631364822, + "advantage_std": 0.7045476697385311, + "completion_length": 2096.25008392334, + "epoch": 0.44457142857142856, + "grad_norm": 0.724722146987915, + "kl": 0.24822998046875, + "lambda_div_used": 0.6, + "learning_rate": 2.2848729416523859e-07, + "loss": -0.0112, + "reward": 0.03399020340293646, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.03399020340293646, + "reward_after_std": 0.7045476622879505, + "reward_before_mean": 0.45298708602786064, + "reward_before_std": 0.6788679845631123, + "reward_change_max": 0.0008410587906837463, + "reward_change_mean": -0.41899688611738384, + "reward_change_min": -0.7899326011538506, + "reward_change_std": 0.2929133272264153, + "reward_std": 0.7045476697385311, + "rewards/cosine_scaled_reward": -0.10683980397880077, + "rewards/format_reward": 0.6666666772216558, + "step": 389 + }, + { + "advantage_max": 1.5262744650244713, + "advantage_mean": -1.1796752852344383e-08, + "advantage_min": -0.7143342308700085, + "advantage_std": 0.8563202805817127, + "completion_length": 2600.5625762939453, + "epoch": 0.44571428571428573, + "grad_norm": 1.5622819662094116, + "kl": 0.375518798828125, + "lambda_div_used": 0.6, + "learning_rate": 2.2629708984760706e-07, + "loss": 0.035, + "reward": 0.043615717673674226, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.043615717673674226, + "reward_after_std": 0.8563202731311321, + "reward_before_mean": 0.44365750439465046, + "reward_before_std": 0.8856248073279858, + "reward_change_max": 0.003797389566898346, + "reward_change_mean": -0.40004180651158094, + "reward_change_min": -0.9396551251411438, + "reward_change_std": 0.34944793209433556, + "reward_std": 0.8563202954828739, + "rewards/cosine_scaled_reward": -0.10108792106620967, + "rewards/format_reward": 0.6458333432674408, + "step": 390 + }, + { + "advantage_max": 1.7013674080371857, + "advantage_mean": -9.313225857177088e-09, + "advantage_min": -0.8481043614447117, + "advantage_std": 0.9216614998877048, + "completion_length": 2286.1042098999023, + "epoch": 0.44685714285714284, + "grad_norm": 0.7252182960510254, + "kl": 0.368560791015625, + "lambda_div_used": 0.6, + "learning_rate": 2.2412266235313973e-07, + "loss": 0.0725, + "reward": 0.17385222483426332, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.17385222483426332, + "reward_after_std": 0.9216614998877048, + "reward_before_mean": 0.6222281260415912, + "reward_before_std": 0.8909232281148434, + "reward_change_max": 0.0006584227085113525, + "reward_change_mean": -0.4483759216964245, + "reward_change_min": -0.8788629658520222, + "reward_change_std": 0.3533425759524107, + "reward_std": 0.9216615185141563, + "rewards/cosine_scaled_reward": 0.05069740046747029, + "rewards/format_reward": 0.5208333376795053, + "step": 391 + }, + { + "advantage_max": 1.3481401428580284, + "advantage_mean": -9.934108202713787e-09, + "advantage_min": -0.8792018666863441, + "advantage_std": 0.7797165811061859, + "completion_length": 2121.3542098999023, + "epoch": 0.448, + "grad_norm": 0.6497393250465393, + "kl": 0.25140380859375, + "lambda_div_used": 0.6, + "learning_rate": 2.2196411766036487e-07, + "loss": 0.0394, + "reward": 0.280883414670825, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.280883414670825, + "reward_after_std": 0.7797165736556053, + "reward_before_mean": 0.8301408728584647, + "reward_before_std": 0.7800439111888409, + "reward_change_max": 0.000479675829410553, + "reward_change_mean": -0.549257442355156, + "reward_change_min": -0.9315827190876007, + "reward_change_std": 0.3869467042386532, + "reward_std": 0.7797165811061859, + "rewards/cosine_scaled_reward": 0.01923706941306591, + "rewards/format_reward": 0.7916666828095913, + "step": 392 + }, + { + "advantage_max": 1.978425569832325, + "advantage_mean": -2.2972624191819335e-08, + "advantage_min": -0.8907108381390572, + "advantage_std": 1.0441827401518822, + "completion_length": 2310.4583892822266, + "epoch": 0.4491428571428571, + "grad_norm": 0.5620689392089844, + "kl": 0.23760986328125, + "lambda_div_used": 0.6, + "learning_rate": 2.1982156097370557e-07, + "loss": -0.007, + "reward": 0.18085258081555367, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.18085258081555367, + "reward_after_std": 1.0441827550530434, + "reward_before_mean": 0.6036436296999454, + "reward_before_std": 0.999367343261838, + "reward_change_max": 0.0009649470448493958, + "reward_change_mean": -0.42279105074703693, + "reward_change_min": -0.8772672526538372, + "reward_change_std": 0.33015530183911324, + "reward_std": 1.0441827774047852, + "rewards/cosine_scaled_reward": -0.03151152500322496, + "rewards/format_reward": 0.6666666846722364, + "step": 393 + }, + { + "advantage_max": 1.2289385050535202, + "advantage_mean": 7.450580818968433e-09, + "advantage_min": -0.5500497072935104, + "advantage_std": 0.640130028128624, + "completion_length": 2670.7292404174805, + "epoch": 0.4502857142857143, + "grad_norm": 0.37953662872314453, + "kl": 0.3760986328125, + "lambda_div_used": 0.6, + "learning_rate": 2.1769509671835223e-07, + "loss": 0.0425, + "reward": -0.11808802396990359, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.11808802396990359, + "reward_after_std": 0.6401300318539143, + "reward_before_mean": 0.21933782380074263, + "reward_before_std": 0.5647947750985622, + "reward_change_max": 2.1852552890777588e-05, + "reward_change_mean": -0.3374258540570736, + "reward_change_min": -0.5754900723695755, + "reward_change_std": 0.21995082311332226, + "reward_std": 0.6401300616562366, + "rewards/cosine_scaled_reward": -0.24449776113033295, + "rewards/format_reward": 0.7083333432674408, + "step": 394 + }, + { + "advantage_max": 1.7114560678601265, + "advantage_mean": -2.8560559695023358e-08, + "advantage_min": -0.7531477734446526, + "advantage_std": 0.915810015052557, + "completion_length": 2337.000030517578, + "epoch": 0.4514285714285714, + "grad_norm": 0.5603395700454712, + "kl": 0.28656005859375, + "lambda_div_used": 0.6, + "learning_rate": 2.1558482853517253e-07, + "loss": 0.0182, + "reward": 0.30787351354956627, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.30787351354956627, + "reward_after_std": 0.9158100299537182, + "reward_before_mean": 0.824693463742733, + "reward_before_std": 0.8460011817514896, + "reward_change_max": 0.0010833367705345154, + "reward_change_mean": -0.5168199352920055, + "reward_change_min": -0.9332334361970425, + "reward_change_std": 0.3591248635202646, + "reward_std": 0.9158100374042988, + "rewards/cosine_scaled_reward": 0.05818005627952516, + "rewards/format_reward": 0.708333345130086, + "step": 395 + }, + { + "advantage_max": 1.3889750689268112, + "advantage_mean": -1.8626448716752009e-09, + "advantage_min": -0.7343614771962166, + "advantage_std": 0.7592885904014111, + "completion_length": 2536.3750762939453, + "epoch": 0.45257142857142857, + "grad_norm": 0.4565255641937256, + "kl": 0.31463623046875, + "lambda_div_used": 0.6, + "learning_rate": 2.134908592756607e-07, + "loss": 0.0303, + "reward": 0.32924389315303415, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.32924389315303415, + "reward_after_std": 0.7592885978519917, + "reward_before_mean": 0.9013736853376031, + "reward_before_std": 0.6819272413849831, + "reward_change_max": 0.0, + "reward_change_mean": -0.5721297487616539, + "reward_change_min": -0.9597470313310623, + "reward_change_std": 0.3658205308020115, + "reward_std": 0.7592885978519917, + "rewards/cosine_scaled_reward": 0.06527014682069421, + "rewards/format_reward": 0.7708333507180214, + "step": 396 + }, + { + "advantage_max": 1.3574538677930832, + "advantage_mean": -1.5522042984272844e-08, + "advantage_min": -0.5243251714855433, + "advantage_std": 0.7060611471533775, + "completion_length": 2016.8542175292969, + "epoch": 0.45371428571428574, + "grad_norm": 0.8537499904632568, + "kl": 0.1929931640625, + "lambda_div_used": 0.6, + "learning_rate": 2.1141329099692406e-07, + "loss": -0.034, + "reward": 0.11655983421951532, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.11655983421951532, + "reward_after_std": 0.7060611508786678, + "reward_before_mean": 0.5711096227169037, + "reward_before_std": 0.579546969383955, + "reward_change_max": 0.001223057508468628, + "reward_change_mean": -0.45454983226954937, + "reward_change_min": -0.7644696235656738, + "reward_change_std": 0.28570349514484406, + "reward_std": 0.7060611769556999, + "rewards/cosine_scaled_reward": -0.037361856549978256, + "rewards/format_reward": 0.6458333432674408, + "step": 397 + }, + { + "advantage_max": 1.2000513300299644, + "advantage_mean": -9.93410742555767e-09, + "advantage_min": -0.8511512242257595, + "advantage_std": 0.6951940208673477, + "completion_length": 2343.541748046875, + "epoch": 0.45485714285714285, + "grad_norm": 0.4527902603149414, + "kl": 0.2850341796875, + "lambda_div_used": 0.6, + "learning_rate": 2.0935222495670968e-07, + "loss": 0.0159, + "reward": 0.004995799623429775, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.004995799623429775, + "reward_after_std": 0.6951940171420574, + "reward_before_mean": 0.41582280572038144, + "reward_before_std": 0.7327194809913635, + "reward_change_max": 0.002899445593357086, + "reward_change_mean": -0.41082701925188303, + "reward_change_min": -0.7185042686760426, + "reward_change_std": 0.3099668100476265, + "reward_std": 0.6951940283179283, + "rewards/cosine_scaled_reward": -0.11500527150928974, + "rewards/format_reward": 0.6458333544433117, + "step": 398 + }, + { + "advantage_max": 1.6650248989462852, + "advantage_mean": -3.0112764060064023e-08, + "advantage_min": -0.7992283068597317, + "advantage_std": 0.9040088169276714, + "completion_length": 2293.0209045410156, + "epoch": 0.456, + "grad_norm": 0.3662654459476471, + "kl": 0.20367431640625, + "lambda_div_used": 0.6, + "learning_rate": 2.0730776160846853e-07, + "loss": 0.0122, + "reward": 0.42200227081775665, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.42200227081775665, + "reward_after_std": 0.904008824378252, + "reward_before_mean": 1.0163545496761799, + "reward_before_std": 0.8316931277513504, + "reward_change_max": 0.0, + "reward_change_mean": -0.5943523086607456, + "reward_change_min": -1.0510099232196808, + "reward_change_std": 0.3899985756725073, + "reward_std": 0.9040088318288326, + "rewards/cosine_scaled_reward": 0.07067725621163845, + "rewards/format_reward": 0.8750000149011612, + "step": 399 + }, + { + "advantage_max": 1.7473459392786026, + "advantage_mean": -1.5522043039783995e-08, + "advantage_min": -0.8987687975168228, + "advantage_std": 0.9387243762612343, + "completion_length": 1392.0416946411133, + "epoch": 0.45714285714285713, + "grad_norm": 0.2878487706184387, + "kl": 0.1201019287109375, + "lambda_div_used": 0.6, + "learning_rate": 2.0528000059645995e-07, + "loss": -0.0384, + "reward": 0.5482377011794597, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5482377011794597, + "reward_after_std": 0.9387243762612343, + "reward_before_mean": 1.1946721822023392, + "reward_before_std": 0.8241510428488255, + "reward_change_max": 0.000455513596534729, + "reward_change_mean": -0.6464345380663872, + "reward_change_min": -1.0573916137218475, + "reward_change_std": 0.41280957497656345, + "reward_std": 0.9387244209647179, + "rewards/cosine_scaled_reward": 0.2223361013457179, + "rewards/format_reward": 0.7500000204890966, + "step": 400 + }, + { + "advantage_max": 1.1178666576743126, + "advantage_mean": -1.676380706472358e-08, + "advantage_min": -0.4279431030154228, + "advantage_std": 0.5744078829884529, + "completion_length": 2834.541778564453, + "epoch": 0.4582857142857143, + "grad_norm": 0.5082582235336304, + "kl": 0.5018310546875, + "lambda_div_used": 0.6, + "learning_rate": 2.032690407508949e-07, + "loss": 0.0078, + "reward": 0.255753539968282, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.255753539968282, + "reward_after_std": 0.5744078755378723, + "reward_before_mean": 0.814882205799222, + "reward_before_std": 0.3851259686052799, + "reward_change_max": 0.0, + "reward_change_mean": -0.5591286532580853, + "reward_change_min": -0.8304774910211563, + "reward_change_std": 0.30313936062157154, + "reward_std": 0.5744078904390335, + "rewards/cosine_scaled_reward": 0.03244107961654663, + "rewards/format_reward": 0.7500000018626451, + "step": 401 + }, + { + "advantage_max": 1.2266887575387955, + "advantage_mean": 1.3038516710750514e-08, + "advantage_min": -0.5537307001650333, + "advantage_std": 0.6451385356485844, + "completion_length": 2054.104248046875, + "epoch": 0.4594285714285714, + "grad_norm": 1.4331732988357544, + "kl": 2.166656494140625, + "lambda_div_used": 0.6, + "learning_rate": 2.0127498008311922e-07, + "loss": 0.0531, + "reward": 0.015745140612125397, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.015745140612125397, + "reward_after_std": 0.645138543099165, + "reward_before_mean": 0.4195268111070618, + "reward_before_std": 0.547524556517601, + "reward_change_max": 0.0012333318591117859, + "reward_change_mean": -0.40378167293965816, + "reward_change_min": -0.6718766801059246, + "reward_change_std": 0.261042807251215, + "reward_std": 0.6451385729014874, + "rewards/cosine_scaled_reward": -0.12356993323192, + "rewards/format_reward": 0.6666666772216558, + "step": 402 + }, + { + "advantage_max": 1.441964067518711, + "advantage_mean": -6.208817349140361e-09, + "advantage_min": -0.8136776760220528, + "advantage_std": 0.816083088517189, + "completion_length": 2189.562568664551, + "epoch": 0.4605714285714286, + "grad_norm": 1.0570858716964722, + "kl": 0.165130615234375, + "lambda_div_used": 0.6, + "learning_rate": 1.9929791578083655e-07, + "loss": 0.0366, + "reward": 0.23330267828714568, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.23330267828714568, + "reward_after_std": 0.8160830736160278, + "reward_before_mean": 0.7426377143710852, + "reward_before_std": 0.8013063948601484, + "reward_change_max": 0.0, + "reward_change_mean": -0.5093350410461426, + "reward_change_min": -0.9062354825437069, + "reward_change_std": 0.3685335274785757, + "reward_std": 0.8160831034183502, + "rewards/cosine_scaled_reward": -0.03493115585297346, + "rewards/format_reward": 0.8125000111758709, + "step": 403 + }, + { + "advantage_max": 1.1642425507307053, + "advantage_mean": -1.2417634254191512e-08, + "advantage_min": -0.6793310269713402, + "advantage_std": 0.6530807539820671, + "completion_length": 2348.687530517578, + "epoch": 0.4617142857142857, + "grad_norm": 0.3486461639404297, + "kl": 0.2562255859375, + "lambda_div_used": 0.6, + "learning_rate": 1.9733794420337213e-07, + "loss": -0.005, + "reward": 0.22327465657144785, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.22327465657144785, + "reward_after_std": 0.6530807465314865, + "reward_before_mean": 0.7535381922498345, + "reward_before_std": 0.6034115515649319, + "reward_change_max": 0.0, + "reward_change_mean": -0.530263539403677, + "reward_change_min": -0.8887680508196354, + "reward_change_std": 0.3427054435014725, + "reward_std": 0.6530807688832283, + "rewards/cosine_scaled_reward": 0.0017690882086753845, + "rewards/format_reward": 0.7500000186264515, + "step": 404 + }, + { + "advantage_max": 1.8272784128785133, + "advantage_mean": -7.45058115203534e-09, + "advantage_min": -0.9271564707159996, + "advantage_std": 1.0041028670966625, + "completion_length": 2259.083381652832, + "epoch": 0.46285714285714286, + "grad_norm": 0.5416133403778076, + "kl": 0.2439727783203125, + "lambda_div_used": 0.6, + "learning_rate": 1.9539516087697517e-07, + "loss": 0.0235, + "reward": 0.5748835622798651, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5748835622798651, + "reward_after_std": 1.0041028745472431, + "reward_before_mean": 1.2313530333340168, + "reward_before_std": 0.9382626451551914, + "reward_change_max": 0.0, + "reward_change_mean": -0.656469464302063, + "reward_change_min": -1.1503315716981888, + "reward_change_std": 0.444169782102108, + "reward_std": 1.0041029155254364, + "rewards/cosine_scaled_reward": 0.17817650269716978, + "rewards/format_reward": 0.8750000223517418, + "step": 405 + }, + { + "advantage_max": 1.7873649969696999, + "advantage_mean": -2.42143873285805e-08, + "advantage_min": -0.8977884463965893, + "advantage_std": 0.9721400141716003, + "completion_length": 2179.4376068115234, + "epoch": 0.464, + "grad_norm": 1.5086603164672852, + "kl": 0.2163543701171875, + "lambda_div_used": 0.6, + "learning_rate": 1.934696604901642e-07, + "loss": 0.0538, + "reward": 0.5057008937001228, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5057008937001228, + "reward_after_std": 0.9721400439739227, + "reward_before_mean": 1.1307021956890821, + "reward_before_std": 0.8954973742365837, + "reward_change_max": 0.0, + "reward_change_mean": -0.6250013373792171, + "reward_change_min": -1.0632744282484055, + "reward_change_std": 0.4240496251732111, + "reward_std": 0.9721400514245033, + "rewards/cosine_scaled_reward": 0.13826777413487434, + "rewards/format_reward": 0.8541666865348816, + "step": 406 + }, + { + "advantage_max": 1.568674311041832, + "advantage_mean": -2.23517424569053e-08, + "advantage_min": -0.8621452823281288, + "advantage_std": 0.8588430657982826, + "completion_length": 2311.687568664551, + "epoch": 0.46514285714285714, + "grad_norm": 0.7728492617607117, + "kl": 0.22466278076171875, + "lambda_div_used": 0.6, + "learning_rate": 1.915615368891117e-07, + "loss": 0.0356, + "reward": 0.48120962642133236, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.48120962642133236, + "reward_after_std": 0.8588430657982826, + "reward_before_mean": 1.1148183699697256, + "reward_before_std": 0.7778550088405609, + "reward_change_max": 0.0, + "reward_change_mean": -0.6336087584495544, + "reward_change_min": -1.072540633380413, + "reward_change_std": 0.4125585276633501, + "reward_std": 0.858843095600605, + "rewards/cosine_scaled_reward": 0.1511591738089919, + "rewards/format_reward": 0.8125000186264515, + "step": 407 + }, + { + "advantage_max": 1.6463065594434738, + "advantage_mean": -4.84287747681833e-08, + "advantage_min": -0.7306011319160461, + "advantage_std": 0.8598592802882195, + "completion_length": 2620.1250610351562, + "epoch": 0.4662857142857143, + "grad_norm": 0.7048142552375793, + "kl": 0.27455902099609375, + "lambda_div_used": 0.6, + "learning_rate": 1.8967088307307e-07, + "loss": 0.0408, + "reward": 0.36233993619680405, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.36233993619680405, + "reward_after_std": 0.8598593175411224, + "reward_before_mean": 0.9261019452242181, + "reward_before_std": 0.7249259613454342, + "reward_change_max": 0.0001357346773147583, + "reward_change_mean": -0.5637620557099581, + "reward_change_min": -0.8630594946444035, + "reward_change_std": 0.34440525993704796, + "reward_std": 0.8598593324422836, + "rewards/cosine_scaled_reward": 0.1297176331281662, + "rewards/format_reward": 0.6666666772216558, + "step": 408 + }, + { + "advantage_max": 1.3177231326699257, + "advantage_mean": -3.166496848061584e-08, + "advantage_min": -0.5865827538073063, + "advantage_std": 0.7076354771852493, + "completion_length": 3044.291717529297, + "epoch": 0.4674285714285714, + "grad_norm": 0.6633644104003906, + "kl": 0.371734619140625, + "lambda_div_used": 0.6, + "learning_rate": 1.8779779118983867e-07, + "loss": 0.035, + "reward": 0.18076253915205598, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.18076253915205598, + "reward_after_std": 0.707635473459959, + "reward_before_mean": 0.6803056970238686, + "reward_before_std": 0.6231886614114046, + "reward_change_max": 0.0009818002581596375, + "reward_change_mean": -0.499543197453022, + "reward_change_min": -0.8707900457084179, + "reward_change_std": 0.32813665829598904, + "reward_std": 0.7076354995369911, + "rewards/cosine_scaled_reward": -0.045263820327818394, + "rewards/format_reward": 0.7708333432674408, + "step": 409 + }, + { + "advantage_max": 1.5697736218571663, + "advantage_mean": -1.6142925107764938e-08, + "advantage_min": -0.8108692672103643, + "advantage_std": 0.8806155137717724, + "completion_length": 2453.520851135254, + "epoch": 0.4685714285714286, + "grad_norm": 0.7028084993362427, + "kl": 0.3839569091796875, + "lambda_div_used": 0.6, + "learning_rate": 1.8594235253127372e-07, + "loss": 0.0156, + "reward": 0.1786555778235197, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1786555778235197, + "reward_after_std": 0.8806155323982239, + "reward_before_mean": 0.6418251923751086, + "reward_before_std": 0.8865823717787862, + "reward_change_max": 0.001151353120803833, + "reward_change_mean": -0.4631696194410324, + "reward_change_min": -0.9242124333977699, + "reward_change_std": 0.37072835117578506, + "reward_std": 0.8806155361235142, + "rewards/cosine_scaled_reward": -0.03325407952070236, + "rewards/format_reward": 0.7083333469927311, + "step": 410 + }, + { + "advantage_max": 1.6483391597867012, + "advantage_mean": -1.2417631367611648e-09, + "advantage_min": -0.9043002128601074, + "advantage_std": 0.9175063073635101, + "completion_length": 3027.6043090820312, + "epoch": 0.4697142857142857, + "grad_norm": 1.045861840248108, + "kl": 0.3138427734375, + "lambda_div_used": 0.6, + "learning_rate": 1.8410465752883758e-07, + "loss": 0.0091, + "reward": 0.3661972675472498, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3661972675472498, + "reward_after_std": 0.9175063371658325, + "reward_before_mean": 0.9325102139264345, + "reward_before_std": 0.8878533262759447, + "reward_change_max": 0.0, + "reward_change_mean": -0.5663129389286041, + "reward_change_min": -0.9974900856614113, + "reward_change_std": 0.4016368221491575, + "reward_std": 0.9175063446164131, + "rewards/cosine_scaled_reward": 0.049588436260819435, + "rewards/format_reward": 0.8333333432674408, + "step": 411 + }, + { + "advantage_max": 1.6660237908363342, + "advantage_mean": -8.692344122263052e-09, + "advantage_min": -0.8078876323997974, + "advantage_std": 0.8997826427221298, + "completion_length": 2970.166748046875, + "epoch": 0.47085714285714286, + "grad_norm": 1.1745421886444092, + "kl": 0.33782958984375, + "lambda_div_used": 0.6, + "learning_rate": 1.822847957491922e-07, + "loss": 0.0598, + "reward": 0.41099782660603523, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.41099782660603523, + "reward_after_std": 0.8997826538980007, + "reward_before_mean": 0.9954136228188872, + "reward_before_std": 0.8094833288341761, + "reward_change_max": 0.0, + "reward_change_mean": -0.5844158306717873, + "reward_change_min": -0.9936563968658447, + "reward_change_std": 0.3828421086072922, + "reward_std": 0.8997826650738716, + "rewards/cosine_scaled_reward": 0.10187347792088985, + "rewards/format_reward": 0.7916666828095913, + "step": 412 + }, + { + "advantage_max": 1.453200839459896, + "advantage_mean": -3.228585088166369e-08, + "advantage_min": -0.7565282955765724, + "advantage_std": 0.7962896563112736, + "completion_length": 2726.9375610351562, + "epoch": 0.472, + "grad_norm": 0.5472252368927002, + "kl": 0.32379150390625, + "lambda_div_used": 0.6, + "learning_rate": 1.804828558898332e-07, + "loss": 0.0299, + "reward": 0.3759459834545851, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3759459834545851, + "reward_after_std": 0.7962896451354027, + "reward_before_mean": 0.9632791336625814, + "reward_before_std": 0.7153794188052416, + "reward_change_max": 0.000562228262424469, + "reward_change_mean": -0.5873331986367702, + "reward_change_min": -0.9731578454375267, + "reward_change_std": 0.38425926864147186, + "reward_std": 0.7962896823883057, + "rewards/cosine_scaled_reward": 0.04413955472409725, + "rewards/format_reward": 0.8750000149011612, + "step": 413 + }, + { + "advantage_max": 1.6291797831654549, + "advantage_mean": 4.346172255420555e-09, + "advantage_min": -0.7556163519620895, + "advantage_std": 0.8886192888021469, + "completion_length": 3423.291748046875, + "epoch": 0.47314285714285714, + "grad_norm": 1.426332712173462, + "kl": 0.516357421875, + "lambda_div_used": 0.6, + "learning_rate": 1.7869892577476722e-07, + "loss": 0.0101, + "reward": 0.11338039068505168, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.11338039068505168, + "reward_after_std": 0.8886192999780178, + "reward_before_mean": 0.540803493000567, + "reward_before_std": 0.8775491826236248, + "reward_change_max": 0.0, + "reward_change_mean": -0.42742311395704746, + "reward_change_min": -0.8786544650793076, + "reward_change_std": 0.33593827672302723, + "reward_std": 0.8886192999780178, + "rewards/cosine_scaled_reward": -0.0525149138411507, + "rewards/format_reward": 0.6458333376795053, + "step": 414 + }, + { + "advantage_max": 1.3992087170481682, + "advantage_mean": -1.4280280180578586e-08, + "advantage_min": -0.8285434059798717, + "advantage_std": 0.794212706387043, + "completion_length": 3180.5625610351562, + "epoch": 0.4742857142857143, + "grad_norm": 0.5881298184394836, + "kl": 0.4810791015625, + "lambda_div_used": 0.6, + "learning_rate": 1.7693309235023127e-07, + "loss": 0.0677, + "reward": 0.20738966763019562, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.20738966763019562, + "reward_after_std": 0.7942127101123333, + "reward_before_mean": 0.7095146963838488, + "reward_before_std": 0.7941083908081055, + "reward_change_max": 0.0004946514964103699, + "reward_change_mean": -0.5021250303834677, + "reward_change_min": -0.9012851640582085, + "reward_change_std": 0.3629884757101536, + "reward_std": 0.7942127585411072, + "rewards/cosine_scaled_reward": 0.011007343418896198, + "rewards/format_reward": 0.6875000037252903, + "step": 415 + }, + { + "advantage_max": 1.5969299972057343, + "advantage_mean": -4.346172310931706e-09, + "advantage_min": -0.7314600218087435, + "advantage_std": 0.8682622611522675, + "completion_length": 2351.0417098999023, + "epoch": 0.4754285714285714, + "grad_norm": 0.619637668132782, + "kl": 0.21923828125, + "lambda_div_used": 0.6, + "learning_rate": 1.7518544168045524e-07, + "loss": 0.017, + "reward": 0.44899107329547405, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.44899107329547405, + "reward_after_std": 0.8682622611522675, + "reward_before_mean": 1.062957238405943, + "reward_before_std": 0.7723662834614515, + "reward_change_max": 0.0, + "reward_change_mean": -0.6139661446213722, + "reward_change_min": -1.0317911952733994, + "reward_change_std": 0.40039036609232426, + "reward_std": 0.8682622760534286, + "rewards/cosine_scaled_reward": 0.0627285810187459, + "rewards/format_reward": 0.9375000074505806, + "step": 416 + }, + { + "advantage_max": 1.364932507276535, + "advantage_mean": 0.0, + "advantage_min": -0.7825092300772667, + "advantage_std": 0.7713565900921822, + "completion_length": 3244.5625610351562, + "epoch": 0.4765714285714286, + "grad_norm": 0.9164711833000183, + "kl": 0.445068359375, + "lambda_div_used": 0.6, + "learning_rate": 1.7345605894346726e-07, + "loss": 0.0304, + "reward": 0.14966288317373255, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.14966288317373255, + "reward_after_std": 0.7713565900921822, + "reward_before_mean": 0.6202793065458536, + "reward_before_std": 0.7590491138398647, + "reward_change_max": 0.0, + "reward_change_mean": -0.4706163890659809, + "reward_change_min": -0.8597500771284103, + "reward_change_std": 0.3365605156868696, + "reward_std": 0.771356612443924, + "rewards/cosine_scaled_reward": -0.0752770397812128, + "rewards/format_reward": 0.770833358168602, + "step": 417 + }, + { + "advantage_max": 1.712221696972847, + "advantage_mean": -2.2972624136308184e-08, + "advantage_min": -0.925393857061863, + "advantage_std": 0.9599899351596832, + "completion_length": 2048.1875381469727, + "epoch": 0.4777142857142857, + "grad_norm": 0.6422997117042542, + "kl": 0.4207611083984375, + "lambda_div_used": 0.6, + "learning_rate": 1.7174502842694212e-07, + "loss": -0.0047, + "reward": 0.5533166616223752, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5533166616223752, + "reward_after_std": 0.9599899277091026, + "reward_before_mean": 1.2044505663216114, + "reward_before_std": 0.9257478043437004, + "reward_change_max": 0.0003671795129776001, + "reward_change_mean": -0.6511338837444782, + "reward_change_min": -1.1758594512939453, + "reward_change_std": 0.4581292551010847, + "reward_std": 0.9599899724125862, + "rewards/cosine_scaled_reward": 0.2168085891753435, + "rewards/format_reward": 0.7708333469927311, + "step": 418 + }, + { + "advantage_max": 1.504380777478218, + "advantage_mean": -2.4214387106535895e-08, + "advantage_min": -0.9585339762270451, + "advantage_std": 0.8656284362077713, + "completion_length": 2767.1250610351562, + "epoch": 0.47885714285714287, + "grad_norm": 1.0067801475524902, + "kl": 0.3099365234375, + "lambda_div_used": 0.6, + "learning_rate": 1.7005243352409333e-07, + "loss": -0.0183, + "reward": 0.4650968345813453, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4650968345813453, + "reward_after_std": 0.8656284473836422, + "reward_before_mean": 1.0998233817517757, + "reward_before_std": 0.8545772768557072, + "reward_change_max": 0.0, + "reward_change_mean": -0.6347265485674143, + "reward_change_min": -1.069912202656269, + "reward_change_std": 0.4303837288171053, + "reward_std": 0.8656284883618355, + "rewards/cosine_scaled_reward": 0.17491167411208153, + "rewards/format_reward": 0.7500000223517418, + "step": 419 + }, + { + "advantage_max": 1.138153724372387, + "advantage_mean": -9.313225912688239e-09, + "advantage_min": -0.8338864184916019, + "advantage_std": 0.6697599738836288, + "completion_length": 2140.500057220459, + "epoch": 0.48, + "grad_norm": 0.5568791031837463, + "kl": 0.23101806640625, + "lambda_div_used": 0.6, + "learning_rate": 1.6837835672960831e-07, + "loss": 0.0396, + "reward": 0.27711703907698393, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.27711703907698393, + "reward_after_std": 0.6697599701583385, + "reward_before_mean": 0.8464478626847267, + "reward_before_std": 0.659028060734272, + "reward_change_max": 0.0013652518391609192, + "reward_change_mean": -0.5693307928740978, + "reward_change_min": -0.9026694595813751, + "reward_change_std": 0.3788332063704729, + "reward_std": 0.6697599962353706, + "rewards/cosine_scaled_reward": -0.0246927491389215, + "rewards/format_reward": 0.8958333432674408, + "step": 420 + }, + { + "advantage_max": 1.6502467170357704, + "advantage_mean": -6.829699028543246e-09, + "advantage_min": -0.7769649140536785, + "advantage_std": 0.8680305257439613, + "completion_length": 2986.8333740234375, + "epoch": 0.48114285714285715, + "grad_norm": 0.6128113269805908, + "kl": 0.376708984375, + "lambda_div_used": 0.6, + "learning_rate": 1.6672287963562852e-07, + "loss": 0.047, + "reward": 0.0061189401894807816, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.0061189401894807816, + "reward_after_std": 0.8680305257439613, + "reward_before_mean": 0.36435882560908794, + "reward_before_std": 0.8137674778699875, + "reward_change_max": 0.0, + "reward_change_mean": -0.3582398798316717, + "reward_change_min": -0.6710369400680065, + "reward_change_std": 0.2599798422306776, + "reward_std": 0.8680305629968643, + "rewards/cosine_scaled_reward": -0.16157058905810118, + "rewards/format_reward": 0.6875000242143869, + "step": 421 + }, + { + "advantage_max": 1.508325882256031, + "advantage_mean": -2.1109978765032622e-08, + "advantage_min": -0.6916386522352695, + "advantage_std": 0.812825795263052, + "completion_length": 2900.9584045410156, + "epoch": 0.48228571428571426, + "grad_norm": 0.6628983616828918, + "kl": 0.346527099609375, + "lambda_div_used": 0.6, + "learning_rate": 1.6508608292777203e-07, + "loss": 0.0175, + "reward": 0.3042160285403952, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3042160285403952, + "reward_after_std": 0.812825795263052, + "reward_before_mean": 0.8499894179403782, + "reward_before_std": 0.7234699986875057, + "reward_change_max": 0.0008436664938926697, + "reward_change_mean": -0.5457733869552612, + "reward_change_min": -0.9079204723238945, + "reward_change_std": 0.3513330090790987, + "reward_std": 0.8128258399665356, + "rewards/cosine_scaled_reward": 0.00832803500816226, + "rewards/format_reward": 0.8333333469927311, + "step": 422 + }, + { + "advantage_max": 1.5517828837037086, + "advantage_mean": 1.2417634698280722e-09, + "advantage_min": -0.7369258608669043, + "advantage_std": 0.8391496613621712, + "completion_length": 2629.270896911621, + "epoch": 0.48342857142857143, + "grad_norm": 0.5332776308059692, + "kl": 0.31683349609375, + "lambda_div_used": 0.6, + "learning_rate": 1.6346804638120098e-07, + "loss": 0.0051, + "reward": 0.09200821781996638, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.09200821781996638, + "reward_after_std": 0.8391496725380421, + "reward_before_mean": 0.5115727577358484, + "reward_before_std": 0.8018813747912645, + "reward_change_max": 0.00034625083208084106, + "reward_change_mean": -0.4195645246654749, + "reward_change_min": -0.7326377555727959, + "reward_change_std": 0.2982933344319463, + "reward_std": 0.839149683713913, + "rewards/cosine_scaled_reward": -0.10879695974290371, + "rewards/format_reward": 0.7291666846722364, + "step": 423 + }, + { + "advantage_max": 1.4316673576831818, + "advantage_mean": -7.450580818968433e-09, + "advantage_min": -0.7059814631938934, + "advantage_std": 0.7919131703674793, + "completion_length": 3144.0208892822266, + "epoch": 0.4845714285714286, + "grad_norm": 1.1393442153930664, + "kl": 0.509765625, + "lambda_div_used": 0.6, + "learning_rate": 1.6186884885673413e-07, + "loss": 0.0139, + "reward": -0.05595473758876324, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.05595473758876324, + "reward_after_std": 0.7919131703674793, + "reward_before_mean": 0.29102808563038707, + "reward_before_std": 0.7984279748052359, + "reward_change_max": 0.0006482526659965515, + "reward_change_mean": -0.3469828423112631, + "reward_change_min": -0.7688832432031631, + "reward_change_std": 0.29653407260775566, + "reward_std": 0.791913203895092, + "rewards/cosine_scaled_reward": -0.1461526220664382, + "rewards/format_reward": 0.583333345130086, + "step": 424 + }, + { + "advantage_max": 2.001866862177849, + "advantage_mean": -2.9181442262604662e-08, + "advantage_min": -0.9877432808279991, + "advantage_std": 1.09125317633152, + "completion_length": 2045.8542175292969, + "epoch": 0.4857142857142857, + "grad_norm": 1.4421943426132202, + "kl": 0.1781005859375, + "lambda_div_used": 0.6, + "learning_rate": 1.6028856829700258e-07, + "loss": 0.034, + "reward": 0.9677936118096113, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.9677936118096113, + "reward_after_std": 1.0912531986832619, + "reward_before_mean": 1.8249291330575943, + "reward_before_std": 0.9513313472270966, + "reward_change_max": 2.5540590286254883e-05, + "reward_change_mean": -0.8571355231106281, + "reward_change_min": -1.4232430160045624, + "reward_change_std": 0.5513026043772697, + "reward_std": 1.091253213584423, + "rewards/cosine_scaled_reward": 0.4749645469710231, + "rewards/format_reward": 0.8750000074505806, + "step": 425 + }, + { + "advantage_max": 1.7256113290786743, + "advantage_mean": 3.7252904094842165e-09, + "advantage_min": -0.8169574663043022, + "advantage_std": 0.9189626127481461, + "completion_length": 2349.7084045410156, + "epoch": 0.4868571428571429, + "grad_norm": 0.7599732875823975, + "kl": 0.3668975830078125, + "lambda_div_used": 0.6, + "learning_rate": 1.5872728172265146e-07, + "loss": 0.0176, + "reward": 0.4623431172221899, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4623431172221899, + "reward_after_std": 0.9189625754952431, + "reward_before_mean": 1.061521541327238, + "reward_before_std": 0.8109705522656441, + "reward_change_max": 0.0, + "reward_change_mean": -0.5991784110665321, + "reward_change_min": -1.0059629045426846, + "reward_change_std": 0.38436930999159813, + "reward_std": 0.9189625829458237, + "rewards/cosine_scaled_reward": 0.1140940950717777, + "rewards/format_reward": 0.8333333507180214, + "step": 426 + }, + { + "advantage_max": 1.1098849326372147, + "advantage_mean": 5.2774946912581555e-09, + "advantage_min": -0.4467169027775526, + "advantage_std": 0.5858809929341078, + "completion_length": 2994.5209045410156, + "epoch": 0.488, + "grad_norm": 0.7700356841087341, + "kl": 0.361083984375, + "lambda_div_used": 0.6, + "learning_rate": 1.5718506522858572e-07, + "loss": 0.0261, + "reward": 0.219692911952734, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.219692911952734, + "reward_after_std": 0.5858809780329466, + "reward_before_mean": 0.7604799484834075, + "reward_before_std": 0.42057891469448805, + "reward_change_max": 0.0, + "reward_change_mean": -0.5407870132476091, + "reward_change_min": -0.8110893554985523, + "reward_change_std": 0.3113178089261055, + "reward_std": 0.5858809947967529, + "rewards/cosine_scaled_reward": -0.015593377873301506, + "rewards/format_reward": 0.7916666846722364, + "step": 427 + }, + { + "advantage_max": 1.5108700022101402, + "advantage_mean": 8.692344288796505e-09, + "advantage_min": -0.7426536791026592, + "advantage_std": 0.8018022254109383, + "completion_length": 2830.979217529297, + "epoch": 0.48914285714285716, + "grad_norm": 0.6052858829498291, + "kl": 0.4901123046875, + "lambda_div_used": 0.6, + "learning_rate": 1.5566199398026147e-07, + "loss": 0.0524, + "reward": 0.004844090901315212, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.004844090901315212, + "reward_after_std": 0.80180224776268, + "reward_before_mean": 0.37800159538164735, + "reward_before_std": 0.7561859395354986, + "reward_change_max": 0.0, + "reward_change_mean": -0.37315749377012253, + "reward_change_min": -0.7145323418080807, + "reward_change_std": 0.2716970220208168, + "reward_std": 0.8018022775650024, + "rewards/cosine_scaled_reward": -0.17558254208415747, + "rewards/format_reward": 0.729166692122817, + "step": 428 + }, + { + "advantage_max": 1.5547840893268585, + "advantage_mean": 3.1044086745701804e-09, + "advantage_min": -0.8601616211235523, + "advantage_std": 0.8643078021705151, + "completion_length": 2310.000072479248, + "epoch": 0.49028571428571427, + "grad_norm": 0.81473708152771, + "kl": 0.19464111328125, + "lambda_div_used": 0.6, + "learning_rate": 1.5415814221002265e-07, + "loss": 0.0051, + "reward": 0.29873619112186134, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.29873619112186134, + "reward_after_std": 0.8643078021705151, + "reward_before_mean": 0.8367813774384558, + "reward_before_std": 0.8450817838311195, + "reward_change_max": 0.00034783780574798584, + "reward_change_mean": -0.5380451511591673, + "reward_change_min": -1.0198836103081703, + "reward_change_std": 0.3816765770316124, + "reward_std": 0.8643078207969666, + "rewards/cosine_scaled_reward": 0.0017240047454833984, + "rewards/format_reward": 0.8333333469927311, + "step": 429 + }, + { + "advantage_max": 1.532344713807106, + "advantage_mean": -1.8626450937198058e-09, + "advantage_min": -0.5167469158768654, + "advantage_std": 0.7778219282627106, + "completion_length": 2256.250045776367, + "epoch": 0.49142857142857144, + "grad_norm": 0.7217494249343872, + "kl": 0.24468994140625, + "lambda_div_used": 0.6, + "learning_rate": 1.5267358321348285e-07, + "loss": 0.0228, + "reward": 0.2548238287563436, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2548238287563436, + "reward_after_std": 0.7778219506144524, + "reward_before_mean": 0.769319811835885, + "reward_before_std": 0.6003479808568954, + "reward_change_max": 0.0004530102014541626, + "reward_change_mean": -0.5144959762692451, + "reward_change_min": -0.7555083595216274, + "reward_change_std": 0.28606732469052076, + "reward_std": 0.7778219655156136, + "rewards/cosine_scaled_reward": 0.040909904055297375, + "rewards/format_reward": 0.6875000055879354, + "step": 430 + }, + { + "advantage_max": 1.2318995967507362, + "advantage_mean": -1.024454859832602e-08, + "advantage_min": -0.6241239868104458, + "advantage_std": 0.6798490583896637, + "completion_length": 2692.979263305664, + "epoch": 0.49257142857142855, + "grad_norm": 0.409150630235672, + "kl": 0.30722808837890625, + "lambda_div_used": 0.6, + "learning_rate": 1.5120838934595337e-07, + "loss": 0.032, + "reward": 0.08035399056097958, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.08035399056097958, + "reward_after_std": 0.6798490472137928, + "reward_before_mean": 0.5355471428483725, + "reward_before_std": 0.6428811736404896, + "reward_change_max": 0.0008766204118728638, + "reward_change_mean": -0.4551931694149971, + "reward_change_min": -0.8111777156591415, + "reward_change_std": 0.31029749289155006, + "reward_std": 0.679849062114954, + "rewards/cosine_scaled_reward": -0.18014310486614704, + "rewards/format_reward": 0.8958333432674408, + "step": 431 + }, + { + "advantage_max": 1.5041551887989044, + "advantage_mean": -9.313226023710541e-09, + "advantage_min": -0.660958144813776, + "advantage_std": 0.7954925112426281, + "completion_length": 3086.354248046875, + "epoch": 0.4937142857142857, + "grad_norm": 0.9090562462806702, + "kl": 0.4284820556640625, + "lambda_div_used": 0.6, + "learning_rate": 1.4976263201891613e-07, + "loss": 0.0191, + "reward": 0.15131173096597195, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.15131173096597195, + "reward_after_std": 0.7954924888908863, + "reward_before_mean": 0.6126590967178345, + "reward_before_std": 0.7049643620848656, + "reward_change_max": 1.9669532775878906e-06, + "reward_change_mean": -0.4613474104553461, + "reward_change_min": -0.7781161703169346, + "reward_change_std": 0.2984175104647875, + "reward_std": 0.7954925149679184, + "rewards/cosine_scaled_reward": -0.037420436972752213, + "rewards/format_reward": 0.6875000111758709, + "step": 432 + }, + { + "advantage_max": 1.669169820845127, + "advantage_mean": -6.208817127095756e-09, + "advantage_min": -0.726369071751833, + "advantage_std": 0.875070009380579, + "completion_length": 2916.4792251586914, + "epoch": 0.4948571428571429, + "grad_norm": 0.7531116008758545, + "kl": 0.50775146484375, + "lambda_div_used": 0.6, + "learning_rate": 1.483363816965435e-07, + "loss": 0.0138, + "reward": 0.38012822410382796, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.38012822410382796, + "reward_after_std": 0.8750699795782566, + "reward_before_mean": 0.9496738724410534, + "reward_before_std": 0.7544910423457623, + "reward_change_max": 0.0009396150708198547, + "reward_change_mean": -0.5695456936955452, + "reward_change_min": -0.8935679048299789, + "reward_change_std": 0.34632328897714615, + "reward_std": 0.8750700131058693, + "rewards/cosine_scaled_reward": 0.06858693342655897, + "rewards/format_reward": 0.8125000149011612, + "step": 433 + }, + { + "advantage_max": 1.0035743713378906, + "advantage_mean": 5.58793539218172e-09, + "advantage_min": -0.407320324331522, + "advantage_std": 0.5333701260387897, + "completion_length": 3139.812545776367, + "epoch": 0.496, + "grad_norm": 2.604379653930664, + "kl": 0.579833984375, + "lambda_div_used": 0.6, + "learning_rate": 1.469297078922642e-07, + "loss": -0.0146, + "reward": -0.09332448849454522, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.09332448849454522, + "reward_after_std": 0.53337012976408, + "reward_before_mean": 0.28518972732126713, + "reward_before_std": 0.46411832980811596, + "reward_change_max": 0.0, + "reward_change_mean": -0.3785142097622156, + "reward_change_min": -0.6618387140333652, + "reward_change_std": 0.23844915814697742, + "reward_std": 0.5333701446652412, + "rewards/cosine_scaled_reward": -0.23240514658391476, + "rewards/format_reward": 0.750000013038516, + "step": 434 + }, + { + "advantage_max": 1.3506320640444756, + "advantage_mean": 3.7252904094842165e-09, + "advantage_min": -0.7499644756317139, + "advantage_std": 0.735443152487278, + "completion_length": 2099.187545776367, + "epoch": 0.49714285714285716, + "grad_norm": 0.4415164291858673, + "kl": 0.31992340087890625, + "lambda_div_used": 0.6, + "learning_rate": 1.4554267916537495e-07, + "loss": 0.0248, + "reward": 0.22127506462857127, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.22127506462857127, + "reward_after_std": 0.7354431487619877, + "reward_before_mean": 0.7249867599457502, + "reward_before_std": 0.6692739687860012, + "reward_change_max": 0.00029733777046203613, + "reward_change_mean": -0.5037116818130016, + "reward_change_min": -0.8023575022816658, + "reward_change_std": 0.3164084330201149, + "reward_std": 0.735443152487278, + "rewards/cosine_scaled_reward": -0.05417329433839768, + "rewards/format_reward": 0.8333333469927311, + "step": 435 + }, + { + "advantage_max": 1.5958310216665268, + "advantage_mean": 7.771561172376096e-16, + "advantage_min": -0.7120724134147167, + "advantage_std": 0.8530858978629112, + "completion_length": 2177.645866394043, + "epoch": 0.4982857142857143, + "grad_norm": 0.43583664298057556, + "kl": 0.21563720703125, + "lambda_div_used": 0.6, + "learning_rate": 1.4417536311769885e-07, + "loss": 0.005, + "reward": 0.5215210448950529, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5215210448950529, + "reward_after_std": 0.85308588296175, + "reward_before_mean": 1.1749990084208548, + "reward_before_std": 0.7143463343381882, + "reward_change_max": 0.0, + "reward_change_mean": -0.6534778997302055, + "reward_change_min": -1.098868913948536, + "reward_change_std": 0.4049825519323349, + "reward_std": 0.8530858978629112, + "rewards/cosine_scaled_reward": 0.149999488145113, + "rewards/format_reward": 0.875, + "step": 436 + }, + { + "advantage_max": 1.501756675541401, + "advantage_mean": -9.93410742555767e-09, + "advantage_min": -0.7757223770022392, + "advantage_std": 0.8331665247678757, + "completion_length": 3171.3959045410156, + "epoch": 0.49942857142857144, + "grad_norm": 0.7524950504302979, + "kl": 0.406005859375, + "lambda_div_used": 0.6, + "learning_rate": 1.4282782639029128e-07, + "loss": 0.0419, + "reward": 0.14843263989314437, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.14843263989314437, + "reward_after_std": 0.8331665396690369, + "reward_before_mean": 0.60943434946239, + "reward_before_std": 0.8200727887451649, + "reward_change_max": 5.303323268890381e-05, + "reward_change_mean": -0.46100171096622944, + "reward_change_min": -0.8599130436778069, + "reward_change_std": 0.33630202896893024, + "reward_std": 0.8331665582954884, + "rewards/cosine_scaled_reward": -0.09111617412418127, + "rewards/format_reward": 0.7916666865348816, + "step": 437 + }, + { + "advantage_max": 1.5247759148478508, + "advantage_mean": -3.1044086745701804e-09, + "advantage_min": -0.624311737716198, + "advantage_std": 0.7884609997272491, + "completion_length": 2956.6250762939453, + "epoch": 0.5005714285714286, + "grad_norm": 0.3807191550731659, + "kl": 0.3555755615234375, + "lambda_div_used": 0.6, + "learning_rate": 1.4150013466019114e-07, + "loss": 0.0229, + "reward": 0.012609760742634535, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.012609760742634535, + "reward_after_std": 0.7884609997272491, + "reward_before_mean": 0.3945481926202774, + "reward_before_std": 0.6936755180358887, + "reward_change_max": 0.0006226003170013428, + "reward_change_mean": -0.3819384379312396, + "reward_change_min": -0.6387332938611507, + "reward_change_std": 0.25689230114221573, + "reward_std": 0.7884610146284103, + "rewards/cosine_scaled_reward": -0.12564256973564625, + "rewards/format_reward": 0.6458333525806665, + "step": 438 + }, + { + "advantage_max": 1.304104894399643, + "advantage_mean": 7.450580985501887e-09, + "advantage_min": -0.747392974793911, + "advantage_std": 0.7265794835984707, + "completion_length": 2621.979202270508, + "epoch": 0.5017142857142857, + "grad_norm": 0.5890460014343262, + "kl": 0.321441650390625, + "lambda_div_used": 0.6, + "learning_rate": 1.4019235263722034e-07, + "loss": 0.0219, + "reward": 0.08033835608512163, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.08033835608512163, + "reward_after_std": 0.7265794575214386, + "reward_before_mean": 0.5202324008569121, + "reward_before_std": 0.7163271754980087, + "reward_change_max": 0.0, + "reward_change_mean": -0.43989402055740356, + "reward_change_min": -0.7937644794583321, + "reward_change_std": 0.3140565250068903, + "reward_std": 0.7265795208513737, + "rewards/cosine_scaled_reward": -0.10446714423596859, + "rewards/format_reward": 0.7291666772216558, + "step": 439 + }, + { + "advantage_max": 1.4373757094144821, + "advantage_mean": -1.7384688633104162e-08, + "advantage_min": -0.7765031270682812, + "advantage_std": 0.7870672270655632, + "completion_length": 3122.6458587646484, + "epoch": 0.5028571428571429, + "grad_norm": 0.5948376059532166, + "kl": 0.41351318359375, + "lambda_div_used": 0.6, + "learning_rate": 1.3890454406082956e-07, + "loss": 0.0331, + "reward": -0.030132897198200226, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.030132897198200226, + "reward_after_std": 0.7870672382414341, + "reward_before_mean": 0.3378842771053314, + "reward_before_std": 0.7818763107061386, + "reward_change_max": 0.0, + "reward_change_mean": -0.36801718175411224, + "reward_change_min": -0.6863149777054787, + "reward_change_std": 0.29010715894401073, + "reward_std": 0.7870672680437565, + "rewards/cosine_scaled_reward": -0.1227245363406837, + "rewards/format_reward": 0.5833333525806665, + "step": 440 + }, + { + "advantage_max": 1.5642970129847527, + "advantage_mean": -6.053596901534064e-09, + "advantage_min": -0.9813426993787289, + "advantage_std": 0.8995516113936901, + "completion_length": 2796.166748046875, + "epoch": 0.504, + "grad_norm": 0.7474466562271118, + "kl": 0.31884765625, + "lambda_div_used": 0.6, + "learning_rate": 1.3763677169699217e-07, + "loss": 0.0371, + "reward": 0.38214205752592534, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.38214205752592534, + "reward_after_std": 0.8995516039431095, + "reward_before_mean": 0.9629887044429779, + "reward_before_std": 0.913324523717165, + "reward_change_max": 0.0, + "reward_change_mean": -0.5808466412127018, + "reward_change_min": -1.0513642355799675, + "reward_change_std": 0.4244292415678501, + "reward_std": 0.8995516337454319, + "rewards/cosine_scaled_reward": 0.08566100802272558, + "rewards/format_reward": 0.791666679084301, + "step": 441 + }, + { + "advantage_max": 1.5939322486519814, + "advantage_mean": -2.7318796558262193e-08, + "advantage_min": -0.7752449028193951, + "advantage_std": 0.8887365721166134, + "completion_length": 2893.375045776367, + "epoch": 0.5051428571428571, + "grad_norm": 0.7434902787208557, + "kl": 0.27886962890625, + "lambda_div_used": 0.6, + "learning_rate": 1.3638909733514452e-07, + "loss": 0.0234, + "reward": 0.3462903336621821, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3462903336621821, + "reward_after_std": 0.8887365981936455, + "reward_before_mean": 0.8997693937271833, + "reward_before_std": 0.8497613854706287, + "reward_change_max": 0.0006150901317596436, + "reward_change_mean": -0.5534790195524693, + "reward_change_min": -1.053729109466076, + "reward_change_std": 0.3987939488142729, + "reward_std": 0.8887366205453873, + "rewards/cosine_scaled_reward": 0.022801332794188056, + "rewards/format_reward": 0.8541666716337204, + "step": 442 + }, + { + "advantage_max": 1.4239144548773766, + "advantage_mean": 7.450580596923828e-09, + "advantage_min": -0.7238317169249058, + "advantage_std": 0.7982291206717491, + "completion_length": 3030.979217529297, + "epoch": 0.5062857142857143, + "grad_norm": 0.6095349788665771, + "kl": 0.32720947265625, + "lambda_div_used": 0.6, + "learning_rate": 1.351615817851748e-07, + "loss": 0.0193, + "reward": 0.0006824731826782227, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.0006824731826782227, + "reward_after_std": 0.7982290983200073, + "reward_before_mean": 0.38761329650878906, + "reward_before_std": 0.8135102391242981, + "reward_change_max": 0.0008568018674850464, + "reward_change_mean": -0.38693082239478827, + "reward_change_min": -0.7811053469777107, + "reward_change_std": 0.3120748773217201, + "reward_std": 0.7982291206717491, + "rewards/cosine_scaled_reward": -0.10827669268473983, + "rewards/format_reward": 0.604166679084301, + "step": 443 + }, + { + "advantage_max": 1.024867869913578, + "advantage_mean": -4.346171755820194e-09, + "advantage_min": -0.53191352263093, + "advantage_std": 0.5605506096035242, + "completion_length": 2855.562545776367, + "epoch": 0.5074285714285715, + "grad_norm": 0.5939316749572754, + "kl": 0.3170013427734375, + "lambda_div_used": 0.6, + "learning_rate": 1.3395428487445914e-07, + "loss": 0.0085, + "reward": 0.0003267568536102772, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.0003267568536102772, + "reward_after_std": 0.5605506096035242, + "reward_before_mean": 0.4287848509848118, + "reward_before_std": 0.5058968178927898, + "reward_change_max": 0.00040687620639801025, + "reward_change_mean": -0.42845806665718555, + "reward_change_min": -0.7199537493288517, + "reward_change_std": 0.26975464215502143, + "reward_std": 0.5605506226420403, + "rewards/cosine_scaled_reward": -0.09810760384425521, + "rewards/format_reward": 0.6250000037252903, + "step": 444 + }, + { + "advantage_max": 1.4808774963021278, + "advantage_mean": -3.1044082304809706e-09, + "advantage_min": -0.7681502997875214, + "advantage_std": 0.8168426752090454, + "completion_length": 2854.666748046875, + "epoch": 0.5085714285714286, + "grad_norm": 0.6682929396629333, + "kl": 0.23166656494140625, + "lambda_div_used": 0.6, + "learning_rate": 1.3276726544494571e-07, + "loss": 0.0271, + "reward": 0.3290069226641208, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3290069226641208, + "reward_after_std": 0.8168426714837551, + "reward_before_mean": 0.8914229711517692, + "reward_before_std": 0.7797074243426323, + "reward_change_max": 0.0006837695837020874, + "reward_change_mean": -0.5624160580337048, + "reward_change_min": -0.9781622253358364, + "reward_change_std": 0.37248685862869024, + "reward_std": 0.8168426789343357, + "rewards/cosine_scaled_reward": 0.04987816256470978, + "rewards/format_reward": 0.7916666697710752, + "step": 445 + }, + { + "advantage_max": 1.4862676709890366, + "advantage_mean": -1.2417634698280722e-09, + "advantage_min": -0.939953550696373, + "advantage_std": 0.8497539050877094, + "completion_length": 2677.9792709350586, + "epoch": 0.5097142857142857, + "grad_norm": 1.6928560733795166, + "kl": 0.21446990966796875, + "lambda_div_used": 0.6, + "learning_rate": 1.316005813502869e-07, + "loss": 0.069, + "reward": 0.334447986446321, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.334447986446321, + "reward_after_std": 0.8497539088129997, + "reward_before_mean": 0.8993029966950417, + "reward_before_std": 0.8538679517805576, + "reward_change_max": 0.0010942071676254272, + "reward_change_mean": -0.564854983240366, + "reward_change_min": -0.9794363453984261, + "reward_change_std": 0.40965361148118973, + "reward_std": 0.8497539162635803, + "rewards/cosine_scaled_reward": 0.07465148530900478, + "rewards/format_reward": 0.7500000223517418, + "step": 446 + }, + { + "advantage_max": 1.5106425508856773, + "advantage_mean": -7.761021714181027e-09, + "advantage_min": -0.7425118647515774, + "advantage_std": 0.8140601627528667, + "completion_length": 2409.500030517578, + "epoch": 0.5108571428571429, + "grad_norm": 0.5263113379478455, + "kl": 0.16021728515625, + "lambda_div_used": 0.6, + "learning_rate": 1.3045428945301953e-07, + "loss": 0.0239, + "reward": 0.3240194395184517, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3240194395184517, + "reward_after_std": 0.8140601515769958, + "reward_before_mean": 0.8813347518444061, + "reward_before_std": 0.7404376417398453, + "reward_change_max": 0.00041799992322921753, + "reward_change_mean": -0.5573153309524059, + "reward_change_min": -0.9619341045618057, + "reward_change_std": 0.3613274283707142, + "reward_std": 0.8140601553022861, + "rewards/cosine_scaled_reward": -0.007249297806993127, + "rewards/format_reward": 0.8958333395421505, + "step": 447 + }, + { + "advantage_max": 1.4709734171628952, + "advantage_mean": 7.450580818968433e-09, + "advantage_min": -0.6773672588169575, + "advantage_std": 0.7775681540369987, + "completion_length": 2605.7708587646484, + "epoch": 0.512, + "grad_norm": 1.9863547086715698, + "kl": 0.26190185546875, + "lambda_div_used": 0.6, + "learning_rate": 1.2932844562179352e-07, + "loss": -0.0291, + "reward": 0.2725476995110512, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2725476995110512, + "reward_after_std": 0.7775681465864182, + "reward_before_mean": 0.8013585917651653, + "reward_before_std": 0.6695942915976048, + "reward_change_max": 0.0006482377648353577, + "reward_change_mean": -0.528810883872211, + "reward_change_min": -0.8442369252443314, + "reward_change_std": 0.3180638235062361, + "reward_std": 0.7775681652128696, + "rewards/cosine_scaled_reward": 0.004845963791012764, + "rewards/format_reward": 0.7916666716337204, + "step": 448 + }, + { + "advantage_max": 1.454092152416706, + "advantage_mean": -1.738468857759301e-08, + "advantage_min": -0.9398189447820187, + "advantage_std": 0.833489615470171, + "completion_length": 2516.812515258789, + "epoch": 0.5131428571428571, + "grad_norm": 0.5418146848678589, + "kl": 0.190399169921875, + "lambda_div_used": 0.6, + "learning_rate": 1.2822310472864885e-07, + "loss": 0.0099, + "reward": 0.20460259914398193, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.20460259914398193, + "reward_after_std": 0.833489615470171, + "reward_before_mean": 0.7024632133543491, + "reward_before_std": 0.8663576692342758, + "reward_change_max": 0.00025576353073120117, + "reward_change_mean": -0.49786060582846403, + "reward_change_min": -0.8856289610266685, + "reward_change_std": 0.3694376861676574, + "reward_std": 0.8334896489977837, + "rewards/cosine_scaled_reward": 0.01789826713502407, + "rewards/format_reward": 0.6666666828095913, + "step": 449 + }, + { + "advantage_max": 1.2588047087192535, + "advantage_mean": 2.1730859889323995e-09, + "advantage_min": -0.662886805832386, + "advantage_std": 0.6904132477939129, + "completion_length": 2910.479202270508, + "epoch": 0.5142857142857142, + "grad_norm": 0.738971471786499, + "kl": 0.3026580810546875, + "lambda_div_used": 0.6, + "learning_rate": 1.2713832064634125e-07, + "loss": 0.0117, + "reward": 0.1044841951224953, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1044841951224953, + "reward_after_std": 0.6904132552444935, + "reward_before_mean": 0.5679622534662485, + "reward_before_std": 0.641492698341608, + "reward_change_max": 0.0002985745668411255, + "reward_change_mean": -0.4634780492633581, + "reward_change_min": -0.8120031580328941, + "reward_change_std": 0.31523124873638153, + "reward_std": 0.6904132626950741, + "rewards/cosine_scaled_reward": -0.04935221001505852, + "rewards/format_reward": 0.6666666734963655, + "step": 450 + }, + { + "advantage_max": 1.7307595536112785, + "advantage_mean": -3.290673183942161e-08, + "advantage_min": -0.9142901822924614, + "advantage_std": 0.9461825527250767, + "completion_length": 2641.375015258789, + "epoch": 0.5154285714285715, + "grad_norm": 1.169973611831665, + "kl": 0.228729248046875, + "lambda_div_used": 0.6, + "learning_rate": 1.260741462457165e-07, + "loss": 0.0487, + "reward": 0.4206886999309063, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4206886999309063, + "reward_after_std": 0.9461825452744961, + "reward_before_mean": 1.009432639926672, + "reward_before_std": 0.8840302973985672, + "reward_change_max": 0.0003242567181587219, + "reward_change_mean": -0.5887439749203622, + "reward_change_min": -0.9866149947047234, + "reward_change_std": 0.3979658451862633, + "reward_std": 0.946182556450367, + "rewards/cosine_scaled_reward": 0.16096631158143282, + "rewards/format_reward": 0.687500013038516, + "step": 451 + }, + { + "advantage_max": 1.4747585132718086, + "advantage_mean": 3.725290076417309e-09, + "advantage_min": -0.5936764031648636, + "advantage_std": 0.7980528734624386, + "completion_length": 3164.729248046875, + "epoch": 0.5165714285714286, + "grad_norm": 1.0744494199752808, + "kl": 0.3144378662109375, + "lambda_div_used": 0.6, + "learning_rate": 1.2503063339313356e-07, + "loss": 0.0404, + "reward": 0.020421676337718964, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.020421676337718964, + "reward_after_std": 0.798052866011858, + "reward_before_mean": 0.414817335549742, + "reward_before_std": 0.7678319737315178, + "reward_change_max": 0.0007721483707427979, + "reward_change_mean": -0.3943956736475229, + "reward_change_min": -0.7912264950573444, + "reward_change_std": 0.29552899673581123, + "reward_std": 0.7980528771877289, + "rewards/cosine_scaled_reward": -0.011341335251927376, + "rewards/format_reward": 0.4375000037252903, + "step": 452 + }, + { + "advantage_max": 1.5293823331594467, + "advantage_mean": -1.6142925107764938e-08, + "advantage_min": -0.914829894900322, + "advantage_std": 0.8372916430234909, + "completion_length": 2751.4792098999023, + "epoch": 0.5177142857142857, + "grad_norm": 0.5472725033760071, + "kl": 0.28240966796875, + "lambda_div_used": 0.6, + "learning_rate": 1.2400783294793668e-07, + "loss": 0.0214, + "reward": 0.35070702666416764, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.35070702666416764, + "reward_after_std": 0.8372916504740715, + "reward_before_mean": 0.9138406561687589, + "reward_before_std": 0.7884054854512215, + "reward_change_max": 0.0006369054317474365, + "reward_change_mean": -0.5631336495280266, + "reward_change_min": -0.9314819648861885, + "reward_change_std": 0.3668592553585768, + "reward_std": 0.8372916728258133, + "rewards/cosine_scaled_reward": 0.08192032761871815, + "rewards/format_reward": 0.7500000149011612, + "step": 453 + }, + { + "advantage_max": 1.4231568723917007, + "advantage_mean": -1.9247334115402026e-08, + "advantage_min": -0.832726463675499, + "advantage_std": 0.793323565274477, + "completion_length": 2578.4375610351562, + "epoch": 0.5188571428571429, + "grad_norm": 1.011570692062378, + "kl": 0.17710113525390625, + "lambda_div_used": 0.6, + "learning_rate": 1.2300579475997657e-07, + "loss": 0.0356, + "reward": 0.23797382973134518, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.23797382973134518, + "reward_after_std": 0.7933235578238964, + "reward_before_mean": 0.7599361706525087, + "reward_before_std": 0.7715496458113194, + "reward_change_max": 0.0, + "reward_change_mean": -0.5219623260200024, + "reward_change_min": -0.96772700548172, + "reward_change_std": 0.3597046695649624, + "reward_std": 0.7933235876262188, + "rewards/cosine_scaled_reward": -0.01586526818573475, + "rewards/format_reward": 0.7916666828095913, + "step": 454 + }, + { + "advantage_max": 1.1031616851687431, + "advantage_mean": 1.4590720687213121e-08, + "advantage_min": -0.5992041826248169, + "advantage_std": 0.6113407611846924, + "completion_length": 3231.7709350585938, + "epoch": 0.52, + "grad_norm": 1.4600268602371216, + "kl": 0.35595703125, + "lambda_div_used": 0.6, + "learning_rate": 1.220245676671809e-07, + "loss": -0.0044, + "reward": 0.026436822256073356, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.026436822256073356, + "reward_after_std": 0.6113407723605633, + "reward_before_mean": 0.46537392027676105, + "reward_before_std": 0.5697146020829678, + "reward_change_max": 0.0, + "reward_change_mean": -0.43893709033727646, + "reward_change_min": -0.7486898303031921, + "reward_change_std": 0.28901602514088154, + "reward_std": 0.6113407835364342, + "rewards/cosine_scaled_reward": -0.12147971335798502, + "rewards/format_reward": 0.7083333395421505, + "step": 455 + }, + { + "advantage_max": 1.0907672867178917, + "advantage_mean": -1.1796753296433593e-08, + "advantage_min": -0.6506773978471756, + "advantage_std": 0.6137639144435525, + "completion_length": 2944.854217529297, + "epoch": 0.5211428571428571, + "grad_norm": 0.48930859565734863, + "kl": 0.28680419921875, + "lambda_div_used": 0.6, + "learning_rate": 1.2106419949317388e-07, + "loss": 0.012, + "reward": 0.017696384878945537, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.017696384878945537, + "reward_after_std": 0.6137639237567782, + "reward_before_mean": 0.44923659786581993, + "reward_before_std": 0.5944715887308121, + "reward_change_max": 0.000657103955745697, + "reward_change_mean": -0.4315402414649725, + "reward_change_min": -0.7416375353932381, + "reward_change_std": 0.29969449946656823, + "reward_std": 0.6137639349326491, + "rewards/cosine_scaled_reward": -0.09829837083816528, + "rewards/format_reward": 0.6458333488553762, + "step": 456 + }, + { + "advantage_max": 1.2821914702653885, + "advantage_mean": 5.5879355587151736e-09, + "advantage_min": -0.6453388333320618, + "advantage_std": 0.7012712173163891, + "completion_length": 2911.1875762939453, + "epoch": 0.5222857142857142, + "grad_norm": 0.5257925391197205, + "kl": 0.2884521484375, + "lambda_div_used": 0.6, + "learning_rate": 1.2012473704494537e-07, + "loss": 0.0062, + "reward": 0.1664595603942871, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1664595603942871, + "reward_after_std": 0.7012712173163891, + "reward_before_mean": 0.6591437275055796, + "reward_before_std": 0.6436357796192169, + "reward_change_max": 0.0, + "reward_change_mean": -0.49268414825201035, + "reward_change_min": -0.8301557153463364, + "reward_change_std": 0.3239094000309706, + "reward_std": 0.70127122849226, + "rewards/cosine_scaled_reward": -0.003761494532227516, + "rewards/format_reward": 0.6666666828095913, + "step": 457 + }, + { + "advantage_max": 1.6441849321126938, + "advantage_mean": 9.313226079221693e-09, + "advantage_min": -0.7769365385174751, + "advantage_std": 0.8987438268959522, + "completion_length": 2578.0208740234375, + "epoch": 0.5234285714285715, + "grad_norm": 1.0878689289093018, + "kl": 0.279937744140625, + "lambda_div_used": 0.6, + "learning_rate": 1.1920622611056974e-07, + "loss": 0.0544, + "reward": 0.027010804347810335, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.027010804347810335, + "reward_after_std": 0.8987438231706619, + "reward_before_mean": 0.40003128722310066, + "reward_before_std": 0.9050403535366058, + "reward_change_max": 0.0013420060276985168, + "reward_change_mean": -0.37302049063146114, + "reward_change_min": -0.7793563231825829, + "reward_change_std": 0.30434579588472843, + "reward_std": 0.8987438529729843, + "rewards/cosine_scaled_reward": -0.14373437548056245, + "rewards/format_reward": 0.6875000186264515, + "step": 458 + }, + { + "advantage_max": 1.384127452969551, + "advantage_mean": -2.4835269396561444e-08, + "advantage_min": -0.697060227394104, + "advantage_std": 0.7580831982195377, + "completion_length": 2400.2708892822266, + "epoch": 0.5245714285714286, + "grad_norm": 0.5284119844436646, + "kl": 0.16412353515625, + "lambda_div_used": 0.6, + "learning_rate": 1.1830871145697412e-07, + "loss": 0.013, + "reward": 0.5363429971039295, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5363429971039295, + "reward_after_std": 0.7580831833183765, + "reward_before_mean": 1.2248572246171534, + "reward_before_std": 0.6392851062119007, + "reward_change_max": 0.0005814731121063232, + "reward_change_mean": -0.688514206558466, + "reward_change_min": -1.1085792183876038, + "reward_change_std": 0.4158451687544584, + "reward_std": 0.7580831982195377, + "rewards/cosine_scaled_reward": 0.20617857947945595, + "rewards/format_reward": 0.8125000055879354, + "step": 459 + }, + { + "advantage_max": 1.4096355810761452, + "advantage_mean": 1.552204503818544e-09, + "advantage_min": -0.7541556470096111, + "advantage_std": 0.7679723687469959, + "completion_length": 3067.708450317383, + "epoch": 0.5257142857142857, + "grad_norm": 0.5955216288566589, + "kl": 0.3953094482421875, + "lambda_div_used": 0.6, + "learning_rate": 1.1743223682775649e-07, + "loss": 0.0453, + "reward": 0.04936068008100847, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.04936068008100847, + "reward_after_std": 0.7679723650217056, + "reward_before_mean": 0.4637445118278265, + "reward_before_std": 0.7469018138945103, + "reward_change_max": 0.0, + "reward_change_mean": -0.41438381001353264, + "reward_change_min": -0.7415559887886047, + "reward_change_std": 0.3010368328541517, + "reward_std": 0.7679724022746086, + "rewards/cosine_scaled_reward": -0.12229442107491195, + "rewards/format_reward": 0.7083333395421505, + "step": 460 + }, + { + "advantage_max": 1.6185405403375626, + "advantage_mean": -1.3659398279131096e-08, + "advantage_min": -0.8021063134074211, + "advantage_std": 0.8745506145060062, + "completion_length": 3010.8750915527344, + "epoch": 0.5268571428571428, + "grad_norm": 0.9823641777038574, + "kl": 0.326171875, + "lambda_div_used": 0.6, + "learning_rate": 1.1657684494105386e-07, + "loss": 0.0702, + "reward": 0.17612256668508053, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.17612256668508053, + "reward_after_std": 0.8745506182312965, + "reward_before_mean": 0.6361170820891857, + "reward_before_std": 0.8338725045323372, + "reward_change_max": 0.0004089474678039551, + "reward_change_mean": -0.4599945154041052, + "reward_change_min": -0.8391226977109909, + "reward_change_std": 0.3295242711901665, + "reward_std": 0.8745506331324577, + "rewards/cosine_scaled_reward": -0.02569146826863289, + "rewards/format_reward": 0.6875000111758709, + "step": 461 + }, + { + "advantage_max": 1.3718049079179764, + "advantage_mean": 3.725290520506519e-09, + "advantage_min": -0.573211383074522, + "advantage_std": 0.7311573587357998, + "completion_length": 2979.958396911621, + "epoch": 0.528, + "grad_norm": 0.5489564538002014, + "kl": 0.288818359375, + "lambda_div_used": 0.6, + "learning_rate": 1.1574257748745986e-07, + "loss": 0.0419, + "reward": -0.05020551884081215, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.05020551884081215, + "reward_after_std": 0.7311573512852192, + "reward_before_mean": 0.31298802606761456, + "reward_before_std": 0.6865883357822895, + "reward_change_max": 0.00015259534120559692, + "reward_change_mean": -0.36319353617727757, + "reward_change_min": -0.7135965973138809, + "reward_change_std": 0.2655666396021843, + "reward_std": 0.7311573661863804, + "rewards/cosine_scaled_reward": -0.19767267780844122, + "rewards/format_reward": 0.708333345130086, + "step": 462 + }, + { + "advantage_max": 1.2812325581908226, + "advantage_mean": 3.1044081749698194e-09, + "advantage_min": -0.7424813508987427, + "advantage_std": 0.7148092705756426, + "completion_length": 3094.0208740234375, + "epoch": 0.5291428571428571, + "grad_norm": 0.6410408020019531, + "kl": 0.373321533203125, + "lambda_div_used": 0.6, + "learning_rate": 1.1492947512799328e-07, + "loss": 0.0467, + "reward": 0.0862201638519764, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.0862201638519764, + "reward_after_std": 0.7148092705756426, + "reward_before_mean": 0.5360564319416881, + "reward_before_std": 0.6975500099360943, + "reward_change_max": 0.0016349032521247864, + "reward_change_mean": -0.44983627228066325, + "reward_change_min": -0.7784820459783077, + "reward_change_std": 0.31272281985729933, + "reward_std": 0.7148093041032553, + "rewards/cosine_scaled_reward": -0.04447178915143013, + "rewards/format_reward": 0.6250000093132257, + "step": 463 + }, + { + "advantage_max": 1.2580845430493355, + "advantage_mean": -9.313227411489322e-10, + "advantage_min": -0.46518684923648834, + "advantage_std": 0.6540076658129692, + "completion_length": 2200.0000228881836, + "epoch": 0.5302857142857142, + "grad_norm": 0.8608336448669434, + "kl": 0.327178955078125, + "lambda_div_used": 0.6, + "learning_rate": 1.1413757749211602e-07, + "loss": 0.0051, + "reward": 0.27756172651425004, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.27756172651425004, + "reward_after_std": 0.654007650911808, + "reward_before_mean": 0.832300890237093, + "reward_before_std": 0.48632822558283806, + "reward_change_max": 0.0, + "reward_change_mean": -0.5547391846776009, + "reward_change_min": -0.855882078409195, + "reward_change_std": 0.31370352767407894, + "reward_std": 0.6540076583623886, + "rewards/cosine_scaled_reward": -0.0005162106826901436, + "rewards/format_reward": 0.8333333507180214, + "step": 464 + }, + { + "advantage_max": 1.569391205906868, + "advantage_mean": -1.2107193803068128e-08, + "advantage_min": -0.8988099247217178, + "advantage_std": 0.881404098123312, + "completion_length": 3097.979248046875, + "epoch": 0.5314285714285715, + "grad_norm": 0.6101652383804321, + "kl": 0.368743896484375, + "lambda_div_used": 0.6, + "learning_rate": 1.1336692317580158e-07, + "loss": 0.023, + "reward": 0.03458056226372719, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.03458056226372719, + "reward_after_std": 0.8814041279256344, + "reward_before_mean": 0.42546552093699574, + "reward_before_std": 0.9255600795149803, + "reward_change_max": 0.0013052746653556824, + "reward_change_mean": -0.39088496938347816, + "reward_change_min": -0.7886900715529919, + "reward_change_std": 0.3330534026026726, + "reward_std": 0.8814041391015053, + "rewards/cosine_scaled_reward": -0.08935058303177357, + "rewards/format_reward": 0.604166679084301, + "step": 465 + }, + { + "advantage_max": 1.9037173837423325, + "advantage_mean": -3.7252904094842165e-09, + "advantage_min": -0.8234491348266602, + "advantage_std": 1.006118580698967, + "completion_length": 3100.5833892822266, + "epoch": 0.5325714285714286, + "grad_norm": 1.2542622089385986, + "kl": 0.33632659912109375, + "lambda_div_used": 0.6, + "learning_rate": 1.1261754973965422e-07, + "loss": 0.0639, + "reward": 0.28048246819525957, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.28048246819525957, + "reward_after_std": 1.006118580698967, + "reward_before_mean": 0.7699800729751587, + "reward_before_std": 0.9303984567523003, + "reward_change_max": 0.0, + "reward_change_mean": -0.4894975833594799, + "reward_change_min": -0.9491491317749023, + "reward_change_std": 0.35173042491078377, + "reward_std": 1.006118580698967, + "rewards/cosine_scaled_reward": 0.10374002461321652, + "rewards/format_reward": 0.562500013038516, + "step": 466 + }, + { + "advantage_max": 1.1005533039569855, + "advantage_mean": 1.0865429111994729e-09, + "advantage_min": -0.6562899611890316, + "advantage_std": 0.6144971549510956, + "completion_length": 3035.5833892822266, + "epoch": 0.5337142857142857, + "grad_norm": 0.6637769341468811, + "kl": 0.325714111328125, + "lambda_div_used": 0.6, + "learning_rate": 1.1188949370707787e-07, + "loss": 0.0167, + "reward": 0.05374788446351886, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.05374788446351886, + "reward_after_std": 0.6144971549510956, + "reward_before_mean": 0.5047153122723103, + "reward_before_std": 0.572929710149765, + "reward_change_max": 0.0007006600499153137, + "reward_change_mean": -0.4509674310684204, + "reward_change_min": -0.7618739381432533, + "reward_change_std": 0.29695762135088444, + "reward_std": 0.6144971884787083, + "rewards/cosine_scaled_reward": -0.14347568154335022, + "rewards/format_reward": 0.7916666828095913, + "step": 467 + }, + { + "advantage_max": 1.8903379365801811, + "advantage_mean": -2.0178656356950597e-08, + "advantage_min": -0.9778851941227913, + "advantage_std": 1.0337693467736244, + "completion_length": 3057.291717529297, + "epoch": 0.5348571428571428, + "grad_norm": 1.200481653213501, + "kl": 0.3438720703125, + "lambda_div_used": 0.6, + "learning_rate": 1.1118279056249653e-07, + "loss": 0.0748, + "reward": 0.42284363880753517, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.42284363880753517, + "reward_after_std": 1.0337693840265274, + "reward_before_mean": 0.9913366120308638, + "reward_before_std": 1.0011842101812363, + "reward_change_max": 0.0007850900292396545, + "reward_change_mean": -0.5684929899871349, + "reward_change_min": -1.0620117411017418, + "reward_change_std": 0.40233112312853336, + "reward_std": 1.033769391477108, + "rewards/cosine_scaled_reward": 0.08941829390823841, + "rewards/format_reward": 0.8125000186264515, + "step": 468 + }, + { + "advantage_max": 1.535146877169609, + "advantage_mean": -1.4901161637936866e-08, + "advantage_min": -0.8611436411738396, + "advantage_std": 0.8365602642297745, + "completion_length": 2808.9583892822266, + "epoch": 0.536, + "grad_norm": 0.8671948313713074, + "kl": 0.34912109375, + "lambda_div_used": 0.6, + "learning_rate": 1.1049747474962444e-07, + "loss": 0.0171, + "reward": 0.35874230810441077, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.35874230810441077, + "reward_after_std": 0.8365602716803551, + "reward_before_mean": 0.9347008776385337, + "reward_before_std": 0.7686922624707222, + "reward_change_max": 0.0, + "reward_change_mean": -0.5759586170315742, + "reward_change_min": -0.9069937616586685, + "reward_change_std": 0.3752599321305752, + "reward_std": 0.8365602791309357, + "rewards/cosine_scaled_reward": 0.05068379477597773, + "rewards/format_reward": 0.8333333507180214, + "step": 469 + }, + { + "advantage_max": 1.4765567556023598, + "advantage_mean": -1.4280279347911318e-08, + "advantage_min": -0.8885560110211372, + "advantage_std": 0.8526263982057571, + "completion_length": 3234.416748046875, + "epoch": 0.5371428571428571, + "grad_norm": 0.4323049783706665, + "kl": 0.38311767578125, + "lambda_div_used": 0.6, + "learning_rate": 1.0983357966978745e-07, + "loss": 0.0266, + "reward": 0.08719077915884554, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.08719077915884554, + "reward_after_std": 0.8526264131069183, + "reward_before_mean": 0.5135673470795155, + "reward_before_std": 0.9073382653295994, + "reward_change_max": 0.0005387812852859497, + "reward_change_mean": -0.42637658305466175, + "reward_change_min": -0.8534897528588772, + "reward_change_std": 0.3533334955573082, + "reward_std": 0.8526264280080795, + "rewards/cosine_scaled_reward": -0.05571634043008089, + "rewards/format_reward": 0.6250000204890966, + "step": 470 + }, + { + "advantage_max": 1.6796835511922836, + "advantage_mean": -6.208817349140361e-09, + "advantage_min": -0.8010094054043293, + "advantage_std": 0.893382478505373, + "completion_length": 3111.104248046875, + "epoch": 0.5382857142857143, + "grad_norm": 0.856365978717804, + "kl": 0.4088134765625, + "lambda_div_used": 0.6, + "learning_rate": 1.0919113768029517e-07, + "loss": 0.0227, + "reward": 0.2544027629774064, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2544027629774064, + "reward_after_std": 0.8933824747800827, + "reward_before_mean": 0.750761479139328, + "reward_before_std": 0.8166644982993603, + "reward_change_max": 0.00011660903692245483, + "reward_change_mean": -0.4963587149977684, + "reward_change_min": -0.8781631253659725, + "reward_change_std": 0.3329938519746065, + "reward_std": 0.8933825083076954, + "rewards/cosine_scaled_reward": -0.020452602300792933, + "rewards/format_reward": 0.7916666753590107, + "step": 471 + }, + { + "advantage_max": 1.6807861477136612, + "advantage_mean": -4.346172144398253e-09, + "advantage_min": -0.7541506327688694, + "advantage_std": 0.896699994802475, + "completion_length": 2950.666748046875, + "epoch": 0.5394285714285715, + "grad_norm": 0.8270246386528015, + "kl": 0.33209228515625, + "lambda_div_used": 0.6, + "learning_rate": 1.0857018009286381e-07, + "loss": 0.037, + "reward": 0.09066922077909112, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.09066922077909112, + "reward_after_std": 0.896699994802475, + "reward_before_mean": 0.4971227226778865, + "reward_before_std": 0.8639278411865234, + "reward_change_max": 0.00027048587799072266, + "reward_change_mean": -0.40645348466932774, + "reward_change_min": -0.7924029342830181, + "reward_change_std": 0.298714161850512, + "reward_std": 0.8967000283300877, + "rewards/cosine_scaled_reward": -0.11602198891341686, + "rewards/format_reward": 0.7291666772216558, + "step": 472 + }, + { + "advantage_max": 1.2666622251272202, + "advantage_mean": -3.1044085080367267e-09, + "advantage_min": -0.7336488291621208, + "advantage_std": 0.7046432234346867, + "completion_length": 2937.6458892822266, + "epoch": 0.5405714285714286, + "grad_norm": 0.6345215439796448, + "kl": 0.26678466796875, + "lambda_div_used": 0.6, + "learning_rate": 1.0797073717209013e-07, + "loss": -0.0034, + "reward": 0.24014693347271532, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.24014693347271532, + "reward_after_std": 0.7046432085335255, + "reward_before_mean": 0.7769346954301, + "reward_before_std": 0.6478455290198326, + "reward_change_max": 0.00018671154975891113, + "reward_change_mean": -0.5367877427488565, + "reward_change_min": -0.9317773431539536, + "reward_change_std": 0.3534272387623787, + "reward_std": 0.7046432197093964, + "rewards/cosine_scaled_reward": 0.013467340730130672, + "rewards/format_reward": 0.7500000167638063, + "step": 473 + }, + { + "advantage_max": 1.8945399820804596, + "advantage_mean": 1.2417634476236117e-08, + "advantage_min": -0.7374499104917049, + "advantage_std": 0.9751530736684799, + "completion_length": 2611.479217529297, + "epoch": 0.5417142857142857, + "grad_norm": 0.5259870886802673, + "kl": 0.29508209228515625, + "lambda_div_used": 0.6, + "learning_rate": 1.0739283813397639e-07, + "loss": 0.0486, + "reward": 0.3512391453841701, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3512391453841701, + "reward_after_std": 0.9751530811190605, + "reward_before_mean": 0.8826748724095523, + "reward_before_std": 0.8347831852734089, + "reward_change_max": 0.0, + "reward_change_mean": -0.5314356926828623, + "reward_change_min": -0.868693646043539, + "reward_change_std": 0.3366575762629509, + "reward_std": 0.9751530885696411, + "rewards/cosine_scaled_reward": 0.07675410318188369, + "rewards/format_reward": 0.7291666772216558, + "step": 474 + }, + { + "advantage_max": 1.640965461730957, + "advantage_mean": 1.986821529520455e-08, + "advantage_min": -0.6341977827250957, + "advantage_std": 0.8619020059704781, + "completion_length": 2496.4167098999023, + "epoch": 0.5428571428571428, + "grad_norm": 0.5010621547698975, + "kl": 0.47027587890625, + "lambda_div_used": 0.6, + "learning_rate": 1.068365111445064e-07, + "loss": 0.0417, + "reward": 0.2569463880499825, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2569463880499825, + "reward_after_std": 0.8619019910693169, + "reward_before_mean": 0.7555128782987595, + "reward_before_std": 0.7467895969748497, + "reward_change_max": 0.0006568357348442078, + "reward_change_mean": -0.49856647476553917, + "reward_change_min": -0.8757809400558472, + "reward_change_std": 0.3343859352171421, + "reward_std": 0.8619020283222198, + "rewards/cosine_scaled_reward": 0.0965064475312829, + "rewards/format_reward": 0.5625000018626451, + "step": 475 + }, + { + "advantage_max": 1.956729143857956, + "advantage_mean": -9.934107758624577e-09, + "advantage_min": -0.9143726825714111, + "advantage_std": 1.0552341118454933, + "completion_length": 3005.5625915527344, + "epoch": 0.544, + "grad_norm": 2.4432625770568848, + "kl": 0.372802734375, + "lambda_div_used": 0.6, + "learning_rate": 1.063017833182728e-07, + "loss": 0.0981, + "reward": 0.24209812004119158, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.24209812004119158, + "reward_after_std": 1.0552341118454933, + "reward_before_mean": 0.7057913790922612, + "reward_before_std": 1.0380639657378197, + "reward_change_max": 0.0, + "reward_change_mean": -0.46369326300919056, + "reward_change_min": -0.9631114266812801, + "reward_change_std": 0.36324442923069, + "reward_std": 1.055234156548977, + "rewards/cosine_scaled_reward": -0.032520990062039346, + "rewards/format_reward": 0.7708333507180214, + "step": 476 + }, + { + "advantage_max": 1.6627508848905563, + "advantage_mean": 5.587935947293232e-09, + "advantage_min": -0.7512878328561783, + "advantage_std": 0.8839316442608833, + "completion_length": 2252.5000915527344, + "epoch": 0.5451428571428572, + "grad_norm": 1.535606861114502, + "kl": 0.16825103759765625, + "lambda_div_used": 0.6, + "learning_rate": 1.0578868071715544e-07, + "loss": 0.0511, + "reward": 0.6627893559634686, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6627893559634686, + "reward_after_std": 0.8839316368103027, + "reward_before_mean": 1.3912963923066854, + "reward_before_std": 0.7169366031885147, + "reward_change_max": 0.0, + "reward_change_mean": -0.7285070493817329, + "reward_change_min": -1.05146986246109, + "reward_change_std": 0.4186026845127344, + "reward_std": 0.8839316628873348, + "rewards/cosine_scaled_reward": 0.22689818311482668, + "rewards/format_reward": 0.9375000074505806, + "step": 477 + }, + { + "advantage_max": 1.5479968041181564, + "advantage_mean": -9.934107314535368e-09, + "advantage_min": -0.6975952759385109, + "advantage_std": 0.8103221245110035, + "completion_length": 2727.8125762939453, + "epoch": 0.5462857142857143, + "grad_norm": 0.9718415141105652, + "kl": 0.2834930419921875, + "lambda_div_used": 0.6, + "learning_rate": 1.0529722834905125e-07, + "loss": 0.0349, + "reward": 0.01596967503428459, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.01596967503428459, + "reward_after_std": 0.8103221245110035, + "reward_before_mean": 0.39516052044928074, + "reward_before_std": 0.7432158552110195, + "reward_change_max": 0.0012557506561279297, + "reward_change_mean": -0.37919086311012506, + "reward_change_min": -0.6728931628167629, + "reward_change_std": 0.2619459554553032, + "reward_std": 0.8103221319615841, + "rewards/cosine_scaled_reward": -0.0940864197909832, + "rewards/format_reward": 0.5833333525806665, + "step": 478 + }, + { + "advantage_max": 1.5282156020402908, + "advantage_mean": -7.45058065243498e-09, + "advantage_min": -0.6081922184675932, + "advantage_std": 0.7922434285283089, + "completion_length": 2916.1250610351562, + "epoch": 0.5474285714285714, + "grad_norm": 0.396575003862381, + "kl": 0.3342437744140625, + "lambda_div_used": 0.6, + "learning_rate": 1.0482745016665526e-07, + "loss": 0.0481, + "reward": 0.1486353098298423, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1486353098298423, + "reward_after_std": 0.7922434583306313, + "reward_before_mean": 0.6048104707151651, + "reward_before_std": 0.6755081471055746, + "reward_change_max": 0.0, + "reward_change_mean": -0.45617516711354256, + "reward_change_min": -0.7893848493695259, + "reward_change_std": 0.2866050563752651, + "reward_std": 0.7922434769570827, + "rewards/cosine_scaled_reward": -0.1350947736063972, + "rewards/format_reward": 0.8750000149011612, + "step": 479 + }, + { + "advantage_max": 1.4838635623455048, + "advantage_mean": -1.0554989604560916e-08, + "advantage_min": -0.6728080175817013, + "advantage_std": 0.8044894188642502, + "completion_length": 2724.4792404174805, + "epoch": 0.5485714285714286, + "grad_norm": 0.9163870215415955, + "kl": 0.307281494140625, + "lambda_div_used": 0.6, + "learning_rate": 1.0437936906629334e-07, + "loss": 0.0065, + "reward": 0.14659792685415596, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.14659792685415596, + "reward_after_std": 0.8044894188642502, + "reward_before_mean": 0.6072377488017082, + "reward_before_std": 0.7544484827667475, + "reward_change_max": 0.0, + "reward_change_mean": -0.46063981391489506, + "reward_change_min": -0.8319177739322186, + "reward_change_std": 0.3214505556970835, + "reward_std": 0.8044894523918629, + "rewards/cosine_scaled_reward": -0.04013113956898451, + "rewards/format_reward": 0.6875000055879354, + "step": 480 + }, + { + "advantage_max": 1.359544724225998, + "advantage_mean": 1.2728075482471013e-08, + "advantage_min": -0.5560879930853844, + "advantage_std": 0.7110361345112324, + "completion_length": 3155.666778564453, + "epoch": 0.5497142857142857, + "grad_norm": 0.54989093542099, + "kl": 0.33892822265625, + "lambda_div_used": 0.6, + "learning_rate": 1.0395300688680625e-07, + "loss": 0.0236, + "reward": 0.14918394200503826, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.14918394200503826, + "reward_after_std": 0.7110361270606518, + "reward_before_mean": 0.6284264158457518, + "reward_before_std": 0.6068124901503325, + "reward_change_max": 0.0010639280080795288, + "reward_change_mean": -0.4792424812912941, + "reward_change_min": -0.8376931995153427, + "reward_change_std": 0.2971576862037182, + "reward_std": 0.7110361494123936, + "rewards/cosine_scaled_reward": -0.11287012998946011, + "rewards/format_reward": 0.8541666828095913, + "step": 481 + }, + { + "advantage_max": 1.7242258936166763, + "advantage_mean": -2.6697914601303552e-08, + "advantage_min": -0.9280278235673904, + "advantage_std": 0.9408847987651825, + "completion_length": 2702.333351135254, + "epoch": 0.5508571428571428, + "grad_norm": 0.807201623916626, + "kl": 0.326751708984375, + "lambda_div_used": 0.6, + "learning_rate": 1.0354838440848501e-07, + "loss": 0.0343, + "reward": 0.3970525776967406, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3970525776967406, + "reward_after_std": 0.9408847987651825, + "reward_before_mean": 0.9709220081567764, + "reward_before_std": 0.8880007974803448, + "reward_change_max": 0.0015290752053260803, + "reward_change_mean": -0.5738694509491324, + "reward_change_min": -0.9650955460965633, + "reward_change_std": 0.3934989022091031, + "reward_std": 0.9408848136663437, + "rewards/cosine_scaled_reward": 0.13129432266578078, + "rewards/format_reward": 0.7083333376795053, + "step": 482 + }, + { + "advantage_max": 1.142264649271965, + "advantage_mean": -1.0554989049449404e-08, + "advantage_min": -0.6997229307889938, + "advantage_std": 0.639618307352066, + "completion_length": 3028.104232788086, + "epoch": 0.552, + "grad_norm": 0.48145022988319397, + "kl": 0.388519287109375, + "lambda_div_used": 0.6, + "learning_rate": 1.0316552135205837e-07, + "loss": 0.0323, + "reward": 0.07725562807172537, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.07725562807172537, + "reward_after_std": 0.6396183036267757, + "reward_before_mean": 0.53841517935507, + "reward_before_std": 0.6105475910007954, + "reward_change_max": 0.0, + "reward_change_mean": -0.4611595496535301, + "reward_change_min": -0.7524649910628796, + "reward_change_std": 0.3050990607589483, + "reward_std": 0.639618307352066, + "rewards/cosine_scaled_reward": -0.10579242091625929, + "rewards/format_reward": 0.7500000111758709, + "step": 483 + }, + { + "advantage_max": 1.4299253299832344, + "advantage_mean": -3.725290520506519e-09, + "advantage_min": -0.8638685494661331, + "advantage_std": 0.8112721741199493, + "completion_length": 2579.7708892822266, + "epoch": 0.5531428571428572, + "grad_norm": 1.489706039428711, + "kl": 0.2846221923828125, + "lambda_div_used": 0.6, + "learning_rate": 1.0280443637773163e-07, + "loss": 0.0702, + "reward": 0.3688160046003759, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3688160046003759, + "reward_after_std": 0.8112721666693687, + "reward_before_mean": 0.9602692160988227, + "reward_before_std": 0.7896840088069439, + "reward_change_max": 0.0, + "reward_change_mean": -0.5914532169699669, + "reward_change_min": -1.0330743491649628, + "reward_change_std": 0.4153981562703848, + "reward_std": 0.8112721815705299, + "rewards/cosine_scaled_reward": 0.09471793798729777, + "rewards/format_reward": 0.7708333469927311, + "step": 484 + }, + { + "advantage_max": 1.6329149454832077, + "advantage_mean": -1.8005569923928988e-08, + "advantage_min": -0.8053278736770153, + "advantage_std": 0.8814475685358047, + "completion_length": 2728.9375610351562, + "epoch": 0.5542857142857143, + "grad_norm": 0.6401396989822388, + "kl": 0.4595947265625, + "lambda_div_used": 0.6, + "learning_rate": 1.0246514708427701e-07, + "loss": 0.0341, + "reward": 0.18104042625054717, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.18104042625054717, + "reward_after_std": 0.8814475610852242, + "reward_before_mean": 0.6409261487424374, + "reward_before_std": 0.8435741513967514, + "reward_change_max": 0.0005346983671188354, + "reward_change_mean": -0.45988572016358376, + "reward_change_min": -0.8481529578566551, + "reward_change_std": 0.3186565674841404, + "reward_std": 0.8814475685358047, + "rewards/cosine_scaled_reward": -0.07537028007209301, + "rewards/format_reward": 0.7916666939854622, + "step": 485 + }, + { + "advantage_max": 1.2459972277283669, + "advantage_mean": -1.4280279680978225e-08, + "advantage_min": -0.5129361264407635, + "advantage_std": 0.6458823718130589, + "completion_length": 2616.5833740234375, + "epoch": 0.5554285714285714, + "grad_norm": 0.7087405323982239, + "kl": 0.3177337646484375, + "lambda_div_used": 0.6, + "learning_rate": 1.0214767000817596e-07, + "loss": 0.0197, + "reward": 0.12188614474143833, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.12188614474143833, + "reward_after_std": 0.64588238671422, + "reward_before_mean": 0.5966749314684421, + "reward_before_std": 0.502772644162178, + "reward_change_max": 0.0, + "reward_change_mean": -0.47478877753019333, + "reward_change_min": -0.7171571180224419, + "reward_change_std": 0.27346576750278473, + "reward_std": 0.6458823904395103, + "rewards/cosine_scaled_reward": -0.1287458804436028, + "rewards/format_reward": 0.8541666865348816, + "step": 486 + }, + { + "advantage_max": 1.5544669702649117, + "advantage_mean": 3.7252904094842165e-09, + "advantage_min": -0.5799307525157928, + "advantage_std": 0.7964937537908554, + "completion_length": 2185.62504196167, + "epoch": 0.5565714285714286, + "grad_norm": 0.7838549613952637, + "kl": 0.2025604248046875, + "lambda_div_used": 0.6, + "learning_rate": 1.0185202062281336e-07, + "loss": -0.007, + "reward": 0.4715829244814813, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4715829244814813, + "reward_after_std": 0.7964937686920166, + "reward_before_mean": 1.106121625751257, + "reward_before_std": 0.5940053351223469, + "reward_change_max": 0.0, + "reward_change_mean": -0.6345386989414692, + "reward_change_min": -0.960016280412674, + "reward_change_std": 0.3513956777751446, + "reward_std": 0.7964937686920166, + "rewards/cosine_scaled_reward": 0.07389411079930142, + "rewards/format_reward": 0.9583333432674408, + "step": 487 + }, + { + "advantage_max": 1.1480357646942139, + "advantage_mean": 8.692344288796505e-09, + "advantage_min": -0.5942578949034214, + "advantage_std": 0.636834591627121, + "completion_length": 2252.604179382324, + "epoch": 0.5577142857142857, + "grad_norm": 0.34447166323661804, + "kl": 0.24651336669921875, + "lambda_div_used": 0.6, + "learning_rate": 1.0157821333772304e-07, + "loss": 0.0124, + "reward": 0.09369122982025146, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.09369122982025146, + "reward_after_std": 0.6368345841765404, + "reward_before_mean": 0.5591697334311903, + "reward_before_std": 0.591731121763587, + "reward_change_max": 0.0017108246684074402, + "reward_change_mean": -0.46547851897776127, + "reward_change_min": -0.8035316653549671, + "reward_change_std": 0.3129301369190216, + "reward_std": 0.636834591627121, + "rewards/cosine_scaled_reward": -0.0745818018913269, + "rewards/format_reward": 0.7083333469927311, + "step": 488 + }, + { + "advantage_max": 1.1497306674718857, + "advantage_mean": 3.4148495420271985e-09, + "advantage_min": -0.536426167935133, + "advantage_std": 0.6116931214928627, + "completion_length": 3402.8541870117188, + "epoch": 0.5588571428571428, + "grad_norm": 1.1072264909744263, + "kl": 0.534423828125, + "lambda_div_used": 0.6, + "learning_rate": 1.013262614978859e-07, + "loss": 0.0316, + "reward": -0.22755743563175201, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.22755743563175201, + "reward_after_std": 0.6116931177675724, + "reward_before_mean": 0.061076946556568146, + "reward_before_std": 0.5797216258943081, + "reward_change_max": 0.00025550276041030884, + "reward_change_mean": -0.2886343817226589, + "reward_change_min": -0.5480538904666901, + "reward_change_std": 0.20898331236094236, + "reward_std": 0.6116931326687336, + "rewards/cosine_scaled_reward": -0.24029485508799553, + "rewards/format_reward": 0.5416666828095913, + "step": 489 + }, + { + "advantage_max": 1.19098761677742, + "advantage_mean": -9.3132264122886e-09, + "advantage_min": -0.5116630420088768, + "advantage_std": 0.6245383583009243, + "completion_length": 2225.3334045410156, + "epoch": 0.56, + "grad_norm": 0.4310756027698517, + "kl": 0.24271392822265625, + "lambda_div_used": 0.6, + "learning_rate": 1.0109617738307911e-07, + "loss": 0.0228, + "reward": 0.11366786062717438, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.11366786062717438, + "reward_after_std": 0.624538354575634, + "reward_before_mean": 0.5907516656443477, + "reward_before_std": 0.4991147108376026, + "reward_change_max": 0.0008979067206382751, + "reward_change_mean": -0.47708383947610855, + "reward_change_min": -0.7438569702208042, + "reward_change_std": 0.27903275191783905, + "reward_std": 0.6245383620262146, + "rewards/cosine_scaled_reward": -0.1421241695061326, + "rewards/format_reward": 0.8750000111758709, + "step": 490 + }, + { + "advantage_max": 1.4718172997236252, + "advantage_mean": -1.1796753074388988e-08, + "advantage_min": -0.6949670314788818, + "advantage_std": 0.7889797687530518, + "completion_length": 2863.437545776367, + "epoch": 0.5611428571428572, + "grad_norm": 0.5122021436691284, + "kl": 0.44110107421875, + "lambda_div_used": 0.6, + "learning_rate": 1.0088797220727779e-07, + "loss": 0.0421, + "reward": 0.45797410421073437, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.45797410421073437, + "reward_after_std": 0.7889797762036324, + "reward_before_mean": 1.0883095804601908, + "reward_before_std": 0.6660438068211079, + "reward_change_max": 0.0016724318265914917, + "reward_change_mean": -0.6303354352712631, + "reward_change_min": -1.0100024230778217, + "reward_change_std": 0.382383581250906, + "reward_std": 0.7889798246324062, + "rewards/cosine_scaled_reward": 0.1899880999699235, + "rewards/format_reward": 0.7083333395421505, + "step": 491 + }, + { + "advantage_max": 1.319904811680317, + "advantage_mean": -1.80055704790405e-08, + "advantage_min": -0.8290270902216434, + "advantage_std": 0.7442688755691051, + "completion_length": 2714.7917404174805, + "epoch": 0.5622857142857143, + "grad_norm": 0.6903409361839294, + "kl": 0.3553466796875, + "lambda_div_used": 0.6, + "learning_rate": 1.0070165611810855e-07, + "loss": 0.0379, + "reward": 0.3523098239675164, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3523098239675164, + "reward_after_std": 0.7442688718438148, + "reward_before_mean": 0.9426992423832417, + "reward_before_std": 0.6989971324801445, + "reward_change_max": 0.0, + "reward_change_mean": -0.5903894305229187, + "reward_change_min": -0.9812662154436111, + "reward_change_std": 0.38451446406543255, + "reward_std": 0.7442688792943954, + "rewards/cosine_scaled_reward": 0.04426628723740578, + "rewards/format_reward": 0.854166679084301, + "step": 492 + }, + { + "advantage_max": 1.611944667994976, + "advantage_mean": -4.221995775210985e-08, + "advantage_min": -0.8087064102292061, + "advantage_std": 0.8931445516645908, + "completion_length": 2369.81258392334, + "epoch": 0.5634285714285714, + "grad_norm": 0.427365779876709, + "kl": 0.24420928955078125, + "lambda_div_used": 0.6, + "learning_rate": 1.005372381963547e-07, + "loss": 0.0232, + "reward": 0.5566324144601822, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5566324144601822, + "reward_after_std": 0.8931445516645908, + "reward_before_mean": 1.229935996234417, + "reward_before_std": 0.8107591010630131, + "reward_change_max": 0.0006113573908805847, + "reward_change_mean": -0.6733036190271378, + "reward_change_min": -1.150217853486538, + "reward_change_std": 0.44052213057875633, + "reward_std": 0.8931446000933647, + "rewards/cosine_scaled_reward": 0.14621799066662788, + "rewards/format_reward": 0.9375000074505806, + "step": 493 + }, + { + "advantage_max": 2.0662256479263306, + "advantage_mean": -1.940255400789681e-08, + "advantage_min": -1.025919608771801, + "advantage_std": 1.1447409093379974, + "completion_length": 2289.3125610351562, + "epoch": 0.5645714285714286, + "grad_norm": 1.5831674337387085, + "kl": 0.2780609130859375, + "lambda_div_used": 0.6, + "learning_rate": 1.0039472645551372e-07, + "loss": 0.0502, + "reward": 0.47046585264615715, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.47046585264615715, + "reward_after_std": 1.1447409093379974, + "reward_before_mean": 1.0404850542545319, + "reward_before_std": 1.1501704677939415, + "reward_change_max": 0.000872880220413208, + "reward_change_mean": -0.5700191967189312, + "reward_change_min": -1.2041919827461243, + "reward_change_std": 0.44489174522459507, + "reward_std": 1.1447409093379974, + "rewards/cosine_scaled_reward": 0.08274251967668533, + "rewards/format_reward": 0.8750000223517418, + "step": 494 + }, + { + "advantage_max": 1.207513116300106, + "advantage_mean": -2.017865596837254e-08, + "advantage_min": -0.6101922206580639, + "advantage_std": 0.6588433645665646, + "completion_length": 2922.6458740234375, + "epoch": 0.5657142857142857, + "grad_norm": 0.5391934514045715, + "kl": 0.37164306640625, + "lambda_div_used": 0.6, + "learning_rate": 1.002741278414069e-07, + "loss": 0.0271, + "reward": 0.2596977346111089, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2596977346111089, + "reward_after_std": 0.6588433682918549, + "reward_before_mean": 0.8155031725764275, + "reward_before_std": 0.55685312487185, + "reward_change_max": 0.0, + "reward_change_mean": -0.5558054614812136, + "reward_change_min": -0.8851585574448109, + "reward_change_std": 0.3384298738092184, + "reward_std": 0.6588433757424355, + "rewards/cosine_scaled_reward": 0.0015015807002782822, + "rewards/format_reward": 0.8125000111758709, + "step": 495 + }, + { + "advantage_max": 1.452590487897396, + "advantage_mean": -4.8428774435116395e-08, + "advantage_min": -0.6632073484361172, + "advantage_std": 0.7584183663129807, + "completion_length": 2523.5833587646484, + "epoch": 0.5668571428571428, + "grad_norm": 0.47810283303260803, + "kl": 0.3327484130859375, + "lambda_div_used": 0.6, + "learning_rate": 1.0017544823184055e-07, + "loss": 0.0208, + "reward": 0.4266379442997277, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4266379442997277, + "reward_after_std": 0.7584183737635612, + "reward_before_mean": 1.0474421493709087, + "reward_before_std": 0.599539153277874, + "reward_change_max": 3.6485493183135986e-05, + "reward_change_mean": -0.6208042334765196, + "reward_change_min": -0.9354967325925827, + "reward_change_std": 0.35683672688901424, + "reward_std": 0.7584183923900127, + "rewards/cosine_scaled_reward": 0.13830439560115337, + "rewards/format_reward": 0.7708333414047956, + "step": 496 + }, + { + "advantage_max": 1.416138507425785, + "advantage_mean": -8.6923440667519e-09, + "advantage_min": -0.8415560573339462, + "advantage_std": 0.7852711267769337, + "completion_length": 2443.604232788086, + "epoch": 0.568, + "grad_norm": 0.6396067142486572, + "kl": 0.198028564453125, + "lambda_div_used": 0.6, + "learning_rate": 1.0009869243631952e-07, + "loss": 0.0316, + "reward": 0.5844808593392372, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5844808593392372, + "reward_after_std": 0.7852711230516434, + "reward_before_mean": 1.298376940190792, + "reward_before_std": 0.6900318209081888, + "reward_change_max": 0.0, + "reward_change_mean": -0.7138960808515549, + "reward_change_min": -1.110473420470953, + "reward_change_std": 0.4316666442900896, + "reward_std": 0.7852711528539658, + "rewards/cosine_scaled_reward": 0.19085513520985842, + "rewards/format_reward": 0.9166666716337204, + "step": 497 + }, + { + "advantage_max": 1.8146474957466125, + "advantage_mean": -2.1109979486677588e-08, + "advantage_min": -0.8280821889638901, + "advantage_std": 0.962591864168644, + "completion_length": 2807.1459350585938, + "epoch": 0.5691428571428572, + "grad_norm": 0.9842413663864136, + "kl": 0.35626220703125, + "lambda_div_used": 0.6, + "learning_rate": 1.000438641958131e-07, + "loss": 0.0414, + "reward": 0.35881377733312547, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.35881377733312547, + "reward_after_std": 0.9625918716192245, + "reward_before_mean": 0.8991924710571766, + "reward_before_std": 0.8837512731552124, + "reward_change_max": 0.0004179328680038452, + "reward_change_mean": -0.5403786823153496, + "reward_change_min": -0.9675283432006836, + "reward_change_std": 0.35297749750316143, + "reward_std": 0.9625919163227081, + "rewards/cosine_scaled_reward": 0.012096216436475515, + "rewards/format_reward": 0.8750000223517418, + "step": 498 + }, + { + "advantage_max": 1.6346061080694199, + "advantage_mean": -1.6142925329809543e-08, + "advantage_min": -0.9418043605983257, + "advantage_std": 0.9355646707117558, + "completion_length": 2936.437545776367, + "epoch": 0.5702857142857143, + "grad_norm": 0.9127795100212097, + "kl": 0.388641357421875, + "lambda_div_used": 0.6, + "learning_rate": 1.0001096618257236e-07, + "loss": 0.0482, + "reward": 0.32707899808883667, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.32707899808883667, + "reward_after_std": 0.935564685612917, + "reward_before_mean": 0.8720442028716207, + "reward_before_std": 0.963412918150425, + "reward_change_max": 0.0, + "reward_change_mean": -0.5449651964008808, + "reward_change_min": -1.0156399756669998, + "reward_change_std": 0.4170589130371809, + "reward_std": 0.9355646967887878, + "rewards/cosine_scaled_reward": 0.05060542202409124, + "rewards/format_reward": 0.7708333469927311, + "step": 499 + }, + { + "advantage_max": 1.487747348845005, + "advantage_mean": -4.967053879312289e-09, + "advantage_min": -0.7583122663199902, + "advantage_std": 0.8185562938451767, + "completion_length": 3081.4375610351562, + "epoch": 0.5714285714285714, + "grad_norm": 0.8269075155258179, + "kl": 0.37713623046875, + "lambda_div_used": 0.6, + "learning_rate": 1e-07, + "loss": 0.0488, + "reward": 0.1662901910021901, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1662901910021901, + "reward_after_std": 0.8185563161969185, + "reward_before_mean": 0.635827727150172, + "reward_before_std": 0.7835097871720791, + "reward_change_max": 0.0, + "reward_change_mean": -0.46953752264380455, + "reward_change_min": -0.888320729136467, + "reward_change_std": 0.33614025823771954, + "reward_std": 0.81855633482337, + "rewards/cosine_scaled_reward": -0.025836152024567127, + "rewards/format_reward": 0.6875000204890966, + "step": 500 + }, + { + "epoch": 0.5714285714285714, + "step": 500, + "total_flos": 0.0, + "train_loss": 0.0057369744749739765, + "train_runtime": 18708.5051, + "train_samples_per_second": 1.283, + "train_steps_per_second": 0.027 + } + ], + "logging_steps": 1, + "max_steps": 500, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}