diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13542 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5714285714285714, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "advantage_max": 1.5351008400321007, + "advantage_mean": 1.6142925107764938e-08, + "advantage_min": -0.7533128149807453, + "advantage_std": 0.8219119422137737, + "completion_length": 2571.2083587646484, + "epoch": 0.001142857142857143, + "grad_norm": 0.11749051511287689, + "kl": 0.0, + "lambda_div_used": 0.5, + "learning_rate": 2e-08, + "loss": 0.0601, + "reward": -0.03908593417145312, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.03908593417145312, + "reward_after_std": 0.8219119422137737, + "reward_before_mean": 0.4897647276520729, + "reward_before_std": 0.8290339298546314, + "reward_change_max": 0.0007017925381660461, + "reward_change_mean": -0.5288506411015987, + "reward_change_min": -1.0365500748157501, + "reward_change_std": 0.4204680975526571, + "reward_std": 0.8219119869172573, + "rewards/cosine_scaled_reward": -0.015534311532974243, + "rewards/format_reward": 0.5208333488553762, + "step": 1 + }, + { + "advantage_max": 0.9172319918870926, + "advantage_mean": 1.8626452047421083e-08, + "advantage_min": -0.43226177990436554, + "advantage_std": 0.4922399129718542, + "completion_length": 2804.395881652832, + "epoch": 0.002285714285714286, + "grad_norm": 0.06315363943576813, + "kl": 0.0, + "lambda_div_used": 0.5, + "learning_rate": 4e-08, + "loss": 0.0237, + "reward": -0.21404163353145123, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.21404163353145123, + "reward_after_std": 0.4922399129718542, + "reward_before_mean": 0.27539755403995514, + "reward_before_std": 0.42092561535537243, + "reward_change_max": 0.001632794737815857, + "reward_change_mean": -0.48943919129669666, + "reward_change_min": -0.7970554456114769, + "reward_change_std": 0.3251637788489461, + "reward_std": 0.4922399166971445, + "rewards/cosine_scaled_reward": -0.04980122856795788, + "rewards/format_reward": 0.37500000558793545, + "step": 2 + }, + { + "advantage_max": 0.9185556918382645, + "advantage_mean": 1.8626452102932234e-08, + "advantage_min": -0.4186497926712036, + "advantage_std": 0.49275562539696693, + "completion_length": 3346.6458740234375, + "epoch": 0.0034285714285714284, + "grad_norm": 0.08308498561382294, + "kl": 4.464387893676758e-05, + "lambda_div_used": 0.5, + "learning_rate": 6e-08, + "loss": 0.0288, + "reward": -0.4993674159049988, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.4993674159049988, + "reward_after_std": 0.49275562167167664, + "reward_before_mean": -0.2508781077340245, + "reward_before_std": 0.5146373584866524, + "reward_change_max": 0.0006987899541854858, + "reward_change_mean": -0.24848931096494198, + "reward_change_min": -0.5916367769241333, + "reward_change_std": 0.23015559278428555, + "reward_std": 0.49275562912225723, + "rewards/cosine_scaled_reward": -0.18793905060738325, + "rewards/format_reward": 0.1250000037252903, + "step": 3 + }, + { + "advantage_max": 1.6362370401620865, + "advantage_mean": 2.4835269951672956e-09, + "advantage_min": -0.692343682050705, + "advantage_std": 0.8636160232126713, + "completion_length": 2088.5208587646484, + "epoch": 0.004571428571428572, + "grad_norm": 0.14335113763809204, + "kl": 4.871189594268799e-05, + "lambda_div_used": 0.5, + "learning_rate": 8e-08, + "loss": 0.05, + "reward": 0.0002058250829577446, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.0002058250829577446, + "reward_after_std": 0.8636160232126713, + "reward_before_mean": 0.5427898876369, + "reward_before_std": 0.8250438794493675, + "reward_change_max": 0.0, + "reward_change_mean": -0.5425840523093939, + "reward_change_min": -1.0156159922480583, + "reward_change_std": 0.4061375632882118, + "reward_std": 0.8636160306632519, + "rewards/cosine_scaled_reward": -0.061938409227877855, + "rewards/format_reward": 0.6666666753590107, + "step": 4 + }, + { + "advantage_max": 1.400401022285223, + "advantage_mean": -1.4280279680978225e-08, + "advantage_min": -0.5432489290833473, + "advantage_std": 0.726759284734726, + "completion_length": 3397.4583435058594, + "epoch": 0.005714285714285714, + "grad_norm": 0.1702452301979065, + "kl": 4.617869853973389e-05, + "lambda_div_used": 0.5, + "learning_rate": 1e-07, + "loss": 0.0404, + "reward": -0.4024368515238166, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.4024368515238166, + "reward_after_std": 0.7267592810094357, + "reward_before_mean": -0.15027659479528666, + "reward_before_std": 0.735253743827343, + "reward_change_max": 0.0012489557266235352, + "reward_change_mean": -0.25216028839349747, + "reward_change_min": -0.6197184585034847, + "reward_change_std": 0.25399384275078773, + "reward_std": 0.7267593070864677, + "rewards/cosine_scaled_reward": -0.179304969497025, + "rewards/format_reward": 0.2083333358168602, + "step": 5 + }, + { + "advantage_max": 1.512929029762745, + "advantage_mean": 9.934107758624577e-09, + "advantage_min": -0.4930929020047188, + "advantage_std": 0.7648793570697308, + "completion_length": 3090.3333587646484, + "epoch": 0.006857142857142857, + "grad_norm": 0.14592202007770538, + "kl": 4.194676876068115e-05, + "lambda_div_used": 0.5, + "learning_rate": 1.2e-07, + "loss": 0.0437, + "reward": -0.35319698275998235, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.35319698275998235, + "reward_after_std": 0.7648793533444405, + "reward_before_mean": -0.08074576873332262, + "reward_before_std": 0.7240379452705383, + "reward_change_max": 0.0034214183688163757, + "reward_change_mean": -0.27245120890438557, + "reward_change_min": -0.5420073866844177, + "reward_change_std": 0.21257336740382016, + "reward_std": 0.7648793831467628, + "rewards/cosine_scaled_reward": -0.17578955832868814, + "rewards/format_reward": 0.2708333358168602, + "step": 6 + }, + { + "advantage_max": 1.2516018003225327, + "advantage_mean": -2.4835269396561444e-09, + "advantage_min": -0.6208689622581005, + "advantage_std": 0.6690351134166121, + "completion_length": 3066.7708587646484, + "epoch": 0.008, + "grad_norm": 0.0987529531121254, + "kl": 2.1673738956451416e-05, + "lambda_div_used": 0.5, + "learning_rate": 1.4e-07, + "loss": 0.0107, + "reward": -0.22521874122321606, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.22521874122321606, + "reward_after_std": 0.6690351143479347, + "reward_before_mean": 0.20237011834979057, + "reward_before_std": 0.6675894744694233, + "reward_change_max": 0.00040956586599349976, + "reward_change_mean": -0.42758889915421605, + "reward_change_min": -0.8027475290000439, + "reward_change_std": 0.3406977616250515, + "reward_std": 0.6690351590514183, + "rewards/cosine_scaled_reward": -0.18006493523716927, + "rewards/format_reward": 0.5625000055879354, + "step": 7 + }, + { + "advantage_max": 1.648353137075901, + "advantage_mean": -1.738468857759301e-08, + "advantage_min": -0.6211307123303413, + "advantage_std": 0.8522039614617825, + "completion_length": 2743.3333892822266, + "epoch": 0.009142857142857144, + "grad_norm": 0.13271355628967285, + "kl": 3.2164156436920166e-05, + "lambda_div_used": 0.5, + "learning_rate": 1.6e-07, + "loss": 0.0114, + "reward": 0.09091248735785484, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.09091248735785484, + "reward_after_std": 0.8522039651870728, + "reward_before_mean": 0.7095784271750745, + "reward_before_std": 0.7119280910119414, + "reward_change_max": 0.0019819363951683044, + "reward_change_mean": -0.6186658814549446, + "reward_change_min": -1.01032629981637, + "reward_change_std": 0.40532769449055195, + "reward_std": 0.8522040024399757, + "rewards/cosine_scaled_reward": 0.11520583834499121, + "rewards/format_reward": 0.47916666977107525, + "step": 8 + }, + { + "advantage_max": 1.3571566194295883, + "advantage_mean": -7.450580541412677e-09, + "advantage_min": -0.6235329136252403, + "advantage_std": 0.7175218462944031, + "completion_length": 3166.604217529297, + "epoch": 0.010285714285714285, + "grad_norm": 0.1199144497513771, + "kl": 4.5552849769592285e-05, + "lambda_div_used": 0.5, + "learning_rate": 1.8e-07, + "loss": 0.0728, + "reward": -0.2478786800056696, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2478786800056696, + "reward_after_std": 0.7175218313932419, + "reward_before_mean": 0.13642565836198628, + "reward_before_std": 0.7206410467624664, + "reward_change_max": 0.0011469870805740356, + "reward_change_mean": -0.3843043278902769, + "reward_change_min": -0.7558578066527843, + "reward_change_std": 0.31058289762586355, + "reward_std": 0.7175218351185322, + "rewards/cosine_scaled_reward": -0.07762051094323397, + "rewards/format_reward": 0.2916666753590107, + "step": 9 + }, + { + "advantage_max": 0.8546838238835335, + "advantage_mean": 5.587935669737476e-09, + "advantage_min": -0.44723255559802055, + "advantage_std": 0.4613385181874037, + "completion_length": 2646.687515258789, + "epoch": 0.011428571428571429, + "grad_norm": 0.05081493407487869, + "kl": 2.671964466571808e-05, + "lambda_div_used": 0.5, + "learning_rate": 2e-07, + "loss": 0.0109, + "reward": -0.3753599179908633, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.3753599179908633, + "reward_after_std": 0.46133850887417793, + "reward_before_mean": -0.012189794331789017, + "reward_before_std": 0.4536908529698849, + "reward_change_max": 0.0008832141757011414, + "reward_change_mean": -0.36317012226209044, + "reward_change_min": -0.6532888412475586, + "reward_change_std": 0.2652512276545167, + "reward_std": 0.46133851259946823, + "rewards/cosine_scaled_reward": -0.1935949008911848, + "rewards/format_reward": 0.37500000558793545, + "step": 10 + }, + { + "advantage_max": 1.4129405990242958, + "advantage_mean": 1.3659398279131096e-08, + "advantage_min": -0.5573535785079002, + "advantage_std": 0.7481464147567749, + "completion_length": 3410.3541870117188, + "epoch": 0.012571428571428572, + "grad_norm": 0.14011387526988983, + "kl": 3.7103891372680664e-05, + "lambda_div_used": 0.5, + "learning_rate": 2.1999999999999998e-07, + "loss": 0.0477, + "reward": -0.3787131551653147, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.3787131551653147, + "reward_after_std": 0.7481464073061943, + "reward_before_mean": -0.11150018568150699, + "reward_before_std": 0.7933910526335239, + "reward_change_max": 0.0027255788445472717, + "reward_change_mean": -0.26721295714378357, + "reward_change_min": -0.7168385572731495, + "reward_change_std": 0.2900985237210989, + "reward_std": 0.7481464371085167, + "rewards/cosine_scaled_reward": -0.17033343017101288, + "rewards/format_reward": 0.2291666716337204, + "step": 11 + }, + { + "advantage_max": 1.2863394618034363, + "advantage_mean": -1.2417633032946185e-09, + "advantage_min": -0.4983537904918194, + "advantage_std": 0.6565282978117466, + "completion_length": 2601.0834045410156, + "epoch": 0.013714285714285714, + "grad_norm": 0.08084140717983246, + "kl": 3.930681850761175e-05, + "lambda_div_used": 0.5, + "learning_rate": 2.4e-07, + "loss": 0.014, + "reward": -0.05203055217862129, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.05203055217862129, + "reward_after_std": 0.6565283127129078, + "reward_before_mean": 0.5094301174394786, + "reward_before_std": 0.5098936408758163, + "reward_change_max": 0.0, + "reward_change_mean": -0.561460705474019, + "reward_change_min": -0.8758684396743774, + "reward_change_std": 0.3374221920967102, + "reward_std": 0.656528327614069, + "rewards/cosine_scaled_reward": -0.07861827686429024, + "rewards/format_reward": 0.6666666716337204, + "step": 12 + }, + { + "advantage_max": 1.375102460384369, + "advantage_mean": 1.5522043039783995e-08, + "advantage_min": -0.6102774068713188, + "advantage_std": 0.7467358633875847, + "completion_length": 3022.5833587646484, + "epoch": 0.014857142857142857, + "grad_norm": 0.14866332709789276, + "kl": 3.300607204437256e-05, + "lambda_div_used": 0.5, + "learning_rate": 2.6e-07, + "loss": 0.0596, + "reward": -0.22573862690478563, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.22573862690478563, + "reward_after_std": 0.7467358633875847, + "reward_before_mean": 0.17446773871779442, + "reward_before_std": 0.7973835095763206, + "reward_change_max": 0.0012500211596488953, + "reward_change_mean": -0.40020634327083826, + "reward_change_min": -0.9575489908456802, + "reward_change_std": 0.3800930418074131, + "reward_std": 0.746735867112875, + "rewards/cosine_scaled_reward": -0.06901614367961884, + "rewards/format_reward": 0.3125000037252903, + "step": 13 + }, + { + "advantage_max": 1.6066002435982227, + "advantage_mean": 6.829698695476338e-09, + "advantage_min": -0.5994165241718292, + "advantage_std": 0.8288106508553028, + "completion_length": 2782.5833892822266, + "epoch": 0.016, + "grad_norm": 0.17648428678512573, + "kl": 3.495439887046814e-05, + "lambda_div_used": 0.5, + "learning_rate": 2.8e-07, + "loss": 0.0814, + "reward": -0.2345301266759634, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2345301266759634, + "reward_after_std": 0.8288106359541416, + "reward_before_mean": 0.12168530002236366, + "reward_before_std": 0.7971222475171089, + "reward_change_max": 0.0022324323654174805, + "reward_change_mean": -0.35621544159948826, + "reward_change_min": -0.7587636262178421, + "reward_change_std": 0.3039133660495281, + "reward_std": 0.828810652717948, + "rewards/cosine_scaled_reward": -0.1266573565080762, + "rewards/format_reward": 0.37500000558793545, + "step": 14 + }, + { + "advantage_max": 1.2302554100751877, + "advantage_mean": 1.6763807453301638e-08, + "advantage_min": -0.4796036444604397, + "advantage_std": 0.6268669404089451, + "completion_length": 2698.1041984558105, + "epoch": 0.017142857142857144, + "grad_norm": 0.08298429101705551, + "kl": 2.790987491607666e-05, + "lambda_div_used": 0.5, + "learning_rate": 3e-07, + "loss": 0.0127, + "reward": -0.13335783407092094, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.13335783407092094, + "reward_after_std": 0.6268669404089451, + "reward_before_mean": 0.37199926376342773, + "reward_before_std": 0.4883376806974411, + "reward_change_max": 0.0012028142809867859, + "reward_change_mean": -0.5053570671007037, + "reward_change_min": -0.7833587303757668, + "reward_change_std": 0.3134935852140188, + "reward_std": 0.6268669553101063, + "rewards/cosine_scaled_reward": -0.022333701490424573, + "rewards/format_reward": 0.4166666716337204, + "step": 15 + }, + { + "advantage_max": 0.58426259085536, + "advantage_mean": 2.297262391426358e-08, + "advantage_min": -0.2842987850308418, + "advantage_std": 0.31660328805446625, + "completion_length": 3563.5625, + "epoch": 0.018285714285714287, + "grad_norm": 0.0470166839659214, + "kl": 3.844499588012695e-05, + "lambda_div_used": 0.5, + "learning_rate": 3.2e-07, + "loss": 0.0043, + "reward": -0.6709575429558754, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.6709575429558754, + "reward_after_std": 0.3166032861918211, + "reward_before_mean": -0.5079018995165825, + "reward_before_std": 0.33846721425652504, + "reward_change_max": 0.0015065670013427734, + "reward_change_mean": -0.1630556397140026, + "reward_change_min": -0.36466383934020996, + "reward_change_std": 0.15320970956236124, + "reward_std": 0.31660328805446625, + "rewards/cosine_scaled_reward": -0.26436761766672134, + "rewards/format_reward": 0.02083333395421505, + "step": 16 + }, + { + "advantage_max": 1.545443370938301, + "advantage_mean": -9.934107758624577e-09, + "advantage_min": -0.5599127858877182, + "advantage_std": 0.8097105287015438, + "completion_length": 2066.9375534057617, + "epoch": 0.019428571428571427, + "grad_norm": 0.1167609766125679, + "kl": 2.705492079257965e-05, + "lambda_div_used": 0.5, + "learning_rate": 3.4000000000000003e-07, + "loss": 0.0423, + "reward": -0.019218791276216507, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.019218791276216507, + "reward_after_std": 0.8097105361521244, + "reward_before_mean": 0.5264171995222569, + "reward_before_std": 0.7449488136917353, + "reward_change_max": 0.0, + "reward_change_mean": -0.5456360150128603, + "reward_change_min": -1.1235605031251907, + "reward_change_std": 0.39495558850467205, + "reward_std": 0.8097105547785759, + "rewards/cosine_scaled_reward": -0.03887474234215915, + "rewards/format_reward": 0.6041666697710752, + "step": 17 + }, + { + "advantage_max": 1.6779102236032486, + "advantage_mean": 9.93410786964688e-09, + "advantage_min": -0.6428487151861191, + "advantage_std": 0.862780749797821, + "completion_length": 3074.3958892822266, + "epoch": 0.02057142857142857, + "grad_norm": 0.17488229274749756, + "kl": 2.652546390891075e-05, + "lambda_div_used": 0.5, + "learning_rate": 3.6e-07, + "loss": 0.0487, + "reward": -0.19576671486720443, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.19576671486720443, + "reward_after_std": 0.8627807684242725, + "reward_before_mean": 0.18335069436579943, + "reward_before_std": 0.8314383886754513, + "reward_change_max": 0.0014890357851982117, + "reward_change_mean": -0.3791173882782459, + "reward_change_min": -0.7256179824471474, + "reward_change_std": 0.28984588757157326, + "reward_std": 0.8627808056771755, + "rewards/cosine_scaled_reward": -0.08540799282491207, + "rewards/format_reward": 0.35416667349636555, + "step": 18 + }, + { + "advantage_max": 2.1795709654688835, + "advantage_mean": 1.862645149230957e-09, + "advantage_min": -0.8414489030838013, + "advantage_std": 1.1247562803328037, + "completion_length": 3021.4583892822266, + "epoch": 0.021714285714285714, + "grad_norm": 0.20591142773628235, + "kl": 2.354755997657776e-05, + "lambda_div_used": 0.5, + "learning_rate": 3.7999999999999996e-07, + "loss": 0.0303, + "reward": 0.12146518751978874, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.12146518751978874, + "reward_after_std": 1.124756295233965, + "reward_before_mean": 0.6802486801752821, + "reward_before_std": 1.0683305263519287, + "reward_change_max": 0.0007704496383666992, + "reward_change_mean": -0.5587835060432553, + "reward_change_min": -1.0479483902454376, + "reward_change_std": 0.41728440672159195, + "reward_std": 1.1247563175857067, + "rewards/cosine_scaled_reward": 0.13179101014975458, + "rewards/format_reward": 0.41666667722165585, + "step": 19 + }, + { + "advantage_max": 1.4769879020750523, + "advantage_mean": -1.8626453157644107e-09, + "advantage_min": -0.6367698088288307, + "advantage_std": 0.7875895667821169, + "completion_length": 2265.1666946411133, + "epoch": 0.022857142857142857, + "grad_norm": 0.12089281529188156, + "kl": 1.477077603340149e-05, + "lambda_div_used": 0.5, + "learning_rate": 4e-07, + "loss": 0.0535, + "reward": 0.04904268682003021, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.04904268682003021, + "reward_after_std": 0.787589592859149, + "reward_before_mean": 0.658054169267416, + "reward_before_std": 0.7119925869628787, + "reward_change_max": 0.0, + "reward_change_mean": -0.6090115122497082, + "reward_change_min": -1.0665437504649162, + "reward_change_std": 0.43563584610819817, + "reward_std": 0.7875896263867617, + "rewards/cosine_scaled_reward": -0.025139580480754375, + "rewards/format_reward": 0.7083333432674408, + "step": 20 + }, + { + "advantage_max": 1.2946437485516071, + "advantage_mean": 1.117587122845265e-08, + "advantage_min": -0.6569844149053097, + "advantage_std": 0.689958855509758, + "completion_length": 2700.2916870117188, + "epoch": 0.024, + "grad_norm": 0.0916251540184021, + "kl": 3.0294060707092285e-05, + "lambda_div_used": 0.5, + "learning_rate": 4.1999999999999995e-07, + "loss": 0.0295, + "reward": -0.12843544664792717, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.12843544664792717, + "reward_after_std": 0.6899588741362095, + "reward_before_mean": 0.3676719907671213, + "reward_before_std": 0.673046289011836, + "reward_change_max": 0.0006017163395881653, + "reward_change_mean": -0.49610744789242744, + "reward_change_min": -0.898295234888792, + "reward_change_std": 0.35666080191731453, + "reward_std": 0.6899588778614998, + "rewards/cosine_scaled_reward": -0.024497329257428646, + "rewards/format_reward": 0.4166666716337204, + "step": 21 + }, + { + "advantage_max": 1.4664242267608643, + "advantage_mean": -2.6077032533322608e-08, + "advantage_min": -0.6705172508955002, + "advantage_std": 0.7772407494485378, + "completion_length": 1715.1666870117188, + "epoch": 0.025142857142857144, + "grad_norm": 0.09739455580711365, + "kl": 1.611793413758278e-05, + "lambda_div_used": 0.5, + "learning_rate": 4.3999999999999997e-07, + "loss": -0.0015, + "reward": 0.16622705105692148, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.16622705105692148, + "reward_after_std": 0.7772407718002796, + "reward_before_mean": 0.8788988022133708, + "reward_before_std": 0.6709907222539186, + "reward_change_max": 0.0, + "reward_change_mean": -0.7126717567443848, + "reward_change_min": -1.252291426062584, + "reward_change_std": 0.47464586794376373, + "reward_std": 0.7772407941520214, + "rewards/cosine_scaled_reward": 0.054032716900110245, + "rewards/format_reward": 0.7708333395421505, + "step": 22 + }, + { + "advantage_max": 1.510243035852909, + "advantage_mean": 1.2417632477834672e-09, + "advantage_min": -0.6073393523693085, + "advantage_std": 0.7824924364686012, + "completion_length": 2540.9166870117188, + "epoch": 0.026285714285714287, + "grad_norm": 0.09775416553020477, + "kl": 2.3233238607645035e-05, + "lambda_div_used": 0.5, + "learning_rate": 4.6e-07, + "loss": 0.0514, + "reward": -0.19087476283311844, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.19087476283311844, + "reward_after_std": 0.7824924141168594, + "reward_before_mean": 0.21587533503770828, + "reward_before_std": 0.7406318411231041, + "reward_change_max": 0.0006575360894203186, + "reward_change_mean": -0.4067500773817301, + "reward_change_min": -0.8022553063929081, + "reward_change_std": 0.3064673077315092, + "reward_std": 0.782492458820343, + "rewards/cosine_scaled_reward": -0.12122901016846299, + "rewards/format_reward": 0.4583333395421505, + "step": 23 + }, + { + "advantage_max": 1.8014312759041786, + "advantage_mean": 4.967053990334591e-09, + "advantage_min": -0.9172608628869057, + "advantage_std": 0.9784793332219124, + "completion_length": 2844.6459045410156, + "epoch": 0.027428571428571427, + "grad_norm": 0.15517903864383698, + "kl": 2.2795749828219414e-05, + "lambda_div_used": 0.5, + "learning_rate": 4.8e-07, + "loss": 0.0567, + "reward": 0.015449069440364838, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.015449069440364838, + "reward_after_std": 0.9784793667495251, + "reward_before_mean": 0.5478918794542551, + "reward_before_std": 1.0509838238358498, + "reward_change_max": 0.0003214925527572632, + "reward_change_mean": -0.5324427876621485, + "reward_change_min": -1.174141988158226, + "reward_change_std": 0.4900853671133518, + "reward_std": 0.9784793853759766, + "rewards/cosine_scaled_reward": 0.03436260763555765, + "rewards/format_reward": 0.4791666828095913, + "step": 24 + }, + { + "advantage_max": 1.2840170040726662, + "advantage_mean": 5.587935614226325e-09, + "advantage_min": -0.6295462027192116, + "advantage_std": 0.680108830332756, + "completion_length": 2679.562530517578, + "epoch": 0.02857142857142857, + "grad_norm": 0.09606768935918808, + "kl": 2.7861446142196655e-05, + "lambda_div_used": 0.5, + "learning_rate": 5e-07, + "loss": -0.0088, + "reward": -0.2081345096230507, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2081345096230507, + "reward_after_std": 0.6801088340580463, + "reward_before_mean": 0.21977244596928358, + "reward_before_std": 0.6630591489374638, + "reward_change_max": 0.000768907368183136, + "reward_change_mean": -0.42790696490556, + "reward_change_min": -0.7866359949111938, + "reward_change_std": 0.3270989526063204, + "reward_std": 0.6801088377833366, + "rewards/cosine_scaled_reward": -0.10886377561837435, + "rewards/format_reward": 0.43750001303851604, + "step": 25 + }, + { + "advantage_max": 1.1658898368477821, + "advantage_mean": -7.450581041013038e-09, + "advantage_min": -0.5475185066461563, + "advantage_std": 0.6162115931510925, + "completion_length": 2960.6458740234375, + "epoch": 0.029714285714285714, + "grad_norm": 0.07693036645650864, + "kl": 2.6431865990161896e-05, + "lambda_div_used": 0.5, + "learning_rate": 5.2e-07, + "loss": 0.0079, + "reward": -0.09086161851882935, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.09086161851882935, + "reward_after_std": 0.6162116080522537, + "reward_before_mean": 0.4600065350532532, + "reward_before_std": 0.5263635087758303, + "reward_change_max": 0.0, + "reward_change_mean": -0.5508681740611792, + "reward_change_min": -0.8693654052913189, + "reward_change_std": 0.3519774377346039, + "reward_std": 0.6162116266787052, + "rewards/cosine_scaled_reward": 0.011253247037529945, + "rewards/format_reward": 0.4375000074505806, + "step": 26 + }, + { + "advantage_max": 1.5078487060964108, + "advantage_mean": 1.3814618671226242e-08, + "advantage_min": -0.6412480399012566, + "advantage_std": 0.8038202319294214, + "completion_length": 2972.000030517578, + "epoch": 0.030857142857142857, + "grad_norm": 0.13779900968074799, + "kl": 1.5037134289741516e-05, + "lambda_div_used": 0.5, + "learning_rate": 5.4e-07, + "loss": 0.0427, + "reward": -0.1060659121721983, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.1060659121721983, + "reward_after_std": 0.8038202319294214, + "reward_before_mean": 0.37545553129166365, + "reward_before_std": 0.8012076523154974, + "reward_change_max": 0.0, + "reward_change_mean": -0.4815214276313782, + "reward_change_min": -0.9643566869199276, + "reward_change_std": 0.3837998528033495, + "reward_std": 0.803820263594389, + "rewards/cosine_scaled_reward": -0.04143891017884016, + "rewards/format_reward": 0.45833334140479565, + "step": 27 + }, + { + "advantage_max": 1.7663781940937042, + "advantage_mean": -7.45058070794613e-09, + "advantage_min": -0.7497513294219971, + "advantage_std": 0.9309034496545792, + "completion_length": 2767.375030517578, + "epoch": 0.032, + "grad_norm": 0.11623068898916245, + "kl": 2.4037901312112808e-05, + "lambda_div_used": 0.5, + "learning_rate": 5.6e-07, + "loss": 0.0502, + "reward": -0.036286541260778904, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.036286541260778904, + "reward_after_std": 0.930903472006321, + "reward_before_mean": 0.46089379489421844, + "reward_before_std": 0.9374263137578964, + "reward_change_max": 0.0007840320467948914, + "reward_change_mean": -0.4971803342923522, + "reward_change_min": -1.002158623188734, + "reward_change_std": 0.40878670290112495, + "reward_std": 0.9309034869074821, + "rewards/cosine_scaled_reward": -0.009136438369750977, + "rewards/format_reward": 0.47916667349636555, + "step": 28 + }, + { + "advantage_max": 1.0969897732138634, + "advantage_mean": 1.6142925329809543e-08, + "advantage_min": -0.45380792766809464, + "advantage_std": 0.5798213481903076, + "completion_length": 3435.354217529297, + "epoch": 0.03314285714285714, + "grad_norm": 0.11971724778413773, + "kl": 1.8674880266189575e-05, + "lambda_div_used": 0.5, + "learning_rate": 5.8e-07, + "loss": 0.0462, + "reward": -0.4915079523343593, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.4915079523343593, + "reward_after_std": 0.5798213705420494, + "reward_before_mean": -0.26571009401232004, + "reward_before_std": 0.6099032871425152, + "reward_change_max": 0.0003291517496109009, + "reward_change_mean": -0.2257978618144989, + "reward_change_min": -0.5952579416334629, + "reward_change_std": 0.234037593472749, + "reward_std": 0.57982137799263, + "rewards/cosine_scaled_reward": -0.2161883795633912, + "rewards/format_reward": 0.1666666716337204, + "step": 29 + }, + { + "advantage_max": 2.382244497537613, + "advantage_mean": -6.208815683805824e-10, + "advantage_min": -0.8910982012748718, + "advantage_std": 1.2278216630220413, + "completion_length": 3001.479232788086, + "epoch": 0.03428571428571429, + "grad_norm": 0.21962173283100128, + "kl": 1.8961261957883835e-05, + "lambda_div_used": 0.5, + "learning_rate": 6e-07, + "loss": 0.0829, + "reward": -0.011109492421383038, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.011109492421383038, + "reward_after_std": 1.2278216481208801, + "reward_before_mean": 0.4033150505274534, + "reward_before_std": 1.2304754592478275, + "reward_change_max": 0.00011484324932098389, + "reward_change_mean": -0.4144245618954301, + "reward_change_min": -0.9832869358360767, + "reward_change_std": 0.39574938639998436, + "reward_std": 1.2278216779232025, + "rewards/cosine_scaled_reward": -0.017092485912144184, + "rewards/format_reward": 0.4375000111758709, + "step": 30 + }, + { + "advantage_max": 1.7745858430862427, + "advantage_mean": 1.8005570368018198e-08, + "advantage_min": -0.7045384347438812, + "advantage_std": 0.9255342036485672, + "completion_length": 2991.625045776367, + "epoch": 0.03542857142857143, + "grad_norm": 0.1190023124217987, + "kl": 1.7091631889343262e-05, + "lambda_div_used": 0.5, + "learning_rate": 6.2e-07, + "loss": 0.0341, + "reward": -0.016999364597722888, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.016999364597722888, + "reward_after_std": 0.925534226000309, + "reward_before_mean": 0.4908587606623769, + "reward_before_std": 0.8883664254099131, + "reward_change_max": 0.00013752281665802002, + "reward_change_mean": -0.5078581348061562, + "reward_change_min": -1.0069617629051208, + "reward_change_std": 0.3849845137447119, + "reward_std": 0.9255342334508896, + "rewards/cosine_scaled_reward": 0.037096042186021805, + "rewards/format_reward": 0.41666666977107525, + "step": 31 + }, + { + "advantage_max": 1.1209936514496803, + "advantage_mean": -9.93410786964688e-09, + "advantage_min": -0.5845097005367279, + "advantage_std": 0.6096477992832661, + "completion_length": 3136.5833435058594, + "epoch": 0.036571428571428574, + "grad_norm": 0.10291552543640137, + "kl": 3.565289080142975e-05, + "lambda_div_used": 0.5, + "learning_rate": 6.4e-07, + "loss": 0.0208, + "reward": -0.18031363934278488, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.18031363934278488, + "reward_after_std": 0.6096478067338467, + "reward_before_mean": 0.30608561262488365, + "reward_before_std": 0.6181067805737257, + "reward_change_max": 0.0007964596152305603, + "reward_change_mean": -0.48639929108321667, + "reward_change_min": -0.8966321311891079, + "reward_change_std": 0.355242476798594, + "reward_std": 0.609647810459137, + "rewards/cosine_scaled_reward": -0.024040542542934418, + "rewards/format_reward": 0.3541666716337204, + "step": 32 + }, + { + "advantage_max": 1.741414237767458, + "advantage_mean": 4.967053657267684e-09, + "advantage_min": -0.6895862258970737, + "advantage_std": 0.9135888814926147, + "completion_length": 3273.8750610351562, + "epoch": 0.037714285714285714, + "grad_norm": 0.1362524926662445, + "kl": 2.4221837520599365e-05, + "lambda_div_used": 0.5, + "learning_rate": 6.6e-07, + "loss": 0.0346, + "reward": -0.21659856289625168, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.21659856289625168, + "reward_after_std": 0.913588859140873, + "reward_before_mean": 0.13308776635676622, + "reward_before_std": 0.9547240622341633, + "reward_change_max": 0.0021802037954330444, + "reward_change_mean": -0.34968633856624365, + "reward_change_min": -0.6969276033341885, + "reward_change_std": 0.3053355095908046, + "reward_std": 0.9135888814926147, + "rewards/cosine_scaled_reward": -0.0897061238065362, + "rewards/format_reward": 0.31250000931322575, + "step": 33 + }, + { + "advantage_max": 1.7108439281582832, + "advantage_mean": 3.725290353973065e-09, + "advantage_min": -0.7600312046706676, + "advantage_std": 0.8962564840912819, + "completion_length": 2601.0208892822266, + "epoch": 0.038857142857142854, + "grad_norm": 0.1575985848903656, + "kl": 9.97595489025116e-05, + "lambda_div_used": 0.5, + "learning_rate": 6.800000000000001e-07, + "loss": 0.0179, + "reward": 0.06414370238780975, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.06414370238780975, + "reward_after_std": 0.8962565250694752, + "reward_before_mean": 0.649968970566988, + "reward_before_std": 0.8475272879004478, + "reward_change_max": 0.0, + "reward_change_mean": -0.5858252439647913, + "reward_change_min": -1.081942304968834, + "reward_change_std": 0.4181617796421051, + "reward_std": 0.8962565287947655, + "rewards/cosine_scaled_reward": 0.07498448248952627, + "rewards/format_reward": 0.5000000111758709, + "step": 34 + }, + { + "advantage_max": 1.5702840462327003, + "advantage_mean": 2.545615118698663e-08, + "advantage_min": -0.6366428211331367, + "advantage_std": 0.8293478488922119, + "completion_length": 3007.437530517578, + "epoch": 0.04, + "grad_norm": 0.12913337349891663, + "kl": 4.92781400680542e-05, + "lambda_div_used": 0.5, + "learning_rate": 7e-07, + "loss": 0.0438, + "reward": -0.24781744490610436, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.24781744490610436, + "reward_after_std": 0.8293478675186634, + "reward_before_mean": 0.10291525349020958, + "reward_before_std": 0.8493822924792767, + "reward_change_max": 0.0025837719440460205, + "reward_change_mean": -0.350732677616179, + "reward_change_min": -0.8603302016854286, + "reward_change_std": 0.34615642856806517, + "reward_std": 0.8293478898704052, + "rewards/cosine_scaled_reward": -0.10479237232357264, + "rewards/format_reward": 0.3125000074505806, + "step": 35 + }, + { + "advantage_max": 0.7410223707556725, + "advantage_mean": 1.8626452158443385e-08, + "advantage_min": -0.31461289897561073, + "advantage_std": 0.39646704867482185, + "completion_length": 3507.7708435058594, + "epoch": 0.04114285714285714, + "grad_norm": 0.07298173755407333, + "kl": 6.047636270523071e-05, + "lambda_div_used": 0.5, + "learning_rate": 7.2e-07, + "loss": 0.0098, + "reward": -0.6106727570295334, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.6106727570295334, + "reward_after_std": 0.3964670542627573, + "reward_before_mean": -0.42390722688287497, + "reward_before_std": 0.4142027962952852, + "reward_change_max": 0.0024718865752220154, + "reward_change_mean": -0.18676553736440837, + "reward_change_min": -0.42391372472047806, + "reward_change_std": 0.17731763934716582, + "reward_std": 0.39646706730127335, + "rewards/cosine_scaled_reward": -0.26403695228509605, + "rewards/format_reward": 0.1041666679084301, + "step": 36 + }, + { + "advantage_max": 0.8747889474034309, + "advantage_mean": 6.208817349140361e-09, + "advantage_min": -0.40808849036693573, + "advantage_std": 0.4734865203499794, + "completion_length": 3295.0833435058594, + "epoch": 0.04228571428571429, + "grad_norm": 0.06901846081018448, + "kl": 3.345310688018799e-05, + "lambda_div_used": 0.5, + "learning_rate": 7.4e-07, + "loss": 0.0204, + "reward": -0.4728499799966812, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.4728499799966812, + "reward_after_std": 0.4734865240752697, + "reward_before_mean": -0.1951841153204441, + "reward_before_std": 0.501446358859539, + "reward_change_max": 0.0024179965257644653, + "reward_change_mean": -0.2776658684015274, + "reward_change_min": -0.5783885680139065, + "reward_change_std": 0.24316345155239105, + "reward_std": 0.4734865352511406, + "rewards/cosine_scaled_reward": -0.2017587386071682, + "rewards/format_reward": 0.2083333358168602, + "step": 37 + }, + { + "advantage_max": 0.7613074332475662, + "advantage_mean": 1.2417634698280722e-08, + "advantage_min": -0.42001959681510925, + "advantage_std": 0.4193967394530773, + "completion_length": 3289.9375, + "epoch": 0.04342857142857143, + "grad_norm": 0.05465118587017059, + "kl": 6.181374192237854e-05, + "lambda_div_used": 0.5, + "learning_rate": 7.599999999999999e-07, + "loss": 0.0025, + "reward": -0.48476463556289673, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.48476463556289673, + "reward_after_std": 0.419396735727787, + "reward_before_mean": -0.19598917290568352, + "reward_before_std": 0.4489164873957634, + "reward_change_max": 0.0023155659437179565, + "reward_change_mean": -0.28877546079456806, + "reward_change_min": -0.5613481365144253, + "reward_change_std": 0.23745440039783716, + "reward_std": 0.4193967394530773, + "rewards/cosine_scaled_reward": -0.17091125436127186, + "rewards/format_reward": 0.14583333395421505, + "step": 38 + }, + { + "advantage_max": 1.2476415075361729, + "advantage_mean": 1.303851654421706e-08, + "advantage_min": -0.5395600944757462, + "advantage_std": 0.6443872451782227, + "completion_length": 2855.854217529297, + "epoch": 0.044571428571428574, + "grad_norm": 0.09057861566543579, + "kl": 0.00010902388021349907, + "lambda_div_used": 0.5, + "learning_rate": 7.799999999999999e-07, + "loss": 0.0132, + "reward": -0.06797056319192052, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.06797056319192052, + "reward_after_std": 0.6443872451782227, + "reward_before_mean": 0.4895414039492607, + "reward_before_std": 0.5221749180927873, + "reward_change_max": 0.0, + "reward_change_mean": -0.5575119545683265, + "reward_change_min": -0.824012566357851, + "reward_change_std": 0.33742869179695845, + "reward_std": 0.6443872600793839, + "rewards/cosine_scaled_reward": -0.00522929901489988, + "rewards/format_reward": 0.5000000111758709, + "step": 39 + }, + { + "advantage_max": 1.4988975450396538, + "advantage_mean": -4.967053435223079e-09, + "advantage_min": -0.6433758027851582, + "advantage_std": 0.7838446795940399, + "completion_length": 2304.4167098999023, + "epoch": 0.045714285714285714, + "grad_norm": 0.10686811059713364, + "kl": 0.0002674385905265808, + "lambda_div_used": 0.5, + "learning_rate": 8e-07, + "loss": 0.0178, + "reward": -0.020974524319171906, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.020974524319171906, + "reward_after_std": 0.7838446721434593, + "reward_before_mean": 0.5286793448030949, + "reward_before_std": 0.7248776257038116, + "reward_change_max": 0.0, + "reward_change_mean": -0.5496538653969765, + "reward_change_min": -0.9812478795647621, + "reward_change_std": 0.3794911988079548, + "reward_std": 0.7838447019457817, + "rewards/cosine_scaled_reward": -0.058577004820108414, + "rewards/format_reward": 0.645833345130086, + "step": 40 + }, + { + "advantage_max": 1.211952231824398, + "advantage_mean": 9.934107980669182e-09, + "advantage_min": -0.6004071980714798, + "advantage_std": 0.6527913548052311, + "completion_length": 2990.8958740234375, + "epoch": 0.046857142857142854, + "grad_norm": 0.11691441386938095, + "kl": 8.746236562728882e-05, + "lambda_div_used": 0.5, + "learning_rate": 8.199999999999999e-07, + "loss": 0.0022, + "reward": -0.2903154147788882, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2903154147788882, + "reward_after_std": 0.6527913585305214, + "reward_before_mean": 0.0847369134426117, + "reward_before_std": 0.6745379194617271, + "reward_change_max": 0.0022248774766921997, + "reward_change_mean": -0.3750523333437741, + "reward_change_min": -0.7458742596209049, + "reward_change_std": 0.31590295769274235, + "reward_std": 0.6527913697063923, + "rewards/cosine_scaled_reward": -0.16596487676724792, + "rewards/format_reward": 0.4166666753590107, + "step": 41 + }, + { + "advantage_max": 0.7290397174656391, + "advantage_mean": 1.6142925440831846e-08, + "advantage_min": -0.30367110669612885, + "advantage_std": 0.38293253630399704, + "completion_length": 2846.270839691162, + "epoch": 0.048, + "grad_norm": 0.0523265041410923, + "kl": 6.805360317230225e-05, + "lambda_div_used": 0.5, + "learning_rate": 8.399999999999999e-07, + "loss": 0.0103, + "reward": -0.5473397932946682, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.5473397932946682, + "reward_after_std": 0.38293252885341644, + "reward_before_mean": -0.30693099461495876, + "reward_before_std": 0.3625225257128477, + "reward_change_max": 0.002422921359539032, + "reward_change_mean": -0.24040879029780626, + "reward_change_min": -0.4838433638215065, + "reward_change_std": 0.1895816596224904, + "reward_std": 0.38293253630399704, + "rewards/cosine_scaled_reward": -0.29929884150624275, + "rewards/format_reward": 0.2916666679084301, + "step": 42 + }, + { + "advantage_max": 1.2590354792773724, + "advantage_mean": 6.829699361610153e-09, + "advantage_min": -0.5609464049339294, + "advantage_std": 0.653152123093605, + "completion_length": 3096.708366394043, + "epoch": 0.04914285714285714, + "grad_norm": 0.11091286689043045, + "kl": 6.282329559326172e-05, + "lambda_div_used": 0.5, + "learning_rate": 8.599999999999999e-07, + "loss": -0.0118, + "reward": -0.2702128039672971, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2702128039672971, + "reward_after_std": 0.653152123093605, + "reward_before_mean": 0.1145242340862751, + "reward_before_std": 0.6108989454805851, + "reward_change_max": 0.0005005002021789551, + "reward_change_mean": -0.3847370594739914, + "reward_change_min": -0.6752089485526085, + "reward_change_std": 0.2768531898036599, + "reward_std": 0.6531521566212177, + "rewards/cosine_scaled_reward": -0.06773788295686245, + "rewards/format_reward": 0.25, + "step": 43 + }, + { + "advantage_max": 1.2386278919875622, + "advantage_mean": -7.450580763457282e-09, + "advantage_min": -0.5172981098294258, + "advantage_std": 0.6598590333014727, + "completion_length": 2765.3542098999023, + "epoch": 0.05028571428571429, + "grad_norm": 0.11159113794565201, + "kl": 0.00022670626640319824, + "lambda_div_used": 0.5, + "learning_rate": 8.799999999999999e-07, + "loss": 0.0116, + "reward": -0.055530715733766556, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.055530715733766556, + "reward_after_std": 0.6598590407520533, + "reward_before_mean": 0.5103710060939193, + "reward_before_std": 0.5851284889504313, + "reward_change_max": 0.0, + "reward_change_mean": -0.5659017502330244, + "reward_change_min": -0.9489397816359997, + "reward_change_std": 0.38827237067744136, + "reward_std": 0.6598590649664402, + "rewards/cosine_scaled_reward": 0.0364355007186532, + "rewards/format_reward": 0.43750000558793545, + "step": 44 + }, + { + "advantage_max": 1.0946906879544258, + "advantage_mean": 1.0554989549049765e-08, + "advantage_min": -0.48534848541021347, + "advantage_std": 0.5852056853473186, + "completion_length": 3439.000030517578, + "epoch": 0.05142857142857143, + "grad_norm": 0.0995851531624794, + "kl": 0.00010732375085353851, + "lambda_div_used": 0.5, + "learning_rate": 9e-07, + "loss": 0.017, + "reward": -0.437751529738307, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.437751529738307, + "reward_after_std": 0.5852056816220284, + "reward_before_mean": -0.16415877640247345, + "reward_before_std": 0.6127915233373642, + "reward_change_max": 0.001485571265220642, + "reward_change_mean": -0.27359276497736573, + "reward_change_min": -0.6492436826229095, + "reward_change_std": 0.2626556186005473, + "reward_std": 0.5852056965231895, + "rewards/cosine_scaled_reward": -0.17582939192652702, + "rewards/format_reward": 0.1875000037252903, + "step": 45 + }, + { + "advantage_max": 0.6511989608407021, + "advantage_mean": 2.23517424569053e-08, + "advantage_min": -0.3454531729221344, + "advantage_std": 0.3550384156405926, + "completion_length": 3257.6041717529297, + "epoch": 0.052571428571428575, + "grad_norm": 0.052597712725400925, + "kl": 0.0002651810646057129, + "lambda_div_used": 0.5, + "learning_rate": 9.2e-07, + "loss": 0.0014, + "reward": -0.5496700319345109, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.5496700319345109, + "reward_after_std": 0.3550384044647217, + "reward_before_mean": -0.2950664572417736, + "reward_before_std": 0.3601315375417471, + "reward_change_max": 0.0012530013918876648, + "reward_change_mean": -0.25460357405245304, + "reward_change_min": -0.49163252115249634, + "reward_change_std": 0.20036437083035707, + "reward_std": 0.35503840632736683, + "rewards/cosine_scaled_reward": -0.2204498965293169, + "rewards/format_reward": 0.14583333395421505, + "step": 46 + }, + { + "advantage_max": 1.4671212919056416, + "advantage_mean": -8.071462664904772e-09, + "advantage_min": -0.8597921542823315, + "advantage_std": 0.8293537385761738, + "completion_length": 2995.979217529297, + "epoch": 0.053714285714285714, + "grad_norm": 0.14203596115112305, + "kl": 9.060092270374298e-05, + "lambda_div_used": 0.5, + "learning_rate": 9.399999999999999e-07, + "loss": 0.0581, + "reward": 0.15175165981054306, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.15175165981054306, + "reward_after_std": 0.8293537385761738, + "reward_before_mean": 0.8535946235060692, + "reward_before_std": 0.8959902841597795, + "reward_change_max": 0.0, + "reward_change_mean": -0.7018429655581713, + "reward_change_min": -1.2576069086790085, + "reward_change_std": 0.5387048032134771, + "reward_std": 0.8293537646532059, + "rewards/cosine_scaled_reward": 0.16638064198195934, + "rewards/format_reward": 0.5208333507180214, + "step": 47 + }, + { + "advantage_max": 1.550131119787693, + "advantage_mean": 1.7384689132704523e-08, + "advantage_min": -0.7010627537965775, + "advantage_std": 0.8304248489439487, + "completion_length": 2845.750030517578, + "epoch": 0.054857142857142854, + "grad_norm": 0.11722289025783539, + "kl": 0.0005335649475455284, + "lambda_div_used": 0.5, + "learning_rate": 9.6e-07, + "loss": 0.0032, + "reward": -0.13437421433627605, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.13437421433627605, + "reward_after_std": 0.8304248340427876, + "reward_before_mean": 0.3132213608769234, + "reward_before_std": 0.860728744417429, + "reward_change_max": 0.0034469887614250183, + "reward_change_mean": -0.4475955702364445, + "reward_change_min": -0.9456360414624214, + "reward_change_std": 0.3944186642765999, + "reward_std": 0.8304248489439487, + "rewards/cosine_scaled_reward": -0.07255600206553936, + "rewards/format_reward": 0.45833333395421505, + "step": 48 + }, + { + "advantage_max": 1.5654122084379196, + "advantage_mean": 1.117587122845265e-08, + "advantage_min": -0.716593436896801, + "advantage_std": 0.8443465456366539, + "completion_length": 2267.2292137145996, + "epoch": 0.056, + "grad_norm": 0.11271828413009644, + "kl": 0.0002801865339279175, + "lambda_div_used": 0.5, + "learning_rate": 9.8e-07, + "loss": 0.0784, + "reward": -0.05247187614440918, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.05247187614440918, + "reward_after_std": 0.8443465456366539, + "reward_before_mean": 0.464739790186286, + "reward_before_std": 0.8581503331661224, + "reward_change_max": 0.0022785961627960205, + "reward_change_mean": -0.5172116560861468, + "reward_change_min": -1.1284822821617126, + "reward_change_std": 0.44013802148401737, + "reward_std": 0.8443465568125248, + "rewards/cosine_scaled_reward": -0.059296777937561274, + "rewards/format_reward": 0.5833333376795053, + "step": 49 + }, + { + "advantage_max": 1.1785499043762684, + "advantage_mean": -3.725290298461914e-09, + "advantage_min": -0.5394556485116482, + "advantage_std": 0.6275908350944519, + "completion_length": 3042.1041984558105, + "epoch": 0.05714285714285714, + "grad_norm": 0.12018999457359314, + "kl": 0.00027485471218824387, + "lambda_div_used": 0.5, + "learning_rate": 1e-06, + "loss": 0.0332, + "reward": -0.20314022013917565, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.20314022013917565, + "reward_after_std": 0.6275908537209034, + "reward_before_mean": 0.25351108238101006, + "reward_before_std": 0.5844974275678396, + "reward_change_max": 0.0004190877079963684, + "reward_change_mean": -0.45665127877146006, + "reward_change_min": -0.8546877354383469, + "reward_change_std": 0.3474463615566492, + "reward_std": 0.6275908723473549, + "rewards/cosine_scaled_reward": -0.019077795557677746, + "rewards/format_reward": 0.2916666716337204, + "step": 50 + }, + { + "advantage_max": 1.1401083320379257, + "advantage_mean": 8.69234451084111e-09, + "advantage_min": -0.6032663956284523, + "advantage_std": 0.6303308606147766, + "completion_length": 2341.1250534057617, + "epoch": 0.05828571428571429, + "grad_norm": 0.10779228806495667, + "kl": 0.000632166862487793, + "lambda_div_used": 0.5, + "learning_rate": 9.999890338174275e-07, + "loss": 0.0377, + "reward": -0.12168696755543351, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.12168696755543351, + "reward_after_std": 0.6303308419883251, + "reward_before_mean": 0.4071807861328125, + "reward_before_std": 0.6401997022330761, + "reward_change_max": 0.0, + "reward_change_mean": -0.5288677718490362, + "reward_change_min": -0.9466118700802326, + "reward_change_std": 0.3940194044262171, + "reward_std": 0.6303308643400669, + "rewards/cosine_scaled_reward": -0.07765959948301315, + "rewards/format_reward": 0.5625000074505806, + "step": 51 + }, + { + "advantage_max": 1.657617561519146, + "advantage_mean": -6.208817349140361e-10, + "advantage_min": -0.8769890516996384, + "advantage_std": 0.8941908292472363, + "completion_length": 2828.6042251586914, + "epoch": 0.05942857142857143, + "grad_norm": 0.10411342233419418, + "kl": 0.0005970411002635956, + "lambda_div_used": 0.5, + "learning_rate": 9.999561358041868e-07, + "loss": 0.0157, + "reward": 0.11933974362909794, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.11933974362909794, + "reward_after_std": 0.8941908441483974, + "reward_before_mean": 0.7608568891882896, + "reward_before_std": 0.8951385281980038, + "reward_change_max": 0.000269867479801178, + "reward_change_mean": -0.6415171585977077, + "reward_change_min": -1.175170797854662, + "reward_change_std": 0.47852752916514874, + "reward_std": 0.8941908627748489, + "rewards/cosine_scaled_reward": 0.13042843155562878, + "rewards/format_reward": 0.5000000149011612, + "step": 52 + }, + { + "advantage_max": 1.5903307870030403, + "advantage_mean": -8.071462054282108e-09, + "advantage_min": -0.5871336311101913, + "advantage_std": 0.817248024046421, + "completion_length": 2820.187530517578, + "epoch": 0.060571428571428575, + "grad_norm": 0.11234336346387863, + "kl": 0.00043966108933091164, + "lambda_div_used": 0.5, + "learning_rate": 9.999013075636804e-07, + "loss": -0.0115, + "reward": 0.0432483796030283, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.0432483796030283, + "reward_after_std": 0.8172480165958405, + "reward_before_mean": 0.6357301553944126, + "reward_before_std": 0.6847620755434036, + "reward_change_max": 0.0, + "reward_change_mean": -0.5924818031489849, + "reward_change_min": -0.9367171190679073, + "reward_change_std": 0.36971727199852467, + "reward_std": 0.8172480203211308, + "rewards/cosine_scaled_reward": 0.036615074845030904, + "rewards/format_reward": 0.5625000055879354, + "step": 53 + }, + { + "advantage_max": 2.0655234158039093, + "advantage_mean": -5.587935614226325e-09, + "advantage_min": -0.9336753264069557, + "advantage_std": 1.1151320487260818, + "completion_length": 2985.6250610351562, + "epoch": 0.061714285714285715, + "grad_norm": 0.16910512745380402, + "kl": 0.00020236149430274963, + "lambda_div_used": 0.5, + "learning_rate": 9.998245517681593e-07, + "loss": 0.044, + "reward": 0.15036086877807975, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.15036086877807975, + "reward_after_std": 1.1151320710778236, + "reward_before_mean": 0.7494669770821929, + "reward_before_std": 1.1877865009009838, + "reward_change_max": 0.0017044693231582642, + "reward_change_mean": -0.5991061069071293, + "reward_change_min": -1.2863488122820854, + "reward_change_std": 0.5509431846439838, + "reward_std": 1.115132100880146, + "rewards/cosine_scaled_reward": 0.10390015179291368, + "rewards/format_reward": 0.541666679084301, + "step": 54 + }, + { + "advantage_max": 1.346567988395691, + "advantage_mean": 5.5879357807597785e-09, + "advantage_min": -0.5341115295886993, + "advantage_std": 0.6975418590009212, + "completion_length": 3084.5834045410156, + "epoch": 0.06285714285714286, + "grad_norm": 0.10880491882562637, + "kl": 0.0010903775691986084, + "lambda_div_used": 0.5, + "learning_rate": 9.997258721585931e-07, + "loss": 0.0464, + "reward": -0.2912944480776787, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.2912944480776787, + "reward_after_std": 0.697541855275631, + "reward_before_mean": 0.06275918334722519, + "reward_before_std": 0.6657360903918743, + "reward_change_max": 0.0007643029093742371, + "reward_change_mean": -0.35405362490564585, + "reward_change_min": -0.6719070784747601, + "reward_change_std": 0.2700015977025032, + "reward_std": 0.697541881352663, + "rewards/cosine_scaled_reward": -0.10403707949444652, + "rewards/format_reward": 0.2708333358168602, + "step": 55 + }, + { + "advantage_max": 1.1837703846395016, + "advantage_mean": 1.0554989493538613e-08, + "advantage_min": -0.6579623222351074, + "advantage_std": 0.6466393321752548, + "completion_length": 2916.8333892822266, + "epoch": 0.064, + "grad_norm": 0.0897069051861763, + "kl": 0.0005567669868469238, + "lambda_div_used": 0.5, + "learning_rate": 9.996052735444862e-07, + "loss": 0.0412, + "reward": -0.1802707426249981, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.1802707426249981, + "reward_after_std": 0.646639347076416, + "reward_before_mean": 0.29126008972525597, + "reward_before_std": 0.6771029680967331, + "reward_change_max": 0.0011851340532302856, + "reward_change_mean": -0.47153082955628633, + "reward_change_min": -0.8577333986759186, + "reward_change_std": 0.365943206474185, + "reward_std": 0.6466393582522869, + "rewards/cosine_scaled_reward": -0.07311996631324291, + "rewards/format_reward": 0.4375000111758709, + "step": 56 + }, + { + "advantage_max": 1.5917630940675735, + "advantage_mean": 1.6763806898190126e-08, + "advantage_min": -0.5033136904239655, + "advantage_std": 0.8053608201444149, + "completion_length": 3434.687530517578, + "epoch": 0.06514285714285714, + "grad_norm": 0.11615301668643951, + "kl": 0.000165596604347229, + "lambda_div_used": 0.5, + "learning_rate": 9.994627618036452e-07, + "loss": 0.0071, + "reward": -0.34377744421362877, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.34377744421362877, + "reward_after_std": 0.805360808968544, + "reward_before_mean": -0.07328794337809086, + "reward_before_std": 0.7592337466776371, + "reward_change_max": 0.00013331323862075806, + "reward_change_mean": -0.2704894933849573, + "reward_change_min": -0.5694965869188309, + "reward_change_std": 0.2236005710437894, + "reward_std": 0.8053608499467373, + "rewards/cosine_scaled_reward": -0.16164396703243256, + "rewards/format_reward": 0.2500000037252903, + "step": 57 + }, + { + "advantage_max": 1.8052620589733124, + "advantage_mean": 3.7252904094842165e-09, + "advantage_min": -0.7408896759152412, + "advantage_std": 0.9437650218605995, + "completion_length": 2223.4167251586914, + "epoch": 0.06628571428571428, + "grad_norm": 0.17151595652103424, + "kl": 0.004196107387542725, + "lambda_div_used": 0.5, + "learning_rate": 9.992983438818915e-07, + "loss": 0.0393, + "reward": 0.08948480966500938, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.08948480966500938, + "reward_after_std": 0.9437649697065353, + "reward_before_mean": 0.6831958554685116, + "reward_before_std": 0.8700358737260103, + "reward_change_max": 0.0, + "reward_change_mean": -0.5937110781669617, + "reward_change_min": -1.0060609802603722, + "reward_change_std": 0.41046774573624134, + "reward_std": 0.9437649697065353, + "rewards/cosine_scaled_reward": -0.02298540365882218, + "rewards/format_reward": 0.7291666753590107, + "step": 58 + }, + { + "advantage_max": 1.175541877746582, + "advantage_mean": -1.6763807009212428e-08, + "advantage_min": -0.44569920375943184, + "advantage_std": 0.6143423058092594, + "completion_length": 2941.7083587646484, + "epoch": 0.06742857142857143, + "grad_norm": 0.08178117871284485, + "kl": 0.000935891643166542, + "lambda_div_used": 0.5, + "learning_rate": 9.991120277927223e-07, + "loss": 0.0342, + "reward": -0.2478409237228334, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2478409237228334, + "reward_after_std": 0.6143423058092594, + "reward_before_mean": 0.16964629292488098, + "reward_before_std": 0.5480784140527248, + "reward_change_max": 0.000612989068031311, + "reward_change_mean": -0.4174872115254402, + "reward_change_min": -0.8169004134833813, + "reward_change_std": 0.31252420227974653, + "reward_std": 0.6143423169851303, + "rewards/cosine_scaled_reward": -0.06101020169444382, + "rewards/format_reward": 0.2916666679084301, + "step": 59 + }, + { + "advantage_max": 1.0423083528876305, + "advantage_mean": 1.4901161526914564e-08, + "advantage_min": -0.4296622648835182, + "advantage_std": 0.5458662435412407, + "completion_length": 2984.916679382324, + "epoch": 0.06857142857142857, + "grad_norm": 0.07768469303846359, + "kl": 0.0006950497627258301, + "lambda_div_used": 0.5, + "learning_rate": 9.989038226169207e-07, + "loss": 0.0063, + "reward": -0.40611574915237725, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.40611574915237725, + "reward_after_std": 0.5458662435412407, + "reward_before_mean": -0.0968768410384655, + "reward_before_std": 0.5283457562327385, + "reward_change_max": 0.002580612897872925, + "reward_change_mean": -0.30923892557621, + "reward_change_min": -0.6607633791863918, + "reward_change_std": 0.24943595007061958, + "reward_std": 0.5458662621676922, + "rewards/cosine_scaled_reward": -0.2046884261071682, + "rewards/format_reward": 0.31250000186264515, + "step": 60 + }, + { + "advantage_max": 1.457154531031847, + "advantage_mean": 9.313225801665936e-09, + "advantage_min": -0.7158756963908672, + "advantage_std": 0.7909952085465193, + "completion_length": 3087.0208740234375, + "epoch": 0.06971428571428571, + "grad_norm": 0.16649970412254333, + "kl": 0.0007555186748504639, + "lambda_div_used": 0.5, + "learning_rate": 9.98673738502114e-07, + "loss": 0.0935, + "reward": -0.13823260087519884, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.13823260087519884, + "reward_after_std": 0.790995180606842, + "reward_before_mean": 0.3234656620770693, + "reward_before_std": 0.8312350269407034, + "reward_change_max": 0.0023474469780921936, + "reward_change_mean": -0.46169828064739704, + "reward_change_min": -0.9537935554981232, + "reward_change_std": 0.4016226176172495, + "reward_std": 0.790995180606842, + "rewards/cosine_scaled_reward": -0.0674338429234922, + "rewards/format_reward": 0.45833334513008595, + "step": 61 + }, + { + "advantage_max": 2.0991614311933517, + "advantage_mean": -1.738468857759301e-08, + "advantage_min": -0.7645231634378433, + "advantage_std": 1.0837591513991356, + "completion_length": 2622.6458740234375, + "epoch": 0.07085714285714285, + "grad_norm": 0.16811510920524597, + "kl": 0.015173658728599548, + "lambda_div_used": 0.5, + "learning_rate": 9.98421786662277e-07, + "loss": 0.0233, + "reward": 0.20936205855105072, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.20936205855105072, + "reward_after_std": 1.0837591513991356, + "reward_before_mean": 0.8541646478697658, + "reward_before_std": 0.9757980816066265, + "reward_change_max": 0.0, + "reward_change_mean": -0.6448025852441788, + "reward_change_min": -1.1601449958980083, + "reward_change_std": 0.456626171246171, + "reward_std": 1.0837591886520386, + "rewards/cosine_scaled_reward": 0.0937489839270711, + "rewards/format_reward": 0.6666666809469461, + "step": 62 + }, + { + "advantage_max": 1.5456999205052853, + "advantage_mean": -6.20881729362921e-09, + "advantage_min": -0.6523657143115997, + "advantage_std": 0.7971435803920031, + "completion_length": 2413.3958740234375, + "epoch": 0.072, + "grad_norm": 0.12558011710643768, + "kl": 0.0017941594123840332, + "lambda_div_used": 0.5, + "learning_rate": 9.981479793771866e-07, + "loss": 0.0628, + "reward": 0.025665222201496363, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.025665222201496363, + "reward_after_std": 0.7971435710787773, + "reward_before_mean": 0.6093635726720095, + "reward_before_std": 0.6957045514136553, + "reward_change_max": 0.0003859475255012512, + "reward_change_mean": -0.5836983378976583, + "reward_change_min": -0.9405001066625118, + "reward_change_std": 0.3708849251270294, + "reward_std": 0.7971436083316803, + "rewards/cosine_scaled_reward": 0.013015098869800568, + "rewards/format_reward": 0.5833333469927311, + "step": 63 + }, + { + "advantage_max": 1.5050344467163086, + "advantage_mean": 1.8005570256995895e-08, + "advantage_min": -0.6849210783839226, + "advantage_std": 0.8289222978055477, + "completion_length": 3141.7500610351562, + "epoch": 0.07314285714285715, + "grad_norm": 0.16174478828907013, + "kl": 0.0019003748893737793, + "lambda_div_used": 0.5, + "learning_rate": 9.97852329991824e-07, + "loss": 0.1196, + "reward": -0.21013362589292228, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.21013362589292228, + "reward_after_std": 0.8289222978055477, + "reward_before_mean": 0.1825589847867377, + "reward_before_std": 0.923391830176115, + "reward_change_max": 0.0005897730588912964, + "reward_change_mean": -0.39269259478896856, + "reward_change_min": -0.9992674365639687, + "reward_change_std": 0.4153337189927697, + "reward_std": 0.8289223089814186, + "rewards/cosine_scaled_reward": -0.05455384007655084, + "rewards/format_reward": 0.29166667722165585, + "step": 64 + }, + { + "advantage_max": 1.3541830480098724, + "advantage_mean": -1.2417631367611648e-09, + "advantage_min": -0.5969244241714478, + "advantage_std": 0.7039464600384235, + "completion_length": 2772.8333587646484, + "epoch": 0.07428571428571429, + "grad_norm": 0.09448474645614624, + "kl": 0.003227710723876953, + "lambda_div_used": 0.5, + "learning_rate": 9.975348529157229e-07, + "loss": 0.0235, + "reward": -0.26149132908903994, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.26149132908903994, + "reward_after_std": 0.7039464488625526, + "reward_before_mean": 0.11393103189766407, + "reward_before_std": 0.6785321645438671, + "reward_change_max": 0.002308964729309082, + "reward_change_mean": -0.3754223734140396, + "reward_change_min": -0.7083380743861198, + "reward_change_std": 0.293591745197773, + "reward_std": 0.7039464600384235, + "rewards/cosine_scaled_reward": -0.15136783104389906, + "rewards/format_reward": 0.41666666977107525, + "step": 65 + }, + { + "advantage_max": 1.297847893089056, + "advantage_mean": 5.5879357807597785e-09, + "advantage_min": -0.4594452455639839, + "advantage_std": 0.6709181014448404, + "completion_length": 2317.3125038146973, + "epoch": 0.07542857142857143, + "grad_norm": 0.09020698070526123, + "kl": 0.002762317657470703, + "lambda_div_used": 0.5, + "learning_rate": 9.971955636222684e-07, + "loss": -0.0007, + "reward": -0.0659542977809906, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.0659542977809906, + "reward_after_std": 0.6709181014448404, + "reward_before_mean": 0.478666216135025, + "reward_before_std": 0.5349580403417349, + "reward_change_max": 0.0030061379075050354, + "reward_change_mean": -0.5446205204352736, + "reward_change_min": -0.9633432440459728, + "reward_change_std": 0.36170417233370245, + "reward_std": 0.6709181163460016, + "rewards/cosine_scaled_reward": 0.010166438878513873, + "rewards/format_reward": 0.4583333358168602, + "step": 66 + }, + { + "advantage_max": 1.126870758831501, + "advantage_mean": 1.8626452047421083e-08, + "advantage_min": -0.4127872511744499, + "advantage_std": 0.5886235721409321, + "completion_length": 3524.2916870117188, + "epoch": 0.07657142857142857, + "grad_norm": 0.10760419815778732, + "kl": 0.0019077062606811523, + "lambda_div_used": 0.5, + "learning_rate": 9.968344786479415e-07, + "loss": 0.0454, + "reward": -0.5646808911114931, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.5646808911114931, + "reward_after_std": 0.5886235684156418, + "reward_before_mean": -0.4027576297521591, + "reward_before_std": 0.6220518052577972, + "reward_change_max": 0.0007558688521385193, + "reward_change_mean": -0.16192326415330172, + "reward_change_min": -0.5088565908372402, + "reward_change_std": 0.20086207846179605, + "reward_std": 0.5886235684156418, + "rewards/cosine_scaled_reward": -0.24304548278450966, + "rewards/format_reward": 0.0833333358168602, + "step": 67 + }, + { + "advantage_max": 1.5252962484955788, + "advantage_mean": 2.4835271617007493e-09, + "advantage_min": -0.7342267334461212, + "advantage_std": 0.835030809044838, + "completion_length": 2333.3541946411133, + "epoch": 0.07771428571428571, + "grad_norm": 0.15104389190673828, + "kl": 0.004139900207519531, + "lambda_div_used": 0.5, + "learning_rate": 9.964516155915151e-07, + "loss": 0.0705, + "reward": -0.011033048038370907, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.011033048038370907, + "reward_after_std": 0.8350308053195477, + "reward_before_mean": 0.5420394577085972, + "reward_before_std": 0.8811622634530067, + "reward_change_max": 0.0, + "reward_change_mean": -0.5530724860727787, + "reward_change_min": -1.1742777340114117, + "reward_change_std": 0.4708987697958946, + "reward_std": 0.8350308425724506, + "rewards/cosine_scaled_reward": -0.010230285115540028, + "rewards/format_reward": 0.5625000074505806, + "step": 68 + }, + { + "advantage_max": 1.0369067564606667, + "advantage_mean": 2.4214387495113954e-08, + "advantage_min": -0.382175724953413, + "advantage_std": 0.5327570177614689, + "completion_length": 2721.0833740234375, + "epoch": 0.07885714285714286, + "grad_norm": 0.07868275046348572, + "kl": 0.003982067108154297, + "lambda_div_used": 0.5, + "learning_rate": 9.960469931131936e-07, + "loss": 0.0352, + "reward": -0.43468883633613586, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.43468883633613586, + "reward_after_std": 0.5327570252120495, + "reward_before_mean": -0.1535684810951352, + "reward_before_std": 0.4849686697125435, + "reward_change_max": 0.0007972046732902527, + "reward_change_mean": -0.2811203598976135, + "reward_change_min": -0.5471938513219357, + "reward_change_std": 0.20546625927090645, + "reward_std": 0.5327570587396622, + "rewards/cosine_scaled_reward": -0.24345091171562672, + "rewards/format_reward": 0.3333333358168602, + "step": 69 + }, + { + "advantage_max": 1.3918376974761486, + "advantage_mean": -6.208819014474898e-10, + "advantage_min": -0.49868446588516235, + "advantage_std": 0.7050085607916117, + "completion_length": 3057.2291870117188, + "epoch": 0.08, + "grad_norm": 0.1143859475851059, + "kl": 0.0013420581817626953, + "lambda_div_used": 0.5, + "learning_rate": 9.956206309337066e-07, + "loss": 0.0225, + "reward": -0.2947548748925328, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2947548748925328, + "reward_after_std": 0.7050085607916117, + "reward_before_mean": 0.04767380794510245, + "reward_before_std": 0.6246827412396669, + "reward_change_max": 0.0009760409593582153, + "reward_change_mean": -0.34242871031165123, + "reward_change_min": -0.5683609507977962, + "reward_change_std": 0.24109898321330547, + "reward_std": 0.7050085626542568, + "rewards/cosine_scaled_reward": -0.1532464288175106, + "rewards/format_reward": 0.3541666679084301, + "step": 70 + }, + { + "advantage_max": 1.0536888763308525, + "advantage_mean": 1.862645426786713e-09, + "advantage_min": -0.5757294371724129, + "advantage_std": 0.5784401223063469, + "completion_length": 2708.5625228881836, + "epoch": 0.08114285714285714, + "grad_norm": 0.10196925699710846, + "kl": 0.005338191986083984, + "lambda_div_used": 0.5, + "learning_rate": 9.951725498333448e-07, + "loss": 0.0312, + "reward": -0.1822344735264778, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.1822344735264778, + "reward_after_std": 0.5784401334822178, + "reward_before_mean": 0.3108774647116661, + "reward_before_std": 0.5767860356718302, + "reward_change_max": 0.003112994134426117, + "reward_change_mean": -0.4931119680404663, + "reward_change_min": -0.8853396885097027, + "reward_change_std": 0.36358125135302544, + "reward_std": 0.5784401521086693, + "rewards/cosine_scaled_reward": -0.042477929033339024, + "rewards/format_reward": 0.3958333395421505, + "step": 71 + }, + { + "advantage_max": 1.6563706696033478, + "advantage_mean": 1.924733389335742e-08, + "advantage_min": -0.6117210127413273, + "advantage_std": 0.8583654910326004, + "completion_length": 3154.312530517578, + "epoch": 0.08228571428571428, + "grad_norm": 0.145771786570549, + "kl": 0.004016876220703125, + "lambda_div_used": 0.5, + "learning_rate": 9.947027716509488e-07, + "loss": 0.0721, + "reward": -0.3233127495041117, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.3233127495041117, + "reward_after_std": 0.8583654500544071, + "reward_before_mean": -0.04943877179175615, + "reward_before_std": 0.8772860243916512, + "reward_change_max": 0.005304671823978424, + "reward_change_mean": -0.27387397922575474, + "reward_change_min": -0.6673308126628399, + "reward_change_std": 0.284614821895957, + "reward_std": 0.8583654649555683, + "rewards/cosine_scaled_reward": -0.18096939055249095, + "rewards/format_reward": 0.3125000111758709, + "step": 72 + }, + { + "advantage_max": 1.26315114274621, + "advantage_mean": 4.3461718668424965e-09, + "advantage_min": -0.4227764904499054, + "advantage_std": 0.639428773894906, + "completion_length": 3549.5208740234375, + "epoch": 0.08342857142857144, + "grad_norm": 0.09522831439971924, + "kl": 0.0009833574295043945, + "lambda_div_used": 0.5, + "learning_rate": 9.942113192828444e-07, + "loss": 0.0254, + "reward": -0.5094889532774687, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.5094889532774687, + "reward_after_std": 0.6394287925213575, + "reward_before_mean": -0.3250623978674412, + "reward_before_std": 0.6202701851725578, + "reward_change_max": 0.0, + "reward_change_mean": -0.18442655354738235, + "reward_change_min": -0.386491771787405, + "reward_change_std": 0.1530038034543395, + "reward_std": 0.6394288036972284, + "rewards/cosine_scaled_reward": -0.2041978659108281, + "rewards/format_reward": 0.0833333358168602, + "step": 73 + }, + { + "advantage_max": 1.29529245570302, + "advantage_mean": 2.048909719665204e-08, + "advantage_min": -0.43963947519659996, + "advantage_std": 0.6664884742349386, + "completion_length": 3175.500030517578, + "epoch": 0.08457142857142858, + "grad_norm": 0.12290780991315842, + "kl": 0.003965795040130615, + "lambda_div_used": 0.5, + "learning_rate": 9.93698216681727e-07, + "loss": 0.0581, + "reward": -0.2856890633702278, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2856890633702278, + "reward_after_std": 0.6664884742349386, + "reward_before_mean": 0.08298108261078596, + "reward_before_std": 0.6028281692415476, + "reward_change_max": 0.0005967244505882263, + "reward_change_mean": -0.3686701231636107, + "reward_change_min": -0.7452529221773148, + "reward_change_std": 0.27662853663787246, + "reward_std": 0.6664884965866804, + "rewards/cosine_scaled_reward": -0.08350946195423603, + "rewards/format_reward": 0.2500000037252903, + "step": 74 + }, + { + "advantage_max": 1.0755420252680779, + "advantage_mean": 0.0, + "advantage_min": -0.41619032248854637, + "advantage_std": 0.5525711067020893, + "completion_length": 3297.1666870117188, + "epoch": 0.08571428571428572, + "grad_norm": 0.09391221404075623, + "kl": 0.0034029483795166016, + "lambda_div_used": 0.5, + "learning_rate": 9.931634888554935e-07, + "loss": 0.023, + "reward": -0.20981414895504713, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.20981414895504713, + "reward_after_std": 0.5525711178779602, + "reward_before_mean": 0.2553990473970771, + "reward_before_std": 0.43019257858395576, + "reward_change_max": 0.0018067359924316406, + "reward_change_mean": -0.46521320194005966, + "reward_change_min": -0.7154612056910992, + "reward_change_std": 0.28662352077662945, + "reward_std": 0.5525711290538311, + "rewards/cosine_scaled_reward": -0.04938381724059582, + "rewards/format_reward": 0.35416666977107525, + "step": 75 + }, + { + "advantage_max": 1.1918527409434319, + "advantage_mean": 1.691902684619606e-08, + "advantage_min": -0.5500158071517944, + "advantage_std": 0.6363635919988155, + "completion_length": 3000.937530517578, + "epoch": 0.08685714285714285, + "grad_norm": 0.10816562920808792, + "kl": 0.0012085437774658203, + "lambda_div_used": 0.5, + "learning_rate": 9.926071618660237e-07, + "loss": 0.0431, + "reward": -0.2598666944541037, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2598666944541037, + "reward_after_std": 0.6363636236637831, + "reward_before_mean": 0.14593160874210298, + "reward_before_std": 0.6384630594402552, + "reward_change_max": 0.0007710233330726624, + "reward_change_mean": -0.4057982945814729, + "reward_change_min": -0.7503902576863766, + "reward_change_std": 0.3121058507822454, + "reward_std": 0.6363636441528797, + "rewards/cosine_scaled_reward": -0.1457842094823718, + "rewards/format_reward": 0.43750001676380634, + "step": 76 + }, + { + "advantage_max": 0.9090688228607178, + "advantage_mean": -3.725290242950763e-09, + "advantage_min": -0.5261832289397717, + "advantage_std": 0.5043393410742283, + "completion_length": 3174.2083740234375, + "epoch": 0.088, + "grad_norm": 0.11574254930019379, + "kl": 0.0021175146102905273, + "lambda_div_used": 0.5, + "learning_rate": 9.9202926282791e-07, + "loss": 0.0272, + "reward": -0.34673725441098213, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.34673725441098213, + "reward_after_std": 0.5043393447995186, + "reward_before_mean": 0.03295615315437317, + "reward_before_std": 0.5356946103274822, + "reward_change_max": 0.0012821629643440247, + "reward_change_mean": -0.37969343923032284, + "reward_change_min": -0.6977962926030159, + "reward_change_std": 0.30058483220636845, + "reward_std": 0.5043393522500992, + "rewards/cosine_scaled_reward": -0.12935525551438332, + "rewards/format_reward": 0.29166667722165585, + "step": 77 + }, + { + "advantage_max": 1.8381619974970818, + "advantage_mean": 2.4835269396561444e-09, + "advantage_min": -0.7710757628083229, + "advantage_std": 0.9732430391013622, + "completion_length": 3186.3958587646484, + "epoch": 0.08914285714285715, + "grad_norm": 0.15152576565742493, + "kl": 0.0022534728050231934, + "lambda_div_used": 0.5, + "learning_rate": 9.91429819907136e-07, + "loss": 0.0564, + "reward": -0.15935248951427639, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.15935248951427639, + "reward_after_std": 0.9732430167496204, + "reward_before_mean": 0.22409775853157043, + "reward_before_std": 1.021384745836258, + "reward_change_max": 0.003303632140159607, + "reward_change_mean": -0.38345025200396776, + "reward_change_min": -0.810688778758049, + "reward_change_std": 0.3610359411686659, + "reward_std": 0.9732430540025234, + "rewards/cosine_scaled_reward": -0.03378446213901043, + "rewards/format_reward": 0.29166667349636555, + "step": 78 + }, + { + "advantage_max": 1.4568165950477123, + "advantage_mean": -4.346172088887101e-09, + "advantage_min": -0.5802691504359245, + "advantage_std": 0.7698210962116718, + "completion_length": 2612.4583740234375, + "epoch": 0.09028571428571429, + "grad_norm": 0.10133597254753113, + "kl": 0.004894614219665527, + "lambda_div_used": 0.5, + "learning_rate": 9.908088623197048e-07, + "loss": 0.019, + "reward": -0.19068989902734756, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.19068989902734756, + "reward_after_std": 0.7698210999369621, + "reward_before_mean": 0.2276688851416111, + "reward_before_std": 0.7653483748435974, + "reward_change_max": 0.0016639381647109985, + "reward_change_mean": -0.41835877299308777, + "reward_change_min": -0.8724069856107235, + "reward_change_std": 0.3391466625034809, + "reward_std": 0.7698211185634136, + "rewards/cosine_scaled_reward": -0.12574889697134495, + "rewards/format_reward": 0.4791666679084301, + "step": 79 + }, + { + "advantage_max": 1.4858717247843742, + "advantage_mean": 8.692344288796505e-09, + "advantage_min": -0.59293033182621, + "advantage_std": 0.8019508235156536, + "completion_length": 3314.4791870117188, + "epoch": 0.09142857142857143, + "grad_norm": 0.1212073266506195, + "kl": 0.003013134002685547, + "lambda_div_used": 0.5, + "learning_rate": 9.901664203302124e-07, + "loss": 0.0403, + "reward": -0.3154444256797433, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.3154444256797433, + "reward_after_std": 0.8019508309662342, + "reward_before_mean": -0.0058077238500118256, + "reward_before_std": 0.8834160603582859, + "reward_change_max": 0.004029706120491028, + "reward_change_mean": -0.30963670555502176, + "reward_change_min": -0.9396452568471432, + "reward_change_std": 0.37205731589347124, + "reward_std": 0.8019508682191372, + "rewards/cosine_scaled_reward": -0.13832053111400455, + "rewards/format_reward": 0.2708333358168602, + "step": 80 + }, + { + "advantage_max": 1.2512575164437294, + "advantage_mean": -1.862645149230957e-09, + "advantage_min": -0.5687988735735416, + "advantage_std": 0.6636236608028412, + "completion_length": 3201.5833740234375, + "epoch": 0.09257142857142857, + "grad_norm": 0.1467030793428421, + "kl": 0.009012222290039062, + "lambda_div_used": 0.5, + "learning_rate": 9.895025252503755e-07, + "loss": 0.0302, + "reward": -0.29607150983065367, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.29607150983065367, + "reward_after_std": 0.6636236682534218, + "reward_before_mean": 0.06822922918945551, + "reward_before_std": 0.6745604537427425, + "reward_change_max": 0.0014805421233177185, + "reward_change_mean": -0.36430073343217373, + "reward_change_min": -0.7246239744126797, + "reward_change_std": 0.29929828830063343, + "reward_std": 0.6636237092316151, + "rewards/cosine_scaled_reward": -0.1325520584359765, + "rewards/format_reward": 0.3333333395421505, + "step": 81 + }, + { + "advantage_max": 1.4132002517580986, + "advantage_mean": 3.725290520506519e-09, + "advantage_min": -0.6526295244693756, + "advantage_std": 0.7535315006971359, + "completion_length": 2793.104202270508, + "epoch": 0.09371428571428571, + "grad_norm": 0.10430161654949188, + "kl": 0.006566286087036133, + "lambda_div_used": 0.5, + "learning_rate": 9.888172094375033e-07, + "loss": 0.0376, + "reward": -0.020408831536769867, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.020408831536769867, + "reward_after_std": 0.7535315155982971, + "reward_before_mean": 0.5454386062920094, + "reward_before_std": 0.7008488159626722, + "reward_change_max": 0.0014094933867454529, + "reward_change_mean": -0.565847460180521, + "reward_change_min": -0.9649605080485344, + "reward_change_std": 0.40595896914601326, + "reward_std": 0.7535315193235874, + "rewards/cosine_scaled_reward": 0.05396931990981102, + "rewards/format_reward": 0.4375000074505806, + "step": 82 + }, + { + "advantage_max": 1.1509655825793743, + "advantage_mean": 1.4280280014045132e-08, + "advantage_min": -0.4061584994196892, + "advantage_std": 0.5890420638024807, + "completion_length": 2931.104179382324, + "epoch": 0.09485714285714286, + "grad_norm": 0.08529522269964218, + "kl": 0.003251791000366211, + "lambda_div_used": 0.5, + "learning_rate": 9.881105062929221e-07, + "loss": -0.0002, + "reward": -0.3930655550211668, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.3930655550211668, + "reward_after_std": 0.5890420712530613, + "reward_before_mean": -0.09176901169121265, + "reward_before_std": 0.5422711819410324, + "reward_change_max": 0.0007033348083496094, + "reward_change_mean": -0.3012965607922524, + "reward_change_min": -0.5879498608410358, + "reward_change_std": 0.22128173056989908, + "reward_std": 0.5890420861542225, + "rewards/cosine_scaled_reward": -0.181301174685359, + "rewards/format_reward": 0.27083333395421505, + "step": 83 + }, + { + "advantage_max": 1.3313812613487244, + "advantage_mean": 1.3659397946064189e-08, + "advantage_min": -0.5420710518956184, + "advantage_std": 0.708248311188072, + "completion_length": 3138.3958587646484, + "epoch": 0.096, + "grad_norm": 0.10056845843791962, + "kl": 0.0016238689422607422, + "lambda_div_used": 0.5, + "learning_rate": 9.873824502603459e-07, + "loss": 0.0243, + "reward": -0.1541997790336609, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.1541997790336609, + "reward_after_std": 0.7082483097910881, + "reward_before_mean": 0.31638404354453087, + "reward_before_std": 0.6747408444061875, + "reward_change_max": 0.0010876953601837158, + "reward_change_mean": -0.4705838493537158, + "reward_change_min": -0.8658628650009632, + "reward_change_std": 0.35376761644147336, + "reward_std": 0.7082483172416687, + "rewards/cosine_scaled_reward": -0.008474626112729311, + "rewards/format_reward": 0.33333334140479565, + "step": 84 + }, + { + "advantage_max": 1.905438233166933, + "advantage_mean": 9.313225857177088e-09, + "advantage_min": -0.6710572019219398, + "advantage_std": 0.9832526743412018, + "completion_length": 3167.2500610351562, + "epoch": 0.09714285714285714, + "grad_norm": 0.16388022899627686, + "kl": 0.0030014514923095703, + "lambda_div_used": 0.5, + "learning_rate": 9.866330768241983e-07, + "loss": 0.0552, + "reward": -0.16078246012330055, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.16078246012330055, + "reward_after_std": 0.9832526911050081, + "reward_before_mean": 0.20949235220905393, + "reward_before_std": 0.968382814899087, + "reward_change_max": 0.0020551979541778564, + "reward_change_mean": -0.3702748082578182, + "reward_change_min": -0.7871052846312523, + "reward_change_std": 0.33008442260324955, + "reward_std": 0.9832527544349432, + "rewards/cosine_scaled_reward": -0.08275382965803146, + "rewards/format_reward": 0.37500000558793545, + "step": 85 + }, + { + "advantage_max": 1.339600756764412, + "advantage_mean": 4.967053768289986e-09, + "advantage_min": -0.5849123671650887, + "advantage_std": 0.6978941671550274, + "completion_length": 3120.5, + "epoch": 0.09828571428571428, + "grad_norm": 0.14190861582756042, + "kl": 0.003871917724609375, + "lambda_div_used": 0.5, + "learning_rate": 9.85862422507884e-07, + "loss": 0.0345, + "reward": -0.2158273388631642, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2158273388631642, + "reward_after_std": 0.6978941857814789, + "reward_before_mean": 0.2014560904353857, + "reward_before_std": 0.6507901139557362, + "reward_change_max": 0.0007161274552345276, + "reward_change_mean": -0.41728341206908226, + "reward_change_min": -0.740577656775713, + "reward_change_std": 0.3097661882638931, + "reward_std": 0.697894211858511, + "rewards/cosine_scaled_reward": -0.08677197038196027, + "rewards/format_reward": 0.3750000074505806, + "step": 86 + }, + { + "advantage_max": 1.4886211231350899, + "advantage_mean": 9.934107758624577e-09, + "advantage_min": -0.7102038748562336, + "advantage_std": 0.7915925346314907, + "completion_length": 2863.166717529297, + "epoch": 0.09942857142857142, + "grad_norm": 0.13210627436637878, + "kl": 0.007967233657836914, + "lambda_div_used": 0.5, + "learning_rate": 9.850705248720068e-07, + "loss": -0.0141, + "reward": -0.13769594300538301, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.13769594300538301, + "reward_after_std": 0.7915925309062004, + "reward_before_mean": 0.3190428577363491, + "reward_before_std": 0.7937264069914818, + "reward_change_max": 0.0012602433562278748, + "reward_change_mean": -0.4567387877032161, + "reward_change_min": -0.8872962296009064, + "reward_change_std": 0.36563306488096714, + "reward_std": 0.7915925495326519, + "rewards/cosine_scaled_reward": -0.10089525021612644, + "rewards/format_reward": 0.5208333507180214, + "step": 87 + }, + { + "advantage_max": 2.120447114109993, + "advantage_mean": -5.587935447692871e-09, + "advantage_min": -0.865127693861723, + "advantage_std": 1.1003176234662533, + "completion_length": 3047.0000610351562, + "epoch": 0.10057142857142858, + "grad_norm": 0.22280767560005188, + "kl": 0.009730815887451172, + "lambda_div_used": 0.5, + "learning_rate": 9.8425742251254e-07, + "loss": 0.0637, + "reward": -0.025142807513475418, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.025142807513475418, + "reward_after_std": 1.100317608565092, + "reward_before_mean": 0.4224190632812679, + "reward_before_std": 1.0945978332310915, + "reward_change_max": 0.0009044483304023743, + "reward_change_mean": -0.447561863809824, + "reward_change_min": -0.8933478519320488, + "reward_change_std": 0.3938889466226101, + "reward_std": 1.1003176383674145, + "rewards/cosine_scaled_reward": -0.01795714534819126, + "rewards/format_reward": 0.4583333469927311, + "step": 88 + }, + { + "advantage_max": 1.064413994550705, + "advantage_mean": -8.692344843908018e-09, + "advantage_min": -0.6095254570245743, + "advantage_std": 0.5904275216162205, + "completion_length": 3318.4583435058594, + "epoch": 0.10171428571428572, + "grad_norm": 0.0975460335612297, + "kl": 0.004989147186279297, + "lambda_div_used": 0.5, + "learning_rate": 9.83423155058946e-07, + "loss": 0.0134, + "reward": -0.29414668679237366, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.29414668679237366, + "reward_after_std": 0.5904275141656399, + "reward_before_mean": 0.10153248719871044, + "reward_before_std": 0.6402346752583981, + "reward_change_max": 0.001469351351261139, + "reward_change_mean": -0.39567921683192253, + "reward_change_min": -0.7922942489385605, + "reward_change_std": 0.335257139056921, + "reward_std": 0.590427540242672, + "rewards/cosine_scaled_reward": -0.06381708104163408, + "rewards/format_reward": 0.22916666977107525, + "step": 89 + }, + { + "advantage_max": 1.1351367458701134, + "advantage_mean": 2.4835269396561444e-09, + "advantage_min": -0.5059534460306168, + "advantage_std": 0.5984435714781284, + "completion_length": 2699.354202270508, + "epoch": 0.10285714285714286, + "grad_norm": 0.10836907476186752, + "kl": 0.016954421997070312, + "lambda_div_used": 0.5, + "learning_rate": 9.825677631722435e-07, + "loss": 0.0272, + "reward": -0.37371181324124336, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.37371181324124336, + "reward_after_std": 0.5984435267746449, + "reward_before_mean": -0.06075846217572689, + "reward_before_std": 0.6041407622396946, + "reward_change_max": 0.0012149438261985779, + "reward_change_mean": -0.31295335572212934, + "reward_change_min": -0.637012179940939, + "reward_change_std": 0.257972976192832, + "reward_std": 0.5984435491263866, + "rewards/cosine_scaled_reward": -0.2282959033473162, + "rewards/format_reward": 0.39583334513008595, + "step": 90 + }, + { + "advantage_max": 1.5629900321364403, + "advantage_mean": -8.071462387349015e-09, + "advantage_min": -0.6664563044905663, + "advantage_std": 0.8228764943778515, + "completion_length": 3135.5834045410156, + "epoch": 0.104, + "grad_norm": 0.15579403936862946, + "kl": 0.004920244216918945, + "lambda_div_used": 0.5, + "learning_rate": 9.816912885430258e-07, + "loss": 0.0272, + "reward": -0.1463136593811214, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.1463136593811214, + "reward_after_std": 0.8228764794766903, + "reward_before_mean": 0.29082170128822327, + "reward_before_std": 0.8140651769936085, + "reward_change_max": 0.00046353042125701904, + "reward_change_mean": -0.4371353592723608, + "reward_change_min": -0.8551024608314037, + "reward_change_std": 0.35122772585600615, + "reward_std": 0.8228764906525612, + "rewards/cosine_scaled_reward": -0.07333916798233986, + "rewards/format_reward": 0.43750000558793545, + "step": 91 + }, + { + "advantage_max": 1.1421417072415352, + "advantage_mean": 3.725290520506519e-09, + "advantage_min": -0.6064575128257275, + "advantage_std": 0.6331437919288874, + "completion_length": 3024.729202270508, + "epoch": 0.10514285714285715, + "grad_norm": 0.19823330640792847, + "kl": 0.09358072280883789, + "lambda_div_used": 0.5, + "learning_rate": 9.807937738894303e-07, + "loss": 0.0235, + "reward": -0.25559084489941597, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.25559084489941597, + "reward_after_std": 0.633143799379468, + "reward_before_mean": 0.1625400371849537, + "reward_before_std": 0.6757265739142895, + "reward_change_max": 0.0014690980315208435, + "reward_change_mean": -0.418130905367434, + "reward_change_min": -0.8377382159233093, + "reward_change_std": 0.363980152644217, + "reward_std": 0.6331438161432743, + "rewards/cosine_scaled_reward": -0.1270633153617382, + "rewards/format_reward": 0.4166666753590107, + "step": 92 + }, + { + "advantage_max": 0.7856199890375137, + "advantage_mean": 2.297262391426358e-08, + "advantage_min": -0.33980754390358925, + "advantage_std": 0.41527827456593513, + "completion_length": 3444.3958740234375, + "epoch": 0.10628571428571429, + "grad_norm": 0.07124119251966476, + "kl": 0.005066871643066406, + "lambda_div_used": 0.5, + "learning_rate": 9.798752629550546e-07, + "loss": 0.0207, + "reward": -0.6138800643384457, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.6138800643384457, + "reward_after_std": 0.41527827456593513, + "reward_before_mean": -0.436697356402874, + "reward_before_std": 0.4258286878466606, + "reward_change_max": 0.002537757158279419, + "reward_change_mean": -0.17718270793557167, + "reward_change_min": -0.4357273578643799, + "reward_change_std": 0.1735476478934288, + "reward_std": 0.41527828946709633, + "rewards/cosine_scaled_reward": -0.27043201215565205, + "rewards/format_reward": 0.1041666679084301, + "step": 93 + }, + { + "advantage_max": 1.3759141564369202, + "advantage_mean": -6.208819014474898e-10, + "advantage_min": -0.5324600636959076, + "advantage_std": 0.7108637019991875, + "completion_length": 3188.0416870117188, + "epoch": 0.10742857142857143, + "grad_norm": 0.10782060027122498, + "kl": 0.00946807861328125, + "lambda_div_used": 0.5, + "learning_rate": 9.78935800506826e-07, + "loss": 0.0183, + "reward": -0.3538913428783417, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.3538913428783417, + "reward_after_std": 0.7108637019991875, + "reward_before_mean": -0.05700744315981865, + "reward_before_std": 0.7008998841047287, + "reward_change_max": 0.00010148435831069946, + "reward_change_mean": -0.2968839295208454, + "reward_change_min": -0.6375745087862015, + "reward_change_std": 0.2521350774914026, + "reward_std": 0.7108637318015099, + "rewards/cosine_scaled_reward": -0.16392040066421032, + "rewards/format_reward": 0.27083333767950535, + "step": 94 + }, + { + "advantage_max": 1.2005148455500603, + "advantage_mean": 2.7318796669284495e-08, + "advantage_min": -0.5142731741070747, + "advantage_std": 0.6438871584832668, + "completion_length": 3527.0625, + "epoch": 0.10857142857142857, + "grad_norm": 0.10404420644044876, + "kl": 0.002773284912109375, + "lambda_div_used": 0.5, + "learning_rate": 9.779754323328192e-07, + "loss": 0.0085, + "reward": -0.41304099559783936, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.41304099559783936, + "reward_after_std": 0.6438871622085571, + "reward_before_mean": -0.13589461334049702, + "reward_before_std": 0.6911821234971285, + "reward_change_max": 0.0043373703956604, + "reward_change_mean": -0.277146372012794, + "reward_change_min": -0.6644494272768497, + "reward_change_std": 0.27538349106907845, + "reward_std": 0.6438871771097183, + "rewards/cosine_scaled_reward": -0.16169731132686138, + "rewards/format_reward": 0.1875000037252903, + "step": 95 + }, + { + "advantage_max": 1.451909989118576, + "advantage_mean": 3.7252904094842165e-09, + "advantage_min": -0.6129937767982483, + "advantage_std": 0.7681609587743878, + "completion_length": 3130.5208740234375, + "epoch": 0.10971428571428571, + "grad_norm": 0.13211211562156677, + "kl": 0.007869243621826172, + "lambda_div_used": 0.5, + "learning_rate": 9.769942052400235e-07, + "loss": 0.0646, + "reward": -0.2191711962223053, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.2191711962223053, + "reward_after_std": 0.7681609885767102, + "reward_before_mean": 0.17698761075735092, + "reward_before_std": 0.7817029012367129, + "reward_change_max": 0.0002824366092681885, + "reward_change_mean": -0.39615882001817226, + "reward_change_min": -0.8239215202629566, + "reward_change_std": 0.3346262890845537, + "reward_std": 0.7681610230356455, + "rewards/cosine_scaled_reward": -0.06775618996471167, + "rewards/format_reward": 0.3125000037252903, + "step": 96 + }, + { + "advantage_max": 1.2568126060068607, + "advantage_mean": 6.829699139565548e-09, + "advantage_min": -0.5489920303225517, + "advantage_std": 0.6930233538150787, + "completion_length": 3283.2083435058594, + "epoch": 0.11085714285714286, + "grad_norm": 0.1216772198677063, + "kl": 0.00519561767578125, + "lambda_div_used": 0.5, + "learning_rate": 9.759921670520634e-07, + "loss": 0.0276, + "reward": -0.2372879907488823, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2372879907488823, + "reward_after_std": 0.6930233649909496, + "reward_before_mean": 0.17726046219468117, + "reward_before_std": 0.727190125733614, + "reward_change_max": 0.0016672611236572266, + "reward_change_mean": -0.41454842686653137, + "reward_change_min": -0.9322738572955132, + "reward_change_std": 0.39632497169077396, + "reward_std": 0.693023394793272, + "rewards/cosine_scaled_reward": -0.057203130796551704, + "rewards/format_reward": 0.2916666679084301, + "step": 97 + }, + { + "advantage_max": 1.1389856860041618, + "advantage_mean": 1.0554989604560916e-08, + "advantage_min": -0.5510528981685638, + "advantage_std": 0.6010327264666557, + "completion_length": 3024.937530517578, + "epoch": 0.112, + "grad_norm": 0.10726859420537949, + "kl": 0.004727840423583984, + "lambda_div_used": 0.5, + "learning_rate": 9.749693666068663e-07, + "loss": 0.0654, + "reward": -0.31532357865944505, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.31532357865944505, + "reward_after_std": 0.6010327264666557, + "reward_before_mean": 0.05080095771700144, + "reward_before_std": 0.5885812118649483, + "reward_change_max": 0.0008204132318496704, + "reward_change_mean": -0.3661245256662369, + "reward_change_min": -0.6775789931416512, + "reward_change_std": 0.2766888150945306, + "reward_std": 0.6010327376425266, + "rewards/cosine_scaled_reward": -0.15168285369873047, + "rewards/format_reward": 0.35416668094694614, + "step": 98 + }, + { + "advantage_max": 1.061452217400074, + "advantage_mean": 1.3659397946064189e-08, + "advantage_min": -0.5463209822773933, + "advantage_std": 0.5901384837925434, + "completion_length": 2968.5416870117188, + "epoch": 0.11314285714285714, + "grad_norm": 0.12127479910850525, + "kl": 0.008264541625976562, + "lambda_div_used": 0.5, + "learning_rate": 9.739258537542835e-07, + "loss": 0.0124, + "reward": -0.3719342704862356, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.3719342704862356, + "reward_after_std": 0.5901384837925434, + "reward_before_mean": -0.04033519048243761, + "reward_before_std": 0.6591959446668625, + "reward_change_max": 0.0008135139942169189, + "reward_change_mean": -0.3315990660339594, + "reward_change_min": -0.7541460432112217, + "reward_change_std": 0.3205462880432606, + "reward_std": 0.5901384949684143, + "rewards/cosine_scaled_reward": -0.15558426547795534, + "rewards/format_reward": 0.2708333358168602, + "step": 99 + }, + { + "advantage_max": 1.375246461480856, + "advantage_mean": 1.3659398168108794e-08, + "advantage_min": -0.6167638674378395, + "advantage_std": 0.7283363081514835, + "completion_length": 2946.0208740234375, + "epoch": 0.11428571428571428, + "grad_norm": 0.08893745392560959, + "kl": 0.009603500366210938, + "lambda_div_used": 0.5, + "learning_rate": 9.728616793536587e-07, + "loss": 0.0463, + "reward": -0.23348002135753632, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.23348002135753632, + "reward_after_std": 0.7283363118767738, + "reward_before_mean": 0.16290868259966373, + "reward_before_std": 0.7378918267786503, + "reward_change_max": 0.0015308186411857605, + "reward_change_mean": -0.39638871792703867, + "reward_change_min": -0.7486597262322903, + "reward_change_std": 0.31783370301127434, + "reward_std": 0.728336326777935, + "rewards/cosine_scaled_reward": -0.08521231822669506, + "rewards/format_reward": 0.33333333395421505, + "step": 100 + }, + { + "advantage_max": 1.0726460739970207, + "advantage_mean": 2.8560559472978753e-08, + "advantage_min": -0.40579839795827866, + "advantage_std": 0.5493858084082603, + "completion_length": 2935.854232788086, + "epoch": 0.11542857142857142, + "grad_norm": 0.0939834713935852, + "kl": 0.006221771240234375, + "lambda_div_used": 0.5, + "learning_rate": 9.717768952713511e-07, + "loss": 0.0479, + "reward": -0.2715232199989259, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2715232199989259, + "reward_after_std": 0.5493858009576797, + "reward_before_mean": 0.14305181056261063, + "reward_before_std": 0.4525942765176296, + "reward_change_max": 0.0, + "reward_change_mean": -0.4145750030875206, + "reward_change_min": -0.69618084654212, + "reward_change_std": 0.25990021973848343, + "reward_std": 0.5493858084082603, + "rewards/cosine_scaled_reward": -0.09514076914638281, + "rewards/format_reward": 0.3333333395421505, + "step": 101 + }, + { + "advantage_max": 1.60531947016716, + "advantage_mean": 2.4835269396561444e-09, + "advantage_min": -0.7529873549938202, + "advantage_std": 0.8508248142898083, + "completion_length": 2944.1459045410156, + "epoch": 0.11657142857142858, + "grad_norm": 0.15680734813213348, + "kl": 0.01442718505859375, + "lambda_div_used": 0.5, + "learning_rate": 9.706715543782064e-07, + "loss": 0.0135, + "reward": -0.0667097344994545, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.0667097344994545, + "reward_after_std": 0.8508248310536146, + "reward_before_mean": 0.4317157156765461, + "reward_before_std": 0.8481930792331696, + "reward_change_max": 0.0004781857132911682, + "reward_change_mean": -0.4984254566952586, + "reward_change_min": -0.9218480922281742, + "reward_change_std": 0.3881477224640548, + "reward_std": 0.8508248627185822, + "rewards/cosine_scaled_reward": -0.04455881821922958, + "rewards/format_reward": 0.5208333414047956, + "step": 102 + }, + { + "advantage_max": 1.4402986355125904, + "advantage_mean": -9.313226190243995e-09, + "advantage_min": -0.591527882963419, + "advantage_std": 0.7503146361559629, + "completion_length": 3138.3958892822266, + "epoch": 0.11771428571428572, + "grad_norm": 0.14295694231987, + "kl": 0.010210037231445312, + "lambda_div_used": 0.5, + "learning_rate": 9.695457105469804e-07, + "loss": 0.0599, + "reward": -0.2733981416095048, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2733981416095048, + "reward_after_std": 0.7503146324306726, + "reward_before_mean": 0.07834052480757236, + "reward_before_std": 0.7342335116118193, + "reward_change_max": 0.003089629113674164, + "reward_change_mean": -0.35173869086429477, + "reward_change_min": -0.7178602293133736, + "reward_change_std": 0.29554732143878937, + "reward_std": 0.7503146436065435, + "rewards/cosine_scaled_reward": -0.17957975156605244, + "rewards/format_reward": 0.4375000111758709, + "step": 103 + }, + { + "advantage_max": 0.9793983772397041, + "advantage_mean": 4.967053435223079e-09, + "advantage_min": -0.4478438273072243, + "advantage_std": 0.5177636370062828, + "completion_length": 2624.9166717529297, + "epoch": 0.11885714285714286, + "grad_norm": 0.11209587007761002, + "kl": 0.0073070526123046875, + "lambda_div_used": 0.5, + "learning_rate": 9.683994186497132e-07, + "loss": 0.0123, + "reward": -0.28253607312217355, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.28253607312217355, + "reward_after_std": 0.5177636258304119, + "reward_before_mean": 0.1391938179731369, + "reward_before_std": 0.47612931579351425, + "reward_change_max": 6.999820470809937e-05, + "reward_change_mean": -0.42172990553081036, + "reward_change_min": -0.7518970184028149, + "reward_change_std": 0.2917201966047287, + "reward_std": 0.517763651907444, + "rewards/cosine_scaled_reward": -0.13873642776161432, + "rewards/format_reward": 0.4166666679084301, + "step": 104 + }, + { + "advantage_max": 1.5320292636752129, + "advantage_mean": 2.1109978987077227e-08, + "advantage_min": -0.7886748686432838, + "advantage_std": 0.8270585872232914, + "completion_length": 2966.0833892822266, + "epoch": 0.12, + "grad_norm": 0.1404057890176773, + "kl": 0.009779930114746094, + "lambda_div_used": 0.5, + "learning_rate": 9.672327345550543e-07, + "loss": 0.0745, + "reward": -0.13192696264013648, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.13192696264013648, + "reward_after_std": 0.8270585723221302, + "reward_before_mean": 0.3213406689465046, + "reward_before_std": 0.8794911131262779, + "reward_change_max": 0.0026951581239700317, + "reward_change_mean": -0.45326761342585087, + "reward_change_min": -0.9718286655843258, + "reward_change_std": 0.41331026889383793, + "reward_std": 0.8270585872232914, + "rewards/cosine_scaled_reward": -0.016412993194535375, + "rewards/format_reward": 0.35416668094694614, + "step": 105 + }, + { + "advantage_max": 1.729747325181961, + "advantage_mean": -4.967053546245381e-09, + "advantage_min": -0.7270858883857727, + "advantage_std": 0.9044813252985477, + "completion_length": 2396.541717529297, + "epoch": 0.12114285714285715, + "grad_norm": 0.09820910543203354, + "kl": 0.007421970367431641, + "lambda_div_used": 0.5, + "learning_rate": 9.66045715125541e-07, + "loss": -0.0122, + "reward": 0.2950130708049983, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2950130708049983, + "reward_after_std": 0.9044813551008701, + "reward_before_mean": 1.0742302685976028, + "reward_before_std": 0.777211144566536, + "reward_change_max": 0.0, + "reward_change_mean": -0.7792172282934189, + "reward_change_min": -1.333798922598362, + "reward_change_std": 0.5166942048817873, + "reward_std": 0.9044813700020313, + "rewards/cosine_scaled_reward": 0.17253179172985256, + "rewards/format_reward": 0.7291666679084301, + "step": 106 + }, + { + "advantage_max": 1.1745030768215656, + "advantage_mean": 4.3461723664428575e-09, + "advantage_min": -0.48186010867357254, + "advantage_std": 0.610499557107687, + "completion_length": 2884.916702270508, + "epoch": 0.12228571428571429, + "grad_norm": 0.09980223327875137, + "kl": 0.007454872131347656, + "lambda_div_used": 0.5, + "learning_rate": 9.648384182148252e-07, + "loss": 0.0344, + "reward": -0.09255528822541237, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.09255528822541237, + "reward_after_std": 0.6104995384812355, + "reward_before_mean": 0.4557968080043793, + "reward_before_std": 0.4952854886651039, + "reward_change_max": 0.0, + "reward_change_mean": -0.5483520794659853, + "reward_change_min": -0.876308511942625, + "reward_change_std": 0.3524962291121483, + "reward_std": 0.610499557107687, + "rewards/cosine_scaled_reward": -0.02210160903632641, + "rewards/format_reward": 0.5000000037252903, + "step": 107 + }, + { + "advantage_max": 1.6280250921845436, + "advantage_mean": -2.4835271617007493e-09, + "advantage_min": -0.7997777834534645, + "advantage_std": 0.8667001165449619, + "completion_length": 2976.479217529297, + "epoch": 0.12342857142857143, + "grad_norm": 6.980273723602295, + "kl": 0.41866397857666016, + "lambda_div_used": 0.5, + "learning_rate": 9.636109026648554e-07, + "loss": 0.072, + "reward": -0.14665891602635384, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.14665891602635384, + "reward_after_std": 0.8667001202702522, + "reward_before_mean": 0.27829501312226057, + "reward_before_std": 0.9044629707932472, + "reward_change_max": 0.001452334225177765, + "reward_change_mean": -0.4249539338052273, + "reward_change_min": -0.9039922542870045, + "reward_change_std": 0.38425787910819054, + "reward_std": 0.866700142621994, + "rewards/cosine_scaled_reward": -0.0587691655382514, + "rewards/format_reward": 0.3958333469927311, + "step": 108 + }, + { + "advantage_max": 1.293064869940281, + "advantage_mean": 8.692344288796505e-09, + "advantage_min": -0.5089701935648918, + "advantage_std": 0.6711876504123211, + "completion_length": 3053.7708740234375, + "epoch": 0.12457142857142857, + "grad_norm": 0.09334102272987366, + "kl": 0.005675315856933594, + "lambda_div_used": 0.5, + "learning_rate": 9.623632283030077e-07, + "loss": 0.0178, + "reward": -0.24590441305190325, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.24590441305190325, + "reward_after_std": 0.6711876504123211, + "reward_before_mean": 0.15391119476407766, + "reward_before_std": 0.628238283097744, + "reward_change_max": 0.0, + "reward_change_mean": -0.3998156199231744, + "reward_change_min": -0.7111305296421051, + "reward_change_std": 0.27578442357480526, + "reward_std": 0.6711876578629017, + "rewards/cosine_scaled_reward": -0.08971107471734285, + "rewards/format_reward": 0.3333333395421505, + "step": 109 + }, + { + "advantage_max": 1.5743919759988785, + "advantage_mean": 1.3659398279131096e-08, + "advantage_min": -0.6370358616113663, + "advantage_std": 0.8361404724419117, + "completion_length": 3043.0208740234375, + "epoch": 0.12571428571428572, + "grad_norm": 0.12074672430753708, + "kl": 0.009097099304199219, + "lambda_div_used": 0.5, + "learning_rate": 9.610954559391704e-07, + "loss": 0.0472, + "reward": -0.17771138809621334, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.17771138809621334, + "reward_after_std": 0.8361404724419117, + "reward_before_mean": 0.23106246162205935, + "reward_before_std": 0.8586977384984493, + "reward_change_max": 0.0, + "reward_change_mean": -0.4087738338857889, + "reward_change_min": -0.9307141229510307, + "reward_change_std": 0.3709069453179836, + "reward_std": 0.8361404910683632, + "rewards/cosine_scaled_reward": -0.08238544082269073, + "rewards/format_reward": 0.39583333767950535, + "step": 110 + }, + { + "advantage_max": 1.231549710035324, + "advantage_mean": 7.450580929990736e-09, + "advantage_min": -0.690878614783287, + "advantage_std": 0.6834078840911388, + "completion_length": 3437.875030517578, + "epoch": 0.12685714285714286, + "grad_norm": 0.12677572667598724, + "kl": 0.0095977783203125, + "lambda_div_used": 0.5, + "learning_rate": 9.598076473627796e-07, + "loss": 0.0257, + "reward": -0.21460825204849243, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.21460825204849243, + "reward_after_std": 0.6834078989923, + "reward_before_mean": 0.22259395569562912, + "reward_before_std": 0.7492858357727528, + "reward_change_max": 0.0012400075793266296, + "reward_change_mean": -0.4372022282332182, + "reward_change_min": -0.8609942458570004, + "reward_change_std": 0.37953019607812166, + "reward_std": 0.6834079250693321, + "rewards/cosine_scaled_reward": -0.013703018426895142, + "rewards/format_reward": 0.2500000074505806, + "step": 111 + }, + { + "advantage_max": 1.390410177409649, + "advantage_mean": 2.6697914878859308e-08, + "advantage_min": -0.5769059509038925, + "advantage_std": 0.7328597828745842, + "completion_length": 3354.541717529297, + "epoch": 0.128, + "grad_norm": 0.13290849328041077, + "kl": 0.005608558654785156, + "lambda_div_used": 0.5, + "learning_rate": 9.58499865339809e-07, + "loss": 0.0329, + "reward": 0.019102992489933968, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.019102992489933968, + "reward_after_std": 0.7328597791492939, + "reward_before_mean": 0.6243925159797072, + "reward_before_std": 0.6275855414569378, + "reward_change_max": 0.0005017220973968506, + "reward_change_mean": -0.6052894797176123, + "reward_change_min": -0.9622363597154617, + "reward_change_std": 0.40619419515132904, + "reward_std": 0.7328597903251648, + "rewards/cosine_scaled_reward": 0.12469624355435371, + "rewards/format_reward": 0.3750000037252903, + "step": 112 + }, + { + "advantage_max": 1.474353551864624, + "advantage_mean": 9.313226190243995e-09, + "advantage_min": -0.5791169889271259, + "advantage_std": 0.7638243734836578, + "completion_length": 2959.8750610351562, + "epoch": 0.12914285714285714, + "grad_norm": 0.2007800042629242, + "kl": 0.011220932006835938, + "lambda_div_used": 0.5, + "learning_rate": 9.571721736097088e-07, + "loss": 0.0969, + "reward": -0.2861519819125533, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2861519819125533, + "reward_after_std": 0.7638243734836578, + "reward_before_mean": 0.049024675972759724, + "reward_before_std": 0.7449628822505474, + "reward_change_max": 0.00041237473487854004, + "reward_change_mean": -0.33517665788531303, + "reward_change_min": -0.7155702412128448, + "reward_change_std": 0.2762441807426512, + "reward_std": 0.7638244070112705, + "rewards/cosine_scaled_reward": -0.17340433155186474, + "rewards/format_reward": 0.3958333432674408, + "step": 113 + }, + { + "advantage_max": 1.3445461466908455, + "advantage_mean": 3.725290742551124e-09, + "advantage_min": -0.4992591068148613, + "advantage_std": 0.699359804391861, + "completion_length": 2696.500068664551, + "epoch": 0.13028571428571428, + "grad_norm": 0.10839706659317017, + "kl": 0.006946563720703125, + "lambda_div_used": 0.5, + "learning_rate": 9.55824636882301e-07, + "loss": 0.0255, + "reward": -0.26514838729053736, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.26514838729053736, + "reward_after_std": 0.6993598081171513, + "reward_before_mean": 0.11108050495386124, + "reward_before_std": 0.6611098386347294, + "reward_change_max": 0.001189887523651123, + "reward_change_mean": -0.3762288950383663, + "reward_change_min": -0.7401161417365074, + "reward_change_std": 0.29690456204116344, + "reward_std": 0.6993598081171513, + "rewards/cosine_scaled_reward": -0.21529308333992958, + "rewards/format_reward": 0.5416666772216558, + "step": 114 + }, + { + "advantage_max": 1.2387553304433823, + "advantage_mean": -3.3306690738754696e-16, + "advantage_min": -0.4966389983892441, + "advantage_std": 0.6462426483631134, + "completion_length": 2998.0208740234375, + "epoch": 0.13142857142857142, + "grad_norm": 0.08023947477340698, + "kl": 0.007321357727050781, + "lambda_div_used": 0.5, + "learning_rate": 9.54457320834625e-07, + "loss": 0.0372, + "reward": -0.3627615012228489, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.3627615012228489, + "reward_after_std": 0.6462426148355007, + "reward_before_mean": -0.05127234756946564, + "reward_before_std": 0.6328899078071117, + "reward_change_max": 0.001132287085056305, + "reward_change_mean": -0.311489156447351, + "reward_change_min": -0.6855304539203644, + "reward_change_std": 0.26617162115871906, + "reward_std": 0.6462426371872425, + "rewards/cosine_scaled_reward": -0.17146951146423817, + "rewards/format_reward": 0.2916666716337204, + "step": 115 + }, + { + "advantage_max": 1.5859102010726929, + "advantage_mean": 2.220446049250313e-16, + "advantage_min": -0.5750259384512901, + "advantage_std": 0.8160628713667393, + "completion_length": 3465.000030517578, + "epoch": 0.13257142857142856, + "grad_norm": 0.1530701220035553, + "kl": 0.007579803466796875, + "lambda_div_used": 0.5, + "learning_rate": 9.530702921077358e-07, + "loss": 0.0211, + "reward": -0.32605776842683554, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.32605776842683554, + "reward_after_std": 0.8160628564655781, + "reward_before_mean": -0.04084068536758423, + "reward_before_std": 0.8062328286468983, + "reward_change_max": 0.0008537173271179199, + "reward_change_mean": -0.28521708957850933, + "reward_change_min": -0.5912484526634216, + "reward_change_std": 0.2459753742441535, + "reward_std": 0.8160628788173199, + "rewards/cosine_scaled_reward": -0.1037536843214184, + "rewards/format_reward": 0.1666666716337204, + "step": 116 + }, + { + "advantage_max": 1.0990208461880684, + "advantage_mean": 2.359350581571107e-08, + "advantage_min": -0.3993668518960476, + "advantage_std": 0.5640300773084164, + "completion_length": 3263.916717529297, + "epoch": 0.1337142857142857, + "grad_norm": 0.08252304047346115, + "kl": 0.010234832763671875, + "lambda_div_used": 0.5, + "learning_rate": 9.516636183034564e-07, + "loss": 0.0192, + "reward": -0.4662362337112427, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.4662362337112427, + "reward_after_std": 0.5640300661325455, + "reward_before_mean": -0.22322246711701155, + "reward_before_std": 0.5282933078706264, + "reward_change_max": 0.0006141439080238342, + "reward_change_mean": -0.24301375821232796, + "reward_change_min": -0.5001362860202789, + "reward_change_std": 0.19844415225088596, + "reward_std": 0.5640300773084164, + "rewards/cosine_scaled_reward": -0.22619456890970469, + "rewards/format_reward": 0.22916666977107525, + "step": 117 + }, + { + "advantage_max": 1.5211386159062386, + "advantage_mean": -2.483527050678447e-09, + "advantage_min": -0.755425862967968, + "advantage_std": 0.8458346724510193, + "completion_length": 3119.2083435058594, + "epoch": 0.13485714285714287, + "grad_norm": 0.15521512925624847, + "kl": 0.00604248046875, + "lambda_div_used": 0.5, + "learning_rate": 9.502373679810839e-07, + "loss": 0.0558, + "reward": -0.12143947370350361, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.12143947370350361, + "reward_after_std": 0.8458346650004387, + "reward_before_mean": 0.3443289026618004, + "reward_before_std": 0.9570517912507057, + "reward_change_max": 0.001078963279724121, + "reward_change_mean": -0.4657684173434973, + "reward_change_min": -1.147033091634512, + "reward_change_std": 0.46905333921313286, + "reward_std": 0.8458347134292126, + "rewards/cosine_scaled_reward": 0.01591446064412594, + "rewards/format_reward": 0.3125000037252903, + "step": 118 + }, + { + "advantage_max": 1.3600810691714287, + "advantage_mean": 1.2417638028949796e-09, + "advantage_min": -0.7468793988227844, + "advantage_std": 0.7505389116704464, + "completion_length": 2491.5833740234375, + "epoch": 0.136, + "grad_norm": 0.26733919978141785, + "kl": 0.108642578125, + "lambda_div_used": 0.5, + "learning_rate": 9.487916106540465e-07, + "loss": 0.0267, + "reward": -0.01912129484117031, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.01912129484117031, + "reward_after_std": 0.750538919121027, + "reward_before_mean": 0.5529840067028999, + "reward_before_std": 0.7970850095152855, + "reward_change_max": 0.006842672824859619, + "reward_change_mean": -0.5721052885055542, + "reward_change_min": -1.084109291434288, + "reward_change_std": 0.4520927872508764, + "reward_std": 0.7505389600992203, + "rewards/cosine_scaled_reward": 0.016075339168310165, + "rewards/format_reward": 0.5208333432674408, + "step": 119 + }, + { + "advantage_max": 1.5714844167232513, + "advantage_mean": -3.1044086745701804e-09, + "advantage_min": -0.6521871276199818, + "advantage_std": 0.8299883455038071, + "completion_length": 2594.375045776367, + "epoch": 0.13714285714285715, + "grad_norm": 0.1550913006067276, + "kl": 0.01110076904296875, + "lambda_div_used": 0.5, + "learning_rate": 9.473264167865171e-07, + "loss": -0.0004, + "reward": 0.07176335965050384, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.07176335965050384, + "reward_after_std": 0.8299883455038071, + "reward_before_mean": 0.6873215809464455, + "reward_before_std": 0.7673061080276966, + "reward_change_max": 0.0006802454590797424, + "reward_change_mean": -0.6155582182109356, + "reward_change_min": -1.1995624154806137, + "reward_change_std": 0.4468677435070276, + "reward_std": 0.8299883641302586, + "rewards/cosine_scaled_reward": 0.062410795129835606, + "rewards/format_reward": 0.562500013038516, + "step": 120 + }, + { + "advantage_max": 1.3429175913333893, + "advantage_mean": 6.208816794028849e-10, + "advantage_min": -0.591683205217123, + "advantage_std": 0.698270071297884, + "completion_length": 2149.8958740234375, + "epoch": 0.1382857142857143, + "grad_norm": 0.2023954689502716, + "kl": 0.013248443603515625, + "lambda_div_used": 0.5, + "learning_rate": 9.458418577899774e-07, + "loss": 0.0709, + "reward": -0.12103883270174265, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.12103883270174265, + "reward_after_std": 0.698270071297884, + "reward_before_mean": 0.3712943270802498, + "reward_before_std": 0.6273719593882561, + "reward_change_max": 0.0, + "reward_change_mean": -0.49233318492770195, + "reward_change_min": -0.8092544637620449, + "reward_change_std": 0.31992023810744286, + "reward_std": 0.698270071297884, + "rewards/cosine_scaled_reward": -0.1268528364598751, + "rewards/format_reward": 0.6250000055879354, + "step": 121 + }, + { + "advantage_max": 1.5647741705179214, + "advantage_mean": -6.208815128694312e-10, + "advantage_min": -0.7578405737876892, + "advantage_std": 0.8595303483307362, + "completion_length": 2891.0416717529297, + "epoch": 0.13942857142857143, + "grad_norm": 0.15973925590515137, + "kl": 0.008546829223632812, + "lambda_div_used": 0.5, + "learning_rate": 9.443380060197385e-07, + "loss": 0.0531, + "reward": -0.08176559396088123, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.08176559396088123, + "reward_after_std": 0.8595303483307362, + "reward_before_mean": 0.4093674011528492, + "reward_before_std": 0.9295827485620975, + "reward_change_max": 0.001126319169998169, + "reward_change_mean": -0.4911329858005047, + "reward_change_min": -1.1455222107470036, + "reward_change_std": 0.4624634627252817, + "reward_std": 0.8595303595066071, + "rewards/cosine_scaled_reward": -0.014066309202462435, + "rewards/format_reward": 0.43750000931322575, + "step": 122 + }, + { + "advantage_max": 1.4153951033949852, + "advantage_mean": 6.829699139565548e-09, + "advantage_min": -0.5769023820757866, + "advantage_std": 0.7529639042913914, + "completion_length": 3128.0625610351562, + "epoch": 0.14057142857142857, + "grad_norm": 0.133604496717453, + "kl": 0.007900238037109375, + "lambda_div_used": 0.5, + "learning_rate": 9.428149347714143e-07, + "loss": 0.0562, + "reward": -0.24905523657798767, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.24905523657798767, + "reward_after_std": 0.7529638931155205, + "reward_before_mean": 0.1262472216039896, + "reward_before_std": 0.7704966329038143, + "reward_change_max": 0.001382298767566681, + "reward_change_mean": -0.37530247680842876, + "reward_change_min": -0.8108473680913448, + "reward_change_std": 0.34216398000717163, + "reward_std": 0.7529639154672623, + "rewards/cosine_scaled_reward": -0.11395972780883312, + "rewards/format_reward": 0.35416666977107525, + "step": 123 + }, + { + "advantage_max": 1.3590355888009071, + "advantage_mean": -4.967053879312289e-09, + "advantage_min": -0.5283945128321648, + "advantage_std": 0.703603483736515, + "completion_length": 2643.3125534057617, + "epoch": 0.1417142857142857, + "grad_norm": 0.10262785851955414, + "kl": 0.010004043579101562, + "lambda_div_used": 0.5, + "learning_rate": 9.412727182773486e-07, + "loss": 0.0245, + "reward": -0.10296567948535085, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.10296567948535085, + "reward_after_std": 0.7036034874618053, + "reward_before_mean": 0.4043073896318674, + "reward_before_std": 0.5893766656517982, + "reward_change_max": 0.0, + "reward_change_mean": -0.5072730649262667, + "reward_change_min": -0.900951974093914, + "reward_change_std": 0.35020011104643345, + "reward_std": 0.7036035098135471, + "rewards/cosine_scaled_reward": -0.027012981940060854, + "rewards/format_reward": 0.45833334140479565, + "step": 124 + }, + { + "advantage_max": 0.9600905813276768, + "advantage_mean": 1.8005570145973593e-08, + "advantage_min": -0.5070587620139122, + "advantage_std": 0.5214144960045815, + "completion_length": 2769.6666870117188, + "epoch": 0.14285714285714285, + "grad_norm": 0.05761028453707695, + "kl": 0.00707244873046875, + "lambda_div_used": 0.5, + "learning_rate": 9.397114317029974e-07, + "loss": 0.0083, + "reward": -0.2994791641831398, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2994791641831398, + "reward_after_std": 0.5214144885540009, + "reward_before_mean": 0.10765105485916138, + "reward_before_std": 0.5211964379996061, + "reward_change_max": 0.001614697277545929, + "reward_change_mean": -0.40713020414114, + "reward_change_min": -0.7412909679114819, + "reward_change_std": 0.3003601636737585, + "reward_std": 0.5214144997298717, + "rewards/cosine_scaled_reward": -0.09200781211256981, + "rewards/format_reward": 0.2916666679084301, + "step": 125 + }, + { + "advantage_max": 1.369416281580925, + "advantage_mean": 4.3461718668424965e-09, + "advantage_min": -0.5342150218784809, + "advantage_std": 0.718287467956543, + "completion_length": 2917.666732788086, + "epoch": 0.144, + "grad_norm": 0.11026092618703842, + "kl": 0.0053558349609375, + "lambda_div_used": 0.5, + "learning_rate": 9.381311511432658e-07, + "loss": 0.0133, + "reward": -0.14198202081024647, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.14198202081024647, + "reward_after_std": 0.7182874493300915, + "reward_before_mean": 0.3325531389564276, + "reward_before_std": 0.6610416546463966, + "reward_change_max": 0.00305301696062088, + "reward_change_mean": -0.4745351132005453, + "reward_change_min": -0.8417270965874195, + "reward_change_std": 0.34801490139216185, + "reward_std": 0.7182874642312527, + "rewards/cosine_scaled_reward": -0.07330678962171078, + "rewards/format_reward": 0.47916667349636555, + "step": 126 + }, + { + "advantage_max": 1.3165396451950073, + "advantage_mean": -9.93410786964688e-09, + "advantage_min": -0.5570442825555801, + "advantage_std": 0.7028694860637188, + "completion_length": 3262.8541870117188, + "epoch": 0.14514285714285713, + "grad_norm": 0.11621111631393433, + "kl": 0.00928497314453125, + "lambda_div_used": 0.5, + "learning_rate": 9.36531953618799e-07, + "loss": 0.041, + "reward": -0.4189454587176442, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.4189454587176442, + "reward_after_std": 0.7028694786131382, + "reward_before_mean": -0.16826728964224458, + "reward_before_std": 0.7615924999117851, + "reward_change_max": 0.0017796605825424194, + "reward_change_mean": -0.2506781928241253, + "reward_change_min": -0.7534624300897121, + "reward_change_std": 0.301827160641551, + "reward_std": 0.7028694786131382, + "rewards/cosine_scaled_reward": -0.22996697621420026, + "rewards/format_reward": 0.2916666716337204, + "step": 127 + }, + { + "advantage_max": 1.6697375662624836, + "advantage_mean": 2.173086055545781e-08, + "advantage_min": -0.7176847979426384, + "advantage_std": 0.9009743053466082, + "completion_length": 2810.4583740234375, + "epoch": 0.1462857142857143, + "grad_norm": 0.11985532194375992, + "kl": 0.007503509521484375, + "lambda_div_used": 0.5, + "learning_rate": 9.34913917072228e-07, + "loss": 0.0328, + "reward": -0.04913006443530321, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.04913006443530321, + "reward_after_std": 0.9009743090718985, + "reward_before_mean": 0.452212794451043, + "reward_before_std": 0.9385515972971916, + "reward_change_max": 0.0008388608694076538, + "reward_change_mean": -0.5013428647071123, + "reward_change_min": -0.9899672120809555, + "reward_change_std": 0.42710711527615786, + "reward_std": 0.9009743295609951, + "rewards/cosine_scaled_reward": 0.01777306676376611, + "rewards/format_reward": 0.4166666716337204, + "step": 128 + }, + { + "advantage_max": 1.5713545978069305, + "advantage_mean": 1.117587078436344e-08, + "advantage_min": -0.5929481983184814, + "advantage_std": 0.8208220899105072, + "completion_length": 3373.1458435058594, + "epoch": 0.14742857142857144, + "grad_norm": 0.21055659651756287, + "kl": 0.01023101806640625, + "lambda_div_used": 0.5, + "learning_rate": 9.332771203643714e-07, + "loss": 0.0635, + "reward": -0.34678191784769297, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.34678191784769297, + "reward_after_std": 0.8208221010863781, + "reward_before_mean": -0.07890627905726433, + "reward_before_std": 0.8488923981785774, + "reward_change_max": 0.0015719234943389893, + "reward_change_mean": -0.26787563413381577, + "reward_change_min": -0.6541193760931492, + "reward_change_std": 0.2729021832346916, + "reward_std": 0.8208221010863781, + "rewards/cosine_scaled_reward": -0.13320314860902727, + "rewards/format_reward": 0.1875000037252903, + "step": 129 + }, + { + "advantage_max": 1.3760394155979156, + "advantage_mean": 3.7252904094842165e-09, + "advantage_min": -0.5131339877843857, + "advantage_std": 0.7071768157184124, + "completion_length": 3168.5833740234375, + "epoch": 0.14857142857142858, + "grad_norm": 0.10870277136564255, + "kl": 0.009489059448242188, + "lambda_div_used": 0.5, + "learning_rate": 9.316216432703916e-07, + "loss": 0.0555, + "reward": -0.3918801546096802, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.3918801546096802, + "reward_after_std": 0.70717678591609, + "reward_before_mean": -0.1307674515992403, + "reward_before_std": 0.6902662832289934, + "reward_change_max": 0.0022120848298072815, + "reward_change_mean": -0.26111270394176245, + "reward_change_min": -0.5894866921007633, + "reward_change_std": 0.2326252982020378, + "reward_std": 0.7071768119931221, + "rewards/cosine_scaled_reward": -0.16955040022730827, + "rewards/format_reward": 0.2083333358168602, + "step": 130 + }, + { + "advantage_max": 1.5219105705618858, + "advantage_mean": 2.0489097529718947e-08, + "advantage_min": -0.7303340062499046, + "advantage_std": 0.8300552815198898, + "completion_length": 2928.6875762939453, + "epoch": 0.14971428571428572, + "grad_norm": 0.1419273018836975, + "kl": 0.01134490966796875, + "lambda_div_used": 0.5, + "learning_rate": 9.299475664759068e-07, + "loss": 0.0631, + "reward": -0.060884641483426094, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.060884641483426094, + "reward_after_std": 0.8300552740693092, + "reward_before_mean": 0.4547289116308093, + "reward_before_std": 0.8692549020051956, + "reward_change_max": 0.0035802796483039856, + "reward_change_mean": -0.5156135559082031, + "reward_change_min": -1.0946178510785103, + "reward_change_std": 0.44368345849215984, + "reward_std": 0.8300552740693092, + "rewards/cosine_scaled_reward": 0.029447784181684256, + "rewards/format_reward": 0.39583334140479565, + "step": 131 + }, + { + "advantage_max": 2.0168206430971622, + "advantage_mean": 3.104408841103634e-09, + "advantage_min": -0.8502785339951515, + "advantage_std": 1.0592867005616426, + "completion_length": 2758.062515258789, + "epoch": 0.15085714285714286, + "grad_norm": 0.17530718445777893, + "kl": 0.00824737548828125, + "lambda_div_used": 0.5, + "learning_rate": 9.282549715730579e-07, + "loss": 0.0628, + "reward": 0.035275431582704186, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.035275431582704186, + "reward_after_std": 1.0592866968363523, + "reward_before_mean": 0.543465293943882, + "reward_before_std": 1.0686840042471886, + "reward_change_max": 0.0006739124655723572, + "reward_change_mean": -0.5081898486241698, + "reward_change_min": -1.1129607036709785, + "reward_change_std": 0.44032883644104004, + "reward_std": 1.0592867471277714, + "rewards/cosine_scaled_reward": 0.052982633154897485, + "rewards/format_reward": 0.43750000558793545, + "step": 132 + }, + { + "advantage_max": 0.970925759524107, + "advantage_mean": 3.2285850104507574e-08, + "advantage_min": -0.3871290944516659, + "advantage_std": 0.5082768350839615, + "completion_length": 3336.2291870117188, + "epoch": 0.152, + "grad_norm": 0.08025325834751129, + "kl": 0.012659072875976562, + "lambda_div_used": 0.5, + "learning_rate": 9.265439410565328e-07, + "loss": 0.013, + "reward": -0.49410490319132805, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.49410490319132805, + "reward_after_std": 0.5082768425345421, + "reward_before_mean": -0.248309426009655, + "reward_before_std": 0.5010492280125618, + "reward_change_max": 0.0004126057028770447, + "reward_change_mean": -0.24579549115151167, + "reward_change_min": -0.5601531192660332, + "reward_change_std": 0.21191422455012798, + "reward_std": 0.508276853710413, + "rewards/cosine_scaled_reward": -0.2179047055542469, + "rewards/format_reward": 0.18750000186264515, + "step": 133 + }, + { + "advantage_max": 1.5898833870887756, + "advantage_mean": 4.035731165918932e-09, + "advantage_min": -0.6322812959551811, + "advantage_std": 0.8210309743881226, + "completion_length": 2692.0416870117188, + "epoch": 0.15314285714285714, + "grad_norm": 0.16357369720935822, + "kl": 0.0129547119140625, + "lambda_div_used": 0.5, + "learning_rate": 9.248145583195447e-07, + "loss": 0.0494, + "reward": -0.11071780603379011, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.11071780603379011, + "reward_after_std": 0.8210309743881226, + "reward_before_mean": 0.354950032196939, + "reward_before_std": 0.7637733723968267, + "reward_change_max": 4.401057958602905e-05, + "reward_change_mean": -0.4656678568571806, + "reward_change_min": -0.8694765791296959, + "reward_change_std": 0.3447660394012928, + "reward_std": 0.8210310265421867, + "rewards/cosine_scaled_reward": -0.06210831506177783, + "rewards/format_reward": 0.4791666716337204, + "step": 134 + }, + { + "advantage_max": 1.6711772456765175, + "advantage_mean": 1.862645193639878e-08, + "advantage_min": -0.806564062833786, + "advantage_std": 0.8973346874117851, + "completion_length": 2051.729217529297, + "epoch": 0.15428571428571428, + "grad_norm": 0.16736926138401031, + "kl": 0.010089874267578125, + "lambda_div_used": 0.5, + "learning_rate": 9.230669076497687e-07, + "loss": 0.0708, + "reward": 0.21743118949234486, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.21743118949234486, + "reward_after_std": 0.89733468554914, + "reward_before_mean": 0.9405843168497086, + "reward_before_std": 0.8591349758207798, + "reward_change_max": 0.0, + "reward_change_mean": -0.7231530882418156, + "reward_change_min": -1.226291723549366, + "reward_change_std": 0.5035424418747425, + "reward_std": 0.8973347377032042, + "rewards/cosine_scaled_reward": 0.12654214911162853, + "rewards/format_reward": 0.687500013038516, + "step": 135 + }, + { + "advantage_max": 1.704109400510788, + "advantage_mean": -1.6763806842678974e-08, + "advantage_min": -0.7793970480561256, + "advantage_std": 0.9227009601891041, + "completion_length": 2986.1458892822266, + "epoch": 0.15542857142857142, + "grad_norm": 0.1529574692249298, + "kl": 0.0151824951171875, + "lambda_div_used": 0.5, + "learning_rate": 9.213010742252327e-07, + "loss": 0.0333, + "reward": 0.06355301290750504, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.06355301290750504, + "reward_after_std": 0.9227009415626526, + "reward_before_mean": 0.6477234736084938, + "reward_before_std": 0.9388607256114483, + "reward_change_max": 0.006334125995635986, + "reward_change_mean": -0.5841704942286015, + "reward_change_min": -1.2264491878449917, + "reward_change_std": 0.4949139221571386, + "reward_std": 0.9227009601891041, + "rewards/cosine_scaled_reward": 0.0842784009873867, + "rewards/format_reward": 0.4791666753590107, + "step": 136 + }, + { + "advantage_max": 1.199564404785633, + "advantage_mean": 1.738468857759301e-08, + "advantage_min": -0.5005557537078857, + "advantage_std": 0.6346891317516565, + "completion_length": 3282.6458740234375, + "epoch": 0.15657142857142858, + "grad_norm": 0.11640360951423645, + "kl": 0.0130615234375, + "lambda_div_used": 0.5, + "learning_rate": 9.195171441101668e-07, + "loss": 0.0435, + "reward": -0.4002666026353836, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.4002666026353836, + "reward_after_std": 0.6346891317516565, + "reward_before_mean": -0.1202960298396647, + "reward_before_std": 0.6554977763444185, + "reward_change_max": 0.001410163938999176, + "reward_change_mean": -0.2799705620855093, + "reward_change_min": -0.6201891824603081, + "reward_change_std": 0.2587748169898987, + "reward_std": 0.634689137339592, + "rewards/cosine_scaled_reward": -0.18514801934361458, + "rewards/format_reward": 0.25000000558793545, + "step": 137 + }, + { + "advantage_max": 1.3507948219776154, + "advantage_mean": 2.4835269396561444e-09, + "advantage_min": -0.49056804925203323, + "advantage_std": 0.6830014102160931, + "completion_length": 2713.395896911621, + "epoch": 0.15771428571428572, + "grad_norm": 0.1140730232000351, + "kl": 0.0122528076171875, + "lambda_div_used": 0.5, + "learning_rate": 9.177152042508077e-07, + "loss": 0.0291, + "reward": -0.12246760074049234, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.12246760074049234, + "reward_after_std": 0.6830014064908028, + "reward_before_mean": 0.3673200160264969, + "reward_before_std": 0.5412693619728088, + "reward_change_max": 0.00021785497665405273, + "reward_change_mean": -0.48978761956095695, + "reward_change_min": -0.7547291815280914, + "reward_change_std": 0.2954326942563057, + "reward_std": 0.6830014288425446, + "rewards/cosine_scaled_reward": -0.05592333839740604, + "rewards/format_reward": 0.47916668094694614, + "step": 138 + }, + { + "advantage_max": 1.6214874014258385, + "advantage_mean": 3.725290520506519e-09, + "advantage_min": -0.5952094718813896, + "advantage_std": 0.8399164713919163, + "completion_length": 3305.8333740234375, + "epoch": 0.15885714285714286, + "grad_norm": 0.1574862152338028, + "kl": 0.01418304443359375, + "lambda_div_used": 0.5, + "learning_rate": 9.158953424711624e-07, + "loss": 0.0403, + "reward": -0.3565631117671728, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.3565631117671728, + "reward_after_std": 0.8399164713919163, + "reward_before_mean": -0.1034796079620719, + "reward_before_std": 0.8613853231072426, + "reward_change_max": 0.0007909610867500305, + "reward_change_mean": -0.2530834935605526, + "reward_change_min": -0.65073337033391, + "reward_change_std": 0.26929632388055325, + "reward_std": 0.8399164862930775, + "rewards/cosine_scaled_reward": -0.166323134675622, + "rewards/format_reward": 0.22916666977107525, + "step": 139 + }, + { + "advantage_max": 1.4743750020861626, + "advantage_mean": 3.725290298461914e-09, + "advantage_min": -0.5049934312701225, + "advantage_std": 0.7563006207346916, + "completion_length": 3242.5833740234375, + "epoch": 0.16, + "grad_norm": 0.3012092411518097, + "kl": 0.01766204833984375, + "lambda_div_used": 0.5, + "learning_rate": 9.140576474687263e-07, + "loss": 0.0329, + "reward": -0.19406265020370483, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.19406265020370483, + "reward_after_std": 0.7563006244599819, + "reward_before_mean": 0.20776143670082092, + "reward_before_std": 0.6639534346759319, + "reward_change_max": 0.0021727383136749268, + "reward_change_mean": -0.40182408690452576, + "reward_change_min": -0.7207726016640663, + "reward_change_std": 0.28624863363802433, + "reward_std": 0.7563006617128849, + "rewards/cosine_scaled_reward": -0.03153595281764865, + "rewards/format_reward": 0.27083333767950535, + "step": 140 + }, + { + "advantage_max": 1.5581751950085163, + "advantage_mean": 1.4280279625467074e-08, + "advantage_min": -0.7472601998597383, + "advantage_std": 0.842759732156992, + "completion_length": 2783.8126068115234, + "epoch": 0.16114285714285714, + "grad_norm": 0.14586445689201355, + "kl": 0.01715850830078125, + "lambda_div_used": 0.5, + "learning_rate": 9.122022088101613e-07, + "loss": 0.0588, + "reward": -0.013555251061916351, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.013555251061916351, + "reward_after_std": 0.842759732156992, + "reward_before_mean": 0.5331528140231967, + "reward_before_std": 0.8482963526621461, + "reward_change_max": 0.0022571608424186707, + "reward_change_mean": -0.5467080399394035, + "reward_change_min": -1.014662615954876, + "reward_change_std": 0.4375216653570533, + "reward_std": 0.8427597507834435, + "rewards/cosine_scaled_reward": -0.014673600438982248, + "rewards/format_reward": 0.5625000093132257, + "step": 141 + }, + { + "advantage_max": 1.253233052790165, + "advantage_mean": 1.8626450382086546e-09, + "advantage_min": -0.611926406621933, + "advantage_std": 0.6612989082932472, + "completion_length": 2920.5625610351562, + "epoch": 0.16228571428571428, + "grad_norm": 0.11870459467172623, + "kl": 0.014678955078125, + "lambda_div_used": 0.5, + "learning_rate": 9.103291169269299e-07, + "loss": 0.0349, + "reward": -0.07781748473644257, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.07781748473644257, + "reward_after_std": 0.6612989269196987, + "reward_before_mean": 0.468132134526968, + "reward_before_std": 0.6124387122690678, + "reward_change_max": 0.0016920417547225952, + "reward_change_mean": -0.5459496257826686, + "reward_change_min": -0.9166505560278893, + "reward_change_std": 0.36288318363949656, + "reward_std": 0.6612989380955696, + "rewards/cosine_scaled_reward": -0.0576006043702364, + "rewards/format_reward": 0.583333345130086, + "step": 142 + }, + { + "advantage_max": 1.2389843165874481, + "advantage_mean": 1.6142925496342997e-08, + "advantage_min": -0.6687506064772606, + "advantage_std": 0.6784803830087185, + "completion_length": 2588.562545776367, + "epoch": 0.16342857142857142, + "grad_norm": 0.21533679962158203, + "kl": 0.01389312744140625, + "lambda_div_used": 0.5, + "learning_rate": 9.084384631108882e-07, + "loss": 0.0792, + "reward": -0.2079525962471962, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2079525962471962, + "reward_after_std": 0.6784803830087185, + "reward_before_mean": 0.21878846874460578, + "reward_before_std": 0.7084575332701206, + "reward_change_max": 0.0006280243396759033, + "reward_change_mean": -0.42674104776233435, + "reward_change_min": -0.8186662420630455, + "reward_change_std": 0.34944797586649656, + "reward_std": 0.6784803867340088, + "rewards/cosine_scaled_reward": -0.1406057756394148, + "rewards/format_reward": 0.5000000149011612, + "step": 143 + }, + { + "advantage_max": 1.3775924891233444, + "advantage_mean": 1.4901161637936866e-08, + "advantage_min": -0.5184234380722046, + "advantage_std": 0.7250764183700085, + "completion_length": 3109.2292098999023, + "epoch": 0.16457142857142856, + "grad_norm": 0.12948386371135712, + "kl": 0.01511383056640625, + "lambda_div_used": 0.5, + "learning_rate": 9.065303395098358e-07, + "loss": -0.0036, + "reward": -0.367202827706933, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.367202827706933, + "reward_after_std": 0.7250763960182667, + "reward_before_mean": -0.08332528173923492, + "reward_before_std": 0.7478654459118843, + "reward_change_max": 0.0013224631547927856, + "reward_change_mean": -0.2838775431737304, + "reward_change_min": -0.7693227715790272, + "reward_change_std": 0.29384620860219, + "reward_std": 0.7250764183700085, + "rewards/cosine_scaled_reward": -0.20832931413315237, + "rewards/format_reward": 0.33333334140479565, + "step": 144 + }, + { + "advantage_max": 1.552978865802288, + "advantage_mean": 4.656612678788363e-09, + "advantage_min": -0.5855300799012184, + "advantage_std": 0.8020382151007652, + "completion_length": 2332.750045776367, + "epoch": 0.1657142857142857, + "grad_norm": 0.10903175920248032, + "kl": 0.014812469482421875, + "lambda_div_used": 0.5, + "learning_rate": 9.046048391230247e-07, + "loss": 0.0034, + "reward": 0.07218926632776856, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.07218926632776856, + "reward_after_std": 0.8020382151007652, + "reward_before_mean": 0.6919207079336047, + "reward_before_std": 0.6887411586940289, + "reward_change_max": 0.001665949821472168, + "reward_change_mean": -0.619731426006183, + "reward_change_min": -0.9985717423260212, + "reward_change_std": 0.3817316296044737, + "reward_std": 0.8020382300019264, + "rewards/cosine_scaled_reward": 0.033460333943367004, + "rewards/format_reward": 0.6250000037252903, + "step": 145 + }, + { + "advantage_max": 1.0583246350288391, + "advantage_mean": 2.173086055545781e-08, + "advantage_min": -0.4859006591141224, + "advantage_std": 0.5605090521275997, + "completion_length": 2781.2500762939453, + "epoch": 0.16685714285714287, + "grad_norm": 0.12265187501907349, + "kl": 0.01261138916015625, + "lambda_div_used": 0.5, + "learning_rate": 9.026620557966279e-07, + "loss": 0.0693, + "reward": -0.36917710676789284, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.36917710676789284, + "reward_after_std": 0.5605090446770191, + "reward_before_mean": -0.03538452973589301, + "reward_before_std": 0.5493589676916599, + "reward_change_max": 0.0036334246397018433, + "reward_change_mean": -0.3337925784289837, + "reward_change_min": -0.6612095944583416, + "reward_change_std": 0.26808320358395576, + "reward_std": 0.56050905585289, + "rewards/cosine_scaled_reward": -0.2676922781392932, + "rewards/format_reward": 0.5000000093132257, + "step": 146 + }, + { + "advantage_max": 1.379384882748127, + "advantage_mean": 2.110997909809953e-08, + "advantage_min": -0.6559456661343575, + "advantage_std": 0.7488440778106451, + "completion_length": 3024.6458892822266, + "epoch": 0.168, + "grad_norm": 0.2351156324148178, + "kl": 0.0204620361328125, + "lambda_div_used": 0.5, + "learning_rate": 9.007020842191634e-07, + "loss": 0.0832, + "reward": -0.27363124303519726, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.27363124303519726, + "reward_after_std": 0.7488440666347742, + "reward_before_mean": 0.08099634572863579, + "reward_before_std": 0.813685305416584, + "reward_change_max": 0.0024464577436447144, + "reward_change_mean": -0.35462760739028454, + "reward_change_min": -0.9023499675095081, + "reward_change_std": 0.3578293425962329, + "reward_std": 0.7488440815359354, + "rewards/cosine_scaled_reward": -0.08450182341039181, + "rewards/format_reward": 0.2500000037252903, + "step": 147 + }, + { + "advantage_max": 1.430250957608223, + "advantage_mean": 2.483526828633842e-09, + "advantage_min": -0.6970910802483559, + "advantage_std": 0.7667691125534475, + "completion_length": 2556.3958587646484, + "epoch": 0.16914285714285715, + "grad_norm": 0.11008276790380478, + "kl": 0.017719268798828125, + "lambda_div_used": 0.5, + "learning_rate": 8.987250199168808e-07, + "loss": 0.0022, + "reward": -0.10380987264215946, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.10380987264215946, + "reward_after_std": 0.7667691130191088, + "reward_before_mean": 0.3883102908730507, + "reward_before_std": 0.7692638714797795, + "reward_change_max": 0.0, + "reward_change_mean": -0.4921201700344682, + "reward_change_min": -0.9062949493527412, + "reward_change_std": 0.37739391159266233, + "reward_std": 0.7667691316455603, + "rewards/cosine_scaled_reward": -0.0870948564261198, + "rewards/format_reward": 0.562500013038516, + "step": 148 + }, + { + "advantage_max": 1.5377417542040348, + "advantage_mean": 1.6763807175745882e-08, + "advantage_min": -0.7391314059495926, + "advantage_std": 0.8271317277103662, + "completion_length": 2969.500045776367, + "epoch": 0.1702857142857143, + "grad_norm": 0.13372498750686646, + "kl": 0.01287078857421875, + "lambda_div_used": 0.5, + "learning_rate": 8.967309592491052e-07, + "loss": 0.0276, + "reward": -0.15246623707935214, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.15246623707935214, + "reward_after_std": 0.8271317090839148, + "reward_before_mean": 0.2821824178099632, + "reward_before_std": 0.8760084733366966, + "reward_change_max": 0.0019507110118865967, + "reward_change_mean": -0.43464863393455744, + "reward_change_min": -1.0092763751745224, + "reward_change_std": 0.3935320507735014, + "reward_std": 0.8271317090839148, + "rewards/cosine_scaled_reward": -0.09849213063716888, + "rewards/format_reward": 0.47916668094694614, + "step": 149 + }, + { + "advantage_max": 1.601058579981327, + "advantage_mean": 8.69234451084111e-09, + "advantage_min": -0.7131126075983047, + "advantage_std": 0.8711162880063057, + "completion_length": 3100.9792404174805, + "epoch": 0.17142857142857143, + "grad_norm": 0.1580471247434616, + "kl": 0.0231170654296875, + "lambda_div_used": 0.5, + "learning_rate": 8.9471999940354e-07, + "loss": 0.0167, + "reward": -0.19248445704579353, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.19248445704579353, + "reward_after_std": 0.8711162842810154, + "reward_before_mean": 0.19730104506015778, + "reward_before_std": 0.9518158473074436, + "reward_change_max": 0.0012674406170845032, + "reward_change_mean": -0.38978548534214497, + "reward_change_min": -0.9178396500647068, + "reward_change_std": 0.39312170818448067, + "reward_std": 0.8711163178086281, + "rewards/cosine_scaled_reward": -0.08884949050843716, + "rewards/format_reward": 0.3750000037252903, + "step": 150 + }, + { + "advantage_max": 1.4950109869241714, + "advantage_mean": 2.220446049250313e-16, + "advantage_min": -0.6828820556402206, + "advantage_std": 0.7891075238585472, + "completion_length": 2729.9583435058594, + "epoch": 0.17257142857142857, + "grad_norm": 0.19314298033714294, + "kl": 0.0204315185546875, + "lambda_div_used": 0.5, + "learning_rate": 8.926922383915315e-07, + "loss": 0.036, + "reward": 0.06169239804148674, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.06169239804148674, + "reward_after_std": 0.7891075126826763, + "reward_before_mean": 0.6801268644630909, + "reward_before_std": 0.7161053195595741, + "reward_change_max": 0.0, + "reward_change_mean": -0.618434488773346, + "reward_change_min": -1.1258440501987934, + "reward_change_std": 0.4249352663755417, + "reward_std": 0.7891075238585472, + "rewards/cosine_scaled_reward": 0.05881343060173094, + "rewards/format_reward": 0.5625000074505806, + "step": 151 + }, + { + "advantage_max": 0.912200003862381, + "advantage_mean": 1.4280280014045132e-08, + "advantage_min": -0.4642832688987255, + "advantage_std": 0.49871931597590446, + "completion_length": 2945.000011444092, + "epoch": 0.1737142857142857, + "grad_norm": 0.09074344485998154, + "kl": 0.0227203369140625, + "lambda_div_used": 0.5, + "learning_rate": 8.906477750432903e-07, + "loss": 0.0049, + "reward": -0.3652635831385851, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.3652635831385851, + "reward_after_std": 0.49871933087706566, + "reward_before_mean": -0.0032742172479629517, + "reward_before_std": 0.5042455215007067, + "reward_change_max": 0.001100011169910431, + "reward_change_mean": -0.36198936961591244, + "reward_change_min": -0.7148860283195972, + "reward_change_std": 0.2905337093397975, + "reward_std": 0.49871933087706566, + "rewards/cosine_scaled_reward": -0.15788711048662663, + "rewards/format_reward": 0.3125, + "step": 152 + }, + { + "advantage_max": 1.5115925967693329, + "advantage_mean": -1.2417633588057697e-09, + "advantage_min": -0.5612894706428051, + "advantage_std": 0.770535409450531, + "completion_length": 2853.979217529297, + "epoch": 0.17485714285714285, + "grad_norm": 0.1524999886751175, + "kl": 0.0258026123046875, + "lambda_div_used": 0.5, + "learning_rate": 8.88586709003076e-07, + "loss": 0.0004, + "reward": -0.2571018021553755, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2571018021553755, + "reward_after_std": 0.770535409450531, + "reward_before_mean": 0.09108775481581688, + "reward_before_std": 0.7091003078967333, + "reward_change_max": 3.434717655181885e-05, + "reward_change_mean": -0.34818957280367613, + "reward_change_min": -0.7208801098167896, + "reward_change_std": 0.2727233390323818, + "reward_std": 0.7705354280769825, + "rewards/cosine_scaled_reward": -0.14195612538605928, + "rewards/format_reward": 0.37500000931322575, + "step": 153 + }, + { + "advantage_max": 1.5345972888171673, + "advantage_mean": -6.829699139565548e-09, + "advantage_min": -0.6860805973410606, + "advantage_std": 0.8205587547272444, + "completion_length": 3347.916717529297, + "epoch": 0.176, + "grad_norm": 0.15438421070575714, + "kl": 0.014179229736328125, + "lambda_div_used": 0.5, + "learning_rate": 8.865091407243394e-07, + "loss": 0.035, + "reward": 0.020980832166969776, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.020980832166969776, + "reward_after_std": 0.8205587510019541, + "reward_before_mean": 0.6036945916712284, + "reward_before_std": 0.787444993853569, + "reward_change_max": 0.00190676748752594, + "reward_change_mean": -0.5827137199230492, + "reward_change_min": -1.0991328060626984, + "reward_change_std": 0.45627478789538145, + "reward_std": 0.8205587565898895, + "rewards/cosine_scaled_reward": 0.07268060557544231, + "rewards/format_reward": 0.4583333358168602, + "step": 154 + }, + { + "advantage_max": 1.808014616370201, + "advantage_mean": 1.7384688910659918e-08, + "advantage_min": -0.6731105744838715, + "advantage_std": 0.9314933232963085, + "completion_length": 2730.687515258789, + "epoch": 0.17714285714285713, + "grad_norm": 0.1630096137523651, + "kl": 0.02112579345703125, + "lambda_div_used": 0.5, + "learning_rate": 8.844151714648274e-07, + "loss": 0.0479, + "reward": -0.0624679122120142, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.0624679122120142, + "reward_after_std": 0.931493304669857, + "reward_before_mean": 0.40027226135134697, + "reward_before_std": 0.8790432922542095, + "reward_change_max": 0.0011110901832580566, + "reward_change_mean": -0.4627401642501354, + "reward_change_min": -0.9882379546761513, + "reward_change_std": 0.3674464877694845, + "reward_std": 0.9314933083951473, + "rewards/cosine_scaled_reward": -0.029030536767095327, + "rewards/format_reward": 0.4583333395421505, + "step": 155 + }, + { + "advantage_max": 1.6551181003451347, + "advantage_mean": 1.0554989271494009e-08, + "advantage_min": -0.7165040969848633, + "advantage_std": 0.8703439943492413, + "completion_length": 3049.2291870117188, + "epoch": 0.1782857142857143, + "grad_norm": 0.25175604224205017, + "kl": 0.017902374267578125, + "lambda_div_used": 0.5, + "learning_rate": 8.823049032816478e-07, + "loss": 0.1118, + "reward": -0.2424111724831164, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2424111724831164, + "reward_after_std": 0.8703440092504025, + "reward_before_mean": 0.09557440504431725, + "reward_before_std": 0.9049641638994217, + "reward_change_max": 0.0012922286987304688, + "reward_change_mean": -0.33798559941351414, + "reward_change_min": -0.8197051659226418, + "reward_change_std": 0.332876767963171, + "reward_std": 0.8703440129756927, + "rewards/cosine_scaled_reward": -0.06679612956941128, + "rewards/format_reward": 0.22916667349636555, + "step": 156 + }, + { + "advantage_max": 1.05646251142025, + "advantage_mean": 9.31322552411018e-09, + "advantage_min": -0.4026567302644253, + "advantage_std": 0.5398902297019958, + "completion_length": 3296.625030517578, + "epoch": 0.17942857142857144, + "grad_norm": 0.11682058125734329, + "kl": 0.0221710205078125, + "lambda_div_used": 0.5, + "learning_rate": 8.801784390262943e-07, + "loss": 0.0258, + "reward": -0.2805541264824569, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2805541264824569, + "reward_after_std": 0.539890231564641, + "reward_before_mean": 0.1279155914671719, + "reward_before_std": 0.4479395393282175, + "reward_change_max": 0.0005399435758590698, + "reward_change_mean": -0.4084697123616934, + "reward_change_min": -0.6387323662638664, + "reward_change_std": 0.24938340950757265, + "reward_std": 0.5398902371525764, + "rewards/cosine_scaled_reward": -0.11312553659081459, + "rewards/format_reward": 0.3541666679084301, + "step": 157 + }, + { + "advantage_max": 1.5963420271873474, + "advantage_mean": 6.208817349140361e-10, + "advantage_min": -0.7781121879816055, + "advantage_std": 0.8665140904486179, + "completion_length": 3194.4375915527344, + "epoch": 0.18057142857142858, + "grad_norm": 0.2124539315700531, + "kl": 0.0201416015625, + "lambda_div_used": 0.5, + "learning_rate": 8.780358823396352e-07, + "loss": 0.0665, + "reward": 0.07793341856449842, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.07793341856449842, + "reward_after_std": 0.8665141090750694, + "reward_before_mean": 0.6934919357299805, + "reward_before_std": 0.8816309906542301, + "reward_change_max": 0.004750244319438934, + "reward_change_mean": -0.6155585404485464, + "reward_change_min": -1.2228962555527687, + "reward_change_std": 0.49037730880081654, + "reward_std": 0.8665141351521015, + "rewards/cosine_scaled_reward": 0.14882931299507618, + "rewards/format_reward": 0.3958333432674408, + "step": 158 + }, + { + "advantage_max": 1.2864177525043488, + "advantage_mean": 2.0489097030118586e-08, + "advantage_min": -0.5405474305152893, + "advantage_std": 0.6673975624144077, + "completion_length": 3283.8541870117188, + "epoch": 0.18171428571428572, + "grad_norm": 0.15530923008918762, + "kl": 0.0285491943359375, + "lambda_div_used": 0.5, + "learning_rate": 8.758773376468604e-07, + "loss": 0.0649, + "reward": -0.4083964992314577, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.4083964992314577, + "reward_after_std": 0.6673975735902786, + "reward_before_mean": -0.14629693608731031, + "reward_before_std": 0.6638858169317245, + "reward_change_max": 0.0001917034387588501, + "reward_change_mean": -0.2620995473116636, + "reward_change_min": -0.5487857535481453, + "reward_change_std": 0.23622982390224934, + "reward_std": 0.6673975922167301, + "rewards/cosine_scaled_reward": -0.18773180805146694, + "rewards/format_reward": 0.2291666753590107, + "step": 159 + }, + { + "advantage_max": 1.461052566766739, + "advantage_mean": 8.692344288796505e-09, + "advantage_min": -0.6089429929852486, + "advantage_std": 0.7506822571158409, + "completion_length": 2841.7708740234375, + "epoch": 0.18285714285714286, + "grad_norm": 0.2132018804550171, + "kl": 0.02667999267578125, + "lambda_div_used": 0.5, + "learning_rate": 8.737029101523929e-07, + "loss": 0.0835, + "reward": -0.21343960147351027, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.21343960147351027, + "reward_after_std": 0.7506822533905506, + "reward_before_mean": 0.18066900130361319, + "reward_before_std": 0.7000150382518768, + "reward_change_max": 0.0030357539653778076, + "reward_change_mean": -0.39410861022770405, + "reward_change_min": -0.6988341324031353, + "reward_change_std": 0.28755941800773144, + "reward_std": 0.7506822720170021, + "rewards/cosine_scaled_reward": -0.09716550912708044, + "rewards/format_reward": 0.37500000186264515, + "step": 160 + }, + { + "advantage_max": 1.3866098821163177, + "advantage_mean": 6.208816238917336e-10, + "advantage_min": -0.647071972489357, + "advantage_std": 0.7460334822535515, + "completion_length": 2920.750015258789, + "epoch": 0.184, + "grad_norm": 0.16971638798713684, + "kl": 0.028717041015625, + "lambda_div_used": 0.5, + "learning_rate": 8.715127058347614e-07, + "loss": 0.0448, + "reward": -0.035258321557193995, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.035258321557193995, + "reward_after_std": 0.7460334822535515, + "reward_before_mean": 0.519082885235548, + "reward_before_std": 0.7315765358507633, + "reward_change_max": 0.004067353904247284, + "reward_change_mean": -0.5543412175029516, + "reward_change_min": -1.0808284804224968, + "reward_change_std": 0.428444167599082, + "reward_std": 0.7460335120558739, + "rewards/cosine_scaled_reward": 0.009541435167193413, + "rewards/format_reward": 0.5000000149011612, + "step": 161 + }, + { + "advantage_max": 1.3734740167856216, + "advantage_mean": 1.6142925329809543e-08, + "advantage_min": -0.5869306847453117, + "advantage_std": 0.7363872975111008, + "completion_length": 3208.0208740234375, + "epoch": 0.18514285714285714, + "grad_norm": 0.2380620688199997, + "kl": 0.0320892333984375, + "lambda_div_used": 0.5, + "learning_rate": 8.693068314414344e-07, + "loss": 0.0282, + "reward": -0.2969482094049454, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2969482094049454, + "reward_after_std": 0.7363873012363911, + "reward_before_mean": 0.043372806161642075, + "reward_before_std": 0.776290912181139, + "reward_change_max": 0.0018128976225852966, + "reward_change_mean": -0.3403210146352649, + "reward_change_min": -0.8492111563682556, + "reward_change_std": 0.3306410340592265, + "reward_std": 0.7363873347640038, + "rewards/cosine_scaled_reward": -0.13456360204145312, + "rewards/format_reward": 0.3125000074505806, + "step": 162 + }, + { + "advantage_max": 1.3436215445399284, + "advantage_mean": -9.934108091691485e-09, + "advantage_min": -0.7455865144729614, + "advantage_std": 0.7360293306410313, + "completion_length": 2572.437545776367, + "epoch": 0.18628571428571428, + "grad_norm": 0.13132235407829285, + "kl": 0.0261383056640625, + "lambda_div_used": 0.5, + "learning_rate": 8.670853944836176e-07, + "loss": 0.0077, + "reward": 0.09643177315592766, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.09643177315592766, + "reward_after_std": 0.7360293306410313, + "reward_before_mean": 0.7683794125914574, + "reward_before_std": 0.7229588590562344, + "reward_change_max": 0.0005083903670310974, + "reward_change_mean": -0.6719476291909814, + "reward_change_min": -1.1431674733757973, + "reward_change_std": 0.46183205861598253, + "reward_std": 0.7360293418169022, + "rewards/cosine_scaled_reward": 0.1029396834783256, + "rewards/format_reward": 0.5625000074505806, + "step": 163 + }, + { + "advantage_max": 1.4671280607581139, + "advantage_mean": 2.5300930905913788e-08, + "advantage_min": -0.6642183251678944, + "advantage_std": 0.7612626627087593, + "completion_length": 2702.4583587646484, + "epoch": 0.18742857142857142, + "grad_norm": 0.140818789601326, + "kl": 0.02923583984375, + "lambda_div_used": 0.5, + "learning_rate": 8.648485032310144e-07, + "loss": -0.0184, + "reward": 0.013984514982439578, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.013984514982439578, + "reward_after_std": 0.7612626627087593, + "reward_before_mean": 0.5976662468165159, + "reward_before_std": 0.6583688072860241, + "reward_change_max": 0.0011242106556892395, + "reward_change_mean": -0.5836816895753145, + "reward_change_min": -0.9547205977141857, + "reward_change_std": 0.38507608138024807, + "reward_std": 0.7612626999616623, + "rewards/cosine_scaled_reward": 0.04883309965953231, + "rewards/format_reward": 0.5000000055879354, + "step": 164 + }, + { + "advantage_max": 1.6381709426641464, + "advantage_mean": 2.2351742401394148e-08, + "advantage_min": -0.6009313315153122, + "advantage_std": 0.850938007235527, + "completion_length": 3325.479248046875, + "epoch": 0.18857142857142858, + "grad_norm": 0.2902795076370239, + "kl": 0.0403594970703125, + "lambda_div_used": 0.5, + "learning_rate": 8.625962667065487e-07, + "loss": 0.0991, + "reward": -0.3041955577209592, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.3041955577209592, + "reward_after_std": 0.8509380016475916, + "reward_before_mean": -0.013059889897704124, + "reward_before_std": 0.8602779507637024, + "reward_change_max": 0.0005616173148155212, + "reward_change_mean": -0.29113566502928734, + "reward_change_min": -0.7681114263832569, + "reward_change_std": 0.2988483002409339, + "reward_std": 0.8509380277246237, + "rewards/cosine_scaled_reward": -0.12111327843740582, + "rewards/format_reward": 0.22916666977107525, + "step": 165 + }, + { + "advantage_max": 1.4696976244449615, + "advantage_mean": -3.104408785592483e-09, + "advantage_min": -0.6678441911935806, + "advantage_std": 0.7880504056811333, + "completion_length": 3159.5625610351562, + "epoch": 0.18971428571428572, + "grad_norm": 0.1653311848640442, + "kl": 0.027587890625, + "lambda_div_used": 0.5, + "learning_rate": 8.603287946810513e-07, + "loss": 0.0269, + "reward": -0.23447778564877808, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.23447778564877808, + "reward_after_std": 0.7880504205822945, + "reward_before_mean": 0.14275594055652618, + "reward_before_std": 0.8306904956698418, + "reward_change_max": 0.0057152509689331055, + "reward_change_mean": -0.3772337343543768, + "reward_change_min": -0.8500324971973896, + "reward_change_std": 0.36009896732866764, + "reward_std": 0.7880504615604877, + "rewards/cosine_scaled_reward": -0.08487202413380146, + "rewards/format_reward": 0.3125000111758709, + "step": 166 + }, + { + "advantage_max": 1.4749844521284103, + "advantage_mean": 1.1102230246251565e-16, + "advantage_min": -0.6022154316306114, + "advantage_std": 0.7785226926207542, + "completion_length": 2614.416717529297, + "epoch": 0.19085714285714286, + "grad_norm": 0.19377173483371735, + "kl": 0.025634765625, + "lambda_div_used": 0.5, + "learning_rate": 8.580461976679099e-07, + "loss": 0.0438, + "reward": -0.12003645673394203, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.12003645673394203, + "reward_after_std": 0.7785226963460445, + "reward_before_mean": 0.3457608614116907, + "reward_before_std": 0.7535338178277016, + "reward_change_max": 0.0003101229667663574, + "reward_change_mean": -0.4657973274588585, + "reward_change_min": -0.9503576084971428, + "reward_change_std": 0.36564162001013756, + "reward_std": 0.7785227224230766, + "rewards/cosine_scaled_reward": -0.16045291302725673, + "rewards/format_reward": 0.6666666734963655, + "step": 167 + }, + { + "advantage_max": 1.6648427098989487, + "advantage_mean": 2.483526828633842e-09, + "advantage_min": -0.8055135011672974, + "advantage_std": 0.8785872720181942, + "completion_length": 2906.2708740234375, + "epoch": 0.192, + "grad_norm": 0.16217641532421112, + "kl": 0.0312652587890625, + "lambda_div_used": 0.5, + "learning_rate": 8.557485869176825e-07, + "loss": 0.0122, + "reward": 0.11544627044349909, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.11544627044349909, + "reward_after_std": 0.8785872720181942, + "reward_before_mean": 0.7536546923220158, + "reward_before_std": 0.829089242964983, + "reward_change_max": 0.000681951642036438, + "reward_change_mean": -0.6382084004580975, + "reward_change_min": -1.1904875002801418, + "reward_change_std": 0.4658457115292549, + "reward_std": 0.8785872757434845, + "rewards/cosine_scaled_reward": 0.09557732753455639, + "rewards/format_reward": 0.5625000111758709, + "step": 168 + }, + { + "advantage_max": 1.8562040403485298, + "advantage_mean": -1.738468857759301e-08, + "advantage_min": -0.6270218789577484, + "advantage_std": 0.9351372793316841, + "completion_length": 2535.291732788086, + "epoch": 0.19314285714285714, + "grad_norm": 0.16913841664791107, + "kl": 0.04036712646484375, + "lambda_div_used": 0.5, + "learning_rate": 8.534360744126753e-07, + "loss": 0.035, + "reward": 0.2956652529537678, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2956652529537678, + "reward_after_std": 0.9351372867822647, + "reward_before_mean": 1.0529553515370935, + "reward_before_std": 0.701921995729208, + "reward_change_max": 0.0003681108355522156, + "reward_change_mean": -0.7572900727391243, + "reward_change_min": -1.1275853663682938, + "reward_change_std": 0.4388972017914057, + "reward_std": 0.9351373203098774, + "rewards/cosine_scaled_reward": 0.24522765818983316, + "rewards/format_reward": 0.5625000018626451, + "step": 169 + }, + { + "advantage_max": 1.4476363211870193, + "advantage_mean": 4.346172199909404e-09, + "advantage_min": -0.675914853811264, + "advantage_std": 0.7612299062311649, + "completion_length": 2383.458396911621, + "epoch": 0.19428571428571428, + "grad_norm": 0.14560545980930328, + "kl": 0.02826690673828125, + "lambda_div_used": 0.5, + "learning_rate": 8.511087728614862e-07, + "loss": 0.0487, + "reward": -0.05682095978409052, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.05682095978409052, + "reward_after_std": 0.7612299248576164, + "reward_before_mean": 0.47058921796269715, + "reward_before_std": 0.7103168535977602, + "reward_change_max": 0.0016023367643356323, + "reward_change_mean": -0.5274101868271828, + "reward_change_min": -1.002557884901762, + "reward_change_std": 0.38785367645323277, + "reward_std": 0.761229932308197, + "rewards/cosine_scaled_reward": -0.06678873766213655, + "rewards/format_reward": 0.6041666846722364, + "step": 170 + }, + { + "advantage_max": 1.6320666521787643, + "advantage_mean": -3.7252904094842165e-09, + "advantage_min": -0.6522354558110237, + "advantage_std": 0.8446485474705696, + "completion_length": 2998.3333587646484, + "epoch": 0.19542857142857142, + "grad_norm": 0.1538252979516983, + "kl": 0.0330810546875, + "lambda_div_used": 0.5, + "learning_rate": 8.487667956935087e-07, + "loss": 0.0287, + "reward": -0.11476925574243069, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.11476925574243069, + "reward_after_std": 0.8446485660970211, + "reward_before_mean": 0.33872489258646965, + "reward_before_std": 0.7975562885403633, + "reward_change_max": 0.0007885992527008057, + "reward_change_mean": -0.4534941161982715, + "reward_change_min": -0.8434424623847008, + "reward_change_std": 0.3448071158491075, + "reward_std": 0.8446485921740532, + "rewards/cosine_scaled_reward": -0.028554232325404882, + "rewards/format_reward": 0.39583334140479565, + "step": 171 + }, + { + "advantage_max": 1.4919284507632256, + "advantage_mean": -1.1175871117430347e-08, + "advantage_min": -0.6620048135519028, + "advantage_std": 0.7910696156322956, + "completion_length": 2916.354217529297, + "epoch": 0.19657142857142856, + "grad_norm": 0.13567769527435303, + "kl": 0.0409088134765625, + "lambda_div_used": 0.5, + "learning_rate": 8.464102570534061e-07, + "loss": 0.0138, + "reward": -0.02154737338423729, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.02154737338423729, + "reward_after_std": 0.7910696379840374, + "reward_before_mean": 0.5326739009469748, + "reward_before_std": 0.7421072386205196, + "reward_change_max": 0.0, + "reward_change_mean": -0.5542212873697281, + "reward_change_min": -0.9738561362028122, + "reward_change_std": 0.4017046205699444, + "reward_std": 0.7910696603357792, + "rewards/cosine_scaled_reward": 0.04758694767951965, + "rewards/format_reward": 0.43750000558793545, + "step": 172 + }, + { + "advantage_max": 1.25401646271348, + "advantage_mean": -6.208816794028849e-10, + "advantage_min": -0.4338163882493973, + "advantage_std": 0.6396212875843048, + "completion_length": 2764.0416831970215, + "epoch": 0.1977142857142857, + "grad_norm": 0.10223139822483063, + "kl": 0.049713134765625, + "lambda_div_used": 0.5, + "learning_rate": 8.440392717955475e-07, + "loss": 0.0166, + "reward": -0.36996013298630714, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.36996013298630714, + "reward_after_std": 0.6396212838590145, + "reward_before_mean": -0.07614278933033347, + "reward_before_std": 0.5837610512971878, + "reward_change_max": 0.0012845918536186218, + "reward_change_mean": -0.29381735995411873, + "reward_change_min": -0.5957952998578548, + "reward_change_std": 0.2213090155273676, + "reward_std": 0.6396212950348854, + "rewards/cosine_scaled_reward": -0.24640473164618015, + "rewards/format_reward": 0.4166666716337204, + "step": 173 + }, + { + "advantage_max": 1.5859555639326572, + "advantage_mean": 1.676380712023473e-08, + "advantage_min": -0.6190120317041874, + "advantage_std": 0.8216863200068474, + "completion_length": 2392.7291870117188, + "epoch": 0.19885714285714284, + "grad_norm": 0.14171089231967926, + "kl": 0.0450439453125, + "lambda_div_used": 0.5, + "learning_rate": 8.416539554784089e-07, + "loss": 0.0041, + "reward": -0.0845435168594122, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.0845435168594122, + "reward_after_std": 0.8216863293200731, + "reward_before_mean": 0.400400809943676, + "reward_before_std": 0.7570674475282431, + "reward_change_max": 0.0009220615029335022, + "reward_change_mean": -0.484944318421185, + "reward_change_min": -0.9629802815616131, + "reward_change_std": 0.35314718913286924, + "reward_std": 0.8216863460838795, + "rewards/cosine_scaled_reward": -0.1018829345703125, + "rewards/format_reward": 0.6041666772216558, + "step": 174 + }, + { + "advantage_max": 1.261248379945755, + "advantage_mean": 9.934108091691485e-09, + "advantage_min": -0.6661129705607891, + "advantage_std": 0.683073777705431, + "completion_length": 2795.50004196167, + "epoch": 0.2, + "grad_norm": 0.13047651946544647, + "kl": 0.0414886474609375, + "lambda_div_used": 0.5, + "learning_rate": 8.392544243589427e-07, + "loss": 0.0349, + "reward": -0.044038325548172, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.044038325548172, + "reward_after_std": 0.6830737814307213, + "reward_before_mean": 0.5270445011556149, + "reward_before_std": 0.6684056259691715, + "reward_change_max": 0.0006987974047660828, + "reward_change_mean": -0.5710828024893999, + "reward_change_min": -0.9975682608783245, + "reward_change_std": 0.4042843095958233, + "reward_std": 0.6830737888813019, + "rewards/cosine_scaled_reward": 0.013522235676646233, + "rewards/format_reward": 0.5000000111758709, + "step": 175 + }, + { + "advantage_max": 1.607403114438057, + "advantage_mean": 1.3038516322172455e-08, + "advantage_min": -0.6778121329843998, + "advantage_std": 0.8579734489321709, + "completion_length": 2719.270896911621, + "epoch": 0.20114285714285715, + "grad_norm": 0.2701166570186615, + "kl": 0.0418701171875, + "lambda_div_used": 0.5, + "learning_rate": 8.368407953869103e-07, + "loss": 0.0373, + "reward": -0.06047849915921688, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.06047849915921688, + "reward_after_std": 0.8579734601080418, + "reward_before_mean": 0.4298029188066721, + "reward_before_std": 0.8629395943135023, + "reward_change_max": 0.0006796494126319885, + "reward_change_mean": -0.4902814142405987, + "reward_change_min": -1.097361333668232, + "reward_change_std": 0.427899737842381, + "reward_std": 0.8579734787344933, + "rewards/cosine_scaled_reward": -0.0455152140930295, + "rewards/format_reward": 0.520833333954215, + "step": 176 + }, + { + "advantage_max": 1.4356028512120247, + "advantage_mean": 1.8626452047421083e-09, + "advantage_min": -0.5858288630843163, + "advantage_std": 0.7616856321692467, + "completion_length": 3033.062545776367, + "epoch": 0.2022857142857143, + "grad_norm": 0.22180777788162231, + "kl": 0.05340576171875, + "lambda_div_used": 0.5, + "learning_rate": 8.344131861991828e-07, + "loss": 0.0459, + "reward": -0.2571666557341814, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2571666557341814, + "reward_after_std": 0.7616856321692467, + "reward_before_mean": 0.10084620304405689, + "reward_before_std": 0.7724887356162071, + "reward_change_max": 0.0013925060629844666, + "reward_change_mean": -0.3580128587782383, + "reward_change_min": -0.75266994535923, + "reward_change_std": 0.3098360765725374, + "reward_std": 0.7616856396198273, + "rewards/cosine_scaled_reward": -0.19957689847797155, + "rewards/format_reward": 0.5000000093132257, + "step": 177 + }, + { + "advantage_max": 1.3237878009676933, + "advantage_mean": -5.587935614226325e-09, + "advantage_min": -0.5824862495064735, + "advantage_std": 0.706440394744277, + "completion_length": 2700.687530517578, + "epoch": 0.20342857142857143, + "grad_norm": 0.21339182555675507, + "kl": 0.058441162109375, + "lambda_div_used": 0.5, + "learning_rate": 8.319717151140072e-07, + "loss": 0.0295, + "reward": -0.28272207267582417, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.28272207267582417, + "reward_after_std": 0.7064403779804707, + "reward_before_mean": 0.06567367631942034, + "reward_before_std": 0.7269596587866545, + "reward_change_max": 0.003658100962638855, + "reward_change_mean": -0.34839577320963144, + "reward_change_min": -0.721770029515028, + "reward_change_std": 0.3038365198299289, + "reward_std": 0.7064404059201479, + "rewards/cosine_scaled_reward": -0.14424649812281132, + "rewards/format_reward": 0.35416667349636555, + "step": 178 + }, + { + "advantage_max": 1.487779911607504, + "advantage_mean": 1.3659398279131096e-08, + "advantage_min": -0.4699557423591614, + "advantage_std": 0.7494055908173323, + "completion_length": 2953.3333740234375, + "epoch": 0.20457142857142857, + "grad_norm": 0.37815913558006287, + "kl": 0.0502777099609375, + "lambda_div_used": 0.5, + "learning_rate": 8.295165011252396e-07, + "loss": 0.0936, + "reward": -0.32715226151049137, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.32715226151049137, + "reward_after_std": 0.7494055908173323, + "reward_before_mean": -0.028430516831576824, + "reward_before_std": 0.675062196329236, + "reward_change_max": 0.0010721608996391296, + "reward_change_mean": -0.2987217428162694, + "reward_change_min": -0.5488623008131981, + "reward_change_std": 0.22199713252484798, + "reward_std": 0.7494056057184935, + "rewards/cosine_scaled_reward": -0.17046526400372386, + "rewards/format_reward": 0.31250000186264515, + "step": 179 + }, + { + "advantage_max": 1.5573591962456703, + "advantage_mean": 1.8626452602532595e-09, + "advantage_min": -0.6234206072986126, + "advantage_std": 0.8115277662873268, + "completion_length": 2144.208366394043, + "epoch": 0.2057142857142857, + "grad_norm": 0.1629016101360321, + "kl": 0.0549774169921875, + "lambda_div_used": 0.5, + "learning_rate": 8.270476638965461e-07, + "loss": 0.0502, + "reward": 0.01475525926798582, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.01475525926798582, + "reward_after_std": 0.8115277662873268, + "reward_before_mean": 0.5810065679252148, + "reward_before_std": 0.7341162748634815, + "reward_change_max": 0.0006464123725891113, + "reward_change_mean": -0.5662512928247452, + "reward_change_min": -1.0189911015331745, + "reward_change_std": 0.38147035613656044, + "reward_std": 0.8115277960896492, + "rewards/cosine_scaled_reward": -0.04283006116747856, + "rewards/format_reward": 0.6666666679084301, + "step": 180 + }, + { + "advantage_max": 1.4340339675545692, + "advantage_mean": -3.1044086745701804e-09, + "advantage_min": -0.578693151473999, + "advantage_std": 0.7451658025383949, + "completion_length": 2921.541732788086, + "epoch": 0.20685714285714285, + "grad_norm": 0.20287711918354034, + "kl": 0.05859375, + "lambda_div_used": 0.5, + "learning_rate": 8.245653237555705e-07, + "loss": 0.0512, + "reward": -0.1003317330032587, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.1003317330032587, + "reward_after_std": 0.7451657950878143, + "reward_before_mean": 0.39000407606363297, + "reward_before_std": 0.6769847068935633, + "reward_change_max": 0.0010607540607452393, + "reward_change_mean": -0.49033585004508495, + "reward_change_min": -0.9029561765491962, + "reward_change_std": 0.3469520937651396, + "reward_std": 0.7451658211648464, + "rewards/cosine_scaled_reward": -0.002914630458690226, + "rewards/format_reward": 0.3958333395421505, + "step": 181 + }, + { + "advantage_max": 1.394393615424633, + "advantage_mean": 1.8626452047421083e-09, + "advantage_min": -0.7053156830370426, + "advantage_std": 0.7596918232738972, + "completion_length": 2541.437545776367, + "epoch": 0.208, + "grad_norm": 0.16592997312545776, + "kl": 0.0494232177734375, + "lambda_div_used": 0.5, + "learning_rate": 8.220696016880687e-07, + "loss": -0.0005, + "reward": -0.022533608600497246, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.022533608600497246, + "reward_after_std": 0.759691808372736, + "reward_before_mean": 0.5406998414546251, + "reward_before_std": 0.769719572737813, + "reward_change_max": 0.003669723868370056, + "reward_change_mean": -0.5632334500551224, + "reward_change_min": -1.0908655300736427, + "reward_change_std": 0.4336672220379114, + "reward_std": 0.7596918120980263, + "rewards/cosine_scaled_reward": -0.021316751837730408, + "rewards/format_reward": 0.5833333469927311, + "step": 182 + }, + { + "advantage_max": 1.5372853577136993, + "advantage_mean": -3.7252904094842165e-09, + "advantage_min": -0.6862039528787136, + "advantage_std": 0.8002040609717369, + "completion_length": 2360.8958892822266, + "epoch": 0.20914285714285713, + "grad_norm": 0.2165931612253189, + "kl": 0.0673828125, + "lambda_div_used": 0.5, + "learning_rate": 8.195606193320136e-07, + "loss": 0.0391, + "reward": 0.06708026025444269, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.06708026025444269, + "reward_after_std": 0.8002040907740593, + "reward_before_mean": 0.6775889825075865, + "reward_before_std": 0.7066179402172565, + "reward_change_max": 0.0, + "reward_change_mean": -0.6105087604373693, + "reward_change_min": -0.9791091829538345, + "reward_change_std": 0.3944264929741621, + "reward_std": 0.8002041131258011, + "rewards/cosine_scaled_reward": 0.005461166147142649, + "rewards/format_reward": 0.666666679084301, + "step": 183 + }, + { + "advantage_max": 1.4124026373028755, + "advantage_mean": -1.2417630257388623e-09, + "advantage_min": -0.5000558458268642, + "advantage_std": 0.729760505259037, + "completion_length": 2818.354202270508, + "epoch": 0.2102857142857143, + "grad_norm": 0.25655215978622437, + "kl": 0.072998046875, + "lambda_div_used": 0.5, + "learning_rate": 8.170384989716657e-07, + "loss": 0.0362, + "reward": -0.35524504724889994, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.35524504724889994, + "reward_after_std": 0.7297605089843273, + "reward_before_mean": -0.08101257588714361, + "reward_before_std": 0.7060116622596979, + "reward_change_max": 0.0005974471569061279, + "reward_change_mean": -0.2742324732244015, + "reward_change_min": -0.57624626532197, + "reward_change_std": 0.22511245217174292, + "reward_std": 0.7297605387866497, + "rewards/cosine_scaled_reward": -0.21758961910381913, + "rewards/format_reward": 0.3541666716337204, + "step": 184 + }, + { + "advantage_max": 1.020853940397501, + "advantage_mean": 2.2351742234860694e-08, + "advantage_min": -0.42880232259631157, + "advantage_std": 0.54007213935256, + "completion_length": 2751.0833740234375, + "epoch": 0.21142857142857144, + "grad_norm": 0.15834684669971466, + "kl": 0.0701141357421875, + "lambda_div_used": 0.5, + "learning_rate": 8.145033635316128e-07, + "loss": 0.0207, + "reward": -0.35438499972224236, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.35438499972224236, + "reward_after_std": 0.5400721412152052, + "reward_before_mean": -0.006038234569132328, + "reward_before_std": 0.505929496139288, + "reward_change_max": 0.0008821934461593628, + "reward_change_mean": -0.3483467437326908, + "reward_change_min": -0.6647264584898949, + "reward_change_std": 0.2687018224969506, + "reward_std": 0.540072163566947, + "rewards/cosine_scaled_reward": -0.23218578845262527, + "rewards/format_reward": 0.4583333432674408, + "step": 185 + }, + { + "advantage_max": 1.0780138969421387, + "advantage_mean": 1.6142925773898753e-08, + "advantage_min": -0.6679522022604942, + "advantage_std": 0.5969401746988297, + "completion_length": 3050.812545776367, + "epoch": 0.21257142857142858, + "grad_norm": 0.16625259816646576, + "kl": 0.090362548828125, + "lambda_div_used": 0.5, + "learning_rate": 8.119553365707802e-07, + "loss": 0.0018, + "reward": -0.0695158913731575, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.0695158913731575, + "reward_after_std": 0.5969401746988297, + "reward_before_mean": 0.5104820095002651, + "reward_before_std": 0.6082560420036316, + "reward_change_max": 0.0033091604709625244, + "reward_change_mean": -0.5799979045987129, + "reward_change_min": -0.9459269754588604, + "reward_change_std": 0.4035164900124073, + "reward_std": 0.5969401746988297, + "rewards/cosine_scaled_reward": 0.06774100661277771, + "rewards/format_reward": 0.375, + "step": 186 + }, + { + "advantage_max": 1.2345889136195183, + "advantage_mean": 6.208817904251873e-10, + "advantage_min": -0.5896378178149462, + "advantage_std": 0.6629064604640007, + "completion_length": 2472.312545776367, + "epoch": 0.21371428571428572, + "grad_norm": 0.2719042897224426, + "kl": 0.0833282470703125, + "lambda_div_used": 0.5, + "learning_rate": 8.093945422764069e-07, + "loss": 0.0412, + "reward": -0.1858983300626278, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.1858983300626278, + "reward_after_std": 0.6629064828157425, + "reward_before_mean": 0.2663856241852045, + "reward_before_std": 0.6563301542773843, + "reward_change_max": 0.004294879734516144, + "reward_change_mean": -0.4522839467972517, + "reward_change_min": -0.8460966721177101, + "reward_change_std": 0.35423574782907963, + "reward_std": 0.6629064977169037, + "rewards/cosine_scaled_reward": -0.12722386233508587, + "rewards/format_reward": 0.5208333488553762, + "step": 187 + }, + { + "advantage_max": 0.6440011262893677, + "advantage_mean": 3.973643181165443e-08, + "advantage_min": -0.3368586078286171, + "advantage_std": 0.3568432927131653, + "completion_length": 3203.229217529297, + "epoch": 0.21485714285714286, + "grad_norm": 0.16227319836616516, + "kl": 0.101318359375, + "lambda_div_used": 0.5, + "learning_rate": 8.068211054579943e-07, + "loss": 0.0267, + "reward": -0.6190324202179909, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.6190324202179909, + "reward_after_std": 0.3568432927131653, + "reward_before_mean": -0.4256970062851906, + "reward_before_std": 0.3935478888452053, + "reward_change_max": 0.0018765032291412354, + "reward_change_mean": -0.1933353953063488, + "reward_change_min": -0.425104808062315, + "reward_change_std": 0.18632372096180916, + "reward_std": 0.3568432964384556, + "rewards/cosine_scaled_reward": -0.24409850127995014, + "rewards/format_reward": 0.06250000186264515, + "step": 188 + }, + { + "advantage_max": 1.5661092102527618, + "advantage_mean": -3.7252904094842165e-09, + "advantage_min": -0.6449663117527962, + "advantage_std": 0.8198821656405926, + "completion_length": 2691.125045776367, + "epoch": 0.216, + "grad_norm": 0.3333768844604492, + "kl": 0.105682373046875, + "lambda_div_used": 0.5, + "learning_rate": 8.04235151541222e-07, + "loss": 0.0368, + "reward": -0.056773873046040535, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.056773873046040535, + "reward_after_std": 0.8198821656405926, + "reward_before_mean": 0.45091634057462215, + "reward_before_std": 0.7725614793598652, + "reward_change_max": 0.0009219944477081299, + "reward_change_mean": -0.507690217345953, + "reward_change_min": -1.0040309727191925, + "reward_change_std": 0.3715070113539696, + "reward_std": 0.8198821842670441, + "rewards/cosine_scaled_reward": -0.07662516506388783, + "rewards/format_reward": 0.6041666734963655, + "step": 189 + }, + { + "advantage_max": 1.2479316182434559, + "advantage_mean": -1.4280279958533981e-08, + "advantage_min": -0.5517519414424896, + "advantage_std": 0.6586221437901258, + "completion_length": 2229.1458740234375, + "epoch": 0.21714285714285714, + "grad_norm": 0.15027928352355957, + "kl": 0.092498779296875, + "lambda_div_used": 0.5, + "learning_rate": 8.01636806561836e-07, + "loss": 0.0206, + "reward": -0.07967419736087322, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.07967419736087322, + "reward_after_std": 0.6586221437901258, + "reward_before_mean": 0.46196483448147774, + "reward_before_std": 0.5788971818983555, + "reward_change_max": 0.00156412273645401, + "reward_change_mean": -0.5416390751488507, + "reward_change_min": -0.9397790506482124, + "reward_change_std": 0.37341161631047726, + "reward_std": 0.6586221754550934, + "rewards/cosine_scaled_reward": -0.05026757996529341, + "rewards/format_reward": 0.5625000111758709, + "step": 190 + }, + { + "advantage_max": 1.4184372648596764, + "advantage_mean": 1.3659398057086491e-08, + "advantage_min": -0.6462213322520256, + "advantage_std": 0.7566835880279541, + "completion_length": 2572.3333740234375, + "epoch": 0.21828571428571428, + "grad_norm": 0.2690849304199219, + "kl": 0.1207275390625, + "lambda_div_used": 0.5, + "learning_rate": 7.990261971595048e-07, + "loss": 0.0306, + "reward": -0.1155730914324522, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.1155730914324522, + "reward_after_std": 0.7566835880279541, + "reward_before_mean": 0.362278588116169, + "reward_before_std": 0.7694622874259949, + "reward_change_max": 0.002707548439502716, + "reward_change_mean": -0.4778516888618469, + "reward_change_min": -0.9092966057360172, + "reward_change_std": 0.36542451940476894, + "reward_std": 0.7566836401820183, + "rewards/cosine_scaled_reward": -0.037610700353980064, + "rewards/format_reward": 0.43750000558793545, + "step": 191 + }, + { + "advantage_max": 1.475701242685318, + "advantage_mean": 2.2972624080797033e-08, + "advantage_min": -0.7762894034385681, + "advantage_std": 0.8053352851420641, + "completion_length": 3140.791748046875, + "epoch": 0.21942857142857142, + "grad_norm": 0.29571542143821716, + "kl": 0.12591552734375, + "lambda_div_used": 0.5, + "learning_rate": 7.964034505716476e-07, + "loss": 0.0412, + "reward": -0.18194560799747705, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.18194560799747705, + "reward_after_std": 0.8053352888673544, + "reward_before_mean": 0.23417986929416656, + "reward_before_std": 0.8801908064633608, + "reward_change_max": 0.004965953528881073, + "reward_change_mean": -0.416125463321805, + "reward_change_min": -0.905586700886488, + "reward_change_std": 0.39055878994986415, + "reward_std": 0.8053353205323219, + "rewards/cosine_scaled_reward": -0.08082673698663712, + "rewards/format_reward": 0.39583334513008595, + "step": 192 + }, + { + "advantage_max": 1.4207666739821434, + "advantage_mean": 6.208817127095756e-09, + "advantage_min": -0.5451913326978683, + "advantage_std": 0.7405758053064346, + "completion_length": 3119.3333892822266, + "epoch": 0.22057142857142858, + "grad_norm": 0.21056059002876282, + "kl": 0.11077880859375, + "lambda_div_used": 0.5, + "learning_rate": 7.93768694627233e-07, + "loss": 0.0192, + "reward": -0.32524373196065426, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.32524373196065426, + "reward_after_std": 0.7405758053064346, + "reward_before_mean": -0.019589267438277602, + "reward_before_std": 0.7454305961728096, + "reward_change_max": 0.0016882121562957764, + "reward_change_mean": -0.30565447825938463, + "reward_change_min": -0.7668191641569138, + "reward_change_std": 0.2845957148820162, + "reward_std": 0.7405758276581764, + "rewards/cosine_scaled_reward": -0.17646130733191967, + "rewards/format_reward": 0.33333334140479565, + "step": 193 + }, + { + "advantage_max": 1.6386329382658005, + "advantage_mean": -1.6653345369377348e-16, + "advantage_min": -0.9344875812530518, + "advantage_std": 0.9019115082919598, + "completion_length": 2661.25008392334, + "epoch": 0.22171428571428572, + "grad_norm": 0.630655825138092, + "kl": 0.100555419921875, + "lambda_div_used": 0.5, + "learning_rate": 7.911220577405484e-07, + "loss": 0.0725, + "reward": 0.040607784409075975, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.040607784409075975, + "reward_after_std": 0.901911523193121, + "reward_before_mean": 0.6173225231468678, + "reward_before_std": 0.9771861061453819, + "reward_change_max": 0.0, + "reward_change_mean": -0.5767147559672594, + "reward_change_min": -1.134931169450283, + "reward_change_std": 0.48344396241009235, + "reward_std": 0.9019115567207336, + "rewards/cosine_scaled_reward": 0.11074459226801991, + "rewards/format_reward": 0.39583334885537624, + "step": 194 + }, + { + "advantage_max": 1.5321245640516281, + "advantage_mean": 1.1796752963366686e-08, + "advantage_min": -0.6784209460020065, + "advantage_std": 0.8166707828640938, + "completion_length": 2987.3333740234375, + "epoch": 0.22285714285714286, + "grad_norm": 0.33836349844932556, + "kl": 0.1348876953125, + "lambda_div_used": 0.5, + "learning_rate": 7.884636689049422e-07, + "loss": 0.0389, + "reward": -0.22936188150197268, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.22936188150197268, + "reward_after_std": 0.8166707642376423, + "reward_before_mean": 0.14030634611845016, + "reward_before_std": 0.8599527664482594, + "reward_change_max": 0.0, + "reward_change_mean": -0.36966823507100344, + "reward_change_min": -0.8392911180853844, + "reward_change_std": 0.3513228427618742, + "reward_std": 0.8166707865893841, + "rewards/cosine_scaled_reward": -0.06526349484920502, + "rewards/format_reward": 0.2708333432674408, + "step": 195 + }, + { + "advantage_max": 1.4621029123663902, + "advantage_mean": 3.104408619059029e-09, + "advantage_min": -0.6472999192774296, + "advantage_std": 0.7688349261879921, + "completion_length": 3229.4583740234375, + "epoch": 0.224, + "grad_norm": 0.25638365745544434, + "kl": 0.16192626953125, + "lambda_div_used": 0.5, + "learning_rate": 7.857936576865356e-07, + "loss": 0.0169, + "reward": -0.1755674695596099, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.1755674695596099, + "reward_after_std": 0.768834937363863, + "reward_before_mean": 0.25635097920894623, + "reward_before_std": 0.7566685378551483, + "reward_change_max": 0.0, + "reward_change_mean": -0.4319184827618301, + "reward_change_min": -0.8243024758994579, + "reward_change_std": 0.3426487520337105, + "reward_std": 0.768834937363863, + "rewards/cosine_scaled_reward": -0.02807450108230114, + "rewards/format_reward": 0.3125000037252903, + "step": 196 + }, + { + "advantage_max": 1.9627781957387924, + "advantage_mean": -7.450580818968433e-09, + "advantage_min": -0.8632603287696838, + "advantage_std": 1.062688797712326, + "completion_length": 2410.500057220459, + "epoch": 0.22514285714285714, + "grad_norm": 0.4595416188240051, + "kl": 0.1593017578125, + "lambda_div_used": 0.5, + "learning_rate": 7.831121542179086e-07, + "loss": 0.026, + "reward": 0.029495095717720687, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.029495095717720687, + "reward_after_std": 1.062688760459423, + "reward_before_mean": 0.5363987050950527, + "reward_before_std": 1.1507313326001167, + "reward_change_max": 0.009369902312755585, + "reward_change_mean": -0.5069035869091749, + "reward_change_min": -1.2301556020975113, + "reward_change_std": 0.5150115732103586, + "reward_std": 1.0626887753605843, + "rewards/cosine_scaled_reward": 0.07028267765417695, + "rewards/format_reward": 0.3958333358168602, + "step": 197 + }, + { + "advantage_max": 1.6594363003969193, + "advantage_mean": -1.1486311984887365e-08, + "advantage_min": -0.7000625804066658, + "advantage_std": 0.8589765839278698, + "completion_length": 2688.1250610351562, + "epoch": 0.22628571428571428, + "grad_norm": 0.4216662645339966, + "kl": 0.1824951171875, + "lambda_div_used": 0.5, + "learning_rate": 7.804192891917571e-07, + "loss": 0.0484, + "reward": -0.014411035925149918, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.014411035925149918, + "reward_after_std": 0.8589765951037407, + "reward_before_mean": 0.5093722362071276, + "reward_before_std": 0.8033815808594227, + "reward_change_max": 0.0009694769978523254, + "reward_change_mean": -0.5237833168357611, + "reward_change_min": -0.876764677464962, + "reward_change_std": 0.3557278939988464, + "reward_std": 0.8589766137301922, + "rewards/cosine_scaled_reward": 0.004686110652983189, + "rewards/format_reward": 0.500000013038516, + "step": 198 + }, + { + "advantage_max": 1.248784601688385, + "advantage_mean": -1.8626449826975033e-09, + "advantage_min": -0.5464016161859035, + "advantage_std": 0.6605355255305767, + "completion_length": 2801.3333740234375, + "epoch": 0.22742857142857142, + "grad_norm": 0.2286648154258728, + "kl": 0.186309814453125, + "lambda_div_used": 0.5, + "learning_rate": 7.777151938545235e-07, + "loss": 0.0171, + "reward": -0.35405838675796986, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.35405838675796986, + "reward_after_std": 0.6605355255305767, + "reward_before_mean": -0.04154867585748434, + "reward_before_std": 0.6669703125953674, + "reward_change_max": 0.0020766928791999817, + "reward_change_mean": -0.31250972487032413, + "reward_change_min": -0.7029464244842529, + "reward_change_std": 0.28745912201702595, + "reward_std": 0.6605355553328991, + "rewards/cosine_scaled_reward": -0.1770243365317583, + "rewards/format_reward": 0.31250001303851604, + "step": 199 + }, + { + "advantage_max": 1.615936852991581, + "advantage_mean": -1.2417634809303024e-08, + "advantage_min": -0.5708847790956497, + "advantage_std": 0.8185114078223705, + "completion_length": 2348.6042251586914, + "epoch": 0.22857142857142856, + "grad_norm": 0.318811297416687, + "kl": 0.1622314453125, + "lambda_div_used": 0.5, + "learning_rate": 7.75e-07, + "loss": 0.029, + "reward": 0.001345137134194374, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.001345137134194374, + "reward_after_std": 0.8185114376246929, + "reward_before_mean": 0.5466999299824238, + "reward_before_std": 0.6768320240080357, + "reward_change_max": 0.006076157093048096, + "reward_change_mean": -0.5453548207879066, + "reward_change_min": -0.9213692545890808, + "reward_change_std": 0.35560460947453976, + "reward_std": 0.8185114562511444, + "rewards/cosine_scaled_reward": 0.012933290250657592, + "rewards/format_reward": 0.5208333358168602, + "step": 200 + }, + { + "advantage_max": 1.8653700202703476, + "advantage_mean": -2.23517424569053e-08, + "advantage_min": -0.987945843487978, + "advantage_std": 1.0161167159676552, + "completion_length": 2228.375068664551, + "epoch": 0.2297142857142857, + "grad_norm": 0.2828601896762848, + "kl": 0.148040771484375, + "lambda_div_used": 0.5, + "learning_rate": 7.72273839962904e-07, + "loss": 0.0173, + "reward": 0.3448567260056734, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3448567260056734, + "reward_after_std": 1.0161167457699776, + "reward_before_mean": 1.136616634670645, + "reward_before_std": 1.0092586781829596, + "reward_change_max": 0.002497762441635132, + "reward_change_mean": -0.7917599156498909, + "reward_change_min": -1.446341522037983, + "reward_change_std": 0.5879297144711018, + "reward_std": 1.0161167681217194, + "rewards/cosine_scaled_reward": 0.25580831430852413, + "rewards/format_reward": 0.6250000093132257, + "step": 201 + }, + { + "advantage_max": 1.617574080824852, + "advantage_mean": -8.692344288796505e-09, + "advantage_min": -0.6772880628705025, + "advantage_std": 0.8404804095625877, + "completion_length": 2325.7083587646484, + "epoch": 0.23085714285714284, + "grad_norm": 0.2522527277469635, + "kl": 0.1925048828125, + "lambda_div_used": 0.5, + "learning_rate": 7.695368466124296e-07, + "loss": 0.0241, + "reward": 0.24290845077484846, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.24290845077484846, + "reward_after_std": 0.840480413287878, + "reward_before_mean": 0.9944449234753847, + "reward_before_std": 0.6817901022732258, + "reward_change_max": 0.0, + "reward_change_mean": -0.7515364792197943, + "reward_change_min": -1.2624378241598606, + "reward_change_std": 0.4821481630206108, + "reward_std": 0.8404804356396198, + "rewards/cosine_scaled_reward": 0.21597244683653116, + "rewards/format_reward": 0.5625000018626451, + "step": 202 + }, + { + "advantage_max": 1.8188269883394241, + "advantage_mean": 1.1175870895385742e-08, + "advantage_min": -0.6880287379026413, + "advantage_std": 0.9322909750044346, + "completion_length": 2903.1041870117188, + "epoch": 0.232, + "grad_norm": 0.4888186752796173, + "kl": 0.216949462890625, + "lambda_div_used": 0.5, + "learning_rate": 7.667891533457718e-07, + "loss": 0.054, + "reward": -0.16240698844194412, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.16240698844194412, + "reward_after_std": 0.9322909824550152, + "reward_before_mean": 0.21877515967935324, + "reward_before_std": 0.9038714617490768, + "reward_change_max": 0.0020834803581237793, + "reward_change_mean": -0.3811821388080716, + "reward_change_min": -0.7407228797674179, + "reward_change_std": 0.312047659419477, + "reward_std": 0.9322910197079182, + "rewards/cosine_scaled_reward": -0.05727909505367279, + "rewards/format_reward": 0.33333334140479565, + "step": 203 + }, + { + "advantage_max": 1.2213104590773582, + "advantage_mean": -4.346171922353648e-09, + "advantage_min": -0.5388765670359135, + "advantage_std": 0.6483286060392857, + "completion_length": 2404.8959197998047, + "epoch": 0.23314285714285715, + "grad_norm": 0.3916052579879761, + "kl": 0.20050048828125, + "lambda_div_used": 0.5, + "learning_rate": 7.640308940816239e-07, + "loss": 0.0321, + "reward": -0.05752340517938137, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.05752340517938137, + "reward_after_std": 0.6483286060392857, + "reward_before_mean": 0.49461894296109676, + "reward_before_std": 0.5494524594396353, + "reward_change_max": 0.0004562288522720337, + "reward_change_mean": -0.552142359316349, + "reward_change_min": -0.9264970943331718, + "reward_change_std": 0.3694954924285412, + "reward_std": 0.6483286134898663, + "rewards/cosine_scaled_reward": -0.09644054435193539, + "rewards/format_reward": 0.6875000074505806, + "step": 204 + }, + { + "advantage_max": 1.9968280270695686, + "advantage_mean": -1.5522043428362053e-08, + "advantage_min": -0.82924984395504, + "advantage_std": 1.0424656011164188, + "completion_length": 2681.1875915527344, + "epoch": 0.2342857142857143, + "grad_norm": 0.5587193369865417, + "kl": 0.202392578125, + "lambda_div_used": 0.5, + "learning_rate": 7.612622032536507e-07, + "loss": 0.0405, + "reward": 0.03933172253891826, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.03933172253891826, + "reward_after_std": 1.0424656197428703, + "reward_before_mean": 0.5593788512051105, + "reward_before_std": 1.0272934138774872, + "reward_change_max": 0.0, + "reward_change_mean": -0.5200471375137568, + "reward_change_min": -1.0572655908763409, + "reward_change_std": 0.4100263640284538, + "reward_std": 1.042465664446354, + "rewards/cosine_scaled_reward": 0.04010608239332214, + "rewards/format_reward": 0.479166679084301, + "step": 205 + }, + { + "advantage_max": 1.7176525816321373, + "advantage_mean": 4.967053990334591e-09, + "advantage_min": -0.6258624605834484, + "advantage_std": 0.8893463686108589, + "completion_length": 3179.979217529297, + "epoch": 0.23542857142857143, + "grad_norm": 0.5262120962142944, + "kl": 0.217041015625, + "lambda_div_used": 0.5, + "learning_rate": 7.584832158039378e-07, + "loss": 0.0088, + "reward": -0.18126259616110474, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.18126259616110474, + "reward_after_std": 0.8893463760614395, + "reward_before_mean": 0.1952662207186222, + "reward_before_std": 0.8674014620482922, + "reward_change_max": 0.002380073070526123, + "reward_change_mean": -0.3765288144350052, + "reward_change_min": -0.8658742196857929, + "reward_change_std": 0.3221265822649002, + "reward_std": 0.889346394687891, + "rewards/cosine_scaled_reward": -0.10028356406837702, + "rewards/format_reward": 0.3958333395421505, + "step": 206 + }, + { + "advantage_max": 1.5335121899843216, + "advantage_mean": 2.359350592673337e-08, + "advantage_min": -0.5418790131807327, + "advantage_std": 0.7710942663252354, + "completion_length": 2872.229263305664, + "epoch": 0.23657142857142857, + "grad_norm": 0.4609461724758148, + "kl": 0.242431640625, + "lambda_div_used": 0.5, + "learning_rate": 7.556940671764124e-07, + "loss": 0.0595, + "reward": -0.2480186834000051, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2480186834000051, + "reward_after_std": 0.7710942775011063, + "reward_before_mean": 0.09909067209810019, + "reward_before_std": 0.6730157136917114, + "reward_change_max": 0.0010094791650772095, + "reward_change_mean": -0.34710934944450855, + "reward_change_min": -0.6180264130234718, + "reward_change_std": 0.24715251475572586, + "reward_std": 0.7710942812263966, + "rewards/cosine_scaled_reward": -0.2212880039587617, + "rewards/format_reward": 0.5416666753590107, + "step": 207 + }, + { + "advantage_max": 1.6003206372261047, + "advantage_mean": 1.2417634698280722e-09, + "advantage_min": -0.631665613502264, + "advantage_std": 0.8309444338083267, + "completion_length": 2605.604202270508, + "epoch": 0.2377142857142857, + "grad_norm": 0.4859123229980469, + "kl": 0.222259521484375, + "lambda_div_used": 0.5, + "learning_rate": 7.528948933102438e-07, + "loss": 0.0133, + "reward": -0.15951420087367296, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.15951420087367296, + "reward_after_std": 0.8309444487094879, + "reward_before_mean": 0.25432649441063404, + "reward_before_std": 0.7960464023053646, + "reward_change_max": 0.0005858913064002991, + "reward_change_mean": -0.41384069016203284, + "reward_change_min": -0.7964263036847115, + "reward_change_std": 0.3161437623202801, + "reward_std": 0.8309444785118103, + "rewards/cosine_scaled_reward": -0.1124201025813818, + "rewards/format_reward": 0.4791666753590107, + "step": 208 + }, + { + "advantage_max": 1.5546401739120483, + "advantage_mean": -5.587935947293232e-09, + "advantage_min": -0.7405244670808315, + "advantage_std": 0.8403762653470039, + "completion_length": 2657.208396911621, + "epoch": 0.23885714285714285, + "grad_norm": 0.29709237813949585, + "kl": 0.2550048828125, + "lambda_div_used": 0.5, + "learning_rate": 7.500858306332172e-07, + "loss": 0.0441, + "reward": 0.06591091491281986, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.06591091491281986, + "reward_after_std": 0.8403762653470039, + "reward_before_mean": 0.6802605744451284, + "reward_before_std": 0.8452886939048767, + "reward_change_max": 0.0011561810970306396, + "reward_change_mean": -0.614349715411663, + "reward_change_min": -1.1940862014889717, + "reward_change_std": 0.46805957332253456, + "reward_std": 0.8403762951493263, + "rewards/cosine_scaled_reward": -0.0036197155714035034, + "rewards/format_reward": 0.6875000037252903, + "step": 209 + }, + { + "advantage_max": 1.196301095187664, + "advantage_mean": -1.2417632477834672e-09, + "advantage_min": -0.5151055417954922, + "advantage_std": 0.6200075708329678, + "completion_length": 2680.166717529297, + "epoch": 0.24, + "grad_norm": 0.3059697151184082, + "kl": 0.2135009765625, + "lambda_div_used": 0.5, + "learning_rate": 7.472670160550848e-07, + "loss": 0.0306, + "reward": -0.061095981509424746, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.061095981509424746, + "reward_after_std": 0.6200075745582581, + "reward_before_mean": 0.5075158753897995, + "reward_before_std": 0.50667629763484, + "reward_change_max": 0.00011307001113891602, + "reward_change_mean": -0.5686118816956878, + "reward_change_min": -0.9131542295217514, + "reward_change_std": 0.34807233698666096, + "reward_std": 0.620007585734129, + "rewards/cosine_scaled_reward": 0.014174612239003181, + "rewards/format_reward": 0.47916667349636555, + "step": 210 + }, + { + "advantage_max": 1.583703152835369, + "advantage_mean": 1.4901161637936866e-08, + "advantage_min": -0.7270540446043015, + "advantage_std": 0.837357334792614, + "completion_length": 2362.187530517578, + "epoch": 0.24114285714285713, + "grad_norm": 0.5771478414535522, + "kl": 0.2373046875, + "lambda_div_used": 0.5, + "learning_rate": 7.444385869608921e-07, + "loss": -0.0055, + "reward": -0.05148214101791382, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.05148214101791382, + "reward_after_std": 0.8373573496937752, + "reward_before_mean": 0.45062290877103806, + "reward_before_std": 0.8202326558530331, + "reward_change_max": 0.0023512914776802063, + "reward_change_mean": -0.502105032093823, + "reward_change_min": -0.9305754974484444, + "reward_change_std": 0.37797669507563114, + "reward_std": 0.8373573534190655, + "rewards/cosine_scaled_reward": -0.014271877706050873, + "rewards/format_reward": 0.47916668094694614, + "step": 211 + }, + { + "advantage_max": 1.3947479017078876, + "advantage_mean": 1.490116185998147e-08, + "advantage_min": -0.6110854707658291, + "advantage_std": 0.7231886181980371, + "completion_length": 2476.9375610351562, + "epoch": 0.2422857142857143, + "grad_norm": 0.6602047681808472, + "kl": 0.28125, + "lambda_div_used": 0.5, + "learning_rate": 7.416006812042827e-07, + "loss": -0.0038, + "reward": -0.12894845008850098, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.12894845008850098, + "reward_after_std": 0.7231886237859726, + "reward_before_mean": 0.3463945370167494, + "reward_before_std": 0.65158936008811, + "reward_change_max": 0.000197581946849823, + "reward_change_mean": -0.4753429926931858, + "reward_change_min": -0.8183762915432453, + "reward_change_std": 0.320674704387784, + "reward_std": 0.7231886312365532, + "rewards/cosine_scaled_reward": -0.08721940265968442, + "rewards/format_reward": 0.520833345130086, + "step": 212 + }, + { + "advantage_max": 1.5491738989949226, + "advantage_mean": -1.2417633588057697e-09, + "advantage_min": -0.6663748882710934, + "advantage_std": 0.8115720618516207, + "completion_length": 2468.4792251586914, + "epoch": 0.24342857142857144, + "grad_norm": 0.38668930530548096, + "kl": 0.28509521484375, + "lambda_div_used": 0.5, + "learning_rate": 7.387534371007797e-07, + "loss": 0.0502, + "reward": -0.054109593853354454, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.054109593853354454, + "reward_after_std": 0.8115720748901367, + "reward_before_mean": 0.45440296456217766, + "reward_before_std": 0.7600329406559467, + "reward_change_max": 0.0024536699056625366, + "reward_change_mean": -0.5085125498007983, + "reward_change_min": -0.983875211328268, + "reward_change_std": 0.37908973544836044, + "reward_std": 0.8115721084177494, + "rewards/cosine_scaled_reward": -0.07488186378031969, + "rewards/format_reward": 0.6041666753590107, + "step": 213 + }, + { + "advantage_max": 1.6328135281801224, + "advantage_mean": -4.346171922353648e-09, + "advantage_min": -0.7386883497238159, + "advantage_std": 0.8717365637421608, + "completion_length": 2732.2500610351562, + "epoch": 0.24457142857142858, + "grad_norm": 0.7322311997413635, + "kl": 0.23431396484375, + "lambda_div_used": 0.5, + "learning_rate": 7.358969934210438e-07, + "loss": 0.0818, + "reward": -0.05645147990435362, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.05645147990435362, + "reward_after_std": 0.8717365935444832, + "reward_before_mean": 0.442460672929883, + "reward_before_std": 0.8829019628465176, + "reward_change_max": 0.0, + "reward_change_mean": -0.49891215190291405, + "reward_change_min": -1.0390591099858284, + "reward_change_std": 0.4195642340928316, + "reward_std": 0.8717366270720959, + "rewards/cosine_scaled_reward": -0.049603000516071916, + "rewards/format_reward": 0.5416666734963655, + "step": 214 + }, + { + "advantage_max": 1.150866337120533, + "advantage_mean": 4.967053879312289e-09, + "advantage_min": -0.5373395159840584, + "advantage_std": 0.612909123301506, + "completion_length": 2315.812515258789, + "epoch": 0.24571428571428572, + "grad_norm": 0.26189979910850525, + "kl": 0.209259033203125, + "lambda_div_used": 0.5, + "learning_rate": 7.330314893841101e-07, + "loss": 0.0336, + "reward": -0.1552600208669901, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.1552600208669901, + "reward_after_std": 0.6129091084003448, + "reward_before_mean": 0.33281449880450964, + "reward_before_std": 0.574024710804224, + "reward_change_max": 0.00011177361011505127, + "reward_change_mean": -0.48807452619075775, + "reward_change_min": -0.8746481277048588, + "reward_change_std": 0.33652111142873764, + "reward_std": 0.6129091084003448, + "rewards/cosine_scaled_reward": -0.16692609898746014, + "rewards/format_reward": 0.6666666772216558, + "step": 215 + }, + { + "advantage_max": 1.7821224480867386, + "advantage_mean": -1.1175871117430347e-08, + "advantage_min": -0.7689832858741283, + "advantage_std": 0.9388571158051491, + "completion_length": 2343.37508392334, + "epoch": 0.24685714285714286, + "grad_norm": 0.5186208486557007, + "kl": 0.25201416015625, + "lambda_div_used": 0.5, + "learning_rate": 7.301570646506027e-07, + "loss": 0.0608, + "reward": 0.16585935093462467, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.16585935093462467, + "reward_after_std": 0.9388571009039879, + "reward_before_mean": 0.8218697644770145, + "reward_before_std": 0.8792456723749638, + "reward_change_max": 0.0, + "reward_change_mean": -0.6560104191303253, + "reward_change_min": -1.2858059257268906, + "reward_change_std": 0.4733603745698929, + "reward_std": 0.9388571158051491, + "rewards/cosine_scaled_reward": 0.004684882238507271, + "rewards/format_reward": 0.8125000223517418, + "step": 216 + }, + { + "advantage_max": 1.7403410822153091, + "advantage_mean": -5.5879355587151736e-09, + "advantage_min": -0.6725828759372234, + "advantage_std": 0.9263965599238873, + "completion_length": 2844.8750762939453, + "epoch": 0.248, + "grad_norm": 0.47382375597953796, + "kl": 0.3302001953125, + "lambda_div_used": 0.5, + "learning_rate": 7.27273859315928e-07, + "loss": 0.0358, + "reward": -0.06096999440342188, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.06096999440342188, + "reward_after_std": 0.9263965785503387, + "reward_before_mean": 0.4143219366669655, + "reward_before_std": 0.9496859908103943, + "reward_change_max": 0.001133456826210022, + "reward_change_mean": -0.47529190964996815, + "reward_change_min": -1.12648393958807, + "reward_change_std": 0.4290233626961708, + "reward_std": 0.9263966307044029, + "rewards/cosine_scaled_reward": -0.03242238308303058, + "rewards/format_reward": 0.47916667349636555, + "step": 217 + }, + { + "advantage_max": 1.569477766752243, + "advantage_mean": -2.220446049250313e-16, + "advantage_min": -0.5702780596911907, + "advantage_std": 0.8123864158987999, + "completion_length": 2503.3333587646484, + "epoch": 0.24914285714285714, + "grad_norm": 0.25727400183677673, + "kl": 0.233642578125, + "lambda_div_used": 0.5, + "learning_rate": 7.243820139034464e-07, + "loss": 0.0315, + "reward": -0.2071865415200591, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2071865415200591, + "reward_after_std": 0.8123864233493805, + "reward_before_mean": 0.1725800707936287, + "reward_before_std": 0.785517480224371, + "reward_change_max": 0.002965576946735382, + "reward_change_mean": -0.3797666160389781, + "reward_change_min": -0.8266499191522598, + "reward_change_std": 0.30547447595745325, + "reward_std": 0.8123864307999611, + "rewards/cosine_scaled_reward": -0.11162663483992219, + "rewards/format_reward": 0.3958333358168602, + "step": 218 + }, + { + "advantage_max": 1.4346892908215523, + "advantage_mean": 1.6142925107764938e-08, + "advantage_min": -0.6690775826573372, + "advantage_std": 0.7566139325499535, + "completion_length": 2354.4792404174805, + "epoch": 0.2502857142857143, + "grad_norm": 0.38650956749916077, + "kl": 0.282867431640625, + "lambda_div_used": 0.5, + "learning_rate": 7.214816693576234e-07, + "loss": 0.0173, + "reward": 0.017466269433498383, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.017466269433498383, + "reward_after_std": 0.7566139176487923, + "reward_before_mean": 0.6082219113595784, + "reward_before_std": 0.6809872947633266, + "reward_change_max": 0.00339333713054657, + "reward_change_mean": -0.5907555720768869, + "reward_change_min": -1.0609471648931503, + "reward_change_std": 0.4097218685783446, + "reward_std": 0.7566139437258244, + "rewards/cosine_scaled_reward": -0.029222410172224045, + "rewards/format_reward": 0.6666666753590107, + "step": 219 + }, + { + "advantage_max": 0.955601155757904, + "advantage_mean": 1.0554989549049765e-08, + "advantage_min": -0.40321509912610054, + "advantage_std": 0.50310243293643, + "completion_length": 2792.3333740234375, + "epoch": 0.25142857142857145, + "grad_norm": 0.41683459281921387, + "kl": 0.364990234375, + "lambda_div_used": 0.5, + "learning_rate": 7.185729670371604e-07, + "loss": 0.031, + "reward": -0.43255934678018093, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.43255934678018093, + "reward_after_std": 0.5031024254858494, + "reward_before_mean": -0.139401210937649, + "reward_before_std": 0.47662991285324097, + "reward_change_max": 0.0028152763843536377, + "reward_change_mean": -0.2931581400334835, + "reward_change_min": -0.542736854404211, + "reward_change_std": 0.22856017015874386, + "reward_std": 0.5031024366617203, + "rewards/cosine_scaled_reward": -0.26761728897690773, + "rewards/format_reward": 0.39583333767950535, + "step": 220 + }, + { + "advantage_max": 1.7276756018400192, + "advantage_mean": -4.967053768289986e-09, + "advantage_min": -0.8304614424705505, + "advantage_std": 0.9121339544653893, + "completion_length": 2324.187545776367, + "epoch": 0.25257142857142856, + "grad_norm": 0.6268060207366943, + "kl": 0.3243408203125, + "lambda_div_used": 0.5, + "learning_rate": 7.156560487081051e-07, + "loss": 0.0542, + "reward": 0.11553898081183434, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.11553898081183434, + "reward_after_std": 0.9121339917182922, + "reward_before_mean": 0.7398845301941037, + "reward_before_std": 0.8802719376981258, + "reward_change_max": 0.0006899982690811157, + "reward_change_mean": -0.62434555683285, + "reward_change_min": -1.1505605317652225, + "reward_change_std": 0.4582773372530937, + "reward_std": 0.9121340066194534, + "rewards/cosine_scaled_reward": 0.04702560231089592, + "rewards/format_reward": 0.6458333469927311, + "step": 221 + }, + { + "advantage_max": 1.26497058942914, + "advantage_mean": -1.1796752907855534e-08, + "advantage_min": -0.5657484494149685, + "advantage_std": 0.6593565940856934, + "completion_length": 2305.333366394043, + "epoch": 0.2537142857142857, + "grad_norm": 0.3454340398311615, + "kl": 0.3505859375, + "lambda_div_used": 0.5, + "learning_rate": 7.127310565369415e-07, + "loss": 0.0358, + "reward": 0.044890944845974445, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.044890944845974445, + "reward_after_std": 0.6593566089868546, + "reward_before_mean": 0.6905238628387451, + "reward_before_std": 0.5432622786611319, + "reward_change_max": 0.00029180198907852173, + "reward_change_mean": -0.645632941275835, + "reward_change_min": -1.049218151718378, + "reward_change_std": 0.39333911798894405, + "reward_std": 0.6593566127121449, + "rewards/cosine_scaled_reward": 0.022345258854329586, + "rewards/format_reward": 0.645833333954215, + "step": 222 + }, + { + "advantage_max": 1.798129253089428, + "advantage_mean": 4.967053768289986e-09, + "advantage_min": -0.8689031004905701, + "advantage_std": 0.950947355479002, + "completion_length": 2518.166732788086, + "epoch": 0.25485714285714284, + "grad_norm": 0.621035635471344, + "kl": 0.40380859375, + "lambda_div_used": 0.5, + "learning_rate": 7.097981330836616e-07, + "loss": 0.0301, + "reward": 0.011060059070587158, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.011060059070587158, + "reward_after_std": 0.9509473778307438, + "reward_before_mean": 0.5413750989828259, + "reward_before_std": 0.9515966884791851, + "reward_change_max": 0.0010984092950820923, + "reward_change_mean": -0.5303150387480855, + "reward_change_min": -0.9885171167552471, + "reward_change_std": 0.4150323858484626, + "reward_std": 0.9509474150836468, + "rewards/cosine_scaled_reward": -0.02097912272438407, + "rewards/format_reward": 0.5833333488553762, + "step": 223 + }, + { + "advantage_max": 1.782409906387329, + "advantage_mean": 2.4835267176115394e-09, + "advantage_min": -0.6734658181667328, + "advantage_std": 0.9314799420535564, + "completion_length": 2908.7084045410156, + "epoch": 0.256, + "grad_norm": 0.5879162549972534, + "kl": 0.42041015625, + "lambda_div_used": 0.5, + "learning_rate": 7.068574212948169e-07, + "loss": 0.0696, + "reward": -0.10128612630069256, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.10128612630069256, + "reward_after_std": 0.9314799383282661, + "reward_before_mean": 0.3314627211075276, + "reward_before_std": 0.9180979616940022, + "reward_change_max": 0.0, + "reward_change_mean": -0.4327488373965025, + "reward_change_min": -0.9870041385293007, + "reward_change_std": 0.37257966212928295, + "reward_std": 0.9314799644052982, + "rewards/cosine_scaled_reward": -0.09468531236052513, + "rewards/format_reward": 0.520833345130086, + "step": 224 + }, + { + "advantage_max": 0.9428818374872208, + "advantage_mean": 1.2417638028949796e-09, + "advantage_min": -0.5349617823958397, + "advantage_std": 0.5179965812712908, + "completion_length": 3046.437545776367, + "epoch": 0.2571428571428571, + "grad_norm": 0.8561449646949768, + "kl": 0.4515380859375, + "lambda_div_used": 0.5, + "learning_rate": 7.039090644965509e-07, + "loss": 0.0212, + "reward": -0.3602433856576681, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.3602433856576681, + "reward_after_std": 0.5179965812712908, + "reward_before_mean": -0.0025209784507751465, + "reward_before_std": 0.5427715508267283, + "reward_change_max": 0.0024244189262390137, + "reward_change_mean": -0.35772242583334446, + "reward_change_min": -0.6620992906391621, + "reward_change_std": 0.28696852759458125, + "reward_std": 0.5179965924471617, + "rewards/cosine_scaled_reward": -0.18876048736274242, + "rewards/format_reward": 0.37500000931322575, + "step": 225 + }, + { + "advantage_max": 1.444564439356327, + "advantage_mean": -4.967053546245381e-09, + "advantage_min": -0.5781576111912727, + "advantage_std": 0.7491228319704533, + "completion_length": 2677.2083740234375, + "epoch": 0.2582857142857143, + "grad_norm": 0.5025551915168762, + "kl": 0.4019775390625, + "lambda_div_used": 0.5, + "learning_rate": 7.009532063876148e-07, + "loss": 0.0154, + "reward": 0.07985361525788903, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.07985361525788903, + "reward_after_std": 0.7491228580474854, + "reward_before_mean": 0.725711066275835, + "reward_before_std": 0.6195245683193207, + "reward_change_max": 8.346140384674072e-05, + "reward_change_mean": -0.6458574496209621, + "reward_change_min": -1.0521031320095062, + "reward_change_std": 0.40322335436940193, + "reward_std": 0.7491228729486465, + "rewards/cosine_scaled_reward": 0.07118885964155197, + "rewards/format_reward": 0.583333333954215, + "step": 226 + }, + { + "advantage_max": 1.6421142667531967, + "advantage_mean": -4.346171977864799e-09, + "advantage_min": -0.7257067114114761, + "advantage_std": 0.858584251254797, + "completion_length": 2422.666702270508, + "epoch": 0.25942857142857145, + "grad_norm": 0.6199707388877869, + "kl": 0.35296630859375, + "lambda_div_used": 0.5, + "learning_rate": 6.979899910323624e-07, + "loss": 0.008, + "reward": -0.09044338576495647, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.09044338576495647, + "reward_after_std": 0.858584251254797, + "reward_before_mean": 0.3791947956196964, + "reward_before_std": 0.8184922076761723, + "reward_change_max": 0.0029807984828948975, + "reward_change_mean": -0.4696381874382496, + "reward_change_min": -0.9254156686365604, + "reward_change_std": 0.3720104694366455, + "reward_std": 0.8585842587053776, + "rewards/cosine_scaled_reward": -0.13331927731633186, + "rewards/format_reward": 0.6458333507180214, + "step": 227 + }, + { + "advantage_max": 1.3848458677530289, + "advantage_mean": -1.862645149230957e-09, + "advantage_min": -0.7087922766804695, + "advantage_std": 0.7344786264002323, + "completion_length": 2385.395866394043, + "epoch": 0.26057142857142856, + "grad_norm": 0.30610209703445435, + "kl": 0.3045654296875, + "lambda_div_used": 0.5, + "learning_rate": 6.950195628537299e-07, + "loss": 0.0351, + "reward": -0.0021043140441179276, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.0021043140441179276, + "reward_after_std": 0.7344786264002323, + "reward_before_mean": 0.5830968823283911, + "reward_before_std": 0.6933562718331814, + "reward_change_max": 0.0014819428324699402, + "reward_change_mean": -0.585201189853251, + "reward_change_min": -1.0121977366507053, + "reward_change_std": 0.4014543369412422, + "reward_std": 0.7344786338508129, + "rewards/cosine_scaled_reward": 0.02071509137749672, + "rewards/format_reward": 0.5416666734963655, + "step": 228 + }, + { + "advantage_max": 1.376700833439827, + "advantage_mean": 1.3038516877283968e-08, + "advantage_min": -0.6517984047532082, + "advantage_std": 0.731338307261467, + "completion_length": 2883.5209350585938, + "epoch": 0.26171428571428573, + "grad_norm": 0.6094762682914734, + "kl": 0.408447265625, + "lambda_div_used": 0.5, + "learning_rate": 6.920420666261961e-07, + "loss": 0.0149, + "reward": -0.10475949943065643, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.10475949943065643, + "reward_after_std": 0.7313383035361767, + "reward_before_mean": 0.39478362910449505, + "reward_before_std": 0.7087099254131317, + "reward_change_max": 3.308802843093872e-05, + "reward_change_mean": -0.49954311922192574, + "reward_change_min": -0.9209417179226875, + "reward_change_std": 0.38150350376963615, + "reward_std": 0.7313383109867573, + "rewards/cosine_scaled_reward": -0.031774863600730896, + "rewards/format_reward": 0.4583333469927311, + "step": 229 + }, + { + "advantage_max": 1.37582515925169, + "advantage_mean": -3.725290520506519e-09, + "advantage_min": -0.5348953269422054, + "advantage_std": 0.7181179635226727, + "completion_length": 3124.541717529297, + "epoch": 0.26285714285714284, + "grad_norm": 0.5313109755516052, + "kl": 0.381103515625, + "lambda_div_used": 0.5, + "learning_rate": 6.890576474687263e-07, + "loss": 0.0481, + "reward": -0.29289132729172707, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.29289132729172707, + "reward_after_std": 0.7181179486215115, + "reward_before_mean": 0.049733877182006836, + "reward_before_std": 0.7073988057672977, + "reward_change_max": 0.0, + "reward_change_mean": -0.342625193297863, + "reward_change_min": -0.7525230571627617, + "reward_change_std": 0.287032725289464, + "reward_std": 0.718117967247963, + "rewards/cosine_scaled_reward": -0.15221640653908253, + "rewards/format_reward": 0.3541666679084301, + "step": 230 + }, + { + "advantage_max": 1.051845483481884, + "advantage_mean": 2.3593505704688766e-08, + "advantage_min": -0.5571013279259205, + "advantage_std": 0.5643906779587269, + "completion_length": 2762.500045776367, + "epoch": 0.264, + "grad_norm": 0.33200061321258545, + "kl": 0.290496826171875, + "lambda_div_used": 0.5, + "learning_rate": 6.860664508377001e-07, + "loss": 0.0513, + "reward": -0.24098680447787046, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.24098680447787046, + "reward_after_std": 0.5643906742334366, + "reward_before_mean": 0.20136728300713003, + "reward_before_std": 0.5432005859911442, + "reward_change_max": 0.0009815171360969543, + "reward_change_mean": -0.4423540476709604, + "reward_change_min": -0.7840287238359451, + "reward_change_std": 0.31915647722780704, + "reward_std": 0.5643906779587269, + "rewards/cosine_scaled_reward": -0.11806636117398739, + "rewards/format_reward": 0.43750000558793545, + "step": 231 + }, + { + "advantage_max": 1.15317015722394, + "advantage_mean": 1.3038516488705909e-08, + "advantage_min": -0.444794662296772, + "advantage_std": 0.598000954836607, + "completion_length": 2860.708450317383, + "epoch": 0.2651428571428571, + "grad_norm": 0.2935395836830139, + "kl": 0.30963134765625, + "lambda_div_used": 0.5, + "learning_rate": 6.83068622519821e-07, + "loss": 0.0267, + "reward": -0.4222787544131279, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.4222787544131279, + "reward_after_std": 0.5980009585618973, + "reward_before_mean": -0.1471138414926827, + "reward_before_std": 0.5847893413156271, + "reward_change_max": 0.0004475414752960205, + "reward_change_mean": -0.27516489988192916, + "reward_change_min": -0.580917950719595, + "reward_change_std": 0.23027911875396967, + "reward_std": 0.5980009716004133, + "rewards/cosine_scaled_reward": -0.2402235958725214, + "rewards/format_reward": 0.3333333395421505, + "step": 232 + }, + { + "advantage_max": 1.4307596236467361, + "advantage_mean": 7.45058065243498e-09, + "advantage_min": -0.5155288130044937, + "advantage_std": 0.7287798225879669, + "completion_length": 2711.333427429199, + "epoch": 0.2662857142857143, + "grad_norm": 0.37518948316574097, + "kl": 0.260467529296875, + "lambda_div_used": 0.5, + "learning_rate": 6.800643086250121e-07, + "loss": 0.0252, + "reward": -0.16911163227632642, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.16911163227632642, + "reward_after_std": 0.7287798337638378, + "reward_before_mean": 0.271101389080286, + "reward_before_std": 0.6243734955787659, + "reward_change_max": 0.0, + "reward_change_mean": -0.4402130162343383, + "reward_change_min": -0.7976448684930801, + "reward_change_std": 0.3009780514985323, + "reward_std": 0.7287798523902893, + "rewards/cosine_scaled_reward": -0.17694931849837303, + "rewards/format_reward": 0.6250000149011612, + "step": 233 + }, + { + "advantage_max": 1.38912433385849, + "advantage_mean": 1.2417633588057697e-09, + "advantage_min": -0.5794681049883366, + "advantage_std": 0.7250793315470219, + "completion_length": 2647.0208854675293, + "epoch": 0.2674285714285714, + "grad_norm": 0.3574603796005249, + "kl": 0.2249755859375, + "lambda_div_used": 0.5, + "learning_rate": 6.770536555792944e-07, + "loss": 0.0235, + "reward": -0.2084459774196148, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.2084459774196148, + "reward_after_std": 0.7250793501734734, + "reward_before_mean": 0.20548728946596384, + "reward_before_std": 0.6992978528141975, + "reward_change_max": 0.0009614154696464539, + "reward_change_mean": -0.4139332454651594, + "reward_change_min": -0.8338779956102371, + "reward_change_std": 0.32731484808027744, + "reward_std": 0.7250793538987637, + "rewards/cosine_scaled_reward": -0.09517304040491581, + "rewards/format_reward": 0.39583333767950535, + "step": 234 + }, + { + "advantage_max": 1.5219962149858475, + "advantage_mean": -1.117587211663107e-08, + "advantage_min": -0.5729375965893269, + "advantage_std": 0.7900757826864719, + "completion_length": 2313.7083892822266, + "epoch": 0.26857142857142857, + "grad_norm": 0.30398738384246826, + "kl": 0.222442626953125, + "lambda_div_used": 0.5, + "learning_rate": 6.740368101176495e-07, + "loss": 0.0332, + "reward": -0.08736011805012822, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.08736011805012822, + "reward_after_std": 0.7900757752358913, + "reward_before_mean": 0.403134074062109, + "reward_before_std": 0.7198442071676254, + "reward_change_max": 0.0, + "reward_change_mean": -0.4904941339045763, + "reward_change_min": -0.928714144974947, + "reward_change_std": 0.34090583585202694, + "reward_std": 0.7900757789611816, + "rewards/cosine_scaled_reward": -0.07968299090862274, + "rewards/format_reward": 0.5625000074505806, + "step": 235 + }, + { + "advantage_max": 2.1206346452236176, + "advantage_mean": -1.428027990302283e-08, + "advantage_min": -0.9444840997457504, + "advantage_std": 1.131901353597641, + "completion_length": 2942.8959045410156, + "epoch": 0.26971428571428574, + "grad_norm": 1.6650360822677612, + "kl": 0.2109375, + "lambda_div_used": 0.5, + "learning_rate": 6.710139192768694e-07, + "loss": 0.0798, + "reward": 0.06566341919824481, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.06566341919824481, + "reward_after_std": 1.1319013722240925, + "reward_before_mean": 0.585906186606735, + "reward_before_std": 1.19473173096776, + "reward_change_max": 0.00039448589086532593, + "reward_change_mean": -0.520242765545845, + "reward_change_min": -1.2649845704436302, + "reward_change_std": 0.5010114163160324, + "reward_std": 1.1319014057517052, + "rewards/cosine_scaled_reward": 0.022119746543467045, + "rewards/format_reward": 0.541666679084301, + "step": 236 + }, + { + "advantage_max": 1.702420450747013, + "advantage_mean": 1.2417634476236117e-08, + "advantage_min": -0.7322128117084503, + "advantage_std": 0.8832337111234665, + "completion_length": 2699.979217529297, + "epoch": 0.27085714285714285, + "grad_norm": 0.5813829302787781, + "kl": 0.23004150390625, + "lambda_div_used": 0.5, + "learning_rate": 6.679851303883891e-07, + "loss": -0.0006, + "reward": 0.06363435182720423, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.06363435182720423, + "reward_after_std": 0.8832337185740471, + "reward_before_mean": 0.651249460875988, + "reward_before_std": 0.80680762976408, + "reward_change_max": 0.0011894777417182922, + "reward_change_mean": -0.5876150969415903, + "reward_change_min": -1.0081750489771366, + "reward_change_std": 0.4123306255787611, + "reward_std": 0.8832337334752083, + "rewards/cosine_scaled_reward": 0.06520804762840271, + "rewards/format_reward": 0.5208333507180214, + "step": 237 + }, + { + "advantage_max": 1.7350734397768974, + "advantage_mean": -1.614292466367573e-08, + "advantage_min": -0.8302115723490715, + "advantage_std": 0.9283706545829773, + "completion_length": 2385.9166870117188, + "epoch": 0.272, + "grad_norm": 0.5285000205039978, + "kl": 0.22991943359375, + "lambda_div_used": 0.5, + "learning_rate": 6.649505910711058e-07, + "loss": -0.0073, + "reward": 0.16460101958364248, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.16460101958364248, + "reward_after_std": 0.9283706694841385, + "reward_before_mean": 0.8287704335525632, + "reward_before_std": 0.8974343203008175, + "reward_change_max": 0.003822311758995056, + "reward_change_mean": -0.6641694214195013, + "reward_change_min": -1.260919313877821, + "reward_change_std": 0.5075275972485542, + "reward_std": 0.9283707290887833, + "rewards/cosine_scaled_reward": 0.13313521444797516, + "rewards/format_reward": 0.5625000149011612, + "step": 238 + }, + { + "advantage_max": 1.426647413522005, + "advantage_mean": 3.7252898543727042e-09, + "advantage_min": -0.5672443434596062, + "advantage_std": 0.7399463746696711, + "completion_length": 2061.166706085205, + "epoch": 0.27314285714285713, + "grad_norm": 0.21509422361850739, + "kl": 0.182098388671875, + "lambda_div_used": 0.5, + "learning_rate": 6.619104492241847e-07, + "loss": 0.0163, + "reward": 0.21323653869330883, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.21323653869330883, + "reward_after_std": 0.7399463709443808, + "reward_before_mean": 0.9677451644092798, + "reward_before_std": 0.576780516654253, + "reward_change_max": 0.0005485713481903076, + "reward_change_mean": -0.7545085661113262, + "reward_change_min": -1.1895010620355606, + "reward_change_std": 0.44879256654530764, + "reward_std": 0.7399463932961226, + "rewards/cosine_scaled_reward": 0.16095588821917772, + "rewards/format_reward": 0.6458333414047956, + "step": 239 + }, + { + "advantage_max": 1.2942400351166725, + "advantage_mean": 3.104408563547878e-09, + "advantage_min": -0.5551019683480263, + "advantage_std": 0.6765760257840157, + "completion_length": 2909.104232788086, + "epoch": 0.2742857142857143, + "grad_norm": 0.4627504348754883, + "kl": 0.38671875, + "lambda_div_used": 0.5, + "learning_rate": 6.588648530198504e-07, + "loss": 0.0268, + "reward": -0.2009472165373154, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2009472165373154, + "reward_after_std": 0.6765760257840157, + "reward_before_mean": 0.23628009483218193, + "reward_before_std": 0.6305925287306309, + "reward_change_max": 0.0005473867058753967, + "reward_change_mean": -0.437227301299572, + "reward_change_min": -0.8425621017813683, + "reward_change_std": 0.3167913742363453, + "reward_std": 0.676576055586338, + "rewards/cosine_scaled_reward": -0.18394330446608365, + "rewards/format_reward": 0.604166679084301, + "step": 240 + }, + { + "advantage_max": 0.9308176077902317, + "advantage_mean": 1.8626452213954536e-08, + "advantage_min": -0.3792998418211937, + "advantage_std": 0.4861900471150875, + "completion_length": 2913.6875, + "epoch": 0.2754285714285714, + "grad_norm": 0.4878361225128174, + "kl": 0.33270263671875, + "lambda_div_used": 0.5, + "learning_rate": 6.558139508961654e-07, + "loss": 0.0172, + "reward": -0.40733741596341133, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.40733741596341133, + "reward_after_std": 0.4861900359392166, + "reward_before_mean": -0.08301575854420662, + "reward_before_std": 0.4412674307823181, + "reward_change_max": 0.0004919767379760742, + "reward_change_mean": -0.32432164903730154, + "reward_change_min": -0.6432442888617516, + "reward_change_std": 0.23452804517000914, + "reward_std": 0.4861900471150875, + "rewards/cosine_scaled_reward": -0.2602578904479742, + "rewards/format_reward": 0.4375000037252903, + "step": 241 + }, + { + "advantage_max": 1.4320058524608612, + "advantage_mean": -2.4835267731226907e-09, + "advantage_min": -0.6539147347211838, + "advantage_std": 0.7636258415877819, + "completion_length": 2382.5417404174805, + "epoch": 0.2765714285714286, + "grad_norm": 0.5571370124816895, + "kl": 0.2674560546875, + "lambda_div_used": 0.5, + "learning_rate": 6.527578915497951e-07, + "loss": 0.0559, + "reward": 0.047114765271544456, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.047114765271544456, + "reward_after_std": 0.7636258564889431, + "reward_before_mean": 0.6656512469053268, + "reward_before_std": 0.7144070193171501, + "reward_change_max": 0.0007030218839645386, + "reward_change_mean": -0.6185364685952663, + "reward_change_min": -1.1091222614049911, + "reward_change_std": 0.42813634127378464, + "reward_std": 0.7636258639395237, + "rewards/cosine_scaled_reward": -0.0630077242385596, + "rewards/format_reward": 0.7916666753590107, + "step": 242 + }, + { + "advantage_max": 1.5952362790703773, + "advantage_mean": 1.7384688466570708e-08, + "advantage_min": -0.58317955955863, + "advantage_std": 0.8279558680951595, + "completion_length": 2815.541702270508, + "epoch": 0.2777142857142857, + "grad_norm": 0.612875759601593, + "kl": 0.308380126953125, + "lambda_div_used": 0.5, + "learning_rate": 6.496968239287603e-07, + "loss": 0.0432, + "reward": -0.06789615005254745, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.06789615005254745, + "reward_after_std": 0.827955886721611, + "reward_before_mean": 0.42719752667471766, + "reward_before_std": 0.7438078261911869, + "reward_change_max": 0.0016501322388648987, + "reward_change_mean": -0.49509366787970066, + "reward_change_min": -0.9931257180869579, + "reward_change_std": 0.38069539703428745, + "reward_std": 0.8279559127986431, + "rewards/cosine_scaled_reward": -0.025984576670452952, + "rewards/format_reward": 0.47916666977107525, + "step": 243 + }, + { + "advantage_max": 1.8111069053411484, + "advantage_mean": -3.7252901874396116e-09, + "advantage_min": -0.6616870537400246, + "advantage_std": 0.9255795367062092, + "completion_length": 2855.2083892822266, + "epoch": 0.27885714285714286, + "grad_norm": 0.47464779019355774, + "kl": 0.323638916015625, + "lambda_div_used": 0.5, + "learning_rate": 6.466308972251785e-07, + "loss": 0.0105, + "reward": 0.07657577097415924, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.07657577097415924, + "reward_after_std": 0.9255795031785965, + "reward_before_mean": 0.6568863705615513, + "reward_before_std": 0.802655566483736, + "reward_change_max": 0.00221802294254303, + "reward_change_mean": -0.5803106501698494, + "reward_change_min": -1.0821171216666698, + "reward_change_std": 0.39801392890512943, + "reward_std": 0.9255795180797577, + "rewards/cosine_scaled_reward": 0.09927653288468719, + "rewards/format_reward": 0.45833334140479565, + "step": 244 + }, + { + "advantage_max": 1.9345757067203522, + "advantage_mean": 1.117587122845265e-08, + "advantage_min": -0.8029041439294815, + "advantage_std": 1.0185250714421272, + "completion_length": 2893.0209350585938, + "epoch": 0.28, + "grad_norm": 1.1499474048614502, + "kl": 0.329345703125, + "lambda_div_used": 0.5, + "learning_rate": 6.435602608679916e-07, + "loss": 0.0769, + "reward": -0.11789725301787257, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.11789725301787257, + "reward_after_std": 1.0185250639915466, + "reward_before_mean": 0.27830402879044414, + "reward_before_std": 1.0572170242667198, + "reward_change_max": 0.00021164864301681519, + "reward_change_mean": -0.3962012715637684, + "reward_change_min": -1.0164557546377182, + "reward_change_std": 0.3989331964403391, + "reward_std": 1.0185250863432884, + "rewards/cosine_scaled_reward": -0.03793133102590218, + "rewards/format_reward": 0.35416667722165585, + "step": 245 + }, + { + "advantage_max": 1.797962486743927, + "advantage_mean": -1.6763806787167823e-08, + "advantage_min": -0.8499506339430809, + "advantage_std": 0.9726065471768379, + "completion_length": 2728.291763305664, + "epoch": 0.28114285714285714, + "grad_norm": 0.7932919263839722, + "kl": 0.32568359375, + "lambda_div_used": 0.5, + "learning_rate": 6.404850645156841e-07, + "loss": 0.0096, + "reward": 0.08054419793188572, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.08054419793188572, + "reward_after_std": 0.9726065471768379, + "reward_before_mean": 0.6667746864259243, + "reward_before_std": 1.0122821666300297, + "reward_change_max": 0.0, + "reward_change_mean": -0.5862304829061031, + "reward_change_min": -1.2132366746664047, + "reward_change_std": 0.4926592092961073, + "reward_std": 0.9726065844297409, + "rewards/cosine_scaled_reward": 0.041720665991306305, + "rewards/format_reward": 0.5833333525806665, + "step": 246 + }, + { + "advantage_max": 1.1503918841481209, + "advantage_mean": 9.313226134732844e-09, + "advantage_min": -0.5107267498970032, + "advantage_std": 0.624145220965147, + "completion_length": 2959.5625610351562, + "epoch": 0.2822857142857143, + "grad_norm": 0.2704883813858032, + "kl": 0.33203125, + "lambda_div_used": 0.5, + "learning_rate": 6.374054580489873e-07, + "loss": 0.027, + "reward": -0.28593634255230427, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.28593634255230427, + "reward_after_std": 0.6241452284157276, + "reward_before_mean": 0.09957591397687793, + "reward_before_std": 0.6440289784222841, + "reward_change_max": 0.0004436373710632324, + "reward_change_mean": -0.3855122644454241, + "reward_change_min": -0.8384397625923157, + "reward_change_std": 0.3347723223268986, + "reward_std": 0.6241452470421791, + "rewards/cosine_scaled_reward": -0.22104538418352604, + "rewards/format_reward": 0.5416666772216558, + "step": 247 + }, + { + "advantage_max": 1.7827886566519737, + "advantage_mean": 1.2417632477834672e-09, + "advantage_min": -0.7948522940278053, + "advantage_std": 0.9226235747337341, + "completion_length": 2637.9375762939453, + "epoch": 0.2834285714285714, + "grad_norm": 0.6064723134040833, + "kl": 0.323699951171875, + "lambda_div_used": 0.5, + "learning_rate": 6.343215915635761e-07, + "loss": 0.0602, + "reward": 0.15817245468497276, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.15817245468497276, + "reward_after_std": 0.9226235896348953, + "reward_before_mean": 0.8102723509073257, + "reward_before_std": 0.8119069188833237, + "reward_change_max": 0.0, + "reward_change_mean": -0.652099872007966, + "reward_change_min": -1.090793576091528, + "reward_change_std": 0.43665625154972076, + "reward_std": 0.9226236119866371, + "rewards/cosine_scaled_reward": 0.08221947122365236, + "rewards/format_reward": 0.6458333432674408, + "step": 248 + }, + { + "advantage_max": 1.5615429654717445, + "advantage_mean": -9.934107314535368e-09, + "advantage_min": -0.7385703772306442, + "advantage_std": 0.8267139345407486, + "completion_length": 2314.9584045410156, + "epoch": 0.2845714285714286, + "grad_norm": 0.5855349898338318, + "kl": 0.285888671875, + "lambda_div_used": 0.5, + "learning_rate": 6.31233615362752e-07, + "loss": 0.0494, + "reward": 0.21203571744263172, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.21203571744263172, + "reward_after_std": 0.8267139345407486, + "reward_before_mean": 0.9464347688481212, + "reward_before_std": 0.7446025982499123, + "reward_change_max": 0.0, + "reward_change_mean": -0.7343990430235863, + "reward_change_min": -1.168241087347269, + "reward_change_std": 0.47521305456757545, + "reward_std": 0.8267139419913292, + "rewards/cosine_scaled_reward": 0.15030069323256612, + "rewards/format_reward": 0.6458333432674408, + "step": 249 + }, + { + "advantage_max": 1.6400514990091324, + "advantage_mean": 6.208815128694312e-10, + "advantage_min": -0.6048454195261002, + "advantage_std": 0.8398489728569984, + "completion_length": 2437.979248046875, + "epoch": 0.2857142857142857, + "grad_norm": 0.5643945336341858, + "kl": 0.34326171875, + "lambda_div_used": 0.5, + "learning_rate": 6.281416799501187e-07, + "loss": 0.0226, + "reward": -0.026893689762800932, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.026893689762800932, + "reward_after_std": 0.8398489952087402, + "reward_before_mean": 0.49545107781887054, + "reward_before_std": 0.7367004603147507, + "reward_change_max": 0.0, + "reward_change_mean": -0.5223447624593973, + "reward_change_min": -0.9968926385045052, + "reward_change_std": 0.3599364850670099, + "reward_std": 0.8398490101099014, + "rewards/cosine_scaled_reward": -0.13769113458693027, + "rewards/format_reward": 0.770833345130086, + "step": 250 + }, + { + "advantage_max": 1.7156466022133827, + "advantage_mean": -8.692344455329959e-09, + "advantage_min": -0.6233803145587444, + "advantage_std": 0.8830557502806187, + "completion_length": 2032.9167175292969, + "epoch": 0.28685714285714287, + "grad_norm": 0.5100935101509094, + "kl": 0.2845458984375, + "lambda_div_used": 0.5, + "learning_rate": 6.25045936022246e-07, + "loss": 0.0108, + "reward": 0.15881825191900134, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.15881825191900134, + "reward_after_std": 0.8830557428300381, + "reward_before_mean": 0.8191157560795546, + "reward_before_std": 0.7504131086170673, + "reward_change_max": 0.0, + "reward_change_mean": -0.6602975130081177, + "reward_change_min": -1.1482919603586197, + "reward_change_std": 0.4236573148518801, + "reward_std": 0.8830557651817799, + "rewards/cosine_scaled_reward": 0.013724527321755886, + "rewards/format_reward": 0.7916666772216558, + "step": 251 + }, + { + "advantage_max": 1.3824920132756233, + "advantage_mean": -7.450580263856921e-09, + "advantage_min": -0.5751038901507854, + "advantage_std": 0.7163071185350418, + "completion_length": 2569.6042098999023, + "epoch": 0.288, + "grad_norm": 0.44910934567451477, + "kl": 0.32745361328125, + "lambda_div_used": 0.5, + "learning_rate": 6.219465344613258e-07, + "loss": 0.0476, + "reward": -0.045434391126036644, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.045434391126036644, + "reward_after_std": 0.7163071371614933, + "reward_before_mean": 0.5019231028854847, + "reward_before_std": 0.6015528850257397, + "reward_change_max": 0.0, + "reward_change_mean": -0.5473575107753277, + "reward_change_min": -0.9850481189787388, + "reward_change_std": 0.3646436370909214, + "reward_std": 0.7163071446120739, + "rewards/cosine_scaled_reward": -0.0719551183283329, + "rewards/format_reward": 0.6458333544433117, + "step": 252 + }, + { + "advantage_max": 1.5739115923643112, + "advantage_mean": 1.7074247515846963e-08, + "advantage_min": -0.7448620498180389, + "advantage_std": 0.8350856080651283, + "completion_length": 2185.479217529297, + "epoch": 0.28914285714285715, + "grad_norm": 0.6235557794570923, + "kl": 0.2657470703125, + "lambda_div_used": 0.5, + "learning_rate": 6.188436263278172e-07, + "loss": 0.0477, + "reward": 0.13030250370502472, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.13030250370502472, + "reward_after_std": 0.8350856006145477, + "reward_before_mean": 0.7909778542816639, + "reward_before_std": 0.7733752019703388, + "reward_change_max": 0.0005941390991210938, + "reward_change_mean": -0.660675348713994, + "reward_change_min": -1.1640889719128609, + "reward_change_std": 0.4511568062007427, + "reward_std": 0.8350856229662895, + "rewards/cosine_scaled_reward": 0.0100722536444664, + "rewards/format_reward": 0.770833345130086, + "step": 253 + }, + { + "advantage_max": 1.4640011079609394, + "advantage_mean": 1.0554989549049765e-08, + "advantage_min": -0.575132817029953, + "advantage_std": 0.7637783885002136, + "completion_length": 2904.3125610351562, + "epoch": 0.29028571428571426, + "grad_norm": 0.6790186166763306, + "kl": 0.49212646484375, + "lambda_div_used": 0.5, + "learning_rate": 6.157373628530852e-07, + "loss": 0.044, + "reward": -0.17827866226434708, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.17827866226434708, + "reward_after_std": 0.7637783773243427, + "reward_before_mean": 0.24730434804223478, + "reward_before_std": 0.7225446216762066, + "reward_change_max": 0.0017682760953903198, + "reward_change_mean": -0.42558303009718657, + "reward_change_min": -0.908573143184185, + "reward_change_std": 0.33383440785109997, + "reward_std": 0.763778381049633, + "rewards/cosine_scaled_reward": -0.1367644937708974, + "rewards/format_reward": 0.5208333395421505, + "step": 254 + }, + { + "advantage_max": 1.595671109855175, + "advantage_mean": 1.2417634698280722e-08, + "advantage_min": -0.6261331886053085, + "advantage_std": 0.820063129067421, + "completion_length": 2718.9375610351562, + "epoch": 0.2914285714285714, + "grad_norm": 0.6525692939758301, + "kl": 0.441162109375, + "lambda_div_used": 0.5, + "learning_rate": 6.126278954320294e-07, + "loss": 0.0181, + "reward": -0.21421317756175995, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.21421317756175995, + "reward_after_std": 0.8200631737709045, + "reward_before_mean": 0.16194157907739282, + "reward_before_std": 0.7840095274150372, + "reward_change_max": 0.001371636986732483, + "reward_change_mean": -0.3761547487229109, + "reward_change_min": -0.768225908279419, + "reward_change_std": 0.30263841338455677, + "reward_std": 0.8200631812214851, + "rewards/cosine_scaled_reward": -0.16902921721339226, + "rewards/format_reward": 0.5000000093132257, + "step": 255 + }, + { + "advantage_max": 1.0734341964125633, + "advantage_mean": -4.346171700309043e-09, + "advantage_min": -0.56340616568923, + "advantage_std": 0.5797542482614517, + "completion_length": 2705.5625762939453, + "epoch": 0.2925714285714286, + "grad_norm": 0.5698035359382629, + "kl": 0.4195556640625, + "lambda_div_used": 0.5, + "learning_rate": 6.095153756157051e-07, + "loss": 0.0522, + "reward": -0.21560884034261107, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.21560884034261107, + "reward_after_std": 0.5797542706131935, + "reward_before_mean": 0.24173666816204786, + "reward_before_std": 0.5742171257734299, + "reward_change_max": 4.9874186515808105e-05, + "reward_change_mean": -0.457345524802804, + "reward_change_min": -0.8107205480337143, + "reward_change_std": 0.33175988495349884, + "reward_std": 0.579754289239645, + "rewards/cosine_scaled_reward": -0.12913167104125023, + "rewards/format_reward": 0.5000000167638063, + "step": 256 + }, + { + "advantage_max": 1.8975205719470978, + "advantage_mean": -2.483526828633842e-09, + "advantage_min": -0.8254300951957703, + "advantage_std": 1.0045362412929535, + "completion_length": 2797.666748046875, + "epoch": 0.2937142857142857, + "grad_norm": 1.2031116485595703, + "kl": 0.36602783203125, + "lambda_div_used": 0.5, + "learning_rate": 6.06399955103937e-07, + "loss": 0.0641, + "reward": 0.21093771699815989, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.21093771699815989, + "reward_after_std": 1.004536233842373, + "reward_before_mean": 0.8886510282754898, + "reward_before_std": 0.9414378330111504, + "reward_change_max": 0.0012116432189941406, + "reward_change_mean": -0.6777133084833622, + "reward_change_min": -1.307423584163189, + "reward_change_std": 0.5130571741610765, + "reward_std": 1.0045362561941147, + "rewards/cosine_scaled_reward": 0.07974217180162668, + "rewards/format_reward": 0.729166679084301, + "step": 257 + }, + { + "advantage_max": 1.6636093333363533, + "advantage_mean": 4.967053435223079e-09, + "advantage_min": -0.8262095041573048, + "advantage_std": 0.8977803438901901, + "completion_length": 2957.854248046875, + "epoch": 0.2948571428571429, + "grad_norm": 0.6931731700897217, + "kl": 0.4793701171875, + "lambda_div_used": 0.5, + "learning_rate": 6.032817857379256e-07, + "loss": 0.0403, + "reward": -0.015886036679148674, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.015886036679148674, + "reward_after_std": 0.8977803215384483, + "reward_before_mean": 0.5135537143796682, + "reward_before_std": 0.9260082244873047, + "reward_change_max": 0.0007776841521263123, + "reward_change_mean": -0.5294397762045264, + "reward_change_min": -1.0711536519229412, + "reward_change_std": 0.4524089526385069, + "reward_std": 0.8977803774178028, + "rewards/cosine_scaled_reward": -0.014056478627026081, + "rewards/format_reward": 0.5416666846722364, + "step": 258 + }, + { + "advantage_max": 1.886551707983017, + "advantage_mean": -2.2351742234860694e-08, + "advantage_min": -0.8040063604712486, + "advantage_std": 0.991726316511631, + "completion_length": 2334.0625610351562, + "epoch": 0.296, + "grad_norm": 0.7005351185798645, + "kl": 0.38140869140625, + "lambda_div_used": 0.5, + "learning_rate": 6.001610194928464e-07, + "loss": 0.0496, + "reward": 0.23102650791406631, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.23102650791406631, + "reward_after_std": 0.9917263463139534, + "reward_before_mean": 0.9282313352450728, + "reward_before_std": 0.9095796346664429, + "reward_change_max": 0.002127528190612793, + "reward_change_mean": -0.6972048059105873, + "reward_change_min": -1.2518923357129097, + "reward_change_std": 0.4892759174108505, + "reward_std": 0.9917263612151146, + "rewards/cosine_scaled_reward": 0.07869898644275963, + "rewards/format_reward": 0.7708333432674408, + "step": 259 + }, + { + "advantage_max": 1.7901442348957062, + "advantage_mean": -2.6077033421501028e-08, + "advantage_min": -0.8769202791154385, + "advantage_std": 0.9542439803481102, + "completion_length": 1963.6667022705078, + "epoch": 0.29714285714285715, + "grad_norm": 0.6481761932373047, + "kl": 0.27838134765625, + "lambda_div_used": 0.5, + "learning_rate": 5.97037808470444e-07, + "loss": 0.0426, + "reward": 0.2789585944265127, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2789585944265127, + "reward_after_std": 0.9542439877986908, + "reward_before_mean": 1.0259063299745321, + "reward_before_std": 0.9065537489950657, + "reward_change_max": 0.0016413480043411255, + "reward_change_mean": -0.7469477131962776, + "reward_change_min": -1.3164174929261208, + "reward_change_std": 0.5313061438500881, + "reward_std": 0.9542440101504326, + "rewards/cosine_scaled_reward": 0.11711981240659952, + "rewards/format_reward": 0.7916666716337204, + "step": 260 + }, + { + "advantage_max": 1.4177104532718658, + "advantage_mean": -1.1102230246251565e-16, + "advantage_min": -0.6767967715859413, + "advantage_std": 0.7486027590930462, + "completion_length": 2660.0834197998047, + "epoch": 0.29828571428571427, + "grad_norm": 0.4558544158935547, + "kl": 0.396484375, + "lambda_div_used": 0.5, + "learning_rate": 5.939123048916173e-07, + "loss": 0.0347, + "reward": -0.09252565540373325, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.09252565540373325, + "reward_after_std": 0.7486027553677559, + "reward_before_mean": 0.41056731529533863, + "reward_before_std": 0.7132025845348835, + "reward_change_max": 0.0010763555765151978, + "reward_change_mean": -0.5030929781496525, + "reward_change_min": -0.8874324820935726, + "reward_change_std": 0.3634376022964716, + "reward_std": 0.7486027553677559, + "rewards/cosine_scaled_reward": -0.09679968375712633, + "rewards/format_reward": 0.6041666809469461, + "step": 261 + }, + { + "advantage_max": 1.3604392558336258, + "advantage_mean": 7.1401400625337175e-09, + "advantage_min": -0.610820833593607, + "advantage_std": 0.7167367935180664, + "completion_length": 2684.2709350585938, + "epoch": 0.29942857142857143, + "grad_norm": 0.5589156150817871, + "kl": 0.40673828125, + "lambda_div_used": 0.5, + "learning_rate": 5.907846610890011e-07, + "loss": 0.0415, + "reward": -0.1675438095408026, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.1675438095408026, + "reward_after_std": 0.716736800968647, + "reward_before_mean": 0.28486710973083973, + "reward_before_std": 0.6902591176331043, + "reward_change_max": 0.0013748407363891602, + "reward_change_mean": -0.45241091772913933, + "reward_change_min": -0.8601580671966076, + "reward_change_std": 0.3375726994127035, + "reward_std": 0.7167368233203888, + "rewards/cosine_scaled_reward": -0.2117331251502037, + "rewards/format_reward": 0.7083333469927311, + "step": 262 + }, + { + "advantage_max": 1.264581359922886, + "advantage_mean": 3.7252904094842165e-09, + "advantage_min": -0.5539588704705238, + "advantage_std": 0.6596848592162132, + "completion_length": 2776.000030517578, + "epoch": 0.30057142857142854, + "grad_norm": 0.8647629618644714, + "kl": 0.4727783203125, + "lambda_div_used": 0.5, + "learning_rate": 5.87655029499542e-07, + "loss": 0.0256, + "reward": -0.13164973491802812, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.13164973491802812, + "reward_after_std": 0.6596848666667938, + "reward_before_mean": 0.3626173473894596, + "reward_before_std": 0.580600518733263, + "reward_change_max": 0.0, + "reward_change_mean": -0.4942670837044716, + "reward_change_min": -0.8870192393660545, + "reward_change_std": 0.32861434295773506, + "reward_std": 0.6596848852932453, + "rewards/cosine_scaled_reward": -0.1936913337558508, + "rewards/format_reward": 0.750000013038516, + "step": 263 + }, + { + "advantage_max": 1.5834196358919144, + "advantage_mean": -3.725290298461914e-09, + "advantage_min": -0.6787533052265644, + "advantage_std": 0.8481276035308838, + "completion_length": 2627.8959350585938, + "epoch": 0.3017142857142857, + "grad_norm": 0.8860915303230286, + "kl": 0.45947265625, + "lambda_div_used": 0.5, + "learning_rate": 5.845235626570683e-07, + "loss": 0.0302, + "reward": -0.08708051778376102, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.08708051778376102, + "reward_after_std": 0.8481276035308838, + "reward_before_mean": 0.3910123445093632, + "reward_before_std": 0.864212442189455, + "reward_change_max": 0.0015831664204597473, + "reward_change_mean": -0.47809283807873726, + "reward_change_min": -1.0792249664664268, + "reward_change_std": 0.40922513976693153, + "reward_std": 0.8481276258826256, + "rewards/cosine_scaled_reward": -0.10657717660069466, + "rewards/format_reward": 0.6041666753590107, + "step": 264 + }, + { + "advantage_max": 1.3798726946115494, + "advantage_mean": 1.6142925107764938e-08, + "advantage_min": -0.7368720173835754, + "advantage_std": 0.7367407977581024, + "completion_length": 2043.7292022705078, + "epoch": 0.3028571428571429, + "grad_norm": 0.7481672763824463, + "kl": 0.21441650390625, + "lambda_div_used": 0.5, + "learning_rate": 5.813904131848564e-07, + "loss": -0.0053, + "reward": 0.1203412376344204, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1203412376344204, + "reward_after_std": 0.7367407903075218, + "reward_before_mean": 0.8063299190253019, + "reward_before_std": 0.6718975119292736, + "reward_change_max": 0.0, + "reward_change_mean": -0.6859886385500431, + "reward_change_min": -1.1170197911560535, + "reward_change_std": 0.44301604852080345, + "reward_std": 0.7367408089339733, + "rewards/cosine_scaled_reward": -0.003085056319832802, + "rewards/format_reward": 0.812500013038516, + "step": 265 + }, + { + "advantage_max": 1.5312513262033463, + "advantage_mean": 6.208816794028849e-10, + "advantage_min": -0.7652738317847252, + "advantage_std": 0.807264044880867, + "completion_length": 2435.9375610351562, + "epoch": 0.304, + "grad_norm": 0.956333339214325, + "kl": 0.31561279296875, + "lambda_div_used": 0.5, + "learning_rate": 5.78255733788191e-07, + "loss": 0.0178, + "reward": 0.039600093849003315, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.039600093849003315, + "reward_after_std": 0.807264044880867, + "reward_before_mean": 0.6330936271697283, + "reward_before_std": 0.7511084713041782, + "reward_change_max": 0.0, + "reward_change_mean": -0.5934935286641121, + "reward_change_min": -1.0697472617030144, + "reward_change_std": 0.4119662679731846, + "reward_std": 0.8072640635073185, + "rewards/cosine_scaled_reward": -0.04803652781993151, + "rewards/format_reward": 0.7291666865348816, + "step": 266 + }, + { + "advantage_max": 1.6614954099059105, + "advantage_mean": -4.967053768289986e-09, + "advantage_min": -0.6934118308126926, + "advantage_std": 0.8682034313678741, + "completion_length": 2963.6458892822266, + "epoch": 0.30514285714285716, + "grad_norm": 1.267866611480713, + "kl": 0.4324951171875, + "lambda_div_used": 0.5, + "learning_rate": 5.751196772469237e-07, + "loss": 0.0592, + "reward": -0.19175553228706121, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.19175553228706121, + "reward_after_std": 0.868203416466713, + "reward_before_mean": 0.19106070883572102, + "reward_before_std": 0.8646641299128532, + "reward_change_max": 0.0, + "reward_change_mean": -0.3828162420541048, + "reward_change_min": -0.8564209714531898, + "reward_change_std": 0.33900113217532635, + "reward_std": 0.8682034537196159, + "rewards/cosine_scaled_reward": -0.1753029921092093, + "rewards/format_reward": 0.5416666734963655, + "step": 267 + }, + { + "advantage_max": 1.3772685006260872, + "advantage_mean": 1.0554989382516311e-08, + "advantage_min": -0.6027562022209167, + "advantage_std": 0.7217709645628929, + "completion_length": 2267.0208740234375, + "epoch": 0.3062857142857143, + "grad_norm": 0.48167508840560913, + "kl": 0.257080078125, + "lambda_div_used": 0.5, + "learning_rate": 5.71982396408026e-07, + "loss": 0.0281, + "reward": -0.1674497053027153, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.1674497053027153, + "reward_after_std": 0.7217709608376026, + "reward_before_mean": 0.2781257377937436, + "reward_before_std": 0.6824806816875935, + "reward_change_max": 7.049739360809326e-05, + "reward_change_mean": -0.44557544589042664, + "reward_change_min": -0.9256918206810951, + "reward_change_std": 0.33537369780242443, + "reward_std": 0.7217709757387638, + "rewards/cosine_scaled_reward": -0.13177046133205295, + "rewards/format_reward": 0.5416666734963655, + "step": 268 + }, + { + "advantage_max": 1.4905111193656921, + "advantage_mean": 5.587935503204022e-09, + "advantage_min": -0.5878796353936195, + "advantage_std": 0.7800829708576202, + "completion_length": 2680.1876068115234, + "epoch": 0.30742857142857144, + "grad_norm": 0.42294082045555115, + "kl": 0.29730224609375, + "lambda_div_used": 0.5, + "learning_rate": 5.688440441781398e-07, + "loss": 0.0065, + "reward": -0.014324287883937359, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.014324287883937359, + "reward_after_std": 0.7800829447805882, + "reward_before_mean": 0.5407853480428457, + "reward_before_std": 0.7013232558965683, + "reward_change_max": 0.0007768869400024414, + "reward_change_mean": -0.5551096498966217, + "reward_change_min": -1.0575359836220741, + "reward_change_std": 0.38940995931625366, + "reward_std": 0.780082993209362, + "rewards/cosine_scaled_reward": -0.12544066738337278, + "rewards/format_reward": 0.7916666753590107, + "step": 269 + }, + { + "advantage_max": 1.7074644267559052, + "advantage_mean": -7.45058115203534e-09, + "advantage_min": -0.7014975696802139, + "advantage_std": 0.886136669665575, + "completion_length": 2692.854217529297, + "epoch": 0.30857142857142855, + "grad_norm": 0.725864589214325, + "kl": 0.239715576171875, + "lambda_div_used": 0.5, + "learning_rate": 5.657047735161255e-07, + "loss": 0.0403, + "reward": 0.07836535479873419, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.07836535479873419, + "reward_after_std": 0.8861366622149944, + "reward_before_mean": 0.6759741138666868, + "reward_before_std": 0.7980044074356556, + "reward_change_max": 0.0, + "reward_change_mean": -0.5976087264716625, + "reward_change_min": -1.1347657963633537, + "reward_change_std": 0.40823423117399216, + "reward_std": 0.8861366920173168, + "rewards/cosine_scaled_reward": -0.037012950982898474, + "rewards/format_reward": 0.7500000260770321, + "step": 270 + }, + { + "advantage_max": 1.5989461839199066, + "advantage_mean": -9.31322552411018e-09, + "advantage_min": -0.8416948765516281, + "advantage_std": 0.858425922691822, + "completion_length": 2536.104232788086, + "epoch": 0.3097142857142857, + "grad_norm": 0.32519644498825073, + "kl": 0.22412109375, + "lambda_div_used": 0.5, + "learning_rate": 5.625647374256061e-07, + "loss": 0.008, + "reward": 0.2996886605396867, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2996886605396867, + "reward_after_std": 0.8584259375929832, + "reward_before_mean": 1.0987447872757912, + "reward_before_std": 0.8029434680938721, + "reward_change_max": 0.0, + "reward_change_mean": -0.7990561313927174, + "reward_change_min": -1.3279625624418259, + "reward_change_std": 0.5225053429603577, + "reward_std": 0.8584259562194347, + "rewards/cosine_scaled_reward": 0.13270571175962687, + "rewards/format_reward": 0.8333333414047956, + "step": 271 + }, + { + "advantage_max": 1.3123992159962654, + "advantage_mean": 1.117587122845265e-08, + "advantage_min": -0.5966584831476212, + "advantage_std": 0.6955144740641117, + "completion_length": 2838.416717529297, + "epoch": 0.31085714285714283, + "grad_norm": 0.5349856615066528, + "kl": 0.3720703125, + "lambda_div_used": 0.5, + "learning_rate": 5.594240889475106e-07, + "loss": 0.0444, + "reward": -0.0673510073684156, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.0673510073684156, + "reward_after_std": 0.6955144926905632, + "reward_before_mean": 0.47645336762070656, + "reward_before_std": 0.6381653603166342, + "reward_change_max": 6.621330976486206e-05, + "reward_change_mean": -0.5438043996691704, + "reward_change_min": -1.0006888955831528, + "reward_change_std": 0.38272993825376034, + "reward_std": 0.6955145299434662, + "rewards/cosine_scaled_reward": -0.03260663757100701, + "rewards/format_reward": 0.5416666772216558, + "step": 272 + }, + { + "advantage_max": 1.5722772255539894, + "advantage_mean": 1.1796752574788627e-08, + "advantage_min": -0.6985844299197197, + "advantage_std": 0.8248922377824783, + "completion_length": 2693.041717529297, + "epoch": 0.312, + "grad_norm": 0.406802773475647, + "kl": 0.27484130859375, + "lambda_div_used": 0.5, + "learning_rate": 5.562829811526154e-07, + "loss": 0.0212, + "reward": 0.11140474304556847, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.11140474304556847, + "reward_after_std": 0.8248922675848007, + "reward_before_mean": 0.7608576994389296, + "reward_before_std": 0.7347896918654442, + "reward_change_max": 0.0008430108428001404, + "reward_change_mean": -0.6494529545307159, + "reward_change_min": -1.1342052109539509, + "reward_change_std": 0.4459607619792223, + "reward_std": 0.8248922750353813, + "rewards/cosine_scaled_reward": 0.04709549807012081, + "rewards/format_reward": 0.6666666734963655, + "step": 273 + }, + { + "advantage_max": 1.8088025748729706, + "advantage_mean": -7.450581041013038e-09, + "advantage_min": -0.7626455649733543, + "advantage_std": 0.942629911005497, + "completion_length": 1852.8542098999023, + "epoch": 0.31314285714285717, + "grad_norm": 0.9049416780471802, + "kl": 0.31414794921875, + "lambda_div_used": 0.5, + "learning_rate": 5.531415671340826e-07, + "loss": 0.0055, + "reward": 0.33919756673276424, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.33919756673276424, + "reward_after_std": 0.9426299259066582, + "reward_before_mean": 1.1372214332222939, + "reward_before_std": 0.8105506710708141, + "reward_change_max": 0.0016675367951393127, + "reward_change_mean": -0.7980238739401102, + "reward_change_min": -1.3720234483480453, + "reward_change_std": 0.5126860048621893, + "reward_std": 0.9426299408078194, + "rewards/cosine_scaled_reward": 0.15194405056536198, + "rewards/format_reward": 0.833333333954215, + "step": 274 + }, + { + "advantage_max": 1.7259643226861954, + "advantage_mean": -2.4835267176115394e-09, + "advantage_min": -0.7082581743597984, + "advantage_std": 0.8969070762395859, + "completion_length": 2284.500068664551, + "epoch": 0.3142857142857143, + "grad_norm": 0.42548516392707825, + "kl": 0.208038330078125, + "lambda_div_used": 0.5, + "learning_rate": 5.5e-07, + "loss": 0.0016, + "reward": 0.1408673170953989, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1408673170953989, + "reward_after_std": 0.8969070613384247, + "reward_before_mean": 0.7898597102612257, + "reward_before_std": 0.8003634139895439, + "reward_change_max": 0.0011668428778648376, + "reward_change_mean": -0.6489923857152462, + "reward_change_min": -1.132670484483242, + "reward_change_std": 0.4332315605133772, + "reward_std": 0.8969071060419083, + "rewards/cosine_scaled_reward": 0.040763177908957005, + "rewards/format_reward": 0.7083333395421505, + "step": 275 + }, + { + "advantage_max": 1.8853215798735619, + "advantage_mean": -1.241763691872677e-09, + "advantage_min": -0.7883929088711739, + "advantage_std": 0.9830573312938213, + "completion_length": 2414.3750534057617, + "epoch": 0.31542857142857145, + "grad_norm": 0.8729665279388428, + "kl": 0.23760986328125, + "lambda_div_used": 0.5, + "learning_rate": 5.468584328659172e-07, + "loss": 0.0557, + "reward": 0.24620786309242249, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.24620786309242249, + "reward_after_std": 0.9830573238432407, + "reward_before_mean": 0.9521079548285343, + "reward_before_std": 0.8866949342191219, + "reward_change_max": 0.0, + "reward_change_mean": -0.7059001140296459, + "reward_change_min": -1.2638509795069695, + "reward_change_std": 0.4728654455393553, + "reward_std": 0.9830573461949825, + "rewards/cosine_scaled_reward": 0.05938731785863638, + "rewards/format_reward": 0.833333358168602, + "step": 276 + }, + { + "advantage_max": 1.5835720784962177, + "advantage_mean": -8.071462442860167e-09, + "advantage_min": -0.6679697595536709, + "advantage_std": 0.8301307894289494, + "completion_length": 2175.2500762939453, + "epoch": 0.31657142857142856, + "grad_norm": 0.24413251876831055, + "kl": 0.170562744140625, + "lambda_div_used": 0.5, + "learning_rate": 5.437170188473847e-07, + "loss": 0.0264, + "reward": -0.008413793984800577, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.008413793984800577, + "reward_after_std": 0.8301308266818523, + "reward_before_mean": 0.5342761669307947, + "reward_before_std": 0.77891606092453, + "reward_change_max": 0.0025727152824401855, + "reward_change_mean": -0.5426899380981922, + "reward_change_min": -0.9942139759659767, + "reward_change_std": 0.38135170191526413, + "reward_std": 0.8301308341324329, + "rewards/cosine_scaled_reward": -0.09744528587907553, + "rewards/format_reward": 0.7291666753590107, + "step": 277 + }, + { + "advantage_max": 1.3841341733932495, + "advantage_mean": -1.3038516211150153e-08, + "advantage_min": -0.4801356568932533, + "advantage_std": 0.7003735899925232, + "completion_length": 2088.479202270508, + "epoch": 0.3177142857142857, + "grad_norm": 0.29018351435661316, + "kl": 0.23699951171875, + "lambda_div_used": 0.5, + "learning_rate": 5.405759110524894e-07, + "loss": 0.0285, + "reward": 0.23467270750552416, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.23467270750552416, + "reward_after_std": 0.7003735899925232, + "reward_before_mean": 1.0165191926062107, + "reward_before_std": 0.43884219601750374, + "reward_change_max": 0.0, + "reward_change_mean": -0.7818465009331703, + "reward_change_min": -1.1315721720457077, + "reward_change_std": 0.42579494789242744, + "reward_std": 0.7003736048936844, + "rewards/cosine_scaled_reward": 0.09159291861578822, + "rewards/format_reward": 0.8333333414047956, + "step": 278 + }, + { + "advantage_max": 1.7110272645950317, + "advantage_mean": -1.2417631367611648e-09, + "advantage_min": -0.801856491714716, + "advantage_std": 0.9113363847136497, + "completion_length": 2521.791732788086, + "epoch": 0.31885714285714284, + "grad_norm": 0.9733167886734009, + "kl": 0.2313232421875, + "lambda_div_used": 0.5, + "learning_rate": 5.37435262574394e-07, + "loss": -0.027, + "reward": -0.05211097002029419, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.05211097002029419, + "reward_after_std": 0.9113363474607468, + "reward_before_mean": 0.43769562989473343, + "reward_before_std": 0.9335248246788979, + "reward_change_max": 0.004060961306095123, + "reward_change_mean": -0.48980659805238247, + "reward_change_min": -1.0869178883731365, + "reward_change_std": 0.42547522112727165, + "reward_std": 0.9113363847136497, + "rewards/cosine_scaled_reward": -0.0728188632056117, + "rewards/format_reward": 0.5833333469927311, + "step": 279 + }, + { + "advantage_max": 1.8734301775693893, + "advantage_mean": -1.490116141589226e-08, + "advantage_min": -0.8812666833400726, + "advantage_std": 0.9982732012867928, + "completion_length": 2256.062602996826, + "epoch": 0.32, + "grad_norm": 0.7435464262962341, + "kl": 0.198822021484375, + "lambda_div_used": 0.5, + "learning_rate": 5.342952264838747e-07, + "loss": -0.0071, + "reward": 0.510563270188868, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.510563270188868, + "reward_after_std": 0.998273216187954, + "reward_before_mean": 1.439866580069065, + "reward_before_std": 0.889154952019453, + "reward_change_max": 0.0004360675811767578, + "reward_change_mean": -0.9293033089488745, + "reward_change_min": -1.5980991758406162, + "reward_change_std": 0.6186284609138966, + "reward_std": 0.9982732459902763, + "rewards/cosine_scaled_reward": 0.32409994560293853, + "rewards/format_reward": 0.7916666753590107, + "step": 280 + }, + { + "advantage_max": 1.3946446254849434, + "advantage_mean": 2.1730860666480112e-08, + "advantage_min": -0.6471364013850689, + "advantage_std": 0.7381687723100185, + "completion_length": 3138.1875915527344, + "epoch": 0.3211428571428571, + "grad_norm": 0.4560682773590088, + "kl": 0.2747802734375, + "lambda_div_used": 0.5, + "learning_rate": 5.311559558218603e-07, + "loss": 0.0362, + "reward": -0.31014756578952074, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.31014756578952074, + "reward_after_std": 0.7381687834858894, + "reward_before_mean": 0.01806825865060091, + "reward_before_std": 0.7624465189874172, + "reward_change_max": 0.0003070831298828125, + "reward_change_mean": -0.3282158114016056, + "reward_change_min": -0.7260516695678234, + "reward_change_std": 0.30591568164527416, + "reward_std": 0.7381688207387924, + "rewards/cosine_scaled_reward": -0.20971587905660272, + "rewards/format_reward": 0.43750001303851604, + "step": 281 + }, + { + "advantage_max": 1.5682580173015594, + "advantage_mean": -9.313225690643634e-09, + "advantage_min": -0.8357209786772728, + "advantage_std": 0.83661337941885, + "completion_length": 2523.7500610351562, + "epoch": 0.3222857142857143, + "grad_norm": 0.4174196720123291, + "kl": 0.204254150390625, + "lambda_div_used": 0.5, + "learning_rate": 5.28017603591974e-07, + "loss": 0.028, + "reward": 0.19610136304982007, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.19610136304982007, + "reward_after_std": 0.8366133980453014, + "reward_before_mean": 0.918760965578258, + "reward_before_std": 0.7789534255862236, + "reward_change_max": 0.0, + "reward_change_mean": -0.7226595841348171, + "reward_change_min": -1.191369317471981, + "reward_change_std": 0.48115156777203083, + "reward_std": 0.8366134092211723, + "rewards/cosine_scaled_reward": 0.05313047394156456, + "rewards/format_reward": 0.8125000074505806, + "step": 282 + }, + { + "advantage_max": 1.661266028881073, + "advantage_mean": -7.450580485901526e-09, + "advantage_min": -0.7974419593811035, + "advantage_std": 0.876350536942482, + "completion_length": 2636.0833740234375, + "epoch": 0.32342857142857145, + "grad_norm": 1.408146858215332, + "kl": 0.2386474609375, + "lambda_div_used": 0.5, + "learning_rate": 5.248803227530763e-07, + "loss": 0.0824, + "reward": 0.21048655919730663, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.21048655919730663, + "reward_after_std": 0.8763505443930626, + "reward_before_mean": 0.9238283336162567, + "reward_before_std": 0.7904699593782425, + "reward_change_max": 0.00020716339349746704, + "reward_change_mean": -0.7133418209850788, + "reward_change_min": -1.2066475562751293, + "reward_change_std": 0.48531679809093475, + "reward_std": 0.8763505443930626, + "rewards/cosine_scaled_reward": 0.1494141835719347, + "rewards/format_reward": 0.6250000093132257, + "step": 283 + }, + { + "advantage_max": 1.7019705697894096, + "advantage_mean": -9.313227133933566e-10, + "advantage_min": -0.6638279147446156, + "advantage_std": 0.8716025203466415, + "completion_length": 2428.5625762939453, + "epoch": 0.32457142857142857, + "grad_norm": 0.5000388622283936, + "kl": 0.23974609375, + "lambda_div_used": 0.5, + "learning_rate": 5.21744266211809e-07, + "loss": 0.0384, + "reward": 0.09703357797116041, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.09703357797116041, + "reward_after_std": 0.8716025464236736, + "reward_before_mean": 0.7079252786934376, + "reward_before_std": 0.7274694666266441, + "reward_change_max": 0.0, + "reward_change_mean": -0.6108916997909546, + "reward_change_min": -1.0118683576583862, + "reward_change_std": 0.37852455861866474, + "reward_std": 0.8716025911271572, + "rewards/cosine_scaled_reward": -0.05228736763820052, + "rewards/format_reward": 0.8125000111758709, + "step": 284 + }, + { + "advantage_max": 1.5750038623809814, + "advantage_mean": -9.934107703113426e-09, + "advantage_min": -0.7462300956249237, + "advantage_std": 0.8351321816444397, + "completion_length": 2142.250015258789, + "epoch": 0.32571428571428573, + "grad_norm": 0.9030725359916687, + "kl": 0.224151611328125, + "lambda_div_used": 0.5, + "learning_rate": 5.186095868151436e-07, + "loss": 0.0476, + "reward": 0.09167552087455988, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.09167552087455988, + "reward_after_std": 0.8351321667432785, + "reward_before_mean": 0.7201039753854275, + "reward_before_std": 0.7814724817872047, + "reward_change_max": 0.001250341534614563, + "reward_change_mean": -0.6284284666180611, + "reward_change_min": -1.201073795557022, + "reward_change_std": 0.4616069979965687, + "reward_std": 0.8351322039961815, + "rewards/cosine_scaled_reward": -0.02536469604820013, + "rewards/format_reward": 0.7708333544433117, + "step": 285 + }, + { + "advantage_max": 1.4558663815259933, + "advantage_mean": 1.0554989632316492e-08, + "advantage_min": -0.6718570664525032, + "advantage_std": 0.7702252045273781, + "completion_length": 2493.041732788086, + "epoch": 0.32685714285714285, + "grad_norm": 0.7895695567131042, + "kl": 0.38580322265625, + "lambda_div_used": 0.5, + "learning_rate": 5.154764373429315e-07, + "loss": 0.0287, + "reward": 0.006931816227734089, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.006931816227734089, + "reward_after_std": 0.7702252045273781, + "reward_before_mean": 0.5846692929044366, + "reward_before_std": 0.7240047045052052, + "reward_change_max": 5.987286567687988e-05, + "reward_change_mean": -0.5777374971657991, + "reward_change_min": -1.0274383313953876, + "reward_change_std": 0.4023998789489269, + "reward_std": 0.7702252194285393, + "rewards/cosine_scaled_reward": -0.07224868983030319, + "rewards/format_reward": 0.7291666846722364, + "step": 286 + }, + { + "advantage_max": 1.680905520915985, + "advantage_mean": -4.346172144398253e-09, + "advantage_min": -0.6960576623678207, + "advantage_std": 0.8782125003635883, + "completion_length": 1915.2291870117188, + "epoch": 0.328, + "grad_norm": 0.5730679035186768, + "kl": 0.2940673828125, + "lambda_div_used": 0.5, + "learning_rate": 5.123449705004581e-07, + "loss": 0.023, + "reward": 0.05262707732617855, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.05262707732617855, + "reward_after_std": 0.8782124929130077, + "reward_before_mean": 0.6271408479660749, + "reward_before_std": 0.7979765832424164, + "reward_change_max": 0.0, + "reward_change_mean": -0.5745137967169285, + "reward_change_min": -1.0377584993839264, + "reward_change_std": 0.40095450915396214, + "reward_std": 0.8782125115394592, + "rewards/cosine_scaled_reward": -0.09267958626151085, + "rewards/format_reward": 0.8125000204890966, + "step": 287 + }, + { + "advantage_max": 1.4794694632291794, + "advantage_mean": -3.725290742551124e-09, + "advantage_min": -0.5603340975940228, + "advantage_std": 0.754909448325634, + "completion_length": 2463.041748046875, + "epoch": 0.3291428571428571, + "grad_norm": 1.0325500965118408, + "kl": 0.26513671875, + "lambda_div_used": 0.5, + "learning_rate": 5.09215338910999e-07, + "loss": -0.0066, + "reward": 0.044862196780741215, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.044862196780741215, + "reward_after_std": 0.7549094632267952, + "reward_before_mean": 0.6544077503494918, + "reward_before_std": 0.5984827503561974, + "reward_change_max": 0.0005664229393005371, + "reward_change_mean": -0.6095455586910248, + "reward_change_min": -0.9598565027117729, + "reward_change_std": 0.36767209880053997, + "reward_std": 0.7549095004796982, + "rewards/cosine_scaled_reward": -0.037379464134573936, + "rewards/format_reward": 0.7291666846722364, + "step": 288 + }, + { + "advantage_max": 1.2859587520360947, + "advantage_mean": -1.1175871061919196e-08, + "advantage_min": -0.48796913772821426, + "advantage_std": 0.6554525531828403, + "completion_length": 1999.9792137145996, + "epoch": 0.3302857142857143, + "grad_norm": 0.9590998291969299, + "kl": 0.24828338623046875, + "lambda_div_used": 0.5, + "learning_rate": 5.060876951083828e-07, + "loss": 0.0059, + "reward": 0.0213075689971447, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.0213075689971447, + "reward_after_std": 0.6554525531828403, + "reward_before_mean": 0.6397494054399431, + "reward_before_std": 0.4752440471202135, + "reward_change_max": 0.0, + "reward_change_mean": -0.6184418424963951, + "reward_change_min": -0.952071838080883, + "reward_change_std": 0.35505982115864754, + "reward_std": 0.6554525941610336, + "rewards/cosine_scaled_reward": -0.09679198311641812, + "rewards/format_reward": 0.8333333395421505, + "step": 289 + }, + { + "advantage_max": 1.5296735242009163, + "advantage_mean": 0.0, + "advantage_min": -0.6690849587321281, + "advantage_std": 0.7993335947394371, + "completion_length": 2463.6459045410156, + "epoch": 0.3314285714285714, + "grad_norm": 0.539526641368866, + "kl": 0.4915771484375, + "lambda_div_used": 0.5, + "learning_rate": 5.02962191529556e-07, + "loss": 0.0596, + "reward": -0.0028451760299503803, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.0028451760299503803, + "reward_after_std": 0.7993335947394371, + "reward_before_mean": 0.5566588691435754, + "reward_before_std": 0.7177799604833126, + "reward_change_max": 0.0003264695405960083, + "reward_change_mean": -0.5595040284097195, + "reward_change_min": -0.9658835083246231, + "reward_change_std": 0.3854859843850136, + "reward_std": 0.7993336021900177, + "rewards/cosine_scaled_reward": -0.12792058615013957, + "rewards/format_reward": 0.8125000149011612, + "step": 290 + }, + { + "advantage_max": 1.5921382904052734, + "advantage_mean": -4.346172144398253e-09, + "advantage_min": -0.7556582726538181, + "advantage_std": 0.8591411933302879, + "completion_length": 2490.354232788086, + "epoch": 0.3325714285714286, + "grad_norm": 0.6908680200576782, + "kl": 0.317626953125, + "lambda_div_used": 0.5, + "learning_rate": 4.998389805071536e-07, + "loss": 0.0146, + "reward": 0.07940021844115108, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.07940021844115108, + "reward_after_std": 0.8591412007808685, + "reward_before_mean": 0.6981133483350277, + "reward_before_std": 0.8550258204340935, + "reward_change_max": 0.0007056146860122681, + "reward_change_mean": -0.6187131479382515, + "reward_change_min": -1.133220985531807, + "reward_change_std": 0.4543332364410162, + "reward_std": 0.8591412231326103, + "rewards/cosine_scaled_reward": -0.046776650473475456, + "rewards/format_reward": 0.7916666828095913, + "step": 291 + }, + { + "advantage_max": 1.5652444809675217, + "advantage_mean": 4.346172199909404e-09, + "advantage_min": -0.5440915711224079, + "advantage_std": 0.7995546236634254, + "completion_length": 2738.916732788086, + "epoch": 0.33371428571428574, + "grad_norm": 0.6306151151657104, + "kl": 0.31866455078125, + "lambda_div_used": 0.5, + "learning_rate": 4.967182142620745e-07, + "loss": 0.0131, + "reward": -0.05529059190303087, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.05529059190303087, + "reward_after_std": 0.799554631114006, + "reward_before_mean": 0.45519457198679447, + "reward_before_std": 0.6706855967640877, + "reward_change_max": 0.0017582103610038757, + "reward_change_mean": -0.5104851890355349, + "reward_change_min": -0.9267382770776749, + "reward_change_std": 0.3522724714130163, + "reward_std": 0.799554668366909, + "rewards/cosine_scaled_reward": -0.10573606146499515, + "rewards/format_reward": 0.6666666772216558, + "step": 292 + }, + { + "advantage_max": 1.276260830461979, + "advantage_mean": 6.829698917520943e-09, + "advantage_min": -0.570405226200819, + "advantage_std": 0.6685235388576984, + "completion_length": 2273.875030517578, + "epoch": 0.33485714285714285, + "grad_norm": 0.9914068579673767, + "kl": 0.29083251953125, + "lambda_div_used": 0.5, + "learning_rate": 4.93600044896063e-07, + "loss": 0.0091, + "reward": 0.055066212080419064, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.055066212080419064, + "reward_after_std": 0.6685235537588596, + "reward_before_mean": 0.7027740270714276, + "reward_before_std": 0.5311368498951197, + "reward_change_max": 0.001424834132194519, + "reward_change_mean": -0.6477077975869179, + "reward_change_min": -1.0224635303020477, + "reward_change_std": 0.40102437511086464, + "reward_std": 0.6685235574841499, + "rewards/cosine_scaled_reward": -0.08611301146447659, + "rewards/format_reward": 0.8750000074505806, + "step": 293 + }, + { + "advantage_max": 1.177469477057457, + "advantage_mean": -3.1044085080367267e-09, + "advantage_min": -0.5789675116539001, + "advantage_std": 0.6299779340624809, + "completion_length": 3101.604278564453, + "epoch": 0.336, + "grad_norm": 1.2353851795196533, + "kl": 0.46026611328125, + "lambda_div_used": 0.5, + "learning_rate": 4.904846243842949e-07, + "loss": 0.0339, + "reward": -0.12046112306416035, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.12046112306416035, + "reward_after_std": 0.6299779377877712, + "reward_before_mean": 0.40089414454996586, + "reward_before_std": 0.588392173871398, + "reward_change_max": 0.000742591917514801, + "reward_change_mean": -0.5213552713394165, + "reward_change_min": -0.9144763983786106, + "reward_change_std": 0.35968529619276524, + "reward_std": 0.6299779377877712, + "rewards/cosine_scaled_reward": -0.08080292865633965, + "rewards/format_reward": 0.5625000111758709, + "step": 294 + }, + { + "advantage_max": 1.599842369556427, + "advantage_mean": 1.2417633588057697e-09, + "advantage_min": -0.678341705352068, + "advantage_std": 0.8257317095994949, + "completion_length": 2566.4583587646484, + "epoch": 0.33714285714285713, + "grad_norm": 1.0961662530899048, + "kl": 0.3182373046875, + "lambda_div_used": 0.5, + "learning_rate": 4.873721045679706e-07, + "loss": -0.0227, + "reward": 0.08157170051708817, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.08157170051708817, + "reward_after_std": 0.8257317095994949, + "reward_before_mean": 0.7002139764372259, + "reward_before_std": 0.697816614061594, + "reward_change_max": 0.000387534499168396, + "reward_change_mean": -0.6186422556638718, + "reward_change_min": -1.0723202005028725, + "reward_change_std": 0.3978155329823494, + "reward_std": 0.8257317095994949, + "rewards/cosine_scaled_reward": 0.05844030901789665, + "rewards/format_reward": 0.5833333395421505, + "step": 295 + }, + { + "advantage_max": 1.5283400043845177, + "advantage_mean": -2.1730860721991263e-09, + "advantage_min": -0.7080317139625549, + "advantage_std": 0.8136551566421986, + "completion_length": 3081.4166870117188, + "epoch": 0.3382857142857143, + "grad_norm": 0.6585099697113037, + "kl": 0.266845703125, + "lambda_div_used": 0.5, + "learning_rate": 4.842626371469149e-07, + "loss": 0.0465, + "reward": -0.008985697524622083, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.008985697524622083, + "reward_after_std": 0.81365517526865, + "reward_before_mean": 0.5441303681582212, + "reward_before_std": 0.784828394651413, + "reward_change_max": 0.000543445348739624, + "reward_change_mean": -0.5531160607933998, + "reward_change_min": -1.0541326105594635, + "reward_change_std": 0.4212801605463028, + "reward_std": 0.8136552199721336, + "rewards/cosine_scaled_reward": -0.07168482430279255, + "rewards/format_reward": 0.6875000111758709, + "step": 296 + }, + { + "advantage_max": 1.088131882250309, + "advantage_mean": 1.4901161637936866e-08, + "advantage_min": -0.4907407984137535, + "advantage_std": 0.5779940336942673, + "completion_length": 3145.0625610351562, + "epoch": 0.3394285714285714, + "grad_norm": 0.2617074251174927, + "kl": 0.2529296875, + "lambda_div_used": 0.5, + "learning_rate": 4.811563736721829e-07, + "loss": 0.0091, + "reward": -0.3666674308478832, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.3666674308478832, + "reward_after_std": 0.5779940262436867, + "reward_before_mean": -0.03471547598019242, + "reward_before_std": 0.5734328739345074, + "reward_change_max": 0.0015069320797920227, + "reward_change_mean": -0.33195194229483604, + "reward_change_min": -0.6547215916216373, + "reward_change_std": 0.2700279410928488, + "reward_std": 0.5779940336942673, + "rewards/cosine_scaled_reward": -0.1527744084596634, + "rewards/format_reward": 0.27083333767950535, + "step": 297 + }, + { + "advantage_max": 1.5531343445181847, + "advantage_mean": 7.450580929990736e-09, + "advantage_min": -0.6577443964779377, + "advantage_std": 0.8128904439508915, + "completion_length": 2198.916748046875, + "epoch": 0.3405714285714286, + "grad_norm": 0.2890186011791229, + "kl": 0.135406494140625, + "lambda_div_used": 0.5, + "learning_rate": 4.780534655386743e-07, + "loss": 0.0164, + "reward": 0.01897238101810217, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.01897238101810217, + "reward_after_std": 0.8128904215991497, + "reward_before_mean": 0.5919715128839016, + "reward_before_std": 0.7551921270787716, + "reward_change_max": 0.005190655589103699, + "reward_change_mean": -0.5729991532862186, + "reward_change_min": -1.0509915724396706, + "reward_change_std": 0.4005443025380373, + "reward_std": 0.8128904551267624, + "rewards/cosine_scaled_reward": -0.05818091053515673, + "rewards/format_reward": 0.7083333525806665, + "step": 298 + }, + { + "advantage_max": 1.967075452208519, + "advantage_mean": 1.1175871117430347e-08, + "advantage_min": -0.8121235743165016, + "advantage_std": 1.0208548679947853, + "completion_length": 2885.604217529297, + "epoch": 0.3417142857142857, + "grad_norm": 0.44406768679618835, + "kl": 0.1641845703125, + "lambda_div_used": 0.5, + "learning_rate": 4.749540639777539e-07, + "loss": -0.0042, + "reward": 0.03758762776851654, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.03758762776851654, + "reward_after_std": 1.0208548456430435, + "reward_before_mean": 0.5578274028375745, + "reward_before_std": 0.9705353602766991, + "reward_change_max": 0.0003610551357269287, + "reward_change_mean": -0.5202397517859936, + "reward_change_min": -1.0511676035821438, + "reward_change_std": 0.4258074313402176, + "reward_std": 1.0208548977971077, + "rewards/cosine_scaled_reward": 0.02891370188444853, + "rewards/format_reward": 0.5000000167638063, + "step": 299 + }, + { + "advantage_max": 1.2897720709443092, + "advantage_mean": -3.7252904094842165e-09, + "advantage_min": -0.5727378912270069, + "advantage_std": 0.678062092512846, + "completion_length": 2925.1875610351562, + "epoch": 0.34285714285714286, + "grad_norm": 0.2789536714553833, + "kl": 0.170654296875, + "lambda_div_used": 0.5, + "learning_rate": 4.7185832004988133e-07, + "loss": 0.0069, + "reward": -0.25556246004998684, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.25556246004998684, + "reward_after_std": 0.6780621074140072, + "reward_before_mean": 0.1323627531528473, + "reward_before_std": 0.6543583087623119, + "reward_change_max": 0.0016498491168022156, + "reward_change_mean": -0.3879252327606082, + "reward_change_min": -0.8045506216585636, + "reward_change_std": 0.31338599789887667, + "reward_std": 0.6780621185898781, + "rewards/cosine_scaled_reward": -0.18381863087415695, + "rewards/format_reward": 0.5000000074505806, + "step": 300 + }, + { + "advantage_max": 1.3899911418557167, + "advantage_mean": 2.483526884144993e-09, + "advantage_min": -0.5897494703531265, + "advantage_std": 0.742620075121522, + "completion_length": 2620.125030517578, + "epoch": 0.344, + "grad_norm": 0.49501293897628784, + "kl": 0.1964111328125, + "lambda_div_used": 0.5, + "learning_rate": 4.68766384637248e-07, + "loss": 0.0368, + "reward": -0.05963777285069227, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.05963777285069227, + "reward_after_std": 0.7426200993359089, + "reward_before_mean": 0.4776543521657004, + "reward_before_std": 0.7033826969563961, + "reward_change_max": 0.0021210387349128723, + "reward_change_mean": -0.5372921079397202, + "reward_change_min": -1.0695495195686817, + "reward_change_std": 0.414053525775671, + "reward_std": 0.7426201142370701, + "rewards/cosine_scaled_reward": -0.04242282547056675, + "rewards/format_reward": 0.5625000037252903, + "step": 301 + }, + { + "advantage_max": 1.5067075043916702, + "advantage_mean": 1.1175871339474952e-08, + "advantage_min": -0.6437755972146988, + "advantage_std": 0.7938609048724174, + "completion_length": 2604.479248046875, + "epoch": 0.34514285714285714, + "grad_norm": 0.6377979516983032, + "kl": 0.15997314453125, + "lambda_div_used": 0.5, + "learning_rate": 4.656784084364238e-07, + "loss": -0.0173, + "reward": -0.1051476038992405, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.1051476038992405, + "reward_after_std": 0.7938608899712563, + "reward_before_mean": 0.3738048989325762, + "reward_before_std": 0.768853772431612, + "reward_change_max": 0.002476900815963745, + "reward_change_mean": -0.4789524972438812, + "reward_change_min": -0.9946075230836868, + "reward_change_std": 0.3719355911016464, + "reward_std": 0.7938609048724174, + "rewards/cosine_scaled_reward": -0.08393089659512043, + "rewards/format_reward": 0.5416666753590107, + "step": 302 + }, + { + "advantage_max": 1.6140480786561966, + "advantage_mean": -1.924733383784627e-08, + "advantage_min": -0.6462118700146675, + "advantage_std": 0.8338347300887108, + "completion_length": 2591.8542098999023, + "epoch": 0.3462857142857143, + "grad_norm": 0.371255487203598, + "kl": 0.1746826171875, + "lambda_div_used": 0.5, + "learning_rate": 4.6259454195101267e-07, + "loss": 0.0434, + "reward": -0.06317769235465676, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.06317769235465676, + "reward_after_std": 0.8338347561657429, + "reward_before_mean": 0.42486920207738876, + "reward_before_std": 0.7611415684223175, + "reward_change_max": 0.0010538250207901, + "reward_change_mean": -0.4880469013005495, + "reward_change_min": -0.8825994059443474, + "reward_change_std": 0.3486558496952057, + "reward_std": 0.8338347896933556, + "rewards/cosine_scaled_reward": -0.12089874129742384, + "rewards/format_reward": 0.6666666734963655, + "step": 303 + }, + { + "advantage_max": 1.1657679006457329, + "advantage_mean": 5.587935225648266e-09, + "advantage_min": -0.6645041145384312, + "advantage_std": 0.6425657123327255, + "completion_length": 2899.854232788086, + "epoch": 0.3474285714285714, + "grad_norm": 0.6334242820739746, + "kl": 0.21673583984375, + "lambda_div_used": 0.5, + "learning_rate": 4.59514935484316e-07, + "loss": 0.0462, + "reward": -0.1580784060060978, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.1580784060060978, + "reward_after_std": 0.6425657048821449, + "reward_before_mean": 0.3347341902554035, + "reward_before_std": 0.6689680777490139, + "reward_change_max": 0.0011926591396331787, + "reward_change_mean": -0.4928125822916627, + "reward_change_min": -0.8947502039372921, + "reward_change_std": 0.3764376938343048, + "reward_std": 0.6425657123327255, + "rewards/cosine_scaled_reward": -0.12429956905543804, + "rewards/format_reward": 0.5833333525806665, + "step": 304 + }, + { + "advantage_max": 1.5461205169558525, + "advantage_mean": 1.4280279847511679e-08, + "advantage_min": -0.6990028731524944, + "advantage_std": 0.8203661367297173, + "completion_length": 3038.5625762939453, + "epoch": 0.3485714285714286, + "grad_norm": 0.5466167330741882, + "kl": 0.18798828125, + "lambda_div_used": 0.5, + "learning_rate": 4.5643973913200837e-07, + "loss": 0.0264, + "reward": -0.13418531976640224, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.13418531976640224, + "reward_after_std": 0.820366133004427, + "reward_before_mean": 0.31502748280763626, + "reward_before_std": 0.8251273334026337, + "reward_change_max": 0.0004555061459541321, + "reward_change_mean": -0.44921278581023216, + "reward_change_min": -0.9437504261732101, + "reward_change_std": 0.3788600452244282, + "reward_std": 0.8203661553561687, + "rewards/cosine_scaled_reward": -0.10290294280275702, + "rewards/format_reward": 0.5208333469927311, + "step": 305 + }, + { + "advantage_max": 1.7352852076292038, + "advantage_mean": -1.8626450382086546e-09, + "advantage_min": -0.6884033642709255, + "advantage_std": 0.8893226571381092, + "completion_length": 2894.916717529297, + "epoch": 0.3497142857142857, + "grad_norm": 0.5606780052185059, + "kl": 0.16204833984375, + "lambda_div_used": 0.5, + "learning_rate": 4.5336910277482155e-07, + "loss": 0.0152, + "reward": 0.17971285339444876, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.17971285339444876, + "reward_after_std": 0.8893226645886898, + "reward_before_mean": 0.8592891084699659, + "reward_before_std": 0.7582590952515602, + "reward_change_max": 0.0, + "reward_change_mean": -0.6795762628316879, + "reward_change_min": -1.081557810306549, + "reward_change_std": 0.4216056726872921, + "reward_std": 0.8893226906657219, + "rewards/cosine_scaled_reward": 0.08589455112814903, + "rewards/format_reward": 0.6875000074505806, + "step": 306 + }, + { + "advantage_max": 1.7301098704338074, + "advantage_mean": 1.2417633588057697e-09, + "advantage_min": -0.7147096432745457, + "advantage_std": 0.8947756588459015, + "completion_length": 2657.187530517578, + "epoch": 0.35085714285714287, + "grad_norm": 0.34765297174453735, + "kl": 0.2095947265625, + "lambda_div_used": 0.5, + "learning_rate": 4.503031760712397e-07, + "loss": 0.0495, + "reward": 0.019154822453856468, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.019154822453856468, + "reward_after_std": 0.8947756588459015, + "reward_before_mean": 0.5600891445064917, + "reward_before_std": 0.8151229023933411, + "reward_change_max": 0.0, + "reward_change_mean": -0.5409342758357525, + "reward_change_min": -1.0163558684289455, + "reward_change_std": 0.38923518545925617, + "reward_std": 0.8947756960988045, + "rewards/cosine_scaled_reward": -0.022038788767531514, + "rewards/format_reward": 0.6041666679084301, + "step": 307 + }, + { + "advantage_max": 1.384334035217762, + "advantage_mean": 4.9670538238011375e-09, + "advantage_min": -0.6601945385336876, + "advantage_std": 0.7446734458208084, + "completion_length": 3214.0834045410156, + "epoch": 0.352, + "grad_norm": 0.5936756134033203, + "kl": 0.29541015625, + "lambda_div_used": 0.5, + "learning_rate": 4.4724210845020494e-07, + "loss": 0.0293, + "reward": -0.14771428517997265, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.14771428517997265, + "reward_after_std": 0.7446734383702278, + "reward_before_mean": 0.31862270552664995, + "reward_before_std": 0.7598531804978848, + "reward_change_max": 0.0, + "reward_change_mean": -0.46633700653910637, + "reward_change_min": -0.9204640761017799, + "reward_change_std": 0.36674703285098076, + "reward_std": 0.744673453271389, + "rewards/cosine_scaled_reward": -0.14277198538184166, + "rewards/format_reward": 0.6041666809469461, + "step": 308 + }, + { + "advantage_max": 1.5429324805736542, + "advantage_mean": -2.483527050678447e-09, + "advantage_min": -0.635457769036293, + "advantage_std": 0.808469258248806, + "completion_length": 2883.6875610351562, + "epoch": 0.35314285714285715, + "grad_norm": 0.34315183758735657, + "kl": 0.21173095703125, + "lambda_div_used": 0.5, + "learning_rate": 4.441860491038345e-07, + "loss": 0.0044, + "reward": 0.039851417765021324, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.039851417765021324, + "reward_after_std": 0.8084692545235157, + "reward_before_mean": 0.6314571984112263, + "reward_before_std": 0.7366010136902332, + "reward_change_max": 0.0005460754036903381, + "reward_change_mean": -0.5916057825088501, + "reward_change_min": -1.1304726675152779, + "reward_change_std": 0.40859665535390377, + "reward_std": 0.8084692656993866, + "rewards/cosine_scaled_reward": -0.05927141313441098, + "rewards/format_reward": 0.7500000111758709, + "step": 309 + }, + { + "advantage_max": 1.740215465426445, + "advantage_mean": -3.7252904094842165e-09, + "advantage_min": -0.6650184690952301, + "advantage_std": 0.8944349363446236, + "completion_length": 2405.479217529297, + "epoch": 0.35428571428571426, + "grad_norm": 0.8915708661079407, + "kl": 0.242462158203125, + "lambda_div_used": 0.5, + "learning_rate": 4.4113514698014953e-07, + "loss": 0.0609, + "reward": -0.012727348133921623, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.012727348133921623, + "reward_after_std": 0.8944349437952042, + "reward_before_mean": 0.4980372078716755, + "reward_before_std": 0.8056224025785923, + "reward_change_max": 0.0, + "reward_change_mean": -0.5107645392417908, + "reward_change_min": -0.9406066909432411, + "reward_change_std": 0.35088925808668137, + "reward_std": 0.894434966146946, + "rewards/cosine_scaled_reward": -0.11556475143879652, + "rewards/format_reward": 0.7291666734963655, + "step": 310 + }, + { + "advantage_max": 1.7507240772247314, + "advantage_mean": -1.8626452047421083e-08, + "advantage_min": -0.8782271668314934, + "advantage_std": 0.9409815222024918, + "completion_length": 2372.916717529297, + "epoch": 0.3554285714285714, + "grad_norm": 0.933577835559845, + "kl": 0.16912841796875, + "lambda_div_used": 0.5, + "learning_rate": 4.3808955077581546e-07, + "loss": 0.017, + "reward": 0.17776218801736832, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.17776218801736832, + "reward_after_std": 0.9409815445542336, + "reward_before_mean": 0.8487966532702558, + "reward_before_std": 0.9439231902360916, + "reward_change_max": 0.0004132986068725586, + "reward_change_mean": -0.6710344441235065, + "reward_change_min": -1.344845950603485, + "reward_change_std": 0.5198050625622272, + "reward_std": 0.9409816116094589, + "rewards/cosine_scaled_reward": 0.08064829930663109, + "rewards/format_reward": 0.6875000186264515, + "step": 311 + }, + { + "advantage_max": 1.6873877942562103, + "advantage_mean": -3.725290076417309e-09, + "advantage_min": -0.6372330226004124, + "advantage_std": 0.8659485727548599, + "completion_length": 2212.4792098999023, + "epoch": 0.3565714285714286, + "grad_norm": 0.9566351771354675, + "kl": 0.2457275390625, + "lambda_div_used": 0.5, + "learning_rate": 4.350494089288943e-07, + "loss": -0.0267, + "reward": 0.2527286000549793, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2527286000549793, + "reward_after_std": 0.8659485355019569, + "reward_before_mean": 0.9952520411461592, + "reward_before_std": 0.6744127124547958, + "reward_change_max": 0.0006193891167640686, + "reward_change_mean": -0.7425234168767929, + "reward_change_min": -1.1924034729599953, + "reward_change_std": 0.4576085638254881, + "reward_std": 0.8659485578536987, + "rewards/cosine_scaled_reward": 0.18512600846588612, + "rewards/format_reward": 0.6250000018626451, + "step": 312 + }, + { + "advantage_max": 1.463294543325901, + "advantage_mean": -1.4901161526914564e-08, + "advantage_min": -0.5900123342871666, + "advantage_std": 0.7641986832022667, + "completion_length": 2953.7083587646484, + "epoch": 0.3577142857142857, + "grad_norm": 89.38374328613281, + "kl": 2.8055419921875, + "lambda_div_used": 0.5, + "learning_rate": 4.3201486961161093e-07, + "loss": 0.0373, + "reward": 0.10185758583247662, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.10185758583247662, + "reward_after_std": 0.7641987055540085, + "reward_before_mean": 0.7619487410411239, + "reward_before_std": 0.6133890030905604, + "reward_change_max": 0.0006252899765968323, + "reward_change_mean": -0.6600911617279053, + "reward_change_min": -1.149497613310814, + "reward_change_std": 0.4455935023725033, + "reward_std": 0.7641987279057503, + "rewards/cosine_scaled_reward": 0.05805770156439394, + "rewards/format_reward": 0.645833345130086, + "step": 313 + }, + { + "advantage_max": 1.4923174902796745, + "advantage_mean": -4.34617203337595e-09, + "advantage_min": -0.5729423686861992, + "advantage_std": 0.7636104431003332, + "completion_length": 2375.666717529297, + "epoch": 0.3588571428571429, + "grad_norm": 0.924056351184845, + "kl": 0.2772216796875, + "lambda_div_used": 0.5, + "learning_rate": 4.2898608072313045e-07, + "loss": 0.0017, + "reward": 0.1338311405852437, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1338311405852437, + "reward_after_std": 0.7636104375123978, + "reward_before_mean": 0.8161500915884972, + "reward_before_std": 0.5966301336884499, + "reward_change_max": 0.0, + "reward_change_mean": -0.682318925857544, + "reward_change_min": -1.1118975020945072, + "reward_change_std": 0.397566681727767, + "reward_std": 0.7636104635894299, + "rewards/cosine_scaled_reward": 0.022658362751826644, + "rewards/format_reward": 0.7708333358168602, + "step": 314 + }, + { + "advantage_max": 1.401157207787037, + "advantage_mean": 1.3659398390153399e-08, + "advantage_min": -0.5785900503396988, + "advantage_std": 0.7340905368328094, + "completion_length": 2862.1666870117188, + "epoch": 0.36, + "grad_norm": 0.5284225940704346, + "kl": 0.3608856201171875, + "lambda_div_used": 0.5, + "learning_rate": 4.2596318988235037e-07, + "loss": 0.0091, + "reward": -0.15376823209226131, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.15376823209226131, + "reward_after_std": 0.7340905666351318, + "reward_before_mean": 0.2963081202469766, + "reward_before_std": 0.6793590821325779, + "reward_change_max": 0.001107342541217804, + "reward_change_mean": -0.4500763714313507, + "reward_change_min": -0.8281183242797852, + "reward_change_std": 0.3382910368964076, + "reward_std": 0.7340905852615833, + "rewards/cosine_scaled_reward": -0.09142925776541233, + "rewards/format_reward": 0.47916667722165585, + "step": 315 + }, + { + "advantage_max": 1.1306168586015701, + "advantage_mean": 1.1796752796833232e-08, + "advantage_min": -0.48234958946704865, + "advantage_std": 0.5896905846893787, + "completion_length": 3272.604217529297, + "epoch": 0.36114285714285715, + "grad_norm": 0.6392146348953247, + "kl": 0.358642578125, + "lambda_div_used": 0.5, + "learning_rate": 4.2294634442070553e-07, + "loss": 0.0484, + "reward": -0.3114443914964795, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.3114443914964795, + "reward_after_std": 0.589690588414669, + "reward_before_mean": 0.05901812016963959, + "reward_before_std": 0.5436771884560585, + "reward_change_max": 0.0008131638169288635, + "reward_change_mean": -0.37046249210834503, + "reward_change_min": -0.6907815337181091, + "reward_change_std": 0.2691768379881978, + "reward_std": 0.5896905958652496, + "rewards/cosine_scaled_reward": -0.23090762086212635, + "rewards/format_reward": 0.5208333414047956, + "step": 316 + }, + { + "advantage_max": 1.5040301159024239, + "advantage_mean": -1.241763458725842e-08, + "advantage_min": -0.5454112328588963, + "advantage_std": 0.7728907950222492, + "completion_length": 2862.3959197998047, + "epoch": 0.36228571428571427, + "grad_norm": 0.7288563251495361, + "kl": 0.33740234375, + "lambda_div_used": 0.5, + "learning_rate": 4.1993569137498776e-07, + "loss": 0.0474, + "reward": 0.04084273800253868, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.04084273800253868, + "reward_after_std": 0.7728907912969589, + "reward_before_mean": 0.6436011524274363, + "reward_before_std": 0.6424034424126148, + "reward_change_max": 0.0, + "reward_change_mean": -0.6027584280818701, + "reward_change_min": -1.0649547278881073, + "reward_change_std": 0.381022609770298, + "reward_std": 0.7728908061981201, + "rewards/cosine_scaled_reward": -0.03236610069870949, + "rewards/format_reward": 0.7083333488553762, + "step": 317 + }, + { + "advantage_max": 1.589605301618576, + "advantage_mean": 1.2728075593493315e-08, + "advantage_min": -0.6930773742496967, + "advantage_std": 0.8379422500729561, + "completion_length": 2228.4375610351562, + "epoch": 0.36342857142857143, + "grad_norm": 0.26594236493110657, + "kl": 0.27484130859375, + "lambda_div_used": 0.5, + "learning_rate": 4.1693137748017915e-07, + "loss": 0.0243, + "reward": 0.030790013261139393, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.030790013261139393, + "reward_after_std": 0.8379422500729561, + "reward_before_mean": 0.6122659891843796, + "reward_before_std": 0.7788247428834438, + "reward_change_max": 0.0004334002733230591, + "reward_change_mean": -0.5814759768545628, + "reward_change_min": -1.061459630727768, + "reward_change_std": 0.4175482243299484, + "reward_std": 0.8379422500729561, + "rewards/cosine_scaled_reward": -0.13136701984331012, + "rewards/format_reward": 0.8750000149011612, + "step": 318 + }, + { + "advantage_max": 1.1976237669587135, + "advantage_mean": 6.674478525425798e-09, + "advantage_min": -0.5388987213373184, + "advantage_std": 0.6269382648169994, + "completion_length": 3055.416702270508, + "epoch": 0.36457142857142855, + "grad_norm": 0.41507601737976074, + "kl": 0.32904052734375, + "lambda_div_used": 0.5, + "learning_rate": 4.1393354916230005e-07, + "loss": 0.0276, + "reward": -0.17632382595911622, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.17632382595911622, + "reward_after_std": 0.6269382573664188, + "reward_before_mean": 0.29730080626904964, + "reward_before_std": 0.5578728318214417, + "reward_change_max": 0.0006338581442832947, + "reward_change_mean": -0.4736246280372143, + "reward_change_min": -0.7681119628250599, + "reward_change_std": 0.3120098374783993, + "reward_std": 0.6269382685422897, + "rewards/cosine_scaled_reward": -0.1638496033847332, + "rewards/format_reward": 0.6250000167638063, + "step": 319 + }, + { + "advantage_max": 1.8235188126564026, + "advantage_mean": 6.208817349140361e-09, + "advantage_min": -0.7356794327497482, + "advantage_std": 0.9437028914690018, + "completion_length": 2396.0626220703125, + "epoch": 0.3657142857142857, + "grad_norm": 0.5142975449562073, + "kl": 0.2520751953125, + "lambda_div_used": 0.5, + "learning_rate": 4.1094235253127374e-07, + "loss": 0.0213, + "reward": 0.07883075065910816, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.07883075065910816, + "reward_after_std": 0.9437028840184212, + "reward_before_mean": 0.6556731648743153, + "reward_before_std": 0.8697749823331833, + "reward_change_max": 0.0, + "reward_change_mean": -0.5768423937261105, + "reward_change_min": -1.047103874385357, + "reward_change_std": 0.40687838755548, + "reward_std": 0.9437028951942921, + "rewards/cosine_scaled_reward": -0.08883010782301426, + "rewards/format_reward": 0.8333333395421505, + "step": 320 + }, + { + "advantage_max": 1.9276015385985374, + "advantage_mean": -1.3038516599728212e-08, + "advantage_min": -0.7980079278349876, + "advantage_std": 0.99143286049366, + "completion_length": 2184.812545776367, + "epoch": 0.3668571428571429, + "grad_norm": 0.793869137763977, + "kl": 0.189727783203125, + "lambda_div_used": 0.5, + "learning_rate": 4.079579333738039e-07, + "loss": 0.0287, + "reward": 0.42022357787936926, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.42022357787936926, + "reward_after_std": 0.9914328753948212, + "reward_before_mean": 1.2628098390996456, + "reward_before_std": 0.8046529665589333, + "reward_change_max": 0.001957610249519348, + "reward_change_mean": -0.8425862416625023, + "reward_change_min": -1.369489625096321, + "reward_change_std": 0.5107485167682171, + "reward_std": 0.9914328753948212, + "rewards/cosine_scaled_reward": 0.18348823045380414, + "rewards/format_reward": 0.8958333507180214, + "step": 321 + }, + { + "advantage_max": 1.6895935460925102, + "advantage_mean": -5.587935503204022e-09, + "advantage_min": -0.6205887608230114, + "advantage_std": 0.8657153844833374, + "completion_length": 2706.1250915527344, + "epoch": 0.368, + "grad_norm": 0.3828519582748413, + "kl": 0.29534912109375, + "lambda_div_used": 0.5, + "learning_rate": 4.0498043714627006e-07, + "loss": 0.0165, + "reward": -0.05292603746056557, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.05292603746056557, + "reward_after_std": 0.8657153844833374, + "reward_before_mean": 0.4366830997169018, + "reward_before_std": 0.773483332246542, + "reward_change_max": 0.0, + "reward_change_mean": -0.48960915207862854, + "reward_change_min": -0.926708921790123, + "reward_change_std": 0.3389029707759619, + "reward_std": 0.8657154068350792, + "rewards/cosine_scaled_reward": -0.11499179247766733, + "rewards/format_reward": 0.6666666902601719, + "step": 322 + }, + { + "advantage_max": 1.400377780199051, + "advantage_mean": -2.4835271617007493e-09, + "advantage_min": -0.6002854071557522, + "advantage_std": 0.7208580374717712, + "completion_length": 2751.1458892822266, + "epoch": 0.36914285714285716, + "grad_norm": 0.3763584792613983, + "kl": 0.2479248046875, + "lambda_div_used": 0.5, + "learning_rate": 4.020100089676376e-07, + "loss": 0.0295, + "reward": -0.013417241163551807, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.013417241163551807, + "reward_after_std": 0.7208580300211906, + "reward_before_mean": 0.5566643313504755, + "reward_before_std": 0.5935582704842091, + "reward_change_max": 0.0009166598320007324, + "reward_change_mean": -0.5700815692543983, + "reward_change_min": -0.9349125400185585, + "reward_change_std": 0.3692823648452759, + "reward_std": 0.7208580449223518, + "rewards/cosine_scaled_reward": -0.09666784037835896, + "rewards/format_reward": 0.7500000204890966, + "step": 323 + }, + { + "advantage_max": 1.7073202952742577, + "advantage_mean": 1.2417634476236117e-08, + "advantage_min": -0.6523040719330311, + "advantage_std": 0.8818818256258965, + "completion_length": 3288.0000915527344, + "epoch": 0.3702857142857143, + "grad_norm": 0.9621612429618835, + "kl": 0.3841552734375, + "lambda_div_used": 0.5, + "learning_rate": 3.9904679361238526e-07, + "loss": 0.07, + "reward": -0.23006662633270025, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.23006662633270025, + "reward_after_std": 0.881881844252348, + "reward_before_mean": 0.11532109789550304, + "reward_before_std": 0.8703635148704052, + "reward_change_max": 0.0007134005427360535, + "reward_change_mean": -0.3453877214342356, + "reward_change_min": -0.7478644847869873, + "reward_change_std": 0.2995779123157263, + "reward_std": 0.881881844252348, + "rewards/cosine_scaled_reward": -0.20275612798286602, + "rewards/format_reward": 0.5208333488553762, + "step": 324 + }, + { + "advantage_max": 1.7375487461686134, + "advantage_mean": 1.1796752963366686e-08, + "advantage_min": -0.6599301993846893, + "advantage_std": 0.9019149504601955, + "completion_length": 2948.1250915527344, + "epoch": 0.37142857142857144, + "grad_norm": 0.44344088435173035, + "kl": 0.3438720703125, + "lambda_div_used": 0.5, + "learning_rate": 3.9609093550344907e-07, + "loss": 0.0385, + "reward": -0.017678143922239542, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.017678143922239542, + "reward_after_std": 0.9019149504601955, + "reward_before_mean": 0.4983112784102559, + "reward_before_std": 0.8421217501163483, + "reward_change_max": 0.0005606338381767273, + "reward_change_mean": -0.5159894041717052, + "reward_change_min": -1.0590049587190151, + "reward_change_std": 0.38723311573266983, + "reward_std": 0.9019149765372276, + "rewards/cosine_scaled_reward": -0.0945943733677268, + "rewards/format_reward": 0.6875000074505806, + "step": 325 + }, + { + "advantage_max": 1.5786890238523483, + "advantage_mean": 2.2972624136308184e-08, + "advantage_min": -0.6869267821311951, + "advantage_std": 0.8400480523705482, + "completion_length": 2644.041748046875, + "epoch": 0.37257142857142855, + "grad_norm": 0.5172160863876343, + "kl": 0.26190185546875, + "lambda_div_used": 0.5, + "learning_rate": 3.931425787051832e-07, + "loss": 0.0359, + "reward": 0.14027714263647795, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.14027714263647795, + "reward_after_std": 0.8400480523705482, + "reward_before_mean": 0.813402040861547, + "reward_before_std": 0.748632000759244, + "reward_change_max": 0.0004016384482383728, + "reward_change_mean": -0.6731249168515205, + "reward_change_min": -1.1637303456664085, + "reward_change_std": 0.4690163619816303, + "reward_std": 0.8400481045246124, + "rewards/cosine_scaled_reward": -0.009965650620870292, + "rewards/format_reward": 0.8333333432674408, + "step": 326 + }, + { + "advantage_max": 2.1002472937107086, + "advantage_mean": -1.8626451825376478e-08, + "advantage_min": -0.9241488240659237, + "advantage_std": 1.1009439006447792, + "completion_length": 2588.14591217041, + "epoch": 0.3737142857142857, + "grad_norm": 0.4935661256313324, + "kl": 0.310272216796875, + "lambda_div_used": 0.5, + "learning_rate": 3.902018669163384e-07, + "loss": 0.0476, + "reward": 0.3117635138332844, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3117635138332844, + "reward_after_std": 1.100943885743618, + "reward_before_mean": 1.0368181890808046, + "reward_before_std": 1.0324792116880417, + "reward_change_max": 0.0, + "reward_change_mean": -0.7250546813011169, + "reward_change_min": -1.3601718544960022, + "reward_change_std": 0.5242071263492107, + "reward_std": 1.1009439453482628, + "rewards/cosine_scaled_reward": 0.09132576221600175, + "rewards/format_reward": 0.8541666939854622, + "step": 327 + }, + { + "advantage_max": 1.6814225018024445, + "advantage_mean": 7.450580929990736e-09, + "advantage_min": -0.6006612703204155, + "advantage_std": 0.8554463051259518, + "completion_length": 3272.916748046875, + "epoch": 0.37485714285714283, + "grad_norm": 0.5845516920089722, + "kl": 0.413330078125, + "lambda_div_used": 0.5, + "learning_rate": 3.872689434630585e-07, + "loss": 0.0414, + "reward": -0.14344895351678133, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.14344895351678133, + "reward_after_std": 0.8554462976753712, + "reward_before_mean": 0.27615911699831486, + "reward_before_std": 0.7600311264395714, + "reward_change_max": 0.0007801279425621033, + "reward_change_mean": -0.41960807144641876, + "reward_change_min": -0.7604356594383717, + "reward_change_std": 0.3008067738264799, + "reward_std": 0.8554463237524033, + "rewards/cosine_scaled_reward": -0.12233711747103371, + "rewards/format_reward": 0.5208333488553762, + "step": 328 + }, + { + "advantage_max": 1.6130209863185883, + "advantage_mean": -6.829699028543246e-09, + "advantage_min": -0.7729388028383255, + "advantage_std": 0.8566270098090172, + "completion_length": 2137.8750762939453, + "epoch": 0.376, + "grad_norm": 1.0926064252853394, + "kl": 0.299530029296875, + "lambda_div_used": 0.5, + "learning_rate": 3.843439512918949e-07, + "loss": -0.002, + "reward": 0.22257465310394764, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.22257465310394764, + "reward_after_std": 0.8566270098090172, + "reward_before_mean": 0.9557426117826253, + "reward_before_std": 0.7563247159123421, + "reward_change_max": 0.0, + "reward_change_mean": -0.7331679500639439, + "reward_change_min": -1.2459847666323185, + "reward_change_std": 0.49254793860018253, + "reward_std": 0.8566270247101784, + "rewards/cosine_scaled_reward": 0.10287128575146198, + "rewards/format_reward": 0.7500000186264515, + "step": 329 + }, + { + "advantage_max": 1.5837075859308243, + "advantage_mean": 1.4901161526914564e-08, + "advantage_min": -0.5107849761843681, + "advantage_std": 0.7936838679015636, + "completion_length": 2223.2083740234375, + "epoch": 0.37714285714285717, + "grad_norm": 0.39597129821777344, + "kl": 0.3514556884765625, + "lambda_div_used": 0.5, + "learning_rate": 3.8142703296283953e-07, + "loss": 0.0206, + "reward": -0.12900587869808078, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.12900587869808078, + "reward_after_std": 0.7936838828027248, + "reward_before_mean": 0.3146856687963009, + "reward_before_std": 0.6586159784346819, + "reward_change_max": 0.0, + "reward_change_mean": -0.44369154796004295, + "reward_change_min": -0.7083565294742584, + "reward_change_std": 0.2738402709364891, + "reward_std": 0.7936839014291763, + "rewards/cosine_scaled_reward": -0.24890717677772045, + "rewards/format_reward": 0.8125000074505806, + "step": 330 + }, + { + "advantage_max": 1.1777547150850296, + "advantage_mean": 1.179675318541129e-08, + "advantage_min": -0.5273271761834621, + "advantage_std": 0.6165788248181343, + "completion_length": 2718.604217529297, + "epoch": 0.3782857142857143, + "grad_norm": 0.4538213908672333, + "kl": 0.3648681640625, + "lambda_div_used": 0.5, + "learning_rate": 3.785183306423767e-07, + "loss": 0.0276, + "reward": -0.22839653585106134, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.22839653585106134, + "reward_after_std": 0.6165788136422634, + "reward_before_mean": 0.20412603858858347, + "reward_before_std": 0.5550496280193329, + "reward_change_max": 0.0007744207978248596, + "reward_change_mean": -0.43252256885170937, + "reward_change_min": -0.7282052636146545, + "reward_change_std": 0.3019270282238722, + "reward_std": 0.6165788173675537, + "rewards/cosine_scaled_reward": -0.1896036472171545, + "rewards/format_reward": 0.5833333432674408, + "step": 331 + }, + { + "advantage_max": 1.5473309606313705, + "advantage_mean": 2.4835262735223296e-09, + "advantage_min": -0.61005724593997, + "advantage_std": 0.7968885004520416, + "completion_length": 2720.729232788086, + "epoch": 0.37942857142857145, + "grad_norm": 0.47542330622673035, + "kl": 0.347320556640625, + "lambda_div_used": 0.5, + "learning_rate": 3.7561798609655373e-07, + "loss": 0.0237, + "reward": 0.0141659677028656, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.0141659677028656, + "reward_after_std": 0.796888493001461, + "reward_before_mean": 0.5891684554517269, + "reward_before_std": 0.6871049627661705, + "reward_change_max": 0.003196209669113159, + "reward_change_mean": -0.5750024765729904, + "reward_change_min": -1.0247270502150059, + "reward_change_std": 0.3719689790159464, + "reward_std": 0.7968885004520416, + "rewards/cosine_scaled_reward": -0.10124912392348051, + "rewards/format_reward": 0.7916666772216558, + "step": 332 + }, + { + "advantage_max": 1.4445695504546165, + "advantage_mean": -1.528921261817473e-08, + "advantage_min": -0.6396168023347855, + "advantage_std": 0.7692903093993664, + "completion_length": 2287.0833892822266, + "epoch": 0.38057142857142856, + "grad_norm": 0.7400115728378296, + "kl": 0.20025634765625, + "lambda_div_used": 0.5, + "learning_rate": 3.72726140684072e-07, + "loss": 0.0036, + "reward": 0.1459917591419071, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1459917591419071, + "reward_after_std": 0.7692903243005276, + "reward_before_mean": 0.8459194973111153, + "reward_before_std": 0.6958840638399124, + "reward_change_max": 0.0, + "reward_change_mean": -0.6999277397990227, + "reward_change_min": -1.225555181503296, + "reward_change_std": 0.4582208953797817, + "reward_std": 0.7692903392016888, + "rewards/cosine_scaled_reward": -0.06662360485643148, + "rewards/format_reward": 0.9791666716337204, + "step": 333 + }, + { + "advantage_max": 1.1501679047942162, + "advantage_mean": 2.483526884144993e-09, + "advantage_min": -0.5715098641812801, + "advantage_std": 0.6181285083293915, + "completion_length": 3047.8334045410156, + "epoch": 0.38171428571428573, + "grad_norm": 0.9689967036247253, + "kl": 0.4241943359375, + "lambda_div_used": 0.5, + "learning_rate": 3.6984293534939737e-07, + "loss": 0.0217, + "reward": -0.1921203788369894, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.1921203788369894, + "reward_after_std": 0.6181284599006176, + "reward_before_mean": 0.27629107190296054, + "reward_before_std": 0.5983157828450203, + "reward_change_max": 0.0005147382616996765, + "reward_change_mean": -0.4684114558622241, + "reward_change_min": -0.8724917247891426, + "reward_change_std": 0.3435290567576885, + "reward_std": 0.6181284710764885, + "rewards/cosine_scaled_reward": -0.20560447499155998, + "rewards/format_reward": 0.6875000223517418, + "step": 334 + }, + { + "advantage_max": 1.7195413634181023, + "advantage_mean": -7.450580818968433e-09, + "advantage_min": -0.7100030183792114, + "advantage_std": 0.9040074683725834, + "completion_length": 2427.020866394043, + "epoch": 0.38285714285714284, + "grad_norm": 2.7406511306762695, + "kl": 0.7989501953125, + "lambda_div_used": 0.5, + "learning_rate": 3.6696851061588994e-07, + "loss": 0.0033, + "reward": 0.16293947119265795, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.16293947119265795, + "reward_after_std": 0.9040074795484543, + "reward_before_mean": 0.8333401568233967, + "reward_before_std": 0.8052664678543806, + "reward_change_max": 7.398426532745361e-05, + "reward_change_mean": -0.6704006977379322, + "reward_change_min": -1.190424356609583, + "reward_change_std": 0.46814507246017456, + "reward_std": 0.9040075056254864, + "rewards/cosine_scaled_reward": 0.031253403052687645, + "rewards/format_reward": 0.7708333432674408, + "step": 335 + }, + { + "advantage_max": 1.7609595283865929, + "advantage_mean": -3.7252904094842165e-09, + "advantage_min": -0.7816978171467781, + "advantage_std": 0.9407963380217552, + "completion_length": 2901.854202270508, + "epoch": 0.384, + "grad_norm": 0.7952771186828613, + "kl": 0.3408203125, + "lambda_div_used": 0.5, + "learning_rate": 3.641030065789562e-07, + "loss": 0.0545, + "reward": 0.08905918773962185, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.08905918773962185, + "reward_after_std": 0.9407963454723358, + "reward_before_mean": 0.6842526560649276, + "reward_before_std": 0.9304242916405201, + "reward_change_max": 0.0, + "reward_change_mean": -0.5951934605836868, + "reward_change_min": -1.1929472386837006, + "reward_change_std": 0.4584904685616493, + "reward_std": 0.9407963864505291, + "rewards/cosine_scaled_reward": -0.0016236957162618637, + "rewards/format_reward": 0.6875000298023224, + "step": 336 + }, + { + "advantage_max": 1.602539524435997, + "advantage_mean": -8.692344399818808e-09, + "advantage_min": -0.6270337104797363, + "advantage_std": 0.8279041424393654, + "completion_length": 2545.7084045410156, + "epoch": 0.3851428571428571, + "grad_norm": 1.1055015325546265, + "kl": 0.306396484375, + "lambda_div_used": 0.5, + "learning_rate": 3.612465628992203e-07, + "loss": 0.0811, + "reward": -0.06436791177839041, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.06436791177839041, + "reward_after_std": 0.8279041349887848, + "reward_before_mean": 0.43131764233112335, + "reward_before_std": 0.756643932312727, + "reward_change_max": 0.0, + "reward_change_mean": -0.4956855494529009, + "reward_change_min": -0.8966120667755604, + "reward_change_std": 0.3453631680458784, + "reward_std": 0.8279041722416878, + "rewards/cosine_scaled_reward": -0.16975786164402962, + "rewards/format_reward": 0.7708333488553762, + "step": 337 + }, + { + "advantage_max": 1.4630458503961563, + "advantage_mean": 6.829699084054397e-09, + "advantage_min": -0.7494680806994438, + "advantage_std": 0.7806403860449791, + "completion_length": 2262.729232788086, + "epoch": 0.3862857142857143, + "grad_norm": 0.7859777808189392, + "kl": 0.29241943359375, + "lambda_div_used": 0.5, + "learning_rate": 3.5839931879571725e-07, + "loss": 0.0578, + "reward": 0.06579644477460533, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.06579644477460533, + "reward_after_std": 0.7806403860449791, + "reward_before_mean": 0.6918848976492882, + "reward_before_std": 0.73706915974617, + "reward_change_max": 0.0, + "reward_change_mean": -0.6260884515941143, + "reward_change_min": -1.0630837492644787, + "reward_change_std": 0.42817062325775623, + "reward_std": 0.7806403934955597, + "rewards/cosine_scaled_reward": -0.049890896305441856, + "rewards/format_reward": 0.7916666772216558, + "step": 338 + }, + { + "advantage_max": 1.36690903455019, + "advantage_mean": 6.829698806498641e-09, + "advantage_min": -0.5567984506487846, + "advantage_std": 0.7089659981429577, + "completion_length": 3122.5000915527344, + "epoch": 0.38742857142857146, + "grad_norm": 0.6798500418663025, + "kl": 0.3626708984375, + "lambda_div_used": 0.5, + "learning_rate": 3.555614130391079e-07, + "loss": 0.0192, + "reward": -0.16638006269931793, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.16638006269931793, + "reward_after_std": 0.7089660130441189, + "reward_before_mean": 0.2861324343830347, + "reward_before_std": 0.6491104103624821, + "reward_change_max": 0.00011374801397323608, + "reward_change_mean": -0.45251248590648174, + "reward_change_min": -0.8402752205729485, + "reward_change_std": 0.31217301823198795, + "reward_std": 0.7089660204946995, + "rewards/cosine_scaled_reward": -0.1277671225834638, + "rewards/format_reward": 0.5416666772216558, + "step": 339 + }, + { + "advantage_max": 1.7433002442121506, + "advantage_mean": -4.967053990334591e-09, + "advantage_min": -0.7869556918740273, + "advantage_std": 0.9177567921578884, + "completion_length": 2737.6458892822266, + "epoch": 0.38857142857142857, + "grad_norm": 0.716688871383667, + "kl": 0.2950439453125, + "lambda_div_used": 0.5, + "learning_rate": 3.5273298394491515e-07, + "loss": 0.0255, + "reward": 0.09202966094017029, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.09202966094017029, + "reward_after_std": 0.9177567958831787, + "reward_before_mean": 0.6951882378198206, + "reward_before_std": 0.8705759271979332, + "reward_change_max": 0.0, + "reward_change_mean": -0.6031586118042469, + "reward_change_min": -1.0886082351207733, + "reward_change_std": 0.4399815835058689, + "reward_std": 0.917756836861372, + "rewards/cosine_scaled_reward": -0.048239219933748245, + "rewards/format_reward": 0.7916666865348816, + "step": 340 + }, + { + "advantage_max": 1.7033798545598984, + "advantage_mean": -2.5456151409031236e-08, + "advantage_min": -0.7195212692022324, + "advantage_std": 0.8980820775032043, + "completion_length": 2603.354217529297, + "epoch": 0.38971428571428574, + "grad_norm": 0.9158607721328735, + "kl": 0.24871826171875, + "lambda_div_used": 0.5, + "learning_rate": 3.4991416936678276e-07, + "loss": 0.0565, + "reward": 0.345290195196867, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.345290195196867, + "reward_after_std": 0.8980820700526237, + "reward_before_mean": 1.1627929043024778, + "reward_before_std": 0.7433993555605412, + "reward_change_max": 0.0, + "reward_change_mean": -0.8175027351826429, + "reward_change_min": -1.348432257771492, + "reward_change_std": 0.5476555228233337, + "reward_std": 0.8980820924043655, + "rewards/cosine_scaled_reward": 0.23764644749462605, + "rewards/format_reward": 0.6875000055879354, + "step": 341 + }, + { + "advantage_max": 1.687784269452095, + "advantage_mean": -1.4901161637936866e-08, + "advantage_min": -0.7531973719596863, + "advantage_std": 0.8736945390701294, + "completion_length": 2862.9375915527344, + "epoch": 0.39085714285714285, + "grad_norm": 0.9657043218612671, + "kl": 0.439453125, + "lambda_div_used": 0.5, + "learning_rate": 3.471051066897562e-07, + "loss": 0.0755, + "reward": 0.04618118858593334, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.04618118858593334, + "reward_after_std": 0.8736945353448391, + "reward_before_mean": 0.6218621619045734, + "reward_before_std": 0.7942062020301819, + "reward_change_max": 0.0, + "reward_change_mean": -0.5756809897720814, + "reward_change_min": -1.013890691101551, + "reward_change_std": 0.397940494120121, + "reward_std": 0.8736945502460003, + "rewards/cosine_scaled_reward": -0.07448559207841754, + "rewards/format_reward": 0.770833358168602, + "step": 342 + }, + { + "advantage_max": 1.596488393843174, + "advantage_mean": 1.1796752963366686e-08, + "advantage_min": -0.7924398183822632, + "advantage_std": 0.8561305105686188, + "completion_length": 2894.2083740234375, + "epoch": 0.392, + "grad_norm": 0.913608968257904, + "kl": 0.294189453125, + "lambda_div_used": 0.5, + "learning_rate": 3.4430593282358777e-07, + "loss": 0.0409, + "reward": 0.14228842593729496, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.14228842593729496, + "reward_after_std": 0.8561305180191994, + "reward_before_mean": 0.8087989874184132, + "reward_before_std": 0.8190572299063206, + "reward_change_max": 0.0003446340560913086, + "reward_change_mean": -0.6665105260908604, + "reward_change_min": -1.2105020619928837, + "reward_change_std": 0.48955480568110943, + "reward_std": 0.8561305701732635, + "rewards/cosine_scaled_reward": 0.07106614392250776, + "rewards/format_reward": 0.666666679084301, + "step": 343 + }, + { + "advantage_max": 1.4609468877315521, + "advantage_mean": -2.297262396977473e-08, + "advantage_min": -0.6554704532027245, + "advantage_std": 0.7672437131404877, + "completion_length": 2265.104232788086, + "epoch": 0.3931428571428571, + "grad_norm": 0.38663792610168457, + "kl": 0.217529296875, + "lambda_div_used": 0.5, + "learning_rate": 3.4151678419606233e-07, + "loss": 0.0095, + "reward": 0.32306694984436035, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.32306694984436035, + "reward_after_std": 0.7672437205910683, + "reward_before_mean": 1.1657235862221569, + "reward_before_std": 0.5834860354661942, + "reward_change_max": 0.0020352303981781006, + "reward_change_mean": -0.8426566086709499, + "reward_change_min": -1.2558316215872765, + "reward_change_std": 0.5019741114228964, + "reward_std": 0.7672437354922295, + "rewards/cosine_scaled_reward": 0.1661950871348381, + "rewards/format_reward": 0.8333333395421505, + "step": 344 + }, + { + "advantage_max": 1.7725291848182678, + "advantage_mean": -1.4280279792000528e-08, + "advantage_min": -0.8514701277017593, + "advantage_std": 0.9321935474872589, + "completion_length": 2641.916778564453, + "epoch": 0.3942857142857143, + "grad_norm": 0.6002000570297241, + "kl": 0.28643798828125, + "lambda_div_used": 0.5, + "learning_rate": 3.387377967463493e-07, + "loss": 0.0045, + "reward": 0.20617245603352785, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.20617245603352785, + "reward_after_std": 0.9321935623884201, + "reward_before_mean": 0.9006627351045609, + "reward_before_std": 0.8598495684564114, + "reward_change_max": 0.0, + "reward_change_mean": -0.6944902688264847, + "reward_change_min": -1.2238090112805367, + "reward_change_std": 0.4703991822898388, + "reward_std": 0.9321935623884201, + "rewards/cosine_scaled_reward": 0.07533134613186121, + "rewards/format_reward": 0.7500000223517418, + "step": 345 + }, + { + "advantage_max": 1.2814199030399323, + "advantage_mean": 3.7252904094842165e-09, + "advantage_min": -0.6667215526103973, + "advantage_std": 0.6843202896416187, + "completion_length": 2701.104232788086, + "epoch": 0.3954285714285714, + "grad_norm": 0.2635735273361206, + "kl": 0.3524169921875, + "lambda_div_used": 0.5, + "learning_rate": 3.359691059183761e-07, + "loss": 0.0427, + "reward": -0.021729059517383575, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.021729059517383575, + "reward_after_std": 0.6843202896416187, + "reward_before_mean": 0.5648492462933064, + "reward_before_std": 0.6315851099789143, + "reward_change_max": 0.0006144046783447266, + "reward_change_mean": -0.5865782834589481, + "reward_change_min": -1.0161477029323578, + "reward_change_std": 0.39074820280075073, + "reward_std": 0.6843203119933605, + "rewards/cosine_scaled_reward": -0.10299206525087357, + "rewards/format_reward": 0.7708333507180214, + "step": 346 + }, + { + "advantage_max": 1.352771744132042, + "advantage_mean": -3.104408563547878e-09, + "advantage_min": -0.5322981774806976, + "advantage_std": 0.6943789683282375, + "completion_length": 2622.1875610351562, + "epoch": 0.3965714285714286, + "grad_norm": 1.1750766038894653, + "kl": 0.292724609375, + "lambda_div_used": 0.5, + "learning_rate": 3.3321084665422803e-07, + "loss": -0.0206, + "reward": 0.014769105706363916, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.014769105706363916, + "reward_after_std": 0.6943789832293987, + "reward_before_mean": 0.6197219397872686, + "reward_before_std": 0.5595865100622177, + "reward_change_max": 0.0, + "reward_change_mean": -0.6049528494477272, + "reward_change_min": -1.0069943517446518, + "reward_change_std": 0.3644682914018631, + "reward_std": 0.6943789906799793, + "rewards/cosine_scaled_reward": -0.12763904221355915, + "rewards/format_reward": 0.8750000149011612, + "step": 347 + }, + { + "advantage_max": 1.5380387529730797, + "advantage_mean": -1.490116224855953e-08, + "advantage_min": -0.6307278983294964, + "advantage_std": 0.7947516813874245, + "completion_length": 2490.7500610351562, + "epoch": 0.3977142857142857, + "grad_norm": 0.3507033586502075, + "kl": 0.315765380859375, + "lambda_div_used": 0.5, + "learning_rate": 3.3046315338757026e-07, + "loss": 0.0396, + "reward": 0.14277693210169673, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.14277693210169673, + "reward_after_std": 0.7947516664862633, + "reward_before_mean": 0.8233021963387728, + "reward_before_std": 0.6358652543276548, + "reward_change_max": 0.0, + "reward_change_mean": -0.6805252507328987, + "reward_change_min": -1.0959210619330406, + "reward_change_std": 0.4228264205157757, + "reward_std": 0.7947516813874245, + "rewards/cosine_scaled_reward": -0.015432262793183327, + "rewards/format_reward": 0.8541666716337204, + "step": 348 + }, + { + "advantage_max": 1.4082676097750664, + "advantage_mean": 1.490116141589226e-08, + "advantage_min": -0.6084829457104206, + "advantage_std": 0.7429373823106289, + "completion_length": 2923.291717529297, + "epoch": 0.39885714285714285, + "grad_norm": 0.7672387361526489, + "kl": 0.4090576171875, + "lambda_div_used": 0.5, + "learning_rate": 3.2772616003709616e-07, + "loss": 0.0578, + "reward": -0.14835473091807216, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.14835473091807216, + "reward_after_std": 0.7429374102503061, + "reward_before_mean": 0.3137938645668328, + "reward_before_std": 0.7114957068115473, + "reward_change_max": 0.00020164251327514648, + "reward_change_mean": -0.46214855602011085, + "reward_change_min": -0.9401774033904076, + "reward_change_std": 0.35716398153454065, + "reward_std": 0.7429374195635319, + "rewards/cosine_scaled_reward": -0.14518641866743565, + "rewards/format_reward": 0.6041666828095913, + "step": 349 + }, + { + "advantage_max": 1.752659372985363, + "advantage_mean": 1.4280279847511679e-08, + "advantage_min": -0.6503000631928444, + "advantage_std": 0.8873385712504387, + "completion_length": 2393.0000915527344, + "epoch": 0.4, + "grad_norm": 0.501510500907898, + "kl": 0.3831787109375, + "lambda_div_used": 0.5, + "learning_rate": 3.250000000000001e-07, + "loss": 0.028, + "reward": 0.06549177691340446, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.06549177691340446, + "reward_after_std": 0.8873385488986969, + "reward_before_mean": 0.6462145633995533, + "reward_before_std": 0.724578857421875, + "reward_change_max": 0.0005541294813156128, + "reward_change_mean": -0.5807227715849876, + "reward_change_min": -0.9543578177690506, + "reward_change_std": 0.3548112027347088, + "reward_std": 0.8873385824263096, + "rewards/cosine_scaled_reward": -0.08314273924042936, + "rewards/format_reward": 0.8125000111758709, + "step": 350 + }, + { + "advantage_max": 1.7768183425068855, + "advantage_mean": -1.1796752963366686e-08, + "advantage_min": -0.7786560505628586, + "advantage_std": 0.944592297077179, + "completion_length": 2501.250030517578, + "epoch": 0.40114285714285713, + "grad_norm": 0.3541778028011322, + "kl": 0.29620361328125, + "lambda_div_used": 0.5, + "learning_rate": 3.222848061454764e-07, + "loss": 0.0245, + "reward": 0.1836626399308443, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1836626399308443, + "reward_after_std": 0.944592297077179, + "reward_before_mean": 0.8567006252706051, + "reward_before_std": 0.8989718146622181, + "reward_change_max": 0.0011807605624198914, + "reward_change_mean": -0.6730380021035671, + "reward_change_min": -1.1930915638804436, + "reward_change_std": 0.481119092553854, + "reward_std": 0.944592297077179, + "rewards/cosine_scaled_reward": 0.011683644726872444, + "rewards/format_reward": 0.8333333414047956, + "step": 351 + }, + { + "advantage_max": 1.2776615843176842, + "advantage_mean": -9.313225579621331e-09, + "advantage_min": -0.6007811687886715, + "advantage_std": 0.678571205586195, + "completion_length": 2365.375045776367, + "epoch": 0.4022857142857143, + "grad_norm": 0.9657369256019592, + "kl": 0.3677520751953125, + "lambda_div_used": 0.5, + "learning_rate": 3.195807108082429e-07, + "loss": 0.0034, + "reward": -0.07690786942839622, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.07690786942839622, + "reward_after_std": 0.6785712130367756, + "reward_before_mean": 0.4603999052196741, + "reward_before_std": 0.6251944825053215, + "reward_change_max": 0.00337374210357666, + "reward_change_mean": -0.5373077914118767, + "reward_change_min": -0.9584220610558987, + "reward_change_std": 0.3818345069885254, + "reward_std": 0.6785712391138077, + "rewards/cosine_scaled_reward": -0.05105004645884037, + "rewards/format_reward": 0.5625000093132257, + "step": 352 + }, + { + "advantage_max": 1.450112447142601, + "advantage_mean": 1.8626452047421083e-09, + "advantage_min": -0.5225486978888512, + "advantage_std": 0.7335505895316601, + "completion_length": 2045.3958854675293, + "epoch": 0.4034285714285714, + "grad_norm": 0.47462838888168335, + "kl": 0.257476806640625, + "lambda_div_used": 0.5, + "learning_rate": 3.168878457820915e-07, + "loss": 0.0158, + "reward": 0.1838024971075356, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1838024971075356, + "reward_after_std": 0.7335505895316601, + "reward_before_mean": 0.9056749492883682, + "reward_before_std": 0.5095305219292641, + "reward_change_max": 0.0010958164930343628, + "reward_change_mean": -0.7218724116683006, + "reward_change_min": -1.0607609003782272, + "reward_change_std": 0.40031613036990166, + "reward_std": 0.7335506342351437, + "rewards/cosine_scaled_reward": 0.015337456949055195, + "rewards/format_reward": 0.8750000074505806, + "step": 353 + }, + { + "advantage_max": 1.4784216433763504, + "advantage_mean": 1.0554989549049765e-08, + "advantage_min": -0.6035023629665375, + "advantage_std": 0.7698302268981934, + "completion_length": 2046.0833892822266, + "epoch": 0.4045714285714286, + "grad_norm": 0.22698451578617096, + "kl": 0.18927001953125, + "lambda_div_used": 0.5, + "learning_rate": 3.142063423134644e-07, + "loss": 0.0088, + "reward": 0.19753902312368155, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.19753902312368155, + "reward_after_std": 0.7698302119970322, + "reward_before_mean": 0.9317657127976418, + "reward_before_std": 0.6076288931071758, + "reward_change_max": 0.0, + "reward_change_mean": -0.734226655215025, + "reward_change_min": -1.204187534749508, + "reward_change_std": 0.4471469521522522, + "reward_std": 0.7698302567005157, + "rewards/cosine_scaled_reward": 0.03879951499402523, + "rewards/format_reward": 0.8541666716337204, + "step": 354 + }, + { + "advantage_max": 2.0405396223068237, + "advantage_mean": -6.208817904251873e-10, + "advantage_min": -0.8608497157692909, + "advantage_std": 1.0653215050697327, + "completion_length": 2223.6458587646484, + "epoch": 0.4057142857142857, + "grad_norm": 0.48912322521209717, + "kl": 0.2689208984375, + "lambda_div_used": 0.5, + "learning_rate": 3.115363310950578e-07, + "loss": 0.0411, + "reward": 0.24965599924325943, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.24965599924325943, + "reward_after_std": 1.0653215199708939, + "reward_before_mean": 0.9300570599734783, + "reward_before_std": 0.9955869130790234, + "reward_change_max": 0.0, + "reward_change_mean": -0.6804010719060898, + "reward_change_min": -1.240929253399372, + "reward_change_std": 0.4811771549284458, + "reward_std": 1.0653215497732162, + "rewards/cosine_scaled_reward": 0.027528513222932816, + "rewards/format_reward": 0.8750000111758709, + "step": 355 + }, + { + "advantage_max": 1.5031840428709984, + "advantage_mean": -1.428027990302283e-08, + "advantage_min": -0.7539886124432087, + "advantage_std": 0.7986224293708801, + "completion_length": 2421.729248046875, + "epoch": 0.40685714285714286, + "grad_norm": 0.6188843250274658, + "kl": 0.25262451171875, + "lambda_div_used": 0.5, + "learning_rate": 3.0887794225945143e-07, + "loss": 0.0175, + "reward": 0.19870756931777578, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.19870756931777578, + "reward_after_std": 0.7986224070191383, + "reward_before_mean": 0.9287926075048745, + "reward_before_std": 0.713905394077301, + "reward_change_max": 0.0, + "reward_change_mean": -0.7300850711762905, + "reward_change_min": -1.2293099090456963, + "reward_change_std": 0.4734327495098114, + "reward_std": 0.7986224070191383, + "rewards/cosine_scaled_reward": 0.04772963561117649, + "rewards/format_reward": 0.8333333395421505, + "step": 356 + }, + { + "advantage_max": 1.3350956961512566, + "advantage_mean": -1.3659398390153399e-08, + "advantage_min": -0.633977860212326, + "advantage_std": 0.6937114223837852, + "completion_length": 2763.125045776367, + "epoch": 0.408, + "grad_norm": 0.7168110013008118, + "kl": 0.2630615234375, + "lambda_div_used": 0.5, + "learning_rate": 3.062313053727671e-07, + "loss": 0.0174, + "reward": 0.08040890609845519, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.08040890609845519, + "reward_after_std": 0.6937114372849464, + "reward_before_mean": 0.7407789751887321, + "reward_before_std": 0.5677717514336109, + "reward_change_max": 0.0, + "reward_change_mean": -0.6603700965642929, + "reward_change_min": -0.9959189742803574, + "reward_change_std": 0.3936575651168823, + "reward_std": 0.6937114521861076, + "rewards/cosine_scaled_reward": -0.08794385753571987, + "rewards/format_reward": 0.9166666865348816, + "step": 357 + }, + { + "advantage_max": 1.592109739780426, + "advantage_mean": 4.967053712778835e-09, + "advantage_min": -0.716572854667902, + "advantage_std": 0.8424624130129814, + "completion_length": 1910.2292022705078, + "epoch": 0.40914285714285714, + "grad_norm": 0.42959484457969666, + "kl": 0.201019287109375, + "lambda_div_used": 0.5, + "learning_rate": 3.0359654942835247e-07, + "loss": 0.0346, + "reward": 0.24396879551932216, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.24396879551932216, + "reward_after_std": 0.8424624279141426, + "reward_before_mean": 0.9911262951791286, + "reward_before_std": 0.7452986799180508, + "reward_change_max": 0.0, + "reward_change_mean": -0.7471575364470482, + "reward_change_min": -1.3087777346372604, + "reward_change_std": 0.48442544788122177, + "reward_std": 0.8424624502658844, + "rewards/cosine_scaled_reward": 0.11014649923890829, + "rewards/format_reward": 0.770833345130086, + "step": 358 + }, + { + "advantage_max": 1.335417702794075, + "advantage_mean": 6.829698862009792e-09, + "advantage_min": -0.6110520102083683, + "advantage_std": 0.7061552852392197, + "completion_length": 2318.479248046875, + "epoch": 0.4102857142857143, + "grad_norm": 1.0732530355453491, + "kl": 0.234130859375, + "lambda_div_used": 0.5, + "learning_rate": 3.0097380284049523e-07, + "loss": -0.0101, + "reward": 0.0727979297953425, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.0727979297953425, + "reward_after_std": 0.70615528896451, + "reward_before_mean": 0.7263930886983871, + "reward_before_std": 0.6077985875308514, + "reward_change_max": 0.0003136545419692993, + "reward_change_mean": -0.6535951718688011, + "reward_change_min": -1.060469426214695, + "reward_change_std": 0.41288536973297596, + "reward_std": 0.70615528896451, + "rewards/cosine_scaled_reward": -0.07430345751345158, + "rewards/format_reward": 0.8750000149011612, + "step": 359 + }, + { + "advantage_max": 1.9765265434980392, + "advantage_mean": -2.1109978876054925e-08, + "advantage_min": -0.9272864870727062, + "advantage_std": 1.044951420277357, + "completion_length": 2600.2291870117188, + "epoch": 0.4114285714285714, + "grad_norm": 0.5990117192268372, + "kl": 0.23577880859375, + "lambda_div_used": 0.5, + "learning_rate": 2.9836319343816397e-07, + "loss": 0.024, + "reward": 0.38619683496654034, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.38619683496654034, + "reward_after_std": 1.044951420277357, + "reward_before_mean": 1.1979414029046893, + "reward_before_std": 0.9707056246697903, + "reward_change_max": 0.0, + "reward_change_mean": -0.8117445930838585, + "reward_change_min": -1.418171539902687, + "reward_change_std": 0.5478472858667374, + "reward_std": 1.044951420277357, + "rewards/cosine_scaled_reward": 0.1302206851541996, + "rewards/format_reward": 0.9375000149011612, + "step": 360 + }, + { + "advantage_max": 1.6750199496746063, + "advantage_mean": 6.208820124697922e-10, + "advantage_min": -0.7383632361888885, + "advantage_std": 0.8688630610704422, + "completion_length": 2427.5834350585938, + "epoch": 0.4125714285714286, + "grad_norm": 0.7121224403381348, + "kl": 0.23345947265625, + "lambda_div_used": 0.5, + "learning_rate": 2.9576484845877793e-07, + "loss": 0.0544, + "reward": 0.19519370747730136, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.19519370747730136, + "reward_after_std": 0.8688630871474743, + "reward_before_mean": 0.8941241502761841, + "reward_before_std": 0.744682066142559, + "reward_change_max": 0.0008524805307388306, + "reward_change_mean": -0.698930449783802, + "reward_change_min": -1.1726604774594307, + "reward_change_std": 0.4409833699464798, + "reward_std": 0.8688631132245064, + "rewards/cosine_scaled_reward": 0.009562073741108179, + "rewards/format_reward": 0.8750000149011612, + "step": 361 + }, + { + "advantage_max": 1.3919198587536812, + "advantage_mean": -5.587935336670569e-09, + "advantage_min": -0.5790497735142708, + "advantage_std": 0.7130658477544785, + "completion_length": 1741.7708892822266, + "epoch": 0.4137142857142857, + "grad_norm": 0.450083464384079, + "kl": 0.19146728515625, + "lambda_div_used": 0.5, + "learning_rate": 2.931788945420058e-07, + "loss": 0.0206, + "reward": 0.21085366362240165, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.21085366362240165, + "reward_after_std": 0.7130658328533173, + "reward_before_mean": 0.9695420237258077, + "reward_before_std": 0.5190842002630234, + "reward_change_max": 0.0, + "reward_change_mean": -0.7586883679032326, + "reward_change_min": -1.1238619238138199, + "reward_change_std": 0.4279701504856348, + "reward_std": 0.7130658328533173, + "rewards/cosine_scaled_reward": 0.0472710095345974, + "rewards/format_reward": 0.8750000111758709, + "step": 362 + }, + { + "advantage_max": 1.424420714378357, + "advantage_mean": -1.1102230246251565e-16, + "advantage_min": -0.5779132470488548, + "advantage_std": 0.7376838810741901, + "completion_length": 1992.645881652832, + "epoch": 0.41485714285714287, + "grad_norm": 0.5427188873291016, + "kl": 0.257598876953125, + "lambda_div_used": 0.5, + "learning_rate": 2.9060545772359305e-07, + "loss": -0.0058, + "reward": 0.0010278723202645779, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.0010278723202645779, + "reward_after_std": 0.7376838736236095, + "reward_before_mean": 0.5802722265943885, + "reward_before_std": 0.6190046742558479, + "reward_change_max": 0.00023803859949111938, + "reward_change_mean": -0.5792443305253983, + "reward_change_min": -0.9397610351443291, + "reward_change_std": 0.3671391997486353, + "reward_std": 0.737683892250061, + "rewards/cosine_scaled_reward": -0.04319723695516586, + "rewards/format_reward": 0.6666666846722364, + "step": 363 + }, + { + "advantage_max": 1.311921313405037, + "advantage_mean": -8.071462553882469e-09, + "advantage_min": -0.5705104358494282, + "advantage_std": 0.6872405484318733, + "completion_length": 2710.31258392334, + "epoch": 0.416, + "grad_norm": 1.0595557689666748, + "kl": 0.29632568359375, + "lambda_div_used": 0.5, + "learning_rate": 2.8804466342921987e-07, + "loss": -0.0062, + "reward": -0.25245581939816475, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.25245581939816475, + "reward_after_std": 0.6872405298054218, + "reward_before_mean": 0.1285827998071909, + "reward_before_std": 0.658259104937315, + "reward_change_max": 0.0, + "reward_change_mean": -0.3810386322438717, + "reward_change_min": -0.6986820474267006, + "reward_change_std": 0.2910507880151272, + "reward_std": 0.6872405484318733, + "rewards/cosine_scaled_reward": -0.23779193311929703, + "rewards/format_reward": 0.6041666772216558, + "step": 364 + }, + { + "advantage_max": 1.4988937079906464, + "advantage_mean": -7.450581263057643e-09, + "advantage_min": -0.6771756447851658, + "advantage_std": 0.7947739884257317, + "completion_length": 2861.166732788086, + "epoch": 0.41714285714285715, + "grad_norm": 0.3315470218658447, + "kl": 0.22308349609375, + "lambda_div_used": 0.5, + "learning_rate": 2.854966364683872e-07, + "loss": 0.0188, + "reward": 0.1647136379033327, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1647136379033327, + "reward_after_std": 0.7947739958763123, + "reward_before_mean": 0.8680481066694483, + "reward_before_std": 0.7035925425589085, + "reward_change_max": 0.00020241737365722656, + "reward_change_mean": -0.7033344469964504, + "reward_change_min": -1.2183981984853745, + "reward_change_std": 0.4677523523569107, + "reward_std": 0.7947740480303764, + "rewards/cosine_scaled_reward": 0.0069406908005476, + "rewards/format_reward": 0.8541666716337204, + "step": 365 + }, + { + "advantage_max": 1.6599657125771046, + "advantage_mean": -2.4524827946237338e-08, + "advantage_min": -0.7904684916138649, + "advantage_std": 0.89173923432827, + "completion_length": 2031.7500457763672, + "epoch": 0.41828571428571426, + "grad_norm": 0.7609602212905884, + "kl": 0.1444244384765625, + "lambda_div_used": 0.5, + "learning_rate": 2.829615010283344e-07, + "loss": -0.0237, + "reward": 0.24849661067128181, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.24849661067128181, + "reward_after_std": 0.8917392194271088, + "reward_before_mean": 0.9921067655086517, + "reward_before_std": 0.8401160351932049, + "reward_change_max": 0.0, + "reward_change_mean": -0.7436101827770472, + "reward_change_min": -1.3646418452262878, + "reward_change_std": 0.5249740164726973, + "reward_std": 0.8917392492294312, + "rewards/cosine_scaled_reward": 0.15230336226522923, + "rewards/format_reward": 0.6875000149011612, + "step": 366 + }, + { + "advantage_max": 1.6305503770709038, + "advantage_mean": 2.4835269396561444e-09, + "advantage_min": -0.7210628725588322, + "advantage_std": 0.8612675108015537, + "completion_length": 2874.041732788086, + "epoch": 0.41942857142857143, + "grad_norm": 1.068894624710083, + "kl": 0.2226409912109375, + "lambda_div_used": 0.5, + "learning_rate": 2.8043938066798645e-07, + "loss": 0.063, + "reward": -0.04002854856662452, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.04002854856662452, + "reward_after_std": 0.8612675480544567, + "reward_before_mean": 0.4689618442207575, + "reward_before_std": 0.8422180972993374, + "reward_change_max": 0.0019412413239479065, + "reward_change_mean": -0.5089903902262449, + "reward_change_min": -0.9655529074370861, + "reward_change_std": 0.3873988389968872, + "reward_std": 0.8612675666809082, + "rewards/cosine_scaled_reward": -0.06760243279859424, + "rewards/format_reward": 0.6041666809469461, + "step": 367 + }, + { + "advantage_max": 1.5631348192691803, + "advantage_mean": -2.483527106189598e-09, + "advantage_min": -0.6437996104359627, + "advantage_std": 0.8016529567539692, + "completion_length": 2851.6459045410156, + "epoch": 0.4205714285714286, + "grad_norm": 0.7713169455528259, + "kl": 0.19476318359375, + "lambda_div_used": 0.5, + "learning_rate": 2.7793039831193133e-07, + "loss": 0.0495, + "reward": -0.07841504114912823, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.07841504114912823, + "reward_after_std": 0.8016529642045498, + "reward_before_mean": 0.41246860893443227, + "reward_before_std": 0.7121658660471439, + "reward_change_max": 0.0004137009382247925, + "reward_change_mean": -0.4908836465328932, + "reward_change_min": -0.7905861400067806, + "reward_change_std": 0.31734895519912243, + "reward_std": 0.801652979105711, + "rewards/cosine_scaled_reward": -0.09584904834628105, + "rewards/format_reward": 0.6041666753590107, + "step": 368 + }, + { + "advantage_max": 1.8276142477989197, + "advantage_mean": -2.2351742789972207e-08, + "advantage_min": -0.8692605495452881, + "advantage_std": 0.9705227017402649, + "completion_length": 2686.7709350585938, + "epoch": 0.4217142857142857, + "grad_norm": 0.6053704619407654, + "kl": 0.17791748046875, + "lambda_div_used": 0.5, + "learning_rate": 2.7543467624442956e-07, + "loss": 0.0124, + "reward": 0.264951853081584, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.264951853081584, + "reward_after_std": 0.9705226868391037, + "reward_before_mean": 0.9941436583176255, + "reward_before_std": 0.9082366935908794, + "reward_change_max": 0.0004303380846977234, + "reward_change_mean": -0.7291918061673641, + "reward_change_min": -1.2517874836921692, + "reward_change_std": 0.4964812193065882, + "reward_std": 0.9705227166414261, + "rewards/cosine_scaled_reward": 0.09082182496786118, + "rewards/format_reward": 0.8125000111758709, + "step": 369 + }, + { + "advantage_max": 1.2040704488754272, + "advantage_mean": 6.8296991950766994e-09, + "advantage_min": -0.5248738452792168, + "advantage_std": 0.6303714476525784, + "completion_length": 2822.75008392334, + "epoch": 0.4228571428571429, + "grad_norm": 0.30178558826446533, + "kl": 0.2030181884765625, + "lambda_div_used": 0.5, + "learning_rate": 2.729523361034538e-07, + "loss": 0.0336, + "reward": -0.024024151323828846, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.024024151323828846, + "reward_after_std": 0.630371455103159, + "reward_before_mean": 0.573287246748805, + "reward_before_std": 0.5030009597539902, + "reward_change_max": 0.000520557165145874, + "reward_change_mean": -0.5973114091902971, + "reward_change_min": -0.9545686095952988, + "reward_change_std": 0.382950097322464, + "reward_std": 0.6303714849054813, + "rewards/cosine_scaled_reward": -0.04668972175568342, + "rewards/format_reward": 0.6666666809469461, + "step": 370 + }, + { + "advantage_max": 1.5700139477849007, + "advantage_mean": -1.117587167254186e-08, + "advantage_min": -0.7256846129894257, + "advantage_std": 0.8189797066152096, + "completion_length": 1951.1875228881836, + "epoch": 0.424, + "grad_norm": 0.39446350932121277, + "kl": 0.1736907958984375, + "lambda_div_used": 0.5, + "learning_rate": 2.7048349887476037e-07, + "loss": 0.0311, + "reward": 0.2522589797154069, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2522589797154069, + "reward_after_std": 0.8189797289669514, + "reward_before_mean": 1.0146636981517076, + "reward_before_std": 0.6875501796603203, + "reward_change_max": 0.003395289182662964, + "reward_change_mean": -0.7624046634882689, + "reward_change_min": -1.200857788324356, + "reward_change_std": 0.47052861377596855, + "reward_std": 0.8189797662198544, + "rewards/cosine_scaled_reward": 0.1114985030144453, + "rewards/format_reward": 0.7916666846722364, + "step": 371 + }, + { + "advantage_max": 1.7297367379069328, + "advantage_mean": -7.450580818968433e-09, + "advantage_min": -0.7292061150074005, + "advantage_std": 0.9062994085252285, + "completion_length": 3032.854217529297, + "epoch": 0.42514285714285716, + "grad_norm": 0.546402633190155, + "kl": 0.2222137451171875, + "lambda_div_used": 0.5, + "learning_rate": 2.6802828488599294e-07, + "loss": 0.0465, + "reward": 0.04156911559402943, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.04156911559402943, + "reward_after_std": 0.9062994159758091, + "reward_before_mean": 0.6037184139713645, + "reward_before_std": 0.8658935464918613, + "reward_change_max": 3.7536025047302246e-05, + "reward_change_mean": -0.5621493104845285, + "reward_change_min": -1.030746005475521, + "reward_change_std": 0.40250076726078987, + "reward_std": 0.9062994495034218, + "rewards/cosine_scaled_reward": -0.02105746790766716, + "rewards/format_reward": 0.6458333414047956, + "step": 372 + }, + { + "advantage_max": 1.5753349885344505, + "advantage_mean": 8.071462664904772e-09, + "advantage_min": -0.707958310842514, + "advantage_std": 0.8195471204817295, + "completion_length": 1873.8750381469727, + "epoch": 0.42628571428571427, + "grad_norm": 0.45761746168136597, + "kl": 0.1535797119140625, + "lambda_div_used": 0.5, + "learning_rate": 2.655868138008171e-07, + "loss": 0.0228, + "reward": 0.09441595152020454, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.09441595152020454, + "reward_after_std": 0.8195471279323101, + "reward_before_mean": 0.7264794651418924, + "reward_before_std": 0.7281909249722958, + "reward_change_max": 0.0031985342502593994, + "reward_change_mean": -0.6320635080337524, + "reward_change_min": -1.0189965330064297, + "reward_change_std": 0.4044807106256485, + "reward_std": 0.8195471614599228, + "rewards/cosine_scaled_reward": -0.04301029210910201, + "rewards/format_reward": 0.8125000149011612, + "step": 373 + }, + { + "advantage_max": 1.6910846680402756, + "advantage_mean": -1.1175871006408045e-08, + "advantage_min": -0.8846486583352089, + "advantage_std": 0.9025316834449768, + "completion_length": 2393.2500610351562, + "epoch": 0.42742857142857144, + "grad_norm": 0.5066131353378296, + "kl": 0.156524658203125, + "lambda_div_used": 0.5, + "learning_rate": 2.631592046130896e-07, + "loss": 0.0508, + "reward": 0.2205169820226729, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2205169820226729, + "reward_after_std": 0.9025316908955574, + "reward_before_mean": 0.9377603754401207, + "reward_before_std": 0.8492592498660088, + "reward_change_max": 0.0010761022567749023, + "reward_change_mean": -0.7172434497624636, + "reward_change_min": -1.2053956873714924, + "reward_change_std": 0.497776135802269, + "reward_std": 0.9025317057967186, + "rewards/cosine_scaled_reward": 0.06263019423931837, + "rewards/format_reward": 0.8125000223517418, + "step": 374 + }, + { + "advantage_max": 1.5614767000079155, + "advantage_mean": -1.2417633588057697e-09, + "advantage_min": -0.7359337955713272, + "advantage_std": 0.8408215641975403, + "completion_length": 2382.5000762939453, + "epoch": 0.42857142857142855, + "grad_norm": 0.6809465289115906, + "kl": 0.1686859130859375, + "lambda_div_used": 0.5, + "learning_rate": 2.6074557564105724e-07, + "loss": 0.0181, + "reward": 0.1346543780528009, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1346543780528009, + "reward_after_std": 0.8408215567469597, + "reward_before_mean": 0.7986902371048927, + "reward_before_std": 0.7959046922624111, + "reward_change_max": 0.0, + "reward_change_mean": -0.6640358716249466, + "reward_change_min": -1.1981551013886929, + "reward_change_std": 0.4808881878852844, + "reward_std": 0.8408215865492821, + "rewards/cosine_scaled_reward": 0.04517845343798399, + "rewards/format_reward": 0.7083333414047956, + "step": 375 + }, + { + "advantage_max": 1.4033148437738419, + "advantage_mean": 3.725290298461914e-09, + "advantage_min": -0.508766308426857, + "advantage_std": 0.7132799662649632, + "completion_length": 2357.812530517578, + "epoch": 0.4297142857142857, + "grad_norm": 0.3875832259654999, + "kl": 0.2390899658203125, + "lambda_div_used": 0.5, + "learning_rate": 2.583460445215911e-07, + "loss": 0.0401, + "reward": -0.024911897722631693, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.024911897722631693, + "reward_after_std": 0.7132799625396729, + "reward_before_mean": 0.5406988188624382, + "reward_before_std": 0.57003128901124, + "reward_change_max": 0.008424557745456696, + "reward_change_mean": -0.5656107012182474, + "reward_change_min": -0.927064124494791, + "reward_change_std": 0.35486311838030815, + "reward_std": 0.7132799699902534, + "rewards/cosine_scaled_reward": -0.0942339263856411, + "rewards/format_reward": 0.7291666753590107, + "step": 376 + }, + { + "advantage_max": 1.6421222761273384, + "advantage_mean": -7.450580763457282e-09, + "advantage_min": -0.8725541532039642, + "advantage_std": 0.8950701281428337, + "completion_length": 3138.166732788086, + "epoch": 0.4308571428571429, + "grad_norm": 1.4470289945602417, + "kl": 0.2535400390625, + "lambda_div_used": 0.5, + "learning_rate": 2.5596072820445254e-07, + "loss": 0.0475, + "reward": 0.032397462986409664, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.032397462986409664, + "reward_after_std": 0.8950701355934143, + "reward_before_mean": 0.6025219317525625, + "reward_before_std": 0.9364787600934505, + "reward_change_max": 0.0008480995893478394, + "reward_change_mean": -0.5701244231313467, + "reward_change_min": -1.1732884608209133, + "reward_change_std": 0.4806421175599098, + "reward_std": 0.8950701430439949, + "rewards/cosine_scaled_reward": -0.03207239834591746, + "rewards/format_reward": 0.6666666902601719, + "step": 377 + }, + { + "advantage_max": 1.7775244414806366, + "advantage_mean": -9.934107758624577e-09, + "advantage_min": -0.7913789357990026, + "advantage_std": 0.9315557107329369, + "completion_length": 2286.7917251586914, + "epoch": 0.432, + "grad_norm": 0.6633191108703613, + "kl": 0.189361572265625, + "lambda_div_used": 0.5, + "learning_rate": 2.5358974294659373e-07, + "loss": 0.0556, + "reward": 0.2488978197798133, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2488978197798133, + "reward_after_std": 0.9315557107329369, + "reward_before_mean": 0.9738697209395468, + "reward_before_std": 0.8460350800305605, + "reward_change_max": 0.00010737031698226929, + "reward_change_mean": -0.7249718643724918, + "reward_change_min": -1.23944041877985, + "reward_change_std": 0.47084952518343925, + "reward_std": 0.9315557405352592, + "rewards/cosine_scaled_reward": 0.09110149601474404, + "rewards/format_reward": 0.7916666772216558, + "step": 378 + }, + { + "advantage_max": 1.577236846089363, + "advantage_mean": 1.3659398057086491e-08, + "advantage_min": -0.7336809299886227, + "advantage_std": 0.8440136685967445, + "completion_length": 2862.6875610351562, + "epoch": 0.43314285714285716, + "grad_norm": 0.9866610765457153, + "kl": 0.466796875, + "lambda_div_used": 0.5, + "learning_rate": 2.512332043064913e-07, + "loss": 0.0855, + "reward": -0.02831041906028986, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.02831041906028986, + "reward_after_std": 0.8440136685967445, + "reward_before_mean": 0.49924127757549286, + "reward_before_std": 0.8402791954576969, + "reward_change_max": 0.0014529749751091003, + "reward_change_mean": -0.5275516845285892, + "reward_change_min": -0.9985620677471161, + "reward_change_std": 0.41906842961907387, + "reward_std": 0.8440137207508087, + "rewards/cosine_scaled_reward": -0.11496270447969437, + "rewards/format_reward": 0.729166679084301, + "step": 379 + }, + { + "advantage_max": 1.4116209298372269, + "advantage_mean": -1.2417632477834672e-09, + "advantage_min": -0.6222640015184879, + "advantage_std": 0.7395900189876556, + "completion_length": 2217.3958892822266, + "epoch": 0.4342857142857143, + "grad_norm": 0.822210431098938, + "kl": 0.193084716796875, + "lambda_div_used": 0.5, + "learning_rate": 2.488912271385139e-07, + "loss": 0.0237, + "reward": 0.06631459016352892, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.06631459016352892, + "reward_after_std": 0.7395900264382362, + "reward_before_mean": 0.7042246758937836, + "reward_before_std": 0.6468055509030819, + "reward_change_max": 0.0, + "reward_change_mean": -0.6379101015627384, + "reward_change_min": -1.0936972200870514, + "reward_change_std": 0.4125612024217844, + "reward_std": 0.7395900562405586, + "rewards/cosine_scaled_reward": -0.04372099880129099, + "rewards/format_reward": 0.7916666753590107, + "step": 380 + }, + { + "advantage_max": 1.4688269421458244, + "advantage_mean": -6.829698917520943e-09, + "advantage_min": -0.548859566450119, + "advantage_std": 0.7594586610794067, + "completion_length": 2741.0417709350586, + "epoch": 0.43542857142857144, + "grad_norm": 0.5852844715118408, + "kl": 0.35443115234375, + "lambda_div_used": 0.5, + "learning_rate": 2.465639255873246e-07, + "loss": 0.0173, + "reward": -0.06746639730408788, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.06746639730408788, + "reward_after_std": 0.7594586685299873, + "reward_before_mean": 0.44514533365145326, + "reward_before_std": 0.6602870542556047, + "reward_change_max": 0.00012650340795516968, + "reward_change_mean": -0.5126117654144764, + "reward_change_min": -0.8934906348586082, + "reward_change_std": 0.34667243622243404, + "reward_std": 0.7594587020576, + "rewards/cosine_scaled_reward": -0.12117732595652342, + "rewards/format_reward": 0.6875000055879354, + "step": 381 + }, + { + "advantage_max": 1.1954265832901, + "advantage_mean": 1.2417634698280722e-09, + "advantage_min": -0.4305326081812382, + "advantage_std": 0.6082721762359142, + "completion_length": 2293.479232788086, + "epoch": 0.43657142857142855, + "grad_norm": 0.45443591475486755, + "kl": 0.295379638671875, + "lambda_div_used": 0.5, + "learning_rate": 2.4425141308231765e-07, + "loss": 0.0369, + "reward": -0.11909189762081951, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.11909189762081951, + "reward_after_std": 0.6082722060382366, + "reward_before_mean": 0.39205896970815957, + "reward_before_std": 0.4652012325823307, + "reward_change_max": 0.0, + "reward_change_mean": -0.5111508592963219, + "reward_change_min": -0.8269638493657112, + "reward_change_std": 0.29517384245991707, + "reward_std": 0.6082722283899784, + "rewards/cosine_scaled_reward": -0.24147052597254515, + "rewards/format_reward": 0.8750000074505806, + "step": 382 + }, + { + "advantage_max": 1.583274468779564, + "advantage_mean": 2.7755575615628914e-16, + "advantage_min": -0.7815160192549229, + "advantage_std": 0.8540562726557255, + "completion_length": 2804.7709350585938, + "epoch": 0.4377142857142857, + "grad_norm": 0.3750695586204529, + "kl": 0.305908203125, + "lambda_div_used": 0.5, + "learning_rate": 2.4195380233209006e-07, + "loss": 0.0362, + "reward": 0.13424018677324057, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.13424018677324057, + "reward_after_std": 0.8540562726557255, + "reward_before_mean": 0.7994630765169859, + "reward_before_std": 0.830976240336895, + "reward_change_max": 0.0006273016333580017, + "reward_change_mean": -0.6652229018509388, + "reward_change_min": -1.246145885437727, + "reward_change_std": 0.4925944656133652, + "reward_std": 0.85405632853508, + "rewards/cosine_scaled_reward": 0.024731531739234924, + "rewards/format_reward": 0.7500000037252903, + "step": 383 + }, + { + "advantage_max": 1.7960882484912872, + "advantage_mean": -6.829699084054397e-09, + "advantage_min": -0.6961954347789288, + "advantage_std": 0.9222677126526833, + "completion_length": 1781.5208892822266, + "epoch": 0.43885714285714283, + "grad_norm": 0.8119909763336182, + "kl": 0.182159423828125, + "lambda_div_used": 0.5, + "learning_rate": 2.3967120531894857e-07, + "loss": -0.0102, + "reward": 0.4542910009622574, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4542910009622574, + "reward_after_std": 0.9222677052021027, + "reward_before_mean": 1.344484157860279, + "reward_before_std": 0.7018172033131123, + "reward_change_max": 0.0, + "reward_change_mean": -0.8901931792497635, + "reward_change_min": -1.4151117950677872, + "reward_change_std": 0.5127660743892193, + "reward_std": 0.9222677275538445, + "rewards/cosine_scaled_reward": 0.2764087514951825, + "rewards/format_reward": 0.7916666772216558, + "step": 384 + }, + { + "advantage_max": 1.6422509998083115, + "advantage_mean": -1.4435500406140278e-08, + "advantage_min": -0.7044455334544182, + "advantage_std": 0.8583366200327873, + "completion_length": 2314.541748046875, + "epoch": 0.44, + "grad_norm": 0.5646221041679382, + "kl": 0.2369384765625, + "lambda_div_used": 0.5, + "learning_rate": 2.374037332934512e-07, + "loss": -0.0053, + "reward": 0.029954310972243547, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.029954310972243547, + "reward_after_std": 0.8583366423845291, + "reward_before_mean": 0.597434401512146, + "reward_before_std": 0.7965768575668335, + "reward_change_max": 0.005019478499889374, + "reward_change_mean": -0.5674800910055637, + "reward_change_min": -1.0640814229846, + "reward_change_std": 0.4066210687160492, + "reward_std": 0.8583366572856903, + "rewards/cosine_scaled_reward": -0.10753281530924141, + "rewards/format_reward": 0.8125000149011612, + "step": 385 + }, + { + "advantage_max": 1.875676967203617, + "advantage_mean": -3.2285851103708296e-08, + "advantage_min": -0.614168468862772, + "advantage_std": 0.9387934468686581, + "completion_length": 2594.166717529297, + "epoch": 0.44114285714285717, + "grad_norm": 1.6637388467788696, + "kl": 0.306884765625, + "lambda_div_used": 0.5, + "learning_rate": 2.3515149676898552e-07, + "loss": 0.0001, + "reward": 0.4086938863620162, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4086938863620162, + "reward_after_std": 0.9387934617698193, + "reward_before_mean": 1.2516701593995094, + "reward_before_std": 0.633562033995986, + "reward_change_max": 0.0, + "reward_change_mean": -0.8429762609302998, + "reward_change_min": -1.2401651069521904, + "reward_change_std": 0.4739220403134823, + "reward_std": 0.938793495297432, + "rewards/cosine_scaled_reward": 0.1883350731804967, + "rewards/format_reward": 0.8750000074505806, + "step": 386 + }, + { + "advantage_max": 1.226422742009163, + "advantage_mean": -2.483526828633842e-09, + "advantage_min": -0.5920934341847897, + "advantage_std": 0.6487264335155487, + "completion_length": 2671.6875610351562, + "epoch": 0.4422857142857143, + "grad_norm": 1.0913386344909668, + "kl": 0.326934814453125, + "lambda_div_used": 0.5, + "learning_rate": 2.3291460551638237e-07, + "loss": 0.0016, + "reward": 0.07431225699838251, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.07431225699838251, + "reward_after_std": 0.6487264335155487, + "reward_before_mean": 0.7488601338118315, + "reward_before_std": 0.5205418951809406, + "reward_change_max": 0.0, + "reward_change_mean": -0.6745478585362434, + "reward_change_min": -1.0816974267363548, + "reward_change_std": 0.4158578272908926, + "reward_std": 0.6487264707684517, + "rewards/cosine_scaled_reward": -0.042236629873514175, + "rewards/format_reward": 0.8333333469927311, + "step": 387 + }, + { + "advantage_max": 1.7276557385921478, + "advantage_mean": -1.1175871339474952e-08, + "advantage_min": -0.5782233960926533, + "advantage_std": 0.8705500811338425, + "completion_length": 2406.3750762939453, + "epoch": 0.44342857142857145, + "grad_norm": 1.3232017755508423, + "kl": 0.28704833984375, + "lambda_div_used": 0.5, + "learning_rate": 2.306931685585657e-07, + "loss": -0.0104, + "reward": 0.1922937948256731, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1922937948256731, + "reward_after_std": 0.8705500811338425, + "reward_before_mean": 0.8794347532093525, + "reward_before_std": 0.6561367399990559, + "reward_change_max": 0.0, + "reward_change_mean": -0.6871409565210342, + "reward_change_min": -1.0342309921979904, + "reward_change_std": 0.3947345446795225, + "reward_std": 0.870550125837326, + "rewards/cosine_scaled_reward": 0.043884020298719406, + "rewards/format_reward": 0.7916666828095913, + "step": 388 + }, + { + "advantage_max": 1.5205080583691597, + "advantage_mean": -6.208817349140361e-09, + "advantage_min": -0.6496899351477623, + "advantage_std": 0.788145937025547, + "completion_length": 2370.562530517578, + "epoch": 0.44457142857142856, + "grad_norm": 0.3369554281234741, + "kl": 0.2927093505859375, + "lambda_div_used": 0.5, + "learning_rate": 2.2848729416523859e-07, + "loss": 0.0241, + "reward": -0.006450021639466286, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.006450021639466286, + "reward_after_std": 0.7881459295749664, + "reward_before_mean": 0.547332945279777, + "reward_before_std": 0.6886685937643051, + "reward_change_max": 0.0006812885403633118, + "reward_change_mean": -0.5537829957902431, + "reward_change_min": -0.9398513734340668, + "reward_change_std": 0.37046326510608196, + "reward_std": 0.788145937025547, + "rewards/cosine_scaled_reward": -0.11175020085647702, + "rewards/format_reward": 0.7708333414047956, + "step": 389 + }, + { + "advantage_max": 1.827544629573822, + "advantage_mean": 1.241763458725842e-08, + "advantage_min": -0.7890815921127796, + "advantage_std": 0.9409858584403992, + "completion_length": 2891.9375915527344, + "epoch": 0.44571428571428573, + "grad_norm": 0.9549670815467834, + "kl": 0.42510986328125, + "lambda_div_used": 0.5, + "learning_rate": 2.2629708984760706e-07, + "loss": 0.0339, + "reward": 0.024634618312120438, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.024634618312120438, + "reward_after_std": 0.9409858658909798, + "reward_before_mean": 0.5577679611742496, + "reward_before_std": 0.8641040101647377, + "reward_change_max": 0.0, + "reward_change_mean": -0.5331333354115486, + "reward_change_min": -0.9533309638500214, + "reward_change_std": 0.37772860564291477, + "reward_std": 0.9409858882427216, + "rewards/cosine_scaled_reward": -0.09611603221856058, + "rewards/format_reward": 0.7500000186264515, + "step": 390 + }, + { + "advantage_max": 1.8195180594921112, + "advantage_mean": -2.0489097307674342e-08, + "advantage_min": -0.8196974471211433, + "advantage_std": 0.9608958922326565, + "completion_length": 2568.979232788086, + "epoch": 0.44685714285714284, + "grad_norm": 0.8521519899368286, + "kl": 0.34222412109375, + "lambda_div_used": 0.5, + "learning_rate": 2.2412266235313973e-07, + "loss": 0.053, + "reward": 0.20352406054735184, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.20352406054735184, + "reward_after_std": 0.9608958698809147, + "reward_before_mean": 0.8823814336210489, + "reward_before_std": 0.9071417227387428, + "reward_change_max": 0.0, + "reward_change_mean": -0.6788573786616325, + "reward_change_min": -1.235791377723217, + "reward_change_std": 0.48160158656537533, + "reward_std": 0.9608958885073662, + "rewards/cosine_scaled_reward": 0.055774035543436185, + "rewards/format_reward": 0.7708333432674408, + "step": 391 + }, + { + "advantage_max": 1.5429241210222244, + "advantage_mean": 5.58793583627093e-09, + "advantage_min": -0.752219345420599, + "advantage_std": 0.8274786025285721, + "completion_length": 2352.1875610351562, + "epoch": 0.448, + "grad_norm": 0.43700075149536133, + "kl": 0.310302734375, + "lambda_div_used": 0.5, + "learning_rate": 2.2196411766036487e-07, + "loss": 0.0189, + "reward": 0.17132935347035527, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.17132935347035527, + "reward_after_std": 0.8274786174297333, + "reward_before_mean": 0.876284271478653, + "reward_before_std": 0.7766976878046989, + "reward_change_max": 0.0, + "reward_change_mean": -0.7049549445509911, + "reward_change_min": -1.256986565887928, + "reward_change_std": 0.4841836094856262, + "reward_std": 0.8274786546826363, + "rewards/cosine_scaled_reward": -0.009774532169103622, + "rewards/format_reward": 0.8958333507180214, + "step": 392 + }, + { + "advantage_max": 2.1338966339826584, + "advantage_mean": -2.421438782818086e-08, + "advantage_min": -1.005941316485405, + "advantage_std": 1.1352594494819641, + "completion_length": 2590.8333587646484, + "epoch": 0.4491428571428571, + "grad_norm": 0.5280225872993469, + "kl": 0.28961181640625, + "lambda_div_used": 0.5, + "learning_rate": 2.1982156097370557e-07, + "loss": 0.0103, + "reward": 0.3125934284180403, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3125934284180403, + "reward_after_std": 1.1352594494819641, + "reward_before_mean": 1.0306870639324188, + "reward_before_std": 1.1202639937400818, + "reward_change_max": 0.0010864585638046265, + "reward_change_mean": -0.7180936299264431, + "reward_change_min": -1.4779117703437805, + "reward_change_std": 0.5617240853607655, + "reward_std": 1.1352594941854477, + "rewards/cosine_scaled_reward": 0.09867685753852129, + "rewards/format_reward": 0.8333333469927311, + "step": 393 + }, + { + "advantage_max": 1.255016416311264, + "advantage_mean": 6.829698917520943e-09, + "advantage_min": -0.5600562617182732, + "advantage_std": 0.656397633254528, + "completion_length": 3040.604248046875, + "epoch": 0.4502857142857143, + "grad_norm": 0.9631423950195312, + "kl": 0.3890380859375, + "lambda_div_used": 0.5, + "learning_rate": 2.1769509671835223e-07, + "loss": 0.0085, + "reward": -0.19103789888322353, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.19103789888322353, + "reward_after_std": 0.656397633254528, + "reward_before_mean": 0.25348040368407965, + "reward_before_std": 0.5955763068050146, + "reward_change_max": 0.0, + "reward_change_mean": -0.4445183016359806, + "reward_change_min": -0.8603153452277184, + "reward_change_std": 0.32232517190277576, + "reward_std": 0.6563976444303989, + "rewards/cosine_scaled_reward": -0.22742647491395473, + "rewards/format_reward": 0.7083333432674408, + "step": 394 + }, + { + "advantage_max": 1.6628983914852142, + "advantage_mean": -9.313226023710541e-09, + "advantage_min": -0.868357315659523, + "advantage_std": 0.9002252444624901, + "completion_length": 2143.541702270508, + "epoch": 0.4514285714285714, + "grad_norm": 0.5944837927818298, + "kl": 0.2022247314453125, + "lambda_div_used": 0.5, + "learning_rate": 2.1558482853517253e-07, + "loss": -0.0058, + "reward": 0.07094830134883523, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.07094830134883523, + "reward_after_std": 0.9002252593636513, + "reward_before_mean": 0.6628658212721348, + "reward_before_std": 0.9293262884020805, + "reward_change_max": 0.00043542683124542236, + "reward_change_mean": -0.5919175185263157, + "reward_change_min": -1.1777884289622307, + "reward_change_std": 0.48178502917289734, + "reward_std": 0.9002252817153931, + "rewards/cosine_scaled_reward": -0.043567102402448654, + "rewards/format_reward": 0.7500000149011612, + "step": 395 + }, + { + "advantage_max": 1.619390420615673, + "advantage_mean": -1.1175870895385742e-08, + "advantage_min": -0.6089432537555695, + "advantage_std": 0.8248656615614891, + "completion_length": 2691.229232788086, + "epoch": 0.45257142857142857, + "grad_norm": 0.497490257024765, + "kl": 0.25360107421875, + "lambda_div_used": 0.5, + "learning_rate": 2.134908592756607e-07, + "loss": -0.0032, + "reward": 0.21576578076928854, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.21576578076928854, + "reward_after_std": 0.8248656615614891, + "reward_before_mean": 0.9347635172307491, + "reward_before_std": 0.6298792697489262, + "reward_change_max": 0.0, + "reward_change_mean": -0.7189977429807186, + "reward_change_min": -1.1502107232809067, + "reward_change_std": 0.42266157269477844, + "reward_std": 0.824865709990263, + "rewards/cosine_scaled_reward": 0.009048409294337034, + "rewards/format_reward": 0.9166666865348816, + "step": 396 + }, + { + "advantage_max": 1.3563490435481071, + "advantage_mean": -9.313225801665936e-09, + "advantage_min": -0.6180124171078205, + "advantage_std": 0.7036689929664135, + "completion_length": 2026.4167175292969, + "epoch": 0.45371428571428574, + "grad_norm": 0.31323692202568054, + "kl": 0.11944580078125, + "lambda_div_used": 0.5, + "learning_rate": 2.1141329099692406e-07, + "loss": -0.0147, + "reward": 0.08167102443985641, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.08167102443985641, + "reward_after_std": 0.7036689929664135, + "reward_before_mean": 0.7375619001686573, + "reward_before_std": 0.5679947603493929, + "reward_change_max": 0.0009193271398544312, + "reward_change_mean": -0.6558908764272928, + "reward_change_min": -0.989147812128067, + "reward_change_std": 0.3919084258377552, + "reward_std": 0.7036690339446068, + "rewards/cosine_scaled_reward": -0.027052395045757294, + "rewards/format_reward": 0.7916666828095913, + "step": 397 + }, + { + "advantage_max": 1.7759979516267776, + "advantage_mean": 7.45058065243498e-09, + "advantage_min": -0.6430985629558563, + "advantage_std": 0.9113304987549782, + "completion_length": 2401.791717529297, + "epoch": 0.45485714285714285, + "grad_norm": 1.0113723278045654, + "kl": 0.3887786865234375, + "lambda_div_used": 0.5, + "learning_rate": 2.0935222495670968e-07, + "loss": 0.016, + "reward": 0.09548487048596144, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.09548487048596144, + "reward_after_std": 0.9113305136561394, + "reward_before_mean": 0.6916496542398818, + "reward_before_std": 0.7774554453790188, + "reward_change_max": 0.0, + "reward_change_mean": -0.5961647741496563, + "reward_change_min": -1.03419828414917, + "reward_change_std": 0.38981775380671024, + "reward_std": 0.9113305732607841, + "rewards/cosine_scaled_reward": -0.018758506514132023, + "rewards/format_reward": 0.7291666828095913, + "step": 398 + }, + { + "advantage_max": 2.001268118619919, + "advantage_mean": -1.924733378233512e-08, + "advantage_min": -0.8032764531672001, + "advantage_std": 1.0470364093780518, + "completion_length": 2351.2500762939453, + "epoch": 0.456, + "grad_norm": 0.7004992365837097, + "kl": 0.22613525390625, + "lambda_div_used": 0.5, + "learning_rate": 2.0730776160846853e-07, + "loss": 0.0115, + "reward": 0.30928437784314156, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.30928437784314156, + "reward_after_std": 1.047036424279213, + "reward_before_mean": 1.0472409576177597, + "reward_before_std": 0.9616851769387722, + "reward_change_max": 0.0, + "reward_change_mean": -0.7379565685987473, + "reward_change_min": -1.4024077132344246, + "reward_change_std": 0.5100381188094616, + "reward_std": 1.0470364391803741, + "rewards/cosine_scaled_reward": 0.05487045622430742, + "rewards/format_reward": 0.9375000074505806, + "step": 399 + }, + { + "advantage_max": 1.948826715350151, + "advantage_mean": -1.6142924996742636e-08, + "advantage_min": -0.8613158576190472, + "advantage_std": 1.0191247761249542, + "completion_length": 1702.4584121704102, + "epoch": 0.45714285714285713, + "grad_norm": 0.3669349253177643, + "kl": 0.1160430908203125, + "lambda_div_used": 0.5, + "learning_rate": 2.0528000059645995e-07, + "loss": -0.0115, + "reward": 0.41814972274005413, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.41814972274005413, + "reward_after_std": 1.0191247910261154, + "reward_before_mean": 1.2584092626348138, + "reward_before_std": 0.8928911443799734, + "reward_change_max": 0.0, + "reward_change_mean": -0.8402595855295658, + "reward_change_min": -1.408696487545967, + "reward_change_std": 0.545951347798109, + "reward_std": 1.019124835729599, + "rewards/cosine_scaled_reward": 0.1917046275921166, + "rewards/format_reward": 0.8750000037252903, + "step": 400 + }, + { + "advantage_max": 1.6143862754106522, + "advantage_mean": -1.3659397724019584e-08, + "advantage_min": -0.7051677592098713, + "advantage_std": 0.8424163311719894, + "completion_length": 2862.0209350585938, + "epoch": 0.4582857142857143, + "grad_norm": 0.35205700993537903, + "kl": 0.2542724609375, + "lambda_div_used": 0.5, + "learning_rate": 2.032690407508949e-07, + "loss": 0.0116, + "reward": 0.2912533753551543, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2912533753551543, + "reward_after_std": 0.8424163162708282, + "reward_before_mean": 1.0814423598349094, + "reward_before_std": 0.6909527480602264, + "reward_change_max": 0.0003457888960838318, + "reward_change_mean": -0.7901889868080616, + "reward_change_min": -1.2221315279603004, + "reward_change_std": 0.4830316975712776, + "reward_std": 0.8424163609743118, + "rewards/cosine_scaled_reward": 0.10322117432951927, + "rewards/format_reward": 0.8750000074505806, + "step": 401 + }, + { + "advantage_max": 1.3721887990832329, + "advantage_mean": 1.1796752963366686e-08, + "advantage_min": -0.639917079359293, + "advantage_std": 0.7129794172942638, + "completion_length": 2224.3333740234375, + "epoch": 0.4594285714285714, + "grad_norm": 0.7739508748054504, + "kl": 0.1800537109375, + "lambda_div_used": 0.5, + "learning_rate": 2.0127498008311922e-07, + "loss": 0.0708, + "reward": 0.03190509416162968, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.03190509416162968, + "reward_after_std": 0.7129794359207153, + "reward_before_mean": 0.6415856056846678, + "reward_before_std": 0.6075125262141228, + "reward_change_max": 0.0, + "reward_change_mean": -0.609680525958538, + "reward_change_min": -0.9582468569278717, + "reward_change_std": 0.382707916200161, + "reward_std": 0.7129794433712959, + "rewards/cosine_scaled_reward": -0.12712387926876545, + "rewards/format_reward": 0.8958333432674408, + "step": 402 + }, + { + "advantage_max": 1.3863247409462929, + "advantage_mean": -1.8626450382086546e-09, + "advantage_min": -0.633481714874506, + "advantage_std": 0.7379178702831268, + "completion_length": 2153.1042251586914, + "epoch": 0.4605714285714286, + "grad_norm": 0.45951032638549805, + "kl": 0.164215087890625, + "lambda_div_used": 0.5, + "learning_rate": 1.9929791578083655e-07, + "loss": -0.0093, + "reward": 0.006359277293086052, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.006359277293086052, + "reward_after_std": 0.7379178553819656, + "reward_before_mean": 0.5869014756754041, + "reward_before_std": 0.687812514603138, + "reward_change_max": 0.0007327944040298462, + "reward_change_mean": -0.5805421750992537, + "reward_change_min": -1.0237832926213741, + "reward_change_std": 0.4055717270821333, + "reward_std": 0.7379178702831268, + "rewards/cosine_scaled_reward": -0.07113261707127094, + "rewards/format_reward": 0.7291666697710752, + "step": 403 + }, + { + "advantage_max": 1.2565838545560837, + "advantage_mean": 1.2417634698280722e-09, + "advantage_min": -0.6241967566311359, + "advantage_std": 0.6679689809679985, + "completion_length": 2440.2917404174805, + "epoch": 0.4617142857142857, + "grad_norm": 0.26222601532936096, + "kl": 0.2291107177734375, + "lambda_div_used": 0.5, + "learning_rate": 1.9733794420337213e-07, + "loss": 0.0283, + "reward": -0.009857988567091525, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.009857988567091525, + "reward_after_std": 0.6679689884185791, + "reward_before_mean": 0.5863540228456259, + "reward_before_std": 0.5875301137566566, + "reward_change_max": 0.002009287476539612, + "reward_change_mean": -0.59621203225106, + "reward_change_min": -1.0139516070485115, + "reward_change_std": 0.3946251608431339, + "reward_std": 0.66796899959445, + "rewards/cosine_scaled_reward": -0.06098964437842369, + "rewards/format_reward": 0.708333345130086, + "step": 404 + }, + { + "advantage_max": 1.655549019575119, + "advantage_mean": 6.208817460162663e-09, + "advantage_min": -0.7989021204411983, + "advantage_std": 0.8907373733818531, + "completion_length": 2022.0833892822266, + "epoch": 0.46285714285714286, + "grad_norm": 0.33377718925476074, + "kl": 0.201019287109375, + "lambda_div_used": 0.5, + "learning_rate": 1.9539516087697517e-07, + "loss": 0.0165, + "reward": 0.1627146191895008, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1627146191895008, + "reward_after_std": 0.8907373733818531, + "reward_before_mean": 0.8349231742322445, + "reward_before_std": 0.8655808642506599, + "reward_change_max": 0.0005633682012557983, + "reward_change_mean": -0.6722085531800985, + "reward_change_min": -1.2954475656151772, + "reward_change_std": 0.4969491269439459, + "reward_std": 0.8907373957335949, + "rewards/cosine_scaled_reward": 0.0528782494366169, + "rewards/format_reward": 0.7291666828095913, + "step": 405 + }, + { + "advantage_max": 1.8264697641134262, + "advantage_mean": -4.967053546245381e-09, + "advantage_min": -0.7284509651362896, + "advantage_std": 0.9489434212446213, + "completion_length": 2138.2083587646484, + "epoch": 0.464, + "grad_norm": 0.3909973204135895, + "kl": 0.17059326171875, + "lambda_div_used": 0.5, + "learning_rate": 1.934696604901642e-07, + "loss": -0.003, + "reward": 0.1820035995915532, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1820035995915532, + "reward_after_std": 0.9489434361457825, + "reward_before_mean": 0.844778798520565, + "reward_before_std": 0.8595172390341759, + "reward_change_max": 0.0005043521523475647, + "reward_change_mean": -0.6627751663327217, + "reward_change_min": -1.2455066293478012, + "reward_change_std": 0.4535139240324497, + "reward_std": 0.9489434435963631, + "rewards/cosine_scaled_reward": 0.0057227155193686485, + "rewards/format_reward": 0.8333333395421505, + "step": 406 + }, + { + "advantage_max": 1.3399841859936714, + "advantage_mean": -1.6763806731656672e-08, + "advantage_min": -0.43382854759693146, + "advantage_std": 0.6706324480473995, + "completion_length": 2403.0208892822266, + "epoch": 0.46514285714285714, + "grad_norm": 0.22056791186332703, + "kl": 0.1808319091796875, + "lambda_div_used": 0.5, + "learning_rate": 1.915615368891117e-07, + "loss": 0.0168, + "reward": 0.16278862720355392, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.16278862720355392, + "reward_after_std": 0.670632466673851, + "reward_before_mean": 0.8940027691423893, + "reward_before_std": 0.399973401799798, + "reward_change_max": 0.0, + "reward_change_mean": -0.7312141358852386, + "reward_change_min": -1.0532505437731743, + "reward_change_std": 0.3912728149443865, + "reward_std": 0.6706324815750122, + "rewards/cosine_scaled_reward": 0.040751357562839985, + "rewards/format_reward": 0.8125000055879354, + "step": 407 + }, + { + "advantage_max": 1.7584019675850868, + "advantage_mean": -2.85605594174676e-08, + "advantage_min": -0.6405363604426384, + "advantage_std": 0.9045485965907574, + "completion_length": 2371.291748046875, + "epoch": 0.4662857142857143, + "grad_norm": 0.550900936126709, + "kl": 0.2185821533203125, + "lambda_div_used": 0.5, + "learning_rate": 1.8967088307307e-07, + "loss": 0.0123, + "reward": 0.32463838160037994, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.32463838160037994, + "reward_after_std": 0.9045485965907574, + "reward_before_mean": 1.120386364404112, + "reward_before_std": 0.6943012624979019, + "reward_change_max": 0.0008317306637763977, + "reward_change_mean": -0.7957480065524578, + "reward_change_min": -1.2995992079377174, + "reward_change_std": 0.5055977776646614, + "reward_std": 0.9045485965907574, + "rewards/cosine_scaled_reward": 0.15394318150356412, + "rewards/format_reward": 0.8125000186264515, + "step": 408 + }, + { + "advantage_max": 1.32723917812109, + "advantage_mean": 1.4901162082026076e-08, + "advantage_min": -0.6812616810202599, + "advantage_std": 0.705879982560873, + "completion_length": 3048.041717529297, + "epoch": 0.4674285714285714, + "grad_norm": 0.5131399631500244, + "kl": 0.278533935546875, + "lambda_div_used": 0.5, + "learning_rate": 1.8779779118983867e-07, + "loss": 0.0211, + "reward": -0.08907385356724262, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.08907385356724262, + "reward_after_std": 0.705879982560873, + "reward_before_mean": 0.43432591343298554, + "reward_before_std": 0.6671581901609898, + "reward_change_max": 0.0005308240652084351, + "reward_change_mean": -0.5233997739851475, + "reward_change_min": -0.9856052212417126, + "reward_change_std": 0.38404569029808044, + "reward_std": 0.7058799862861633, + "rewards/cosine_scaled_reward": -0.09533703187480569, + "rewards/format_reward": 0.6250000186264515, + "step": 409 + }, + { + "advantage_max": 1.9289898574352264, + "advantage_mean": -2.483527050678447e-09, + "advantage_min": -0.6834102421998978, + "advantage_std": 0.9876251555979252, + "completion_length": 2450.3750381469727, + "epoch": 0.4685714285714286, + "grad_norm": 1.2392657995224, + "kl": 0.25274658203125, + "lambda_div_used": 0.5, + "learning_rate": 1.8594235253127372e-07, + "loss": 0.0502, + "reward": 0.04491107352077961, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.04491107352077961, + "reward_after_std": 0.987625103443861, + "reward_before_mean": 0.5722492169588804, + "reward_before_std": 0.8849713280797005, + "reward_change_max": 0.0, + "reward_change_mean": -0.5273381508886814, + "reward_change_min": -1.0481892675161362, + "reward_change_std": 0.380878571420908, + "reward_std": 0.9876251295208931, + "rewards/cosine_scaled_reward": -0.06804206012748182, + "rewards/format_reward": 0.7083333358168602, + "step": 410 + }, + { + "advantage_max": 1.8131102174520493, + "advantage_mean": -4.346172532976311e-09, + "advantage_min": -0.8035260625183582, + "advantage_std": 0.9476320967078209, + "completion_length": 2871.729278564453, + "epoch": 0.4697142857142857, + "grad_norm": 0.8493767976760864, + "kl": 0.27581787109375, + "lambda_div_used": 0.5, + "learning_rate": 1.8410465752883758e-07, + "loss": 0.0695, + "reward": 0.2603566190227866, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2603566190227866, + "reward_after_std": 0.9476321116089821, + "reward_before_mean": 0.9874628521502018, + "reward_before_std": 0.8463159576058388, + "reward_change_max": 0.0006547495722770691, + "reward_change_mean": -0.7271062415093184, + "reward_change_min": -1.2260434813797474, + "reward_change_std": 0.4771372377872467, + "reward_std": 0.9476321414113045, + "rewards/cosine_scaled_reward": 0.08748140814714134, + "rewards/format_reward": 0.8125000111758709, + "step": 411 + }, + { + "advantage_max": 1.9703435078263283, + "advantage_mean": -3.7252904094842165e-09, + "advantage_min": -0.9085872285068035, + "advantage_std": 1.0412596613168716, + "completion_length": 2834.0208892822266, + "epoch": 0.47085714285714286, + "grad_norm": 1.3024187088012695, + "kl": 0.22625732421875, + "lambda_div_used": 0.5, + "learning_rate": 1.822847957491922e-07, + "loss": 0.0245, + "reward": 0.18393388949334621, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.18393388949334621, + "reward_after_std": 1.041259691119194, + "reward_before_mean": 0.8306164983659983, + "reward_before_std": 1.0156190879642963, + "reward_change_max": 0.0005686357617378235, + "reward_change_mean": -0.6466826163232327, + "reward_change_min": -1.296832486987114, + "reward_change_std": 0.5055333897471428, + "reward_std": 1.0412597358226776, + "rewards/cosine_scaled_reward": 0.04030823614448309, + "rewards/format_reward": 0.7500000074505806, + "step": 412 + }, + { + "advantage_max": 1.5172990262508392, + "advantage_mean": -3.104408619059029e-09, + "advantage_min": -0.7004434801638126, + "advantage_std": 0.803725078701973, + "completion_length": 2530.0834197998047, + "epoch": 0.472, + "grad_norm": 0.7498294115066528, + "kl": 0.19140625, + "lambda_div_used": 0.5, + "learning_rate": 1.804828558898332e-07, + "loss": 0.0558, + "reward": 0.10242907330393791, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.10242907330393791, + "reward_after_std": 0.803725078701973, + "reward_before_mean": 0.7471863450482488, + "reward_before_std": 0.7387400958687067, + "reward_change_max": 0.0, + "reward_change_mean": -0.6447572745382786, + "reward_change_min": -1.173839956521988, + "reward_change_std": 0.4428400434553623, + "reward_std": 0.8037250824272633, + "rewards/cosine_scaled_reward": -0.011823497712612152, + "rewards/format_reward": 0.7708333544433117, + "step": 413 + }, + { + "advantage_max": 1.4408729001879692, + "advantage_mean": 1.8626451825376478e-08, + "advantage_min": -0.597054660320282, + "advantage_std": 0.7476470805704594, + "completion_length": 3157.0625610351562, + "epoch": 0.47314285714285714, + "grad_norm": 0.9261155128479004, + "kl": 0.346923828125, + "lambda_div_used": 0.5, + "learning_rate": 1.7869892577476722e-07, + "loss": 0.0193, + "reward": -0.11320638004690409, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.11320638004690409, + "reward_after_std": 0.74764708802104, + "reward_before_mean": 0.3687801326159388, + "reward_before_std": 0.6687690652906895, + "reward_change_max": 0.0027580782771110535, + "reward_change_mean": -0.48198647797107697, + "reward_change_min": -0.8558552041649818, + "reward_change_std": 0.324984148144722, + "reward_std": 0.747647114098072, + "rewards/cosine_scaled_reward": -0.21144328452646732, + "rewards/format_reward": 0.791666679084301, + "step": 414 + }, + { + "advantage_max": 1.704764910042286, + "advantage_mean": -7.450580929990736e-09, + "advantage_min": -0.9032006934285164, + "advantage_std": 0.9267212748527527, + "completion_length": 3225.979278564453, + "epoch": 0.4742857142857143, + "grad_norm": 0.5711144208908081, + "kl": 0.39208984375, + "lambda_div_used": 0.5, + "learning_rate": 1.7693309235023127e-07, + "loss": 0.0613, + "reward": 0.13933263439685106, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.13933263439685106, + "reward_after_std": 0.9267212599515915, + "reward_before_mean": 0.7884473074227571, + "reward_before_std": 0.9520576298236847, + "reward_change_max": 0.0004018843173980713, + "reward_change_mean": -0.6491146758198738, + "reward_change_min": -1.266348458826542, + "reward_change_std": 0.5081395395100117, + "reward_std": 0.9267212674021721, + "rewards/cosine_scaled_reward": 0.019223633222281933, + "rewards/format_reward": 0.7500000298023224, + "step": 415 + }, + { + "advantage_max": 1.9375486299395561, + "advantage_mean": -3.1044085080367267e-09, + "advantage_min": -0.8114347271621227, + "advantage_std": 1.0063435733318329, + "completion_length": 2236.8958740234375, + "epoch": 0.4754285714285714, + "grad_norm": 0.6455732583999634, + "kl": 0.1776275634765625, + "lambda_div_used": 0.5, + "learning_rate": 1.7518544168045524e-07, + "loss": 0.0696, + "reward": 0.17256421316415071, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.17256421316415071, + "reward_after_std": 1.0063435807824135, + "reward_before_mean": 0.8131153769791126, + "reward_before_std": 0.9146289005875587, + "reward_change_max": 0.0, + "reward_change_mean": -0.6405511423945427, + "reward_change_min": -1.2199120596051216, + "reward_change_std": 0.4531344957649708, + "reward_std": 1.0063436180353165, + "rewards/cosine_scaled_reward": 0.0003076721914112568, + "rewards/format_reward": 0.8125000055879354, + "step": 416 + }, + { + "advantage_max": 1.4362558871507645, + "advantage_mean": 1.6142925107764938e-08, + "advantage_min": -0.7358811981976032, + "advantage_std": 0.7816518843173981, + "completion_length": 3103.854248046875, + "epoch": 0.4765714285714286, + "grad_norm": 0.5906895399093628, + "kl": 0.3399658203125, + "lambda_div_used": 0.5, + "learning_rate": 1.7345605894346726e-07, + "loss": 0.0153, + "reward": -0.08510342193767428, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.08510342193767428, + "reward_after_std": 0.7816518843173981, + "reward_before_mean": 0.4210092220455408, + "reward_before_std": 0.8183082491159439, + "reward_change_max": 0.0, + "reward_change_mean": -0.5061126090586185, + "reward_change_min": -1.0609179846942425, + "reward_change_std": 0.4223988838493824, + "reward_std": 0.7816519141197205, + "rewards/cosine_scaled_reward": -0.11241207923740149, + "rewards/format_reward": 0.6458333600312471, + "step": 417 + }, + { + "advantage_max": 1.923146240413189, + "advantage_mean": -1.8316011041186187e-08, + "advantage_min": -0.8123641051352024, + "advantage_std": 0.9936596788465977, + "completion_length": 2348.250072479248, + "epoch": 0.4777142857142857, + "grad_norm": 0.48226863145828247, + "kl": 0.217620849609375, + "lambda_div_used": 0.5, + "learning_rate": 1.7174502842694212e-07, + "loss": 0.0215, + "reward": 0.39290976664051414, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.39290976664051414, + "reward_after_std": 0.9936597011983395, + "reward_before_mean": 1.216740008443594, + "reward_before_std": 0.8273234628140926, + "reward_change_max": 0.0, + "reward_change_mean": -0.8238302320241928, + "reward_change_min": -1.355735719203949, + "reward_change_std": 0.5216735042631626, + "reward_std": 0.9936597235500813, + "rewards/cosine_scaled_reward": 0.17086999164894223, + "rewards/format_reward": 0.8750000074505806, + "step": 418 + }, + { + "advantage_max": 1.910450629889965, + "advantage_mean": -1.738468857759301e-08, + "advantage_min": -0.8536994196474552, + "advantage_std": 1.012555181980133, + "completion_length": 2614.666748046875, + "epoch": 0.47885714285714287, + "grad_norm": 0.873965859413147, + "kl": 0.208770751953125, + "lambda_div_used": 0.5, + "learning_rate": 1.7005243352409333e-07, + "loss": 0.0198, + "reward": 0.08720649965107441, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.08720649965107441, + "reward_after_std": 1.012555181980133, + "reward_before_mean": 0.6550634186714888, + "reward_before_std": 1.015907321125269, + "reward_change_max": 0.0020074471831321716, + "reward_change_mean": -0.5678569041192532, + "reward_change_min": -1.0999210849404335, + "reward_change_std": 0.4567646738141775, + "reward_std": 1.012555219233036, + "rewards/cosine_scaled_reward": -0.037051646038889885, + "rewards/format_reward": 0.7291666828095913, + "step": 419 + }, + { + "advantage_max": 1.4565354362130165, + "advantage_mean": 4.967053879312289e-09, + "advantage_min": -0.7246251739561558, + "advantage_std": 0.7659836560487747, + "completion_length": 2339.833381652832, + "epoch": 0.48, + "grad_norm": 0.440544456243515, + "kl": 0.2689208984375, + "lambda_div_used": 0.5, + "learning_rate": 1.6837835672960831e-07, + "loss": 0.0333, + "reward": 0.05484509962843731, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.05484509962843731, + "reward_after_std": 0.7659836485981941, + "reward_before_mean": 0.6696557952091098, + "reward_before_std": 0.6934031620621681, + "reward_change_max": 0.0011351853609085083, + "reward_change_mean": -0.6148106604814529, + "reward_change_min": -1.0451572611927986, + "reward_change_std": 0.4126305319368839, + "reward_std": 0.7659836895763874, + "rewards/cosine_scaled_reward": -0.0818388033658266, + "rewards/format_reward": 0.833333358168602, + "step": 420 + }, + { + "advantage_max": 1.380041942000389, + "advantage_mean": 4.346172144398253e-09, + "advantage_min": -0.6508830934762955, + "advantage_std": 0.7286509647965431, + "completion_length": 3022.604248046875, + "epoch": 0.48114285714285715, + "grad_norm": 0.36839476227760315, + "kl": 0.2734375, + "lambda_div_used": 0.5, + "learning_rate": 1.6672287963562852e-07, + "loss": 0.0201, + "reward": -0.10270002530887723, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.10270002530887723, + "reward_after_std": 0.7286509685218334, + "reward_before_mean": 0.40078302239999175, + "reward_before_std": 0.6932071186602116, + "reward_change_max": 0.0006034299731254578, + "reward_change_mean": -0.5034830346703529, + "reward_change_min": -0.9662024602293968, + "reward_change_std": 0.36624561436474323, + "reward_std": 0.728650975972414, + "rewards/cosine_scaled_reward": -0.18502516951411963, + "rewards/format_reward": 0.7708333544433117, + "step": 421 + }, + { + "advantage_max": 1.4664915353059769, + "advantage_mean": -1.2417635808503746e-09, + "advantage_min": -0.6892965100705624, + "advantage_std": 0.7829396314918995, + "completion_length": 2686.6042098999023, + "epoch": 0.48228571428571426, + "grad_norm": 0.6950385570526123, + "kl": 0.24615478515625, + "lambda_div_used": 0.5, + "learning_rate": 1.6508608292777203e-07, + "loss": -0.0204, + "reward": -0.0029990673065185547, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.0029990673065185547, + "reward_after_std": 0.782939612865448, + "reward_before_mean": 0.5685033015906811, + "reward_before_std": 0.7512042485177517, + "reward_change_max": 0.0011105537414550781, + "reward_change_mean": -0.5715023390948772, + "reward_change_min": -1.053415346890688, + "reward_change_std": 0.41840689815580845, + "reward_std": 0.7829396203160286, + "rewards/cosine_scaled_reward": -0.1115816955716582, + "rewards/format_reward": 0.7916666902601719, + "step": 422 + }, + { + "advantage_max": 1.5644587278366089, + "advantage_mean": 6.829699028543246e-09, + "advantage_min": -0.6998874768614769, + "advantage_std": 0.8275195769965649, + "completion_length": 2256.145881652832, + "epoch": 0.48342857142857143, + "grad_norm": 0.4108107388019562, + "kl": 0.2389373779296875, + "lambda_div_used": 0.5, + "learning_rate": 1.6346804638120098e-07, + "loss": -0.0092, + "reward": -0.020866421051323414, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.020866421051323414, + "reward_after_std": 0.8275195844471455, + "reward_before_mean": 0.5108129326254129, + "reward_before_std": 0.8039413914084435, + "reward_change_max": 0.0, + "reward_change_mean": -0.5316793769598007, + "reward_change_min": -1.0674263015389442, + "reward_change_std": 0.40754328295588493, + "reward_std": 0.8275196105241776, + "rewards/cosine_scaled_reward": -0.0987602099776268, + "rewards/format_reward": 0.7083333469927311, + "step": 423 + }, + { + "advantage_max": 1.4423941150307655, + "advantage_mean": 8.071462442860167e-09, + "advantage_min": -0.562180645763874, + "advantage_std": 0.7390119582414627, + "completion_length": 2725.2084045410156, + "epoch": 0.4845714285714286, + "grad_norm": 0.6571977734565735, + "kl": 0.2581787109375, + "lambda_div_used": 0.5, + "learning_rate": 1.6186884885673413e-07, + "loss": 0.0731, + "reward": -0.05772208608686924, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.05772208608686924, + "reward_after_std": 0.7390119433403015, + "reward_before_mean": 0.4672268598806113, + "reward_before_std": 0.6280698869377375, + "reward_change_max": 0.0, + "reward_change_mean": -0.5249489285051823, + "reward_change_min": -0.8993471413850784, + "reward_change_std": 0.3353810776025057, + "reward_std": 0.7390119507908821, + "rewards/cosine_scaled_reward": -0.13096991274505854, + "rewards/format_reward": 0.7291666809469461, + "step": 424 + }, + { + "advantage_max": 1.8298065513372421, + "advantage_mean": 4.967054101356894e-09, + "advantage_min": -0.79290621727705, + "advantage_std": 0.9439056888222694, + "completion_length": 2086.1250534057617, + "epoch": 0.4857142857142857, + "grad_norm": 0.7611392140388489, + "kl": 0.239532470703125, + "lambda_div_used": 0.5, + "learning_rate": 1.6028856829700258e-07, + "loss": 0.0603, + "reward": 0.516562958873692, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.516562958873692, + "reward_after_std": 0.9439056888222694, + "reward_before_mean": 1.4521526768803596, + "reward_before_std": 0.7164515964686871, + "reward_change_max": 0.00011243671178817749, + "reward_change_mean": -0.935589674860239, + "reward_change_min": -1.415509656071663, + "reward_change_std": 0.5498905442655087, + "reward_std": 0.9439057037234306, + "rewards/cosine_scaled_reward": 0.33024298259988427, + "rewards/format_reward": 0.7916666828095913, + "step": 425 + }, + { + "advantage_max": 1.8577021807432175, + "advantage_mean": 9.313225912688239e-09, + "advantage_min": -0.7165437117218971, + "advantage_std": 0.9620238281786442, + "completion_length": 1966.6458892822266, + "epoch": 0.4868571428571429, + "grad_norm": 0.23527927696704865, + "kl": 0.141265869140625, + "lambda_div_used": 0.5, + "learning_rate": 1.5872728172265146e-07, + "loss": 0.004, + "reward": 0.2146987458691001, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2146987458691001, + "reward_after_std": 0.9620238430798054, + "reward_before_mean": 0.8918719813227654, + "reward_before_std": 0.8488549254834652, + "reward_change_max": 0.0008971467614173889, + "reward_change_mean": -0.6771732289344072, + "reward_change_min": -1.1791602671146393, + "reward_change_std": 0.4466224256902933, + "reward_std": 0.962023850530386, + "rewards/cosine_scaled_reward": 0.018852660432457924, + "rewards/format_reward": 0.8541666679084301, + "step": 426 + }, + { + "advantage_max": 1.620961919426918, + "advantage_mean": -5.58793539218172e-09, + "advantage_min": -0.7730313017964363, + "advantage_std": 0.8482746072113514, + "completion_length": 2883.041748046875, + "epoch": 0.488, + "grad_norm": 0.36235061287879944, + "kl": 0.2977294921875, + "lambda_div_used": 0.5, + "learning_rate": 1.5718506522858572e-07, + "loss": 0.0417, + "reward": 0.17976178135722876, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.17976178135722876, + "reward_after_std": 0.8482745960354805, + "reward_before_mean": 0.8755260724574327, + "reward_before_std": 0.7476286813616753, + "reward_change_max": 0.0008819624781608582, + "reward_change_mean": -0.6957642920315266, + "reward_change_min": -1.1062694638967514, + "reward_change_std": 0.4417474362999201, + "reward_std": 0.8482746034860611, + "rewards/cosine_scaled_reward": 0.031513024296145886, + "rewards/format_reward": 0.812500013038516, + "step": 427 + }, + { + "advantage_max": 1.90454863011837, + "advantage_mean": -9.313225690643634e-09, + "advantage_min": -0.8171779625117779, + "advantage_std": 1.0046098306775093, + "completion_length": 2616.3125610351562, + "epoch": 0.48914285714285716, + "grad_norm": 0.5639275312423706, + "kl": 0.276397705078125, + "lambda_div_used": 0.5, + "learning_rate": 1.5566199398026147e-07, + "loss": 0.0197, + "reward": 0.23964783176779747, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.23964783176779747, + "reward_after_std": 1.0046098455786705, + "reward_before_mean": 0.9390165340155363, + "reward_before_std": 0.9415052011609077, + "reward_change_max": 0.0, + "reward_change_mean": -0.6993687264621258, + "reward_change_min": -1.348164975643158, + "reward_change_std": 0.4951324574649334, + "reward_std": 1.0046098679304123, + "rewards/cosine_scaled_reward": 0.04242492467164993, + "rewards/format_reward": 0.8541666939854622, + "step": 428 + }, + { + "advantage_max": 1.7529038935899734, + "advantage_mean": 1.1175871561519557e-08, + "advantage_min": -0.6102720461785793, + "advantage_std": 0.8972237780690193, + "completion_length": 2280.5000762939453, + "epoch": 0.49028571428571427, + "grad_norm": 0.29874134063720703, + "kl": 0.1969451904296875, + "lambda_div_used": 0.5, + "learning_rate": 1.5415814221002265e-07, + "loss": 0.0362, + "reward": 0.0367011446505785, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.0367011446505785, + "reward_after_std": 0.8972237929701805, + "reward_before_mean": 0.5863402839750051, + "reward_before_std": 0.7750564068555832, + "reward_change_max": 0.0, + "reward_change_mean": -0.549639143049717, + "reward_change_min": -1.0111135095357895, + "reward_change_std": 0.3604156728833914, + "reward_std": 0.8972238451242447, + "rewards/cosine_scaled_reward": -0.09224652778357267, + "rewards/format_reward": 0.770833345130086, + "step": 429 + }, + { + "advantage_max": 1.4815399199724197, + "advantage_mean": -8.692344288796505e-09, + "advantage_min": -0.5698779486119747, + "advantage_std": 0.7527001649141312, + "completion_length": 2175.6459197998047, + "epoch": 0.49142857142857144, + "grad_norm": 0.29431992769241333, + "kl": 0.1795654296875, + "lambda_div_used": 0.5, + "learning_rate": 1.5267358321348285e-07, + "loss": 0.0208, + "reward": 0.08262787573039532, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.08262787573039532, + "reward_after_std": 0.7527001649141312, + "reward_before_mean": 0.7200266793370247, + "reward_before_std": 0.5634596981108189, + "reward_change_max": 0.0, + "reward_change_mean": -0.6373988222330809, + "reward_change_min": -0.9826225861907005, + "reward_change_std": 0.38093653693795204, + "reward_std": 0.7527001947164536, + "rewards/cosine_scaled_reward": 0.016263334080576897, + "rewards/format_reward": 0.6875000018626451, + "step": 430 + }, + { + "advantage_max": 1.316926158964634, + "advantage_mean": 3.1044085080367267e-09, + "advantage_min": -0.6289765909314156, + "advantage_std": 0.7088558524847031, + "completion_length": 2656.604202270508, + "epoch": 0.49257142857142855, + "grad_norm": 0.37576258182525635, + "kl": 0.30914306640625, + "lambda_div_used": 0.5, + "learning_rate": 1.5120838934595337e-07, + "loss": 0.0404, + "reward": -0.039196502417325974, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.039196502417325974, + "reward_after_std": 0.7088558524847031, + "reward_before_mean": 0.5267140232026577, + "reward_before_std": 0.6843018792569637, + "reward_change_max": 0.0008359923958778381, + "reward_change_mean": -0.5659105181694031, + "reward_change_min": -1.0442478582262993, + "reward_change_std": 0.4136344399303198, + "reward_std": 0.7088558599352837, + "rewards/cosine_scaled_reward": -0.12205966003239155, + "rewards/format_reward": 0.7708333469927311, + "step": 431 + }, + { + "advantage_max": 1.4318501353263855, + "advantage_mean": -1.179675312990014e-08, + "advantage_min": -0.808821115642786, + "advantage_std": 0.7790350094437599, + "completion_length": 2498.979217529297, + "epoch": 0.4937142857142857, + "grad_norm": 0.3581421375274658, + "kl": 0.3209228515625, + "lambda_div_used": 0.5, + "learning_rate": 1.4976263201891613e-07, + "loss": 0.0232, + "reward": -0.008112411946058273, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.008112411946058273, + "reward_after_std": 0.7790350392460823, + "reward_before_mean": 0.5635525565594435, + "reward_before_std": 0.7991535924375057, + "reward_change_max": 0.0005855560302734375, + "reward_change_mean": -0.5716649480164051, + "reward_change_min": -1.0400940477848053, + "reward_change_std": 0.4370091240853071, + "reward_std": 0.7790350429713726, + "rewards/cosine_scaled_reward": -0.05155707709491253, + "rewards/format_reward": 0.6666666939854622, + "step": 432 + }, + { + "advantage_max": 1.39846720546484, + "advantage_mean": 1.2417631367611648e-09, + "advantage_min": -0.6483557894825935, + "advantage_std": 0.7378144934773445, + "completion_length": 2703.9167098999023, + "epoch": 0.4948571428571429, + "grad_norm": 0.4320548176765442, + "kl": 0.274993896484375, + "lambda_div_used": 0.5, + "learning_rate": 1.483363816965435e-07, + "loss": 0.004, + "reward": 0.14599608164280653, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.14599608164280653, + "reward_after_std": 0.7378145009279251, + "reward_before_mean": 0.8464012066833675, + "reward_before_std": 0.6108515709638596, + "reward_change_max": 0.0, + "reward_change_mean": -0.70040512830019, + "reward_change_min": -1.126664161682129, + "reward_change_std": 0.4516938291490078, + "reward_std": 0.7378145381808281, + "rewards/cosine_scaled_reward": 0.03778391517698765, + "rewards/format_reward": 0.7708333507180214, + "step": 433 + }, + { + "advantage_max": 1.0967141687870026, + "advantage_mean": 1.3348957383918503e-08, + "advantage_min": -0.5869733244180679, + "advantage_std": 0.5921955332159996, + "completion_length": 2893.7500610351562, + "epoch": 0.496, + "grad_norm": 0.4491467773914337, + "kl": 0.31512451171875, + "lambda_div_used": 0.5, + "learning_rate": 1.469297078922642e-07, + "loss": 0.023, + "reward": -0.17803913820534945, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.17803913820534945, + "reward_after_std": 0.5921955332159996, + "reward_before_mean": 0.3083352339453995, + "reward_before_std": 0.5735643208026886, + "reward_change_max": 0.0010740086436271667, + "reward_change_mean": -0.4863743484020233, + "reward_change_min": -0.881128154695034, + "reward_change_std": 0.34586321376264095, + "reward_std": 0.5921955406665802, + "rewards/cosine_scaled_reward": -0.23124905675649643, + "rewards/format_reward": 0.770833358168602, + "step": 434 + }, + { + "advantage_max": 1.555536113679409, + "advantage_mean": -9.934107758624577e-09, + "advantage_min": -0.6525837108492851, + "advantage_std": 0.8070865571498871, + "completion_length": 2240.395866394043, + "epoch": 0.49714285714285716, + "grad_norm": 0.6314480900764465, + "kl": 0.2317047119140625, + "lambda_div_used": 0.5, + "learning_rate": 1.4554267916537495e-07, + "loss": 0.0486, + "reward": 0.04277882166206837, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.04277882166206837, + "reward_after_std": 0.8070865720510483, + "reward_before_mean": 0.6320224069058895, + "reward_before_std": 0.7188326716423035, + "reward_change_max": 0.002196013927459717, + "reward_change_mean": -0.5892436020076275, + "reward_change_min": -1.0437349304556847, + "reward_change_std": 0.39640600606799126, + "reward_std": 0.8070865944027901, + "rewards/cosine_scaled_reward": -0.09023878816515207, + "rewards/format_reward": 0.8125000111758709, + "step": 435 + }, + { + "advantage_max": 1.803834691643715, + "advantage_mean": -8.692344399818808e-09, + "advantage_min": -0.9135415107011795, + "advantage_std": 0.9677546098828316, + "completion_length": 2222.687568664551, + "epoch": 0.4982857142857143, + "grad_norm": 0.7510201334953308, + "kl": 0.2289886474609375, + "lambda_div_used": 0.5, + "learning_rate": 1.4417536311769885e-07, + "loss": -0.0192, + "reward": 0.2787493225187063, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2787493225187063, + "reward_after_std": 0.967754602432251, + "reward_before_mean": 1.0235793087631464, + "reward_before_std": 0.9413213469088078, + "reward_change_max": 0.001415349543094635, + "reward_change_mean": -0.7448300048708916, + "reward_change_min": -1.3619326502084732, + "reward_change_std": 0.5367412976920605, + "reward_std": 0.9677546247839928, + "rewards/cosine_scaled_reward": 0.10553962551057339, + "rewards/format_reward": 0.8125000111758709, + "step": 436 + }, + { + "advantage_max": 1.3153732120990753, + "advantage_mean": -1.241763464276957e-08, + "advantage_min": -0.5812518484890461, + "advantage_std": 0.6810687147080898, + "completion_length": 2788.8126220703125, + "epoch": 0.49942857142857144, + "grad_norm": 0.6309353709220886, + "kl": 0.2413330078125, + "lambda_div_used": 0.5, + "learning_rate": 1.4282782639029128e-07, + "loss": -0.0033, + "reward": -0.02365095540881157, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.02365095540881157, + "reward_after_std": 0.681068692356348, + "reward_before_mean": 0.555051636591088, + "reward_before_std": 0.5716553665697575, + "reward_change_max": 0.0, + "reward_change_mean": -0.5787026062607765, + "reward_change_min": -0.9294457882642746, + "reward_change_std": 0.3546513319015503, + "reward_std": 0.6810687109827995, + "rewards/cosine_scaled_reward": -0.14955753087997437, + "rewards/format_reward": 0.854166679084301, + "step": 437 + }, + { + "advantage_max": 1.6681238859891891, + "advantage_mean": 0.0, + "advantage_min": -0.6987838484346867, + "advantage_std": 0.8795321509242058, + "completion_length": 2744.104248046875, + "epoch": 0.5005714285714286, + "grad_norm": 0.7771314382553101, + "kl": 0.3111572265625, + "lambda_div_used": 0.5, + "learning_rate": 1.4150013466019114e-07, + "loss": 0.035, + "reward": 0.0005856994539499283, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.0005856994539499283, + "reward_after_std": 0.8795321434736252, + "reward_before_mean": 0.5383487353101373, + "reward_before_std": 0.8421776629984379, + "reward_change_max": 0.0006900280714035034, + "reward_change_mean": -0.5377630740404129, + "reward_change_min": -1.0730514042079449, + "reward_change_std": 0.4223514683544636, + "reward_std": 0.87953220307827, + "rewards/cosine_scaled_reward": -0.0745756197720766, + "rewards/format_reward": 0.6875000093132257, + "step": 438 + }, + { + "advantage_max": 1.3797827512025833, + "advantage_mean": -2.483526884144993e-08, + "advantage_min": -0.6489520594477654, + "advantage_std": 0.7218637317419052, + "completion_length": 2213.416717529297, + "epoch": 0.5017142857142857, + "grad_norm": 0.3220473527908325, + "kl": 0.212005615234375, + "lambda_div_used": 0.5, + "learning_rate": 1.4019235263722034e-07, + "loss": 0.0013, + "reward": -0.04800033336505294, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.04800033336505294, + "reward_after_std": 0.7218637317419052, + "reward_before_mean": 0.49656740692444146, + "reward_before_std": 0.6511244103312492, + "reward_change_max": 0.0005797073245048523, + "reward_change_mean": -0.5445677675306797, + "reward_change_min": -0.9406459517776966, + "reward_change_std": 0.3685037661343813, + "reward_std": 0.7218637466430664, + "rewards/cosine_scaled_reward": -0.12671629822580144, + "rewards/format_reward": 0.750000013038516, + "step": 439 + }, + { + "advantage_max": 1.1950260624289513, + "advantage_mean": 1.1796752796833232e-08, + "advantage_min": -0.5321464128792286, + "advantage_std": 0.6297563761472702, + "completion_length": 2962.5833892822266, + "epoch": 0.5028571428571429, + "grad_norm": 0.6554329991340637, + "kl": 0.315338134765625, + "lambda_div_used": 0.5, + "learning_rate": 1.3890454406082956e-07, + "loss": 0.0261, + "reward": -0.10440824367105961, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.10440824367105961, + "reward_after_std": 0.6297563761472702, + "reward_before_mean": 0.42778288945555687, + "reward_before_std": 0.5683407075703144, + "reward_change_max": 0.0014156848192214966, + "reward_change_mean": -0.5321911424398422, + "reward_change_min": -0.9416075497865677, + "reward_change_std": 0.35448674857616425, + "reward_std": 0.6297563910484314, + "rewards/cosine_scaled_reward": -0.18194188922643661, + "rewards/format_reward": 0.7916666753590107, + "step": 440 + }, + { + "advantage_max": 1.675834745168686, + "advantage_mean": -9.934107758624577e-09, + "advantage_min": -0.8089264556765556, + "advantage_std": 0.8974254056811333, + "completion_length": 2361.8750915527344, + "epoch": 0.504, + "grad_norm": 0.7889370918273926, + "kl": 0.27734375, + "lambda_div_used": 0.5, + "learning_rate": 1.3763677169699217e-07, + "loss": 0.0561, + "reward": 0.04729715920984745, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.04729715920984745, + "reward_after_std": 0.8974253907799721, + "reward_before_mean": 0.6199708860367537, + "reward_before_std": 0.8931392878293991, + "reward_change_max": 0.0019818097352981567, + "reward_change_mean": -0.5726737715303898, + "reward_change_min": -1.117475550621748, + "reward_change_std": 0.44693936966359615, + "reward_std": 0.8974254131317139, + "rewards/cosine_scaled_reward": -0.0650145672261715, + "rewards/format_reward": 0.7500000223517418, + "step": 441 + }, + { + "advantage_max": 1.7124353647232056, + "advantage_mean": -1.2417634753791873e-08, + "advantage_min": -0.7255078367888927, + "advantage_std": 0.90172154083848, + "completion_length": 2513.104232788086, + "epoch": 0.5051428571428571, + "grad_norm": 0.49326351284980774, + "kl": 0.217529296875, + "lambda_div_used": 0.5, + "learning_rate": 1.3638909733514452e-07, + "loss": 0.0213, + "reward": 0.16501678587519564, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.16501678587519564, + "reward_after_std": 0.9017215259373188, + "reward_before_mean": 0.8267796207219362, + "reward_before_std": 0.8075233958661556, + "reward_change_max": 0.0017507299780845642, + "reward_change_mean": -0.6617628708481789, + "reward_change_min": -1.2036407738924026, + "reward_change_std": 0.4718140196055174, + "reward_std": 0.9017215520143509, + "rewards/cosine_scaled_reward": 0.03838980197906494, + "rewards/format_reward": 0.7500000111758709, + "step": 442 + }, + { + "advantage_max": 1.2792986631393433, + "advantage_mean": 1.4280279847511679e-08, + "advantage_min": -0.6353005319833755, + "advantage_std": 0.6750058308243752, + "completion_length": 2782.416763305664, + "epoch": 0.5062857142857143, + "grad_norm": 0.4678073823451996, + "kl": 0.3636474609375, + "lambda_div_used": 0.5, + "learning_rate": 1.351615817851748e-07, + "loss": 0.0335, + "reward": -0.15951000433415174, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.15951000433415174, + "reward_after_std": 0.6750058382749557, + "reward_before_mean": 0.30732602812349796, + "reward_before_std": 0.6360847689211369, + "reward_change_max": 0.001194782555103302, + "reward_change_mean": -0.46683603525161743, + "reward_change_min": -0.7657738365232944, + "reward_change_std": 0.3201506529003382, + "reward_std": 0.6750058494508266, + "rewards/cosine_scaled_reward": -0.17967032874003053, + "rewards/format_reward": 0.6666666883975267, + "step": 443 + }, + { + "advantage_max": 1.3639702945947647, + "advantage_mean": -3.1044091186593903e-09, + "advantage_min": -0.7017063722014427, + "advantage_std": 0.7363520376384258, + "completion_length": 2507.6459045410156, + "epoch": 0.5074285714285715, + "grad_norm": 0.3171975314617157, + "kl": 0.2984619140625, + "lambda_div_used": 0.5, + "learning_rate": 1.3395428487445914e-07, + "loss": 0.0351, + "reward": -0.00795650389045477, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.00795650389045477, + "reward_after_std": 0.7363520339131355, + "reward_before_mean": 0.574550624784024, + "reward_before_std": 0.7271179482340813, + "reward_change_max": 0.0, + "reward_change_mean": -0.5825071297585964, + "reward_change_min": -1.123153805732727, + "reward_change_std": 0.4219965375959873, + "reward_std": 0.7363520637154579, + "rewards/cosine_scaled_reward": -0.09814136102795601, + "rewards/format_reward": 0.7708333544433117, + "step": 444 + }, + { + "advantage_max": 1.7807418704032898, + "advantage_mean": -1.2417634698280722e-08, + "advantage_min": -0.8873736076056957, + "advantage_std": 0.9604065492749214, + "completion_length": 2646.0209045410156, + "epoch": 0.5085714285714286, + "grad_norm": 0.555528461933136, + "kl": 0.234771728515625, + "lambda_div_used": 0.5, + "learning_rate": 1.3276726544494571e-07, + "loss": 0.006, + "reward": 0.1469159945845604, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1469159945845604, + "reward_after_std": 0.9604065716266632, + "reward_before_mean": 0.7886058418080211, + "reward_before_std": 0.974107701331377, + "reward_change_max": 0.0008414238691329956, + "reward_change_mean": -0.641689844429493, + "reward_change_min": -1.2327545881271362, + "reward_change_std": 0.49856342375278473, + "reward_std": 0.9604065902531147, + "rewards/cosine_scaled_reward": -0.0223637567833066, + "rewards/format_reward": 0.8333333507180214, + "step": 445 + }, + { + "advantage_max": 1.600982904434204, + "advantage_mean": 2.483526884144993e-09, + "advantage_min": -0.7412622272968292, + "advantage_std": 0.8439879864454269, + "completion_length": 2478.3750915527344, + "epoch": 0.5097142857142857, + "grad_norm": 1.7437587976455688, + "kl": 0.28363037109375, + "lambda_div_used": 0.5, + "learning_rate": 1.316005813502869e-07, + "loss": 0.0567, + "reward": 0.06024339620489627, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.06024339620489627, + "reward_after_std": 0.8439880236983299, + "reward_before_mean": 0.6513098394498229, + "reward_before_std": 0.7912632301449776, + "reward_change_max": 0.0, + "reward_change_mean": -0.5910664387047291, + "reward_change_min": -1.0746823698282242, + "reward_change_std": 0.4154123105108738, + "reward_std": 0.8439880311489105, + "rewards/cosine_scaled_reward": -0.04934508353471756, + "rewards/format_reward": 0.7500000111758709, + "step": 446 + }, + { + "advantage_max": 1.665705218911171, + "advantage_mean": 4.967053546245381e-09, + "advantage_min": -0.7389098927378654, + "advantage_std": 0.887108825147152, + "completion_length": 2225.3334045410156, + "epoch": 0.5108571428571429, + "grad_norm": 0.531279444694519, + "kl": 0.19537353515625, + "lambda_div_used": 0.5, + "learning_rate": 1.3045428945301953e-07, + "loss": 0.0211, + "reward": 0.08284095581620932, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.08284095581620932, + "reward_after_std": 0.8871088400483131, + "reward_before_mean": 0.6888705305755138, + "reward_before_std": 0.8519846573472023, + "reward_change_max": 0.0006781443953514099, + "reward_change_mean": -0.6060296073555946, + "reward_change_min": -1.178616851568222, + "reward_change_std": 0.4650719538331032, + "reward_std": 0.8871088586747646, + "rewards/cosine_scaled_reward": -0.040981391444802284, + "rewards/format_reward": 0.7708333432674408, + "step": 447 + }, + { + "advantage_max": 1.5247330516576767, + "advantage_mean": -1.2728075537982164e-08, + "advantage_min": -0.6495399959385395, + "advantage_std": 0.8035099133849144, + "completion_length": 2038.2083740234375, + "epoch": 0.512, + "grad_norm": 0.3488284945487976, + "kl": 0.244049072265625, + "lambda_div_used": 0.5, + "learning_rate": 1.2932844562179352e-07, + "loss": 0.0037, + "reward": 0.11860301904380322, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.11860301904380322, + "reward_after_std": 0.803509958088398, + "reward_before_mean": 0.7717763539403677, + "reward_before_std": 0.7233231142163277, + "reward_change_max": 0.0, + "reward_change_mean": -0.6531733274459839, + "reward_change_min": -1.1682813242077827, + "reward_change_std": 0.4387435019016266, + "reward_std": 0.8035099804401398, + "rewards/cosine_scaled_reward": -0.02036183699965477, + "rewards/format_reward": 0.8125000074505806, + "step": 448 + }, + { + "advantage_max": 1.645022451877594, + "advantage_mean": -6.208817182606907e-09, + "advantage_min": -0.6213822662830353, + "advantage_std": 0.8313011080026627, + "completion_length": 2265.312530517578, + "epoch": 0.5131428571428571, + "grad_norm": 0.40414363145828247, + "kl": 0.171142578125, + "lambda_div_used": 0.5, + "learning_rate": 1.2822310472864885e-07, + "loss": 0.0142, + "reward": 0.1660279119387269, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1660279119387269, + "reward_after_std": 0.8313010893762112, + "reward_before_mean": 0.8451403537765145, + "reward_before_std": 0.6286548934876919, + "reward_change_max": 0.0, + "reward_change_mean": -0.6791124641895294, + "reward_change_min": -1.0571254119277, + "reward_change_std": 0.3865590952336788, + "reward_std": 0.8313011229038239, + "rewards/cosine_scaled_reward": -0.01492983102798462, + "rewards/format_reward": 0.8750000055879354, + "step": 449 + }, + { + "advantage_max": 1.3215680569410324, + "advantage_mean": -1.7384688799637615e-08, + "advantage_min": -0.6460866183042526, + "advantage_std": 0.6979735009372234, + "completion_length": 2327.875045776367, + "epoch": 0.5142857142857142, + "grad_norm": 0.342936247587204, + "kl": 0.1763763427734375, + "lambda_div_used": 0.5, + "learning_rate": 1.2713832064634125e-07, + "loss": 0.0263, + "reward": 0.08694206736981869, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.08694206736981869, + "reward_after_std": 0.6979734934866428, + "reward_before_mean": 0.7526519640232436, + "reward_before_std": 0.5924910455942154, + "reward_change_max": 0.00038858503103256226, + "reward_change_mean": -0.665709912776947, + "reward_change_min": -1.07553119212389, + "reward_change_std": 0.42237728647887707, + "reward_std": 0.6979735009372234, + "rewards/cosine_scaled_reward": -0.029924023896455765, + "rewards/format_reward": 0.8125000223517418, + "step": 450 + }, + { + "advantage_max": 1.5565338879823685, + "advantage_mean": 8.381903282561609e-09, + "advantage_min": -0.6266820058226585, + "advantage_std": 0.8044048026204109, + "completion_length": 2260.375030517578, + "epoch": 0.5154285714285715, + "grad_norm": 0.5136589407920837, + "kl": 0.171142578125, + "lambda_div_used": 0.5, + "learning_rate": 1.260741462457165e-07, + "loss": -0.0038, + "reward": 0.22477071173489094, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.22477071173489094, + "reward_after_std": 0.8044047877192497, + "reward_before_mean": 0.9543191902339458, + "reward_before_std": 0.6328833512961864, + "reward_change_max": 0.0, + "reward_change_mean": -0.729548454284668, + "reward_change_min": -1.1411267966032028, + "reward_change_std": 0.44372114166617393, + "reward_std": 0.8044048100709915, + "rewards/cosine_scaled_reward": 0.09174292162060738, + "rewards/format_reward": 0.7708333414047956, + "step": 451 + }, + { + "advantage_max": 1.6044269427657127, + "advantage_mean": -1.2417629147165599e-09, + "advantage_min": -0.6126443706452847, + "advantage_std": 0.8330548368394375, + "completion_length": 2954.604217529297, + "epoch": 0.5165714285714286, + "grad_norm": 0.5223681330680847, + "kl": 0.2507781982421875, + "lambda_div_used": 0.5, + "learning_rate": 1.2503063339313356e-07, + "loss": 0.0356, + "reward": -0.0031112791039049625, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.0031112791039049625, + "reward_after_std": 0.8330548293888569, + "reward_before_mean": 0.5448109768331051, + "reward_before_std": 0.7421528398990631, + "reward_change_max": 0.000568874180316925, + "reward_change_mean": -0.547922252677381, + "reward_change_min": -0.9984243176877499, + "reward_change_std": 0.38183396589010954, + "reward_std": 0.8330548629164696, + "rewards/cosine_scaled_reward": 0.0015721451491117477, + "rewards/format_reward": 0.5416666753590107, + "step": 452 + }, + { + "advantage_max": 1.6805711835622787, + "advantage_mean": -7.450580818968433e-09, + "advantage_min": -0.7923614680767059, + "advantage_std": 0.8882839158177376, + "completion_length": 2572.4792098999023, + "epoch": 0.5177142857142857, + "grad_norm": 0.5597829222679138, + "kl": 0.3200225830078125, + "lambda_div_used": 0.5, + "learning_rate": 1.2400783294793668e-07, + "loss": 0.0241, + "reward": 0.1551264775916934, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1551264775916934, + "reward_after_std": 0.8882838860154152, + "reward_before_mean": 0.8060325691476464, + "reward_before_std": 0.8305098079144955, + "reward_change_max": 0.0, + "reward_change_mean": -0.6509060971438885, + "reward_change_min": -1.1471052765846252, + "reward_change_std": 0.44953753612935543, + "reward_std": 0.8882839009165764, + "rewards/cosine_scaled_reward": 0.007182922679930925, + "rewards/format_reward": 0.7916666828095913, + "step": 453 + }, + { + "advantage_max": 1.5762149766087532, + "advantage_mean": -6.208817238118058e-09, + "advantage_min": -0.7216840162873268, + "advantage_std": 0.8445982038974762, + "completion_length": 2720.687530517578, + "epoch": 0.5188571428571429, + "grad_norm": 0.554466187953949, + "kl": 0.1890716552734375, + "lambda_div_used": 0.5, + "learning_rate": 1.2300579475997657e-07, + "loss": 0.0008, + "reward": -0.006341944914311171, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.006341944914311171, + "reward_after_std": 0.8445982038974762, + "reward_before_mean": 0.5473174899816513, + "reward_before_std": 0.8355911895632744, + "reward_change_max": 0.0, + "reward_change_mean": -0.5536594353616238, + "reward_change_min": -1.1242877095937729, + "reward_change_std": 0.42637988552451134, + "reward_std": 0.8445982374250889, + "rewards/cosine_scaled_reward": -0.07009126897901297, + "rewards/format_reward": 0.687500013038516, + "step": 454 + }, + { + "advantage_max": 1.4252081513404846, + "advantage_mean": 1.7384688355548406e-08, + "advantage_min": -0.6050524637103081, + "advantage_std": 0.7403261326253414, + "completion_length": 2962.2709045410156, + "epoch": 0.52, + "grad_norm": 0.8583621382713318, + "kl": 0.329345703125, + "lambda_div_used": 0.5, + "learning_rate": 1.220245676671809e-07, + "loss": 0.0206, + "reward": -0.07628545328043401, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.07628545328043401, + "reward_after_std": 0.7403261326253414, + "reward_before_mean": 0.44156032614409924, + "reward_before_std": 0.652463223785162, + "reward_change_max": 0.0005906671285629272, + "reward_change_mean": -0.517845768481493, + "reward_change_min": -0.9065490663051605, + "reward_change_std": 0.357125923037529, + "reward_std": 0.7403261587023735, + "rewards/cosine_scaled_reward": -0.15421984996646643, + "rewards/format_reward": 0.7500000149011612, + "step": 455 + }, + { + "advantage_max": 1.9315543174743652, + "advantage_mean": 6.829698862009792e-09, + "advantage_min": -0.6773689016699791, + "advantage_std": 0.979364488273859, + "completion_length": 2906.0416984558105, + "epoch": 0.5211428571428571, + "grad_norm": 1.3658193349838257, + "kl": 0.2467193603515625, + "lambda_div_used": 0.5, + "learning_rate": 1.2106419949317388e-07, + "loss": 0.0767, + "reward": 0.042131487280130386, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.042131487280130386, + "reward_after_std": 0.9793644994497299, + "reward_before_mean": 0.5710029774345458, + "reward_before_std": 0.8479274027049541, + "reward_change_max": 0.0, + "reward_change_mean": -0.5288714915513992, + "reward_change_min": -0.9232162311673164, + "reward_change_std": 0.35106181912124157, + "reward_std": 0.9793645069003105, + "rewards/cosine_scaled_reward": -0.08949852362275124, + "rewards/format_reward": 0.7500000111758709, + "step": 456 + }, + { + "advantage_max": 1.3629868924617767, + "advantage_mean": 1.3038516655239363e-08, + "advantage_min": -0.5921227112412453, + "advantage_std": 0.7106032706797123, + "completion_length": 2766.6458740234375, + "epoch": 0.5222857142857142, + "grad_norm": 0.4502284526824951, + "kl": 0.3090972900390625, + "lambda_div_used": 0.5, + "learning_rate": 1.2012473704494537e-07, + "loss": 0.0296, + "reward": 0.027612894773483276, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.027612894773483276, + "reward_after_std": 0.7106032818555832, + "reward_before_mean": 0.6371276685968041, + "reward_before_std": 0.605956356972456, + "reward_change_max": 0.0, + "reward_change_mean": -0.6095147393643856, + "reward_change_min": -1.0139854177832603, + "reward_change_std": 0.39748547598719597, + "reward_std": 0.7106032893061638, + "rewards/cosine_scaled_reward": 0.016480496153235435, + "rewards/format_reward": 0.6041666753590107, + "step": 457 + }, + { + "advantage_max": 1.346614234149456, + "advantage_mean": 1.428027990302283e-08, + "advantage_min": -0.5400624051690102, + "advantage_std": 0.7049011290073395, + "completion_length": 2300.4791946411133, + "epoch": 0.5234285714285715, + "grad_norm": 0.29569682478904724, + "kl": 0.207855224609375, + "lambda_div_used": 0.5, + "learning_rate": 1.1920622611056974e-07, + "loss": 0.02, + "reward": -0.03890596283599734, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.03890596283599734, + "reward_after_std": 0.7049011215567589, + "reward_before_mean": 0.517438106238842, + "reward_before_std": 0.6150476150214672, + "reward_change_max": 0.0, + "reward_change_mean": -0.5563440751284361, + "reward_change_min": -0.9435974843800068, + "reward_change_std": 0.3535274900496006, + "reward_std": 0.7049011215567589, + "rewards/cosine_scaled_reward": -0.13711428828537464, + "rewards/format_reward": 0.7916666679084301, + "step": 458 + }, + { + "advantage_max": 1.4628484919667244, + "advantage_mean": -4.346171977864799e-09, + "advantage_min": -0.65413873270154, + "advantage_std": 0.7680129557847977, + "completion_length": 2132.062545776367, + "epoch": 0.5245714285714286, + "grad_norm": 0.40495961904525757, + "kl": 0.19858551025390625, + "lambda_div_used": 0.5, + "learning_rate": 1.1830871145697412e-07, + "loss": 0.011, + "reward": 0.19387144222855568, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.19387144222855568, + "reward_after_std": 0.7680129408836365, + "reward_before_mean": 0.9214918408542871, + "reward_before_std": 0.6441125757992268, + "reward_change_max": 0.0, + "reward_change_mean": -0.7276204153895378, + "reward_change_min": -1.1330279782414436, + "reward_change_std": 0.4475998468697071, + "reward_std": 0.7680129408836365, + "rewards/cosine_scaled_reward": 0.03366258554160595, + "rewards/format_reward": 0.8541666828095913, + "step": 459 + }, + { + "advantage_max": 1.3919820860028267, + "advantage_mean": 4.6566129563441194e-09, + "advantage_min": -0.6502477303147316, + "advantage_std": 0.7406989708542824, + "completion_length": 3168.9375610351562, + "epoch": 0.5257142857142857, + "grad_norm": 0.6914077401161194, + "kl": 0.31787109375, + "lambda_div_used": 0.5, + "learning_rate": 1.1743223682775649e-07, + "loss": 0.047, + "reward": -0.1056380441877991, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.1056380441877991, + "reward_after_std": 0.7406989708542824, + "reward_before_mean": 0.3909294670447707, + "reward_before_std": 0.7229253053665161, + "reward_change_max": 0.0012809410691261292, + "reward_change_mean": -0.49656750820577145, + "reward_change_min": -1.0154965370893478, + "reward_change_std": 0.3839583992958069, + "reward_std": 0.7406989932060242, + "rewards/cosine_scaled_reward": -0.18995194137096405, + "rewards/format_reward": 0.7708333544433117, + "step": 460 + }, + { + "advantage_max": 1.7813145220279694, + "advantage_mean": -8.071462387349015e-09, + "advantage_min": -0.9457485042512417, + "advantage_std": 0.96094960719347, + "completion_length": 2815.604263305664, + "epoch": 0.5268571428571428, + "grad_norm": 0.6786468625068665, + "kl": 0.2430419921875, + "lambda_div_used": 0.5, + "learning_rate": 1.1657684494105386e-07, + "loss": 0.0144, + "reward": 0.20053629763424397, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.20053629763424397, + "reward_after_std": 0.9609495922923088, + "reward_before_mean": 0.8884876975789666, + "reward_before_std": 0.9701001457870007, + "reward_change_max": 0.0006676092743873596, + "reward_change_mean": -0.6879513971507549, + "reward_change_min": -1.2457620240747929, + "reward_change_std": 0.516149502247572, + "reward_std": 0.9609496183693409, + "rewards/cosine_scaled_reward": 0.1004938306286931, + "rewards/format_reward": 0.6875000111758709, + "step": 461 + }, + { + "advantage_max": 1.4213928058743477, + "advantage_mean": 6.4222452911266714e-09, + "advantage_min": -0.5725329555571079, + "advantage_std": 0.7395026087760925, + "completion_length": 2614.8125610351562, + "epoch": 0.528, + "grad_norm": 0.7102219462394714, + "kl": 0.2266845703125, + "lambda_div_used": 0.5, + "learning_rate": 1.1574257748745986e-07, + "loss": 0.0127, + "reward": -0.033994670637184754, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.033994670637184754, + "reward_after_std": 0.7395026162266731, + "reward_before_mean": 0.5079695815220475, + "reward_before_std": 0.6631190590560436, + "reward_change_max": 0.0, + "reward_change_mean": -0.5419642440974712, + "reward_change_min": -0.986947949975729, + "reward_change_std": 0.36053442023694515, + "reward_std": 0.7395026385784149, + "rewards/cosine_scaled_reward": -0.11059854784980416, + "rewards/format_reward": 0.7291666734963655, + "step": 462 + }, + { + "advantage_max": 2.1029365062713623, + "advantage_mean": 2.4835267176115394e-09, + "advantage_min": -0.8385254740715027, + "advantage_std": 1.095969557762146, + "completion_length": 2805.3333740234375, + "epoch": 0.5291428571428571, + "grad_norm": 1.0408904552459717, + "kl": 0.3365631103515625, + "lambda_div_used": 0.5, + "learning_rate": 1.1492947512799328e-07, + "loss": 0.0304, + "reward": 0.1863800287246704, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1863800287246704, + "reward_after_std": 1.0959695428609848, + "reward_before_mean": 0.7977765635587275, + "reward_before_std": 1.050209902226925, + "reward_change_max": 0.0, + "reward_change_mean": -0.6113965678960085, + "reward_change_min": -1.2300319075584412, + "reward_change_std": 0.4674637410789728, + "reward_std": 1.0959695726633072, + "rewards/cosine_scaled_reward": 0.055138289637397975, + "rewards/format_reward": 0.6875000018626451, + "step": 463 + }, + { + "advantage_max": 1.4044927433133125, + "advantage_mean": -8.692344233285354e-09, + "advantage_min": -0.5210793204605579, + "advantage_std": 0.7161996513605118, + "completion_length": 1916.3125381469727, + "epoch": 0.5302857142857142, + "grad_norm": 0.8636742830276489, + "kl": 0.178009033203125, + "lambda_div_used": 0.5, + "learning_rate": 1.1413757749211602e-07, + "loss": 0.0443, + "reward": 0.20765853859484196, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.20765853859484196, + "reward_after_std": 0.7161996364593506, + "reward_before_mean": 0.9627609495073557, + "reward_before_std": 0.48609965573996305, + "reward_change_max": 0.0006634891033172607, + "reward_change_mean": -0.7551024369895458, + "reward_change_min": -1.1348499171435833, + "reward_change_std": 0.43385453149676323, + "reward_std": 0.7161996439099312, + "rewards/cosine_scaled_reward": 0.054297154769301414, + "rewards/format_reward": 0.8541666772216558, + "step": 464 + }, + { + "advantage_max": 1.9507509917020798, + "advantage_mean": -4.967053768289986e-09, + "advantage_min": -0.887705996632576, + "advantage_std": 1.0343635454773903, + "completion_length": 2737.395881652832, + "epoch": 0.5314285714285715, + "grad_norm": 0.669562578201294, + "kl": 0.25555419921875, + "lambda_div_used": 0.5, + "learning_rate": 1.1336692317580158e-07, + "loss": 0.0124, + "reward": 0.14993727079126984, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.14993727079126984, + "reward_after_std": 1.0343635752797127, + "reward_before_mean": 0.765050558373332, + "reward_before_std": 1.0290489830076694, + "reward_change_max": 0.0, + "reward_change_mean": -0.6151133142411709, + "reward_change_min": -1.1888877488672733, + "reward_change_std": 0.47263885475695133, + "reward_std": 1.0343635752797127, + "rewards/cosine_scaled_reward": 0.007525268010795116, + "rewards/format_reward": 0.7500000279396772, + "step": 465 + }, + { + "advantage_max": 1.6696715205907822, + "advantage_mean": -2.2351742234860694e-08, + "advantage_min": -0.6946392580866814, + "advantage_std": 0.8632129430770874, + "completion_length": 3091.541748046875, + "epoch": 0.5325714285714286, + "grad_norm": 1.2827321290969849, + "kl": 0.28610992431640625, + "lambda_div_used": 0.5, + "learning_rate": 1.1261754973965422e-07, + "loss": 0.0023, + "reward": 0.3170170905068517, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3170170905068517, + "reward_after_std": 0.8632129468023777, + "reward_before_mean": 1.1178719718009233, + "reward_before_std": 0.6960075739771128, + "reward_change_max": 0.0, + "reward_change_mean": -0.80085489153862, + "reward_change_min": -1.2640929967164993, + "reward_change_std": 0.4748579952865839, + "reward_std": 0.8632129579782486, + "rewards/cosine_scaled_reward": 0.1631026342511177, + "rewards/format_reward": 0.7916666716337204, + "step": 466 + }, + { + "advantage_max": 1.4761824756860733, + "advantage_mean": 1.3038516433194758e-08, + "advantage_min": -0.6751852855086327, + "advantage_std": 0.7853565439581871, + "completion_length": 2753.9375762939453, + "epoch": 0.5337142857142857, + "grad_norm": 0.511631965637207, + "kl": 0.2637786865234375, + "lambda_div_used": 0.5, + "learning_rate": 1.1188949370707787e-07, + "loss": 0.0275, + "reward": -0.0035535165516193956, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.0035535165516193956, + "reward_after_std": 0.7853565439581871, + "reward_before_mean": 0.5619046837091446, + "reward_before_std": 0.744366992264986, + "reward_change_max": 0.000514104962348938, + "reward_change_mean": -0.5654581896960735, + "reward_change_min": -1.037429817020893, + "reward_change_std": 0.39547770842909813, + "reward_std": 0.7853565737605095, + "rewards/cosine_scaled_reward": -0.1565476767718792, + "rewards/format_reward": 0.8750000223517418, + "step": 467 + }, + { + "advantage_max": 1.8538229018449783, + "advantage_mean": -1.2417635808503746e-09, + "advantage_min": -0.7242231853306293, + "advantage_std": 0.9473971761763096, + "completion_length": 2780.291732788086, + "epoch": 0.5348571428571428, + "grad_norm": 0.6883957386016846, + "kl": 0.2642822265625, + "lambda_div_used": 0.5, + "learning_rate": 1.1118279056249653e-07, + "loss": 0.0303, + "reward": 0.10545292682945728, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.10545292682945728, + "reward_after_std": 0.9473971761763096, + "reward_before_mean": 0.6927674971520901, + "reward_before_std": 0.8236076533794403, + "reward_change_max": 0.0, + "reward_change_mean": -0.5873145572841167, + "reward_change_min": -1.057724367827177, + "reward_change_std": 0.38849309273064137, + "reward_std": 0.9473971910774708, + "rewards/cosine_scaled_reward": -0.04944960871944204, + "rewards/format_reward": 0.7916666865348816, + "step": 468 + }, + { + "advantage_max": 1.807606466114521, + "advantage_mean": 0.0, + "advantage_min": -0.7852521277964115, + "advantage_std": 0.9635585993528366, + "completion_length": 2637.2083740234375, + "epoch": 0.536, + "grad_norm": 0.83745276927948, + "kl": 0.30572509765625, + "lambda_div_used": 0.5, + "learning_rate": 1.1049747474962444e-07, + "loss": 0.0347, + "reward": 0.08417265303432941, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.08417265303432941, + "reward_after_std": 0.9635586142539978, + "reward_before_mean": 0.6696641687303782, + "reward_before_std": 0.9555434696376324, + "reward_change_max": 0.00029350072145462036, + "reward_change_mean": -0.5854915156960487, + "reward_change_min": -1.1475455537438393, + "reward_change_std": 0.4715314581990242, + "reward_std": 0.9635586366057396, + "rewards/cosine_scaled_reward": -0.04016792553011328, + "rewards/format_reward": 0.7500000186264515, + "step": 469 + }, + { + "advantage_max": 1.3150828257203102, + "advantage_mean": 1.2107193830823704e-08, + "advantage_min": -0.686145231127739, + "advantage_std": 0.7068404294550419, + "completion_length": 3181.7084045410156, + "epoch": 0.5371428571428571, + "grad_norm": 0.4413515627384186, + "kl": 0.29815673828125, + "lambda_div_used": 0.5, + "learning_rate": 1.0983357966978745e-07, + "loss": 0.0485, + "reward": -0.16447675041854382, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.16447675041854382, + "reward_after_std": 0.7068404443562031, + "reward_before_mean": 0.2920903190970421, + "reward_before_std": 0.7121127918362617, + "reward_change_max": 0.00026867538690567017, + "reward_change_mean": -0.45656704902648926, + "reward_change_min": -0.8190466426312923, + "reward_change_std": 0.34242652729153633, + "reward_std": 0.7068404592573643, + "rewards/cosine_scaled_reward": -0.17687150929123163, + "rewards/format_reward": 0.6458333544433117, + "step": 470 + }, + { + "advantage_max": 1.77982447296381, + "advantage_mean": 6.208816794028849e-10, + "advantage_min": -0.6413669362664223, + "advantage_std": 0.9082127101719379, + "completion_length": 2911.2084045410156, + "epoch": 0.5382857142857143, + "grad_norm": 0.8663266897201538, + "kl": 0.2624359130859375, + "lambda_div_used": 0.5, + "learning_rate": 1.0919113768029517e-07, + "loss": 0.0105, + "reward": 0.22175636049360037, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.22175636049360037, + "reward_after_std": 0.9082127138972282, + "reward_before_mean": 0.9290354289114475, + "reward_before_std": 0.7207024209201336, + "reward_change_max": 0.0, + "reward_change_mean": -0.7072790637612343, + "reward_change_min": -1.1375292390584946, + "reward_change_std": 0.42616303265094757, + "reward_std": 0.9082127138972282, + "rewards/cosine_scaled_reward": 0.04785105166956782, + "rewards/format_reward": 0.8333333469927311, + "step": 471 + }, + { + "advantage_max": 1.4702309519052505, + "advantage_mean": -8.692344288796505e-09, + "advantage_min": -0.6833783108741045, + "advantage_std": 0.7934335358440876, + "completion_length": 2542.3958892822266, + "epoch": 0.5394285714285715, + "grad_norm": 0.3770515024662018, + "kl": 0.21100616455078125, + "lambda_div_used": 0.5, + "learning_rate": 1.0857018009286381e-07, + "loss": -0.0088, + "reward": -0.08202749770134687, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.08202749770134687, + "reward_after_std": 0.7934335470199585, + "reward_before_mean": 0.42360311560332775, + "reward_before_std": 0.8034615498036146, + "reward_change_max": 0.0008336007595062256, + "reward_change_mean": -0.5056306086480618, + "reward_change_min": -0.9713359847664833, + "reward_change_std": 0.4131198935210705, + "reward_std": 0.7934335544705391, + "rewards/cosine_scaled_reward": -0.15278178313747048, + "rewards/format_reward": 0.7291666716337204, + "step": 472 + }, + { + "advantage_max": 1.4210163056850433, + "advantage_mean": -8.692344621863413e-09, + "advantage_min": -0.5511983409523964, + "advantage_std": 0.7278570458292961, + "completion_length": 2934.0000915527344, + "epoch": 0.5405714285714286, + "grad_norm": 0.3252032399177551, + "kl": 0.234100341796875, + "lambda_div_used": 0.5, + "learning_rate": 1.0797073717209013e-07, + "loss": 0.0438, + "reward": -0.03442589659243822, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.03442589659243822, + "reward_after_std": 0.7278570234775543, + "reward_before_mean": 0.515663924627006, + "reward_before_std": 0.5890849232673645, + "reward_change_max": 0.0013192519545555115, + "reward_change_mean": -0.5500898249447346, + "reward_change_min": -0.8929934203624725, + "reward_change_std": 0.34633047319948673, + "reward_std": 0.7278570309281349, + "rewards/cosine_scaled_reward": -0.08591805072501302, + "rewards/format_reward": 0.6875000093132257, + "step": 473 + }, + { + "advantage_max": 1.601642407476902, + "advantage_mean": -1.9868215517249155e-08, + "advantage_min": -0.6131134703755379, + "advantage_std": 0.8238486312329769, + "completion_length": 2461.9375610351562, + "epoch": 0.5417142857142857, + "grad_norm": 0.7367488741874695, + "kl": 0.2513580322265625, + "lambda_div_used": 0.5, + "learning_rate": 1.0739283813397639e-07, + "loss": 0.0143, + "reward": 0.23803382087498903, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.23803382087498903, + "reward_after_std": 0.8238486312329769, + "reward_before_mean": 0.9828812517225742, + "reward_before_std": 0.628759897314012, + "reward_change_max": 0.0, + "reward_change_mean": -0.7448474448174238, + "reward_change_min": -1.1187105029821396, + "reward_change_std": 0.44198982790112495, + "reward_std": 0.8238486833870411, + "rewards/cosine_scaled_reward": 0.08519062399864197, + "rewards/format_reward": 0.812500013038516, + "step": 474 + }, + { + "advantage_max": 2.2786176800727844, + "advantage_mean": -8.6923440667519e-09, + "advantage_min": -1.0382463559508324, + "advantage_std": 1.20528145134449, + "completion_length": 2290.6041870117188, + "epoch": 0.5428571428571428, + "grad_norm": 1.510539174079895, + "kl": 0.19629669189453125, + "lambda_div_used": 0.5, + "learning_rate": 1.068365111445064e-07, + "loss": 0.0649, + "reward": 0.4062812924385071, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4062812924385071, + "reward_after_std": 1.2052814662456512, + "reward_before_mean": 1.1780878230929375, + "reward_before_std": 1.165225274860859, + "reward_change_max": 0.0019550248980522156, + "reward_change_mean": -0.7718064785003662, + "reward_change_min": -1.4727793186903, + "reward_change_std": 0.5788527056574821, + "reward_std": 1.2052814960479736, + "rewards/cosine_scaled_reward": 0.18279388174414635, + "rewards/format_reward": 0.8125000111758709, + "step": 475 + }, + { + "advantage_max": 2.106508746743202, + "advantage_mean": -3.0112764060064023e-08, + "advantage_min": -0.8679858073592186, + "advantage_std": 1.1112488955259323, + "completion_length": 2717.4375915527344, + "epoch": 0.544, + "grad_norm": 0.8656560182571411, + "kl": 0.22894287109375, + "lambda_div_used": 0.5, + "learning_rate": 1.063017833182728e-07, + "loss": 0.0287, + "reward": 0.3367365933954716, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3367365933954716, + "reward_after_std": 1.1112489104270935, + "reward_before_mean": 1.082529116421938, + "reward_before_std": 1.0571173802018166, + "reward_change_max": 0.0, + "reward_change_mean": -0.7457925379276276, + "reward_change_min": -1.5623594596982002, + "reward_change_std": 0.5568713434040546, + "reward_std": 1.111248940229416, + "rewards/cosine_scaled_reward": 0.07251455070218071, + "rewards/format_reward": 0.9375000149011612, + "step": 476 + }, + { + "advantage_max": 1.8266699612140656, + "advantage_mean": 1.862645149230957e-09, + "advantage_min": -0.6633548140525818, + "advantage_std": 0.9389680698513985, + "completion_length": 1962.6667251586914, + "epoch": 0.5451428571428572, + "grad_norm": 0.4617604613304138, + "kl": 0.106781005859375, + "lambda_div_used": 0.5, + "learning_rate": 1.0578868071715544e-07, + "loss": 0.0078, + "reward": 0.46729777520522475, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.46729777520522475, + "reward_after_std": 0.9389680847525597, + "reward_before_mean": 1.3609278202056885, + "reward_before_std": 0.6903122533112764, + "reward_change_max": 0.0, + "reward_change_mean": -0.8936300873756409, + "reward_change_min": -1.3898151367902756, + "reward_change_std": 0.518205638974905, + "reward_std": 0.9389681071043015, + "rewards/cosine_scaled_reward": 0.18046390381641686, + "rewards/format_reward": 1.0, + "step": 477 + }, + { + "advantage_max": 1.443758599460125, + "advantage_mean": 2.0489097030118586e-08, + "advantage_min": -0.5808941088616848, + "advantage_std": 0.7490430325269699, + "completion_length": 2686.3125610351562, + "epoch": 0.5462857142857143, + "grad_norm": 0.5362579822540283, + "kl": 0.19677734375, + "lambda_div_used": 0.5, + "learning_rate": 1.0529722834905125e-07, + "loss": 0.0186, + "reward": -0.016255690716207027, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.016255690716207027, + "reward_after_std": 0.7490430511534214, + "reward_before_mean": 0.5467281192541122, + "reward_before_std": 0.6304172240197659, + "reward_change_max": 0.0013319402933120728, + "reward_change_mean": -0.5629837699234486, + "reward_change_min": -0.9032267481088638, + "reward_change_std": 0.3670722022652626, + "reward_std": 0.749043058604002, + "rewards/cosine_scaled_reward": -0.03913595899939537, + "rewards/format_reward": 0.6250000055879354, + "step": 478 + }, + { + "advantage_max": 1.3233840316534042, + "advantage_mean": 6.5192581055750765e-09, + "advantage_min": -0.6653562523424625, + "advantage_std": 0.7084184885025024, + "completion_length": 2855.8750915527344, + "epoch": 0.5474285714285714, + "grad_norm": 0.9758108258247375, + "kl": 0.2758941650390625, + "lambda_div_used": 0.5, + "learning_rate": 1.0482745016665526e-07, + "loss": 0.0006, + "reward": -0.041172572411596775, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.041172572411596775, + "reward_after_std": 0.7084184885025024, + "reward_before_mean": 0.5113696761545725, + "reward_before_std": 0.6773869059979916, + "reward_change_max": 0.0, + "reward_change_mean": -0.5525422282516956, + "reward_change_min": -1.0090350657701492, + "reward_change_std": 0.3896159194409847, + "reward_std": 0.708418533205986, + "rewards/cosine_scaled_reward": -0.12973184324800968, + "rewards/format_reward": 0.7708333469927311, + "step": 479 + }, + { + "advantage_max": 1.533621370792389, + "advantage_mean": 6.208817571184966e-09, + "advantage_min": -0.6178735308349133, + "advantage_std": 0.8067186251282692, + "completion_length": 2509.1250228881836, + "epoch": 0.5485714285714286, + "grad_norm": 0.46223902702331543, + "kl": 0.26495361328125, + "lambda_div_used": 0.5, + "learning_rate": 1.0437936906629334e-07, + "loss": 0.0208, + "reward": -0.08323651552200317, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.08323651552200317, + "reward_after_std": 0.8067186456173658, + "reward_before_mean": 0.4114815816283226, + "reward_before_std": 0.7638159282505512, + "reward_change_max": 0.0004747062921524048, + "reward_change_mean": -0.4947180775925517, + "reward_change_min": -1.0002856254577637, + "reward_change_std": 0.3739140098914504, + "reward_std": 0.8067186698317528, + "rewards/cosine_scaled_reward": -0.16925923340022564, + "rewards/format_reward": 0.7500000074505806, + "step": 480 + }, + { + "advantage_max": 1.533097319304943, + "advantage_mean": -4.967054434423801e-09, + "advantage_min": -0.7590233869850636, + "advantage_std": 0.8262660577893257, + "completion_length": 3085.5625610351562, + "epoch": 0.5497142857142857, + "grad_norm": 1.1266975402832031, + "kl": 0.211395263671875, + "lambda_div_used": 0.5, + "learning_rate": 1.0395300688680625e-07, + "loss": 0.0325, + "reward": 0.06830573407933116, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.06830573407933116, + "reward_after_std": 0.8262660764157772, + "reward_before_mean": 0.6889901962131262, + "reward_before_std": 0.806677432730794, + "reward_change_max": 0.0006970912218093872, + "reward_change_mean": -0.6206844747066498, + "reward_change_min": -1.1201928928494453, + "reward_change_std": 0.46400916762650013, + "reward_std": 0.8262661173939705, + "rewards/cosine_scaled_reward": -0.030504904687404633, + "rewards/format_reward": 0.7500000111758709, + "step": 481 + }, + { + "advantage_max": 1.8319706320762634, + "advantage_mean": -1.676380750881279e-08, + "advantage_min": -0.6075238063931465, + "advantage_std": 0.9355734586715698, + "completion_length": 2757.5625762939453, + "epoch": 0.5508571428571428, + "grad_norm": 0.35477179288864136, + "kl": 0.259674072265625, + "lambda_div_used": 0.5, + "learning_rate": 1.0354838440848501e-07, + "loss": 0.0204, + "reward": 0.4476965293288231, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4476965293288231, + "reward_after_std": 0.9355734437704086, + "reward_before_mean": 1.3339410591870546, + "reward_before_std": 0.6797395423054695, + "reward_change_max": 0.00023727118968963623, + "reward_change_mean": -0.8862445838749409, + "reward_change_min": -1.44765355437994, + "reward_change_std": 0.5364833064377308, + "reward_std": 0.9355734586715698, + "rewards/cosine_scaled_reward": 0.3023871900513768, + "rewards/format_reward": 0.7291666753590107, + "step": 482 + }, + { + "advantage_max": 1.4121412485837936, + "advantage_mean": 4.967053879312289e-09, + "advantage_min": -0.6135230548679829, + "advantage_std": 0.7367845512926579, + "completion_length": 2745.666778564453, + "epoch": 0.552, + "grad_norm": 1.3282009363174438, + "kl": 0.23053741455078125, + "lambda_div_used": 0.5, + "learning_rate": 1.0316552135205837e-07, + "loss": -0.018, + "reward": 0.031401316984556615, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.031401316984556615, + "reward_after_std": 0.7367845550179482, + "reward_before_mean": 0.6348350569605827, + "reward_before_std": 0.6336484104394913, + "reward_change_max": 0.0004684925079345703, + "reward_change_mean": -0.6034337263554335, + "reward_change_min": -1.0589254647493362, + "reward_change_std": 0.3971208855509758, + "reward_std": 0.7367845699191093, + "rewards/cosine_scaled_reward": -0.12008249387145042, + "rewards/format_reward": 0.8750000111758709, + "step": 483 + }, + { + "advantage_max": 1.4934806898236275, + "advantage_mean": -9.313226356777449e-09, + "advantage_min": -0.7406910136342049, + "advantage_std": 0.7948021329939365, + "completion_length": 2448.104202270508, + "epoch": 0.5531428571428572, + "grad_norm": 0.9353572726249695, + "kl": 0.18560791015625, + "lambda_div_used": 0.5, + "learning_rate": 1.0280443637773163e-07, + "loss": 0.0357, + "reward": 0.12793250009417534, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.12793250009417534, + "reward_after_std": 0.7948021404445171, + "reward_before_mean": 0.8039913112297654, + "reward_before_std": 0.7426050752401352, + "reward_change_max": 0.001218445599079132, + "reward_change_mean": -0.6760587878525257, + "reward_change_min": -1.1402862071990967, + "reward_change_std": 0.46075804345309734, + "reward_std": 0.7948021776974201, + "rewards/cosine_scaled_reward": 0.058245645835995674, + "rewards/format_reward": 0.687500013038516, + "step": 484 + }, + { + "advantage_max": 1.35654865950346, + "advantage_mean": 6.829698917520943e-09, + "advantage_min": -0.48635658249258995, + "advantage_std": 0.6858345717191696, + "completion_length": 2498.666763305664, + "epoch": 0.5542857142857143, + "grad_norm": 0.23835696280002594, + "kl": 0.240142822265625, + "lambda_div_used": 0.5, + "learning_rate": 1.0246514708427701e-07, + "loss": 0.0352, + "reward": 0.0007673171348869801, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.0007673171348869801, + "reward_after_std": 0.6858346164226532, + "reward_before_mean": 0.5846241675317287, + "reward_before_std": 0.48853123001754284, + "reward_change_max": 0.0003979206085205078, + "reward_change_mean": -0.5838568340986967, + "reward_change_min": -0.8816848546266556, + "reward_change_std": 0.33762601763010025, + "reward_std": 0.6858346238732338, + "rewards/cosine_scaled_reward": -0.13477126159705222, + "rewards/format_reward": 0.8541666828095913, + "step": 485 + }, + { + "advantage_max": 1.3995477855205536, + "advantage_mean": -3.104408841103634e-09, + "advantage_min": -0.6553352884948254, + "advantage_std": 0.7347025461494923, + "completion_length": 2372.375045776367, + "epoch": 0.5554285714285714, + "grad_norm": 0.4456922709941864, + "kl": 0.206756591796875, + "lambda_div_used": 0.5, + "learning_rate": 1.0214767000817596e-07, + "loss": 0.0034, + "reward": 0.10065419168677181, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.10065419168677181, + "reward_after_std": 0.7347025536000729, + "reward_before_mean": 0.767020778497681, + "reward_before_std": 0.6293009147047997, + "reward_change_max": 0.0004982724785804749, + "reward_change_mean": -0.6663665995001793, + "reward_change_min": -1.0780046060681343, + "reward_change_std": 0.4334941878914833, + "reward_std": 0.7347025647759438, + "rewards/cosine_scaled_reward": -0.04357295297086239, + "rewards/format_reward": 0.854166679084301, + "step": 486 + }, + { + "advantage_max": 1.5991999804973602, + "advantage_mean": -9.31322596819939e-09, + "advantage_min": -0.6759998686611652, + "advantage_std": 0.839877612888813, + "completion_length": 2050.2083892822266, + "epoch": 0.5565714285714286, + "grad_norm": 0.4651009440422058, + "kl": 0.16881561279296875, + "lambda_div_used": 0.5, + "learning_rate": 1.0185202062281336e-07, + "loss": 0.0254, + "reward": 0.2241103844717145, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2241103844717145, + "reward_after_std": 0.8398776091635227, + "reward_before_mean": 0.9572561159729958, + "reward_before_std": 0.7173593416810036, + "reward_change_max": 0.0006580352783203125, + "reward_change_mean": -0.7331457510590553, + "reward_change_min": -1.2133165672421455, + "reward_change_std": 0.47992801666259766, + "reward_std": 0.8398776240646839, + "rewards/cosine_scaled_reward": 0.051544721238315105, + "rewards/format_reward": 0.8541666716337204, + "step": 487 + }, + { + "advantage_max": 1.4238049387931824, + "advantage_mean": -3.1044086745701804e-09, + "advantage_min": -0.6612549461424351, + "advantage_std": 0.7586982101202011, + "completion_length": 2104.1042404174805, + "epoch": 0.5577142857142857, + "grad_norm": 0.2961161434650421, + "kl": 0.1575164794921875, + "lambda_div_used": 0.5, + "learning_rate": 1.0157821333772304e-07, + "loss": 0.0166, + "reward": -0.05783984065055847, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.05783984065055847, + "reward_after_std": 0.7586982138454914, + "reward_before_mean": 0.46660646225791425, + "reward_before_std": 0.7264276705682278, + "reward_change_max": 0.0007160604000091553, + "reward_change_mean": -0.5244462713599205, + "reward_change_min": -0.9499593526124954, + "reward_change_std": 0.38819571398198605, + "reward_std": 0.7586982510983944, + "rewards/cosine_scaled_reward": -0.14169678711914457, + "rewards/format_reward": 0.7500000074505806, + "step": 488 + }, + { + "advantage_max": 1.3053287342190742, + "advantage_mean": 1.3659398168108794e-08, + "advantage_min": -0.6531180515885353, + "advantage_std": 0.701459277421236, + "completion_length": 3237.187530517578, + "epoch": 0.5588571428571428, + "grad_norm": 0.876398503780365, + "kl": 0.3499755859375, + "lambda_div_used": 0.5, + "learning_rate": 1.013262614978859e-07, + "loss": 0.0806, + "reward": -0.1897954777814448, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.1897954777814448, + "reward_after_std": 0.7014592699706554, + "reward_before_mean": 0.24803152214735746, + "reward_before_std": 0.7108192816376686, + "reward_change_max": 0.0005951672792434692, + "reward_change_mean": -0.437827005982399, + "reward_change_min": -0.8422755375504494, + "reward_change_std": 0.35589798726141453, + "reward_std": 0.7014592848718166, + "rewards/cosine_scaled_reward": -0.13640091847628355, + "rewards/format_reward": 0.5208333414047956, + "step": 489 + }, + { + "advantage_max": 2.04465813934803, + "advantage_mean": 2.4835269396561444e-09, + "advantage_min": -0.7276268340647221, + "advantage_std": 1.0391230322420597, + "completion_length": 2317.2500915527344, + "epoch": 0.56, + "grad_norm": 0.8721774220466614, + "kl": 0.17469024658203125, + "lambda_div_used": 0.5, + "learning_rate": 1.0109617738307911e-07, + "loss": 0.0357, + "reward": 0.2616567127406597, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2616567127406597, + "reward_after_std": 1.0391229949891567, + "reward_before_mean": 0.9548911787569523, + "reward_before_std": 0.8656030613929033, + "reward_change_max": 0.0004154816269874573, + "reward_change_mean": -0.693234434351325, + "reward_change_min": -1.169628955423832, + "reward_change_std": 0.4441659040749073, + "reward_std": 1.039123009890318, + "rewards/cosine_scaled_reward": 0.039945571683347225, + "rewards/format_reward": 0.8750000037252903, + "step": 490 + }, + { + "advantage_max": 1.7204342857003212, + "advantage_mean": -2.483526828633842e-09, + "advantage_min": -0.8566758520901203, + "advantage_std": 0.9151424802839756, + "completion_length": 2481.4375762939453, + "epoch": 0.5611428571428572, + "grad_norm": 1.3411248922348022, + "kl": 0.231109619140625, + "lambda_div_used": 0.5, + "learning_rate": 1.0088797220727779e-07, + "loss": 0.0341, + "reward": 0.4715647688135505, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4715647688135505, + "reward_after_std": 0.9151424951851368, + "reward_before_mean": 1.3851352967321873, + "reward_before_std": 0.795163257047534, + "reward_change_max": 0.0, + "reward_change_mean": -0.9135705418884754, + "reward_change_min": -1.4265716075897217, + "reward_change_std": 0.580524630844593, + "reward_std": 0.9151425138115883, + "rewards/cosine_scaled_reward": 0.3071509785950184, + "rewards/format_reward": 0.7708333507180214, + "step": 491 + }, + { + "advantage_max": 1.320159673690796, + "advantage_mean": -8.071462720415923e-09, + "advantage_min": -0.6564004346728325, + "advantage_std": 0.710913211107254, + "completion_length": 2465.729232788086, + "epoch": 0.5622857142857143, + "grad_norm": 0.4086810350418091, + "kl": 0.2469635009765625, + "lambda_div_used": 0.5, + "learning_rate": 1.0070165611810855e-07, + "loss": 0.0104, + "reward": 0.11124568968079984, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.11124568968079984, + "reward_after_std": 0.7109132148325443, + "reward_before_mean": 0.7901245001703501, + "reward_before_std": 0.6355638056993484, + "reward_change_max": 0.0, + "reward_change_mean": -0.6788788326084614, + "reward_change_min": -1.1075879484415054, + "reward_change_std": 0.44137519784271717, + "reward_std": 0.7109132558107376, + "rewards/cosine_scaled_reward": -0.0007710885256528854, + "rewards/format_reward": 0.791666679084301, + "step": 492 + }, + { + "advantage_max": 2.035825029015541, + "advantage_mean": -3.725290298461914e-09, + "advantage_min": -0.8803831040859222, + "advantage_std": 1.061902403831482, + "completion_length": 2251.6459045410156, + "epoch": 0.5634285714285714, + "grad_norm": 1.2623525857925415, + "kl": 0.1896820068359375, + "lambda_div_used": 0.5, + "learning_rate": 1.005372381963547e-07, + "loss": -0.0228, + "reward": 0.39003518410027027, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.39003518410027027, + "reward_after_std": 1.061902403831482, + "reward_before_mean": 1.1865560039877892, + "reward_before_std": 0.9536248818039894, + "reward_change_max": 0.0007629022002220154, + "reward_change_mean": -0.7965207956731319, + "reward_change_min": -1.3428436070680618, + "reward_change_std": 0.5259374044835567, + "reward_std": 1.0619024187326431, + "rewards/cosine_scaled_reward": 0.11411132011562586, + "rewards/format_reward": 0.9583333432674408, + "step": 493 + }, + { + "advantage_max": 1.9328140318393707, + "advantage_mean": -4.96705393482344e-09, + "advantage_min": -0.8767299056053162, + "advantage_std": 1.0182598046958447, + "completion_length": 1785.6250610351562, + "epoch": 0.5645714285714286, + "grad_norm": 1.4499907493591309, + "kl": 0.122161865234375, + "lambda_div_used": 0.5, + "learning_rate": 1.0039472645551372e-07, + "loss": 0.0295, + "reward": 0.3418647423386574, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3418647423386574, + "reward_after_std": 1.018259834498167, + "reward_before_mean": 1.1130484715104103, + "reward_before_std": 0.9386914111673832, + "reward_change_max": 0.0, + "reward_change_mean": -0.7711836881935596, + "reward_change_min": -1.3634328842163086, + "reward_change_std": 0.525868522003293, + "reward_std": 1.0182598493993282, + "rewards/cosine_scaled_reward": 0.09819087269715965, + "rewards/format_reward": 0.916666679084301, + "step": 494 + }, + { + "advantage_max": 1.592846192419529, + "advantage_mean": -7.450581041013038e-09, + "advantage_min": -0.6558383330702782, + "advantage_std": 0.831727247685194, + "completion_length": 2725.3333587646484, + "epoch": 0.5657142857142857, + "grad_norm": 0.35111257433891296, + "kl": 0.252655029296875, + "lambda_div_used": 0.5, + "learning_rate": 1.002741278414069e-07, + "loss": 0.0314, + "reward": 0.0912869069725275, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.0912869069725275, + "reward_after_std": 0.8317272700369358, + "reward_before_mean": 0.720516414847225, + "reward_before_std": 0.7226469293236732, + "reward_change_max": 0.0018553584814071655, + "reward_change_mean": -0.6292295139282942, + "reward_change_min": -1.1326270997524261, + "reward_change_std": 0.4294526055455208, + "reward_std": 0.8317273035645485, + "rewards/cosine_scaled_reward": 0.01650819112546742, + "rewards/format_reward": 0.6875000055879354, + "step": 495 + }, + { + "advantage_max": 1.5743544548749924, + "advantage_mean": -9.313226467799751e-09, + "advantage_min": -0.7295497246086597, + "advantage_std": 0.8451073318719864, + "completion_length": 1919.1875228881836, + "epoch": 0.5668571428571428, + "grad_norm": 0.410314679145813, + "kl": 0.2297515869140625, + "lambda_div_used": 0.5, + "learning_rate": 1.0017544823184055e-07, + "loss": 0.0128, + "reward": 0.12698657670989633, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.12698657670989633, + "reward_after_std": 0.8451073467731476, + "reward_before_mean": 0.7774940053932369, + "reward_before_std": 0.7901693303138018, + "reward_change_max": 0.0, + "reward_change_mean": -0.6505074352025986, + "reward_change_min": -1.152572087943554, + "reward_change_std": 0.47371556237339973, + "reward_std": 0.8451073616743088, + "rewards/cosine_scaled_reward": 0.003330339677631855, + "rewards/format_reward": 0.7708333469927311, + "step": 496 + }, + { + "advantage_max": 1.628432959318161, + "advantage_mean": -2.4835269396561444e-09, + "advantage_min": -0.7225891537964344, + "advantage_std": 0.8677199482917786, + "completion_length": 2582.0625762939453, + "epoch": 0.568, + "grad_norm": 0.6326491236686707, + "kl": 0.25273895263671875, + "lambda_div_used": 0.5, + "learning_rate": 1.0009869243631952e-07, + "loss": 0.0302, + "reward": 0.22483675926923752, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.22483675926923752, + "reward_after_std": 0.8677199482917786, + "reward_before_mean": 0.964297803118825, + "reward_before_std": 0.7687317673116922, + "reward_change_max": 0.0022812560200691223, + "reward_change_mean": -0.7394610624760389, + "reward_change_min": -1.3346537351608276, + "reward_change_std": 0.5281522907316685, + "reward_std": 0.8677199706435204, + "rewards/cosine_scaled_reward": 0.12798223458230495, + "rewards/format_reward": 0.7083333395421505, + "step": 497 + }, + { + "advantage_max": 1.3664054870605469, + "advantage_mean": 2.4835269396561444e-09, + "advantage_min": -0.7810567021369934, + "advantage_std": 0.7454097159206867, + "completion_length": 2571.291732788086, + "epoch": 0.5691428571428572, + "grad_norm": 0.7533896565437317, + "kl": 0.250091552734375, + "lambda_div_used": 0.5, + "learning_rate": 1.000438641958131e-07, + "loss": 0.026, + "reward": 0.13466812949627638, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.13466812949627638, + "reward_after_std": 0.7454097084701061, + "reward_before_mean": 0.8310234602540731, + "reward_before_std": 0.7226423937827349, + "reward_change_max": 0.0, + "reward_change_mean": -0.6963553391396999, + "reward_change_min": -1.16697296500206, + "reward_change_std": 0.4730456341058016, + "reward_std": 0.7454097121953964, + "rewards/cosine_scaled_reward": -0.0428216177970171, + "rewards/format_reward": 0.916666679084301, + "step": 498 + }, + { + "advantage_max": 2.1150874942541122, + "advantage_mean": -1.7384688355548406e-08, + "advantage_min": -0.8328440636396408, + "advantage_std": 1.0995317697525024, + "completion_length": 2640.3750762939453, + "epoch": 0.5702857142857143, + "grad_norm": 0.7087387442588806, + "kl": 0.32720947265625, + "lambda_div_used": 0.5, + "learning_rate": 1.0001096618257236e-07, + "loss": 0.0227, + "reward": 0.1912521708291024, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1912521708291024, + "reward_after_std": 1.0995317697525024, + "reward_before_mean": 0.8074039425700903, + "reward_before_std": 1.0485225953161716, + "reward_change_max": 0.00030355900526046753, + "reward_change_mean": -0.6161517985165119, + "reward_change_min": -1.195886768400669, + "reward_change_std": 0.46653415262699127, + "reward_std": 1.099531814455986, + "rewards/cosine_scaled_reward": 0.007868630811572075, + "rewards/format_reward": 0.791666679084301, + "step": 499 + }, + { + "advantage_max": 1.8856936693191528, + "advantage_mean": 2.4835269396561444e-09, + "advantage_min": -0.7711785212159157, + "advantage_std": 0.9831021875143051, + "completion_length": 2979.229248046875, + "epoch": 0.5714285714285714, + "grad_norm": 0.687682569026947, + "kl": 0.3313140869140625, + "lambda_div_used": 0.5, + "learning_rate": 1e-07, + "loss": 0.0507, + "reward": 0.009953925851732492, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.009953925851732492, + "reward_after_std": 0.9831021577119827, + "reward_before_mean": 0.5228100651875138, + "reward_before_std": 0.9547398835420609, + "reward_change_max": 0.0015029683709144592, + "reward_change_mean": -0.5128561519086361, + "reward_change_min": -1.1185838133096695, + "reward_change_std": 0.4156857579946518, + "reward_std": 0.9831021949648857, + "rewards/cosine_scaled_reward": -0.07192830881103873, + "rewards/format_reward": 0.6666666846722364, + "step": 500 + }, + { + "epoch": 0.5714285714285714, + "step": 500, + "total_flos": 0.0, + "train_loss": 0.0024257735135033726, + "train_runtime": 9466.1521, + "train_samples_per_second": 2.535, + "train_steps_per_second": 0.053 + } + ], + "logging_steps": 1, + "max_steps": 500, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}