diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13542 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5714285714285714, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "advantage_max": 1.8650365471839905, + "advantage_mean": 5.4016712935922584e-08, + "advantage_min": -0.9214624911546707, + "advantage_std": 0.999835692346096, + "completion_length": 2571.2083587646484, + "epoch": 0.001142857142857143, + "grad_norm": 0.20358379185199738, + "kl": 0.0, + "lambda_div_used": 0.5, + "learning_rate": 2e-08, + "loss": -0.0, + "reward": -0.03908593417145312, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.03908593417145312, + "reward_after_std": 0.8219119422137737, + "reward_before_mean": 0.4897647276520729, + "reward_before_std": 0.8290339298546314, + "reward_change_max": 0.0007017925381660461, + "reward_change_mean": -0.5288506411015987, + "reward_change_min": -1.0365500748157501, + "reward_change_std": 0.4204680975526571, + "reward_std": 0.8219119869172573, + "rewards/cosine_scaled_reward": -0.015534311532974243, + "rewards/format_reward": 0.5208333488553762, + "step": 1 + }, + { + "advantage_max": 1.8198039829730988, + "advantage_mean": 5.4637592672435176e-08, + "advantage_min": -0.9130084365606308, + "advantage_std": 0.9997541680932045, + "completion_length": 2804.395881652832, + "epoch": 0.002285714285714286, + "grad_norm": 0.18365833163261414, + "kl": 0.0, + "lambda_div_used": 0.5, + "learning_rate": 4e-08, + "loss": -0.0, + "reward": -0.21404163353145123, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.21404163353145123, + "reward_after_std": 0.4922399129718542, + "reward_before_mean": 0.27539755403995514, + "reward_before_std": 0.42092561535537243, + "reward_change_max": 0.001632794737815857, + "reward_change_mean": -0.48943919129669666, + "reward_change_min": -0.7970554456114769, + "reward_change_std": 0.3251637788489461, + "reward_std": 0.4922399166971445, + "rewards/cosine_scaled_reward": -0.04980122856795788, + "rewards/format_reward": 0.37500000558793545, + "step": 2 + }, + { + "advantage_max": 1.8429554402828217, + "advantage_mean": 1.862645104822036e-08, + "advantage_min": -0.8803069293498993, + "advantage_std": 0.9997477829456329, + "completion_length": 3368.1041870117188, + "epoch": 0.0034285714285714284, + "grad_norm": 0.16678079962730408, + "kl": 4.012882709503174e-05, + "lambda_div_used": 0.5, + "learning_rate": 6e-08, + "loss": 0.0, + "reward": -0.48418260738253593, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.48418260738253593, + "reward_after_std": 0.4452684707939625, + "reward_before_mean": -0.2069278322160244, + "reward_before_std": 0.4541137106716633, + "reward_change_max": 0.0003810301423072815, + "reward_change_mean": -0.2772547733038664, + "reward_change_min": -0.554124977439642, + "reward_change_std": 0.22407594323158264, + "reward_std": 0.44526849314570427, + "rewards/cosine_scaled_reward": -0.17638059332966805, + "rewards/format_reward": 0.14583333395421505, + "step": 3 + }, + { + "advantage_max": 1.9580544233322144, + "advantage_mean": 9.934108535780695e-09, + "advantage_min": -0.7727478072047234, + "advantage_std": 0.9998557344079018, + "completion_length": 2326.5833892822266, + "epoch": 0.004571428571428572, + "grad_norm": 0.2263989895582199, + "kl": 4.519522190093994e-05, + "lambda_div_used": 0.5, + "learning_rate": 8e-08, + "loss": 0.0, + "reward": -0.028838554164394736, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.028838554164394736, + "reward_after_std": 0.938622172921896, + "reward_before_mean": 0.45526737440377474, + "reward_before_std": 0.8286880534142256, + "reward_change_max": 0.0, + "reward_change_mean": -0.4841059297323227, + "reward_change_min": -0.8776258826255798, + "reward_change_std": 0.32474724017083645, + "reward_std": 0.9386221915483475, + "rewards/cosine_scaled_reward": -0.074449656996876, + "rewards/format_reward": 0.6041666697710752, + "step": 4 + }, + { + "advantage_max": 1.9359291642904282, + "advantage_mean": 3.6011140291947186e-08, + "advantage_min": -0.7737014293670654, + "advantage_std": 0.999842680990696, + "completion_length": 3186.0625610351562, + "epoch": 0.005714285714285714, + "grad_norm": 0.18710263073444366, + "kl": 4.155933856964111e-05, + "lambda_div_used": 0.5, + "learning_rate": 1e-07, + "loss": 0.0, + "reward": -0.3549866806715727, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.3549866806715727, + "reward_after_std": 0.774379301816225, + "reward_before_mean": -0.08489098830614239, + "reward_before_std": 0.7537979669868946, + "reward_change_max": 0.001534678041934967, + "reward_change_mean": -0.270095681771636, + "reward_change_min": -0.5602017678320408, + "reward_change_std": 0.23432399705052376, + "reward_std": 0.774379301816225, + "rewards/cosine_scaled_reward": -0.2091121654957533, + "rewards/format_reward": 0.3333333432674408, + "step": 5 + }, + { + "advantage_max": 1.863465204834938, + "advantage_mean": 3.539025805743279e-08, + "advantage_min": -0.84102962911129, + "advantage_std": 0.9996843636035919, + "completion_length": 3105.354217529297, + "epoch": 0.006857142857142857, + "grad_norm": 0.20564018189907074, + "kl": 5.272030830383301e-05, + "lambda_div_used": 0.5, + "learning_rate": 1.2e-07, + "loss": 0.0, + "reward": -0.4135122634470463, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.4135122634470463, + "reward_after_std": 0.5014001121744514, + "reward_before_mean": -0.09564527939073741, + "reward_before_std": 0.513142878189683, + "reward_change_max": 0.0008734241127967834, + "reward_change_mean": -0.3178669649641961, + "reward_change_min": -0.6814537905156612, + "reward_change_std": 0.26048805261962116, + "reward_std": 0.501400119625032, + "rewards/cosine_scaled_reward": -0.1728226412087679, + "rewards/format_reward": 0.25000000186264515, + "step": 6 + }, + { + "advantage_max": 1.8490984290838242, + "advantage_mean": 3.1044085080367267e-09, + "advantage_min": -0.9447271376848221, + "advantage_std": 0.9998549297451973, + "completion_length": 3099.562530517578, + "epoch": 0.008, + "grad_norm": 0.15761414170265198, + "kl": 2.193450927734375e-05, + "lambda_div_used": 0.5, + "learning_rate": 1.4e-07, + "loss": 0.0, + "reward": -0.014719070866703987, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.014719070866703987, + "reward_after_std": 0.8509783893823624, + "reward_before_mean": 0.5388575592078269, + "reward_before_std": 0.9079956337809563, + "reward_change_max": 0.0003746822476387024, + "reward_change_mean": -0.5535766407847404, + "reward_change_min": -1.1588828563690186, + "reward_change_std": 0.46872530598193407, + "reward_std": 0.850978396832943, + "rewards/cosine_scaled_reward": -0.011821209453046322, + "rewards/format_reward": 0.5625000223517418, + "step": 7 + }, + { + "advantage_max": 1.855475664138794, + "advantage_mean": -2.2972623803241277e-08, + "advantage_min": -0.9538509175181389, + "advantage_std": 0.9998169988393784, + "completion_length": 2739.895851135254, + "epoch": 0.009142857142857144, + "grad_norm": 0.16170156002044678, + "kl": 2.1321699023246765e-05, + "lambda_div_used": 0.5, + "learning_rate": 1.6e-07, + "loss": 0.0, + "reward": 0.041294334921985865, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.041294334921985865, + "reward_after_std": 0.7754815965890884, + "reward_before_mean": 0.6523661861720029, + "reward_before_std": 0.7343525104224682, + "reward_change_max": 0.001979641616344452, + "reward_change_mean": -0.6110718548297882, + "reward_change_min": -0.9722144268453121, + "reward_change_std": 0.4230197472497821, + "reward_std": 0.7754816301167011, + "rewards/cosine_scaled_reward": 0.08659974206238985, + "rewards/format_reward": 0.47916667722165585, + "step": 8 + }, + { + "advantage_max": 1.8438913971185684, + "advantage_mean": 7.450580818968433e-09, + "advantage_min": -0.866303063929081, + "advantage_std": 0.9998209476470947, + "completion_length": 3056.8125610351562, + "epoch": 0.010285714285714285, + "grad_norm": 0.21665655076503754, + "kl": 4.3138861656188965e-05, + "lambda_div_used": 0.5, + "learning_rate": 1.8e-07, + "loss": 0.0, + "reward": -0.22506628464907408, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.22506628464907408, + "reward_after_std": 0.80972820520401, + "reward_before_mean": 0.15861139819025993, + "reward_before_std": 0.8846573233604431, + "reward_change_max": 0.0, + "reward_change_mean": -0.3836776837706566, + "reward_change_min": -0.9843891076743603, + "reward_change_std": 0.40112666599452496, + "reward_std": 0.8097282461822033, + "rewards/cosine_scaled_reward": -0.10819429811090231, + "rewards/format_reward": 0.37500001303851604, + "step": 9 + }, + { + "advantage_max": 1.8683208376169205, + "advantage_mean": 5.339582997976322e-08, + "advantage_min": -0.854301206767559, + "advantage_std": 0.999813862144947, + "completion_length": 2680.1041870117188, + "epoch": 0.011428571428571429, + "grad_norm": 0.20287226140499115, + "kl": 2.043694257736206e-05, + "lambda_div_used": 0.5, + "learning_rate": 2e-07, + "loss": 0.0, + "reward": -0.23507352324668318, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.23507352324668318, + "reward_after_std": 0.7843089625239372, + "reward_before_mean": 0.1418246179819107, + "reward_before_std": 0.8295328728854656, + "reward_change_max": 0.0011120587587356567, + "reward_change_mean": -0.3768981248140335, + "reward_change_min": -0.8887458518147469, + "reward_change_std": 0.3657720573246479, + "reward_std": 0.7843089960515499, + "rewards/cosine_scaled_reward": -0.12700436916202307, + "rewards/format_reward": 0.3958333432674408, + "step": 10 + }, + { + "advantage_max": 1.929537832736969, + "advantage_mean": 9.934107758624577e-09, + "advantage_min": -0.7646144963800907, + "advantage_std": 0.9998139664530754, + "completion_length": 3314.9375, + "epoch": 0.012571428571428572, + "grad_norm": 0.18723739683628082, + "kl": 3.269314765930176e-05, + "lambda_div_used": 0.5, + "learning_rate": 2.1999999999999998e-07, + "loss": 0.0, + "reward": -0.3749279286712408, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.3749279286712408, + "reward_after_std": 0.8660313133150339, + "reward_before_mean": -0.14815278816968203, + "reward_before_std": 0.8909500073641539, + "reward_change_max": 0.006466515362262726, + "reward_change_mean": -0.22677514608949423, + "reward_change_min": -0.5190893076360226, + "reward_change_std": 0.23491209978237748, + "reward_std": 0.8660313449800014, + "rewards/cosine_scaled_reward": -0.17824306967668235, + "rewards/format_reward": 0.20833333767950535, + "step": 11 + }, + { + "advantage_max": 1.8934268653392792, + "advantage_mean": -5.587935947293232e-09, + "advantage_min": -0.7801559269428253, + "advantage_std": 0.9998533800244331, + "completion_length": 2306.3333740234375, + "epoch": 0.013714285714285714, + "grad_norm": 0.22430002689361572, + "kl": 3.651529550552368e-05, + "lambda_div_used": 0.5, + "learning_rate": 2.4e-07, + "loss": 0.0, + "reward": -0.05954301607562229, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.05954301607562229, + "reward_after_std": 0.8166232109069824, + "reward_before_mean": 0.4518513362854719, + "reward_before_std": 0.771789163351059, + "reward_change_max": 0.002845950424671173, + "reward_change_mean": -0.5113943256437778, + "reward_change_min": -1.007496863603592, + "reward_change_std": 0.3824256267398596, + "reward_std": 0.8166232109069824, + "rewards/cosine_scaled_reward": -0.09699101699516177, + "rewards/format_reward": 0.6458333414047956, + "step": 12 + }, + { + "advantage_max": 1.8332414776086807, + "advantage_mean": 2.3593506426333732e-08, + "advantage_min": -0.9173097312450409, + "advantage_std": 0.9997749254107475, + "completion_length": 3007.687530517578, + "epoch": 0.014857142857142857, + "grad_norm": 0.19439151883125305, + "kl": 3.3482909202575684e-05, + "lambda_div_used": 0.5, + "learning_rate": 2.6e-07, + "loss": 0.0, + "reward": -0.28582077845931053, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.28582077845931053, + "reward_after_std": 0.6129563190042973, + "reward_before_mean": 0.1077382080256939, + "reward_before_std": 0.6512869298458099, + "reward_change_max": 0.0, + "reward_change_mean": -0.3935589976608753, + "reward_change_min": -0.8552064150571823, + "reward_change_std": 0.33744460716843605, + "reward_std": 0.6129563301801682, + "rewards/cosine_scaled_reward": -0.09196422435343266, + "rewards/format_reward": 0.2916666716337204, + "step": 13 + }, + { + "advantage_max": 1.8766373842954636, + "advantage_mean": -9.313225468599029e-09, + "advantage_min": -0.8543065041303635, + "advantage_std": 0.9998165741562843, + "completion_length": 2791.041717529297, + "epoch": 0.016, + "grad_norm": 0.1748839169740677, + "kl": 2.9399991035461426e-05, + "lambda_div_used": 0.5, + "learning_rate": 2.8e-07, + "loss": 0.0, + "reward": -0.18133868090808392, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.18133868090808392, + "reward_after_std": 0.7546257749199867, + "reward_before_mean": 0.2495754323899746, + "reward_before_std": 0.7416385095566511, + "reward_change_max": 0.0003454461693763733, + "reward_change_mean": -0.43091410771012306, + "reward_change_min": -0.8854171372950077, + "reward_change_std": 0.3447839133441448, + "reward_std": 0.7546257749199867, + "rewards/cosine_scaled_reward": -0.10437896568328142, + "rewards/format_reward": 0.4583333469927311, + "step": 14 + }, + { + "advantage_max": 1.921839565038681, + "advantage_mean": 5.3395828314428684e-08, + "advantage_min": -0.8221309706568718, + "advantage_std": 0.9998021274805069, + "completion_length": 2747.1875343322754, + "epoch": 0.017142857142857144, + "grad_norm": 0.20771348476409912, + "kl": 3.0741095542907715e-05, + "lambda_div_used": 0.5, + "learning_rate": 3e-07, + "loss": 0.0, + "reward": -0.11654636077582836, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.11654636077582836, + "reward_after_std": 0.6426295302808285, + "reward_before_mean": 0.39872846752405167, + "reward_before_std": 0.5288418848067522, + "reward_change_max": 0.0006920620799064636, + "reward_change_mean": -0.5152748012915254, + "reward_change_min": -0.812650166451931, + "reward_change_std": 0.3277115412056446, + "reward_std": 0.6426295377314091, + "rewards/cosine_scaled_reward": 0.0014475611969828606, + "rewards/format_reward": 0.3958333358168602, + "step": 15 + }, + { + "advantage_max": 1.838592454791069, + "advantage_mean": 3.601114018092488e-08, + "advantage_min": -0.9086092934012413, + "advantage_std": 0.9996505901217461, + "completion_length": 3414.9583740234375, + "epoch": 0.018285714285714287, + "grad_norm": 0.18958210945129395, + "kl": 3.1381845474243164e-05, + "lambda_div_used": 0.5, + "learning_rate": 3.2e-07, + "loss": 0.0, + "reward": -0.5179620869457722, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.5179620869457722, + "reward_after_std": 0.5234112944453955, + "reward_before_mean": -0.28896366874687374, + "reward_before_std": 0.58364612236619, + "reward_change_max": 0.0011472925543785095, + "reward_change_mean": -0.22899843472987413, + "reward_change_min": -0.5847333557903767, + "reward_change_std": 0.25637041311711073, + "reward_std": 0.523411313071847, + "rewards/cosine_scaled_reward": -0.20698183961212635, + "rewards/format_reward": 0.12500000186264515, + "step": 16 + }, + { + "advantage_max": 1.8927734047174454, + "advantage_mean": 2.2351741790771484e-08, + "advantage_min": -0.8768943846225739, + "advantage_std": 0.999830462038517, + "completion_length": 2313.9167251586914, + "epoch": 0.019428571428571427, + "grad_norm": 0.2875772714614868, + "kl": 4.1112303733825684e-05, + "lambda_div_used": 0.5, + "learning_rate": 3.4000000000000003e-07, + "loss": 0.0, + "reward": -0.0218079574406147, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.0218079574406147, + "reward_after_std": 0.7882238104939461, + "reward_before_mean": 0.5311932638287544, + "reward_before_std": 0.7641869075596333, + "reward_change_max": 0.001342780888080597, + "reward_change_mean": -0.5530012445524335, + "reward_change_min": -1.0787059292197227, + "reward_change_std": 0.42368356697261333, + "reward_std": 0.7882238253951073, + "rewards/cosine_scaled_reward": -0.026070039719343185, + "rewards/format_reward": 0.5833333469927311, + "step": 17 + }, + { + "advantage_max": 1.8699724674224854, + "advantage_mean": 2.5456150465341665e-08, + "advantage_min": -0.8736857995390892, + "advantage_std": 0.9997463449835777, + "completion_length": 3016.1875228881836, + "epoch": 0.02057142857142857, + "grad_norm": 0.1541830599308014, + "kl": 2.8382986783981323e-05, + "lambda_div_used": 0.5, + "learning_rate": 3.6e-07, + "loss": 0.0, + "reward": -0.3072060104459524, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": -0.3072060104459524, + "reward_after_std": 0.5597724877297878, + "reward_before_mean": 0.08646659925580025, + "reward_before_std": 0.5770146455615759, + "reward_change_max": 0.0017676278948783875, + "reward_change_mean": -0.39367261063307524, + "reward_change_min": -0.765678558498621, + "reward_change_std": 0.3168019922450185, + "reward_std": 0.5597724877297878, + "rewards/cosine_scaled_reward": -0.11301671247929335, + "rewards/format_reward": 0.31250000558793545, + "step": 18 + }, + { + "advantage_max": 1.8800882250070572, + "advantage_mean": 2.4835267176115394e-09, + "advantage_min": -0.8829206228256226, + "advantage_std": 0.9998331665992737, + "completion_length": 2785.3750381469727, + "epoch": 0.021714285714285714, + "grad_norm": 0.16840744018554688, + "kl": 2.9861927032470703e-05, + "lambda_div_used": 0.5, + "learning_rate": 3.7999999999999996e-07, + "loss": 0.0, + "reward": 0.03168467991054058, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.03168467991054058, + "reward_after_std": 0.834330890327692, + "reward_before_mean": 0.6134747294709086, + "reward_before_std": 0.8129289895296097, + "reward_change_max": 0.0006548240780830383, + "reward_change_mean": -0.5817900514230132, + "reward_change_min": -1.0407536514103413, + "reward_change_std": 0.4286394249647856, + "reward_std": 0.8343308977782726, + "rewards/cosine_scaled_reward": 0.06715403066482395, + "rewards/format_reward": 0.47916668094694614, + "step": 19 + }, + { + "advantage_max": 1.8762269914150238, + "advantage_mean": 6.829699084054397e-09, + "advantage_min": -0.7452137358486652, + "advantage_std": 0.9998760744929314, + "completion_length": 2365.3958740234375, + "epoch": 0.022857142857142857, + "grad_norm": 0.18150842189788818, + "kl": 2.7011847123503685e-05, + "lambda_div_used": 0.5, + "learning_rate": 4e-07, + "loss": 0.0, + "reward": 0.11636773869395256, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.11636773869395256, + "reward_after_std": 0.9510068409144878, + "reward_before_mean": 0.731377505348064, + "reward_before_std": 0.9209488183259964, + "reward_change_max": 0.0, + "reward_change_mean": -0.6150098070502281, + "reward_change_min": -1.2754083350300789, + "reward_change_std": 0.5005144346505404, + "reward_std": 0.9510068707168102, + "rewards/cosine_scaled_reward": 0.011522093118401244, + "rewards/format_reward": 0.7083333358168602, + "step": 20 + }, + { + "advantage_max": 1.9191124886274338, + "advantage_mean": -3.228584943837376e-08, + "advantage_min": -0.741646058857441, + "advantage_std": 0.9997679218649864, + "completion_length": 2737.666717529297, + "epoch": 0.024, + "grad_norm": 0.18668660521507263, + "kl": 4.217773675918579e-05, + "lambda_div_used": 0.5, + "learning_rate": 4.1999999999999995e-07, + "loss": 0.0, + "reward": -0.14399441180285066, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.14399441180285066, + "reward_after_std": 0.8878922425210476, + "reward_before_mean": 0.270708542317152, + "reward_before_std": 0.8766209781169891, + "reward_change_max": 0.0014019683003425598, + "reward_change_mean": -0.41470295563340187, + "reward_change_min": -0.8328092768788338, + "reward_change_std": 0.3357093087397516, + "reward_std": 0.8878922797739506, + "rewards/cosine_scaled_reward": -0.09381241840310395, + "rewards/format_reward": 0.4583333395421505, + "step": 21 + }, + { + "advantage_max": 1.9325433522462845, + "advantage_mean": 6.364037852257809e-09, + "advantage_min": -0.8162704780697823, + "advantage_std": 0.9998474344611168, + "completion_length": 1529.2292137145996, + "epoch": 0.025142857142857144, + "grad_norm": 0.3153725266456604, + "kl": 5.3122639656066895e-05, + "lambda_div_used": 0.5, + "learning_rate": 4.3999999999999997e-07, + "loss": 0.0, + "reward": 0.18219375910121016, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.18219375910121016, + "reward_after_std": 0.7688724808394909, + "reward_before_mean": 0.9053211729042232, + "reward_before_std": 0.6001508925110102, + "reward_change_max": 6.76959753036499e-05, + "reward_change_mean": -0.7231274656951427, + "reward_change_min": -1.1781792268157005, + "reward_change_std": 0.445758156478405, + "reward_std": 0.7688724920153618, + "rewards/cosine_scaled_reward": 0.025577264837920666, + "rewards/format_reward": 0.8541666753590107, + "step": 22 + }, + { + "advantage_max": 1.9284197837114334, + "advantage_mean": 1.3038516710750514e-08, + "advantage_min": -0.799706406891346, + "advantage_std": 0.9998190701007843, + "completion_length": 2694.687545776367, + "epoch": 0.026285714285714287, + "grad_norm": 0.23257306218147278, + "kl": 2.7127563953399658e-05, + "lambda_div_used": 0.5, + "learning_rate": 4.6e-07, + "loss": 0.0, + "reward": -0.232659702480305, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.232659702480305, + "reward_after_std": 0.7457499727606773, + "reward_before_mean": 0.14918443333590403, + "reward_before_std": 0.7016965411603451, + "reward_change_max": 0.0005003586411476135, + "reward_change_mean": -0.38184412755072117, + "reward_change_min": -0.7390884645283222, + "reward_change_std": 0.2957865409553051, + "reward_std": 0.7457499802112579, + "rewards/cosine_scaled_reward": -0.14415779197588563, + "rewards/format_reward": 0.43750000186264515, + "step": 23 + }, + { + "advantage_max": 1.9204401075839996, + "advantage_mean": 1.6453365864199654e-08, + "advantage_min": -0.8407022133469582, + "advantage_std": 0.9998438656330109, + "completion_length": 3135.1250610351562, + "epoch": 0.027428571428571427, + "grad_norm": 0.16080714762210846, + "kl": 3.184378147125244e-05, + "lambda_div_used": 0.5, + "learning_rate": 4.8e-07, + "loss": 0.0, + "reward": -0.15408260421827435, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.15408260421827435, + "reward_after_std": 0.9264589920639992, + "reward_before_mean": 0.24006994348019361, + "reward_before_std": 0.9225658029317856, + "reward_change_max": 0.0, + "reward_change_mean": -0.3941525584086776, + "reward_change_min": -0.8041125237941742, + "reward_change_std": 0.3279961505904794, + "reward_std": 0.9264590125530958, + "rewards/cosine_scaled_reward": -0.0778816994279623, + "rewards/format_reward": 0.39583334513008595, + "step": 24 + }, + { + "advantage_max": 1.858852818608284, + "advantage_mean": 6.0147916647323996e-09, + "advantage_min": -0.8925201445817947, + "advantage_std": 0.9998224526643753, + "completion_length": 2680.000015258789, + "epoch": 0.02857142857142857, + "grad_norm": 0.18977677822113037, + "kl": 3.3307820558547974e-05, + "lambda_div_used": 0.5, + "learning_rate": 5e-07, + "loss": 0.0, + "reward": -0.08414146304130554, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.08414146304130554, + "reward_after_std": 0.7848474644124508, + "reward_before_mean": 0.42510347068309784, + "reward_before_std": 0.8524695411324501, + "reward_change_max": 0.003357619047164917, + "reward_change_mean": -0.5092449325602502, + "reward_change_min": -1.0072326622903347, + "reward_change_std": 0.4247382814064622, + "reward_std": 0.7848474718630314, + "rewards/cosine_scaled_reward": -0.01661492884159088, + "rewards/format_reward": 0.45833334885537624, + "step": 25 + }, + { + "advantage_max": 1.9238365292549133, + "advantage_mean": 3.539025994481193e-08, + "advantage_min": -0.8145479038357735, + "advantage_std": 0.9997957423329353, + "completion_length": 2942.8333740234375, + "epoch": 0.029714285714285714, + "grad_norm": 0.16458716988563538, + "kl": 2.6823952794075012e-05, + "lambda_div_used": 0.5, + "learning_rate": 5.2e-07, + "loss": 0.0, + "reward": -0.12822037562727928, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.12822037562727928, + "reward_after_std": 0.657489363104105, + "reward_before_mean": 0.37513636983931065, + "reward_before_std": 0.5635189693421125, + "reward_change_max": 0.0009020194411277771, + "reward_change_mean": -0.5033567762002349, + "reward_change_min": -0.8012861534953117, + "reward_change_std": 0.3210837971419096, + "reward_std": 0.6574893668293953, + "rewards/cosine_scaled_reward": -0.0728484783321619, + "rewards/format_reward": 0.520833333954215, + "step": 26 + }, + { + "advantage_max": 1.8898457139730453, + "advantage_mean": 3.282912131030713e-08, + "advantage_min": -0.8456361517310143, + "advantage_std": 0.9998165890574455, + "completion_length": 3154.312545776367, + "epoch": 0.030857142857142857, + "grad_norm": 0.18873968720436096, + "kl": 2.8768088668584824e-05, + "lambda_div_used": 0.5, + "learning_rate": 5.4e-07, + "loss": 0.0, + "reward": -0.19466266850940883, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.19466266850940883, + "reward_after_std": 0.7585207261145115, + "reward_before_mean": 0.22541327867656946, + "reward_before_std": 0.755299074575305, + "reward_change_max": 0.0006344020366668701, + "reward_change_mean": -0.4200759269297123, + "reward_change_min": -0.8561866730451584, + "reward_change_std": 0.3406699551269412, + "reward_std": 0.7585207633674145, + "rewards/cosine_scaled_reward": -0.053960046730935574, + "rewards/format_reward": 0.33333333767950535, + "step": 27 + }, + { + "advantage_max": 1.8525895327329636, + "advantage_mean": 2.2972624136308184e-08, + "advantage_min": -0.943960890173912, + "advantage_std": 0.9997684359550476, + "completion_length": 2793.0833587646484, + "epoch": 0.032, + "grad_norm": 0.20687302947044373, + "kl": 3.4846365451812744e-05, + "lambda_div_used": 0.5, + "learning_rate": 5.6e-07, + "loss": 0.0, + "reward": -0.07525549922138453, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.07525549922138453, + "reward_after_std": 0.7445814274251461, + "reward_before_mean": 0.447348415851593, + "reward_before_std": 0.7081887386739254, + "reward_change_max": 0.0002209991216659546, + "reward_change_mean": -0.5226039234548807, + "reward_change_min": -0.8660258539021015, + "reward_change_std": 0.3674051659181714, + "reward_std": 0.7445814423263073, + "rewards/cosine_scaled_reward": -0.005492456257343292, + "rewards/format_reward": 0.45833334885537624, + "step": 28 + }, + { + "advantage_max": 1.9153321534395218, + "advantage_mean": 4.5324366704235786e-08, + "advantage_min": -0.7868571989238262, + "advantage_std": 0.9998080059885979, + "completion_length": 3342.854248046875, + "epoch": 0.03314285714285714, + "grad_norm": 0.22355221211910248, + "kl": 3.172457218170166e-05, + "lambda_div_used": 0.5, + "learning_rate": 5.8e-07, + "loss": 0.0, + "reward": -0.4529417622834444, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.4529417622834444, + "reward_after_std": 0.6748885735869408, + "reward_before_mean": -0.2338971405988559, + "reward_before_std": 0.6635385602712631, + "reward_change_max": 0.002843029797077179, + "reward_change_mean": -0.21904463577084243, + "reward_change_min": -0.4862074926495552, + "reward_change_std": 0.20299789123237133, + "reward_std": 0.674888588488102, + "rewards/cosine_scaled_reward": -0.22111523617058992, + "rewards/format_reward": 0.20833333767950535, + "step": 29 + }, + { + "advantage_max": 1.8778761476278305, + "advantage_mean": -1.2417634476236117e-08, + "advantage_min": -0.8202168717980385, + "advantage_std": 0.9998758137226105, + "completion_length": 2829.6041870117188, + "epoch": 0.03428571428571429, + "grad_norm": 0.21529708802700043, + "kl": 2.3565255105495453e-05, + "lambda_div_used": 0.5, + "learning_rate": 6e-07, + "loss": 0.0, + "reward": 0.029207328334450722, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.029207328334450722, + "reward_after_std": 1.0521993562579155, + "reward_before_mean": 0.5473756925202906, + "reward_before_std": 1.10447221621871, + "reward_change_max": 0.0031651705503463745, + "reward_change_mean": -0.5181683599948883, + "reward_change_min": -1.1787523217499256, + "reward_change_std": 0.47795812133699656, + "reward_std": 1.0521993860602379, + "rewards/cosine_scaled_reward": 0.023687828797847033, + "rewards/format_reward": 0.5000000074505806, + "step": 30 + }, + { + "advantage_max": 1.8974759131669998, + "advantage_mean": 5.463759289447978e-08, + "advantage_min": -0.7851004675030708, + "advantage_std": 0.9997349232435226, + "completion_length": 3072.645835876465, + "epoch": 0.03542857142857143, + "grad_norm": 0.16128847002983093, + "kl": 2.508610486984253e-05, + "lambda_div_used": 0.5, + "learning_rate": 6.2e-07, + "loss": 0.0, + "reward": -0.4339814521372318, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.4339814521372318, + "reward_after_std": 0.6195205058902502, + "reward_before_mean": -0.17191330716013908, + "reward_before_std": 0.6293110642582178, + "reward_change_max": 0.0012357085943222046, + "reward_change_mean": -0.26206816267222166, + "reward_change_min": -0.6218099929392338, + "reward_change_std": 0.24361183121800423, + "reward_std": 0.6195205226540565, + "rewards/cosine_scaled_reward": -0.200539980083704, + "rewards/format_reward": 0.2291666716337204, + "step": 31 + }, + { + "advantage_max": 1.8547977358102798, + "advantage_mean": 3.725291408684939e-09, + "advantage_min": -0.9400533139705658, + "advantage_std": 0.9998045861721039, + "completion_length": 3167.854217529297, + "epoch": 0.036571428571428574, + "grad_norm": 0.16295260190963745, + "kl": 2.5317072868347168e-05, + "lambda_div_used": 0.5, + "learning_rate": 6.4e-07, + "loss": 0.0, + "reward": -0.15984043339267373, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.15984043339267373, + "reward_after_std": 0.6901145968586206, + "reward_before_mean": 0.31627153418958187, + "reward_before_std": 0.7256025932729244, + "reward_change_max": 0.0024649351835250854, + "reward_change_mean": -0.4761119820177555, + "reward_change_min": -0.9442967176437378, + "reward_change_std": 0.3847173471003771, + "reward_std": 0.6901146098971367, + "rewards/cosine_scaled_reward": -0.05019756080582738, + "rewards/format_reward": 0.4166666753590107, + "step": 32 + }, + { + "advantage_max": 1.8590810298919678, + "advantage_mean": 8.071461832237503e-09, + "advantage_min": -0.8207400739192963, + "advantage_std": 0.9998291581869125, + "completion_length": 3435.3959045410156, + "epoch": 0.037714285714285714, + "grad_norm": 0.1503303498029709, + "kl": 3.756582736968994e-05, + "lambda_div_used": 0.5, + "learning_rate": 6.6e-07, + "loss": 0.0, + "reward": -0.2504210639744997, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2504210639744997, + "reward_after_std": 0.8694077096879482, + "reward_before_mean": 0.08852381771430373, + "reward_before_std": 0.9110365584492683, + "reward_change_max": 0.0003108680248260498, + "reward_change_mean": -0.3389448709785938, + "reward_change_min": -0.7708389461040497, + "reward_change_std": 0.32376272417604923, + "reward_std": 0.8694077171385288, + "rewards/cosine_scaled_reward": -0.07032142765820026, + "rewards/format_reward": 0.2291666716337204, + "step": 33 + }, + { + "advantage_max": 1.8898741006851196, + "advantage_mean": 1.1175870862079051e-07, + "advantage_min": -0.8010011166334152, + "advantage_std": 0.9996843114495277, + "completion_length": 2295.7083587646484, + "epoch": 0.038857142857142854, + "grad_norm": 0.2735695242881775, + "kl": 3.103911876678467e-05, + "lambda_div_used": 0.5, + "learning_rate": 6.800000000000001e-07, + "loss": 0.0, + "reward": -0.04704808286624029, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.04704808286624029, + "reward_after_std": 0.8338280580937862, + "reward_before_mean": 0.4671720117330551, + "reward_before_std": 0.7878899574279785, + "reward_change_max": 0.00018334388732910156, + "reward_change_mean": -0.5142201161943376, + "reward_change_min": -0.9473587274551392, + "reward_change_std": 0.3720804797485471, + "reward_std": 0.8338280990719795, + "rewards/cosine_scaled_reward": -0.03724732855334878, + "rewards/format_reward": 0.5416666716337204, + "step": 34 + }, + { + "advantage_max": 1.829305186867714, + "advantage_mean": 2.6077032810878364e-08, + "advantage_min": -0.8712577894330025, + "advantage_std": 0.9998411163687706, + "completion_length": 2964.6458435058594, + "epoch": 0.04, + "grad_norm": 0.20366409420967102, + "kl": 4.675239324569702e-05, + "lambda_div_used": 0.5, + "learning_rate": 7e-07, + "loss": 0.0, + "reward": -0.009542571380734444, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.009542571380734444, + "reward_after_std": 0.9506381116807461, + "reward_before_mean": 0.514573335647583, + "reward_before_std": 1.050828494131565, + "reward_change_max": 0.00032019615173339844, + "reward_change_mean": -0.5241158995777369, + "reward_change_min": -1.2276764325797558, + "reward_change_std": 0.5117328846827149, + "reward_std": 0.9506381563842297, + "rewards/cosine_scaled_reward": 0.05936999386176467, + "rewards/format_reward": 0.3958333358168602, + "step": 35 + }, + { + "advantage_max": 1.8876190781593323, + "advantage_mean": 9.344269968902807e-08, + "advantage_min": -0.8404295146465302, + "advantage_std": 0.9997332692146301, + "completion_length": 3368.375030517578, + "epoch": 0.04114285714285714, + "grad_norm": 0.158416748046875, + "kl": 3.793835639953613e-05, + "lambda_div_used": 0.5, + "learning_rate": 7.2e-07, + "loss": 0.0, + "reward": -0.518017141148448, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.518017141148448, + "reward_after_std": 0.5544664487242699, + "reward_before_mean": -0.3090785853564739, + "reward_before_std": 0.5590341556817293, + "reward_change_max": 0.0015523284673690796, + "reward_change_mean": -0.20893854810856283, + "reward_change_min": -0.4174285866320133, + "reward_change_std": 0.1820876558776945, + "reward_std": 0.5544664710760117, + "rewards/cosine_scaled_reward": -0.25870596151798964, + "rewards/format_reward": 0.2083333395421505, + "step": 36 + }, + { + "advantage_max": 1.832179456949234, + "advantage_mean": 9.390836297473726e-08, + "advantage_min": -0.9252327382564545, + "advantage_std": 0.9997186735272408, + "completion_length": 3271.250015258789, + "epoch": 0.04228571428571429, + "grad_norm": 0.15664541721343994, + "kl": 2.317875623703003e-05, + "lambda_div_used": 0.5, + "learning_rate": 7.4e-07, + "loss": 0.0, + "reward": -0.5315404608845711, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.5315404608845711, + "reward_after_std": 0.40248375572264194, + "reward_before_mean": -0.27842903579585254, + "reward_before_std": 0.40684795938432217, + "reward_change_max": 0.0006852373480796814, + "reward_change_mean": -0.2531114090234041, + "reward_change_min": -0.4775106720626354, + "reward_change_std": 0.20638726092875004, + "reward_std": 0.40248377062380314, + "rewards/cosine_scaled_reward": -0.2433811966329813, + "rewards/format_reward": 0.20833333395421505, + "step": 37 + }, + { + "advantage_max": 1.8640215396881104, + "advantage_mean": 9.126961608707518e-08, + "advantage_min": -0.9243824407458305, + "advantage_std": 0.999757893383503, + "completion_length": 3210.7916870117188, + "epoch": 0.04342857142857143, + "grad_norm": 0.16199976205825806, + "kl": 2.1124258637428284e-05, + "lambda_div_used": 0.5, + "learning_rate": 7.599999999999999e-07, + "loss": 0.0, + "reward": -0.39931502752006054, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.39931502752006054, + "reward_after_std": 0.5454818941652775, + "reward_before_mean": -0.0851539708673954, + "reward_before_std": 0.545851357281208, + "reward_change_max": 0.0036991089582443237, + "reward_change_mean": -0.3141610324382782, + "reward_change_min": -0.6185585148632526, + "reward_change_std": 0.2539242282509804, + "reward_std": 0.545481912791729, + "rewards/cosine_scaled_reward": -0.13632699789013714, + "rewards/format_reward": 0.1875, + "step": 38 + }, + { + "advantage_max": 1.8490605801343918, + "advantage_mean": 4.6566132061443e-09, + "advantage_min": -0.9282248802483082, + "advantage_std": 0.99977096170187, + "completion_length": 2869.5625228881836, + "epoch": 0.044571428571428574, + "grad_norm": 0.1589706391096115, + "kl": 1.77919864654541e-05, + "lambda_div_used": 0.5, + "learning_rate": 7.799999999999999e-07, + "loss": 0.0, + "reward": -0.13201994262635708, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.13201994262635708, + "reward_after_std": 0.5264497548341751, + "reward_before_mean": 0.4156036972999573, + "reward_before_std": 0.45706650079227984, + "reward_change_max": 0.0018109232187271118, + "reward_change_mean": -0.5476236194372177, + "reward_change_min": -0.8542361706495285, + "reward_change_std": 0.35812338441610336, + "reward_std": 0.526449766010046, + "rewards/cosine_scaled_reward": -0.03178148064762354, + "rewards/format_reward": 0.4791666753590107, + "step": 39 + }, + { + "advantage_max": 1.890770971775055, + "advantage_mean": 3.911554902202852e-08, + "advantage_min": -0.9282963797450066, + "advantage_std": 0.9997633919119835, + "completion_length": 2680.6250610351562, + "epoch": 0.045714285714285714, + "grad_norm": 0.1795395016670227, + "kl": 3.270059823989868e-05, + "lambda_div_used": 0.5, + "learning_rate": 8e-07, + "loss": 0.0, + "reward": -0.15905895363539457, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.15905895363539457, + "reward_after_std": 0.5205851662904024, + "reward_before_mean": 0.3587095569819212, + "reward_before_std": 0.41430299170315266, + "reward_change_max": 0.0008555799722671509, + "reward_change_mean": -0.5177684919908643, + "reward_change_min": -0.8109701350331306, + "reward_change_std": 0.32442333083599806, + "reward_std": 0.5205851849168539, + "rewards/cosine_scaled_reward": -0.07064523361623287, + "rewards/format_reward": 0.5000000093132257, + "step": 40 + }, + { + "advantage_max": 1.934581384062767, + "advantage_mean": 6.5192582443529545e-09, + "advantage_min": -0.7122581228613853, + "advantage_std": 0.9998366460204124, + "completion_length": 3073.5833740234375, + "epoch": 0.046857142857142854, + "grad_norm": 0.16301335394382477, + "kl": 2.850499004125595e-05, + "lambda_div_used": 0.5, + "learning_rate": 8.199999999999999e-07, + "loss": 0.0, + "reward": -0.3758960599079728, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.3758960599079728, + "reward_after_std": 0.7847613207995892, + "reward_before_mean": -0.12845315597951412, + "reward_before_std": 0.7612991891801357, + "reward_change_max": 0.0017989054322242737, + "reward_change_mean": -0.24744290485978127, + "reward_change_min": -0.5457473620772362, + "reward_change_std": 0.22195276990532875, + "reward_std": 0.7847613506019115, + "rewards/cosine_scaled_reward": -0.22047658078372478, + "rewards/format_reward": 0.31250000558793545, + "step": 41 + }, + { + "advantage_max": 1.9205241799354553, + "advantage_mean": 2.483526917451684e-08, + "advantage_min": -0.6978275701403618, + "advantage_std": 0.999718151986599, + "completion_length": 2810.895866394043, + "epoch": 0.048, + "grad_norm": 0.22402982413768768, + "kl": 4.731118679046631e-05, + "lambda_div_used": 0.5, + "learning_rate": 8.399999999999999e-07, + "loss": 0.0, + "reward": -0.4632724979892373, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.4632724979892373, + "reward_after_std": 0.550530057400465, + "reward_before_mean": -0.20958980754949152, + "reward_before_std": 0.535234902985394, + "reward_change_max": 0.002125099301338196, + "reward_change_mean": -0.25368270359467715, + "reward_change_min": -0.5563341490924358, + "reward_change_std": 0.21589954826049507, + "reward_std": 0.5505300872027874, + "rewards/cosine_scaled_reward": -0.27146157110109925, + "rewards/format_reward": 0.3333333358168602, + "step": 42 + }, + { + "advantage_max": 1.9519257545471191, + "advantage_mean": 1.055498932700516e-08, + "advantage_min": -0.7456157505512238, + "advantage_std": 0.9998450949788094, + "completion_length": 3029.416717529297, + "epoch": 0.04914285714285714, + "grad_norm": 0.20059436559677124, + "kl": 2.3126602172851562e-05, + "lambda_div_used": 0.5, + "learning_rate": 8.599999999999999e-07, + "loss": 0.0, + "reward": -0.2882131487131119, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.2882131487131119, + "reward_after_std": 0.8363650441169739, + "reward_before_mean": 0.01781909167766571, + "reward_before_std": 0.8098427765071392, + "reward_change_max": 0.002855464816093445, + "reward_change_mean": -0.30603223550133407, + "reward_change_min": -0.7128670252859592, + "reward_change_std": 0.2696162755601108, + "reward_std": 0.8363650590181351, + "rewards/cosine_scaled_reward": -0.14734045788645744, + "rewards/format_reward": 0.31250000558793545, + "step": 43 + }, + { + "advantage_max": 1.921295627951622, + "advantage_mean": 3.3527614351491764e-08, + "advantage_min": -0.7192764729261398, + "advantage_std": 0.9997594803571701, + "completion_length": 2610.000030517578, + "epoch": 0.05028571428571429, + "grad_norm": 0.2608076333999634, + "kl": 0.0001121722161769867, + "lambda_div_used": 0.5, + "learning_rate": 8.799999999999999e-07, + "loss": 0.0, + "reward": -0.2315717376768589, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2315717376768589, + "reward_after_std": 0.7365182116627693, + "reward_before_mean": 0.15951215103268623, + "reward_before_std": 0.7295054513961077, + "reward_change_max": 0.0021981820464134216, + "reward_change_mean": -0.39108387008309364, + "reward_change_min": -0.790928453207016, + "reward_change_std": 0.3149452833458781, + "reward_std": 0.7365182191133499, + "rewards/cosine_scaled_reward": -0.1285772593691945, + "rewards/format_reward": 0.41666667349636555, + "step": 44 + }, + { + "advantage_max": 1.8415375053882599, + "advantage_mean": 7.078051655895479e-08, + "advantage_min": -0.8002776876091957, + "advantage_std": 0.9997911676764488, + "completion_length": 3401.375, + "epoch": 0.05142857142857143, + "grad_norm": 0.15118639171123505, + "kl": 4.731118679046631e-05, + "lambda_div_used": 0.5, + "learning_rate": 9e-07, + "loss": 0.0, + "reward": -0.3574506975710392, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.3574506975710392, + "reward_after_std": 0.7208534479141235, + "reward_before_mean": -0.06056522950530052, + "reward_before_std": 0.7741351164877415, + "reward_change_max": 0.0, + "reward_change_mean": -0.29688544757664204, + "reward_change_min": -0.7843009307980537, + "reward_change_std": 0.30331660620868206, + "reward_std": 0.7208534702658653, + "rewards/cosine_scaled_reward": -0.12403261481085792, + "rewards/format_reward": 0.18750000558793545, + "step": 45 + }, + { + "advantage_max": 1.8736757040023804, + "advantage_mean": -6.6744791915596124e-09, + "advantage_min": -0.9290287122130394, + "advantage_std": 0.9997085630893707, + "completion_length": 3181.1875, + "epoch": 0.052571428571428575, + "grad_norm": 0.21306857466697693, + "kl": 7.120147347450256e-05, + "lambda_div_used": 0.5, + "learning_rate": 9.2e-07, + "loss": 0.0, + "reward": -0.5937137454748154, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.5937137454748154, + "reward_after_std": 0.3615282103419304, + "reward_before_mean": -0.3823595102876425, + "reward_before_std": 0.37264155969023705, + "reward_change_max": 0.00016885995864868164, + "reward_change_mean": -0.21135425474494696, + "reward_change_min": -0.4077131114900112, + "reward_change_std": 0.17300888849422336, + "reward_std": 0.3615282140672207, + "rewards/cosine_scaled_reward": -0.26409642212092876, + "rewards/format_reward": 0.1458333395421505, + "step": 46 + }, + { + "advantage_max": 1.8681392222642899, + "advantage_mean": 2.5611371856637533e-08, + "advantage_min": -0.8132197260856628, + "advantage_std": 0.9998373538255692, + "completion_length": 2791.8333892822266, + "epoch": 0.053714285714285714, + "grad_norm": 0.2283652275800705, + "kl": 7.794797420501709e-05, + "lambda_div_used": 0.5, + "learning_rate": 9.399999999999999e-07, + "loss": 0.0, + "reward": -0.02218475693371147, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.02218475693371147, + "reward_after_std": 0.8718568906188011, + "reward_before_mean": 0.511185884475708, + "reward_before_std": 0.9081144295632839, + "reward_change_max": 0.00019782781600952148, + "reward_change_mean": -0.5333706503733993, + "reward_change_min": -1.1346662230789661, + "reward_change_std": 0.4650296289473772, + "reward_std": 0.8718568943440914, + "rewards/cosine_scaled_reward": 0.005592945963144302, + "rewards/format_reward": 0.5, + "step": 47 + }, + { + "advantage_max": 1.8958362936973572, + "advantage_mean": 1.4280280513645494e-08, + "advantage_min": -0.7991106733679771, + "advantage_std": 0.9998115226626396, + "completion_length": 2759.770866394043, + "epoch": 0.054857142857142854, + "grad_norm": 0.30551856756210327, + "kl": 0.00015142187476158142, + "lambda_div_used": 0.5, + "learning_rate": 9.6e-07, + "loss": 0.0, + "reward": -0.21847828477621078, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.21847828477621078, + "reward_after_std": 0.776937173679471, + "reward_before_mean": 0.16552765760570765, + "reward_before_std": 0.7491930667310953, + "reward_change_max": 0.0014309212565422058, + "reward_change_mean": -0.3840059507638216, + "reward_change_min": -0.7702530585229397, + "reward_change_std": 0.29957136139273643, + "reward_std": 0.7769371904432774, + "rewards/cosine_scaled_reward": -0.08390284143388271, + "rewards/format_reward": 0.3333333358168602, + "step": 48 + }, + { + "advantage_max": 1.8815445601940155, + "advantage_mean": 1.179675274132208e-08, + "advantage_min": -0.8456504121422768, + "advantage_std": 0.9998609572649002, + "completion_length": 2267.3125381469727, + "epoch": 0.056, + "grad_norm": 0.24697095155715942, + "kl": 0.00010534748435020447, + "lambda_div_used": 0.5, + "learning_rate": 9.8e-07, + "loss": 0.0, + "reward": -0.055962367448955774, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.055962367448955774, + "reward_after_std": 0.9708420261740685, + "reward_before_mean": 0.4173060655593872, + "reward_before_std": 1.0131547879427671, + "reward_change_max": 0.003098607063293457, + "reward_change_mean": -0.4732684250921011, + "reward_change_min": -1.1020850576460361, + "reward_change_std": 0.44488269835710526, + "reward_std": 0.9708420485258102, + "rewards/cosine_scaled_reward": -0.07259697344852611, + "rewards/format_reward": 0.5625000167638063, + "step": 49 + }, + { + "advantage_max": 1.8893340080976486, + "advantage_mean": 1.8005570812107408e-08, + "advantage_min": -0.8202431090176105, + "advantage_std": 0.9997889772057533, + "completion_length": 2962.0625076293945, + "epoch": 0.05714285714285714, + "grad_norm": 0.1640988290309906, + "kl": 0.00016727298498153687, + "lambda_div_used": 0.5, + "learning_rate": 1e-06, + "loss": 0.0, + "reward": -0.09585870243608952, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.09585870243608952, + "reward_after_std": 0.7277687638998032, + "reward_before_mean": 0.41637253458611667, + "reward_before_std": 0.6608343506231904, + "reward_change_max": 0.0, + "reward_change_mean": -0.512231232598424, + "reward_change_min": -0.9351947791874409, + "reward_change_std": 0.3637898899614811, + "reward_std": 0.727768812328577, + "rewards/cosine_scaled_reward": 0.041519587859511375, + "rewards/format_reward": 0.3333333358168602, + "step": 50 + }, + { + "advantage_max": 1.8653928488492966, + "advantage_mean": 5.86733245322435e-08, + "advantage_min": -0.9568943232297897, + "advantage_std": 0.9997608214616776, + "completion_length": 2327.958381652832, + "epoch": 0.05828571428571429, + "grad_norm": 0.24649883806705475, + "kl": 0.00038273632526397705, + "lambda_div_used": 0.5, + "learning_rate": 9.999890338174275e-07, + "loss": 0.0, + "reward": -0.17880022956524044, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.17880022956524044, + "reward_after_std": 0.6422858815640211, + "reward_before_mean": 0.2922116946429014, + "reward_before_std": 0.6047020759433508, + "reward_change_max": 0.001704975962638855, + "reward_change_mean": -0.4710119031369686, + "reward_change_min": -0.8810345865786076, + "reward_change_std": 0.34449261892586946, + "reward_std": 0.6422859076410532, + "rewards/cosine_scaled_reward": -0.10389416851103306, + "rewards/format_reward": 0.5000000055879354, + "step": 51 + }, + { + "advantage_max": 1.823257952928543, + "advantage_mean": -1.1796753018877837e-08, + "advantage_min": -0.8818978518247604, + "advantage_std": 0.9998767971992493, + "completion_length": 2949.541717529297, + "epoch": 0.05942857142857143, + "grad_norm": 0.18382064998149872, + "kl": 0.0005578286945819855, + "lambda_div_used": 0.5, + "learning_rate": 9.999561358041868e-07, + "loss": 0.0, + "reward": 0.14336604299023747, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.14336604299023747, + "reward_after_std": 1.1285717599093914, + "reward_before_mean": 0.7415338456630707, + "reward_before_std": 1.2546439096331596, + "reward_change_max": 0.0019373148679733276, + "reward_change_mean": -0.5981677994132042, + "reward_change_min": -1.3573957942426205, + "reward_change_std": 0.5901675391942263, + "reward_std": 1.1285717971622944, + "rewards/cosine_scaled_reward": 0.1311835777014494, + "rewards/format_reward": 0.4791666753590107, + "step": 52 + }, + { + "advantage_max": 1.8426322937011719, + "advantage_mean": 7.450580818968433e-09, + "advantage_min": -0.9017970934510231, + "advantage_std": 0.9998707324266434, + "completion_length": 2719.041717529297, + "epoch": 0.060571428571428575, + "grad_norm": 0.18095125257968903, + "kl": 0.000291973352432251, + "lambda_div_used": 0.5, + "learning_rate": 9.999013075636804e-07, + "loss": 0.0, + "reward": 0.19375211838632822, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.19375211838632822, + "reward_after_std": 1.0046247728168964, + "reward_before_mean": 0.8646276481449604, + "reward_before_std": 1.0044534653425217, + "reward_change_max": 0.005661614239215851, + "reward_change_mean": -0.6708755232393742, + "reward_change_min": -1.3468510583043098, + "reward_change_std": 0.5442535653710365, + "reward_std": 1.0046248212456703, + "rewards/cosine_scaled_reward": 0.11981380297220312, + "rewards/format_reward": 0.6250000111758709, + "step": 53 + }, + { + "advantage_max": 1.838746502995491, + "advantage_mean": -8.07146127712599e-09, + "advantage_min": -0.8344198316335678, + "advantage_std": 0.9998571202158928, + "completion_length": 2901.416717529297, + "epoch": 0.061714285714285715, + "grad_norm": 0.1705823391675949, + "kl": 8.270889520645142e-05, + "lambda_div_used": 0.5, + "learning_rate": 9.998245517681593e-07, + "loss": 0.0, + "reward": 0.13633309258148074, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.13633309258148074, + "reward_after_std": 0.9937352724373341, + "reward_before_mean": 0.7676731869578362, + "reward_before_std": 1.0381547771394253, + "reward_change_max": 0.001669757068157196, + "reward_change_mean": -0.6313401181250811, + "reward_change_min": -1.3544641695916653, + "reward_change_std": 0.5637638978660107, + "reward_std": 0.9937352761626244, + "rewards/cosine_scaled_reward": 0.14425326080527157, + "rewards/format_reward": 0.4791666753590107, + "step": 54 + }, + { + "advantage_max": 1.8439057767391205, + "advantage_mean": -6.829699583654758e-09, + "advantage_min": -0.8918868899345398, + "advantage_std": 0.9998131170868874, + "completion_length": 3011.3333740234375, + "epoch": 0.06285714285714286, + "grad_norm": 0.15338917076587677, + "kl": 0.0003665238618850708, + "lambda_div_used": 0.5, + "learning_rate": 9.997258721585931e-07, + "loss": 0.0, + "reward": -0.14612246677279472, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.14612246677279472, + "reward_after_std": 0.7486055754125118, + "reward_before_mean": 0.31836483208462596, + "reward_before_std": 0.7372772246599197, + "reward_change_max": 0.0, + "reward_change_mean": -0.46448732912540436, + "reward_change_min": -0.9766540713608265, + "reward_change_std": 0.39013639837503433, + "reward_std": 0.7486055865883827, + "rewards/cosine_scaled_reward": -0.028317579999566078, + "rewards/format_reward": 0.3750000074505806, + "step": 55 + }, + { + "advantage_max": 1.8368095457553864, + "advantage_mean": 2.7318797446440612e-08, + "advantage_min": -0.9046461880207062, + "advantage_std": 0.9997836649417877, + "completion_length": 2900.312515258789, + "epoch": 0.064, + "grad_norm": 0.17617428302764893, + "kl": 0.0005377233028411865, + "lambda_div_used": 0.5, + "learning_rate": 9.996052735444862e-07, + "loss": 0.0, + "reward": -0.13353440910577774, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.13353440910577774, + "reward_after_std": 0.6982099693268538, + "reward_before_mean": 0.3613228676840663, + "reward_before_std": 0.7062958665192127, + "reward_change_max": 0.0013622492551803589, + "reward_change_mean": -0.4948572628200054, + "reward_change_min": -0.8601287193596363, + "reward_change_std": 0.3771468782797456, + "reward_std": 0.698210010305047, + "rewards/cosine_scaled_reward": -0.04850525222718716, + "rewards/format_reward": 0.45833334140479565, + "step": 56 + }, + { + "advantage_max": 1.885016068816185, + "advantage_mean": 4.2219957807621e-08, + "advantage_min": -0.8030908405780792, + "advantage_std": 0.9997961819171906, + "completion_length": 3282.9584045410156, + "epoch": 0.06514285714285714, + "grad_norm": 0.1289624273777008, + "kl": 0.0001293867826461792, + "lambda_div_used": 0.5, + "learning_rate": 9.994627618036452e-07, + "loss": 0.0, + "reward": -0.23884075693786144, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.23884075693786144, + "reward_after_std": 0.7863927576690912, + "reward_before_mean": 0.13652504794299603, + "reward_before_std": 0.8171067088842392, + "reward_change_max": 0.0025532394647598267, + "reward_change_mean": -0.3753657881170511, + "reward_change_min": -0.8432562984526157, + "reward_change_std": 0.3470982797443867, + "reward_std": 0.78639280423522, + "rewards/cosine_scaled_reward": -0.12965414859354496, + "rewards/format_reward": 0.3958333395421505, + "step": 57 + }, + { + "advantage_max": 1.8578455299139023, + "advantage_mean": 2.793967834868738e-08, + "advantage_min": -0.8778782561421394, + "advantage_std": 0.9998427554965019, + "completion_length": 2379.1250228881836, + "epoch": 0.06628571428571428, + "grad_norm": 0.27240949869155884, + "kl": 0.0014230608940124512, + "lambda_div_used": 0.5, + "learning_rate": 9.992983438818915e-07, + "loss": 0.0001, + "reward": -0.005995278595946729, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.005995278595946729, + "reward_after_std": 0.8489437401294708, + "reward_before_mean": 0.5475127007812262, + "reward_before_std": 0.8708108924329281, + "reward_change_max": 0.0017568692564964294, + "reward_change_mean": -0.5535079715773463, + "reward_change_min": -1.082301240414381, + "reward_change_std": 0.4453202374279499, + "reward_std": 0.8489437624812126, + "rewards/cosine_scaled_reward": -0.049160322174429893, + "rewards/format_reward": 0.645833345130086, + "step": 58 + }, + { + "advantage_max": 1.861334353685379, + "advantage_mean": -1.179675268581093e-08, + "advantage_min": -0.8184168860316277, + "advantage_std": 0.9997863620519638, + "completion_length": 2806.333351135254, + "epoch": 0.06742857142857143, + "grad_norm": 0.15535277128219604, + "kl": 0.00013153068721294403, + "lambda_div_used": 0.5, + "learning_rate": 9.991120277927223e-07, + "loss": 0.0, + "reward": -0.2553709470666945, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.2553709470666945, + "reward_after_std": 0.6489655673503876, + "reward_before_mean": 0.14917359128594398, + "reward_before_std": 0.641713622957468, + "reward_change_max": 0.0031897202134132385, + "reward_change_mean": -0.4045445565134287, + "reward_change_min": -0.8240447789430618, + "reward_change_std": 0.3207780700176954, + "reward_std": 0.6489655859768391, + "rewards/cosine_scaled_reward": -0.09207986295223236, + "rewards/format_reward": 0.3333333358168602, + "step": 59 + }, + { + "advantage_max": 1.911918118596077, + "advantage_mean": 1.5543122344752192e-15, + "advantage_min": -0.7607595697045326, + "advantage_std": 0.9998063743114471, + "completion_length": 2909.041702270508, + "epoch": 0.06857142857142857, + "grad_norm": 0.1574772149324417, + "kl": 0.0002549951896071434, + "lambda_div_used": 0.5, + "learning_rate": 9.989038226169207e-07, + "loss": 0.0, + "reward": -0.24616998185229022, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.24616998185229022, + "reward_after_std": 0.7582415752112865, + "reward_before_mean": 0.12705273227766156, + "reward_before_std": 0.7256910298019648, + "reward_change_max": 0.0013196319341659546, + "reward_change_mean": -0.3732227301225066, + "reward_change_min": -0.6842942871153355, + "reward_change_std": 0.2910933867096901, + "reward_std": 0.7582416199147701, + "rewards/cosine_scaled_reward": -0.13439030945301056, + "rewards/format_reward": 0.39583333767950535, + "step": 60 + }, + { + "advantage_max": 1.8177462965250015, + "advantage_mean": -1.241764135961887e-09, + "advantage_min": -1.0356914550065994, + "advantage_std": 0.9997951909899712, + "completion_length": 2884.5834045410156, + "epoch": 0.06971428571428571, + "grad_norm": 0.1754026859998703, + "kl": 0.00044707953929901123, + "lambda_div_used": 0.5, + "learning_rate": 9.98673738502114e-07, + "loss": 0.0, + "reward": -0.10852715838700533, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": -0.10852715838700533, + "reward_after_std": 0.7251477800309658, + "reward_before_mean": 0.400545597076416, + "reward_before_std": 0.7691365052014589, + "reward_change_max": 0.0020439624786376953, + "reward_change_mean": -0.5090727712959051, + "reward_change_min": -0.9695079140365124, + "reward_change_std": 0.41014579124748707, + "reward_std": 0.725147807970643, + "rewards/cosine_scaled_reward": -0.049727211240679026, + "rewards/format_reward": 0.5000000111758709, + "step": 61 + }, + { + "advantage_max": 1.9007128179073334, + "advantage_mean": 1.707424779340272e-08, + "advantage_min": -0.7632358595728874, + "advantage_std": 0.999834880232811, + "completion_length": 2444.4583435058594, + "epoch": 0.07085714285714285, + "grad_norm": 0.20853900909423828, + "kl": 0.0012095272541046143, + "lambda_div_used": 0.5, + "learning_rate": 9.98421786662277e-07, + "loss": 0.0, + "reward": 0.08043081499636173, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.08043081499636173, + "reward_after_std": 0.8052340261638165, + "reward_before_mean": 0.7084651309996843, + "reward_before_std": 0.6865715570747852, + "reward_change_max": 0.0, + "reward_change_mean": -0.6280343104153872, + "reward_change_min": -1.0744778029620647, + "reward_change_std": 0.4058781899511814, + "reward_std": 0.8052340373396873, + "rewards/cosine_scaled_reward": 0.05214923154562712, + "rewards/format_reward": 0.6041666697710752, + "step": 62 + }, + { + "advantage_max": 1.8943010717630386, + "advantage_mean": -2.235174201281609e-08, + "advantage_min": -0.8820522204041481, + "advantage_std": 0.9998508021235466, + "completion_length": 2203.541717529297, + "epoch": 0.072, + "grad_norm": 0.25720417499542236, + "kl": 0.0009733438491821289, + "lambda_div_used": 0.5, + "learning_rate": 9.981479793771866e-07, + "loss": 0.0, + "reward": 0.106345753534697, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.106345753534697, + "reward_after_std": 0.8436989672482014, + "reward_before_mean": 0.7488799318671227, + "reward_before_std": 0.7643895111978054, + "reward_change_max": 0.00014873594045639038, + "reward_change_mean": -0.6425342075526714, + "reward_change_min": -1.0930566787719727, + "reward_change_std": 0.43817601166665554, + "reward_std": 0.843698974698782, + "rewards/cosine_scaled_reward": 0.041106633958406746, + "rewards/format_reward": 0.6666666772216558, + "step": 63 + }, + { + "advantage_max": 1.9071584939956665, + "advantage_mean": 5.494803312355856e-08, + "advantage_min": -0.8514663949608803, + "advantage_std": 0.9998283535242081, + "completion_length": 2759.000045776367, + "epoch": 0.07314285714285715, + "grad_norm": 0.1668003350496292, + "kl": 0.00036591291427612305, + "lambda_div_used": 0.5, + "learning_rate": 9.97852329991824e-07, + "loss": 0.0, + "reward": 0.03588845953345299, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.03588845953345299, + "reward_after_std": 0.8736652098596096, + "reward_before_mean": 0.6065978556871414, + "reward_before_std": 0.8280804753303528, + "reward_change_max": 0.0005163624882698059, + "reward_change_mean": -0.5707093523815274, + "reward_change_min": -1.060088012367487, + "reward_change_std": 0.4052878515794873, + "reward_std": 0.8736652284860611, + "rewards/cosine_scaled_reward": 0.05329891119617969, + "rewards/format_reward": 0.5000000111758709, + "step": 64 + }, + { + "advantage_max": 1.9266762882471085, + "advantage_mean": 6.208816460961941e-09, + "advantage_min": -0.7601385116577148, + "advantage_std": 0.9997923299670219, + "completion_length": 2736.000015258789, + "epoch": 0.07428571428571429, + "grad_norm": 0.19557268917560577, + "kl": 0.0006786584854125977, + "lambda_div_used": 0.5, + "learning_rate": 9.975348529157229e-07, + "loss": 0.0, + "reward": -0.2076890431344509, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2076890431344509, + "reward_after_std": 0.6389360800385475, + "reward_before_mean": 0.2348735888954252, + "reward_before_std": 0.5559763349592686, + "reward_change_max": 0.001060768961906433, + "reward_change_mean": -0.4425626490265131, + "reward_change_min": -0.8092819452285767, + "reward_change_std": 0.32220305129885674, + "reward_std": 0.6389361061155796, + "rewards/cosine_scaled_reward": -0.10131320543587208, + "rewards/format_reward": 0.43750000186264515, + "step": 65 + }, + { + "advantage_max": 1.9095334112644196, + "advantage_mean": 2.452482830705982e-08, + "advantage_min": -0.7046549618244171, + "advantage_std": 0.9998053535819054, + "completion_length": 2092.9583473205566, + "epoch": 0.07542857142857143, + "grad_norm": 0.24275831878185272, + "kl": 0.0019240379333496094, + "lambda_div_used": 0.5, + "learning_rate": 9.971955636222684e-07, + "loss": 0.0001, + "reward": -0.10312021151185036, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.10312021151185036, + "reward_after_std": 0.705897755920887, + "reward_before_mean": 0.4020507410168648, + "reward_before_std": 0.6098583452403545, + "reward_change_max": 0.002116262912750244, + "reward_change_mean": -0.5051709450781345, + "reward_change_min": -0.9165816679596901, + "reward_change_std": 0.3466827627271414, + "reward_std": 0.7058977633714676, + "rewards/cosine_scaled_reward": -0.05939129926264286, + "rewards/format_reward": 0.520833333954215, + "step": 66 + }, + { + "advantage_max": 1.8870718479156494, + "advantage_mean": 4.594524827261637e-08, + "advantage_min": -0.7717634811997414, + "advantage_std": 0.9997425973415375, + "completion_length": 3376.8333435058594, + "epoch": 0.07657142857142857, + "grad_norm": 0.13818876445293427, + "kl": 0.0005912370979785919, + "lambda_div_used": 0.5, + "learning_rate": 9.968344786479415e-07, + "loss": 0.0, + "reward": -0.5356982201337814, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.5356982201337814, + "reward_after_std": 0.5143481884151697, + "reward_before_mean": -0.32520947977900505, + "reward_before_std": 0.5225167237222195, + "reward_change_max": 0.0011303573846817017, + "reward_change_mean": -0.21048873430117965, + "reward_change_min": -0.5026979595422745, + "reward_change_std": 0.20696008298546076, + "reward_std": 0.5143481958657503, + "rewards/cosine_scaled_reward": -0.23552141524851322, + "rewards/format_reward": 0.1458333395421505, + "step": 67 + }, + { + "advantage_max": 1.8678310364484787, + "advantage_mean": -2.8871000701258254e-08, + "advantage_min": -0.8665808811783791, + "advantage_std": 0.9998535141348839, + "completion_length": 2133.0417098999023, + "epoch": 0.07771428571428571, + "grad_norm": 0.22691960632801056, + "kl": 0.001744091510772705, + "lambda_div_used": 0.5, + "learning_rate": 9.964516155915151e-07, + "loss": 0.0001, + "reward": -0.007227955153211951, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.007227955153211951, + "reward_after_std": 0.9634606316685677, + "reward_before_mean": 0.5006781555712223, + "reward_before_std": 0.9779879823327065, + "reward_change_max": 0.0024183765053749084, + "reward_change_mean": -0.5079061184078455, + "reward_change_min": -1.039794061332941, + "reward_change_std": 0.4307109545916319, + "reward_std": 0.9634606577455997, + "rewards/cosine_scaled_reward": -0.05174426478333771, + "rewards/format_reward": 0.604166679084301, + "step": 68 + }, + { + "advantage_max": 1.8920985758304596, + "advantage_mean": 1.6142924885720333e-08, + "advantage_min": -0.8405609056353569, + "advantage_std": 0.9997691139578819, + "completion_length": 2647.4166870117188, + "epoch": 0.07885714285714286, + "grad_norm": 0.27245891094207764, + "kl": 0.0019397735595703125, + "lambda_div_used": 0.5, + "learning_rate": 9.960469931131936e-07, + "loss": 0.0001, + "reward": -0.3990805000066757, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.3990805000066757, + "reward_after_std": 0.5694083124399185, + "reward_before_mean": -0.10020093433558941, + "reward_before_std": 0.5409816838800907, + "reward_change_max": 0.0004583820700645447, + "reward_change_mean": -0.2988795740529895, + "reward_change_min": -0.5763977244496346, + "reward_change_std": 0.22476927004754543, + "reward_std": 0.5694083385169506, + "rewards/cosine_scaled_reward": -0.22718380577862263, + "rewards/format_reward": 0.3541666753590107, + "step": 69 + }, + { + "advantage_max": 1.9086698144674301, + "advantage_mean": 2.856055980604566e-08, + "advantage_min": -0.8275108188390732, + "advantage_std": 0.9997518509626389, + "completion_length": 3076.375045776367, + "epoch": 0.08, + "grad_norm": 0.16705681383609772, + "kl": 0.0011624768376350403, + "lambda_div_used": 0.5, + "learning_rate": 9.956206309337066e-07, + "loss": 0.0, + "reward": -0.4171713124960661, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.4171713124960661, + "reward_after_std": 0.5041129477322102, + "reward_before_mean": -0.10847730562090874, + "reward_before_std": 0.46587206050753593, + "reward_change_max": 0.0018154531717300415, + "reward_change_mean": -0.30869400314986706, + "reward_change_min": -0.5475729629397392, + "reward_change_std": 0.22376791667193174, + "reward_std": 0.5041129551827908, + "rewards/cosine_scaled_reward": -0.20007198816165328, + "rewards/format_reward": 0.29166667349636555, + "step": 70 + }, + { + "advantage_max": 1.8870573192834854, + "advantage_mean": 1.1796752186210568e-08, + "advantage_min": -0.7760294862091541, + "advantage_std": 0.9997637048363686, + "completion_length": 2732.1458740234375, + "epoch": 0.08114285714285714, + "grad_norm": 0.498686820268631, + "kl": 0.007267266511917114, + "lambda_div_used": 0.5, + "learning_rate": 9.951725498333448e-07, + "loss": 0.0003, + "reward": -0.17455044807866216, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.17455044807866216, + "reward_after_std": 0.5208050534129143, + "reward_before_mean": 0.3330681398510933, + "reward_before_std": 0.3784195650368929, + "reward_change_max": 0.0, + "reward_change_mean": -0.5076185669749975, + "reward_change_min": -0.7843321524560452, + "reward_change_std": 0.31202565133571625, + "reward_std": 0.5208050757646561, + "rewards/cosine_scaled_reward": -0.02096594963222742, + "rewards/format_reward": 0.3750000037252903, + "step": 71 + }, + { + "advantage_max": 1.8693189769983292, + "advantage_mean": 2.6697914656814703e-08, + "advantage_min": -0.8475881665945053, + "advantage_std": 0.9998268261551857, + "completion_length": 3021.166702270508, + "epoch": 0.08228571428571428, + "grad_norm": 0.2537727355957031, + "kl": 0.0018652677536010742, + "lambda_div_used": 0.5, + "learning_rate": 9.947027716509488e-07, + "loss": 0.0001, + "reward": -0.30589374899864197, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.30589374899864197, + "reward_after_std": 0.7303144559264183, + "reward_before_mean": 0.0258680060505867, + "reward_before_std": 0.7414015308022499, + "reward_change_max": 0.0, + "reward_change_mean": -0.33176175132393837, + "reward_change_min": -0.7698645628988743, + "reward_change_std": 0.3043969329446554, + "reward_std": 0.7303144857287407, + "rewards/cosine_scaled_reward": -0.1433160022716038, + "rewards/format_reward": 0.3125000111758709, + "step": 72 + }, + { + "advantage_max": 1.87188358604908, + "advantage_mean": 4.3461721443982526e-08, + "advantage_min": -0.8683692142367363, + "advantage_std": 0.99978356808424, + "completion_length": 3466.187530517578, + "epoch": 0.08342857142857144, + "grad_norm": 0.15660125017166138, + "kl": 0.00022289901971817017, + "lambda_div_used": 0.5, + "learning_rate": 9.942113192828444e-07, + "loss": 0.0, + "reward": -0.3592198118567467, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.3592198118567467, + "reward_after_std": 0.6919657550752163, + "reward_before_mean": -0.05832542385905981, + "reward_before_std": 0.70926533639431, + "reward_change_max": 0.0, + "reward_change_mean": -0.3008943935856223, + "reward_change_min": -0.6622928902506828, + "reward_change_std": 0.27009566500782967, + "reward_std": 0.6919657699763775, + "rewards/cosine_scaled_reward": -0.14374604681506753, + "rewards/format_reward": 0.22916666977107525, + "step": 73 + }, + { + "advantage_max": 1.937007024884224, + "advantage_mean": 6.705522959116195e-08, + "advantage_min": -0.726312592625618, + "advantage_std": 0.9997691810131073, + "completion_length": 3224.0000610351562, + "epoch": 0.08457142857142858, + "grad_norm": 0.16150522232055664, + "kl": 0.0010072067379951477, + "lambda_div_used": 0.5, + "learning_rate": 9.93698216681727e-07, + "loss": 0.0, + "reward": -0.30428778287023306, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.30428778287023306, + "reward_after_std": 0.693829420953989, + "reward_before_mean": 0.036474743857979774, + "reward_before_std": 0.6149825137108564, + "reward_change_max": 0.0007465705275535583, + "reward_change_mean": -0.3407625099644065, + "reward_change_min": -0.6090046390891075, + "reward_change_std": 0.23605277249589562, + "reward_std": 0.6938294619321823, + "rewards/cosine_scaled_reward": -0.08592929691076279, + "rewards/format_reward": 0.2083333358168602, + "step": 74 + }, + { + "advantage_max": 1.9063157737255096, + "advantage_mean": 1.3969838397187573e-08, + "advantage_min": -0.7801737226545811, + "advantage_std": 0.9998098164796829, + "completion_length": 2924.5000610351562, + "epoch": 0.08571428571428572, + "grad_norm": 0.17792584002017975, + "kl": 0.0008277297019958496, + "lambda_div_used": 0.5, + "learning_rate": 9.931634888554935e-07, + "loss": 0.0, + "reward": -0.09559960942715406, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.09559960942715406, + "reward_after_std": 0.7040151692926884, + "reward_before_mean": 0.4155034478753805, + "reward_before_std": 0.5951447263360023, + "reward_change_max": 0.00082358717918396, + "reward_change_mean": -0.5111030461266637, + "reward_change_min": -0.8385867662727833, + "reward_change_std": 0.33908967301249504, + "reward_std": 0.7040151953697205, + "rewards/cosine_scaled_reward": -0.0005816244520246983, + "rewards/format_reward": 0.4166666679084301, + "step": 75 + }, + { + "advantage_max": 1.898203819990158, + "advantage_mean": -7.450580818968433e-09, + "advantage_min": -0.8630974441766739, + "advantage_std": 0.9997747763991356, + "completion_length": 2809.229232788086, + "epoch": 0.08685714285714285, + "grad_norm": 0.1862030029296875, + "kl": 0.0001981779932975769, + "lambda_div_used": 0.5, + "learning_rate": 9.926071618660237e-07, + "loss": 0.0, + "reward": -0.25100927520543337, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.25100927520543337, + "reward_after_std": 0.5281644500792027, + "reward_before_mean": 0.1934042815119028, + "reward_before_std": 0.47337512113153934, + "reward_change_max": 0.0, + "reward_change_mean": -0.4444135669618845, + "reward_change_min": -0.7713957540690899, + "reward_change_std": 0.29939418844878674, + "reward_std": 0.5281644649803638, + "rewards/cosine_scaled_reward": -0.15329785831272602, + "rewards/format_reward": 0.5000000093132257, + "step": 76 + }, + { + "advantage_max": 1.8327363729476929, + "advantage_mean": 2.107117447192053e-08, + "advantage_min": -1.0422530099749565, + "advantage_std": 0.9997787699103355, + "completion_length": 3040.812530517578, + "epoch": 0.088, + "grad_norm": 0.15248002111911774, + "kl": 0.00022713467478752136, + "lambda_div_used": 0.5, + "learning_rate": 9.9202926282791e-07, + "loss": 0.0, + "reward": -0.3516826815903187, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.3516826815903187, + "reward_after_std": 0.5596727095544338, + "reward_before_mean": 0.0043606823310256, + "reward_before_std": 0.6001443788409233, + "reward_change_max": 0.0011972561478614807, + "reward_change_mean": -0.3560433629900217, + "reward_change_min": -0.7085311934351921, + "reward_change_std": 0.30133811570703983, + "reward_std": 0.5596727319061756, + "rewards/cosine_scaled_reward": -0.14365299977362156, + "rewards/format_reward": 0.29166667722165585, + "step": 77 + }, + { + "advantage_max": 1.8811450600624084, + "advantage_mean": 4.967053979232361e-08, + "advantage_min": -0.7257413119077682, + "advantage_std": 0.9998095110058784, + "completion_length": 3252.8958587646484, + "epoch": 0.08914285714285715, + "grad_norm": 0.17430712282657623, + "kl": 0.00013465189840644598, + "lambda_div_used": 0.5, + "learning_rate": 9.91429819907136e-07, + "loss": 0.0, + "reward": -0.3133370358264074, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.3133370358264074, + "reward_after_std": 0.8261595070362091, + "reward_before_mean": -0.015162130817770958, + "reward_before_std": 0.8511116541922092, + "reward_change_max": 0.0, + "reward_change_mean": -0.2981749000027776, + "reward_change_min": -0.7532649748027325, + "reward_change_std": 0.295009003020823, + "reward_std": 0.8261595349758863, + "rewards/cosine_scaled_reward": -0.12216439709300175, + "rewards/format_reward": 0.2291666753590107, + "step": 78 + }, + { + "advantage_max": 1.9250884354114532, + "advantage_mean": 1.4280279958533981e-08, + "advantage_min": -0.7625341862440109, + "advantage_std": 0.9998122826218605, + "completion_length": 2306.250015258789, + "epoch": 0.09028571428571429, + "grad_norm": 0.21141712367534637, + "kl": 0.0023859739303588867, + "lambda_div_used": 0.5, + "learning_rate": 9.908088623197048e-07, + "loss": 0.0001, + "reward": -0.12412842572666705, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.12412842572666705, + "reward_after_std": 0.7300403695553541, + "reward_before_mean": 0.35858380049467087, + "reward_before_std": 0.6866056565195322, + "reward_change_max": 0.001516088843345642, + "reward_change_mean": -0.48271221574395895, + "reward_change_min": -0.8789361789822578, + "reward_change_std": 0.3496302356943488, + "reward_std": 0.7300403844565153, + "rewards/cosine_scaled_reward": -0.12279144860804081, + "rewards/format_reward": 0.6041666679084301, + "step": 79 + }, + { + "advantage_max": 1.9084865599870682, + "advantage_mean": 4.594524782852716e-08, + "advantage_min": -0.7381228767335415, + "advantage_std": 0.999783106148243, + "completion_length": 3179.9375610351562, + "epoch": 0.09142857142857143, + "grad_norm": 0.17955073714256287, + "kl": 0.0006044581532478333, + "lambda_div_used": 0.5, + "learning_rate": 9.901664203302124e-07, + "loss": 0.0, + "reward": -0.27829574840143323, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.27829574840143323, + "reward_after_std": 0.7623252347111702, + "reward_before_mean": 0.07243560440838337, + "reward_before_std": 0.7849831110797822, + "reward_change_max": 0.0006520003080368042, + "reward_change_mean": -0.3507313448935747, + "reward_change_min": -0.913384735584259, + "reward_change_std": 0.35589488223195076, + "reward_std": 0.7623252794146538, + "rewards/cosine_scaled_reward": -0.14086553594097495, + "rewards/format_reward": 0.3541666679084301, + "step": 80 + }, + { + "advantage_max": 1.874961331486702, + "advantage_mean": 2.980232349791834e-08, + "advantage_min": -0.926294356584549, + "advantage_std": 0.9997445642948151, + "completion_length": 3036.6667098999023, + "epoch": 0.09257142857142857, + "grad_norm": 0.2611143887042999, + "kl": 0.0016833841800689697, + "lambda_div_used": 0.5, + "learning_rate": 9.895025252503755e-07, + "loss": 0.0001, + "reward": -0.3727427292615175, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.3727427292615175, + "reward_after_std": 0.48480862006545067, + "reward_before_mean": -0.014494793489575386, + "reward_before_std": 0.469564750790596, + "reward_change_max": 0.001546420156955719, + "reward_change_mean": -0.3582479301840067, + "reward_change_min": -0.637864962220192, + "reward_change_std": 0.26254395116120577, + "reward_std": 0.48480862379074097, + "rewards/cosine_scaled_reward": -0.15308072790503502, + "rewards/format_reward": 0.29166667349636555, + "step": 81 + }, + { + "advantage_max": 1.9086092114448547, + "advantage_mean": -1.862645193639878e-08, + "advantage_min": -0.8542907983064651, + "advantage_std": 0.9998050406575203, + "completion_length": 2752.7083892822266, + "epoch": 0.09371428571428571, + "grad_norm": 0.19600576162338257, + "kl": 0.00047707557678222656, + "lambda_div_used": 0.5, + "learning_rate": 9.888172094375033e-07, + "loss": 0.0, + "reward": -0.07845883443951607, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.07845883443951607, + "reward_after_std": 0.7290177270770073, + "reward_before_mean": 0.4407756347209215, + "reward_before_std": 0.6415084861218929, + "reward_change_max": 0.0006015673279762268, + "reward_change_mean": -0.5192344691604376, + "reward_change_min": -0.8588821068406105, + "reward_change_std": 0.34626587107777596, + "reward_std": 0.7290177717804909, + "rewards/cosine_scaled_reward": 0.03288781363517046, + "rewards/format_reward": 0.3750000037252903, + "step": 82 + }, + { + "advantage_max": 1.8362404704093933, + "advantage_mean": 2.669791410170319e-08, + "advantage_min": -0.8356914222240448, + "advantage_std": 0.9998195692896843, + "completion_length": 2632.9375076293945, + "epoch": 0.09485714285714286, + "grad_norm": 0.1994089037179947, + "kl": 0.0006115809082984924, + "lambda_div_used": 0.5, + "learning_rate": 9.881105062929221e-07, + "loss": 0.0, + "reward": -0.14021942391991615, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.14021942391991615, + "reward_after_std": 0.721308272331953, + "reward_before_mean": 0.3346644751727581, + "reward_before_std": 0.710454810410738, + "reward_change_max": 0.0016935467720031738, + "reward_change_mean": -0.4748838823288679, + "reward_change_min": -0.9975110255181789, + "reward_change_std": 0.3862521070986986, + "reward_std": 0.7213082872331142, + "rewards/cosine_scaled_reward": -0.04100109916180372, + "rewards/format_reward": 0.41666667349636555, + "step": 83 + }, + { + "advantage_max": 1.8866354674100876, + "advantage_mean": 1.6763806787167823e-08, + "advantage_min": -0.8430695235729218, + "advantage_std": 0.9998267069458961, + "completion_length": 2886.812530517578, + "epoch": 0.096, + "grad_norm": 0.1975167691707611, + "kl": 0.00033015012741088867, + "lambda_div_used": 0.5, + "learning_rate": 9.873824502603459e-07, + "loss": 0.0, + "reward": -0.05239881947636604, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.05239881947636604, + "reward_after_std": 0.8544594086706638, + "reward_before_mean": 0.45468202233314514, + "reward_before_std": 0.8344392105937004, + "reward_change_max": 0.00045930594205856323, + "reward_change_mean": -0.5070808418095112, + "reward_change_min": -1.0089579410851002, + "reward_change_std": 0.39830892719328403, + "reward_std": 0.8544594198465347, + "rewards/cosine_scaled_reward": -0.0018256474286317825, + "rewards/format_reward": 0.45833334885537624, + "step": 84 + }, + { + "advantage_max": 1.8973801732063293, + "advantage_mean": 2.1109978542988017e-08, + "advantage_min": -0.8326839506626129, + "advantage_std": 0.9998261779546738, + "completion_length": 3018.3541870117188, + "epoch": 0.09714285714285714, + "grad_norm": 0.15980154275894165, + "kl": 0.0002526193857192993, + "lambda_div_used": 0.5, + "learning_rate": 9.866330768241983e-07, + "loss": 0.0, + "reward": -0.1795702837407589, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.1795702837407589, + "reward_after_std": 0.8903343379497528, + "reward_before_mean": 0.20754530653357506, + "reward_before_std": 0.8969676159322262, + "reward_change_max": 8.164346218109131e-05, + "reward_change_mean": -0.3871155809611082, + "reward_change_min": -0.9163610003888607, + "reward_change_std": 0.3585868049412966, + "reward_std": 0.8903343491256237, + "rewards/cosine_scaled_reward": -0.0941440174356103, + "rewards/format_reward": 0.3958333469927311, + "step": 85 + }, + { + "advantage_max": 1.9123211801052094, + "advantage_mean": 4.1599076183729267e-08, + "advantage_min": -0.7952956557273865, + "advantage_std": 0.9997774809598923, + "completion_length": 2792.125030517578, + "epoch": 0.09828571428571428, + "grad_norm": 0.20445193350315094, + "kl": 0.0007953643798828125, + "lambda_div_used": 0.5, + "learning_rate": 9.85862422507884e-07, + "loss": 0.0, + "reward": -0.12261645495891571, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.12261645495891571, + "reward_after_std": 0.6959991101175547, + "reward_before_mean": 0.3699899148195982, + "reward_before_std": 0.6057371087372303, + "reward_change_max": 0.0005937442183494568, + "reward_change_mean": -0.4926063437014818, + "reward_change_min": -0.8540405631065369, + "reward_change_std": 0.3248249962925911, + "reward_std": 0.6959991175681353, + "rewards/cosine_scaled_reward": -0.023338390979915857, + "rewards/format_reward": 0.4166666679084301, + "step": 86 + }, + { + "advantage_max": 1.8586776107549667, + "advantage_mean": 2.6077032810878364e-08, + "advantage_min": -0.8346791416406631, + "advantage_std": 0.9998296871781349, + "completion_length": 2592.5625762939453, + "epoch": 0.09942857142857142, + "grad_norm": 0.24278458952903748, + "kl": 0.0009903311729431152, + "lambda_div_used": 0.5, + "learning_rate": 9.850705248720068e-07, + "loss": 0.0, + "reward": -0.18643693253397942, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.18643693253397942, + "reward_after_std": 0.7580613270401955, + "reward_before_mean": 0.2400309145450592, + "reward_before_std": 0.7840206138789654, + "reward_change_max": 0.002095058560371399, + "reward_change_mean": -0.4264678508043289, + "reward_change_min": -0.8813573159277439, + "reward_change_std": 0.3639759235084057, + "reward_std": 0.7580613419413567, + "rewards/cosine_scaled_reward": -0.1299845464527607, + "rewards/format_reward": 0.5000000074505806, + "step": 87 + }, + { + "advantage_max": 1.9035822749137878, + "advantage_mean": 9.93410831373609e-09, + "advantage_min": -0.8028503619134426, + "advantage_std": 0.9998452365398407, + "completion_length": 2750.729248046875, + "epoch": 0.10057142857142858, + "grad_norm": 0.22822467982769012, + "kl": 0.0008462667465209961, + "lambda_div_used": 0.5, + "learning_rate": 9.8425742251254e-07, + "loss": 0.0, + "reward": -0.1496799192391336, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.1496799192391336, + "reward_after_std": 0.8370900675654411, + "reward_before_mean": 0.27654790971428156, + "reward_before_std": 0.8350924924015999, + "reward_change_max": 0.001572735607624054, + "reward_change_mean": -0.42622782615944743, + "reward_change_min": -0.7948319055140018, + "reward_change_std": 0.33608314860612154, + "reward_std": 0.8370900899171829, + "rewards/cosine_scaled_reward": -0.11172605864703655, + "rewards/format_reward": 0.5000000093132257, + "step": 88 + }, + { + "advantage_max": 1.8878425657749176, + "advantage_mean": 3.476937759927523e-08, + "advantage_min": -0.8848367482423782, + "advantage_std": 0.999799333512783, + "completion_length": 2829.854248046875, + "epoch": 0.10171428571428572, + "grad_norm": 0.1981220543384552, + "kl": 0.0010972023010253906, + "lambda_div_used": 0.5, + "learning_rate": 9.83423155058946e-07, + "loss": 0.0, + "reward": -0.20833131205290556, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.20833131205290556, + "reward_after_std": 0.756609233096242, + "reward_before_mean": 0.1950739398598671, + "reward_before_std": 0.7492302563041449, + "reward_change_max": 0.0, + "reward_change_mean": -0.40340525284409523, + "reward_change_min": -0.8476431369781494, + "reward_change_std": 0.33259575068950653, + "reward_std": 0.7566092498600483, + "rewards/cosine_scaled_reward": -0.100379703566432, + "rewards/format_reward": 0.3958333395421505, + "step": 89 + }, + { + "advantage_max": 1.9099785536527634, + "advantage_mean": 2.359350637082258e-08, + "advantage_min": -0.7695377096533775, + "advantage_std": 0.9997612684965134, + "completion_length": 2356.7916946411133, + "epoch": 0.10285714285714286, + "grad_norm": 0.3218241333961487, + "kl": 0.001089632511138916, + "lambda_div_used": 0.5, + "learning_rate": 9.825677631722435e-07, + "loss": 0.0, + "reward": -0.2230790453031659, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.2230790453031659, + "reward_after_std": 0.6019329931586981, + "reward_before_mean": 0.21309048682451248, + "reward_before_std": 0.5367362890392542, + "reward_change_max": 0.0002956688404083252, + "reward_change_mean": -0.43616954190656543, + "reward_change_min": -0.7830835692584515, + "reward_change_std": 0.30072727892547846, + "reward_std": 0.6019330080598593, + "rewards/cosine_scaled_reward": -0.1747047562384978, + "rewards/format_reward": 0.5625, + "step": 90 + }, + { + "advantage_max": 1.8672983944416046, + "advantage_mean": 1.0554989493538613e-08, + "advantage_min": -0.8877752497792244, + "advantage_std": 0.9998028874397278, + "completion_length": 3059.791702270508, + "epoch": 0.104, + "grad_norm": 0.19541896879673004, + "kl": 0.0007580630481243134, + "lambda_div_used": 0.5, + "learning_rate": 9.816912885430258e-07, + "loss": 0.0, + "reward": -0.25269458070397377, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.25269458070397377, + "reward_after_std": 0.7357797585427761, + "reward_before_mean": 0.12885426357388496, + "reward_before_std": 0.7820154465734959, + "reward_change_max": 0.0023800507187843323, + "reward_change_mean": -0.3815488638356328, + "reward_change_min": -0.803812101483345, + "reward_change_std": 0.35552883241325617, + "reward_std": 0.7357797957956791, + "rewards/cosine_scaled_reward": -0.10223953444801737, + "rewards/format_reward": 0.3333333469927311, + "step": 91 + }, + { + "advantage_max": 1.8726870566606522, + "advantage_mean": 2.1730860888524717e-08, + "advantage_min": -0.9580079317092896, + "advantage_std": 0.9998205602169037, + "completion_length": 2739.8541870117188, + "epoch": 0.10514285714285715, + "grad_norm": 0.1873386800289154, + "kl": 0.0021462738513946533, + "lambda_div_used": 0.5, + "learning_rate": 9.807937738894303e-07, + "loss": 0.0001, + "reward": -0.14901877380907536, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.14901877380907536, + "reward_after_std": 0.7104939520359039, + "reward_before_mean": 0.3285065218806267, + "reward_before_std": 0.7255417667329311, + "reward_change_max": 0.001958779990673065, + "reward_change_mean": -0.4775253050029278, + "reward_change_min": -0.9158918745815754, + "reward_change_std": 0.37975638918578625, + "reward_std": 0.7104939669370651, + "rewards/cosine_scaled_reward": -0.08574674651026726, + "rewards/format_reward": 0.5000000149011612, + "step": 92 + }, + { + "advantage_max": 1.8517653942108154, + "advantage_mean": 1.2728075243773063e-07, + "advantage_min": -0.9585303515195847, + "advantage_std": 0.9996733292937279, + "completion_length": 3399.229217529297, + "epoch": 0.10628571428571429, + "grad_norm": 0.20226261019706726, + "kl": 0.0012685954570770264, + "lambda_div_used": 0.5, + "learning_rate": 9.798752629550546e-07, + "loss": 0.0001, + "reward": -0.5985056199133396, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.5985056199133396, + "reward_after_std": 0.3514649849385023, + "reward_before_mean": -0.3890782119706273, + "reward_before_std": 0.3508653249591589, + "reward_change_max": 0.0008473768830299377, + "reward_change_mean": -0.2094273860566318, + "reward_change_min": -0.3923211321234703, + "reward_change_std": 0.16505926102399826, + "reward_std": 0.3514649923890829, + "rewards/cosine_scaled_reward": -0.23620576970279217, + "rewards/format_reward": 0.0833333358168602, + "step": 93 + }, + { + "advantage_max": 1.9095520675182343, + "advantage_mean": 5.960464588561365e-08, + "advantage_min": -0.7972255274653435, + "advantage_std": 0.9998055398464203, + "completion_length": 3063.3541870117188, + "epoch": 0.10742857142857143, + "grad_norm": 0.17396514117717743, + "kl": 0.0015499591827392578, + "lambda_div_used": 0.5, + "learning_rate": 9.78935800506826e-07, + "loss": 0.0001, + "reward": -0.24387888237833977, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.24387888237833977, + "reward_after_std": 0.699499323964119, + "reward_before_mean": 0.14552057534456253, + "reward_before_std": 0.6129138586111367, + "reward_change_max": 0.0001614391803741455, + "reward_change_mean": -0.38939943816512823, + "reward_change_min": -0.6290842294692993, + "reward_change_std": 0.2671583485789597, + "reward_std": 0.6994993314146996, + "rewards/cosine_scaled_reward": -0.07307304441928864, + "rewards/format_reward": 0.2916666716337204, + "step": 94 + }, + { + "advantage_max": 1.883233681321144, + "advantage_mean": 5.743155395698807e-09, + "advantage_min": -0.8746126741170883, + "advantage_std": 0.9997252598404884, + "completion_length": 3426.5833740234375, + "epoch": 0.10857142857142857, + "grad_norm": 0.20511451363563538, + "kl": 0.0005182921886444092, + "lambda_div_used": 0.5, + "learning_rate": 9.779754323328192e-07, + "loss": 0.0, + "reward": -0.5034449929371476, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.5034449929371476, + "reward_after_std": 0.5185970328748226, + "reward_before_mean": -0.26567897759377956, + "reward_before_std": 0.5346523299813271, + "reward_change_max": 0.0007946863770484924, + "reward_change_mean": -0.2377660358324647, + "reward_change_min": -0.5584822706878185, + "reward_change_std": 0.2266387245617807, + "reward_std": 0.518597049638629, + "rewards/cosine_scaled_reward": -0.22658948972821236, + "rewards/format_reward": 0.1875000074505806, + "step": 95 + }, + { + "advantage_max": 1.919153854250908, + "advantage_mean": 2.405916721404111e-08, + "advantage_min": -0.8224275931715965, + "advantage_std": 0.9998463094234467, + "completion_length": 2780.291702270508, + "epoch": 0.10971428571428571, + "grad_norm": 0.18453562259674072, + "kl": 0.001372743397951126, + "lambda_div_used": 0.5, + "learning_rate": 9.769942052400235e-07, + "loss": 0.0001, + "reward": -0.05859323777258396, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.05859323777258396, + "reward_after_std": 0.9193606749176979, + "reward_before_mean": 0.41439007595181465, + "reward_before_std": 0.8898796737194061, + "reward_change_max": 0.0017773956060409546, + "reward_change_mean": -0.47298329696059227, + "reward_change_min": -0.8093426078557968, + "reward_change_std": 0.3369905035942793, + "reward_std": 0.9193606898188591, + "rewards/cosine_scaled_reward": -0.011554960161447525, + "rewards/format_reward": 0.43750000186264515, + "step": 96 + }, + { + "advantage_max": 1.860081598162651, + "advantage_mean": 5.0912303484196286e-08, + "advantage_min": -0.9762744233012199, + "advantage_std": 0.9997392669320107, + "completion_length": 2987.9375610351562, + "epoch": 0.11085714285714286, + "grad_norm": 0.20188908278942108, + "kl": 0.0014988183975219727, + "lambda_div_used": 0.5, + "learning_rate": 9.759921670520634e-07, + "loss": 0.0001, + "reward": -0.10808135382831097, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.10808135382831097, + "reward_after_std": 0.5511326994746923, + "reward_before_mean": 0.44958585873246193, + "reward_before_std": 0.4742696601897478, + "reward_change_max": 0.00031263381242752075, + "reward_change_mean": -0.557667214423418, + "reward_change_min": -0.9108167290687561, + "reward_change_std": 0.3545882785692811, + "reward_std": 0.5511327031999826, + "rewards/cosine_scaled_reward": 0.01645958609879017, + "rewards/format_reward": 0.41666667722165585, + "step": 97 + }, + { + "advantage_max": 1.924290120601654, + "advantage_mean": 2.7318796669284495e-08, + "advantage_min": -0.8337865993380547, + "advantage_std": 0.99977807700634, + "completion_length": 2615.604202270508, + "epoch": 0.112, + "grad_norm": 0.19144760072231293, + "kl": 0.0005843713879585266, + "lambda_div_used": 0.5, + "learning_rate": 9.749693666068663e-07, + "loss": 0.0, + "reward": -0.19257944263517857, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.19257944263517857, + "reward_after_std": 0.6021185424178839, + "reward_before_mean": 0.26773499604314566, + "reward_before_std": 0.48827750235795975, + "reward_change_max": 0.0028692707419395447, + "reward_change_mean": -0.46031447034329176, + "reward_change_min": -0.7352127730846405, + "reward_change_std": 0.28656440041959286, + "reward_std": 0.6021185610443354, + "rewards/cosine_scaled_reward": -0.12654916709288955, + "rewards/format_reward": 0.5208333432674408, + "step": 98 + }, + { + "advantage_max": 1.8842637687921524, + "advantage_mean": -4.967053546245381e-09, + "advantage_min": -0.8146334141492844, + "advantage_std": 0.999771386384964, + "completion_length": 2808.9791717529297, + "epoch": 0.11314285714285714, + "grad_norm": 0.19145610928535461, + "kl": 0.0007163286209106445, + "lambda_div_used": 0.5, + "learning_rate": 9.739258537542835e-07, + "loss": 0.0, + "reward": -0.25064975768327713, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.25064975768327713, + "reward_after_std": 0.558937631547451, + "reward_before_mean": 0.18323423340916634, + "reward_before_std": 0.4840637035667896, + "reward_change_max": 0.000568389892578125, + "reward_change_mean": -0.43388404604047537, + "reward_change_min": -0.7998382151126862, + "reward_change_std": 0.29928822815418243, + "reward_std": 0.5589376464486122, + "rewards/cosine_scaled_reward": -0.0646328553557396, + "rewards/format_reward": 0.3125, + "step": 99 + }, + { + "advantage_max": 1.883774295449257, + "advantage_mean": 3.849466767569254e-08, + "advantage_min": -0.8500948920845985, + "advantage_std": 0.9998322054743767, + "completion_length": 2592.9792098999023, + "epoch": 0.11428571428571428, + "grad_norm": 0.18909044563770294, + "kl": 0.0011529922485351562, + "lambda_div_used": 0.5, + "learning_rate": 9.728616793536587e-07, + "loss": 0.0, + "reward": -0.0791124738752842, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.0791124738752842, + "reward_after_std": 0.7873836122453213, + "reward_before_mean": 0.4297027445281856, + "reward_before_std": 0.7936761677265167, + "reward_change_max": 0.000196017324924469, + "reward_change_mean": -0.5088152242824435, + "reward_change_min": -0.9765620827674866, + "reward_change_std": 0.3931038361042738, + "reward_std": 0.7873836234211922, + "rewards/cosine_scaled_reward": -0.035148635506629944, + "rewards/format_reward": 0.5000000093132257, + "step": 100 + }, + { + "advantage_max": 1.8153242319822311, + "advantage_mean": 5.029142047252577e-08, + "advantage_min": -0.938971072435379, + "advantage_std": 0.9997990876436234, + "completion_length": 2627.000045776367, + "epoch": 0.11542857142857142, + "grad_norm": 0.2275507003068924, + "kl": 0.0010591745376586914, + "lambda_div_used": 0.5, + "learning_rate": 9.717768952713511e-07, + "loss": 0.0, + "reward": -0.19237697310745716, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.19237697310745716, + "reward_after_std": 0.6066364720463753, + "reward_before_mean": 0.2787788547575474, + "reward_before_std": 0.5884393192827702, + "reward_change_max": 0.0011661723256111145, + "reward_change_mean": -0.4711558297276497, + "reward_change_min": -0.8935237973928452, + "reward_change_std": 0.3549905549734831, + "reward_std": 0.6066364757716656, + "rewards/cosine_scaled_reward": -0.08977724611759186, + "rewards/format_reward": 0.4583333395421505, + "step": 101 + }, + { + "advantage_max": 1.9111991822719574, + "advantage_mean": 0.0, + "advantage_min": -0.8480004072189331, + "advantage_std": 0.9998131468892097, + "completion_length": 2199.3958892822266, + "epoch": 0.11657142857142858, + "grad_norm": 0.28151506185531616, + "kl": 0.002061605453491211, + "lambda_div_used": 0.5, + "learning_rate": 9.706715543782064e-07, + "loss": 0.0001, + "reward": -0.10260325577110052, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.10260325577110052, + "reward_after_std": 0.7042684890329838, + "reward_before_mean": 0.40410364978015423, + "reward_before_std": 0.6136751640588045, + "reward_change_max": 0.00042323023080825806, + "reward_change_mean": -0.506706902757287, + "reward_change_min": -0.8719289116561413, + "reward_change_std": 0.3393041845411062, + "reward_std": 0.7042685002088547, + "rewards/cosine_scaled_reward": -0.14169819233939052, + "rewards/format_reward": 0.687500013038516, + "step": 102 + }, + { + "advantage_max": 1.8461199253797531, + "advantage_mean": 2.359350548264416e-08, + "advantage_min": -0.8268741890788078, + "advantage_std": 0.999856062233448, + "completion_length": 2750.125057220459, + "epoch": 0.11771428571428572, + "grad_norm": 0.274533748626709, + "kl": 0.0018423646688461304, + "lambda_div_used": 0.5, + "learning_rate": 9.695457105469804e-07, + "loss": 0.0001, + "reward": -0.1559738339856267, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": -0.1559738339856267, + "reward_after_std": 0.8337625414133072, + "reward_before_mean": 0.27372268959879875, + "reward_before_std": 0.9095275402069092, + "reward_change_max": 0.0012431740760803223, + "reward_change_mean": -0.42969651333987713, + "reward_change_min": -0.9942761585116386, + "reward_change_std": 0.422477787360549, + "reward_std": 0.8337625414133072, + "rewards/cosine_scaled_reward": -0.09230532869696617, + "rewards/format_reward": 0.4583333432674408, + "step": 103 + }, + { + "advantage_max": 1.8895713835954666, + "advantage_mean": 4.5324367148324995e-08, + "advantage_min": -0.8674125224351883, + "advantage_std": 0.9997276961803436, + "completion_length": 2726.8958435058594, + "epoch": 0.11885714285714286, + "grad_norm": 0.2888650894165039, + "kl": 0.0068280696868896484, + "lambda_div_used": 0.5, + "learning_rate": 9.683994186497132e-07, + "loss": 0.0003, + "reward": -0.2802076867665164, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.2802076867665164, + "reward_after_std": 0.45476071164011955, + "reward_before_mean": 0.161182364448905, + "reward_before_std": 0.399234589189291, + "reward_change_max": 0.00044030696153640747, + "reward_change_mean": -0.44139003101736307, + "reward_change_min": -0.7332473956048489, + "reward_change_std": 0.2831938583403826, + "reward_std": 0.45476071909070015, + "rewards/cosine_scaled_reward": -0.1173255043104291, + "rewards/format_reward": 0.39583333395421505, + "step": 104 + }, + { + "advantage_max": 1.8745701760053635, + "advantage_mean": 1.707424779340272e-08, + "advantage_min": -0.8500468209385872, + "advantage_std": 0.9998556524515152, + "completion_length": 2416.8333587646484, + "epoch": 0.12, + "grad_norm": 0.21496868133544922, + "kl": 0.0012444257736206055, + "lambda_div_used": 0.5, + "learning_rate": 9.672327345550543e-07, + "loss": 0.0, + "reward": 0.03997505363076925, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.03997505363076925, + "reward_after_std": 0.9438597373664379, + "reward_before_mean": 0.5930667854845524, + "reward_before_std": 0.9317218028008938, + "reward_change_max": 0.0017218366265296936, + "reward_change_mean": -0.5530917001888156, + "reward_change_min": -1.0486153699457645, + "reward_change_std": 0.429962957277894, + "reward_std": 0.9438597410917282, + "rewards/cosine_scaled_reward": 0.036116703413426876, + "rewards/format_reward": 0.5208333414047956, + "step": 105 + }, + { + "advantage_max": 1.9127518236637115, + "advantage_mean": -1.241763913917282e-09, + "advantage_min": -0.7981174066662788, + "advantage_std": 0.9998517706990242, + "completion_length": 2194.812545776367, + "epoch": 0.12114285714285715, + "grad_norm": 0.21088606119155884, + "kl": 0.004343807697296143, + "lambda_div_used": 0.5, + "learning_rate": 9.66045715125541e-07, + "loss": 0.0002, + "reward": 0.23806806560605764, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.23806806560605764, + "reward_after_std": 0.8330020643770695, + "reward_before_mean": 0.9928077161312103, + "reward_before_std": 0.6945897105615586, + "reward_change_max": 0.00025169551372528076, + "reward_change_mean": -0.7547396793961525, + "reward_change_min": -1.2395295053720474, + "reward_change_std": 0.4977700933814049, + "reward_std": 0.8330021128058434, + "rewards/cosine_scaled_reward": 0.15265384782105684, + "rewards/format_reward": 0.6875000111758709, + "step": 106 + }, + { + "advantage_max": 1.8897164016962051, + "advantage_mean": 3.973643103449831e-08, + "advantage_min": -0.8593711704015732, + "advantage_std": 0.9998006448149681, + "completion_length": 2883.812515258789, + "epoch": 0.12228571428571429, + "grad_norm": 0.21747052669525146, + "kl": 0.0013761520385742188, + "lambda_div_used": 0.5, + "learning_rate": 9.648384182148252e-07, + "loss": 0.0001, + "reward": -0.13875769823789597, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.13875769823789597, + "reward_after_std": 0.6199061535298824, + "reward_before_mean": 0.37559359334409237, + "reward_before_std": 0.5839899033308029, + "reward_change_max": 0.0, + "reward_change_mean": -0.5143513064831495, + "reward_change_min": -0.8711799457669258, + "reward_change_std": 0.36477554962038994, + "reward_std": 0.6199061721563339, + "rewards/cosine_scaled_reward": -0.062203213572502136, + "rewards/format_reward": 0.5000000074505806, + "step": 107 + }, + { + "advantage_max": 1.8762633353471756, + "advantage_mean": 2.483527827834564e-09, + "advantage_min": -0.8940647393465042, + "advantage_std": 0.999830886721611, + "completion_length": 2561.500030517578, + "epoch": 0.12342857142857143, + "grad_norm": 0.23256301879882812, + "kl": 0.00116729736328125, + "lambda_div_used": 0.5, + "learning_rate": 9.636109026648554e-07, + "loss": 0.0, + "reward": 0.07393225142732263, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.07393225142732263, + "reward_after_std": 0.7953523583710194, + "reward_before_mean": 0.7025719881057739, + "reward_before_std": 0.7550234757363796, + "reward_change_max": 0.0, + "reward_change_mean": -0.6286397371441126, + "reward_change_min": -1.0530790612101555, + "reward_change_std": 0.43232874386012554, + "reward_std": 0.7953523695468903, + "rewards/cosine_scaled_reward": 0.059619318693876266, + "rewards/format_reward": 0.5833333507180214, + "step": 108 + }, + { + "advantage_max": 1.9162142127752304, + "advantage_mean": 1.7384688799637615e-08, + "advantage_min": -0.7551117762923241, + "advantage_std": 0.9998111501336098, + "completion_length": 3017.2500762939453, + "epoch": 0.12457142857142857, + "grad_norm": 0.2104494720697403, + "kl": 0.0007964372634887695, + "lambda_div_used": 0.5, + "learning_rate": 9.623632283030077e-07, + "loss": 0.0, + "reward": -0.177821128629148, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.177821128629148, + "reward_after_std": 0.7017932273447514, + "reward_before_mean": 0.26592331333085895, + "reward_before_std": 0.6155474036931992, + "reward_change_max": 0.000747285783290863, + "reward_change_mean": -0.44374443776905537, + "reward_change_min": -0.8128639236092567, + "reward_change_std": 0.3069191947579384, + "reward_std": 0.7017932571470737, + "rewards/cosine_scaled_reward": -0.03370502591133118, + "rewards/format_reward": 0.3333333395421505, + "step": 109 + }, + { + "advantage_max": 1.960287183523178, + "advantage_mean": 1.862645193639878e-08, + "advantage_min": -0.7720007188618183, + "advantage_std": 0.9998722448945045, + "completion_length": 2521.1458740234375, + "epoch": 0.12571428571428572, + "grad_norm": 0.26947009563446045, + "kl": 0.0013470649719238281, + "lambda_div_used": 0.5, + "learning_rate": 9.610954559391704e-07, + "loss": 0.0001, + "reward": -0.05872512166388333, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.05872512166388333, + "reward_after_std": 1.0253377817571163, + "reward_before_mean": 0.3726255651563406, + "reward_before_std": 0.9357779249548912, + "reward_change_max": 0.0007335245609283447, + "reward_change_mean": -0.4313506744801998, + "reward_change_min": -0.8192849457263947, + "reward_change_std": 0.31888777762651443, + "reward_std": 1.0253378376364708, + "rewards/cosine_scaled_reward": -0.07410389324650168, + "rewards/format_reward": 0.5208333469927311, + "step": 110 + }, + { + "advantage_max": 1.8416922390460968, + "advantage_mean": 2.483526972962835e-08, + "advantage_min": -0.9220654144883156, + "advantage_std": 0.9998046904802322, + "completion_length": 2686.2083587646484, + "epoch": 0.12685714285714286, + "grad_norm": 0.22409984469413757, + "kl": 0.0019087791442871094, + "lambda_div_used": 0.5, + "learning_rate": 9.598076473627796e-07, + "loss": 0.0001, + "reward": -0.12221940292511135, + "reward_advantage_correlation": 0.9999999999999994, + "reward_after_mean": -0.12221940292511135, + "reward_after_std": 0.7193337231874466, + "reward_before_mean": 0.3737189192324877, + "reward_before_std": 0.7402253746986389, + "reward_change_max": 0.0015523731708526611, + "reward_change_mean": -0.4959383327513933, + "reward_change_min": -1.015930712223053, + "reward_change_std": 0.4064535070210695, + "reward_std": 0.7193337418138981, + "rewards/cosine_scaled_reward": -0.04230721411295235, + "rewards/format_reward": 0.4583333544433117, + "step": 111 + }, + { + "advantage_max": 1.9122427105903625, + "advantage_mean": 5.277494635747004e-09, + "advantage_min": -0.794686496257782, + "advantage_std": 0.9998061284422874, + "completion_length": 2904.125045776367, + "epoch": 0.128, + "grad_norm": 0.19855168461799622, + "kl": 0.0010465309023857117, + "lambda_div_used": 0.5, + "learning_rate": 9.58499865339809e-07, + "loss": 0.0, + "reward": -0.17366264760494232, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.17366264760494232, + "reward_after_std": 0.734580010175705, + "reward_before_mean": 0.26212383183883503, + "reward_before_std": 0.6679606847465038, + "reward_change_max": 0.0, + "reward_change_mean": -0.4357864987105131, + "reward_change_min": -0.8571107387542725, + "reward_change_std": 0.3221443220973015, + "reward_std": 0.7345800176262856, + "rewards/cosine_scaled_reward": -0.035604753997176886, + "rewards/format_reward": 0.33333334140479565, + "step": 112 + }, + { + "advantage_max": 1.8980989009141922, + "advantage_mean": 4.967054101356894e-09, + "advantage_min": -0.860293336212635, + "advantage_std": 0.9998377710580826, + "completion_length": 2528.7708740234375, + "epoch": 0.12914285714285714, + "grad_norm": 0.32358023524284363, + "kl": 0.0017886161804199219, + "lambda_div_used": 0.5, + "learning_rate": 9.571721736097088e-07, + "loss": 0.0001, + "reward": -0.22631652595009655, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.22631652595009655, + "reward_after_std": 0.7506244815886021, + "reward_before_mean": 0.1606158297508955, + "reward_before_std": 0.7282139845192432, + "reward_change_max": 0.0006383880972862244, + "reward_change_mean": -0.3869323618710041, + "reward_change_min": -0.7756957449018955, + "reward_change_std": 0.29775483161211014, + "reward_std": 0.7506244964897633, + "rewards/cosine_scaled_reward": -0.15927542932331562, + "rewards/format_reward": 0.4791666753590107, + "step": 113 + }, + { + "advantage_max": 1.9110632240772247, + "advantage_mean": 2.6542694486764162e-08, + "advantage_min": -0.771735567599535, + "advantage_std": 0.9997791200876236, + "completion_length": 2564.1666870117188, + "epoch": 0.13028571428571428, + "grad_norm": 0.20324108004570007, + "kl": 0.0022649765014648438, + "lambda_div_used": 0.5, + "learning_rate": 9.55824636882301e-07, + "loss": 0.0001, + "reward": -0.28347105346620083, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.28347105346620083, + "reward_after_std": 0.6658044643700123, + "reward_before_mean": 0.08999151457101107, + "reward_before_std": 0.6686059813946486, + "reward_change_max": 0.0010268613696098328, + "reward_change_mean": -0.37346256989985704, + "reward_change_min": -0.8506891429424286, + "reward_change_std": 0.32815066166222095, + "reward_std": 0.665804473683238, + "rewards/cosine_scaled_reward": -0.19458758272230625, + "rewards/format_reward": 0.4791666753590107, + "step": 114 + }, + { + "advantage_max": 1.866062507033348, + "advantage_mean": 2.2351741513215728e-08, + "advantage_min": -0.8335398361086845, + "advantage_std": 0.999809741973877, + "completion_length": 2849.583366394043, + "epoch": 0.13142857142857142, + "grad_norm": 0.176808163523674, + "kl": 0.002234935760498047, + "lambda_div_used": 0.5, + "learning_rate": 9.54457320834625e-07, + "loss": 0.0001, + "reward": -0.14792929776012897, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.14792929776012897, + "reward_after_std": 0.7658665478229523, + "reward_before_mean": 0.31146786167664686, + "reward_before_std": 0.7670408114790916, + "reward_change_max": 0.0006662756204605103, + "reward_change_mean": -0.45939717441797256, + "reward_change_min": -0.956385251134634, + "reward_change_std": 0.38262984342873096, + "reward_std": 0.7658665888011456, + "rewards/cosine_scaled_reward": -0.052599404007196426, + "rewards/format_reward": 0.4166666679084301, + "step": 115 + }, + { + "advantage_max": 1.8941835165023804, + "advantage_mean": 7.202228013980516e-08, + "advantage_min": -0.8339600563049316, + "advantage_std": 0.999761626124382, + "completion_length": 3221.0625, + "epoch": 0.13257142857142856, + "grad_norm": 0.18061460554599762, + "kl": 0.0020515918731689453, + "lambda_div_used": 0.5, + "learning_rate": 9.530702921077358e-07, + "loss": 0.0001, + "reward": -0.42343843169510365, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.42343843169510365, + "reward_after_std": 0.6135209687054157, + "reward_before_mean": -0.15599404089152813, + "reward_before_std": 0.6033586822450161, + "reward_change_max": 0.0025794655084609985, + "reward_change_mean": -0.2674443628638983, + "reward_change_min": -0.5305989198386669, + "reward_change_std": 0.21838054060935974, + "reward_std": 0.6135209947824478, + "rewards/cosine_scaled_reward": -0.1717470269650221, + "rewards/format_reward": 0.18750000186264515, + "step": 116 + }, + { + "advantage_max": 1.8966728001832962, + "advantage_mean": -1.7384688688615313e-08, + "advantage_min": -0.7671744376420975, + "advantage_std": 0.9998082593083382, + "completion_length": 3019.7291870117188, + "epoch": 0.1337142857142857, + "grad_norm": 0.20431950688362122, + "kl": 0.0031223297119140625, + "lambda_div_used": 0.5, + "learning_rate": 9.516636183034564e-07, + "loss": 0.0001, + "reward": -0.38849915959872305, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.38849915959872305, + "reward_after_std": 0.7110750675201416, + "reward_before_mean": -0.1264194727409631, + "reward_before_std": 0.6984617970883846, + "reward_change_max": 0.0021105334162712097, + "reward_change_mean": -0.26207969430834055, + "reward_change_min": -0.6101348102092743, + "reward_change_std": 0.24454447254538536, + "reward_std": 0.7110750749707222, + "rewards/cosine_scaled_reward": -0.1986264120787382, + "rewards/format_reward": 0.2708333358168602, + "step": 117 + }, + { + "advantage_max": 1.8363988399505615, + "advantage_mean": 2.949188337986186e-08, + "advantage_min": -0.863525852560997, + "advantage_std": 0.999841958284378, + "completion_length": 2949.437515258789, + "epoch": 0.13485714285714287, + "grad_norm": 0.18025439977645874, + "kl": 0.001627206802368164, + "lambda_div_used": 0.5, + "learning_rate": 9.502373679810839e-07, + "loss": 0.0001, + "reward": 0.07480202615261078, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.07480202615261078, + "reward_after_std": 0.920928593724966, + "reward_before_mean": 0.6708409860730171, + "reward_before_std": 0.9098874349147081, + "reward_change_max": 0.0014739260077476501, + "reward_change_mean": -0.5960389524698257, + "reward_change_min": -1.1508560217916965, + "reward_change_std": 0.481814730912447, + "reward_std": 0.920928630977869, + "rewards/cosine_scaled_reward": 0.12708715675398707, + "rewards/format_reward": 0.41666666977107525, + "step": 118 + }, + { + "advantage_max": 1.8390760868787766, + "advantage_mean": -6.208820124697922e-10, + "advantage_min": -0.949719063937664, + "advantage_std": 0.9998083710670471, + "completion_length": 2325.479202270508, + "epoch": 0.136, + "grad_norm": 0.2877277135848999, + "kl": 0.0034942626953125, + "lambda_div_used": 0.5, + "learning_rate": 9.487916106540465e-07, + "loss": 0.0001, + "reward": -0.03717762790620327, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.03717762790620327, + "reward_after_std": 0.7488083764910698, + "reward_before_mean": 0.5156388609902933, + "reward_before_std": 0.7618674747645855, + "reward_change_max": 0.0008146613836288452, + "reward_change_mean": -0.5528164934366941, + "reward_change_min": -1.0835012346506119, + "reward_change_std": 0.4235940780490637, + "reward_std": 0.7488083802163601, + "rewards/cosine_scaled_reward": -0.013013919815421104, + "rewards/format_reward": 0.541666679084301, + "step": 119 + }, + { + "advantage_max": 1.9311043322086334, + "advantage_mean": 7.450580818968433e-09, + "advantage_min": -0.7216290608048439, + "advantage_std": 0.9998380541801453, + "completion_length": 2253.5417137145996, + "epoch": 0.13714285714285715, + "grad_norm": 0.22529254853725433, + "kl": 0.0031099319458007812, + "lambda_div_used": 0.5, + "learning_rate": 9.473264167865171e-07, + "loss": 0.0001, + "reward": -0.043658461421728134, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.043658461421728134, + "reward_after_std": 0.8314470946788788, + "reward_before_mean": 0.4670662308926694, + "reward_before_std": 0.7415252029895782, + "reward_change_max": 0.0007584840059280396, + "reward_change_mean": -0.5107246488332748, + "reward_change_min": -0.9488695897161961, + "reward_change_std": 0.36425756290555, + "reward_std": 0.8314471282064915, + "rewards/cosine_scaled_reward": -0.058133574202656746, + "rewards/format_reward": 0.5833333414047956, + "step": 120 + }, + { + "advantage_max": 1.9516853392124176, + "advantage_mean": -1.4280279514444771e-08, + "advantage_min": -0.6656368263065815, + "advantage_std": 0.9998772144317627, + "completion_length": 1376.1250305175781, + "epoch": 0.1382857142857143, + "grad_norm": 0.2774488627910614, + "kl": 0.002851724624633789, + "lambda_div_used": 0.5, + "learning_rate": 9.458418577899774e-07, + "loss": 0.0001, + "reward": 0.26350086531601846, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.26350086531601846, + "reward_after_std": 0.9549853205680847, + "reward_before_mean": 0.9884882047772408, + "reward_before_std": 0.7882588542997837, + "reward_change_max": 0.0, + "reward_change_mean": -0.7249873355031013, + "reward_change_min": -1.2277339547872543, + "reward_change_std": 0.44961426220834255, + "reward_std": 0.9549853503704071, + "rewards/cosine_scaled_reward": 0.05674408434424549, + "rewards/format_reward": 0.8750000111758709, + "step": 121 + }, + { + "advantage_max": 1.8947256356477737, + "advantage_mean": -9.93410742555767e-09, + "advantage_min": -0.781624436378479, + "advantage_std": 0.99982900172472, + "completion_length": 2792.458335876465, + "epoch": 0.13942857142857143, + "grad_norm": 0.25618037581443787, + "kl": 0.001964092254638672, + "lambda_div_used": 0.5, + "learning_rate": 9.443380060197385e-07, + "loss": 0.0001, + "reward": -0.08971790038049221, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.08971790038049221, + "reward_after_std": 0.8048688769340515, + "reward_before_mean": 0.3975519463419914, + "reward_before_std": 0.7465364970266819, + "reward_change_max": 0.0019213780760765076, + "reward_change_mean": -0.4872698709368706, + "reward_change_min": -0.9815921522676945, + "reward_change_std": 0.37444521114230156, + "reward_std": 0.8048688843846321, + "rewards/cosine_scaled_reward": -0.019974021706730127, + "rewards/format_reward": 0.4375, + "step": 122 + }, + { + "advantage_max": 1.8762762248516083, + "advantage_mean": -6.208817127095756e-09, + "advantage_min": -0.8266568705439568, + "advantage_std": 0.9997798949480057, + "completion_length": 2543.937515258789, + "epoch": 0.14057142857142857, + "grad_norm": 0.1804003268480301, + "kl": 0.0017232894897460938, + "lambda_div_used": 0.5, + "learning_rate": 9.428149347714143e-07, + "loss": 0.0001, + "reward": -0.2562871566042304, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.2562871566042304, + "reward_after_std": 0.6744464114308357, + "reward_before_mean": 0.13696255098329857, + "reward_before_std": 0.6849313788115978, + "reward_change_max": 0.0006846264004707336, + "reward_change_mean": -0.39324971940368414, + "reward_change_min": -0.8639900349080563, + "reward_change_std": 0.3413631683215499, + "reward_std": 0.6744464132934809, + "rewards/cosine_scaled_reward": -0.17110206559300423, + "rewards/format_reward": 0.47916667349636555, + "step": 123 + }, + { + "advantage_max": 1.9491935968399048, + "advantage_mean": 1.8626452713554897e-08, + "advantage_min": -0.7547185383737087, + "advantage_std": 0.9998470917344093, + "completion_length": 2096.875030517578, + "epoch": 0.1417142857142857, + "grad_norm": 0.24292317032814026, + "kl": 0.009019851684570312, + "lambda_div_used": 0.5, + "learning_rate": 9.412727182773486e-07, + "loss": 0.0004, + "reward": 0.0013284431770443916, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.0013284431770443916, + "reward_after_std": 0.912301491945982, + "reward_before_mean": 0.5245756283402443, + "reward_before_std": 0.841710695065558, + "reward_change_max": 0.0007164850831031799, + "reward_change_mean": -0.5232471814379096, + "reward_change_min": -0.9632034972310066, + "reward_change_std": 0.37779437424615026, + "reward_std": 0.9123015441000462, + "rewards/cosine_scaled_reward": -0.08146218955516815, + "rewards/format_reward": 0.687500013038516, + "step": 124 + }, + { + "advantage_max": 1.9068433791399002, + "advantage_mean": 5.285255755271834e-08, + "advantage_min": -0.7247280701994896, + "advantage_std": 0.999810703098774, + "completion_length": 2890.5208587646484, + "epoch": 0.14285714285714285, + "grad_norm": 0.17092150449752808, + "kl": 0.001659393310546875, + "lambda_div_used": 0.5, + "learning_rate": 9.397114317029974e-07, + "loss": 0.0001, + "reward": -0.09609029325656593, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.09609029325656593, + "reward_after_std": 0.7903048172593117, + "reward_before_mean": 0.38883402571082115, + "reward_before_std": 0.7232473976910114, + "reward_change_max": 0.0020884498953819275, + "reward_change_mean": -0.48492429591715336, + "reward_change_min": -0.9192905463278294, + "reward_change_std": 0.3599092774093151, + "reward_std": 0.790304858237505, + "rewards/cosine_scaled_reward": 0.02775033819489181, + "rewards/format_reward": 0.3333333358168602, + "step": 125 + }, + { + "advantage_max": 1.9119371473789215, + "advantage_mean": -1.8626451603331873e-08, + "advantage_min": -0.8382847681641579, + "advantage_std": 0.9998529180884361, + "completion_length": 2823.1458740234375, + "epoch": 0.144, + "grad_norm": 0.18794958293437958, + "kl": 0.001481771469116211, + "lambda_div_used": 0.5, + "learning_rate": 9.381311511432658e-07, + "loss": 0.0001, + "reward": -0.1438035280443728, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.1438035280443728, + "reward_after_std": 0.842867836356163, + "reward_before_mean": 0.2903837040066719, + "reward_before_std": 0.84921033680439, + "reward_change_max": 0.0, + "reward_change_mean": -0.4341872353106737, + "reward_change_min": -0.8519404716789722, + "reward_change_std": 0.36564615555107594, + "reward_std": 0.8428678773343563, + "rewards/cosine_scaled_reward": -0.0839748103171587, + "rewards/format_reward": 0.4583333432674408, + "step": 126 + }, + { + "advantage_max": 1.8188635557889938, + "advantage_mean": 3.973643153409867e-08, + "advantage_min": -0.9549058154225349, + "advantage_std": 0.9997655674815178, + "completion_length": 2931.812530517578, + "epoch": 0.14514285714285713, + "grad_norm": 0.17654724419116974, + "kl": 0.0027322769165039062, + "lambda_div_used": 0.5, + "learning_rate": 9.36531953618799e-07, + "loss": 0.0001, + "reward": -0.4510874133557081, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.4510874133557081, + "reward_after_std": 0.4549293704330921, + "reward_before_mean": -0.14654914196580648, + "reward_before_std": 0.4743167757987976, + "reward_change_max": 0.002840563654899597, + "reward_change_mean": -0.30453827790915966, + "reward_change_min": -0.6247300133109093, + "reward_change_std": 0.25821792520582676, + "reward_std": 0.4549293927848339, + "rewards/cosine_scaled_reward": -0.26077457517385483, + "rewards/format_reward": 0.37500000931322575, + "step": 127 + }, + { + "advantage_max": 1.844459444284439, + "advantage_mean": 4.097819439330408e-08, + "advantage_min": -1.0102382525801659, + "advantage_std": 0.9997452944517136, + "completion_length": 2836.083366394043, + "epoch": 0.1462857142857143, + "grad_norm": 0.17293789982795715, + "kl": 0.003336310386657715, + "lambda_div_used": 0.5, + "learning_rate": 9.34913917072228e-07, + "loss": 0.0001, + "reward": 0.02156132459640503, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.02156132459640503, + "reward_after_std": 0.7180147506296635, + "reward_before_mean": 0.6425447445362806, + "reward_before_std": 0.6847621034830809, + "reward_change_max": 0.0016322359442710876, + "reward_change_mean": -0.6209834087640047, + "reward_change_min": -1.057206965982914, + "reward_change_std": 0.44490036740899086, + "reward_std": 0.7180147618055344, + "rewards/cosine_scaled_reward": 0.10252236761152744, + "rewards/format_reward": 0.43750000558793545, + "step": 128 + }, + { + "advantage_max": 1.8426750153303146, + "advantage_mean": 2.7318795670083773e-08, + "advantage_min": -0.8593291789293289, + "advantage_std": 0.99974674731493, + "completion_length": 3238.3125, + "epoch": 0.14742857142857144, + "grad_norm": 0.17253991961479187, + "kl": 0.002398967742919922, + "lambda_div_used": 0.5, + "learning_rate": 9.332771203643714e-07, + "loss": 0.0001, + "reward": -0.35933883488178253, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.35933883488178253, + "reward_after_std": 0.5568939186632633, + "reward_before_mean": -0.006960507482290268, + "reward_before_std": 0.5875996574759483, + "reward_change_max": 0.000954747200012207, + "reward_change_mean": -0.3523783441632986, + "reward_change_min": -0.7693795971572399, + "reward_change_std": 0.30717823654413223, + "reward_std": 0.5568939335644245, + "rewards/cosine_scaled_reward": -0.11806358769536018, + "rewards/format_reward": 0.22916666977107525, + "step": 129 + }, + { + "advantage_max": 1.8513092994689941, + "advantage_mean": 6.270905628102952e-08, + "advantage_min": -0.8922067731618881, + "advantage_std": 0.9997543543577194, + "completion_length": 2752.5625534057617, + "epoch": 0.14857142857142858, + "grad_norm": 0.17327053844928741, + "kl": 0.0019626617431640625, + "lambda_div_used": 0.5, + "learning_rate": 9.316216432703916e-07, + "loss": 0.0001, + "reward": -0.37150960601866245, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.37150960601866245, + "reward_after_std": 0.5050280168652534, + "reward_before_mean": -0.019790776073932648, + "reward_before_std": 0.49720101431012154, + "reward_change_max": 0.0008353963494300842, + "reward_change_mean": -0.35171885043382645, + "reward_change_min": -0.6718562357127666, + "reward_change_std": 0.2648839596658945, + "reward_std": 0.5050280280411243, + "rewards/cosine_scaled_reward": -0.18697872385382652, + "rewards/format_reward": 0.35416667722165585, + "step": 130 + }, + { + "advantage_max": 1.838008999824524, + "advantage_mean": 2.1265198602016255e-08, + "advantage_min": -0.9664673283696175, + "advantage_std": 0.9998275339603424, + "completion_length": 2747.7083740234375, + "epoch": 0.14971428571428572, + "grad_norm": 0.20217643678188324, + "kl": 0.0033931732177734375, + "lambda_div_used": 0.5, + "learning_rate": 9.299475664759068e-07, + "loss": 0.0001, + "reward": 0.06368017196655273, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.06368017196655273, + "reward_after_std": 0.761361688375473, + "reward_before_mean": 0.7030438333749771, + "reward_before_std": 0.7445781577844173, + "reward_change_max": 0.00028298795223236084, + "reward_change_mean": -0.639363644644618, + "reward_change_min": -1.077939011156559, + "reward_change_std": 0.46623699367046356, + "reward_std": 0.7613617070019245, + "rewards/cosine_scaled_reward": 0.10152191109955311, + "rewards/format_reward": 0.5000000167638063, + "step": 131 + }, + { + "advantage_max": 1.901751920580864, + "advantage_mean": 1.5832484268063496e-08, + "advantage_min": -0.7725710570812225, + "advantage_std": 0.999824121594429, + "completion_length": 2550.3333435058594, + "epoch": 0.15085714285714286, + "grad_norm": 0.18242870271205902, + "kl": 0.0016298294067382812, + "lambda_div_used": 0.5, + "learning_rate": 9.282549715730579e-07, + "loss": 0.0001, + "reward": -0.11272692680358887, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.11272692680358887, + "reward_after_std": 0.8790729530155659, + "reward_before_mean": 0.3309657648205757, + "reward_before_std": 0.8492833152413368, + "reward_change_max": 0.0006166920065879822, + "reward_change_mean": -0.44369266368448734, + "reward_change_min": -0.9826720170676708, + "reward_change_std": 0.36282812524586916, + "reward_std": 0.8790729679167271, + "rewards/cosine_scaled_reward": -0.05326713342219591, + "rewards/format_reward": 0.43750000558793545, + "step": 132 + }, + { + "advantage_max": 1.9136138558387756, + "advantage_mean": 2.4835269396561444e-08, + "advantage_min": -0.8313917182385921, + "advantage_std": 0.9997948184609413, + "completion_length": 2903.9583740234375, + "epoch": 0.152, + "grad_norm": 0.23344391584396362, + "kl": 0.0031654834747314453, + "lambda_div_used": 0.5, + "learning_rate": 9.265439410565328e-07, + "loss": 0.0001, + "reward": -0.3133470695465803, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": -0.3133470695465803, + "reward_after_std": 0.6490304917097092, + "reward_before_mean": 0.032596178352832794, + "reward_before_std": 0.5889531373977661, + "reward_change_max": 0.0016025900840759277, + "reward_change_mean": -0.3459432474337518, + "reward_change_min": -0.5833385325968266, + "reward_change_std": 0.23549414426088333, + "reward_std": 0.6490305289626122, + "rewards/cosine_scaled_reward": -0.16078524757176638, + "rewards/format_reward": 0.35416666977107525, + "step": 133 + }, + { + "advantage_max": 1.9435906410217285, + "advantage_mean": -4.967053879312289e-09, + "advantage_min": -0.6701120026409626, + "advantage_std": 0.9998712614178658, + "completion_length": 2339.979202270508, + "epoch": 0.15314285714285714, + "grad_norm": 0.22259725630283356, + "kl": 0.004046916961669922, + "lambda_div_used": 0.5, + "learning_rate": 9.248145583195447e-07, + "loss": 0.0002, + "reward": 0.10023209895007312, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.10023209895007312, + "reward_after_std": 1.0333988890051842, + "reward_before_mean": 0.6736355237662792, + "reward_before_std": 0.9573747776448727, + "reward_change_max": 4.84660267829895e-05, + "reward_change_mean": -0.5734034404158592, + "reward_change_min": -1.0696075037121773, + "reward_change_std": 0.4389838185161352, + "reward_std": 1.033398911356926, + "rewards/cosine_scaled_reward": 0.024317767238244414, + "rewards/format_reward": 0.6250000037252903, + "step": 134 + }, + { + "advantage_max": 1.9079030454158783, + "advantage_mean": 2.4835269396561444e-09, + "advantage_min": -0.7340277917683125, + "advantage_std": 0.9998625218868256, + "completion_length": 1714.1250381469727, + "epoch": 0.15428571428571428, + "grad_norm": 0.2778604030609131, + "kl": 0.003810882568359375, + "lambda_div_used": 0.5, + "learning_rate": 9.230669076497687e-07, + "loss": 0.0002, + "reward": 0.16280546970665455, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.16280546970665455, + "reward_after_std": 0.857132826000452, + "reward_before_mean": 0.8465496301651001, + "reward_before_std": 0.7335850484669209, + "reward_change_max": 0.0013803690671920776, + "reward_change_mean": -0.6837441800162196, + "reward_change_min": -1.207035694271326, + "reward_change_std": 0.4760168734937906, + "reward_std": 0.8571328409016132, + "rewards/cosine_scaled_reward": 0.08994148019701242, + "rewards/format_reward": 0.6666666734963655, + "step": 135 + }, + { + "advantage_max": 1.9024064391851425, + "advantage_mean": 2.483526828633842e-09, + "advantage_min": -0.8795400336384773, + "advantage_std": 0.9998568445444107, + "completion_length": 2289.0208740234375, + "epoch": 0.15542857142857142, + "grad_norm": 0.21803708374500275, + "kl": 0.003295421600341797, + "lambda_div_used": 0.5, + "learning_rate": 9.213010742252327e-07, + "loss": 0.0001, + "reward": -0.003703461028635502, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.003703461028635502, + "reward_after_std": 0.86224514991045, + "reward_before_mean": 0.5360340485349298, + "reward_before_std": 0.8398735448718071, + "reward_change_max": 0.0023953020572662354, + "reward_change_mean": -0.5397375160828233, + "reward_change_min": -1.0290844030678272, + "reward_change_std": 0.4134146720170975, + "reward_std": 0.8622451946139336, + "rewards/cosine_scaled_reward": -0.013232994824647903, + "rewards/format_reward": 0.5625000093132257, + "step": 136 + }, + { + "advantage_max": 1.8921066671609879, + "advantage_mean": 8.568168108347152e-08, + "advantage_min": -0.9159591495990753, + "advantage_std": 0.9996925368905067, + "completion_length": 2726.250030517578, + "epoch": 0.15657142857142858, + "grad_norm": 0.15787829458713531, + "kl": 0.0028100013732910156, + "lambda_div_used": 0.5, + "learning_rate": 9.195171441101668e-07, + "loss": 0.0001, + "reward": -0.45353410951793194, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.45353410951793194, + "reward_after_std": 0.40450212359428406, + "reward_before_mean": -0.14005180727690458, + "reward_before_std": 0.3673751577734947, + "reward_change_max": 0.0007824301719665527, + "reward_change_mean": -0.31348231015726924, + "reward_change_min": -0.5197167657315731, + "reward_change_std": 0.21080828132107854, + "reward_std": 0.40450213477015495, + "rewards/cosine_scaled_reward": -0.24710923619568348, + "rewards/format_reward": 0.3541666679084301, + "step": 137 + }, + { + "advantage_max": 1.9471510499715805, + "advantage_mean": 2.4835267176115394e-09, + "advantage_min": -0.7620296776294708, + "advantage_std": 0.999866396188736, + "completion_length": 2125.145866394043, + "epoch": 0.15771428571428572, + "grad_norm": 0.20413421094417572, + "kl": 0.0021791458129882812, + "lambda_div_used": 0.5, + "learning_rate": 9.177152042508077e-07, + "loss": 0.0001, + "reward": 0.08020066749304533, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.08020066749304533, + "reward_after_std": 0.9415929988026619, + "reward_before_mean": 0.6609942256473005, + "reward_before_std": 0.8445362187922001, + "reward_change_max": 0.0, + "reward_change_mean": -0.5807935297489166, + "reward_change_min": -1.122881568968296, + "reward_change_std": 0.4123641811311245, + "reward_std": 0.9415930137038231, + "rewards/cosine_scaled_reward": -0.02366957487538457, + "rewards/format_reward": 0.7083333432674408, + "step": 138 + }, + { + "advantage_max": 1.8990636467933655, + "advantage_mean": 3.4148494587604716e-08, + "advantage_min": -0.7599098831415176, + "advantage_std": 0.9998147487640381, + "completion_length": 2946.229232788086, + "epoch": 0.15885714285714286, + "grad_norm": 0.17725327610969543, + "kl": 0.0037374496459960938, + "lambda_div_used": 0.5, + "learning_rate": 9.158953424711624e-07, + "loss": 0.0001, + "reward": -0.23691413225606084, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.23691413225606084, + "reward_after_std": 0.8540695495903492, + "reward_before_mean": 0.11466125026345253, + "reward_before_std": 0.8710706867277622, + "reward_change_max": 0.0020438656210899353, + "reward_change_mean": -0.351575399748981, + "reward_change_min": -0.9406367465853691, + "reward_change_std": 0.35900538228452206, + "reward_std": 0.8540695644915104, + "rewards/cosine_scaled_reward": -0.16141936974599957, + "rewards/format_reward": 0.4375000037252903, + "step": 139 + }, + { + "advantage_max": 1.9330047219991684, + "advantage_mean": 1.0554989604560916e-08, + "advantage_min": -0.7634199261665344, + "advantage_std": 0.9998235404491425, + "completion_length": 2628.354202270508, + "epoch": 0.16, + "grad_norm": 0.2571854889392853, + "kl": 0.005505561828613281, + "lambda_div_used": 0.5, + "learning_rate": 9.140576474687263e-07, + "loss": 0.0002, + "reward": -0.185523915104568, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.185523915104568, + "reward_after_std": 0.6811343766748905, + "reward_before_mean": 0.2555219102650881, + "reward_before_std": 0.5780453644692898, + "reward_change_max": 0.0009342730045318604, + "reward_change_mean": -0.4410457955673337, + "reward_change_min": -0.8106644526124, + "reward_change_std": 0.29249521903693676, + "reward_std": 0.6811344102025032, + "rewards/cosine_scaled_reward": -0.10140572674572468, + "rewards/format_reward": 0.45833334513008595, + "step": 140 + }, + { + "advantage_max": 1.9670456051826477, + "advantage_mean": 2.3593505704688766e-08, + "advantage_min": -0.6404533721506596, + "advantage_std": 0.9998448416590691, + "completion_length": 2478.6041870117188, + "epoch": 0.16114285714285714, + "grad_norm": 0.20180006325244904, + "kl": 0.0043582916259765625, + "lambda_div_used": 0.5, + "learning_rate": 9.122022088101613e-07, + "loss": 0.0002, + "reward": -0.25295988253492396, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.25295988253492396, + "reward_after_std": 0.8347440958023071, + "reward_before_mean": 0.07825877517461777, + "reward_before_std": 0.7710718885064125, + "reward_change_max": 0.0029845386743545532, + "reward_change_mean": -0.33121864404529333, + "reward_change_min": -0.6887594200670719, + "reward_change_std": 0.26015863846987486, + "reward_std": 0.8347441107034683, + "rewards/cosine_scaled_reward": -0.2212872877717018, + "rewards/format_reward": 0.5208333432674408, + "step": 141 + }, + { + "advantage_max": 1.9127947390079498, + "advantage_mean": 2.4214387661647407e-08, + "advantage_min": -0.8311641737818718, + "advantage_std": 0.9998164772987366, + "completion_length": 2687.9375610351562, + "epoch": 0.16228571428571428, + "grad_norm": 0.17241254448890686, + "kl": 0.0032384395599365234, + "lambda_div_used": 0.5, + "learning_rate": 9.103291169269299e-07, + "loss": 0.0001, + "reward": -0.052776604890823364, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.052776604890823364, + "reward_after_std": 0.8440326936542988, + "reward_before_mean": 0.4517287611961365, + "reward_before_std": 0.7939337026327848, + "reward_change_max": 0.0012746453285217285, + "reward_change_mean": -0.5045053567737341, + "reward_change_min": -0.921326007694006, + "reward_change_std": 0.3809546297416091, + "reward_std": 0.8440327122807503, + "rewards/cosine_scaled_reward": -0.05538563430309296, + "rewards/format_reward": 0.5625000055879354, + "step": 142 + }, + { + "advantage_max": 1.8830101788043976, + "advantage_mean": 1.4435500572673732e-08, + "advantage_min": -0.869749516248703, + "advantage_std": 0.9998139664530754, + "completion_length": 2302.7083740234375, + "epoch": 0.16342857142857142, + "grad_norm": 0.19639165699481964, + "kl": 0.0040073394775390625, + "lambda_div_used": 0.5, + "learning_rate": 9.084384631108882e-07, + "loss": 0.0002, + "reward": -0.1497055273503065, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.1497055273503065, + "reward_after_std": 0.6513480730354786, + "reward_before_mean": 0.34076992981135845, + "reward_before_std": 0.6215660385787487, + "reward_change_max": 0.0, + "reward_change_mean": -0.4904754748567939, + "reward_change_min": -0.8741833716630936, + "reward_change_std": 0.34840161446481943, + "reward_std": 0.6513480953872204, + "rewards/cosine_scaled_reward": -0.1421150453388691, + "rewards/format_reward": 0.6250000223517418, + "step": 143 + }, + { + "advantage_max": 1.9165276736021042, + "advantage_mean": -1.0865431776529988e-08, + "advantage_min": -0.8160995990037918, + "advantage_std": 0.999792642891407, + "completion_length": 2817.812530517578, + "epoch": 0.16457142857142856, + "grad_norm": 0.18998044729232788, + "kl": 0.0037641525268554688, + "lambda_div_used": 0.5, + "learning_rate": 9.065303395098358e-07, + "loss": 0.0002, + "reward": -0.19920555595308542, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.19920555595308542, + "reward_after_std": 0.7285573910921812, + "reward_before_mean": 0.22228339943103492, + "reward_before_std": 0.6816221624612808, + "reward_change_max": 0.0012655630707740784, + "reward_change_mean": -0.4214889407157898, + "reward_change_min": -0.9015527628362179, + "reward_change_std": 0.34488642401993275, + "reward_std": 0.7285573966801167, + "rewards/cosine_scaled_reward": -0.06594165321439505, + "rewards/format_reward": 0.3541666716337204, + "step": 144 + }, + { + "advantage_max": 1.9259110987186432, + "advantage_mean": -9.313230187046884e-10, + "advantage_min": -0.7588883340358734, + "advantage_std": 0.9998414367437363, + "completion_length": 2125.31254196167, + "epoch": 0.1657142857142857, + "grad_norm": 0.2980290949344635, + "kl": 0.0061130523681640625, + "lambda_div_used": 0.5, + "learning_rate": 9.046048391230247e-07, + "loss": 0.0002, + "reward": 0.04788858536630869, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.04788858536630869, + "reward_after_std": 0.7997053451836109, + "reward_before_mean": 0.649374682456255, + "reward_before_std": 0.6824689619243145, + "reward_change_max": 0.0015939399600028992, + "reward_change_mean": -0.601486106403172, + "reward_change_min": -1.022288154810667, + "reward_change_std": 0.3961034547537565, + "reward_std": 0.7997053638100624, + "rewards/cosine_scaled_reward": 0.0017706537619233131, + "rewards/format_reward": 0.6458333395421505, + "step": 145 + }, + { + "advantage_max": 1.8907168805599213, + "advantage_mean": 1.459072107579118e-08, + "advantage_min": -0.8794617429375648, + "advantage_std": 0.9998200982809067, + "completion_length": 1808.7708740234375, + "epoch": 0.16685714285714287, + "grad_norm": 0.2139863818883896, + "kl": 0.0027594566345214844, + "lambda_div_used": 0.5, + "learning_rate": 9.026620557966279e-07, + "loss": 0.0001, + "reward": -0.08791144005954266, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.08791144005954266, + "reward_after_std": 0.6702312454581261, + "reward_before_mean": 0.4437525011599064, + "reward_before_std": 0.6040500663220882, + "reward_change_max": 1.3709068298339844e-05, + "reward_change_mean": -0.5316639486700296, + "reward_change_min": -0.9739903621375561, + "reward_change_std": 0.35773606039583683, + "reward_std": 0.670231256633997, + "rewards/cosine_scaled_reward": -0.16354042233433574, + "rewards/format_reward": 0.7708333395421505, + "step": 146 + }, + { + "advantage_max": 1.939362108707428, + "advantage_mean": 4.811833542728294e-09, + "advantage_min": -0.8192641139030457, + "advantage_std": 0.9998601749539375, + "completion_length": 1971.7500534057617, + "epoch": 0.168, + "grad_norm": 0.2149391919374466, + "kl": 0.004222869873046875, + "lambda_div_used": 0.5, + "learning_rate": 9.007020842191634e-07, + "loss": 0.0002, + "reward": 0.08826066565234214, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.08826066565234214, + "reward_after_std": 0.9593193866312504, + "reward_before_mean": 0.6639640498906374, + "reward_before_std": 0.8700052294880152, + "reward_change_max": 0.0, + "reward_change_mean": -0.5757033489644527, + "reward_change_min": -0.9369436614215374, + "reward_change_std": 0.37583223544061184, + "reward_std": 0.9593194648623466, + "rewards/cosine_scaled_reward": -0.0013513155281543732, + "rewards/format_reward": 0.6666666772216558, + "step": 147 + }, + { + "advantage_max": 1.8952298015356064, + "advantage_mean": -8.692344399818808e-09, + "advantage_min": -0.8259298205375671, + "advantage_std": 0.9998547807335854, + "completion_length": 1961.854232788086, + "epoch": 0.16914285714285715, + "grad_norm": 0.2422906756401062, + "kl": 0.0032510757446289062, + "lambda_div_used": 0.5, + "learning_rate": 8.987250199168808e-07, + "loss": 0.0001, + "reward": 0.02922473382204771, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.02922473382204771, + "reward_after_std": 0.7920557893812656, + "reward_before_mean": 0.6183740254491568, + "reward_before_std": 0.7007161788642406, + "reward_change_max": 0.0, + "reward_change_mean": -0.589149309322238, + "reward_change_min": -1.0229771696031094, + "reward_change_std": 0.3872289936989546, + "reward_std": 0.7920558117330074, + "rewards/cosine_scaled_reward": -0.04497966240160167, + "rewards/format_reward": 0.7083333414047956, + "step": 148 + }, + { + "advantage_max": 1.9160983711481094, + "advantage_mean": 3.973643092347601e-08, + "advantage_min": -0.8291295692324638, + "advantage_std": 0.999817244708538, + "completion_length": 2609.937545776367, + "epoch": 0.1702857142857143, + "grad_norm": 0.19904126226902008, + "kl": 0.003222942352294922, + "lambda_div_used": 0.5, + "learning_rate": 8.967309592491052e-07, + "loss": 0.0001, + "reward": -0.07225888641551137, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.07225888641551137, + "reward_after_std": 0.8874395098537207, + "reward_before_mean": 0.40166381001472473, + "reward_before_std": 0.848232576623559, + "reward_change_max": 7.143616676330566e-05, + "reward_change_mean": -0.47392270155251026, + "reward_change_min": -0.8683690465986729, + "reward_change_std": 0.36992147751152515, + "reward_std": 0.8874395303428173, + "rewards/cosine_scaled_reward": -0.0804181108251214, + "rewards/format_reward": 0.5625000074505806, + "step": 149 + }, + { + "advantage_max": 1.851120799779892, + "advantage_mean": 2.6387473095468295e-08, + "advantage_min": -0.9206305295228958, + "advantage_std": 0.9998620450496674, + "completion_length": 2259.666702270508, + "epoch": 0.17142857142857143, + "grad_norm": 0.235165536403656, + "kl": 0.0041065216064453125, + "lambda_div_used": 0.5, + "learning_rate": 8.9471999940354e-07, + "loss": 0.0002, + "reward": 0.12295411620289087, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.12295411620289087, + "reward_after_std": 0.9792751409113407, + "reward_before_mean": 0.7382536884397268, + "reward_before_std": 0.9881106838583946, + "reward_change_max": 0.0017604902386665344, + "reward_change_mean": -0.6152995843440294, + "reward_change_min": -1.2774429582059383, + "reward_change_std": 0.5003240220248699, + "reward_std": 0.9792751893401146, + "rewards/cosine_scaled_reward": 0.03579352074302733, + "rewards/format_reward": 0.6666666828095913, + "step": 150 + }, + { + "advantage_max": 1.860343113541603, + "advantage_mean": 3.725290076417309e-09, + "advantage_min": -0.9063407108187675, + "advantage_std": 0.9998343735933304, + "completion_length": 2302.7500534057617, + "epoch": 0.17257142857142857, + "grad_norm": 0.24778403341770172, + "kl": 0.004438161849975586, + "lambda_div_used": 0.5, + "learning_rate": 8.926922383915315e-07, + "loss": 0.0002, + "reward": 0.07336704945191741, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.07336704945191741, + "reward_after_std": 0.8510355353355408, + "reward_before_mean": 0.6922492235898972, + "reward_before_std": 0.83472590893507, + "reward_change_max": 0.0, + "reward_change_mean": -0.6188821662217379, + "reward_change_min": -1.1263868436217308, + "reward_change_std": 0.45005420222878456, + "reward_std": 0.851035550236702, + "rewards/cosine_scaled_reward": 0.033624591305851936, + "rewards/format_reward": 0.6250000074505806, + "step": 151 + }, + { + "advantage_max": 1.8980120420455933, + "advantage_mean": 1.3659397612997282e-08, + "advantage_min": -0.8315541744232178, + "advantage_std": 0.9997904226183891, + "completion_length": 2617.5833740234375, + "epoch": 0.1737142857142857, + "grad_norm": 0.25885194540023804, + "kl": 0.0038526058197021484, + "lambda_div_used": 0.5, + "learning_rate": 8.906477750432903e-07, + "loss": 0.0002, + "reward": -0.24192199483513832, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.24192199483513832, + "reward_after_std": 0.6966664083302021, + "reward_before_mean": 0.1597299799323082, + "reward_before_std": 0.7053615525364876, + "reward_change_max": 0.0009815618395805359, + "reward_change_mean": -0.40165199153125286, + "reward_change_min": -0.8559901602566242, + "reward_change_std": 0.3405442573130131, + "reward_std": 0.6966664344072342, + "rewards/cosine_scaled_reward": -0.1180516816675663, + "rewards/format_reward": 0.39583334140479565, + "step": 152 + }, + { + "advantage_max": 1.862968534231186, + "advantage_mean": 4.6566130562641916e-08, + "advantage_min": -0.9057076796889305, + "advantage_std": 0.9997600317001343, + "completion_length": 2691.729217529297, + "epoch": 0.17485714285714285, + "grad_norm": 0.26509541273117065, + "kl": 0.008235931396484375, + "lambda_div_used": 0.5, + "learning_rate": 8.88586709003076e-07, + "loss": 0.0003, + "reward": -0.41280866833403707, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.41280866833403707, + "reward_after_std": 0.4682539440691471, + "reward_before_mean": -0.08330497704446316, + "reward_before_std": 0.4649371337145567, + "reward_change_max": 0.0011959150433540344, + "reward_change_mean": -0.3295036805793643, + "reward_change_min": -0.6323241330683231, + "reward_change_std": 0.25634999945759773, + "reward_std": 0.4682539738714695, + "rewards/cosine_scaled_reward": -0.22915249690413475, + "rewards/format_reward": 0.37500000931322575, + "step": 153 + }, + { + "advantage_max": 1.8755891919136047, + "advantage_mean": 1.8626452213954536e-08, + "advantage_min": -0.9043065384030342, + "advantage_std": 0.9998662620782852, + "completion_length": 2791.416717529297, + "epoch": 0.176, + "grad_norm": 0.16538317501544952, + "kl": 0.0028333663940429688, + "lambda_div_used": 0.5, + "learning_rate": 8.865091407243394e-07, + "loss": 0.0001, + "reward": 0.12848534993827343, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.12848534993827343, + "reward_after_std": 0.9189904667437077, + "reward_before_mean": 0.7679711561650038, + "reward_before_std": 0.8985546752810478, + "reward_change_max": 0.0, + "reward_change_mean": -0.6394858248531818, + "reward_change_min": -1.227436114102602, + "reward_change_std": 0.48260986618697643, + "reward_std": 0.9189904779195786, + "rewards/cosine_scaled_reward": 0.09231891017407179, + "rewards/format_reward": 0.5833333488553762, + "step": 154 + }, + { + "advantage_max": 1.8813521564006805, + "advantage_mean": 1.3038517820973539e-08, + "advantage_min": -0.9157970994710922, + "advantage_std": 0.9998358935117722, + "completion_length": 2411.0625381469727, + "epoch": 0.17714285714285713, + "grad_norm": 0.22895239293575287, + "kl": 0.0044879913330078125, + "lambda_div_used": 0.5, + "learning_rate": 8.844151714648274e-07, + "loss": 0.0002, + "reward": 0.12336456589400768, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.12336456589400768, + "reward_after_std": 0.8058106414973736, + "reward_before_mean": 0.7900370554998517, + "reward_before_std": 0.7316606715321541, + "reward_change_max": 0.0, + "reward_change_mean": -0.6666725035756826, + "reward_change_min": -1.1328043192625046, + "reward_change_std": 0.44681636057794094, + "reward_std": 0.8058106563985348, + "rewards/cosine_scaled_reward": 0.10335184819996357, + "rewards/format_reward": 0.5833333395421505, + "step": 155 + }, + { + "advantage_max": 1.8762772977352142, + "advantage_mean": 3.228584977144067e-08, + "advantage_min": -0.8056568801403046, + "advantage_std": 0.9997596368193626, + "completion_length": 2534.2500381469727, + "epoch": 0.1782857142857143, + "grad_norm": 0.18885566294193268, + "kl": 0.0033082962036132812, + "lambda_div_used": 0.5, + "learning_rate": 8.823049032816478e-07, + "loss": 0.0001, + "reward": -0.15194648504257202, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.15194648504257202, + "reward_after_std": 0.7984406873583794, + "reward_before_mean": 0.2926546409726143, + "reward_before_std": 0.8270257189869881, + "reward_change_max": 0.00426897406578064, + "reward_change_mean": -0.44460115022957325, + "reward_change_min": -0.9271150156855583, + "reward_change_std": 0.39101812755689025, + "reward_std": 0.7984407059848309, + "rewards/cosine_scaled_reward": -0.07242266833782196, + "rewards/format_reward": 0.4375000037252903, + "step": 156 + }, + { + "advantage_max": 1.9505676925182343, + "advantage_mean": 1.614292466367573e-08, + "advantage_min": -0.7314069494605064, + "advantage_std": 0.9998128190636635, + "completion_length": 2602.187530517578, + "epoch": 0.17942857142857144, + "grad_norm": 0.24203114211559296, + "kl": 0.005136966705322266, + "lambda_div_used": 0.5, + "learning_rate": 8.801784390262943e-07, + "loss": 0.0002, + "reward": -0.09863819554448128, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.09863819554448128, + "reward_after_std": 0.7120418101549149, + "reward_before_mean": 0.40562048298306763, + "reward_before_std": 0.5621872544288635, + "reward_change_max": 0.00021963566541671753, + "reward_change_mean": -0.5042586587369442, + "reward_change_min": -0.8021534122526646, + "reward_change_std": 0.307680306956172, + "reward_std": 0.7120418287813663, + "rewards/cosine_scaled_reward": -0.06802311120554805, + "rewards/format_reward": 0.5416666716337204, + "step": 157 + }, + { + "advantage_max": 1.909734457731247, + "advantage_mean": 1.2417634698280722e-08, + "advantage_min": -0.8408237770199776, + "advantage_std": 0.9998659491539001, + "completion_length": 2733.8126220703125, + "epoch": 0.18057142857142858, + "grad_norm": 0.19597436487674713, + "kl": 0.0046710968017578125, + "lambda_div_used": 0.5, + "learning_rate": 8.780358823396352e-07, + "loss": 0.0002, + "reward": 0.04880722239613533, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.04880722239613533, + "reward_after_std": 0.8823314607143402, + "reward_before_mean": 0.6228008596226573, + "reward_before_std": 0.8027093037962914, + "reward_change_max": 0.0017058923840522766, + "reward_change_mean": -0.573993619531393, + "reward_change_min": -1.032454714179039, + "reward_change_std": 0.4067836385220289, + "reward_std": 0.8823314979672432, + "rewards/cosine_scaled_reward": 0.05098374653607607, + "rewards/format_reward": 0.5208333525806665, + "step": 158 + }, + { + "advantage_max": 1.8789568841457367, + "advantage_mean": 4.346172643998614e-09, + "advantage_min": -0.8822707831859589, + "advantage_std": 0.9997832998633385, + "completion_length": 2412.5208435058594, + "epoch": 0.18171428571428572, + "grad_norm": 0.22717958688735962, + "kl": 0.0044708251953125, + "lambda_div_used": 0.5, + "learning_rate": 8.758773376468604e-07, + "loss": 0.0002, + "reward": -0.21282217151019722, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.21282217151019722, + "reward_after_std": 0.5436347834765911, + "reward_before_mean": 0.25775578059256077, + "reward_before_std": 0.5021079070866108, + "reward_change_max": 0.0008254200220108032, + "reward_change_mean": -0.4705779440701008, + "reward_change_min": -0.8097075633704662, + "reward_change_std": 0.31645464431494474, + "reward_std": 0.5436348021030426, + "rewards/cosine_scaled_reward": -0.13153879530727863, + "rewards/format_reward": 0.5208333432674408, + "step": 159 + }, + { + "advantage_max": 1.9354328662157059, + "advantage_mean": 1.4901161637936866e-08, + "advantage_min": -0.7828179746866226, + "advantage_std": 0.9998691380023956, + "completion_length": 2073.2500381469727, + "epoch": 0.18285714285714286, + "grad_norm": 0.20922227203845978, + "kl": 0.0052356719970703125, + "lambda_div_used": 0.5, + "learning_rate": 8.737029101523929e-07, + "loss": 0.0002, + "reward": 0.041329525411129, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.041329525411129, + "reward_after_std": 0.8853391669690609, + "reward_before_mean": 0.6063848384656012, + "reward_before_std": 0.8054082207381725, + "reward_change_max": 0.0010421797633171082, + "reward_change_mean": -0.5650553349405527, + "reward_change_min": -0.9835452996194363, + "reward_change_std": 0.3828981779515743, + "reward_std": 0.8853391967713833, + "rewards/cosine_scaled_reward": -0.03014090470969677, + "rewards/format_reward": 0.6666666734963655, + "step": 160 + }, + { + "advantage_max": 1.9191998690366745, + "advantage_mean": 3.414849514271623e-08, + "advantage_min": -0.741759903728962, + "advantage_std": 0.9998334273695946, + "completion_length": 2272.854232788086, + "epoch": 0.184, + "grad_norm": 0.2741566002368927, + "kl": 0.00377655029296875, + "lambda_div_used": 0.5, + "learning_rate": 8.715127058347614e-07, + "loss": 0.0002, + "reward": -0.12755455309525132, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.12755455309525132, + "reward_after_std": 0.7911244854331017, + "reward_before_mean": 0.3247405719012022, + "reward_before_std": 0.7095393165946007, + "reward_change_max": 0.0008605495095252991, + "reward_change_mean": -0.45229510217905045, + "reward_change_min": -0.8612713702023029, + "reward_change_std": 0.313374862074852, + "reward_std": 0.7911244966089725, + "rewards/cosine_scaled_reward": -0.12929638382047415, + "rewards/format_reward": 0.5833333358168602, + "step": 161 + }, + { + "advantage_max": 1.868800163269043, + "advantage_mean": 3.7252905427109795e-08, + "advantage_min": -0.8891083151102066, + "advantage_std": 0.9998169168829918, + "completion_length": 2741.6458892822266, + "epoch": 0.18514285714285714, + "grad_norm": 0.2599841356277466, + "kl": 0.007129669189453125, + "lambda_div_used": 0.5, + "learning_rate": 8.693068314414344e-07, + "loss": 0.0003, + "reward": -0.12564774230122566, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.12564774230122566, + "reward_after_std": 0.7360602058470249, + "reward_before_mean": 0.3576655611395836, + "reward_before_std": 0.7290375046432018, + "reward_change_max": 0.00034596025943756104, + "reward_change_mean": -0.4833132941275835, + "reward_change_min": -0.9254280216991901, + "reward_change_std": 0.37433141842484474, + "reward_std": 0.7360602542757988, + "rewards/cosine_scaled_reward": -0.02950056130066514, + "rewards/format_reward": 0.4166666716337204, + "step": 162 + }, + { + "advantage_max": 1.8910458385944366, + "advantage_mean": -1.1486311457531428e-08, + "advantage_min": -0.7969113737344742, + "advantage_std": 0.9998064860701561, + "completion_length": 2245.729202270508, + "epoch": 0.18628571428571428, + "grad_norm": 0.2186397910118103, + "kl": 0.005253791809082031, + "lambda_div_used": 0.5, + "learning_rate": 8.670853944836176e-07, + "loss": 0.0002, + "reward": 0.010679369792342186, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.010679369792342186, + "reward_after_std": 0.683979082852602, + "reward_before_mean": 0.6219610050320625, + "reward_before_std": 0.5548157496377826, + "reward_change_max": 0.0006725862622261047, + "reward_change_mean": -0.6112816166132689, + "reward_change_min": -0.9902759939432144, + "reward_change_std": 0.39613597467541695, + "reward_std": 0.6839791089296341, + "rewards/cosine_scaled_reward": 0.019313829019665718, + "rewards/format_reward": 0.5833333432674408, + "step": 163 + }, + { + "advantage_max": 1.8810230642557144, + "advantage_mean": 2.1265200045306187e-08, + "advantage_min": -0.8811631724238396, + "advantage_std": 0.9998394995927811, + "completion_length": 1964.4583587646484, + "epoch": 0.18742857142857142, + "grad_norm": 0.2334863245487213, + "kl": 0.0047149658203125, + "lambda_div_used": 0.5, + "learning_rate": 8.648485032310144e-07, + "loss": 0.0002, + "reward": 0.07969965832307935, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.07969965832307935, + "reward_after_std": 0.7769861966371536, + "reward_before_mean": 0.7223222488537431, + "reward_before_std": 0.7593985050916672, + "reward_change_max": 0.0, + "reward_change_mean": -0.642622577957809, + "reward_change_min": -1.1558948084712029, + "reward_change_std": 0.46158906538039446, + "reward_std": 0.7769861966371536, + "rewards/cosine_scaled_reward": -0.0034222062677145004, + "rewards/format_reward": 0.7291666828095913, + "step": 164 + }, + { + "advantage_max": 1.9029300063848495, + "advantage_mean": -1.2417634698280722e-09, + "advantage_min": -0.8189666792750359, + "advantage_std": 0.9998203292489052, + "completion_length": 2241.9166946411133, + "epoch": 0.18857142857142858, + "grad_norm": 0.22668945789337158, + "kl": 0.005153656005859375, + "lambda_div_used": 0.5, + "learning_rate": 8.625962667065487e-07, + "loss": 0.0002, + "reward": -0.16207364294677973, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.16207364294677973, + "reward_after_std": 0.7425038516521454, + "reward_before_mean": 0.2870515671093017, + "reward_before_std": 0.7249557636678219, + "reward_change_max": 0.0007998421788215637, + "reward_change_mean": -0.44912520330399275, + "reward_change_min": -0.9289770163595676, + "reward_change_std": 0.3635756126604974, + "reward_std": 0.7425038702785969, + "rewards/cosine_scaled_reward": -0.12730755750089884, + "rewards/format_reward": 0.541666679084301, + "step": 165 + }, + { + "advantage_max": 1.906169667840004, + "advantage_mean": 1.552204276222824e-08, + "advantage_min": -0.8235228583216667, + "advantage_std": 0.9998261034488678, + "completion_length": 2244.4583587646484, + "epoch": 0.18971428571428572, + "grad_norm": 0.16937896609306335, + "kl": 0.0038509368896484375, + "lambda_div_used": 0.5, + "learning_rate": 8.603287946810513e-07, + "loss": 0.0002, + "reward": 0.02075265534222126, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.02075265534222126, + "reward_after_std": 0.7293945774435997, + "reward_before_mean": 0.6221222206950188, + "reward_before_std": 0.6278940867632627, + "reward_change_max": 0.0005093738436698914, + "reward_change_mean": -0.6013695877045393, + "reward_change_min": -1.004751831293106, + "reward_change_std": 0.38687464594841003, + "reward_std": 0.7293945997953415, + "rewards/cosine_scaled_reward": 0.008977774530649185, + "rewards/format_reward": 0.6041666697710752, + "step": 166 + }, + { + "advantage_max": 1.8805283606052399, + "advantage_mean": 6.208817349140361e-09, + "advantage_min": -0.9127615913748741, + "advantage_std": 0.9998705834150314, + "completion_length": 2170.187545776367, + "epoch": 0.19085714285714286, + "grad_norm": 0.19412115216255188, + "kl": 0.0035266876220703125, + "lambda_div_used": 0.5, + "learning_rate": 8.580461976679099e-07, + "loss": 0.0001, + "reward": 0.07650089706294239, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.07650089706294239, + "reward_after_std": 0.9583298601210117, + "reward_before_mean": 0.6533183455467224, + "reward_before_std": 0.9510982520878315, + "reward_change_max": 0.006050758063793182, + "reward_change_mean": -0.5768174193799496, + "reward_change_min": -1.1144284754991531, + "reward_change_std": 0.4647065959870815, + "reward_std": 0.9583299160003662, + "rewards/cosine_scaled_reward": -0.06917417328804731, + "rewards/format_reward": 0.7916666865348816, + "step": 167 + }, + { + "advantage_max": 1.8871784955263138, + "advantage_mean": -8.071462831438225e-09, + "advantage_min": -0.8822121098637581, + "advantage_std": 0.9998352527618408, + "completion_length": 2507.8334197998047, + "epoch": 0.192, + "grad_norm": 0.19946350157260895, + "kl": 0.0037517547607421875, + "lambda_div_used": 0.5, + "learning_rate": 8.557485869176825e-07, + "loss": 0.0002, + "reward": -0.021524932235479355, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.021524932235479355, + "reward_after_std": 0.8910724967718124, + "reward_before_mean": 0.4931417219340801, + "reward_before_std": 0.8521817494183779, + "reward_change_max": 0.001464754343032837, + "reward_change_mean": -0.5146666690707207, + "reward_change_min": -1.010201033204794, + "reward_change_std": 0.3852034341543913, + "reward_std": 0.8910725526511669, + "rewards/cosine_scaled_reward": -0.05551247042603791, + "rewards/format_reward": 0.6041666772216558, + "step": 168 + }, + { + "advantage_max": 1.9182345271110535, + "advantage_mean": -3.4769377266208323e-08, + "advantage_min": -0.9020901657640934, + "advantage_std": 0.999874897301197, + "completion_length": 1449.6041946411133, + "epoch": 0.19314285714285714, + "grad_norm": 0.20374953746795654, + "kl": 0.0037746429443359375, + "lambda_div_used": 0.5, + "learning_rate": 8.534360744126753e-07, + "loss": 0.0002, + "reward": 0.4316548388451338, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4316548388451338, + "reward_after_std": 0.8835392966866493, + "reward_before_mean": 1.3296481654979289, + "reward_before_std": 0.7325374772772193, + "reward_change_max": 0.0, + "reward_change_mean": -0.8979933187365532, + "reward_change_min": -1.3769907057285309, + "reward_change_std": 0.554233618080616, + "reward_std": 0.8835393264889717, + "rewards/cosine_scaled_reward": 0.21690738759934902, + "rewards/format_reward": 0.8958333432674408, + "step": 169 + }, + { + "advantage_max": 1.9222121238708496, + "advantage_mean": 4.718701318573437e-08, + "advantage_min": -0.8292897716164589, + "advantage_std": 0.9997989609837532, + "completion_length": 2184.1875534057617, + "epoch": 0.19428571428571428, + "grad_norm": 0.21339966356754303, + "kl": 0.004436492919921875, + "lambda_div_used": 0.5, + "learning_rate": 8.511087728614862e-07, + "loss": 0.0002, + "reward": -0.003595355898141861, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.003595355898141861, + "reward_after_std": 0.7801594603806734, + "reward_before_mean": 0.56236975453794, + "reward_before_std": 0.7188321929425001, + "reward_change_max": 0.0028308257460594177, + "reward_change_mean": -0.5659651174210012, + "reward_change_min": -1.0077791698276997, + "reward_change_std": 0.40986311715096235, + "reward_std": 0.7801594976335764, + "rewards/cosine_scaled_reward": -6.513111293315887e-05, + "rewards/format_reward": 0.5625, + "step": 170 + }, + { + "advantage_max": 1.8645301908254623, + "advantage_mean": 1.870406252102441e-08, + "advantage_min": -0.8515758588910103, + "advantage_std": 0.9998170509934425, + "completion_length": 2218.6041717529297, + "epoch": 0.19542857142857142, + "grad_norm": 0.1987854391336441, + "kl": 0.0031156539916992188, + "lambda_div_used": 0.5, + "learning_rate": 8.487667956935087e-07, + "loss": 0.0001, + "reward": 0.025004766881465912, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.025004766881465912, + "reward_after_std": 0.7621033377945423, + "reward_before_mean": 0.6254060752689838, + "reward_before_std": 0.7110554575920105, + "reward_change_max": 0.0008187741041183472, + "reward_change_mean": -0.600401321426034, + "reward_change_min": -1.0654651895165443, + "reward_change_std": 0.4122706390917301, + "reward_std": 0.7621033787727356, + "rewards/cosine_scaled_reward": 0.04186970740556717, + "rewards/format_reward": 0.5416666679084301, + "step": 171 + }, + { + "advantage_max": 1.8990549445152283, + "advantage_mean": 2.483527605789959e-09, + "advantage_min": -0.8405355215072632, + "advantage_std": 0.9998717159032822, + "completion_length": 2534.520866394043, + "epoch": 0.19657142857142856, + "grad_norm": 0.26087817549705505, + "kl": 0.0061130523681640625, + "lambda_div_used": 0.5, + "learning_rate": 8.464102570534061e-07, + "loss": 0.0002, + "reward": 0.19893252104520798, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.19893252104520798, + "reward_after_std": 0.9219256825745106, + "reward_before_mean": 0.8899438828229904, + "reward_before_std": 0.8286309987306595, + "reward_change_max": 0.0008333176374435425, + "reward_change_mean": -0.6910113664343953, + "reward_change_min": -1.1952459029853344, + "reward_change_std": 0.4888560585677624, + "reward_std": 0.9219257161021233, + "rewards/cosine_scaled_reward": 0.1741385916247964, + "rewards/format_reward": 0.5416666809469461, + "step": 172 + }, + { + "advantage_max": 1.9473352134227753, + "advantage_mean": 7.450581041013038e-09, + "advantage_min": -0.7159592658281326, + "advantage_std": 0.9998235329985619, + "completion_length": 1390.0833473205566, + "epoch": 0.1977142857142857, + "grad_norm": 0.2918720543384552, + "kl": 0.004482269287109375, + "lambda_div_used": 0.5, + "learning_rate": 8.440392717955475e-07, + "loss": 0.0002, + "reward": -0.030134814442135394, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.030134814442135394, + "reward_after_std": 0.7218679785728455, + "reward_before_mean": 0.5274257343262434, + "reward_before_std": 0.5976778594776988, + "reward_change_max": 0.00031603872776031494, + "reward_change_mean": -0.5575605668127537, + "reward_change_min": -0.9151461683213711, + "reward_change_std": 0.35466235876083374, + "reward_std": 0.7218680009245872, + "rewards/cosine_scaled_reward": -0.1321204612031579, + "rewards/format_reward": 0.7916666679084301, + "step": 173 + }, + { + "advantage_max": 1.9343837201595306, + "advantage_mean": 9.934107203513065e-09, + "advantage_min": -0.6875580325722694, + "advantage_std": 0.9998670294880867, + "completion_length": 1463.7500305175781, + "epoch": 0.19885714285714284, + "grad_norm": 0.21818959712982178, + "kl": 0.005323886871337891, + "lambda_div_used": 0.5, + "learning_rate": 8.416539554784089e-07, + "loss": 0.0002, + "reward": 0.22527608275413513, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.22527608275413513, + "reward_after_std": 0.9232716746628284, + "reward_before_mean": 0.9317025779746473, + "reward_before_std": 0.7586650950834155, + "reward_change_max": 0.0010546371340751648, + "reward_change_mean": -0.7064264751970768, + "reward_change_min": -1.2322439178824425, + "reward_change_std": 0.45952145755290985, + "reward_std": 0.9232717081904411, + "rewards/cosine_scaled_reward": 0.028351284796372056, + "rewards/format_reward": 0.875, + "step": 174 + }, + { + "advantage_max": 1.8499791771173477, + "advantage_mean": -6.2088170160734535e-09, + "advantage_min": -1.020237274467945, + "advantage_std": 0.999834693968296, + "completion_length": 2484.833366394043, + "epoch": 0.2, + "grad_norm": 0.22397536039352417, + "kl": 0.0039272308349609375, + "lambda_div_used": 0.5, + "learning_rate": 8.392544243589427e-07, + "loss": 0.0002, + "reward": 0.09528359724208713, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.09528359724208713, + "reward_after_std": 0.7255423031747341, + "reward_before_mean": 0.7738265693187714, + "reward_before_std": 0.7208334431052208, + "reward_change_max": 0.00027216970920562744, + "reward_change_mean": -0.6785429371520877, + "reward_change_min": -1.1358166001737118, + "reward_change_std": 0.4691190980374813, + "reward_std": 0.725542314350605, + "rewards/cosine_scaled_reward": 0.07441327720880508, + "rewards/format_reward": 0.6250000167638063, + "step": 175 + }, + { + "advantage_max": 1.905025526881218, + "advantage_mean": 7.450580818968433e-09, + "advantage_min": -0.8770438581705093, + "advantage_std": 0.9998798817396164, + "completion_length": 2178.458396911621, + "epoch": 0.20114285714285715, + "grad_norm": 0.2587198317050934, + "kl": 0.00556182861328125, + "lambda_div_used": 0.5, + "learning_rate": 8.368407953869103e-07, + "loss": 0.0002, + "reward": 0.1341405614912219, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.1341405614912219, + "reward_after_std": 1.0044050849974155, + "reward_before_mean": 0.7469082167372108, + "reward_before_std": 0.9681647643446922, + "reward_change_max": 0.0, + "reward_change_mean": -0.6127676479518414, + "reward_change_min": -1.131796881556511, + "reward_change_std": 0.4701329004019499, + "reward_std": 1.0044051185250282, + "rewards/cosine_scaled_reward": 0.02970409602858126, + "rewards/format_reward": 0.6875000074505806, + "step": 176 + }, + { + "advantage_max": 1.8849133551120758, + "advantage_mean": -1.2417634698280722e-09, + "advantage_min": -0.8384641855955124, + "advantage_std": 0.9998805969953537, + "completion_length": 2451.854232788086, + "epoch": 0.2022857142857143, + "grad_norm": 0.24274566769599915, + "kl": 0.0052585601806640625, + "lambda_div_used": 0.5, + "learning_rate": 8.344131861991828e-07, + "loss": 0.0002, + "reward": 0.02091561071574688, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.02091561071574688, + "reward_after_std": 0.9315523132681847, + "reward_before_mean": 0.5612329412251711, + "reward_before_std": 0.9423600323498249, + "reward_change_max": 0.0008768588304519653, + "reward_change_mean": -0.5403173211961985, + "reward_change_min": -1.0866288468241692, + "reward_change_std": 0.4319887850433588, + "reward_std": 0.9315523356199265, + "rewards/cosine_scaled_reward": -0.031883541494607925, + "rewards/format_reward": 0.625000013038516, + "step": 177 + }, + { + "advantage_max": 1.858227476477623, + "advantage_mean": -3.1044086745701804e-09, + "advantage_min": -0.8183949738740921, + "advantage_std": 0.9998875185847282, + "completion_length": 1927.0833740234375, + "epoch": 0.20342857142857143, + "grad_norm": 0.26487603783607483, + "kl": 0.00604248046875, + "lambda_div_used": 0.5, + "learning_rate": 8.319717151140072e-07, + "loss": 0.0002, + "reward": 0.2407649210654199, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2407649210654199, + "reward_after_std": 1.03031674772501, + "reward_before_mean": 0.9408783931285143, + "reward_before_std": 1.0674152709543705, + "reward_change_max": 0.00026485323905944824, + "reward_change_mean": -0.7001134771853685, + "reward_change_min": -1.4129510670900345, + "reward_change_std": 0.5647882856428623, + "reward_std": 1.0303168073296547, + "rewards/cosine_scaled_reward": 0.13710585562512279, + "rewards/format_reward": 0.6666666809469461, + "step": 178 + }, + { + "advantage_max": 1.928291454911232, + "advantage_mean": 1.8005570590062803e-08, + "advantage_min": -0.7337657734751701, + "advantage_std": 0.9998143911361694, + "completion_length": 2320.0417098999023, + "epoch": 0.20457142857142857, + "grad_norm": 0.2769847512245178, + "kl": 0.0048007965087890625, + "lambda_div_used": 0.5, + "learning_rate": 8.295165011252396e-07, + "loss": 0.0002, + "reward": -0.2737759065348655, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.2737759065348655, + "reward_after_std": 0.7116647139191628, + "reward_before_mean": 0.0847318172454834, + "reward_before_std": 0.6560190003365278, + "reward_change_max": 0.0031480640172958374, + "reward_change_mean": -0.3585077226161957, + "reward_change_min": -0.7125654704868793, + "reward_change_std": 0.2866736575961113, + "reward_std": 0.7116647399961948, + "rewards/cosine_scaled_reward": -0.22846743231639266, + "rewards/format_reward": 0.5416666679084301, + "step": 179 + }, + { + "advantage_max": 1.9269533902406693, + "advantage_mean": -2.1109979209121832e-08, + "advantage_min": -0.6805262081325054, + "advantage_std": 0.9998768717050552, + "completion_length": 1817.520881652832, + "epoch": 0.2057142857142857, + "grad_norm": 0.3480105400085449, + "kl": 0.0066127777099609375, + "lambda_div_used": 0.5, + "learning_rate": 8.270476638965461e-07, + "loss": 0.0003, + "reward": 0.24234669422730803, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.24234669422730803, + "reward_after_std": 1.059589195996523, + "reward_before_mean": 0.924484197050333, + "reward_before_std": 0.9491253048181534, + "reward_change_max": 0.0, + "reward_change_mean": -0.6821374967694283, + "reward_change_min": -1.345071405172348, + "reward_change_std": 0.5074834516271949, + "reward_std": 1.0595892071723938, + "rewards/cosine_scaled_reward": 0.10807542316615582, + "rewards/format_reward": 0.7083333358168602, + "step": 180 + }, + { + "advantage_max": 1.9125166982412338, + "advantage_mean": 9.934107536579972e-09, + "advantage_min": -0.8180154636502266, + "advantage_std": 0.9997949376702309, + "completion_length": 2856.791717529297, + "epoch": 0.20685714285714285, + "grad_norm": 0.23662905395030975, + "kl": 0.0064849853515625, + "lambda_div_used": 0.5, + "learning_rate": 8.245653237555705e-07, + "loss": 0.0003, + "reward": -0.23404993303120136, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.23404993303120136, + "reward_after_std": 0.6118505820631981, + "reward_before_mean": 0.1888514976017177, + "reward_before_std": 0.515215652063489, + "reward_change_max": 0.0014413893222808838, + "reward_change_mean": -0.4229014324955642, + "reward_change_min": -0.6865375861525536, + "reward_change_std": 0.2758261002600193, + "reward_std": 0.6118505895137787, + "rewards/cosine_scaled_reward": -0.0826575867831707, + "rewards/format_reward": 0.3541666753590107, + "step": 181 + }, + { + "advantage_max": 1.8652609288692474, + "advantage_mean": -1.055498977109437e-08, + "advantage_min": -0.9345456510782242, + "advantage_std": 0.9998304173350334, + "completion_length": 1919.791732788086, + "epoch": 0.208, + "grad_norm": 0.16495570540428162, + "kl": 0.002300739288330078, + "lambda_div_used": 0.5, + "learning_rate": 8.220696016880687e-07, + "loss": 0.0001, + "reward": 0.10570523329079151, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.10570523329079151, + "reward_after_std": 0.7955246269702911, + "reward_before_mean": 0.7623180113732815, + "reward_before_std": 0.7287193089723587, + "reward_change_max": 0.002306625247001648, + "reward_change_mean": -0.6566128022968769, + "reward_change_min": -1.0960058122873306, + "reward_change_std": 0.43216282688081264, + "reward_std": 0.7955246269702911, + "rewards/cosine_scaled_reward": 0.016575670335441828, + "rewards/format_reward": 0.7291666753590107, + "step": 182 + }, + { + "advantage_max": 1.9063877165317535, + "advantage_mean": -3.7834980481932234e-08, + "advantage_min": -0.8252428323030472, + "advantage_std": 0.9998625591397285, + "completion_length": 1387.1667022705078, + "epoch": 0.20914285714285713, + "grad_norm": 0.23810604214668274, + "kl": 0.00701904296875, + "lambda_div_used": 0.5, + "learning_rate": 8.195606193320136e-07, + "loss": 0.0003, + "reward": 0.35889948764815927, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.35889948764815927, + "reward_after_std": 0.8050801753997803, + "reward_before_mean": 1.217661987990141, + "reward_before_std": 0.6298352889716625, + "reward_change_max": 0.0, + "reward_change_mean": -0.8587625250220299, + "reward_change_min": -1.343634694814682, + "reward_change_std": 0.5051140710711479, + "reward_std": 0.8050802126526833, + "rewards/cosine_scaled_reward": 0.14008096978068352, + "rewards/format_reward": 0.9375000074505806, + "step": 183 + }, + { + "advantage_max": 1.9439354538917542, + "advantage_mean": -1.7384688355548406e-08, + "advantage_min": -0.7739471718668938, + "advantage_std": 0.9997957646846771, + "completion_length": 2074.5417289733887, + "epoch": 0.2102857142857143, + "grad_norm": 0.2490999549627304, + "kl": 0.0060520172119140625, + "lambda_div_used": 0.5, + "learning_rate": 8.170384989716657e-07, + "loss": 0.0002, + "reward": -0.20943114906549454, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.20943114906549454, + "reward_after_std": 0.6082219518721104, + "reward_before_mean": 0.23564279254060239, + "reward_before_std": 0.49706319300457835, + "reward_change_max": 0.0028117522597312927, + "reward_change_mean": -0.44507398270070553, + "reward_change_min": -0.7033988237380981, + "reward_change_std": 0.28294625133275986, + "reward_std": 0.608221959322691, + "rewards/cosine_scaled_reward": -0.20509526692330837, + "rewards/format_reward": 0.6458333358168602, + "step": 184 + }, + { + "advantage_max": 1.9413893222808838, + "advantage_mean": 4.718701052119911e-08, + "advantage_min": -0.8057427629828453, + "advantage_std": 0.9997518807649612, + "completion_length": 1958.3750228881836, + "epoch": 0.21142857142857144, + "grad_norm": 0.2674250900745392, + "kl": 0.004154205322265625, + "lambda_div_used": 0.5, + "learning_rate": 8.145033635316128e-07, + "loss": 0.0002, + "reward": -0.3591794992535142, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.3591794992535142, + "reward_after_std": 0.5219908468425274, + "reward_before_mean": -0.010549581609666348, + "reward_before_std": 0.439737007021904, + "reward_change_max": 0.0004829689860343933, + "reward_change_mean": -0.3486299216747284, + "reward_change_min": -0.5860956497490406, + "reward_change_std": 0.23113016970455647, + "reward_std": 0.5219908636063337, + "rewards/cosine_scaled_reward": -0.3073581252247095, + "rewards/format_reward": 0.6041666753590107, + "step": 185 + }, + { + "advantage_max": 1.873088613152504, + "advantage_mean": 1.490116141589226e-08, + "advantage_min": -0.9214615821838379, + "advantage_std": 0.9998159259557724, + "completion_length": 2246.875015258789, + "epoch": 0.21257142857142858, + "grad_norm": 0.2050282061100006, + "kl": 0.00543212890625, + "lambda_div_used": 0.5, + "learning_rate": 8.119553365707802e-07, + "loss": 0.0002, + "reward": -0.07845413440372795, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.07845413440372795, + "reward_after_std": 0.6444421708583832, + "reward_before_mean": 0.47786211501806974, + "reward_before_std": 0.6074944473803043, + "reward_change_max": 0.002060152590274811, + "reward_change_mean": -0.556316233240068, + "reward_change_min": -0.9550213851034641, + "reward_change_std": 0.38717483170330524, + "reward_std": 0.6444421894848347, + "rewards/cosine_scaled_reward": -0.04231897369027138, + "rewards/format_reward": 0.5625000055879354, + "step": 186 + }, + { + "advantage_max": 1.9475088268518448, + "advantage_mean": 1.3038517154839724e-08, + "advantage_min": -0.7693781480193138, + "advantage_std": 0.9998098015785217, + "completion_length": 1656.9375305175781, + "epoch": 0.21371428571428572, + "grad_norm": 0.23222263157367706, + "kl": 0.005061149597167969, + "lambda_div_used": 0.5, + "learning_rate": 8.093945422764069e-07, + "loss": 0.0002, + "reward": -0.08286092977505177, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.08286092977505177, + "reward_after_std": 0.5727328844368458, + "reward_before_mean": 0.4811726361513138, + "reward_before_std": 0.43661591596901417, + "reward_change_max": 0.0015158876776695251, + "reward_change_mean": -0.564033567905426, + "reward_change_min": -0.8926524370908737, + "reward_change_std": 0.33777882531285286, + "reward_std": 0.5727329030632973, + "rewards/cosine_scaled_reward": -0.15524702798575163, + "rewards/format_reward": 0.7916666716337204, + "step": 187 + }, + { + "advantage_max": 1.9176759123802185, + "advantage_mean": -4.967054045845742e-09, + "advantage_min": -0.7783575281500816, + "advantage_std": 0.9998136684298515, + "completion_length": 2514.562515258789, + "epoch": 0.21485714285714286, + "grad_norm": 0.1731446236371994, + "kl": 0.0059967041015625, + "lambda_div_used": 0.5, + "learning_rate": 8.068211054579943e-07, + "loss": 0.0002, + "reward": -0.20618502353318036, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.20618502353318036, + "reward_after_std": 0.7404297292232513, + "reward_before_mean": 0.19920311123132706, + "reward_before_std": 0.6630725599825382, + "reward_change_max": 0.00040778517723083496, + "reward_change_mean": -0.40538814663887024, + "reward_change_min": -0.7672394849359989, + "reward_change_std": 0.28316234797239304, + "reward_std": 0.7404297553002834, + "rewards/cosine_scaled_reward": -0.15039845742285252, + "rewards/format_reward": 0.5000000074505806, + "step": 188 + }, + { + "advantage_max": 1.9747483879327774, + "advantage_mean": 7.450580596923828e-09, + "advantage_min": -0.6897084377706051, + "advantage_std": 0.999849870800972, + "completion_length": 1798.0417098999023, + "epoch": 0.216, + "grad_norm": 0.24236349761486053, + "kl": 0.0056133270263671875, + "lambda_div_used": 0.5, + "learning_rate": 8.04235151541222e-07, + "loss": 0.0002, + "reward": -0.020142017863690853, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.020142017863690853, + "reward_after_std": 0.8742521069943905, + "reward_before_mean": 0.4924778901040554, + "reward_before_std": 0.7583403587341309, + "reward_change_max": 0.0004052966833114624, + "reward_change_mean": -0.5126199284568429, + "reward_change_min": -0.873991385102272, + "reward_change_std": 0.3210005727596581, + "reward_std": 0.8742521218955517, + "rewards/cosine_scaled_reward": -0.0975110623985529, + "rewards/format_reward": 0.687500013038516, + "step": 189 + }, + { + "advantage_max": 1.9481079131364822, + "advantage_mean": 2.4835269396561444e-09, + "advantage_min": -0.6872792914509773, + "advantage_std": 0.9998851120471954, + "completion_length": 1400.145881652832, + "epoch": 0.21714285714285714, + "grad_norm": 0.22386141121387482, + "kl": 0.0051937103271484375, + "lambda_div_used": 0.5, + "learning_rate": 8.01636806561836e-07, + "loss": 0.0002, + "reward": 0.2527433391660452, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2527433391660452, + "reward_after_std": 0.9781098589301109, + "reward_before_mean": 0.9614727329462767, + "reward_before_std": 0.8016847297549248, + "reward_change_max": 0.0, + "reward_change_mean": -0.7087293863296509, + "reward_change_min": -1.2133204266428947, + "reward_change_std": 0.45133110880851746, + "reward_std": 0.9781098812818527, + "rewards/cosine_scaled_reward": 0.0432363604195416, + "rewards/format_reward": 0.8750000149011612, + "step": 190 + }, + { + "advantage_max": 1.9580977708101273, + "advantage_mean": 3.725290298461914e-09, + "advantage_min": -0.6801789999008179, + "advantage_std": 0.9998925402760506, + "completion_length": 1348.5625534057617, + "epoch": 0.21828571428571428, + "grad_norm": 0.26126664876937866, + "kl": 0.004961967468261719, + "lambda_div_used": 0.5, + "learning_rate": 7.990261971595048e-07, + "loss": 0.0002, + "reward": 0.320645788917318, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.320645788917318, + "reward_after_std": 1.0616176091134548, + "reward_before_mean": 1.060829535126686, + "reward_before_std": 0.8928857175633311, + "reward_change_max": 0.002303190529346466, + "reward_change_mean": -0.7401837222278118, + "reward_change_min": -1.3448436558246613, + "reward_change_std": 0.4931131489574909, + "reward_std": 1.0616176202893257, + "rewards/cosine_scaled_reward": 0.07208140660077333, + "rewards/format_reward": 0.9166666716337204, + "step": 191 + }, + { + "advantage_max": 1.9228132516145706, + "advantage_mean": 1.179675312990014e-08, + "advantage_min": -0.7693631574511528, + "advantage_std": 0.9998264163732529, + "completion_length": 1864.6250305175781, + "epoch": 0.21942857142857142, + "grad_norm": 0.21476708352565765, + "kl": 0.005016326904296875, + "lambda_div_used": 0.5, + "learning_rate": 7.964034505716476e-07, + "loss": 0.0002, + "reward": -0.013189246295951307, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.013189246295951307, + "reward_after_std": 0.6955150477588177, + "reward_before_mean": 0.5694208387285471, + "reward_before_std": 0.5938720889389515, + "reward_change_max": 0.0, + "reward_change_mean": -0.5826100930571556, + "reward_change_min": -1.0174332857131958, + "reward_change_std": 0.368743147701025, + "reward_std": 0.6955150812864304, + "rewards/cosine_scaled_reward": -0.12153958529233932, + "rewards/format_reward": 0.8125, + "step": 192 + }, + { + "advantage_max": 1.9259341210126877, + "advantage_mean": 1.1175871006408045e-08, + "advantage_min": -0.7410866692662239, + "advantage_std": 0.9997951835393906, + "completion_length": 2657.7708587646484, + "epoch": 0.22057142857142858, + "grad_norm": 0.5041037201881409, + "kl": 0.016092300415039062, + "lambda_div_used": 0.5, + "learning_rate": 7.93768694627233e-07, + "loss": 0.0006, + "reward": -0.32176475087180734, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.32176475087180734, + "reward_after_std": 0.5517542436718941, + "reward_before_mean": 0.04636443965137005, + "reward_before_std": 0.48256898298859596, + "reward_change_max": 0.0009066537022590637, + "reward_change_mean": -0.3681291975080967, + "reward_change_min": -0.6095230802893639, + "reward_change_std": 0.2412662087008357, + "reward_std": 0.5517542473971844, + "rewards/cosine_scaled_reward": -0.19556778552941978, + "rewards/format_reward": 0.43750000186264515, + "step": 193 + }, + { + "advantage_max": 1.8814585208892822, + "advantage_mean": -2.8560559917067962e-08, + "advantage_min": -0.7467570975422859, + "advantage_std": 0.999844953417778, + "completion_length": 2386.416732788086, + "epoch": 0.22171428571428572, + "grad_norm": 0.19957709312438965, + "kl": 0.006275177001953125, + "lambda_div_used": 0.5, + "learning_rate": 7.911220577405484e-07, + "loss": 0.0003, + "reward": 0.24586467817425728, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.24586467817425728, + "reward_after_std": 0.8887559995055199, + "reward_before_mean": 0.9962658639997244, + "reward_before_std": 0.8165496792644262, + "reward_change_max": 0.0, + "reward_change_mean": -0.7504012230783701, + "reward_change_min": -1.3230471685528755, + "reward_change_std": 0.5285136736929417, + "reward_std": 0.8887560218572617, + "rewards/cosine_scaled_reward": 0.12313293479382992, + "rewards/format_reward": 0.7500000074505806, + "step": 194 + }, + { + "advantage_max": 1.944841906428337, + "advantage_mean": 1.3969838702498905e-08, + "advantage_min": -0.7255363836884499, + "advantage_std": 0.9998145550489426, + "completion_length": 1739.8542022705078, + "epoch": 0.22285714285714286, + "grad_norm": 0.22262509167194366, + "kl": 0.00640106201171875, + "lambda_div_used": 0.5, + "learning_rate": 7.884636689049422e-07, + "loss": 0.0003, + "reward": -0.14309865795075893, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.14309865795075893, + "reward_after_std": 0.700147919356823, + "reward_before_mean": 0.3267921321094036, + "reward_before_std": 0.6149430330842733, + "reward_change_max": 0.00010732561349868774, + "reward_change_mean": -0.4698908142745495, + "reward_change_min": -0.9223340824246407, + "reward_change_std": 0.3288527149707079, + "reward_std": 0.7001479230821133, + "rewards/cosine_scaled_reward": -0.20118727069348097, + "rewards/format_reward": 0.7291666753590107, + "step": 195 + }, + { + "advantage_max": 1.8364529013633728, + "advantage_mean": 1.11758712839638e-08, + "advantage_min": -0.9304062947630882, + "advantage_std": 0.9998329728841782, + "completion_length": 2679.729263305664, + "epoch": 0.224, + "grad_norm": 0.22063890099525452, + "kl": 0.007236480712890625, + "lambda_div_used": 0.5, + "learning_rate": 7.857936576865356e-07, + "loss": 0.0003, + "reward": -0.1205627042800188, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.1205627042800188, + "reward_after_std": 0.7646887302398682, + "reward_before_mean": 0.36539409251417965, + "reward_before_std": 0.8065420165657997, + "reward_change_max": 7.483363151550293e-05, + "reward_change_mean": -0.48595677874982357, + "reward_change_min": -0.9786637127399445, + "reward_change_std": 0.40618606097996235, + "reward_std": 0.7646887451410294, + "rewards/cosine_scaled_reward": -0.05688631488010287, + "rewards/format_reward": 0.4791666716337204, + "step": 196 + }, + { + "advantage_max": 1.9239116162061691, + "advantage_mean": -9.54605661185326e-09, + "advantage_min": -0.7319557182490826, + "advantage_std": 0.9998846724629402, + "completion_length": 1059.9167022705078, + "epoch": 0.22514285714285714, + "grad_norm": 0.2716602385044098, + "kl": 0.005367279052734375, + "lambda_div_used": 0.5, + "learning_rate": 7.831121542179086e-07, + "loss": 0.0002, + "reward": 0.40557946916669607, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.40557946916669607, + "reward_after_std": 1.0015158616006374, + "reward_before_mean": 1.2390838451683521, + "reward_before_std": 0.8481772616505623, + "reward_change_max": 0.0, + "reward_change_mean": -0.8335044011473656, + "reward_change_min": -1.465621568262577, + "reward_change_std": 0.5422770120203495, + "reward_std": 1.0015158914029598, + "rewards/cosine_scaled_reward": 0.1507919318974018, + "rewards/format_reward": 0.9375, + "step": 197 + }, + { + "advantage_max": 1.911670058965683, + "advantage_mean": 2.110997909809953e-08, + "advantage_min": -0.8416686952114105, + "advantage_std": 0.9998601600527763, + "completion_length": 1591.2083587646484, + "epoch": 0.22628571428571428, + "grad_norm": 0.2324543595314026, + "kl": 0.006961822509765625, + "lambda_div_used": 0.5, + "learning_rate": 7.804192891917571e-07, + "loss": 0.0003, + "reward": 0.2818256893660873, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2818256893660873, + "reward_after_std": 0.9285234659910202, + "reward_before_mean": 1.0414979457855225, + "reward_before_std": 0.8359009139239788, + "reward_change_max": 0.0, + "reward_change_mean": -0.7596722422167659, + "reward_change_min": -1.2616542540490627, + "reward_change_std": 0.504207344725728, + "reward_std": 0.9285234957933426, + "rewards/cosine_scaled_reward": 0.10408229497261345, + "rewards/format_reward": 0.8333333358168602, + "step": 198 + }, + { + "advantage_max": 1.9385438710451126, + "advantage_mean": -2.1730859334212482e-09, + "advantage_min": -0.7928804978728294, + "advantage_std": 0.9998569265007973, + "completion_length": 1582.166732788086, + "epoch": 0.22742857142857142, + "grad_norm": 0.22295568883419037, + "kl": 0.0064563751220703125, + "lambda_div_used": 0.5, + "learning_rate": 7.777151938545235e-07, + "loss": 0.0003, + "reward": 0.047651538625359535, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.047651538625359535, + "reward_after_std": 0.8108061738312244, + "reward_before_mean": 0.6391786062158644, + "reward_before_std": 0.6576478350907564, + "reward_change_max": 0.0, + "reward_change_mean": -0.591527096927166, + "reward_change_min": -0.955488495528698, + "reward_change_std": 0.36143919453024864, + "reward_std": 0.8108061775565147, + "rewards/cosine_scaled_reward": -0.1491607059724629, + "rewards/format_reward": 0.9375000074505806, + "step": 199 + }, + { + "advantage_max": 1.9287515133619308, + "advantage_mean": -6.208816794028849e-10, + "advantage_min": -0.8186419308185577, + "advantage_std": 0.999849408864975, + "completion_length": 1619.4583740234375, + "epoch": 0.22857142857142856, + "grad_norm": 0.2281968593597412, + "kl": 0.005645751953125, + "lambda_div_used": 0.5, + "learning_rate": 7.75e-07, + "loss": 0.0002, + "reward": 0.13060855865478516, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.13060855865478516, + "reward_after_std": 0.787628311663866, + "reward_before_mean": 0.8036444410681725, + "reward_before_std": 0.6729063596576452, + "reward_change_max": 0.0010903030633926392, + "reward_change_mean": -0.673035865649581, + "reward_change_min": -1.091851219534874, + "reward_change_std": 0.4272896870970726, + "reward_std": 0.7876283414661884, + "rewards/cosine_scaled_reward": 0.005988870281726122, + "rewards/format_reward": 0.7916666753590107, + "step": 200 + }, + { + "advantage_max": 1.9076077789068222, + "advantage_mean": 1.6763806343078613e-08, + "advantage_min": -0.7760433480143547, + "advantage_std": 0.9998678788542747, + "completion_length": 1999.1667022705078, + "epoch": 0.2297142857142857, + "grad_norm": 0.23005475103855133, + "kl": 0.0055484771728515625, + "lambda_div_used": 0.5, + "learning_rate": 7.72273839962904e-07, + "loss": 0.0002, + "reward": 0.4647516645491123, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4647516645491123, + "reward_after_std": 0.9089144691824913, + "reward_before_mean": 1.3805565685033798, + "reward_before_std": 0.7163506662473083, + "reward_change_max": 0.0, + "reward_change_mean": -0.9158048909157515, + "reward_change_min": -1.4860983304679394, + "reward_change_std": 0.5765415318310261, + "reward_std": 0.9089145064353943, + "rewards/cosine_scaled_reward": 0.2840282618999481, + "rewards/format_reward": 0.8125000074505806, + "step": 201 + }, + { + "advantage_max": 1.9722920954227448, + "advantage_mean": -2.0566708336389183e-08, + "advantage_min": -0.6951834484934807, + "advantage_std": 0.9998218566179276, + "completion_length": 1457.9166946411133, + "epoch": 0.23085714285714284, + "grad_norm": 0.20306335389614105, + "kl": 0.004856109619140625, + "lambda_div_used": 0.5, + "learning_rate": 7.695368466124296e-07, + "loss": 0.0002, + "reward": 0.2708722506649792, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2708722506649792, + "reward_after_std": 0.6537715718150139, + "reward_before_mean": 1.1032776683568954, + "reward_before_std": 0.3559281248599291, + "reward_change_max": 0.0013939812779426575, + "reward_change_mean": -0.8324054088443518, + "reward_change_min": -1.1846450828015804, + "reward_change_std": 0.455369858071208, + "reward_std": 0.6537716016173363, + "rewards/cosine_scaled_reward": 0.14538880321197212, + "rewards/format_reward": 0.8125, + "step": 202 + }, + { + "advantage_max": 1.9697763472795486, + "advantage_mean": 1.2417631367611648e-09, + "advantage_min": -0.7334681376814842, + "advantage_std": 0.9998695030808449, + "completion_length": 1679.4375610351562, + "epoch": 0.232, + "grad_norm": 0.23335202038288116, + "kl": 0.0074920654296875, + "lambda_div_used": 0.5, + "learning_rate": 7.667891533457718e-07, + "loss": 0.0003, + "reward": 0.15621353359892964, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.15621353359892964, + "reward_after_std": 0.8448192551732063, + "reward_before_mean": 0.8237780556082726, + "reward_before_std": 0.6491473764181137, + "reward_change_max": 0.0, + "reward_change_mean": -0.6675645336508751, + "reward_change_min": -1.0493612885475159, + "reward_change_std": 0.399025097489357, + "reward_std": 0.8448192626237869, + "rewards/cosine_scaled_reward": -0.015194314531981945, + "rewards/format_reward": 0.854166679084301, + "step": 203 + }, + { + "advantage_max": 1.9231015890836716, + "advantage_mean": 1.862645149230957e-09, + "advantage_min": -0.7674050070345402, + "advantage_std": 0.9998611137270927, + "completion_length": 1489.666732788086, + "epoch": 0.23314285714285715, + "grad_norm": 0.2765462100505829, + "kl": 0.006114959716796875, + "lambda_div_used": 0.5, + "learning_rate": 7.640308940816239e-07, + "loss": 0.0002, + "reward": 0.19802786083891988, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.19802786083891988, + "reward_after_std": 0.8996425718069077, + "reward_before_mean": 0.8953988589346409, + "reward_before_std": 0.8019166607409716, + "reward_change_max": 0.0, + "reward_change_mean": -0.6973709799349308, + "reward_change_min": -1.293586179614067, + "reward_change_std": 0.4740810338407755, + "reward_std": 0.8996425941586494, + "rewards/cosine_scaled_reward": -0.00021725334227085114, + "rewards/format_reward": 0.8958333432674408, + "step": 204 + }, + { + "advantage_max": 1.9331641048192978, + "advantage_mean": -4.097819450432638e-08, + "advantage_min": -0.7692599110305309, + "advantage_std": 0.9998975172638893, + "completion_length": 1505.895881652832, + "epoch": 0.2342857142857143, + "grad_norm": 0.24515070021152496, + "kl": 0.0053730010986328125, + "lambda_div_used": 0.5, + "learning_rate": 7.612622032536507e-07, + "loss": 0.0002, + "reward": 0.45918071921914816, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.45918071921914816, + "reward_after_std": 1.0181467235088348, + "reward_before_mean": 1.3325093338498846, + "reward_before_std": 0.8213769532740116, + "reward_change_max": 0.0004552304744720459, + "reward_change_mean": -0.8733286261558533, + "reward_change_min": -1.388043962419033, + "reward_change_std": 0.5551399476826191, + "reward_std": 1.018146738409996, + "rewards/cosine_scaled_reward": 0.2287546508014202, + "rewards/format_reward": 0.8750000074505806, + "step": 205 + }, + { + "advantage_max": 1.8465500622987747, + "advantage_mean": 1.4280280458134342e-08, + "advantage_min": -0.9646818488836288, + "advantage_std": 0.9998023062944412, + "completion_length": 2294.312545776367, + "epoch": 0.23542857142857143, + "grad_norm": 0.2301677167415619, + "kl": 0.0046977996826171875, + "lambda_div_used": 0.5, + "learning_rate": 7.584832158039378e-07, + "loss": 0.0002, + "reward": -0.29218528768979013, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.29218528768979013, + "reward_after_std": 0.5265999808907509, + "reward_before_mean": 0.11779676657170057, + "reward_before_std": 0.5209280513226986, + "reward_change_max": 0.0007386580109596252, + "reward_change_mean": -0.40998202189803123, + "reward_change_min": -0.7738494612276554, + "reward_change_std": 0.3092892915010452, + "reward_std": 0.5265999883413315, + "rewards/cosine_scaled_reward": -0.2536016311496496, + "rewards/format_reward": 0.625000013038516, + "step": 206 + }, + { + "advantage_max": 1.9277321547269821, + "advantage_mean": -5.587935947293232e-09, + "advantage_min": -0.7697168327867985, + "advantage_std": 0.9998654946684837, + "completion_length": 1900.0416870117188, + "epoch": 0.23657142857142857, + "grad_norm": 0.3748758137226105, + "kl": 0.007366180419921875, + "lambda_div_used": 0.5, + "learning_rate": 7.556940671764124e-07, + "loss": 0.0003, + "reward": -0.02678732480853796, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.02678732480853796, + "reward_after_std": 0.8746708072721958, + "reward_before_mean": 0.4839032180607319, + "reward_before_std": 0.8162506558001041, + "reward_change_max": 0.0, + "reward_change_mean": -0.5106905549764633, + "reward_change_min": -1.037591204047203, + "reward_change_std": 0.3922302946448326, + "reward_std": 0.8746708072721958, + "rewards/cosine_scaled_reward": -0.15388172399252653, + "rewards/format_reward": 0.7916666753590107, + "step": 207 + }, + { + "advantage_max": 1.9064188599586487, + "advantage_mean": 2.1109978876054925e-08, + "advantage_min": -0.9026965498924255, + "advantage_std": 0.9998533874750137, + "completion_length": 1185.145866394043, + "epoch": 0.2377142857142857, + "grad_norm": 0.22525237500667572, + "kl": 0.00638580322265625, + "lambda_div_used": 0.5, + "learning_rate": 7.528948933102438e-07, + "loss": 0.0003, + "reward": 0.2584262453019619, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2584262453019619, + "reward_after_std": 0.7765984460711479, + "reward_before_mean": 1.0444153249263763, + "reward_before_std": 0.6186449788510799, + "reward_change_max": 0.0, + "reward_change_mean": -0.7859890535473824, + "reward_change_min": -1.2380337715148926, + "reward_change_std": 0.4751918613910675, + "reward_std": 0.7765984572470188, + "rewards/cosine_scaled_reward": 0.0534576578065753, + "rewards/format_reward": 0.9375000149011612, + "step": 208 + }, + { + "advantage_max": 1.9135385006666183, + "advantage_mean": -6.5192582443529545e-09, + "advantage_min": -0.9110362008213997, + "advantage_std": 0.9998896718025208, + "completion_length": 1635.7917098999023, + "epoch": 0.23885714285714285, + "grad_norm": 0.2753134071826935, + "kl": 0.009868621826171875, + "lambda_div_used": 0.5, + "learning_rate": 7.500858306332172e-07, + "loss": 0.0004, + "reward": 0.2827942790463567, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2827942790463567, + "reward_after_std": 0.9956619068980217, + "reward_before_mean": 1.0158954737707973, + "reward_before_std": 0.890892380848527, + "reward_change_max": 0.001428067684173584, + "reward_change_mean": -0.733101200312376, + "reward_change_min": -1.203175701200962, + "reward_change_std": 0.5036097802221775, + "reward_std": 0.9956619516015053, + "rewards/cosine_scaled_reward": 0.12253105826675892, + "rewards/format_reward": 0.7708333544433117, + "step": 209 + }, + { + "advantage_max": 1.9230344742536545, + "advantage_mean": -1.862645149230957e-09, + "advantage_min": -0.8080510422587395, + "advantage_std": 0.9998234212398529, + "completion_length": 1600.104232788086, + "epoch": 0.24, + "grad_norm": 0.21205396950244904, + "kl": 0.005344390869140625, + "lambda_div_used": 0.5, + "learning_rate": 7.472670160550848e-07, + "loss": 0.0002, + "reward": 0.10949054697994143, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.10949054697994143, + "reward_after_std": 0.6526135057210922, + "reward_before_mean": 0.8106731325387955, + "reward_before_std": 0.505129705183208, + "reward_change_max": 0.0018544942140579224, + "reward_change_mean": -0.7011825982481241, + "reward_change_min": -1.0848392620682716, + "reward_change_std": 0.42527284659445286, + "reward_std": 0.6526135131716728, + "rewards/cosine_scaled_reward": -0.03216344257816672, + "rewards/format_reward": 0.8750000055879354, + "step": 210 + }, + { + "advantage_max": 1.9566478729248047, + "advantage_mean": -3.725290520506519e-09, + "advantage_min": -0.6811323426663876, + "advantage_std": 0.9998495057225227, + "completion_length": 1779.6250610351562, + "epoch": 0.24114285714285713, + "grad_norm": 0.26686230301856995, + "kl": 0.008592605590820312, + "lambda_div_used": 0.5, + "learning_rate": 7.444385869608921e-07, + "loss": 0.0003, + "reward": 0.23951259814202785, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.23951259814202785, + "reward_after_std": 0.825482502579689, + "reward_before_mean": 0.990179181098938, + "reward_before_std": 0.6018722131848335, + "reward_change_max": 0.00048663467168807983, + "reward_change_mean": -0.7506665643304586, + "reward_change_min": -1.2245620265603065, + "reward_change_std": 0.4667343068867922, + "reward_std": 0.8254825361073017, + "rewards/cosine_scaled_reward": 0.15133956633508205, + "rewards/format_reward": 0.6875000037252903, + "step": 211 + }, + { + "advantage_max": 1.903679609298706, + "advantage_mean": -3.104407619858307e-09, + "advantage_min": -0.8234128206968307, + "advantage_std": 0.9998500868678093, + "completion_length": 1217.854190826416, + "epoch": 0.2422857142857143, + "grad_norm": 0.2410409301519394, + "kl": 0.00655364990234375, + "lambda_div_used": 0.5, + "learning_rate": 7.416006812042827e-07, + "loss": 0.0003, + "reward": 0.33111227909103036, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.33111227909103036, + "reward_after_std": 0.8645887561142445, + "reward_before_mean": 1.150598868727684, + "reward_before_std": 0.7242950107902288, + "reward_change_max": 0.0, + "reward_change_mean": -0.8194866478443146, + "reward_change_min": -1.380204539746046, + "reward_change_std": 0.5173613056540489, + "reward_std": 0.8645887933671474, + "rewards/cosine_scaled_reward": 0.1482161059975624, + "rewards/format_reward": 0.8541666716337204, + "step": 212 + }, + { + "advantage_max": 1.8833979219198227, + "advantage_mean": 7.450580818968433e-09, + "advantage_min": -0.8867584019899368, + "advantage_std": 0.9998794943094254, + "completion_length": 1776.3750648498535, + "epoch": 0.24342857142857144, + "grad_norm": 0.29762017726898193, + "kl": 0.008733749389648438, + "lambda_div_used": 0.5, + "learning_rate": 7.387534371007797e-07, + "loss": 0.0003, + "reward": 0.14349895459599793, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.14349895459599793, + "reward_after_std": 0.9624890685081482, + "reward_before_mean": 0.7755306456238031, + "reward_before_std": 0.9363698288798332, + "reward_change_max": 0.0, + "reward_change_mean": -0.6320316605269909, + "reward_change_min": -1.2054100409150124, + "reward_change_std": 0.4742110073566437, + "reward_std": 0.9624891020357609, + "rewards/cosine_scaled_reward": -0.008068038150668144, + "rewards/format_reward": 0.7916666716337204, + "step": 213 + }, + { + "advantage_max": 1.896130695939064, + "advantage_mean": 3.104409507237449e-10, + "advantage_min": -0.8746867999434471, + "advantage_std": 0.9998463988304138, + "completion_length": 2014.4792175292969, + "epoch": 0.24457142857142858, + "grad_norm": 0.20922031998634338, + "kl": 0.007541656494140625, + "lambda_div_used": 0.5, + "learning_rate": 7.358969934210438e-07, + "loss": 0.0003, + "reward": 0.0535255391150713, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.0535255391150713, + "reward_after_std": 0.8194720596075058, + "reward_before_mean": 0.6575473733246326, + "reward_before_std": 0.756176769733429, + "reward_change_max": 0.0028605982661247253, + "reward_change_mean": -0.6040218118578196, + "reward_change_min": -1.0416854172945023, + "reward_change_std": 0.41382226534187794, + "reward_std": 0.8194721043109894, + "rewards/cosine_scaled_reward": -0.03580965753644705, + "rewards/format_reward": 0.7291666865348816, + "step": 214 + }, + { + "advantage_max": 1.915054827928543, + "advantage_mean": 1.1175871117430347e-08, + "advantage_min": -0.8241091445088387, + "advantage_std": 0.9998098164796829, + "completion_length": 1411.7500228881836, + "epoch": 0.24571428571428572, + "grad_norm": 0.25436699390411377, + "kl": 0.00496673583984375, + "lambda_div_used": 0.5, + "learning_rate": 7.330314893841101e-07, + "loss": 0.0002, + "reward": -0.07637928635813296, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.07637928635813296, + "reward_after_std": 0.5911289788782597, + "reward_before_mean": 0.4801142290234566, + "reward_before_std": 0.47789497673511505, + "reward_change_max": 0.0, + "reward_change_mean": -0.5564935095608234, + "reward_change_min": -0.9470963031053543, + "reward_change_std": 0.33488226495683193, + "reward_std": 0.5911289900541306, + "rewards/cosine_scaled_reward": -0.17660956643521786, + "rewards/format_reward": 0.8333333432674408, + "step": 215 + }, + { + "advantage_max": 1.8878718316555023, + "advantage_mean": -1.117587122845265e-08, + "advantage_min": -0.9693049490451813, + "advantage_std": 0.9998598992824554, + "completion_length": 1302.7292098999023, + "epoch": 0.24685714285714286, + "grad_norm": 0.290322870016098, + "kl": 0.00699615478515625, + "lambda_div_used": 0.5, + "learning_rate": 7.301570646506027e-07, + "loss": 0.0003, + "reward": 0.3085772795602679, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3085772795602679, + "reward_after_std": 0.7910725995898247, + "reward_before_mean": 1.1356257870793343, + "reward_before_std": 0.6947167627513409, + "reward_change_max": 0.0, + "reward_change_mean": -0.8270485177636147, + "reward_change_min": -1.3118191435933113, + "reward_change_std": 0.5104956552386284, + "reward_std": 0.7910726070404053, + "rewards/cosine_scaled_reward": 0.11989622749388218, + "rewards/format_reward": 0.8958333395421505, + "step": 216 + }, + { + "advantage_max": 1.9119762033224106, + "advantage_mean": 1.986821618338297e-08, + "advantage_min": -0.7242433242499828, + "advantage_std": 0.9998660087585449, + "completion_length": 1618.4791946411133, + "epoch": 0.248, + "grad_norm": 0.21120773255825043, + "kl": 0.00627899169921875, + "lambda_div_used": 0.5, + "learning_rate": 7.27273859315928e-07, + "loss": 0.0003, + "reward": 0.20981781790032983, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.20981781790032983, + "reward_after_std": 0.9317884184420109, + "reward_before_mean": 0.905009500682354, + "reward_before_std": 0.818368062376976, + "reward_change_max": 0.0008935406804084778, + "reward_change_mean": -0.6951915994286537, + "reward_change_min": -1.2974179834127426, + "reward_change_std": 0.490953104570508, + "reward_std": 0.9317884258925915, + "rewards/cosine_scaled_reward": 0.056671383790671825, + "rewards/format_reward": 0.7916666679084301, + "step": 217 + }, + { + "advantage_max": 1.918167695403099, + "advantage_mean": 2.7939677238464355e-09, + "advantage_min": -0.767826035618782, + "advantage_std": 0.9998479783535004, + "completion_length": 1574.7292175292969, + "epoch": 0.24914285714285714, + "grad_norm": 0.2458459585905075, + "kl": 0.006359100341796875, + "lambda_div_used": 0.5, + "learning_rate": 7.243820139034464e-07, + "loss": 0.0003, + "reward": -0.017704853788018227, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.017704853788018227, + "reward_after_std": 0.7833386063575745, + "reward_before_mean": 0.5354463215917349, + "reward_before_std": 0.7116248942911625, + "reward_change_max": 0.0005803108215332031, + "reward_change_mean": -0.5531511753797531, + "reward_change_min": -1.0884820893406868, + "reward_change_std": 0.38788705691695213, + "reward_std": 0.7833386063575745, + "rewards/cosine_scaled_reward": -0.15936017641797662, + "rewards/format_reward": 0.854166679084301, + "step": 218 + }, + { + "advantage_max": 1.9283503293991089, + "advantage_mean": 1.862645371275562e-09, + "advantage_min": -0.8017013743519783, + "advantage_std": 0.9998396635055542, + "completion_length": 1339.4791946411133, + "epoch": 0.2502857142857143, + "grad_norm": 0.348197340965271, + "kl": 0.0066318511962890625, + "lambda_div_used": 0.5, + "learning_rate": 7.214816693576234e-07, + "loss": 0.0003, + "reward": 0.24929398368112743, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.24929398368112743, + "reward_after_std": 0.7916696332395077, + "reward_before_mean": 1.0231999158859253, + "reward_before_std": 0.6326823104172945, + "reward_change_max": 0.0, + "reward_change_mean": -0.7739059310406446, + "reward_change_min": -1.2800477743148804, + "reward_change_std": 0.4866549037396908, + "reward_std": 0.7916696481406689, + "rewards/cosine_scaled_reward": 0.09493327140808105, + "rewards/format_reward": 0.8333333432674408, + "step": 219 + }, + { + "advantage_max": 1.897721529006958, + "advantage_mean": 1.6142925329809543e-08, + "advantage_min": -0.8538015857338905, + "advantage_std": 0.9997849836945534, + "completion_length": 1803.0416831970215, + "epoch": 0.25142857142857145, + "grad_norm": 0.2825516164302826, + "kl": 0.0066280364990234375, + "lambda_div_used": 0.5, + "learning_rate": 7.185729670371604e-07, + "loss": 0.0003, + "reward": -0.23762081807944924, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.23762081807944924, + "reward_after_std": 0.4904538653790951, + "reward_before_mean": 0.2176198773086071, + "reward_before_std": 0.4150450900197029, + "reward_change_max": 0.0025578737258911133, + "reward_change_mean": -0.4552406966686249, + "reward_change_min": -0.7237915322184563, + "reward_change_std": 0.29777046479284763, + "reward_std": 0.49045388400554657, + "rewards/cosine_scaled_reward": -0.2661900743842125, + "rewards/format_reward": 0.7500000074505806, + "step": 220 + }, + { + "advantage_max": 1.9148018211126328, + "advantage_mean": -1.2417633588057697e-09, + "advantage_min": -0.8129096515476704, + "advantage_std": 0.9998365119099617, + "completion_length": 1439.7291831970215, + "epoch": 0.25257142857142856, + "grad_norm": 0.20042772591114044, + "kl": 0.0054264068603515625, + "lambda_div_used": 0.5, + "learning_rate": 7.156560487081051e-07, + "loss": 0.0002, + "reward": 0.16458496823906898, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.16458496823906898, + "reward_after_std": 0.7355330400168896, + "reward_before_mean": 0.8835766538977623, + "reward_before_std": 0.5434940056875348, + "reward_change_max": 0.0, + "reward_change_mean": -0.7189916595816612, + "reward_change_min": -1.1088164262473583, + "reward_change_std": 0.4312316067516804, + "reward_std": 0.7355330511927605, + "rewards/cosine_scaled_reward": 0.0251216241158545, + "rewards/format_reward": 0.8333333432674408, + "step": 221 + }, + { + "advantage_max": 1.8958768248558044, + "advantage_mean": -7.916242328320777e-09, + "advantage_min": -0.9265587478876114, + "advantage_std": 0.9998396784067154, + "completion_length": 1538.4167022705078, + "epoch": 0.2537142857142857, + "grad_norm": 0.2274702936410904, + "kl": 0.006367683410644531, + "lambda_div_used": 0.5, + "learning_rate": 7.127310565369415e-07, + "loss": 0.0003, + "reward": 0.12984950304962695, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.12984950304962695, + "reward_after_std": 0.7608728632330894, + "reward_before_mean": 0.8141781985759735, + "reward_before_std": 0.6723398230969906, + "reward_change_max": 0.0017403960227966309, + "reward_change_mean": -0.6843287535011768, + "reward_change_min": -1.0775920823216438, + "reward_change_std": 0.4317518901079893, + "reward_std": 0.7608729153871536, + "rewards/cosine_scaled_reward": 0.011255767196416855, + "rewards/format_reward": 0.791666679084301, + "step": 222 + }, + { + "advantage_max": 1.967112883925438, + "advantage_mean": 1.893689272058907e-08, + "advantage_min": -0.8013791739940643, + "advantage_std": 0.9998081922531128, + "completion_length": 1757.1667022705078, + "epoch": 0.25485714285714284, + "grad_norm": 0.24869827926158905, + "kl": 0.0062732696533203125, + "lambda_div_used": 0.5, + "learning_rate": 7.097981330836616e-07, + "loss": 0.0003, + "reward": 0.12367848050780594, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.12367848050780594, + "reward_after_std": 0.6536546088755131, + "reward_before_mean": 0.8322453033179045, + "reward_before_std": 0.45517710596323013, + "reward_change_max": 0.0, + "reward_change_mean": -0.7085668370127678, + "reward_change_min": -1.0539904236793518, + "reward_change_std": 0.39596980810165405, + "reward_std": 0.6536546275019646, + "rewards/cosine_scaled_reward": 0.06195598840713501, + "rewards/format_reward": 0.7083333395421505, + "step": 223 + }, + { + "advantage_max": 1.9077493101358414, + "advantage_mean": 1.2417635808503746e-09, + "advantage_min": -0.7769346758723259, + "advantage_std": 0.9998728260397911, + "completion_length": 1987.6875457763672, + "epoch": 0.256, + "grad_norm": 0.20385780930519104, + "kl": 0.0055694580078125, + "lambda_div_used": 0.5, + "learning_rate": 7.068574212948169e-07, + "loss": 0.0002, + "reward": 0.21556761115789413, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.21556761115789413, + "reward_after_std": 0.9941208474338055, + "reward_before_mean": 0.8981098346412182, + "reward_before_std": 0.9162913672626019, + "reward_change_max": 0.0, + "reward_change_mean": -0.682542210444808, + "reward_change_min": -1.266989454627037, + "reward_change_std": 0.4777718782424927, + "reward_std": 0.9941208772361279, + "rewards/cosine_scaled_reward": 0.032388224732130766, + "rewards/format_reward": 0.8333333432674408, + "step": 224 + }, + { + "advantage_max": 1.8822802901268005, + "advantage_mean": -8.692344177774203e-09, + "advantage_min": -0.8069443702697754, + "advantage_std": 0.9998818635940552, + "completion_length": 2217.208396911621, + "epoch": 0.2571428571428571, + "grad_norm": 0.2534785270690918, + "kl": 0.010219573974609375, + "lambda_div_used": 0.5, + "learning_rate": 7.039090644965509e-07, + "loss": 0.0004, + "reward": 0.05662869522348046, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.05662869522348046, + "reward_after_std": 0.9689956679940224, + "reward_before_mean": 0.6146425190381706, + "reward_before_std": 0.9694811850786209, + "reward_change_max": 0.0, + "reward_change_mean": -0.558013841509819, + "reward_change_min": -1.179277814924717, + "reward_change_std": 0.4618273414671421, + "reward_std": 0.9689956903457642, + "rewards/cosine_scaled_reward": -0.015595396980643272, + "rewards/format_reward": 0.6458333432674408, + "step": 225 + }, + { + "advantage_max": 1.8899008184671402, + "advantage_mean": 8.537124229768267e-09, + "advantage_min": -0.8222019150853157, + "advantage_std": 0.999876007437706, + "completion_length": 1630.270881652832, + "epoch": 0.2582857142857143, + "grad_norm": 0.20453056693077087, + "kl": 0.006317138671875, + "lambda_div_used": 0.5, + "learning_rate": 7.009532063876148e-07, + "loss": 0.0003, + "reward": 0.27888658829033375, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.27888658829033375, + "reward_after_std": 0.9403040483593941, + "reward_before_mean": 1.036810528486967, + "reward_before_std": 0.8681198097765446, + "reward_change_max": 0.0, + "reward_change_mean": -0.7579239271581173, + "reward_change_min": -1.3337569385766983, + "reward_change_std": 0.5241223052144051, + "reward_std": 0.9403041005134583, + "rewards/cosine_scaled_reward": 0.09132191189564764, + "rewards/format_reward": 0.8541666716337204, + "step": 226 + }, + { + "advantage_max": 1.971393644809723, + "advantage_mean": 3.725290520506519e-09, + "advantage_min": -0.6958120688796043, + "advantage_std": 0.9998596981167793, + "completion_length": 1262.6667098999023, + "epoch": 0.25942857142857145, + "grad_norm": 0.3305748999118805, + "kl": 0.009073257446289062, + "lambda_div_used": 0.5, + "learning_rate": 6.979899910323624e-07, + "loss": 0.0004, + "reward": 0.11022061249241233, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.11022061249241233, + "reward_after_std": 0.8254881426692009, + "reward_before_mean": 0.7461133226752281, + "reward_before_std": 0.6367629840970039, + "reward_change_max": 0.0, + "reward_change_mean": -0.6358926966786385, + "reward_change_min": -1.0167369693517685, + "reward_change_std": 0.3645213767886162, + "reward_std": 0.8254881650209427, + "rewards/cosine_scaled_reward": -0.11652669706381857, + "rewards/format_reward": 0.9791666716337204, + "step": 227 + }, + { + "advantage_max": 1.9378290474414825, + "advantage_mean": -2.2041301228625798e-08, + "advantage_min": -0.7519373595714569, + "advantage_std": 0.9998505935072899, + "completion_length": 1381.7292022705078, + "epoch": 0.26057142857142856, + "grad_norm": 0.234617680311203, + "kl": 0.00640869140625, + "lambda_div_used": 0.5, + "learning_rate": 6.950195628537299e-07, + "loss": 0.0003, + "reward": 0.2944382159039378, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2944382159039378, + "reward_after_std": 0.8001749962568283, + "reward_before_mean": 1.0973990336060524, + "reward_before_std": 0.5767837278544903, + "reward_change_max": 0.0, + "reward_change_mean": -0.8029608391225338, + "reward_change_min": -1.2629943564534187, + "reward_change_std": 0.46727374754846096, + "reward_std": 0.8001750260591507, + "rewards/cosine_scaled_reward": 0.142449501901865, + "rewards/format_reward": 0.8125000149011612, + "step": 228 + }, + { + "advantage_max": 1.9648645520210266, + "advantage_mean": -1.4901161971003773e-08, + "advantage_min": -0.7115680947899818, + "advantage_std": 0.9998573809862137, + "completion_length": 1574.3125762939453, + "epoch": 0.26171428571428573, + "grad_norm": 0.2357558012008667, + "kl": 0.007843017578125, + "lambda_div_used": 0.5, + "learning_rate": 6.920420666261961e-07, + "loss": 0.0003, + "reward": 0.1340332217514515, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1340332217514515, + "reward_after_std": 0.8145388253033161, + "reward_before_mean": 0.7931003291159868, + "reward_before_std": 0.6152823623269796, + "reward_change_max": 0.0, + "reward_change_mean": -0.6590671092271805, + "reward_change_min": -1.0245969370007515, + "reward_change_std": 0.38715394400060177, + "reward_std": 0.8145388327538967, + "rewards/cosine_scaled_reward": -0.009699843125417829, + "rewards/format_reward": 0.812500013038516, + "step": 229 + }, + { + "advantage_max": 1.8860953599214554, + "advantage_mean": 1.8626452658043746e-08, + "advantage_min": -0.9193001091480255, + "advantage_std": 0.9998148381710052, + "completion_length": 1827.3542175292969, + "epoch": 0.26285714285714284, + "grad_norm": 0.22835993766784668, + "kl": 0.008272171020507812, + "lambda_div_used": 0.5, + "learning_rate": 6.890576474687263e-07, + "loss": 0.0003, + "reward": -0.1569704683497548, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.1569704683497548, + "reward_after_std": 0.6100475452840328, + "reward_before_mean": 0.33352593518793583, + "reward_before_std": 0.544262558221817, + "reward_change_max": 0.0, + "reward_change_mean": -0.49049638770520687, + "reward_change_min": -0.8691147491335869, + "reward_change_std": 0.3244625609368086, + "reward_std": 0.6100475862622261, + "rewards/cosine_scaled_reward": -0.21865370776504278, + "rewards/format_reward": 0.7708333507180214, + "step": 230 + }, + { + "advantage_max": 1.9178503304719925, + "advantage_mean": -2.4835269396561444e-09, + "advantage_min": -0.8343557715415955, + "advantage_std": 0.9998718351125717, + "completion_length": 1749.9166946411133, + "epoch": 0.264, + "grad_norm": 0.22359751164913177, + "kl": 0.008632659912109375, + "lambda_div_used": 0.5, + "learning_rate": 6.860664508377001e-07, + "loss": 0.0003, + "reward": 0.22381134470924735, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.22381134470924735, + "reward_after_std": 0.86066984385252, + "reward_before_mean": 0.9546388499438763, + "reward_before_std": 0.7596081979572773, + "reward_change_max": 0.0020909160375595093, + "reward_change_mean": -0.7308275178074837, + "reward_change_min": -1.3097406700253487, + "reward_change_std": 0.4961383566260338, + "reward_std": 0.8606698885560036, + "rewards/cosine_scaled_reward": 0.07106942869722843, + "rewards/format_reward": 0.8125000111758709, + "step": 231 + }, + { + "advantage_max": 1.9307349771261215, + "advantage_mean": 7.574757443506996e-08, + "advantage_min": -0.7901148796081543, + "advantage_std": 0.9997728392481804, + "completion_length": 1812.833381652832, + "epoch": 0.2651428571428571, + "grad_norm": 0.2341795712709427, + "kl": 0.008274078369140625, + "lambda_div_used": 0.5, + "learning_rate": 6.83068622519821e-07, + "loss": 0.0003, + "reward": -0.07405400322750211, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.07405400322750211, + "reward_after_std": 0.7224782053381205, + "reward_before_mean": 0.448210122063756, + "reward_before_std": 0.6184751987457275, + "reward_change_max": 0.00040543824434280396, + "reward_change_mean": -0.5222641108557582, + "reward_change_min": -0.9738937392830849, + "reward_change_std": 0.34536191495135427, + "reward_std": 0.7224782090634108, + "rewards/cosine_scaled_reward": -0.16131161339581013, + "rewards/format_reward": 0.7708333432674408, + "step": 232 + }, + { + "advantage_max": 1.9683336466550827, + "advantage_mean": -1.2417634698280722e-09, + "advantage_min": -0.7432735189795494, + "advantage_std": 0.999840646982193, + "completion_length": 1129.708366394043, + "epoch": 0.2662857142857143, + "grad_norm": 0.3380148708820343, + "kl": 0.0073413848876953125, + "lambda_div_used": 0.5, + "learning_rate": 6.800643086250121e-07, + "loss": 0.0003, + "reward": -0.01638099644333124, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.01638099644333124, + "reward_after_std": 0.7068706303834915, + "reward_before_mean": 0.5511052198708057, + "reward_before_std": 0.5402777791023254, + "reward_change_max": 0.0, + "reward_change_mean": -0.5674862191081047, + "reward_change_min": -0.8964772894978523, + "reward_change_std": 0.3298127166926861, + "reward_std": 0.7068706452846527, + "rewards/cosine_scaled_reward": -0.19319739658385515, + "rewards/format_reward": 0.9375000074505806, + "step": 233 + }, + { + "advantage_max": 1.9574606865644455, + "advantage_mean": 3.725291075618031e-09, + "advantage_min": -0.7137792631983757, + "advantage_std": 0.9998077154159546, + "completion_length": 1706.3542175292969, + "epoch": 0.2674285714285714, + "grad_norm": 0.23307549953460693, + "kl": 0.0071849822998046875, + "lambda_div_used": 0.5, + "learning_rate": 6.770536555792944e-07, + "loss": 0.0003, + "reward": 0.01580604538321495, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.01580604538321495, + "reward_after_std": 0.702125009149313, + "reward_before_mean": 0.62279331125319, + "reward_before_std": 0.5630986513569951, + "reward_change_max": 0.0, + "reward_change_mean": -0.6069872789084911, + "reward_change_min": -0.9930330999195576, + "reward_change_std": 0.39121099561452866, + "reward_std": 0.7021250482648611, + "rewards/cosine_scaled_reward": -0.05318668344989419, + "rewards/format_reward": 0.7291666772216558, + "step": 234 + }, + { + "advantage_max": 1.9572331607341766, + "advantage_mean": -8.692344399818808e-09, + "advantage_min": -0.7316535785794258, + "advantage_std": 0.9998847916722298, + "completion_length": 1356.958381652832, + "epoch": 0.26857142857142857, + "grad_norm": 0.2740318775177002, + "kl": 0.007434844970703125, + "lambda_div_used": 0.5, + "learning_rate": 6.740368101176495e-07, + "loss": 0.0003, + "reward": 0.38920610025525093, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.38920610025525093, + "reward_after_std": 0.9113083258271217, + "reward_before_mean": 1.2318087760359049, + "reward_before_std": 0.6570627186447382, + "reward_change_max": 0.0014033988118171692, + "reward_change_mean": -0.8426027111709118, + "reward_change_min": -1.267478797584772, + "reward_change_std": 0.48250637575984, + "reward_std": 0.9113083481788635, + "rewards/cosine_scaled_reward": 0.1784043600782752, + "rewards/format_reward": 0.8750000055879354, + "step": 235 + }, + { + "advantage_max": 1.9679247587919235, + "advantage_mean": -2.4835269396561444e-09, + "advantage_min": -0.6973557993769646, + "advantage_std": 0.9998724982142448, + "completion_length": 1780.5417251586914, + "epoch": 0.26971428571428574, + "grad_norm": 0.22148916125297546, + "kl": 0.0067691802978515625, + "lambda_div_used": 0.5, + "learning_rate": 6.710139192768694e-07, + "loss": 0.0003, + "reward": 0.02068489557132125, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.02068489557132125, + "reward_after_std": 0.9226878210902214, + "reward_before_mean": 0.5527768121100962, + "reward_before_std": 0.8031711392104626, + "reward_change_max": 0.0, + "reward_change_mean": -0.5320919454097748, + "reward_change_min": -0.9312163218855858, + "reward_change_std": 0.3605116531252861, + "reward_std": 0.9226878210902214, + "rewards/cosine_scaled_reward": -0.10902826674282551, + "rewards/format_reward": 0.770833333954215, + "step": 236 + }, + { + "advantage_max": 1.9593615680932999, + "advantage_mean": -2.4835269063494536e-08, + "advantage_min": -0.7497310638427734, + "advantage_std": 0.9998056143522263, + "completion_length": 1496.6666870117188, + "epoch": 0.27085714285714285, + "grad_norm": 0.21749630570411682, + "kl": 0.006267547607421875, + "lambda_div_used": 0.5, + "learning_rate": 6.679851303883891e-07, + "loss": 0.0003, + "reward": 0.08538356237113476, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.08538356237113476, + "reward_after_std": 0.6719049979001284, + "reward_before_mean": 0.7545446641743183, + "reward_before_std": 0.4781134817749262, + "reward_change_max": 0.000691726803779602, + "reward_change_mean": -0.6691611055284739, + "reward_change_min": -1.0085529759526253, + "reward_change_std": 0.3762901732698083, + "reward_std": 0.6719050072133541, + "rewards/cosine_scaled_reward": -0.03939435165375471, + "rewards/format_reward": 0.8333333432674408, + "step": 237 + }, + { + "advantage_max": 1.9590356647968292, + "advantage_mean": -1.1175871450497255e-08, + "advantage_min": -0.745018545538187, + "advantage_std": 0.999871663749218, + "completion_length": 1041.8750381469727, + "epoch": 0.272, + "grad_norm": 0.23885765671730042, + "kl": 0.007602691650390625, + "lambda_div_used": 0.5, + "learning_rate": 6.649505910711058e-07, + "loss": 0.0003, + "reward": 0.26911926828324795, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.26911926828324795, + "reward_after_std": 0.8938291482627392, + "reward_before_mean": 1.0215275082737207, + "reward_before_std": 0.6879322770982981, + "reward_change_max": 0.00012250244617462158, + "reward_change_mean": -0.7524082288146019, + "reward_change_min": -1.1642607524991035, + "reward_change_std": 0.4326386693865061, + "reward_std": 0.8938291892409325, + "rewards/cosine_scaled_reward": 0.03159707225859165, + "rewards/format_reward": 0.9583333432674408, + "step": 238 + }, + { + "advantage_max": 1.9281752556562424, + "advantage_mean": -3.290673267208888e-08, + "advantage_min": -0.7456583306193352, + "advantage_std": 0.999865360558033, + "completion_length": 1458.6667098999023, + "epoch": 0.27314285714285713, + "grad_norm": 0.21850885450839996, + "kl": 0.0056705474853515625, + "lambda_div_used": 0.5, + "learning_rate": 6.619104492241847e-07, + "loss": 0.0002, + "reward": 0.46745580551214516, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.46745580551214516, + "reward_after_std": 0.8389881551265717, + "reward_before_mean": 1.4097450375556946, + "reward_before_std": 0.6188498791307211, + "reward_change_max": 0.0, + "reward_change_mean": -0.942289263010025, + "reward_change_min": -1.4645648337900639, + "reward_change_std": 0.570222893729806, + "reward_std": 0.8389881774783134, + "rewards/cosine_scaled_reward": 0.2986225029453635, + "rewards/format_reward": 0.8125000018626451, + "step": 239 + }, + { + "advantage_max": 1.943466305732727, + "advantage_mean": 1.552204287325054e-08, + "advantage_min": -0.7473693750798702, + "advantage_std": 0.9998158067464828, + "completion_length": 1844.145866394043, + "epoch": 0.2742857142857143, + "grad_norm": 0.37496596574783325, + "kl": 0.011318206787109375, + "lambda_div_used": 0.5, + "learning_rate": 6.588648530198504e-07, + "loss": 0.0005, + "reward": -0.23318948596715927, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.23318948596715927, + "reward_after_std": 0.6114565394818783, + "reward_before_mean": 0.1918274350464344, + "reward_before_std": 0.5235918611288071, + "reward_change_max": 0.0006126239895820618, + "reward_change_mean": -0.42501691612415016, + "reward_change_min": -0.7172445729374886, + "reward_change_std": 0.2740626563318074, + "reward_std": 0.6114565506577492, + "rewards/cosine_scaled_reward": -0.2686696262098849, + "rewards/format_reward": 0.7291666846722364, + "step": 240 + }, + { + "advantage_max": 1.9376345574855804, + "advantage_mean": 1.5522043428362053e-08, + "advantage_min": -0.7810809761285782, + "advantage_std": 0.999825157225132, + "completion_length": 1782.8333587646484, + "epoch": 0.2754285714285714, + "grad_norm": 0.26208123564720154, + "kl": 0.0093231201171875, + "lambda_div_used": 0.5, + "learning_rate": 6.558139508961654e-07, + "loss": 0.0004, + "reward": -0.16795344126876444, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.16795344126876444, + "reward_after_std": 0.6143370009958744, + "reward_before_mean": 0.309583880007267, + "reward_before_std": 0.496045283973217, + "reward_change_max": 0.0004753321409225464, + "reward_change_mean": -0.47753732465207577, + "reward_change_min": -0.7713849879801273, + "reward_change_std": 0.2903636433184147, + "reward_std": 0.6143370270729065, + "rewards/cosine_scaled_reward": -0.22020806092768908, + "rewards/format_reward": 0.7500000093132257, + "step": 241 + }, + { + "advantage_max": 1.9451895356178284, + "advantage_mean": 1.6653345369377348e-16, + "advantage_min": -0.7966984063386917, + "advantage_std": 0.9998231902718544, + "completion_length": 1370.9791946411133, + "epoch": 0.2765714285714286, + "grad_norm": 0.32628756761550903, + "kl": 0.009744644165039062, + "lambda_div_used": 0.5, + "learning_rate": 6.527578915497951e-07, + "loss": 0.0004, + "reward": -0.0009593330323696136, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.0009593330323696136, + "reward_after_std": 0.7521246317774057, + "reward_before_mean": 0.5692737740464509, + "reward_before_std": 0.6212347452528775, + "reward_change_max": 0.0, + "reward_change_mean": -0.5702331103384495, + "reward_change_min": -0.8771290257573128, + "reward_change_std": 0.34795637615025043, + "reward_std": 0.7521246485412121, + "rewards/cosine_scaled_reward": -0.16327978996559978, + "rewards/format_reward": 0.8958333432674408, + "step": 242 + }, + { + "advantage_max": 1.9557791501283646, + "advantage_mean": -4.967053768289986e-09, + "advantage_min": -0.7319461777806282, + "advantage_std": 0.9998825341463089, + "completion_length": 1414.0416946411133, + "epoch": 0.2777142857142857, + "grad_norm": 0.2184247523546219, + "kl": 0.0065326690673828125, + "lambda_div_used": 0.5, + "learning_rate": 6.496968239287603e-07, + "loss": 0.0003, + "reward": 0.2227376624941826, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2227376624941826, + "reward_after_std": 0.9276175498962402, + "reward_before_mean": 0.9240134842693806, + "reward_before_std": 0.7810794413089752, + "reward_change_max": 0.0009024292230606079, + "reward_change_mean": -0.7012757956981659, + "reward_change_min": -1.248128518462181, + "reward_change_std": 0.4496439266949892, + "reward_std": 0.9276175945997238, + "rewards/cosine_scaled_reward": 0.014090052805840969, + "rewards/format_reward": 0.8958333395421505, + "step": 243 + }, + { + "advantage_max": 1.9719029814004898, + "advantage_mean": -5.89837656495007e-09, + "advantage_min": -0.6661188155412674, + "advantage_std": 0.9999043568968773, + "completion_length": 1704.1458854675293, + "epoch": 0.27885714285714286, + "grad_norm": 0.23143883049488068, + "kl": 0.00717926025390625, + "lambda_div_used": 0.5, + "learning_rate": 6.466308972251785e-07, + "loss": 0.0003, + "reward": 0.4157985597848892, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4157985597848892, + "reward_after_std": 1.1085382103919983, + "reward_before_mean": 1.2161034047603607, + "reward_before_std": 0.8881407734006643, + "reward_change_max": 0.0004424676299095154, + "reward_change_mean": -0.8003048803657293, + "reward_change_min": -1.3503287062048912, + "reward_change_std": 0.49696409702301025, + "reward_std": 1.108538269996643, + "rewards/cosine_scaled_reward": 0.20180170447565615, + "rewards/format_reward": 0.8125000055879354, + "step": 244 + }, + { + "advantage_max": 1.932635858654976, + "advantage_mean": -1.76951293617833e-08, + "advantage_min": -0.7667308263480663, + "advantage_std": 0.999868243932724, + "completion_length": 1864.5208740234375, + "epoch": 0.28, + "grad_norm": 0.2242145985364914, + "kl": 0.0067577362060546875, + "lambda_div_used": 0.5, + "learning_rate": 6.435602608679916e-07, + "loss": 0.0003, + "reward": 0.09724759729579091, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.09724759729579091, + "reward_after_std": 0.9333448335528374, + "reward_before_mean": 0.6936495788395405, + "reward_before_std": 0.8179382756352425, + "reward_change_max": 0.0, + "reward_change_mean": -0.5964019894599915, + "reward_change_min": -1.0710543617606163, + "reward_change_std": 0.4002857990562916, + "reward_std": 0.9333448670804501, + "rewards/cosine_scaled_reward": -0.02817522920668125, + "rewards/format_reward": 0.7500000093132257, + "step": 245 + }, + { + "advantage_max": 1.9313433915376663, + "advantage_mean": 1.3659398501175701e-08, + "advantage_min": -0.7306452617049217, + "advantage_std": 0.999847486615181, + "completion_length": 1426.0000610351562, + "epoch": 0.28114285714285714, + "grad_norm": 0.21626295149326324, + "kl": 0.007900238037109375, + "lambda_div_used": 0.5, + "learning_rate": 6.404850645156841e-07, + "loss": 0.0003, + "reward": 0.04067504871636629, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.04067504871636629, + "reward_after_std": 0.7928140051662922, + "reward_before_mean": 0.6401501521468163, + "reward_before_std": 0.6862226165831089, + "reward_change_max": 0.000470772385597229, + "reward_change_mean": -0.5994750969111919, + "reward_change_min": -1.0783002860844135, + "reward_change_std": 0.39229152724146843, + "reward_std": 0.7928140312433243, + "rewards/cosine_scaled_reward": -0.12784160394221544, + "rewards/format_reward": 0.8958333395421505, + "step": 246 + }, + { + "advantage_max": 1.909987896680832, + "advantage_mean": 1.2417634698280722e-08, + "advantage_min": -0.8228132203221321, + "advantage_std": 0.9998472183942795, + "completion_length": 1998.7500457763672, + "epoch": 0.2822857142857143, + "grad_norm": 0.23221814632415771, + "kl": 0.0082244873046875, + "lambda_div_used": 0.5, + "learning_rate": 6.374054580489873e-07, + "loss": 0.0003, + "reward": -0.10598975839093328, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": -0.10598975839093328, + "reward_after_std": 0.7731035724282265, + "reward_before_mean": 0.37022377736866474, + "reward_before_std": 0.7434089183807373, + "reward_change_max": 0.0009429380297660828, + "reward_change_mean": -0.4762135464698076, + "reward_change_min": -0.9990293830633163, + "reward_change_std": 0.36950034089386463, + "reward_std": 0.7731035761535168, + "rewards/cosine_scaled_reward": -0.15863812156021595, + "rewards/format_reward": 0.687500013038516, + "step": 247 + }, + { + "advantage_max": 1.9852658063173294, + "advantage_mean": -3.7252901874396116e-09, + "advantage_min": -0.6675059422850609, + "advantage_std": 0.9998695775866508, + "completion_length": 1363.3125305175781, + "epoch": 0.2834285714285714, + "grad_norm": 0.3437131643295288, + "kl": 0.008054733276367188, + "lambda_div_used": 0.5, + "learning_rate": 6.343215915635761e-07, + "loss": 0.0003, + "reward": 0.3367419361602515, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3367419361602515, + "reward_after_std": 0.8527059704065323, + "reward_before_mean": 1.1518472461029887, + "reward_before_std": 0.582564564421773, + "reward_change_max": 0.0, + "reward_change_mean": -0.8151053376495838, + "reward_change_min": -1.2186406515538692, + "reward_change_std": 0.45573683083057404, + "reward_std": 0.8527059927582741, + "rewards/cosine_scaled_reward": 0.15925694815814495, + "rewards/format_reward": 0.833333333954215, + "step": 248 + }, + { + "advantage_max": 1.9299704134464264, + "advantage_mean": -6.208817349140361e-09, + "advantage_min": -0.759776271879673, + "advantage_std": 0.9998761713504791, + "completion_length": 1248.5000457763672, + "epoch": 0.2845714285714286, + "grad_norm": 0.30233049392700195, + "kl": 0.012414932250976562, + "lambda_div_used": 0.5, + "learning_rate": 6.31233615362752e-07, + "loss": 0.0005, + "reward": 0.4085944064427167, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4085944064427167, + "reward_after_std": 0.8884996175765991, + "reward_before_mean": 1.2769385250285268, + "reward_before_std": 0.6949853375554085, + "reward_change_max": 0.0018889382481575012, + "reward_change_mean": -0.8683441504836082, + "reward_change_min": -1.3353769183158875, + "reward_change_std": 0.5192967671900988, + "reward_std": 0.8884996548295021, + "rewards/cosine_scaled_reward": 0.2218025820911862, + "rewards/format_reward": 0.8333333395421505, + "step": 249 + }, + { + "advantage_max": 1.897703930735588, + "advantage_mean": 2.514571070810767e-08, + "advantage_min": -0.8732591867446899, + "advantage_std": 0.9998427405953407, + "completion_length": 1361.4583702087402, + "epoch": 0.2857142857142857, + "grad_norm": 0.3435612916946411, + "kl": 0.008312225341796875, + "lambda_div_used": 0.5, + "learning_rate": 6.281416799501187e-07, + "loss": 0.0003, + "reward": 0.05800286494195461, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.05800286494195461, + "reward_after_std": 0.7511843629181385, + "reward_before_mean": 0.6857307204045355, + "reward_before_std": 0.6802803799510002, + "reward_change_max": 0.0, + "reward_change_mean": -0.6277278400957584, + "reward_change_min": -1.1124539822340012, + "reward_change_std": 0.41588929295539856, + "reward_std": 0.7511844001710415, + "rewards/cosine_scaled_reward": -0.11546798469498754, + "rewards/format_reward": 0.916666679084301, + "step": 250 + }, + { + "advantage_max": 1.9394332319498062, + "advantage_mean": -1.2417634476236117e-08, + "advantage_min": -0.8116177469491959, + "advantage_std": 0.9998854398727417, + "completion_length": 1301.6875381469727, + "epoch": 0.28685714285714287, + "grad_norm": 0.26373091340065, + "kl": 0.009695053100585938, + "lambda_div_used": 0.5, + "learning_rate": 6.25045936022246e-07, + "loss": 0.0004, + "reward": 0.24317791312932968, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.24317791312932968, + "reward_after_std": 0.9907942861318588, + "reward_before_mean": 0.9422217644751072, + "reward_before_std": 0.8384118843823671, + "reward_change_max": 0.0, + "reward_change_mean": -0.6990438550710678, + "reward_change_min": -1.1878609210252762, + "reward_change_std": 0.4547945335507393, + "reward_std": 0.9907942861318588, + "rewards/cosine_scaled_reward": 0.03361086605582386, + "rewards/format_reward": 0.8750000074505806, + "step": 251 + }, + { + "advantage_max": 1.9412772208452225, + "advantage_mean": 1.490116141589226e-08, + "advantage_min": -0.8077448084950447, + "advantage_std": 0.9998365715146065, + "completion_length": 1653.0208587646484, + "epoch": 0.288, + "grad_norm": 0.22626619040966034, + "kl": 0.0092926025390625, + "lambda_div_used": 0.5, + "learning_rate": 6.219465344613258e-07, + "loss": 0.0004, + "reward": -0.016541813500225544, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.016541813500225544, + "reward_after_std": 0.6735903918743134, + "reward_before_mean": 0.5694787334650755, + "reward_before_std": 0.5151693597435951, + "reward_change_max": 0.0, + "reward_change_mean": -0.5860205516219139, + "reward_change_min": -0.8795042932033539, + "reward_change_std": 0.3404890410602093, + "reward_std": 0.6735904067754745, + "rewards/cosine_scaled_reward": -0.1110939746722579, + "rewards/format_reward": 0.7916666846722364, + "step": 252 + }, + { + "advantage_max": 1.8977369666099548, + "advantage_mean": 4.9670543234014986e-09, + "advantage_min": -0.7935145944356918, + "advantage_std": 0.9998346045613289, + "completion_length": 1753.9167175292969, + "epoch": 0.28914285714285715, + "grad_norm": 0.2937052547931671, + "kl": 0.013032913208007812, + "lambda_div_used": 0.5, + "learning_rate": 6.188436263278172e-07, + "loss": 0.0005, + "reward": 0.07536422368139029, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.07536422368139029, + "reward_after_std": 0.8575031999498606, + "reward_before_mean": 0.6837189728394151, + "reward_before_std": 0.8161356411874294, + "reward_change_max": 0.0, + "reward_change_mean": -0.6083547510206699, + "reward_change_min": -1.1546659246087074, + "reward_change_std": 0.4495114888995886, + "reward_std": 0.8575032278895378, + "rewards/cosine_scaled_reward": -0.05397386848926544, + "rewards/format_reward": 0.7916666828095913, + "step": 253 + }, + { + "advantage_max": 1.874565601348877, + "advantage_mean": 5.587935669737476e-09, + "advantage_min": -0.8698348104953766, + "advantage_std": 0.9998382553458214, + "completion_length": 1759.9791946411133, + "epoch": 0.29028571428571426, + "grad_norm": 0.3259371221065521, + "kl": 0.012237548828125, + "lambda_div_used": 0.5, + "learning_rate": 6.157373628530852e-07, + "loss": 0.0005, + "reward": -0.029336320236325264, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.029336320236325264, + "reward_after_std": 0.7416080571711063, + "reward_before_mean": 0.5269271868746728, + "reward_before_std": 0.6862723678350449, + "reward_change_max": 0.00011439621448516846, + "reward_change_mean": -0.5562635250389576, + "reward_change_min": -0.9738981239497662, + "reward_change_std": 0.38014569878578186, + "reward_std": 0.7416080869734287, + "rewards/cosine_scaled_reward": -0.142786405980587, + "rewards/format_reward": 0.812500013038516, + "step": 254 + }, + { + "advantage_max": 1.9653310924768448, + "advantage_mean": 3.7873786884468075e-08, + "advantage_min": -0.6673059165477753, + "advantage_std": 0.9997921586036682, + "completion_length": 1813.2917098999023, + "epoch": 0.2914285714285714, + "grad_norm": 0.22582511603832245, + "kl": 0.009113311767578125, + "lambda_div_used": 0.5, + "learning_rate": 6.126278954320294e-07, + "loss": 0.0004, + "reward": -0.19223103299736977, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.19223103299736977, + "reward_after_std": 0.6329327449202538, + "reward_before_mean": 0.2562922164797783, + "reward_before_std": 0.4785165935754776, + "reward_change_max": 0.00114508718252182, + "reward_change_mean": -0.4485232476145029, + "reward_change_min": -0.7125186286866665, + "reward_change_std": 0.26121316477656364, + "reward_std": 0.632932759821415, + "rewards/cosine_scaled_reward": -0.23643723208806477, + "rewards/format_reward": 0.7291666716337204, + "step": 255 + }, + { + "advantage_max": 1.922520250082016, + "advantage_mean": 2.4835269396561444e-09, + "advantage_min": -0.8391077145934105, + "advantage_std": 0.9998262971639633, + "completion_length": 1616.6458587646484, + "epoch": 0.2925714285714286, + "grad_norm": 0.289453387260437, + "kl": 0.010000228881835938, + "lambda_div_used": 0.5, + "learning_rate": 6.095153756157051e-07, + "loss": 0.0004, + "reward": 0.08426090609282255, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.08426090609282255, + "reward_after_std": 0.7107764109969139, + "reward_before_mean": 0.7408394683152437, + "reward_before_std": 0.5993353240191936, + "reward_change_max": 0.0, + "reward_change_mean": -0.6565785631537437, + "reward_change_min": -1.0525353848934174, + "reward_change_std": 0.40571545250713825, + "reward_std": 0.7107764147222042, + "rewards/cosine_scaled_reward": -0.04624694274389185, + "rewards/format_reward": 0.8333333469927311, + "step": 256 + }, + { + "advantage_max": 1.7898119240999222, + "advantage_mean": -6.519258355375257e-09, + "advantage_min": -1.0971575528383255, + "advantage_std": 0.9998827576637268, + "completion_length": 1864.1250610351562, + "epoch": 0.2937142857142857, + "grad_norm": 0.25969284772872925, + "kl": 0.00867462158203125, + "lambda_div_used": 0.5, + "learning_rate": 6.06399955103937e-07, + "loss": 0.0003, + "reward": 0.293114073574543, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.293114073574543, + "reward_after_std": 0.9689504988491535, + "reward_before_mean": 1.0681517720222473, + "reward_before_std": 1.044908344745636, + "reward_change_max": 0.0009504184126853943, + "reward_change_mean": -0.7750377468764782, + "reward_change_min": -1.4173622876405716, + "reward_change_std": 0.6037551872432232, + "reward_std": 0.9689505062997341, + "rewards/cosine_scaled_reward": 0.15907588601112366, + "rewards/format_reward": 0.7500000260770321, + "step": 257 + }, + { + "advantage_max": 1.913239747285843, + "advantage_mean": -2.483527050678447e-09, + "advantage_min": -0.7339973300695419, + "advantage_std": 0.9998701959848404, + "completion_length": 1717.6250534057617, + "epoch": 0.2948571428571429, + "grad_norm": 0.2277102768421173, + "kl": 0.007541656494140625, + "lambda_div_used": 0.5, + "learning_rate": 6.032817857379256e-07, + "loss": 0.0003, + "reward": 0.14000254310667515, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.14000254310667515, + "reward_after_std": 0.9483177699148655, + "reward_before_mean": 0.7783935330808163, + "reward_before_std": 0.8779289349913597, + "reward_change_max": 0.0, + "reward_change_mean": -0.6383909955620766, + "reward_change_min": -1.2183105871081352, + "reward_change_std": 0.4686681590974331, + "reward_std": 0.9483177699148655, + "rewards/cosine_scaled_reward": -0.03788657521363348, + "rewards/format_reward": 0.8541666716337204, + "step": 258 + }, + { + "advantage_max": 1.960391491651535, + "advantage_mean": -1.1175871339474952e-08, + "advantage_min": -0.6969268918037415, + "advantage_std": 0.9998453557491302, + "completion_length": 1366.3958740234375, + "epoch": 0.296, + "grad_norm": 0.27390286326408386, + "kl": 0.00946807861328125, + "lambda_div_used": 0.5, + "learning_rate": 6.001610194928464e-07, + "loss": 0.0004, + "reward": 0.17380837351083755, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.17380837351083755, + "reward_after_std": 0.709467351436615, + "reward_before_mean": 0.9051290564239025, + "reward_before_std": 0.48298365622758865, + "reward_change_max": 0.0, + "reward_change_mean": -0.7313206605613232, + "reward_change_min": -1.1136563420295715, + "reward_change_std": 0.40611193887889385, + "reward_std": 0.7094673663377762, + "rewards/cosine_scaled_reward": 0.015064499340951443, + "rewards/format_reward": 0.8750000055879354, + "step": 259 + }, + { + "advantage_max": 1.902090847492218, + "advantage_mean": -5.551115123125783e-16, + "advantage_min": -0.8360799662768841, + "advantage_std": 0.9998557269573212, + "completion_length": 1141.3958644866943, + "epoch": 0.29714285714285715, + "grad_norm": 0.27379411458969116, + "kl": 0.00658416748046875, + "lambda_div_used": 0.5, + "learning_rate": 5.97037808470444e-07, + "loss": 0.0003, + "reward": 0.33953036181628704, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.33953036181628704, + "reward_after_std": 0.8164723627269268, + "reward_before_mean": 1.1856674198061228, + "reward_before_std": 0.6960519719868898, + "reward_change_max": 0.00030853599309921265, + "reward_change_mean": -0.8461369834840298, + "reward_change_min": -1.3422378227114677, + "reward_change_std": 0.533370828256011, + "reward_std": 0.816472377628088, + "rewards/cosine_scaled_reward": 0.10325032752007246, + "rewards/format_reward": 0.9791666716337204, + "step": 260 + }, + { + "advantage_max": 1.9397707134485245, + "advantage_mean": 5.215406695402436e-08, + "advantage_min": -0.7806360647082329, + "advantage_std": 0.9997915849089622, + "completion_length": 2092.395866394043, + "epoch": 0.29828571428571427, + "grad_norm": 0.2074173092842102, + "kl": 0.008243560791015625, + "lambda_div_used": 0.5, + "learning_rate": 5.939123048916173e-07, + "loss": 0.0003, + "reward": -0.11741140764206648, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.11741140764206648, + "reward_after_std": 0.7313024029135704, + "reward_before_mean": 0.3657475281506777, + "reward_before_std": 0.625818207859993, + "reward_change_max": 0.0005079880356788635, + "reward_change_mean": -0.4831589162349701, + "reward_change_min": -0.8495705388486385, + "reward_change_std": 0.30140868201851845, + "reward_std": 0.7313024085015059, + "rewards/cosine_scaled_reward": -0.11920957826077938, + "rewards/format_reward": 0.6041666716337204, + "step": 261 + }, + { + "advantage_max": 1.9241239577531815, + "advantage_mean": 1.9247333560290514e-08, + "advantage_min": -0.7412631884217262, + "advantage_std": 0.999830037355423, + "completion_length": 1531.395896911621, + "epoch": 0.29942857142857143, + "grad_norm": 0.2947309613227844, + "kl": 0.010721206665039062, + "lambda_div_used": 0.5, + "learning_rate": 5.907846610890011e-07, + "loss": 0.0004, + "reward": -0.11896365694701672, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.11896365694701672, + "reward_after_std": 0.6233756765723228, + "reward_before_mean": 0.3966408409178257, + "reward_before_std": 0.5298713929951191, + "reward_change_max": 0.001282908022403717, + "reward_change_mean": -0.5156044885516167, + "reward_change_min": -0.9353062510490417, + "reward_change_std": 0.3425426837056875, + "reward_std": 0.6233757026493549, + "rewards/cosine_scaled_reward": -0.20792959071695805, + "rewards/format_reward": 0.8125000111758709, + "step": 262 + }, + { + "advantage_max": 1.9853202849626541, + "advantage_mean": 1.8626452047421083e-08, + "advantage_min": -0.6768524274230003, + "advantage_std": 0.9998548626899719, + "completion_length": 1272.0625228881836, + "epoch": 0.30057142857142854, + "grad_norm": 0.23958361148834229, + "kl": 0.007053375244140625, + "lambda_div_used": 0.5, + "learning_rate": 5.87655029499542e-07, + "loss": 0.0003, + "reward": -0.035472466610372066, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.035472466610372066, + "reward_after_std": 0.7921707406640053, + "reward_before_mean": 0.487296462059021, + "reward_before_std": 0.6246878579258919, + "reward_change_max": 0.0, + "reward_change_mean": -0.5227689333260059, + "reward_change_min": -0.8227484747767448, + "reward_change_std": 0.2987172771245241, + "reward_std": 0.7921707481145859, + "rewards/cosine_scaled_reward": -0.23551844991743565, + "rewards/format_reward": 0.9583333432674408, + "step": 263 + }, + { + "advantage_max": 1.94135420024395, + "advantage_mean": -1.738468852208186e-08, + "advantage_min": -0.7536604218184948, + "advantage_std": 0.99985770881176, + "completion_length": 1475.1250305175781, + "epoch": 0.3017142857142857, + "grad_norm": 0.22166408598423004, + "kl": 0.007488250732421875, + "lambda_div_used": 0.5, + "learning_rate": 5.845235626570683e-07, + "loss": 0.0003, + "reward": 0.054106075898744166, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.054106075898744166, + "reward_after_std": 0.7976480908691883, + "reward_before_mean": 0.6590630635619164, + "reward_before_std": 0.6566238440573215, + "reward_change_max": 0.0, + "reward_change_mean": -0.6049569994211197, + "reward_change_min": -0.9770181886851788, + "reward_change_std": 0.37318576872348785, + "reward_std": 0.797648124396801, + "rewards/cosine_scaled_reward": -0.11838514357805252, + "rewards/format_reward": 0.8958333395421505, + "step": 264 + }, + { + "advantage_max": 1.9491015672683716, + "advantage_mean": -1.1796752907855534e-08, + "advantage_min": -0.708407960832119, + "advantage_std": 0.9998629614710808, + "completion_length": 1303.1042251586914, + "epoch": 0.3028571428571429, + "grad_norm": 0.25498053431510925, + "kl": 0.008037567138671875, + "lambda_div_used": 0.5, + "learning_rate": 5.813904131848564e-07, + "loss": 0.0003, + "reward": 0.2292765413003508, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2292765413003508, + "reward_after_std": 0.8382696844637394, + "reward_before_mean": 0.9684329330921173, + "reward_before_std": 0.6394072566181421, + "reward_change_max": 0.0, + "reward_change_mean": -0.739156398922205, + "reward_change_min": -1.1916795074939728, + "reward_change_std": 0.44865376502275467, + "reward_std": 0.8382696956396103, + "rewards/cosine_scaled_reward": 0.005049763713032007, + "rewards/format_reward": 0.9583333358168602, + "step": 265 + }, + { + "advantage_max": 1.9380334466695786, + "advantage_mean": 8.6923440667519e-09, + "advantage_min": -0.7872503623366356, + "advantage_std": 0.9998449459671974, + "completion_length": 1485.3750305175781, + "epoch": 0.304, + "grad_norm": 0.24378226697444916, + "kl": 0.008502960205078125, + "lambda_div_used": 0.5, + "learning_rate": 5.78255733788191e-07, + "loss": 0.0003, + "reward": 0.026207237504422665, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.026207237504422665, + "reward_after_std": 0.762421753257513, + "reward_before_mean": 0.6195590551942587, + "reward_before_std": 0.6429805513471365, + "reward_change_max": 1.6391277313232422e-06, + "reward_change_mean": -0.5933518260717392, + "reward_change_min": -1.0302874110639095, + "reward_change_std": 0.3743545040488243, + "reward_std": 0.762421753257513, + "rewards/cosine_scaled_reward": -0.11730381986126304, + "rewards/format_reward": 0.8541666679084301, + "step": 266 + }, + { + "advantage_max": 1.9066883474588394, + "advantage_mean": 2.23517424569053e-08, + "advantage_min": -0.8579533696174622, + "advantage_std": 0.9998121857643127, + "completion_length": 2088.250015258789, + "epoch": 0.30514285714285716, + "grad_norm": 0.2329891175031662, + "kl": 0.012308120727539062, + "lambda_div_used": 0.5, + "learning_rate": 5.751196772469237e-07, + "loss": 0.0005, + "reward": -0.20661406670114957, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.20661406670114957, + "reward_after_std": 0.6844783090054989, + "reward_before_mean": 0.21980984695255756, + "reward_before_std": 0.6176054794341326, + "reward_change_max": 0.003687061369419098, + "reward_change_mean": -0.42642390355467796, + "reward_change_min": -0.7731238566339016, + "reward_change_std": 0.3042891379445791, + "reward_std": 0.6844783164560795, + "rewards/cosine_scaled_reward": -0.192178413271904, + "rewards/format_reward": 0.6041666753590107, + "step": 267 + }, + { + "advantage_max": 1.9255520403385162, + "advantage_mean": -4.967053990334591e-09, + "advantage_min": -0.8325854539871216, + "advantage_std": 0.9998375400900841, + "completion_length": 1344.5625305175781, + "epoch": 0.3062857142857143, + "grad_norm": 0.31328123807907104, + "kl": 0.010145187377929688, + "lambda_div_used": 0.5, + "learning_rate": 5.71982396408026e-07, + "loss": 0.0004, + "reward": 0.04860646743327379, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.04860646743327379, + "reward_after_std": 0.7247105650603771, + "reward_before_mean": 0.6737147830426693, + "reward_before_std": 0.6382793746888638, + "reward_change_max": 0.0, + "reward_change_mean": -0.6251083053648472, + "reward_change_min": -1.0648128241300583, + "reward_change_std": 0.41542051173746586, + "reward_std": 0.7247105725109577, + "rewards/cosine_scaled_reward": -0.09022596850991249, + "rewards/format_reward": 0.8541666753590107, + "step": 268 + }, + { + "advantage_max": 1.9525828659534454, + "advantage_mean": 2.8560558584800333e-08, + "advantage_min": -0.7449537627398968, + "advantage_std": 0.9998025745153427, + "completion_length": 1480.8333587646484, + "epoch": 0.30742857142857144, + "grad_norm": 0.2323724776506424, + "kl": 0.00814056396484375, + "lambda_div_used": 0.5, + "learning_rate": 5.688440441781398e-07, + "loss": 0.0003, + "reward": 0.09112085448578, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.09112085448578, + "reward_after_std": 0.6920596230775118, + "reward_before_mean": 0.763656685128808, + "reward_before_std": 0.5157211106270552, + "reward_change_max": 0.0005363896489143372, + "reward_change_mean": -0.6725358641706407, + "reward_change_min": -1.0665729567408562, + "reward_change_std": 0.4021889283321798, + "reward_std": 0.6920596417039633, + "rewards/cosine_scaled_reward": -0.03483833000063896, + "rewards/format_reward": 0.8333333358168602, + "step": 269 + }, + { + "advantage_max": 1.9452837705612183, + "advantage_mean": 1.2417634254191512e-08, + "advantage_min": -0.6652894914150238, + "advantage_std": 0.9998657330870628, + "completion_length": 1606.645866394043, + "epoch": 0.30857142857142855, + "grad_norm": 0.18927238881587982, + "kl": 0.009557723999023438, + "lambda_div_used": 0.5, + "learning_rate": 5.657047735161255e-07, + "loss": 0.0004, + "reward": 0.21252715727314353, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.21252715727314353, + "reward_after_std": 0.8603941760957241, + "reward_before_mean": 0.9310585322091356, + "reward_before_std": 0.6936173308640718, + "reward_change_max": 0.0003393515944480896, + "reward_change_mean": -0.7185313515365124, + "reward_change_min": -1.1931066066026688, + "reward_change_std": 0.4510572552680969, + "reward_std": 0.8603942133486271, + "rewards/cosine_scaled_reward": 0.03844592347741127, + "rewards/format_reward": 0.8541666679084301, + "step": 270 + }, + { + "advantage_max": 1.9340698719024658, + "advantage_mean": 1.0554989549049765e-08, + "advantage_min": -0.8037978634238243, + "advantage_std": 0.9998864904046059, + "completion_length": 1319.333381652832, + "epoch": 0.3097142857142857, + "grad_norm": 0.29956671595573425, + "kl": 0.00847625732421875, + "lambda_div_used": 0.5, + "learning_rate": 5.625647374256061e-07, + "loss": 0.0003, + "reward": 0.37543570157140493, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.37543570157140493, + "reward_after_std": 0.9555936455726624, + "reward_before_mean": 1.198203792795539, + "reward_before_std": 0.7853490561246872, + "reward_change_max": 0.0, + "reward_change_mean": -0.8227681331336498, + "reward_change_min": -1.3215494900941849, + "reward_change_std": 0.5134271755814552, + "reward_std": 0.9555936753749847, + "rewards/cosine_scaled_reward": 0.15118524804711342, + "rewards/format_reward": 0.8958333432674408, + "step": 271 + }, + { + "advantage_max": 1.9177703112363815, + "advantage_mean": 3.414849514271623e-09, + "advantage_min": -0.8598247207701206, + "advantage_std": 0.9998156502842903, + "completion_length": 1830.6250305175781, + "epoch": 0.31085714285714283, + "grad_norm": 0.29469630122184753, + "kl": 0.011463165283203125, + "lambda_div_used": 0.5, + "learning_rate": 5.594240889475106e-07, + "loss": 0.0005, + "reward": -0.05768436938524246, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.05768436938524246, + "reward_after_std": 0.6672587916254997, + "reward_before_mean": 0.4954916397109628, + "reward_before_std": 0.5718173142522573, + "reward_change_max": 0.0, + "reward_change_mean": -0.5531760081648827, + "reward_change_min": -0.9284788183867931, + "reward_change_std": 0.3455266337841749, + "reward_std": 0.6672588251531124, + "rewards/cosine_scaled_reward": -0.13767085410654545, + "rewards/format_reward": 0.7708333358168602, + "step": 272 + }, + { + "advantage_max": 1.9324713498353958, + "advantage_mean": 2.1730858223989458e-09, + "advantage_min": -0.802878201007843, + "advantage_std": 0.9998410195112228, + "completion_length": 1315.0000381469727, + "epoch": 0.312, + "grad_norm": 0.24351400136947632, + "kl": 0.00801849365234375, + "lambda_div_used": 0.5, + "learning_rate": 5.562829811526154e-07, + "loss": 0.0003, + "reward": 0.13825794821605086, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.13825794821605086, + "reward_after_std": 0.7033094502985477, + "reward_before_mean": 0.8465951606631279, + "reward_before_std": 0.5551106706261635, + "reward_change_max": 0.0007760822772979736, + "reward_change_mean": -0.7083372462075204, + "reward_change_min": -1.0916896015405655, + "reward_change_std": 0.4183959634974599, + "reward_std": 0.703309491276741, + "rewards/cosine_scaled_reward": 0.006630909629166126, + "rewards/format_reward": 0.8333333488553762, + "step": 273 + }, + { + "advantage_max": 1.9511011093854904, + "advantage_mean": 2.6077033199456423e-08, + "advantage_min": -0.7403512001037598, + "advantage_std": 0.999863512814045, + "completion_length": 1045.1458549499512, + "epoch": 0.31314285714285717, + "grad_norm": 0.28778988122940063, + "kl": 0.0078582763671875, + "lambda_div_used": 0.5, + "learning_rate": 5.531415671340826e-07, + "loss": 0.0003, + "reward": 0.29660653905011714, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.29660653905011714, + "reward_after_std": 0.857206005603075, + "reward_before_mean": 1.0843129493296146, + "reward_before_std": 0.6621245350688696, + "reward_change_max": 0.0, + "reward_change_mean": -0.7877063974738121, + "reward_change_min": -1.2632982358336449, + "reward_change_std": 0.4729926325380802, + "reward_std": 0.8572060279548168, + "rewards/cosine_scaled_reward": 0.052573127672076225, + "rewards/format_reward": 0.9791666716337204, + "step": 274 + }, + { + "advantage_max": 1.9466314613819122, + "advantage_mean": -1.8626451714354175e-08, + "advantage_min": -0.7949444241821766, + "advantage_std": 0.999883271753788, + "completion_length": 1440.5208740234375, + "epoch": 0.3142857142857143, + "grad_norm": 0.24470460414886475, + "kl": 0.008533477783203125, + "lambda_div_used": 0.5, + "learning_rate": 5.5e-07, + "loss": 0.0003, + "reward": 0.39895466226153076, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.39895466226153076, + "reward_after_std": 0.9324078634381294, + "reward_before_mean": 1.2469081226736307, + "reward_before_std": 0.7196089113131166, + "reward_change_max": 0.00023962557315826416, + "reward_change_mean": -0.8479535095393658, + "reward_change_min": -1.2866101413965225, + "reward_change_std": 0.509355966001749, + "reward_std": 0.9324079155921936, + "rewards/cosine_scaled_reward": 0.19637071434408426, + "rewards/format_reward": 0.8541666772216558, + "step": 275 + }, + { + "advantage_max": 1.9146538376808167, + "advantage_mean": 3.104408841103634e-09, + "advantage_min": -0.7443368807435036, + "advantage_std": 0.9998732730746269, + "completion_length": 1471.916732788086, + "epoch": 0.31542857142857145, + "grad_norm": 0.3321644365787506, + "kl": 0.0133209228515625, + "lambda_div_used": 0.5, + "learning_rate": 5.468584328659172e-07, + "loss": 0.0005, + "reward": 0.14495250955224037, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.14495250955224037, + "reward_after_std": 0.9756482355296612, + "reward_before_mean": 0.76621616166085, + "reward_before_std": 0.8980020098388195, + "reward_change_max": 0.0011306777596473694, + "reward_change_mean": -0.6212636511772871, + "reward_change_min": -1.210908930748701, + "reward_change_std": 0.44276667572557926, + "reward_std": 0.9756482467055321, + "rewards/cosine_scaled_reward": -0.023141922429203987, + "rewards/format_reward": 0.8125000055879354, + "step": 276 + }, + { + "advantage_max": 1.9444350749254227, + "advantage_mean": 3.725290520506519e-09, + "advantage_min": -0.7333899475634098, + "advantage_std": 0.9998810589313507, + "completion_length": 1408.020881652832, + "epoch": 0.31657142857142856, + "grad_norm": 0.34184730052948, + "kl": 0.013032913208007812, + "lambda_div_used": 0.5, + "learning_rate": 5.437170188473847e-07, + "loss": 0.0005, + "reward": 0.2797253541648388, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2797253541648388, + "reward_after_std": 0.9698216766119003, + "reward_before_mean": 1.0142830004915595, + "reward_before_std": 0.7941651232540607, + "reward_change_max": 0.0, + "reward_change_mean": -0.734557643532753, + "reward_change_min": -1.2394614443182945, + "reward_change_std": 0.4602707177400589, + "reward_std": 0.9698216803371906, + "rewards/cosine_scaled_reward": 0.06964147090911865, + "rewards/format_reward": 0.8750000074505806, + "step": 277 + }, + { + "advantage_max": 1.9824930280447006, + "advantage_mean": -1.2417631367611648e-09, + "advantage_min": -0.7233392670750618, + "advantage_std": 0.9998651966452599, + "completion_length": 1325.6458930969238, + "epoch": 0.3177142857142857, + "grad_norm": 0.2691362202167511, + "kl": 0.0094451904296875, + "lambda_div_used": 0.5, + "learning_rate": 5.405759110524894e-07, + "loss": 0.0004, + "reward": 0.3478627223521471, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3478627223521471, + "reward_after_std": 0.8299554251134396, + "reward_before_mean": 1.1734108105301857, + "reward_before_std": 0.5487337484955788, + "reward_change_max": 0.0006553307175636292, + "reward_change_mean": -0.8255480825901031, + "reward_change_min": -1.1466122642159462, + "reward_change_std": 0.4497763179242611, + "reward_std": 0.8299554325640202, + "rewards/cosine_scaled_reward": 0.12837206269614398, + "rewards/format_reward": 0.9166666716337204, + "step": 278 + }, + { + "advantage_max": 1.9485204070806503, + "advantage_mean": -3.1044087300813317e-09, + "advantage_min": -0.8334387242794037, + "advantage_std": 0.9998544678092003, + "completion_length": 1508.8542022705078, + "epoch": 0.31885714285714284, + "grad_norm": 0.18029850721359253, + "kl": 0.0090484619140625, + "lambda_div_used": 0.5, + "learning_rate": 5.37435262574394e-07, + "loss": 0.0004, + "reward": 0.13744146656244993, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.13744146656244993, + "reward_after_std": 0.8153170794248581, + "reward_before_mean": 0.8065908160060644, + "reward_before_std": 0.6509456839412451, + "reward_change_max": 0.0, + "reward_change_mean": -0.6691493093967438, + "reward_change_min": -1.0829177498817444, + "reward_change_std": 0.40314605459570885, + "reward_std": 0.8153170831501484, + "rewards/cosine_scaled_reward": -0.06545462599024177, + "rewards/format_reward": 0.9375000149011612, + "step": 279 + }, + { + "advantage_max": 1.8773256838321686, + "advantage_mean": 3.104408619059029e-09, + "advantage_min": -0.8730496391654015, + "advantage_std": 0.9998912662267685, + "completion_length": 1677.208381652832, + "epoch": 0.32, + "grad_norm": 0.274914026260376, + "kl": 0.011020660400390625, + "lambda_div_used": 0.5, + "learning_rate": 5.342952264838747e-07, + "loss": 0.0004, + "reward": 0.4246948091313243, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4246948091313243, + "reward_after_std": 1.0214713141322136, + "reward_before_mean": 1.2751869540661573, + "reward_before_std": 0.9706107303500175, + "reward_change_max": 0.0, + "reward_change_mean": -0.8504920788109303, + "reward_change_min": -1.5552897602319717, + "reward_change_std": 0.5969200953841209, + "reward_std": 1.0214713215827942, + "rewards/cosine_scaled_reward": 0.20009343978017569, + "rewards/format_reward": 0.875, + "step": 280 + }, + { + "advantage_max": 1.9586692303419113, + "advantage_mean": 3.104408841103634e-09, + "advantage_min": -0.7022324539721012, + "advantage_std": 0.9998129606246948, + "completion_length": 2271.416702270508, + "epoch": 0.3211428571428571, + "grad_norm": 0.25421464443206787, + "kl": 0.01201629638671875, + "lambda_div_used": 0.5, + "learning_rate": 5.311559558218603e-07, + "loss": 0.0005, + "reward": -0.1680087298154831, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.1680087298154831, + "reward_after_std": 0.7381243333220482, + "reward_before_mean": 0.27042799443006516, + "reward_before_std": 0.659267095848918, + "reward_change_max": 0.0005218088626861572, + "reward_change_mean": -0.4384367326274514, + "reward_change_min": -0.7785316742956638, + "reward_change_std": 0.3023688681423664, + "reward_std": 0.73812435567379, + "rewards/cosine_scaled_reward": -0.16686933673918247, + "rewards/format_reward": 0.6041666679084301, + "step": 281 + }, + { + "advantage_max": 1.958927944302559, + "advantage_mean": 3.104408619059029e-09, + "advantage_min": -0.719094455242157, + "advantage_std": 0.9998442083597183, + "completion_length": 1348.5208740234375, + "epoch": 0.3222857142857143, + "grad_norm": 0.23964226245880127, + "kl": 0.008609771728515625, + "lambda_div_used": 0.5, + "learning_rate": 5.28017603591974e-07, + "loss": 0.0003, + "reward": 0.13726119976490736, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.13726119976490736, + "reward_after_std": 0.7930195443332195, + "reward_before_mean": 0.81022091768682, + "reward_before_std": 0.5720008015632629, + "reward_change_max": 0.0, + "reward_change_mean": -0.6729597263038158, + "reward_change_min": -0.9675496965646744, + "reward_change_std": 0.37686675041913986, + "reward_std": 0.793019562959671, + "rewards/cosine_scaled_reward": -0.05322289373725653, + "rewards/format_reward": 0.916666679084301, + "step": 282 + }, + { + "advantage_max": 1.9157568961381912, + "advantage_mean": -1.2728075482471013e-08, + "advantage_min": -0.8272853344678879, + "advantage_std": 0.9998943582177162, + "completion_length": 1776.1042175292969, + "epoch": 0.32342857142857145, + "grad_norm": 0.21559320390224457, + "kl": 0.008855819702148438, + "lambda_div_used": 0.5, + "learning_rate": 5.248803227530763e-07, + "loss": 0.0004, + "reward": 0.44334246404469013, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.44334246404469013, + "reward_after_std": 1.0401191785931587, + "reward_before_mean": 1.3003057707101107, + "reward_before_std": 0.907433059066534, + "reward_change_max": 0.00027686357498168945, + "reward_change_mean": -0.8569633327424526, + "reward_change_min": -1.4866943806409836, + "reward_change_std": 0.5639832280576229, + "reward_std": 1.0401191860437393, + "rewards/cosine_scaled_reward": 0.21265287976711988, + "rewards/format_reward": 0.8750000074505806, + "step": 283 + }, + { + "advantage_max": 1.926544651389122, + "advantage_mean": 8.847564458847046e-09, + "advantage_min": -0.7751799561083317, + "advantage_std": 0.999853253364563, + "completion_length": 1054.770866394043, + "epoch": 0.32457142857142857, + "grad_norm": 0.34369370341300964, + "kl": 0.006954193115234375, + "lambda_div_used": 0.5, + "learning_rate": 5.21744266211809e-07, + "loss": 0.0003, + "reward": 0.12165421736426651, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.12165421736426651, + "reward_after_std": 0.8663975708186626, + "reward_before_mean": 0.7515358105301857, + "reward_before_std": 0.7445918060839176, + "reward_change_max": 0.0019152984023094177, + "reward_change_mean": -0.6298815757036209, + "reward_change_min": -1.1273687332868576, + "reward_change_std": 0.40246682800352573, + "reward_std": 0.8663975708186626, + "rewards/cosine_scaled_reward": -0.1138154431246221, + "rewards/format_reward": 0.9791666716337204, + "step": 284 + }, + { + "advantage_max": 1.9379163980484009, + "advantage_mean": -3.1044087300813317e-09, + "advantage_min": -0.7704954259097576, + "advantage_std": 0.9998484328389168, + "completion_length": 992.1875305175781, + "epoch": 0.32571428571428573, + "grad_norm": 0.26117557287216187, + "kl": 0.008707046508789062, + "lambda_div_used": 0.5, + "learning_rate": 5.186095868151436e-07, + "loss": 0.0003, + "reward": 0.15157003700733185, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.15157003700733185, + "reward_after_std": 0.843847218900919, + "reward_before_mean": 0.8257862031459808, + "reward_before_std": 0.7248164545744658, + "reward_change_max": 0.0, + "reward_change_mean": -0.6742161959409714, + "reward_change_min": -1.11558248847723, + "reward_change_std": 0.42356468737125397, + "reward_std": 0.8438472338020802, + "rewards/cosine_scaled_reward": -0.07669024355709553, + "rewards/format_reward": 0.9791666716337204, + "step": 285 + }, + { + "advantage_max": 1.927560493350029, + "advantage_mean": -1.2728075149404106e-08, + "advantage_min": -0.8058949783444405, + "advantage_std": 0.9998480826616287, + "completion_length": 1363.7083740234375, + "epoch": 0.32685714285714285, + "grad_norm": 0.27410030364990234, + "kl": 0.006923675537109375, + "lambda_div_used": 0.5, + "learning_rate": 5.154764373429315e-07, + "loss": 0.0003, + "reward": 0.14017292112112045, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.14017292112112045, + "reward_after_std": 0.7733436599373817, + "reward_before_mean": 0.8225498106330633, + "reward_before_std": 0.6446866802871227, + "reward_change_max": 0.0, + "reward_change_mean": -0.6823768839240074, + "reward_change_min": -1.1488277539610863, + "reward_change_std": 0.4256325364112854, + "reward_std": 0.7733436599373817, + "rewards/cosine_scaled_reward": -0.057475125417113304, + "rewards/format_reward": 0.9375, + "step": 286 + }, + { + "advantage_max": 1.9195478856563568, + "advantage_mean": 1.6142925107764938e-08, + "advantage_min": -0.7916990965604782, + "advantage_std": 0.9998394548892975, + "completion_length": 1224.8958625793457, + "epoch": 0.328, + "grad_norm": 0.29854050278663635, + "kl": 0.0105438232421875, + "lambda_div_used": 0.5, + "learning_rate": 5.123449705004581e-07, + "loss": 0.0004, + "reward": 0.09376479079946876, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.09376479079946876, + "reward_after_std": 0.7171185463666916, + "reward_before_mean": 0.7573619782924652, + "reward_before_std": 0.5836118310689926, + "reward_change_max": 0.0, + "reward_change_mean": -0.66359718516469, + "reward_change_min": -1.1100826933979988, + "reward_change_std": 0.4130848478525877, + "reward_std": 0.7171185612678528, + "rewards/cosine_scaled_reward": -0.02756902575492859, + "rewards/format_reward": 0.8125000074505806, + "step": 287 + }, + { + "advantage_max": 1.9400405883789062, + "advantage_mean": -1.2417632477834672e-09, + "advantage_min": -0.7356844134628773, + "advantage_std": 0.9998426660895348, + "completion_length": 1373.5000457763672, + "epoch": 0.3291428571428571, + "grad_norm": 0.22060362994670868, + "kl": 0.009063720703125, + "lambda_div_used": 0.5, + "learning_rate": 5.09215338910999e-07, + "loss": 0.0004, + "reward": -0.04765269602648914, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.04765269602648914, + "reward_after_std": 0.7224587984383106, + "reward_before_mean": 0.50075370259583, + "reward_before_std": 0.622645877301693, + "reward_change_max": 0.0, + "reward_change_mean": -0.5484064146876335, + "reward_change_min": -1.0198993384838104, + "reward_change_std": 0.3653989788144827, + "reward_std": 0.7224588245153427, + "rewards/cosine_scaled_reward": -0.21837315894663334, + "rewards/format_reward": 0.9375000074505806, + "step": 288 + }, + { + "advantage_max": 1.9686802327632904, + "advantage_mean": -8.692344177774203e-09, + "advantage_min": -0.7215582430362701, + "advantage_std": 0.9998724237084389, + "completion_length": 1411.6250381469727, + "epoch": 0.3302857142857143, + "grad_norm": 0.3080616891384125, + "kl": 0.0121307373046875, + "lambda_div_used": 0.5, + "learning_rate": 5.060876951083828e-07, + "loss": 0.0005, + "reward": 0.1366715773474425, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1366715773474425, + "reward_after_std": 0.8801541402935982, + "reward_before_mean": 0.7729975432157516, + "reward_before_std": 0.7023350726813078, + "reward_change_max": 0.0006599947810173035, + "reward_change_mean": -0.6363259479403496, + "reward_change_min": -1.0092889964580536, + "reward_change_std": 0.3881163038313389, + "reward_std": 0.880154199898243, + "rewards/cosine_scaled_reward": -0.030167920514941216, + "rewards/format_reward": 0.8333333395421505, + "step": 289 + }, + { + "advantage_max": 1.9003676027059555, + "advantage_mean": -2.4835269396561444e-09, + "advantage_min": -0.7423394173383713, + "advantage_std": 0.9998863041400909, + "completion_length": 1045.5416984558105, + "epoch": 0.3314285714285714, + "grad_norm": 0.3107107877731323, + "kl": 0.0094146728515625, + "lambda_div_used": 0.5, + "learning_rate": 5.02962191529556e-07, + "loss": 0.0004, + "reward": 0.42231632210314274, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.42231632210314274, + "reward_after_std": 1.0070499442517757, + "reward_before_mean": 1.2733629271388054, + "reward_before_std": 0.8752043275162578, + "reward_change_max": 0.0, + "reward_change_mean": -0.851046584546566, + "reward_change_min": -1.5561627894639969, + "reward_change_std": 0.5719406455755234, + "reward_std": 1.007049947977066, + "rewards/cosine_scaled_reward": 0.1575147584080696, + "rewards/format_reward": 0.9583333358168602, + "step": 290 + }, + { + "advantage_max": 1.961781457066536, + "advantage_mean": 1.2417633588057697e-09, + "advantage_min": -0.7081636115908623, + "advantage_std": 0.9998603463172913, + "completion_length": 1178.5625305175781, + "epoch": 0.3325714285714286, + "grad_norm": 0.2246701866388321, + "kl": 0.0083160400390625, + "lambda_div_used": 0.5, + "learning_rate": 4.998389805071536e-07, + "loss": 0.0003, + "reward": 0.13762659142958, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.13762659142958, + "reward_after_std": 0.8188576325774193, + "reward_before_mean": 0.8014777625649003, + "reward_before_std": 0.6381174903362989, + "reward_change_max": 0.0, + "reward_change_mean": -0.66385118663311, + "reward_change_min": -1.0244802385568619, + "reward_change_std": 0.3820841684937477, + "reward_std": 0.8188576474785805, + "rewards/cosine_scaled_reward": -0.07842779252678156, + "rewards/format_reward": 0.9583333358168602, + "step": 291 + }, + { + "advantage_max": 1.919129803776741, + "advantage_mean": -4.113341445233232e-09, + "advantage_min": -0.8278881087899208, + "advantage_std": 0.9998489618301392, + "completion_length": 1414.7917098999023, + "epoch": 0.33371428571428574, + "grad_norm": 0.2348564714193344, + "kl": 0.009510040283203125, + "lambda_div_used": 0.5, + "learning_rate": 4.967182142620745e-07, + "loss": 0.0004, + "reward": 0.1004533120431006, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.1004533120431006, + "reward_after_std": 0.7959478050470352, + "reward_before_mean": 0.7488336265087128, + "reward_before_std": 0.6854219231754541, + "reward_change_max": 0.0, + "reward_change_mean": -0.6483803391456604, + "reward_change_min": -1.0386323034763336, + "reward_change_std": 0.4167475663125515, + "reward_std": 0.7959478460252285, + "rewards/cosine_scaled_reward": -0.07349985092878342, + "rewards/format_reward": 0.8958333432674408, + "step": 292 + }, + { + "advantage_max": 1.9526210129261017, + "advantage_mean": 1.2417634698280722e-09, + "advantage_min": -0.8244255632162094, + "advantage_std": 0.9998187869787216, + "completion_length": 1080.5000228881836, + "epoch": 0.33485714285714285, + "grad_norm": 0.24422034621238708, + "kl": 0.009332656860351562, + "lambda_div_used": 0.5, + "learning_rate": 4.93600044896063e-07, + "loss": 0.0004, + "reward": 0.24010583432391286, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.24010583432391286, + "reward_after_std": 0.6077372506260872, + "reward_before_mean": 1.0529423654079437, + "reward_before_std": 0.36602981574833393, + "reward_change_max": 0.0, + "reward_change_mean": -0.8128365054726601, + "reward_change_min": -1.1496832966804504, + "reward_change_std": 0.43168095126748085, + "reward_std": 0.6077372655272484, + "rewards/cosine_scaled_reward": 0.03688783012330532, + "rewards/format_reward": 0.9791666716337204, + "step": 293 + }, + { + "advantage_max": 1.90102319419384, + "advantage_mean": 8.07146260939362e-09, + "advantage_min": -0.7834720239043236, + "advantage_std": 0.9998085051774979, + "completion_length": 1687.7083740234375, + "epoch": 0.336, + "grad_norm": 0.2405546009540558, + "kl": 0.01241302490234375, + "lambda_div_used": 0.5, + "learning_rate": 4.904846243842949e-07, + "loss": 0.0005, + "reward": -0.004911705851554871, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.004911705851554871, + "reward_after_std": 0.7069712989032269, + "reward_before_mean": 0.583304937928915, + "reward_before_std": 0.6382516892626882, + "reward_change_max": 0.0, + "reward_change_mean": -0.5882166475057602, + "reward_change_min": -1.0201143249869347, + "reward_change_std": 0.40058210119605064, + "reward_std": 0.7069713175296783, + "rewards/cosine_scaled_reward": -0.09376422129571438, + "rewards/format_reward": 0.7708333432674408, + "step": 294 + }, + { + "advantage_max": 1.8850695043802261, + "advantage_mean": -1.490116174895917e-08, + "advantage_min": -0.8308575078845024, + "advantage_std": 0.9998729452490807, + "completion_length": 1527.8542022705078, + "epoch": 0.33714285714285713, + "grad_norm": 0.25586745142936707, + "kl": 0.01103973388671875, + "lambda_div_used": 0.5, + "learning_rate": 4.873721045679706e-07, + "loss": 0.0004, + "reward": 0.36468934011645615, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.36468934011645615, + "reward_after_std": 0.9241388477385044, + "reward_before_mean": 1.1961545972153544, + "reward_before_std": 0.8213022798299789, + "reward_change_max": 0.0, + "reward_change_mean": -0.8314652666449547, + "reward_change_min": -1.457765981554985, + "reward_change_std": 0.5527912154793739, + "reward_std": 0.9241388812661171, + "rewards/cosine_scaled_reward": 0.1605772953480482, + "rewards/format_reward": 0.8750000074505806, + "step": 295 + }, + { + "advantage_max": 1.8951444178819656, + "advantage_mean": -1.1331091731570098e-08, + "advantage_min": -0.9481487721204758, + "advantage_std": 0.9998307302594185, + "completion_length": 1485.3125610351562, + "epoch": 0.3382857142857143, + "grad_norm": 0.2452612966299057, + "kl": 0.00897216796875, + "lambda_div_used": 0.5, + "learning_rate": 4.842626371469149e-07, + "loss": 0.0004, + "reward": 0.08178290724754333, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.08178290724754333, + "reward_after_std": 0.6763971261680126, + "reward_before_mean": 0.7581503801047802, + "reward_before_std": 0.6051702238619328, + "reward_change_max": 0.0, + "reward_change_mean": -0.6763674877583981, + "reward_change_min": -1.109766460955143, + "reward_change_std": 0.4316418059170246, + "reward_std": 0.6763971447944641, + "rewards/cosine_scaled_reward": -0.08967481926083565, + "rewards/format_reward": 0.9375000149011612, + "step": 296 + }, + { + "advantage_max": 1.9076000899076462, + "advantage_mean": 1.459072151988039e-08, + "advantage_min": -0.8135585486888885, + "advantage_std": 0.9998480305075645, + "completion_length": 1972.2292022705078, + "epoch": 0.3394285714285714, + "grad_norm": 0.27767202258110046, + "kl": 0.015995025634765625, + "lambda_div_used": 0.5, + "learning_rate": 4.811563736721829e-07, + "loss": 0.0006, + "reward": 0.09356885030865669, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.09356885030865669, + "reward_after_std": 0.7759539633989334, + "reward_before_mean": 0.7381240706890821, + "reward_before_std": 0.6730439588427544, + "reward_change_max": 0.0, + "reward_change_mean": -0.6445552557706833, + "reward_change_min": -1.1503963880240917, + "reward_change_std": 0.4329614248126745, + "reward_std": 0.7759539783000946, + "rewards/cosine_scaled_reward": -0.016354622319340706, + "rewards/format_reward": 0.7708333488553762, + "step": 297 + }, + { + "advantage_max": 1.9864699989557266, + "advantage_mean": -1.9557773955902746e-08, + "advantage_min": -0.670455165207386, + "advantage_std": 0.9998606741428375, + "completion_length": 1538.7500534057617, + "epoch": 0.3405714285714286, + "grad_norm": 0.21603329479694366, + "kl": 0.008434295654296875, + "lambda_div_used": 0.5, + "learning_rate": 4.780534655386743e-07, + "loss": 0.0003, + "reward": 0.1843671938404441, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.1843671938404441, + "reward_after_std": 0.8246655911207199, + "reward_before_mean": 0.878794476389885, + "reward_before_std": 0.5901540853083134, + "reward_change_max": 0.0, + "reward_change_mean": -0.6944273039698601, + "reward_change_min": -1.0942930579185486, + "reward_change_std": 0.39840497076511383, + "reward_std": 0.8246655985713005, + "rewards/cosine_scaled_reward": -0.029352783225476742, + "rewards/format_reward": 0.9375, + "step": 298 + }, + { + "advantage_max": 1.9436794072389603, + "advantage_mean": -9.313225746154785e-09, + "advantage_min": -0.7619923055171967, + "advantage_std": 0.9998595044016838, + "completion_length": 1167.5416870117188, + "epoch": 0.3417142857142857, + "grad_norm": 0.22107698023319244, + "kl": 0.008447647094726562, + "lambda_div_used": 0.5, + "learning_rate": 4.749540639777539e-07, + "loss": 0.0003, + "reward": 0.1298121795989573, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.1298121795989573, + "reward_after_std": 0.7819931656122208, + "reward_before_mean": 0.803707379847765, + "reward_before_std": 0.6334694363176823, + "reward_change_max": 0.0, + "reward_change_mean": -0.6738951951265335, + "reward_change_min": -1.084967590868473, + "reward_change_std": 0.4023057296872139, + "reward_std": 0.781993180513382, + "rewards/cosine_scaled_reward": -0.08772965613752604, + "rewards/format_reward": 0.9791666716337204, + "step": 299 + }, + { + "advantage_max": 1.9793277829885483, + "advantage_mean": 1.0554989549049765e-08, + "advantage_min": -0.6793212965130806, + "advantage_std": 0.9998233988881111, + "completion_length": 1506.3333702087402, + "epoch": 0.34285714285714286, + "grad_norm": 0.31695958971977234, + "kl": 0.0128173828125, + "lambda_div_used": 0.5, + "learning_rate": 4.7185832004988133e-07, + "loss": 0.0005, + "reward": 0.02915547788143158, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.02915547788143158, + "reward_after_std": 0.6308148801326752, + "reward_before_mean": 0.6603523679077625, + "reward_before_std": 0.42772908695042133, + "reward_change_max": 0.0, + "reward_change_mean": -0.6311969514936209, + "reward_change_min": -0.9433031231164932, + "reward_change_std": 0.35703119821846485, + "reward_std": 0.6308148987591267, + "rewards/cosine_scaled_reward": -0.09690714068710804, + "rewards/format_reward": 0.8541666716337204, + "step": 300 + }, + { + "advantage_max": 1.9420295059680939, + "advantage_mean": 2.4835269396561444e-09, + "advantage_min": -0.7559764087200165, + "advantage_std": 0.9998279139399529, + "completion_length": 1254.3125457763672, + "epoch": 0.344, + "grad_norm": 0.2822733223438263, + "kl": 0.014049530029296875, + "lambda_div_used": 0.5, + "learning_rate": 4.68766384637248e-07, + "loss": 0.0006, + "reward": 0.03332954691722989, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.03332954691722989, + "reward_after_std": 0.6856257766485214, + "reward_before_mean": 0.652209609746933, + "reward_before_std": 0.5623303577303886, + "reward_change_max": 0.0, + "reward_change_mean": -0.6188800632953644, + "reward_change_min": -1.0174327939748764, + "reward_change_std": 0.3780553489923477, + "reward_std": 0.6856257915496826, + "rewards/cosine_scaled_reward": -0.1426452063024044, + "rewards/format_reward": 0.9375000149011612, + "step": 301 + }, + { + "advantage_max": 1.8940949887037277, + "advantage_mean": 1.0865430444262358e-08, + "advantage_min": -0.8010035902261734, + "advantage_std": 0.9998576045036316, + "completion_length": 1526.37504196167, + "epoch": 0.34514285714285714, + "grad_norm": 0.3930858075618744, + "kl": 0.01357269287109375, + "lambda_div_used": 0.5, + "learning_rate": 4.656784084364238e-07, + "loss": 0.0005, + "reward": 0.1928790423553437, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.1928790423553437, + "reward_after_std": 0.8109844736754894, + "reward_before_mean": 0.9092248368542641, + "reward_before_std": 0.6983633888885379, + "reward_change_max": 0.0008866190910339355, + "reward_change_mean": -0.7163458056747913, + "reward_change_min": -1.235180925577879, + "reward_change_std": 0.4744630251079798, + "reward_std": 0.8109845034778118, + "rewards/cosine_scaled_reward": 0.06919574737548828, + "rewards/format_reward": 0.7708333376795053, + "step": 302 + }, + { + "advantage_max": 1.8796220421791077, + "advantage_mean": -3.6476800815976596e-09, + "advantage_min": -0.8963010087609291, + "advantage_std": 0.9998479634523392, + "completion_length": 1107.0208740234375, + "epoch": 0.3462857142857143, + "grad_norm": 0.3062381148338318, + "kl": 0.009563446044921875, + "lambda_div_used": 0.5, + "learning_rate": 4.6259454195101267e-07, + "loss": 0.0004, + "reward": 0.2196194063872099, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2196194063872099, + "reward_after_std": 0.7703746408224106, + "reward_before_mean": 0.9780805017799139, + "reward_before_std": 0.7009712867438793, + "reward_change_max": 0.0, + "reward_change_mean": -0.7584610693156719, + "reward_change_min": -1.1881650909781456, + "reward_change_std": 0.47840745374560356, + "reward_std": 0.7703746594488621, + "rewards/cosine_scaled_reward": 0.009873565286397934, + "rewards/format_reward": 0.9583333432674408, + "step": 303 + }, + { + "advantage_max": 1.9196258336305618, + "advantage_mean": -2.6387469986843826e-09, + "advantage_min": -0.7958096042275429, + "advantage_std": 0.9998370930552483, + "completion_length": 1217.895851135254, + "epoch": 0.3474285714285714, + "grad_norm": 0.27057960629463196, + "kl": 0.010829925537109375, + "lambda_div_used": 0.5, + "learning_rate": 4.59514935484316e-07, + "loss": 0.0004, + "reward": 0.10770251415669918, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.10770251415669918, + "reward_after_std": 0.754070594906807, + "reward_before_mean": 0.7725160010159016, + "reward_before_std": 0.6370177734643221, + "reward_change_max": 0.0, + "reward_change_mean": -0.6648134812712669, + "reward_change_min": -1.0958143062889576, + "reward_change_std": 0.4159948546439409, + "reward_std": 0.7540706358850002, + "rewards/cosine_scaled_reward": -0.05124201602302492, + "rewards/format_reward": 0.875, + "step": 304 + }, + { + "advantage_max": 1.9056095480918884, + "advantage_mean": 1.2728075482471013e-08, + "advantage_min": -0.8551415503025055, + "advantage_std": 0.9998487681150436, + "completion_length": 1382.4375381469727, + "epoch": 0.3485714285714286, + "grad_norm": 0.3034442961215973, + "kl": 0.0097503662109375, + "lambda_div_used": 0.5, + "learning_rate": 4.5643973913200837e-07, + "loss": 0.0004, + "reward": 0.05656467331573367, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.05656467331573367, + "reward_after_std": 0.785954438149929, + "reward_before_mean": 0.6694810688495636, + "reward_before_std": 0.7176000475883484, + "reward_change_max": 0.0, + "reward_change_mean": -0.6129163838922977, + "reward_change_min": -1.1551690623164177, + "reward_change_std": 0.4239853620529175, + "reward_std": 0.7859544530510902, + "rewards/cosine_scaled_reward": -0.10275947768241167, + "rewards/format_reward": 0.8750000223517418, + "step": 305 + }, + { + "advantage_max": 1.929152637720108, + "advantage_mean": -3.725290964595729e-09, + "advantage_min": -0.8490537628531456, + "advantage_std": 0.9998709782958031, + "completion_length": 1078.0625228881836, + "epoch": 0.3497142857142857, + "grad_norm": 0.3089045584201813, + "kl": 0.0123291015625, + "lambda_div_used": 0.5, + "learning_rate": 4.5336910277482155e-07, + "loss": 0.0005, + "reward": 0.45340352691709995, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.45340352691709995, + "reward_after_std": 0.8783687688410282, + "reward_before_mean": 1.3650562167167664, + "reward_before_std": 0.6945241689682007, + "reward_change_max": 0.0, + "reward_change_mean": -0.9116526544094086, + "reward_change_min": -1.4638760089874268, + "reward_change_std": 0.5430373214185238, + "reward_std": 0.8783687688410282, + "rewards/cosine_scaled_reward": 0.20336140575818717, + "rewards/format_reward": 0.9583333432674408, + "step": 306 + }, + { + "advantage_max": 1.928852841258049, + "advantage_mean": -6.829699139565548e-09, + "advantage_min": -0.7465885579586029, + "advantage_std": 0.9998717904090881, + "completion_length": 1224.1875457763672, + "epoch": 0.35085714285714287, + "grad_norm": 0.29672563076019287, + "kl": 0.010662078857421875, + "lambda_div_used": 0.5, + "learning_rate": 4.503031760712397e-07, + "loss": 0.0004, + "reward": 0.26565046235919, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.26565046235919, + "reward_after_std": 0.9444398619234562, + "reward_before_mean": 1.0031846947968006, + "reward_before_std": 0.8245226852595806, + "reward_change_max": 0.0, + "reward_change_mean": -0.7375341951847076, + "reward_change_min": -1.283432550728321, + "reward_change_std": 0.4830533228814602, + "reward_std": 0.9444398768246174, + "rewards/cosine_scaled_reward": 0.03284231084398925, + "rewards/format_reward": 0.9375000074505806, + "step": 307 + }, + { + "advantage_max": 1.9126890301704407, + "advantage_mean": -3.1044089521259366e-09, + "advantage_min": -0.7414450347423553, + "advantage_std": 0.9998699352145195, + "completion_length": 1937.0000610351562, + "epoch": 0.352, + "grad_norm": 0.21619656682014465, + "kl": 0.014070510864257812, + "lambda_div_used": 0.5, + "learning_rate": 4.4724210845020494e-07, + "loss": 0.0006, + "reward": 0.15780878346413374, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.15780878346413374, + "reward_after_std": 0.9465266540646553, + "reward_before_mean": 0.8064774088561535, + "reward_before_std": 0.8910163529217243, + "reward_change_max": 0.0, + "reward_change_mean": -0.6486686486750841, + "reward_change_min": -1.1572811380028725, + "reward_change_std": 0.45672182738780975, + "reward_std": 0.9465266764163971, + "rewards/cosine_scaled_reward": -0.023844645358622074, + "rewards/format_reward": 0.8541666697710752, + "step": 308 + }, + { + "advantage_max": 1.9088157713413239, + "advantage_mean": 2.4835269396561444e-09, + "advantage_min": -0.7664674371480942, + "advantage_std": 0.999833844602108, + "completion_length": 1647.5000457763672, + "epoch": 0.35314285714285715, + "grad_norm": 0.27004313468933105, + "kl": 0.013214111328125, + "lambda_div_used": 0.5, + "learning_rate": 4.441860491038345e-07, + "loss": 0.0005, + "reward": 0.02324374718591571, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.02324374718591571, + "reward_after_std": 0.7216699905693531, + "reward_before_mean": 0.6317189037799835, + "reward_before_std": 0.6191937811672688, + "reward_change_max": 0.0001543685793876648, + "reward_change_mean": -0.6084751673042774, + "reward_change_min": -1.0604383125901222, + "reward_change_std": 0.3989776838570833, + "reward_std": 0.721670001745224, + "rewards/cosine_scaled_reward": -0.10080722998827696, + "rewards/format_reward": 0.8333333414047956, + "step": 309 + }, + { + "advantage_max": 1.9126400649547577, + "advantage_mean": 4.967054212379196e-09, + "advantage_min": -0.8069312274456024, + "advantage_std": 0.9998295158147812, + "completion_length": 1398.7917022705078, + "epoch": 0.35428571428571426, + "grad_norm": 0.30042949318885803, + "kl": 0.01859283447265625, + "lambda_div_used": 0.5, + "learning_rate": 4.4113514698014953e-07, + "loss": 0.0007, + "reward": 0.017530305543914437, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.017530305543914437, + "reward_after_std": 0.6874769181013107, + "reward_before_mean": 0.626902480609715, + "reward_before_std": 0.5805229172110558, + "reward_change_max": 0.0, + "reward_change_mean": -0.6093721650540829, + "reward_change_min": -1.0393969900906086, + "reward_change_std": 0.387352529913187, + "reward_std": 0.6874769255518913, + "rewards/cosine_scaled_reward": -0.12404878530651331, + "rewards/format_reward": 0.8750000037252903, + "step": 310 + }, + { + "advantage_max": 1.9714401960372925, + "advantage_mean": -1.3038516266661304e-08, + "advantage_min": -0.6754600629210472, + "advantage_std": 0.9998734816908836, + "completion_length": 1114.9375228881836, + "epoch": 0.3554285714285714, + "grad_norm": 0.2700650990009308, + "kl": 0.009521484375, + "lambda_div_used": 0.5, + "learning_rate": 4.3808955077581546e-07, + "loss": 0.0004, + "reward": 0.30486697098240256, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.30486697098240256, + "reward_after_std": 0.9124359712004662, + "reward_before_mean": 1.0774826928973198, + "reward_before_std": 0.6932089999318123, + "reward_change_max": 0.0, + "reward_change_mean": -0.7726157456636429, + "reward_change_min": -1.2057588621973991, + "reward_change_std": 0.45093817077577114, + "reward_std": 0.9124359861016273, + "rewards/cosine_scaled_reward": 0.0491579994559288, + "rewards/format_reward": 0.9791666716337204, + "step": 311 + }, + { + "advantage_max": 1.9676008075475693, + "advantage_mean": -1.2417635808503746e-09, + "advantage_min": -0.6602420620620251, + "advantage_std": 0.9998646304011345, + "completion_length": 1168.2708587646484, + "epoch": 0.3565714285714286, + "grad_norm": 0.23943375051021576, + "kl": 0.00844573974609375, + "lambda_div_used": 0.5, + "learning_rate": 4.350494089288943e-07, + "loss": 0.0003, + "reward": 0.44616672629490495, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.44616672629490495, + "reward_after_std": 0.8134817592799664, + "reward_before_mean": 1.368921009125188, + "reward_before_std": 0.495204322040081, + "reward_change_max": 0.0, + "reward_change_mean": -0.922754317522049, + "reward_change_min": -1.3639105558395386, + "reward_change_std": 0.5105717405676842, + "reward_std": 0.8134817853569984, + "rewards/cosine_scaled_reward": 0.22612717002630234, + "rewards/format_reward": 0.9166666679084301, + "step": 312 + }, + { + "advantage_max": 1.9355697929859161, + "advantage_mean": -1.241763458725842e-08, + "advantage_min": -0.6672081351280212, + "advantage_std": 0.9998272061347961, + "completion_length": 1695.708366394043, + "epoch": 0.3577142857142857, + "grad_norm": 0.24104565382003784, + "kl": 0.01529693603515625, + "lambda_div_used": 0.5, + "learning_rate": 4.3201486961161093e-07, + "loss": 0.0006, + "reward": 0.009465799666941166, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.009465799666941166, + "reward_after_std": 0.7200150936841965, + "reward_before_mean": 0.6010144352912903, + "reward_before_std": 0.5657152414787561, + "reward_change_max": 0.0007646605372428894, + "reward_change_mean": -0.5915486626327038, + "reward_change_min": -0.921930406242609, + "reward_change_std": 0.3610827811062336, + "reward_std": 0.7200151309370995, + "rewards/cosine_scaled_reward": -0.06407611817121506, + "rewards/format_reward": 0.7291666716337204, + "step": 313 + }, + { + "advantage_max": 1.918437272310257, + "advantage_mean": -2.483526828633842e-09, + "advantage_min": -0.9093864634633064, + "advantage_std": 0.9998309463262558, + "completion_length": 1344.1875228881836, + "epoch": 0.3588571428571429, + "grad_norm": 0.31180670857429504, + "kl": 0.013317108154296875, + "lambda_div_used": 0.5, + "learning_rate": 4.2898608072313045e-07, + "loss": 0.0005, + "reward": 0.20816711336374283, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.20816711336374283, + "reward_after_std": 0.7800188288092613, + "reward_before_mean": 0.9478923492133617, + "reward_before_std": 0.6674759928137064, + "reward_change_max": 0.0, + "reward_change_mean": -0.739725261926651, + "reward_change_min": -1.2057388499379158, + "reward_change_std": 0.4727477263659239, + "reward_std": 0.7800188288092613, + "rewards/cosine_scaled_reward": 0.07811282994225621, + "rewards/format_reward": 0.7916666679084301, + "step": 314 + }, + { + "advantage_max": 1.9160043001174927, + "advantage_mean": 4.346172532976311e-09, + "advantage_min": -0.7502782195806503, + "advantage_std": 0.9998539909720421, + "completion_length": 1994.2708892822266, + "epoch": 0.36, + "grad_norm": 0.31821468472480774, + "kl": 0.029644012451171875, + "lambda_div_used": 0.5, + "learning_rate": 4.2596318988235037e-07, + "loss": 0.0012, + "reward": 0.11614364665001631, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.11614364665001631, + "reward_after_std": 0.8035916015505791, + "reward_before_mean": 0.7763093619141728, + "reward_before_std": 0.6844161916524172, + "reward_change_max": 0.0012766644358634949, + "reward_change_mean": -0.6601656787097454, + "reward_change_min": -1.1163662187755108, + "reward_change_std": 0.44189007207751274, + "reward_std": 0.8035916239023209, + "rewards/cosine_scaled_reward": 0.013154652551747859, + "rewards/format_reward": 0.7500000111758709, + "step": 315 + }, + { + "advantage_max": 1.9073859602212906, + "advantage_mean": 3.6011140736036396e-08, + "advantage_min": -0.8746335953474045, + "advantage_std": 0.9998039901256561, + "completion_length": 2026.1042556762695, + "epoch": 0.36114285714285715, + "grad_norm": 0.33457982540130615, + "kl": 0.023563385009765625, + "lambda_div_used": 0.5, + "learning_rate": 4.2294634442070553e-07, + "loss": 0.0009, + "reward": -0.21840902511030436, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.21840902511030436, + "reward_after_std": 0.5321423746645451, + "reward_before_mean": 0.24390754383057356, + "reward_before_std": 0.46077893674373627, + "reward_change_max": 0.000392720103263855, + "reward_change_mean": -0.46231655217707157, + "reward_change_min": -0.7747361436486244, + "reward_change_std": 0.29503229446709156, + "reward_std": 0.5321423932909966, + "rewards/cosine_scaled_reward": -0.24262958019971848, + "rewards/format_reward": 0.7291666865348816, + "step": 316 + }, + { + "advantage_max": 1.9794776886701584, + "advantage_mean": 1.1175871006408045e-08, + "advantage_min": -0.7000977098941803, + "advantage_std": 0.9998548403382301, + "completion_length": 1663.4167175292969, + "epoch": 0.36228571428571427, + "grad_norm": 0.34562116861343384, + "kl": 0.0205841064453125, + "lambda_div_used": 0.5, + "learning_rate": 4.1993569137498776e-07, + "loss": 0.0008, + "reward": 0.04382408410310745, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.04382408410310745, + "reward_after_std": 0.7862890549004078, + "reward_before_mean": 0.6326780554954894, + "reward_before_std": 0.602029662579298, + "reward_change_max": 0.0, + "reward_change_mean": -0.5888539738953114, + "reward_change_min": -0.8865768276154995, + "reward_change_std": 0.3419474679976702, + "reward_std": 0.7862890884280205, + "rewards/cosine_scaled_reward": -0.02741097833495587, + "rewards/format_reward": 0.6875000074505806, + "step": 317 + }, + { + "advantage_max": 1.9762675315141678, + "advantage_mean": -2.5456151353520085e-08, + "advantage_min": -0.6477275937795639, + "advantage_std": 0.9998506307601929, + "completion_length": 1250.791706085205, + "epoch": 0.36342857142857143, + "grad_norm": 0.3656529188156128, + "kl": 0.02227783203125, + "lambda_div_used": 0.5, + "learning_rate": 4.1693137748017915e-07, + "loss": 0.0009, + "reward": 0.05467936210334301, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.05467936210334301, + "reward_after_std": 0.8251577764749527, + "reward_before_mean": 0.6445684731006622, + "reward_before_std": 0.6484403479844332, + "reward_change_max": 0.0, + "reward_change_mean": -0.5898891296237707, + "reward_change_min": -0.9933161847293377, + "reward_change_std": 0.3516414873301983, + "reward_std": 0.8251578062772751, + "rewards/cosine_scaled_reward": -0.1256324439891614, + "rewards/format_reward": 0.895833333954215, + "step": 318 + }, + { + "advantage_max": 1.9682789146900177, + "advantage_mean": 8.07146260939362e-09, + "advantage_min": -0.7039951980113983, + "advantage_std": 0.9998318552970886, + "completion_length": 1496.895881652832, + "epoch": 0.36457142857142855, + "grad_norm": 0.3081127107143402, + "kl": 0.011016845703125, + "lambda_div_used": 0.5, + "learning_rate": 4.1393354916230005e-07, + "loss": 0.0004, + "reward": -0.004789367318153381, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.004789367318153381, + "reward_after_std": 0.6943408586084843, + "reward_before_mean": 0.5780723858624697, + "reward_before_std": 0.5280365757644176, + "reward_change_max": 0.0, + "reward_change_mean": -0.5828617438673973, + "reward_change_min": -0.9427573829889297, + "reward_change_std": 0.335221491754055, + "reward_std": 0.6943408660590649, + "rewards/cosine_scaled_reward": -0.14846382848918438, + "rewards/format_reward": 0.8750000074505806, + "step": 319 + }, + { + "advantage_max": 1.9441251009702682, + "advantage_mean": -1.6142924885720333e-08, + "advantage_min": -0.7833335176110268, + "advantage_std": 0.9998515844345093, + "completion_length": 1153.4791870117188, + "epoch": 0.3657142857142857, + "grad_norm": 0.3333378732204437, + "kl": 0.0160675048828125, + "lambda_div_used": 0.5, + "learning_rate": 4.1094235253127374e-07, + "loss": 0.0006, + "reward": 0.20300496055278927, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.20300496055278927, + "reward_after_std": 0.7820871509611607, + "reward_before_mean": 0.9307644553482533, + "reward_before_std": 0.6208079401403666, + "reward_change_max": 0.0, + "reward_change_mean": -0.7277594916522503, + "reward_change_min": -1.1636041551828384, + "reward_change_std": 0.4276655428111553, + "reward_std": 0.7820871770381927, + "rewards/cosine_scaled_reward": -0.0033677939791232347, + "rewards/format_reward": 0.9375, + "step": 320 + }, + { + "advantage_max": 1.9495942294597626, + "advantage_mean": -1.614292477469803e-08, + "advantage_min": -0.7533436268568039, + "advantage_std": 0.9998676404356956, + "completion_length": 1021.6666946411133, + "epoch": 0.3668571428571429, + "grad_norm": 0.2943817377090454, + "kl": 0.00847625732421875, + "lambda_div_used": 0.5, + "learning_rate": 4.079579333738039e-07, + "loss": 0.0003, + "reward": 0.3986971661215648, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3986971661215648, + "reward_after_std": 0.8560882285237312, + "reward_before_mean": 1.268271841108799, + "reward_before_std": 0.6319062225520611, + "reward_change_max": 0.0, + "reward_change_mean": -0.8695746436715126, + "reward_change_min": -1.3420241624116898, + "reward_change_std": 0.5012955367565155, + "reward_std": 0.8560882434248924, + "rewards/cosine_scaled_reward": 0.13413588888943195, + "rewards/format_reward": 1.0, + "step": 321 + }, + { + "advantage_max": 1.9425024837255478, + "advantage_mean": 1.0554989438027462e-08, + "advantage_min": -0.6427893787622452, + "advantage_std": 0.9998832494020462, + "completion_length": 1422.2292137145996, + "epoch": 0.368, + "grad_norm": 0.36897826194763184, + "kl": 0.026554107666015625, + "lambda_div_used": 0.5, + "learning_rate": 4.0498043714627006e-07, + "loss": 0.0011, + "reward": 0.06227410305291414, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.06227410305291414, + "reward_after_std": 1.005720667541027, + "reward_before_mean": 0.6011641100049019, + "reward_before_std": 0.9237909056246281, + "reward_change_max": 0.0, + "reward_change_mean": -0.5388900116086006, + "reward_change_min": -1.156266689300537, + "reward_change_std": 0.4076250493526459, + "reward_std": 1.0057206749916077, + "rewards/cosine_scaled_reward": -0.11608462547883391, + "rewards/format_reward": 0.8333333432674408, + "step": 322 + }, + { + "advantage_max": 1.95187209546566, + "advantage_mean": -2.0799538202886936e-08, + "advantage_min": -0.8534364998340607, + "advantage_std": 0.9998331591486931, + "completion_length": 1464.7500305175781, + "epoch": 0.36914285714285716, + "grad_norm": 0.40443792939186096, + "kl": 0.021869659423828125, + "lambda_div_used": 0.5, + "learning_rate": 4.020100089676376e-07, + "loss": 0.0009, + "reward": 0.11423857533372939, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.11423857533372939, + "reward_after_std": 0.6722201742231846, + "reward_before_mean": 0.8066409444436431, + "reward_before_std": 0.49490773119032383, + "reward_change_max": 0.0024028271436691284, + "reward_change_mean": -0.6924023889005184, + "reward_change_min": -1.0461347699165344, + "reward_change_std": 0.40142657794058323, + "reward_std": 0.6722202003002167, + "rewards/cosine_scaled_reward": -0.013346204534173012, + "rewards/format_reward": 0.8333333395421505, + "step": 323 + }, + { + "advantage_max": 1.8911200314760208, + "advantage_mean": 1.3038516377683607e-08, + "advantage_min": -0.9142744615674019, + "advantage_std": 0.9998573586344719, + "completion_length": 1039.583366394043, + "epoch": 0.3702857142857143, + "grad_norm": 0.3049314618110657, + "kl": 0.01213836669921875, + "lambda_div_used": 0.5, + "learning_rate": 3.9904679361238526e-07, + "loss": 0.0005, + "reward": 0.03350969776511192, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.03350969776511192, + "reward_after_std": 0.7819260433316231, + "reward_before_mean": 0.6313423737883568, + "reward_before_std": 0.7142659351229668, + "reward_change_max": 0.0011547952890396118, + "reward_change_mean": -0.5978326722979546, + "reward_change_min": -1.0388338789343834, + "reward_change_std": 0.3994863033294678, + "reward_std": 0.7819260433316231, + "rewards/cosine_scaled_reward": -0.1426621489226818, + "rewards/format_reward": 0.9166666865348816, + "step": 324 + }, + { + "advantage_max": 1.9915322363376617, + "advantage_mean": 2.980232371996294e-08, + "advantage_min": -0.6144200935959816, + "advantage_std": 0.9998652711510658, + "completion_length": 1653.1250305175781, + "epoch": 0.37142857142857144, + "grad_norm": 0.26348766684532166, + "kl": 0.014278411865234375, + "lambda_div_used": 0.5, + "learning_rate": 3.9609093550344907e-07, + "loss": 0.0006, + "reward": 0.17073870450258255, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.17073870450258255, + "reward_after_std": 0.8961613662540913, + "reward_before_mean": 0.835974670946598, + "reward_before_std": 0.6641275025904179, + "reward_change_max": 0.0009778067469596863, + "reward_change_mean": -0.6652359329164028, + "reward_change_min": -1.0335832685232162, + "reward_change_std": 0.38337370892986655, + "reward_std": 0.8961613737046719, + "rewards/cosine_scaled_reward": 0.02215397759573534, + "rewards/format_reward": 0.7916666679084301, + "step": 325 + }, + { + "advantage_max": 1.895740658044815, + "advantage_mean": 1.6142924996742636e-08, + "advantage_min": -0.8704542517662048, + "advantage_std": 0.9998702183365822, + "completion_length": 1409.4375305175781, + "epoch": 0.37257142857142855, + "grad_norm": 0.43456870317459106, + "kl": 0.022167205810546875, + "lambda_div_used": 0.5, + "learning_rate": 3.931425787051832e-07, + "loss": 0.0009, + "reward": 0.27470920979976654, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.27470920979976654, + "reward_after_std": 0.9201184548437595, + "reward_before_mean": 1.0299480855464935, + "reward_before_std": 0.8468910120427608, + "reward_change_max": 0.0, + "reward_change_mean": -0.7552388813346624, + "reward_change_min": -1.3375147059559822, + "reward_change_std": 0.5060300789773464, + "reward_std": 0.9201184548437595, + "rewards/cosine_scaled_reward": 0.0983073660172522, + "rewards/format_reward": 0.8333333469927311, + "step": 326 + }, + { + "advantage_max": 1.9163780510425568, + "advantage_mean": -5.2774945524802774e-09, + "advantage_min": -0.8060482665896416, + "advantage_std": 0.9998558238148689, + "completion_length": 1544.0000305175781, + "epoch": 0.3737142857142857, + "grad_norm": 0.24153485894203186, + "kl": 0.01259613037109375, + "lambda_div_used": 0.5, + "learning_rate": 3.902018669163384e-07, + "loss": 0.0005, + "reward": 0.2656072140671313, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2656072140671313, + "reward_after_std": 0.7916287072002888, + "reward_before_mean": 1.0449545159935951, + "reward_before_std": 0.6116082668304443, + "reward_change_max": 0.0, + "reward_change_mean": -0.7793473340570927, + "reward_change_min": -1.2160406894981861, + "reward_change_std": 0.4596743304282427, + "reward_std": 0.7916287481784821, + "rewards/cosine_scaled_reward": 0.11622725054621696, + "rewards/format_reward": 0.8125000074505806, + "step": 327 + }, + { + "advantage_max": 1.9556438773870468, + "advantage_mean": 9.623666974434286e-09, + "advantage_min": -0.7209471166133881, + "advantage_std": 0.999858446419239, + "completion_length": 1625.8958587646484, + "epoch": 0.37485714285714283, + "grad_norm": 0.4186374545097351, + "kl": 0.0170440673828125, + "lambda_div_used": 0.5, + "learning_rate": 3.872689434630585e-07, + "loss": 0.0007, + "reward": -0.05386994406580925, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.05386994406580925, + "reward_after_std": 0.8116550669074059, + "reward_before_mean": 0.45288407802581787, + "reward_before_std": 0.7180973216891289, + "reward_change_max": 0.00030559301376342773, + "reward_change_mean": -0.5067540071904659, + "reward_change_min": -0.9800437428057194, + "reward_change_std": 0.35215629637241364, + "reward_std": 0.8116550669074059, + "rewards/cosine_scaled_reward": -0.15897464100271463, + "rewards/format_reward": 0.770833345130086, + "step": 328 + }, + { + "advantage_max": 1.9007090032100677, + "advantage_mean": 1.4280280180578586e-08, + "advantage_min": -0.9063881933689117, + "advantage_std": 0.9998352080583572, + "completion_length": 1053.7916793823242, + "epoch": 0.376, + "grad_norm": 0.29686838388442993, + "kl": 0.012237548828125, + "lambda_div_used": 0.5, + "learning_rate": 3.843439512918949e-07, + "loss": 0.0005, + "reward": 0.19925944739952683, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.19925944739952683, + "reward_after_std": 0.7242099829018116, + "reward_before_mean": 0.9510072600096464, + "reward_before_std": 0.6169087514281273, + "reward_change_max": 0.0, + "reward_change_mean": -0.7517478205263615, + "reward_change_min": -1.2093592062592506, + "reward_change_std": 0.4579017572104931, + "reward_std": 0.7242099903523922, + "rewards/cosine_scaled_reward": -0.003663059324026108, + "rewards/format_reward": 0.9583333432674408, + "step": 329 + }, + { + "advantage_max": 1.9330773949623108, + "advantage_mean": -4.346172088887101e-09, + "advantage_min": -0.779715783894062, + "advantage_std": 0.9998436868190765, + "completion_length": 1192.6667175292969, + "epoch": 0.37714285714285717, + "grad_norm": 0.44872358441352844, + "kl": 0.020023345947265625, + "lambda_div_used": 0.5, + "learning_rate": 3.8142703296283953e-07, + "loss": 0.0008, + "reward": 0.06654795771464705, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.06654795771464705, + "reward_after_std": 0.7801444493234158, + "reward_before_mean": 0.6826637480407953, + "reward_before_std": 0.673134308308363, + "reward_change_max": 0.0, + "reward_change_mean": -0.616115789860487, + "reward_change_min": -1.0302981063723564, + "reward_change_std": 0.39053851924836636, + "reward_std": 0.7801444493234158, + "rewards/cosine_scaled_reward": -0.11700147949159145, + "rewards/format_reward": 0.916666679084301, + "step": 330 + }, + { + "advantage_max": 1.943891003727913, + "advantage_mean": 6.208816905051151e-09, + "advantage_min": -0.7582138329744339, + "advantage_std": 0.9998082295060158, + "completion_length": 1751.770866394043, + "epoch": 0.3782857142857143, + "grad_norm": 0.44164517521858215, + "kl": 0.018161773681640625, + "lambda_div_used": 0.5, + "learning_rate": 3.785183306423767e-07, + "loss": 0.0007, + "reward": -0.15124160097911954, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.15124160097911954, + "reward_after_std": 0.6162938810884953, + "reward_before_mean": 0.33426031470298767, + "reward_before_std": 0.49191057682037354, + "reward_change_max": 0.0008933767676353455, + "reward_change_mean": -0.4855019422248006, + "reward_change_min": -0.7947644628584385, + "reward_change_std": 0.29183477628976107, + "reward_std": 0.6162938885390759, + "rewards/cosine_scaled_reward": -0.18703650496900082, + "rewards/format_reward": 0.7083333414047956, + "step": 331 + }, + { + "advantage_max": 1.9274078607559204, + "advantage_mean": 3.725290520506519e-09, + "advantage_min": -0.6919441595673561, + "advantage_std": 0.9998692721128464, + "completion_length": 1333.5208587646484, + "epoch": 0.37942857142857145, + "grad_norm": 0.3751254975795746, + "kl": 0.015411376953125, + "lambda_div_used": 0.5, + "learning_rate": 3.7561798609655373e-07, + "loss": 0.0006, + "reward": 0.12781556928530335, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.12781556928530335, + "reward_after_std": 0.889508455991745, + "reward_before_mean": 0.7664818149060011, + "reward_before_std": 0.8011388294398785, + "reward_change_max": 0.0011112168431282043, + "reward_change_mean": -0.6386662609875202, + "reward_change_min": -1.1745759025216103, + "reward_change_std": 0.44522836804389954, + "reward_std": 0.8895084857940674, + "rewards/cosine_scaled_reward": -0.05425910046324134, + "rewards/format_reward": 0.8750000037252903, + "step": 332 + }, + { + "advantage_max": 1.9375999569892883, + "advantage_mean": 2.374872591637267e-08, + "advantage_min": -0.7582232654094696, + "advantage_std": 0.9998498037457466, + "completion_length": 1146.5208740234375, + "epoch": 0.38057142857142856, + "grad_norm": 0.27129772305488586, + "kl": 0.011959075927734375, + "lambda_div_used": 0.5, + "learning_rate": 3.72726140684072e-07, + "loss": 0.0005, + "reward": 0.1619847072288394, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1619847072288394, + "reward_after_std": 0.80493438616395, + "reward_before_mean": 0.8567570652812719, + "reward_before_std": 0.6830338407307863, + "reward_change_max": 0.0, + "reward_change_mean": -0.6947723813354969, + "reward_change_min": -1.1889399215579033, + "reward_change_std": 0.442412793636322, + "reward_std": 0.8049344308674335, + "rewards/cosine_scaled_reward": -0.050788127817213535, + "rewards/format_reward": 0.9583333358168602, + "step": 333 + }, + { + "advantage_max": 1.9378287494182587, + "advantage_mean": 3.539025866805545e-08, + "advantage_min": -0.7852133959531784, + "advantage_std": 0.9998596906661987, + "completion_length": 1976.9167175292969, + "epoch": 0.38171428571428573, + "grad_norm": 0.42113620042800903, + "kl": 0.026947021484375, + "lambda_div_used": 0.5, + "learning_rate": 3.6984293534939737e-07, + "loss": 0.0011, + "reward": -0.13296086061745882, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.13296086061745882, + "reward_after_std": 0.834507130086422, + "reward_before_mean": 0.300143308006227, + "reward_before_std": 0.76963946595788, + "reward_change_max": 0.0035352930426597595, + "reward_change_mean": -0.43310416489839554, + "reward_change_min": -0.8661820441484451, + "reward_change_std": 0.3252943940460682, + "reward_std": 0.8345071524381638, + "rewards/cosine_scaled_reward": -0.19367835018783808, + "rewards/format_reward": 0.6875000093132257, + "step": 334 + }, + { + "advantage_max": 1.897423803806305, + "advantage_mean": -2.1109979209121832e-08, + "advantage_min": -0.8362590447068214, + "advantage_std": 0.9998617917299271, + "completion_length": 1236.3750534057617, + "epoch": 0.38285714285714284, + "grad_norm": 0.4480448067188263, + "kl": 0.0216522216796875, + "lambda_div_used": 0.5, + "learning_rate": 3.6696851061588994e-07, + "loss": 0.0009, + "reward": 0.21662542037665844, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.21662542037665844, + "reward_after_std": 0.8816086277365685, + "reward_before_mean": 0.9350887164473534, + "reward_before_std": 0.8172469660639763, + "reward_change_max": 0.0, + "reward_change_mean": -0.7184633240103722, + "reward_change_min": -1.356583371758461, + "reward_change_std": 0.49653930589556694, + "reward_std": 0.8816086612641811, + "rewards/cosine_scaled_reward": 0.009211016818881035, + "rewards/format_reward": 0.9166666716337204, + "step": 335 + }, + { + "advantage_max": 1.9454235136508942, + "advantage_mean": -3.4148496252939253e-09, + "advantage_min": -0.799261599779129, + "advantage_std": 0.9998770728707314, + "completion_length": 1380.0209121704102, + "epoch": 0.384, + "grad_norm": 0.43987566232681274, + "kl": 0.016510009765625, + "lambda_div_used": 0.5, + "learning_rate": 3.641030065789562e-07, + "loss": 0.0007, + "reward": 0.2827145103365183, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2827145103365183, + "reward_after_std": 0.9053149372339249, + "reward_before_mean": 1.0367488861083984, + "reward_before_std": 0.719157699495554, + "reward_change_max": 0.0, + "reward_change_mean": -0.7540343515574932, + "reward_change_min": -1.2173069790005684, + "reward_change_std": 0.4567888453602791, + "reward_std": 0.9053149521350861, + "rewards/cosine_scaled_reward": 0.10170776396989822, + "rewards/format_reward": 0.8333333432674408, + "step": 336 + }, + { + "advantage_max": 1.9325546622276306, + "advantage_mean": 3.1044086745701804e-09, + "advantage_min": -0.7538226917386055, + "advantage_std": 0.9998637139797211, + "completion_length": 1289.333381652832, + "epoch": 0.3851428571428571, + "grad_norm": 0.3083600103855133, + "kl": 0.012012481689453125, + "lambda_div_used": 0.5, + "learning_rate": 3.612465628992203e-07, + "loss": 0.0005, + "reward": 0.17610874178353697, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.17610874178353697, + "reward_after_std": 0.9225818663835526, + "reward_before_mean": 0.8412223365157843, + "reward_before_std": 0.7876620050519705, + "reward_change_max": 0.0, + "reward_change_mean": -0.6651136055588722, + "reward_change_min": -1.143676407635212, + "reward_change_std": 0.41685409285128117, + "reward_std": 0.9225818961858749, + "rewards/cosine_scaled_reward": -0.0689721773378551, + "rewards/format_reward": 0.9791666716337204, + "step": 337 + }, + { + "advantage_max": 1.9723588973283768, + "advantage_mean": -1.8626452602532595e-08, + "advantage_min": -0.734128400683403, + "advantage_std": 0.9998419284820557, + "completion_length": 1366.9375457763672, + "epoch": 0.3862857142857143, + "grad_norm": 0.34599220752716064, + "kl": 0.01892852783203125, + "lambda_div_used": 0.5, + "learning_rate": 3.5839931879571725e-07, + "loss": 0.0008, + "reward": 0.2359230676665902, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2359230676665902, + "reward_after_std": 0.7228243090212345, + "reward_before_mean": 1.0056013464927673, + "reward_before_std": 0.4889524318277836, + "reward_change_max": 0.001291126012802124, + "reward_change_mean": -0.7696783049032092, + "reward_change_min": -1.112348735332489, + "reward_change_std": 0.42689187824726105, + "reward_std": 0.7228243388235569, + "rewards/cosine_scaled_reward": 0.05488398531451821, + "rewards/format_reward": 0.895833333954215, + "step": 338 + }, + { + "advantage_max": 1.8810593783855438, + "advantage_mean": 1.2417634476236117e-08, + "advantage_min": -0.88125079870224, + "advantage_std": 0.9998427778482437, + "completion_length": 1647.0000305175781, + "epoch": 0.38742857142857146, + "grad_norm": 0.32401907444000244, + "kl": 0.026760101318359375, + "lambda_div_used": 0.5, + "learning_rate": 3.555614130391079e-07, + "loss": 0.0011, + "reward": 0.010632646270096302, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.010632646270096302, + "reward_after_std": 0.7342008873820305, + "reward_before_mean": 0.5953217758797109, + "reward_before_std": 0.6697694882750511, + "reward_change_max": 0.0, + "reward_change_mean": -0.5846891477704048, + "reward_change_min": -0.9830212518572807, + "reward_change_std": 0.38205210864543915, + "reward_std": 0.7342009283602238, + "rewards/cosine_scaled_reward": -0.12942244857549667, + "rewards/format_reward": 0.8541666865348816, + "step": 339 + }, + { + "advantage_max": 1.9713415503501892, + "advantage_mean": 2.4835267176115394e-09, + "advantage_min": -0.7005641125142574, + "advantage_std": 0.9998614490032196, + "completion_length": 1372.1875305175781, + "epoch": 0.38857142857142857, + "grad_norm": 0.24274250864982605, + "kl": 0.012508392333984375, + "lambda_div_used": 0.5, + "learning_rate": 3.5273298394491515e-07, + "loss": 0.0005, + "reward": 0.056589219719171524, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.056589219719171524, + "reward_after_std": 0.856462549418211, + "reward_before_mean": 0.6404929962009192, + "reward_before_std": 0.7010131273418665, + "reward_change_max": 0.0, + "reward_change_mean": -0.5839037746191025, + "reward_change_min": -0.9369787387549877, + "reward_change_std": 0.3534848652780056, + "reward_std": 0.8564625568687916, + "rewards/cosine_scaled_reward": -0.11725351912900805, + "rewards/format_reward": 0.8750000055879354, + "step": 340 + }, + { + "advantage_max": 1.9696991741657257, + "advantage_mean": -1.9247334503980085e-08, + "advantage_min": -0.6668812446296215, + "advantage_std": 0.9998695105314255, + "completion_length": 1139.9792022705078, + "epoch": 0.38971428571428574, + "grad_norm": 0.27058425545692444, + "kl": 0.01445770263671875, + "lambda_div_used": 0.5, + "learning_rate": 3.4991416936678276e-07, + "loss": 0.0006, + "reward": 0.24952181614935398, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.24952181614935398, + "reward_after_std": 0.9109466038644314, + "reward_before_mean": 0.9754927828907967, + "reward_before_std": 0.6992852129042149, + "reward_change_max": 0.0, + "reward_change_mean": -0.725970946252346, + "reward_change_min": -1.183552272617817, + "reward_change_std": 0.4466256331652403, + "reward_std": 0.9109466522932053, + "rewards/cosine_scaled_reward": 0.03982970770448446, + "rewards/format_reward": 0.8958333358168602, + "step": 341 + }, + { + "advantage_max": 1.898781567811966, + "advantage_mean": -1.2417635808503746e-09, + "advantage_min": -0.8452117443084717, + "advantage_std": 0.9998630881309509, + "completion_length": 1458.0833435058594, + "epoch": 0.39085714285714285, + "grad_norm": 0.3426169157028198, + "kl": 0.0243072509765625, + "lambda_div_used": 0.5, + "learning_rate": 3.471051066897562e-07, + "loss": 0.001, + "reward": 0.13850300945341587, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.13850300945341587, + "reward_after_std": 0.8392068706452847, + "reward_before_mean": 0.800941426306963, + "reward_before_std": 0.7777084633708, + "reward_change_max": 8.752942085266113e-05, + "reward_change_mean": -0.662438403815031, + "reward_change_min": -1.175816796720028, + "reward_change_std": 0.4522414803504944, + "reward_std": 0.8392068967223167, + "rewards/cosine_scaled_reward": -0.047445970587432384, + "rewards/format_reward": 0.8958333432674408, + "step": 342 + }, + { + "advantage_max": 1.9382910281419754, + "advantage_mean": -1.117587122845265e-08, + "advantage_min": -0.7976772412657738, + "advantage_std": 0.9998707920312881, + "completion_length": 1346.5000305175781, + "epoch": 0.392, + "grad_norm": 0.32439133524894714, + "kl": 0.014873504638671875, + "lambda_div_used": 0.5, + "learning_rate": 3.4430593282358777e-07, + "loss": 0.0006, + "reward": 0.21847632061690092, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.21847632061690092, + "reward_after_std": 0.9025693461298943, + "reward_before_mean": 0.9282007124274969, + "reward_before_std": 0.7880385238677263, + "reward_change_max": 0.0, + "reward_change_mean": -0.7097243778407574, + "reward_change_min": -1.207001969218254, + "reward_change_std": 0.4577541835606098, + "reward_std": 0.9025693461298943, + "rewards/cosine_scaled_reward": -0.00464966893196106, + "rewards/format_reward": 0.9375000149011612, + "step": 343 + }, + { + "advantage_max": 1.971865400671959, + "advantage_mean": -3.414849514271623e-08, + "advantage_min": -0.7260220609605312, + "advantage_std": 0.9998649135231972, + "completion_length": 1340.458351135254, + "epoch": 0.3931428571428571, + "grad_norm": 0.25376710295677185, + "kl": 0.017496109008789062, + "lambda_div_used": 0.5, + "learning_rate": 3.4151678419606233e-07, + "loss": 0.0007, + "reward": 0.4441331517882645, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4441331517882645, + "reward_after_std": 0.8279833234846592, + "reward_before_mean": 1.3637418150901794, + "reward_before_std": 0.5514478012919426, + "reward_change_max": 0.0, + "reward_change_mean": -0.9196087047457695, + "reward_change_min": -1.3315959051251411, + "reward_change_std": 0.5144347231835127, + "reward_std": 0.8279833309352398, + "rewards/cosine_scaled_reward": 0.2339542363770306, + "rewards/format_reward": 0.895833333954215, + "step": 344 + }, + { + "advantage_max": 1.8874634951353073, + "advantage_mean": -1.4901161637936866e-08, + "advantage_min": -0.8752384632825851, + "advantage_std": 0.9998615980148315, + "completion_length": 1664.3125610351562, + "epoch": 0.3942857142857143, + "grad_norm": 0.4946304261684418, + "kl": 0.0244293212890625, + "lambda_div_used": 0.5, + "learning_rate": 3.387377967463493e-07, + "loss": 0.001, + "reward": 0.14852892188355327, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.14852892188355327, + "reward_after_std": 0.8501206710934639, + "reward_before_mean": 0.8226657584309578, + "reward_before_std": 0.7914411835372448, + "reward_change_max": 0.0013569816946983337, + "reward_change_mean": -0.6741368658840656, + "reward_change_min": -1.1830965094268322, + "reward_change_std": 0.467490840703249, + "reward_std": 0.8501207120716572, + "rewards/cosine_scaled_reward": 0.0050828717648983, + "rewards/format_reward": 0.812500013038516, + "step": 345 + }, + { + "advantage_max": 1.9018942415714264, + "advantage_mean": 6.208816238917336e-10, + "advantage_min": -0.8773997947573662, + "advantage_std": 0.9998459815979004, + "completion_length": 1397.2500228881836, + "epoch": 0.3954285714285714, + "grad_norm": 0.2937193214893341, + "kl": 0.01808929443359375, + "lambda_div_used": 0.5, + "learning_rate": 3.359691059183761e-07, + "loss": 0.0007, + "reward": 0.0976130670751445, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.0976130670751445, + "reward_after_std": 0.7293423525989056, + "reward_before_mean": 0.7637905050069094, + "reward_before_std": 0.6217326875776052, + "reward_change_max": 0.00037025660276412964, + "reward_change_mean": -0.6661774702370167, + "reward_change_min": -1.1007812693715096, + "reward_change_std": 0.4196817334741354, + "reward_std": 0.729342382401228, + "rewards/cosine_scaled_reward": -0.07643807306885719, + "rewards/format_reward": 0.9166666865348816, + "step": 346 + }, + { + "advantage_max": 1.9513923674821854, + "advantage_mean": 6.3640377412355065e-09, + "advantage_min": -0.766438364982605, + "advantage_std": 0.9998519346117973, + "completion_length": 1442.7083587646484, + "epoch": 0.3965714285714286, + "grad_norm": 0.3007684051990509, + "kl": 0.01589202880859375, + "lambda_div_used": 0.5, + "learning_rate": 3.3321084665422803e-07, + "loss": 0.0006, + "reward": 0.014781358651816845, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.014781358651816845, + "reward_after_std": 0.7751567997038364, + "reward_before_mean": 0.5895684882998466, + "reward_before_std": 0.6202553771436214, + "reward_change_max": 0.001266680657863617, + "reward_change_mean": -0.5747871249914169, + "reward_change_min": -0.8554559350013733, + "reward_change_std": 0.33717326261103153, + "reward_std": 0.7751568369567394, + "rewards/cosine_scaled_reward": -0.17396577447652817, + "rewards/format_reward": 0.9375000149011612, + "step": 347 + }, + { + "advantage_max": 1.9060746431350708, + "advantage_mean": 1.0477379408513343e-08, + "advantage_min": -0.8555322960019112, + "advantage_std": 0.9998508542776108, + "completion_length": 1647.4792251586914, + "epoch": 0.3977142857142857, + "grad_norm": 0.518498420715332, + "kl": 0.033458709716796875, + "lambda_div_used": 0.5, + "learning_rate": 3.3046315338757026e-07, + "loss": 0.0013, + "reward": 0.14236977510154247, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.14236977510154247, + "reward_after_std": 0.7683847695589066, + "reward_before_mean": 0.834890816360712, + "reward_before_std": 0.6821971833705902, + "reward_change_max": 0.0012637749314308167, + "reward_change_mean": -0.6925210300832987, + "reward_change_min": -1.134917676448822, + "reward_change_std": 0.45355450361967087, + "reward_std": 0.7683847993612289, + "rewards/cosine_scaled_reward": 0.04244539514183998, + "rewards/format_reward": 0.750000013038516, + "step": 348 + }, + { + "advantage_max": 1.8973789811134338, + "advantage_mean": 1.2417634698280722e-09, + "advantage_min": -0.843739926815033, + "advantage_std": 0.9998513013124466, + "completion_length": 1307.8125305175781, + "epoch": 0.39885714285714285, + "grad_norm": 0.30172500014305115, + "kl": 0.019256591796875, + "lambda_div_used": 0.5, + "learning_rate": 3.2772616003709616e-07, + "loss": 0.0008, + "reward": 0.258987728622742, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.258987728622742, + "reward_after_std": 0.80018550157547, + "reward_before_mean": 1.0375699400901794, + "reward_before_std": 0.7033422328531742, + "reward_change_max": 0.0, + "reward_change_mean": -0.7785822227597237, + "reward_change_min": -1.2887407094240189, + "reward_change_std": 0.4977263957262039, + "reward_std": 0.80018550157547, + "rewards/cosine_scaled_reward": 0.03961828793399036, + "rewards/format_reward": 0.9583333432674408, + "step": 349 + }, + { + "advantage_max": 1.965741217136383, + "advantage_mean": 2.4835269396561444e-09, + "advantage_min": -0.6967620141804218, + "advantage_std": 0.9998733699321747, + "completion_length": 961.3958740234375, + "epoch": 0.4, + "grad_norm": 0.3641263246536255, + "kl": 0.0122528076171875, + "lambda_div_used": 0.5, + "learning_rate": 3.250000000000001e-07, + "loss": 0.0005, + "reward": 0.1289899628609419, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1289899628609419, + "reward_after_std": 0.9607405439019203, + "reward_before_mean": 0.7350360294803977, + "reward_before_std": 0.7971515450626612, + "reward_change_max": 0.0033571943640708923, + "reward_change_mean": -0.6060460731387138, + "reward_change_min": -1.0657524466514587, + "reward_change_std": 0.3789667785167694, + "reward_std": 0.9607405923306942, + "rewards/cosine_scaled_reward": -0.12206532340496778, + "rewards/format_reward": 0.9791666716337204, + "step": 350 + }, + { + "advantage_max": 1.9018863588571548, + "advantage_mean": 9.235616160729876e-09, + "advantage_min": -0.7558177262544632, + "advantage_std": 0.999875046312809, + "completion_length": 1295.708366394043, + "epoch": 0.40114285714285713, + "grad_norm": 0.340808242559433, + "kl": 0.018463134765625, + "lambda_div_used": 0.5, + "learning_rate": 3.222848061454764e-07, + "loss": 0.0007, + "reward": 0.23417375516146421, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.23417375516146421, + "reward_after_std": 0.9156857281923294, + "reward_before_mean": 0.9571765847504139, + "reward_before_std": 0.858966302126646, + "reward_change_max": 0.0011729896068572998, + "reward_change_mean": -0.7230028323829174, + "reward_change_min": -1.339646216481924, + "reward_change_std": 0.5007602255791426, + "reward_std": 0.9156857430934906, + "rewards/cosine_scaled_reward": 0.04108827468007803, + "rewards/format_reward": 0.8750000055879354, + "step": 351 + }, + { + "advantage_max": 1.9002157002687454, + "advantage_mean": 9.934107758624577e-09, + "advantage_min": -0.7841279283165932, + "advantage_std": 0.9998011961579323, + "completion_length": 1612.3125305175781, + "epoch": 0.4022857142857143, + "grad_norm": 0.6093102097511292, + "kl": 0.02661895751953125, + "lambda_div_used": 0.5, + "learning_rate": 3.195807108082429e-07, + "loss": 0.0011, + "reward": -0.020187399117276073, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.020187399117276073, + "reward_after_std": 0.5602889284491539, + "reward_before_mean": 0.5988853393937461, + "reward_before_std": 0.4055903349071741, + "reward_change_max": 0.0, + "reward_change_mean": -0.6190727520734072, + "reward_change_min": -0.973639614880085, + "reward_change_std": 0.3668802008032799, + "reward_std": 0.5602889433503151, + "rewards/cosine_scaled_reward": -0.06514065247029066, + "rewards/format_reward": 0.7291666697710752, + "step": 352 + }, + { + "advantage_max": 1.9760214239358902, + "advantage_mean": 1.2417633588057697e-09, + "advantage_min": -0.647670142352581, + "advantage_std": 0.9998504817485809, + "completion_length": 980.2083702087402, + "epoch": 0.4034285714285714, + "grad_norm": 0.40995076298713684, + "kl": 0.00830078125, + "lambda_div_used": 0.5, + "learning_rate": 3.168878457820915e-07, + "loss": 0.0003, + "reward": 0.362104510422796, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.362104510422796, + "reward_after_std": 0.7626262679696083, + "reward_before_mean": 1.230547845363617, + "reward_before_std": 0.48653218522667885, + "reward_change_max": 0.0, + "reward_change_mean": -0.8684433326125145, + "reward_change_min": -1.3135455027222633, + "reward_change_std": 0.4772064797580242, + "reward_std": 0.7626262977719307, + "rewards/cosine_scaled_reward": 0.11527392640709877, + "rewards/format_reward": 1.0, + "step": 353 + }, + { + "advantage_max": 1.9692391902208328, + "advantage_mean": -1.241763691872677e-09, + "advantage_min": -0.7419257685542107, + "advantage_std": 0.9998397529125214, + "completion_length": 1016.020866394043, + "epoch": 0.4045714285714286, + "grad_norm": 0.2783952057361603, + "kl": 0.013134002685546875, + "lambda_div_used": 0.5, + "learning_rate": 3.142063423134644e-07, + "loss": 0.0005, + "reward": 0.2590313320979476, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2590313320979476, + "reward_after_std": 0.7170304656028748, + "reward_before_mean": 1.0561553873121738, + "reward_before_std": 0.4702160977758467, + "reward_change_max": 0.0, + "reward_change_mean": -0.7971240431070328, + "reward_change_min": -1.1697766482830048, + "reward_change_std": 0.4459417313337326, + "reward_std": 0.7170304767787457, + "rewards/cosine_scaled_reward": 0.05932767526246607, + "rewards/format_reward": 0.9375000149011612, + "step": 354 + }, + { + "advantage_max": 1.9122939109802246, + "advantage_mean": -6.208817349140361e-09, + "advantage_min": -0.7931451685726643, + "advantage_std": 0.9998869970440865, + "completion_length": 1000.5625305175781, + "epoch": 0.4057142857142857, + "grad_norm": 0.2880876064300537, + "kl": 0.010530471801757812, + "lambda_div_used": 0.5, + "learning_rate": 3.115363310950578e-07, + "loss": 0.0004, + "reward": 0.256419240264222, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.256419240264222, + "reward_after_std": 0.9639424160122871, + "reward_before_mean": 0.9679636843502522, + "reward_before_std": 0.8596122078597546, + "reward_change_max": 0.0, + "reward_change_mean": -0.7115443907678127, + "reward_change_min": -1.2375174909830093, + "reward_change_std": 0.46681670658290386, + "reward_std": 0.9639424830675125, + "rewards/cosine_scaled_reward": -0.005601532757282257, + "rewards/format_reward": 0.9791666716337204, + "step": 355 + }, + { + "advantage_max": 1.9162172675132751, + "advantage_mean": -1.8626451603331873e-08, + "advantage_min": -0.8011289536952972, + "advantage_std": 0.999864473938942, + "completion_length": 1473.395851135254, + "epoch": 0.40685714285714286, + "grad_norm": 0.27427616715431213, + "kl": 0.026065826416015625, + "lambda_div_used": 0.5, + "learning_rate": 3.0887794225945143e-07, + "loss": 0.001, + "reward": 0.15733138285577297, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.15733138285577297, + "reward_after_std": 0.8644469156861305, + "reward_before_mean": 0.8291059145703912, + "reward_before_std": 0.7782079391181469, + "reward_change_max": 0.0, + "reward_change_mean": -0.6717745885252953, + "reward_change_min": -1.1893984526395798, + "reward_change_std": 0.45820096507668495, + "reward_std": 0.8644469529390335, + "rewards/cosine_scaled_reward": -0.022947038523852825, + "rewards/format_reward": 0.875, + "step": 356 + }, + { + "advantage_max": 1.9351384490728378, + "advantage_mean": 2.1730860333413204e-08, + "advantage_min": -0.8235296234488487, + "advantage_std": 0.9998565465211868, + "completion_length": 1561.2500457763672, + "epoch": 0.408, + "grad_norm": 0.3253626823425293, + "kl": 0.024013519287109375, + "lambda_div_used": 0.5, + "learning_rate": 3.062313053727671e-07, + "loss": 0.001, + "reward": -0.026991624385118484, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.026991624385118484, + "reward_after_std": 0.8327379450201988, + "reward_before_mean": 0.4940991383045912, + "reward_before_std": 0.7405038252472878, + "reward_change_max": 0.0, + "reward_change_mean": -0.5210907440632582, + "reward_change_min": -0.9241106547415257, + "reward_change_std": 0.3496807739138603, + "reward_std": 0.83273795992136, + "rewards/cosine_scaled_reward": -0.13836710306350142, + "rewards/format_reward": 0.7708333488553762, + "step": 357 + }, + { + "advantage_max": 1.906896635890007, + "advantage_mean": 3.104408619059029e-09, + "advantage_min": -0.7559650018811226, + "advantage_std": 0.9998820126056671, + "completion_length": 1381.4166793823242, + "epoch": 0.40914285714285714, + "grad_norm": 0.24099020659923553, + "kl": 0.015361785888671875, + "lambda_div_used": 0.5, + "learning_rate": 3.0359654942835247e-07, + "loss": 0.0006, + "reward": 0.367542517837137, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.367542517837137, + "reward_after_std": 0.971061497926712, + "reward_before_mean": 1.1805765418102965, + "reward_before_std": 0.8555642701685429, + "reward_change_max": 0.0, + "reward_change_mean": -0.8130340054631233, + "reward_change_min": -1.4749982208013535, + "reward_change_std": 0.5291310101747513, + "reward_std": 0.9710615314543247, + "rewards/cosine_scaled_reward": 0.12153824418783188, + "rewards/format_reward": 0.9375, + "step": 358 + }, + { + "advantage_max": 1.9519437849521637, + "advantage_mean": 9.313226023710541e-09, + "advantage_min": -0.7854745984077454, + "advantage_std": 0.9998278766870499, + "completion_length": 960.1666946411133, + "epoch": 0.4102857142857143, + "grad_norm": 0.4961448311805725, + "kl": 0.017177581787109375, + "lambda_div_used": 0.5, + "learning_rate": 3.0097380284049523e-07, + "loss": 0.0007, + "reward": 0.07578269951045513, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.07578269951045513, + "reward_after_std": 0.6794970296323299, + "reward_before_mean": 0.7309711538255215, + "reward_before_std": 0.5242785401642323, + "reward_change_max": 0.0, + "reward_change_mean": -0.6551884561777115, + "reward_change_min": -0.9929383620619774, + "reward_change_std": 0.38128501921892166, + "reward_std": 0.679497055709362, + "rewards/cosine_scaled_reward": -0.12409775704145432, + "rewards/format_reward": 0.9791666716337204, + "step": 359 + }, + { + "advantage_max": 1.921691581606865, + "advantage_mean": -3.849466723160333e-08, + "advantage_min": -0.8046199455857277, + "advantage_std": 0.9998729974031448, + "completion_length": 1303.9375381469727, + "epoch": 0.4114285714285714, + "grad_norm": 0.33698728680610657, + "kl": 0.0240478515625, + "lambda_div_used": 0.5, + "learning_rate": 2.9836319343816397e-07, + "loss": 0.001, + "reward": 0.4048253740184009, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4048253740184009, + "reward_after_std": 0.8957697823643684, + "reward_before_mean": 1.270486131310463, + "reward_before_std": 0.7351746968924999, + "reward_change_max": 0.0, + "reward_change_mean": -0.865660771727562, + "reward_change_min": -1.4209284782409668, + "reward_change_std": 0.535039871931076, + "reward_std": 0.895769789814949, + "rewards/cosine_scaled_reward": 0.14565971928823274, + "rewards/format_reward": 0.9791666716337204, + "step": 360 + }, + { + "advantage_max": 1.9461275935173035, + "advantage_mean": -2.0178656301439446e-08, + "advantage_min": -0.6842218115925789, + "advantage_std": 0.9998639598488808, + "completion_length": 1284.0625305175781, + "epoch": 0.4125714285714286, + "grad_norm": 0.2980879545211792, + "kl": 0.020538330078125, + "lambda_div_used": 0.5, + "learning_rate": 2.9576484845877793e-07, + "loss": 0.0008, + "reward": 0.20743734575808048, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.20743734575808048, + "reward_after_std": 0.8618561178445816, + "reward_before_mean": 0.9113494399935007, + "reward_before_std": 0.6808569990098476, + "reward_change_max": 0.0015008747577667236, + "reward_change_mean": -0.7039121389389038, + "reward_change_min": -1.143224611878395, + "reward_change_std": 0.4279782176017761, + "reward_std": 0.8618561401963234, + "rewards/cosine_scaled_reward": -0.013075282797217369, + "rewards/format_reward": 0.9375000074505806, + "step": 361 + }, + { + "advantage_max": 1.9330978840589523, + "advantage_mean": 1.8626450937198058e-09, + "advantage_min": -0.8092173300683498, + "advantage_std": 0.9998206868767738, + "completion_length": 880.7500190734863, + "epoch": 0.4137142857142857, + "grad_norm": 0.3951859474182129, + "kl": 0.01638031005859375, + "lambda_div_used": 0.5, + "learning_rate": 2.931788945420058e-07, + "loss": 0.0007, + "reward": 0.1559063233435154, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1559063233435154, + "reward_after_std": 0.647941593080759, + "reward_before_mean": 0.8897985154762864, + "reward_before_std": 0.4787643002346158, + "reward_change_max": 0.0, + "reward_change_mean": -0.7338921837508678, + "reward_change_min": -1.1151539906859398, + "reward_change_std": 0.4269598387181759, + "reward_std": 0.6479416117072105, + "rewards/cosine_scaled_reward": -0.023850757628679276, + "rewards/format_reward": 0.9375, + "step": 362 + }, + { + "advantage_max": 1.9025491178035736, + "advantage_mean": -3.476937759927523e-08, + "advantage_min": -0.7924175783991814, + "advantage_std": 0.9998592659831047, + "completion_length": 936.8750152587891, + "epoch": 0.41485714285714287, + "grad_norm": 0.35128331184387207, + "kl": 0.013484954833984375, + "lambda_div_used": 0.5, + "learning_rate": 2.9060545772359305e-07, + "loss": 0.0005, + "reward": 0.3590309312567115, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3590309312567115, + "reward_after_std": 0.7893019616603851, + "reward_before_mean": 1.216321088373661, + "reward_before_std": 0.6109752170741558, + "reward_change_max": 0.0, + "reward_change_mean": -0.8572902157902718, + "reward_change_min": -1.3023911118507385, + "reward_change_std": 0.5073912441730499, + "reward_std": 0.7893019765615463, + "rewards/cosine_scaled_reward": 0.11857721768319607, + "rewards/format_reward": 0.9791666716337204, + "step": 363 + }, + { + "advantage_max": 1.9401460587978363, + "advantage_mean": 1.3969839451899446e-09, + "advantage_min": -0.7830497920513153, + "advantage_std": 0.9998208284378052, + "completion_length": 1427.3542098999023, + "epoch": 0.416, + "grad_norm": 0.41800156235694885, + "kl": 0.02063751220703125, + "lambda_div_used": 0.5, + "learning_rate": 2.8804466342921987e-07, + "loss": 0.0008, + "reward": -0.08546914509497583, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.08546914509497583, + "reward_after_std": 0.6147534213960171, + "reward_before_mean": 0.45134393498301506, + "reward_before_std": 0.49978536926209927, + "reward_change_max": 0.0, + "reward_change_mean": -0.5368130728602409, + "reward_change_min": -0.8912122398614883, + "reward_change_std": 0.32553112506866455, + "reward_std": 0.6147534511983395, + "rewards/cosine_scaled_reward": -0.2326613813638687, + "rewards/format_reward": 0.916666679084301, + "step": 364 + }, + { + "advantage_max": 1.9052964746952057, + "advantage_mean": -1.2417632477834672e-09, + "advantage_min": -0.7776142209768295, + "advantage_std": 0.9998699054121971, + "completion_length": 1549.3750381469727, + "epoch": 0.41714285714285715, + "grad_norm": 0.5369855165481567, + "kl": 0.035430908203125, + "lambda_div_used": 0.5, + "learning_rate": 2.854966364683872e-07, + "loss": 0.0014, + "reward": 0.005877653602510691, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.005877653602510691, + "reward_after_std": 0.8726274520158768, + "reward_before_mean": 0.5426001232117414, + "reward_before_std": 0.8404004983603954, + "reward_change_max": 0.0007102638483047485, + "reward_change_mean": -0.5367224644869566, + "reward_change_min": -1.1121207065880299, + "reward_change_std": 0.41939173452556133, + "reward_std": 0.8726274818181992, + "rewards/cosine_scaled_reward": -0.0932832807302475, + "rewards/format_reward": 0.7291666697710752, + "step": 365 + }, + { + "advantage_max": 1.9043861776590347, + "advantage_mean": 6.208818126296478e-09, + "advantage_min": -0.8616622500121593, + "advantage_std": 0.9998663142323494, + "completion_length": 1251.1667022705078, + "epoch": 0.41828571428571426, + "grad_norm": 0.3164905607700348, + "kl": 0.0154266357421875, + "lambda_div_used": 0.5, + "learning_rate": 2.829615010283344e-07, + "loss": 0.0006, + "reward": 0.32734458870254457, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.32734458870254457, + "reward_after_std": 0.8601884730160236, + "reward_before_mean": 1.138159309513867, + "reward_before_std": 0.7124536950141191, + "reward_change_max": 0.0, + "reward_change_mean": -0.8108147121965885, + "reward_change_min": -1.284881740808487, + "reward_change_std": 0.5022755339741707, + "reward_std": 0.8601884730160236, + "rewards/cosine_scaled_reward": 0.10032964125275612, + "rewards/format_reward": 0.9375000074505806, + "step": 366 + }, + { + "advantage_max": 1.9257488250732422, + "advantage_mean": -1.8626452602532595e-09, + "advantage_min": -0.8024434819817543, + "advantage_std": 0.9998431578278542, + "completion_length": 1324.3958892822266, + "epoch": 0.41942857142857143, + "grad_norm": 0.3114745616912842, + "kl": 0.015369415283203125, + "lambda_div_used": 0.5, + "learning_rate": 2.8043938066798645e-07, + "loss": 0.0006, + "reward": 0.008062966400757432, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.008062966400757432, + "reward_after_std": 0.7330645881593227, + "reward_before_mean": 0.593254167586565, + "reward_before_std": 0.6409398391842842, + "reward_change_max": 0.0, + "reward_change_mean": -0.5851911939680576, + "reward_change_min": -1.088577315211296, + "reward_change_std": 0.3858226127922535, + "reward_std": 0.7330645956099033, + "rewards/cosine_scaled_reward": -0.16170626878738403, + "rewards/format_reward": 0.916666679084301, + "step": 367 + }, + { + "advantage_max": 1.961761862039566, + "advantage_mean": -6.208817127095756e-09, + "advantage_min": -0.6810972690582275, + "advantage_std": 0.9998715221881866, + "completion_length": 1611.4792098999023, + "epoch": 0.4205714285714286, + "grad_norm": 0.6100217700004578, + "kl": 0.026096343994140625, + "lambda_div_used": 0.5, + "learning_rate": 2.7793039831193133e-07, + "loss": 0.001, + "reward": 0.0674855774268508, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.0674855774268508, + "reward_after_std": 0.9106010496616364, + "reward_before_mean": 0.6422249140887288, + "reward_before_std": 0.7696337550878525, + "reward_change_max": 0.0, + "reward_change_mean": -0.5747393220663071, + "reward_change_min": -1.0176613926887512, + "reward_change_std": 0.3697497956454754, + "reward_std": 0.9106010720133781, + "rewards/cosine_scaled_reward": -0.10597089910879731, + "rewards/format_reward": 0.8541666679084301, + "step": 368 + }, + { + "advantage_max": 1.9500256478786469, + "advantage_mean": 1.7074247571358114e-09, + "advantage_min": -0.7334987670183182, + "advantage_std": 0.9998728185892105, + "completion_length": 1466.7291946411133, + "epoch": 0.4217142857142857, + "grad_norm": 0.48714351654052734, + "kl": 0.028980255126953125, + "lambda_div_used": 0.5, + "learning_rate": 2.7543467624442956e-07, + "loss": 0.0012, + "reward": 0.13486522855237126, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.13486522855237126, + "reward_after_std": 0.9019280709326267, + "reward_before_mean": 0.7667449675500393, + "reward_before_std": 0.7737538255751133, + "reward_change_max": 0.0004113316535949707, + "reward_change_mean": -0.6318797618150711, + "reward_change_min": -1.1101751253008842, + "reward_change_std": 0.41809099167585373, + "reward_std": 0.9019280709326267, + "rewards/cosine_scaled_reward": -0.06454417761415243, + "rewards/format_reward": 0.8958333432674408, + "step": 369 + }, + { + "advantage_max": 2.000591605901718, + "advantage_mean": -9.934107314535368e-09, + "advantage_min": -0.6458289884030819, + "advantage_std": 0.9998491033911705, + "completion_length": 1292.6667213439941, + "epoch": 0.4228571428571429, + "grad_norm": 0.3272714614868164, + "kl": 0.024097442626953125, + "lambda_div_used": 0.5, + "learning_rate": 2.729523361034538e-07, + "loss": 0.001, + "reward": 0.11595443380065262, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.11595443380065262, + "reward_after_std": 0.7612169794738293, + "reward_before_mean": 0.7688500918447971, + "reward_before_std": 0.5021704901009798, + "reward_change_max": 0.0, + "reward_change_mean": -0.6528956815600395, + "reward_change_min": -0.9266734272241592, + "reward_change_std": 0.3518510889261961, + "reward_std": 0.7612170018255711, + "rewards/cosine_scaled_reward": -0.07390830665826797, + "rewards/format_reward": 0.9166666865348816, + "step": 370 + }, + { + "advantage_max": 1.9721637219190598, + "advantage_mean": -2.7318796669284495e-08, + "advantage_min": -0.6613831929862499, + "advantage_std": 0.9998557940125465, + "completion_length": 752.2500305175781, + "epoch": 0.424, + "grad_norm": 0.3771182894706726, + "kl": 0.013050079345703125, + "lambda_div_used": 0.5, + "learning_rate": 2.7048349887476037e-07, + "loss": 0.0005, + "reward": 0.4583681761287153, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4583681761287153, + "reward_after_std": 0.8134524710476398, + "reward_before_mean": 1.3909370601177216, + "reward_before_std": 0.5117593847680837, + "reward_change_max": 0.0, + "reward_change_mean": -0.9325688779354095, + "reward_change_min": -1.3584751039743423, + "reward_change_std": 0.524842644110322, + "reward_std": 0.8134525120258331, + "rewards/cosine_scaled_reward": 0.2058851895853877, + "rewards/format_reward": 0.9791666716337204, + "step": 371 + }, + { + "advantage_max": 1.9488580971956253, + "advantage_mean": -8.692344177774203e-09, + "advantage_min": -0.7659207582473755, + "advantage_std": 0.9998446479439735, + "completion_length": 1342.9375381469727, + "epoch": 0.42514285714285716, + "grad_norm": 0.240267813205719, + "kl": 0.01468658447265625, + "lambda_div_used": 0.5, + "learning_rate": 2.6802828488599294e-07, + "loss": 0.0006, + "reward": 0.24963407404720783, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.24963407404720783, + "reward_after_std": 0.7213221676647663, + "reward_before_mean": 1.0393864251673222, + "reward_before_std": 0.5199251472949982, + "reward_change_max": 0.0, + "reward_change_mean": -0.7897523567080498, + "reward_change_min": -1.1659668758511543, + "reward_change_std": 0.4499006439000368, + "reward_std": 0.7213221788406372, + "rewards/cosine_scaled_reward": 0.030109863728284836, + "rewards/format_reward": 0.9791666716337204, + "step": 372 + }, + { + "advantage_max": 1.958022728562355, + "advantage_mean": -3.104408563547878e-09, + "advantage_min": -0.677016519010067, + "advantage_std": 0.99983249604702, + "completion_length": 759.9166946411133, + "epoch": 0.42628571428571427, + "grad_norm": 0.37422969937324524, + "kl": 0.011119842529296875, + "lambda_div_used": 0.5, + "learning_rate": 2.655868138008171e-07, + "loss": 0.0004, + "reward": 0.19378744415007532, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.19378744415007532, + "reward_after_std": 0.8117514979094267, + "reward_before_mean": 0.8973379731178284, + "reward_before_std": 0.6298465020954609, + "reward_change_max": 0.0, + "reward_change_mean": -0.7035505324602127, + "reward_change_min": -1.1113643571734428, + "reward_change_std": 0.41324375942349434, + "reward_std": 0.811751514673233, + "rewards/cosine_scaled_reward": -0.05133102275431156, + "rewards/format_reward": 1.0, + "step": 373 + }, + { + "advantage_max": 1.9328200817108154, + "advantage_mean": -7.140139812733537e-09, + "advantage_min": -0.865662969648838, + "advantage_std": 0.9998394921422005, + "completion_length": 1035.5416870117188, + "epoch": 0.42742857142857144, + "grad_norm": 0.3418057858943939, + "kl": 0.011562347412109375, + "lambda_div_used": 0.5, + "learning_rate": 2.631592046130896e-07, + "loss": 0.0005, + "reward": 0.2122154445387423, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2122154445387423, + "reward_after_std": 0.729767944663763, + "reward_before_mean": 0.9647579044103622, + "reward_before_std": 0.5690576434135437, + "reward_change_max": 0.0, + "reward_change_mean": -0.7525424435734749, + "reward_change_min": -1.1482946649193764, + "reward_change_std": 0.44476964697241783, + "reward_std": 0.7297679595649242, + "rewards/cosine_scaled_reward": -0.007204409688711166, + "rewards/format_reward": 0.9791666716337204, + "step": 374 + }, + { + "advantage_max": 1.9218790829181671, + "advantage_mean": -6.208817238118058e-09, + "advantage_min": -0.7422648146748543, + "advantage_std": 0.9998613074421883, + "completion_length": 1885.0625228881836, + "epoch": 0.42857142857142855, + "grad_norm": 0.36158016324043274, + "kl": 0.052459716796875, + "lambda_div_used": 0.5, + "learning_rate": 2.6074557564105724e-07, + "loss": 0.0021, + "reward": 0.16519121266901493, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.16519121266901493, + "reward_after_std": 0.9700677208602428, + "reward_before_mean": 0.7985913008451462, + "reward_before_std": 0.8461023792624474, + "reward_change_max": 0.0019219592213630676, + "reward_change_mean": -0.6334001235663891, + "reward_change_min": -1.1537334434688091, + "reward_change_std": 0.44267112016677856, + "reward_std": 0.9700677394866943, + "rewards/cosine_scaled_reward": 0.08679564902558923, + "rewards/format_reward": 0.6250000074505806, + "step": 375 + }, + { + "advantage_max": 1.9197021126747131, + "advantage_mean": -6.208817238118058e-09, + "advantage_min": -0.8161247000098228, + "advantage_std": 0.999818779528141, + "completion_length": 1328.5625534057617, + "epoch": 0.4297142857142857, + "grad_norm": 0.3313137888908386, + "kl": 0.02817535400390625, + "lambda_div_used": 0.5, + "learning_rate": 2.583460445215911e-07, + "loss": 0.0011, + "reward": 0.03532506921328604, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.03532506921328604, + "reward_after_std": 0.5968778096139431, + "reward_before_mean": 0.6952200355008245, + "reward_before_std": 0.48110751807689667, + "reward_change_max": 0.0, + "reward_change_mean": -0.6598949693143368, + "reward_change_min": -1.0436818599700928, + "reward_change_std": 0.3981757313013077, + "reward_std": 0.5968778170645237, + "rewards/cosine_scaled_reward": -0.10030667018145323, + "rewards/format_reward": 0.8958333358168602, + "step": 376 + }, + { + "advantage_max": 1.8938241600990295, + "advantage_mean": 1.2728075482471013e-08, + "advantage_min": -0.8486315608024597, + "advantage_std": 0.9998703300952911, + "completion_length": 1734.0209121704102, + "epoch": 0.4308571428571429, + "grad_norm": 0.3574160933494568, + "kl": 0.041107177734375, + "lambda_div_used": 0.5, + "learning_rate": 2.5596072820445254e-07, + "loss": 0.0016, + "reward": 0.17134802043437958, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.17134802043437958, + "reward_after_std": 0.9034502916038036, + "reward_before_mean": 0.8413319541141391, + "reward_before_std": 0.8715251758694649, + "reward_change_max": 0.0, + "reward_change_mean": -0.6699839308857918, + "reward_change_min": -1.2517807893455029, + "reward_change_std": 0.4853241294622421, + "reward_std": 0.9034503139555454, + "rewards/cosine_scaled_reward": -0.006417365744709969, + "rewards/format_reward": 0.854166679084301, + "step": 377 + }, + { + "advantage_max": 1.9512610882520676, + "advantage_mean": -9.934107758624577e-09, + "advantage_min": -0.673358790576458, + "advantage_std": 0.999885655939579, + "completion_length": 1136.0000228881836, + "epoch": 0.432, + "grad_norm": 0.3083365559577942, + "kl": 0.016523361206054688, + "lambda_div_used": 0.5, + "learning_rate": 2.5358974294659373e-07, + "loss": 0.0007, + "reward": 0.431393014267087, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.431393014267087, + "reward_after_std": 1.0133287981152534, + "reward_before_mean": 1.2786421403288841, + "reward_before_std": 0.8173676989972591, + "reward_change_max": 0.0, + "reward_change_mean": -0.8472491428256035, + "reward_change_min": -1.4520145133137703, + "reward_change_std": 0.5355722364038229, + "reward_std": 1.0133288130164146, + "rewards/cosine_scaled_reward": 0.1705710692331195, + "rewards/format_reward": 0.9375, + "step": 378 + }, + { + "advantage_max": 1.9463636577129364, + "advantage_mean": 3.725290853573426e-09, + "advantage_min": -0.7703254446387291, + "advantage_std": 0.9998480081558228, + "completion_length": 1578.1875457763672, + "epoch": 0.43314285714285716, + "grad_norm": 0.33663100004196167, + "kl": 0.03569793701171875, + "lambda_div_used": 0.5, + "learning_rate": 2.512332043064913e-07, + "loss": 0.0014, + "reward": 0.18515793047845364, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.18515793047845364, + "reward_after_std": 0.7468986734747887, + "reward_before_mean": 0.9092491827905178, + "reward_before_std": 0.5696240924298763, + "reward_change_max": 0.0, + "reward_change_mean": -0.724091213196516, + "reward_change_min": -1.1145296394824982, + "reward_change_std": 0.4156108219176531, + "reward_std": 0.7468986958265305, + "rewards/cosine_scaled_reward": -0.014125420711934566, + "rewards/format_reward": 0.9375000074505806, + "step": 379 + }, + { + "advantage_max": 1.9473781883716583, + "advantage_mean": 4.967053879312289e-09, + "advantage_min": -0.7745387181639671, + "advantage_std": 0.9998561814427376, + "completion_length": 1598.6875228881836, + "epoch": 0.4342857142857143, + "grad_norm": 0.3826024532318115, + "kl": 0.038959503173828125, + "lambda_div_used": 0.5, + "learning_rate": 2.488912271385139e-07, + "loss": 0.0016, + "reward": 0.168661929666996, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.168661929666996, + "reward_after_std": 0.8322513662278652, + "reward_before_mean": 0.8563592098653316, + "reward_before_std": 0.6955513991415501, + "reward_change_max": 0.0005630478262901306, + "reward_change_mean": -0.6876972541213036, + "reward_change_min": -1.0310806632041931, + "reward_change_std": 0.4104973468929529, + "reward_std": 0.8322513960301876, + "rewards/cosine_scaled_reward": 0.0010962523519992828, + "rewards/format_reward": 0.8541666772216558, + "step": 380 + }, + { + "advantage_max": 1.8988536298274994, + "advantage_mean": -1.7384688355548406e-08, + "advantage_min": -0.9226270765066147, + "advantage_std": 0.9998243972659111, + "completion_length": 1515.3542175292969, + "epoch": 0.43542857142857144, + "grad_norm": 0.4202130138874054, + "kl": 0.038272857666015625, + "lambda_div_used": 0.5, + "learning_rate": 2.465639255873246e-07, + "loss": 0.0015, + "reward": -0.0928139602765441, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.0928139602765441, + "reward_after_std": 0.6203138083219528, + "reward_before_mean": 0.4476607348769903, + "reward_before_std": 0.5513483323156834, + "reward_change_max": 0.0, + "reward_change_mean": -0.5404747053980827, + "reward_change_min": -0.9399674460291862, + "reward_change_std": 0.35836709290742874, + "reward_std": 0.6203138120472431, + "rewards/cosine_scaled_reward": -0.19283631443977356, + "rewards/format_reward": 0.8333333507180214, + "step": 381 + }, + { + "advantage_max": 1.9759876430034637, + "advantage_mean": 7.450580818968433e-09, + "advantage_min": -0.6738567687571049, + "advantage_std": 0.999846376478672, + "completion_length": 897.5833549499512, + "epoch": 0.43657142857142855, + "grad_norm": 0.30360478162765503, + "kl": 0.010141372680664062, + "lambda_div_used": 0.5, + "learning_rate": 2.4425141308231765e-07, + "loss": 0.0004, + "reward": 0.05183810880407691, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.05183810880407691, + "reward_after_std": 0.7484404519200325, + "reward_before_mean": 0.6674462854862213, + "reward_before_std": 0.5618630647659302, + "reward_change_max": 0.0, + "reward_change_mean": -0.6156081818044186, + "reward_change_min": -0.9522461071610451, + "reward_change_std": 0.34806813672184944, + "reward_std": 0.7484404593706131, + "rewards/cosine_scaled_reward": -0.16627686785068363, + "rewards/format_reward": 1.0, + "step": 382 + }, + { + "advantage_max": 1.9031931459903717, + "advantage_mean": -1.3038516488705909e-08, + "advantage_min": -0.7755768671631813, + "advantage_std": 0.999890647828579, + "completion_length": 1310.645881652832, + "epoch": 0.4377142857142857, + "grad_norm": 0.4743720591068268, + "kl": 0.034099578857421875, + "lambda_div_used": 0.5, + "learning_rate": 2.4195380233209006e-07, + "loss": 0.0014, + "reward": 0.3746328540146351, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3746328540146351, + "reward_after_std": 1.0508150830864906, + "reward_before_mean": 1.163354642689228, + "reward_before_std": 0.9778174087405205, + "reward_change_max": 0.0, + "reward_change_mean": -0.7887218073010445, + "reward_change_min": -1.4353682398796082, + "reward_change_std": 0.5492018274962902, + "reward_std": 1.0508150979876518, + "rewards/cosine_scaled_reward": 0.1650106585584581, + "rewards/format_reward": 0.8333333358168602, + "step": 383 + }, + { + "advantage_max": 1.9253188371658325, + "advantage_mean": -3.414849525373853e-08, + "advantage_min": -0.8059746026992798, + "advantage_std": 0.9998823553323746, + "completion_length": 1079.7708740234375, + "epoch": 0.43885714285714283, + "grad_norm": 0.352461576461792, + "kl": 0.0101470947265625, + "lambda_div_used": 0.5, + "learning_rate": 2.3967120531894857e-07, + "loss": 0.0004, + "reward": 0.5411808973294683, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.5411808973294683, + "reward_after_std": 0.9512509405612946, + "reward_before_mean": 1.5009002909064293, + "reward_before_std": 0.7456874251365662, + "reward_change_max": 0.008606597781181335, + "reward_change_mean": -0.9597193524241447, + "reward_change_min": -1.4905616790056229, + "reward_change_std": 0.5804052986204624, + "reward_std": 0.9512509629130363, + "rewards/cosine_scaled_reward": 0.2712834384292364, + "rewards/format_reward": 0.9583333432674408, + "step": 384 + }, + { + "advantage_max": 1.9382314532995224, + "advantage_mean": 1.2417634531747268e-08, + "advantage_min": -0.7810436561703682, + "advantage_std": 0.9998589232563972, + "completion_length": 1364.6666946411133, + "epoch": 0.44, + "grad_norm": 0.3704834282398224, + "kl": 0.019073486328125, + "lambda_div_used": 0.5, + "learning_rate": 2.374037332934512e-07, + "loss": 0.0008, + "reward": 0.13947630883194506, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.13947630883194506, + "reward_after_std": 0.8770334720611572, + "reward_before_mean": 0.7803040593862534, + "reward_before_std": 0.7611396610736847, + "reward_change_max": 0.0, + "reward_change_mean": -0.6408277489244938, + "reward_change_min": -1.1368419975042343, + "reward_change_std": 0.4232936166226864, + "reward_std": 0.8770334757864475, + "rewards/cosine_scaled_reward": -0.07859798357822001, + "rewards/format_reward": 0.9375000074505806, + "step": 385 + }, + { + "advantage_max": 1.9047647416591644, + "advantage_mean": 4.346172088887101e-09, + "advantage_min": -0.8413906320929527, + "advantage_std": 0.9998516365885735, + "completion_length": 1433.645866394043, + "epoch": 0.44114285714285717, + "grad_norm": 0.41787880659103394, + "kl": 0.04235076904296875, + "lambda_div_used": 0.5, + "learning_rate": 2.3515149676898552e-07, + "loss": 0.0017, + "reward": 0.30202385812299326, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.30202385812299326, + "reward_after_std": 0.7839195318520069, + "reward_before_mean": 1.120539478957653, + "reward_before_std": 0.6518189385533333, + "reward_change_max": 0.0, + "reward_change_mean": -0.8185156136751175, + "reward_change_min": -1.297236330807209, + "reward_change_std": 0.512270912528038, + "reward_std": 0.7839195430278778, + "rewards/cosine_scaled_reward": 0.08110305480659008, + "rewards/format_reward": 0.9583333432674408, + "step": 386 + }, + { + "advantage_max": 1.9116900265216827, + "advantage_mean": 8.07146305348283e-09, + "advantage_min": -0.8842339888215065, + "advantage_std": 0.9998463243246078, + "completion_length": 1939.6041946411133, + "epoch": 0.4422857142857143, + "grad_norm": 0.5493029952049255, + "kl": 0.0702667236328125, + "lambda_div_used": 0.5, + "learning_rate": 2.3291460551638237e-07, + "loss": 0.0028, + "reward": 0.05763331870548427, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.05763331870548427, + "reward_after_std": 0.7158108092844486, + "reward_before_mean": 0.6914464961737394, + "reward_before_std": 0.6264186557382345, + "reward_change_max": 0.000324346125125885, + "reward_change_mean": -0.6338131837546825, + "reward_change_min": -1.006659995764494, + "reward_change_std": 0.4054126776754856, + "reward_std": 0.7158108279109001, + "rewards/cosine_scaled_reward": 0.0019732341170310974, + "rewards/format_reward": 0.6875000167638063, + "step": 387 + }, + { + "advantage_max": 1.8982749581336975, + "advantage_mean": -6.208817349140361e-10, + "advantage_min": -0.867442212998867, + "advantage_std": 0.9998432621359825, + "completion_length": 1462.5833740234375, + "epoch": 0.44342857142857145, + "grad_norm": 0.3895389437675476, + "kl": 0.03160667419433594, + "lambda_div_used": 0.5, + "learning_rate": 2.306931685585657e-07, + "loss": 0.0013, + "reward": 0.0997003959491849, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.0997003959491849, + "reward_after_std": 0.7914495207369328, + "reward_before_mean": 0.7393311122432351, + "reward_before_std": 0.7161987256258726, + "reward_change_max": 0.002636954188346863, + "reward_change_mean": -0.6396307125687599, + "reward_change_min": -1.1351251155138016, + "reward_change_std": 0.4340355843305588, + "reward_std": 0.7914495468139648, + "rewards/cosine_scaled_reward": -0.0574177885428071, + "rewards/format_reward": 0.8541666753590107, + "step": 388 + }, + { + "advantage_max": 1.960044041275978, + "advantage_mean": -1.6142925107764938e-08, + "advantage_min": -0.6981809362769127, + "advantage_std": 0.9998580366373062, + "completion_length": 1519.270881652832, + "epoch": 0.44457142857142856, + "grad_norm": 0.3216893970966339, + "kl": 0.034030914306640625, + "lambda_div_used": 0.5, + "learning_rate": 2.2848729416523859e-07, + "loss": 0.0014, + "reward": 0.06332766944251489, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.06332766944251489, + "reward_after_std": 0.875417023897171, + "reward_before_mean": 0.6491998098790646, + "reward_before_std": 0.7423716522753239, + "reward_change_max": 0.0, + "reward_change_mean": -0.5858721435070038, + "reward_change_min": -1.0857658833265305, + "reward_change_std": 0.3880383335053921, + "reward_std": 0.8754170686006546, + "rewards/cosine_scaled_reward": -0.1337334355339408, + "rewards/format_reward": 0.9166666716337204, + "step": 389 + }, + { + "advantage_max": 1.9449383169412613, + "advantage_mean": 2.220446049250313e-16, + "advantage_min": -0.7815985605120659, + "advantage_std": 0.9998255670070648, + "completion_length": 1706.4792098999023, + "epoch": 0.44571428571428573, + "grad_norm": 0.25515004992485046, + "kl": 0.037689208984375, + "lambda_div_used": 0.5, + "learning_rate": 2.2629708984760706e-07, + "loss": 0.0015, + "reward": 0.08829102944582701, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.08829102944582701, + "reward_after_std": 0.7169809453189373, + "reward_before_mean": 0.7542915940284729, + "reward_before_std": 0.5898414496332407, + "reward_change_max": 0.0, + "reward_change_mean": -0.6660005636513233, + "reward_change_min": -1.0728442445397377, + "reward_change_std": 0.40890334732830524, + "reward_std": 0.7169809490442276, + "rewards/cosine_scaled_reward": -0.029104202054440975, + "rewards/format_reward": 0.8125, + "step": 390 + }, + { + "advantage_max": 1.9109623730182648, + "advantage_mean": -6.2088170160734535e-09, + "advantage_min": -0.8808049410581589, + "advantage_std": 0.9998817071318626, + "completion_length": 1412.6250267028809, + "epoch": 0.44685714285714284, + "grad_norm": 0.6990765929222107, + "kl": 0.051448822021484375, + "lambda_div_used": 0.5, + "learning_rate": 2.2412266235313973e-07, + "loss": 0.0021, + "reward": 0.1668861098587513, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1668861098587513, + "reward_after_std": 0.9577151201665401, + "reward_before_mean": 0.8118191917892545, + "reward_before_std": 0.8829915300011635, + "reward_change_max": 0.0, + "reward_change_mean": -0.644933145493269, + "reward_change_min": -1.16757021099329, + "reward_change_std": 0.455503998324275, + "reward_std": 0.9577151350677013, + "rewards/cosine_scaled_reward": -0.010757071897387505, + "rewards/format_reward": 0.8333333432674408, + "step": 391 + }, + { + "advantage_max": 1.8998601883649826, + "advantage_mean": -2.4835269396561444e-09, + "advantage_min": -0.8490258902311325, + "advantage_std": 0.9998376965522766, + "completion_length": 1412.666732788086, + "epoch": 0.448, + "grad_norm": 0.41337519884109497, + "kl": 0.0423736572265625, + "lambda_div_used": 0.5, + "learning_rate": 2.2196411766036487e-07, + "loss": 0.0017, + "reward": 0.133538922178559, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.133538922178559, + "reward_after_std": 0.7546857632696629, + "reward_before_mean": 0.8264464624226093, + "reward_before_std": 0.6899518445134163, + "reward_change_max": 0.00036785751581192017, + "reward_change_mean": -0.6929075047373772, + "reward_change_min": -1.1742151752114296, + "reward_change_std": 0.46688779070973396, + "reward_std": 0.754685778170824, + "rewards/cosine_scaled_reward": -0.05552679859101772, + "rewards/format_reward": 0.9375000074505806, + "step": 392 + }, + { + "advantage_max": 1.9469918012619019, + "advantage_mean": -9.934107536579972e-09, + "advantage_min": -0.7732015550136566, + "advantage_std": 0.9998616725206375, + "completion_length": 1526.2500305175781, + "epoch": 0.4491428571428571, + "grad_norm": 0.5304766893386841, + "kl": 0.03658294677734375, + "lambda_div_used": 0.5, + "learning_rate": 2.1982156097370557e-07, + "loss": 0.0015, + "reward": 0.11220249300822616, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.11220249300822616, + "reward_after_std": 0.8990034684538841, + "reward_before_mean": 0.724617250263691, + "reward_before_std": 0.7731786463409662, + "reward_change_max": 0.0, + "reward_change_mean": -0.6124147698283195, + "reward_change_min": -0.9759417399764061, + "reward_change_std": 0.3854426145553589, + "reward_std": 0.8990034759044647, + "rewards/cosine_scaled_reward": -0.0543580437079072, + "rewards/format_reward": 0.8333333395421505, + "step": 393 + }, + { + "advantage_max": 1.9288842529058456, + "advantage_mean": -3.7252901874396116e-09, + "advantage_min": -0.7776024453341961, + "advantage_std": 0.9998523741960526, + "completion_length": 1408.5625076293945, + "epoch": 0.4502857142857143, + "grad_norm": 0.5574933290481567, + "kl": 0.03571319580078125, + "lambda_div_used": 0.5, + "learning_rate": 2.1769509671835223e-07, + "loss": 0.0014, + "reward": 0.016732539370423183, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.016732539370423183, + "reward_after_std": 0.8311260640621185, + "reward_before_mean": 0.5725190471857786, + "reward_before_std": 0.745930090546608, + "reward_change_max": 0.0, + "reward_change_mean": -0.555786494165659, + "reward_change_min": -0.9936203956604004, + "reward_change_std": 0.37467138282954693, + "reward_std": 0.8311261013150215, + "rewards/cosine_scaled_reward": -0.11999049689620733, + "rewards/format_reward": 0.8125000149011612, + "step": 394 + }, + { + "advantage_max": 1.8790218234062195, + "advantage_mean": -5.587935614226325e-09, + "advantage_min": -0.9174651429057121, + "advantage_std": 0.9998451918363571, + "completion_length": 1520.1041984558105, + "epoch": 0.4514285714285714, + "grad_norm": 0.4611024558544159, + "kl": 0.04803466796875, + "lambda_div_used": 0.5, + "learning_rate": 2.1558482853517253e-07, + "loss": 0.0019, + "reward": 0.2775235758163035, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2775235758163035, + "reward_after_std": 0.8445008955895901, + "reward_before_mean": 1.0525891445577145, + "reward_before_std": 0.7422155807726085, + "reward_change_max": 0.0, + "reward_change_mean": -0.7750656194984913, + "reward_change_min": -1.3609160706400871, + "reward_change_std": 0.5207080245018005, + "reward_std": 0.8445009030401707, + "rewards/cosine_scaled_reward": 0.14087790716439486, + "rewards/format_reward": 0.770833333954215, + "step": 395 + }, + { + "advantage_max": 1.9562052339315414, + "advantage_mean": -1.117587122845265e-08, + "advantage_min": -0.7776111736893654, + "advantage_std": 0.9998528361320496, + "completion_length": 1210.6041870117188, + "epoch": 0.45257142857142857, + "grad_norm": 0.3211035430431366, + "kl": 0.022783279418945312, + "lambda_div_used": 0.5, + "learning_rate": 2.134908592756607e-07, + "loss": 0.0009, + "reward": 0.15681475645396858, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.15681475645396858, + "reward_after_std": 0.8517627380788326, + "reward_before_mean": 0.8255343623459339, + "reward_before_std": 0.7159137614071369, + "reward_change_max": 0.0029743388295173645, + "reward_change_mean": -0.6687196530401707, + "reward_change_min": -1.1430072411894798, + "reward_change_std": 0.43173813447356224, + "reward_std": 0.8517627380788326, + "rewards/cosine_scaled_reward": -0.03514947555959225, + "rewards/format_reward": 0.8958333432674408, + "step": 396 + }, + { + "advantage_max": 1.902836725115776, + "advantage_mean": 7.450580596923828e-09, + "advantage_min": -0.7827468067407608, + "advantage_std": 0.9998227432370186, + "completion_length": 1434.4375305175781, + "epoch": 0.45371428571428574, + "grad_norm": 0.43412744998931885, + "kl": 0.033294677734375, + "lambda_div_used": 0.5, + "learning_rate": 2.1141329099692406e-07, + "loss": 0.0013, + "reward": -0.0025264378637075424, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.0025264378637075424, + "reward_after_std": 0.7433805800974369, + "reward_before_mean": 0.583407724276185, + "reward_before_std": 0.7002998664975166, + "reward_change_max": 0.0011475682258605957, + "reward_change_mean": -0.5859341472387314, + "reward_change_min": -1.1278854310512543, + "reward_change_std": 0.4434995539486408, + "reward_std": 0.743380606174469, + "rewards/cosine_scaled_reward": -0.1145461443811655, + "rewards/format_reward": 0.8125000037252903, + "step": 397 + }, + { + "advantage_max": 1.9327199161052704, + "advantage_mean": 6.208817682207268e-09, + "advantage_min": -0.7297875881195068, + "advantage_std": 0.9998441785573959, + "completion_length": 1530.7709121704102, + "epoch": 0.45485714285714285, + "grad_norm": 0.6134384274482727, + "kl": 0.046314239501953125, + "lambda_div_used": 0.5, + "learning_rate": 2.0935222495670968e-07, + "loss": 0.0019, + "reward": -0.03442497365176678, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.03442497365176678, + "reward_after_std": 0.7637092061340809, + "reward_before_mean": 0.5058358758687973, + "reward_before_std": 0.6379642691463232, + "reward_change_max": 0.0020051226019859314, + "reward_change_mean": -0.5402608290314674, + "reward_change_min": -0.9148640409111977, + "reward_change_std": 0.3525990601629019, + "reward_std": 0.7637092582881451, + "rewards/cosine_scaled_reward": -0.14291540812700987, + "rewards/format_reward": 0.7916666716337204, + "step": 398 + }, + { + "advantage_max": 1.9521061778068542, + "advantage_mean": -2.7318796558262193e-08, + "advantage_min": -0.7446748539805412, + "advantage_std": 0.9998859688639641, + "completion_length": 1348.770866394043, + "epoch": 0.456, + "grad_norm": 0.33135783672332764, + "kl": 0.03333282470703125, + "lambda_div_used": 0.5, + "learning_rate": 2.0730776160846853e-07, + "loss": 0.0013, + "reward": 0.31152006052434444, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.31152006052434444, + "reward_after_std": 1.0044073984026909, + "reward_before_mean": 1.0513861402869225, + "reward_before_std": 0.8144489899277687, + "reward_change_max": 0.00045236945152282715, + "reward_change_mean": -0.739866092801094, + "reward_change_min": -1.2442336976528168, + "reward_change_std": 0.4599989354610443, + "reward_std": 1.0044074207544327, + "rewards/cosine_scaled_reward": 0.046526393853127956, + "rewards/format_reward": 0.9583333432674408, + "step": 399 + }, + { + "advantage_max": 1.9472700208425522, + "advantage_mean": -2.188608116959756e-08, + "advantage_min": -0.8032184466719627, + "advantage_std": 0.9998786151409149, + "completion_length": 1004.2083587646484, + "epoch": 0.45714285714285713, + "grad_norm": 0.34391146898269653, + "kl": 0.019023895263671875, + "lambda_div_used": 0.5, + "learning_rate": 2.0528000059645995e-07, + "loss": 0.0008, + "reward": 0.47748311748728156, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.47748311748728156, + "reward_after_std": 0.9301084578037262, + "reward_before_mean": 1.3930362164974213, + "reward_before_std": 0.7088299039751291, + "reward_change_max": 0.0, + "reward_change_mean": -0.91555305570364, + "reward_change_min": -1.4300957471132278, + "reward_change_std": 0.5428194254636765, + "reward_std": 0.9301084876060486, + "rewards/cosine_scaled_reward": 0.20693476125597954, + "rewards/format_reward": 0.9791666716337204, + "step": 400 + }, + { + "advantage_max": 1.918205127120018, + "advantage_mean": 6.829699361610153e-09, + "advantage_min": -0.8171031698584557, + "advantage_std": 0.9998555332422256, + "completion_length": 1798.8750305175781, + "epoch": 0.4582857142857143, + "grad_norm": 0.4476446211338043, + "kl": 0.07358551025390625, + "lambda_div_used": 0.5, + "learning_rate": 2.032690407508949e-07, + "loss": 0.0029, + "reward": 0.11297351177199744, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.11297351177199744, + "reward_after_std": 0.825851283967495, + "reward_before_mean": 0.752119667828083, + "reward_before_std": 0.7276361435651779, + "reward_change_max": 0.0, + "reward_change_mean": -0.6391461752355099, + "reward_change_min": -1.0911386832594872, + "reward_change_std": 0.4222437683492899, + "reward_std": 0.825851283967495, + "rewards/cosine_scaled_reward": 0.0010598432272672653, + "rewards/format_reward": 0.7500000111758709, + "step": 401 + }, + { + "advantage_max": 1.926381230354309, + "advantage_mean": -1.373700808660061e-08, + "advantage_min": -0.8074382990598679, + "advantage_std": 0.9998680353164673, + "completion_length": 1267.895881652832, + "epoch": 0.4594285714285714, + "grad_norm": 0.4774840474128723, + "kl": 0.050079345703125, + "lambda_div_used": 0.5, + "learning_rate": 2.0127498008311922e-07, + "loss": 0.002, + "reward": 0.1499855355359614, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1499855355359614, + "reward_after_std": 0.853604331612587, + "reward_before_mean": 0.8121988624334335, + "reward_before_std": 0.7147294506430626, + "reward_change_max": 0.0, + "reward_change_mean": -0.6622133255004883, + "reward_change_min": -1.0627883076667786, + "reward_change_std": 0.42420555651187897, + "reward_std": 0.8536043539643288, + "rewards/cosine_scaled_reward": -0.04181723203510046, + "rewards/format_reward": 0.8958333507180214, + "step": 402 + }, + { + "advantage_max": 1.9298964142799377, + "advantage_mean": -1.9247333560290514e-08, + "advantage_min": -0.855850912630558, + "advantage_std": 0.9998518601059914, + "completion_length": 1122.0000305175781, + "epoch": 0.4605714285714286, + "grad_norm": 0.37893974781036377, + "kl": 0.034740447998046875, + "lambda_div_used": 0.5, + "learning_rate": 1.9929791578083655e-07, + "loss": 0.0014, + "reward": 0.27812413964420557, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.27812413964420557, + "reward_after_std": 0.7593046091496944, + "reward_before_mean": 1.073563028126955, + "reward_before_std": 0.5789857367053628, + "reward_change_max": 0.0, + "reward_change_mean": -0.79543886333704, + "reward_change_min": -1.182148739695549, + "reward_change_std": 0.46591442450881004, + "reward_std": 0.7593046091496944, + "rewards/cosine_scaled_reward": 0.057614823803305626, + "rewards/format_reward": 0.9583333358168602, + "step": 403 + }, + { + "advantage_max": 1.9724492132663727, + "advantage_mean": -2.6387474177935744e-08, + "advantage_min": -0.7148680537939072, + "advantage_std": 0.9998226389288902, + "completion_length": 1216.8750457763672, + "epoch": 0.4617142857142857, + "grad_norm": 0.6647066473960876, + "kl": 0.03928375244140625, + "lambda_div_used": 0.5, + "learning_rate": 1.9733794420337213e-07, + "loss": 0.0016, + "reward": 0.22924650724348794, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.22924650724348794, + "reward_after_std": 0.6133319586515427, + "reward_before_mean": 1.0240289568901062, + "reward_before_std": 0.336720185354352, + "reward_change_max": 0.0, + "reward_change_mean": -0.7947824373841286, + "reward_change_min": -1.0856451392173767, + "reward_change_std": 0.4123692698776722, + "reward_std": 0.6133319735527039, + "rewards/cosine_scaled_reward": 0.022431131452322006, + "rewards/format_reward": 0.9791666716337204, + "step": 404 + }, + { + "advantage_max": 1.9340013265609741, + "advantage_mean": -2.2584572989536866e-08, + "advantage_min": -0.8358136937022209, + "advantage_std": 0.9998725801706314, + "completion_length": 1360.708396911621, + "epoch": 0.46285714285714286, + "grad_norm": 0.39807194471359253, + "kl": 0.04998016357421875, + "lambda_div_used": 0.5, + "learning_rate": 1.9539516087697517e-07, + "loss": 0.002, + "reward": 0.4218410551548004, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.4218410551548004, + "reward_after_std": 0.8566682934761047, + "reward_before_mean": 1.3111839480698109, + "reward_before_std": 0.6185014648362994, + "reward_change_max": 0.0007586926221847534, + "reward_change_mean": -0.8893428482115269, + "reward_change_min": -1.2902274504303932, + "reward_change_std": 0.5102062933146954, + "reward_std": 0.8566683232784271, + "rewards/cosine_scaled_reward": 0.18684194469824433, + "rewards/format_reward": 0.9375000074505806, + "step": 405 + }, + { + "advantage_max": 1.9196098744869232, + "advantage_mean": 1.2417633588057697e-09, + "advantage_min": -0.8075883463025093, + "advantage_std": 0.9998895972967148, + "completion_length": 1349.770881652832, + "epoch": 0.464, + "grad_norm": 0.4375143349170685, + "kl": 0.03128814697265625, + "lambda_div_used": 0.5, + "learning_rate": 1.934696604901642e-07, + "loss": 0.0013, + "reward": 0.3450129013508558, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3450129013508558, + "reward_after_std": 1.0195146724581718, + "reward_before_mean": 1.1223999299108982, + "reward_before_std": 0.8690014518797398, + "reward_change_max": 0.0, + "reward_change_mean": -0.777387011796236, + "reward_change_min": -1.3255415260791779, + "reward_change_std": 0.5084087513387203, + "reward_std": 1.0195147022604942, + "rewards/cosine_scaled_reward": 0.10286661703139544, + "rewards/format_reward": 0.916666679084301, + "step": 406 + }, + { + "advantage_max": 1.9780168533325195, + "advantage_mean": -3.539025983378963e-08, + "advantage_min": -0.7044540494680405, + "advantage_std": 0.9998190328478813, + "completion_length": 1630.479206085205, + "epoch": 0.46514285714285714, + "grad_norm": 0.36417001485824585, + "kl": 0.0523529052734375, + "lambda_div_used": 0.5, + "learning_rate": 1.915615368891117e-07, + "loss": 0.0021, + "reward": 0.1611375161446631, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1611375161446631, + "reward_after_std": 0.6852178312838078, + "reward_before_mean": 0.8822555118240416, + "reward_before_std": 0.444427031558007, + "reward_change_max": 0.0, + "reward_change_mean": -0.7211180254817009, + "reward_change_min": -1.059138998389244, + "reward_change_std": 0.4020147733390331, + "reward_std": 0.6852178387343884, + "rewards/cosine_scaled_reward": 0.024461084976792336, + "rewards/format_reward": 0.8333333432674408, + "step": 407 + }, + { + "advantage_max": 1.9174300134181976, + "advantage_mean": 5.8983765094389184e-09, + "advantage_min": -0.857127234339714, + "advantage_std": 0.9998722821474075, + "completion_length": 1546.0000610351562, + "epoch": 0.4662857142857143, + "grad_norm": 0.4313151240348816, + "kl": 0.03270912170410156, + "lambda_div_used": 0.5, + "learning_rate": 1.8967088307307e-07, + "loss": 0.0013, + "reward": 0.07606437988579273, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.07606437988579273, + "reward_after_std": 0.8793220743536949, + "reward_before_mean": 0.6768680065870285, + "reward_before_std": 0.7902133017778397, + "reward_change_max": 0.0029953643679618835, + "reward_change_mean": -0.6008036248385906, + "reward_change_min": -1.1326103135943413, + "reward_change_std": 0.43921875581145287, + "reward_std": 0.8793221041560173, + "rewards/cosine_scaled_reward": -0.06781600136309862, + "rewards/format_reward": 0.8125000223517418, + "step": 408 + }, + { + "advantage_max": 1.9145079553127289, + "advantage_mean": 1.2417631367611648e-09, + "advantage_min": -0.7866168022155762, + "advantage_std": 0.9998549968004227, + "completion_length": 1843.708381652832, + "epoch": 0.4674285714285714, + "grad_norm": 0.3521879017353058, + "kl": 0.05733489990234375, + "lambda_div_used": 0.5, + "learning_rate": 1.8779779118983867e-07, + "loss": 0.0023, + "reward": 0.09330637939274311, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.09330637939274311, + "reward_after_std": 0.9056757166981697, + "reward_before_mean": 0.6961738504469395, + "reward_before_std": 0.8515808396041393, + "reward_change_max": 0.0, + "reward_change_mean": -0.6028674840927124, + "reward_change_min": -1.1532711759209633, + "reward_change_std": 0.4412507191300392, + "reward_std": 0.9056757390499115, + "rewards/cosine_scaled_reward": -0.058163101435638964, + "rewards/format_reward": 0.8125000055879354, + "step": 409 + }, + { + "advantage_max": 1.9500681310892105, + "advantage_mean": -3.104408285992122e-09, + "advantage_min": -0.6658405214548111, + "advantage_std": 0.9998787567019463, + "completion_length": 1263.2083587646484, + "epoch": 0.4685714285714286, + "grad_norm": 0.2878897488117218, + "kl": 0.019916534423828125, + "lambda_div_used": 0.5, + "learning_rate": 1.8594235253127372e-07, + "loss": 0.0008, + "reward": 0.19085952546447515, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.19085952546447515, + "reward_after_std": 0.9523617699742317, + "reward_before_mean": 0.8548973593860865, + "reward_before_std": 0.7926736399531364, + "reward_change_max": 0.0, + "reward_change_mean": -0.6640378534793854, + "reward_change_min": -1.1314915791153908, + "reward_change_std": 0.42829754017293453, + "reward_std": 0.9523618072271347, + "rewards/cosine_scaled_reward": -0.030884657404385507, + "rewards/format_reward": 0.916666679084301, + "step": 410 + }, + { + "advantage_max": 1.9501308053731918, + "advantage_mean": 2.5456150520852816e-08, + "advantage_min": -0.722774401307106, + "advantage_std": 0.9998585432767868, + "completion_length": 2099.645908355713, + "epoch": 0.4697142857142857, + "grad_norm": 0.4024723768234253, + "kl": 0.06557846069335938, + "lambda_div_used": 0.5, + "learning_rate": 1.8410465752883758e-07, + "loss": 0.0026, + "reward": 0.04722657427191734, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.04722657427191734, + "reward_after_std": 0.8598898909986019, + "reward_before_mean": 0.627219133079052, + "reward_before_std": 0.742165463976562, + "reward_change_max": 0.0003335103392601013, + "reward_change_mean": -0.5799925178289413, + "reward_change_min": -0.9638400673866272, + "reward_change_std": 0.3880952801555395, + "reward_std": 0.8598899245262146, + "rewards/cosine_scaled_reward": -0.061390455812215805, + "rewards/format_reward": 0.7500000055879354, + "step": 411 + }, + { + "advantage_max": 1.9316768646240234, + "advantage_mean": 6.5192582443529545e-09, + "advantage_min": -0.7628925666213036, + "advantage_std": 0.999868169426918, + "completion_length": 1022.5000228881836, + "epoch": 0.47085714285714286, + "grad_norm": 0.42367619276046753, + "kl": 0.018037796020507812, + "lambda_div_used": 0.5, + "learning_rate": 1.822847957491922e-07, + "loss": 0.0007, + "reward": 0.1572526041418314, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.1572526041418314, + "reward_after_std": 0.8611396998167038, + "reward_before_mean": 0.8268182594329119, + "reward_before_std": 0.748228445649147, + "reward_change_max": 0.0, + "reward_change_mean": -0.6695656627416611, + "reward_change_min": -1.220998875796795, + "reward_change_std": 0.4380665756762028, + "reward_std": 0.8611396998167038, + "rewards/cosine_scaled_reward": -0.07617420144379139, + "rewards/format_reward": 0.9791666716337204, + "step": 412 + }, + { + "advantage_max": 1.9105447828769684, + "advantage_mean": -6.208817238118058e-09, + "advantage_min": -0.7985327839851379, + "advantage_std": 0.9998429268598557, + "completion_length": 1399.7917022705078, + "epoch": 0.472, + "grad_norm": 0.4110771119594574, + "kl": 0.03682518005371094, + "lambda_div_used": 0.5, + "learning_rate": 1.804828558898332e-07, + "loss": 0.0015, + "reward": 0.19052385329268873, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.19052385329268873, + "reward_after_std": 0.7446985505521297, + "reward_before_mean": 0.9286044277250767, + "reward_before_std": 0.6417394857853651, + "reward_change_max": 0.0, + "reward_change_mean": -0.738080620765686, + "reward_change_min": -1.2469749003648758, + "reward_change_std": 0.4651348330080509, + "reward_std": 0.74469855427742, + "rewards/cosine_scaled_reward": 0.00596888642758131, + "rewards/format_reward": 0.9166666716337204, + "step": 413 + }, + { + "advantage_max": 1.9046124964952469, + "advantage_mean": -5.587935447692871e-09, + "advantage_min": -0.8260460719466209, + "advantage_std": 0.9998347759246826, + "completion_length": 1609.083396911621, + "epoch": 0.47314285714285714, + "grad_norm": 0.36283785104751587, + "kl": 0.032196044921875, + "lambda_div_used": 0.5, + "learning_rate": 1.7869892577476722e-07, + "loss": 0.0013, + "reward": -0.037756118923425674, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.037756118923425674, + "reward_after_std": 0.7010756619274616, + "reward_before_mean": 0.5217839451506734, + "reward_before_std": 0.6131279207766056, + "reward_change_max": 0.00289192795753479, + "reward_change_mean": -0.5595400780439377, + "reward_change_min": -1.0230029672384262, + "reward_change_std": 0.3680335786193609, + "reward_std": 0.7010756768286228, + "rewards/cosine_scaled_reward": -0.1661913748830557, + "rewards/format_reward": 0.8541666753590107, + "step": 414 + }, + { + "advantage_max": 1.915537714958191, + "advantage_mean": 2.6697914157214342e-08, + "advantage_min": -0.7383338809013367, + "advantage_std": 0.9998762533068657, + "completion_length": 1897.583396911621, + "epoch": 0.4742857142857143, + "grad_norm": 0.7078869938850403, + "kl": 0.08300018310546875, + "lambda_div_used": 0.5, + "learning_rate": 1.7693309235023127e-07, + "loss": 0.0033, + "reward": -0.02524216379970312, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.02524216379970312, + "reward_after_std": 0.9051150903105736, + "reward_before_mean": 0.4787615801615175, + "reward_before_std": 0.8592314906418324, + "reward_change_max": 0.0013706609606742859, + "reward_change_mean": -0.5040037520229816, + "reward_change_min": -1.025732345879078, + "reward_change_std": 0.39429098181426525, + "reward_std": 0.9051151052117348, + "rewards/cosine_scaled_reward": -0.10436921380460262, + "rewards/format_reward": 0.6875000093132257, + "step": 415 + }, + { + "advantage_max": 1.9446207731962204, + "advantage_mean": -9.313227966600834e-10, + "advantage_min": -0.7540571428835392, + "advantage_std": 0.9998806416988373, + "completion_length": 1226.6458740234375, + "epoch": 0.4754285714285714, + "grad_norm": 0.6419525146484375, + "kl": 0.0194091796875, + "lambda_div_used": 0.5, + "learning_rate": 1.7518544168045524e-07, + "loss": 0.0008, + "reward": 0.34382338635623455, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.34382338635623455, + "reward_after_std": 0.9477375037968159, + "reward_before_mean": 1.1409937180578709, + "reward_before_std": 0.7824789434671402, + "reward_change_max": 0.0, + "reward_change_mean": -0.7971703410148621, + "reward_change_min": -1.2335463464260101, + "reward_change_std": 0.475644176825881, + "reward_std": 0.9477375410497189, + "rewards/cosine_scaled_reward": 0.07049683481454849, + "rewards/format_reward": 1.0, + "step": 416 + }, + { + "advantage_max": 1.9392386972904205, + "advantage_mean": 3.1044089521259366e-10, + "advantage_min": -0.8309417217969894, + "advantage_std": 0.999857485294342, + "completion_length": 1679.3125457763672, + "epoch": 0.4765714285714286, + "grad_norm": 0.41789889335632324, + "kl": 0.03730010986328125, + "lambda_div_used": 0.5, + "learning_rate": 1.7345605894346726e-07, + "loss": 0.0015, + "reward": 0.18889883533120155, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.18889883533120155, + "reward_after_std": 0.8192192353308201, + "reward_before_mean": 0.8952180985361338, + "reward_before_std": 0.6806813403964043, + "reward_change_max": 0.0, + "reward_change_mean": -0.7063192613422871, + "reward_change_min": -1.1340601965785027, + "reward_change_std": 0.431840430945158, + "reward_std": 0.8192192912101746, + "rewards/cosine_scaled_reward": 0.010109039023518562, + "rewards/format_reward": 0.8750000149011612, + "step": 417 + }, + { + "advantage_max": 1.9405268132686615, + "advantage_mean": -3.725290520506519e-09, + "advantage_min": -0.7386938184499741, + "advantage_std": 0.9998854398727417, + "completion_length": 1269.7500305175781, + "epoch": 0.4777142857142857, + "grad_norm": 0.4162716269493103, + "kl": 0.0435333251953125, + "lambda_div_used": 0.5, + "learning_rate": 1.7174502842694212e-07, + "loss": 0.0017, + "reward": 0.416334574110806, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.416334574110806, + "reward_after_std": 0.980750635266304, + "reward_before_mean": 1.2588410302996635, + "reward_before_std": 0.79679533559829, + "reward_change_max": 0.0006934776902198792, + "reward_change_mean": -0.8425064440816641, + "reward_change_min": -1.3780925124883652, + "reward_change_std": 0.5267263427376747, + "reward_std": 0.980750672519207, + "rewards/cosine_scaled_reward": 0.18150383047759533, + "rewards/format_reward": 0.8958333395421505, + "step": 418 + }, + { + "advantage_max": 1.919666275382042, + "advantage_mean": -1.4590721297835785e-08, + "advantage_min": -0.7799930199980736, + "advantage_std": 0.9998503774404526, + "completion_length": 1480.1458740234375, + "epoch": 0.47885714285714287, + "grad_norm": 0.41415131092071533, + "kl": 0.05619621276855469, + "lambda_div_used": 0.5, + "learning_rate": 1.7005243352409333e-07, + "loss": 0.0022, + "reward": 0.328931987285614, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.328931987285614, + "reward_after_std": 0.8104345016181469, + "reward_before_mean": 1.158056017011404, + "reward_before_std": 0.640655167400837, + "reward_change_max": 0.0, + "reward_change_mean": -0.8291240483522415, + "reward_change_min": -1.3252276219427586, + "reward_change_std": 0.4964596051722765, + "reward_std": 0.8104345090687275, + "rewards/cosine_scaled_reward": 0.15194466523826122, + "rewards/format_reward": 0.8541666716337204, + "step": 419 + }, + { + "advantage_max": 1.9785042852163315, + "advantage_mean": -1.490116141589226e-08, + "advantage_min": -0.6844378933310509, + "advantage_std": 0.9998235180974007, + "completion_length": 966.3333740234375, + "epoch": 0.48, + "grad_norm": 0.5011756420135498, + "kl": 0.02069854736328125, + "lambda_div_used": 0.5, + "learning_rate": 1.6837835672960831e-07, + "loss": 0.0008, + "reward": 0.05833008675836027, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.05833008675836027, + "reward_after_std": 0.612760417163372, + "reward_before_mean": 0.7137523256242275, + "reward_before_std": 0.39219198003411293, + "reward_change_max": 0.0, + "reward_change_mean": -0.65542221814394, + "reward_change_min": -0.9523419812321663, + "reward_change_std": 0.34629401564598083, + "reward_std": 0.6127604395151138, + "rewards/cosine_scaled_reward": -0.14312385022640228, + "rewards/format_reward": 1.0, + "step": 420 + }, + { + "advantage_max": 1.9388664364814758, + "advantage_mean": 8.692344399818808e-09, + "advantage_min": -0.7572106420993805, + "advantage_std": 0.9998262673616409, + "completion_length": 1458.2708854675293, + "epoch": 0.48114285714285715, + "grad_norm": 0.5800455212593079, + "kl": 0.0557098388671875, + "lambda_div_used": 0.5, + "learning_rate": 1.6672287963562852e-07, + "loss": 0.0022, + "reward": -0.0324164031771943, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.0324164031771943, + "reward_after_std": 0.7277033217251301, + "reward_before_mean": 0.5232896497473121, + "reward_before_std": 0.6215553600341082, + "reward_change_max": 0.0010679811239242554, + "reward_change_mean": -0.5557060427963734, + "reward_change_min": -0.9187875688076019, + "reward_change_std": 0.3641525115817785, + "reward_std": 0.7277033478021622, + "rewards/cosine_scaled_reward": -0.1654385207220912, + "rewards/format_reward": 0.854166679084301, + "step": 421 + }, + { + "advantage_max": 1.9458617120981216, + "advantage_mean": -7.140140034778142e-09, + "advantage_min": -0.7656347528100014, + "advantage_std": 0.9998919069766998, + "completion_length": 1688.0000457763672, + "epoch": 0.48228571428571426, + "grad_norm": 0.4291648864746094, + "kl": 0.061893463134765625, + "lambda_div_used": 0.5, + "learning_rate": 1.6508608292777203e-07, + "loss": 0.0025, + "reward": 0.19870495703071356, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.19870495703071356, + "reward_after_std": 1.0406667441129684, + "reward_before_mean": 0.8401019177399576, + "reward_before_std": 0.8951727598905563, + "reward_change_max": 0.0, + "reward_change_mean": -0.6413969695568085, + "reward_change_min": -1.067455343902111, + "reward_change_std": 0.41564703918993473, + "reward_std": 1.0406667664647102, + "rewards/cosine_scaled_reward": 0.0033842832781374454, + "rewards/format_reward": 0.8333333414047956, + "step": 422 + }, + { + "advantage_max": 1.9458054602146149, + "advantage_mean": 1.2417634809303024e-08, + "advantage_min": -0.7163062021136284, + "advantage_std": 0.9998630881309509, + "completion_length": 2009.3125610351562, + "epoch": 0.48342857142857143, + "grad_norm": 0.5385973453521729, + "kl": 0.09122467041015625, + "lambda_div_used": 0.5, + "learning_rate": 1.6346804638120098e-07, + "loss": 0.0037, + "reward": -0.002161663491278887, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.002161663491278887, + "reward_after_std": 0.8518885150551796, + "reward_before_mean": 0.5340993963181973, + "reward_before_std": 0.7732574082911015, + "reward_change_max": 0.0011147409677505493, + "reward_change_mean": -0.5362610556185246, + "reward_change_min": -0.997920099645853, + "reward_change_std": 0.3805042449384928, + "reward_std": 0.8518885150551796, + "rewards/cosine_scaled_reward": -0.08711698092520237, + "rewards/format_reward": 0.708333345130086, + "step": 423 + }, + { + "advantage_max": 1.932769998908043, + "advantage_mean": 1.552204320631745e-08, + "advantage_min": -0.7818415835499763, + "advantage_std": 0.9998092278838158, + "completion_length": 1649.0000381469727, + "epoch": 0.4845714285714286, + "grad_norm": 0.43892186880111694, + "kl": 0.048885345458984375, + "lambda_div_used": 0.5, + "learning_rate": 1.6186884885673413e-07, + "loss": 0.002, + "reward": -0.14956187270581722, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.14956187270581722, + "reward_after_std": 0.6577580161392689, + "reward_before_mean": 0.3286280228057876, + "reward_before_std": 0.5828495901077986, + "reward_change_max": 0.0017966404557228088, + "reward_change_mean": -0.47818990983068943, + "reward_change_min": -0.839508980512619, + "reward_change_std": 0.32613570243120193, + "reward_std": 0.6577580310404301, + "rewards/cosine_scaled_reward": -0.21068599075078964, + "rewards/format_reward": 0.7500000074505806, + "step": 424 + }, + { + "advantage_max": 1.900831788778305, + "advantage_mean": -3.166496842510469e-08, + "advantage_min": -0.8162828199565411, + "advantage_std": 0.9999080747365952, + "completion_length": 1217.9167175292969, + "epoch": 0.4857142857142857, + "grad_norm": 0.2568250596523285, + "kl": 0.0250244140625, + "lambda_div_used": 0.5, + "learning_rate": 1.6028856829700258e-07, + "loss": 0.001, + "reward": 0.6311993859708309, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6311993859708309, + "reward_after_std": 1.1424714177846909, + "reward_before_mean": 1.6125700250267982, + "reward_before_std": 1.0077669620513916, + "reward_change_max": 0.0, + "reward_change_mean": -0.9813706278800964, + "reward_change_min": -1.6586797833442688, + "reward_change_std": 0.6413742937147617, + "reward_std": 1.1424714773893356, + "rewards/cosine_scaled_reward": 0.3271183331380598, + "rewards/format_reward": 0.9583333358168602, + "step": 425 + }, + { + "advantage_max": 1.9204679131507874, + "advantage_mean": 9.623666918923135e-09, + "advantage_min": -0.820533998310566, + "advantage_std": 0.999837689101696, + "completion_length": 1694.708366394043, + "epoch": 0.4868571428571429, + "grad_norm": 0.6034704446792603, + "kl": 0.09422683715820312, + "lambda_div_used": 0.5, + "learning_rate": 1.5872728172265146e-07, + "loss": 0.0038, + "reward": 0.1135131117189303, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1135131117189303, + "reward_after_std": 0.767748273909092, + "reward_before_mean": 0.7725034542381763, + "reward_before_std": 0.6063254494220018, + "reward_change_max": 0.0, + "reward_change_mean": -0.6589903496205807, + "reward_change_min": -1.0663022696971893, + "reward_change_std": 0.4144749026745558, + "reward_std": 0.7677483111619949, + "rewards/cosine_scaled_reward": -0.01999828591942787, + "rewards/format_reward": 0.812500013038516, + "step": 426 + }, + { + "advantage_max": 1.9189137816429138, + "advantage_mean": -6.829698917520943e-09, + "advantage_min": -0.8139993920922279, + "advantage_std": 0.9998674094676971, + "completion_length": 1720.7500534057617, + "epoch": 0.488, + "grad_norm": 0.48113617300987244, + "kl": 0.04186248779296875, + "lambda_div_used": 0.5, + "learning_rate": 1.5718506522858572e-07, + "loss": 0.0017, + "reward": 0.171463415666949, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.171463415666949, + "reward_after_std": 0.9078935757279396, + "reward_before_mean": 0.8412778452038765, + "reward_before_std": 0.8320760056376457, + "reward_change_max": 0.0, + "reward_change_mean": -0.6698144376277924, + "reward_change_min": -1.2227418944239616, + "reward_change_std": 0.46701946668326855, + "reward_std": 0.9078936129808426, + "rewards/cosine_scaled_reward": 0.014388916082680225, + "rewards/format_reward": 0.8125000186264515, + "step": 427 + }, + { + "advantage_max": 1.937966212630272, + "advantage_mean": 6.208817349140361e-09, + "advantage_min": -0.7897857651114464, + "advantage_std": 0.9998549371957779, + "completion_length": 1417.1041946411133, + "epoch": 0.48914285714285716, + "grad_norm": 0.6769260764122009, + "kl": 0.032806396484375, + "lambda_div_used": 0.5, + "learning_rate": 1.5566199398026147e-07, + "loss": 0.0013, + "reward": 0.05286524537950754, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.05286524537950754, + "reward_after_std": 0.8130511678755283, + "reward_before_mean": 0.6512714847922325, + "reward_before_std": 0.6781092509627342, + "reward_change_max": 0.0005588680505752563, + "reward_change_mean": -0.5984062403440475, + "reward_change_min": -0.9986972808837891, + "reward_change_std": 0.36525629833340645, + "reward_std": 0.8130511939525604, + "rewards/cosine_scaled_reward": -0.12228094134479761, + "rewards/format_reward": 0.8958333432674408, + "step": 428 + }, + { + "advantage_max": 1.9107749164104462, + "advantage_mean": -9.313226023710541e-09, + "advantage_min": -0.7637681402266026, + "advantage_std": 0.9998604357242584, + "completion_length": 1157.8750495910645, + "epoch": 0.49028571428571427, + "grad_norm": 0.4458828866481781, + "kl": 0.0574798583984375, + "lambda_div_used": 0.5, + "learning_rate": 1.5415814221002265e-07, + "loss": 0.0023, + "reward": 0.16714679636061192, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.16714679636061192, + "reward_after_std": 0.8375038094818592, + "reward_before_mean": 0.8551300168037415, + "reward_before_std": 0.7358640916645527, + "reward_change_max": 0.0, + "reward_change_mean": -0.6879832223057747, + "reward_change_min": -1.2509738504886627, + "reward_change_std": 0.4568104110658169, + "reward_std": 0.8375038281083107, + "rewards/cosine_scaled_reward": -0.06201833672821522, + "rewards/format_reward": 0.9791666716337204, + "step": 429 + }, + { + "advantage_max": 1.9214459359645844, + "advantage_mean": -3.104408619059029e-09, + "advantage_min": -0.7615657895803452, + "advantage_std": 0.9998621419072151, + "completion_length": 1377.2708892822266, + "epoch": 0.49142857142857144, + "grad_norm": 0.47784435749053955, + "kl": 0.054538726806640625, + "lambda_div_used": 0.5, + "learning_rate": 1.5267358321348285e-07, + "loss": 0.0022, + "reward": 0.21472038747742772, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.21472038747742772, + "reward_after_std": 0.8057798445224762, + "reward_before_mean": 0.9536108346655965, + "reward_before_std": 0.6560990251600742, + "reward_change_max": 0.0, + "reward_change_mean": -0.7388904243707657, + "reward_change_min": -1.1853134781122208, + "reward_change_std": 0.46452105045318604, + "reward_std": 0.805779866874218, + "rewards/cosine_scaled_reward": 0.03930539125576615, + "rewards/format_reward": 0.875, + "step": 430 + }, + { + "advantage_max": 1.9473706632852554, + "advantage_mean": -5.122274382429737e-09, + "advantage_min": -0.7392523810267448, + "advantage_std": 0.9998333901166916, + "completion_length": 1522.8333740234375, + "epoch": 0.49257142857142855, + "grad_norm": 0.5233393907546997, + "kl": 0.08725738525390625, + "lambda_div_used": 0.5, + "learning_rate": 1.5120838934595337e-07, + "loss": 0.0035, + "reward": 0.0229043357539922, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.0229043357539922, + "reward_after_std": 0.6961653456091881, + "reward_before_mean": 0.6314683072268963, + "reward_before_std": 0.5440207023639232, + "reward_change_max": 0.0, + "reward_change_mean": -0.6085639595985413, + "reward_change_min": -1.0415047705173492, + "reward_change_std": 0.36842281371355057, + "reward_std": 0.6961653828620911, + "rewards/cosine_scaled_reward": -0.09051587281282991, + "rewards/format_reward": 0.8125, + "step": 431 + }, + { + "advantage_max": 1.8979507982730865, + "advantage_mean": -3.570069795344466e-09, + "advantage_min": -0.7884392961859703, + "advantage_std": 0.9998320043087006, + "completion_length": 1696.8541870117188, + "epoch": 0.4937142857142857, + "grad_norm": 0.44175800681114197, + "kl": 0.06729888916015625, + "lambda_div_used": 0.5, + "learning_rate": 1.4976263201891613e-07, + "loss": 0.0027, + "reward": -0.02060939557850361, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.02060939557850361, + "reward_after_std": 0.7018601484596729, + "reward_before_mean": 0.5527903139591217, + "reward_before_std": 0.5883015915751457, + "reward_change_max": 0.00041546672582626343, + "reward_change_mean": -0.573399730026722, + "reward_change_min": -0.9393155761063099, + "reward_change_std": 0.35436554066836834, + "reward_std": 0.7018601670861244, + "rewards/cosine_scaled_reward": -0.11943817464634776, + "rewards/format_reward": 0.7916666716337204, + "step": 432 + }, + { + "advantage_max": 1.964208960533142, + "advantage_mean": -1.1796753240922442e-08, + "advantage_min": -0.6931241601705551, + "advantage_std": 0.9998587071895599, + "completion_length": 1315.8333587646484, + "epoch": 0.4948571428571429, + "grad_norm": 0.43583711981773376, + "kl": 0.021282196044921875, + "lambda_div_used": 0.5, + "learning_rate": 1.483363816965435e-07, + "loss": 0.0009, + "reward": 0.28905248921364546, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.28905248921364546, + "reward_after_std": 0.7677325867116451, + "reward_before_mean": 1.0941534340381622, + "reward_before_std": 0.504822900518775, + "reward_change_max": 0.0002372339367866516, + "reward_change_mean": -0.8051009066402912, + "reward_change_min": -1.2095557525753975, + "reward_change_std": 0.44658362865448, + "reward_std": 0.7677325941622257, + "rewards/cosine_scaled_reward": 0.06791000673547387, + "rewards/format_reward": 0.9583333358168602, + "step": 433 + }, + { + "advantage_max": 1.90696319937706, + "advantage_mean": 3.1044085080367267e-09, + "advantage_min": -0.8565580695867538, + "advantage_std": 0.9998314157128334, + "completion_length": 1433.1042022705078, + "epoch": 0.496, + "grad_norm": 0.4644394516944885, + "kl": 0.045490264892578125, + "lambda_div_used": 0.5, + "learning_rate": 1.469297078922642e-07, + "loss": 0.0018, + "reward": -0.05052966655057389, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.05052966655057389, + "reward_after_std": 0.6372621580958366, + "reward_before_mean": 0.513536169193685, + "reward_before_std": 0.5464147813618183, + "reward_change_max": 0.0025284886360168457, + "reward_change_mean": -0.5640658438205719, + "reward_change_min": -0.9717100188136101, + "reward_change_std": 0.3585630767047405, + "reward_std": 0.6372621655464172, + "rewards/cosine_scaled_reward": -0.20156525447964668, + "rewards/format_reward": 0.916666679084301, + "step": 434 + }, + { + "advantage_max": 1.9291100949048996, + "advantage_mean": 3.725290353973065e-09, + "advantage_min": -0.8469479605555534, + "advantage_std": 0.999834306538105, + "completion_length": 1254.3542098999023, + "epoch": 0.49714285714285716, + "grad_norm": 0.5041975975036621, + "kl": 0.0749053955078125, + "lambda_div_used": 0.5, + "learning_rate": 1.4554267916537495e-07, + "loss": 0.003, + "reward": 0.05097749922424555, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.05097749922424555, + "reward_after_std": 0.7026257328689098, + "reward_before_mean": 0.6741197109222412, + "reward_before_std": 0.5542252194136381, + "reward_change_max": 0.0, + "reward_change_mean": -0.6231421791017056, + "reward_change_min": -1.0012230202555656, + "reward_change_std": 0.37164881080389023, + "reward_std": 0.7026257365942001, + "rewards/cosine_scaled_reward": -0.14210683782584965, + "rewards/format_reward": 0.9583333432674408, + "step": 435 + }, + { + "advantage_max": 1.9043562710285187, + "advantage_mean": -1.6142925440831846e-08, + "advantage_min": -0.829190157353878, + "advantage_std": 0.9998815432190895, + "completion_length": 1361.083366394043, + "epoch": 0.4982857142857143, + "grad_norm": 0.6900902390480042, + "kl": 0.06423187255859375, + "lambda_div_used": 0.5, + "learning_rate": 1.4417536311769885e-07, + "loss": 0.0026, + "reward": 0.2773757204413414, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2773757204413414, + "reward_after_std": 0.9254966825246811, + "reward_before_mean": 1.0332196983508766, + "reward_before_std": 0.8475972041487694, + "reward_change_max": 0.0005481541156768799, + "reward_change_mean": -0.7558439932763577, + "reward_change_min": -1.2967700064182281, + "reward_change_std": 0.5047092605382204, + "reward_std": 0.9254966899752617, + "rewards/cosine_scaled_reward": 0.07910984754562378, + "rewards/format_reward": 0.875, + "step": 436 + }, + { + "advantage_max": 1.9277342706918716, + "advantage_mean": 1.2417633588057697e-09, + "advantage_min": -0.8391797617077827, + "advantage_std": 0.9998452290892601, + "completion_length": 1271.4167251586914, + "epoch": 0.49942857142857144, + "grad_norm": 0.4023401141166687, + "kl": 0.03212738037109375, + "lambda_div_used": 0.5, + "learning_rate": 1.4282782639029128e-07, + "loss": 0.0013, + "reward": 0.2056784473825246, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2056784473825246, + "reward_after_std": 0.7642825543880463, + "reward_before_mean": 0.947747528553009, + "reward_before_std": 0.6257410123944283, + "reward_change_max": 0.0, + "reward_change_mean": -0.7420690581202507, + "reward_change_min": -1.1740047186613083, + "reward_change_std": 0.45448190718889236, + "reward_std": 0.7642825618386269, + "rewards/cosine_scaled_reward": -0.005292933899909258, + "rewards/format_reward": 0.9583333432674408, + "step": 437 + }, + { + "advantage_max": 1.9327512085437775, + "advantage_mean": 2.7318795781106076e-08, + "advantage_min": -0.7429120875895023, + "advantage_std": 0.9998285323381424, + "completion_length": 1894.7500534057617, + "epoch": 0.5005714285714286, + "grad_norm": 0.6199660301208496, + "kl": 0.08026504516601562, + "lambda_div_used": 0.5, + "learning_rate": 1.4150013466019114e-07, + "loss": 0.0032, + "reward": -0.031040742062032223, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.031040742062032223, + "reward_after_std": 0.6640561632812023, + "reward_before_mean": 0.5457404367625713, + "reward_before_std": 0.5272583463229239, + "reward_change_max": 0.0, + "reward_change_mean": -0.5767811760306358, + "reward_change_min": -0.9341515824198723, + "reward_change_std": 0.360694108530879, + "reward_std": 0.6640561930835247, + "rewards/cosine_scaled_reward": -0.10212977975606918, + "rewards/format_reward": 0.7500000111758709, + "step": 438 + }, + { + "advantage_max": 1.919052854180336, + "advantage_mean": -6.208817349140361e-09, + "advantage_min": -0.8913252726197243, + "advantage_std": 0.9998451843857765, + "completion_length": 1347.8542098999023, + "epoch": 0.5017142857142857, + "grad_norm": 0.5470811128616333, + "kl": 0.0438079833984375, + "lambda_div_used": 0.5, + "learning_rate": 1.4019235263722034e-07, + "loss": 0.0018, + "reward": 0.04901101998984814, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.04901101998984814, + "reward_after_std": 0.7144185900688171, + "reward_before_mean": 0.6771840838191565, + "reward_before_std": 0.6143386028707027, + "reward_change_max": 0.002023160457611084, + "reward_change_mean": -0.6281730607151985, + "reward_change_min": -1.0101951658725739, + "reward_change_std": 0.39536717906594276, + "reward_std": 0.7144186124205589, + "rewards/cosine_scaled_reward": -0.09890797361731529, + "rewards/format_reward": 0.8750000149011612, + "step": 439 + }, + { + "advantage_max": 1.961982324719429, + "advantage_mean": 1.2728075454715437e-08, + "advantage_min": -0.7568341344594955, + "advantage_std": 0.9998323395848274, + "completion_length": 1275.854206085205, + "epoch": 0.5028571428571429, + "grad_norm": 0.45573917031288147, + "kl": 0.042362213134765625, + "lambda_div_used": 0.5, + "learning_rate": 1.3890454406082956e-07, + "loss": 0.0017, + "reward": -0.05465042544528842, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.05465042544528842, + "reward_after_std": 0.6868558041751385, + "reward_before_mean": 0.4895972586236894, + "reward_before_std": 0.5479109510779381, + "reward_change_max": 0.0007667094469070435, + "reward_change_mean": -0.5442476458847523, + "reward_change_min": -0.8939982280135155, + "reward_change_std": 0.3350295424461365, + "reward_std": 0.6868558302521706, + "rewards/cosine_scaled_reward": -0.21353472862392664, + "rewards/format_reward": 0.916666679084301, + "step": 440 + }, + { + "advantage_max": 1.9711784422397614, + "advantage_mean": 8.07146260939362e-09, + "advantage_min": -0.6901766732335091, + "advantage_std": 0.9998734667897224, + "completion_length": 1920.6667175292969, + "epoch": 0.504, + "grad_norm": 0.773413360118866, + "kl": 0.0863800048828125, + "lambda_div_used": 0.5, + "learning_rate": 1.3763677169699217e-07, + "loss": 0.0034, + "reward": 0.05949468910694122, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.05949468910694122, + "reward_after_std": 0.9693707525730133, + "reward_before_mean": 0.6075320821255445, + "reward_before_std": 0.853855162858963, + "reward_change_max": 0.0, + "reward_change_mean": -0.5480373837053776, + "reward_change_min": -0.9951094016432762, + "reward_change_std": 0.37295518442988396, + "reward_std": 0.9693707972764969, + "rewards/cosine_scaled_reward": -0.06081731495214626, + "rewards/format_reward": 0.7291666809469461, + "step": 441 + }, + { + "advantage_max": 1.9738472253084183, + "advantage_mean": -2.483526828633842e-09, + "advantage_min": -0.6937413960695267, + "advantage_std": 0.9998617991805077, + "completion_length": 1337.0416946411133, + "epoch": 0.5051428571428571, + "grad_norm": 0.5474178791046143, + "kl": 0.04239082336425781, + "lambda_div_used": 0.5, + "learning_rate": 1.3638909733514452e-07, + "loss": 0.0017, + "reward": 0.2079712525010109, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2079712525010109, + "reward_after_std": 0.802063100039959, + "reward_before_mean": 0.9304749630391598, + "reward_before_std": 0.582705058157444, + "reward_change_max": 0.0, + "reward_change_mean": -0.7225036658346653, + "reward_change_min": -1.1098268404603004, + "reward_change_std": 0.41106531769037247, + "reward_std": 0.8020631074905396, + "rewards/cosine_scaled_reward": 0.017320780083537102, + "rewards/format_reward": 0.8958333432674408, + "step": 442 + }, + { + "advantage_max": 1.9156748950481415, + "advantage_mean": 1.1175870673341137e-08, + "advantage_min": -0.854647807776928, + "advantage_std": 0.9998498931527138, + "completion_length": 1900.0834121704102, + "epoch": 0.5062857142857143, + "grad_norm": 0.6384137272834778, + "kl": 0.07502937316894531, + "lambda_div_used": 0.5, + "learning_rate": 1.351615817851748e-07, + "loss": 0.003, + "reward": 0.0032209542114287615, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.0032209542114287615, + "reward_after_std": 0.7304340079426765, + "reward_before_mean": 0.5884513519704342, + "reward_before_std": 0.6362341642379761, + "reward_change_max": 0.0014985054731369019, + "reward_change_mean": -0.5852304063737392, + "reward_change_min": -0.9630409777164459, + "reward_change_std": 0.37647127173841, + "reward_std": 0.7304340153932571, + "rewards/cosine_scaled_reward": -0.08077432494610548, + "rewards/format_reward": 0.7500000111758709, + "step": 443 + }, + { + "advantage_max": 1.8896929323673248, + "advantage_mean": 4.035731082652205e-09, + "advantage_min": -0.808488741517067, + "advantage_std": 0.9998615458607674, + "completion_length": 1932.8125190734863, + "epoch": 0.5074285714285715, + "grad_norm": 0.7674197554588318, + "kl": 0.12237548828125, + "lambda_div_used": 0.5, + "learning_rate": 1.3395428487445914e-07, + "loss": 0.0049, + "reward": -0.03767237951979041, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.03767237951979041, + "reward_after_std": 0.8582460209727287, + "reward_before_mean": 0.47329113259911537, + "reward_before_std": 0.831679854542017, + "reward_change_max": 0.001330450177192688, + "reward_change_mean": -0.5109634958207607, + "reward_change_min": -0.9912841245532036, + "reward_change_std": 0.39683387242257595, + "reward_std": 0.8582460507750511, + "rewards/cosine_scaled_reward": -0.10710444860160351, + "rewards/format_reward": 0.6875000111758709, + "step": 444 + }, + { + "advantage_max": 1.9227261245250702, + "advantage_mean": 2.7939677571531263e-08, + "advantage_min": -0.8477248698472977, + "advantage_std": 0.999822311103344, + "completion_length": 1753.2917175292969, + "epoch": 0.5085714285714286, + "grad_norm": 0.4278562366962433, + "kl": 0.08892059326171875, + "lambda_div_used": 0.5, + "learning_rate": 1.3276726544494571e-07, + "loss": 0.0036, + "reward": -0.11785220762249082, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.11785220762249082, + "reward_after_std": 0.5929725170135498, + "reward_before_mean": 0.4046326084062457, + "reward_before_std": 0.47818462178111076, + "reward_change_max": 0.0008755475282669067, + "reward_change_mean": -0.5224847923964262, + "reward_change_min": -0.8103570342063904, + "reward_change_std": 0.31598146446049213, + "reward_std": 0.5929725207388401, + "rewards/cosine_scaled_reward": -0.2039337046444416, + "rewards/format_reward": 0.8125000111758709, + "step": 445 + }, + { + "advantage_max": 1.9457627087831497, + "advantage_mean": -6.208817238118058e-09, + "advantage_min": -0.7387218102812767, + "advantage_std": 0.9998513013124466, + "completion_length": 1702.1458740234375, + "epoch": 0.5097142857142857, + "grad_norm": 0.5779604911804199, + "kl": 0.07822990417480469, + "lambda_div_used": 0.5, + "learning_rate": 1.316005813502869e-07, + "loss": 0.0031, + "reward": 0.15322877001017332, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.15322877001017332, + "reward_after_std": 0.7949897982180119, + "reward_before_mean": 0.8317308221012354, + "reward_before_std": 0.6429251153022051, + "reward_change_max": 0.0012955516576766968, + "reward_change_mean": -0.6785020679235458, + "reward_change_min": -1.1380436643958092, + "reward_change_std": 0.4298906698822975, + "reward_std": 0.794989813119173, + "rewards/cosine_scaled_reward": -0.0008012736216187477, + "rewards/format_reward": 0.8333333395421505, + "step": 446 + }, + { + "advantage_max": 1.9582972824573517, + "advantage_mean": 1.3038516266661304e-08, + "advantage_min": -0.7675222232937813, + "advantage_std": 0.9998466297984123, + "completion_length": 1413.083396911621, + "epoch": 0.5108571428571429, + "grad_norm": 0.6830373406410217, + "kl": 0.084716796875, + "lambda_div_used": 0.5, + "learning_rate": 1.3045428945301953e-07, + "loss": 0.0034, + "reward": 0.09648872714024037, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.09648872714024037, + "reward_after_std": 0.7626160159707069, + "reward_before_mean": 0.7410371452569962, + "reward_before_std": 0.5982570890337229, + "reward_change_max": 0.0017501115798950195, + "reward_change_mean": -0.6445484086871147, + "reward_change_min": -0.991340659558773, + "reward_change_std": 0.38689782470464706, + "reward_std": 0.7626160383224487, + "rewards/cosine_scaled_reward": -0.0982314352877438, + "rewards/format_reward": 0.9375000074505806, + "step": 447 + }, + { + "advantage_max": 1.9502499103546143, + "advantage_mean": 1.8626452269465688e-08, + "advantage_min": -0.7153580188751221, + "advantage_std": 0.999836340546608, + "completion_length": 1293.833366394043, + "epoch": 0.512, + "grad_norm": 0.7012367844581604, + "kl": 0.06674957275390625, + "lambda_div_used": 0.5, + "learning_rate": 1.2932844562179352e-07, + "loss": 0.0027, + "reward": 0.12058082967996597, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.12058082967996597, + "reward_after_std": 0.7431465946137905, + "reward_before_mean": 0.783910283818841, + "reward_before_std": 0.5813306700438261, + "reward_change_max": 0.0, + "reward_change_mean": -0.663329441100359, + "reward_change_min": -1.070579469203949, + "reward_change_std": 0.40219525434076786, + "reward_std": 0.7431466057896614, + "rewards/cosine_scaled_reward": -0.05596153810620308, + "rewards/format_reward": 0.8958333432674408, + "step": 448 + }, + { + "advantage_max": 1.9743008613586426, + "advantage_mean": 3.7252904094842165e-09, + "advantage_min": -0.680752731859684, + "advantage_std": 0.9998580440878868, + "completion_length": 1440.93754196167, + "epoch": 0.5131428571428571, + "grad_norm": 0.8411633372306824, + "kl": 0.1081695556640625, + "lambda_div_used": 0.5, + "learning_rate": 1.2822310472864885e-07, + "loss": 0.0043, + "reward": -0.0015947124920785427, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.0015947124920785427, + "reward_after_std": 0.798382893204689, + "reward_before_mean": 0.5481637455523014, + "reward_before_std": 0.6414137668907642, + "reward_change_max": 0.003213651478290558, + "reward_change_mean": -0.54975844360888, + "reward_change_min": -0.9199419319629669, + "reward_change_std": 0.3311528917402029, + "reward_std": 0.7983829081058502, + "rewards/cosine_scaled_reward": -0.16341814678162336, + "rewards/format_reward": 0.8750000055879354, + "step": 449 + }, + { + "advantage_max": 1.9542711079120636, + "advantage_mean": 3.414849514271623e-09, + "advantage_min": -0.8025156818330288, + "advantage_std": 0.9998338893055916, + "completion_length": 1186.5625305175781, + "epoch": 0.5142857142857142, + "grad_norm": 0.6107338666915894, + "kl": 0.037349700927734375, + "lambda_div_used": 0.5, + "learning_rate": 1.2713832064634125e-07, + "loss": 0.0015, + "reward": 0.14779857732355595, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.14779857732355595, + "reward_after_std": 0.6910727322101593, + "reward_before_mean": 0.8608616031706333, + "reward_before_std": 0.4924622122198343, + "reward_change_max": 0.0, + "reward_change_mean": -0.7130630314350128, + "reward_change_min": -1.0870602205395699, + "reward_change_std": 0.4094325453042984, + "reward_std": 0.6910727508366108, + "rewards/cosine_scaled_reward": -0.038319210056215525, + "rewards/format_reward": 0.9375000149011612, + "step": 450 + }, + { + "advantage_max": 1.9720109701156616, + "advantage_mean": -2.173086144363623e-08, + "advantage_min": -0.7459082752466202, + "advantage_std": 0.999847374856472, + "completion_length": 1231.645881652832, + "epoch": 0.5154285714285715, + "grad_norm": 0.5005615949630737, + "kl": 0.066131591796875, + "lambda_div_used": 0.5, + "learning_rate": 1.260741462457165e-07, + "loss": 0.0026, + "reward": 0.06357445800676942, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.06357445800676942, + "reward_after_std": 0.7245562225580215, + "reward_before_mean": 0.6849512457847595, + "reward_before_std": 0.5289575774222612, + "reward_change_max": 0.004507869482040405, + "reward_change_mean": -0.6213767826557159, + "reward_change_min": -0.9343273863196373, + "reward_change_std": 0.36326562985777855, + "reward_std": 0.7245562374591827, + "rewards/cosine_scaled_reward": -0.11585774272680283, + "rewards/format_reward": 0.9166666865348816, + "step": 451 + }, + { + "advantage_max": 1.910423904657364, + "advantage_mean": 3.7252904094842165e-09, + "advantage_min": -0.7892148867249489, + "advantage_std": 0.9998941868543625, + "completion_length": 1935.5208740234375, + "epoch": 0.5165714285714286, + "grad_norm": 0.815445065498352, + "kl": 0.12743377685546875, + "lambda_div_used": 0.5, + "learning_rate": 1.2503063339313356e-07, + "loss": 0.0051, + "reward": 0.1091558001935482, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1091558001935482, + "reward_after_std": 1.0659672245383263, + "reward_before_mean": 0.6719845505431294, + "reward_before_std": 1.0267342068254948, + "reward_change_max": 0.00015228241682052612, + "reward_change_mean": -0.5628287335857749, + "reward_change_min": -1.069099210202694, + "reward_change_std": 0.43420621007680893, + "reward_std": 1.0659672319889069, + "rewards/cosine_scaled_reward": 0.013075600378215313, + "rewards/format_reward": 0.6458333525806665, + "step": 452 + }, + { + "advantage_max": 1.9004657417535782, + "advantage_mean": 7.450581041013038e-09, + "advantage_min": -0.8262704908847809, + "advantage_std": 0.9998724386096001, + "completion_length": 1443.791706085205, + "epoch": 0.5177142857142857, + "grad_norm": 0.9717079997062683, + "kl": 0.076995849609375, + "lambda_div_used": 0.5, + "learning_rate": 1.2400783294793668e-07, + "loss": 0.0031, + "reward": 0.15508080273866653, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.15508080273866653, + "reward_after_std": 0.8549620658159256, + "reward_before_mean": 0.8279006769880652, + "reward_before_std": 0.7649585343897343, + "reward_change_max": 0.0015206411480903625, + "reward_change_mean": -0.6728198602795601, + "reward_change_min": -1.1892420575022697, + "reward_change_std": 0.442673247307539, + "reward_std": 0.8549620807170868, + "rewards/cosine_scaled_reward": -0.002716338261961937, + "rewards/format_reward": 0.8333333432674408, + "step": 453 + }, + { + "advantage_max": 1.9408542066812515, + "advantage_mean": 1.4280279847511679e-08, + "advantage_min": -0.7722970768809319, + "advantage_std": 0.9998204112052917, + "completion_length": 1339.0000534057617, + "epoch": 0.5188571428571429, + "grad_norm": 0.5383586883544922, + "kl": 0.05023193359375, + "lambda_div_used": 0.5, + "learning_rate": 1.2300579475997657e-07, + "loss": 0.002, + "reward": -0.027189110405743122, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.027189110405743122, + "reward_after_std": 0.6605150178074837, + "reward_before_mean": 0.5520283579826355, + "reward_before_std": 0.5278488723561168, + "reward_change_max": 0.0, + "reward_change_mean": -0.5792174749076366, + "reward_change_min": -0.9451394081115723, + "reward_change_std": 0.35080388747155666, + "reward_std": 0.6605150178074837, + "rewards/cosine_scaled_reward": -0.18231916427612305, + "rewards/format_reward": 0.916666679084301, + "step": 454 + }, + { + "advantage_max": 1.9397364258766174, + "advantage_mean": -3.104408841103634e-09, + "advantage_min": -0.7429406382143497, + "advantage_std": 0.9998341724276543, + "completion_length": 1487.0000534057617, + "epoch": 0.52, + "grad_norm": 0.6561453342437744, + "kl": 0.07395362854003906, + "lambda_div_used": 0.5, + "learning_rate": 1.220245676671809e-07, + "loss": 0.003, + "reward": -0.03200409188866615, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.03200409188866615, + "reward_after_std": 0.685878798365593, + "reward_before_mean": 0.5347689781337976, + "reward_before_std": 0.5669304095208645, + "reward_change_max": 0.00017894059419631958, + "reward_change_mean": -0.5667730458080769, + "reward_change_min": -0.9794808402657509, + "reward_change_std": 0.3551515229046345, + "reward_std": 0.6858788095414639, + "rewards/cosine_scaled_reward": -0.17011553049087524, + "rewards/format_reward": 0.8750000111758709, + "step": 455 + }, + { + "advantage_max": 1.9430626034736633, + "advantage_mean": 1.1486311790598336e-08, + "advantage_min": -0.7365816906094551, + "advantage_std": 0.9998258501291275, + "completion_length": 1819.020896911621, + "epoch": 0.5211428571428571, + "grad_norm": 0.5760971903800964, + "kl": 0.11542129516601562, + "lambda_div_used": 0.5, + "learning_rate": 1.2106419949317388e-07, + "loss": 0.0046, + "reward": -0.06889674533158541, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.06889674533158541, + "reward_after_std": 0.7080087102949619, + "reward_before_mean": 0.45906742848455906, + "reward_before_std": 0.6147968918085098, + "reward_change_max": 0.0, + "reward_change_mean": -0.5279641784727573, + "reward_change_min": -0.9316188842058182, + "reward_change_std": 0.3530385736376047, + "reward_std": 0.7080087289214134, + "rewards/cosine_scaled_reward": -0.1558829639106989, + "rewards/format_reward": 0.770833345130086, + "step": 456 + }, + { + "advantage_max": 1.9111386984586716, + "advantage_mean": -7.761022269292539e-09, + "advantage_min": -0.8441537171602249, + "advantage_std": 0.9998257532715797, + "completion_length": 1603.3541870117188, + "epoch": 0.5222857142857142, + "grad_norm": 0.42133650183677673, + "kl": 0.11229705810546875, + "lambda_div_used": 0.5, + "learning_rate": 1.2012473704494537e-07, + "loss": 0.0045, + "reward": 0.11794697493314743, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.11794697493314743, + "reward_after_std": 0.746201453730464, + "reward_before_mean": 0.7967136232182384, + "reward_before_std": 0.6390935992822051, + "reward_change_max": 0.0001236647367477417, + "reward_change_mean": -0.6787666529417038, + "reward_change_min": -1.113053236156702, + "reward_change_std": 0.4282074421644211, + "reward_std": 0.7462014760822058, + "rewards/cosine_scaled_reward": 0.0025234604254364967, + "rewards/format_reward": 0.7916666697710752, + "step": 457 + }, + { + "advantage_max": 1.9392741024494171, + "advantage_mean": 1.940255400789681e-09, + "advantage_min": -0.7527594342827797, + "advantage_std": 0.9998307451605797, + "completion_length": 1405.750015258789, + "epoch": 0.5234285714285715, + "grad_norm": 0.8316559195518494, + "kl": 0.068511962890625, + "lambda_div_used": 0.5, + "learning_rate": 1.1920622611056974e-07, + "loss": 0.0027, + "reward": 0.012192606925964355, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.012192606925964355, + "reward_after_std": 0.7541647478938103, + "reward_before_mean": 0.5929140914231539, + "reward_before_std": 0.6398164816200733, + "reward_change_max": 0.0, + "reward_change_mean": -0.5807214863598347, + "reward_change_min": -0.9291879385709763, + "reward_change_std": 0.3721654526889324, + "reward_std": 0.7541647478938103, + "rewards/cosine_scaled_reward": -0.12020963057875633, + "rewards/format_reward": 0.8333333395421505, + "step": 458 + }, + { + "advantage_max": 1.9433845430612564, + "advantage_mean": -8.847564902936256e-09, + "advantage_min": -0.7697709649801254, + "advantage_std": 0.9998654574155807, + "completion_length": 1161.9583587646484, + "epoch": 0.5245714285714286, + "grad_norm": 0.4935070276260376, + "kl": 0.054248809814453125, + "lambda_div_used": 0.5, + "learning_rate": 1.1830871145697412e-07, + "loss": 0.0022, + "reward": 0.30431293696165085, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.30431293696165085, + "reward_after_std": 0.8699779510498047, + "reward_before_mean": 1.0904029458761215, + "reward_before_std": 0.7013438567519188, + "reward_change_max": 0.0, + "reward_change_mean": -0.7860900089144707, + "reward_change_min": -1.30374313890934, + "reward_change_std": 0.4662424735724926, + "reward_std": 0.8699779585003853, + "rewards/cosine_scaled_reward": 0.045201453380286694, + "rewards/format_reward": 1.0, + "step": 459 + }, + { + "advantage_max": 1.894415706396103, + "advantage_mean": 3.3306690738754696e-16, + "advantage_min": -0.8416037708520889, + "advantage_std": 0.9998557269573212, + "completion_length": 1902.7708740234375, + "epoch": 0.5257142857142857, + "grad_norm": 0.5548562407493591, + "kl": 0.100677490234375, + "lambda_div_used": 0.5, + "learning_rate": 1.1743223682775649e-07, + "loss": 0.004, + "reward": 0.12215450627263635, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.12215450627263635, + "reward_after_std": 0.7530257254838943, + "reward_before_mean": 0.8011749666184187, + "reward_before_std": 0.644983071833849, + "reward_change_max": 0.0003541409969329834, + "reward_change_mean": -0.6790204904973507, + "reward_change_min": -1.1368926838040352, + "reward_change_std": 0.43394239246845245, + "reward_std": 0.7530257627367973, + "rewards/cosine_scaled_reward": -0.016079182736575603, + "rewards/format_reward": 0.8333333507180214, + "step": 460 + }, + { + "advantage_max": 1.961041271686554, + "advantage_mean": 8.692344288796505e-09, + "advantage_min": -0.7419874519109726, + "advantage_std": 0.9998823627829552, + "completion_length": 1571.8541946411133, + "epoch": 0.5268571428571428, + "grad_norm": 0.7262814044952393, + "kl": 0.06304168701171875, + "lambda_div_used": 0.5, + "learning_rate": 1.1657684494105386e-07, + "loss": 0.0025, + "reward": 0.22672313824295998, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.22672313824295998, + "reward_after_std": 0.9908672571182251, + "reward_before_mean": 0.909737903624773, + "reward_before_std": 0.827164052054286, + "reward_change_max": 0.0015447810292243958, + "reward_change_mean": -0.6830147504806519, + "reward_change_min": -1.196632169187069, + "reward_change_std": 0.4549138732254505, + "reward_std": 0.9908672869205475, + "rewards/cosine_scaled_reward": 0.038202277704840526, + "rewards/format_reward": 0.8333333507180214, + "step": 461 + }, + { + "advantage_max": 1.9567998200654984, + "advantage_mean": 1.3659398168108794e-08, + "advantage_min": -0.7012544423341751, + "advantage_std": 0.9998421967029572, + "completion_length": 1640.125015258789, + "epoch": 0.528, + "grad_norm": 0.7010632157325745, + "kl": 0.12639236450195312, + "lambda_div_used": 0.5, + "learning_rate": 1.1574257748745986e-07, + "loss": 0.0051, + "reward": -0.08827551966533065, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.08827551966533065, + "reward_after_std": 0.7464075610041618, + "reward_before_mean": 0.4057305008172989, + "reward_before_std": 0.6126368492841721, + "reward_change_max": 0.0030915439128875732, + "reward_change_mean": -0.4940060283988714, + "reward_change_min": -0.836365569382906, + "reward_change_std": 0.3115391172468662, + "reward_std": 0.7464075684547424, + "rewards/cosine_scaled_reward": -0.2033847626298666, + "rewards/format_reward": 0.8125000055879354, + "step": 462 + }, + { + "advantage_max": 1.9140111058950424, + "advantage_mean": -1.2417634698280722e-09, + "advantage_min": -0.7328698076307774, + "advantage_std": 0.999872162938118, + "completion_length": 1977.6042022705078, + "epoch": 0.5291428571428571, + "grad_norm": 0.8431422710418701, + "kl": 0.11789321899414062, + "lambda_div_used": 0.5, + "learning_rate": 1.1492947512799328e-07, + "loss": 0.0047, + "reward": 0.08569935825653374, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.08569935825653374, + "reward_after_std": 0.8853895887732506, + "reward_before_mean": 0.6880028424784541, + "reward_before_std": 0.7817164659500122, + "reward_change_max": 0.002470634877681732, + "reward_change_mean": -0.6023034751415253, + "reward_change_min": -1.1469271406531334, + "reward_change_std": 0.4306590985506773, + "reward_std": 0.8853896260261536, + "rewards/cosine_scaled_reward": -0.010165270417928696, + "rewards/format_reward": 0.7083333469927311, + "step": 463 + }, + { + "advantage_max": 1.9133918732404709, + "advantage_mean": 9.934107536579972e-09, + "advantage_min": -0.8422577381134033, + "advantage_std": 0.999857671558857, + "completion_length": 1337.8125228881836, + "epoch": 0.5302857142857142, + "grad_norm": 0.8812828660011292, + "kl": 0.09333038330078125, + "lambda_div_used": 0.5, + "learning_rate": 1.1413757749211602e-07, + "loss": 0.0037, + "reward": 0.2514381390064955, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2514381390064955, + "reward_after_std": 0.7524741515517235, + "reward_before_mean": 1.0340424440801144, + "reward_before_std": 0.5952115915715694, + "reward_change_max": 0.0028984099626541138, + "reward_change_mean": -0.782604280859232, + "reward_change_min": -1.204802818596363, + "reward_change_std": 0.47602505423128605, + "reward_std": 0.7524741850793362, + "rewards/cosine_scaled_reward": 0.10035453177988529, + "rewards/format_reward": 0.8333333395421505, + "step": 464 + }, + { + "advantage_max": 1.9208644330501556, + "advantage_mean": 5.587935447692871e-09, + "advantage_min": -0.8015524484217167, + "advantage_std": 0.9998502060770988, + "completion_length": 1781.9375457763672, + "epoch": 0.5314285714285715, + "grad_norm": 0.9359230399131775, + "kl": 0.13791275024414062, + "lambda_div_used": 0.5, + "learning_rate": 1.1336692317580158e-07, + "loss": 0.0055, + "reward": -0.023814965039491653, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.023814965039491653, + "reward_after_std": 0.7645898275077343, + "reward_before_mean": 0.5261010061949492, + "reward_before_std": 0.678345825523138, + "reward_change_max": 0.0007695704698562622, + "reward_change_mean": -0.5499159917235374, + "reward_change_min": -0.9802470579743385, + "reward_change_std": 0.3772691786289215, + "reward_std": 0.764589861035347, + "rewards/cosine_scaled_reward": -0.12236616667360067, + "rewards/format_reward": 0.7708333432674408, + "step": 465 + }, + { + "advantage_max": 1.9382754564285278, + "advantage_mean": -5.587935447692871e-09, + "advantage_min": -0.7668173387646675, + "advantage_std": 0.9998831152915955, + "completion_length": 1432.2292098999023, + "epoch": 0.5325714285714286, + "grad_norm": 0.9250560402870178, + "kl": 0.0701141357421875, + "lambda_div_used": 0.5, + "learning_rate": 1.1261754973965422e-07, + "loss": 0.0028, + "reward": 0.3115917220711708, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3115917220711708, + "reward_after_std": 0.9879258796572685, + "reward_before_mean": 1.0618979572318494, + "reward_before_std": 0.8279051408171654, + "reward_change_max": 0.0, + "reward_change_mean": -0.7503062412142754, + "reward_change_min": -1.3201973289251328, + "reward_change_std": 0.48962003737688065, + "reward_std": 0.9879258833825588, + "rewards/cosine_scaled_reward": 0.09344895218964666, + "rewards/format_reward": 0.8750000074505806, + "step": 466 + }, + { + "advantage_max": 1.8915677815675735, + "advantage_mean": 9.313225746154785e-09, + "advantage_min": -0.8872964084148407, + "advantage_std": 0.9998693838715553, + "completion_length": 1769.1667251586914, + "epoch": 0.5337142857142857, + "grad_norm": 0.6294421553611755, + "kl": 0.1210784912109375, + "lambda_div_used": 0.5, + "learning_rate": 1.1188949370707787e-07, + "loss": 0.0048, + "reward": 0.12419883778784424, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.12419883778784424, + "reward_after_std": 0.903032261878252, + "reward_before_mean": 0.7542639700695872, + "reward_before_std": 0.8407903723418713, + "reward_change_max": 0.0016758441925048828, + "reward_change_mean": -0.6300651095807552, + "reward_change_min": -1.129418022930622, + "reward_change_std": 0.4298090599477291, + "reward_std": 0.9030322767794132, + "rewards/cosine_scaled_reward": -0.04995136708021164, + "rewards/format_reward": 0.8541666865348816, + "step": 467 + }, + { + "advantage_max": 1.9225873202085495, + "advantage_mean": -1.2417633588057697e-09, + "advantage_min": -0.7797147929668427, + "advantage_std": 0.9998800531029701, + "completion_length": 1557.770881652832, + "epoch": 0.5348571428571428, + "grad_norm": 0.6982704401016235, + "kl": 0.06484222412109375, + "lambda_div_used": 0.5, + "learning_rate": 1.1118279056249653e-07, + "loss": 0.0026, + "reward": 0.18363811075687408, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.18363811075687408, + "reward_after_std": 0.9357991591095924, + "reward_before_mean": 0.8452462187269703, + "reward_before_std": 0.8119201101362705, + "reward_change_max": 0.0, + "reward_change_mean": -0.6616080775856972, + "reward_change_min": -1.198854148387909, + "reward_change_std": 0.4518035836517811, + "reward_std": 0.9357991963624954, + "rewards/cosine_scaled_reward": -0.01487693004310131, + "rewards/format_reward": 0.8750000149011612, + "step": 468 + }, + { + "advantage_max": 1.890088975429535, + "advantage_mean": 2.980232283178452e-08, + "advantage_min": -0.8545728474855423, + "advantage_std": 0.9998464807868004, + "completion_length": 1714.7709007263184, + "epoch": 0.536, + "grad_norm": 0.9024974703788757, + "kl": 0.16271209716796875, + "lambda_div_used": 0.5, + "learning_rate": 1.1049747474962444e-07, + "loss": 0.0065, + "reward": -0.060654870234429836, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.060654870234429836, + "reward_after_std": 0.6992764100432396, + "reward_before_mean": 0.47816853411495686, + "reward_before_std": 0.6197003275156021, + "reward_change_max": 0.0035224109888076782, + "reward_change_mean": -0.5388233549892902, + "reward_change_min": -0.932822585105896, + "reward_change_std": 0.3747980333864689, + "reward_std": 0.6992764323949814, + "rewards/cosine_scaled_reward": -0.1567490752786398, + "rewards/format_reward": 0.791666679084301, + "step": 469 + }, + { + "advantage_max": 1.9362804740667343, + "advantage_mean": 4.967053435223079e-09, + "advantage_min": -0.8115110918879509, + "advantage_std": 0.9998445808887482, + "completion_length": 2205.7083740234375, + "epoch": 0.5371428571428571, + "grad_norm": 1.068304181098938, + "kl": 0.21833038330078125, + "lambda_div_used": 0.5, + "learning_rate": 1.0983357966978745e-07, + "loss": 0.0087, + "reward": -0.10846987180411816, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.10846987180411816, + "reward_after_std": 0.7410068362951279, + "reward_before_mean": 0.37174488231539726, + "reward_before_std": 0.6479252725839615, + "reward_change_max": 0.0, + "reward_change_mean": -0.48021476343274117, + "reward_change_min": -0.8555741608142853, + "reward_change_std": 0.31600991636514664, + "reward_std": 0.7410068362951279, + "rewards/cosine_scaled_reward": -0.16829423449235037, + "rewards/format_reward": 0.7083333525806665, + "step": 470 + }, + { + "advantage_max": 1.8785135746002197, + "advantage_mean": 3.1044086745701804e-09, + "advantage_min": -0.8254240527749062, + "advantage_std": 0.9998672753572464, + "completion_length": 2018.7917098999023, + "epoch": 0.5382857142857143, + "grad_norm": 1.2153743505477905, + "kl": 0.1783733367919922, + "lambda_div_used": 0.5, + "learning_rate": 1.0919113768029517e-07, + "loss": 0.0071, + "reward": 0.15960154053755105, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.15960154053755105, + "reward_after_std": 0.8837853036820889, + "reward_before_mean": 0.8326390506699681, + "reward_before_std": 0.8361635878682137, + "reward_change_max": 0.002089478075504303, + "reward_change_mean": -0.6730375066399574, + "reward_change_min": -1.2551670372486115, + "reward_change_std": 0.519281305372715, + "reward_std": 0.8837853148579597, + "rewards/cosine_scaled_reward": 0.05173617601394653, + "rewards/format_reward": 0.7291666828095913, + "step": 471 + }, + { + "advantage_max": 1.9424489885568619, + "advantage_mean": 7.450580818968433e-09, + "advantage_min": -0.680642619729042, + "advantage_std": 0.9998679831624031, + "completion_length": 1611.5209045410156, + "epoch": 0.5394285714285715, + "grad_norm": 0.6896313428878784, + "kl": 0.11153030395507812, + "lambda_div_used": 0.5, + "learning_rate": 1.0857018009286381e-07, + "loss": 0.0045, + "reward": 0.03950534947216511, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.03950534947216511, + "reward_after_std": 0.8686452694237232, + "reward_before_mean": 0.6119452454149723, + "reward_before_std": 0.7634111884981394, + "reward_change_max": 0.0, + "reward_change_mean": -0.5724398903548717, + "reward_change_min": -1.1462013721466064, + "reward_change_std": 0.40038398280739784, + "reward_std": 0.8686453029513359, + "rewards/cosine_scaled_reward": -0.11069405497983098, + "rewards/format_reward": 0.8333333358168602, + "step": 472 + }, + { + "advantage_max": 1.914126843214035, + "advantage_mean": 2.2351742678949904e-08, + "advantage_min": -0.8491419404745102, + "advantage_std": 0.9998372197151184, + "completion_length": 1868.1875762939453, + "epoch": 0.5405714285714286, + "grad_norm": 0.8767368793487549, + "kl": 0.1471405029296875, + "lambda_div_used": 0.5, + "learning_rate": 1.0797073717209013e-07, + "loss": 0.0059, + "reward": -0.07519353553652763, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.07519353553652763, + "reward_after_std": 0.693619716912508, + "reward_before_mean": 0.4554354092106223, + "reward_before_std": 0.6181728020310402, + "reward_change_max": 0.0007615610957145691, + "reward_change_mean": -0.5306289177387953, + "reward_change_min": -0.8539394959807396, + "reward_change_std": 0.34847177751362324, + "reward_std": 0.6936197318136692, + "rewards/cosine_scaled_reward": -0.1472823154181242, + "rewards/format_reward": 0.7500000055879354, + "step": 473 + }, + { + "advantage_max": 1.9438211470842361, + "advantage_mean": -1.0554989215982857e-08, + "advantage_min": -0.8066472448408604, + "advantage_std": 0.9998703449964523, + "completion_length": 1492.1042098999023, + "epoch": 0.5417142857142857, + "grad_norm": 0.5975131988525391, + "kl": 0.14487838745117188, + "lambda_div_used": 0.5, + "learning_rate": 1.0739283813397639e-07, + "loss": 0.0058, + "reward": 0.5643487274646759, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5643487274646759, + "reward_after_std": 0.8311857730150223, + "reward_before_mean": 1.5848356559872627, + "reward_before_std": 0.5555077791213989, + "reward_change_max": 0.0, + "reward_change_mean": -1.0204869396984577, + "reward_change_min": -1.4674010053277016, + "reward_change_std": 0.5802014097571373, + "reward_std": 0.8311858177185059, + "rewards/cosine_scaled_reward": 0.34450116008520126, + "rewards/format_reward": 0.8958333395421505, + "step": 474 + }, + { + "advantage_max": 1.9134457558393478, + "advantage_mean": -1.924733428193548e-08, + "advantage_min": -0.7586380168795586, + "advantage_std": 0.9998574778437614, + "completion_length": 1629.0417022705078, + "epoch": 0.5428571428571428, + "grad_norm": 0.5304942727088928, + "kl": 0.10820388793945312, + "lambda_div_used": 0.5, + "learning_rate": 1.068365111445064e-07, + "loss": 0.0043, + "reward": 0.18582246452569962, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.18582246452569962, + "reward_after_std": 0.7934923768043518, + "reward_before_mean": 0.9009908102452755, + "reward_before_std": 0.6717259753495455, + "reward_change_max": 0.0006563067436218262, + "reward_change_mean": -0.7151683606207371, + "reward_change_min": -1.2400154992938042, + "reward_change_std": 0.46559458039700985, + "reward_std": 0.7934924028813839, + "rewards/cosine_scaled_reward": 0.06507872650399804, + "rewards/format_reward": 0.770833345130086, + "step": 475 + }, + { + "advantage_max": 1.9225990921258926, + "advantage_mean": -1.552204331733975e-08, + "advantage_min": -0.8317600563168526, + "advantage_std": 0.9998689591884613, + "completion_length": 1525.9792098999023, + "epoch": 0.544, + "grad_norm": 0.7439927458763123, + "kl": 0.07448959350585938, + "lambda_div_used": 0.5, + "learning_rate": 1.063017833182728e-07, + "loss": 0.003, + "reward": 0.31535858660936356, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.31535858660936356, + "reward_after_std": 0.8980049230158329, + "reward_before_mean": 1.1031142249703407, + "reward_before_std": 0.7534971106797457, + "reward_change_max": 0.0, + "reward_change_mean": -0.7877555936574936, + "reward_change_min": -1.2570370063185692, + "reward_change_std": 0.4909738227725029, + "reward_std": 0.8980049341917038, + "rewards/cosine_scaled_reward": 0.10364041291177273, + "rewards/format_reward": 0.895833358168602, + "step": 476 + }, + { + "advantage_max": 1.8741195350885391, + "advantage_mean": 2.220446049250313e-16, + "advantage_min": -0.8873728774487972, + "advantage_std": 0.9998980090022087, + "completion_length": 1399.2917175292969, + "epoch": 0.5451428571428572, + "grad_norm": 0.583365797996521, + "kl": 0.11350250244140625, + "lambda_div_used": 0.5, + "learning_rate": 1.0578868071715544e-07, + "loss": 0.0045, + "reward": 0.4705557865090668, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4705557865090668, + "reward_after_std": 1.0551037564873695, + "reward_before_mean": 1.347284235060215, + "reward_before_std": 0.9735592044889927, + "reward_change_max": 0.001387663185596466, + "reward_change_mean": -0.8767284750938416, + "reward_change_min": -1.5517780631780624, + "reward_change_std": 0.6072789244353771, + "reward_std": 1.0551037788391113, + "rewards/cosine_scaled_reward": 0.23614212637767196, + "rewards/format_reward": 0.8750000223517418, + "step": 477 + }, + { + "advantage_max": 1.9357862919569016, + "advantage_mean": -4.967053657267684e-09, + "advantage_min": -0.8154433369636536, + "advantage_std": 0.9998578280210495, + "completion_length": 2062.395881652832, + "epoch": 0.5462857142857143, + "grad_norm": 0.5441526770591736, + "kl": 0.16361618041992188, + "lambda_div_used": 0.5, + "learning_rate": 1.0529722834905125e-07, + "loss": 0.0065, + "reward": 0.009207317605614662, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.009207317605614662, + "reward_after_std": 0.8506947085261345, + "reward_before_mean": 0.5521553927101195, + "reward_before_std": 0.7576607428491116, + "reward_change_max": 0.0030072256922721863, + "reward_change_mean": -0.5429480616003275, + "reward_change_min": -0.9525541067123413, + "reward_change_std": 0.37604556791484356, + "reward_std": 0.8506947420537472, + "rewards/cosine_scaled_reward": -0.0780889829620719, + "rewards/format_reward": 0.7083333432674408, + "step": 478 + }, + { + "advantage_max": 1.9259414672851562, + "advantage_mean": -1.0554989493538613e-08, + "advantage_min": -0.8462852165102959, + "advantage_std": 0.9998517110943794, + "completion_length": 1818.8958587646484, + "epoch": 0.5474285714285714, + "grad_norm": 0.8077680468559265, + "kl": 0.18693161010742188, + "lambda_div_used": 0.5, + "learning_rate": 1.0482745016665526e-07, + "loss": 0.0075, + "reward": 0.08315641060471535, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.08315641060471535, + "reward_after_std": 0.7782744280993938, + "reward_before_mean": 0.7172673875465989, + "reward_before_std": 0.6326955072581768, + "reward_change_max": 0.0008339658379554749, + "reward_change_mean": -0.634110989049077, + "reward_change_min": -1.057134885340929, + "reward_change_std": 0.3948863986879587, + "reward_std": 0.7782744280993938, + "rewards/cosine_scaled_reward": -0.04761631414294243, + "rewards/format_reward": 0.8125000055879354, + "step": 479 + }, + { + "advantage_max": 1.8997021317481995, + "advantage_mean": -6.519258410886408e-09, + "advantage_min": -0.8673086017370224, + "advantage_std": 0.9998597353696823, + "completion_length": 1601.9375534057617, + "epoch": 0.5485714285714286, + "grad_norm": 0.9863570928573608, + "kl": 0.14463424682617188, + "lambda_div_used": 0.5, + "learning_rate": 1.0437936906629334e-07, + "loss": 0.0058, + "reward": -0.006774002220481634, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.006774002220481634, + "reward_after_std": 0.8246656432747841, + "reward_before_mean": 0.5343326807487756, + "reward_before_std": 0.7610690146684647, + "reward_change_max": 0.0005768761038780212, + "reward_change_mean": -0.5411067046225071, + "reward_change_min": -1.0265009850263596, + "reward_change_std": 0.388399351388216, + "reward_std": 0.8246656730771065, + "rewards/cosine_scaled_reward": -0.13908366532996297, + "rewards/format_reward": 0.8125000149011612, + "step": 480 + }, + { + "advantage_max": 1.9240192770957947, + "advantage_mean": 2.7939678071131624e-09, + "advantage_min": -0.8690285757184029, + "advantage_std": 0.9998449757695198, + "completion_length": 1750.354232788086, + "epoch": 0.5497142857142857, + "grad_norm": 1.2178927659988403, + "kl": 0.13234710693359375, + "lambda_div_used": 0.5, + "learning_rate": 1.0395300688680625e-07, + "loss": 0.0053, + "reward": -0.010232685133814812, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": -0.010232685133814812, + "reward_after_std": 0.792898278683424, + "reward_before_mean": 0.5407127062790096, + "reward_before_std": 0.7064782101660967, + "reward_change_max": 0.0, + "reward_change_mean": -0.5509453937411308, + "reward_change_min": -0.9236162602901459, + "reward_change_std": 0.3657870851457119, + "reward_std": 0.7928983122110367, + "rewards/cosine_scaled_reward": -0.1463103265268728, + "rewards/format_reward": 0.833333358168602, + "step": 481 + }, + { + "advantage_max": 1.9587009698152542, + "advantage_mean": 2.483527383745354e-09, + "advantage_min": -0.6773890405893326, + "advantage_std": 0.9998364299535751, + "completion_length": 1611.2500381469727, + "epoch": 0.5508571428571428, + "grad_norm": 0.6870055794715881, + "kl": 0.1389312744140625, + "lambda_div_used": 0.5, + "learning_rate": 1.0354838440848501e-07, + "loss": 0.0056, + "reward": 0.06978282704949379, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.06978282704949379, + "reward_after_std": 0.7692562937736511, + "reward_before_mean": 0.6915448009967804, + "reward_before_std": 0.6328919120132923, + "reward_change_max": 0.001624472439289093, + "reward_change_mean": -0.6217619744129479, + "reward_change_min": -1.0122926868498325, + "reward_change_std": 0.38024843856692314, + "reward_std": 0.7692563086748123, + "rewards/cosine_scaled_reward": -0.03964426927268505, + "rewards/format_reward": 0.770833333954215, + "step": 482 + }, + { + "advantage_max": 1.9146474301815033, + "advantage_mean": -2.4835269396561444e-09, + "advantage_min": -0.8112232387065887, + "advantage_std": 0.9998473078012466, + "completion_length": 1654.4583740234375, + "epoch": 0.552, + "grad_norm": 1.1116269826889038, + "kl": 0.12012290954589844, + "lambda_div_used": 0.5, + "learning_rate": 1.0316552135205837e-07, + "loss": 0.0048, + "reward": 0.06403302727267146, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.06403302727267146, + "reward_after_std": 0.8078042268753052, + "reward_before_mean": 0.6758778803050518, + "reward_before_std": 0.7425609044730663, + "reward_change_max": 0.0, + "reward_change_mean": -0.6118448786437511, + "reward_change_min": -1.0602629855275154, + "reward_change_std": 0.42626157216727734, + "reward_std": 0.8078042753040791, + "rewards/cosine_scaled_reward": -0.0787277314811945, + "rewards/format_reward": 0.8333333395421505, + "step": 483 + }, + { + "advantage_max": 1.879467323422432, + "advantage_mean": -6.208814573582799e-10, + "advantage_min": -0.8875841423869133, + "advantage_std": 0.9998355507850647, + "completion_length": 1322.7500457763672, + "epoch": 0.5531428571428572, + "grad_norm": 0.5565298199653625, + "kl": 0.09266090393066406, + "lambda_div_used": 0.5, + "learning_rate": 1.0280443637773163e-07, + "loss": 0.0037, + "reward": 0.12074141576886177, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.12074141576886177, + "reward_after_std": 0.7823374047875404, + "reward_before_mean": 0.7933894079178572, + "reward_before_std": 0.7216227240860462, + "reward_change_max": 0.0009933337569236755, + "reward_change_mean": -0.6726479884237051, + "reward_change_min": -1.105917226523161, + "reward_change_std": 0.4536776263266802, + "reward_std": 0.7823374196887016, + "rewards/cosine_scaled_reward": -0.04080530256032944, + "rewards/format_reward": 0.8750000149011612, + "step": 484 + }, + { + "advantage_max": 1.9454391449689865, + "advantage_mean": -1.2417635808503746e-09, + "advantage_min": -0.7871048152446747, + "advantage_std": 0.9998297542333603, + "completion_length": 1602.2291793823242, + "epoch": 0.5542857142857143, + "grad_norm": 0.7432307600975037, + "kl": 0.15546417236328125, + "lambda_div_used": 0.5, + "learning_rate": 1.0246514708427701e-07, + "loss": 0.0062, + "reward": 0.10268004890531301, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.10268004890531301, + "reward_after_std": 0.6411448940634727, + "reward_before_mean": 0.7937132641673088, + "reward_before_std": 0.47726221568882465, + "reward_change_max": 0.0, + "reward_change_mean": -0.6910331957042217, + "reward_change_min": -1.0900376811623573, + "reward_change_std": 0.40093186870217323, + "reward_std": 0.6411449201405048, + "rewards/cosine_scaled_reward": -0.061476717703044415, + "rewards/format_reward": 0.916666679084301, + "step": 485 + }, + { + "advantage_max": 1.9508067667484283, + "advantage_mean": 6.208817460162663e-09, + "advantage_min": -0.7736483179032803, + "advantage_std": 0.9998454228043556, + "completion_length": 1003.0625267028809, + "epoch": 0.5554285714285714, + "grad_norm": 0.9682008028030396, + "kl": 0.0872650146484375, + "lambda_div_used": 0.5, + "learning_rate": 1.0214767000817596e-07, + "loss": 0.0035, + "reward": 0.12867436837404966, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.12867436837404966, + "reward_after_std": 0.7180111818015575, + "reward_before_mean": 0.8117377087473869, + "reward_before_std": 0.5166822988539934, + "reward_change_max": 0.0017078518867492676, + "reward_change_mean": -0.6830633729696274, + "reward_change_min": -1.0265196487307549, + "reward_change_std": 0.3901584856212139, + "reward_std": 0.7180111892521381, + "rewards/cosine_scaled_reward": -0.052464481675997376, + "rewards/format_reward": 0.9166666865348816, + "step": 486 + }, + { + "advantage_max": 1.9592144191265106, + "advantage_mean": -1.2417634698280722e-09, + "advantage_min": -0.6940793693065643, + "advantage_std": 0.9998834952712059, + "completion_length": 1188.1041774749756, + "epoch": 0.5565714285714286, + "grad_norm": 0.5985164642333984, + "kl": 0.08372879028320312, + "lambda_div_used": 0.5, + "learning_rate": 1.0185202062281336e-07, + "loss": 0.0034, + "reward": 0.41113134508486837, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.41113134508486837, + "reward_after_std": 0.9559298977255821, + "reward_before_mean": 1.257676038891077, + "reward_before_std": 0.7205435046926141, + "reward_change_max": 0.0, + "reward_change_mean": -0.8465447202324867, + "reward_change_min": -1.3234855383634567, + "reward_change_std": 0.5153034143149853, + "reward_std": 0.9559298977255821, + "rewards/cosine_scaled_reward": 0.19133803341537714, + "rewards/format_reward": 0.8750000074505806, + "step": 487 + }, + { + "advantage_max": 1.9492617100477219, + "advantage_mean": 2.359350581571107e-08, + "advantage_min": -0.7352440245449543, + "advantage_std": 0.999819852411747, + "completion_length": 1420.7708702087402, + "epoch": 0.5577142857142857, + "grad_norm": 0.833846390247345, + "kl": 0.15200424194335938, + "lambda_div_used": 0.5, + "learning_rate": 1.0157821333772304e-07, + "loss": 0.0061, + "reward": -0.00616908073425293, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.00616908073425293, + "reward_after_std": 0.6220297068357468, + "reward_before_mean": 0.6039720326662064, + "reward_before_std": 0.4390154229477048, + "reward_change_max": 0.0, + "reward_change_mean": -0.6101410929113626, + "reward_change_min": -0.9060985743999481, + "reward_change_std": 0.3509222362190485, + "reward_std": 0.6220297180116177, + "rewards/cosine_scaled_reward": -0.11468065832741559, + "rewards/format_reward": 0.8333333414047956, + "step": 488 + }, + { + "advantage_max": 1.8662956207990646, + "advantage_mean": 3.228584999348527e-08, + "advantage_min": -0.9588077291846275, + "advantage_std": 0.9998302757740021, + "completion_length": 1769.5000457763672, + "epoch": 0.5588571428571428, + "grad_norm": 1.0365986824035645, + "kl": 0.1967620849609375, + "lambda_div_used": 0.5, + "learning_rate": 1.013262614978859e-07, + "loss": 0.0079, + "reward": -0.1484443813096732, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.1484443813096732, + "reward_after_std": 0.6222599521279335, + "reward_before_mean": 0.34515629429370165, + "reward_before_std": 0.5821492820978165, + "reward_change_max": 0.0, + "reward_change_mean": -0.49360066652297974, + "reward_change_min": -0.8139010816812515, + "reward_change_std": 0.3350451663136482, + "reward_std": 0.6222599819302559, + "rewards/cosine_scaled_reward": -0.20242186076939106, + "rewards/format_reward": 0.7500000186264515, + "step": 489 + }, + { + "advantage_max": 1.992196410894394, + "advantage_mean": 1.1175871117430347e-08, + "advantage_min": -0.6759535558521748, + "advantage_std": 0.999831385910511, + "completion_length": 1338.2083740234375, + "epoch": 0.56, + "grad_norm": 0.7504719495773315, + "kl": 0.09897232055664062, + "lambda_div_used": 0.5, + "learning_rate": 1.0109617738307911e-07, + "loss": 0.004, + "reward": 0.0957014646846801, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.0957014646846801, + "reward_after_std": 0.7212163619697094, + "reward_before_mean": 0.7547863759100437, + "reward_before_std": 0.49603763967752457, + "reward_change_max": 0.0, + "reward_change_mean": -0.6590849310159683, + "reward_change_min": -0.9607073739171028, + "reward_change_std": 0.3600234966725111, + "reward_std": 0.72121636942029, + "rewards/cosine_scaled_reward": -0.07052347250282764, + "rewards/format_reward": 0.8958333432674408, + "step": 490 + }, + { + "advantage_max": 1.9146278351545334, + "advantage_mean": -1.1175871117430347e-08, + "advantage_min": -0.8270855993032455, + "advantage_std": 0.9998930171132088, + "completion_length": 1563.9166793823242, + "epoch": 0.5611428571428572, + "grad_norm": 0.5624725818634033, + "kl": 0.09905624389648438, + "lambda_div_used": 0.5, + "learning_rate": 1.0088797220727779e-07, + "loss": 0.004, + "reward": 0.39007012732326984, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.39007012732326984, + "reward_after_std": 1.004666656255722, + "reward_before_mean": 1.2099284492433071, + "reward_before_std": 0.86801877617836, + "reward_change_max": 0.0, + "reward_change_mean": -0.8198583498597145, + "reward_change_min": -1.4678706899285316, + "reward_change_std": 0.5273435860872269, + "reward_std": 1.0046666860580444, + "rewards/cosine_scaled_reward": 0.15704754507169127, + "rewards/format_reward": 0.8958333432674408, + "step": 491 + }, + { + "advantage_max": 1.9598166197538376, + "advantage_mean": 3.352761424046946e-08, + "advantage_min": -0.6968494616448879, + "advantage_std": 0.9997868090867996, + "completion_length": 1500.0625190734863, + "epoch": 0.5622857142857143, + "grad_norm": 1.76079261302948, + "kl": 0.13095855712890625, + "lambda_div_used": 0.5, + "learning_rate": 1.0070165611810855e-07, + "loss": 0.0052, + "reward": -0.011532854987308383, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.011532854987308383, + "reward_after_std": 0.6456209793686867, + "reward_before_mean": 0.5838748067617416, + "reward_before_std": 0.48374018631875515, + "reward_change_max": 0.0, + "reward_change_mean": -0.595407678745687, + "reward_change_min": -0.9056248366832733, + "reward_change_std": 0.34459487441927195, + "reward_std": 0.6456209793686867, + "rewards/cosine_scaled_reward": -0.11431259848177433, + "rewards/format_reward": 0.8125000149011612, + "step": 492 + }, + { + "advantage_max": 1.9004540145397186, + "advantage_mean": 1.3038516377683607e-08, + "advantage_min": -0.862372025847435, + "advantage_std": 0.9998877570033073, + "completion_length": 1285.2916793823242, + "epoch": 0.5634285714285714, + "grad_norm": 0.5592947602272034, + "kl": 0.09164047241210938, + "lambda_div_used": 0.5, + "learning_rate": 1.005372381963547e-07, + "loss": 0.0037, + "reward": 0.3626462905667722, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3626462905667722, + "reward_after_std": 1.0406964905560017, + "reward_before_mean": 1.1492395093664527, + "reward_before_std": 0.9662736691534519, + "reward_change_max": 0.0, + "reward_change_mean": -0.786593209952116, + "reward_change_min": -1.3980813696980476, + "reward_change_std": 0.5360561832785606, + "reward_std": 1.0406964905560017, + "rewards/cosine_scaled_reward": 0.11628641374409199, + "rewards/format_reward": 0.916666679084301, + "step": 493 + }, + { + "advantage_max": 1.9788424372673035, + "advantage_mean": -6.208817182606907e-09, + "advantage_min": -0.7454531416296959, + "advantage_std": 0.9998568743467331, + "completion_length": 1362.6042022705078, + "epoch": 0.5645714285714286, + "grad_norm": 0.7242301106452942, + "kl": 0.09954833984375, + "lambda_div_used": 0.5, + "learning_rate": 1.0039472645551372e-07, + "loss": 0.004, + "reward": 0.12059944961220026, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.12059944961220026, + "reward_after_std": 0.7983547747135162, + "reward_before_mean": 0.7689751125872135, + "reward_before_std": 0.6040369383990765, + "reward_change_max": 0.0, + "reward_change_mean": -0.6483756825327873, + "reward_change_min": -0.9728981480002403, + "reward_change_std": 0.3582359105348587, + "reward_std": 0.7983547821640968, + "rewards/cosine_scaled_reward": -0.08426245115697384, + "rewards/format_reward": 0.9375000149011612, + "step": 494 + }, + { + "advantage_max": 1.87544085085392, + "advantage_mean": -1.8626451825376478e-08, + "advantage_min": -0.9081234857439995, + "advantage_std": 0.9998789876699448, + "completion_length": 1417.7500457763672, + "epoch": 0.5657142857142857, + "grad_norm": 0.6767681241035461, + "kl": 0.0517578125, + "lambda_div_used": 0.5, + "learning_rate": 1.002741278414069e-07, + "loss": 0.0021, + "reward": 0.3090813576709479, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3090813576709479, + "reward_after_std": 0.9749084785580635, + "reward_before_mean": 1.0812942683696747, + "reward_before_std": 0.9502445831894875, + "reward_change_max": 0.0, + "reward_change_mean": -0.7722129225730896, + "reward_change_min": -1.4743258655071259, + "reward_change_std": 0.5595591589808464, + "reward_std": 0.9749084934592247, + "rewards/cosine_scaled_reward": 0.07189711090177298, + "rewards/format_reward": 0.9375000149011612, + "step": 495 + }, + { + "advantage_max": 1.9674073159694672, + "advantage_mean": 1.7384688022481498e-08, + "advantage_min": -0.7077538371086121, + "advantage_std": 0.9998735785484314, + "completion_length": 1427.5416946411133, + "epoch": 0.5668571428571428, + "grad_norm": 0.7200669050216675, + "kl": 0.10258865356445312, + "lambda_div_used": 0.5, + "learning_rate": 1.0017544823184055e-07, + "loss": 0.0041, + "reward": 0.2750055827200413, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2750055827200413, + "reward_after_std": 0.907068207859993, + "reward_before_mean": 1.0247598066926003, + "reward_before_std": 0.6894198805093765, + "reward_change_max": 0.0008356347680091858, + "reward_change_mean": -0.7497542221099138, + "reward_change_min": -1.2161314561963081, + "reward_change_std": 0.4413383901119232, + "reward_std": 0.9070682227611542, + "rewards/cosine_scaled_reward": 0.13737990334630013, + "rewards/format_reward": 0.7500000037252903, + "step": 496 + }, + { + "advantage_max": 1.9448639154434204, + "advantage_mean": -4.1443856013678726e-08, + "advantage_min": -0.7576332688331604, + "advantage_std": 0.9998561069369316, + "completion_length": 1287.0208740234375, + "epoch": 0.568, + "grad_norm": 0.7512621879577637, + "kl": 0.16324996948242188, + "lambda_div_used": 0.5, + "learning_rate": 1.0009869243631952e-07, + "loss": 0.0065, + "reward": 0.35428231628611684, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.35428231628611684, + "reward_after_std": 0.787726778537035, + "reward_before_mean": 1.2080298122018576, + "reward_before_std": 0.5447326581925154, + "reward_change_max": 0.0021923109889030457, + "reward_change_mean": -0.8537475019693375, + "reward_change_min": -1.3086559250950813, + "reward_change_std": 0.49219193682074547, + "reward_std": 0.7877267859876156, + "rewards/cosine_scaled_reward": 0.13526488654315472, + "rewards/format_reward": 0.9375, + "step": 497 + }, + { + "advantage_max": 1.9822159558534622, + "advantage_mean": -2.4835269396561444e-09, + "advantage_min": -0.6983235441148281, + "advantage_std": 0.9998753666877747, + "completion_length": 1643.0416870117188, + "epoch": 0.5691428571428572, + "grad_norm": 0.4986113905906677, + "kl": 0.11568832397460938, + "lambda_div_used": 0.5, + "learning_rate": 1.000438641958131e-07, + "loss": 0.0046, + "reward": 0.029189766384661198, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.029189766384661198, + "reward_after_std": 1.0231481902301311, + "reward_before_mean": 0.5286800367757678, + "reward_before_std": 0.8786374758929014, + "reward_change_max": 0.0018067136406898499, + "reward_change_mean": -0.49949030485004187, + "reward_change_min": -0.8799277618527412, + "reward_change_std": 0.32619673386216164, + "reward_std": 1.0231482051312923, + "rewards/cosine_scaled_reward": -0.12107665184885263, + "rewards/format_reward": 0.770833345130086, + "step": 498 + }, + { + "advantage_max": 1.921624556183815, + "advantage_mean": -1.2107193636534674e-08, + "advantage_min": -0.7888023443520069, + "advantage_std": 0.9998480603098869, + "completion_length": 1392.8333892822266, + "epoch": 0.5702857142857143, + "grad_norm": 0.8271004557609558, + "kl": 0.08435821533203125, + "lambda_div_used": 0.5, + "learning_rate": 1.0001096618257236e-07, + "loss": 0.0034, + "reward": 0.13609783939318731, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.13609783939318731, + "reward_after_std": 0.7598652169108391, + "reward_before_mean": 0.817673472687602, + "reward_before_std": 0.6392671652138233, + "reward_change_max": 0.009077653288841248, + "reward_change_mean": -0.6815756633877754, + "reward_change_min": -1.0989074856042862, + "reward_change_std": 0.4282492324709892, + "reward_std": 0.7598652243614197, + "rewards/cosine_scaled_reward": -0.07032993622124195, + "rewards/format_reward": 0.9583333358168602, + "step": 499 + }, + { + "advantage_max": 1.9175880998373032, + "advantage_mean": 2.483526606589237e-09, + "advantage_min": -0.7598303630948067, + "advantage_std": 0.9998560920357704, + "completion_length": 1688.8541946411133, + "epoch": 0.5714285714285714, + "grad_norm": 0.9154338240623474, + "kl": 0.20852279663085938, + "lambda_div_used": 0.5, + "learning_rate": 1e-07, + "loss": 0.0083, + "reward": -0.03248512791469693, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.03248512791469693, + "reward_after_std": 0.8366284519433975, + "reward_before_mean": 0.4850434511899948, + "reward_before_std": 0.754515565931797, + "reward_change_max": 0.001007281243801117, + "reward_change_mean": -0.5175285935401917, + "reward_change_min": -0.9981810115277767, + "reward_change_std": 0.3586902804672718, + "reward_std": 0.8366284593939781, + "rewards/cosine_scaled_reward": -0.1428949534893036, + "rewards/format_reward": 0.770833345130086, + "step": 500 + }, + { + "epoch": 0.5714285714285714, + "step": 500, + "total_flos": 0.0, + "train_loss": 0.0009665181785810547, + "train_runtime": 55686.6375, + "train_samples_per_second": 0.431, + "train_steps_per_second": 0.009 + } + ], + "logging_steps": 1, + "max_steps": 500, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}